Repository: ml-explore/mlx
Branch: main
Commit: 70a0da6fca8a
Files: 879
Total size: 6.3 MB

Directory structure:
gitextract_c0zbkz84/

├── .clang-format
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   └── bug_report.md
│   ├── actions/
│   │   ├── build-cuda-release/
│   │   │   └── action.yml
│   │   ├── build-docs/
│   │   │   └── action.yml
│   │   ├── build-linux/
│   │   │   └── action.yml
│   │   ├── build-linux-release/
│   │   │   └── action.yml
│   │   ├── build-macos/
│   │   │   └── action.yml
│   │   ├── build-macos-release/
│   │   │   └── action.yml
│   │   ├── build-windows/
│   │   │   └── action.yml
│   │   ├── setup-linux/
│   │   │   └── action.yml
│   │   ├── setup-macos/
│   │   │   └── action.yml
│   │   ├── setup-windows/
│   │   │   └── action.yml
│   │   ├── test-linux/
│   │   │   └── action.yml
│   │   └── test-windows/
│   │       └── action.yml
│   ├── dependabot.yml
│   ├── pull_request_template.md
│   ├── scripts/
│   │   ├── build-sanitizer-tests.sh
│   │   └── setup+build-cpp-linux-fedora-container.sh
│   └── workflows/
│       ├── build_and_test.yml
│       ├── documentation.yml
│       ├── nightly.yml
│       └── release.yml
├── .gitignore
├── .pre-commit-config.yaml
├── ACKNOWLEDGMENTS.md
├── CITATION.cff
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── benchmarks/
│   ├── cpp/
│   │   ├── CMakeLists.txt
│   │   ├── autograd.cpp
│   │   ├── compare_devices.cpp
│   │   ├── irregular_strides.cpp
│   │   ├── single_ops.cpp
│   │   └── time_utils.h
│   ├── numpy/
│   │   ├── single_ops.py
│   │   └── time_utils.py
│   └── python/
│       ├── batch_matmul_bench.py
│       ├── blas/
│       │   ├── bench_gemm.py
│       │   └── bench_gemv.py
│       ├── comparative/
│       │   ├── README.md
│       │   ├── bench_mlx.py
│       │   ├── bench_torch.py
│       │   └── compare.py
│       ├── compile_bench.py
│       ├── conv1d_bench.py
│       ├── conv2d_bench_cpu.py
│       ├── conv2d_train_bench_cpu.py
│       ├── conv2d_transpose_bench_cpu.py
│       ├── conv3d_bench.py
│       ├── conv3d_bench_cpu.py
│       ├── conv3d_train_bench_cpu.py
│       ├── conv3d_transpose_bench_cpu.py
│       ├── conv_bench.py
│       ├── conv_transpose_bench.py
│       ├── conv_unaligned_bench.py
│       ├── distributed_bench.py
│       ├── einsum_bench.py
│       ├── fft_bench.py
│       ├── gather_bench.py
│       ├── gather_mm_bench.py
│       ├── gather_qmm_bench.py
│       ├── hadamard_bench.py
│       ├── large_gemm_bench.py
│       ├── layer_norm_bench.py
│       ├── masked_scatter.py
│       ├── rms_norm_bench.py
│       ├── rope_bench.py
│       ├── scatter_bench.py
│       ├── sdpa_bench.py
│       ├── sdpa_vector_bench.py
│       ├── segmented_mm_bench.py
│       ├── single_ops.py
│       ├── slice_update_bench.py
│       ├── synchronize_bench.py
│       └── time_utils.py
├── cmake/
│   ├── FindCUDNN.cmake
│   ├── FindNCCL.cmake
│   ├── Findnvpl.cmake
│   └── extension.cmake
├── docs/
│   ├── .clang-format
│   ├── .gitignore
│   ├── .nojekyll
│   ├── Doxyfile
│   ├── Makefile
│   ├── README.md
│   ├── index.html
│   ├── requirements.txt
│   └── src/
│       ├── _templates/
│       │   ├── module-base-class.rst
│       │   ├── nn-module-template.rst
│       │   └── optimizers-template.rst
│       ├── conf.py
│       ├── cpp/
│       │   └── ops.rst
│       ├── dev/
│       │   ├── custom_metal_kernels.rst
│       │   ├── extensions.rst
│       │   ├── metal_debugger.rst
│       │   ├── metal_logging.rst
│       │   └── mlx_in_cpp.rst
│       ├── examples/
│       │   ├── data_parallelism.rst
│       │   ├── linear_regression.rst
│       │   ├── llama-inference.rst
│       │   ├── mlp.rst
│       │   └── tensor_parallelism.rst
│       ├── index.rst
│       ├── install.rst
│       ├── python/
│       │   ├── array.rst
│       │   ├── cuda.rst
│       │   ├── data_types.rst
│       │   ├── devices_and_streams.rst
│       │   ├── distributed.rst
│       │   ├── export.rst
│       │   ├── fast.rst
│       │   ├── fft.rst
│       │   ├── linalg.rst
│       │   ├── memory_management.rst
│       │   ├── metal.rst
│       │   ├── nn/
│       │   │   ├── distributed.rst
│       │   │   ├── functions.rst
│       │   │   ├── init.rst
│       │   │   ├── layers.rst
│       │   │   ├── losses.rst
│       │   │   └── module.rst
│       │   ├── nn.rst
│       │   ├── ops.rst
│       │   ├── optimizers/
│       │   │   ├── common_optimizers.rst
│       │   │   ├── optimizer.rst
│       │   │   └── schedulers.rst
│       │   ├── optimizers.rst
│       │   ├── random.rst
│       │   ├── transforms.rst
│       │   └── tree_utils.rst
│       └── usage/
│           ├── compile.rst
│           ├── distributed.rst
│           ├── export.rst
│           ├── function_transforms.rst
│           ├── indexing.rst
│           ├── launching_distributed.rst
│           ├── lazy_evaluation.rst
│           ├── numpy.rst
│           ├── quick_start.rst
│           ├── saving_and_loading.rst
│           ├── unified_memory.rst
│           └── using_streams.rst
├── examples/
│   ├── cmake_project/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   └── example.cpp
│   ├── cpp/
│   │   ├── CMakeLists.txt
│   │   ├── distributed.cpp
│   │   ├── linear_regression.cpp
│   │   ├── logistic_regression.cpp
│   │   ├── metal_capture.cpp
│   │   ├── timer.h
│   │   └── tutorial.cpp
│   ├── export/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── eval_mlp.cpp
│   │   ├── eval_mlp.py
│   │   ├── train_mlp.cpp
│   │   └── train_mlp.py
│   ├── extensions/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── axpby/
│   │   │   ├── axpby.cpp
│   │   │   ├── axpby.h
│   │   │   └── axpby.metal
│   │   ├── bindings.cpp
│   │   ├── mlx_sample_extensions/
│   │   │   └── __init__.py
│   │   ├── pyproject.toml
│   │   ├── requirements.txt
│   │   ├── setup.py
│   │   └── test.py
│   └── python/
│       ├── linear_regression.py
│       ├── logistic_regression.py
│       └── qqmm.py
├── mlx/
│   ├── 3rdparty/
│   │   ├── .clang-format
│   │   └── pocketfft.h
│   ├── CMakeLists.txt
│   ├── allocator.h
│   ├── api.h
│   ├── array.cpp
│   ├── array.h
│   ├── backend/
│   │   ├── common/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── binary.h
│   │   │   ├── broadcasting.cpp
│   │   │   ├── broadcasting.h
│   │   │   ├── buffer_cache.h
│   │   │   ├── common.cpp
│   │   │   ├── compiled.cpp
│   │   │   ├── compiled.h
│   │   │   ├── copy.h
│   │   │   ├── hadamard.h
│   │   │   ├── load.cpp
│   │   │   ├── matmul.h
│   │   │   ├── quantized.h
│   │   │   ├── reduce.cpp
│   │   │   ├── reduce.h
│   │   │   ├── slicing.cpp
│   │   │   ├── slicing.h
│   │   │   ├── ternary.h
│   │   │   ├── unary.h
│   │   │   ├── utils.cpp
│   │   │   └── utils.h
│   │   ├── cpu/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── arange.h
│   │   │   ├── arg_reduce.cpp
│   │   │   ├── binary.cpp
│   │   │   ├── binary.h
│   │   │   ├── binary_ops.h
│   │   │   ├── binary_two.h
│   │   │   ├── cholesky.cpp
│   │   │   ├── compiled.cpp
│   │   │   ├── compiled_preamble.h
│   │   │   ├── conv.cpp
│   │   │   ├── copy.cpp
│   │   │   ├── copy.h
│   │   │   ├── device_info.cpp
│   │   │   ├── device_info.h
│   │   │   ├── distributed.cpp
│   │   │   ├── eig.cpp
│   │   │   ├── eigh.cpp
│   │   │   ├── encoder.cpp
│   │   │   ├── encoder.h
│   │   │   ├── eval.cpp
│   │   │   ├── eval.h
│   │   │   ├── fft.cpp
│   │   │   ├── gemm.h
│   │   │   ├── gemms/
│   │   │   │   ├── bnns.cpp
│   │   │   │   ├── cblas.cpp
│   │   │   │   ├── simd_bf16.cpp
│   │   │   │   ├── simd_fp16.cpp
│   │   │   │   └── simd_gemm.h
│   │   │   ├── hadamard.cpp
│   │   │   ├── indexing.cpp
│   │   │   ├── inverse.cpp
│   │   │   ├── jit_compiler.cpp
│   │   │   ├── jit_compiler.h
│   │   │   ├── lapack.h
│   │   │   ├── logsumexp.cpp
│   │   │   ├── luf.cpp
│   │   │   ├── make_compiled_preamble.ps1
│   │   │   ├── make_compiled_preamble.sh
│   │   │   ├── masked_mm.cpp
│   │   │   ├── matmul.cpp
│   │   │   ├── primitives.cpp
│   │   │   ├── qrf.cpp
│   │   │   ├── quantized.cpp
│   │   │   ├── reduce.cpp
│   │   │   ├── scan.cpp
│   │   │   ├── select.cpp
│   │   │   ├── simd/
│   │   │   │   ├── accelerate_fp16_simd.h
│   │   │   │   ├── accelerate_simd.h
│   │   │   │   ├── base_simd.h
│   │   │   │   ├── math.h
│   │   │   │   ├── neon_fp16_simd.h
│   │   │   │   ├── simd.h
│   │   │   │   └── type.h
│   │   │   ├── slicing.h
│   │   │   ├── softmax.cpp
│   │   │   ├── sort.cpp
│   │   │   ├── svd.cpp
│   │   │   ├── ternary.h
│   │   │   ├── threefry.cpp
│   │   │   ├── threefry.h
│   │   │   ├── unary.cpp
│   │   │   ├── unary.h
│   │   │   └── unary_ops.h
│   │   ├── cuda/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── allocator.cpp
│   │   │   ├── allocator.h
│   │   │   ├── arange.cu
│   │   │   ├── arg_reduce.cu
│   │   │   ├── bin2h.cmake
│   │   │   ├── binary/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── add.cu
│   │   │   │   ├── arctan2.cu
│   │   │   │   ├── binary.cuh
│   │   │   │   ├── bitwise_binary.cu
│   │   │   │   ├── divide.cu
│   │   │   │   ├── equal.cu
│   │   │   │   ├── greater.cu
│   │   │   │   ├── greater_equal.cu
│   │   │   │   ├── less.cu
│   │   │   │   ├── less_equal.cu
│   │   │   │   ├── log_add_exp.cu
│   │   │   │   ├── logical_and.cu
│   │   │   │   ├── logical_or.cu
│   │   │   │   ├── maximum.cu
│   │   │   │   ├── minimum.cu
│   │   │   │   ├── multiply.cu
│   │   │   │   ├── not_equal.cu
│   │   │   │   ├── power.cu
│   │   │   │   ├── remainder.cu
│   │   │   │   └── subtract.cu
│   │   │   ├── binary_two.cu
│   │   │   ├── compiled.cpp
│   │   │   ├── conv/
│   │   │   │   ├── conv.h
│   │   │   │   ├── gemm_conv.cu
│   │   │   │   └── gemm_grouped_conv.cu
│   │   │   ├── conv.cpp
│   │   │   ├── copy/
│   │   │   │   ├── copy.cuh
│   │   │   │   ├── copy_contiguous.cu
│   │   │   │   ├── copy_general.cu
│   │   │   │   ├── copy_general_dynamic.cu
│   │   │   │   └── copy_general_input.cu
│   │   │   ├── copy.cu
│   │   │   ├── cublas_utils.cpp
│   │   │   ├── cublas_utils.h
│   │   │   ├── cuda.h
│   │   │   ├── cuda_utils.h
│   │   │   ├── cudnn_utils.cpp
│   │   │   ├── cudnn_utils.h
│   │   │   ├── custom_kernel.cpp
│   │   │   ├── cutlass_utils.cuh
│   │   │   ├── delayload.cpp
│   │   │   ├── device/
│   │   │   │   ├── atomic_ops.cuh
│   │   │   │   ├── binary_ops.cuh
│   │   │   │   ├── cast_op.cuh
│   │   │   │   ├── complex.cuh
│   │   │   │   ├── config.h
│   │   │   │   ├── fp16_math.cuh
│   │   │   │   ├── gather.cuh
│   │   │   │   ├── gather_axis.cuh
│   │   │   │   ├── hadamard.cuh
│   │   │   │   ├── indexing.cuh
│   │   │   │   ├── scatter.cuh
│   │   │   │   ├── scatter_axis.cuh
│   │   │   │   ├── scatter_ops.cuh
│   │   │   │   ├── slice_update.cuh
│   │   │   │   ├── ternary_ops.cuh
│   │   │   │   ├── unary_ops.cuh
│   │   │   │   └── utils.cuh
│   │   │   ├── device.cpp
│   │   │   ├── device.h
│   │   │   ├── device_info.cpp
│   │   │   ├── distributed.cu
│   │   │   ├── eval.cpp
│   │   │   ├── event.cu
│   │   │   ├── event.h
│   │   │   ├── fence.cpp
│   │   │   ├── fft.cu
│   │   │   ├── gemms/
│   │   │   │   ├── cublas_gemm.cpp
│   │   │   │   ├── cublas_gemm.h
│   │   │   │   ├── cublas_gemm_batched_12_0.cpp
│   │   │   │   ├── cublas_gemm_batched_12_9.cu
│   │   │   │   ├── gemv.cu
│   │   │   │   ├── gemv.h
│   │   │   │   ├── grouped_gemm.h
│   │   │   │   └── grouped_gemm_unaligned.cu
│   │   │   ├── hadamard.cu
│   │   │   ├── indexing.cpp
│   │   │   ├── jit_module.cpp
│   │   │   ├── jit_module.h
│   │   │   ├── kernel_utils.cu
│   │   │   ├── kernel_utils.cuh
│   │   │   ├── layer_norm.cu
│   │   │   ├── load.cpp
│   │   │   ├── logsumexp.cu
│   │   │   ├── lru_cache.h
│   │   │   ├── matmul.cpp
│   │   │   ├── no_cuda.cpp
│   │   │   ├── primitives.cpp
│   │   │   ├── quantized/
│   │   │   │   ├── affine_quantize.cu
│   │   │   │   ├── convert_fp8.cu
│   │   │   │   ├── cublas_qqmm.cpp
│   │   │   │   ├── cublas_qqmm.h
│   │   │   │   ├── fp_quantize.cu
│   │   │   │   ├── mxfp8_quantize.cuh
│   │   │   │   ├── no_qqmm_impl.cpp
│   │   │   │   ├── nvfp4_quantize.cuh
│   │   │   │   ├── qmm/
│   │   │   │   │   ├── CMakeLists.txt
│   │   │   │   │   ├── fp_qmv.cu
│   │   │   │   │   ├── qmm.cu
│   │   │   │   │   ├── qmm.h
│   │   │   │   │   ├── qmm_impl_sm80.cuh
│   │   │   │   │   ├── qmm_impl_sm80_m16.cu
│   │   │   │   │   ├── qmm_impl_sm80_m32.cu
│   │   │   │   │   ├── qmm_impl_sm80_m64.cu
│   │   │   │   │   ├── qmm_impl_sm90.cuh
│   │   │   │   │   ├── qmm_impl_sm90_m128_n128_m2.cu
│   │   │   │   │   ├── qmm_impl_sm90_m128_n16_m1.cu
│   │   │   │   │   ├── qmm_impl_sm90_m128_n256_m2.cu
│   │   │   │   │   ├── qmm_impl_sm90_m128_n32_m1.cu
│   │   │   │   │   ├── qmm_impl_sm90_m128_n64_m2.cu
│   │   │   │   │   └── qmv.cu
│   │   │   │   ├── qqmm.cpp
│   │   │   │   ├── qqmm_impl.cpp
│   │   │   │   ├── qqmm_impl.h
│   │   │   │   ├── qqmm_utils.cu
│   │   │   │   ├── qqmm_utils.h
│   │   │   │   ├── quantized.cpp
│   │   │   │   ├── quantized.h
│   │   │   │   └── quantized_utils.h
│   │   │   ├── random.cu
│   │   │   ├── reduce/
│   │   │   │   ├── all_reduce.cu
│   │   │   │   ├── col_reduce.cu
│   │   │   │   ├── init_reduce.cu
│   │   │   │   ├── reduce.cuh
│   │   │   │   ├── reduce_ops.cuh
│   │   │   │   ├── reduce_utils.cuh
│   │   │   │   └── row_reduce.cu
│   │   │   ├── reduce.cu
│   │   │   ├── rms_norm.cu
│   │   │   ├── rope.cu
│   │   │   ├── scaled_dot_product_attention.cpp
│   │   │   ├── scaled_dot_product_attention.cu
│   │   │   ├── scan.cu
│   │   │   ├── slicing.cpp
│   │   │   ├── softmax.cu
│   │   │   ├── sort.cu
│   │   │   ├── steel/
│   │   │   │   ├── defines.cuh
│   │   │   │   ├── gemm.cuh
│   │   │   │   ├── mma.cuh
│   │   │   │   ├── tiles.cuh
│   │   │   │   └── utils.cuh
│   │   │   ├── ternary.cu
│   │   │   ├── unary/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── abs.cu
│   │   │   │   ├── arccos.cu
│   │   │   │   ├── arccosh.cu
│   │   │   │   ├── arcsin.cu
│   │   │   │   ├── arcsinh.cu
│   │   │   │   ├── arctan.cu
│   │   │   │   ├── arctanh.cu
│   │   │   │   ├── bitwise_invert.cu
│   │   │   │   ├── ceil.cu
│   │   │   │   ├── conjugate.cu
│   │   │   │   ├── cos.cu
│   │   │   │   ├── cosh.cu
│   │   │   │   ├── erf.cu
│   │   │   │   ├── erf_inv.cu
│   │   │   │   ├── exp.cu
│   │   │   │   ├── expm1.cu
│   │   │   │   ├── floor.cu
│   │   │   │   ├── imag.cu
│   │   │   │   ├── log.cu
│   │   │   │   ├── log1p.cu
│   │   │   │   ├── logical_not.cu
│   │   │   │   ├── negative.cu
│   │   │   │   ├── real.cu
│   │   │   │   ├── round.cu
│   │   │   │   ├── sigmoid.cu
│   │   │   │   ├── sign.cu
│   │   │   │   ├── sin.cu
│   │   │   │   ├── sinh.cu
│   │   │   │   ├── sqrt.cu
│   │   │   │   ├── square.cu
│   │   │   │   ├── tan.cu
│   │   │   │   ├── tanh.cu
│   │   │   │   └── unary.cuh
│   │   │   ├── utils.cpp
│   │   │   ├── utils.h
│   │   │   ├── vector_types.cuh
│   │   │   ├── worker.cpp
│   │   │   └── worker.h
│   │   ├── gpu/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── copy.cpp
│   │   │   ├── copy.h
│   │   │   ├── device_info.h
│   │   │   ├── eval.h
│   │   │   ├── primitives.cpp
│   │   │   ├── scan.h
│   │   │   ├── slicing.cpp
│   │   │   └── slicing.h
│   │   ├── metal/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── allocator.cpp
│   │   │   ├── allocator.h
│   │   │   ├── binary.cpp
│   │   │   ├── binary.h
│   │   │   ├── compiled.cpp
│   │   │   ├── conv.cpp
│   │   │   ├── copy.cpp
│   │   │   ├── custom_kernel.cpp
│   │   │   ├── device.cpp
│   │   │   ├── device.h
│   │   │   ├── device_info.cpp
│   │   │   ├── distributed.cpp
│   │   │   ├── eval.cpp
│   │   │   ├── event.cpp
│   │   │   ├── fence.cpp
│   │   │   ├── fft.cpp
│   │   │   ├── hadamard.cpp
│   │   │   ├── indexing.cpp
│   │   │   ├── jit/
│   │   │   │   ├── includes.h
│   │   │   │   └── indexing.h
│   │   │   ├── jit_kernels.cpp
│   │   │   ├── kernels/
│   │   │   │   ├── CMakeLists.txt
│   │   │   │   ├── arange.h
│   │   │   │   ├── arange.metal
│   │   │   │   ├── arg_reduce.metal
│   │   │   │   ├── atomic.h
│   │   │   │   ├── bf16.h
│   │   │   │   ├── bf16_math.h
│   │   │   │   ├── binary.h
│   │   │   │   ├── binary.metal
│   │   │   │   ├── binary_ops.h
│   │   │   │   ├── binary_two.h
│   │   │   │   ├── binary_two.metal
│   │   │   │   ├── cexpf.h
│   │   │   │   ├── complex.h
│   │   │   │   ├── conv.metal
│   │   │   │   ├── copy.h
│   │   │   │   ├── copy.metal
│   │   │   │   ├── defines.h
│   │   │   │   ├── erf.h
│   │   │   │   ├── expm1f.h
│   │   │   │   ├── fence.metal
│   │   │   │   ├── fft/
│   │   │   │   │   ├── radix.h
│   │   │   │   │   └── readwrite.h
│   │   │   │   ├── fft.h
│   │   │   │   ├── fft.metal
│   │   │   │   ├── fp4.h
│   │   │   │   ├── fp8.h
│   │   │   │   ├── fp_quantized.h
│   │   │   │   ├── fp_quantized.metal
│   │   │   │   ├── fp_quantized_nax.h
│   │   │   │   ├── fp_quantized_nax.metal
│   │   │   │   ├── gemv.metal
│   │   │   │   ├── gemv_masked.h
│   │   │   │   ├── gemv_masked.metal
│   │   │   │   ├── hadamard.h
│   │   │   │   ├── indexing/
│   │   │   │   │   ├── gather.h
│   │   │   │   │   ├── gather_axis.h
│   │   │   │   │   ├── gather_front.h
│   │   │   │   │   ├── indexing.h
│   │   │   │   │   ├── masked_scatter.h
│   │   │   │   │   ├── scatter.h
│   │   │   │   │   └── scatter_axis.h
│   │   │   │   ├── layer_norm.metal
│   │   │   │   ├── logging.h
│   │   │   │   ├── logsumexp.h
│   │   │   │   ├── logsumexp.metal
│   │   │   │   ├── quantized.h
│   │   │   │   ├── quantized.metal
│   │   │   │   ├── quantized_nax.h
│   │   │   │   ├── quantized_nax.metal
│   │   │   │   ├── quantized_utils.h
│   │   │   │   ├── random.metal
│   │   │   │   ├── reduce.h
│   │   │   │   ├── reduce.metal
│   │   │   │   ├── reduce_utils.h
│   │   │   │   ├── reduction/
│   │   │   │   │   ├── ops.h
│   │   │   │   │   ├── reduce_all.h
│   │   │   │   │   ├── reduce_col.h
│   │   │   │   │   ├── reduce_init.h
│   │   │   │   │   └── reduce_row.h
│   │   │   │   ├── rms_norm.metal
│   │   │   │   ├── rope.metal
│   │   │   │   ├── scaled_dot_product_attention.metal
│   │   │   │   ├── scan.h
│   │   │   │   ├── scan.metal
│   │   │   │   ├── sdpa_vector.h
│   │   │   │   ├── softmax.h
│   │   │   │   ├── softmax.metal
│   │   │   │   ├── sort.h
│   │   │   │   ├── sort.metal
│   │   │   │   ├── steel/
│   │   │   │   │   ├── attn/
│   │   │   │   │   │   ├── attn.h
│   │   │   │   │   │   ├── kernels/
│   │   │   │   │   │   │   ├── steel_attention.h
│   │   │   │   │   │   │   ├── steel_attention.metal
│   │   │   │   │   │   │   ├── steel_attention_nax.h
│   │   │   │   │   │   │   └── steel_attention_nax.metal
│   │   │   │   │   │   ├── loader.h
│   │   │   │   │   │   ├── mma.h
│   │   │   │   │   │   ├── nax.h
│   │   │   │   │   │   ├── params.h
│   │   │   │   │   │   └── transforms.h
│   │   │   │   │   ├── conv/
│   │   │   │   │   │   ├── conv.h
│   │   │   │   │   │   ├── kernels/
│   │   │   │   │   │   │   ├── steel_conv.h
│   │   │   │   │   │   │   ├── steel_conv.metal
│   │   │   │   │   │   │   ├── steel_conv_3d.h
│   │   │   │   │   │   │   ├── steel_conv_3d.metal
│   │   │   │   │   │   │   ├── steel_conv_general.h
│   │   │   │   │   │   │   └── steel_conv_general.metal
│   │   │   │   │   │   ├── loader.h
│   │   │   │   │   │   ├── loaders/
│   │   │   │   │   │   │   ├── loader_channel_l.h
│   │   │   │   │   │   │   ├── loader_channel_n.h
│   │   │   │   │   │   │   └── loader_general.h
│   │   │   │   │   │   └── params.h
│   │   │   │   │   ├── defines.h
│   │   │   │   │   ├── gemm/
│   │   │   │   │   │   ├── gemm.h
│   │   │   │   │   │   ├── gemm_nax.h
│   │   │   │   │   │   ├── kernels/
│   │   │   │   │   │   │   ├── steel_gemm_fused.h
│   │   │   │   │   │   │   ├── steel_gemm_fused.metal
│   │   │   │   │   │   │   ├── steel_gemm_fused_nax.h
│   │   │   │   │   │   │   ├── steel_gemm_fused_nax.metal
│   │   │   │   │   │   │   ├── steel_gemm_gather.h
│   │   │   │   │   │   │   ├── steel_gemm_gather.metal
│   │   │   │   │   │   │   ├── steel_gemm_gather_nax.h
│   │   │   │   │   │   │   ├── steel_gemm_gather_nax.metal
│   │   │   │   │   │   │   ├── steel_gemm_masked.h
│   │   │   │   │   │   │   ├── steel_gemm_masked.metal
│   │   │   │   │   │   │   ├── steel_gemm_segmented.h
│   │   │   │   │   │   │   ├── steel_gemm_segmented.metal
│   │   │   │   │   │   │   ├── steel_gemm_splitk.h
│   │   │   │   │   │   │   ├── steel_gemm_splitk.metal
│   │   │   │   │   │   │   ├── steel_gemm_splitk_nax.h
│   │   │   │   │   │   │   └── steel_gemm_splitk_nax.metal
│   │   │   │   │   │   ├── loader.h
│   │   │   │   │   │   ├── mma.h
│   │   │   │   │   │   ├── nax.h
│   │   │   │   │   │   ├── params.h
│   │   │   │   │   │   └── transforms.h
│   │   │   │   │   ├── utils/
│   │   │   │   │   │   ├── integral_constant.h
│   │   │   │   │   │   └── type_traits.h
│   │   │   │   │   └── utils.h
│   │   │   │   ├── ternary.h
│   │   │   │   ├── ternary.metal
│   │   │   │   ├── ternary_ops.h
│   │   │   │   ├── unary.h
│   │   │   │   ├── unary.metal
│   │   │   │   ├── unary_ops.h
│   │   │   │   └── utils.h
│   │   │   ├── kernels.h
│   │   │   ├── logsumexp.cpp
│   │   │   ├── make_compiled_preamble.sh
│   │   │   ├── matmul.cpp
│   │   │   ├── matmul.h
│   │   │   ├── metal.cpp
│   │   │   ├── metal.h
│   │   │   ├── no_metal.cpp
│   │   │   ├── nojit_kernels.cpp
│   │   │   ├── normalization.cpp
│   │   │   ├── primitives.cpp
│   │   │   ├── quantized.cpp
│   │   │   ├── reduce.cpp
│   │   │   ├── reduce.h
│   │   │   ├── resident.cpp
│   │   │   ├── resident.h
│   │   │   ├── rope.cpp
│   │   │   ├── scaled_dot_product_attention.cpp
│   │   │   ├── scan.cpp
│   │   │   ├── slicing.cpp
│   │   │   ├── softmax.cpp
│   │   │   ├── sort.cpp
│   │   │   ├── ternary.cpp
│   │   │   ├── ternary.h
│   │   │   ├── unary.cpp
│   │   │   ├── unary.h
│   │   │   ├── utils.cpp
│   │   │   └── utils.h
│   │   ├── no_cpu/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── compiled.cpp
│   │   │   ├── device_info.cpp
│   │   │   └── primitives.cpp
│   │   └── no_gpu/
│   │       ├── CMakeLists.txt
│   │       ├── allocator.cpp
│   │       ├── apple_memory.h
│   │       ├── device_info.cpp
│   │       ├── eval.cpp
│   │       ├── event.cpp
│   │       ├── fence.cpp
│   │       ├── linux_memory.h
│   │       └── primitives.cpp
│   ├── compile.cpp
│   ├── compile.h
│   ├── compile_impl.h
│   ├── device.cpp
│   ├── device.h
│   ├── distributed/
│   │   ├── CMakeLists.txt
│   │   ├── distributed.cpp
│   │   ├── distributed.h
│   │   ├── distributed_impl.h
│   │   ├── jaccl/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── jaccl.cpp
│   │   │   ├── jaccl.h
│   │   │   ├── mesh.cpp
│   │   │   ├── mesh.h
│   │   │   ├── mesh_impl.h
│   │   │   ├── no_jaccl.cpp
│   │   │   ├── ring.cpp
│   │   │   ├── ring.h
│   │   │   ├── ring_impl.h
│   │   │   ├── utils.cpp
│   │   │   └── utils.h
│   │   ├── mpi/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── mpi.cpp
│   │   │   ├── mpi.h
│   │   │   ├── mpi_declarations.h
│   │   │   └── no_mpi.cpp
│   │   ├── nccl/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── nccl.cpp
│   │   │   ├── nccl.h
│   │   │   └── no_nccl.cpp
│   │   ├── ops.cpp
│   │   ├── ops.h
│   │   ├── primitives.cpp
│   │   ├── primitives.h
│   │   ├── reduction_ops.h
│   │   ├── ring/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── no_ring.cpp
│   │   │   ├── ring.cpp
│   │   │   └── ring.h
│   │   ├── utils.cpp
│   │   └── utils.h
│   ├── dtype.cpp
│   ├── dtype.h
│   ├── dtype_utils.cpp
│   ├── dtype_utils.h
│   ├── einsum.cpp
│   ├── einsum.h
│   ├── event.h
│   ├── export.cpp
│   ├── export.h
│   ├── export_impl.h
│   ├── fast.cpp
│   ├── fast.h
│   ├── fast_primitives.h
│   ├── fence.h
│   ├── fft.cpp
│   ├── fft.h
│   ├── graph_utils.cpp
│   ├── graph_utils.h
│   ├── io/
│   │   ├── CMakeLists.txt
│   │   ├── gguf.cpp
│   │   ├── gguf.h
│   │   ├── gguf_quants.cpp
│   │   ├── load.cpp
│   │   ├── load.h
│   │   ├── no_gguf.cpp
│   │   ├── no_safetensors.cpp
│   │   └── safetensors.cpp
│   ├── io.h
│   ├── linalg.cpp
│   ├── linalg.h
│   ├── memory.h
│   ├── mlx.h
│   ├── ops.cpp
│   ├── ops.h
│   ├── primitives.cpp
│   ├── primitives.h
│   ├── random.cpp
│   ├── random.h
│   ├── scheduler.cpp
│   ├── scheduler.h
│   ├── small_vector.h
│   ├── stream.h
│   ├── threadpool.h
│   ├── transforms.cpp
│   ├── transforms.h
│   ├── transforms_impl.h
│   ├── types/
│   │   ├── bf16.h
│   │   ├── complex.h
│   │   ├── fp16.h
│   │   ├── half_types.h
│   │   └── limits.h
│   ├── utils.cpp
│   ├── utils.h
│   ├── version.cpp
│   └── version.h
├── mlx.pc.in
├── pyproject.toml
├── python/
│   ├── mlx/
│   │   ├── __main__.py
│   │   ├── _distributed_utils/
│   │   │   ├── common.py
│   │   │   ├── config.py
│   │   │   └── launch.py
│   │   ├── _reprlib_fix.py
│   │   ├── _stub_patterns.txt
│   │   ├── extension.py
│   │   ├── nn/
│   │   │   ├── __init__.py
│   │   │   ├── init.py
│   │   │   ├── layers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── activations.py
│   │   │   │   ├── base.py
│   │   │   │   ├── containers.py
│   │   │   │   ├── convolution.py
│   │   │   │   ├── convolution_transpose.py
│   │   │   │   ├── distributed.py
│   │   │   │   ├── dropout.py
│   │   │   │   ├── embedding.py
│   │   │   │   ├── linear.py
│   │   │   │   ├── normalization.py
│   │   │   │   ├── pooling.py
│   │   │   │   ├── positional_encoding.py
│   │   │   │   ├── quantized.py
│   │   │   │   ├── recurrent.py
│   │   │   │   ├── transformer.py
│   │   │   │   └── upsample.py
│   │   │   ├── losses.py
│   │   │   └── utils.py
│   │   ├── optimizers/
│   │   │   ├── __init__.py
│   │   │   ├── optimizers.py
│   │   │   └── schedulers.py
│   │   ├── py.typed
│   │   └── utils.py
│   ├── src/
│   │   ├── CMakeLists.txt
│   │   ├── array.cpp
│   │   ├── buffer.h
│   │   ├── constants.cpp
│   │   ├── convert.cpp
│   │   ├── convert.h
│   │   ├── cuda.cpp
│   │   ├── device.cpp
│   │   ├── distributed.cpp
│   │   ├── export.cpp
│   │   ├── fast.cpp
│   │   ├── fft.cpp
│   │   ├── indexing.cpp
│   │   ├── indexing.h
│   │   ├── linalg.cpp
│   │   ├── load.cpp
│   │   ├── load.h
│   │   ├── memory.cpp
│   │   ├── metal.cpp
│   │   ├── mlx.cpp
│   │   ├── mlx_func.cpp
│   │   ├── mlx_func.h
│   │   ├── ops.cpp
│   │   ├── random.cpp
│   │   ├── small_vector.h
│   │   ├── stream.cpp
│   │   ├── transforms.cpp
│   │   ├── trees.cpp
│   │   ├── trees.h
│   │   ├── utils.cpp
│   │   └── utils.h
│   └── tests/
│       ├── __main__.py
│       ├── cuda_skip.py
│       ├── mlx_distributed_tests.py
│       ├── mlx_tests.py
│       ├── mpi_test_distributed.py
│       ├── nccl_test_distributed.py
│       ├── ring_test_distributed.py
│       ├── test_array.py
│       ├── test_autograd.py
│       ├── test_bf16.py
│       ├── test_blas.py
│       ├── test_compile.py
│       ├── test_constants.py
│       ├── test_conv.py
│       ├── test_conv_transpose.py
│       ├── test_device.py
│       ├── test_double.py
│       ├── test_einsum.py
│       ├── test_eval.py
│       ├── test_export_import.py
│       ├── test_fast.py
│       ├── test_fast_sdpa.py
│       ├── test_fft.py
│       ├── test_graph.py
│       ├── test_init.py
│       ├── test_linalg.py
│       ├── test_load.py
│       ├── test_losses.py
│       ├── test_memory.py
│       ├── test_nn.py
│       ├── test_ops.py
│       ├── test_optimizers.py
│       ├── test_quantized.py
│       ├── test_random.py
│       ├── test_reduce.py
│       ├── test_tree.py
│       ├── test_upsample.py
│       └── test_vmap.py
├── setup.py
└── tests/
    ├── CMakeLists.txt
    ├── allocator_tests.cpp
    ├── arg_reduce_tests.cpp
    ├── array_tests.cpp
    ├── autograd_tests.cpp
    ├── blas_tests.cpp
    ├── compile_tests.cpp
    ├── creations_tests.cpp
    ├── custom_vjp_tests.cpp
    ├── device_tests.cpp
    ├── einsum_tests.cpp
    ├── eval_tests.cpp
    ├── export_import_tests.cpp
    ├── fft_tests.cpp
    ├── gpu_tests.cpp
    ├── linalg_tests.cpp
    ├── load_tests.cpp
    ├── ops_tests.cpp
    ├── random_tests.cpp
    ├── scheduler_tests.cpp
    ├── test_teardown.cpp
    ├── tests.cpp
    ├── utils_tests.cpp
    └── vmap_tests.cpp

================================================
FILE CONTENTS
================================================

================================================
FILE: .clang-format
================================================
---
AccessModifierOffset: -1
AlignAfterOpenBracket: AlwaysBreak
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlinesLeft: true
AlignOperands:   false
AlignTrailingComments: false
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
  AfterClass:      false
  AfterControlStatement: false
  AfterEnum:       false
  AfterFunction:   false
  AfterNamespace:  false
  AfterObjCDeclaration: false
  AfterStruct:     false
  AfterUnion:      false
  BeforeCatch:     false
  BeforeElse:      false
  IndentBraces:    false
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: false
ColumnLimit:     80
CommentPragmas:  '^ IWYU pragma:'
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
DisableFormat:   false
ForEachMacros:   [ FOR_EACH, FOR_EACH_R, FOR_EACH_RANGE, ]
IncludeCategories:
  - Regex:           '^<.*\.h(pp)?>'
    Priority:        1
  - Regex:           '^<.*'
    Priority:        2
  - Regex:           '.*'
    Priority:        3
IndentCaseLabels: true
IndentWidth:     2
IndentWrappedFunctionNames: false
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd:   ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
ReflowComments:  true
SortIncludes:    true
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles:  false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard:        Cpp11
TabWidth:        8
UseTab:          Never
...


================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a report about an issue you've encountered
title: "[BUG] "
labels: ''
assignees: ''

---

**Describe the bug**
A clear and concise description of what the bug is.

**To Reproduce**

Include code snippet
```python

```

**Expected behavior**
A clear and concise description of what you expected to happen.

**Desktop (please complete the following information):**
 - OS Version: [e.g. MacOS 14.1.2]
 - Version [e.g. 0.7.0]

**Additional context**
Add any other context about the problem here.


================================================
FILE: .github/actions/build-cuda-release/action.yml
================================================
name: 'Build CUDA wheel'
description: 'Build CUDA wheel'

inputs:
  arch:
    description: 'Platform architecture tag'
    required: true
    type: choice
    options:
      - x86_64
      - aarch64

runs:
  using: "composite"
  steps:
    - name: Build package
      shell: bash
      env:
        CMAKE_ARGS: -DMLX_BUILD_CUDA=ON
      run: |
        pip install auditwheel build patchelf setuptools
        python setup.py clean --all
        MLX_DISABLE_SM90A_KERNELS=1 MLX_BUILD_STAGE=2 python -m build -w

        auditwheel repair dist/mlx_cuda*.whl \
          --plat manylinux_2_35_${{ inputs.arch }} \
          --exclude libcublas* \
          --exclude libcuda* \
          --exclude libcudnn* \
          --exclude libnccl* \
          --exclude libnvrtc*


================================================
FILE: .github/actions/build-docs/action.yml
================================================
name: 'Build Documentation'
description: 'Build documentation'

runs:
  using: "composite"
  steps:
    - name: Setup machine
      uses: ./.github/actions/setup-linux

    - name: Install dependencies
      shell: bash
      run: |
        sudo apt-get install -y doxygen
        source .venv/bin/activate
        pip install -r docs/requirements.txt
        pip install . -v
  
    - name: Build documentation
      shell: bash
      run: |
        source .venv/bin/activate
        cd docs
        doxygen
        make html O=-W
    
    - name: Create artifact tar
      shell: bash
      run: tar -cf artifact.tar -C docs --dereference build/html index.html

    # Do it manually because upload-pages-artifact requires gtar
    - name: Upload artifact
      id: upload-artifact
      uses: actions/upload-artifact@v5
      with:
        name: github-pages
        path: artifact.tar
        retention-days: 1
        if-no-files-found: error


================================================
FILE: .github/actions/build-linux/action.yml
================================================
name: 'Build and Test on Linux'

inputs:
  toolkit:
    description: 'The toolkit to build with'
    required: false
    default: 'cpu'

runs:
  using: "composite"
  steps:

    - name: Install Python package
      id: python_build
      shell: sh
      env:
        DEBUG: 1
        CMAKE_ARGS: >-
          -DCMAKE_COMPILE_WARNING_AS_ERROR=ON
          -DMLX_BUILD_CUDA=${{ startsWith(inputs.toolkit, 'cuda') && 'ON' || 'OFF' }}
      run: |
        if ${{ startsWith(inputs.toolkit, 'cuda') && runner.arch == 'arm64' }} ; then
          # There is no GPU in arm64 runner, use a common arch.
          CMAKE_ARGS="$CMAKE_ARGS -DMLX_CUDA_ARCHITECTURES=80"
          # Can not build tests and stubs when the built executables can not run.
          CMAKE_ARGS="$CMAKE_ARGS -DMLX_BUILD_TESTS=OFF -DMLX_BUILD_PYTHON_STUBS=OFF"
        fi
        # Install cpu-only torch to save space
        pip install torch --index-url https://download.pytorch.org/whl/cpu
        pip install --no-build-isolation -e ".[dev]" -v
        # Pass the CMAKE_ARGS to following steps.
        echo CMAKE_ARGS="$CMAKE_ARGS" >> $GITHUB_OUTPUT

    - name: Build CPP only
      shell: bash
      run: |
        cmake . -B build -DCMAKE_BUILD_TYPE=Debug ${{ steps.python_build.outputs.CMAKE_ARGS }}
        cmake --build build -j $(nproc)


================================================
FILE: .github/actions/build-linux-release/action.yml
================================================
name: 'Build Linux wheel'
description: 'Build Linux wheel'

inputs:
  build-backend:
    description: 'Build the backend mlx-cpu package'
    type: boolean
    required: false
    default: false
  arch:
    description: 'Platform architecture tag'
    required: true
    type: choice
    options:
      - x86_64
      - aarch64

runs:
  using: "composite"
  steps:
    - name: Build MLX
      shell: bash
      run: pip install -e . -v

    - name: Build Python package
      shell: bash
      run: |
        pip install auditwheel patchelf build
        python setup.py clean --all
        MLX_BUILD_STAGE=1 python -m build -w
        auditwheel repair dist/mlx-*.whl \
          --plat manylinux_2_35_${{ inputs.arch }} \
          --exclude libmlx.so* \
          --only-plat

    - name: Build backend package
      if: ${{ inputs.build-backend }}
      shell: bash
      run: |
        python setup.py clean --all
        MLX_BUILD_STAGE=2 python -m build -w
        auditwheel repair dist/mlx_cpu*.whl --plat manylinux_2_35_${{ inputs.arch }}


================================================
FILE: .github/actions/build-macos/action.yml
================================================
name: 'Build and Test on macOS'
description: 'Build and test MLX on macOS'

runs:
  using: "composite"
  steps:
    - name: Install dependencies
      env:
        DEBUG: 1
        CMAKE_ARGS: "-DCMAKE_COMPILE_WARNING_AS_ERROR=ON"
      shell: bash -l {0}
      run: |
        pip install --upgrade pip
        pip install cmake setuptools typing_extensions
        pip install -e ".[dev]" -v

    - name: Install tests dependencies
      shell: bash -l {0}
      run: |
        pip install tensorflow

    - name: Run Python tests
      shell: bash -l {0}
      env:
        LOW_MEMORY: 1
      run: |
        DEVICE=cpu python -m unittest discover -v python/tests
        DEVICE=gpu METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 python -m unittest discover -v python/tests
        mpirun --bind-to none -host localhost:8 -np 8 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ python python/tests/mpi_test_distributed.py
        mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
        if $(grep "\[WARN\]" stderr.log); then echo "Distributed ring test failed"; exit 1; fi
    
    - name: Build example extension
      shell: bash -l {0}
      run: |
        cd examples/extensions
        pip install -r requirements.txt
        python setup.py build_ext --inplace
        python test.py
    
    - name: Build CPP only
      shell: bash -l {0}
      run: |
        mkdir -p build
        cd build
        cmake ..
        make -j $(sysctl -n hw.ncpu)
    
    - name: Run CPP tests
      shell: bash -l {0}
      env:
        DEVICE: gpu
        METAL_DEVICE_WRAPPER_TYPE: 1
        METAL_DEBUG_ERROR_MODE: 0
      run: ./build/tests/tests
    
    - name: Build small binary with JIT
      shell: bash -l {0}
      run: |
        mkdir -p build
        cd build
        cmake .. -DCMAKE_BUILD_TYPE=MinSizeRel \
          -DBUILD_SHARED_LIBS=ON \
          -DMLX_BUILD_CPU=OFF \
          -DMLX_BUILD_SAFETENSORS=OFF \
          -DMLX_BUILD_GGUF=OFF \
          -DMLX_METAL_JIT=ON
        make -j $(sysctl -n hw.ncpu)
    
    - name: Run Python tests with JIT
      shell: bash -l {0}
      env:
        LOW_MEMORY: 1
        DEVICE: gpu
        METAL_DEVICE_WRAPPER_TYPE: 1
        METAL_DEBUG_ERROR_MODE: 0
      run: |
        CMAKE_ARGS="-DMLX_METAL_JIT=ON" \
          pip install -e . -v
        python -m unittest discover -v python/tests


================================================
FILE: .github/actions/build-macos-release/action.yml
================================================
name: 'Build macOS release'
description: 'Build MLX releases macOS'

inputs:
  macos-target:
    description: 'macOS build target'
    required: false
    default: '15.0'
  build-backend:
    description: 'Build the backend mlx-metal package'
    type: boolean
    required: false
    default: false

runs:
  using: "composite"
  steps:
    - name: Build Python package
      shell: bash -l {0}
      env:
        DEVELOPER_DIR: /Applications/Xcode-latest.app
        MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }}
      run: |
        pip install build
        python setup.py clean --all
        MLX_BUILD_STAGE=1 python -m build -w

    - name: Build backend package
      if: ${{ inputs.build-backend }}
      shell: bash -l {0}
      env:
        DEVELOPER_DIR: /Applications/Xcode-latest.app
        MACOSX_DEPLOYMENT_TARGET: ${{ inputs.macos-target }}
      run: |
        python setup.py clean --all
        MLX_BUILD_STAGE=2 python -m build -w


================================================
FILE: .github/actions/build-windows/action.yml
================================================
name: 'Build on Windows'

runs:
  using: 'composite'
  steps:
    - name: Install Python package
      id: python-build
      shell: cmd
      env:
        # For MSVC, Ninja/Release is the only config supported by ccache.
        CMAKE_ARGS: >-
          -G Ninja
          -DCMAKE_BUILD_TYPE=Release
          -DCMAKE_C_COMPILER=cl
          -DCMAKE_CXX_COMPILER=cl
          -DCMAKE_RC_COMPILER=rc
      run: |
        uv pip install ".[dev]" -v
        :: Pass the CMAKE_ARGS to following steps.
        >>%GITHUB_OUTPUT% ECHO CMAKE_ARGS=%CMAKE_ARGS%

    - name: Build CPP only
      shell: cmd
      run: |
        cmake . -B build ${{ steps.python-build.outputs.CMAKE_ARGS }}
        cmake --build build -j %NUMBER_OF_PROCESSORS%


================================================
FILE: .github/actions/setup-linux/action.yml
================================================
name: 'Setup Linux Environment'
description: 'Install dependencies for Linux builds'

inputs:
  toolkit:
    description: 'Which toolkit to install'
    required: false
    default: 'cpu'
  python-version:
    description: 'Version of python to set up'
    required: false
    default: '3.14'
  use-ccache:
    description: 'Whether to enable ccache'
    required: false
    default: 'true'

runs:
  using: "composite"
  steps:
    - name: Install common dependencies
      shell: bash
      run: |
        echo "::group::Install common dependencies"
        sudo apt-get update
        sudo apt-get install -y --no-install-recommends \
            zip \
            libblas-dev liblapack-dev liblapacke-dev \
            openmpi-bin openmpi-common libopenmpi-dev
        echo "::endgroup::"

    - name: Use ccache
      if: ${{ inputs.use-ccache == 'true' }}
      uses: hendrikmuhs/ccache-action@v1.2
      with:
        key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ inputs.toolkit }}
        max-size: 1GB
        # ccache-action bug: running "apt-get update" fails on large arm runner.
        update-package-index: false

    - uses: actions/setup-python@v6
      with:
        python-version: ${{ inputs.python-version }}

    - name: Setup Python venv
      shell: bash
      run: |
        echo "::group::Setup Python venv"
        python -m venv .venv
        source .venv/bin/activate
        pip install setuptools cmake typing_extensions
        echo PATH=$PATH >> $GITHUB_ENV
        # Search python packages in .venv
        echo PYTHONPATH=`python -c 'import sys; print(sys.path[-1])'` >> $GITHUB_ENV
        echo "::endgroup::"

    - name: Install CUDA toolkit
      if: ${{ startsWith(inputs.toolkit, 'cuda') }}
      shell: bash
      env:
        # Note: the CI machine does not meet CUDA 13's driver requirement.
        # Compatibility matrix:
        # https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html
        PACKAGES: |
          {
            "cuda-12.6": "libcudnn9-dev-cuda-12 cuda-compiler-12-6 cuda-libraries-dev-12-6",
            "cuda-12.9": "libcudnn9-dev-cuda-12 cuda-compiler-12-9 cuda-libraries-dev-12-9",
            "cuda-13.0": "libcudnn9-dev-cuda-13 cuda-compiler-13-0 cuda-libraries-dev-13-0"
          }
      run: |
        echo "::group::Install CUDA toolkit"
        # The CUDA binaries are hosted in the "sbsa" repo, the "arm64" repo is
        # Jetson specific. SBSA means Arm Server Base System Architecture.
        ARCH=${{ runner.arch == 'arm64' && 'sbsa' || 'x86_64' }}
        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$ARCH/cuda-keyring_1.1-1_all.deb
        sudo dpkg -i cuda-keyring_1.1-1_all.deb
        sudo apt-get update
        sudo apt-get install -y --no-install-recommends \
            libnccl2 libnccl-dev \
            ${{ fromJson(env.PACKAGES)[inputs.toolkit] }}
        echo "/usr/local/${{ inputs.toolkit }}/bin" >> $GITHUB_PATH
        echo "::endgroup::"

    - name: CUDA packages and driver report
      if: ${{ startsWith(inputs.toolkit, 'cuda') }}
      shell: bash
      run: |
        echo "::group::Installed NVIDIA and CUDA packages"
        dpkg -l | egrep "cuda|nvidia" -i
        echo "::endgroup::"
        echo "::group::NVIDIA-SMI Status"
        nvidia-smi || true
        echo "::endgroup::"


================================================
FILE: .github/actions/setup-macos/action.yml
================================================
name: 'Setup macOS Environment'
description: 'Install dependencies for macOS builds'

inputs:
  python-version:
    description: 'Python version to use'
    required: false
    default: '3.10'

runs:
  using: "composite"
  steps:
    - name: Install Homebrew packages
      shell: sh
      run: /opt/homebrew/bin/brew install openmpi
    
    - name: Verify MetalToolchain installed
      shell: bash
      run: xcodebuild -showComponent MetalToolchain

    - uses: conda-incubator/setup-miniconda@v3
      with:
        miniconda-version: "latest"
        python-version: ${{ inputs.python-version }}


================================================
FILE: .github/actions/setup-windows/action.yml
================================================
name: 'Setup Windows environment'

inputs:
  python-version:
    description: 'Version of python to set up'
    required: false
    default: '3.14'
  use-ccache:
    description: 'Whether to enable ccache'
    required: false
    default: 'true'

runs:
  using: 'composite'
  steps:
    - name: Use ccache
      if: ${{ inputs.use-ccache == 'true' }}
      uses: hendrikmuhs/ccache-action@v1.2
      with:
        key: ccache-${{ runner.os }}-${{ runner.arch }}-cpu
        max-size: 1GB

    - name: Setup Visual Studio cmd
      shell: cmd
      run: |
        :: Find out path to VS.
        pushd "C:\Program Files (x86)\Microsoft Visual Studio\Installer\"
        for /f "delims=" %%x in ('.\vswhere.exe -latest -property InstallationPath') do set VSPATH=%%x
        popd
        :: Import VS vars.
        call "%VSPATH%\VC\Auxiliary\Build\vcvarsall.bat" x64
        :: Export to all steps.
        >>%GITHUB_ENV% set

    - uses: astral-sh/setup-uv@v7

    - name: Setup Python venv
      shell: cmd
      run: |
        uv venv --python ${{ inputs.python-version }}
        call ".venv/Scripts/activate.bat"
        >>%GITHUB_ENV% set


================================================
FILE: .github/actions/test-linux/action.yml
================================================
name: 'Run Linux tests'

inputs:
  has-gpu:
    description: 'Run GPU tests'
    required: false
    default: false

runs:
  using: "composite"
  steps:
    - name: Run MPI tests
      shell: bash
      run: |
        echo "::group::MPI tests"
        mpirun --bind-to none --allow-run-as-root -host localhost:8 -np 8 python python/tests/mpi_test_distributed.py
        echo "::endgroup::"

    - name: Run distributed tests
      if: ${{ inputs.has-gpu == 'false' }}
      shell: bash
      run: |
        echo "::group::Distributed tests"
        mlx.launch --verbose -n 8 python/tests/ring_test_distributed.py -v 2> >(tee -a stderr.log >&2)
        if grep -Fq '[WARN]' stderr.log ; then
          grep -F '[WARN]' stderr.log
          echo "Distributed ring test failed";
          exit 1;
        fi
        echo "::endgroup::"

    - name: Run Python tests - CPU
      if: ${{ inputs.has-gpu == 'false' }}
      shell: bash
      env:
        DEVICE: cpu
      run: |
        echo "::group::Python tests - CPU"
        python -m unittest discover python/tests -v
        echo "::endgroup::"

    - name: Run Python tests - GPU
      if: ${{ inputs.has-gpu == 'true' }}
      shell: bash
      env:
        DEVICE: gpu
      run: |
        echo "::group::Python tests - GPU"
        python -m tests discover python/tests -v
        echo "::endgroup::"

    - name: Run CPP tests - CPU
      shell: bash
      env:
        DEVICE: cpu
      run: |
        echo "::group::CPP tests - CPU"
        ./build/tests/tests
        echo "::endgroup::"

    - name: Run CPP tests - GPU
      if: ${{ inputs.has-gpu == 'true' }}
      shell: bash
      env:
        DEVICE: gpu
      run: |
        echo "::group::CPP tests - GPU"
        ./build/tests/tests -sfe="*linalg_tests.cpp"
        echo "::endgroup::"


================================================
FILE: .github/actions/test-windows/action.yml
================================================
name: 'Run tests on Windows'

runs:
  using: 'composite'
  steps:
    - name: Run Python tests - CPU
      shell: bash
      run: |
        echo "::group::Python tests - CPU"
        python -m unittest discover python/tests -v
        echo "::endgroup::"

    - name: Run CPP tests - CPU
      shell: bash
      env:
        DEVICE: cpu
      run: |
        echo "::group::CPP tests - CPU"
        ./build/tests.exe -tce="*gguf*,test random uniform"
        echo "::endgroup::"


================================================
FILE: .github/dependabot.yml
================================================
version: 2
updates:
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      interval: "weekly"


================================================
FILE: .github/pull_request_template.md
================================================
## Proposed changes

Please include a description of the problem or feature this PR is addressing. If there is a corresponding issue, include the issue #.

## Checklist

Put an `x` in the boxes that apply.

- [ ] I have read the [CONTRIBUTING](https://github.com/ml-explore/mlx/blob/main/CONTRIBUTING.md) document
- [ ] I have run `pre-commit run --all-files` to format my code / installed pre-commit prior to committing changes
- [ ] I have added tests that prove my fix is effective or that my feature works
- [ ] I have updated the necessary documentation (if needed)


================================================
FILE: .github/scripts/build-sanitizer-tests.sh
================================================
#!/bin/bash
set -ex

export CMAKE_C_COMPILER=/usr/bin/clang
export CMAKE_CXX_COMPILER=/usr/bin/clang++
BASE_CMAKE_ARGS="-DCMAKE_BUILD_TYPE=DEBUG -DCMAKE_COMPILE_WARNING_AS_ERROR=ON"
if [[ "$(uname -s)" != "Darwin" ]]; then
  BASE_CMAKE_ARGS+=" -DMLX_BUILD_METAL=OFF"
fi

run_test() {
  local sanitizer_name=$1
  local cmake_sanitizer_flag="-DUSE_${sanitizer_name}=ON"
  echo "  Running tests with: ${sanitizer_name}"

  case "$sanitizer_name" in
    ASAN)
      export ASAN_OPTIONS="detect_leaks=0"
      ;;
    UBSAN)
      export UBSAN_OPTIONS="halt_on_error=0:print_stacktrace=1"
      ;;
    TSAN)
      export TSAN_OPTIONS=""
      ;;
  esac

  rm -rf build
  mkdir -p build
  pushd build > /dev/null

  cmake .. ${BASE_CMAKE_ARGS} ${cmake_sanitizer_flag}
  make -j $(nproc)
  ./tests/tests

  popd > /dev/null
  unset ${sanitizer_name}_OPTIONS
}

sanitizer_arg=$(echo "$1" | tr '[:lower:]' '[:upper:]')

if [[ "$sanitizer_arg" == "ASAN" || "$sanitizer_arg" == "UBSAN" || "$sanitizer_arg" == "TSAN" ]]; then
  run_test "$sanitizer_arg"
  echo "  ${sanitizer_arg} test run completed successfully."
else
  echo "Error: Invalid sanitizer '$1'. Please use one of: ASAN, UBSAN, TSAN."
  exit 1
fi


================================================
FILE: .github/scripts/setup+build-cpp-linux-fedora-container.sh
================================================
#!/bin/bash
set -ex

# [Setup] Install dependencies inside the container.
dnf update -y
dnf install -y \
  blas-devel \
  lapack-devel \
  openblas-devel \
  make \
  cmake \
  clang \
  git
dnf clean all

# [C++] CI Build Sanity Check: Verifies code compilation, not for release.
export CMAKE_ARGS="-DCMAKE_COMPILE_WARNING_AS_ERROR=ON"
export DEBUG=1
export CMAKE_C_COMPILER=/usr/bin/clang
export CMAKE_CXX_COMPILER=/usr/bin/clang++

mkdir -p build
pushd build
cmake .. -DMLX_BUILD_METAL=OFF -DCMAKE_BUILD_TYPE=DEBUG
make -j $(nproc)
./tests/tests
popd


================================================
FILE: .github/workflows/build_and_test.yml
================================================
name: Build and Test

on:
  pull_request:
  push:
    branches:
      - main
      # For testing CI without starting a pull request:
      - test/*

permissions:
  contents: read

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

jobs:
  check_lint:
    name: Check Lint
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v6
      - uses: pre-commit/action@v3.0.1

  linux_build_and_test:
    name: Linux (cpu, ${{ matrix.arch }})
    needs: check_lint
    strategy:
      fail-fast: false
      matrix:
        arch: ['x86_64', 'aarch64']
    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/setup-linux
      - uses: ./.github/actions/build-linux
      - uses: ./.github/actions/test-linux
      - run: df -h

  cuda_build_and_test:
    name: Linux (${{ matrix.toolkit }}, ${{ matrix.arch }})
    if: github.repository == 'ml-explore/mlx'
    needs: check_lint
    strategy:
      fail-fast: false
      matrix:
        arch: ['x86_64', 'aarch64']
        toolkit: ['cuda-12.6', 'cuda-12.9']
    runs-on: ${{ matrix.arch == 'x86_64' && 'gpu-t4-4-core' || 'ubuntu-22.04-arm' }}
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/setup-linux
        with:
          toolkit: ${{ matrix.toolkit }}
      - uses: ./.github/actions/build-linux
        with:
          toolkit: ${{ matrix.toolkit }}
      - uses: ./.github/actions/test-linux
        if: matrix.arch == 'x86_64'
        with:
          has-gpu: true

  mac_build_and_test:
    name: macOS (${{ matrix.macos-target }})
    if: github.repository == 'ml-explore/mlx'
    strategy:
      matrix:
        macos-target: ["14.0", "15.0", "26.0"]
    runs-on: [self-hosted, macos]
    env:
      MACOSX_DEPLOYMENT_TARGET: ${{ matrix.macos-target }}
    needs: check_lint
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/setup-macos
      - uses: ./.github/actions/build-macos

  windows_build_and_test:
    name: Windows (cpu, x86_64)
    needs: check_lint
    runs-on: windows-2025
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/setup-windows
      - uses: ./.github/actions/build-windows
      - uses: ./.github/actions/test-windows

  build_documentation:
    name: Build Documentation
    if: github.repository == 'ml-explore/mlx'
    runs-on: ubuntu-22.04
    needs: check_lint
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/build-docs

  linux_sanitizer_build_and_test:
    name: Linux Sanitizer Tests (${{ matrix.sanitizer }})
    needs: check_lint
    strategy:
      fail-fast: false
      matrix:
        sanitizer: [ASAN, UBSAN]
        # todo 12/16/2025: enable TSAN later + consider enabling ASAN for GPU backend tests.
        # sanitizer: [ASAN, UBSAN, TSAN]
    runs-on: ubuntu-22.04-arm
    steps:
      - name: Checkout code
        uses: actions/checkout@v6

      - name: Install Dependencies
        run: |
          export DEBIAN_FRONTEND=noninteractive
          sudo apt-get update -y
          sudo apt-get install -y \
            build-essential \
            libblas-dev \
            liblapacke-dev \
            libopenblas-dev \
            cmake \
            clang \
            git
          sudo apt-get clean
          sudo rm -rf /var/lib/apt/lists/*

      - name: Linux Build and Test with ${{ matrix.sanitizer }}
        run: |
          bash .github/scripts/build-sanitizer-tests.sh ${{ matrix.sanitizer }}

  linux_fedora_build_cpp:
    name: Linux Fedora (${{ matrix.arch }})
    needs: check_lint
    strategy:
      fail-fast: false
      matrix:
        include:
          - host: ubuntu-22.04
            arch: x86_64
          - host: ubuntu-22.04-arm
            arch: aarch64

    runs-on: ${{ matrix.host }}
    container:
      image: fedora:42
    steps:
      - name: Checkout code
        uses: actions/checkout@v6

      - name: CPP Build Test - No Release
        run: |
          bash ./.github/scripts/setup+build-cpp-linux-fedora-container.sh


================================================
FILE: .github/workflows/documentation.yml
================================================
name: Documentation

on:
  workflow_dispatch:

permissions:
  contents: read

jobs:
  build:
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/build-docs
      
  deploy:
    needs: build
    permissions:
      pages: write
      id-token: write
    runs-on: ubuntu-latest
    environment:
      name: github-pages
      url: ${{ steps.deployment.outputs.page_url }}
    steps:
      - name: Deploy to GitHub Pages
        id: deployment
        uses: actions/deploy-pages@v4


================================================
FILE: .github/workflows/nightly.yml
================================================
name: Nightly Build

on:
  schedule:
    - cron: 33 6 * * 1-5
  workflow_dispatch:

permissions:
  contents: read

jobs:
  build_linux_release:
    strategy:
      fail-fast: false
      matrix:
        python_version: ["3.10", "3.14"]
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/setup-linux
      - uses: ./.github/actions/build-linux-release
        with:
          build-backend: ${{ matrix.python-version == '3.10' }}
          arch: "x86_64"
      - name: Upload mlx artifacts
        uses: actions/upload-artifact@v7
        with:
          name: linux-wheels-${{ matrix.python_version }}
          path: wheelhouse/mlx-*.whl
          retention-days: 7
      - name: Upload mlx-cpu artifacts
        if: matrix.python_version == '3.10'
        uses: actions/upload-artifact@v7
        with:
          name: mlx-cpu
          path: wheelhouse/mlx_cpu-*.whl
          retention-days: 7
      - run: df -h

  build_linux_with_tests:
    strategy:
      fail-fast: false
      matrix:
        python_version: ["3.11", "3.12", "3.13", "3.14"]
        runner:
          - ubuntu-22.04
          - ubuntu-22.04-arm
    runs-on: ${{ matrix.runner }}
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/setup-linux
        with:
          python-version: ${{ matrix.python_version }}
      - uses: ./.github/actions/build-linux
      - uses: ./.github/actions/test-linux
      - run: df -h

  build_mac_release:
    if: github.repository == 'ml-explore/mlx'
    strategy:
      matrix:
        python-version: ["3.10", "3.13"]
    runs-on: [self-hosted, macos]
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/setup-macos
        with:
          python-version: ${{ matrix.python-version }}
      - uses: ./.github/actions/build-macos
      - name: Build macOS 26 package
        uses: ./.github/actions/build-macos-release
        with:
          macos-target: 26.0
          build-backend: ${{ matrix.python-version == '3.10' }}
      - name: Build macOS 15 package
        uses: ./.github/actions/build-macos-release
        with:
          macos-target: 15.0
          build-backend: ${{ matrix.python-version == '3.10' }}
      - name: Build macOS 14 package
        uses: ./.github/actions/build-macos-release
        with:
          macos-target: 14.0
          build-backend: ${{ matrix.python-version == '3.10' }}

  build_cuda_release:
    if: github.repository == 'ml-explore/mlx'
    runs-on: ubuntu-22-large
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/setup-linux
        with:
          toolkit: 'cuda-12.9'
      - name: Build Python package
        uses: ./.github/actions/build-cuda-release
        with:
          toolkit: 'cuda-12.9'
          arch: 'x86_64'
      - name: Upload artifacts
        uses: actions/upload-artifact@v7
        with:
          name: mlx-cuda
          path: wheelhouse/mlx_cuda_*.whl
          retention-days: 7


================================================
FILE: .github/workflows/release.yml
================================================
name: PyPI Release

on:
  push:
    tags:
      - 'v*'
    branches:
      - 'test-publish/*'
  workflow_dispatch:
    inputs:
      dry_run:
        description: 'Dry run (do not publish to PyPi)'
        required: false
        type: boolean
      dev_release:
        description: 'Development release (DEV_RELEASE=1)'
        required: false
        type: boolean

permissions:
  contents: read

jobs:
  build_documentation:
    if: github.repository == 'ml-explore/mlx'
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/build-docs

  deploy_documentation:
    if: ${{ !inputs.dry_run }}
    needs: build_documentation
    permissions:
      pages: write
      id-token: write
    runs-on: ubuntu-latest
    environment:
      name: github-pages
      url: ${{ steps.deployment.outputs.page_url }}
    steps:
      - name: Deploy to GitHub Pages
        id: deployment
        uses: actions/deploy-pages@v4

  build_linux_release:
    if: github.repository == 'ml-explore/mlx'
    strategy:
      matrix:
        python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
        arch: ['x86_64', 'aarch64']
    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
    env:
      PYPI_RELEASE: 1
      DEV_RELEASE: ${{ inputs.dev_release && 1 || 0 }}
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/setup-linux
        with:
          python-version: ${{ matrix.python_version }}
          use-ccache: false
      - uses: ./.github/actions/build-linux-release
        with:
          build-backend: ${{ matrix.python_version == '3.10' }}
          arch: ${{ matrix.arch }}
      - name: Upload MLX artifacts
        uses: actions/upload-artifact@v7
        with:
          overwrite: true
          name: linux-wheels-${{ matrix.python_version }}-${{ matrix.arch }}
          path: wheelhouse/mlx-*.whl
          if-no-files-found: error
      - name: Upload CPU artifacts
        if: matrix.python_version == '3.10'
        uses: actions/upload-artifact@v7
        with:
          overwrite: true
          name: mlx-cpu-${{ matrix.arch }}
          path: wheelhouse/mlx_cpu-*.whl
          if-no-files-found: error

  build_mac_release:
    if: github.repository == 'ml-explore/mlx'
    strategy:
      matrix:
        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
    runs-on: [self-hosted, macos]
    env:
      PYPI_RELEASE: 1
      DEV_RELEASE: ${{ inputs.dev_release && 1 || 0 }}
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/setup-macos
        with:
          python-version: ${{ matrix.python-version }}

      - name: Install dependencies
        shell: bash -l {0}
        run: |
          pip install --upgrade pip
          pip install cmake setuptools typing_extensions
          pip install -e . -v
      - name: Build macOS 14 package
        uses: ./.github/actions/build-macos-release
        with:
          macos-target: 14.0
          build-backend: ${{ matrix.python-version == '3.10' }}
      - name: Build macOS 15 package
        uses: ./.github/actions/build-macos-release
        with:
          macos-target: 15.0
          build-backend: ${{ matrix.python-version == '3.10' }}
      - name: Build macOS 26 package
        uses: ./.github/actions/build-macos-release
        with:
          macos-target: 26.0
          build-backend: ${{ matrix.python-version == '3.10' }}
      - name: Upload MLX artifacts
        uses: actions/upload-artifact@v7
        with:
          overwrite: true
          name: mac-wheels-${{ matrix.python-version }}
          path: dist/mlx-*.whl
          if-no-files-found: error
      - name: Upload Metal artifacts
        if: matrix.python-version == '3.10'
        uses: actions/upload-artifact@v7
        with:
          overwrite: true
          name: mlx-metal
          path: dist/mlx_metal-*.whl
          if-no-files-found: error

  build_cuda_release:
    if: github.repository == 'ml-explore/mlx'
    strategy:
      matrix:
        arch: ['x86_64', 'aarch64']
        toolkit: ['cuda-12.9', 'cuda-13.0']
    runs-on: ${{ matrix.arch == 'x86_64' && 'ubuntu-22-large' || 'ubuntu-22-large-arm' }}
    env:
      PYPI_RELEASE: 1
      DEV_RELEASE: ${{ inputs.dev_release && 1 || 0 }}
    steps:
      - uses: actions/checkout@v6
      - uses: ./.github/actions/setup-linux
        with:
          toolkit: ${{ matrix.toolkit }}
          use-ccache: false
      - name: Build Python package
        uses: ./.github/actions/build-cuda-release
        with:
          arch: ${{ matrix.arch }}
      - name: Upload artifacts
        uses: actions/upload-artifact@v7
        with:
          overwrite: true
          name: mlx-${{ matrix.toolkit }}-${{ matrix.arch }}
          path: wheelhouse/mlx_cuda_*.whl
          if-no-files-found: error

  pypi-publish:
    name: Upload release to PyPI
    runs-on: ubuntu-latest
    needs: [build_linux_release, build_mac_release]
    permissions:
      id-token: write
    environment:
      name: ${{ inputs.dry_run && 'dry-run' || 'pypi' }}
      url: https://pypi.org/p/mlx
    steps:
      - uses: actions/download-artifact@v8
        with:
          pattern: linux-wheels-*
          merge-multiple: true
          path: dist
      - uses: actions/download-artifact@v8
        with:
          pattern: mac-wheels-*
          merge-multiple: true
          path: dist
      - name: Display structure of downloaded files
        run: du -ah dist
      - name: Publish package distributions to PyPI
        if: ${{ !inputs.dry_run }}
        uses: pypa/gh-action-pypi-publish@release/v1
        with:
          repository-url: https://upload.pypi.org/legacy/

  pypi-publish-cuda:
    name: Upload CUDA release to PyPI
    runs-on: ubuntu-latest
    needs: [build_cuda_release]
    permissions:
      id-token: write
    environment:
      name: ${{ inputs.dry_run && 'dry-run' || 'pypi' }}
      url: https://pypi.org/p/mlx-cuda
    steps:
      - uses: actions/download-artifact@v8
        with:
          pattern: mlx-cuda-*
          merge-multiple: true
          path: dist
      - name: Display structure of downloaded files
        run: du -ah dist
      - name: Publish package distributions to PyPI
        if: ${{ !inputs.dry_run }}
        uses: pypa/gh-action-pypi-publish@release/v1
        with:
          repository-url: https://upload.pypi.org/legacy/

  pypi-publish-cpu:
    name: Upload CPU release to PyPI
    runs-on: ubuntu-latest
    needs: [build_linux_release]
    permissions:
      id-token: write
    environment:
      name: ${{ inputs.dry_run && 'dry-run' || 'pypi' }}
      url: https://pypi.org/p/mlx-cpu
    steps:
      - uses: actions/download-artifact@v8
        with:
          pattern: mlx-cpu-*
          merge-multiple: true
          path: dist
      - name: Display structure of downloaded files
        run: du -ah dist
      - name: Publish package distributions to PyPI
        if: ${{ !inputs.dry_run }}
        uses: pypa/gh-action-pypi-publish@release/v1
        with:
          repository-url: https://upload.pypi.org/legacy/

  pypi-publish-metal:
    name: Upload Metal release to PyPI
    runs-on: ubuntu-latest
    needs: [build_mac_release]
    permissions:
      id-token: write
    environment:
      name: ${{ inputs.dry_run && 'dry-run' || 'pypi' }}
      url: https://pypi.org/p/mlx-metal
    steps:
      - uses: actions/download-artifact@v8
        with:
          name: mlx-metal
          path: dist
      - name: Display structure of downloaded files
        run: du -ah dist
      - name: Publish package distributions to PyPI
        if: ${{ !inputs.dry_run }}
        uses: pypa/gh-action-pypi-publish@release/v1
        with:
          repository-url: https://upload.pypi.org/legacy/


================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# tensor files
*.safe
*.safetensors

# Metal libraries
*.metallib

# Distribution / packaging
python/mlx/core
python/mlx/share
python/mlx/include
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
venv/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
uv.lock
.DS_Store

# Prerequisites
*.d

# Compiled Object files
*.slo
*.lo
*.o
*.obj
*.ilk

# Precompiled Headers
*.gch
*.pch

# Compiled Dynamic libraries
*.so
*.dylib
*.dll

# Fortran module files
*.mod
*.smod

# Compiled Static libraries
*.lai
*.la
*.a
*.lib

# Executables
*.exe
*.out
*.app

# Debug symbols
*.pdb

# VSCode
.vscode/
# Jetbrains
.cache/
# vim
*.swp


================================================
FILE: .pre-commit-config.yaml
================================================
repos:
-   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v6.0.0
    hooks:
    -   id: check-yaml
    # -   id: end-of-file-fixer
    # -   id: trailing-whitespace
-   repo: https://github.com/pre-commit/mirrors-clang-format
    rev: v21.1.8
    hooks:
    -   id: clang-format
# Using this mirror lets us use mypyc-compiled black, which is about 2x faster
-   repo: https://github.com/psf/black-pre-commit-mirror
    rev: 26.1.0
    hooks:
    -   id: black
    
-   repo: https://github.com/pycqa/isort
    rev: 7.0.0
    hooks:
    -   id: isort
        args:
            - --profile=black
- repo: https://github.com/cheshirekow/cmake-format-precommit
  rev: v0.6.13
  hooks:
    - id: cmake-format


================================================
FILE: ACKNOWLEDGMENTS.md
================================================
# Individual Contributors

If you wish to be acknowledged for your contributions, please list your name
with a short description of your contribution(s) below. For example:

- Jane Smith: Added the `foo` and `bar` ops.

MLX was developed with contributions from the following individuals:

- Nripesh Niketan: Added `softsign`, `softmax`, `hardswish`, `logsoftmax` activation functions. Added `dropout3d` ops. Added `LogicalAnd` and `LogicalOR` ops. Added `clip_grad_norm` along with `tree_reduce`. Added `cross`. Added `orthogonal` initializer.
- Juarez Bochi: Fixed bug in cross attention.
- Justin Deschenaux: Sine, Cosine, arange, randint, truncated normal, bernoulli, lion optimizer, Dropout2d, linear and logistic regression python example.
- Diogo Da Cruz: Added `tri`, `tril`, `triu`, `tensordot`, `inner`, `outer`, `tile`, `StreamContext`, `stream`, safetensors support, `einsum`, and `einsum_path`.
- Gabrijel Boduljak: Added `mlx.core.linalg`, implemented `norm` method and `InstanceNorm` layer. Implemented pooling layers and ``Upsample``.
- Hinrik Snær Guðmundsson: Added `atleast_1d`, `atleast_2d`, `atleast_3d` ops.
- Luca Arnaboldi: Added `Ceil` and `Floor` ops; implemented pickling, copy and deepcopy for mlx arrays.
- Brian Keene & Atila Orhon, with Argmax Inc.: Added `fast.scaled_dot_product_attention`
- AmirHossein Razlighi: Added chaining support for some of the ops in `nn.Module`. Comparison works for non array objects in `mlx.core.array`. Exception handling for invalid operations in `mlx.core.array`.
- Gleb Pobudzey: Added the `where` primitive, and groups in 1D and 2D convolutions.
- Paul Paczuski: Improved stability of BCE loss calculation
- Max-Heinrich Laves: Added `conv_transpose1d`, `conv_transpose2d`, and `conv_transpose3d` ops.
- Gökdeniz Gülmez: Added the `Muon (MomentUm Orthogonalized by Newton-schulz)` optimizer, and the `ReLU²` activation function.

<a href="https://github.com/ml-explore/mlx/graphs/contributors">
  <img class="dark-light" src="https://contrib.rocks/image?repo=ml-explore/mlx&anon=0&columns=20&max=100&r=true" />
</a>

# Organizations

MLX has received contributions from the following companies:
- NVIDIA Corporation & Affiliates

# Third-Party Software

MLX leverages several third-party software, listed here together with
their license copied verbatim.

## PocketFFT

Copyright (C) 2010-2018 Max-Planck-Society
All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
  list of conditions and the following disclaimer in the documentation and/or
  other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its contributors may
  be used to endorse or promote products derived from this software without
  specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

## metal-cpp

                              Apache License
                        Version 2.0, January 2004
                    http://www.apache.org/licenses/

TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

1. Definitions.

  "License" shall mean the terms and conditions for use, reproduction,
  and distribution as defined by Sections 1 through 9 of this document.

  "Licensor" shall mean the copyright owner or entity authorized by
  the copyright owner that is granting the License.

  "Legal Entity" shall mean the union of the acting entity and all
  other entities that control, are controlled by, or are under common
  control with that entity. For the purposes of this definition,
  "control" means (i) the power, direct or indirect, to cause the
  direction or management of such entity, whether by contract or
  otherwise, or (ii) ownership of fifty percent (50%) or more of the
  outstanding shares, or (iii) beneficial ownership of such entity.

  "You" (or "Your") shall mean an individual or Legal Entity
  exercising permissions granted by this License.

  "Source" form shall mean the preferred form for making modifications,
  including but not limited to software source code, documentation
  source, and configuration files.

  "Object" form shall mean any form resulting from mechanical
  transformation or translation of a Source form, including but
  not limited to compiled object code, generated documentation,
  and conversions to other media types.

  "Work" shall mean the work of authorship, whether in Source or
  Object form, made available under the License, as indicated by a
  copyright notice that is included in or attached to the work
  (an example is provided in the Appendix below).

  "Derivative Works" shall mean any work, whether in Source or Object
  form, that is based on (or derived from) the Work and for which the
  editorial revisions, annotations, elaborations, or other modifications
  represent, as a whole, an original work of authorship. For the purposes
  of this License, Derivative Works shall not include works that remain
  separable from, or merely link (or bind by name) to the interfaces of,
  the Work and Derivative Works thereof.

  "Contribution" shall mean any work of authorship, including
  the original version of the Work and any modifications or additions
  to that Work or Derivative Works thereof, that is intentionally
  submitted to Licensor for inclusion in the Work by the copyright owner
  or by an individual or Legal Entity authorized to submit on behalf of
  the copyright owner. For the purposes of this definition, "submitted"
  means any form of electronic, verbal, or written communication sent
  to the Licensor or its representatives, including but not limited to
  communication on electronic mailing lists, source code control systems,
  and issue tracking systems that are managed by, or on behalf of, the
  Licensor for the purpose of discussing and improving the Work, but
  excluding communication that is conspicuously marked or otherwise
  designated in writing by the copyright owner as "Not a Contribution."

  "Contributor" shall mean Licensor and any individual or Legal Entity
  on behalf of whom a Contribution has been received by Licensor and
  subsequently incorporated within the Work.

2. Grant of Copyright License. Subject to the terms and conditions of
  this License, each Contributor hereby grants to You a perpetual,
  worldwide, non-exclusive, no-charge, royalty-free, irrevocable
  copyright license to reproduce, prepare Derivative Works of,
  publicly display, publicly perform, sublicense, and distribute the
  Work and such Derivative Works in Source or Object form.

3. Grant of Patent License. Subject to the terms and conditions of
  this License, each Contributor hereby grants to You a perpetual,
  worldwide, non-exclusive, no-charge, royalty-free, irrevocable
  (except as stated in this section) patent license to make, have made,
  use, offer to sell, sell, import, and otherwise transfer the Work,
  where such license applies only to those patent claims licensable
  by such Contributor that are necessarily infringed by their
  Contribution(s) alone or by combination of their Contribution(s)
  with the Work to which such Contribution(s) was submitted. If You
  institute patent litigation against any entity (including a
  cross-claim or counterclaim in a lawsuit) alleging that the Work
  or a Contribution incorporated within the Work constitutes direct
  or contributory patent infringement, then any patent licenses
  granted to You under this License for that Work shall terminate
  as of the date such litigation is filed.

4. Redistribution. You may reproduce and distribute copies of the
  Work or Derivative Works thereof in any medium, with or without
  modifications, and in Source or Object form, provided that You
  meet the following conditions:

  (a) You must give any other recipients of the Work or
      Derivative Works a copy of this License; and

  (b) You must cause any modified files to carry prominent notices
      stating that You changed the files; and

  (c) You must retain, in the Source form of any Derivative Works
      that You distribute, all copyright, patent, trademark, and
      attribution notices from the Source form of the Work,
      excluding those notices that do not pertain to any part of
      the Derivative Works; and

  (d) If the Work includes a "NOTICE" text file as part of its
      distribution, then any Derivative Works that You distribute must
      include a readable copy of the attribution notices contained
      within such NOTICE file, excluding those notices that do not
      pertain to any part of the Derivative Works, in at least one
      of the following places: within a NOTICE text file distributed
      as part of the Derivative Works; within the Source form or
      documentation, if provided along with the Derivative Works; or,
      within a display generated by the Derivative Works, if and
      wherever such third-party notices normally appear. The contents
      of the NOTICE file are for informational purposes only and
      do not modify the License. You may add Your own attribution
      notices within Derivative Works that You distribute, alongside
      or as an addendum to the NOTICE text from the Work, provided
      that such additional attribution notices cannot be construed
      as modifying the License.

  You may add Your own copyright statement to Your modifications and
  may provide additional or different license terms and conditions
  for use, reproduction, or distribution of Your modifications, or
  for any such Derivative Works as a whole, provided Your use,
  reproduction, and distribution of the Work otherwise complies with
  the conditions stated in this License.

5. Submission of Contributions. Unless You explicitly state otherwise,
  any Contribution intentionally submitted for inclusion in the Work
  by You to the Licensor shall be under the terms and conditions of
  this License, without any additional terms or conditions.
  Notwithstanding the above, nothing herein shall supersede or modify
  the terms of any separate license agreement you may have executed
  with Licensor regarding such Contributions.

6. Trademarks. This License does not grant permission to use the trade
  names, trademarks, service marks, or product names of the Licensor,
  except as required for reasonable and customary use in describing the
  origin of the Work and reproducing the content of the NOTICE file.

7. Disclaimer of Warranty. Unless required by applicable law or
  agreed to in writing, Licensor provides the Work (and each
  Contributor provides its Contributions) on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
  implied, including, without limitation, any warranties or conditions
  of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
  PARTICULAR PURPOSE. You are solely responsible for determining the
  appropriateness of using or redistributing the Work and assume any
  risks associated with Your exercise of permissions under this License.

8. Limitation of Liability. In no event and under no legal theory,
  whether in tort (including negligence), contract, or otherwise,
  unless required by applicable law (such as deliberate and grossly
  negligent acts) or agreed to in writing, shall any Contributor be
  liable to You for damages, including any direct, indirect, special,
  incidental, or consequential damages of any character arising as a
  result of this License or out of the use or inability to use the
  Work (including but not limited to damages for loss of goodwill,
  work stoppage, computer failure or malfunction, or any and all
  other commercial damages or losses), even if such Contributor
  has been advised of the possibility of such damages.

9. Accepting Warranty or Additional Liability. While redistributing
  the Work or Derivative Works thereof, You may choose to offer,
  and charge a fee for, acceptance of support, warranty, indemnity,
  or other liability obligations and/or rights consistent with this
  License. However, in accepting such obligations, You may act only
  on Your own behalf and on Your sole responsibility, not on behalf
  of any other Contributor, and only if You agree to indemnify,
  defend, and hold each Contributor harmless for any liability
  incurred by, or claims asserted against, such Contributor by reason
  of your accepting any such warranty or additional liability.

END OF TERMS AND CONDITIONS

APPENDIX: How to apply the Apache License to your work.

  To apply the Apache License to your work, attach the following
  boilerplate notice, with the fields enclosed by brackets "[]"
  replaced with your own identifying information. (Don't include
  the brackets!)  The text should be enclosed in the appropriate
  comment syntax for the file format. We also recommend that a
  file or class name and description of purpose be included on the
  same "printed page" as the copyright notice for easier
  identification within third-party archives.

Copyright © 2023 Apple Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.


================================================
FILE: CITATION.cff
================================================
cff-version: 1.2.0
title: mlx
message: >-
  If you use this software, please cite it using the
  metadata from this file.
type: software
authors:
  - given-names: Awni
    family-names: Hannun
    affiliation: Apple
  - given-names: Jagrit
    family-names: Digani
    affiliation: Apple
  - given-names: Angelos
    family-names: Katharopoulos
    affiliation: Apple
  - given-names: Ronan
    family-names: Collobert
    affiliation: Apple
repository-code: 'https://github.com/ml-explore'
abstract: >-
  MLX: efficient and flexible machine learning on Apple
  silicon
license: MIT


================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.25)

if(NOT MLX_VERSION)
  file(STRINGS "mlx/version.h" _mlx_h_version REGEX "^#define MLX_VERSION_.*$")
  string(REGEX MATCH "#define MLX_VERSION_MAJOR ([0-9]+)" _ "${_mlx_h_version}")
  set(_major ${CMAKE_MATCH_1})
  string(REGEX MATCH "#define MLX_VERSION_MINOR ([0-9]+)" _ "${_mlx_h_version}")
  set(_minor ${CMAKE_MATCH_1})
  string(REGEX MATCH "#define MLX_VERSION_PATCH ([0-9]+)" _ "${_mlx_h_version}")
  set(_patch ${CMAKE_MATCH_1})
  set(MLX_PROJECT_VERSION "${_major}.${_minor}.${_patch}")
  set(MLX_VERSION ${MLX_PROJECT_VERSION})
else()
  string(REGEX REPLACE "^([0-9]+\.[0-9]+\.[0-9]+).*" "\\1" MLX_PROJECT_VERSION
                       ${MLX_VERSION})
endif()

project(
  mlx
  LANGUAGES C CXX
  VERSION ${MLX_PROJECT_VERSION})

# ----------------------------- Setup -----------------------------
set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_INSTALL_MESSAGE NEVER)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

# ----------------------------- Configuration -----------------------------
option(MLX_BUILD_TESTS "Build tests for mlx" ON)
option(MLX_BUILD_EXAMPLES "Build examples for mlx" ON)
option(MLX_BUILD_BENCHMARKS "Build benchmarks for mlx" OFF)
option(MLX_BUILD_PYTHON_BINDINGS "Build python bindings for mlx" OFF)
option(MLX_BUILD_METAL "Build metal backend" ON)
option(MLX_BUILD_CPU "Build cpu backend" ON)
option(MLX_BUILD_CUDA "Build cuda backend" OFF)
option(MLX_METAL_DEBUG "Enhance metal debug workflow" OFF)
option(MLX_ENABLE_X64_MAC "Enable building for x64 macOS" OFF)
option(MLX_BUILD_GGUF "Include support for GGUF format" ON)
option(MLX_BUILD_SAFETENSORS "Include support for safetensors format" ON)
option(MLX_BUILD_PYTHON_STUBS "Build stub files for python bindings" ON)
option(MLX_METAL_JIT "Use JIT compilation for Metal kernels" OFF)
option(MLX_USE_CCACHE "Use CCache for compilation cache when available" ON)
option(BUILD_SHARED_LIBS "Build mlx as a shared library" OFF)
option(USE_SYSTEM_FMT "Use system's provided fmt library" OFF)
option(USE_ASAN "Enable AddressSanitizer (ASan)" OFF)
option(USE_UBSAN "Enable UndefinedBehaviorSanitizer (UBSan)" OFF)
option(USE_TSAN "Enable ThreadSanitizer (TSan)" OFF)

# --------------------- Processor tests -------------------------
message(
  STATUS
    "Building MLX for ${CMAKE_SYSTEM_PROCESSOR} processor on ${CMAKE_SYSTEM_NAME}"
)

if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
  if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
    if(NOT MLX_ENABLE_X64_MAC)
      message(
        FATAL_ERROR
          "Building for x86_64 on macOS is not supported."
          " If you are on an Apple silicon system, check the build"
          " documentation for possible fixes: "
          "https://ml-explore.github.io/mlx/build/html/install.html#build-from-source"
      )
    else()
      set(MLX_BUILD_METAL OFF)
      message(WARNING "Building for x86_64 arch is not officially supported.")
    endif()
  endif()
else()
  set(MLX_BUILD_METAL OFF)
endif()

if(MLX_USE_CCACHE)
  find_program(CCACHE_PROGRAM ccache)
  if(CCACHE_PROGRAM)
    message(STATUS "Found CCache: ${CCACHE_PROGRAM}")
    set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
    set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
    set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
  endif()
endif()

if(USE_ASAN AND USE_TSAN)
  message(
    FATAL_ERROR
      "AddressSanitizer (ASan) and ThreadSanitizer (TSan) are mutually exclusive and cannot be enabled at the same time."
  )
endif()

set(SANITIZER_COMPILE_FLAGS "")
set(SANITIZER_LINK_FLAGS "")

if(USE_ASAN)
  if(WIN32 AND MSVC)
    list(APPEND SANITIZER_COMPILE_FLAGS /fsanitize=address)
    list(APPEND SANITIZER_LINK_FLAGS /fsanitize=address)
  else()
    list(APPEND SANITIZER_COMPILE_FLAGS -fsanitize=address)
    list(APPEND SANITIZER_LINK_FLAGS -fsanitize=address)
    if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
      list(APPEND SANITIZER_LINK_FLAGS -lpthread)
    endif()
  endif()
endif()

if(USE_UBSAN)
  if(WIN32 AND MSVC)
    if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
      list(APPEND SANITIZER_COMPILE_FLAGS -fsanitize=undefined)
      list(APPEND SANITIZER_LINK_FLAGS -fsanitize=undefined)
    else()
      message(
        WARNING
          "UndefinedBehaviorSanitizer (UBSan) is not directly supported via a simple flag in MSVC."
      )
    endif()
  else()
    list(APPEND SANITIZER_COMPILE_FLAGS -fsanitize=undefined)
    list(APPEND SANITIZER_LINK_FLAGS -fsanitize=undefined)
  endif()
endif()

if(USE_TSAN)
  if(WIN32 AND MSVC)
    message(
      FATAL_ERROR
        "ThreadSanitizer (TSan) is not supported by the MSVC compiler. Please use Clang or GCC."
    )
  elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
    message(FATAL_ERROR "ThreadSanitizer (TSan) is not supported on macOS.")
  else()
    list(APPEND SANITIZER_COMPILE_FLAGS -fsanitize=thread)
    list(APPEND SANITIZER_LINK_FLAGS -fsanitize=thread)
    if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
      list(APPEND SANITIZER_LINK_FLAGS -lpthread)
    endif()
  endif()
endif()

# ----------------------------- Lib -----------------------------

include(FetchContent)
# Avoid warning about DOWNLOAD_EXTRACT_TIMESTAMP in CMake 3.24:
cmake_policy(SET CMP0135 NEW)

add_library(mlx)

target_compile_options(mlx PUBLIC ${SANITIZER_COMPILE_FLAGS})
target_link_options(mlx PUBLIC ${SANITIZER_LINK_FLAGS})

if(MLX_BUILD_CUDA)
  enable_language(CUDA)
  find_package(CUDAToolkit REQUIRED)
  find_package(CUDNN REQUIRED)
  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "13.1" AND CUDAToolkit_VERSION
                                                          VERSION_LESS "13.2")
    message(FATAL_ERROR "CUDA Toolkit 13.1 is not supported.")
  endif()
endif()

if(MLX_BUILD_METAL)
  find_library(METAL_LIB Metal)
  find_library(FOUNDATION_LIB Foundation)
  find_library(QUARTZ_LIB QuartzCore)
  if(METAL_LIB)
    message(STATUS "Metal found ${METAL_LIB}")
  else()
    message(
      FATAL_ERROR
        "Metal not found. Set MLX_BUILD_METAL=OFF to build without GPU")
  endif()

  if(MLX_METAL_DEBUG)
    add_compile_definitions(MLX_METAL_DEBUG)
  endif()

  # Throw an error if xcrun not found
  execute_process(
    COMMAND zsh "-c" "/usr/bin/xcrun -sdk macosx --show-sdk-version"
    OUTPUT_VARIABLE MACOS_SDK_VERSION
    OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ERROR_IS_FATAL ANY)

  if(${MACOS_SDK_VERSION} LESS 14.0)
    message(
      FATAL_ERROR
        "MLX requires macOS SDK >= 14.0 to be built with MLX_BUILD_METAL=ON")
  endif()
  message(STATUS "Building with macOS SDK version ${MACOS_SDK_VERSION}")

  set(METAL_CPP_URL
      https://developer.apple.com/metal/cpp/files/metal-cpp_26.zip)

  if(NOT CMAKE_OSX_DEPLOYMENT_TARGET STREQUAL "")
    if(${CMAKE_OSX_DEPLOYMENT_TARGET} LESS 14.0)
      message(FATAL_ERROR "MLX requires macOS >= 14.0")
    endif()
    set(XCRUN_FLAGS "-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
  endif()
  execute_process(
    COMMAND
      zsh "-c"
      "echo \"__METAL_VERSION__\" | xcrun -sdk macosx metal ${XCRUN_FLAGS} -E -x metal -P - | tail -1 | tr -d '\n'"
    OUTPUT_VARIABLE MLX_METAL_VERSION COMMAND_ERROR_IS_FATAL ANY)
  FetchContent_Declare(metal_cpp URL ${METAL_CPP_URL})
  FetchContent_MakeAvailable(metal_cpp)
  target_include_directories(
    mlx PUBLIC $<BUILD_INTERFACE:${metal_cpp_SOURCE_DIR}>
               $<INSTALL_INTERFACE:include/metal_cpp>)
  target_link_libraries(mlx PUBLIC ${METAL_LIB} ${FOUNDATION_LIB} ${QUARTZ_LIB})
endif()

if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
  # With newer clang/gcc versions following libs are implicitly linked, but when
  # building on old distributions they need to be explicitly listed.
  target_link_libraries(mlx PRIVATE dl pthread)
endif()

if(WIN32)
  if(MSVC)
    # GGUF does not build with MSVC.
    set(MLX_BUILD_GGUF OFF)
  endif()
  # Generate DLL and EXE in the same dir, otherwise EXE will not be able to run.
  # This is only done when MLX is built as the top project.
  if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
  endif()
  # Windows implementation of dlfcn.h APIs.
  FetchContent_Declare(
    dlfcn-win32
    GIT_REPOSITORY https://github.com/dlfcn-win32/dlfcn-win32.git
    GIT_TAG v1.4.2
    EXCLUDE_FROM_ALL)
  block()
  set(BUILD_SHARED_LIBS OFF)
  FetchContent_MakeAvailable(dlfcn-win32)
  endblock()
  target_include_directories(mlx PRIVATE "${dlfcn-win32_SOURCE_DIR}/src")
  target_link_libraries(mlx PRIVATE dl)
endif()

if(MLX_BUILD_CPU)
  find_library(ACCELERATE_LIBRARY Accelerate)
  if(ACCELERATE_LIBRARY)
    message(STATUS "Accelerate found ${ACCELERATE_LIBRARY}")
    set(MLX_BUILD_ACCELERATE ON)
  else()
    message(STATUS "Accelerate not found, using default backend.")
    set(MLX_BUILD_ACCELERATE OFF)
  endif()

  if(MLX_BUILD_ACCELERATE)
    target_link_libraries(mlx PUBLIC ${ACCELERATE_LIBRARY})
    add_compile_definitions(MLX_USE_ACCELERATE)
    add_compile_definitions(ACCELERATE_NEW_LAPACK)
  elseif(WIN32)
    # Download and link prebuilt binaries of OpenBLAS. Note that we can only
    # link with the dynamic library, the prebuilt binaries were built with MinGW
    # so static-linking would require linking with MinGW's runtime.
    FetchContent_Declare(
      openblas
      URL "https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.31/OpenBLAS-0.3.31-x64.zip"
    )
    FetchContent_MakeAvailable(openblas)
    target_link_libraries(mlx
                          PRIVATE "${openblas_SOURCE_DIR}/lib/libopenblas.lib")
    target_include_directories(mlx PRIVATE "${openblas_SOURCE_DIR}/include")
    # Make sure the DLL file is placed in the same dir with executables.
    set(OPENBLAS_DLL_FILE "${openblas_SOURCE_DIR}/bin/libopenblas.dll")
    add_custom_command(
      TARGET mlx
      POST_BUILD
      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${OPENBLAS_DLL_FILE}
              ${CMAKE_BINARY_DIR})
  else()
    if(${CMAKE_HOST_APPLE})
      # The blas shipped in macOS SDK is not supported, search homebrew for
      # openblas instead.
      set(BLA_VENDOR OpenBLAS)
      set(LAPACK_ROOT
          "${LAPACK_ROOT};$ENV{LAPACK_ROOT};/usr/local/opt/openblas")
    endif()
    # Search and link with lapack.
    find_package(LAPACK REQUIRED)
    if(NOT LAPACK_FOUND)
      message(FATAL_ERROR "Must have LAPACK installed")
    endif()
    find_path(LAPACK_INCLUDE_DIRS lapacke.h /usr/include /usr/local/include
              /usr/local/opt/openblas/include)
    message(STATUS "Lapack lib " ${LAPACK_LIBRARIES})
    message(STATUS "Lapack include " ${LAPACK_INCLUDE_DIRS})
    target_include_directories(mlx PRIVATE ${LAPACK_INCLUDE_DIRS})
    target_link_libraries(mlx PRIVATE ${LAPACK_LIBRARIES})
    # List blas after lapack otherwise we may accidentally incldue an old
    # version of lapack.h from the include dirs of blas.
    find_package(BLAS REQUIRED)
    if(NOT BLAS_FOUND)
      message(FATAL_ERROR "Must have BLAS installed")
    endif()
    # TODO find a cleaner way to do this
    find_path(BLAS_INCLUDE_DIRS cblas.h /usr/include /usr/local/include
              $ENV{BLAS_HOME}/include)
    message(STATUS "Blas lib " ${BLAS_LIBRARIES})
    message(STATUS "Blas include " ${BLAS_INCLUDE_DIRS})
    target_include_directories(mlx PRIVATE ${BLAS_INCLUDE_DIRS})
    target_link_libraries(mlx PRIVATE ${BLAS_LIBRARIES})
  endif()
else()
  set(MLX_BUILD_ACCELERATE OFF)
endif()

message(STATUS "Downloading json")
FetchContent_Declare(
  json
  URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz)
FetchContent_MakeAvailable(json)
target_include_directories(
  mlx PRIVATE $<BUILD_INTERFACE:${json_SOURCE_DIR}/single_include/nlohmann>)

add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/mlx)

target_include_directories(
  mlx PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
             $<INSTALL_INTERFACE:include>)

if(USE_SYSTEM_FMT)
  find_package(fmt REQUIRED)
else()
  FetchContent_Declare(
    fmt
    GIT_REPOSITORY https://github.com/fmtlib/fmt.git
    GIT_TAG 12.1.0
    EXCLUDE_FROM_ALL)
  FetchContent_MakeAvailable(fmt)
endif()
target_link_libraries(mlx PRIVATE $<BUILD_INTERFACE:fmt::fmt-header-only>)

if(MLX_BUILD_PYTHON_BINDINGS)
  message(STATUS "Building Python bindings.")
  find_package(
    Python 3.10
    COMPONENTS Interpreter Development.Module
    REQUIRED)
  FetchContent_Declare(
    nanobind
    GIT_REPOSITORY https://github.com/wjakob/nanobind.git
    GIT_TAG v2.10.2
    GIT_SHALLOW TRUE
    EXCLUDE_FROM_ALL)
  FetchContent_MakeAvailable(nanobind)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/python/src)
endif()

if(MLX_BUILD_TESTS)
  include(CTest)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/tests)
endif()

if(MLX_BUILD_EXAMPLES)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/examples/cpp)
endif()

if(MLX_BUILD_BENCHMARKS)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/benchmarks/cpp)
endif()

# ----------------------------- Installation -----------------------------
include(GNUInstallDirs)

if(WIN32)
  # Install DLLs to the same dir with extension file (core.pyd) on Windows.
  set(CMAKE_INSTALL_BINDIR ".")
  if(MLX_BUILD_CPU)
    # Install OpenBLAS.
    install(FILES ${OPENBLAS_DLL_FILE} TYPE BIN)
  endif()
endif()

# Install library
install(
  TARGETS mlx
  EXPORT MLXTargets
  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
  INCLUDES
  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

# Install headers
install(
  DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/mlx
  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
  COMPONENT headers
  FILES_MATCHING
  PATTERN "*.h"
  PATTERN "backend/metal/kernels.h" EXCLUDE)

# Install metal dependencies
if(MLX_BUILD_METAL)

  # Install metal cpp
  install(
    DIRECTORY ${metal_cpp_SOURCE_DIR}/
    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/metal_cpp
    COMPONENT metal_cpp_source)

endif()

# Install cmake config
set(MLX_CMAKE_BUILD_CONFIG ${CMAKE_BINARY_DIR}/MLXConfig.cmake)
set(MLX_CMAKE_BUILD_VERSION_CONFIG ${CMAKE_BINARY_DIR}/MLXConfigVersion.cmake)
set(MLX_CMAKE_INSTALL_MODULE_DIR share/cmake/MLX)

install(
  EXPORT MLXTargets
  FILE MLXTargets.cmake
  DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR})

include(CMakePackageConfigHelpers)

write_basic_package_version_file(
  ${MLX_CMAKE_BUILD_VERSION_CONFIG}
  COMPATIBILITY SameMajorVersion
  VERSION ${MLX_VERSION})

configure_package_config_file(
  ${CMAKE_CURRENT_LIST_DIR}/mlx.pc.in ${MLX_CMAKE_BUILD_CONFIG}
  INSTALL_DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR}
  NO_CHECK_REQUIRED_COMPONENTS_MACRO
  PATH_VARS CMAKE_INSTALL_LIBDIR CMAKE_INSTALL_INCLUDEDIR
            MLX_CMAKE_INSTALL_MODULE_DIR)

install(FILES ${MLX_CMAKE_BUILD_CONFIG} ${MLX_CMAKE_BUILD_VERSION_CONFIG}
        DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR})

install(DIRECTORY ${CMAKE_MODULE_PATH}/
        DESTINATION ${MLX_CMAKE_INSTALL_MODULE_DIR})


================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct

## Our Pledge

We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.

We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.

## Our Standards

Examples of behavior that contributes to a positive environment for our
community include:

* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
* Focusing on what is best not just for us as individuals, but for the overall
  community

Examples of unacceptable behavior include:

* The use of sexualized language or imagery, and sexual attention or advances of
  any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address,
  without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
  professional setting

## Enforcement Responsibilities

Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.

Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.

## Scope

This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at
[opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com).
All complaints will be reviewed and investigated promptly and fairly.

All community leaders are obligated to respect the privacy and security of the
reporter of any incident.

## Enforcement Guidelines

Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:

### 1. Correction

**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.

**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.

### 2. Warning

**Community Impact**: A violation through a single incident or series of
actions.

**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.

### 3. Temporary Ban

**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.

**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.

### 4. Permanent Ban

**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.

**Consequence**: A permanent ban from any sort of public interaction within the
community.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.1, available at
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].

Community Impact Guidelines were inspired by
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].

For answers to common questions about this code of conduct, see the FAQ at
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
[https://www.contributor-covenant.org/translations][translations].

[homepage]: https://www.contributor-covenant.org
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
[Mozilla CoC]: https://github.com/mozilla/diversity
[FAQ]: https://www.contributor-covenant.org/faq
[translations]: https://www.contributor-covenant.org/translations


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to MLX

We want to make contributing to this project as easy and transparent as
possible.

## Pull Requests

1. Fork and submit pull requests to the repo.
2. If you've added code that should be tested, add tests.
3. If a change is likely to impact efficiency, run some of the benchmarks before
   and after the change. Examples of benchmarks can be found in `benchmarks/python/`.
4. If you've changed APIs, update the documentation.
5. Every PR should have passing tests and at least one review.
6. For code formatting install `pre-commit` using something like `pip install pre-commit` and run `pre-commit install`.
   This should install hooks for running `black` and `clang-format` to ensure
   consistent style for C++ and python code.

   You can also run the formatters manually as follows:

   ```shell
   clang-format -i file.cpp
   ```

   ```shell
   black file.py
   ```

   or run `pre-commit run --all-files` to check all files in the repo.

## Issues

We use GitHub issues to track public bugs. Please ensure your description is
clear and has sufficient instructions to be able to reproduce the issue.

## License

By contributing to MLX, you agree that your contributions will be licensed
under the LICENSE file in the root directory of this source tree.


================================================
FILE: LICENSE
================================================
MIT License

Copyright © 2023 Apple Inc.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: MANIFEST.in
================================================
include CMakeLists.txt
include mlx.pc.in
recursive-include mlx/ *
include cmake/*
include python/src/*
include python/mlx/py.typed # support type hinting as in PEP-561


================================================
FILE: README.md
================================================
# MLX

[**Quickstart**](#quickstart) | [**Installation**](#installation) |
[**Documentation**](https://ml-explore.github.io/mlx/build/html/index.html) |
[**Examples**](#examples)

[![CircleCI](https://circleci.com/gh/ml-explore/mlx.svg?style=svg)](https://circleci.com/gh/ml-explore/mlx)

MLX is an array framework for machine learning on Apple silicon,
brought to you by Apple machine learning research.

Some key features of MLX include:

- **Familiar APIs**: MLX has a Python API that closely follows NumPy. MLX
   also has fully featured C++, [C](https://github.com/ml-explore/mlx-c), and
   [Swift](https://github.com/ml-explore/mlx-swift/) APIs, which closely mirror
   the Python API. MLX has higher-level packages like `mlx.nn` and
   `mlx.optimizers` with APIs that closely follow PyTorch to simplify building
   more complex models.

- **Composable function transformations**: MLX supports composable function
  transformations for automatic differentiation, automatic vectorization,
  and computation graph optimization.

- **Lazy computation**: Computations in MLX are lazy. Arrays are only
  materialized when needed.

- **Dynamic graph construction**: Computation graphs in MLX are constructed
  dynamically. Changing the shapes of function arguments does not trigger
  slow compilations, and debugging is simple and intuitive.

- **Multi-device**: Operations can run on any of the supported devices
  (currently the CPU and the GPU).

- **Unified memory**: A notable difference from MLX and other frameworks
  is the *unified memory model*. Arrays in MLX live in shared memory.
  Operations on MLX arrays can be performed on any of the supported
  device types without transferring data.

MLX is designed by machine learning researchers for machine learning
researchers. The framework is intended to be user-friendly, but still efficient
to train and deploy models. The design of the framework itself is also
conceptually simple. We intend to make it easy for researchers to extend and
improve MLX with the goal of quickly exploring new ideas.

The design of MLX is inspired by frameworks like
[NumPy](https://numpy.org/doc/stable/index.html),
[PyTorch](https://pytorch.org/), [Jax](https://github.com/google/jax), and
[ArrayFire](https://arrayfire.org/).

## Examples

The [MLX examples repo](https://github.com/ml-explore/mlx-examples) has a
variety of examples, including:

- [Transformer language model](https://github.com/ml-explore/mlx-examples/tree/main/transformer_lm) training.
- Large-scale text generation with
  [LLaMA](https://github.com/ml-explore/mlx-examples/tree/main/llms/llama) and
  finetuning with [LoRA](https://github.com/ml-explore/mlx-examples/tree/main/lora).
- Generating images with [Stable Diffusion](https://github.com/ml-explore/mlx-examples/tree/main/stable_diffusion).
- Speech recognition with [OpenAI's Whisper](https://github.com/ml-explore/mlx-examples/tree/main/whisper).

## Quickstart

See the [quick start
guide](https://ml-explore.github.io/mlx/build/html/usage/quick_start.html)
in the documentation.

## Installation

MLX is available on [PyPI](https://pypi.org/project/mlx/). To install MLX on
macOS, run:

```bash
pip install mlx
```

To install the CUDA backend on Linux, run:

```bash
pip install mlx[cuda]
```

To install a CPU-only Linux package, run:

```bash
pip install mlx[cpu]
```

Checkout the
[documentation](https://ml-explore.github.io/mlx/build/html/install.html#)
for more information on building the C++ and Python APIs from source.

## Contributing

Check out the [contribution guidelines](https://github.com/ml-explore/mlx/tree/main/CONTRIBUTING.md) for more information
on contributing to MLX. See the
[docs](https://ml-explore.github.io/mlx/build/html/install.html) for more
information on building from source, and running tests.

We are grateful for all of [our
contributors](https://github.com/ml-explore/mlx/tree/main/ACKNOWLEDGMENTS.md#Individual-Contributors). If you contribute
to MLX and wish to be acknowledged, please add your name to the list in your
pull request.

## Citing MLX

The MLX software suite was initially developed with equal contribution by Awni
Hannun, Jagrit Digani, Angelos Katharopoulos, and Ronan Collobert. If you find
MLX useful in your research and wish to cite it, please use the following
BibTex entry:

```text
@software{mlx2023,
  author = {Awni Hannun and Jagrit Digani and Angelos Katharopoulos and Ronan Collobert},
  title = {{MLX}: Efficient and flexible machine learning on Apple silicon},
  url = {https://github.com/ml-explore},
  version = {0.0},
  year = {2023},
}
```


================================================
FILE: benchmarks/cpp/CMakeLists.txt
================================================
function(build_benchmark SRCFILE)
  get_filename_component(src_name ${SRCFILE} NAME_WE)
  set(target "${src_name}")
  add_executable(${target} ${SRCFILE})
  target_link_libraries(${target} PRIVATE mlx)
endfunction(build_benchmark)

build_benchmark(single_ops.cpp)
build_benchmark(irregular_strides.cpp)
build_benchmark(compare_devices.cpp)
build_benchmark(autograd.cpp)


================================================
FILE: benchmarks/cpp/autograd.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <iostream>

#include "mlx/mlx.h"
#include "time_utils.h"

namespace mx = mlx::core;

void time_value_and_grad() {
  auto x = mx::ones({200, 1000});
  mx::eval(x);
  auto fn = [](mx::array x) {
    for (int i = 0; i < 20; ++i) {
      x = mx::log(mx::exp(x));
    }
    return mx::sum(x);
  };

  auto grad_fn = mx::grad(fn);
  auto independent_value_and_grad = [&]() {
    auto value = fn(x);
    auto dfdx = grad_fn(x);
    return std::vector<mx::array>{value, dfdx};
  };
  TIME(independent_value_and_grad);

  auto value_and_grad_fn = mx::value_and_grad(fn);
  auto combined_value_and_grad = [&]() {
    auto [value, dfdx] = value_and_grad_fn(x);
    return std::vector<mx::array>{value, dfdx};
  };
  TIME(combined_value_and_grad);
}

int main() {
  std::cout << "Benchmarks for " << mx::default_device() << std::endl;
  time_value_and_grad();
}


================================================
FILE: benchmarks/cpp/compare_devices.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <iostream>
#include "mlx/mlx.h"
#include "time_utils.h"

namespace mx = mlx::core;

void time_add_op() {
  std::vector<int> sizes(1, 1);
  for (int i = 0; i < 9; ++i) {
    sizes.push_back(10 * sizes.back());
  }
  set_default_device(mx::Device::cpu);
  for (auto size : sizes) {
    auto a = mx::random::uniform({size});
    auto b = mx::random::uniform({size});
    mx::eval(a, b);
    std::cout << "Size " << size << std::endl;
    TIMEM("cpu", mx::add, a, b, mx::Device::cpu);
    TIMEM("gpu", mx::add, a, b, mx::Device::gpu);
  }
}

int main() {
  time_add_op();
}


================================================
FILE: benchmarks/cpp/irregular_strides.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <cstring>
#include <iostream>
#include <sstream>

#include "mlx/mlx.h"
#include "time_utils.h"

namespace mx = mlx::core;

void time_irregular_binary_ops_1D() {
  auto device = mx::default_device();
  int size = 1000000;
  int step = 2;
  auto a = mx::random::uniform({size});
  auto b = mx::random::uniform({size});
  mx::eval(a, b);
  a = slice(a, {0}, {size}, {step});
  b = slice(b, {0}, {size}, {step});
  TIMEM("1D strided", mx::add, a, b, device);
}

void time_irregular_binary_ops_2D() {
  auto device = mx::default_device();
  int size = 2048;
  auto a = mx::random::uniform({size, size});
  auto b = mx::random::uniform({size, size});
  mx::eval(a, b);
  TIMEM("2D regular", mx::add, a, b, device);

  b = mx::transpose(b);
  mx::eval(b);
  TIMEM("2D mx::transpose", mx::add, a, b, device);

  b = mx::random::uniform({size});
  mx::eval(b);
  TIMEM("2D broadcast dim 0", mx::add, a, b, device);

  b = mx::reshape(b, {size, 1});
  mx::eval(b);
  TIMEM("2D broadcast dim 1", mx::add, a, b, device);
}

void time_irregular_binary_ops_3D() {
  auto device = mx::default_device();
  int d0 = 32;
  int d1 = 512;
  int d2 = 512;
  auto a = mx::random::uniform({d0, d1, d2});
  auto b = mx::random::uniform({d0, d1, d2});
  TIMEM("3D regular", mx::add, a, b, device);

  b = mx::transpose(b, {0, 2, 1});
  TIMEM("3D mx::transpose", mx::add, a, b, device);

  b = mx::random::uniform({d1, d2});
  TIMEM("3D broadcast dim 0", mx::add, a, b, device);

  b = mx::random::uniform({d0, 1, d2});
  TIMEM("3D broadcast dim 1", mx::add, a, b, device);

  b = mx::random::uniform({d0, d1, 1});
  TIMEM("3D broadcast dim 2", mx::add, a, b, device);

  b = mx::random::uniform({d2});
  TIMEM("3D broadcast dims 0, 1", mx::add, a, b, device);

  b = mx::random::uniform({d1, 1});
  TIMEM("3D broadcast dims 0, 2", mx::add, a, b, device);

  b = mx::random::uniform({d0, 1, 1});
  TIMEM("3D broadcast dims 1, 2", mx::add, a, b, device);
}

void time_irregular_binary_ops_4D() {
  auto device = mx::default_device();
  mx::Shape shape = {8, 8, 512, 512};
  auto a = mx::random::uniform(shape);
  auto b = mx::random::uniform(shape);

  TIMEM("4D regular", mx::add, a, b, device);

  b = mx::transpose(b, {0, 1, 3, 2});
  TIMEM("4D mx::transpose", mx::add, a, b, device);

  std::string om = "4D broadcast dims ";
  for (int i = 0; i < shape.size(); ++i) {
    shape[i] = 1;
    b = mx::random::uniform(shape);
    std::ostringstream msg;
    msg << om << i;
    TIMEM(msg.str(), mx::add, a, b, device);

    for (int j = i + 1; j < shape.size(); ++j) {
      shape[j] = 1;
      std::ostringstream msg;
      msg << om << i << ", " << j;
      b = mx::random::uniform(shape);
      TIMEM(msg.str(), mx::add, a, b, device);
      shape[j] = a.shape(j);

      for (int k = j + 1; k < shape.size(); ++k) {
        shape[k] = 1;
        std::ostringstream msg;
        msg << om << i << ", " << j << ", " << k;
        b = mx::random::uniform(shape);
        TIMEM(msg.str(), mx::add, a, b, device);
        shape[k] = a.shape(k);
      }
    }
    shape[i] = a.shape(i);
  }
}

void time_irregular_reshape() {
  auto device = mx::default_device();
  mx::Shape shape;
  auto reshape_fn = [&shape, device](const mx::array& a) {
    return mx::reshape(a, shape, device);
  };

  int size = 64;
  int d = 2 * size;

  auto a = mx::random::uniform({d, d, d});

  shape = {8 * size, size, size};
  TIMEM("3D contiguous", reshape_fn, a);

  a = mx::transpose(a);
  shape = {8 * size, size, size};
  TIMEM("3D mx::transpose", reshape_fn, a);

  a = mx::transpose(a, {1, 2, 0});
  shape = {8 * size, size, size};
  TIMEM("3D mx::transpose dims 1 2", reshape_fn, a);

  a = mx::broadcast_to(mx::random::uniform({d, d}), {d, d, d});
  TIMEM("3D broadcast dim 0", reshape_fn, a);

  a = mx::broadcast_to(mx::random::uniform({d, 1, d}), {d, d, d});
  TIMEM("3D broadcast dim 1", reshape_fn, a);

  a = mx::broadcast_to(mx::random::uniform({d, d, 1}), {d, d, d});
  TIMEM("3D broadcast dim 2", reshape_fn, a);

  a = mx::broadcast_to(mx::random::uniform({d}), {d, d, d});
  TIMEM("3D broadcast dims 0, 1", reshape_fn, a);

  a = mx::broadcast_to(mx::random::uniform({d, 1}), {d, d, d});
  TIMEM("3D broadcast dims 0, 2", reshape_fn, a);

  a = mx::broadcast_to(mx::random::uniform({d, 1, 1}), {d, d, d});
  TIMEM("3D broadcast dims 1, 2", reshape_fn, a);

  a = mx::broadcast_to(mx::random::uniform({1, 1, 1}), {d, d, d});
  TIMEM("3D broadcast dims 1, 2, 3", reshape_fn, a);
}

void time_irregular_astype_1D() {
  auto device = mx::default_device();
  int size = 1000000;
  int step = 2;
  auto a = mx::random::uniform({size});
  a = slice(a, {0}, {size}, {step});
  TIMEM("1D strided", mx::astype, a, mx::int32, device);
}

void time_irregular_astype_2D() {
  auto device = mx::default_device();
  int size = 2048;
  mx::Shape shape = {size, size};

  auto a = mx::random::uniform(shape);
  TIMEM("2D regular", mx::astype, a, mx::int32, device);

  a = mx::transpose(a);
  TIMEM("2D mx::transpose", mx::astype, a, mx::int32, device);

  a = mx::broadcast_to(mx::random::uniform({size}), shape);
  TIMEM("2D broadcast dim 0", mx::astype, a, mx::int32, device);

  a = mx::broadcast_to(mx::random::uniform({size, 1}), shape);
  TIMEM("2D broadcast dim 1", mx::astype, a, mx::int32, device);
}

int main(int argc, char** argv) {
  if (argc > 1) {
    bool use_gpu = !strcmp(argv[1], "gpu");
    set_default_device(use_gpu ? mx::Device::gpu : mx::Device::cpu);
  }
  std::cout << "Benchmarks for " << mx::default_device() << std::endl;
  time_irregular_binary_ops_1D();
  time_irregular_binary_ops_2D();
  time_irregular_binary_ops_3D();
  time_irregular_binary_ops_4D();
  time_irregular_reshape();
  time_irregular_astype_1D();
  time_irregular_astype_2D();
}


================================================
FILE: benchmarks/cpp/single_ops.cpp
================================================
// Copyright © 2023 Apple Inc.

#include "mlx/mlx.h"
#include "time_utils.h"

namespace mx = mlx::core;

void time_creation_ops() {
  int M = 2000;
  int N = 500;
  auto shape = {M, N};
  auto full_fp32 = [&]() { return mx::full(shape, 3.3f); };
  TIME(full_fp32);
  auto zeros_fp32 = [&]() { return mx::zeros(shape, mx::float32); };
  TIME(zeros_fp32);
  auto ones_fp32 = [&]() { return mx::ones(shape, mx::float32); };
  TIME(ones_fp32);

  auto arange_fp32 = [&]() { return mx::arange(0.0, 10.0, 1e-4); };
  TIME(arange_fp32);
}

void time_type_conversions() {
  int M = 2000;
  int N = 500;
  auto shape = {M, N};
  auto device = mx::default_device();

  auto a = mx::zeros(shape, mx::float32);
  mx::eval(a);
  TIMEM("mx::float32 to mx::int32", mx::astype, a, mx::int32, device);
  TIMEM("mx::float32 to mx::uint32", mx::astype, a, mx::uint32, device);

  a = mx::zeros(shape, mx::int32);
  mx::eval(a);
  TIMEM("mx::int32 to mx::float32", mx::astype, a, mx::float32, device);

  a = mx::zeros(shape, mx::bool_);
  mx::eval(a);
  TIMEM("bool to mx::float32", mx::astype, a, mx::float32, device);
  TIMEM("bool to mx::int32", mx::astype, a, mx::int32, device);
  TIMEM("bool to mx::uint32", mx::astype, a, mx::uint32, device);
}

void time_random_generation() {
  int M = 2000;
  int N = 500;

  auto uniform = [&]() { return mx::random::uniform({M, N}, mx::float32); };
  TIME(uniform);
  auto normal = [&]() { return mx::random::normal({M, N}, mx::float32); };
  TIME(normal);
}

void time_unary_ops() {
  int M = 2000;
  int N = 500;
  auto device = mx::default_device();

  auto a = mx::random::normal({M, N});
  mx::eval(a);
  TIME(mlx::core::abs, a, device);
  TIME(mx::negative, a, device);
  TIME(mx::sign, a, device);
  TIME(mx::square, a, device);
  TIME(mlx::core::sqrt, a, device);
  TIME(mx::rsqrt, a, device);
  TIME(mlx::core::exp, a, device);

  a = mx::random::uniform({M, N});
  TIME(mlx::core::log, a, device);
}

void time_binary_ops() {
  int M = 1000, N = 100, K = 10;
  auto condition = mx::random::randint(0, 2, {M, N, K});
  auto a = mx::random::uniform({M, N, K});
  auto b = mx::random::uniform({M, N, K});
  auto device = mx::default_device();
  mx::eval(a, b);

  TIME(mx::add, a, b, device);
  TIME(mx::subtract, a, b, device);
  TIME(mx::multiply, a, b, device);
  TIME(mx::divide, a, b, device);
  TIME(mx::maximum, a, b, device);
  TIME(mx::minimum, a, b, device);
  TIME(mx::where, condition, a, b, device);

  condition = mx::array({true});
  b = mx::random::uniform({1});
  mx::eval(b);
  TIMEM("scalar", mx::add, a, b, device);
  TIMEM("vector-scalar", mx::subtract, a, b, device);
  TIMEM("scalar-vector", mx::subtract, b, a, device);
  TIMEM("scalar", mx::multiply, a, b, device);
  TIMEM("vector-scalar", mx::divide, a, b, device);
  TIMEM("scalar-vector", mx::divide, b, a, device);
  TIMEM("scalar-vector", mx::where, condition, a, b, device);

  condition = mx::broadcast_to(mx::array({true}), {1000, 100});
  a = mx::broadcast_to(mx::random::uniform({1}), {1000, 100});
  b = mx::broadcast_to(mx::random::uniform({1}), {1000, 100});
  mx::eval(a, b);
  TIMEM("scalar-scalar broadcast", mx::add, a, b, device);
  TIMEM("scalar-scalar broadcast", mx::subtract, a, b, device);
  TIMEM("scalar-scalar broadcast", mx::multiply, a, b, device);
  TIMEM("scalar-scalar broadcast", mx::divide, a, b, device);
  TIMEM("scalar-scalar broadcast", mx::where, condition, a, b, device);
}

void time_strided_ops() {
  int M = 50, N = 50, O = 50, P = 50;
  auto a = mx::random::uniform({M, N, O, P});
  auto b = mx::random::uniform({M, N, O, P});
  auto device = mx::default_device();
  mx::eval(a, b);
  TIMEM("non-strided", mx::add, a, b, device);
  a = mx::transpose(a, {1, 0, 2, 3});
  b = mx::transpose(b, {3, 2, 0, 1});
  mx::eval(a, b);
  TIMEM("strided", mx::add, a, b, device);
}

void time_comparisons() {
  int M = 1000, N = 100, K = 10;
  auto a = mx::random::uniform({M, N, K});
  auto b = mx::random::uniform({M, N, K});
  auto device = mx::default_device();
  mx::eval(a, b);
  TIME(mx::equal, a, b, device);
  TIME(mx::greater, a, b, device);
  TIME(mx::greater_equal, a, b, device);
  TIME(mx::less, a, b, device);
  TIME(mx::less_equal, a, b, device);
}

void time_matvec() {
  int M = 2000, N = 200;
  auto a = mx::random::uniform({M, N});
  auto b = mx::random::uniform({N});
  auto c = mx::random::uniform({M});
  mx::eval(a, b, c);
  auto matvec = [&]() { return mx::matmul(a, b); };
  TIME(matvec);

  auto matvec_transpose = [&]() { return mx::matmul(mx::transpose(a), c); };
  TIME(matvec_transpose);
}

void time_matmul() {
  int M = 1000, N = 1000, K = 1000;
  auto a = mx::random::uniform({M, K});
  auto b = mx::random::uniform({K, N});
  auto device = mx::default_device();
  mx::eval(a, b);
  TIME(mx::matmul, a, b, device);

  auto transpose_matmul = [&]() { return mx::matmul(mx::transpose(a), b); };
  TIME(transpose_matmul);
}

void time_reductions() {
  auto a = mx::random::normal({10000, 1000});
  mx::eval(a);
  auto sum_all = [&a]() { return mx::sum(a, false); };
  TIME(sum_all);

  auto sum_along_0 = [&a]() { return mx::sum(a, 0, false); };
  TIME(sum_along_0);

  auto sum_along_1 = [&a]() { return mx::sum(a, 1, false); };
  TIME(sum_along_1);

  auto prod_all = [&a]() { return mx::prod(a, false); };
  TIME(prod_all);

  auto all_true = [&a]() { return mx::all(a, false); };
  TIME(all_true);

  auto all_along_0 = [&a]() { return mx::all(a, 0, false); };
  TIME(all_along_0);

  auto all_along_1 = [&a]() { return mx::all(a, 1, false); };
  TIME(all_along_1);

  auto any_true = [&a]() { return mx::any(a, false); };
  TIME(any_true);

  auto argmin_along_0 = [&a]() { return mx::argmin(a, 0, false); };
  TIME(argmin_along_0);

  auto argmin_along_1 = [&a]() { return mx::argmin(a, 1, false); };
  TIME(argmin_along_1);

  auto indices = mx::array({1});
  auto updates = mx::reshape(mx::array({NAN}), {1, 1, 1});
  std::vector<int> axes{0};
  auto b = scatter(a, {indices}, updates, axes);
  mx::eval(b);

  auto max_along_0 = [&b]() { return mx::max(b, 0, false); };
  TIME(max_along_0);
  auto max_along_1 = [&b]() { return mx::max(b, 1, false); };
  TIME(max_along_1);

  auto min_along_0 = [&b]() { return mx::min(b, 0, false); };
  TIME(min_along_0);
  auto min_along_1 = [&b]() { return mx::min(b, 1, false); };
  TIME(min_along_1);
}

void time_gather_scatter() {
  auto a = mx::random::normal({1000, 768});
  mx::eval(a);
  auto indices = mx::random::randint(0, 1000, {256});
  mx::eval(indices);

  auto embedding_lookup = [&a, &indices]() { return mx::take(a, indices, 0); };
  TIME(embedding_lookup);

  indices = mx::random::randint(0, 768 * 1000, {256 * 768});
  mx::eval(indices);

  auto single_element_lookup = [&a, &indices]() {
    return mx::take(a, indices);
  };
  TIME(single_element_lookup);

  indices = mx::random::randint(0, 1000, {256});
  auto updates = mx::random::normal({256, 1, 768});
  mx::eval(indices, updates);

  auto embedding_update = [&a, &indices, &updates]() {
    return scatter(a, indices, updates, 0);
  };
  TIME(embedding_update);

  auto embedding_add = [&a, &indices, &updates]() {
    return scatter_add(a, indices, updates, 0);
  };
  TIME(embedding_add);

  a = mx::reshape(a, {-1});
  indices = mx::random::randint(0, 768 * 1000, {768 * 256});
  updates = mx::random::normal({256 * 768, 1});
  mx::eval(a, indices, updates);

  auto single_element_update = [&a, &indices, &updates]() {
    return scatter(a, indices, updates, 0);
  };
  TIME(single_element_update);

  auto single_element_add = [&a, &indices, &updates]() {
    return scatter_add(a, indices, updates, 0);
  };
  TIME(single_element_add);
}

void time_divmod() {
  auto a = mx::random::normal({1000});
  auto b = mx::random::normal({1000});
  mx::eval({a, b});

  auto divmod_fused = [&a, &b]() { return mx::divmod(a, b); };
  TIME(divmod_fused);

  auto divmod_separate = [&a, &b]() {
    return std::vector<mx::array>{mx::floor_divide(a, b), mx::remainder(a, b)};
  };
  TIME(divmod_separate);
}

int main() {
  std::cout << "Benchmarks for " << mx::default_device() << std::endl;
  time_creation_ops();
  time_type_conversions();
  time_unary_ops();
  time_binary_ops();
  time_strided_ops();
  time_random_generation();
  time_comparisons();
  time_matvec();
  time_matmul();
  time_reductions();
  time_gather_scatter();
  time_divmod();
}


================================================
FILE: benchmarks/cpp/time_utils.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include <chrono>
#include <iomanip>
#include <iostream>

#include "mlx/mlx.h"

#define milliseconds(x) \
  (std::chrono::duration_cast<std::chrono::nanoseconds>(x).count() / 1e6)
#define time_now() std::chrono::high_resolution_clock::now()

#define TIME(FUNC, ...)                                                        \
  std::cout << "Timing " << #FUNC << " ... " << std::flush                     \
            << std::setprecision(5) << time_fn(FUNC, ##__VA_ARGS__) << " msec" \
            << std::endl;

#define TIMEM(MSG, FUNC, ...)                                      \
  std::cout << "Timing " << "(" << MSG << ") " << #FUNC << " ... " \
            << std::flush << std::setprecision(5)                  \
            << time_fn(FUNC, ##__VA_ARGS__) << " msec" << std::endl;

template <typename F, typename... Args>
double time_fn(F fn, Args&&... args) {
  // warmup
  for (int i = 0; i < 5; ++i) {
    eval(fn(std::forward<Args>(args)...));
  }

  int num_iters = 100;
  auto start = time_now();
  for (int i = 0; i < num_iters; i++) {
    eval(fn(std::forward<Args>(args)...));
  }
  auto end = time_now();
  return milliseconds(end - start) / static_cast<double>(num_iters);
}


================================================
FILE: benchmarks/numpy/single_ops.py
================================================
# Copyright © 2023 Apple Inc.

import numpy as np
from time_utils import time_fn


def time_add():
    a = np.ones((100, 100, 10), dtype=np.float32)
    b = np.ones((100, 100, 10), dtype=np.float32)
    time_fn(np.add, a, b)


def time_matmul():
    a = np.random.rand(1000, 500).astype(np.float32)
    b = np.random.rand(500, 1000).astype(np.float32)
    time_fn(np.matmul, a, b)


def time_exp():
    a = np.random.randn(1000, 100).astype(np.float32)
    time_fn(np.exp, a)


def time_take():
    a = np.random.rand(10000, 500)
    ids = np.random.randint(0, 10000, (20, 10))
    ids = [idx.reshape(-1) for idx in np.split(ids, 20)]

    def random_take():
        return [np.take(a, idx, 0) for idx in ids]

    time_fn(random_take)


if __name__ == "__main__":
    time_add()
    time_matmul()
    time_exp()
    time_take()


================================================
FILE: benchmarks/numpy/time_utils.py
================================================
# Copyright © 2023 Apple Inc.

import time


def time_fn(fn, *args):
    print(f"Timing {fn.__name__} ...", end=" ")

    # warmup
    for _ in range(5):
        fn(*args)

    num_iters = 100
    tic = time.perf_counter()
    for _ in range(num_iters):
        x = fn(*args)
    toc = time.perf_counter()

    msec = 1e3 * (toc - tic) / num_iters
    print(f"{msec:.5f} msec")


================================================
FILE: benchmarks/python/batch_matmul_bench.py
================================================
# Copyright © 2023 Apple Inc.

import argparse

import mlx.core as mx
from time_utils import time_fn

B = 8
T = 1024
D = 512


def time_batch_matmul():
    mx.random.seed(3)
    a = mx.random.uniform(shape=(B, T, D))
    b = mx.random.uniform(shape=(D, D))
    c = mx.random.uniform(shape=(B, T, D))
    mx.eval(a, b, c)

    time_fn(mx.matmul, a, b)

    def batch_vjp_first():
        return mx.vjp(mx.matmul, [a, b], [c])[1][0]

    time_fn(batch_vjp_first)

    def batch_vjp_second():
        return mx.vjp(mx.matmul, [a, b], [c])[1][1]

    time_fn(batch_vjp_second)


def time_unbatch_matmul():
    mx.random.seed(3)
    a = mx.random.uniform(shape=(B * T, D))
    b = mx.random.uniform(shape=(D, D))
    c = mx.random.uniform(shape=(B * T, D))
    mx.eval(a, b, c)
    time_fn(mx.matmul, a, b)

    def unbatch_vjp_first():
        return mx.matmul(c, mx.transpose(b))

    time_fn(unbatch_vjp_first)

    def unbatch_vjp_second():
        return mx.matmul(mx.transpose(a), c)

    time_fn(unbatch_vjp_second)


if __name__ == "__main__":
    parser = argparse.ArgumentParser("MLX benchmarks.")
    parser.add_argument("--gpu", action="store_true", help="Use the Metal back-end.")
    args = parser.parse_args()
    if args.gpu:
        mx.set_default_device(mx.gpu)
    else:
        mx.set_default_device(mx.cpu)

    time_batch_matmul()
    time_unbatch_matmul()


================================================
FILE: benchmarks/python/blas/bench_gemm.py
================================================
# Copyright © 2023 Apple Inc.

import argparse
import math
import os
import subprocess
import time

import mlx.core as mx
import numpy as np
import torch

device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
device_name = device_name.decode("utf-8").strip("\n")

N_warmup = 8
N_iter_bench = 80
N_iter_func = 5


def bench(f, a, b):
    for i in range(N_warmup):
        f(a, b)
    torch.mps.synchronize()

    s = time.perf_counter_ns()
    for i in range(N_iter_bench):
        f(a, b)
    e = time.perf_counter_ns()
    return (e - s) * 1e-9


def gemm_nn_mlx(a, b):
    ys = []
    for i in range(N_iter_func):
        y = a @ b
        ys.append(y)
    mx.eval(ys)
    return ys


def gemm_nt_mlx(a, b):
    ys = []
    for i in range(N_iter_func):
        y = a @ b.transpose((0, 2, 1))
        ys.append(y)
    mx.eval(ys)
    return ys


def gemm_tn_mlx(a, b):
    ys = []
    for i in range(N_iter_func):
        y = a.transpose((0, 2, 1)) @ b
        ys.append(y)
    mx.eval(ys)
    return ys


def gemm_tt_mlx(a, b):
    ys = []
    for i in range(N_iter_func):
        y = a.transpose((0, 2, 1)) @ b.transpose((0, 2, 1))
        ys.append(y)
    mx.eval(ys)
    return ys


@torch.no_grad()
def gemm_nn_torch(a, b):
    ys = []
    for i in range(N_iter_func):
        y = a @ b
        ys.append(y)
    torch.mps.synchronize()
    return ys


@torch.no_grad()
def gemm_nt_torch(a, b):
    ys = []
    for i in range(N_iter_func):
        y = a @ b.transpose(-1, -2)
        ys.append(y)
    torch.mps.synchronize()
    return ys


@torch.no_grad()
def gemm_tn_torch(a, b):
    ys = []
    for i in range(N_iter_func):
        y = a.transpose(-1, -2) @ b
        ys.append(y)
    torch.mps.synchronize()
    return ys


@torch.no_grad()
def gemm_tt_torch(a, b):
    ys = []
    for i in range(N_iter_func):
        y = a.transpose(-1, -2) @ b.transpose(-1, -2)
        ys.append(y)
    torch.mps.synchronize()
    return ys


def bench_shape(B, M, N, K, np_dtype, transpose="nn"):
    shape_a = (B, M, K) if transpose[0] == "n" else (B, K, M)
    shape_b = (B, K, N) if transpose[1] == "n" else (B, N, K)

    a_np = np.random.normal(0.0, 1.0 / math.sqrt(M + K), shape_a).astype(np_dtype)
    b_np = np.random.normal(0.0, 1.0 / math.sqrt(N + K), shape_b).astype(np_dtype)

    a_mx = mx.array(a_np)
    b_mx = mx.array(b_np)

    a_pt = torch.from_numpy(a_np).to("mps")
    b_pt = torch.from_numpy(b_np).to("mps")

    torch.mps.synchronize()

    f_mx = {
        "nn": gemm_nn_mlx,
        "nt": gemm_nt_mlx,
        "tn": gemm_tn_mlx,
        "tt": gemm_tt_mlx,
    }[transpose]

    f_pt = {
        "nn": gemm_nn_torch,
        "nt": gemm_nt_torch,
        "tn": gemm_tn_torch,
        "tt": gemm_tt_torch,
    }[transpose]

    time_torch = bench(f_pt, a_pt, b_pt)
    time_mlx = bench(f_mx, a_mx, b_mx)

    t_a = (0, 1, 2) if transpose[0] == "n" else (0, 2, 1)
    t_b = (0, 1, 2) if transpose[1] == "n" else (0, 2, 1)

    c_mlx = a_mx.transpose(t_a) @ b_mx.transpose(t_b)
    c_npy = a_np.transpose(t_a).astype(np_dtype) @ b_np.transpose(t_b).astype(np_dtype)

    atol = 1e-5 if np_dtype == np.float32 else 1e-4

    if not np.allclose(c_mlx, c_npy.astype(np_dtype), atol=atol):
        print(
            f"Failed at {(B, M, N, K)} [transpose = {transpose}] with max(|a - b|) = {np.max(np.abs(c_npy - c_mlx))}"
        )

    return time_mlx, time_torch


def get_gflop_count(B, M, N, K):
    return float(2.0 * N_iter_bench * N_iter_func * B * M * N * K) / float(1024.0**3)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run gemm benchmarks")

    dtypes = ("float32", "float16", "complex64")
    transposes = ("nn", "nt", "tn")
    shapes = (
        (16, 234, 768, 3072),
        (1, 64, 64, 25344),
        (16, 1024, 1024, 1024),
        (1, 1024, 1024, 2048),
        (4, 1024, 1024, 4096),
        (4, 1024, 4096, 1024),
        (1, 4096, 4096, 4096),
    )

    for dtype in dtypes:
        for transpose in transposes:
            for B, M, N, K in shapes:
                np_dtype = getattr(np, dtype)
                time_mlx, time_torch = bench_shape(B, M, N, K, np_dtype, transpose)

                gflop_count = get_gflop_count(B, M, N, K)
                gflops_mx = gflop_count / (time_mlx)
                gflops_pt = gflop_count / (time_torch)
                diff = gflops_mx / gflops_pt - 1.0

                print(
                    f"{B:3d}, {M:4d}, {N:4d}, {K:4d}, {dtype}, {transpose}, {gflops_pt:05.3f}, {gflops_mx:05.3f}, {100.0 * diff:+5.2f}%"
                )
                if gflops_pt >= 2.0 * gflops_mx:
                    print("ATTENTION ^^^^^^^")


================================================
FILE: benchmarks/python/blas/bench_gemv.py
================================================
# Copyright © 2023 Apple Inc.

import os
import subprocess
import time

import matplotlib.pyplot as plt
import mlx.core as mx
import numpy as np
import torch

results_dir = "./results"

if not os.path.isdir(results_dir):
    os.mkdir(results_dir)

device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
device_name = device_name.decode("utf-8").strip("\n")

N_warmup = 5
N_iter_bench = 50
N_iter_func = 20

out_vec_sizes = [128, 512, 2048, 4096]
in_vec_sizes = [128, 512, 2048, 4096]

benchmark_vector_lens = []
benchmark_vector_lens += [(i + 1) * 4096 for i in range(8)][::2]
benchmark_vector_lens += [(i + 1) * 4095 for i in range(8)][::2]
benchmark_vector_lens += [(i + 1) * 4097 for i in range(8)][::2]
benchmark_vector_lens += [64, 128, 512, 1024, 2048, 11008, 32000]

benchmark_vector_lens.sort()


def bench(f, m, v):
    for i in range(N_warmup):
        f(m, v)
    torch.mps.synchronize()

    s = time.perf_counter_ns()
    for i in range(N_iter_bench):
        f(m, v)
    e = time.perf_counter_ns()
    return (e - s) * 1e-9


def gemv_mlx(m, v):
    ys = []
    for i in range(N_iter_func):
        y = m @ v
        ys.append(y)
    mx.eval(ys)
    return ys


def gemv_t_mlx(m, v):
    ys = []
    for i in range(N_iter_func):
        y = v @ m
        ys.append(y)
    mx.eval(ys)
    return ys


@torch.no_grad()
def gemv_torch(m, v):
    ys = []
    for i in range(N_iter_func):
        y = m @ v
        ys.append(y)
    torch.mps.synchronize()
    return ys


@torch.no_grad()
def gemv_t_torch(m, v):
    ys = []
    for i in range(N_iter_func):
        y = v @ m
        ys.append(y)
    torch.mps.synchronize()
    return ys


def bench_lens(in_vec_len, out_vec_len, np_dtype, transpose=False):
    shape_mat = (in_vec_len, out_vec_len) if transpose else (out_vec_len, in_vec_len)
    shape_vec = (1, in_vec_len) if transpose else (in_vec_len, 1)

    mat_npy = np.random.normal(0.0, 2.0 / in_vec_len, shape_mat).astype(np_dtype)
    vec_npy = np.random.normal(0.0, 2.0 / in_vec_len, shape_vec).astype(np_dtype)
    mat_mlx = mx.array(mat_npy)
    vec_mlx = mx.array(vec_npy)
    mat_trc = torch.from_numpy(mat_npy).to("mps")
    vec_trc = torch.from_numpy(vec_npy).to("mps")

    torch.mps.synchronize()

    time_torch = (
        bench(gemv_t_torch, mat_trc, vec_trc)
        if transpose
        else bench(gemv_torch, mat_trc, vec_trc)
    )
    time_mlx = (
        bench(gemv_t_mlx, mat_mlx, vec_mlx)
        if transpose
        else bench(gemv_mlx, mat_mlx, vec_mlx)
    )

    c_mlx = (
        np.asarray(vec_mlx @ mat_mlx) if transpose else np.asarray(mat_mlx @ vec_mlx)
    )
    c_npy = (vec_npy @ mat_npy) if transpose else (mat_npy @ vec_npy)

    if not np.allclose(c_mlx, c_npy, atol=2e-5):
        print(
            f"Failed at {shape_mat} [transpose = {transpose}] with max(|a - b|) = {np.max(np.abs(c_npy - c_mlx))}"
        )

    return time_mlx, time_torch


def get_gflop_count(in_vec_len, out_vec_len):
    return float(2.0 * N_iter_bench * N_iter_func * in_vec_len * out_vec_len) / float(
        1024**3
    )


def get_gbyte_size(in_vec_len, out_vec_len, np_dtype):
    n_elem = in_vec_len * out_vec_len + in_vec_len + out_vec_len
    item_size = 4 if np_dtype == np.float32 else 2
    return float(N_iter_bench * N_iter_func * n_elem * item_size) / float(1024**3)


def bench_with_in_len(ax, in_vec_len, out_vector_lens, dtype, transpose):
    np_dtype = getattr(np, dtype)
    mlx_gb_s = []
    mlx_gflops = []
    pyt_gb_s = []
    pyt_gflops = []

    for out_vec_len in out_vector_lens:
        gflop_count = get_gflop_count(in_vec_len, out_vec_len)
        gbyte_size = get_gbyte_size(in_vec_len, out_vec_len, np_dtype)

        time_mlx, time_torch = bench_lens(in_vec_len, out_vec_len, np_dtype, transpose)

        mlx_gb_s.append(gbyte_size / time_mlx)
        pyt_gb_s.append(gbyte_size / time_torch)

        mlx_gflops.append(gflop_count / time_mlx)
        pyt_gflops.append(gflop_count / time_torch)

    if transpose:
        title = f"gemv_t ([1, {in_vec_len}] [{in_vec_len}, out_vec_len]) | {dtype}"
    else:
        title = f"gemv ([out_vec_len, {in_vec_len}] X [{in_vec_len}, 1] ) | {dtype}"

    ax.plot(out_vector_lens, mlx_gb_s, "tab:blue", label="MLX")
    ax.plot(out_vector_lens, pyt_gb_s, "tab:red", label="Torch")
    ax.set_title(title)
    ax.set(xlabel="out_vector_len", ylabel="Performance (GB/s)")
    ax.legend()


def bench_with_out_len(ax, out_vec_len, in_vector_lens, dtype, transpose):
    np_dtype = getattr(np, dtype)
    mlx_gb_s = []
    mlx_gflops = []
    pyt_gb_s = []
    pyt_gflops = []

    for in_vec_len in in_vector_lens:
        gflop_count = get_gflop_count(in_vec_len, out_vec_len)
        gbyte_size = get_gbyte_size(in_vec_len, out_vec_len, np_dtype)

        time_mlx, time_torch = bench_lens(in_vec_len, out_vec_len, np_dtype, transpose)

        mlx_gb_s.append(gbyte_size / time_mlx)
        pyt_gb_s.append(gbyte_size / time_torch)

        mlx_gflops.append(gflop_count / time_mlx)
        pyt_gflops.append(gflop_count / time_torch)

    if transpose:
        title = f"([1, in_vec_len] [in_vec_len, {out_vec_len}])"
    else:
        title = f"([{out_vec_len}, in_vec_len] X [in_vec_len, 1] )"

    ax.plot(in_vector_lens, mlx_gb_s, "tab:blue", label="MLX")
    ax.plot(in_vector_lens, pyt_gb_s, "tab:red", label="Torch")
    ax.set_title(title)
    ax.set(xlabel="in_vector_len", ylabel="Performance (GB/s)")
    ax.legend()


for transpose in (False, True):
    for dtype in ("float32", "float16", "complex64"):
        fig, axs = plt.subplots(
            len(in_vec_sizes), 2, figsize=(8.5, 11), layout="constrained"
        )

        for i, in_vec_len in enumerate(in_vec_sizes):
            bench_with_in_len(
                axs[i][0], in_vec_len, benchmark_vector_lens, dtype, transpose
            )

        for i, out_vec_len in enumerate(out_vec_sizes):
            bench_with_out_len(
                axs[i][1], out_vec_len, benchmark_vector_lens, dtype, transpose
            )

        op_name = "gemv_t" if transpose else "gemv"
        fig.suptitle(f"{device_name}: {dtype} {op_name}")
        fig.savefig(
            os.path.join(
                results_dir, f"{device_name.replace(' ', '_')}_{dtype}_{op_name}.pdf"
            )
        )
        plt.close(fig)


================================================
FILE: benchmarks/python/comparative/README.md
================================================
Microbenchmarks comparing MLX to PyTorch
========================================

Implement the same microbenchmarks in MLX and PyTorch to compare and make a
list of the biggest possible performance improvements and/or regressions.

Run with `python bench_mlx.py sum_axis --size 8x1024x128 --axis 2 --cpu` for
instance to measure the times it takes to sum across the 3rd axis of the above
tensor on the cpu.

`compare.py` runs several benchmarks and compares the speed-up or lack thereof
in comparison to PyTorch.

Each bench script can be run with `--print-pid` to print the PID and wait for a
key in order to ease attaching a debugger.


================================================
FILE: benchmarks/python/comparative/bench_mlx.py
================================================
# Copyright © 2023 Apple Inc.

import argparse
import math
import os
import time
from functools import partial

import mlx.core as mx
import mlx.nn as nn


def int_or_list(x):
    try:
        return int(x)
    except ValueError:
        return [int(xi) for xi in x.split(",")]


def none_or_list(x):
    if x == "":
        return None
    else:
        return [int(xi) for xi in x.split(",")]


def dtype_from_str(x):
    if x == "":
        return mx.float32
    else:
        dt = getattr(mx, x)
        if not isinstance(dt, mx.Dtype):
            raise ValueError(f"{x} is not an mlx dtype")
        return dt


def bench(f, *args):
    for i in range(10):
        f(*args)

    s = time.perf_counter()
    for i in range(100):
        f(*args)
    e = time.perf_counter()
    return e - s


def matmul_square(x):
    y = x
    for i in range(10):
        y = y @ x
    mx.eval(y)
    return y


def matmul(x, y):
    ys = []
    for i in range(10):
        ys.append(x @ y)
    mx.eval(ys)


def _quant_matmul(x, w, s, b, transpose, group_size, bits):
    ys = []
    for i in range(10):
        ys.append(
            mx.quantized_matmul(
                x, w, s, b, transpose=transpose, group_size=group_size, bits=bits
            )
        )
    mx.eval(ys)


quant_matmul = {
    "quant_matmul_32_2": partial(_quant_matmul, transpose=False, group_size=32, bits=2),
    "quant_matmul_32_4": partial(_quant_matmul, transpose=False, group_size=32, bits=4),
    "quant_matmul_32_8": partial(_quant_matmul, transpose=False, group_size=32, bits=8),
    "quant_matmul_64_2": partial(_quant_matmul, transpose=False, group_size=64, bits=2),
    "quant_matmul_64_4": partial(_quant_matmul, transpose=False, group_size=64, bits=4),
    "quant_matmul_64_8": partial(_quant_matmul, transpose=False, group_size=64, bits=8),
    "quant_matmul_128_2": partial(
        _quant_matmul, transpose=False, group_size=128, bits=2
    ),
    "quant_matmul_128_4": partial(
        _quant_matmul, transpose=False, group_size=128, bits=4
    ),
    "quant_matmul_128_8": partial(
        _quant_matmul, transpose=False, group_size=128, bits=8
    ),
    "quant_matmul_t_32_2": partial(
        _quant_matmul, transpose=True, group_size=32, bits=2
    ),
    "quant_matmul_t_32_4": partial(
        _quant_matmul, transpose=True, group_size=32, bits=4
    ),
    "quant_matmul_t_32_8": partial(
        _quant_matmul, transpose=True, group_size=32, bits=8
    ),
    "quant_matmul_t_64_2": partial(
        _quant_matmul, transpose=True, group_size=64, bits=2
    ),
    "quant_matmul_t_64_4": partial(
        _quant_matmul, transpose=True, group_size=64, bits=4
    ),
    "quant_matmul_t_64_8": partial(
        _quant_matmul, transpose=True, group_size=64, bits=8
    ),
    "quant_matmul_t_128_2": partial(
        _quant_matmul, transpose=True, group_size=128, bits=2
    ),
    "quant_matmul_t_128_4": partial(
        _quant_matmul, transpose=True, group_size=128, bits=4
    ),
    "quant_matmul_t_128_8": partial(
        _quant_matmul, transpose=True, group_size=128, bits=8
    ),
}


def conv1d(x, y):
    ys = []
    for i in range(10):
        ys.append(mx.conv1d(x, y))
    mx.eval(ys)


def conv2d(x, y):
    ys = []
    for i in range(10):
        ys.append(mx.conv2d(x, y))
    mx.eval(ys)


def binary(op, x, y):
    for i in range(100):
        y = getattr(mx, op)(x, y)
    mx.eval(y)


def reduction(op, axis, x):
    ys = []
    for i in range(100):
        ys.append(getattr(mx, op)(x, axis=axis))
    mx.eval(ys)


def sum_and_add(axis, x, y):
    z = x.sum(axis=axis, keepdims=True)
    for i in range(50):
        z = (z + y).sum(axis=axis, keepdims=True)
    mx.eval(z)


def softmax(axis, x):
    ys = []
    for i in range(100):
        ex = mx.exp(x - mx.max(x, axis=axis, keepdims=True))
        y = ex / mx.sum(ex, axis=axis, keepdims=True)
        ys.append(y)
    mx.eval(ys)


def softmax_fused(axis, x):
    ys = []
    for i in range(100):
        y = mx.softmax(x, axis=axis)
        ys.append(y)
    mx.eval(ys)


def relu(x):
    y = x
    for i in range(100):
        y = nn.relu(y)
    mx.eval(y)


def leaky_relu(x: mx.array):
    y = x
    for i in range(100):
        y = nn.leaky_relu(y)
    mx.eval(y)


def prelu(x: mx.array):
    y = x
    for i in range(100):
        y = nn.prelu(y, mx.ones(1))
    mx.eval(y)


def softplus(x: mx.array):
    y = x
    for i in range(100):
        y = nn.softplus(y)
    mx.eval(y)


def mish(x: mx.array):
    y = x
    for i in range(100):
        y = nn.mish(y)
    mx.eval(y)


def leaky_relu(x):
    y = x
    for i in range(100):
        y = nn.leaky_relu(y)
    mx.eval(y)


def elu(x):
    y = x
    for i in range(100):
        y = nn.elu(y)
    mx.eval(y)


def relu6(x):
    y = x
    for i in range(100):
        y = nn.relu6(y)
    mx.eval(y)


def softplus(x):
    y = x
    for i in range(100):
        y = nn.softplus(y)
    mx.eval(y)


def celu(x):
    y = x
    for i in range(100):
        y = nn.celu(y)
    mx.eval(y)


def log_sigmoid(x):
    y = x
    for i in range(100):
        y = nn.log_sigmoid(y)
    mx.eval(y)


def scalar_mult(x):
    y = x
    for i in range(100):
        y = y * (1.0 / (1 + i))
    mx.eval(y)


def cross_entropy(targets, x):
    ys = []
    for i in range(100):
        y = mx.logsumexp(x, axis=-1, keepdims=True) - mx.take_along_axis(
            x, mx.reshape(targets, (-1, 1)), axis=-1
        )
        ys.append(mx.mean(y))
    mx.eval(ys)


def logsumexp(axis, x):
    ys = []
    for i in range(100):
        ys.append(mx.logsumexp(x, axis=axis))
    mx.eval(ys)


def linear(w, b, x):
    ys = []
    for i in range(10):
        ys.append(x @ mx.transpose(w, (1, 0)) + b)
    mx.eval(ys)


def linear_fused(w, b, x):
    ys = []
    for i in range(10):
        ys.append(mx.addmm(b, x, mx.transpose(w, (1, 0))))
    mx.eval(ys)


def rope(x):
    *_, N, D = x.shape
    ys = []
    for i in range(10):
        shape = x.shape
        x = mx.reshape(x, (-1, N, D))
        positions = mx.arange(N)
        freqs = mx.exp(mx.arange(0.0, D // 2) / math.log(10000 / (D // 2 - 1)))
        theta = mx.reshape(positions, (-1, 1)) * mx.reshape(freqs, (1, -1))
        costheta = mx.cos(theta)
        sintheta = mx.sin(theta)
        x1 = x[..., ::2]
        x2 = x[..., 1::2]
        rx1 = x1 * costheta - x2 * sintheta
        rx2 = x1 * sintheta + x2 * costheta
        y = mx.concatenate([rx1[..., None], rx2[..., None]], axis=-1)
        y = mx.reshape(y, (-1, N, D))
        ys.append(y)
    mx.eval(ys)


def concatenate(axis, x, y):
    ys = []
    for i in range(10):
        ys.append(mx.concatenate([x, y], axis=axis))
    mx.eval(ys)


def cumsum(axis, x):
    ys = []
    for i in range(10):
        ys.append(mx.cumsum(x, axis))
    mx.eval(ys)


def sort(axis, x):
    ys = []
    for i in range(10):
        ys.append(mx.sort(x, axis))
    mx.eval(ys)


def topk(axis, x):
    k = x.shape[axis] // 3
    ys = []
    for i in range(10):
        ys.append(mx.topk(x, k, axis))
    mx.eval(ys)


def step_function(x):
    y = x
    for i in range(100):
        y = nn.step(x)
    mx.eval(y)


def selu(x):
    y = x
    for i in range(100):
        y = nn.selu(x)
    mx.eval(y)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("benchmark", help="Choose the benchmark to run")
    parser.add_argument(
        "--size",
        default=[(1024, 1024)],
        type=lambda x: list(map(int, x.split("x"))),
        help="Set the matrix size",
        action="append",
    )
    parser.add_argument(
        "--axis",
        default=[1],
        type=int_or_list,
        help="Set a reduction axis",
        action="append",
    )
    parser.add_argument(
        "--transpose",
        type=none_or_list,
        default=[],
        help="Permute the matrix",
        action="append",
    )
    parser.add_argument(
        "--print-pid", action="store_true", help="Print the PID and pause"
    )
    parser.add_argument("--cpu", action="store_true", help="Use the CPU")
    parser.add_argument(
        "--fused", action="store_true", help="Use fused functions where possible"
    )
    parser.add_argument("--dtype", type=dtype_from_str, default=[], action="append")

    args = parser.parse_args()

    if len(args.size) > 1:
        args.size.pop(0)
    if len(args.axis) > 1:
        args.axis.pop(0)

    if args.cpu:
        mx.set_default_device(mx.cpu)
    else:
        mx.set_default_device(mx.gpu)

    types = args.dtype
    if not types:
        types = [mx.float32]
    if len(types) < len(args.size):
        types = types + [types[0]] * (len(args.size) - len(types))

    xs = []
    for size, dtype in zip(args.size, types):
        xs.append(mx.random.normal(size).astype(dtype))
    for i, t in enumerate(args.transpose):
        if t is None:
            continue
        xs[i] = mx.transpose(xs[i], t)
    mx.eval(xs)
    x = xs[0]
    axis = args.axis[0]

    if args.print_pid:
        print(os.getpid())
        input("Press enter to run")

    if args.benchmark == "matmul_square":
        print(bench(matmul_square, x))

    elif args.benchmark == "matmul":
        print(bench(matmul, *xs))

    elif args.benchmark.startswith("quant_matmul"):
        print(bench(quant_matmul[args.benchmark], *xs))

    elif args.benchmark == "linear":
        if args.fused:
            print(bench(linear_fused, *xs))
        else:
            print(bench(linear, *xs))

    elif args.benchmark == "sum_axis":
        print(bench(reduction, "sum", axis, x))

    elif args.benchmark == "sum_all":
        print(bench(reduction, "sum", None, x))

    elif args.benchmark == "argmax":
        print(bench(reduction, "argmax", axis, x))

    elif args.benchmark == "add":
        print(bench(binary, "add", *xs))

    elif args.benchmark == "mul":
        print(bench(binary, "multiply", *xs))

    elif args.benchmark == "softmax":
        if args.fused:
            print(bench(softmax_fused, axis, x))
        else:
            print(bench(softmax, axis, x))

    elif args.benchmark == "relu":
        print(bench(relu, x))

    elif args.benchmark == "elu":
        print(bench(elu, x))

    elif args.benchmark == "relu6":
        print(bench(relu6, x))

    elif args.benchmark == "celu":
        print(bench(celu, x))

    elif args.benchmark == "log_sigmoid":
        print(bench(log_sigmoid, x))

    elif args.benchmark == "leaky_relu":
        print(bench(leaky_relu, x))
    elif args.benchmark == "prelu":
        print(bench(prelu, x))
    elif args.benchmark == "softplus":
        print(bench(softplus, x))
    elif args.benchmark == "mish":
        print(bench(mish, x))
    elif args.benchmark == "scalar_mul":
        print(bench(scalar_mult, x))

    elif args.benchmark == "cross_entropy":
        if len(size) != 2:
            raise ValueError("Error: [cross_entropy] benchmark requires a 2 dim size")

        targets = mx.zeros((len(x),), dtype=mx.uint32)
        print(bench(cross_entropy, targets, x))

    elif args.benchmark == "logsumexp":
        print(bench(logsumexp, axis, x))

    elif args.benchmark == "rope":
        print(bench(rope, x))

    elif args.benchmark == "concatenate":
        print(bench(concatenate, axis, *xs))

    elif args.benchmark == "cumsum":
        print(bench(cumsum, axis, *xs))

    elif args.benchmark == "conv1d":
        print(bench(conv1d, *xs))

    elif args.benchmark == "conv2d":
        print(bench(conv2d, *xs))

    elif args.benchmark == "sort":
        print(bench(sort, axis, x))

    elif args.benchmark == "topk":
        print(bench(topk, axis, x))

    elif args.benchmark == "step":
        print(bench(step_function, x))

    elif args.benchmark == "selu":
        print(bench(selu, x))

    elif args.benchmark == "sum_and_add":
        print(bench(sum_and_add, axis, *xs))

    else:
        raise ValueError("Unknown benchmark")


================================================
FILE: benchmarks/python/comparative/bench_torch.py
================================================
# Copyright © 2023 Apple Inc.

import argparse
import os
import time

import torch
import torch.cuda
import torch.mps


def int_or_list(x):
    try:
        return int(x)
    except ValueError:
        return [int(xi) for xi in x.split(",")]


def none_or_list(x):
    if x == "":
        return None
    else:
        return [int(xi) for xi in x.split(",")]


def dtype_from_str(x):
    if x == "":
        return torch.float32
    else:
        dt = getattr(torch, x)
        if not isinstance(dt, torch.dtype):
            raise ValueError(f"{x} is not a torch dtype")
        return dt


def bench(f, *args):
    for i in range(10):
        f(*args)

    s = time.perf_counter()
    for i in range(100):
        f(*args)
    e = time.perf_counter()
    return e - s


def sync_if_needed(x):
    if x.device == torch.device("mps"):
        torch.mps.synchronize()
    elif x.device == torch.device("cuda"):
        torch.cuda.synchronize()


@torch.no_grad()
def matmul_square(x):
    y = x
    for i in range(10):
        y = y @ x
    sync_if_needed(x)


@torch.no_grad()
def matmul(x, y):
    ys = []
    for i in range(10):
        ys.append(x @ y)
    sync_if_needed(x)


@torch.no_grad()
def conv1d(x, y):
    x = torch.transpose(x, -1, -2)
    y = torch.transpose(y, -1, -2)
    ys = []
    for i in range(10):
        ys.append(torch.nn.functional.conv1d(x, y))
    sync_if_needed(x)


@torch.no_grad()
def conv2d(x, y):
    x = torch.permute(x, (0, 3, 1, 2))
    y = torch.permute(y, (0, 3, 1, 2))
    ys = []
    for i in range(10):
        ys.append(torch.nn.functional.conv2d(x, y))
    sync_if_needed(x)


@torch.no_grad()
def binary(op, x, y):
    for i in range(100):
        y = getattr(torch, op)(x, y)
    sync_if_needed(x)


@torch.no_grad()
def reduction(op, axis, x):
    ys = []
    for i in range(100):
        ys.append(getattr(x, op)(axis))
    sync_if_needed(x)


@torch.no_grad()
def sum_and_add(axis, x, y):
    z = x.sum(axis=axis, keepdims=True)
    for i in range(50):
        z = (z + y).sum(axis=axis, keepdims=True)
    sync_if_needed(x)


@torch.no_grad()
def softmax(axis, x):
    ys = []
    for i in range(100):
        ex = torch.exp(x - torch.max(x, dim=axis, keepdims=True).values)
        y = ex / torch.sum(ex, dim=axis, keepdims=True)
        ys.append(y)
    sync_if_needed(x)


@torch.no_grad()
def softmax_fused(axis, x):
    ys = []
    for i in range(100):
        ys.append(torch.nn.functional.softmax(x, dim=axis))
    sync_if_needed(x)


@torch.no_grad()
def relu(x):
    y = x
    for i in range(100):
        y = torch.nn.functional.relu(y)
    sync_if_needed(x)


@torch.no_grad()
def leaky_relu(x):
    y = x
    for i in range(100):
        y = torch.nn.functional.leaky_relu(y)
    sync_if_needed(x)


@torch.no_grad()
def elu(x):
    y = x
    for i in range(100):
        y = torch.nn.functional.elu(y)
    sync_if_needed(x)


@torch.no_grad()
def celu(x):
    y = x
    for i in range(100):
        y = torch.nn.functional.celu(y)
    sync_if_needed(x)


@torch.no_grad()
def relu6(x):
    y = x
    for i in range(100):
        y = torch.nn.functional.relu6(y)
    sync_if_needed(x)


@torch.no_grad()
def softplus(x):
    y = x
    for i in range(100):
        y = torch.nn.functional.softplus(y)
    sync_if_needed(x)


@torch.no_grad()
def log_sigmoid(x):
    y = x
    for i in range(100):
        y = torch.nn.functional.logsigmoid(y)
    sync_if_needed(x)


@torch.no_grad()
def prelu(x: torch.Tensor) -> torch.Tensor:
    y = x
    for _ in range(100):
        y = torch.nn.functional.prelu(y, torch.ones(1).to(y.device))
    sync_if_needed(x)


@torch.no_grad()
def mish(x: torch.Tensor) -> torch.Tensor:
    y = x
    for _ in range(100):
        y = torch.nn.functional.mish(y)
    sync_if_needed(x)


@torch.no_grad()
def scalar_mult(x):
    y = x
    for i in range(100):
        y = y * (1.0 / (1 + i))
    sync_if_needed(x)


@torch.no_grad()
def cross_entropy(targets, x):
    ys = []
    for i in range(100):
        ys.append(torch.nn.functional.cross_entropy(x, targets))
    sync_if_needed(x)


@torch.no_grad()
def logsumexp(axis, x):
    ys = []
    for i in range(100):
        ys.append(torch.logsumexp(x, dim=axis))
    sync_if_needed(x)


@torch.no_grad()
def linear_fused(w, b, x):
    ys = []
    for i in range(10):
        ys.append(torch.nn.functional.linear(x, w, b))
    sync_if_needed(x)


@torch.no_grad()
def linear(w, b, x):
    ys = []
    for i in range(10):
        ys.append((x @ torch.transpose(w, -2, -1)) + b)
    sync_if_needed(x)


@torch.no_grad()
def rope(x):
    *_, N, D = x.shape
    ys = []
    for i in range(10):
        x = x.view(-1, N, D)
        positions = torch.arange(N, device=x.device)
        freqs = 10000 ** torch.linspace(0, 1, D // 2, device=x.device)
        theta = positions[:, None] * freqs[None]
        costheta = torch.cos(theta)
        sintheta = torch.sin(theta)
        x1 = x[..., ::2]
        x2 = x[..., 1::2]
        rx1 = x1 * costheta - x2 * sintheta
        rx2 = x1 * sintheta + x2 * costheta
        y = torch.cat([rx1[..., None], rx2[..., None]], dim=-1)
        y = y.reshape(-1, N, D)
        ys.append(y)
    sync_if_needed(x)


@torch.no_grad()
def concatenate(axis, x, y):
    ys = []
    for i in range(10):
        ys.append(torch.cat([x, y], dim=axis))
    sync_if_needed(x)


@torch.no_grad()
def cumsum(axis, x):
    ys = []
    for i in range(10):
        ys.append(x.cumsum(axis))
    sync_if_needed(x)


@torch.no_grad()
def sort(axis, x):
    ys = []
    for i in range(10):
        ys.append(torch.sort(x, dim=axis)[0])
    sync_if_needed(x)


@torch.no_grad()
def topk(axis, x):
    k = x.shape[axis] // 3
    ys = []
    for i in range(10):
        ys.append(torch.topk(x, k, dim=axis)[0])
    sync_if_needed(x)


@torch.no_grad()
def step_function(x):
    y = x
    for i in range(100):
        y = torch.where(y < 0, 0, 1)
    sync_if_needed(x)


@torch.no_grad()
def selu(x):
    y = x
    for i in range(100):
        y = torch.nn.functional.selu(y)
    sync_if_needed(x)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("benchmark", help="Choose the benchmark to run")
    parser.add_argument(
        "--size",
        default=[(1024, 1024)],
        type=lambda x: list(map(int, x.split("x"))),
        help="Set the matrix size",
        action="append",
    )
    parser.add_argument(
        "--axis",
        default=[1],
        type=int_or_list,
        help="Set a reduction axis",
        action="append",
    )
    parser.add_argument(
        "--transpose",
        type=none_or_list,
        default=[],
        help="Permute the matrix",
        action="append",
    )
    parser.add_argument(
        "--print-pid", action="store_true", help="Print the PID and pause"
    )
    parser.add_argument("--cpu", action="store_true", help="Use the CPU")
    parser.add_argument(
        "--fused", action="store_true", help="Use fused functions where possible"
    )
    parser.add_argument("--dtype", type=dtype_from_str, default=[], action="append")

    args = parser.parse_args()

    if len(args.size) > 1:
        args.size.pop(0)
    if len(args.axis) > 1:
        args.axis.pop(0)

    torch.set_num_threads(1)
    device = "mps"
    if torch.cuda.is_available():
        device = "cuda"
    if args.cpu:
        device = "cpu"

    types = args.dtype
    if not types:
        types = [torch.float32]
    if len(types) < len(args.size):
        types = types + [types[0]] * (len(args.size) - len(types))

    xs = []
    for size, dtype in zip(args.size, types):
        xs.append(torch.randn(*size).to(device).to(dtype))
    for i, t in enumerate(args.transpose):
        if t is None:
            continue
        xs[i] = xs[i].permute(*t)
    x = xs[0]
    axis = args.axis[0]

    if args.print_pid:
        print(os.getpid())
        input("Press enter to run")

    if args.benchmark == "matmul_square":
        print(bench(matmul_square, x))

    elif args.benchmark == "matmul":
        print(bench(matmul, *xs))

    elif args.benchmark == "linear":
        if args.fused:
            print(bench(linear_fused, *xs))
        else:
            print(bench(linear, *xs))

    elif args.benchmark == "sum_axis":
        print(bench(reduction, "sum", axis, x))

    elif args.benchmark == "sum_all":
        print(bench(reduction, "sum", None, x))

    elif args.benchmark == "argmax":
        print(bench(reduction, "argmax", axis, x))

    elif args.benchmark == "add":
        print(bench(binary, "add", *xs))

    elif args.benchmark == "mul":
        print(bench(binary, "mul", *xs))

    elif args.benchmark == "softmax":
        if args.fused:
            print(bench(softmax_fused, axis, x))
        else:
            print(bench(softmax, axis, x))

    elif args.benchmark == "relu":
        print(bench(relu, x))

    elif args.benchmark == "leaky_relu":
        print(bench(leaky_relu, x))

    elif args.benchmark == "elu":
        print(bench(elu, x))

    elif args.benchmark == "relu6":
        print(bench(relu6, x))

    elif args.benchmark == "softplus":
        print(bench(softplus, x))

    elif args.benchmark == "celu":
        print(bench(celu, x))

    elif args.benchmark == "log_sigmoid":
        print(bench(log_sigmoid, x))

    elif args.benchmark == "prelu":
        print(bench(prelu, x))
    elif args.benchmark == "mish":
        print(bench(mish, x))
    elif args.benchmark == "scalar_mul":
        print(bench(scalar_mult, x))

    elif args.benchmark == "cross_entropy":
        if len(size) != 2:
            raise ValueError("Error: [cross_entropy] benchmark requires a 2 dim size")

        targets = torch.zeros(len(x), dtype=torch.long).to(x.device)
        print(bench(cross_entropy, targets, x))

    elif args.benchmark == "logsumexp":
        print(bench(logsumexp, axis, x))

    elif args.benchmark == "rope":
        print(bench(rope, x))

    elif args.benchmark == "concatenate":
        print(bench(concatenate, axis, *xs))

    elif args.benchmark == "cumsum":
        print(bench(cumsum, axis, *xs))

    elif args.benchmark == "conv1d":
        print(bench(conv1d, *xs))

    elif args.benchmark == "conv2d":
        print(bench(conv2d, *xs))

    elif args.benchmark == "sort":
        print(bench(sort, axis, x))

    elif args.benchmark == "topk":
        print(bench(topk, axis, x))

    elif args.benchmark == "step":
        print(bench(step_function, x))

    elif args.benchmark == "selu":
        print(bench(selu, x))

    elif args.benchmark == "sum_and_add":
        print(bench(sum_and_add, axis, *xs))

    else:
        raise ValueError(f"Unknown benchmark `{args.benchmark}`.")


================================================
FILE: benchmarks/python/comparative/compare.py
================================================
# Copyright © 2023 Apple Inc.

#!/usr/bin/env python

import argparse
import re
from pathlib import Path
from subprocess import run

BENCH_MLX = Path(__file__).parent / "bench_mlx.py"
BENCH_TORCH = Path(__file__).parent / "bench_torch.py"


def run_or_raise(*args, **kwargs):
    try:
        result = run(*args, capture_output=True, **kwargs)
        return float(result.stdout)
    except ValueError:
        raise ValueError(
            f"stdout: {result.stdout.decode()}\nstderr: {result.stderr.decode()}"
        )


def compare(args):
    t_mlx = run_or_raise(["python", BENCH_MLX] + args)
    t_torch = run_or_raise(["python", BENCH_TORCH] + args)

    print((t_torch - t_mlx) / t_torch, " ".join(args), sep="\t")


def compare_mlx_dtypes(args, dt1, dt2):
    t_mlx_dt1 = run_or_raise(["python", BENCH_MLX] + args + ["--dtype", dt1])
    t_mlx_dt2 = run_or_raise(["python", BENCH_MLX] + args + ["--dtype", dt2])

    print((t_mlx_dt2 - t_mlx_dt1) / t_mlx_dt2, " ".join(args), sep="\t")


def make_regex_search(regexes):
    compiled_regexes = list(map(re.compile, regexes))

    def search(x):
        return (c.search(x) is not None for c in compiled_regexes)

    return search


def make_predicate(positive_filter, negative_filter):
    if positive_filter is not None:
        positive_filter_search = make_regex_search(positive_filter)
        positive_filter = lambda x: all(positive_filter_search(x))
    else:
        positive_filter = lambda x: True

    if negative_filter is not None:
        negative_filter_search = make_regex_search(negative_filter)
        negative_filter = lambda x: not any(negative_filter_search(x))
    else:
        negative_filter = lambda x: True

    def predicate(x):
        return positive_filter(x) and negative_filter(x)

    return predicate


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run comparisons against PyTorch")
    parser.add_argument(
        "--filter", "-f", help="Regex filter to select benchmarks", nargs="+"
    )
    parser.add_argument(
        "--negative_filter", "-n", help="Regex filter to remove benchmarks", nargs="+"
    )
    parser.add_argument(
        "--mlx_dtypes",
        "-d",
        help="Compare mlx benchmarks between the 2 provided data types",
        nargs=2,
    )
    args, rest = parser.parse_known_args()

    _filter = make_predicate(args.filter, args.negative_filter)

    if args.mlx_dtypes:
        compare_filtered = lambda x: (
            compare_mlx_dtypes(x.split() + rest, args.mlx_dtypes[0], args.mlx_dtypes[1])
            if _filter(x)
            else None
        )
    else:
        compare_filtered = lambda x: compare(x.split() + rest) if _filter(x) else None

    # Binary ops
    compare_filtered("add --size 10x1024x128 --size 1x1024x128 --cpu")
    compare_filtered("add --size 10x1024x128 --size 1x1024x128")
    compare_filtered("add --size 1024x128 --size 1x128 --cpu")
    compare_filtered("add --size 1024x128 --size 1x128")
    compare_filtered("add --size 1024x4096 --size 1x4096 --cpu")
    compare_filtered("add --size 1024x4096 --size 1x4096")
    compare_filtered("add --size 1024x4096 --size 1x1024 --transpose 1,0 --cpu")
    compare_filtered("add --size 1024x4096 --size 1x1024 --transpose 1,0")
    compare_filtered("add --size 1024x1024 --size 1024x1024 --cpu")
    compare_filtered("add --size 1024x1024 --size 1024x1024")
    compare_filtered("add --size 1024x1024 --size 1024x1024 --transpose 1,0 --cpu")
    compare_filtered("add --size 1024x1024 --size 1024x1024 --transpose 1,0")
    compare_filtered(
        "add --size 1024x1024 --size 1024x1024 --transpose 1,0 --transpose 1,0 --cpu"
    )
    compare_filtered(
        "add --size 1024x1024 --size 1024x1024 --transpose 1,0 --transpose 1,0"
    )

    # Reduction ops
    compare_filtered("sum_all --size 10x1024x128 --cpu")
    compare_filtered("sum_all --size 10x1024x128")
    compare_filtered("sum_axis --size 16x1024x128 --axis 2 --cpu")
    compare_filtered("sum_axis --size 16x1024x128 --axis 2")
    compare_filtered("sum_axis --size 16x128x1024 --axis 2 --cpu")
    compare_filtered("sum_axis --size 16x128x1024 --axis 2")
    compare_filtered("sum_axis --size 1024x1024 --axis 1 --cpu")
    compare_filtered("sum_axis --size 1024x1024 --axis 1")
    compare_filtered("sum_axis --size 1024x1024 --axis 0 --cpu")
    compare_filtered("sum_axis --size 1024x1024 --axis 0")
    compare_filtered("sum_axis --size 16x128x1024 --axis 1 --cpu")
    compare_filtered("sum_axis --size 16x128x1024 --axis 1")
    compare_filtered("sum_axis --size 16x128x1024 --axis 0 --cpu")
    compare_filtered("sum_axis --size 16x128x1024 --axis 0")
    compare_filtered("sum_axis --size 16x128x1024 --axis 0,1 --cpu")
    compare_filtered("sum_axis --size 16x128x1024 --axis 0,1")
    compare_filtered("sum_axis --size 16x128x1024 --axis 0,2 --cpu")
    compare_filtered("sum_axis --size 16x128x1024 --axis 0,2")
    compare_filtered("sum_axis --size 16x128x1024 --axis 0,1 --transpose 0,2,1 --cpu")
    compare_filtered("sum_axis --size 16x128x1024 --axis 0,1 --transpose 0,2,1")
    compare_filtered("sum_axis --size 16x128x1024 --axis 0,2 --transpose 0,2,1 --cpu")
    compare_filtered("sum_axis --size 16x128x1024 --axis 0,2 --transpose 0,2,1")
    compare_filtered("argmax --size 10x1024x128 --axis 1 --cpu")
    compare_filtered("argmax --size 10x1024x128 --axis 1")
    compare_filtered("argmax --size 10x1024x128 --axis 2 --cpu")
    compare_filtered("argmax --size 10x1024x128 --axis 2")
    compare_filtered("argmax --size 1024x1024 --axis 1 --cpu")
    compare_filtered("argmax --size 1024x1024 --axis 1")

    # Matmul ops
    compare_filtered("matmul_square --size 1024x1024")
    compare_filtered("matmul_square --size 1024x1024 --cpu")
    compare_filtered("matmul_square --size 16x1024x1024")
    compare_filtered("matmul_square --size 16x1024x1024 --cpu")
    compare_filtered(
        "matmul --size 16x768x768 --size 16x768x768 --transpose= --transpose 0,2,1"
    )
    compare_filtered(
        "matmul --size 16x768x768 --size 16x768x768 --transpose= --transpose 0,2,1 --cpu"
    )
    compare_filtered(
        "matmul --size 16x768x128 --size 16x768x128 --transpose= --transpose 0,2,1"
    )
    compare_filtered(
        "matmul --size 16x768x128 --size 16x768x128 --transpose= --transpose 0,2,1 --cpu"
    )
    compare_filtered("matmul --size 512x8192 --size 8192x512")
    compare_filtered("matmul --size 512x8192 --size 8192x512 --cpu")
    # compare_filtered("matmul --size 512x131072 --size 131072x512")
    # compare_filtered("matmul --size 512x131072 --size 131072x512 --cpu")
    compare_filtered("matmul --size 8192x512 --size 512x8192")
    compare_filtered("matmul --size 8192x512 --size 512x8192 --cpu")
    # compare_filtered("matmul --size 131072x512 --size 512x512")
    # compare_filtered("matmul --size 131072x512 --size 512x512 --cpu")
    compare_filtered("linear --size 1024x1024 --size 1024 --size 128x1024")
    compare_filtered("linear --size 1024x1024 --size 1024 --size 128x1024 --cpu")
    compare_filtered("linear --size 1024x1024 --size 1024 --size 128x1024 --fused")
    compare_filtered(
        "linear --size 1024x1024 --size 1024 --size 128x1024 --fused --cpu"
    )

    # Matvec ops
    compare_filtered("matmul --size 1x1x4096 --size 4096x4096 --cpu")
    compare_filtered("matmul --size 1x1x4096 --size 4096x4096")
    compare_filtered(
        "matmul --size 1x1x4096 --size 4096x4096 --transpose= --transpose 1,0 --cpu"
    )
    compare_filtered(
        "matmul --size 1x1x4096 --size 4096x4096 --transpose= --transpose 1,0"
    )
    compare_filtered("matmul --size 32x1x1000 --size 32x1000x128 --cpu")
    compare_filtered("matmul --size 32x1x1000 --size 32x1000x128")
    compare_filtered(
        "matmul --size 32x1x1000 --size 32x128x1000 --transpose= --transpose 0,2,1 --cpu"
    )
    compare_filtered(
        "matmul --size 32x1x1000 --size 32x128x1000 --transpose= --transpose 0,2,1"
    )

    # Various ops
    compare_filtered("softmax --size 32x16x1024 --axis 2")
    compare_filtered("softmax --size 32x16x1024 --axis 2 --cpu")
    compare_filtered("softmax --size 32x16x1024 --axis 2 --fused")
    compare_filtered("softmax --size 32x16x1024 --axis 2 --fused --cpu")
    compare_filtered("softmax --size 2x1024x1024 --axis 1")
    compare_filtered("softmax --size 2x1024x1024 --axis 1 --cpu")
    compare_filtered("softmax --size 2x1024x1024 --axis 1 --fused")
    compare_filtered("softmax --size 2x1024x1024 --axis 1 --fused --cpu")
    compare_filtered("relu --size 32x16x1024")
    compare_filtered("relu --size 32x16x1024 --cpu")
    compare_filtered("leaky_relu --size 32x16x1024")
    compare_filtered("leaky_relu --size 32x16x1024 --cpu")
    compare_filtered("elu --size 32x16x1024")
    compare_filtered("elu --size 32x16x1024 --cpu")
    compare_filtered("relu6 --size 32x16x1024")
    compare_filtered("relu6 --size 32x16x1024 --cpu")
    compare_filtered("softplus --size 32x16x1024")
    compare_filtered("softplus --size 32x16x1024 --cpu")
    compare_filtered("celu --size 32x16x1024")
    compare_filtered("celu --size 32x16x1024 --cpu")
    compare_filtered("log_sigmoid --size 32x16x1024")
    compare_filtered("log_sigmoid --size 32x16x1024 --cpu")
    compare_filtered("step --size 32x16x1024")
    compare_filtered("step --size 32x16x1024 --cpu")
    compare_filtered("selu --size 32x16x1024")
    compare_filtered("selu --size 32x16x1024 --cpu")
    # compare_filtered("mish --size 32x16x1024") NOTE: Torch does not implement Mish in MPS atm
    compare_filtered("mish --size 32x16x1024 --cpu")
    compare_filtered("prelu --size 32x16x1024")
    compare_filtered("prelu --size 32x16x1024 --cpu")

    compare_filtered("scalar_mul --size 32x16x1024")
    compare_filtered("scalar_mul --size 32x16x1024 --cpu")
    compare_filtered("cross_entropy --size 256x1024")
    compare_filtered("cross_entropy --size 256x1024 --cpu")
    compare_filtered("logsumexp --size 1024x1024 --axis 1")
    compare_filtered("logsumexp --size 1024x1024 --axis 1 --cpu")
    compare_filtered("logsumexp --size 1024x1024 --axis 0")
    compare_filtered("logsumexp --size 1024x1024 --axis 0 --cpu")
    compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 2")
    compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 2 --cpu")
    compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 1")
    compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 1 --cpu")
    compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 0")
    compare_filtered("concatenate --size 32x1024x128 --size 32x1024x128 --axis 0 --cpu")
    compare_filtered("concatenate --size 32x1024x128 --size 32x16x128 --axis 1")
    compare_filtered("concatenate --size 32x1024x128 --size 32x16x128 --axis 1 --cpu")
    compare_filtered("concatenate --size 32x1024x128 --size 32x1x128 --axis 1")
    compare_filtered("concatenate --size 32x1024x128 --size 32x1x128 --axis 1 --cpu")
    compare_filtered("concatenate --size 1x32x1024x128 --size 1x32x1x128 --axis 2")
    compare_filtered(
        "concatenate --size 1x32x1024x128 --size 1x32x1x128 --axis 2 --cpu"
    )
    compare_filtered("conv1d --size 1x1000x80 --size 128x11x80")
    compare_filtered("conv1d --size 1x1000x80 --size 128x11x80 --cpu")
    compare_filtered("conv1d --size 16x1000x80 --size 128x11x80")
    compare_filtered("conv1d --size 4x1000x80 --size 128x11x80 --cpu")
    compare_filtered("conv2d --size 1x256x256x3 --size 8x3x3x3")
    compare_filtered("conv2d --size 1x256x256x3 --size 8x3x3x3 --cpu")
    compare_filtered("conv2d --size 16x256x256x3 --size 8x3x3x3")
    compare_filtered("conv2d --size 4x256x256x3 --size 8x3x3x3 --cpu")
    compare_filtered("cumsum --size 1024x1024 --axis 1 --cpu")
    compare_filtered("cumsum --size 1024x1024 --axis 0 --cpu")
    compare_filtered("cumsum --size 1024x1024 --axis 1")
    compare_filtered("cumsum --size 1024x1024 --axis 0")
    compare_filtered("cumsum --size 128x1024 --axis 1")
    compare_filtered("cumsum --size 128x1024 --axis 0")
    compare_filtered("cumsum --size 1024x4096 --axis 1")
    compare_filtered("cumsum --size 1024x4096 --axis 0")
    compare_filtered("cumsum --size 128x4096 --axis 1")
    compare_filtered("cumsum --size 128x4096 --axis 0")
    compare_filtered("cumsum --size 1024x7777 --axis 1")
    compare_filtered("cumsum --size 1024x7777 --axis 0")
    compare_filtered("cumsum --size 128x7777 --axis 1")
    compare_filtered("cumsum --size 128x7777 --axis 0")
    compare_filtered("cumsum --size 32768x128 --axis 1")
    compare_filtered("cumsum --size 32768x128 --axis 0")

    compare_filtered("sort --size 1024x1024 --axis 0")
    compare_filtered("sort --size 1024x1024 --axis 1")
    compare_filtered("sort --size 32768x128 --axis 0")
    compare_filtered("sort --size 32768x128 --axis 1")
    compare_filtered("sort --size 128x128 --axis 0 --cpu")
    compare_filtered("sort --size 128x128 --axis 1 --cpu")

    compare_filtered("topk --size 1024x1024 --axis 0")
    compare_filtered("topk --size 1024x1024 --axis 1")
    compare_filtered("topk --size 32768x128 --axis 0")
    compare_filtered("topk --size 32768x128 --axis 1")
    compare_filtered("topk --size 128x128 --axis 0 --cpu")
    compare_filtered("topk --size 128x128 --axis 1 --cpu")


================================================
FILE: benchmarks/python/compile_bench.py
================================================
# Copyright © 2023-2024 Apple Inc.

import argparse
import math
import random

import mlx.core as mx
from time_utils import time_fn


def bench_gelu():
    def gelu(x):
        return x * (1 + mx.erf(x / math.sqrt(2))) / 2

    x = mx.random.uniform(shape=(1000, 1024))

    def gen_fun(fun):
        def bench_fun(x):
            for _ in range(10):
                x = fun(x)
            return x

        return bench_fun

    time_fn(gen_fun(gelu), x, msg="fixed gelu")
    time_fn(gen_fun(mx.compile(gelu)), x, msg="compiled fixed gelu")

    def randint():
        return random.randint(1, x.shape[0])

    def gen_fun(fun):
        def bench_fun(x, y):
            x = x[: randint()]
            for _ in range(10):
                x = fun(x)
                y = fun(y)
            return x, y

        return bench_fun

    y = mx.random.uniform(shape=(1000, 1024))
    time_fn(gen_fun(gelu), x, y, msg="variable gelu")
    time_fn(gen_fun(mx.compile(gelu)), x, y, msg="compiled variable gelu")
    time_fn(
        gen_fun(mx.compile(gelu, shapeless=True)),
        x,
        y,
        msg="shapeless variable gelu",
    )


def bench_layernorm():
    weight = mx.random.uniform(shape=(4096,)).astype(mx.float16)
    bias = mx.random.uniform(shape=(4096,)).astype(mx.float16)
    mx.eval(weight, bias)

    def layernorm(x):
        x = x.astype(mx.float32)
        means = mx.mean(x, axis=-1, keepdims=True)
        var = mx.var(x, axis=-1, keepdims=True)
        x = (x - means) * mx.rsqrt(var + 1e-4)
        x = x.astype(mx.float16)
        return weight * x + bias

    x = mx.random.uniform(shape=(1000, 4096)).astype(mx.float16)

    def gen_fun(fun):
        def bench_fun(x):
            for _ in range(10):
                x = fun(x)
            return x

        return bench_fun

    time_fn(gen_fun(layernorm), x, msg="fixed layernorm")
    time_fn(gen_fun(mx.compile(layernorm)), x, msg="compiled fixed layernorm")

    def randint():
        return random.randint(1, x.shape[0])

    def gen_fun(fun):
        def bench_fun(x):
            x = x[: randint()]
            for _ in range(10):
                x = fun(x)
            return x

        return bench_fun

    random.seed(0)
    time_fn(gen_fun(layernorm), x, msg="variable layernorm")
    random.seed(0)
    time_fn(gen_fun(mx.compile(layernorm)), x, msg="compiled variable layernorm")
    random.seed(0)
    time_fn(
        gen_fun(mx.compile(layernorm, shapeless=True)),
        x,
        msg="shapeless variable layernorm",
    )


if __name__ == "__main__":
    parser = argparse.ArgumentParser("Compile benchmarks.")
    args = parser.parse_args()

    bench_gelu()
    bench_layernorm()


================================================
FILE: benchmarks/python/conv1d_bench.py
================================================
import argparse
import math
import os
import subprocess
import time

import mlx.core as mx
import numpy as np
import torch

device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
device_name = device_name.decode("utf-8").strip("\n")

N_warmup = 10
N_iter_bench = 100
N_iter_func = 5


def bench(f, a, b):
    for i in range(N_warmup):
        f(a, b)
    torch.mps.synchronize()

    s = time.perf_counter_ns()
    for i in range(N_iter_bench):
        f(a, b)
    e = time.perf_counter_ns()
    return (e - s) * 1e-9


def make_mx_conv_1D(strides=1, padding=0, groups=1):
    def mx_conv_1D(a, b):
        ys = []
        for _ in range(N_iter_func):
            y = mx.conv1d(a, b, stride=strides, padding=padding, groups=groups)
            ys.append(y)
        mx.eval(ys)
        return ys

    return mx_conv_1D


def make_pt_conv_1D(strides=1, padding=0, groups=1):
    @torch.no_grad()
    def pt_conv_1D(a, b):
        ys = []
        for _ in range(N_iter_func):
            y = torch.conv1d(a, b, stride=strides, padding=padding, groups=groups)
            ys.append(y)
        torch.mps.synchronize()
        return ys

    return pt_conv_1D


def bench_shape(N, iH, C, wH, O, strides, padding, np_dtype, groups):
    scale = 1.0 / math.sqrt(wH * C)
    a_np = np.random.uniform(0, 0.5, (N, iH, C)).astype(np_dtype)
    b_np = np.random.uniform(-scale, scale, (O, wH, int(C / groups))).astype(np_dtype)

    a_mx = mx.array(a_np)
    b_mx = mx.array(b_np)

    a_pt = torch.from_numpy(a_np.transpose((0, 2, 1))).to("mps")
    b_pt = torch.from_numpy(b_np.transpose((0, 2, 1))).to("mps")

    torch.mps.synchronize()

    f_mx = make_mx_conv_1D(strides, padding, groups)
    f_pt = make_pt_conv_1D(strides, padding, groups)

    time_torch = bench(f_pt, a_pt, b_pt)
    time_mlx = bench(f_mx, a_mx, b_mx)

    out_mx = mx.conv1d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
    out_pt = torch.conv1d(
        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
    )
    out_pt = torch.permute(out_pt, (0, 2, 1))
    out_pt = out_pt.numpy(force=True)

    atol = 2e-5 if np_dtype == np.float32 else 1e-4

    if not np.allclose(out_pt, out_mx, atol=atol):
        print(
            f"Failed at {(N, iH, C)}, {(O, wH, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
        )

    return time_mlx, time_torch


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run conv benchmarks")

    dtypes = ("float32",)
    shapes = (
        (4, 32, 32, 5, 32, 1, 2, 1),
        (4, 32, 32, 5, 32, 1, 2, 2),
        (4, 32, 32, 5, 32, 1, 2, 4),
        (4, 32, 32, 5, 32, 1, 2, 8),
        (4, 32, 32, 5, 32, 1, 2, 8),
        (4, 32, 32, 5, 32, 1, 2, 16),
        (4, 32, 32, 5, 32, 1, 2, 32),
        (4, 32, 256, 5, 512, 1, 2, 2),
        (4, 32, 256, 5, 512, 1, 2, 128),
        (4, 32, 256, 5, 512, 1, 2, 256),
    )

    for dtype in dtypes:
        print("(N,  iH,  C),  (O,  wH,  C),   dtype,  stride, pads, groups, diff%")
        for N, iH, C, wH, O, strides, padding, groups in shapes:
            np_dtype = getattr(np, dtype)
            time_mlx, time_torch = bench_shape(
                N, iH, C, wH, O, strides, padding, np_dtype, groups
            )
            diff = time_torch / time_mlx - 1.0

            print(
                f"({N}, {iH:3d}, {C:3d}), ({O:3d}, {wH:2d}, {C:3d}), {dtype}, {strides:5d}, {padding:4d}, {groups:6d}, {100. * diff:+5.2f}%"
            )

            if time_mlx >= 2.0 * time_torch:
                print("ATTENTION ^^^^^^^")


================================================
FILE: benchmarks/python/conv2d_bench_cpu.py
================================================
import argparse
import math
import time

import mlx.core as mx
import numpy as np
import torch

N_warmup = 1
N_iter_bench = 10
N_iter_func = 5
mx.set_default_device(mx.cpu)


def bench(f, a, b):
    for i in range(N_warmup):
        f(a, b)

    s = time.perf_counter_ns()
    for i in range(N_iter_bench):
        f(a, b)
    e = time.perf_counter_ns()
    return (e - s) * 1e-9


def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
    def mx_conv_2D(a, b):
        ys = []
        for i in range(N_iter_func):
            y = mx.conv2d(a, b, stride=strides, padding=padding, groups=groups)
            ys.append(y)
        mx.eval(ys)
        return ys

    return mx_conv_2D


def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
    @torch.no_grad()
    def pt_conv_2D(a, b):
        ys = []
        for i in range(N_iter_func):
            y = torch.conv2d(a, b, stride=strides, padding=padding, groups=groups)
            ys.append(y)
        return ys

    return pt_conv_2D


def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
    scale = 1.0 / math.sqrt(kH * kH * C)
    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
        np_dtype
    )

    a_mx = mx.array(a_np)
    b_mx = mx.array(b_np)

    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("cpu")
    b_pt = torch.from_numpy(b_np.transpose((0, 3, 1, 2))).to("cpu")

    f_mx = make_mx_conv_2D(strides, padding, groups)
    f_pt = make_pt_conv_2D(strides, padding, groups)

    time_torch = bench(f_pt, a_pt, b_pt)
    time_mlx = bench(f_mx, a_mx, b_mx)

    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
    out_pt = torch.conv2d(
        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
    )
    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
    out_pt = out_pt.numpy(force=True)

    atol = 2e-5 if np_dtype == np.float32 else 1e-4

    if not np.allclose(out_pt, out_mx, atol=atol):
        print(
            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
        )

    return time_mlx, time_torch


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run conv benchmarks")

    dtypes = ("float32",)
    shapes = (
        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2), 1),
        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2), 1),
        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2), 1),
        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2), 1),
        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2), 1),
        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2), 1),
        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2), 1),
        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2), 1),
        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 1),
        # (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 2),
        # (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 16),
        # (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 64),
        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2), 1),
        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2), 1),
        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2), 1),
        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2), 1),
        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2), 1),
        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2), 1),
        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2), 1),
    )

    for dtype in dtypes:
        print(
            "(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  groups, diff%"
        )
        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
            np_dtype = getattr(np, dtype)
            time_mlx, time_torch = bench_shape(
                N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype
            )
            diff = time_torch / time_mlx - 1.0

            print(
                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
            )
            if time_mlx >= 2.0 * time_torch:
                print("ATTENTION ^^^^^^^")


================================================
FILE: benchmarks/python/conv2d_train_bench_cpu.py
================================================
import time

import mlx.core as mx
import mlx.nn
import mlx.optimizers as opt
import torch


def bench_mlx(steps: int = 20) -> float:
    mx.set_default_device(mx.cpu)

    class BenchNetMLX(mlx.nn.Module):
        # simple encoder-decoder net

        def __init__(self, in_channels, hidden_channels=32):
            super().__init__()

            self.net = mlx.nn.Sequential(
                mlx.nn.Conv2d(in_channels, hidden_channels, kernel_size=3, padding=1),
                mlx.nn.ReLU(),
                mlx.nn.Conv2d(
                    hidden_channels, 2 * hidden_channels, kernel_size=3, padding=1
                ),
                mlx.nn.ReLU(),
                mlx.nn.ConvTranspose2d(
                    2 * hidden_channels, hidden_channels, kernel_size=3, padding=1
                ),
                mlx.nn.ReLU(),
                mlx.nn.ConvTranspose2d(
                    hidden_channels, in_channels, kernel_size=3, padding=1
                ),
            )

        def __call__(self, input):
            return self.net(input)

    benchNet = BenchNetMLX(3)
    mx.eval(benchNet.parameters())
    optim = opt.Adam(learning_rate=1e-3)

    inputs = mx.random.normal([10, 256, 256, 3])

    params = benchNet.parameters()
    optim.init(params)

    state = [benchNet.state, optim.state]

    def loss_fn(params, image):
        benchNet.update(params)
        pred_image = benchNet(image)
        return (pred_image - image).abs().mean()

    def step(params, image):
        loss, grads = mx.value_and_grad(loss_fn)(params, image)
        optim.update(benchNet, grads)
        return loss

    total_time = 0.0
    print("MLX:")
    for i in range(steps):
        start_time = time.perf_counter()

        step(benchNet.parameters(), inputs)
        mx.eval(state)
        end_time = time.perf_counter()

        print(f"{i:3d}, time={(end_time-start_time) * 1000:7.2f} ms")
        total_time += (end_time - start_time) * 1000

    return total_time


def bench_torch(steps: int = 20) -> float:
    device = torch.device("cpu")

    class BenchNetTorch(torch.nn.Module):
        # simple encoder-decoder net

        def __init__(self, in_channels, hidden_channels=32):
            super().__init__()

            self.net = torch.nn.Sequential(
                torch.nn.Conv2d(in_channels, hidden_channels, kernel_size=3, padding=1),
                torch.nn.ReLU(),
                torch.nn.Conv2d(
                    hidden_channels, 2 * hidden_channels, kernel_size=3, padding=1
                ),
                torch.nn.ReLU(),
                torch.nn.ConvTranspose2d(
                    2 * hidden_channels, hidden_channels, kernel_size=3, padding=1
                ),
                torch.nn.ReLU(),
                torch.nn.ConvTranspose2d(
                    hidden_channels, in_channels, kernel_size=3, padding=1
                ),
            )

        def forward(self, input):
            return self.net(input)

    benchNet = BenchNetTorch(3).to(device)
    optim = torch.optim.Adam(benchNet.parameters(), lr=1e-3)

    inputs = torch.randn(10, 3, 256, 256, device=device)

    def loss_fn(pred_image, image):
        return (pred_image - image).abs().mean()

    total_time = 0.0
    print("PyTorch:")
    for i in range(steps):
        start_time = time.perf_counter()

        optim.zero_grad()
        pred_image = benchNet(inputs)
        loss = loss_fn(pred_image, inputs)
        loss.backward()
        optim.step()

        end_time = time.perf_counter()

        print(f"{i:3d}, time={(end_time-start_time) * 1000:7.2f} ms")
        total_time += (end_time - start_time) * 1000

    return total_time


def main():
    steps = 20
    time_mlx = bench_mlx(steps)
    time_torch = bench_torch(steps)

    print(f"average time of MLX:     {time_mlx/steps:9.2f} ms")
    print(f"total time of MLX:       {time_mlx:9.2f} ms")
    print(f"average time of PyTorch: {time_torch/steps:9.2f} ms")
    print(f"total time of PyTorch:   {time_torch:9.2f} ms")

    diff = time_torch / time_mlx - 1.0
    print(f"torch/mlx diff: {100. * diff:+5.2f}%")


if __name__ == "__main__":
    main()


================================================
FILE: benchmarks/python/conv2d_transpose_bench_cpu.py
================================================
import argparse
import math
import time

import mlx.core as mx
import numpy as np
import torch

N_warmup = 1
N_iter_bench = 10
N_iter_func = 5


def bench(f, a, b):
    for i in range(N_warmup):
        f(a, b)

    s = time.perf_counter_ns()
    for i in range(N_iter_bench):
        f(a, b)
    e = time.perf_counter_ns()
    return (e - s) * 1e-9


def make_mx_conv_transpose_2D(strides=(1, 1), padding=(0, 0), groups=1):
    def mx_conv_transpose_2D(a, b):
        ys = []
        for i in range(N_iter_func):
            y = mx.conv_transpose2d(
                a, b, stride=strides, padding=padding, groups=groups, stream=mx.cpu
            )
            ys.append(y)
        mx.eval(ys)
        return ys

    return mx_conv_transpose_2D


def make_pt_conv_transpose_2D(strides=(1, 1), padding=(0, 0), groups=1):
    @torch.no_grad()
    def pt_conv_transpose_2D(a, b):
        ys = []
        for i in range(N_iter_func):
            y = torch.conv_transpose2d(
                a, b, stride=strides, padding=padding, groups=groups
            )
            ys.append(y)
        return ys

    return pt_conv_transpose_2D


def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
    scale = 1.0 / math.sqrt(kH * kH * C)
    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
    b_np = np.random.uniform(-scale, scale, (int(O / groups), kH, kW, C)).astype(
        np_dtype
    )

    a_mx = mx.array(a_np)
    b_mx = mx.array(b_np)

    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("cpu")
    b_pt = torch.from_numpy(b_np.transpose((3, 0, 1, 2))).to("cpu")

    f_mx = make_mx_conv_transpose_2D(strides, padding, groups)
    f_pt = make_pt_conv_transpose_2D(strides, padding, groups)

    time_torch = bench(f_pt, a_pt, b_pt)
    time_mlx = bench(f_mx, a_mx, b_mx)

    out_mx = mx.conv_transpose2d(
        a_mx, b_mx, stride=strides, padding=padding, groups=groups, stream=mx.cpu
    )
    out_pt = torch.conv_transpose2d(
        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
    )
    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
    out_pt = out_pt.numpy(force=True)

    atol = 2e-5 if np_dtype == np.float32 else 1e-4

    if not np.allclose(out_pt, out_mx, atol=atol):
        print(
            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
        )

    return time_mlx, time_torch


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run conv benchmarks")

    dtypes = ("float32",)
    shapes = (
        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2), 1),
        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2), 1),
        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2), 1),
        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2), 1),
        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2), 1),
        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2), 1),
        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2), 1),
        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2), 1),
        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 1),
        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2), 1),
        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2), 1),
        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2), 1),
        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2), 1),
        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2), 1),
        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2), 1),
        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2), 1),
    )

    for dtype in dtypes:
        print(
            "(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  groups, diff%"
        )
        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
            np_dtype = getattr(np, dtype)
            time_mlx, time_torch = bench_shape(
                N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype
            )
            diff = time_torch / time_mlx - 1.0

            print(
                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
            )
            if time_mlx >= 2.0 * time_torch:
                print("ATTENTION ^^^^^^^")


================================================
FILE: benchmarks/python/conv3d_bench.py
================================================
import math
import time

import mlx.core as mx
import numpy as np
import torch

N_warmup = 2
N_iter_bench = 10
N_iter_func = 10


def bench(f, a, b, b_prime):
    for i in range(N_warmup):
        f(a, b, b_prime)
    torch.mps.synchronize()

    s = time.perf_counter_ns()
    for i in range(N_iter_bench):
        f(a, b, b_prime)
    e = time.perf_counter_ns()
    return (e - s) * 1e-9


def make_mx_conv_3D(strides=(1, 1, 1), padding=(0, 0, 0), groups=1):
    def mx_conv_3D(a, b, b_prime):
        y = a
        for i in range(N_iter_func):
            y = mx.conv3d(y, b, stride=strides, padding=padding, groups=groups)
            y = mx.conv3d(y, b_prime, stride=strides, padding=padding, groups=groups)
        mx.eval(y)
        return y

    return mx_conv_3D


def make_pt_conv_3D(strides=(1, 1, 1), padding=(0, 0, 0), groups=1):
    @torch.no_grad()
    def pt_conv_3D(a, b, b_prime):
        y = a
        for i in range(N_iter_func):
            y = torch.conv3d(y, b, stride=strides, padding=padding, groups=groups)
            y = torch.conv3d(y, b_prime, stride=strides, padding=padding, groups=groups)
        torch.mps.synchronize()
        return y

    return pt_conv_3D


def bench_shape(N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype):
    scale = 1.0 / math.sqrt(kD * kH * kW * C)
    a_np = np.random.uniform(0, 0.5, (N, D, H, W, C))
    b_np = np.random.uniform(-scale, scale, (O, kD, kH, kW, int(C / groups)))
    b_prime_np = np.random.uniform(-scale, scale, (C, kD, kH, kW, int(O / groups)))

    a_np, b_np, b_prime_np = map(lambda x: x.astype(np_dtype), (a_np, b_np, b_prime_np))
    a_mx, b_mx, b_prime_mx = map(lambda x: mx.array(x), (a_np, b_np, b_prime_np))
    a_pt, b_pt, b_prime_pt = map(
        lambda x: torch.from_numpy(x.transpose(0, 4, 1, 2, 3)).to("mps"),
        (a_np, b_np, b_prime_np),
    )

    torch.mps.synchronize()

    f_mx = make_mx_conv_3D(strides, padding, groups)
    f_pt = make_pt_conv_3D(strides, padding, groups)

    time_torch = bench(f_pt, a_pt, b_pt, b_prime_pt)
    time_mlx = bench(f_mx, a_mx, b_mx, b_prime_mx)

    # Measure MLX memory
    mx.clear_cache()
    mx.reset_peak_memory()
    y = mx.conv3d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
    mx.eval(y)
    mlx_peak_mb = mx.get_peak_memory() / 1024**2
    mlx_active_mb = mx.get_active_memory() / 1024**2
    del y

    # Measure PyTorch MPS memory
    torch.mps.synchronize()
    torch.mps.empty_cache()
    y = torch.conv3d(a_pt, b_pt, stride=strides, padding=padding, groups=groups)
    torch.mps.synchronize()
    pt_current_mb = torch.mps.current_allocated_memory() / 1024**2
    pt_driver_mb = torch.mps.driver_allocated_memory() / 1024**2
    del y

    out_mx = mx.conv3d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
    out_pt = torch.conv3d(
        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
    )
    out_pt = torch.permute(out_pt, (0, 2, 3, 4, 1))
    out_pt = out_pt.numpy(force=True)

    atol = 2e-5 if np_dtype == np.float32 else 5e-4

    if not np.allclose(out_pt, out_mx, atol=atol):
        print(
            f"Failed at {(N, D, H, W, C)}, {(O, kD, kH, kW, C)} "
            f"[strides = {strides}, padding = {padding}, groups = {groups}] "
            f"with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
        )

    return time_mlx, time_torch, mlx_peak_mb, mlx_active_mb, pt_current_mb, pt_driver_mb


if __name__ == "__main__":
    dtypes = ("float16", "float32")
    shapes = (
        # (C % 16 == 0)
        (4, 16, 16, 16, 32, 3, 3, 3, 32, (1, 1, 1), (1, 1, 1), 1),
        (4, 16, 16, 16, 64, 3, 3, 3, 64, (1, 1, 1), (1, 1, 1), 1),
        (4, 16, 16, 16, 128, 3, 3, 3, 128, (1, 1, 1), (1, 1, 1), 1),
        (4, 32, 32, 32, 64, 3, 3, 3, 64, (1, 1, 1), (1, 1, 1), 1),
        (4, 32, 32, 32, 128, 3, 3, 3, 128, (1, 1, 1), (1, 1, 1), 1),
        # Larger spatial dims
        (2, 64, 64, 64, 32, 3, 3, 3, 64, (1, 1, 1), (1, 1, 1), 1),
        (1, 64, 64, 64, 64, 3, 3, 3, 128, (1, 1, 1), (1, 1, 1), 1),
        # Strided
        (4, 32, 32, 32, 64, 3, 3, 3, 128, (2, 2, 2), (1, 1, 1), 1),
        # Asymmetric kernels
        (4, 32, 32, 32, 64, 3, 1, 1, 128, (1, 1, 1), (1, 0, 0), 1),
        (4, 32, 32, 32, 64, 1, 3, 3, 128, (1, 1, 1), (0, 1, 1), 1),
        # (C % 16 != 0)
        (4, 16, 16, 16, 21, 3, 3, 3, 21, (1, 1, 1), (1, 1, 1), 1),
        (4, 16, 16, 16, 55, 3, 3, 3, 55, (1, 1, 1), (1, 1, 1), 1),
        (4, 32, 32, 32, 55, 3, 3, 3, 55, (1, 1, 1), (1, 1, 1), 1),
        (4, 16, 16, 16, 3, 3, 3, 3, 32, (1, 1, 1), (1, 1, 1), 1),
    )

    for dtype in dtypes:
        print(f"\n{'=' * 120}" f"\n  dtype: {dtype}" f"\n{'=' * 120}")
        print(
            f"{'(N,   D,   H,   W,   C)':<26s} {'(  O, kD, kH, kW,   C)':<24s} "
            f"{'stride':<12s} {'pads':<12s} {'groups':>6s} "
            f"{'diff%':>7s}  "
            f"{'MLX peak':>9s} {'MLX act':>8s} {'PT cur':>8s} {'PT drv':>8s}"
        )
        for N, D, H, W, C, kD, kH, kW, O, strides, padding, groups in shapes:
            np_dtype = getattr(np, dtype)
            time_mlx, time_torch, mlx_peak, mlx_act, pt_cur, pt_drv = bench_shape(
                N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype
            )
            diff = time_torch / time_mlx - 1.0

            print(
                f"({N}, {D:3d}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kD:2d}, {kH:2d}, {kW:2d}, {C:3d}), "
                f"{strides}, {padding}, {groups:6d}, "
                f"{100. * diff:+6.1f}%  "
                f"{mlx_peak:8.1f}  {mlx_act:7.1f}  {pt_cur:7.1f}  {pt_drv:7.1f}"
            )


================================================
FILE: benchmarks/python/conv3d_bench_cpu.py
================================================
import argparse
import math
import time

import mlx.core as mx
import numpy as np
import torch

N_warmup = 1
N_iter_bench = 10
N_iter_func = 5
mx.set_default_device(mx.cpu)


def bench(f, a, b):
    for i in range(N_warmup):
        f(a, b)

    s = time.perf_counter_ns()
    for i in range(N_iter_bench):
        f(a, b)
    e = time.perf_counter_ns()
    return (e - s) * 1e-9


def make_mx_conv_3D(strides=(1, 1), padding=(0, 0), groups=1):
    def mx_conv_3D(a, b):
        ys = []
        for i in range(N_iter_func):
            y = mx.conv3d(a, b, stride=strides, padding=padding, groups=groups)
            ys.append(y)
        mx.eval(ys)
        return ys

    return mx_conv_3D


def make_pt_conv_3D(strides=(1, 1), padding=(0, 0), groups=1):
    @torch.no_grad()
    def pt_conv_3D(a, b):
        ys = []
        for i in range(N_iter_func):
            y = torch.conv3d(a, b, stride=strides, padding=padding, groups=groups)
            ys.append(y)
        return ys

    return pt_conv_3D


def bench_shape(N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype):
    scale = 1.0 / math.sqrt(kD * kH * kW * C)
    a_np = np.random.uniform(0, 0.5, (N, D, H, W, C)).astype(np_dtype)
    b_np = np.random.uniform(-scale, scale, (O, kD, kH, kW, int(C / groups))).astype(
        np_dtype
    )

    a_mx = mx.array(a_np)
    b_mx = mx.array(b_np)

    a_pt = torch.from_numpy(a_np.transpose((0, 4, 1, 2, 3))).to("cpu")
    b_pt = torch.from_numpy(b_np.transpose((0, 4, 1, 2, 3))).to("cpu")

    f_mx = make_mx_conv_3D(strides, padding, groups)
    f_pt = make_pt_conv_3D(strides, padding, groups)

    time_torch = bench(f_pt, a_pt, b_pt)
    time_mlx = bench(f_mx, a_mx, b_mx)

    out_mx = mx.conv3d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
    out_pt = torch.conv3d(
        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
    )
    out_pt = torch.permute(out_pt, (0, 2, 3, 4, 1))
    out_pt = out_pt.numpy(force=True)

    atol = 2e-5 if np_dtype == np.float32 else 1e-4

    if not np.allclose(out_pt, out_mx, atol=atol):
        print(
            f"Failed at {(N, D, H, W, C)}, {(O, kD, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
        )

    return time_mlx, time_torch


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run conv benchmarks")

    dtypes = ("float32",)
    shapes = (
        (4, 16, 16, 16, 16, 5, 5, 5, 16, (1, 1, 1), (2, 2, 2), 1),
        (4, 16, 16, 16, 32, 5, 5, 5, 32, (1, 1, 1), (2, 2, 2), 1),
    )

    for dtype in dtypes:
        print(
            "(N,   D,   H,   W,   C), (  O, kD, kH, kW,   C),   dtype,    stride,      pads,  groups, diff%"
        )
        for N, D, H, W, C, kD, kH, kW, O, strides, padding, groups in shapes:
            np_dtype = getattr(np, dtype)
            time_mlx, time_torch = bench_shape(
                N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype
            )
            diff = time_torch / time_mlx - 1.0

            print(
                f"({N}, {D:3d}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kD:2d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
            )
            if time_mlx >= 2.0 * time_torch:
                print("ATTENTION ^^^^^^^")


================================================
FILE: benchmarks/python/conv3d_train_bench_cpu.py
================================================
import time

import mlx.core as mx
import mlx.nn
import mlx.optimizers as opt
import torch


def bench_mlx(steps: int = 20, shape=(10, 32, 32, 32, 3)) -> float:
    mx.set_default_device(mx.cpu)

    class BenchNetMLX(mlx.nn.Module):
        # simple encoder-decoder net

        def __init__(self, in_channels, hidden_channels=16):
            super().__init__()

            self.net = mlx.nn.Sequential(
                mlx.nn.Conv3d(in_channels, hidden_channels, kernel_size=3, padding=1),
                mlx.nn.ReLU(),
                mlx.nn.Conv3d(
                    hidden_channels, 2 * hidden_channels, kernel_size=3, padding=1
                ),
                mlx.nn.ReLU(),
                mlx.nn.ConvTranspose3d(
                    2 * hidden_channels, hidden_channels, kernel_size=3, padding=1
                ),
                mlx.nn.ReLU(),
                mlx.nn.ConvTranspose3d(
                    hidden_channels, in_channels, kernel_size=3, padding=1
                ),
            )

        def __call__(self, input):
            return self.net(input)

    benchNet = BenchNetMLX(3)
    mx.eval(benchNet.parameters())
    optim = opt.Adam(learning_rate=1e-3)

    inputs = mx.random.normal(shape)

    params = benchNet.parameters()
    optim.init(params)

    state = [benchNet.state, optim.state]

    def loss_fn(params, image):
        benchNet.update(params)
        pred_image = benchNet(image)
        return (pred_image - image).abs().mean()

    def step(params, image):
        loss, grads = mx.value_and_grad(loss_fn)(params, image)
        optim.update(benchNet, grads)
        return loss

    total_time = 0.0
    print("MLX:")
    for i in range(steps):
        start_time = time.perf_counter()

        step(benchNet.parameters(), inputs)
        mx.eval(state)
        end_time = time.perf_counter()

        print(f"{i:3d}, time={(end_time-start_time) * 1000:7.2f} ms")
        total_time += (end_time - start_time) * 1000

    return total_time


def bench_torch(steps: int = 20, shape=(10, 3, 32, 32, 32)) -> float:
    device = torch.device("cpu")

    class BenchNetTorch(torch.nn.Module):
        # simple encoder-decoder net

        def __init__(self, in_channels, hidden_channels=16):
            super().__init__()

            self.net = torch.nn.Sequential(
                torch.nn.Conv3d(in_channels, hidden_channels, kernel_size=3, padding=1),
                torch.nn.ReLU(),
                torch.nn.Conv3d(
                    hidden_channels, 2 * hidden_channels, kernel_size=3, padding=1
                ),
                torch.nn.ReLU(),
                torch.nn.ConvTranspose3d(
                    2 * hidden_channels, hidden_channels, kernel_size=3, padding=1
                ),
                torch.nn.ReLU(),
                torch.nn.ConvTranspose3d(
                    hidden_channels, in_channels, kernel_size=3, padding=1
                ),
            )

        def forward(self, input):
            return self.net(input)

    benchNet = BenchNetTorch(3).to(device)
    optim = torch.optim.Adam(benchNet.parameters(), lr=1e-3)

    inputs = torch.randn(*shape, device=device)

    def loss_fn(pred_image, image):
        return (pred_image - image).abs().mean()

    total_time = 0.0
    print("PyTorch:")
    for i in range(steps):
        start_time = time.perf_counter()

        optim.zero_grad()
        pred_image = benchNet(inputs)
        loss = loss_fn(pred_image, inputs)
        loss.backward()
        optim.step()

        end_time = time.perf_counter()

        print(f"{i:3d}, time={(end_time-start_time) * 1000:7.2f} ms")
        total_time += (end_time - start_time) * 1000

    return total_time


def main():
    steps = 10
    time_mlx = bench_mlx(steps)
    time_torch = bench_torch(steps)

    print(f"average time of MLX:     {time_mlx/steps:9.2f} ms")
    print(f"total time of MLX:       {time_mlx:9.2f} ms")
    print(f"average time of PyTorch: {time_torch/steps:9.2f} ms")
    print(f"total time of PyTorch:   {time_torch:9.2f} ms")

    diff = time_torch / time_mlx - 1.0
    print(f"torch/mlx diff: {100. * diff:+5.2f}%")


if __name__ == "__main__":
    main()


================================================
FILE: benchmarks/python/conv3d_transpose_bench_cpu.py
================================================
import argparse
import math
import time

import mlx.core as mx
import numpy as np
import torch

N_warmup = 1
N_iter_bench = 10
N_iter_func = 5
mx.set_default_device(mx.cpu)


def bench(f, a, b):
    for i in range(N_warmup):
        f(a, b)

    s = time.perf_counter_ns()
    for i in range(N_iter_bench):
        f(a, b)
    e = time.perf_counter_ns()
    return (e - s) * 1e-9


def make_mx_conv_3D(strides=(1, 1, 1), padding=(0, 0, 0), groups=1):
    def mx_conv_3D(a, b):
        ys = []
        for i in range(N_iter_func):
            y = mx.conv_transpose3d(
                a, b, stride=strides, padding=padding, groups=groups
            )
            ys.append(y)
        mx.eval(ys)
        return ys

    return mx_conv_3D


def make_pt_conv_3D(strides=(1, 1, 1), padding=(0, 0, 0), groups=1):
    @torch.no_grad()
    def pt_conv_3D(a, b):
        ys = []
        for i in range(N_iter_func):
            y = torch.conv_transpose3d(
                a, b, stride=strides, padding=padding, groups=groups
            )
            ys.append(y)
        return ys

    return pt_conv_3D


def bench_shape(N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype):
    scale = 1.0 / math.sqrt(kD * kH * kW * C)
    a_np = np.random.uniform(0, 0.5, (N, D, H, W, C)).astype(np_dtype)
    b_np = np.random.uniform(-scale, scale, (O, kD, kH, kW, int(C / groups))).astype(
        np_dtype
    )

    a_mx = mx.array(a_np)
    b_mx = mx.array(b_np)

    a_pt = torch.from_numpy(a_np.transpose((0, 4, 1, 2, 3))).to("cpu")
    b_pt = torch.from_numpy(b_np.transpose((4, 0, 1, 2, 3))).to("cpu")

    f_mx = make_mx_conv_3D(strides, padding, groups)
    f_pt = make_pt_conv_3D(strides, padding, groups)

    time_torch = bench(f_pt, a_pt, b_pt)
    time_mlx = bench(f_mx, a_mx, b_mx)

    out_mx = mx.conv_transpose3d(
        a_mx, b_mx, stride=strides, padding=padding, groups=groups
    )
    out_pt = torch.conv_transpose3d(
        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
    )
    out_pt = torch.permute(out_pt, (0, 2, 3, 4, 1))
    out_pt = out_pt.numpy(force=True)

    atol = 2e-5 if np_dtype == np.float32 else 1e-4

    if not np.allclose(out_pt, out_mx, atol=atol):
        print(
            f"Failed at {(N, D, H, W, C)}, {(O, kD, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
        )

    return time_mlx, time_torch


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run conv benchmarks")

    dtypes = ("float32",)
    shapes = (
        (4, 16, 16, 16, 16, 5, 5, 5, 16, (1, 1, 1), (2, 2, 2), 1),
        (4, 16, 16, 16, 32, 5, 5, 5, 32, (1, 1, 1), (2, 2, 2), 1),
    )

    for dtype in dtypes:
        print(
            "(N,   D,   H,   W,   C), (  O, kD, kH, kW,   C),   dtype,    stride,      pads,  groups, diff%"
        )
        for N, D, H, W, C, kD, kH, kW, O, strides, padding, groups in shapes:
            np_dtype = getattr(np, dtype)
            time_mlx, time_torch = bench_shape(
                N, D, H, W, C, kD, kH, kW, O, strides, padding, groups, np_dtype
            )
            diff = time_torch / time_mlx - 1.0

            print(
                f"({N}, {D:3d}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kD:2d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
            )
            if time_mlx >= 2.0 * time_torch:
                print("ATTENTION ^^^^^^^")


================================================
FILE: benchmarks/python/conv_bench.py
================================================
import argparse
import math
import os
import subprocess
import time

import mlx.core as mx
import numpy as np
import torch

device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
device_name = device_name.decode("utf-8").strip("\n")

N_warmup = 10
N_iter_bench = 100
N_iter_func = 5


def bench(f, a, b):
    for i in range(N_warmup):
        f(a, b)
    torch.mps.synchronize()

    s = time.perf_counter_ns()
    for i in range(N_iter_bench):
        f(a, b)
    e = time.perf_counter_ns()
    return (e - s) * 1e-9


def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
    def mx_conv_2D(a, b):
        ys = []
        for i in range(N_iter_func):
            y = mx.conv2d(a, b, stride=strides, padding=padding, groups=groups)
            ys.append(y)
        mx.eval(ys)
        return ys

    return mx_conv_2D


def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
    @torch.no_grad()
    def pt_conv_2D(a, b):
        ys = []
        for i in range(N_iter_func):
            y = torch.conv2d(a, b, stride=strides, padding=padding, groups=groups)
            ys.append(y)
        torch.mps.synchronize()
        return ys

    return pt_conv_2D


def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
    scale = 1.0 / math.sqrt(kH * kH * C)
    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
        np_dtype
    )

    a_mx = mx.array(a_np)
    b_mx = mx.array(b_np)

    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("mps")
    b_pt = torch.from_numpy(b_np.transpose((0, 3, 1, 2))).to("mps")

    torch.mps.synchronize()

    f_mx = make_mx_conv_2D(strides, padding, groups)
    f_pt = make_pt_conv_2D(strides, padding, groups)

    time_torch = bench(f_pt, a_pt, b_pt)
    time_mlx = bench(f_mx, a_mx, b_mx)

    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
    out_pt = torch.conv2d(
        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
    )
    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
    out_pt = out_pt.numpy(force=True)

    atol = 2e-5 if np_dtype == np.float32 else 1e-4

    if not np.allclose(out_pt, out_mx, atol=atol):
        print(
            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
        )

    return time_mlx, time_torch


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run conv benchmarks")

    dtypes = ("float32",)
    shapes = (
        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2), 1),
        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2), 1),
        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2), 1),
        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2), 1),
        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2), 1),
        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2), 1),
        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2), 1),
        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2), 1),
        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 1),
        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 2),
        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 16),
        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 64),
        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2), 1),
        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2), 1),
        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2), 1),
        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2), 1),
        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2), 1),
        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2), 1),
        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2), 1),
    )

    for dtype in dtypes:
        print(
            "(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  groups, diff%"
        )
        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
            np_dtype = getattr(np, dtype)
            time_mlx, time_torch = bench_shape(
                N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype
            )
            diff = time_torch / time_mlx - 1.0

            print(
                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
            )
            if time_mlx >= 2.0 * time_torch:
                print("ATTENTION ^^^^^^^")


================================================
FILE: benchmarks/python/conv_transpose_bench.py
================================================
import argparse
import math
import os
import subprocess
import time

import mlx.core as mx
import numpy as np
import torch

N_warmup = 10
N_iter_bench = 100
N_iter_func = 5


def bench(f, a, b):
    for i in range(N_warmup):
        f(a, b)
    torch.mps.synchronize()

    s = time.perf_counter_ns()
    for i in range(N_iter_bench):
        f(a, b)
    e = time.perf_counter_ns()
    return (e - s) * 1e-9


def make_mx_conv_transpose_2D(strides=(1, 1), padding=(0, 0), groups=1):
    def mx_conv_transpose_2D(a, b):
        ys = []
        for i in range(N_iter_func):
            y = mx.conv_transpose2d(
                a, b, stride=strides, padding=padding, groups=groups
            )
            ys.append(y)
        mx.eval(ys)
        return ys

    return mx_conv_transpose_2D


def make_pt_conv_transpose_2D(strides=(1, 1), padding=(0, 0), groups=1):
    @torch.no_grad()
    def pt_conv_transpose_2D(a, b):
        ys = []
        for i in range(N_iter_func):
            y = torch.conv_transpose2d(
                a, b, stride=strides, padding=padding, groups=groups
            )
            ys.append(y)
        torch.mps.synchronize()
        return ys

    return pt_conv_transpose_2D


def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
    scale = 1.0 / math.sqrt(kH * kH * C)
    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
        np_dtype
    )

    a_mx = mx.array(a_np)
    b_mx = mx.array(b_np)

    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("mps")
    b_pt = torch.from_numpy(b_np.transpose((3, 0, 1, 2))).to("mps")

    torch.mps.synchronize()

    f_mx = make_mx_conv_transpose_2D(strides, padding, groups)
    f_pt = make_pt_conv_transpose_2D(strides, padding, groups)

    time_torch = bench(f_pt, a_pt, b_pt)
    time_mlx = bench(f_mx, a_mx, b_mx)

    out_mx = mx.conv_transpose2d(
        a_mx, b_mx, stride=strides, padding=padding, groups=groups
    )
    out_pt = torch.conv_transpose2d(
        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
    )
    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
    out_pt = out_pt.numpy(force=True)

    atol = 2e-5 if np_dtype == np.float32 else 1e-4

    if not np.allclose(out_pt, out_mx, atol=atol):
        print(
            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
        )

    return time_mlx, time_torch


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run conv benchmarks")

    dtypes = ("float32",)
    shapes = (
        (4, 32, 32, 32, 5, 5, 32, (1, 1), (2, 2), 1),
        (4, 32, 32, 64, 5, 5, 64, (1, 1), (2, 2), 1),
        (4, 32, 32, 128, 5, 5, 128, (1, 1), (2, 2), 1),
        (4, 32, 32, 256, 5, 5, 256, (1, 1), (2, 2), 1),
        (4, 32, 32, 512, 5, 5, 512, (1, 1), (2, 2), 1),
        (4, 64, 64, 32, 5, 5, 32, (1, 1), (2, 2), 1),
        (4, 64, 64, 64, 5, 5, 64, (1, 1), (2, 2), 1),
        (4, 64, 64, 128, 5, 5, 128, (1, 1), (2, 2), 1),
        (4, 64, 64, 256, 5, 5, 256, (1, 1), (2, 2), 1),
        (4, 128, 128, 32, 5, 5, 32, (1, 1), (2, 2), 1),
        (4, 128, 128, 64, 5, 5, 64, (1, 1), (2, 2), 1),
        (4, 128, 128, 128, 5, 5, 128, (1, 1), (2, 2), 1),
        (4, 256, 256, 32, 5, 5, 3, (1, 1), (2, 2), 1),
        (4, 256, 256, 3, 5, 5, 32, (1, 1), (2, 2), 1),
        (4, 128, 128, 64, 5, 5, 3, (1, 1), (2, 2), 1),
        (4, 128, 128, 3, 5, 5, 64, (1, 1), (2, 2), 1),
    )

    for dtype in dtypes:
        print(
            "(N,   H,   W,   C), (  O, kH, kW,   C),   dtype, stride,   pads,  groups, diff%"
        )
        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
            np_dtype = getattr(np, dtype)
            time_mlx, time_torch = bench_shape(
                N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype
            )
            diff = time_torch / time_mlx - 1.0

            print(
                f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kH:2d}, {kW:2d}, {C:3d}), {dtype}, {strides}, {padding}, {groups:7d}, {100. * diff:+5.2f}%"
            )
            if time_mlx >= 2.0 * time_torch:
                print("ATTENTION ^^^^^^^")


================================================
FILE: benchmarks/python/conv_unaligned_bench.py
================================================
import math
import time

import mlx.core as mx
import numpy as np
import torch

N_warmup = 10
N_iter_bench = 100
N_iter_func = 5


def bench(f, a, b):
    for i in range(N_warmup):
        f(a, b)
    torch.mps.synchronize()

    s = time.perf_counter_ns()
    for i in range(N_iter_bench):
        f(a, b)
    e = time.perf_counter_ns()
    return (e - s) * 1e-9


def make_mx_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
    def mx_conv_2D(a, b):
        ys = []
        for i in range(N_iter_func):
            y = mx.conv2d(a, b, stride=strides, padding=padding, groups=groups)
            ys.append(y)
        mx.eval(ys)
        return ys

    return mx_conv_2D


def make_pt_conv_2D(strides=(1, 1), padding=(0, 0), groups=1):
    @torch.no_grad()
    def pt_conv_2D(a, b):
        ys = []
        for i in range(N_iter_func):
            y = torch.conv2d(a, b, stride=strides, padding=padding, groups=groups)
            ys.append(y)
        torch.mps.synchronize()
        return ys

    return pt_conv_2D


def bench_shape(N, H, W, C, kH, kW, O, strides, padding, groups, np_dtype):
    scale = 1.0 / math.sqrt(kH * kH * C)
    a_np = np.random.uniform(0, 0.5, (N, H, W, C)).astype(np_dtype)
    b_np = np.random.uniform(-scale, scale, (O, kH, kW, int(C / groups))).astype(
        np_dtype
    )

    a_mx = mx.array(a_np)
    b_mx = mx.array(b_np)

    a_pt = torch.from_numpy(a_np.transpose((0, 3, 1, 2))).to("mps")
    b_pt = torch.from_numpy(b_np.transpose((0, 3, 1, 2))).to("mps")

    torch.mps.synchronize()

    f_mx = make_mx_conv_2D(strides, padding, groups)
    f_pt = make_pt_conv_2D(strides, padding, groups)

    time_torch = bench(f_pt, a_pt, b_pt)
    time_mlx = bench(f_mx, a_mx, b_mx)

    out_mx = mx.conv2d(a_mx, b_mx, stride=strides, padding=padding, groups=groups)
    out_pt = torch.conv2d(
        a_pt.to("cpu"), b_pt.to("cpu"), stride=strides, padding=padding, groups=groups
    )
    out_pt = torch.permute(out_pt, (0, 2, 3, 1))
    out_pt = out_pt.numpy(force=True)

    atol = 2e-5 if np_dtype == np.float32 else 1e-4

    if not np.allclose(out_pt, out_mx, atol=atol):
        print(
            f"Failed at {(N, H, W, C)}, {(O, kH, kW, C)} [strides = {strides}, padding = {padding}, groups = {groups}] with max(|a - b|) = {np.max(np.abs(out_pt - out_mx))}"
        )

    return time_mlx, time_torch


if __name__ == "__main__":
    dtype = "float32"
    shapes = (
        (4, 32, 32, 21, 3, 3, 128),
        (4, 32, 32, 21, 3, 3, 37),
        (4, 32, 32, 370, 3, 3, 370),
        (4, 32, 32, 370, 7, 7, 128),
        (2, 320, 640, 21, 7, 7, 21),
    )
    for N, H, W, C, kh, kw, O in shapes:
        time_mlx, time_torch = bench_shape(
            N, H, W, C, kh, kw, O, (1, 1), (0, 0), 1, dtype
        )
        diff = time_torch / time_mlx - 1.0

        print(
            f"({N}, {H:3d}, {W:3d}, {C:3d}), ({O:3d}, {kh:2d}, {kw:2d}, {C:3d}), {dtype}, {100. * diff:+5.2f}%"
        )
        if time_mlx >= 2.0 * time_torch:
            print("ATTENTION ^^^^^^^")


================================================
FILE: benchmarks/python/distributed_bench.py
================================================
# Copyright © 2024 Apple Inc.

"""
Run with:
    mpirun -n 2 python /path/to/distributed_bench.py
"""

import time

import mlx.core as mx


def time_fn(fn, *args, **kwargs):
    msg = kwargs.pop("msg", None)
    world = mx.distributed.init()
    if world.rank() == 0:
        if msg:
            print(f"Timing {msg} ...", end=" ")
        else:
            print(f"Timing {fn.__name__} ...", end=" ")

    # warmup
    for _ in range(5):
        mx.eval(fn(*args, **kwargs))

    num_iters = 100
    tic = time.perf_counter()
    for _ in range(num_iters):
        x = mx.eval(fn(*args, **kwargs))
    toc = time.perf_counter()

    msec = 1e3 * (toc - tic) / num_iters
    if world.rank() == 0:
        print(f"{msec:.5f} msec")


def time_all_sum():
    shape = (4096,)
    x = mx.random.uniform(shape=shape)
    mx.eval(x)

    def sine(x):
        for _ in range(20):
            x = mx.sin(x)
        return x

    time_fn(sine, x)

    def all_sum_plain(x):
        for _ in range(20):
            x = mx.distributed.all_sum(x)
        return x

    time_fn(all_sum_plain, x)

    def all_sum_with_sine(x):
        for _ in range(20):
            x = mx.sin(x)
            x = mx.distributed.all_sum(x)
        return x

    time_fn(all_sum_with_sine, x)


if __name__ == "__main__":
    time_all_sum()


================================================
FILE: benchmarks/python/einsum_bench.py
================================================
# Copyright © 2024 Apple Inc.

import time

import mlx.core as mx
import numpy as np


def timeit(fn, its=100, args=[]):
    for _ in range(5):
        fn(*args)
    tic = time.perf_counter()
    for _ in range(its):
        fn(*args)
    toc = time.perf_counter()
    return 1e3 * (toc - tic) / its


def time_little_einsum_path():
    subscripts = "ik,kj->ij"
    x = mx.ones((32, 32))
    y = mx.ones((32, 32))
    mx_time = timeit(mx.einsum_path, args=(subscripts, x, y))

    x = np.array(x)
    y = np.array(y)
    np_time = timeit(np.einsum_path, args=(subscripts, x, y))
    print("Timing little einsum path...")
    print(f"MLX ... {mx_time:.3f} ms")
    print(f"NumPy... {np_time:.3f} ms")


def time_big_einsum_path():
    chars = list("abcdefgh")
    char_to_dim = {c: v for v, c in enumerate(chars)}

    num_inputs = 10
    inputs = []
    subscripts = []
    for _ in range(num_inputs):
        subscript = np.random.choice(chars, size=5, replace=False).tolist()
        subscripts.append("".join(subscript))
        inputs.append(np.ones(list(char_to_dim[c] for c in subscript)))
    subscripts = ",".join(subscripts)

    np_time = timeit(np.einsum_path, args=(subscripts, *inputs))

    inputs = [mx.array(x) for x in inputs]
    mx_time = timeit(mx.einsum_path, args=(subscripts, *inputs))
    print("Timing big einsum path...")
    print(f"MLX ... {mx_time:.3f} ms")
    print(f"NumPy... {np_time:.3f} ms")


def time_attention():
    def regular_attention(x):
        # shape [batch, sequence, num_heads, head_dim]
        queries, keys, values = x, x, x
        scores = queries.transpose(0, 2, 1, 3) @ keys.transpose(0, 2, 3, 1)
        scores = mx.softmax(scores, axis=-1)
        output = (scores @ values.transpose(0, 2, 1, 3)).swapaxes(1, 2)
        mx.eval(output)

    def einsum_attention(x):
        # shape [batch, sequence, num_heads, head_dim]
        queries, keys, values = x, x, x
        scores = mx.einsum("itjk,iujk->ijtu", queries, keys)
        scores = mx.softmax(scores, axis=-1)
        output = mx.einsum("ijtu,iujk->itjk", scores, values)
        mx.eval(output)

    x = mx.random.uniform(shape=(8, 512, 32, 128))

    regular_time = timeit(regular_attention, args=(x,))
    ein_time = timeit(einsum_attention, args=(x,))
    print("Timing einsum attention...")
    print(f"Regular ... {regular_time:.3f} ms")
    print(f"Einsum ... {ein_time:.3f} ms")


if __name__ == "__main__":
    time_little_einsum_path()
    time_big_einsum_path()
    time_attention()


================================================
FILE: benchmarks/python/fft_bench.py
================================================
# Copyright © 2024 Apple Inc.

import matplotlib
import mlx.core as mx
import numpy as np
import sympy
import torch
from time_utils import measure_runtime

matplotlib.use("Agg")
import matplotlib.pyplot as plt


def bandwidth_gb(runtime_ms, system_size):
    bytes_per_fft = np.dtype(np.complex64).itemsize * 2
    bytes_per_gb = 1e9
    ms_per_s = 1e3
    return system_size * bytes_per_fft / runtime_ms * ms_per_s / bytes_per_gb


def run_bench(system_size, fft_sizes, backend="mlx", dim=1):
    def fft_mlx(x):
        if dim == 1:
            out = mx.fft.fft(x)
        elif dim == 2:
            out = mx.fft.fft2(x)
        mx.eval(out)
        return out

    def fft_mps(x):
        if dim == 1:
            out = torch.fft.fft(x)
        elif dim == 2:
            out = torch.fft.fft2(x)
        torch.mps.synchronize()
        return out

    bandwidths = []
    for n in fft_sizes:
        batch_size = system_size // n**dim
        shape = [batch_size] + [n for _ in range(dim)]
        if backend == "mlx":
            x_np = np.random.uniform(size=(system_size // n, n)).astype(np.complex64)
            x = mx.array(x_np)
            mx.eval(x)
            fft = fft_mlx
        elif backend == "mps":
            x_np = np.random.uniform(size=(system_size // n, n)).astype(np.complex64)
            x = torch.tensor(x_np, device="mps")
            torch.mps.synchronize()
            fft = fft_mps
        else:
            raise NotImplementedError()
        runtime_ms = measure_runtime(fft, x=x)
        bandwidth = bandwidth_gb(runtime_ms, np.prod(shape))
        print(n, bandwidth)
        bandwidths.append(bandwidth)

    return np.array(bandwidths)


def time_fft():
    x = np.array(range(2, 512))
    system_size = int(2**26)

    print("MLX GPU")
    with mx.stream(mx.gpu):
        gpu_bandwidths = run_bench(system_size=system_size, fft_sizes=x)

    print("MPS GPU")
    mps_bandwidths = run_bench(system_size=system_size, fft_sizes=x, backend="mps")

    print("CPU")
    system_size = int(2**20)
    with mx.stream(mx.cpu):
        cpu_bandwidths = run_bench(system_size=system_size, fft_sizes=x)

    x = np.array(x)

    all_indices = x - x[0]
    radix_2to13 = (
        np.array([i for i in x if all(p <= 13 for p in sympy.primefactors(i))]) - x[0]
    )
    bluesteins = (
        np.array([i for i in x if any(p > 13 for p in sympy.primefactors(i))]) - x[0]
    )

    for indices, name in [
        (all_indices, "All"),
        (radix_2to13, "Radix 2-13"),
        (bluesteins, "Bluestein's"),
    ]:
        # plot bandwidths
        print(name)
        plt.scatter(x[indices], gpu_bandwidths[indices], color="green", label="GPU")
        plt.scatter(x[indices], mps_bandwidths[indices], color="blue", label="MPS")
        plt.scatter(x[indices], cpu_bandwidths[indices], color="red", label="CPU")
        plt.title(f"MLX FFT Benchmark -- {name}")
        plt.xlabel("N")
        plt.ylabel("Bandwidth (GB/s)")
        plt.legend()
        plt.savefig(f"{name}.png")
        plt.clf()

    av_gpu_bandwidth = np.mean(gpu_bandwidths)
    av_mps_bandwidth = np.mean(mps_bandwidths)
    av_cpu_bandwidth = np.mean(cpu_bandwidths)
    print("Average bandwidths:")
    print("GPU:", av_gpu_bandwidth)
    print("MPS:", av_mps_bandwidth)
    print("CPU:", av_cpu_bandwidth)

    portion_faster = len(np.where(gpu_bandwidths > mps_bandwidths)[0]) / len(x)
    print("Percent MLX faster than MPS: ", portion_faster * 100)


if __name__ == "__main__":
    time_fft()


================================================
FILE: benchmarks/python/gather_bench.py
================================================
# Copyright © 2023-2024 Apple Inc.

import argparse

import mlx.core as mx
import torch
from time_utils import measure_runtime


def benchmark_gather_mlx(x_shape, idx_shape):
    def gather(x, idx):
        mx.eval(x[idx])

    idx = mx.random.randint(0, x_shape[0] - 1, idx_shape)
    x = mx.random.normal(x_shape).astype(mx.float32)

    runtime = measure_runtime(gather, x=x, idx=idx)
    print(f"MLX: {runtime:.3f}ms")


def benchmark_gather_torch(x_shape, idx_shape, device):
    def gather(x, idx, device):
        _ = x[idx]
        if device == torch.device("mps"):
            torch.mps.synchronize()

    idx = torch.randint(0, x_shape[0] - 1, idx_shape).to(device)
    x = torch.randn(x_shape, dtype=torch.float32).to(device)

    runtime = measure_runtime(gather, x=x, idx=idx, device=device)
    print(f"PyTorch: {runtime:.3f}ms")


if __name__ == "__main__":
    parser = argparse.ArgumentParser("Gather benchmarks.")
    parser.add_argument("--cpu", action="store_true", help="Use the CPU.")
    args = parser.parse_args()

    if args.cpu:
        mx.set_default_device(mx.cpu)
        device = torch.device("cpu")
    else:
        device = torch.device("mps")

    idx_shapes = [(1_000_000,), (100_000,), ()]
    x_shapes = [(100, 64), (100, 1024), (4, 1_000_000)]

    for x_shape, idx_shape in zip(x_shapes, idx_shapes):
        print("=" * 20)
        print(f"X {x_shape}, Indices {idx_shape}")
        benchmark_gather_mlx(x_shape, idx_shape)
        benchmark_gather_torch(x_shape, idx_shape, device=device)


================================================
FILE: benchmarks/python/gather_mm_bench.py
================================================
# Copyright © 2025 Apple Inc.

import mlx.core as mx
from time_utils import time_fn

N = 1024
D = 1024
M = 1024
E = 32
I = 4


def gather_sort(x, indices):
    N, M = indices.shape
    indices = indices.flatten()
    order = mx.argsort(indices)
    inv_order = mx.argsort(order)
    return x.flatten(0, -3)[order // M], indices[order], inv_order


def scatter_unsort(x, inv_order, shape=None):
    x = x[inv_order]
    if shape is not None:
        x = mx.unflatten(x, 0, shape)
    return x


def gather_mm_simulate(x, w, indices):
    x, idx, inv_order = gather_sort(x, indices)
    for i in range(2):
        y = mx.concatenate([x[i] @ w[j].T for i, j in enumerate(idx.tolist())], axis=0)
        x = y[:, None]
    x = scatter_unsort(x, inv_order, indices.shape)
    return x


def time_gather_mm():
    x = mx.random.normal((N, 1, 1, D)) / 1024**0.5
    w1 = mx.random.normal((E, M, D)) / 1024**0.5
    w2 = mx.random.normal((E, D, M)) / 1024**0.5
    indices = (mx.random.uniform(shape=(N, I)) * E).astype(mx.uint32)
    sorted_indices = mx.sort(indices.flatten()).reshape(N, I)
    mx.eval(x, w1, w2, indices, sorted_indices)

    def gather_mm(x, w1, w2, indices, sort):
        idx = indices
        inv_order = None
        if sort:
            x, idx, inv_order = gather_sort(x, indices)
        x = mx.gather_mm(x, w1.swapaxes(-1, -2), rhs_indices=idx, sorted_indices=sort)
        x = mx.gather_mm(x, w2.swapaxes(-1, -2), rhs_indices=idx, sorted_indices=sort)
        if sort:
            x = scatter_unsort(x, inv_order, indices.shape)
        return x

    time_fn(gather_mm, x, w1, w2, indices, False)
    time_fn(gather_mm, x, w1, w2, sorted_indices, False)
    time_fn(gather_mm, x, w1, w2, indices, True)

    x = mx.random.normal((N * I, D)) / 1024**0.5
    w1 = mx.random.normal((M, D)) / 1024**0.5
    w2 = mx.random.normal((D, M)) / 1024**0.5
    mx.eval(x, w1, w2)

    def equivalent_matmul(x, w1, w2):
        x = x @ w1.T
        x = x @ w2.T
        return x

    time_fn(equivalent_matmul, x, w1, w2)


if __name__ == "__main__":
    time_gather_mm()


================================================
FILE: benchmarks/python/gather_qmm_bench.py
================================================
# Copyright © 2025 Apple Inc.

import mlx.core as mx
from time_utils import time_fn

N = 1024
D = 1024
M = 1024
E = 32
I = 4


def gather_sort(x, indices):
    N, M = indices.shape
    indices = indices.flatten()
    order = mx.argsort(indices)
    inv_order = mx.argsort(order)
    return x.flatten(0, -3)[order // M], indices[order], inv_order


def scatter_unsort(x, inv_order, shape=None):
    x = x[inv_order]
    if shape is not None:
        x = mx.unflatten(x, 0, shape)
    return x


def gather_mm_simulate(x, w, indices):
    x, idx, inv_order = gather_sort(x, indices)
    for i in range(2):
        y = mx.concatenate(
            [
                mx.quantized_matmul(x[i], w[0][j], w[1][j], w[2][j], transpose=True)
                for i, j in enumerate(idx.tolist())
            ],
            axis=0,
        )
        x = y[:, None]
    x = scatter_unsort(x, inv_order, indices.shape)
    return x


def time_gather_qmm():
    x = mx.random.normal((N, 1, 1, D)) / 1024**0.5
    w1 = mx.random.normal((E, M, D)) / 1024**0.5
    w2 = mx.random.normal((E, D, M)) / 1024**0.5
    w1 = mx.quantize(w1)
    w2 = mx.quantize(w2)
    indices = (mx.random.uniform(shape=(N, I)) * E).astype(mx.uint32)
    sorted_indices = mx.sort(indices.flatten()).reshape(N, I)
    mx.eval(x, w1, w2, indices, sorted_indices)

    def gather_mm(x, w1, w2, indices, sort):
        idx = indices
        inv_order = None
        if sort:
            x, idx, inv_order = gather_sort(x, indices)
        x = mx.gather_qmm(x, *w1, transpose=True, rhs_indices=idx, sorted_indices=sort)
        x = mx.gather_qmm(x, *w2, transpose=True, rhs_indices=idx, sorted_indices=sort)
        if sort:
            x = scatter_unsort(x, inv_order, indices.shape)
        return x

    time_fn(gather_mm, x, w1, w2, indices, False)
    time_fn(gather_mm, x, w1, w2, sorted_indices, False)
    time_fn(gather_mm, x, w1, w2, indices, True)

    x = mx.random.normal((N * I, D)) / 1024**0.5
    w1 = mx.random.normal((M, D)) / 1024**0.5
    w2 = mx.random.normal((D, M)) / 1024**0.5
    w1 = mx.quantize(w1)
    w2 = mx.quantize(w2)
    mx.eval(x, w1, w2)

    def equivalent_matmul(x, w1, w2):
        x = mx.quantized_matmul(x, *w1, transpose=True)
        x = mx.quantized_matmul(x, *w2, transpose=True)
        return x

    time_fn(equivalent_matmul, x, w1, w2)


if __name__ == "__main__":
    time_gather_qmm()


================================================
FILE: benchmarks/python/hadamard_bench.py
================================================
import argparse

import matplotlib
import mlx.core as mx
import numpy as np
from time_utils import measure_runtime

matplotlib.use("Agg")
import matplotlib.pyplot as plt


def had(x):
    y = mx.hadamard_transform(x)
    mx.eval(y)


def copy(x):
    y = x + 1.0
    mx.eval(y)


def run(dtype):
    system_size = 2**26
    outputs = {}
    for test_fn in (had, copy):
        for m in [1, 12, 20, 28]:
            if test_fn == copy:
                key = "copy"
            elif m == 1:
                key = "had_2^k"
            else:
                key = "had_m*2^k"
            outputs.setdefault(key, {})
            for k in range(7, 14):
                n = m * 2**k
                if n > 2**15:
                    continue
                x_np = np.random.normal(size=(system_size // n, n)).astype(dtype)
                x = mx.array(x_np)
                runtime_ms = measure_runtime(test_fn, x=x)
                bytes_per_gb = 1e9
                ms_per_s = 1e3
                bytes_per_had = np.dtype(x_np.dtype).itemsize * 2
                bandwidth_gb = (
                    system_size * bytes_per_had / runtime_ms * ms_per_s / bytes_per_gb
                )
                print(n, bandwidth_gb)
                outputs[key][n] = bandwidth_gb

    colors = {
        "copy": "black",
        "had_2^k": "steelblue",
        "had_m*2^k": "skyblue",
    }
    for key, output in outputs.items():
        plt.scatter(output.keys(), output.values(), color=colors[key], label=key)
    plt.title(f"MLX Hadamard Benchmark -- {dtype.__name__}")
    plt.xlabel("N")
    plt.ylabel("Bandwidth (GB/s)")
    plt.legend()
    plt.savefig(f"bench_{dtype.__name__}.png")
    plt.clf()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--fp16", action="store_true")
    args = parser.parse_args()
    dtype = np.float16 if args.fp16 else np.float32
    run(dtype)


================================================
FILE: benchmarks/python/large_gemm_bench.py
================================================
# Copyright © 2026 Apple Inc.

import math
import time

import mlx.core as mx
import numpy as np
import torch

N_WARMUP = 5
N_BENCH = 20


def bench_mlx(a, b):
    for _ in range(N_WARMUP):
        mx.eval(a @ b)

    times = []
    for _ in range(N_BENCH):
        start = time.perf_counter_ns()
        mx.eval(a @ b)
        end = time.perf_counter_ns()
        times.append((end - start) * 1e-9)

    return np.mean(times), np.std(times)


@torch.no_grad()
def bench_torch(a, b):
    for _ in range(N_WARMUP):
        _ = a @ b
        torch.mps.synchronize()

    times = []
    for _ in range(N_BENCH):
        start = time.perf_counter_ns()
        _ = a @ b
        torch.mps.synchronize()
        end = time.perf_counter_ns()
        times.append((end - start) * 1e-9)

    return np.mean(times), np.std(times)


def check_correctness(out_mx, out_pt, rtol, M, N, K):
    if not np.allclose(out_pt, out_mx, rtol=rtol, atol=0):
        abs_diff = np.abs(out_pt - out_mx)
        rel_diff = abs_diff / np.maximum(np.abs(out_pt), 1e-10)

        print(
            f"  WARNING: Correctness failed at {M}x{N}x{K}: "
            f"max_abs={np.max(abs_diff):.6e}, max_rel={np.max(rel_diff):.6e}"
        )


def bench_gemm(M, N, K, dtype, rtol):
    scale = 0.5 / math.sqrt(K)
    a_np = np.random.uniform(0, scale, (M, K)).astype(np.float32)
    b_np = np.random.uniform(0, scale, (K, N)).astype(np.float32)

    a_mx = mx.array(a_np).astype(getattr(mx, dtype))
    b_mx = mx.array(b_np).astype(getattr(mx, dtype))

    a_pt = torch.from_numpy(a_np).to(dtype=getattr(torch, dtype), device="mps")
    b_pt = torch.from_numpy(b_np).to(dtype=getattr(torch, dtype), device="mps")
    torch.mps.synchronize()

    torch_mean, torch_std = bench_torch(a_pt, b_pt)
    mlx_mean, mlx_std = bench_mlx(a_mx, b_mx)

    out_mx = (a_mx @ b_mx).astype(mx.float32)
    out_pt = (a_pt @ b_pt).to(torch.float32).to("cpu").numpy(force=True)
    check_correctness(out_mx, out_pt, rtol, M, N, K)

    return mlx_mean, mlx_std, torch_mean, torch_std


if __name__ == "__main__":
    dtypes = ("bfloat16", "float16", "float32")

    rtols = {
        "float32": 1e-3,
        "float16": 5e-3,
        "bfloat16": 1e-2,
    }

    shapes = (
        (2048, 2048, 10240),
        (2048, 3072, 10240),
        (3072, 3072, 10240),
        (3072, 3072, 12288),
        (3072, 4096, 12288),
        (4096, 4096, 12288),
        (4096, 4096, 18432),
        (4096, 4096, 21504),
        (4096, 6144, 21504),
        (6144, 6144, 21504),
    )

    for dtype in dtypes:
        print(f"\nPerformance ({dtype}):")
        print(
            f"{'M':>5s} {'N':>5s} {'K':>6s}  "
            f"{'MLX (ms)':>15s}  {'Torch (ms)':>15s}  {'Speedup':>10s}"
        )
        print("-" * 80)

        for M, N, K in shapes:
            mlx_mean, mlx_std, torch_mean, torch_std = bench_gemm(
                M, N, K, dtype, rtols[dtype]
            )
            speedup = torch_mean / mlx_mean

            print(
                f"{M:5d} {N:5d} {K:6d}  "
                f"{mlx_mean*1000:7.2f}±{mlx_std*1000:5.2f}  "
                f"{torch_mean*1000:7.2f}±{torch_std*1000:5.2f}  "
                f"{speedup:8.2f}x"
            )


================================================
FILE: benchmarks/python/layer_norm_bench.py
================================================
# Copyright © 2023-2024 Apple Inc.

from functools import partial

import mlx.core as mx
import mlx.nn as nn
from time_utils import time_fn


def layer_norm(x, w, b, eps):
    ot = x.dtype
    x = x.astype(mx.float32)
    mu = mx.mean(x, -1, keepdims=True)
    v = mx.var(x, -1, keepdims=True)
    y = (x - mu) * mx.rsqrt(v + eps)
    if w is not None:
        y = y * w
    if b is not None:
        y = y + b
    return y


def time_layer_norm(N, dt):
    L = 1024
    f1 = lambda x, w, b, y: (layer_norm(x, w, b, 1e-5) * y).sum()
    f2 = lambda x, w, b, y: (mx.fast.layer_norm(x, w, b, 1e-5) * y).sum()
    g1 = mx.grad(f1, argnums=(0, 1, 2))
    g2 = mx.grad(f2, argnums=(0, 1, 2))

    x = mx.random.uniform(shape=(8, L, N)).astype(dt)
    w = mx.random.uniform(shape=(N,)).astype(dt)
    b = mx.random.uniform(shape=(N,)).astype(dt)
    y = mx.random.uniform(shape=(8, L, N)).astype(dt)
    mx.eval(x, w, b, y)

    def layer_norm_loop(f, x, w, b):
        for _ in range(32):
            x = f(x, w, b)
        return x

    time_fn(layer_norm_loop, partial(layer_norm, eps=1e-5), x, w, b)
    time_fn(layer_norm_loop, partial(mx.fast.layer_norm, eps=1e-5), x, w, b)

    def layer_norm_grad_loop(g, x, w, b):
        gx, gw, gb = x, w, b
        for _ in range(32):
            gx, gw, gb = g(gx, gw, gb, y)
        return gx, gw, gb

    time_fn(layer_norm_grad_loop, g1, x, w, b)
    time_fn(layer_norm_grad_loop, g2, x, w, b)
    time_fn(layer_norm_grad_loop, mx.compile(g1), x, w, b)
    time_fn(layer_norm_grad_loop, mx.compile(g2), x, w, b)

    f1 = lambda x, y: (layer_norm(x, None, None, 1e-5) * y).sum()
    f2 = lambda x, y: (mx.fast.layer_norm(x, None, None, 1e-5) * y).sum()
    g1 = mx.grad(f1, argnums=(0,))
    g2 = mx.grad(f2, argnums=(0,))

    x = mx.random.uniform(shape=(8, L, N)).astype(dt)
    w = mx.random.uniform(shape=(N,)).astype(dt)
    b = mx.random.uniform(shape=(N,)).astype(dt)
    y = mx.random.uniform(shape=(8, L, N)).astype(dt)
    mx.eval(x, w, b, y)

    def layer_norm_grad_x_loop(g, x):
        gx = x
        for _ in range(32):
            gx = g(gx, y)
        return gx

    time_fn(layer_norm_grad_x_loop, g1, x)
    time_fn(layer_norm_grad_x_loop, g2, x)
    time_fn(layer_norm_grad_x_loop, mx.compile(g1), x)
    time_fn(layer_norm_grad_x_loop, mx.compile(g2), x)


if __name__ == "__main__":
    for dt in [mx.float32, mx.float16, mx.bfloat16]:
        for n in [1024, 2048, 4096, 8192, 8192 + 1024]:
            print(dt, n)
            time_layer_norm(n, dt)


================================================
FILE: benchmarks/python/masked_scatter.py
================================================
import math
import os
import platform
import subprocess
import time
from copy import copy
from functools import partial

import matplotlib.pyplot as plt
import mlx.core as mx
import numpy as np
import torch
from matplotlib.ticker import FuncFormatter

RESULTS_DIR = "./results"


if not os.path.isdir(RESULTS_DIR):
    os.mkdir(RESULTS_DIR)

TORCH_DEVICE = torch.device(
    "mps"
    if torch.backends.mps.is_available()
    else ("cuda" if torch.cuda.is_available() else "cpu")
)


def get_device_name():
    if TORCH_DEVICE.type == "cuda":
        try:
            out = subprocess.check_output(
                ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
                stderr=subprocess.DEVNULL,
            )
            return out.decode("utf-8").splitlines()[0].strip()
        except Exception:
            return "CUDA_GPU"
    if TORCH_DEVICE.type == "mps":
        try:
            out = subprocess.check_output(
                ["sysctl", "-n", "machdep.cpu.brand_string"],
                stderr=subprocess.DEVNULL,
            )
            return out.decode("utf-8").strip()
        except Exception:
            return "Apple_Silicon"
    return platform.processor() or platform.machine() or "CPU"


DEVICE_NAME = get_device_name()


N_WARMUP = 5
N_ITER_BENCH = 50
N_ITER_FUNC = 20

VECTOR_LENGTHS = [4096 * (2**i) for i in range(12)]
MASK_DENSITIES = [0.01, 0.1, 0.25, 0.5]
D_TYPES = ("float32", "float16")


def _power_of_two_formatter(value, _position):
    if value <= 0:
        return ""
    exponent = int(round(math.log2(value)))
    if abs(value - (1 << exponent)) / value > 1e-6:
        return f"{value:g}"
    return f"$2^{{{exponent}}}$"


def torch_sync():
    if TORCH_DEVICE.type == "cuda":
        torch.cuda.synchronize()
    elif TORCH_DEVICE.type == "mps":
        torch.mps.synchronize()


def masked_scatter_mlx(self_arr, mask_arr, src_arr):
    outs = []
    for _ in range(N_ITER_FUNC):
        out = copy(self_arr)
        out[mask_arr] = src_arr
        outs.append(out)
    mx.eval(outs)
    return outs


@torch.no_grad()
def masked_scatter_torch(self_tensor, mask_tensor, src_tensor):
    outs = []
    for _ in range(N_ITER_FUNC):
        out = self_tensor.clone()
        out.masked_scatter_(mask_tensor, src_tensor)
        outs.append(out)
    torch_sync()
    return outs


def measure(fn):
    for _ in range(N_WARMUP):
        fn()
    start = time.perf_counter_ns()
    for _ in range(N_ITER_BENCH):
        fn()
    end = time.perf_counter_ns()
    return (end - start) * 1e-9


def bytes_touched(length, true_count, item_size):
    mask_bytes = length
    self_bytes = length * item_size * 2  # read + write
    src_bytes = true_count * item_size
    return (mask_bytes + self_bytes + src_bytes) * N_ITER_FUNC * N_ITER_BENCH


def build_case(length, density, np_dtype, torch_dtype):
    true_count = max(1, int(round(length * density)))

    rng = np.random.default_rng()
    self_np = rng.normal(0.0, 1.0, length).astype(np_dtype)
    mask_np = np.zeros(length, dtype=bool)
    mask_np[:true_count] = True
    rng.shuffle(mask_np)
    src_np = rng.normal(0.0, 1.0, true_count).astype(np_dtype)

    self_mlx = mx.array(self_np)
    mask_mlx = mx.array(mask_np)
    src_mlx = mx.array(src_np)

    self_torch = torch.from_numpy(self_np).to(device=TORCH_DEVICE, dtype=torch_dtype)
    mask_torch = torch.from_numpy(mask_np).to(device=TORCH_DEVICE)
    src_torch = torch.from_numpy(src_np).to(device=TORCH_DEVICE, dtype=torch_dtype)

    # Correctness check once per configuration
    mx_out = mx.array(self_np)
    mx_out[mask_mlx] = src_mlx
    mx.eval(mx_out)
    torch_out = self_torch.clone()
    torch_out.masked_scatter_(mask_torch, src_torch)

    atol = 5e-3 if np_dtype == np.float16 else 1e-5
    if not np.allclose(np.array(mx_out), torch_out.cpu().numpy(), atol=atol):
        raise AssertionError("masked_scatter results diverged between MLX and Torch")

    return (self_mlx, mask_mlx, src_mlx, self_torch, mask_torch, src_torch, true_count)


def bench_case(length, density, dtype):
    np_dtype = getattr(np, dtype)
    torch_dtype = getattr(torch, dtype)
    (
        self_mlx,
        mask_mlx,
        src_mlx,
        self_torch,
        mask_torch,
        src_torch,
        true_count,
    ) = build_case(length, density, np_dtype, torch_dtype)

    time_mlx = measure(partial(masked_scatter_mlx, self_mlx, mask_mlx, src_mlx))
    time_torch = measure(
        partial(masked_scatter_torch, self_torch, mask_torch, src_torch)
    )

    total_bytes = bytes_touched(length, true_count, np_dtype().itemsize)
    bytes_per_gb = float(1024**3)
    mlx_gbps = (total_bytes / bytes_per_gb) / time_mlx
    torch_gbps = (total_bytes / bytes_per_gb) / time_torch

    return time_mlx, time_torch, mlx_gbps, torch_gbps


def plot_density(ax_perf, ax_speedup, density, dtype):
    mlx_gbps = []
    torch_gbps = []
    mlx_times = []
    torch_times = []

    for length in VECTOR_LENGTHS:
        t_mlx, t_torch, gbps_mlx, gbps_torch = bench_case(length, density, dtype)
        mlx_gbps.append(gbps_mlx)
        torch_gbps.append(gbps_torch)
        mlx_times.append(t_mlx)
        torch_times.append(t_torch)

    ax_perf.plot(VECTOR_LENGTHS, mlx_gbps, "tab:blue", label="MLX")
    ax_perf.plot(VECTOR_LENGTHS, torch_gbps, "tab:red", label="Torch")
    ax_perf.set_xscale("log", base=2)
    ax_perf.set_xticks(VECTOR_LENGTHS)
    formatter = FuncFormatter(_power_of_two_formatter)
    ax_perf.xaxis.set_major_formatter(formatter)
    ax_perf.set_title(f"density={density:.2f}")
    ax_perf.set_ylabel("GB/s")
    ax_perf.grid(True, which="both", linestyle=":", alpha=0.4)
    ax_perf.legend()

    speedup = np.array(torch_times) / np.array(mlx_times)
    ax_speedup.plot(VECTOR_LENGTHS, speedup, "tab:green")
    ax_speedup.axhline(1.0, color="tab:gray", linestyle="--")
    ax_speedup.set_xscale("log", base=2)
    ax_speedup.set_xticks(VECTOR_LENGTHS)
    ax_speedup.xaxis.set_major_formatter(formatter)
    ax_speedup.set_ylabel("Speedup (Torch_t / MLX_t)")
    ax_speedup.grid(True, which="both", linestyle=":", alpha=0.4)


def main():
    for dtype in D_TYPES:
        fig, axs = plt.subplots(
            len(MASK_DENSITIES),
            2,
            figsize=(10, 12),
            layout="constrained",
            sharex=True,
        )

        for i, density in enumerate(MASK_DENSITIES):
            plot_density(axs[i][0], axs[i][1], density, dtype)
            axs[i][0].set_xlabel("vector length")
            axs[i][1].set_xlabel("vector length")

        fig.suptitle(
            f"{DEVICE_NAME.replace('Apple ', '')} ({TORCH_DEVICE.type}) | dtype={dtype}"
        )
        output_path = os.path.join(
            RESULTS_DIR,
            f"{DEVICE_NAME.replace(' ', '_')}_masked_scatter_{dtype}.png",
        )
        fig.savefig(output_path)
        print(f"Saved benchmark image: {output_path}")
        plt.close(fig)


if __name__ == "__main__":
    main()


================================================
FILE: benchmarks/python/rms_norm_bench.py
================================================
# Copyright © 2023-2024 Apple Inc.

import mlx.core as mx
import mlx.nn as nn
from time_utils import time_fn


def rms_norm(x, w, eps):
    ot = x.dtype
    x = x.astype(mx.float32)
    n = mx.rsqrt(x.square().mean(-1, keepdims=True) + eps)
    y = (x * n).astype(ot)
    if w is not None:
        y = y * w
    return y


def time_rms_norm():
    f1 = lambda x, w, y: (rms_norm(x, w, 1e-5) * y).sum()
    f2 = lambda x, w, y: (mx.fast.rms_norm(x, w, 1e-5) * y).sum()
    g1 = mx.grad(f1, argnums=(0, 1))
    g2 = mx.grad(f2, argnums=(0, 1))

    x = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
    w = mx.random.uniform(shape=(4096,)).astype(mx.float16)
    y = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
    mx.eval(x, w, y)

    def rms_norm_loop(g, x, w):
        gx, gw = x, w
        for _ in range(32):
            gx, gw = g(gx, gw, y)
        return gx, gw

    time_fn(rms_norm_loop, g1, x, w)
    time_fn(rms_norm_loop, g2, x, w)
    time_fn(rms_norm_loop, mx.compile(g1), x, w)
    time_fn(rms_norm_loop, mx.compile(g2), x, w)

    f1 = lambda x, y: (rms_norm(x, None, 1e-5) * y).sum()
    f2 = lambda x, y: (mx.fast.rms_norm(x, None, 1e-5) * y).sum()
    g1 = mx.grad(f1, argnums=(0,))
    g2 = mx.grad(f2, argnums=(0,))

    x = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
    w = mx.random.uniform(shape=(4096,)).astype(mx.float16)
    y = mx.random.uniform(shape=(8, 1024, 4096)).astype(mx.float16)
    mx.eval(x, w, y)

    def rms_norm_loop(g, x):
        gx = x
        for _ in range(32):
            gx = g(gx, y)
        return gx

    time_fn(rms_norm_loop, g1, x)
    time_fn(rms_norm_loop, g2, x)
    time_fn(rms_norm_loop, mx.compile(g1), x)
    time_fn(rms_norm_loop, mx.compile(g2), x)


if __name__ == "__main__":
    time_rms_norm()


================================================
FILE: benchmarks/python/rope_bench.py
================================================
# Copyright © 2023-2024 Apple Inc.

import mlx.core as mx
import mlx.nn as nn
from time_utils import time_fn


def time_rope():
    rope = nn.RoPE(64)

    # vec
    x = mx.random.uniform(shape=(1, 32, 1, 128)).astype(mx.float16)
    mx.eval(x)

    def rope_vec(x):
        for _ in range(32):
            x = rope(x, offset=100)
        return x

    time_fn(rope_vec, x)

    # matrix
    x = mx.random.uniform(shape=(1, 32, 1024, 128)).astype(mx.float16)
    mx.eval(x)

    def rope_mat(x):
        for _ in range(32):
            x = rope(x)
        return x

    time_fn(rope_mat, x)


if __name__ == "__main__":
    time_rope()


================================================
FILE: benchmarks/python/scatter_bench.py
================================================
# Copyright © 2023-2024 Apple Inc.

import argparse

import mlx.core as mx
import torch
from time_utils import measure_runtime


def benchmark_scatter_mlx(dst_shape, x_shape, idx_shapes):
    def scatter(dst, x, idx):
        dst[tuple(idx)] = x
        mx.eval(dst)

    idx = []
    for idx_shape in idx_shapes:
        idx.append(mx.random.randint(0, dst_shape[0] - 1, idx_shape))
    x = mx.random.normal(x_shape).astype(mx.float32)
    dst = mx.random.normal(dst_shape).astype(mx.float32)

    runtime = measure_runtime(scatter, dst=dst, x=x, idx=idx)
    print(f"MLX: {runtime:.3f}ms")


def benchmark_scatter_torch(dst_shape, x_shape, idx_shapes, device):
    def scatter(dst, x, idx, device):
        dst[tuple(idx)] = x
        if device == torch.device("mps"):
            torch.mps.synchronize()

    idx = []
    for idx_shape in idx_shapes:
        idx.append(torch.randint(0, dst_shape[0] - 1, idx_shape).to(device))
    x = torch.randn(x_shape, dtype=torch.float32).to(device)
    dst = torch.randn(dst_shape, dtype=torch.float32).to(device)

    runtime = measure_runtime(scatter, dst=dst, x=x, idx=idx, device=device)
    print(f"PyTorch: {runtime:.3f}ms")


if __name__ == "__main__":
    parser = argparse.ArgumentParser("Gather benchmarks.")
    parser.add_argument("--cpu", action="store_true", help="Use the CPU.")
    args = parser.parse_args()

    if args.cpu:
        mx.set_default_device(mx.cpu)
        device = torch.device("cpu")
    else:
        device = torch.device("mps")

    dst_shapes = [
        (10, 64),
        (100_000, 64),
        (1_000_000, 64),
        (100_000,),
        (200_000,),
        (20_000_000,),
        (10000, 64),
        (100, 64),
        (100, 10_000, 64),
        (10, 100, 100, 21),
        (1_000, 1_000, 10),
    ]
    idx_shapes = [
        [(1_000_000,)],
        [(1_000_000,)],
        [(100_000,)],
        [(1_000_000,)],
        [(20_000_000,)],
        [(20_000_000,)],
        [(1000000,)],
        [(10000000,)],
        [(1_000,)],
        [(10_000,)],
        [(1_000,), (1_000,)],
    ]
    x_shapes = [
        (1_000_000, 64),
        (1_000_000, 64),
        (100_000, 64),
        (1_000_000,),
        (20_000_000,),
        (20_000_000,),
        (1000000, 64),
        (10000000, 64),
        (1_000, 10_000, 64),
        (10_000, 100, 100, 21),
        (1_000, 10),
    ]

    for dst_shape, x_shape, idx_shape in zip(dst_shapes, x_shapes, idx_shapes):
        print("=" * 20)
        print(f"Dst: {dst_shape}, X {x_shape}, Indices {idx_shape}")
        benchmark_scatter_mlx(dst_shape, x_shape, idx_shape)
        benchmark_scatter_torch(dst_shape, x_shape, idx_shape, device=device)


================================================
FILE: benchmarks/python/sdpa_bench.py
================================================
# Copyright © 2024 Apple Inc.

import argparse
import math
import os
import subprocess
import time

import mlx.core as mx
import numpy as np

device_name = subprocess.check_output(["sysctl", "-n", "machdep.cpu.brand_string"])
device_name = device_name.decode("utf-8").strip("\n")

N_warmup = 5
N_iter_bench = 40
N_iter_func = 8


def bench(f, *args):
    for i in range(N_warmup):
        f(*args)

    s = time.perf_counter_ns()
    for i in range(N_iter_bench):
        f(*args)
    e = time.perf_counter_ns()
    return (e - s) * 1e-9


def prepare_inputs(B, qL, kL, D, qH, kH, mask, transpose, dtype):
    np_dtype = getattr(np, dtype)

    shape_q = (B, qL, qH, D) if transpose else (B, qH, qL, D)
    shape_kv = (B, kL, kH, D) if transpose else (B, kH, kL, D)

    scale = 1.0 / math.sqrt(D)

    q_np = np.random.normal(0.0, 1.0, shape_q).astype(np_dtype)
    k_np = np.random.normal(0.0, scale, shape_kv).astype(np_dtype)
    v_np = np.random.normal(0.0, scale, shape_kv).astype(np_dtype)

    q_mx = mx.array(q_np)
    k_mx = mx.array(k_np)
    v_mx = mx.array(v_np)

    if mask is not None:
        if mask == "additive":
            mask_np = np.random.normal(0.0, 1.0, (B, qH, qL, kL)).astype(np_dtype)
            mask = mx.array(mask_np)
        elif mask == "bool":
            mask_np = np.random.uniform(0.0, 1.0, (B, qH, qL, kL)) < 0.5
            mask = mx.array(mask_np)

    return q_mx, k_mx, v_mx, scale, mask


def mlx_ref_attn(q, k, v, scale=1.0, mask=None):
    q_dtype = q.dtype
    q = q * mx.array(scale, q_dtype)
    n_q_heads = q.shape[-3]
    n_kv_heads = k.shape[-3]
    n_repeats = n_q_heads // n_kv_heads

    B = q.shape[0]
    L = q.shape[2]
    kL = k.shape[2]

    if n_repeats > 1:
        q = mx.reshape(q, [B, n_kv_heads, n_repeats, L, -1])
        k = mx.expand_dims(k, 2)
        v = mx.expand_dims(v, 2)

    scores = q @ mx.swapaxes(k, -1, -2)

    if mask is not None:

        if mask == "causal":
            q_offset = max(0, kL - L)
            q_indices = mx.arange(q_offset, q_offset + L)
            k_indices = mx.arange(kL)
            mask = q_indices[:, None] >= k_indices[None]

        if n_repeats > 1 and mask.ndim >= 3:
            if mask.shape[-3] == 1:
                mask = mx.expand_dims(mask, -3)
            else:
                mask = mx.unflatten(mask, -3, (n_kv_heads, n_repeats))

        if mask.dtype == mx.bool_:
            scores = mx.where(mask, scores, -np.float32(np.inf))
        else:
            scores += mask

    scores = mx.softmax(scores, axis=-1, precise=True)

    out = scores @ v
    if n_repeats > 1:
        out = mx.reshape(out, [B, n_q_heads, L, -1])

    return out


def mlx_fused_attn(q, k, v, scale, mask):
    return mx.fast.scaled_dot_product_attention(q, k, v, scale=scale, mask=mask)


def do_attention(f, q, k, v, scale, mask=None, transpose=False):
    if transpose:
        q_t = mx.transpose(q, (0, 2, 1, 3))
        k_t = mx.transpose(k, (0, 2, 1, 3))
        v_t = mx.transpose(v, (0, 2, 1, 3))
        o_t = f(q_t, k_t, v_t, scale=scale, mask=mask)
        return mx.transpose(o_t, (0, 2, 1, 3))
    else:
        return f(q, k, v, scale=scale, mask=mask)


def do_attention_bench(f, q, k, v, scale, mask=None, transpose=False):
    q_out = q

    for i in range(N_iter_func):
        q_out = do_attention(f, q_out, k, v, scale, mask=mask, transpose=transpose)

    mx.eval(q_out)
    return q_out


def bench_shape(
    B, qsl, ksl, head_dim, n_q_heads, n_kv_heads, dtype, transpose=True, mask_in=None
):
    q_mx, k_mx, v_mx, scale, mask = prepare_inputs(
        B, qsl, ksl, head_dim, n_q_heads, n_kv_heads, mask_in, transpose, dtype
    )

    time_mlx_unfused = bench(
        do_attention_bench, mlx_ref_attn, q_mx, k_mx, v_mx, scale, mask, transpose
    )
    time_mlx_fused = bench(
        do_attention_bench, mlx_fused_attn, q_mx, k_mx, v_mx, scale, mask, transpose
    )

    o_mlx_fused = do_attention(mlx_ref_attn, q_mx, k_mx, v_mx, scale, mask, transpose)
    o_mlx_unfused = do_attention(
        mlx_fused_attn, q_mx, k_mx, v_mx, scale, mask, transpose
    )

    atol = 1e-5 if dtype == "float32" else 2e-4

    if not mx.allclose(o_mlx_fused, o_mlx_unfused, atol=atol, rtol=atol):
        print(
            f"Failed at (B: {B}, qsl: {qsl}, ksl: {ksl}, head_dim: {head_dim}, n_qh: {n_q_heads}, n_kvh: {n_kv_heads}, mask: {mask_in}) [tpose = {transpose}] with max(|a - b|) = {mx.max(mx.abs(o_mlx_unfused - o_mlx_fused)):3.2e}"
        )

    return time_mlx_fused, time_mlx_unfused


def get_gflop_count(B, M, N, K):
    return float(2.0 * N_iter_bench * N_iter_func * B * M * N * K) / float(1024.0**3)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run gemm benchmarks")

    dtypes = ("float16", "float32")[:1]
    transposes = (False,)

    # fmt: off
    shapes_64 = (
        # (  B,   qsl,   ksl, head_dim, n_qh, n_kvh)
          (  1,    32,    32,       64,   32,    32),
          (  1,    64,    64,       64,   32,    32),
          (  1,   128,   128,       64,   32,    32),
          (  1,   256,   256,       64,   32,    32),
          (  1,   512,   512,       64,   32,    32),
          (  1,  1024,  1024,       64,   32,     8),
          (  1,  2048,  2048,       64,   32,     8),
          (  1,  4096,  4096,       64,   32,     8),
    )

    shapes_80 = (
        # (  B,   qsl,   ksl, head_dim, n_qh, n_kvh)
          (  1,  1024,  1024,       80,   32,     8),
          (  1,  2048,  2048,       80,   32,     8),
          (  1,  4096,  4096,       80,   32,     8),
    )

    shapes_128 = (
        # (  B,   qsl,   ksl, head_dim, n_qh, n_kvh)
          (  1,  1024,  1024,      128,   32,     8),
          (  1,  2048,  2048,      128,   32,     8),
          (  1,  4096,  4096,      128,   32,     8),
    )
    # fmt: on

    shapes = shapes_64 + shapes_80 + shapes_128

    masks = [None, "bool", "causal"]

    print(
        "  B,   qsl,   ksl, hdim, n_qh, n_kvh, t,   dtype,     mask, t_unfs, t_fuse, diff%"
    )

    for dtype in dtypes:
        for transpose in transposes:
            for B, qsl, ksl, head_dim, n_q_heads, n_kv_heads in shapes:
                for mask_in in masks:
                    time_mlx_fused, time_mlx_unfused = bench_shape(
                        B,
                        qsl,
                        ksl,
                        head_dim,
                        n_q_heads,
                        n_kv_heads,
                        dtype,
                        transpose,
                        mask_in,
                    )
                    diff = time_mlx_unfused / time_mlx_fused - 1.0
                    t_str = 1 if transpose else 0
                    print(
                        f"{B:3d}, {qsl:5d}, {ksl:5d}, {head_dim:4d}, {n_q_heads:4d}, {n_kv_heads:5d}, {t_str:1d}, {dtype}, {str(mask_in):>8}, {time_mlx_unfused: 2.3f}, {time_mlx_fused: 2.3f}, {100. * diff:+5.2f}%"
                    )


================================================
FILE: benchmarks/python/sdpa_vector_bench.py
================================================
import argparse
import math

import mlx.core as mx
from time_utils import time_fn

L = 16384
H = 32
H_k = H // 4
D = 128
V = 128
dtype = mx.float16
loops = 10


def upproject(x, w):
    if w is None:
        return x
    else:
        return x @ w.T


def attention(q, k, v, mask=None, w=None):
    def _sdpa(q, k, v):
        B, Hq, L, D = q.shape
        _, Hk, S, _ = k.shape
        _, _, _, V = v.shape
        q = q.reshape(B, Hk, Hq // Hk, L, D)
        k = k[:, :, None, :, :]
        v = v[:, :, None, :, :]
        s = q @ k.transpose(0, 1, 2, 4, 3)
        if mask is not None:
            m = mx.broadcast_to(mask, (B, Hq, L, S)).reshape(B, Hk, Hq // Hk, L, S)
            s = mx.where(m, s, mx.finfo(s.dtype).min)
        p = mx.softmax(s.astype(mx.float32), axis=-1).astype(s.dtype)
        o = p @ v
        return o.reshape(B, Hq, L, V)

    for i in range(loops):
        q = _sdpa(q, k, v)
        q = upproject(q, w)
    return q


def sdpa(q, k, v, mask=None, w=None):
    for i in range(loops):
        q = mx.fast.scaled_dot_product_attention(q, k, v, scale=1.0, mask=mask)
        q = upproject(q, w)
    return q


def time_self_attention_primitives():
    mx.random.seed(3)
    q = mx.random.uniform(shape=(1, H, 1, D)).astype(dtype)
    k = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
    v = mx.random.uniform(shape=(1, H_k, L, V)).astype(dtype)
    w = mx.random.uniform(shape=(D, V)).astype(dtype) if V != D else None
    mx.eval(q, k, v, w)
    time_fn(attention, q, k, v, w=w)


def time_self_attention_sdpa():
    mx.random.seed(3)
    q = mx.random.uniform(shape=(1, H, 1, D)).astype(dtype)
    k = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
    v = mx.random.uniform(shape=(1, H_k, L, V)).astype(dtype)
    w = mx.random.uniform(shape=(D, V)).astype(dtype) if V != D else None
    mx.eval(q, k, v, w)
    time_fn(sdpa, q, k, v, w=w)


def time_self_attention_sdpa_with_mask():
    mx.random.seed(3)
    q = mx.random.uniform(shape=(1, H, 1, D)).astype(dtype)
    k = mx.random.uniform(shape=(1, H_k, L, D)).astype(dtype)
    v = mx.random.uniform(shape=(1, H_k, L, V)).astype(dtype)
    w = mx.random.uniform(shape=(D, V)).astype(dtype) if V != D else None
    mask = mx.full((L,), True)
    mask[L // 2 :] = False
    mx.eval(q, k, v, mask, w)

    def sdpa_mask(*args):
        return sdpa(*args, mask=mask, w=w)

    def attention_mask(*args):
        return attention(*args, mask=mask, w=w)

    time_fn(attention_mask, q, k, v)
    time_fn(sdpa_mask, q, k, v)


if __name__ == "__main__":
    time_self_attention_sdpa()
    time_self_attention_primitives()
    time_self_attention_sdpa_with_mask()


================================================
FILE: benchmarks/python/segmented_mm_bench.py
================================================
# Copyright © 2026 Apple Inc.

import argparse
import time

import mlx.core as mx
import numpy as np

MLX_DTYPES = {
    "float16": mx.float16,
    "bfloat16": mx.bfloat16,
    "float32": mx.float32,
}


def parse_cases(cases):
    parsed = []
    for spec in cases.split(","):
        m, n, k, s = [int(x) for x in spec.split("x")]
        parsed.append((m, n, k, s))
    return parsed


def make_segments(k, num_segments, pattern, seed):
    if pattern == "equal":
        cuts = np.linspace(0, k, num_segments + 1, dtype=np.int64)
    else:
        rng = np.random.default_rng(seed)
        cuts = rng.integers(0, k + 1, size=(num_segments - 1,), dtype=np.int64)
        cuts = np.sort(cuts)
        cuts = np.concatenate(([0], cuts, [k]))
    return np.stack([cuts[:-1], cuts[1:]], axis=1).astype(np.uint32)


def numpy_segmented_mm_ref(a, b, segments):
    """Ground-truth reference in float64."""
    out = []
    for start, end in segments:
        out.append(a[:, start:end] @ b[start:end, :])
    return np.stack(out, axis=0)


def mlx_segmented_mm_loop(a, b, segments):
    """MLX loop-of-matmuls baseline."""
    segments_list = segments.tolist()
    out = []
    for start, end in segments_list:
        out.append(a[:, start:end] @ b[start:end, :])
    return mx.stack(out, axis=0)


def bench_mlx(a, b, segments, warmup, iters):
    for _ in range(warmup):
        y = mx.segmented_mm(a, b, segments)
        mx.eval(y)
    mx.synchronize()

    start = time.perf_counter()
    for _ in range(iters):
        y = mx.segmented_mm(a, b, segments)
        mx.eval(y)
    mx.synchronize()
    end = time.perf_counter()
    return (end - start) * 1e3 / iters


def bench_mlx_loop(a, b, segments, warmup, iters):
    for _ in range(warmup):
        y = mlx_segmented_mm_loop(a, b, segments)
        mx.eval(y)
    mx.synchronize()

    start = time.perf_counter()
    for _ in range(iters):
        y = mlx_segmented_mm_loop(a, b, segments)
        mx.eval(y)
    mx.synchronize()
    end = time.perf_counter()
    return (end - start) * 1e3 / iters


def print_table(headers, rows):
    widths = [len(h) for h in headers]
    for row in rows:
        for i, cell in enumerate(row):
            widths[i] = max(widths[i], len(cell))

    def fmt_row(row):
        return (
            "| "
            + " | ".join(f"{cell:<{widths[i]}}" for i, cell in enumerate(row))
            + " |"
        )

    sep = "|-" + "-|-".join("-" * w for w in widths) + "-|"
    print(fmt_row(headers))
    print(sep)
    for row in rows:
        print(fmt_row(row))


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--cases",
        default=(
            "128x128x1024x16,"
            "128x128x1024x32,"
            "256x256x2048x16,"
            "512x512x4096x32,"
            "1024x1024x4096x32,"
            "1024x1024x8192x64"
        ),
        help="Comma-separated MxNxKxS list.",
    )
    parser.add_argument(
        "--dtype",
        default="float32",
        choices=["float16", "bfloat16", "float32"],
    )
    parser.add_argument("--warmup", type=int, default=10)
    parser.add_argument("--iters", type=int, default=50)
    parser.add_argument(
        "--segments",
        choices=["equal", "random"],
        default="random",
        help="Segment generation pattern.",
    )
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--no-check", action="store_true")
    args = parser.parse_args()

    mlx_dtype = MLX_DTYPES[args.dtype]

    print(
        f"dtype={args.dtype} warmup={args.warmup} iters={args.iters} segments={args.segments}"
    )

    headers = [
        "Case",
        "MLX ms",
        "Loop ms",
        "Speedup",
        "MLX err",
        "Loop err",
    ]
    rows = []

    cases = parse_cases(args.cases)
    for idx, (m, n, k, s) in enumerate(cases):
        rng = np.random.default_rng(args.seed + idx)
        a_np = rng.standard_normal((m, k)).astype(np.float32)
        b_np = rng.standard_normal((k, n)).astype(np.float32)
        seg_np = make_segments(k, s, args.segments, args.seed + idx)

        a_mx = mx.array(a_np, dtype=mlx_dtype)
        b_mx = mx.array(b_np, dtype=mlx_dtype)
        seg_mx = mx.array(seg_np, dtype=mx.uint32)
        mx.eval(a_mx, b_mx, seg_mx)

        mlx_err_str = ""
        loop_err_str = ""
        if not args.no_check:
            y_mlx = mx.segmented_mm(a_mx, b_mx, seg_mx)
            y_loop = mlx_segmented_mm_loop(a_mx, b_mx, seg_mx)
            mx.eval(y_mlx, y_loop)

            if args.dtype == "float32":
                ref = numpy_segmented_mm_ref(
                    a_np.astype(np.float64),
                    b_np.astype(np.float64),
                    seg_np.tolist(),
                )
                mlx_err = np.max(np.abs(np.array(y_mlx, dtype=np.float64) - ref))
                loop_err = np.max(np.abs(np.array(y_loop, dtype=np.float64) - ref))
            else:
                a_mx_f32 = mx.array(a_np, dtype=mx.float32)
                b_mx_f32 = mx.array(b_np, dtype=mx.float32)
                ref = mx.segmented_mm(a_mx_f32, b_mx_f32, seg_mx)
                mx.eval(ref)
                mlx_err = float(mx.max(mx.abs(ref - y_mlx.astype(mx.float32))).item())
                loop_err = float(mx.max(mx.abs(ref - y_loop.astype(mx.float32))).item())
            mlx_err_str = f"{mlx_err:.2e}"
            loop_err_str = f"{loop_err:.2e}"

        t_mlx = bench_mlx(a_mx, b_mx, seg_mx, args.warmup, args.iters)
        t_loop = bench_mlx_loop(a_mx, b_mx, seg_mx, args.warmup, args.iters)
        ratio = t_loop / t_mlx if t_mlx > 0 else float("inf")
        rows.append(
            [
                f"{m}x{n}x{k}x{s}",
                f"{t_mlx:.3f}",
                f"{t_loop:.3f}",
                f"{ratio:.2f}x",
                mlx_err_str,
                loop_err_str,
            ]
        )

    print_table(headers, rows)
    if not args.no_check:
        if args.dtype == "float32":
            print("err: max|result - numpy_fp64_ref|")
        else:
            print("err: max|result - own_fp32_result|")


if __name__ == "__main__":
    main()


================================================
FILE: benchmarks/python/single_ops.py
================================================
# Copyright © 2023 Apple Inc.

import argparse

import mlx.core as mx
from time_utils import time_fn


def time_add():
    a = mx.random.uniform(shape=(32, 1024, 1024))
    b = mx.random.uniform(shape=(32, 1024, 1024))
    mx.eval(a, b)
    time_fn(mx.add, a, b)

    aT = mx.transpose(a, [0, 2, 1])
    mx.eval(aT)

    def transpose_add(a, b):
        return mx.add(a, b)

    time_fn(transpose_add, aT, b)

    b = mx.random.uniform(shape=(1024,))
    mx.eval(b)

    def slice_add(a, b):
        return mx.add(a, b)

    time_fn(slice_add, a, b)

    b = mx.reshape(b, (1, 1024, 1))
    mx.eval(b)

    def mid_slice_add(a, b):
        return mx.add(a, b)

    time_fn(mid_slice_add, a, b)


def time_matmul():
    a = mx.random.uniform(shape=(1024, 1024))
    b = mx.random.uniform(shape=(1024, 1024))
    mx.eval(a, b)
    time_fn(mx.matmul, a, b)


def time_maximum():
    a = mx.random.uniform(shape=(32, 1024, 1024))
    b = mx.random.uniform(shape=(32, 1024, 1024))
    mx.eval(a, b)
    time_fn(mx.maximum, a, b)


def time_max():
    a = mx.random.uniform(shape=(32, 1024, 1024))
    a[1, 1] = mx.nan
    mx.eval(a)
    time_fn(mx.max, a, 0)


def time_min():
    a = mx.random.uniform(shape=(32, 1024, 1024))
    a[1, 1] = mx.nan
    mx.eval(a)
    time_fn(mx.min, a, 0)


def time_negative():
    a = mx.random.uniform(shape=(10000, 1000))
    mx.eval(a)

    def negative(a):
        return -a

    mx.eval(a)

    time_fn(negative, a)


def time_exp():
    a = mx.random.uniform(shape=(1000, 100))
    mx.eval(a)
    time_fn(mx.exp, a)


def time_logsumexp():
    a = mx.random.uniform(shape=(64, 10, 10000))
    mx.eval(a)
    time_fn(mx.logsumexp, a, axis=-1)


def time_take():
    a = mx.random.uniform(shape=(10000, 500))
    ids = mx.random.randint(low=0, high=10000, shape=(20, 10))
    ids = [mx.reshape(idx, (-1,)) for idx in ids]
    mx.eval(ids)

    def random_take():
        return [mx.take(a, idx, 0) for idx in ids]

    time_fn(random_take)


def time_reshape_transposed():
    x = mx.random.uniform(shape=(256, 256, 128))
    mx.eval(x)

    def reshape_transposed():
        return mx.reshape(mx.transpose(x, (1, 0, 2)), (-1,))

    time_fn(reshape_transposed)


if __name__ == "__main__":
    parser = argparse.ArgumentParser("MLX benchmarks.")
    parser.add_argument("--gpu", action="store_true", help="Use the Metal back-end.")
    args = parser.parse_args()
    if args.gpu:
        mx.set_default_device(mx.gpu)
    else:
        mx.set_default_device(mx.cpu)

    time_add()
    time_matmul()
    time_min()
    time_max()
    time_maximum()
    time_exp()
    time_negative()
    time_logsumexp()
    time_take()
    time_reshape_transposed()


================================================
FILE: benchmarks/python/slice_update_bench.py
================================================
# Copyright © 2023-2024 Apple Inc.

import argparse

import mlx.core as mx
import torch
from time_utils import measure_runtime


def benchmark_slice_update_mlx(dst_shape, slice_shape, slice_range, dtype, iters=10):
    def slice_update(arguments):
        for i in range(iters):
            arguments["dst"] = (
                arguments["dst"].at[slice_range].add(arguments["updates"])
            )
        mx.eval(arguments)

    dtype = getattr(mx, dtype)
    arguments = {
        "dst": mx.random.normal(dst_shape).astype(dtype),
        "updates": mx.random.normal(slice_shape).astype(dtype),
    }

    runtime = measure_runtime(slice_update, arguments=arguments)
    bytes_processed = (
        arguments["dst"][slice_range].nbytes * 2 + arguments["updates"].nbytes
    ) * iters
    bandwidth_gb_s = bytes_processed / runtime / 1e6
    return runtime, bandwidth_gb_s


def benchmark_slice_update_torch(
    dst_shape, slice_shape, slice_range, device, dtype, iters=10
):
    def slice_update(dst, updates, slice_range):
        for i in range(iters):
            dst[slice_range] = dst[slice_range] + updates
        if device == torch.device("mps"):
            torch.mps.synchronize()

    dtype = getattr(torch, dtype)
    updates = torch.randn(slice_shape, dtype=dtype).to(device)
    dst = torch.randn(dst_shape, dtype=dtype).to(device)

    runtime = measure_runtime(
        slice_update, dst=dst, updates=updates, slice_range=slice_range
    )
    bytes_processed = (dst[slice_range].nbytes * 2 + updates.nbytes) * iters
    bandwidth_gb_s = bytes_processed / runtime / 1e6
    return runtime, bandwidth_gb_s


if __name__ == "__main__":
    parser = argparse.ArgumentParser("Slice update benchmarks.")
    parser.add_argument("--cpu", action="store_true", help="Use the CPU.")
    args = parser.parse_args()

    if args.cpu:
        mx.set_default_device(mx.cpu)
        device = torch.device("cpu")
    elif torch.mps.is_available():
        device = torch.device("mps")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        raise ValueError()

    dtypes = ["float32", "bfloat16"]

    test_cases = [
        ((10_000_000,), slice(0, 1_000_000), (1_000_000,)),
        ((100_000,), slice(10_000, 20_000), (10_000,)),
        ((1000, 64), slice(100, 200), (100, 64)),
        ((100, 100, 64), slice(20, 40), (20, 100, 64)),
        (
            (2048, 2048, 128),
            (slice(500, 1500), slice(200, 1200), slice(32, 96)),
            (1000, 1000, 64),
        ),
        (
            (2048, 2048, 128),
            (slice(1800, 1850), slice(100, 200), slice(64, 128)),
            (50, 100, 64),
        ),
        (
            (2048, 2048, 128),
            (slice(1000, 1010), slice(1000, 1010), slice(64, 128)),
            (10, 10, 64),
        ),
    ]

    print(
        f"{'Dtype':<12} {'Dst Shape':<25} {'Update Shape':<20} "
        f"{'MLX (ms)':<12} {'MLX GB/s':<12} {'Torch (ms)':<12} {'Torch GB/s':<12}"
    )
    print("-" * 110)

    for dtype in dtypes:
        for dst_shape, slice_range, update_shape in test_cases:
            mlx_time, mlx_bw = benchmark_slice_update_mlx(
                dst_shape, update_shape, slice_range, dtype
            )
            torch_time, torch_bw = benchmark_slice_update_torch(
                dst_shape, update_shape, slice_range, device, dtype
            )
            print(
                f"{dtype:<12} {str(dst_shape):<25} {str(update_shape):<20} "
                f"{mlx_time:<12.3f} {mlx_bw:<12.2f} {torch_time:<12.3f} {torch_bw:<12.2f}"
            )


================================================
FILE: benchmarks/python/synchronize_bench.py
================================================
import time

import mlx.core as mx

rank = mx.distributed.init().rank()


def timeit(fn, a):

    # warmup
    for _ in range(5):
        mx.eval(fn(a))

    its = 10
    tic = time.perf_counter()
    for _ in range(its):
        mx.eval(fn(a))
    toc = time.perf_counter()
    ms = 1000 * (toc - tic) / its
    return ms


def all_reduce_benchmark():
    a = mx.ones((5, 5), mx.int32)

    its_per_eval = 100

    def fn(x):
        for _ in range(its_per_eval):
            x = mx.distributed.all_sum(x)
            x = x - 1
        return x

    ms = timeit(fn, a) / its_per_eval
    if rank == 0:
        print(f"All Reduce: time per iteration {ms:.6f} (ms)")


def all_gather_benchmark():
    a = mx.ones((5, 5), mx.int32)
    its_per_eval = 100

    def fn(x):
        for _ in range(its_per_eval):
            x = mx.distributed.all_gather(x)[0]
        return x

    ms = timeit(fn, a) / its_per_eval
    if rank == 0:
        print(f"All gather: time per iteration {ms:.6f} (ms)")


if __name__ == "__main__":
    all_reduce_benchmark()
    all_gather_benchmark()


================================================
FILE: benchmarks/python/time_utils.py
================================================
# Copyright © 2023-2024 Apple Inc.

import time

import mlx.core as mx


def time_fn(fn, *args, **kwargs):
    msg = kwargs.pop("msg", None)
    if msg:
        print(f"Timing {msg} ...", end=" ")
    else:
        print(f"Timing {fn.__name__} ...", end=" ")

    # warmup
    for _ in range(5):
        mx.eval(fn(*args, **kwargs))

    num_iters = 100
    tic = time.perf_counter()
    for _ in range(num_iters):
        x = mx.eval(fn(*args, **kwargs))
    toc = time.perf_counter()

    msec = 1e3 * (toc - tic) / num_iters
    print(f"{msec:.5f} msec")


def measure_runtime(fn, **kwargs):
    # Warmup
    for _ in range(5):
        fn(**kwargs)

    tic = time.perf_counter()
    iters = 100
    for _ in range(iters):
        fn(**kwargs)
    return (time.perf_counter() - tic) * 1000 / iters


================================================
FILE: cmake/FindCUDNN.cmake
================================================
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# Modified from
# https://github.com/NVIDIA/cudnn-frontend/blob/main/cmake/cuDNN.cmake

# Return the last file matching the pattern.
function(find_file_glob VAR PATTERN)
  file(GLOB _RESULT "${PATTERN}")
  if(_RESULT)
    list(LENGTH ${_RESULT} _RESULT_LENGTH)
    if(_RESULT_LENGTH GREATER 0)
      list(GET ${_RESULT} -1 _RESULT)
    endif()
    set(${VAR}
        "${_RESULT}"
        PARENT_SCOPE)
  endif()
endfunction()

# Find the dir including the "cudnn.h" file.
find_path(
  CUDNN_INCLUDE_DIR cudnn.h
  HINTS ${CUDNN_INCLUDE_PATH} ${CUDAToolkit_INCLUDE_DIRS}
  PATH_SUFFIXES include OPTIONAL)

# Glob searching "cudnn.h" for Windows.
if(WIN32 AND NOT CUDNN_INCLUDE_DIR)
  find_file_glob(
    CUDNN_H_PATH
    "C:/Program Files/NVIDIA/CUDNN/*/include/${CUDAToolkit_VERSION_MAJOR}.*/cudnn.h"
  )
  if(CUDNN_H_PATH)
    get_filename_component(CUDNN_INCLUDE_DIR "${CUDNN_H_PATH}" DIRECTORY)
  endif()
endif()

if(NOT CUDNN_INCLUDE_DIR)
  message(
    FATAL_ERROR
      "Unable to find cudnn.h, please make sure cuDNN is installed and pass CUDNN_INCLUDE_PATH to cmake."
  )
endif()

# Get cudnn version.
file(READ "${CUDNN_INCLUDE_DIR}/cudnn_version.h" cudnn_version_header)
string(REGEX MATCH "#define CUDNN_MAJOR [1-9]+" macrodef
             "${cudnn_version_header}")
string(REGEX MATCH "[1-9]+" CUDNN_MAJOR_VERSION "${macrodef}")

# Function for searching library files.
function(find_cudnn_library NAME)
  if(NOT "${ARGV1}" STREQUAL "OPTIONAL")
    set(_CUDNN_REQUIRED TRUE)
  else()
    set(_CUDNN_REQUIRED FALSE)
  endif()

  find_library(
    ${NAME}_LIBRARY
    NAMES ${NAME} "lib${NAME}.so.${CUDNN_MAJOR_VERSION}" NAMES_PER_DIR
    HINTS ${CUDNN_LIBRARY_PATH} ${CUDAToolkit_LIBRARY_DIR}
    PATH_SUFFIXES lib64 lib/x64 lib OPTIONAL)

  if(WIN32 AND NOT ${NAME}_LIBRARY)
    find_file_glob(
      ${NAME}_LIBRARY
      "C:/Program Files/NVIDIA/CUDNN/*/lib/${CUDAToolkit_VERSION_MAJOR}.*/x64/${NAME}.lib"
    )
  endif()

  if(NOT ${NAME}_LIBRARY AND ${_CUDNN_REQUIRED})
    message(
      FATAL_ERROR
        "Unable to find ${NAME}, please make sure cuDNN is installed and pass CUDNN_LIBRARY_PATH to cmake."
    )
  endif()

  if(${NAME}_LIBRARY)
    add_library(CUDNN::${NAME} UNKNOWN IMPORTED)
    set_target_properties(
      CUDNN::${NAME}
      PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR}
                 IMPORTED_LOCATION ${${NAME}_LIBRARY})
    set(${NAME}_LIBRARY
        "${${NAME}_LIBRARY}"
        PARENT_SCOPE)
  else()
    message(STATUS "${NAME} not found.")
  endif()
endfunction()

# Search for the main cudnn library.
find_cudnn_library(cudnn)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(CUDNN REQUIRED_VARS CUDNN_INCLUDE_DIR
                                                      cudnn_LIBRARY)

if(CUDNN_INCLUDE_DIR AND cudnn_LIBRARY)
  set(CUDNN_FOUND
      ON
      CACHE INTERNAL "cuDNN Library Found")
else()
  set(CUDNN_FOUND
      OFF
      CACHE INTERNAL "cuDNN Library Not Found")
endif()

# Find out all the DLL files for Windows.
if(WIN32 AND cudnn_LIBRARY)
  get_filename_component(CUDNN_BIN_DIR "${cudnn_LIBRARY}" DIRECTORY)
  string(REPLACE "/lib/" "/bin/" CUDNN_BIN_DIR "${CUDNN_BIN_DIR}")
  file(
    GLOB CUDNN_DLL_NAMES
    RELATIVE "${CUDNN_BIN_DIR}"
    "${CUDNN_BIN_DIR}/*.dll")
endif()

# Create an interface library that users can link with.
add_library(CUDNN::cudnn_all INTERFACE IMPORTED)
target_link_libraries(CUDNN::cudnn_all INTERFACE CUDNN::cudnn)
target_include_directories(
  CUDNN::cudnn_all INTERFACE $<INSTALL_INTERFACE:include>
                             $<BUILD_INTERFACE:${CUDNN_INCLUDE_DIR}>)

# Add other components of cudnn.
if(CUDNN_MAJOR_VERSION EQUAL 8)
  find_cudnn_library(cudnn_adv_infer)
  find_cudnn_library(cudnn_adv_train)
  find_cudnn_library(cudnn_cnn_infer)
  find_cudnn_library(cudnn_cnn_train)
  find_cudnn_library(cudnn_ops_infer)
  find_cudnn_library(cudnn_ops_train)

  target_link_libraries(
    CUDNN::cudnn_all
    INTERFACE CUDNN::cudnn_adv_train CUDNN::cudnn_ops_train
              CUDNN::cudnn_cnn_train CUDNN::cudnn_adv_infer
              CUDNN::cudnn_cnn_infer CUDNN::cudnn_ops_infer)

elseif(CUDNN_MAJOR_VERSION EQUAL 9)
  find_cudnn_library(cudnn_graph)
  find_cudnn_library(cudnn_engines_runtime_compiled)
  find_cudnn_library(cudnn_ops OPTIONAL)
  find_cudnn_library(cudnn_cnn OPTIONAL)
  find_cudnn_library(cudnn_adv OPTIONAL)
  find_cudnn_library(cudnn_engines_precompiled OPTIONAL)
  find_cudnn_library(cudnn_heuristic OPTIONAL)

  target_link_libraries(
    CUDNN::cudnn_all
    INTERFACE CUDNN::cudnn_graph
              CUDNN::cudnn_engines_runtime_compiled
              CUDNN::cudnn_ops
              CUDNN::cudnn_cnn
              CUDNN::cudnn_adv
              CUDNN::cudnn_engines_precompiled
              CUDNN::cudnn_heuristic)
endif()


================================================
FILE: cmake/FindNCCL.cmake
================================================
# FindNCCL.cmake This module finds the NVIDIA NCCL library and its include
# directories.

set(NCCL_ROOT_DIR
    $ENV{NCCL_ROOT_DIR}
    CACHE PATH "Folder contains NVIDIA NCCL")

find_path(
  NCCL_INCLUDE_DIRS
  NAMES nccl.h
  HINTS ${NCCL_INCLUDE_DIR} ${NCCL_ROOT_DIR} ${NCCL_ROOT_DIR}/include
        ${CUDA_TOOLKIT_ROOT_DIR}/include)

if($ENV{USE_STATIC_NCCL})
  message(
    STATUS "USE_STATIC_NCCL detected. Linking against static NCCL library")
  set(NCCL_LIBNAME "libnccl_static.a")
else()
  set(NCCL_LIBNAME "nccl")
endif()

find_library(
  NCCL_LIBRARIES
  NAMES ${NCCL_LIBNAME}
  HINTS ${NCCL_LIB_DIR}
        ${NCCL_ROOT_DIR}
        ${NCCL_ROOT_DIR}/lib
        ${NCCL_ROOT_DIR}/lib/x86_64-linux-gnu
        ${NCCL_ROOT_DIR}/lib64
        ${CUDA_TOOLKIT_ROOT_DIR}/lib
        ${CUDA_TOOLKIT_ROOT_DIR}/lib64)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS
                                  NCCL_LIBRARIES)

if(NCCL_FOUND)
  set(NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
  message(
    STATUS "Determining NCCL version from the header file: ${NCCL_HEADER_FILE}")
  file(
    STRINGS ${NCCL_HEADER_FILE} NCCL_MAJOR_VERSION_DEFINED
    REGEX "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+[0-9]+.*$"
    LIMIT_COUNT 1)
  if(NCCL_MAJOR_VERSION_DEFINED)
    string(REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+" ""
                         NCCL_MAJOR_VERSION ${NCCL_MAJOR_VERSION_DEFINED})
    message(STATUS "NCCL_MAJOR_VERSION: ${NCCL_MAJOR_VERSION}")
  endif()
  message(
    STATUS
      "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
  mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
endif()


================================================
FILE: cmake/Findnvpl.cmake
================================================
# This file does nothing but to suppress the cmake warning: "By not providing
# Findnvpl.cmake in CMAKE_MODULE_PATH...", which is caused by the
# find_package(nvpl) from cmake's builtin FindLAPACK.cmake module.


================================================
FILE: cmake/extension.cmake
================================================
include(CMakeParseArguments)

# clang format off
#
# ##############################################################################
# Build metal library
#
# Adds a custom target ${TARGET} to build ${OUTPUT_DIRECTORY}/{TITLE}.metallib
# from list ${SOURCES}, including list ${INCLUDE_DIRS}, depends on list ${DEPS}
#
# Args: TARGET: Custom target to be added for the metal library TITLE: Name of
# the .metallib OUTPUT_DIRECTORY: Where to place ${TITLE}.metallib SOURCES: List
# of source files INCLUDE_DIRS: List of include dirs DEPS: List of dependency
# files (like headers) DEBUG: Boolean, if true, enables debug compile options
# for this specific library. If not provided, uses global MLX_METAL_DEBUG.
#
# clang format on

macro(mlx_build_metallib)
  # Parse args
  set(oneValueArgs TARGET TITLE OUTPUT_DIRECTORY DEBUG)
  set(multiValueArgs SOURCES INCLUDE_DIRS DEPS)
  cmake_parse_arguments(MTLLIB "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

  # Set output
  set(MTLLIB_BUILD_TARGET "${MTLLIB_OUTPUT_DIRECTORY}/${MTLLIB_TITLE}.metallib")

  # Collect compile options
  set(MTLLIB_COMPILE_OPTIONS -Wall -Wextra -fno-fast-math -Wno-c++17-extensions)
  if(MLX_METAL_DEBUG OR MTLLIB_DEBUG)
    set(MTLLIB_COMPILE_OPTIONS ${MTLLIB_COMPILE_OPTIONS} -gline-tables-only
                               -frecord-sources)
  endif()

  # Prepare metallib build command
  add_custom_command(
    OUTPUT ${MTLLIB_BUILD_TARGET}
    COMMAND
      xcrun -sdk macosx metal
      "$<LIST:TRANSFORM,${MTLLIB_INCLUDE_DIRS},PREPEND,-I>"
      ${MTLLIB_COMPILE_OPTIONS} ${MTLLIB_SOURCES} -o ${MTLLIB_BUILD_TARGET}
    DEPENDS ${MTLLIB_DEPS} ${MTLLIB_SOURCES}
    COMMAND_EXPAND_LISTS
    COMMENT "Building ${MTLLIB_TITLE}.metallib"
    VERBATIM)

  # Add metallib custom target
  add_custom_target(${MTLLIB_TARGET} DEPENDS ${MTLLIB_BUILD_TARGET})

endmacro(mlx_build_metallib)


================================================
FILE: docs/.clang-format
================================================
DisableFormat: true
SortIncludes: Never


================================================
FILE: docs/.gitignore
================================================
src/python/_autosummary*/
src/python/nn/_autosummary*/
src/python/optimizers/_autosummary*/


================================================
FILE: docs/.nojekyll
================================================


================================================
FILE: docs/Doxyfile
================================================
################################################################################
# Primary project setup.                                                       #
################################################################################

PROJECT_NAME           = "MLX"
OUTPUT_DIRECTORY       = build
XML_OUTPUT             = xml
HTML_OUTPUT            = html
STRIP_FROM_PATH        = ../
INPUT                  = ../mlx
FILE_PATTERNS          = *.h
EXCLUDE_PATTERNS       = */private/*
CREATE_SUBDIRS         = NO
FULL_PATH_NAMES        = YES
RECURSIVE              = YES
GENERATE_HTML          = NO
GENERATE_LATEX         = NO
GENERATE_XML           = YES
XML_PROGRAMLISTING     = YES

################################################################################
# Doxygen preprocessor / parser control.                                       #
################################################################################

ENABLE_PREPROCESSING   = YES
MACRO_EXPANSION        = YES
EXPAND_ONLY_PREDEF     = NO
SKIP_FUNCTION_MACROS   = NO
PREDEFINED             = MLX_API=

################################################################################
# Compound extraction control.                                                 #
################################################################################

EXTRACT_ALL            = YES
EXTRACT_PACKAGE        = YES
EXTRACT_STATIC         = YES
CASE_SENSE_NAMES       = NO

################################################################################
# Docstring control / customization.                                           #
################################################################################

JAVADOC_AUTOBRIEF      = YES

################################################################################
# Warning suppression.                                                         #
################################################################################

QUIET                  = YES
WARN_IF_UNDOCUMENTED   = NO


================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation

# You can set these variables from the command line.
SPHINXOPTS    =
SPHINXBUILD   = sphinx-build
SOURCEDIR     = src
BUILDDIR      = build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


================================================
FILE: docs/README.md
================================================
## Build the Docs

### Setup (do once)

Install Doxygen:

```
brew install doxygen
```

Install Python packages:

```
pip install -r requirements.txt
```

### Build

Build the docs from `mlx/docs/`

```
doxygen && make html
```

View the docs by running a server in `mlx/docs/build/html/`:

```
python -m http.server <port>
```

and point your browser to `http://localhost:<port>`.

### Push to GitHub Pages

Check-out the `gh-pages` branch (`git switch gh-pages`) and build
the docs. Then force add the `build/html` directory:

`git add -f build/html`

Commit and push the changes to the `gh-pages` branch.

## Doc Development Setup

To enable live refresh of docs while writing:

Install sphinx autobuild
```
pip install sphinx-autobuild
```

Run auto build on docs/src folder
```
sphinx-autobuild ./src ./build/html
```


================================================
FILE: docs/index.html
================================================
<meta http-equiv="refresh" content="0; url=./build/html/index.html" />


================================================
FILE: docs/requirements.txt
================================================
sphinx
breathe
sphinx-book-theme
sphinx-copybutton
mlx


================================================
FILE: docs/src/_templates/module-base-class.rst
================================================
{{ fullname | escape | underline}}

.. currentmodule:: {{ module }}

.. add toctree option to make autodoc generate the pages

.. autoclass:: {{ objname }}

   {% block attributes %}
   {% if attributes %}
   .. rubric:: Attributes

   .. autosummary::
      :toctree: .
   {% for item in attributes %}
      ~{{ fullname }}.{{ item }}
   {%- endfor %}
   {% endif %}
   {% endblock %}

   {% block methods %}
   {% if methods %}
   .. rubric:: Methods

   .. autosummary::
      :toctree: .
   {% for item in methods %}
      {%- if item not in inherited_members and item != '__init__' %}
      ~{{ fullname }}.{{ item }}
      {%- endif -%}
   {%- endfor %}
   {% endif %}
   {% endblock %}


================================================
FILE: docs/src/_templates/nn-module-template.rst
================================================
{{ fullname | escape | underline}}

.. currentmodule:: {{ module }}

.. autoclass:: {{ objname }}

   {% block methods %}

   {% if methods %}
   .. rubric:: {{ _('Methods') }}

   .. autosummary::
   {% for item in methods %}
      {%- if item not in inherited_members and item != "__init__" %}
         ~{{ name }}.{{ item }}
      {%- endif %}
   {%- endfor %}
   {% endif %}
   {% endblock %}


================================================
FILE: docs/src/_templates/optimizers-template.rst
================================================
{{ fullname | escape | underline}}

.. currentmodule:: {{ module }}

.. autoclass:: {{ objname }}

   {% block methods %}

   {% if methods %}
   .. rubric:: {{ _('Methods') }}

   .. autosummary::
   {% for item in methods %}
      {%- if item not in inherited_members %}
         ~{{ name }}.{{ item }}
      {%- endif %}
   {%- endfor %}
   {% endif %}
   {% endblock %}


================================================
FILE: docs/src/conf.py
================================================
# Copyright © 2023 Apple Inc.

# -*- coding: utf-8 -*-

import os
import subprocess

import mlx.core as mx

# -- Project information -----------------------------------------------------

project = "MLX"
copyright = "2023, Apple"
author = "MLX Contributors"
version = ".".join(mx.__version__.split(".")[:3])
release = version

# -- General configuration ---------------------------------------------------

extensions = [
    "sphinx_copybutton",
    "sphinx.ext.autodoc",
    "sphinx.ext.autosummary",
    "sphinx.ext.intersphinx",
    "sphinx.ext.napoleon",
    "breathe",
]

python_use_unqualified_type_names = True
autosummary_generate = True
autosummary_filename_map = {"mlx.core.Stream": "stream_class"}

intersphinx_mapping = {
    "python": ("https://docs.python.org/3", None),
    "numpy": ("https://numpy.org/doc/stable/", None),
}

breathe_projects = {"mlx": "../build/xml"}
breathe_default_project = "mlx"

templates_path = ["_templates"]
html_static_path = ["_static"]
source_suffix = ".rst"
main_doc = "index"
highlight_language = "python"
pygments_style = "sphinx"
add_module_names = False

# -- Options for HTML output -------------------------------------------------

html_theme = "sphinx_book_theme"

html_theme_options = {
    "show_toc_level": 2,
    "repository_url": "https://github.com/ml-explore/mlx",
    "use_repository_button": True,
    "navigation_with_keys": False,
    "logo": {
        "image_light": "_static/mlx_logo.png",
        "image_dark": "_static/mlx_logo_dark.png",
    },
}

html_favicon = html_theme_options["logo"]["image_light"]

# -- Options for HTMLHelp output ---------------------------------------------

htmlhelp_basename = "mlx_doc"


def setup(app):
    from sphinx.util import inspect

    wrapped_isfunc = inspect.isfunction

    def isfunc(obj):
        type_name = str(type(obj))
        if "nanobind.nb_method" in type_name or "nanobind.nb_func" in type_name:
            return True
        return wrapped_isfunc(obj)

    inspect.isfunction = isfunc


# -- Options for LaTeX output ------------------------------------------------

latex_documents = [(main_doc, "MLX.tex", "MLX Documentation", author, "manual")]
latex_elements = {
    "preamble": r"""
    \usepackage{enumitem}
    \setlistdepth{5}
    \setlist[itemize,1]{label=$\bullet$}
    \setlist[itemize,2]{label=$\bullet$}
    \setlist[itemize,3]{label=$\bullet$}
    \setlist[itemize,4]{label=$\bullet$}
    \setlist[itemize,5]{label=$\bullet$}
    \renewlist{itemize}{itemize}{5}
""",
}


================================================
FILE: docs/src/cpp/ops.rst
================================================
.. _cpp_ops:

Operations
==========

.. doxygengroup:: ops
   :content-only:


================================================
FILE: docs/src/dev/custom_metal_kernels.rst
================================================
.. _custom_metal_kernels:

Custom Metal Kernels
====================

MLX supports writing custom Metal kernels through the Python and C++ APIs.

Simple Example
--------------

.. currentmodule:: mlx.core

Let's write a custom kernel that computes ``exp`` elementwise:

.. code-block:: python

  source = """
      uint elem = thread_position_in_grid.x;
      T tmp = inp[elem];
      out[elem] = metal::exp(tmp);
  """

  kernel = mx.fast.metal_kernel(
      name="myexp",
      input_names=["inp"],
      output_names=["out"],
      source=source,
  )

  def exp_elementwise(a: mx.array):
      outputs = kernel(
          inputs=[a],
          template=[("T", mx.float32)],
          grid=(a.size, 1, 1),
          threadgroup=(256, 1, 1),
          output_shapes=[a.shape],
          output_dtypes=[a.dtype],
      )
      return outputs[0]

  a = mx.random.normal(shape=(4, 16)).astype(mx.float16)
  b = exp_elementwise(a)
  assert mx.allclose(b, mx.exp(a))

Every time you make a kernel, a new Metal library is created and possibly
JIT compiled. To reduce the overhead from that, build the kernel once with
:func:`fast.metal_kernel` and then use it many times.

.. note::
   Only pass the body of the Metal kernel in ``source``. The function
   signature is generated automatically.

The full function signature will be generated using:

* The shapes/dtypes of ``inputs``
    In the above, ``a`` is an ``mx.array`` of type ``mx.float16`` and we pass it with the key ``inp``
    so we will add ``const device float16_t* inp`` to the signature.
    ``inp_shape``, ``inp_strides`` and ``inp_ndim`` are also added for convenience if they are present
    in ``source``.
* The list of ``output_dtypes``
    In the above, ``out`` is an ``mx.array`` of type ``mx.float16``
    so we add ``device float16_t* out``.
* Template parameters passed using ``template``
    In the above, ``template=[("T", mx.float32)]`` adds a template of ``template <typename T>`` to the function
    and instantiates the template with ``custom_kernel_myexp_float<float>``.
    Template parameters can be ``mx.core.Dtype``, ``int`` or ``bool``.
* Metal attributes used in ``source`` such as ``[[thread_position_in_grid]]``
    These will be added as function arguments.
    All the attributes defined in Table 5.8 of the `Metal Shading Language Specification <https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf>`_ are supported.

Putting this all together, the generated function signature for ``myexp`` is as follows:

.. code-block:: cpp

  template <typename T>
  [[kernel]] void custom_kernel_myexp_float(
    const device float16_t* inp [[buffer(0)]],
    device float16_t* out [[buffer(1)]],
    uint3 thread_position_in_grid [[thread_position_in_grid]]) {

          uint elem = thread_position_in_grid.x;
          T tmp = inp[elem];
          out[elem] = metal::exp(tmp);

  }

  template [[host_name("custom_kernel_myexp_float")]] [[kernel]] decltype(custom_kernel_myexp_float<float>) custom_kernel_myexp_float<float>;

Note: ``grid`` and ``threadgroup`` are parameters to the Metal `dispatchThreads
<https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/2866532-dispatchthreads>`_
function. This means we will launch ``mx.prod(grid)`` threads, subdivided into
``threadgroup`` size threadgroups.  For optimal performance, each thread group
dimension should be less than or equal to the corresponding grid dimension.

Passing ``verbose=True`` to :func:`ast.metal_kernel.__call__` will print the
generated code for debugging purposes.

Using Shape/Strides
-------------------

:func:`fast.metal_kernel` supports an argument ``ensure_row_contiguous`` which
is ``True`` by default. This will copy the array inputs if needed
before the kernel is launched to ensure that the memory layout is row
contiguous.  Generally this makes writing the kernel easier, since we don't
have to worry about gaps or the ordering of the dims when indexing.

If we want to avoid this copy, :func:`fast.metal_kernel` automatically passes
``a_shape``, ``a_strides`` and ``a_ndim`` for each input array ``a`` if any are
present in ``source``. We can then use MLX's built in indexing utils to fetch
the right elements for each thread.

Let's convert ``myexp`` above to support arbitrarily strided arrays without
relying on a copy from ``ensure_row_contiguous``:

.. code-block:: python
   
  source = """
      uint elem = thread_position_in_grid.x;
      // Utils from `mlx/backend/metal/kernels/utils.h` are automatically included
      uint loc = elem_to_loc(elem, inp_shape, inp_strides, inp_ndim);
      T tmp = inp[loc];
      // Output arrays are always row contiguous
      out[elem] = metal::exp(tmp);
  """

  kernel = mx.fast.metal_kernel(
      name="myexp_strided",
      input_names=["inp"],
      output_names=["out"],
      source=source,
      ensure_row_contiguous=False,
  )

  def exp_elementwise(a: mx.array):
      outputs = kernel(
          inputs=[a],
          template=[("T", mx.float32)],
          grid=(a.size, 1, 1),
          threadgroup=(256, 1, 1),
          output_shapes=[a.shape],
          output_dtypes=[a.dtype],
      )
      return outputs[0]

  a = mx.random.normal(shape=(4, 16)).astype(mx.float16)
  # make non-contiguous
  a = a[::2]
  b = exp_elementwise(a)
  assert mx.allclose(b, mx.exp(a))

Complex Example
-----------------------------

Let's implement a more complex example: ``grid_sample`` in ``"bilinear"`` mode.

We'll start with the following MLX implementation using standard ops:

.. code-block:: python

  def grid_sample_ref(x, grid):
      N, H_in, W_in, _ = x.shape
      ix = ((grid[..., 0] + 1) * W_in - 1) / 2
      iy = ((grid[..., 1] + 1) * H_in - 1) / 2

      ix_nw = mx.floor(ix).astype(mx.int32)
      iy_nw = mx.floor(iy).astype(mx.int32)

      ix_ne = ix_nw + 1
      iy_ne = iy_nw

      ix_sw = ix_nw
      iy_sw = iy_nw + 1

      ix_se = ix_nw + 1
      iy_se = iy_nw + 1

      nw = (ix_se - ix)    * (iy_se - iy)
      ne = (ix    - ix_sw) * (iy_sw - iy)
      sw = (ix_ne - ix)    * (iy    - iy_ne)
      se = (ix    - ix_nw) * (iy    - iy_nw)

      I_nw = x[mx.arange(N)[:, None, None], iy_nw, ix_nw, :]
      I_ne = x[mx.arange(N)[:, None, None], iy_ne, ix_ne, :]
      I_sw = x[mx.arange(N)[:, None, None], iy_sw, ix_sw, :]
      I_se = x[mx.arange(N)[:, None, None], iy_se, ix_se, :]

      mask_nw = (iy_nw >= 0) & (iy_nw <= H_in - 1) & (ix_nw >= 0) & (ix_nw <= W_in - 1)
      mask_ne = (iy_ne >= 0) & (iy_ne <= H_in - 1) & (ix_ne >= 0) & (ix_ne <= W_in - 1)
      mask_sw = (iy_sw >= 0) & (iy_sw <= H_in - 1) & (ix_sw >= 0) & (ix_sw <= W_in - 1)
      mask_se = (iy_se >= 0) & (iy_se <= H_in - 1) & (ix_se >= 0) & (ix_se <= W_in - 1)

      I_nw *= mask_nw[..., None]
      I_ne *= mask_ne[..., None]
      I_sw *= mask_sw[..., None]
      I_se *= mask_se[..., None]

      output = nw[..., None] * I_nw + ne[..., None] * I_ne + sw[..., None] * I_sw + se[..., None] * I_se

      return output

Now let's use :func:`custom_function` together with :func:`fast.metal_kernel`
to write a fast GPU kernel for both the forward and backward passes.

First we'll implement the forward pass as a fused kernel:

.. code-block:: python

  source = """
      uint elem = thread_position_in_grid.x;
      int H = x_shape[1];
      int W = x_shape[2];
      int C = x_shape[3];
      int gH = grid_shape[1];
      int gW = grid_shape[2];

      int w_stride = C;
      int h_stride = W * w_stride;
      int b_stride = H * h_stride;

      uint grid_idx = elem / C * 2;
      float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
      float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;

      int ix_nw = floor(ix);
      int iy_nw = floor(iy);

      int ix_ne = ix_nw + 1;
      int iy_ne = iy_nw;

      int ix_sw = ix_nw;
      int iy_sw = iy_nw + 1;

      int ix_se = ix_nw + 1;
      int iy_se = iy_nw + 1;

      T nw = (ix_se - ix)    * (iy_se - iy);
      T ne = (ix    - ix_sw) * (iy_sw - iy);
      T sw = (ix_ne - ix)    * (iy    - iy_ne);
      T se = (ix    - ix_nw) * (iy    - iy_nw);

      int batch_idx = elem / C / gH / gW * b_stride;
      int channel_idx = elem % C;
      int base_idx = batch_idx + channel_idx;

      T I_nw = x[base_idx + iy_nw * h_stride + ix_nw * w_stride];
      T I_ne = x[base_idx + iy_ne * h_stride + ix_ne * w_stride];
      T I_sw = x[base_idx + iy_sw * h_stride + ix_sw * w_stride];
      T I_se = x[base_idx + iy_se * h_stride + ix_se * w_stride];

      I_nw = iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1 ? I_nw : 0;
      I_ne = iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1 ? I_ne : 0;
      I_sw = iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1 ? I_sw : 0;
      I_se = iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1 ? I_se : 0;

      out[elem] = nw * I_nw + ne * I_ne + sw * I_sw + se * I_se;
  """

  kernel = mx.fast.metal_kernel(
      name="grid_sample",
      input_names=["x", "grid"],
      output_names=["out"],
      source=source,
  )

  @mx.custom_function
  def grid_sample(x, grid):

      assert x.ndim == 4, "`x` must be 4D."
      assert grid.ndim == 4, "`grid` must be 4D."

      B, _, _, C = x.shape
      _, gN, gM, D = grid.shape
      out_shape = (B, gN, gM, C)

      assert D == 2, "Last dim of `grid` must be size 2."

      outputs = kernel(
          inputs=[x, grid],
          template=[("T", x.dtype)],
          output_shapes=[out_shape],
          output_dtypes=[x.dtype],
          grid=(np.prod(out_shape), 1, 1),
          threadgroup=(256, 1, 1),
      )
      return outputs[0]

For a reasonably sized input such as:

.. code-block:: python

  x.shape = (8, 1024, 1024, 64)
  grid.shape = (8, 256, 256, 2)

On an M1 Max, we see a big performance improvement:

``55.7ms -> 6.7ms => 8x speed up``

Grid Sample VJP
---------------

Since we decorated ``grid_sample`` with :func:`custom_function`, we can now
define its custom vjp transform so MLX can differentiate it.

The backwards pass requires atomically updating ``x_grad``/``grid_grad`` and so
requires a few extra :func:`fast.metal_kernel` features:

* ``init_value=0``
    Initialize all of the kernel's outputs to this value before it runs. This allows us to update only part of the output arrays with the kernel.

* ``atomic_outputs=True``
    Designate all of the kernel outputs as ``atomic`` in the function signature. 
    This means we can use Metal's ``atomic`` features to simultaneously update the ``x_grad`` and ``grid_grad`` arrays from multiple threadgroups. 
    See section 6.15 of the `Metal Shading Language Specification <https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf>`_ for more details.

We can then implement the backwards pass as follows:

.. code-block:: python

  source = """
      uint elem = thread_position_in_grid.x;
      int H = x_shape[1];
      int W = x_shape[2];
      int C = x_shape[3];
      // Pad C to the nearest larger simdgroup size multiple
      int C_padded = ceildiv(C, threads_per_simdgroup) * threads_per_simdgroup;

      int gH = grid_shape[1];
      int gW = grid_shape[2];

      int w_stride = C;
      int h_stride = W * w_stride;
      int b_stride = H * h_stride;

      uint grid_idx = elem / C_padded * 2;
      float ix = ((grid[grid_idx] + 1) * W - 1) / 2;
      float iy = ((grid[grid_idx + 1] + 1) * H - 1) / 2;

      int ix_nw = floor(ix);
      int iy_nw = floor(iy);

      int ix_ne = ix_nw + 1;
      int iy_ne = iy_nw;

      int ix_sw = ix_nw;
      int iy_sw = iy_nw + 1;

      int ix_se = ix_nw + 1;
      int iy_se = iy_nw + 1;

      T nw = (ix_se - ix)    * (iy_se - iy);
      T ne = (ix    - ix_sw) * (iy_sw - iy);
      T sw = (ix_ne - ix)    * (iy    - iy_ne);
      T se = (ix    - ix_nw) * (iy    - iy_nw);

      int batch_idx = elem / C_padded / gH / gW * b_stride;
      int channel_idx = elem % C_padded;
      int base_idx = batch_idx + channel_idx;

      T gix = T(0);
      T giy = T(0);
      if (channel_idx < C) {
          int cot_index = elem / C_padded * C + channel_idx;
          T cot = cotangent[cot_index];
          if (iy_nw >= 0 && iy_nw <= H - 1 && ix_nw >= 0 && ix_nw <= W - 1) {
              int offset = base_idx + iy_nw * h_stride + ix_nw * w_stride;
              atomic_fetch_add_explicit(&x_grad[offset], nw * cot, memory_order_relaxed);

              T I_nw = x[offset];
              gix -= I_nw * (iy_se - iy) * cot;
              giy -= I_nw * (ix_se - ix) * cot;
          }
          if (iy_ne >= 0 && iy_ne <= H - 1 && ix_ne >= 0 && ix_ne <= W - 1) {
              int offset = base_idx + iy_ne * h_stride + ix_ne * w_stride;
              atomic_fetch_add_explicit(&x_grad[offset], ne * cot, memory_order_relaxed);

              T I_ne = x[offset];
              gix += I_ne * (iy_sw - iy) * cot;
              giy -= I_ne * (ix - ix_sw) * cot;
          }
          if (iy_sw >= 0 && iy_sw <= H - 1 && ix_sw >= 0 && ix_sw <= W - 1) {
              int offset = base_idx + iy_sw * h_stride + ix_sw * w_stride;
              atomic_fetch_add_explicit(&x_grad[offset], sw * cot, memory_order_relaxed);

              T I_sw = x[offset];
              gix -= I_sw * (iy - iy_ne) * cot;
              giy += I_sw * (ix_ne - ix) * cot;
          }
          if (iy_se >= 0 && iy_se <= H - 1 && ix_se >= 0 && ix_se <= W - 1) {
              int offset = base_idx + iy_se * h_stride + ix_se * w_stride;
              atomic_fetch_add_explicit(&x_grad[offset], se * cot, memory_order_relaxed);

              T I_se = x[offset];
              gix += I_se * (iy - iy_nw) * cot;
              giy += I_se * (ix - ix_nw) * cot;
          }
      }

      T gix_mult = W / 2;
      T giy_mult = H / 2;

      // Reduce across each simdgroup first.
      // This is much faster than relying purely on atomics.
      gix = simd_sum(gix);
      giy = simd_sum(giy);

      if (thread_index_in_simdgroup == 0) {
          atomic_fetch_add_explicit(&grid_grad[grid_idx], gix * gix_mult, memory_order_relaxed);
          atomic_fetch_add_explicit(&grid_grad[grid_idx + 1], giy * giy_mult, memory_order_relaxed);
      }
  """
  kernel = mx.fast.metal_kernel(
      name="grid_sample_grad",
      input_names=["x", "grid", "cotangent"],
      output_names=["x_grad", "grid_grad"],
      source=source,
      atomic_outputs=True,
  )

  @grid_sample.vjp
  def grid_sample_vjp(primals, cotangent, _):
      x, grid = primals
      B, _, _, C = x.shape
      _, gN, gM, D = grid.shape

      assert D == 2, "Last dim of `grid` must be size 2."

      # pad the output channels to simd group size
      # so that our `simd_sum`s don't overlap.
      simdgroup_size = 32
      C_padded = (C + simdgroup_size - 1) // simdgroup_size * simdgroup_size
      grid_size = B * gN * gM * C_padded
      outputs = kernel(
          inputs=[x, grid, cotangent],
          template=[("T", x.dtype)],
          output_shapes=[x.shape, grid.shape],
          output_dtypes=[x.dtype, x.dtype],
          grid=(grid_size, 1, 1),
          threadgroup=(256, 1, 1),
          init_value=0,
      )
      return outputs[0], outputs[1]

There's an even larger speed up for the vjp:

``676.4ms -> 16.7ms => 40x speed up``


================================================
FILE: docs/src/dev/extensions.rst
================================================
Custom Extensions in MLX
========================

You can extend MLX with custom operations on the CPU or GPU. This guide
explains how to do that with a simple example.

Introducing the Example
-----------------------

Let's say you would like an operation that takes in two arrays, ``x`` and
``y``, scales them both by coefficients ``alpha`` and ``beta`` respectively,
and then adds them together to get the result ``z = alpha * x + beta * y``.
You can do that in MLX directly:

.. code-block:: python

    import mlx.core as mx

    def simple_axpby(x: mx.array, y: mx.array, alpha: float, beta: float) -> mx.array:
        return alpha * x + beta * y

This function performs that operation while leaving the implementation and
function transformations to MLX.

However, you may want to customize the underlying implementation, perhaps to
make it faster. In this tutorial we will go through adding custom extensions.
It will cover:

* The structure of the MLX library.
* Implementing a CPU operation.
* Implementing a GPU operation using metal.
* Adding the ``vjp`` and ``jvp`` function transformation.
* Building a custom extension and binding it to python.

Operations and Primitives
-------------------------

Operations in MLX build the computation graph. Primitives provide the rules for
evaluating and transforming the graph. Let's start by discussing operations in
more detail.

Operations
^^^^^^^^^^^

Operations are the front-end functions that operate on arrays. They are defined
in the C++ API (:ref:`cpp_ops`), and the Python API (:ref:`ops`) binds them.

We would like an operation :meth:`axpby` that takes in two arrays, ``x`` and
``y``, and two scalars, ``alpha`` and ``beta``. This is how to define it in
C++:

.. code-block:: C++

    /**
    *  Scale and sum two vectors element-wise
    *  z = alpha * x + beta * y
    *
    *  Use NumPy-style broadcasting between x and y
    *  Inputs are upcasted to floats if needed
    **/
    array axpby(
        const array& x, // Input array x
        const array& y, // Input array y
        const float alpha, // Scaling factor for x
        const float beta, // Scaling factor for y
        StreamOrDevice s = {} // Stream on which to schedule the operation
    );

The simplest way to implement this is with existing operations:

.. code-block:: C++

    array axpby(
        const array& x, // Input array x
        const array& y, // Input array y
        const float alpha, // Scaling factor for x
        const float beta, // Scaling factor for y
        StreamOrDevice s /* = {} */ // Stream on which to schedule the operation
    ) {
        // Scale x and y on the provided stream
        auto ax = multiply(array(alpha), x, s);
        auto by = multiply(array(beta), y, s);

        // Add and return
        return add(ax, by, s);
    }

The operations themselves do not contain the implementations that act on the
data, nor do they contain the rules of transformations. Rather, they are an
easy to use interface that use :class:`Primitive` building blocks.

Primitives
^^^^^^^^^^^

A :class:`Primitive` is part of the computation graph of an :class:`array`. It
defines how to create output arrays given input arrays. Further, a
:class:`Primitive` has methods to run on the CPU or GPU and for function
transformations such as ``vjp`` and ``jvp``.  Let's go back to our example to be
more concrete:

.. code-block:: C++

    class Axpby : public Primitive {
      public:
        explicit Axpby(Stream stream, float alpha, float beta)
            : Primitive(stream), alpha_(alpha), beta_(beta){};

        /**
        * A primitive must know how to evaluate itself on the CPU/GPU
        * for the given inputs and populate the output array.
        *
        * To avoid unnecessary allocations, the evaluation function
        * is responsible for allocating space for the array.
        */
        void eval_cpu(
            const std::vector<array>& inputs,
            std::vector<array>& outputs) override;
        void eval_gpu(
            const std::vector<array>& inputs,
            std::vector<array>& outputs) override;

        /** The Jacobian-vector product. */
        std::vector<array> jvp(
            const std::vector<array>& primals,
            const std::vector<array>& tangents,
            const std::vector<int>& argnums) override;

        /** The vector-Jacobian product. */
        std::vector<array> vjp(
            const std::vector<array>& primals,
            const std::vector<array>& cotangents,
            const std::vector<int>& argnums,
            const std::vector<array>& outputs) override;

        /**
        * The primitive must know how to vectorize itself across
        * the given axes. The output is a pair containing the array
        * representing the vectorized computation and the axis which
        * corresponds to the output vectorized dimension.
        */
        std::pair<std::vector<array>, std::vector<int>> vmap(
            const std::vector<array>& inputs,
            const std::vector<int>& axes) override;

        /** The name of primitive. */
        const char* name() const override {
          return "Axpby";
        }

        /** Equivalence check **/
        bool is_equivalent(const Primitive& other) const override;

      private:
        float alpha_;
        float beta_;
    };

The :class:`Axpby` class derives from the base :class:`Primitive` class. The
:class:`Axpby` treats ``alpha`` and ``beta`` as parameters. It then provides
implementations of how the output array is produced given the inputs through
:meth:`Axpby::eval_cpu` and :meth:`Axpby::eval_gpu`. It also provides rules
of transformations in :meth:`Axpby::jvp`, :meth:`Axpby::vjp`, and
:meth:`Axpby::vmap`.

Using the Primitive
^^^^^^^^^^^^^^^^^^^

Operations can use this :class:`Primitive` to add a new :class:`array` to the
computation graph. An :class:`array` can be constructed by providing its data
type, shape, the :class:`Primitive` that computes it, and the :class:`array`
inputs that are passed to the primitive.

Let's reimplement our operation now in terms of our :class:`Axpby` primitive.

.. code-block:: C++

    array axpby(
        const array& x, // Input array x
        const array& y, // Input array y
        const float alpha, // Scaling factor for x
        const float beta, // Scaling factor for y
        StreamOrDevice s /* = {} */ // Stream on which to schedule the operation
    ) {
        // Promote dtypes between x and y as needed
        auto promoted_dtype = promote_types(x.dtype(), y.dtype());

        // Upcast to float32 for non-floating point inputs x and y
        auto out_dtype = issubdtype(promoted_dtype, float32)
            ? promoted_dtype
            : promote_types(promoted_dtype, float32);

        // Cast x and y up to the determined dtype (on the same stream s)
        auto x_casted = astype(x, out_dtype, s);
        auto y_casted = astype(y, out_dtype, s);

        // Broadcast the shapes of x and y (on the same stream s)
        auto broadcasted_inputs = broadcast_arrays({x_casted, y_casted}, s);
        auto out_shape = broadcasted_inputs[0].shape();

        // Construct the array as the output of the Axpby primitive
        // with the broadcasted and upcasted arrays as inputs
        return array(
            /* const std::vector<int>& shape = */ out_shape,
            /* Dtype dtype = */ out_dtype,
            /* std::unique_ptr<Primitive> primitive = */
            std::make_shared<Axpby>(to_stream(s), alpha, beta),
            /* const std::vector<array>& inputs = */ broadcasted_inputs);
    }


This operation now handles the following:

#. Upcast inputs and resolve the output data type.
#. Broadcast the inputs and resolve the output shape.
#. Construct the primitive :class:`Axpby` using the given stream, ``alpha``, and ``beta``.
#. Construct the output :class:`array` using the primitive and the inputs.

Implementing the Primitive
--------------------------

No computation happens when we call the operation alone. The operation only
builds the computation graph. When we evaluate the output array, MLX schedules
the execution of the computation graph, and calls :meth:`Axpby::eval_cpu` or
:meth:`Axpby::eval_gpu` depending on the stream/device specified by the user.

.. warning::
    When :meth:`Primitive::eval_cpu` or :meth:`Primitive::eval_gpu` are called,
    no memory has been allocated for the output array. It falls on the implementation
    of these functions to allocate memory as needed.

Implementing the CPU Back-end
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Let's start by implementing :meth:`Axpby::eval_cpu`.

The method will go over each element of the output array, find the
corresponding input elements of ``x`` and ``y`` and perform the operation
point-wise. This is captured in the templated function :meth:`axpby_impl`.

.. code-block:: C++

  template <typename T>
  void axpby_impl(
      const mx::array& x,
      const mx::array& y,
      mx::array& out,
      float alpha_,
      float beta_,
      mx::Stream stream) {
    out.set_data(mx::allocator::malloc(out.nbytes()));

    // Get the CPU command encoder and register input and output arrays
    auto& encoder = mx::cpu::get_command_encoder(stream);
    encoder.set_input_array(x);
    encoder.set_input_array(y);
    encoder.set_output_array(out);

    // Launch the CPU kernel
    encoder.dispatch([x_ptr = x.data<T>(),
                      y_ptr = y.data<T>(),
                      out_ptr = out.data<T>(),
                      size = out.size(),
                      shape = out.shape(),
                      x_strides = x.strides(),
                      y_strides = y.strides(),
                      alpha_,
                      beta_]() {

      // Cast alpha and beta to the relevant types
      T alpha = static_cast<T>(alpha_);
      T beta = static_cast<T>(beta_);

      // Do the element-wise operation for each output
      for (size_t out_idx = 0; out_idx < size; out_idx++) {
        // Map linear indices to offsets in x and y
        auto x_offset = mx::elem_to_loc(out_idx, shape, x_strides);
        auto y_offset = mx::elem_to_loc(out_idx, shape, y_strides);

        // We allocate the output to be contiguous and regularly strided
        // (defaults to row major) and hence it doesn't need additional mapping
        out_ptr[out_idx] = alpha * x_ptr[x_offset] + beta * y_ptr[y_offset];
      }
    });
  }

Our implementation should work for all incoming floating point arrays.
Accordingly, we add dispatches for ``float32``, ``float16``, ``bfloat16`` and
``complex64``. We throw an error if we encounter an unexpected type.

.. code-block:: C++

    void Axpby::eval_cpu(
        const std::vector<mx::array>& inputs,
        std::vector<mx::array>& outputs) {
      auto& x = inputs[0];
      auto& y = inputs[1];
      auto& out = outputs[0];

      // Dispatch to the correct dtype
      if (out.dtype() == mx::float32) {
        return axpby_impl<float>(x, y, out, alpha_, beta_, stream());
      } else if (out.dtype() == mx::float16) {
        return axpby_impl<mx::float16_t>(x, y, out, alpha_, beta_, stream());
      } else if (out.dtype() == mx::bfloat16) {
        return axpby_impl<mx::bfloat16_t>(x, y, out, alpha_, beta_, stream());
      } else if (out.dtype() == mx::complex64) {
        return axpby_impl<mx::complex64_t>(x, y, out, alpha_, beta_, stream());
      } else {
        throw std::runtime_error(
            "Axpby is only supported for floating point types.");
      }
    }

Just this much is enough to run the operation :meth:`axpby` on a CPU stream! If
you do not plan on running the operation on the GPU or using transforms on
computation graphs that contain :class:`Axpby`, you can stop implementing the
primitive here.

Implementing the GPU Back-end
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Apple silicon devices address their GPUs using the Metal_ shading language, and
GPU kernels in MLX are written using Metal.

.. note::

    Here are some helpful resources if you are new to Metal:

    * A walkthrough of the metal compute pipeline: `Metal Example`_
    * Documentation for metal shading language: `Metal Specification`_
    * Using metal from C++: `Metal-cpp`_

Let's keep the GPU kernel simple. We will launch exactly as many threads as
there are elements in the output. Each thread will pick the element it needs
from ``x`` and ``y``, do the point-wise operation, and update its assigned
element in the output.

.. code-block:: C++

    template <typename T>
    [[kernel]] void axpby_general(
            device const T* x [[buffer(0)]],
            device const T* y [[buffer(1)]],
            device T* out [[buffer(2)]],
            constant const float& alpha [[buffer(3)]],
            constant const float& beta [[buffer(4)]],
            constant const int* shape [[buffer(5)]],
            constant const int64_t* x_strides [[buffer(6)]],
            constant const int64_t* y_strides [[buffer(7)]],
            constant const int& ndim [[buffer(8)]],
            uint index [[thread_position_in_grid]]) {
        // Convert linear indices to offsets in array
        auto x_offset = elem_to_loc(index, shape, x_strides, ndim);
        auto y_offset = elem_to_loc(index, shape, y_strides, ndim);

        // Do the operation and update the output
        out[index] =
            static_cast<T>(alpha) * x[x_offset] + static_cast<T>(beta) * y[y_offset];
    }

We then need to instantiate this template for all floating point types and give
each instantiation a unique host name so we can identify it.

.. code-block:: C++

    instantiate_kernel("axpby_general_float32", axpby_general, float)
    instantiate_kernel("axpby_general_float16", axpby_general, float16_t)
    instantiate_kernel("axpby_general_bfloat16", axpby_general, bfloat16_t)
    instantiate_kernel("axpby_general_complex64", axpby_general, complex64_t)

The logic to determine the kernel, set the inputs, resolve the grid dimensions,
and dispatch to the GPU are contained in :meth:`Axpby::eval_gpu` as shown
below.

.. code-block:: C++

    /** Evaluate primitive on GPU */
    void Axpby::eval_gpu(
      const std::vector<array>& inputs,
      std::vector<array>& outputs) {
        // Prepare inputs
        assert(inputs.size() == 2);
        auto& x = inputs[0];
        auto& y = inputs[1];
        auto& out = outputs[0];

        // Each primitive carries the stream it should execute on
        // and each stream carries its device identifiers
        auto& s = stream();
        // We get the needed metal device using the stream
        auto& d = metal::device(s.device);

        // Allocate output memory
        out.set_data(allocator::malloc(out.nbytes()));

        // Resolve name of kernel
        std::stream kname;
        kname = "axpby_general_" + type_to_name(out);

        // Load the metal library
        auto lib = d.get_library("mlx_ext", current_binary_dir());

        // Make a kernel from this metal library
        auto kernel = d.get_kernel(kname, lib);

        // Prepare to encode kernel
        auto& compute_encoder = d.get_command_encoder(s.index);
        compute_encoder.set_compute_pipeline_state(kernel);

        // Kernel parameters are registered with buffer indices corresponding to
        // those in the kernel declaration at axpby.metal
        int ndim = out.ndim();
        size_t nelem = out.size();

        // Encode input arrays to kernel
        compute_encoder.set_input_array(x, 0);
        compute_encoder.set_input_array(y, 1);

        // Encode output arrays to kernel
        compute_encoder.set_output_array(out, 2);

        // Encode alpha and beta
        compute_encoder.set_bytes(alpha_, 3);
        compute_encoder.set_bytes(beta_, 4);

        // Encode shape, strides and ndim
        compute_encoder.set_vector_bytes(x.shape(), 5);
        compute_encoder.set_vector_bytes(x.strides(), 6);
        compute_encoder.set_bytes(y.strides(), 7);
        compute_encoder.set_bytes(ndim, 8);

        // We launch 1 thread for each input and make sure that the number of
        // threads in any given threadgroup is not higher than the max allowed
        size_t tgp_size = std::min(nelem, kernel->maxTotalThreadsPerThreadgroup());

        // Fix the 3D size of each threadgroup (in terms of threads)
        MTL::Size group_dims = MTL::Size(tgp_size, 1, 1);

        // Fix the 3D size of the launch grid (in terms of threads)
        MTL::Size grid_dims = MTL::Size(nelem, 1, 1);

        // Launch the grid with the given number of threads divided among
        // the given threadgroups
        compute_encoder.dispatch_threads(grid_dims, group_dims);
    }

We can now call the :meth:`axpby` operation on both the CPU and the GPU!

A few things to note about MLX and Metal before moving on. MLX keeps track of
the active ``command_buffer`` and the ``MTLCommandBuffer`` to which it is
associated. We rely on :meth:`d.get_command_encoder` to give us the active
metal compute command encoder instead of building a new one and calling
:meth:`compute_encoder->end_encoding` at the end. MLX adds kernels (compute
pipelines) to the active command buffer until some specified limit is hit or
the command buffer needs to be flushed for synchronization.

Primitive Transforms
^^^^^^^^^^^^^^^^^^^^^

Next, let's add implementations for transformations in a :class:`Primitive`.
These transformations can be built on top of other operations, including the
one we just defined:

.. code-block:: C++

    /** The Jacobian-vector product. */
    std::vector<array> Axpby::jvp(
            const std::vector<array>& primals,
            const std::vector<array>& tangents,
            const std::vector<int>& argnums) {
        // Forward mode diff that pushes along the tangents
        // The jvp transform on the primitive can be built with ops
        // that are scheduled on the same stream as the primitive

        // If argnums = {0}, we only push along x in which case the
        // jvp is just the tangent scaled by alpha
        // Similarly, if argnums = {1}, the jvp is just the tangent
        // scaled by beta
        if (argnums.size() > 1) {
            auto scale = argnums[0] == 0 ? alpha_ : beta_;
            auto scale_arr = array(scale, tangents[0].dtype());
            return {multiply(scale_arr, tangents[0], stream())};
        }
        // If argnums = {0, 1}, we take contributions from both
        // which gives us jvp = tangent_x * alpha + tangent_y * beta
        else {
            return {axpby(tangents[0], tangents[1], alpha_, beta_, stream())};
        }
    }

.. code-block:: C++

    /** The vector-Jacobian product. */
    std::vector<array> Axpby::vjp(
            const std::vector<array>& primals,
            const std::vector<array>& cotangents,
            const std::vector<int>& argnums,
            const std::vector<int>& /* unused */) {
        // Reverse mode diff
        std::vector<array> vjps;
        for (auto arg : argnums) {
            auto scale = arg == 0 ? alpha_ : beta_;
            auto scale_arr = array(scale, cotangents[0].dtype());
            vjps.push_back(multiply(scale_arr, cotangents[0], stream()));
        }
        return vjps;
    }

Note, a transformation does not need to be fully defined to start using
the :class:`Primitive`.

.. code-block:: C++

    /** Vectorize primitive along given axis */
    std::pair<std::vector<array>, std::vector<int>> Axpby::vmap(
            const std::vector<array>& inputs,
            const std::vector<int>& axes) {
        throw std::runtime_error("[Axpby] vmap not implemented.");
    }

Building and Binding
--------------------

Let's look at the overall directory structure first.

| extensions
| ├── axpby
| │   ├── axpby.cpp
| │   ├── axpby.h
| │   └── axpby.metal
| ├── mlx_sample_extensions
| │   └── __init__.py
| ├── bindings.cpp
| ├── CMakeLists.txt
| └── setup.py

* ``extensions/axpby/`` defines the C++ extension library
* ``extensions/mlx_sample_extensions`` sets out the structure for the
  associated Python package
* ``extensions/bindings.cpp`` provides Python bindings for our operation
* ``extensions/CMakeLists.txt`` holds CMake rules to build the library and
  Python bindings
* ``extensions/setup.py`` holds the ``setuptools`` rules to build and install
  the Python package

Binding to Python
^^^^^^^^^^^^^^^^^^

We use nanobind_ to build a Python API for the C++ library. Since bindings for
components such as :class:`mlx.core.array`, :class:`mlx.core.stream`, etc. are
already provided, adding our :meth:`axpby` is simple.

.. code-block:: C++

   NB_MODULE(_ext, m) {
        m.doc() = "Sample extension for MLX";

        m.def(
            "axpby",
            &axpby,
            "x"_a,
            "y"_a,
            "alpha"_a,
            "beta"_a,
            nb::kw_only(),
            "stream"_a = nb::none(),
            R"(
                Scale and sum two vectors element-wise
                ``z = alpha * x + beta * y``

                Follows numpy style broadcasting between ``x`` and ``y``
                Inputs are upcasted to floats if needed

                Args:
                    x (array): Input array.
                    y (array): Input array.
                    alpha (float): Scaling factor for ``x``.
                    beta (float): Scaling factor for ``y``.

                Returns:
                    array: ``alpha * x + beta * y``
            )");
    }

Most of the complexity in the above example comes from additional bells and
whistles such as the literal names and doc-strings.

.. warning::

    :mod:`mlx.core` must be imported before importing
    :mod:`mlx_sample_extensions` as defined by the nanobind module above to
    ensure that the casters for :mod:`mlx.core` components like
    :class:`mlx.core.array` are available.

.. _Building with CMake:

Building with CMake
^^^^^^^^^^^^^^^^^^^^

Building the C++ extension library only requires that you ``find_package(MLX
CONFIG)`` and then link it to your library.

.. code-block:: cmake

    # Add library
    add_library(mlx_ext)

    # Add sources
    target_sources(
        mlx_ext
        PUBLIC
        ${CMAKE_CURRENT_LIST_DIR}/axpby/axpby.cpp
    )

    # Add include headers
    target_include_directories(
        mlx_ext PUBLIC ${CMAKE_CURRENT_LIST_DIR}
    )

    # Link to mlx
    target_link_libraries(mlx_ext PUBLIC mlx)

We also need to build the attached Metal library. For convenience, we provide a
:meth:`mlx_build_metallib` function that builds a ``.metallib`` target given
sources, headers, destinations, etc. (defined in ``cmake/extension.cmake`` and
automatically imported with MLX package).

Here is what that looks like in practice:

.. code-block:: cmake

    # Build metallib
    if(MLX_BUILD_METAL)

    mlx_build_metallib(
        TARGET mlx_ext_metallib
        TITLE mlx_ext
        SOURCES ${CMAKE_CURRENT_LIST_DIR}/axpby/axpby.metal
        INCLUDE_DIRS ${PROJECT_SOURCE_DIR} ${MLX_INCLUDE_DIRS}
        OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
    )

    add_dependencies(
        mlx_ext
        mlx_ext_metallib
    )

    endif()

Finally, we build the nanobind_ bindings

.. code-block:: cmake

    nanobind_add_module(
      _ext
      NB_STATIC STABLE_ABI LTO NOMINSIZE
      NB_DOMAIN mlx
      ${CMAKE_CURRENT_LIST_DIR}/bindings.cpp
    )
    target_link_libraries(_ext PRIVATE mlx_ext)

    if(BUILD_SHARED_LIBS)
      target_link_options(_ext PRIVATE -Wl,-rpath,@loader_path)
    endif()

Building with ``setuptools``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Once we have set out the CMake build rules as described above, we can use the
build utilities defined in :mod:`mlx.extension`:

.. code-block:: python

    from mlx import extension
    from setuptools import setup

    if __name__ == "__main__":
        setup(
            name="mlx_sample_extensions",
            version="0.0.0",
            description="Sample C++ and Metal extensions for MLX primitives.",
            ext_modules=[extension.CMakeExtension("mlx_sample_extensions._ext")],
            cmdclass={"build_ext": extension.CMakeBuild},
            packages=["mlx_sample_extensions"],
            package_data={"mlx_sample_extensions": ["*.so", "*.dylib", "*.metallib"]},
            extras_require={"dev":[]},
            zip_safe=False,
            python_requires=">=3.8",
        )

.. note::
    We treat ``extensions/mlx_sample_extensions`` as the package directory
    even though it only contains a ``__init__.py`` to ensure the following:

    * :mod:`mlx.core` must be imported before importing :mod:`_ext`
    * The C++ extension library and the metal library are co-located with the python
      bindings and copied together if the package is installed

To build the package, first install the build dependencies with ``pip install
-r requirements.txt``.  You can then build inplace for development using
``python setup.py build_ext -j8 --inplace`` (in ``extensions/``)

This results in the directory structure:

| extensions
| ├── mlx_sample_extensions
| │   ├── __init__.py
| │   ├── libmlx_ext.dylib # C++ extension library
| │   ├── mlx_ext.metallib # Metal library
| │   └── _ext.cpython-3x-darwin.so # Python Binding
| ...

When you try to install using the command ``python -m pip install .`` (in
``extensions/``), the package will be installed with the same structure as
``extensions/mlx_sample_extensions`` and the C++ and Metal library will be
copied along with the Python binding since they are specified as
``package_data``.

Usage
-----

After installing the extension as described above, you should be able to simply
import the Python package and play with it as you would any other MLX operation.

Let's look at a simple script and its results:

.. code-block:: python

    import mlx.core as mx
    from mlx_sample_extensions import axpby

    a = mx.ones((3, 4))
    b = mx.ones((3, 4))
    c = axpby(a, b, 4.0, 2.0, stream=mx.cpu)

    print(f"c shape: {c.shape}")
    print(f"c dtype: {c.dtype}")
    print(f"c is correct: {mx.all(c == 6.0).item()}")

Output:

.. code-block::

    c shape: [3, 4]
    c dtype: float32
    c is correct: True

Results
^^^^^^^

Let's run a quick benchmark and see how our new ``axpby`` operation compares
with the naive :meth:`simple_axpby` we first defined.

.. code-block:: python

    import mlx.core as mx
    from mlx_sample_extensions import axpby
    import time

    def simple_axpby(x: mx.array, y: mx.array, alpha: float, beta: float) -> mx.array:
        return alpha * x + beta * y

    M = 4096
    N = 4096

    x = mx.random.normal((M, N))
    y = mx.random.normal((M, N))
    alpha = 4.0
    beta = 2.0

    mx.eval(x, y)

    def bench(f):
        # Warm up
        for i in range(5):
            z = f(x, y, alpha, beta)
            mx.eval(z)

        # Timed run
        s = time.perf_counter()
        for i in range(100):
            z = f(x, y, alpha, beta)
            mx.eval(z)
        e = time.perf_counter()
        return 1000 * (e - s) / 100

    simple_time = bench(simple_axpby)
    custom_time = bench(axpby)

    print(f"Simple axpby: {simple_time:.3f} ms | Custom axpby: {custom_time:.3f} ms")

The results are ``Simple axpby: 1.559 ms | Custom axpby: 0.774 ms``. We see
modest improvements right away!

This operation is now good to be used to build other operations, in
:class:`mlx.nn.Module` calls, and also as a part of graph transformations like
:meth:`grad`.

Scripts
-------

.. admonition:: Download the code

   The full example code is available in `mlx <https://github.com/ml-explore/mlx/tree/main/examples/extensions/>`_.

.. _Accelerate: https://developer.apple.com/documentation/accelerate/blas?language=objc
.. _Metal: https://developer.apple.com/documentation/metal?language=objc
.. _Metal-cpp: https://developer.apple.com/metal/cpp/
.. _`Metal Specification`: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
.. _`Metal Example`: https://developer.apple.com/documentation/metal/performing_calculations_on_a_gpu?language=objc
.. _nanobind: https://nanobind.readthedocs.io/en/latest/


================================================
FILE: docs/src/dev/metal_debugger.rst
================================================
Metal Debugger
==============

.. currentmodule:: mlx.core

Profiling is a key step for performance optimization. You can build MLX with
the ``MLX_METAL_DEBUG`` option to improve the Metal debugging and
optimization workflow. The ``MLX_METAL_DEBUG`` debug option:

* Records source during Metal compilation, for later inspection while
  debugging.
* Labels Metal objects such as command queues, improving capture readability.

To build with debugging enabled in Python prepend
``CMAKE_ARGS="-DMLX_METAL_DEBUG=ON"`` to the build call.

The :func:`metal.start_capture` function initiates a capture of all MLX GPU
work.

.. note::

   To capture a GPU trace you must run the application with
   ``MTL_CAPTURE_ENABLED=1``.

.. code-block:: python

    import mlx.core as mx

    a = mx.random.uniform(shape=(512, 512))
    b = mx.random.uniform(shape=(512, 512))
    mx.eval(a, b)

    trace_file = "mlx_trace.gputrace"

    # Make sure to run with MTL_CAPTURE_ENABLED=1 and
    # that the path trace_file does not already exist.
    mx.metal.start_capture(trace_file)

    for _ in range(10):
      mx.eval(mx.add(a, b))

    mx.metal.stop_capture()

You can open and replay the GPU trace in Xcode. The ``Dependencies`` view
has a great overview of all operations. Checkout the `Metal debugger
documentation`_ for more information.

.. image:: ../_static/metal_debugger/capture.png
    :class: dark-light

Xcode Workflow
--------------

You can skip saving to a path by running within Xcode. First, generate an
Xcode project using CMake.

.. code-block::

    mkdir build && cd build
    cmake .. -DMLX_METAL_DEBUG=ON -G Xcode
    open mlx.xcodeproj

Select the ``metal_capture`` example schema and run.

.. image:: ../_static/metal_debugger/schema.png
    :class: dark-light

.. _`Metal debugger documentation`: https://developer.apple.com/documentation/xcode/metal-debugger


================================================
FILE: docs/src/dev/metal_logging.rst
================================================
Metal Logging
=============

In debug builds, MLX compiles Metal kernels with ``os_log`` enabled so shader
warnings and debug messages are visible during development.

.. note::
    Metal logging is only available with Metal 3.2 or higher (macOS 15 and up,
    iOS 18 and up).

To enable logging from kernels, first make sure to build in debug mode:

.. code-block:: bash

    DEBUG=1 python -m pip install -e .

Then, in the kernel source code include MLX's logging shim and use
``mlx::os_log``:

.. code-block::

    #include "mlx/backend/metal/kernels/logging.h"

    constant mlx::os_log logger("mlx", "my_kernel");

    kernel void my_kernel(/* ... */) {
    // ...
      logger.log_debug("unexpected state: idx=%u", idx);
    }

When you run the program, set the Metal log level to your desired level and
forward logs to ``stderr``:

.. code-block:: bash

    MTL_LOG_LEVEL=MTLLogLevelDebug MTL_LOG_TO_STDERR=1 python script.py

See the `Metal logging guide`_ for more details.

.. _`Metal logging guide`: https://developer.apple.com/documentation/metal/logging-shader-debug-messages


================================================
FILE: docs/src/dev/mlx_in_cpp.rst
================================================
.. _mlx_in_cpp:

Using MLX in C++
================

You can use MLX in a C++ project with CMake.

.. note::

  This guide is based one the following `example using MLX in C++ 
  <https://github.com/ml-explore/mlx/tree/main/examples/cmake_project>`_

First install MLX:

.. code-block:: bash

  pip install -U mlx

You can also install the MLX Python package from source or just the C++
library. For more information see the :ref:`documentation on installing MLX
<build_and_install>`.

Next make an example program in ``example.cpp``: 

.. code-block:: C++

  #include <iostream>

  #include "mlx/mlx.h"

  namespace mx = mlx::core;

  int main() {
    auto x = mx::array({1, 2, 3});
    auto y = mx::array({1, 2, 3});
    std::cout << x + y << std::endl;
    return 0;
  }

The next step is to setup a CMake file in ``CMakeLists.txt``:

.. code-block:: cmake

  cmake_minimum_required(VERSION 3.27)

  project(example LANGUAGES CXX)

  set(CMAKE_CXX_STANDARD 20)
  set(CMAKE_CXX_STANDARD_REQUIRED ON)


Depending on how you installed MLX, you may need to tell CMake where to
find it. 

If you installed MLX with Python, then add the following to the CMake file:

.. code-block:: cmake

  find_package(
    Python 3.9
    COMPONENTS Interpreter Development.Module
    REQUIRED)
  execute_process(
    COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
    OUTPUT_STRIP_TRAILING_WHITESPACE
    OUTPUT_VARIABLE MLX_ROOT)

If you installed the MLX C++ package to a system path, then CMake should be
able to find it. If you installed it to a non-standard location or CMake can't
find MLX then set ``MLX_ROOT`` to the location where MLX is installed:

.. code-block:: cmake

  set(MLX_ROOT "/path/to/mlx/")

Next, instruct CMake to find MLX:

.. code-block:: cmake

  find_package(MLX CONFIG REQUIRED)

Finally, add the ``example.cpp`` program as an executable and link MLX.

.. code-block:: cmake

  add_executable(example example.cpp)
  target_link_libraries(example PRIVATE mlx)

You can build the example with:

.. code-block:: bash

  cmake -B build -DCMAKE_BUILD_TYPE=Release
  cmake --build build

And run it with:

.. code-block:: bash

  ./build/example

Note ``find_package(MLX CONFIG REQUIRED)`` sets the following variables:

.. list-table:: Package Variables
   :widths: 20 20 
   :header-rows: 1

   * - Variable 
     - Description 
   * - MLX_FOUND
     - ``True`` if MLX is found
   * - MLX_INCLUDE_DIRS
     - Include directory
   * - MLX_LIBRARIES
     - Libraries to link against
   * - MLX_CXX_FLAGS
     - Additional compiler flags
   * - MLX_BUILD_ACCELERATE
     - ``True`` if MLX was built with Accelerate 
   * - MLX_BUILD_METAL
     - ``True`` if MLX was built with Metal


================================================
FILE: docs/src/examples/data_parallelism.rst
================================================
.. _data_parallelism:

Data Parallelism
================

MLX enables efficient data parallel distributed training through its
distributed communication primitives.

.. _training_example:

Training Example
----------------

In this section we will adapt an MLX training loop to support data parallel
distributed training. Namely, we will average the gradients across a set of
hosts before applying them to the model.

Our training loop looks like the following code snippet if we omit the model,
dataset, and optimizer initialization.

.. code:: python

    model = ...
    optimizer = ...
    dataset = ...

    def step(model, x, y):
        loss, grads = loss_grad_fn(model, x, y)
        optimizer.update(model, grads)
        return loss

    for x, y in dataset:
        loss = step(model, x, y)
        mx.eval(loss, model.parameters())

All we have to do to average the gradients across machines is perform an
:func:`all_sum` and divide by the size of the :class:`Group`. Namely we
have to :func:`mlx.utils.tree_map` the gradients with following function.

.. code:: python

    def all_avg(x):
        return mx.distributed.all_sum(x) / mx.distributed.init().size()

Putting everything together our training loop step looks as follows with
everything else remaining the same.

.. code:: python

    from mlx.utils import tree_map

    def all_reduce_grads(grads):
        N = mx.distributed.init().size()
        if N == 1:
            return grads
        return tree_map(
            lambda x: mx.distributed.all_sum(x) / N,
            grads
        )

    def step(model, x, y):
        loss, grads = loss_grad_fn(model, x, y)
        grads = all_reduce_grads(grads)  # <--- This line was added
        optimizer.update(model, grads)
        return loss

Using ``nn.average_gradients``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Although the code example above works correctly; it performs one communication
per gradient. It is significantly more efficient to aggregate several gradients
together and perform fewer communication steps.

This is the purpose of :func:`mlx.nn.average_gradients`. The final code looks
almost identical to the example above:

.. code:: python

    model = ...
    optimizer = ...
    dataset = ...

    def step(model, x, y):
        loss, grads = loss_grad_fn(model, x, y)
        grads = mx.nn.average_gradients(grads)  # <---- This line was added
        optimizer.update(model, grads)
        return loss

    for x, y in dataset:
        loss = step(model, x, y)
        mx.eval(loss, model.parameters())


================================================
FILE: docs/src/examples/linear_regression.rst
================================================
.. _linear_regression:

Linear Regression
-----------------

Let's implement a basic linear regression model as a starting point to
learn MLX. First import the core package and setup some problem metadata:

.. code-block:: python

  import mlx.core as mx

  num_features = 100
  num_examples = 1_000
  num_iters = 10_000  # iterations of SGD
  lr = 0.01  # learning rate for SGD


We'll generate a synthetic dataset by:

1. Sampling the design matrix ``X``.
2. Sampling a ground truth parameter vector ``w_star``.
3. Compute the dependent values ``y`` by adding Gaussian noise to ``X @ w_star``.

.. code-block:: python

  # True parameters
  w_star = mx.random.normal((num_features,))

  # Input examples (design matrix)
  X = mx.random.normal((num_examples, num_features))

  # Noisy labels
  eps = 1e-2 * mx.random.normal((num_examples,))
  y = X @ w_star + eps


We will use SGD to find the optimal weights. To start, define the squared loss
and get the gradient function of the loss with respect to the parameters.

.. code-block:: python

  def loss_fn(w):
      return 0.5 * mx.mean(mx.square(X @ w - y))

  grad_fn = mx.grad(loss_fn)

Start the optimization by initializing the parameters ``w`` randomly. Then
repeatedly update the parameters for ``num_iters`` iterations. 

.. code-block:: python

  w = 1e-2 * mx.random.normal((num_features,))

  for _ in range(num_iters):
      grad = grad_fn(w)
      w = w - lr * grad
      mx.eval(w)

Finally, compute the loss of the learned parameters and verify that they are
close to the ground truth parameters.

.. code-block:: python

  loss = loss_fn(w)
  error_norm = mx.sum(mx.square(w - w_star)).item() ** 0.5

  print(
      f"Loss {loss.item():.5f}, |w-w*| = {error_norm:.5f}, "
  )
  # Should print something close to: Loss 0.00005, |w-w*| = 0.00364

Complete `linear regression
<https://github.com/ml-explore/mlx/tree/main/examples/python/linear_regression.py>`_
and `logistic regression
<https://github.com/ml-explore/mlx/tree/main/examples/python/logistic_regression.py>`_
examples are available in the MLX GitHub repo.


================================================
FILE: docs/src/examples/llama-inference.rst
================================================
LLM inference
==============

MLX enables efficient inference of large-ish transformers on Apple silicon
without compromising on ease of use. In this example we will create an
inference script for the Llama family of transformer models in which the model
is defined in less than 200 lines of python.

Implementing the model
----------------------

We will use the neural network building blocks defined in the :mod:`mlx.nn`
module to concisely define the model architecture. 

Attention layer
^^^^^^^^^^^^^^^^

We will start with the Llama attention layer which notably uses the RoPE
positional encoding. [1]_ In addition, our attention layer will optionally use a
key/value cache that will be concatenated with the provided keys and values to
support efficient inference.

Our implementation uses :class:`mlx.nn.Linear` for all the projections and
:class:`mlx.nn.RoPE` for the positional encoding.

.. code-block:: python

    import mlx.core as mx
    import mlx.nn as nn

    class LlamaAttention(nn.Module):
        def __init__(self, dims: int, num_heads: int):
            super().__init__()

            self.num_heads = num_heads

            self.rope = nn.RoPE(dims // num_heads, traditional=True)
            self.query_proj = nn.Linear(dims, dims, bias=False)
            self.key_proj = nn.Linear(dims, dims, bias=False)
            self.value_proj = nn.Linear(dims, dims, bias=False)
            self.out_proj = nn.Linear(dims, dims, bias=False)

        def __call__(self, queries, keys, values, mask=None, cache=None):
            queries = self.query_proj(queries)
            keys = self.key_proj(keys)
            values = self.value_proj(values)

            # Extract some shapes
            num_heads = self.num_heads
            B, L, D = queries.shape

            # Prepare the queries, keys and values for the attention computation
            queries = queries.reshape(B, L, num_heads, -1).transpose(0, 2, 1, 3)
            keys = keys.reshape(B, L, num_heads, -1).transpose(0, 2, 1, 3)
            values = values.reshape(B, L, num_heads, -1).transpose(0, 2, 1, 3)

            # Add RoPE to the queries and keys and combine them with the cache
            if cache is not None:
                key_cache, value_cache = cache
                queries = self.rope(queries, offset=key_cache.shape[2])
                keys = self.rope(keys, offset=key_cache.shape[2])
                keys = mx.concatenate([key_cache, keys], axis=2)
                values = mx.concatenate([value_cache, values], axis=2)
            else:
                queries = self.rope(queries)
                keys = self.rope(keys)

            # Finally perform the attention computation
            scale = math.sqrt(1 / queries.shape[-1])
            scores = (queries * scale) @ keys.transpose(0, 1, 3, 2)
            if mask is not None:
                scores = scores + mask
            scores = mx.softmax(scores, axis=-1)
            values_hat = (scores @ values).transpose(0, 2, 1, 3).reshape(B, L, -1)

            # Note that we return the keys and values to possibly be used as a cache
            return self.out_proj(values_hat), (keys, values)

Encoder layer
^^^^^^^^^^^^^

The other component of the Llama model is the encoder layer which uses RMS
normalization [2]_ and SwiGLU. [3]_ For RMS normalization we will use
:class:`mlx.nn.RMSNorm` that is already provided in :mod:`mlx.nn`.

.. code-block:: python

    class LlamaEncoderLayer(nn.Module):
        def __init__(self, dims: int, mlp_dims: int, num_heads: int):
            super().__init__()

            self.attention = LlamaAttention(dims, num_heads)

            self.norm1 = nn.RMSNorm(dims)
            self.norm2 = nn.RMSNorm(dims)

            self.linear1 = nn.Linear(dims, mlp_dims, bias=False)
            self.linear2 = nn.Linear(dims, mlp_dims, bias=False)
            self.linear3 = nn.Linear(mlp_dims, dims, bias=False)

        def __call__(self, x, mask=None, cache=None):
            y = self.norm1(x)
            y, cache = self.attention(y, y, y, mask, cache)
            x = x + y

            y = self.norm2(x)
            a = self.linear1(y)
            b = self.linear2(y)
            y = a * mx.sigmoid(a) * b
            y = self.linear3(y)
            x = x + y

            return x, cache

Full model
^^^^^^^^^^

To implement any Llama model we simply have to combine ``LlamaEncoderLayer``
instances with an :class:`mlx.nn.Embedding` to embed the input tokens.

.. code-block:: python

    class Llama(nn.Module):
        def __init__(
            self, num_layers: int, vocab_size: int, dims: int, mlp_dims: int, num_heads: int
        ):
            super().__init__()

            self.embedding = nn.Embedding(vocab_size, dims)
            self.layers = [
                LlamaEncoderLayer(dims, mlp_dims, num_heads) for _ in range(num_layers)
            ]
            self.norm = nn.RMSNorm(dims)
            self.out_proj = nn.Linear(dims, vocab_size, bias=False)

        def __call__(self, x):
            mask = nn.MultiHeadAttention.create_additive_causal_mask(x.shape[1])
            mask = mask.astype(self.embedding.weight.dtype)

            x = self.embedding(x)
            for l in self.layers:
                x, _ = l(x, mask)
            x = self.norm(x)
            return self.out_proj(x)

Note that in the implementation above we use a simple list to hold the encoder
layers but using ``model.parameters()`` will still consider these layers.

Generation
^^^^^^^^^^^

Our ``Llama`` module can be used for training but not inference as the
``__call__`` method above processes one input, completely ignores the cache and
performs no sampling whatsoever. In the rest of this subsection, we will
implement the inference function as a python generator that processes the
prompt and then autoregressively yields tokens one at a time.

.. code-block:: python

    class Llama(nn.Module):
        ...

        def generate(self, x, temp=1.0):
            cache = []

            # Make an additive causal mask. We will need that to process the prompt.
            mask = nn.MultiHeadAttention.create_additive_causal_mask(x.shape[1])
            mask = mask.astype(self.embedding.weight.dtype)

            # First we process the prompt x the same way as in __call__ but
            # save the caches in cache
            x = self.embedding(x)
            for l in self.layers:
                x, c = l(x, mask=mask)
                cache.append(c)  # <--- we store the per layer cache in a
                                 #      simple python list
            x = self.norm(x)
            y = self.out_proj(x[:, -1])  # <--- we only care about the last logits
                                         #      that generate the next token
            y = mx.random.categorical(y * (1/temp))

            # y now has size [1]
            # Since MLX is lazily evaluated nothing is computed yet.
            # Calling y.item() would force the computation to happen at
            # this point but we can also choose not to do that and let the
            # user choose when to start the computation.
            yield y

            # Now we parsed the prompt and generated the first token we
            # need to feed it back into the model and loop to generate the
            # rest.
            while True:
                # Unsqueezing the last dimension to add a sequence length
                # dimension of 1
                x = y[:, None]

                x = self.embedding(x)
                for i in range(len(cache)):
                    # We are overwriting the arrays in the cache list. When
                    # the computation will happen, MLX will be discarding the
                    # old cache the moment it is not needed anymore.
                    x, cache[i] = self.layers[i](x, mask=None, cache=cache[i])
                x = self.norm(x)
                y = self.out_proj(x[:, -1])
                y = mx.random.categorical(y * (1/temp))

                yield y

Putting it all together
^^^^^^^^^^^^^^^^^^^^^^^

We now have everything we need to create a Llama model and sample tokens from
it. In the following code, we randomly initialize a small Llama model, process
6 tokens of prompt and generate 10 tokens.

.. code-block:: python

    model = Llama(num_layers=12, vocab_size=8192, dims=512, mlp_dims=1024, num_heads=8)

    # Since MLX is lazily evaluated nothing has actually been materialized yet.
    # We could have set the `dims` to 20_000 on a machine with 8GB of RAM and the
    # code above would still run. Let's actually materialize the model.
    mx.eval(model.parameters())

    prompt = mx.array([[1, 10, 8, 32, 44, 7]])  # <-- Note the double brackets because we
                                                #     have a batch dimension even
                                                #     though it is 1 in this case

    generated = [t for i, t in zip(range(10), model.generate(prompt, 0.8))]

    # Since we haven't evaluated anything, nothing is computed yet. The list
    # `generated` contains the arrays that hold the computation graph for the
    # full processing of the prompt and the generation of 10 tokens.
    #
    # We can evaluate them one at a time, or all together. Concatenate them or
    # print them. They would all result in very similar runtimes and give exactly
    # the same results.
    mx.eval(generated)

Converting the weights
----------------------

This section assumes that you have access to the original Llama weights and the
SentencePiece model that comes with them. We will write a small script to
convert the PyTorch weights to MLX compatible ones and write them in a NPZ file
that can be loaded directly by MLX.

.. code-block:: python

    import argparse
    from itertools import starmap

    import numpy as np
    import torch

    def map_torch_to_mlx(key, value):
        if "tok_embedding" in key:
            key = "embedding.weight"

        elif "norm" in key:
            key = key.replace("attention_norm", "norm1").replace("ffn_norm", "norm2")

        elif "wq" in key or "wk" in key or "wv" in key or "wo" in key:
            key = key.replace("wq", "query_proj")
            key = key.replace("wk", "key_proj")
            key = key.replace("wv", "value_proj")
            key = key.replace("wo", "out_proj")

        elif "w1" in key or "w2" in key or "w3" in key:
            # The FFN is a separate submodule in PyTorch
            key = key.replace("feed_forward.w1", "linear1")
            key = key.replace("feed_forward.w3", "linear2")
            key = key.replace("feed_forward.w2", "linear3")

        elif "output" in key:
            key = key.replace("output", "out_proj")

        elif "rope" in key:
            return None, None

        return key, value.numpy()


    if __name__ == "__main__":
        parser = argparse.ArgumentParser(description="Convert Llama weights to MLX")
        parser.add_argument("torch_weights")
        parser.add_argument("output_file")
        args = parser.parse_args()

        state = torch.load(args.torch_weights)
        np.savez(
            args.output_file,
            **{k: v for k, v in starmap(map_torch_to_mlx, state.items()) if k is not None}
        )


Weight loading and benchmarking
-------------------------------

After converting the weights to be compatible to our implementation, all that is
left is to load them from disk and we can finally use the LLM to generate text.
We can load numpy format files using the :func:`mlx.core.load` operation.

To create a parameter dictionary from the key/value representation of NPZ files
we will use the :func:`mlx.utils.tree_unflatten` helper method as follows:

.. code-block:: python

    from mlx.utils import tree_unflatten

    model.update(tree_unflatten(list(mx.load(weight_file).items())))

:meth:`mlx.utils.tree_unflatten` will take keys from the NPZ file that look
like ``layers.2.attention.query_proj.weight`` and will transform them to

.. code-block:: python

   {"layers": [..., ..., {"attention": {"query_proj": {"weight": ...}}}]}

which can then be used to update the model. Note that the method above incurs
several unnecessary copies from disk to numpy and then from numpy to MLX. It
will be replaced in the future with direct loading to MLX.

You can download the full example code in `mlx-examples`_. Assuming, the
existence of ``weights.pth`` and ``tokenizer.model`` in the current working
directory we can play around with our inference script as follows (the timings
are representative of an M1 Ultra and the 7B parameter Llama model):

.. code-block:: bash

    $ python convert.py weights.pth llama-7B.mlx.npz
    $ python llama.py llama-7B.mlx.npz tokenizer.model 'Call me Ishmael. Some years ago never mind how long precisely'
    [INFO] Loading model from disk: 5.247 s
    Press enter to start generation
    ------
    , having little or no money in my purse, and nothing of greater consequence in my mind, I happened to be walking down Gower Street in the afternoon, in the heavy rain, and I saw a few steps off, a man in rags, who sat upon his bundle and looked hard into the wet as if he were going to cry. I watched him attentively for some time, and could not but observe that, though a numerous crowd was hurrying up and down,
    ------
    [INFO] Prompt processing: 0.437 s
    [INFO] Full generation: 4.330 s

We observe that 4.3 seconds are required to generate 100 tokens and 0.4 seconds
of those are spent processing the prompt. This amounts to a little over **39 ms
per token**.

By running with a much bigger prompt we can see that the per token generation
time as well as the prompt processing time remains almost constant.

.. code-block:: bash

    $ python llama.py llama-7B.mlx.npz tokenizer.model 'Call me Ishmael. Some years ago never mind how long precisely, having little or no money in my purse, and nothing of greater consequence in my mind, I happened to be walking down Gower Street in the afternoon, in the heavy rain, and I saw a few steps off, a man in rags, who sat upon his bundle and looked hard into the wet as if he were going to cry. I watched him attentively for some time, and could not but observe that, though a numerous crowd was hurrying up and down, nobody took the least notice of him. I stopped at last, at a little distance, as if I had been in doubt, and after looking on a few minutes, walked straight up to him. He slowly raised his eyes, and fixed them upon me for a moment, without speaking, and then resumed his place and posture as before. I stood looking at him for a while, feeling very much pain at heart, and then said to him, “What are you doing there?” Something like a smile passed over his face, as he said slowly, “I am waiting for someone; but it has been three quarters of an hour now, and he has not come.” “What is it you are waiting for?” said I. Still he made no immediate reply, but again put his face down upon his hands, and did not'
    [INFO] Loading model from disk: 5.247 s
    Press enter to start generation
    ------
    take his eyes from the ground. “What is it you are waiting for?” said I. “I am not accustomed to be thus questioned,” said he. “You look like a reasonable man—tell me, then, what are you waiting for?” “You would not understand,” he replied; “and how could you help me, if I were to tell you?” “I should not only understand, but would do all that I could,” said I. He did not
    ------
    [INFO] Prompt processing: 0.579 s
    [INFO] Full generation: 4.690 s
    $ python llama.py --num-tokens 500 llama-7B.mlx.npz tokenizer.model 'Call me Ishmael. Some years ago never mind how long precisely, having little or no money in my purse, and nothing of greater consequence in my mind, I happened to be walking down Gower Street in the afternoon, in the heavy rain, and I saw a few steps off, a man in rags, who sat upon his bundle and looked hard into the wet as if he were going to cry. I watched him attentively for some time, and could not but observe that, though a numerous crowd was hurrying up and down, nobody took the least notice of him. I stopped at last, at a little distance, as if I had been in doubt, and after looking on a few minutes, walked straight up to him. He slowly raised his eyes, and fixed them upon me for a moment, without speaking, and then resumed his place and posture as before. I stood looking at him for a while, feeling very much pain at heart, and then said to him, “What are you doing there?” Something like a smile passed over his face, as he said slowly, “I am waiting for someone; but it has been three quarters of an hour now, and he has not come.” “What is it you are waiting for?” said I. Still he made no immediate reply, but again put his face down upon his hands, and did not'
    [INFO] Loading model from disk: 5.628 s
    Press enter to start generation
    ------
    take his eyes from the ground. “What is it you are waiting for?” said I. “I am not accustomed to be thus questioned,” said he. “You look like a reasonable man—tell me, then, what are you waiting for?” “You would not understand,” he replied; “and how could you help me, if I were to tell you?” “I should not only understand, but would do all that I could,” said I. He did not reply, but still went on looking at the ground, and took hold of his bundle with a nervous trembling. I waited some time, and then resumed. “It is of no use to say you would not understand, if I were to tell you,” said he. “I have not told you why I am waiting for him,” said I. “And I am sure I should not understand,” replied he. “I will tell you then,” said I, “and, perhaps, you would not be surprised.” “No matter,” said he, “I shall be surprised anyhow; so tell me why you are waiting for him.” “He is my friend,” said I. “Yes,” said he, with a slight smile, “I know.” “He has been kind to me,” said I, “and I am waiting for him. I want to see him, and could have waited as I am now, for a much longer time.” “He will not soon come,” said he. “Unless he sees you here, he will not know of your having waited, and he will be very unlikely to come.” “No matter,” said I, “I shall wait for him.” “This is a strange thing,” said he, still with the same amused smile. “How did you know,” said I, “that he was coming? How should you be waiting?” “That is my secret,” said he. “And you expect him?” “Yes,” said I. “Are you disappointed then, if he does not come?” “No,” said I, “it is his secret, not mine.” “If he comes,” said he, “do you mean to go straight away?” “Yes,” said I, “I cannot be happy if I do not go straight away after him.” “Did you know this place before?” asked he. “Yes,” said I. “Is there any shop to buy food here?” “
    ------
    [INFO] Prompt processing: 0.633 s
    [INFO] Full generation: 21.475 s

Scripts
-------

.. admonition:: Download the code

   The full example code is available in `mlx-examples`_.

.. _mlx-examples: https://github.com/ml-explore/mlx-examples/tree/main/llms/llama

.. [1] Su, J., Lu, Y., Pan, S., Murtadha, A., Wen, B. and Liu, Y., 2021.
   Roformer: Enhanced transformer with rotary position embedding. arXiv
   preprint arXiv:2104.09864.
.. [2] Zhang, B. and Sennrich, R., 2019. Root mean square layer normalization.
   Advances in Neural Information Processing Systems, 32.
.. [3] Shazeer, N., 2020. Glu variants improve transformer. arXiv preprint
   arXiv:2002.05202.


================================================
FILE: docs/src/examples/mlp.rst
================================================
.. _mlp:

Multi-Layer Perceptron
----------------------

In this example we'll learn to use ``mlx.nn`` by implementing a simple
multi-layer perceptron to classify MNIST.

As a first step import the MLX packages we need:

.. code-block:: python

  import mlx.core as mx
  import mlx.nn as nn
  import mlx.optimizers as optim

  import numpy as np


The model is defined as the ``MLP`` class which inherits from
:class:`mlx.nn.Module`. We follow the standard idiom to make a new module:

1. Define an ``__init__`` where the parameters and/or submodules are setup. See
   the :ref:`Module class docs<module_class>` for more information on how
   :class:`mlx.nn.Module` registers parameters.
2. Define a ``__call__`` where the computation is implemented.

.. code-block:: python

  class MLP(nn.Module):
      def __init__(
          self, num_layers: int, input_dim: int, hidden_dim: int, output_dim: int
      ):
          super().__init__()
          layer_sizes = [input_dim] + [hidden_dim] * num_layers + [output_dim]
          self.layers = [
              nn.Linear(idim, odim)
              for idim, odim in zip(layer_sizes[:-1], layer_sizes[1:])
          ]

      def __call__(self, x):
          for l in self.layers[:-1]:
              x = mx.maximum(l(x), 0.0)
          return self.layers[-1](x)


We define the loss function which takes the mean of the per-example cross
entropy loss.  The ``mlx.nn.losses`` sub-package has implementations of some
commonly used loss functions.

.. code-block:: python

  def loss_fn(model, X, y):
      return mx.mean(nn.losses.cross_entropy(model(X), y))

We also need a function to compute the accuracy of the model on the validation
set:

.. code-block:: python

  def eval_fn(model, X, y):
      return mx.mean(mx.argmax(model(X), axis=1) == y)

Next, setup the problem parameters and load the data. To load the data, you need our
`mnist data loader
<https://github.com/ml-explore/mlx-examples/blob/main/mnist/mnist.py>`_, which
we will import as ``mnist``.

.. code-block:: python

  num_layers = 2
  hidden_dim = 32
  num_classes = 10
  batch_size = 256
  num_epochs = 10
  learning_rate = 1e-1

  # Load the data
  import mnist 
  train_images, train_labels, test_images, test_labels = map(
      mx.array, mnist.mnist()
  )

Since we're using SGD, we need an iterator which shuffles and constructs
minibatches of examples in the training set:

.. code-block:: python

  def batch_iterate(batch_size, X, y):
      perm = mx.array(np.random.permutation(y.size))
      for s in range(0, y.size, batch_size):
          ids = perm[s : s + batch_size]
          yield X[ids], y[ids]


Finally, we put it all together by instantiating the model, the
:class:`mlx.optimizers.SGD` optimizer, and running the training loop:

.. code-block:: python

  # Load the model
  model = MLP(num_layers, train_images.shape[-1], hidden_dim, num_classes)
  mx.eval(model.parameters())

  # Get a function which gives the loss and gradient of the
  # loss with respect to the model's trainable parameters
  loss_and_grad_fn = nn.value_and_grad(model, loss_fn)

  # Instantiate the optimizer
  optimizer = optim.SGD(learning_rate=learning_rate)

  for e in range(num_epochs):
      for X, y in batch_iterate(batch_size, train_images, train_labels):
          loss, grads = loss_and_grad_fn(model, X, y)

          # Update the optimizer state and model parameters
          # in a single call
          optimizer.update(model, grads)

          # Force a graph evaluation
          mx.eval(model.parameters(), optimizer.state)

      accuracy = eval_fn(model, test_images, test_labels)
      print(f"Epoch {e}: Test accuracy {accuracy.item():.3f}")


.. note::
  The :func:`mlx.nn.value_and_grad` function is a convenience function to get
  the gradient of a loss with respect to the trainable parameters of a model.
  This should not be confused with :func:`mlx.core.value_and_grad`.

The model should train to a decent accuracy (about 95%) after just a few passes
over the training set. The `full example <https://github.com/ml-explore/mlx-examples/tree/main/mnist>`_
is available in the MLX GitHub repo.


================================================
FILE: docs/src/examples/tensor_parallelism.rst
================================================
.. _tensor_parallelism:

Tensor Parallelism
==================

In this example, we will explore how tensor parallelism (TP) works in MLX.  We
will start with an overview of the distributed layers in ``mlx.nn`` and then
show how to do tensor parallelism Llama-style transformer models.

Sharded Layers
--------------

:class:`AllToShardedLinear <mlx.nn.AllToShardedLinear>`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

This layer replicates a common input and shards the weight matrix along the
output dimension across all devices in the :class:`mlx.core.distributed.Group`.
The layer produces a sharded output.

For example, consider an :class:`mlx.nn.AllToShardedLinear` layer with
``input_dims=2`` and ``output_dims=2``, a batched input of shape ``(4, 2)``,
and a device group with 2 devices. The layer shards the weight matrix along the
output dimension across the two devices, where each device receives the full
input and computes a partial output.

.. raw:: html

    <div>
      <img src="../_static/tp_inference/all-to-sharded-linear.png" alt="column-wise tensor parallelism" style="width: 100%">
    </div>

This layer does not automatically gather all outputs from each device. This is
an intended and :ref:`useful design choice <useful_design_choices>`.

:class:`QuantizedAllToShardedLinear <mlx.nn.QuantizedAllToShardedLinear>` is
the quantized equivalent of :class:`mlx.nn.AllToShardedLinear`.  Similar to
:class:`mlx.nn.QuantizedLinear`, its parameters are frozen and will not be
included in any gradient computation.


:class:`ShardedToAllLinear <mlx.nn.ShardedToAllLinear>`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

This layer expects inputs that are sharded along the feature dimension and
shards the weight matrix along the input dimension across all devices in the
:class:`mlx.core.distributed.Group`. The layer automatically aggregates the
results using :class:`mlx.core.distributed.all_sum`, so all devices in the
group will have the same result.

For example, consider an :class:`mlx.nn.ShardedToAllLinear` layer with
``input_dims=2`` and ``output_dims=2``, a batched input of shape ``(4, 2)``,
and a device group with 2 devices. The layer shards the weight matrix along the
input dimension across the two devices. Each device computes a ``(4,2)``
output, which is then aggregated with all other device outputs to get layer
output.

   .. raw:: html

    <div>
      <img src="../_static/tp_inference/sharded-to-all-linear.png" alt="row-wise tensor parallelism" style="width: 100%">
    </div>

This layer does not automatically shard the inputs along the feature dimension
for you. It is necessary to create a "partial" input structure to feed into the
layer. This is an intended and :ref:`useful design choice
<useful_design_choices>`.

:class:`QuantizedShardedToAllLinear <mlx.nn.QuantizedShardedToAllLinear>` is
the quantized equivalent of :class:`mlx.nn.ShardedToAllLinear`.  Similar to
:class:`mlx.nn.QuantizedLinear`, its parameters are frozen and will not be
included in any gradient computation.


Shard Utility Functions
-----------------------

:func:`shard_linear <mlx.nn.layers.distributed.shard_linear>`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Converts a regular linear layer into a tensor parallel layer that distributes
computation across multiple devices. Takes an existing :class:`mlx.nn.Linear`
or :class:`mlx.nn.QuantizedLinear` layer and returns a new distributed layer
(either :class:`mlx.nn.AllToShardedLinear` or
:class:`mlx.nn.ShardedToAllLinear`, depending on the sharding type). The
original layer is not modified.

:func:`shard_inplace <mlx.nn.layers.distributed.shard_inplace>`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Splits the parameters of an existing layer across multiple devices by modifying
the layer in-place. Unlike :func:`shard_linear
<mlx.nn.layers.distributed.shard_linear>`, this function does not create a new
layer or add distributed communication. The layer itself must handle
distributed communication if needed.


.. _useful_design_choices:

Useful Design Choices
---------------------

The design choices above regarding when operations are done automatically are intentional and make model training and inference easier.

All-to-sharded and sharded-to-all layers naturally go together because the
output of the former layer is exactly the input needed needed for the latter.
This removes the need for an intermediate gather step between the layers,
reducing communication overhead.

This is why :class:`mlx.nn.AllToShardedLinear` does not aggregate results
automatically and why :class:`mlx.nn.ShardedToAllLinear` does not shard inputs
automatically. It is so that they can be placed in successive order and work
together easily.

We can demonstrate this through a simple model using our two types of
distributed layers.

.. code-block:: python

  x = ... # some (4, 2) model input: batch size 4, feature size 2

  l1 = nn.AllToShardedLinear(2, 2, bias=False)   # initialize the layer
  l1_out = l1(x) # (4, 1) output

  l2 = nn.ShardedToAllLinear(2, 2, bias=False)
  l2_out = l2(l1_out) # (4, 2) output

.. raw:: html

    <div>
      <img src="../_static/tp_inference/column-row-tp.png" alt="two layer tensor parallelism" style="width: 100%">
      <p style="font-size: 0.85em; margin-top: 0.5em;"><small>A visualization of the simple MLX model using all-to-sharded then sharded-to-all tensor parallelism across 2 devices.</small></p>
    </div>


LLM Inference with Tensor Parallelism
-------------------------------------

We can apply these TP techniques to LLMs in order to enable inference for much
larger models by sharding parameters from huge layers across multiple devices.

To demonstrate this, let's apply TP to the Transformer block of our :doc:`Llama
Inference <llama-inference>` example. In this example, we will use the same
inference script as the Llama Inference example, which can be found in
`mlx-examples`_.

Our first edit is to initialize the distributed communication group and get the
current process rank:

.. code-block:: python

  world = mx.distributed.init()
  rank = world.rank()

Next, let's look at the current architecture of the transformer block and see how we can apply tensor parallelism:

.. raw:: html

    <div>
      <img src="../_static/tp_inference/llama-transformer.png" alt="llama transformer example" style="width: 100%">
    </div>


This architecture has two natural places where 
tensor parallelism can be applied: the attention block and the FFN
block. Both follow the same pattern: multiple parallel linear layers operating
on the same input, followed by a single output linear layer. In the attention
block, the Q, K, and V projections are sharded along the output dimension (all-to-sharded), and the output
projection is sharded along the input dimension (sharded-to-all). Similarly in the FFN block, the gate and up projections
become all-to-sharded layers, and the down projection becomes an sharded-to-all layer.

The intermediate operations between the linear layers (RoPE, softmax, scaled
dot-product attention in the attention block, and element-wise multiplication
in the FFN block) do not impede the use of our TP paradigm. These operations
are either:

- **Element-wise operations** (RoPE, element-wise multiplication): These
  operate independently on each element or position, preserving the sharding
  pattern without requiring cross-device communication.

- **Operations on non-sharded dimensions** (softmax, scaled dot-product
  attention): These operate along dimensions that are not sharded (such as the
  sequence length or head dimensions), so they can be computed independently on
  each device. The attention computation ``Q @ K^T`` and ``scores @ V`` work
  correctly with sharded Q, K, V tensors because the matrix multiplications are
  performed along the sharded feature dimension, and the results remain
  properly sharded for the subsequent sharded-to-all layer.

To implement sharding in our Llama inference, we use :func:`shard_linear
<mlx.nn.layers.distributed.shard_linear>` to get sharded linear layers with
distributed communication. This is easier than using :func:`shard_inplace
<mlx.nn.layers.distributed.shard_inplace>` and implementing the steps manually
in the :code:`__call__` function.

The following code shows how to shard the Attention block. The Q, K, and V
projection layers are converted to all-to-sharded layers, while the output
projection is converted to a sharded-to-all layer. The number of heads are also
adjusted to account for the sharding:

.. code-block:: python

  # ... in Attention class
  def shard(self, group: mx.distributed.Group):
    self.n_heads = self.n_heads // group.size()
    self.n_kv_heads = self.n_kv_heads // group.size()

    self.wq = nn.layers.distributed.shard_linear(self.wq, "all-to-sharded", group=group)
    self.wk = nn.layers.distributed.shard_linear(self.wk, "all-to-sharded", group=group)
    self.wv = nn.layers.distributed.shard_linear(self.wv, "all-to-sharded", group=group)
    self.wo = nn.layers.distributed.shard_linear(self.wo, "sharded-to-all", group=group)

Similarly, the FeedForward block is sharded by converting the gate (w1) and up
(w3) projections to all-to-sharded layers, and the down projection (w2) to
a sharded-to-all layer:

.. code-block:: python

  # ... in FeedForward class
  def shard(self, group: mx.distributed.Group):
    self.w1 = nn.layers.distributed.shard_linear(self.w1, "all-to-sharded", group=group)
    self.w2 = nn.layers.distributed.shard_linear(self.w2, "sharded-to-all", group=group)
    self.w3 = nn.layers.distributed.shard_linear(self.w3, "all-to-sharded", group=group)

Finally, in our :code:`load_model` function, we need to apply our sharding
functions to all transformer layers when using multiple devices:

.. code-block:: python

  # ... in load_model function
  if world.size() > 1:
    # convert Linear layers in Transformer/FFN to appropriate Sharded Layers
    for layer in model.layers:
        layer.attention.shard(group=world)
        layer.feed_forward.shard(group=world)

This allows us to use the llama inference file as normal when running
:code:`python llama.py`, but now we can also run it across two (or more)
devices via :code:`mlx.launch -n 2 llama.py`.

.. _mlx-examples: https://github.com/ml-explore/mlx-examples/tree/main/llms/llama


================================================
FILE: docs/src/index.rst
================================================
MLX
===

MLX is a NumPy-like array framework designed for efficient and flexible machine
learning on Apple silicon, brought to you by Apple machine learning research.

The Python API closely follows NumPy with a few exceptions. MLX also has a
fully featured C++ API which closely follows the Python API.

The main differences between MLX and NumPy are:

 - **Composable function transformations**: MLX has composable function
   transformations for automatic differentiation, automatic vectorization,
   and computation graph optimization.
 - **Lazy computation**: Computations in MLX are lazy. Arrays are only
   materialized when needed.
 - **Multi-device**: Operations can run on any of the supported devices (CPU,
   GPU, ...)

The design of MLX is inspired by frameworks like `PyTorch
<https://pytorch.org/>`_, `Jax <https://github.com/google/jax>`_, and
`ArrayFire <https://arrayfire.org/>`_. A notable difference from these
frameworks and MLX is the *unified memory model*. Arrays in MLX live in shared
memory. Operations on MLX arrays can be performed on any of the supported
device types without performing data copies. Currently supported device types
are the CPU and GPU.

.. toctree::
   :caption: Install
   :maxdepth: 1

   install

.. toctree::
   :caption: Usage 
   :maxdepth: 1

   usage/quick_start
   usage/lazy_evaluation
   usage/unified_memory
   usage/indexing
   usage/saving_and_loading
   usage/function_transforms
   usage/compile
   usage/numpy
   usage/distributed
   usage/using_streams
   usage/export

.. toctree::
   :caption: Examples
   :maxdepth: 1

   examples/linear_regression
   examples/mlp
   examples/llama-inference
   examples/data_parallelism
   examples/tensor_parallelism

.. toctree::
   :caption: Python API Reference
   :maxdepth: 1

   python/array
   python/data_types
   python/devices_and_streams
   python/export
   python/ops
   python/random
   python/transforms
   python/fast
   python/fft
   python/linalg
   python/metal
   python/cuda
   python/memory_management
   python/nn
   python/optimizers
   python/distributed
   python/tree_utils

.. toctree::
   :caption: C++ API Reference
   :maxdepth: 1

   cpp/ops

.. toctree::
   :caption: Further Reading
   :maxdepth: 1

   dev/extensions
   dev/metal_debugger
   dev/metal_logging
   dev/custom_metal_kernels
   dev/mlx_in_cpp


================================================
FILE: docs/src/install.rst
================================================
.. _build_and_install:

Build and Install
=================

Python Installation
-------------------

MLX is available on PyPI. All you have to do to use MLX with your own Apple
silicon computer is

.. code-block:: shell

    pip install mlx

To install from PyPI your system must meet the following requirements:

- Using `Apple silicon <https://support.apple.com/en-us/116943>`_
- Using a native Python >= 3.10
- macOS >= 14.0

.. note::
    MLX is only available on devices running macOS >= 14.0 and higher.

CUDA
^^^^

MLX has a CUDA backend which you can install with:

.. code-block:: shell

    pip install mlx[cuda12]


To install the CUDA package from PyPi your system must meet the following
requirements:

- Nvidia architecture >= SM 7.5
- Nvidia driver >= 550.54.14
- CUDA toolkit >= 12.0
- Linux distribution with glibc >= 2.35
- Python >= 3.10

For CUDA 13 use ``pip install mlx[cuda13]``. The CUDA 13 package requires
an Nvidia driver >= 580 or an appropriate CUDA compatibility package.

CPU-only (Linux)
^^^^^^^^^^^^^^^^

For a CPU-only version of MLX that runs on Linux use:

.. code-block:: shell

    pip install mlx[cpu]

To install the CPU-only package from PyPi your system must meet the following
requirements:

- Linux distribution with glibc >= 2.35
- Python >= 3.10


Troubleshooting
^^^^^^^^^^^^^^^

*My OS and Python versions are in the required range but pip still does not find
a matching distribution.*

Probably you are using a non-native Python. The output of

.. code-block:: shell

  python -c "import platform; print(platform.processor())"

should be ``arm``. If it is ``i386`` (and you have M series machine) then you
are using a non-native Python. Switch your Python to a native Python. A good
way to do this is with `Conda <https://stackoverflow.com/q/65415996>`_.


Build from source
-----------------

Build Requirements
^^^^^^^^^^^^^^^^^^

- ``libblas-dev``, ``liblapack-dev``, and ``liblapacke-dev`` (Linux)
- A C++ compiler with C++20 support (e.g. Clang >= 15.0)
- `cmake <https://cmake.org/>`_ -- version 3.25 or later, and ``make``
- Xcode >= 15.0 and macOS SDK >= 14.0

.. note::
   Ensure your shell environment is native ``arm``, not ``x86`` via Rosetta. If
   the output of ``uname -p`` is ``x86``, see the :ref:`troubleshooting section <build shell>` below.

Python API
^^^^^^^^^^

.. _python install:

To build and install the MLX python library from source, first, clone MLX from
`its GitHub repo <https://github.com/ml-explore/mlx>`_:

.. code-block:: shell

   git clone git@github.com:ml-explore/mlx.git mlx && cd mlx

Then simply build and install MLX using pip:

.. code-block:: shell

  pip install .

For developing, install the package with development dependencies, and use an
editable install:

.. code-block:: shell

  pip install -e ".[dev]"

Once the development dependencies are installed, you can build faster with:

.. code-block:: shell

 python setup.py build_ext --inplace

Run the tests with:

.. code-block:: shell

  python -m unittest discover python/tests

C++ API
^^^^^^^

.. _cpp install:

Currently, MLX must be built and installed from source.

Similarly to the python library, to build and install the MLX C++ library start
by cloning MLX from `its GitHub repo
<https://github.com/ml-explore/mlx>`_:

.. code-block:: shell

   git clone git@github.com:ml-explore/mlx.git mlx && cd mlx

Create a build directory and run CMake and make:

.. code-block:: shell

   mkdir -p build && cd build
   cmake .. && make -j

Run tests with:

.. code-block:: shell

   make test

Install with:

.. code-block:: shell

   make install

Note that the built ``mlx.metallib`` file should be either at the same
directory as the executable statically linked to ``libmlx.a`` or the
preprocessor constant ``METAL_PATH`` should be defined at build time and it
should point to the path to the built metal library.

.. list-table:: Build Options
   :widths: 25 8
   :header-rows: 1

   * - Option
     - Default
   * - MLX_BUILD_TESTS
     - ON
   * - MLX_BUILD_EXAMPLES
     - OFF
   * - MLX_BUILD_BENCHMARKS
     - OFF
   * - MLX_BUILD_METAL
     - ON
   * - MLX_BUILD_CPU
     - ON
   * - MLX_BUILD_PYTHON_BINDINGS
     - OFF
   * - MLX_METAL_DEBUG
     - OFF
   * - MLX_BUILD_SAFETENSORS
     - ON
   * - MLX_BUILD_GGUF
     - ON
   * - MLX_METAL_JIT
     - OFF

.. note::

    If you have multiple Xcode installations and wish to use
    a specific one while building, you can do so by adding the
    following environment variable before building

    .. code-block:: shell

      export DEVELOPER_DIR="/path/to/Xcode.app/Contents/Developer/"

    Further, you can use the following command to find out which
    macOS SDK will be used

    .. code-block:: shell

      xcrun -sdk macosx --show-sdk-version


Binary Size Minimization
~~~~~~~~~~~~~~~~~~~~~~~~

To produce a smaller binary use the CMake flags ``CMAKE_BUILD_TYPE=MinSizeRel``
and ``BUILD_SHARED_LIBS=ON``.

The MLX CMake build has several additional options to make smaller binaries.
For example, if you don't need the CPU backend or support for safetensors and
GGUF, you can do:

.. code-block:: shell

  cmake .. \
    -DCMAKE_BUILD_TYPE=MinSizeRel \
    -DBUILD_SHARED_LIBS=ON \
    -DMLX_BUILD_CPU=OFF \
    -DMLX_BUILD_SAFETENSORS=OFF \
    -DMLX_BUILD_GGUF=OFF \
    -DMLX_METAL_JIT=ON

THE ``MLX_METAL_JIT`` flag minimizes the size of the MLX Metal library which
contains pre-built GPU kernels. This substantially reduces the size of the
Metal library by run-time compiling kernels the first time they are used in MLX
on a given machine. Note run-time compilation incurs a cold-start cost which can
be anwywhere from a few hundred millisecond to a few seconds depending on the
application. Once a kernel is compiled, it will be cached by the system. The
Metal kernel cache persists across reboots.

Linux
^^^^^

To build from source on Linux (CPU only), install the BLAS and LAPACK headers.
For example on Ubuntu, run the following:

.. code-block:: shell

   apt-get update -y
   apt-get install libblas-dev liblapack-dev liblapacke-dev -y

From here follow the instructions to install either the :ref:`Python <python
install>` or :ref:`C++ <cpp install>` APIs.

CUDA
^^^^

To build from source on Linux with CUDA, install the BLAS and LAPACK headers
and the CUDA toolkit. For example on Ubuntu, run the following:

.. code-block:: shell

   wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
   dpkg -i cuda-keyring_1.1-1_all.deb
   apt-get update -y
   apt-get -y install cuda-toolkit-12-9
   apt-get install libblas-dev liblapack-dev liblapacke-dev libcudnn9-dev-cuda-12 -y


When building either the Python or C++ APIs make sure to pass the cmake flag
``MLX_BUILD_CUDA=ON``. For example, to build the Python API run:

.. code-block:: shell

  CMAKE_ARGS="-DMLX_BUILD_CUDA=ON" pip install -e ".[dev]"

To build the C++ package run:

.. code-block:: shell

   mkdir -p build && cd build
   cmake .. -DMLX_BUILD_CUDA=ON && make -j


Troubleshooting
^^^^^^^^^^^^^^^

Metal not found
~~~~~~~~~~~~~~~

You see the following error when you try to build:

.. code-block:: shell

  error: unable to find utility "metal", not a developer tool or in PATH

To fix this, first make sure you have Xcode installed:

.. code-block:: shell

  xcode-select --install

Then set the active developer directory:

.. code-block:: shell

  sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer

x86 Shell
~~~~~~~~~

.. _build shell:

If the output of ``uname -p``  is ``x86`` then your shell is running as x86 via
Rosetta instead of natively.

To fix this, find the application in Finder (``/Applications`` for iTerm,
``/Applications/Utilities`` for Terminal), right-click, and click “Get Info”.
Uncheck “Open using Rosetta”, close the “Get Info” window, and restart your
terminal.

Verify the terminal is now running natively the following command:

.. code-block:: shell

  $ uname -p
  arm

Also check that cmake is using the correct architecture:

.. code-block:: shell

  $ cmake --system-information | grep CMAKE_HOST_SYSTEM_PROCESSOR
  CMAKE_HOST_SYSTEM_PROCESSOR "arm64"

If you see ``"x86_64"``, try re-installing ``cmake``. If you see ``"arm64"``
but the build errors out with "Building for x86_64 on macOS is not supported."
wipe your build cache with ``rm -rf build/`` and try again.


================================================
FILE: docs/src/python/array.rst
================================================
.. _array:

Array
=====

.. currentmodule:: mlx.core

.. autosummary:: 
   :toctree: _autosummary 

    array
    array.astype
    array.at
    array.item
    array.tolist
    array.dtype
    array.itemsize
    array.nbytes
    array.ndim
    array.shape
    array.size
    array.real
    array.imag
    array.abs
    array.all
    array.any
    array.argmax
    array.argmin
    array.conj
    array.cos
    array.cummax
    array.cummin
    array.cumprod
    array.cumsum
    array.diag
    array.diagonal
    array.exp
    array.flatten
    array.log
    array.log10
    array.log1p
    array.log2
    array.logcumsumexp
    array.logsumexp
    array.max
    array.mean
    array.min
    array.moveaxis
    array.prod
    array.reciprocal
    array.reshape
    array.round
    array.rsqrt
    array.sin
    array.split
    array.sqrt
    array.square
    array.squeeze
    array.std
    array.sum
    array.swapaxes
    array.transpose
    array.T
    array.var
    array.view


================================================
FILE: docs/src/python/cuda.rst
================================================
CUDA
=====

.. currentmodule:: mlx.core.cuda

.. autosummary::
  :toctree: _autosummary

  is_available


================================================
FILE: docs/src/python/data_types.rst
================================================
.. _data_types:

Data Types
==========

.. currentmodule:: mlx.core

The default floating point type is ``float32`` and the default integer type is
``int32``. The table below shows supported values for :obj:`Dtype`. 

.. list-table:: Supported Data Types 
   :widths: 5 3 20
   :header-rows: 1

   * - Type 
     - Bytes
     - Description
   * - ``bool_``
     - 1 
     - Boolean (``True``, ``False``) data type
   * - ``uint8``
     - 1 
     - 8-bit unsigned integer 
   * - ``uint16``
     - 2 
     - 16-bit unsigned integer 
   * - ``uint32``
     - 4 
     - 32-bit unsigned integer 
   * - ``uint64``
     - 8 
     - 64-bit unsigned integer 
   * - ``int8``
     - 1 
     - 8-bit signed integer 
   * - ``int16``
     - 2 
     - 16-bit signed integer 
   * - ``int32``
     - 4 
     - 32-bit signed integer 
   * - ``int64``
     - 8 
     - 64-bit signed integer 
   * - ``bfloat16``
     - 2 
     - 16-bit brain float (e8, m7)
   * - ``float16``
     - 2 
     - 16-bit IEEE float (e5, m10)
   * - ``float32``
     - 4 
     - 32-bit float
   * - ``float64``
     - 8
     - 64-bit double
   * - ``complex64``
     - 8 
     - 64-bit complex float


.. note::

    Arrays with type ``float64`` only work with CPU operations. Using
    ``float64`` arrays on the GPU will result in an exception.


Data type are aranged in a hierarchy. See the :obj:`DtypeCategory` object
documentation for more information. Use :func:`issubdtype` to determine if one
``dtype`` (or category) is a subtype of another category.

.. autosummary::
   :toctree: _autosummary

   Dtype
   DtypeCategory
   issubdtype
   finfo


================================================
FILE: docs/src/python/devices_and_streams.rst
================================================
.. _devices_and_streams:

Devices and Streams
===================

.. currentmodule:: mlx.core

.. autosummary::
  :toctree: _autosummary

   Device
   Stream
   default_device
   set_default_device
   default_stream
   new_stream
   set_default_stream
   stream
   synchronize
   device_count
   device_info


================================================
FILE: docs/src/python/distributed.rst
================================================
.. _distributed:

.. currentmodule:: mlx.core.distributed

Distributed Communication
==========================

MLX provides a distributed communication package using MPI. The MPI library is
loaded at runtime; if MPI is available then distributed communication is also
made available.

.. autosummary::
   :toctree: _autosummary

    Group
    is_available
    init
    all_sum
    all_gather
    send
    recv
    recv_like


================================================
FILE: docs/src/python/export.rst
================================================
.. _export:

Export Functions
================

.. currentmodule:: mlx.core

.. autosummary::
  :toctree: _autosummary

   export_function
   import_function
   exporter
   export_to_dot


================================================
FILE: docs/src/python/fast.rst
================================================
.. _fast:

Fast
====

.. currentmodule:: mlx.core.fast

.. autosummary:: 
  :toctree: _autosummary

  rms_norm
  layer_norm
  rope
  scaled_dot_product_attention
  metal_kernel
  cuda_kernel


================================================
FILE: docs/src/python/fft.rst
================================================
.. _fft:

FFT
===

.. currentmodule:: mlx.core.fft

.. autosummary:: 
  :toctree: _autosummary

  fft
  ifft
  fft2
  ifft2
  fftn
  ifftn
  rfft
  irfft
  rfft2
  irfft2
  rfftn
  irfftn
  fftshift
  ifftshift


================================================
FILE: docs/src/python/linalg.rst
================================================
.. _linalg:

Linear Algebra
==============

.. currentmodule:: mlx.core.linalg

.. autosummary::
   :toctree: _autosummary

    inv
    tri_inv
    norm
    cholesky
    cholesky_inv
    cross
    qr
    svd
    eigvals
    eig
    eigvalsh
    eigh
    lu
    lu_factor
    pinv
    solve
    solve_triangular


================================================
FILE: docs/src/python/memory_management.rst
================================================
Memory Management
=================

.. currentmodule:: mlx.core

.. autosummary::
  :toctree: _autosummary

  get_active_memory
  get_peak_memory
  reset_peak_memory
  get_cache_memory
  set_memory_limit
  set_cache_limit
  set_wired_limit
  clear_cache


================================================
FILE: docs/src/python/metal.rst
================================================
Metal
=====

.. currentmodule:: mlx.core.metal

.. autosummary::
  :toctree: _autosummary

  is_available
  device_info
  start_capture
  stop_capture


================================================
FILE: docs/src/python/nn/distributed.rst
================================================
.. _nn_distributed:

Distributed
-----------

Helper Routines
^^^^^^^^^^^^^^^

The :code:`mlx.nn.layers.distributed` package contains helpful routines to 
create sharded layers from existing :class:`Modules <mlx.nn.Module>`.

.. currentmodule:: mlx.nn.layers.distributed
.. autosummary::
   :toctree: _autosummary

   shard_linear
   shard_inplace

Layers
^^^^^^

.. currentmodule:: mlx.nn
.. autosummary::
   :toctree: _autosummary
   :template: nn-module-template.rst

   AllToShardedLinear
   ShardedToAllLinear
   QuantizedAllToShardedLinear
   QuantizedShardedToAllLinear


================================================
FILE: docs/src/python/nn/functions.rst
================================================
.. _nn_functions:

.. currentmodule:: mlx.nn

Functions
---------

Layers without parameters (e.g. activation functions) are also provided as
simple functions.

.. autosummary::
   :toctree: _autosummary_functions
   :template: nn-module-template.rst

   elu
   celu
   gelu
   gelu_approx
   gelu_fast_approx
   glu
   hard_shrink
   hard_tanh
   hardswish
   leaky_relu
   log_sigmoid
   log_softmax
   mish
   prelu
   relu
   relu2
   relu6
   selu
   sigmoid
   silu
   softmax
   softmin
   softplus
   softshrink
   step
   tanh


================================================
FILE: docs/src/python/nn/init.rst
================================================
.. _init:

.. currentmodule:: mlx.nn.init

Initializers
------------

The ``mlx.nn.init`` package contains commonly used initializers for neural
network parameters. Initializers return a function which can be applied to any
input :obj:`mlx.core.array` to produce an initialized output.

For example:

.. code:: python

   import mlx.core as mx
   import mlx.nn as nn

   init_fn = nn.init.uniform()

   # Produces a [2, 2] uniform matrix
   param = init_fn(mx.zeros((2, 2)))

To re-initialize all the parameter in an :obj:`mlx.nn.Module` from say a uniform 
distribution, you can do:

.. code:: python
  
   import mlx.nn as nn
   model = nn.Sequential(nn.Linear(5, 10), nn.ReLU(), nn.Linear(10, 5))
   init_fn = nn.init.uniform(low=-0.1, high=0.1)
   model.apply(init_fn)
   

.. autosummary::
   :toctree: _autosummary

   constant
   normal
   uniform
   identity
   glorot_normal
   glorot_uniform
   he_normal
   he_uniform


================================================
FILE: docs/src/python/nn/layers.rst
================================================
.. _layers:

.. currentmodule:: mlx.nn

Layers
------

.. autosummary::
   :toctree: _autosummary
   :template: nn-module-template.rst

   ALiBi
   AllToShardedLinear
   AvgPool1d
   AvgPool2d
   AvgPool3d
   BatchNorm
   CELU
   Conv1d
   Conv2d
   Conv3d
   ConvTranspose1d
   ConvTranspose2d
   ConvTranspose3d
   Dropout
   Dropout2d
   Dropout3d
   Embedding
   ELU
   GELU
   GLU
   GroupNorm
   GRU
   HardShrink
   HardTanh
   Hardswish
   InstanceNorm
   LayerNorm
   LeakyReLU
   Linear
   LogSigmoid
   LogSoftmax
   LSTM
   MaxPool1d
   MaxPool2d
   MaxPool3d
   Mish
   MultiHeadAttention
   PReLU
   QuantizedAllToShardedLinear
   QuantizedEmbedding
   QuantizedLinear
   QuantizedShardedToAllLinear
   RMSNorm
   ReLU
   ReLU2
   ReLU6
   RNN
   RoPE
   SELU
   Sequential
   ShardedToAllLinear
   Sigmoid
   SiLU
   SinusoidalPositionalEncoding
   Softmin
   Softshrink
   Softsign
   Softmax
   Softplus
   Step
   Tanh
   Transformer
   Upsample


================================================
FILE: docs/src/python/nn/losses.rst
================================================
.. _losses:

.. currentmodule:: mlx.nn.losses

Loss Functions
--------------

.. autosummary::
   :toctree: _autosummary_functions
   :template: nn-module-template.rst

   binary_cross_entropy
   cosine_similarity_loss
   cross_entropy
   gaussian_nll_loss
   hinge_loss
   huber_loss
   kl_div_loss
   l1_loss
   log_cosh_loss
   margin_ranking_loss
   mse_loss
   nll_loss
   smooth_l1_loss
   triplet_loss

================================================
FILE: docs/src/python/nn/module.rst
================================================
Module
======

.. currentmodule:: mlx.nn

.. autoclass:: Module

   .. rubric:: Attributes

   .. autosummary::
      :toctree: _autosummary
   
      Module.training
      Module.state
   
   .. rubric:: Methods

   .. autosummary::
      :toctree: _autosummary
   
      Module.apply
      Module.apply_to_modules
      Module.children
      Module.eval
      Module.filter_and_map
      Module.freeze
      Module.leaf_modules
      Module.load_weights
      Module.modules
      Module.named_modules
      Module.parameters
      Module.save_weights
      Module.set_dtype
      Module.train
      Module.trainable_parameters
      Module.unfreeze
      Module.update
      Module.update_modules


================================================
FILE: docs/src/python/nn.rst
================================================
.. _nn:

.. currentmodule:: mlx.nn

Neural Networks
===============

Writing arbitrarily complex neural networks in MLX can be done using only
:class:`mlx.core.array` and :meth:`mlx.core.value_and_grad`. However, this requires the
user to write again and again the same simple neural network operations as well
as handle all the parameter state and initialization manually and explicitly.

The module :mod:`mlx.nn` solves this problem by providing an intuitive way of
composing neural network layers, initializing their parameters, freezing them
for finetuning and more.

Quick Start with Neural Networks
---------------------------------

.. code-block:: python

    import mlx.core as mx
    import mlx.nn as nn

    class MLP(nn.Module):
        def __init__(self, in_dims: int, out_dims: int):
            super().__init__()

            self.layers = [
                nn.Linear(in_dims, 128),
                nn.Linear(128, 128),
                nn.Linear(128, out_dims),
            ]

        def __call__(self, x):
            for i, l in enumerate(self.layers):
                x = mx.maximum(x, 0) if i > 0 else x
                x = l(x)
            return x

    # The model is created with all its parameters but nothing is initialized
    # yet because MLX is lazily evaluated
    mlp = MLP(2, 10)

    # We can access its parameters by calling mlp.parameters()
    params = mlp.parameters()
    print(params["layers"][0]["weight"].shape)

    # Printing a parameter will cause it to be evaluated and thus initialized
    print(params["layers"][0])

    # We can also force evaluate all parameters to initialize the model
    mx.eval(mlp.parameters())

    # A simple loss function.
    # NOTE: It doesn't matter how it uses the mlp model. It currently captures
    #       it from the local scope. It could be a positional argument or a
    #       keyword argument.
    def l2_loss(x, y):
        y_hat = mlp(x)
        return (y_hat - y).square().mean()

    # Calling `nn.value_and_grad` instead of `mx.value_and_grad` returns the
    # gradient with respect to `mlp.trainable_parameters()`
    loss_and_grad = nn.value_and_grad(mlp, l2_loss)

.. _module_class:

The Module Class
----------------

The workhorse of any neural network library is the :class:`Module` class. In
MLX the :class:`Module` class is a container of :class:`mlx.core.array` or
:class:`Module` instances. Its main function is to provide a way to
recursively **access** and **update** its parameters and those of its
submodules.

Parameters
^^^^^^^^^^

A parameter of a module is any public member of type :class:`mlx.core.array` (its
name should not start with ``_``). It can be arbitrarily nested in other
:class:`Module` instances or lists and dictionaries.

:meth:`Module.parameters` can be used to extract a nested dictionary with all
the parameters of a module and its submodules.

A :class:`Module` can also keep track of "frozen" parameters. See the
:meth:`Module.freeze` method for more details. :meth:`mlx.nn.value_and_grad`
the gradients returned will be with respect to these trainable parameters.


Updating the Parameters
^^^^^^^^^^^^^^^^^^^^^^^

MLX modules allow accessing and updating individual parameters. However, most
times we need to update large subsets of a module's parameters. This action is
performed by :meth:`Module.update`.


Inspecting Modules
^^^^^^^^^^^^^^^^^^

The simplest way to see the model architecture is to print it. Following along with
the above example, you can print the ``MLP`` with:

.. code-block:: python

  print(mlp)

This will display:

.. code-block:: shell

  MLP(
    (layers.0): Linear(input_dims=2, output_dims=128, bias=True)
    (layers.1): Linear(input_dims=128, output_dims=128, bias=True)
    (layers.2): Linear(input_dims=128, output_dims=10, bias=True)
  )

To get more detailed information on the arrays in a :class:`Module` you can use
:func:`mlx.utils.tree_map` on the parameters. For example, to see the shapes of
all the parameters in a :class:`Module` do:

.. code-block:: python

   from mlx.utils import tree_map
   shapes = tree_map(lambda p: p.shape, mlp.parameters())

As another example, you can count the number of parameters in a :class:`Module`
with:

.. code-block:: python

   from mlx.utils import tree_flatten
   num_params = sum(v.size for _, v in tree_flatten(mlp.parameters()))


Value and Grad
--------------

Using a :class:`Module` does not preclude using MLX's high order function
transformations (:meth:`mlx.core.value_and_grad`, :meth:`mlx.core.grad`, etc.). However,
these function transformations assume pure functions, namely the parameters
should be passed as an argument to the function being transformed.

There is an easy pattern to achieve that with MLX modules

.. code-block:: python

    model = ...

    def f(params, other_inputs):
        model.update(params)  # <---- Necessary to make the model use the passed parameters
        return model(other_inputs)

    f(model.trainable_parameters(), mx.zeros((10,)))

However, :meth:`mlx.nn.value_and_grad` provides precisely this pattern and only
computes the gradients with respect to the trainable parameters of the model.

In detail:

- it wraps the passed function with a function that calls :meth:`Module.update`
  to make sure the model is using the provided parameters.
- it calls :meth:`mlx.core.value_and_grad` to transform the function into a function
  that also computes the gradients with respect to the passed parameters.
- it wraps the returned function with a function that passes the trainable
  parameters as the first argument to the function returned by
  :meth:`mlx.core.value_and_grad`

.. autosummary::
   :toctree: _autosummary

   value_and_grad
   quantize
   average_gradients
   fsdp_apply_gradients

.. toctree::

   nn/module
   nn/layers
   nn/functions
   nn/losses
   nn/init
   nn/distributed


================================================
FILE: docs/src/python/ops.rst
================================================
.. _ops:

Operations
==========

.. currentmodule:: mlx.core

.. autosummary::
  :toctree: _autosummary

   abs
   add
   addmm
   all
   allclose
   any
   arange
   arccos
   arccosh
   arcsin
   arcsinh
   arctan
   arctan2
   arctanh
   argmax
   argmin
   argpartition
   argsort
   array_equal
   as_strided
   atleast_1d
   atleast_2d
   atleast_3d
   bitwise_and
   bitwise_invert
   bitwise_or
   bitwise_xor
   block_masked_mm
   broadcast_arrays
   broadcast_to
   ceil
   clip
   concatenate
   contiguous
   conj
   conjugate
   convolve
   conv1d
   conv2d
   conv3d
   conv_transpose1d
   conv_transpose2d
   conv_transpose3d
   conv_general
   cos
   cosh
   cummax
   cummin
   cumprod
   cumsum
   degrees
   dequantize
   diag
   diagonal
   divide
   divmod
   einsum
   einsum_path
   equal
   erf
   erfinv
   exp
   expm1
   expand_dims
   eye
   flatten
   floor
   floor_divide
   full
   gather_mm
   gather_qmm
   greater
   greater_equal
   hadamard_transform
   identity
   imag
   inner
   isfinite
   isclose
   isinf
   isnan
   isneginf
   isposinf
   issubdtype
   kron
   left_shift
   less
   less_equal
   linspace
   load
   log
   log2
   log10
   log1p
   logaddexp
   logcumsumexp
   logical_not
   logical_and
   logical_or
   logsumexp
   matmul
   max
   maximum
   mean
   median
   meshgrid
   min
   minimum
   moveaxis
   multiply
   nan_to_num
   negative
   not_equal
   ones
   ones_like
   outer
   partition
   pad
   power
   prod
   put_along_axis
   quantize
   quantized_matmul
   radians
   real
   reciprocal
   remainder
   repeat
   reshape
   right_shift
   roll
   round
   rsqrt
   save
   savez
   savez_compressed
   save_gguf
   save_safetensors
   sigmoid
   sign
   sin
   sinh
   slice
   slice_update
   softmax
   sort
   split
   sqrt
   square
   squeeze
   stack
   std
   stop_gradient
   subtract
   sum
   swapaxes
   take
   take_along_axis
   tan
   tanh
   tensordot
   tile
   topk
   trace
   transpose
   tri
   tril
   triu
   unflatten
   var
   view
   where
   zeros
   zeros_like


================================================
FILE: docs/src/python/optimizers/common_optimizers.rst
================================================
.. _common_optimizers:

Common Optimizers
=================

.. currentmodule:: mlx.optimizers

.. autosummary::
   :toctree: _autosummary
   :template: optimizers-template.rst

   SGD
   RMSprop
   Adagrad
   Adafactor
   AdaDelta
   Adam
   AdamW
   Adamax
   Lion
   MultiOptimizer
   Muon


================================================
FILE: docs/src/python/optimizers/optimizer.rst
================================================
Optimizer
=========

.. currentmodule:: mlx.optimizers

.. autoclass:: Optimizer 


   .. rubric:: Attributes

   .. autosummary::
      :toctree: _autosummary

      Optimizer.state
   
   .. rubric:: Methods

   .. autosummary::
      :toctree: _autosummary
   
      Optimizer.apply_gradients
      Optimizer.init
      Optimizer.update


================================================
FILE: docs/src/python/optimizers/schedulers.rst
================================================
.. _schedulers:

Schedulers
==========

.. currentmodule:: mlx.optimizers

.. autosummary::
   :toctree: _autosummary

   cosine_decay    
   exponential_decay    
   join_schedules
   linear_schedule
   step_decay    


================================================
FILE: docs/src/python/optimizers.rst
================================================
.. _optimizers:

.. currentmodule:: mlx.optimizers

Optimizers
==========

The optimizers in MLX can be used both with :mod:`mlx.nn` but also with pure
:mod:`mlx.core` functions. A typical example involves calling
:meth:`Optimizer.update` to update a model's parameters based on the loss
gradients and subsequently calling :func:`mlx.core.eval` to evaluate both the
model's parameters and the **optimizer state**.

.. code-block:: python

    # Create a model
    model = MLP(num_layers, train_images.shape[-1], hidden_dim, num_classes)
    mx.eval(model.parameters())

    # Create the gradient function and the optimizer
    loss_and_grad_fn = nn.value_and_grad(model, loss_fn)
    optimizer = optim.SGD(learning_rate=learning_rate)

    for e in range(num_epochs):
        for X, y in batch_iterate(batch_size, train_images, train_labels):
            loss, grads = loss_and_grad_fn(model, X, y)

            # Update the model with the gradients. So far no computation has happened.
            optimizer.update(model, grads)

            # Compute the new parameters but also the optimizer state.
            mx.eval(model.parameters(), optimizer.state)

Saving and Loading
------------------

To serialize an optimizer, save its state. To load an optimizer, load and set
the saved state. Here's a simple example:

.. code-block:: python

   import mlx.core as mx
   from mlx.utils import tree_flatten, tree_unflatten
   import mlx.optimizers as optim

   optimizer = optim.Adam(learning_rate=1e-2)

   # Perform some updates with the optimizer
   model = {"w" : mx.zeros((5, 5))}
   grads = {"w" : mx.ones((5, 5))}
   optimizer.update(model, grads)

   # Save the state
   state = tree_flatten(optimizer.state, destination={})
   mx.save_safetensors("optimizer.safetensors", state)

   # Later on, for example when loading from a checkpoint,
   # recreate the optimizer and load the state
   optimizer = optim.Adam(learning_rate=1e-2)

   state = tree_unflatten(mx.load("optimizer.safetensors"))
   optimizer.state = state

Note, not every optimizer configuation parameter is saved in the state. For
example, for Adam the learning rate is saved but the ``betas`` and ``eps``
parameters are not. A good rule of thumb is if the parameter can be scheduled
then it will be included in the optimizer state.

.. toctree::

   optimizers/optimizer
   optimizers/common_optimizers
   optimizers/schedulers

.. autosummary::
   :toctree: _autosummary

   clip_grad_norm


================================================
FILE: docs/src/python/random.rst
================================================
.. _random:

Random
======

Random sampling functions in MLX use an implicit global PRNG state by default.
However, all function take an optional ``key`` keyword argument for when more
fine-grained control or explicit state management is needed.

For example, you can generate random numbers with:

.. code-block:: python

  for _ in range(3):
    print(mx.random.uniform())

which will print a sequence of unique pseudo random numbers. Alternatively you
can explicitly set the key:

.. code-block:: python

  key = mx.random.key(0)
  for _ in range(3):
    print(mx.random.uniform(key=key))

which will yield the same pseudo random number at each iteration.

Following `JAX's PRNG design <https://jax.readthedocs.io/en/latest/jep/263-prng.html>`_
we use a splittable version of Threefry, which is a counter-based PRNG.

.. currentmodule:: mlx.core.random

.. autosummary:: 
  :toctree: _autosummary

   bernoulli
   categorical
   gumbel
   key
   normal
   multivariate_normal
   randint
   seed
   split
   truncated_normal
   uniform
   laplace
   permutation


================================================
FILE: docs/src/python/transforms.rst
================================================
.. _transforms:

Transforms
==========

.. currentmodule:: mlx.core

.. autosummary::
  :toctree: _autosummary

   eval
   async_eval
   compile
   checkpoint
   custom_function
   disable_compile
   enable_compile
   grad
   value_and_grad
   jvp
   vjp
   vmap


================================================
FILE: docs/src/python/tree_utils.rst
================================================
.. _utils:

Tree Utils
==========

In MLX we consider a python tree to be an arbitrarily nested collection of
dictionaries, lists and tuples without cycles. Functions in this module that
return python trees will be using the default python ``dict``, ``list`` and
``tuple`` but they can usually process objects that inherit from any of these.

.. note::
   Dictionaries should have keys that are valid python identifiers.

.. currentmodule:: mlx.utils

.. autosummary:: 
  :toctree: _autosummary

   tree_flatten
   tree_unflatten
   tree_map
   tree_map_with_path
   tree_reduce


================================================
FILE: docs/src/usage/compile.rst
================================================
.. _compile:

Compilation
===========

.. currentmodule:: mlx.core

MLX has a :func:`compile` function transformation which compiles computation
graphs. Function compilation results in smaller graphs by merging common work
and fusing certain operations. In many cases this can lead to big improvements
in run-time and memory use.

Getting started with :func:`compile` is simple, but there are some edge cases
that are good to be aware of for more complex graphs and advanced usage.

Basics of Compile
-----------------

Let's start with a simple example:

.. code-block:: python

  def fun(x, y):
      return mx.exp(-x) + y

  x = mx.array(1.0)
  y = mx.array(2.0)

  # Regular call, no compilation
  # Prints: array(2.36788, dtype=float32)
  print(fun(x, y))

  # Compile the function
  compiled_fun = mx.compile(fun)

  # Prints: array(2.36788, dtype=float32)
  print(compiled_fun(x, y))

The output of both the regular function and the compiled function is the same
up to numerical precision.

The first time you call a compiled function, MLX will build the compute
graph, optimize it, and generate and compile code. This can be relatively
slow. However, MLX will cache compiled functions, so calling a compiled
function multiple times will not initiate a new compilation. This means you
should typically compile functions that you plan to use more than once.

.. code-block:: python

  def fun(x, y):
      return mx.exp(-x) + y

  x = mx.array(1.0)
  y = mx.array(2.0)

  compiled_fun = mx.compile(fun)

  # Compiled here
  compiled_fun(x, y)

  # Not compiled again
  compiled_fun(x, y)

  # Not compiled again
  mx.compile(fun)(x, y)

There are some important cases to be aware of that can cause a function to
be recompiled:

* Changing the shape or number of dimensions
* Changing the type of any of the inputs
* Changing the number of inputs to the function

In certain cases only some of the compilation stack will be rerun (for
example when changing the shapes) and in other cases the full compilation
stack will be rerun (for example when changing the types). In general you
should avoid compiling functions too frequently.

Another idiom to watch out for is compiling functions which get created and
destroyed frequently. This can happen, for example, when compiling an anonymous
function in a loop:

.. code-block:: python

  a = mx.array(1.0)
  # Don't do this, compiles lambda at each iteration
  for _ in range(5):
      mx.compile(lambda x: mx.exp(mx.abs(x)))(a)

Example Speedup
---------------

The :func:`mlx.nn.gelu` is a nonlinear activation function commonly used with
Transformer-based models. The implementation involves several unary and binary
element-wise operations:

.. code-block:: python

  def gelu(x):
      return x * (1 + mx.erf(x / math.sqrt(2))) / 2

If you use this function with small arrays, it will be overhead bound. If you
use it with large arrays it will be memory bandwidth bound.  However, all of
the operations in the ``gelu`` are fusible into a single kernel with
:func:`compile`. This can speedup both cases considerably.

Let's compare the runtime of the regular function versus the compiled
function. We'll use the following timing helper which does a warm up and
handles synchronization:

.. code-block:: python

  import time

  def timeit(fun, x):
      # warm up
      for _ in range(10):
          mx.eval(fun(x))

      tic = time.perf_counter()
      for _ in range(100):
          mx.eval(fun(x))
      toc = time.perf_counter()
      tpi = 1e3 * (toc - tic) / 100
      print(f"Time per iteration {tpi:.3f} (ms)")


Now make an array, and benchmark both functions:

.. code-block:: python

  x = mx.random.uniform(shape=(32, 1000, 4096))
  timeit(gelu, x)
  timeit(mx.compile(gelu), x)

On an M1 Max the times are 15.5 and 3.1 milliseconds. The compiled ``gelu`` is
five times faster.

Debugging
---------

When a compiled function is first called, it is traced with placeholder
inputs. This means you can't evaluate arrays (for example to print their
contents) inside compiled functions.

.. code-block:: python

  @mx.compile
  def fun(x):
      z = -x
      print(z)  # Crash
      return mx.exp(z)

  fun(mx.array(5.0))

For debugging, inspecting arrays can be helpful. One way to do that is to
globally disable compilation using the :func:`disable_compile` function or
``MLX_DISABLE_COMPILE`` flag. For example the following is okay even though
``fun`` is compiled:

.. code-block:: python

  @mx.compile
  def fun(x):
      z = -x
      print(z) # Okay
      return mx.exp(z)

  mx.disable_compile()
  fun(mx.array(5.0))


Pure Functions
--------------

Compiled functions are intended to be *pure*; that is they should not have side
effects. For example:

.. code-block:: python

  state = []

  @mx.compile
  def fun(x, y):
      z = x + y
      state.append(z)
      return mx.exp(z)

  fun(mx.array(1.0), mx.array(2.0))
  # Crash!
  print(state)

After the first call of ``fun``, the ``state`` list will hold a placeholder
array. The placeholder does not have any data; it is only used to build the
computation graph. Printing such an array results in a crash.

You have two options to deal with this. The first option is to simply return
``state`` as an output:

.. code-block:: python

   state = []

   @mx.compile
   def fun(x, y):
      z = x + y
      state.append(z)
      return mx.exp(z), state

    _, state = fun(mx.array(1.0), mx.array(2.0))
    # Prints [array(3, dtype=float32)]
    print(state)

In some cases returning updated state can be pretty inconvenient. Hence,
:func:`compile` has a parameter to capture implicit outputs:

.. code-block:: python

  from functools import partial

  state = []

  # Tell compile to capture state as an output
  @partial(mx.compile, outputs=state)
  def fun(x, y):
      z = x + y
      state.append(z)
      return mx.exp(z)

  fun(mx.array(1.0), mx.array(2.0))
  # Prints [array(3, dtype=float32)]
  print(state)

This is particularly useful for compiling a function which includes an update
to a container of arrays, as is commonly done when training the parameters of a
:class:`mlx.nn.Module`.

Compiled functions will also treat any inputs not in the parameter list as
constants. For example:

.. code-block:: python

  state = [mx.array(1.0)]

  @mx.compile
  def fun(x):
      return x + state[0]

  # Prints array(2, dtype=float32)
  print(fun(mx.array(1.0)))

  # Update state
  state[0] = mx.array(5.0)

  # Still prints array(2, dtype=float32)
  print(fun(mx.array(1.0)))

In order to have the change of state reflected in the outputs of ``fun`` you
again have two options. The first option is to simply pass ``state`` as input
to the function.

.. code-block:: python

  state = [mx.array(1.0)]

  @mx.compile
  def fun(x, state):
      return x + state[0]

  # Prints array(2, dtype=float32)
  print(fun(mx.array(1.0), state))

  # Update state
  state[0] = mx.array(5.0)

  # Prints array(6, dtype=float32)
  print(fun(mx.array(1.0), state))

In some cases this can be pretty inconvenient. Hence,
:func:`compile` also has a parameter to capture implicit inputs:

.. code-block:: python

  from functools import partial
  state = [mx.array(1.0)]

  # Tell compile to capture state as an input
  @partial(mx.compile, inputs=state)
  def fun(x):
      return x + state[0]

  # Prints array(2, dtype=float32)
  print(fun(mx.array(1.0)))

  # Update state
  state[0] = mx.array(5.0)

  # Prints array(6, dtype=float32)
  print(fun(mx.array(1.0)))


Compiling Training Graphs
-------------------------

This section will step through how to use :func:`compile` with a simple example
of a common setup: training a model with :obj:`mlx.nn.Module` using an
:obj:`mlx.optimizers.Optimizer` with state. We will show how to compile the
full forward, backward, and update with :func:`compile`.

To start, here is the simple example without any compilation:

.. code-block:: python

  import mlx.core as mx
  import mlx.nn as nn
  import mlx.optimizers as optim

  # 4 examples with 10 features each
  x = mx.random.uniform(shape=(4, 10))

  # 0, 1 targets
  y = mx.array([0, 1, 0, 1])

  # Simple linear model
  model = nn.Linear(10, 1)

  # SGD with momentum
  optimizer = optim.SGD(learning_rate=0.1, momentum=0.8)

  def loss_fn(model, x, y):
      logits = model(x).squeeze()
      return nn.losses.binary_cross_entropy(logits, y)

  loss_and_grad_fn = nn.value_and_grad(model, loss_fn)

  # Perform 10 steps of gradient descent
  for it in range(10):
      loss, grads = loss_and_grad_fn(model, x, y)
      optimizer.update(model, grads)
      mx.eval(model.parameters(), optimizer.state)

To compile the update we can put it all in a function and compile it with the
appropriate input and output captures. Here's the same example but compiled:

.. code-block:: python

  import mlx.core as mx
  import mlx.nn as nn
  import mlx.optimizers as optim
  from functools import partial

  # 4 examples with 10 features each
  x = mx.random.uniform(shape=(4, 10))

  # 0, 1 targets
  y = mx.array([0, 1, 0, 1])

  # Simple linear model
  model = nn.Linear(10, 1)

  # SGD with momentum
  optimizer = optim.SGD(learning_rate=0.1, momentum=0.8)

  def loss_fn(model, x, y):
      logits = model(x).squeeze()
      return nn.losses.binary_cross_entropy(logits, y)

  # The state that will be captured as input and output
  state = [model.state, optimizer.state]

  @partial(mx.compile, inputs=state, outputs=state)
  def step(x, y):
      loss_and_grad_fn = nn.value_and_grad(model, loss_fn)
      loss, grads = loss_and_grad_fn(model, x, y)
      optimizer.update(model, grads)
      return loss

  # Perform 10 steps of gradient descent
  for it in range(10):
      loss = step(x, y)
      # Evaluate the model and optimizer state
      mx.eval(state)
      print(loss)


.. note::

  If you are using a module which performs random sampling such as
  :func:`mlx.nn.Dropout`, make sure you also include ``mx.random.state`` in the
  ``state`` captured by :func:`compile`, i.e. ``state = [model.state,
  optimizer.state, mx.random.state]``.


.. note::

   For more examples of compiling full training graphs checkout the  `MLX
   Examples <https://github.com/ml-explore/mlx-examples>`_ GitHub repo.

Transformations with Compile
----------------------------

In MLX function transformations are composable. You can apply any function
transformation to the output of any other function transformation. For more on
this, see the documentation on :ref:`function transforms
<function_transforms>`.

Compiling transformed functions works just as expected:

.. code-block:: python

  grad_fn = mx.grad(mx.exp)

  compiled_grad_fn = mx.compile(grad_fn)

  # Prints: array(2.71828, dtype=float32)
  print(grad_fn(mx.array(1.0)))

  # Also prints: array(2.71828, dtype=float32)
  print(compiled_grad_fn(mx.array(1.0)))

.. note::

   In order to compile as much as possible, a transformation of a compiled
   function will not by default be compiled. To compile the transformed
   function simply pass it through :func:`compile`.

You can also compile functions which themselves call compiled functions. A
good practice is to compile the outer most function to give :func:`compile`
the most opportunity to optimize the computation graph:

.. code-block:: python

  @mx.compile
  def inner(x):
      return mx.exp(-mx.abs(x))

  def outer(x):
      inner(inner(x))

  # Compiling the outer function is good to do as it will likely
  # be faster even though the inner functions are compiled
  fun = mx.compile(outer)


.. _shapeless_compile:

Shapeless Compilation
---------------------

When the shape of an input to a compiled function changes, the function is
recompiled. You can compile a function once and run it on inputs with
variable shapes by specifying ``shapeless=True`` to :func:`compile`. In this
case changes to the shapes of the inputs do not cause the function to be
recompiled.

.. code-block:: python

  def fun(x, y):
      return mx.abs(x + y)

  compiled_fun = mx.compile(fun, shapeless=True)

  x = mx.array(1.0)
  y = mx.array(-2.0)

  # Firt call compiles the function
  print(compiled_fun(x, y))

  # Second call with different shapes
  # does not recompile the function
  x = mx.array([1.0, -6.0])
  y = mx.array([-2.0, 3.0])
  print(compiled_fun(x, y))


Use shapeless compilations carefully. Since compilation is not triggered when
shapes change, any graphs which are conditional on the input shapes will not
work as expected. Shape-dependent computations are common and sometimes subtle
to detect. For example:

.. code-block:: python

  def fun(x):
      return x.reshape(x.shape[0] * x.shape[1], -1)

  compiled_fun = mx.compile(fun, shapeless=True)

  x = mx.random.uniform(shape=(2, 3, 4))

  out = compiled_fun(x)

  x = mx.random.uniform(shape=(5, 5, 3))

  # Error, can't reshape (5, 5, 3) to (6, -1)
  out = compiled_fun(x)

The second call to the ``compiled_fun`` fails because of the call to
:func:`reshape` which uses the static shape of ``x`` in the first call. We can
fix this by using :func:`flatten` to avoid hardcoding the shape of ``x``:

.. code-block:: python

  def fun(x):
      return x.flatten(0, 1)

  compiled_fun = mx.compile(fun, shapeless=True)

  x = mx.random.uniform(shape=(2, 3, 4))

  out = compiled_fun(x)

  x = mx.random.uniform(shape=(5, 5, 3))

  # Ok
  out = compiled_fun(x)


================================================
FILE: docs/src/usage/distributed.rst
================================================
.. _usage_distributed:

Distributed Communication
=========================

.. currentmodule:: mlx.core.distributed

MLX supports distributed communication operations that allow the computational cost
of training or inference to be shared across many physical machines. At the
moment we support several different communication backends introduced below.

.. list-table::
   :widths: 20 80
   :header-rows: 1

   * - Backend
     - Description
   * - :ref:`MPI <mpi_section>`
     - A full featured and mature distributed communications library.
   * - :ref:`RING <ring_section>`
     - Ring all reduce and all gather over TCP sockets. Always available and
       usually faster than MPI.
   * - :ref:`JACCL <jaccl_section>`
     - Low latency communication with RDMA over thunderbolt. Necessary for
       things like tensor parallelism.
   * - :ref:`NCCL <nccl_section>`
     - The backend of choice for CUDA environments.


The list of all currently supported operations and their documentation can be
seen in the :ref:`API docs<distributed>`.

Getting Started
---------------

A distributed program in MLX is as simple as:

.. code:: python

    import mlx.core as mx

    world = mx.distributed.init()
    x = mx.distributed.all_sum(mx.ones(10))
    print(world.rank(), x)

The program above sums the array ``mx.ones(10)`` across all
distributed processes. However, when this script is run with ``python`` only
one process is launched and no distributed communication takes place. Namely,
all operations in ``mx.distributed`` are noops when the distributed group has a
size of one. This property allows us to avoid code that checks if we are in a
distributed setting similar to the one below:

.. code:: python

    import mlx.core as mx

    x = ...
    world = mx.distributed.init()
    # No need for the check we can simply do x = mx.distributed.all_sum(x)
    if world.size() > 1:
        x = mx.distributed.all_sum(x)

Running Distributed Programs
^^^^^^^^^^^^^^^^^^^^^^^^^^^^

MLX provides ``mlx.launch`` a helper script to launch distributed programs.
Continuing with our initial example we can run it on localhost with 4 processes using

.. code:: shell

    $ mlx.launch -n 4 my_script.py
    3 array([4, 4, 4, ..., 4, 4, 4], dtype=float32)
    2 array([4, 4, 4, ..., 4, 4, 4], dtype=float32)
    1 array([4, 4, 4, ..., 4, 4, 4], dtype=float32)
    0 array([4, 4, 4, ..., 4, 4, 4], dtype=float32)

We can also run it on some remote hosts by providing their IPs (provided that
the script exists on all hosts and they are reachable by ssh)

.. code:: shell

    $ mlx.launch --hosts ip1,ip2,ip3,ip4 my_script.py
    3 array([4, 4, 4, ..., 4, 4, 4], dtype=float32)
    2 array([4, 4, 4, ..., 4, 4, 4], dtype=float32)
    1 array([4, 4, 4, ..., 4, 4, 4], dtype=float32)
    0 array([4, 4, 4, ..., 4, 4, 4], dtype=float32)

Consult the dedicated :doc:`usage guide<launching_distributed>` for more
information on using ``mlx.launch``.

Selecting Backend
^^^^^^^^^^^^^^^^^

You can select the backend you want to use when calling :func:`init` by passing
one of ``{'any', 'ring', 'jaccl', 'mpi', 'nccl'}``. When passing ``any``, MLX will try all
available backends. If they all fail then a singleton group is created.

.. note::
   After a distributed backend is successfully initialized :func:`init` will
   return **the same backend** if called without arguments or with backend set to
   ``any``.

The following examples aim to clarify the backend initialization logic in MLX:

.. code:: python

    # Case 1: Initialize MPI regardless if it was possible to initialize the ring backend
    world = mx.distributed.init(backend="mpi")
    world2 = mx.distributed.init()  # subsequent calls return the MPI backend!

    # Case 2: Initialize any backend
    world = mx.distributed.init(backend="any")  # equivalent to no arguments
    world2 = mx.distributed.init()  # same as above

    # Case 3: Initialize both backends at the same time
    world_mpi = mx.distributed.init(backend="mpi")
    world_ring = mx.distributed.init(backend="ring")
    world_any = mx.distributed.init()  # same as MPI because it was initialized first!

Distributed Program Examples
----------------------------

- :ref:`Data Parallelism <data_parallelism>`
- :ref:`Tensor Parallelism <tensor_parallelism>`

.. _ring_section:

Getting Started with Ring
-------------------------

The ring backend does not depend on any third party library so it is always
available. It uses TCP sockets so the nodes need to be reachable via a network.
As the name suggests the nodes are connected in a ring which means that rank 1
can only communicate with rank 0 and rank 2, rank 2 only with rank 1 and rank 3
and so on and so forth. As a result :func:`send` and :func:`recv` with
arbitrary sender and receiver are not supported in the ring backend.

Defining a Ring
^^^^^^^^^^^^^^^

The easiest way to define and use a ring is via a JSON hostfile and the
``mlx.launch`` :doc:`helper script <launching_distributed>`. For each node one
defines a hostname to ssh into to run commands on this node and one or more IPs
that this node will listen to for connections.

For example the hostfile below defines a 4 node ring. ``hostname1`` will be
rank 0, ``hostname2`` rank 1 etc.

.. code:: json

    [
        {"ssh": "hostname1", "ips": ["123.123.123.1"]},
        {"ssh": "hostname2", "ips": ["123.123.123.2"]},
        {"ssh": "hostname3", "ips": ["123.123.123.3"]},
        {"ssh": "hostname4", "ips": ["123.123.123.4"]}
    ]

Running ``mlx.launch --hostfile ring-4.json my_script.py`` will ssh into each
node, run the script which will listen for connections in each of the provided
IPs. Specifically, ``hostname1`` will connect to ``123.123.123.2`` and accept a
connection from ``123.123.123.4`` and so on and so forth.

Thunderbolt Ring
^^^^^^^^^^^^^^^^

Although the ring backend can have benefits over MPI even for Ethernet, its
main purpose is to use Thunderbolt rings for higher bandwidth communication.
Setting up such thunderbolt rings can be done manually, but is a relatively
tedious process. To simplify this, we provide the utility ``mlx.distributed_config``.

To use ``mlx.distributed_config`` your computers need to be accessible by ssh via
Ethernet or Wi-Fi. Subsequently, connect them via thunderbolt cables and then call the
utility as follows:

.. code:: shell

   mlx.distributed_config --verbose --hosts host1,host2,host3,host4 --backend ring

By default the script will attempt to discover the thunderbolt ring and provide
you with the commands to configure each node as well as the ``hostfile.json``
to use with ``mlx.launch``. If password-less ``sudo`` is available on the nodes
then ``--auto-setup`` can be used to configure them automatically.

If you want to go through the process manually, the steps are as follows:

* Disable the thunderbolt bridge interface
* For the cable connecting rank ``i`` to rank ``i + 1`` find the interfaces
  corresponding to that cable in nodes ``i`` and ``i + 1``.
* Set up a unique subnetwork connecting the two nodes for the corresponding
  interfaces. For instance if the cable corresponds to ``en2`` on node ``i``
  and ``en2`` also on node ``i + 1`` then we may assign IPs ``192.168.0.1`` and
  ``192.168.0.2`` respectively to the two nodes. For more details you can see
  the commands prepared by the utility script.

.. _jaccl_section:

Getting Started with JACCL
--------------------------

Starting from macOS 26.2, RDMA over thunderbolt is available and
enables low-latency communication between Macs with thunderbolt 5. MLX provides
the JACCL backend that uses this functionality to achieve communication latency
an order of magnitude lower than the ring backend.

.. note::

   The name JACCL (pronounced Jackal) stands for *Jack and Angelos' Collective
   Communication Library* and it is an obvious pun to Nvidia's NCCL but also
   tribute to *Jack Beasley* who led the development of RDMA over Thunderbolt
   at Apple.

Enabling RDMA
^^^^^^^^^^^^^

Until the feature matures, enabling RDMA over thunderbolt is slightly more
involved and **cannot** be done remotely even with sudo. In fact, it has to be
done in macOS recovery:

1. `Start your computer in recovery <https://support.apple.com/en-us/102518>`_.
2. Open the Terminal by going to Utilities -> Terminal.
3. Run ``rdma_ctl enable``.
4. Reboot.

To verify that you have successfully enabled Thunderbolt RDMA you can run
``ibv_devices`` which should produce something like the following for an M3 Ultra.

.. code-block:: bash

    ~ % ibv_devices
    device          	   node GUID
    ------          	----------------
    rdma_en2        	8096a9d9edbaac05
    rdma_en3        	8196a9d9edbaac05
    rdma_en5        	8396a9d9edbaac05
    rdma_en4        	8296a9d9edbaac05
    rdma_en6        	8496a9d9edbaac05
    rdma_en7        	8596a9d9edbaac05

Defining a Mesh
^^^^^^^^^^^^^^^

The JACCL backend supports only fully connected topologies. Namely, there needs
to be a thunderbolt cable connecting all pairs of Macs directly. For example, in
the following topology visualizations, the left one is valid because there is a
connection from any node to any other node, while for the one on the right M3
Ultra 1 is not connected to M3 Ultra 2.

.. raw:: html

   <div style="display: flex; text-align: center; align-items: end; font-size: 80%;">
     <div>
       <img src="../_static/distributed/m3-ultra-mesh.png" alt="M3 Ultra thunderbolt mesh" style="width: 55%">
       <p>Fully connected mesh of four M3 Ultra.</p>
     </div>
     <div>
       <img src="../_static/distributed/m3-ultra-mesh-broken.png" alt="M3 Ultra broken thunderbolt mesh" style="width: 55%">
       <p>Not a valid mesh (M3 Ultra 1 is not connected to M3 Ultra 2).</p>
     </div>
   </div>

Similar to the ring backend, the easiest way to use JACCL with MLX is to write
a JSON hostfile that will be used by ``mlx.launch``. The hostfile needs to contain

- Hostnames to use for launching scripts via ssh
- An IP for rank 0 that is reachable by all nodes
- A list of rdma devices that connect each node to each other node

The following JSON defines the valid 4-node mesh from the image above.

.. code-block:: json

    [
        {
            "ssh": "m3-ultra-1",
            "ips": ["123.123.123.1"],
            "rdma": [null, "rdma_en5", "rdma_en4", "rdma_en3"]
        },
        {
            "ssh": "m3-ultra-2",
            "ips": [],
            "rdma": ["rdma_en5", null, "rdma_en3", "rdma_en4"]
        },
        {
            "ssh": "m3-ultra-3",
            "ips": [],
            "rdma": ["rdma_en4", "rdma_en3", null, "rdma_en5"]
        },
        {
            "ssh": "m3-ultra-4",
            "ips": [],
            "rdma": ["rdma_en3", "rdma_en4", "rdma_en5", null]
        }
    ]

Even though TCP/IP is not used when communicating with Thunderbolt RDMA,
disabling the thunderbolt bridge is still required as well as setting up
isolated local networks for each thunderbolt connection.

All of the above can be done instead via ``mlx.distributed_config``. This helper
script will

- ssh into each node
- extract the thunderbolt connectivity
- check for a valid mesh
- provide the commands to configure each node (or run them if sudo is available)
- generate the hostfile to be used with ``mlx.launch``

Putting It All Together
^^^^^^^^^^^^^^^^^^^^^^^^

For example launching a distributed MLX script that uses JACCL is fairly simple
if the nodes are reachable via ssh and have password-less sudo.

First, connect all the thunderbolt cables. Then we can verify the connections
by using the ``mlx.distributed_config`` script to visualize them.

.. code-block::

   mlx.distributed_config --verbose \
        --hosts m3-ultra-1,m3-ultra-2,m3-ultra-3,m3-ultra-4 \
        --over thunderbolt --dot | dot -Tpng | open -f -a Preview

After making sure that everything looks right we can auto-configure the nodes
and save the hostfile to ``m3-ultra-jaccl.json`` by running:

.. code-block::

   mlx.distributed_config --verbose \
        --hosts m3-ultra-1,m3-ultra-2,m3-ultra-3,m3-ultra-4 \
        --over thunderbolt --backend jaccl \
        --auto-setup --output m3-ultra-jaccl.json

And now we are ready to run a distributed MLX script such as distributed inference
of a gigantic model using MLX LM.

.. code-block::

   mlx.launch --verbose --backend jaccl --hostfile m3-ultra-jaccl.json \
        --env MLX_METAL_FAST_SYNCH=1 -- \  # <--- important
        /path/to/remote/python -m mlx_lm chat --model mlx-community/DeepSeek-R1-0528-4bit

.. note::

   Defining the environment variable ``MLX_METAL_FAST_SYNCH=1`` enables a
   different, faster way of synchronizing between the GPU and the CPU. It is
   not specific to the JACCL backend and can be used in all cases where the CPU
   and GPU need to collaborate for some computation and is pretty critical for
   low-latency communication since the communication is done by the CPU.

.. _nccl_section:

Getting Started with NCCL
-------------------------

MLX on CUDA environments ships with the ability to talk to `NCCL
<https://developer.nvidia.com/nccl>`_ which is a high-performance collective
communication library that supports both multi-gpu and multi-node setups.

For CUDA environments, NCCL is the default backend for ``mlx.launch`` and all
it takes to run a distributed job is

.. code-block::

   mlx.launch -n 8 test.py

   # perfect for interactive scripts
   mlx.launch -n 8 python -m mlx_lm chat --model my-model

You can also use ``mlx.launch`` to ssh to a remote node and launch a script
with the same ease

.. code-block::

   mlx.launch --hosts my-cuda-node -n 8 test.py

In many cases you may not want to use ``mlx.launch`` with the NCCL backend
because the cluster scheduler will be the one launching the processes. You can
:ref:`see which environment variables need to be defined <no_mlx_launch>` in
order for the MLX NCCL backend to be initialized correctly.

.. _mpi_section:

Getting Started with MPI
------------------------

MLX already comes with the ability to "talk" to `MPI
<https://en.wikipedia.org/wiki/Message_Passing_Interface>`_ if it is installed
on the machine. Launching distributed MLX programs that use MPI can be done
with ``mpirun`` as expected. However, in the following examples we will be
using ``mlx.launch --backend mpi`` which takes care of some nuisances such as
setting absolute paths for the ``mpirun`` executable and the ``libmpi.dyld``
shared library.

The simplest possible usage is the following which, assuming the minimal
example in the beginning of this page, should result in:

.. code:: shell

    $ mlx.launch --backend mpi -n 2 test.py
    1 array([2, 2, 2, ..., 2, 2, 2], dtype=float32)
    0 array([2, 2, 2, ..., 2, 2, 2], dtype=float32)

The above launches two processes on the same (local) machine and we can see
both standard output streams. The processes send the array of 1s to each other
and compute the sum which is printed. Launching with ``mlx.launch -n 4 ...`` would
print 4 etc.

Installing MPI
^^^^^^^^^^^^^^

MPI can be installed with Homebrew, pip, using the Anaconda package manager, or
compiled from source. Most of our testing is done using ``openmpi`` installed
with the Anaconda package manager as follows:

.. code:: shell

    $ conda install conda-forge::openmpi

Installing with Homebrew or pip requires specifying the location of ``libmpi.dyld``
so that MLX can find it and load it at runtime. This can simply be achieved by
passing the ``DYLD_LIBRARY_PATH`` environment variable to ``mpirun`` and it is
done automatically by ``mlx.launch``. Some environments use a non-standard
library filename that can be specified using the ``MPI_LIBNAME`` environment
variable. This is automatically taken care of by ``mlx.launch`` as well.

.. code:: shell

    $ mpirun -np 2 -x DYLD_LIBRARY_PATH=/opt/homebrew/lib/ -x MPI_LIBNAME=libmpi.40.dylib python test.py
    $ # or simply
    $ mlx.launch -n 2 test.py

Setting up Remote Hosts
^^^^^^^^^^^^^^^^^^^^^^^

MPI can automatically connect to remote hosts and set up the communication over
the network if the remote hosts can be accessed via ssh. A good checklist to
debug connectivity issues is the following:

* ``ssh hostname`` works from all machines to all machines without asking for
  password or host confirmation
* ``mpirun`` is accessible on all machines.
* Ensure that the ``hostname`` used by MPI is the one that you have configured
  in the ``.ssh/config`` files on all machines.

Tuning MPI All Reduce
^^^^^^^^^^^^^^^^^^^^^

.. note::

    For faster all reduce consider using the ring backend either with Thunderbolt
    connections or over Ethernet.

Configure MPI to use N tcp connections between each host to improve bandwidth
by passing ``--mca btl_tcp_links N``.

Force MPI to use the most performant network interface by setting ``--mca
btl_tcp_if_include <iface>`` where ``<iface>`` should be the interface you want
to use.

.. _no_mlx_launch:

Distributed Without ``mlx.launch``
----------------------------------

None of the implementations of the distributed backends require launching with
``mlx.launch``. The script simply connects to each host. Starts a process per
rank and sets up the necessary environment variables before delegating to your
MLX script. See the :doc:`dedicated documentation page <launching_distributed>`
for more details.

For many use-cases this will be the easiest way to perform distributed
computations in MLX. However, there may be reasons that you cannot or should
not use ``mlx.launch``. A common such case is the use of a scheduler that
starts all the processes for you on machines undetermined at the time of
scheduling the job.

Below we list the environment variables required to use each backend.

Ring
^^^^^^

**MLX_RANK** should contain a single 0-based integer that defines the rank of
the process.

**MLX_HOSTFILE** should contain the path to a json file that contains IPs and
ports for each rank to listen to, something like the following:

.. code-block:: json

   [
     ["123.123.1.1:5000", "123.123.1.2:5000"],
     ["123.123.2.1:5000", "123.123.2.2:5000"],
     ["123.123.3.1:5000", "123.123.3.2:5000"],
     ["123.123.4.1:5000", "123.123.4.2:5000"]
   ]

**MLX_RING_VERBOSE** is optional and if set to 1 it enables some more logging
from the distributed backend.

JACCL
^^^^^

**MLX_RANK** should contain a single 0-based integer that defines the rank of
the process.

**MLX_JACCL_COORDINATOR** should contain the IP and port that rank 0 can listen
to all the other ranks connect to in order to establish the RDMA connections.

**MLX_IBV_DEVICES** should contain the path to a json file that contains the
ibverbs device names that connect each node to each other node, something like
the following:

.. code-block:: json

   [
      [null, "rdma_en5", "rdma_en4", "rdma_en3"],
      ["rdma_en5", null, "rdma_en3", "rdma_en4"],
      ["rdma_en4", "rdma_en3", null, "rdma_en5"],
      ["rdma_en3", "rdma_en4", "rdma_en5", null]
   ]


NCCL
^^^^^

**MLX_RANK** should contain a single 0-based integer that defines the rank of
the process.

**MLX_WORLD_SIZE** should contain the total number of processes that will be
launched.

**NCCL_HOST_IP** and **NCCL_PORT** should contain the IP and port that all
hosts can connect to to establish the NCCL communication.

**CUDA_VISIBLE_DEVICES** should contain the local index of the gpu that
corresponds to this process.

Of course any `other environment variable
<https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html>`_ that is
used by NCCL can be set.

.. _tips_and_tricks:

Tips and Tricks
----------------

This is a small collection of tips to help you utilize better the distributed
communication capabilities of MLX.

- *Test locally first.*

  You can use the pattern ``mlx.launch -n2 -- my_script.py`` to run a small
  scale test on a single node first.

- *Batch your communication.*

  As described in the :ref:`training example <training_example>`, performing a
  lot of small communications can hurt performance. Copy the approach of
  :func:`mlx.nn.average_gradients` to gather many small communications in a
  single large one.

- *Visualize the connectivity.*

  Use ``mlx.distributed_config --hosts h1,h2,h3 --over thunderbolt --dot`` to
  visualize the connnections and make sure that the cables are connected
  correctly. See the :ref:`JACCL section <jaccl_section>` for examples.

- *Use the debugger.*

  ``mlx.launch`` is meant for interactive use. It broadcasts stdin to all
  processes and gathers stdout from all processes. This makes using ``pdb`` a
  breeze.


================================================
FILE: docs/src/usage/export.rst
================================================
.. _export_usage:

Exporting Functions
===================

.. currentmodule:: mlx.core

MLX has an API to export and import functions to and from a file. This lets you
run computations written in one MLX front-end (e.g. Python) in another MLX
front-end (e.g. C++).

This guide walks through the basics of the MLX export API with some examples.
To see the full list of functions check-out the :ref:`API documentation
<export>`.

Basics of Exporting
-------------------

Let's start with a simple example:

.. code-block:: python

  def fun(x, y):
    return x + y

  x = mx.array(1.0)
  y = mx.array(1.0)
  mx.export_function("add.mlxfn", fun, x, y)

To export a function, provide sample input arrays that the function
can be called with. The data doesn't matter, but the shapes and types of the
arrays do. In the above example we exported ``fun`` with two ``float32``
scalar arrays. We can then import the function and run it:

.. code-block:: python

  add_fun = mx.import_function("add.mlxfn")

  out, = add_fun(mx.array(1.0), mx.array(2.0))
  # Prints: array(3, dtype=float32)
  print(out)

  out, = add_fun(mx.array(1.0), mx.array(3.0))
  # Prints: array(4, dtype=float32)
  print(out)

  # Raises an exception
  add_fun(mx.array(1), mx.array(3.0))

  # Raises an exception
  add_fun(mx.array([1.0, 2.0]), mx.array(3.0))

Notice the third and fourth calls to ``add_fun`` raise exceptions because the
shapes and types of the inputs are different than the shapes and types of the
example inputs we exported the function with.

Also notice that even though the original ``fun`` returns a single output
array, the imported function always returns a tuple of one or more arrays.

The inputs to :func:`export_function` and to an imported function can be
specified as variable positional arguments or as a tuple of arrays:

.. code-block:: python

  def fun(x, y):
    return x + y

  x = mx.array(1.0)
  y = mx.array(1.0)

  # Both arguments to fun are positional
  mx.export_function("add.mlxfn", fun, x, y)

  # Same as above
  mx.export_function("add.mlxfn", fun, (x, y))

  imported_fun = mx.import_function("add.mlxfn")

  # Ok
  out, = imported_fun(x, y)

  # Also ok
  out, = imported_fun((x, y))

You can pass example inputs to functions as positional or keyword arguments. If
you use keyword arguments to export the function, then you have to use the same
keyword arguments when calling the imported function.

.. code-block:: python

  def fun(x, y):
    return x + y

  # One argument to fun is positional, the other is a kwarg
  mx.export_function("add.mlxfn", fun, x, y=y)

  imported_fun = mx.import_function("add.mlxfn")

  # Ok
  out, = imported_fun(x, y=y)

  # Also ok
  out, = imported_fun((x,), {"y": y})

  # Raises since the keyword argument is missing
  out, = imported_fun(x, y)

  # Raises since the keyword argument has the wrong key
  out, = imported_fun(x, z=y)


Exporting Modules
-----------------

An :obj:`mlx.nn.Module` can be exported with or without the parameters included
in the exported function. Here's an example:

.. code-block:: python

   model = nn.Linear(4, 4)
   mx.eval(model.parameters())

   def call(x):
      return model(x)

   mx.export_function("model.mlxfn", call, mx.zeros(4))

In the above example, the :obj:`mlx.nn.Linear` module is exported. Its
parameters are also saved to the ``model.mlxfn`` file.

.. note::

   For enclosed arrays inside an exported function, be extra careful to ensure
   they are evaluated. The computation graph that gets exported will include
   the computation that produces enclosed inputs.

   If the above example was missing ``mx.eval(model.parameters()``, the
   exported function would include the random initialization of the
   :obj:`mlx.nn.Module` parameters.

If you only want to export the ``Module.__call__`` function without the
parameters, pass them as inputs to the ``call`` wrapper:

.. code-block:: python

   model = nn.Linear(4, 4)
   mx.eval(model.parameters())

   def call(x, **params):
     # Set the model's parameters to the input parameters
     model.update(tree_unflatten(list(params.items())))
     return model(x)

   params = tree_flatten(model.parameters(), destination={})
   mx.export_function("model.mlxfn", call, (mx.zeros(4),), params)


Exporting with a Callback
-------------------------

To inspect the exported graph, you can pass a callback instead of a file path
to :func:`export_function`.

.. code-block:: python

  def fun(x):
    return x.astype(mx.int32)

  def callback(args):
    print(args)

  mx.export_function(callback, fun, mx.array([1.0, 2.0]))

The argument to the callback (``args``) is a dictionary which includes a
``type`` field. The possible types are:

* ``"inputs"``: The ordered positional inputs to the exported function
* ``"keyword_inputs"``: The keyword specified inputs to the exported function
* ``"outputs"``: The ordered outputs of the exported function
* ``"constants"``: Any graph constants
* ``"primitives"``: Inner graph nodes representating the operations

Each type has additional fields in the ``args`` dictionary.


Shapeless Exports
-----------------

Just like :func:`compile`, functions can also be exported for dynamically shaped
inputs. Pass ``shapeless=True`` to :func:`export_function` or :func:`exporter`
to export a function which can be used for inputs with variable shapes:

.. code-block:: python

  mx.export_function("fun.mlxfn", mx.abs, mx.array([0.0]), shapeless=True)
  imported_abs = mx.import_function("fun.mlxfn")

  # Ok
  out, = imported_abs(mx.array([-1.0]))

  # Also ok
  out, = imported_abs(mx.array([-1.0, -2.0]))

With ``shapeless=False`` (which is the default), the second call to
``imported_abs`` would raise an exception with a shape mismatch.

Shapeless exporting works the same as shapeless compilation and should be
used carefully. See the :ref:`documentation on shapeless compilation
<shapeless_compile>` for more information.

Exporting Multiple Traces
-------------------------

In some cases, functions build different computation graphs for different
input arguments. A simple way to manage this is to export to a new file with
each set of inputs. This is a fine option in many cases. But it can be
suboptimal if the exported functions have a large amount of duplicate constant
data (for example the parameters of a :obj:`mlx.nn.Module`).

The export API in MLX lets you export multiple traces of the same function to
a single file by creating an exporting context manager with :func:`exporter`:

.. code-block:: python

  def fun(x, y=None):
      constant = mx.array(3.0)
      if y is not None:
        x += y
      return x + constant

  with mx.exporter("fun.mlxfn", fun) as exporter:
      exporter(mx.array(1.0))
      exporter(mx.array(1.0), y=mx.array(0.0))

  imported_function = mx.import_function("fun.mlxfn")

  # Call the function with y=None
  out, = imported_function(mx.array(1.0))
  print(out)

  # Call the function with y specified
  out, = imported_function(mx.array(1.0), y=mx.array(1.0))
  print(out)

In the above example the function constant data, (i.e. ``constant``), is only
saved once.

Transformations with Imported Functions
---------------------------------------

Function transformations like :func:`grad`, :func:`vmap`, and :func:`compile` work
on imported functions just like regular Python functions:

.. code-block:: python

  def fun(x):
      return mx.sin(x)

  x = mx.array(0.0)
  mx.export_function("sine.mlxfn", fun, x)

  imported_fun = mx.import_function("sine.mlxfn")

  # Take the derivative of the imported function
  dfdx = mx.grad(lambda x: imported_fun(x)[0])
  # Prints: array(1, dtype=float32)
  print(dfdx(x))

  # Compile the imported function
  mx.compile(imported_fun)
  # Prints: array(0, dtype=float32)
  print(compiled_fun(x)[0])


Importing Functions in C++
--------------------------

Importing and running functions in C++ is basically the same as importing and
running them in Python. First, follow the :ref:`instructions <mlx_in_cpp>` to
setup a simple C++ project that uses MLX as a library.

Next, export a simple function from Python:

.. code-block:: python

  def fun(x, y):
      return mx.exp(x + y)

  x = mx.array(1.0)
  y = mx.array(1.0)
  mx.export_function("fun.mlxfn", fun, x, y)


Import and run the function in C++ with only a few lines of code:

.. code-block:: c++

  auto fun = mx::import_function("fun.mlxfn");

  auto inputs = {mx::array(1.0), mx::array(1.0)};
  auto outputs = fun(inputs);

  // Prints: array(2, dtype=float32)
  std::cout << outputs[0] << std::endl;

Imported functions can be transformed in C++ just like in Python. Use
``std::vector<mx::array>`` for positional arguments and ``std::map<std::string,
mx::array>`` for keyword arguments when calling imported functions in C++.

More Examples
-------------

Here are a few more complete examples exporting more complex functions from
Python and importing and running them in C++:

* `Inference and training a multi-layer perceptron <https://github.com/ml-explore/mlx/tree/main/examples/export>`_


================================================
FILE: docs/src/usage/function_transforms.rst
================================================
.. _function_transforms:

Function Transforms
===================

.. currentmodule:: mlx.core

MLX uses composable function transformations for automatic differentiation,
vectorization, and compute graph optimizations. To see the complete list of
function transformations check-out the :ref:`API documentation <transforms>`.

The key idea behind composable function transformations is that every
transformation returns a function which can be further transformed.

Here is a simple example:

.. code-block:: shell

   >>> dfdx = mx.grad(mx.sin)
   >>> dfdx(mx.array(mx.pi))
   array(-1, dtype=float32)
   >>> mx.cos(mx.array(mx.pi))
   array(-1, dtype=float32)


The output of :func:`grad` on :func:`sin` is simply another function. In this
case it is the gradient of the sine function which is exactly the cosine
function. To get the second derivative you can do:

.. code-block:: shell

   >>> d2fdx2 = mx.grad(mx.grad(mx.sin))
   >>> d2fdx2(mx.array(mx.pi / 2))
   array(-1, dtype=float32)
   >>> mx.sin(mx.array(mx.pi / 2))
   array(1, dtype=float32)

Using :func:`grad` on the output of :func:`grad` is always ok. You keep
getting higher order derivatives.

Any of the MLX function transformations can be composed in any order to any
depth. See the following sections for more information on :ref:`automatic
differentiation <auto diff>` and :ref:`automatic vectorization <vmap>`.
For more information on :func:`compile` see the :ref:`compile documentation <compile>`.


Automatic Differentiation
-------------------------

.. _auto diff:

Automatic differentiation in MLX works on functions rather than on implicit
graphs.

.. note::

   If you are coming to MLX from PyTorch, you no longer need functions like
   ``backward``, ``zero_grad``, and ``detach``, or properties like
   ``requires_grad``.

The most basic example is taking the gradient of a scalar-valued function as we
saw above. You can use the :func:`grad` and :func:`value_and_grad` function to
compute gradients of more complex functions. By default these functions compute
the gradient with respect to the first argument:

.. code-block:: python

   def loss_fn(w, x, y):
      return mx.mean(mx.square(w * x - y))

   w = mx.array(1.0)
   x = mx.array([0.5, -0.5])
   y = mx.array([1.5, -1.5])

   # Computes the gradient of loss_fn with respect to w:
   grad_fn = mx.grad(loss_fn)
   dloss_dw = grad_fn(w, x, y)
   # Prints array(-1, dtype=float32)
   print(dloss_dw)

   # To get the gradient with respect to x we can do:
   grad_fn = mx.grad(loss_fn, argnums=1)
   dloss_dx = grad_fn(w, x, y)
   # Prints array([-1, 1], dtype=float32)
   print(dloss_dx)


One way to get the loss and gradient is to call ``loss_fn`` followed by
``grad_fn``, but this can result in a lot of redundant work. Instead, you
should use :func:`value_and_grad`. Continuing the above example:


.. code-block:: python

   # Computes the gradient of loss_fn with respect to w:
   loss_and_grad_fn = mx.value_and_grad(loss_fn)
   loss, dloss_dw = loss_and_grad_fn(w, x, y)

   # Prints array(1, dtype=float32)
   print(loss)

   # Prints array(-1, dtype=float32)
   print(dloss_dw)


You can also take the gradient with respect to arbitrarily nested Python
containers of arrays (specifically any of :obj:`list`, :obj:`tuple`, or
:obj:`dict`).

Suppose we wanted a weight and a bias parameter in the above example. A nice
way to do that is the following:

.. code-block:: python

   def loss_fn(params, x, y):
      w, b = params["weight"], params["bias"]
      h = w * x + b
      return mx.mean(mx.square(h - y))

   params = {"weight": mx.array(1.0), "bias": mx.array(0.0)}
   x = mx.array([0.5, -0.5])
   y = mx.array([1.5, -1.5])

   # Computes the gradient of loss_fn with respect to both the
   # weight and bias:
   grad_fn = mx.grad(loss_fn)
   grads = grad_fn(params, x, y)

   # Prints
   # {'weight': array(-1, dtype=float32), 'bias': array(0, dtype=float32)}
   print(grads)

Notice the tree structure of the parameters is preserved in the gradients.

In some cases you may want to stop gradients from propagating through a
part of the function. You can use the :func:`stop_gradient` for that.


Automatic Vectorization
-----------------------

.. _vmap:

Use :func:`vmap` to automate vectorizing complex functions. Here we'll go
through a basic and contrived example for the sake of clarity, but :func:`vmap`
can be quite powerful for more complex functions which are difficult to optimize
by hand.

.. warning::

   Some operations are not yet supported with :func:`vmap`. If you encounter an error
   like: ``ValueError: Primitive's vmap not implemented.`` file an `issue
   <https://github.com/ml-explore/mlx/issues>`_ and include your function.
   We will prioritize including it.

A naive way to add the elements from two sets of vectors is with a loop:

.. code-block:: python

  xs = mx.random.uniform(shape=(4096, 100))
  ys = mx.random.uniform(shape=(100, 4096))

  def naive_add(xs, ys):
      return [xs[i] + ys[:, i] for i in range(xs.shape[0])]

Instead you can use :func:`vmap` to automatically vectorize the addition:

.. code-block:: python

   # Vectorize over the second dimension of x and the
   # first dimension of y
   vmap_add = mx.vmap(lambda x, y: x + y, in_axes=(0, 1))

The ``in_axes`` parameter can be used to specify which dimensions of the
corresponding input to vectorize over. Similarly, use ``out_axes`` to specify
where the vectorized axes should be in the outputs.

Let's time these two different versions:

.. code-block:: python

  import timeit

  print(timeit.timeit(lambda: mx.eval(naive_add(xs, ys)), number=100))
  print(timeit.timeit(lambda: mx.eval(vmap_add(xs, ys)), number=100))

On an M1 Max the naive version takes in total ``5.639`` seconds whereas the
vectorized version takes only ``0.024`` seconds, more than 200 times faster.

Of course, this operation is quite contrived. A better approach is to simply do
``xs + ys.T``, but for more complex functions :func:`vmap` can be quite handy.


================================================
FILE: docs/src/usage/indexing.rst
================================================
.. _indexing:

Indexing Arrays
===============

.. currentmodule:: mlx.core

For the most part, indexing an MLX :obj:`array` works the same as indexing a
NumPy :obj:`numpy.ndarray`. See the `NumPy documentation
<https://numpy.org/doc/stable/user/basics.indexing.html>`_ for more details on
how that works.

For example, you can use regular integers and slices (:obj:`slice`) to index arrays:

.. code-block:: shell

  >>> arr = mx.arange(10)
  >>> arr[3]
  array(3, dtype=int32)
  >>> arr[-2]  # negative indexing works
  array(8, dtype=int32)
  >>> arr[2:8:2] # start, stop, stride
  array([2, 4, 6], dtype=int32)

For multi-dimensional arrays, the ``...`` or :obj:`Ellipsis` syntax works as in NumPy:

.. code-block:: shell

  >>> arr = mx.arange(8).reshape(2, 2, 2)
  >>> arr[:, :, 0]
  array(3, dtype=int32)
  array([[0, 2],
         [4, 6]], dtype=int32
  >>> arr[..., 0]
  array([[0, 2],
         [4, 6]], dtype=int32

You can index with ``None`` to create a new axis:

.. code-block:: shell

  >>> arr = mx.arange(8)
  >>> arr.shape
  [8]
  >>> arr[None].shape
  [1, 8]


You can also use an :obj:`array` to index another :obj:`array`:

.. code-block:: shell

  >>> arr = mx.arange(10)
  >>> idx = mx.array([5, 7])
  >>> arr[idx]
  array([5, 7], dtype=int32)

Mixing and matching integers, :obj:`slice`, ``...``, and :obj:`array` indices
works just as in NumPy.

Other functions which may be useful for indexing arrays are :func:`take` and
:func:`take_along_axis`.

Differences from NumPy
----------------------

.. Note::

  MLX indexing is different from NumPy indexing in two important ways:

  * Indexing does not perform bounds checking. Indexing out of bounds is
    undefined behavior.
  * Boolean mask based indexing is supported for assignment only (see
    :ref:`boolean-mask-assignment`).

The reason for the lack of bounds checking is that exceptions cannot propagate
from the GPU. Performing bounds checking for array indices before launching the
kernel would be extremely inefficient.

Indexing with boolean masks is something that MLX may support in the future. In
general, MLX has limited support for operations for which output
*shapes* are dependent on input *data*. Other examples of these types of
operations which MLX does not yet support include :func:`numpy.nonzero` and the
single input version of :func:`numpy.where`.

In Place Updates
----------------

In place updates to indexed arrays are possible in MLX. For example:

.. code-block:: shell

  >>> a = mx.array([1, 2, 3])
  >>> a[2] = 0
  >>> a
  array([1, 2, 0], dtype=int32)

Just as in NumPy, in place updates will be reflected in all references to the
same array:

.. code-block:: shell

  >>> a = mx.array([1, 2, 3])
  >>> b = a
  >>> b[2] = 0
  >>> b
  array([1, 2, 0], dtype=int32)
  >>> a
  array([1, 2, 0], dtype=int32)

Note that unlike NumPy, slicing an array creates a copy, not a view. So
mutating it does not mutate the original array:

.. code-block:: shell

  >>> a = mx.array([1, 2, 3])
  >>> b = a[:]
  >>> b[2] = 0
  >>> b
  array([1, 2, 0], dtype=int32)
  >>> a
  array([1, 2, 3], dtype=int32)

Also unlike NumPy, updates to the same location are nondeterministic:

.. code-block:: shell

  >>> a = mx.array([1, 2, 3])
  >>> a[[0, 0]] = mx.array([4, 5])

The first element of ``a`` could be ``4`` or ``5``.

Transformations of functions which use in-place updates are allowed and work as
expected. For example:

.. code-block:: python

   def fun(x, idx):
       x[idx] = 2.0
       return x.sum()

   dfdx = mx.grad(fun)(mx.array([1.0, 2.0, 3.0]), mx.array([1]))
   print(dfdx)  # Prints: array([1, 0, 1], dtype=float32)

In the above ``dfdx`` will have the correct gradient, namely zeros at ``idx``
and ones elsewhere.

.. _boolean-mask-assignment:

Boolean Mask Assignment
-----------------------

MLX supports boolean indices using NumPy syntax. A mask must already be
a :class:`bool_` MLX :class:`array` or a NumPy ``ndarray`` with ``dtype=bool``.
Other index types are routed through the standard scatter code.

.. code-block:: shell

   >>> a = mx.array([1.0, 2.0, 3.0])
   >>> mask = mx.array([True, False, True])
   >>> updates = mx.array([5.0, 6.0])
   >>> a[mask] = updates
   >>> a
   array([5.0, 2.0, 6.0], dtype=float32)

Scalar assignments broadcast to every ``True`` entry in ``mask``. For non-scalar
assignments, ``updates`` must provide at least as many elements as there are
``True`` entries in ``mask``.

.. code-block:: shell

   >>> a = mx.zeros((2, 3))
   >>> mask = mx.array([[True, False, True],
                        [False, False, True]])
   >>> a[mask] = 1.0
   >>> a
   array([[1.0, 0.0, 1.0],
          [0.0, 0.0, 1.0]], dtype=float32)

Boolean masks follow NumPy semantics:

- The mask shape must match the shape of the axes it indexes exactly. The only
  exception is a scalar boolean mask, which broadcasts to the full array.
- Any axes not covered by the mask are taken in full.

.. code-block:: shell

   >>> a = mx.arange(1000).reshape(10, 10, 10)
   >>> a[mx.random.normal((10, 10)) > 0.0] = 0  # valid: mask covers axes 0 and 1

The mask of shape ``(10, 10)`` applies to the first two axes, so ``a[mask]``
selects the 1-D slices ``a[i, j, :]`` where ``mask[i, j]`` is ``True``.
Shapes such as ``(1, 10, 10)`` or ``(10, 10, 1)`` do not match the indexed
axes and therefore raise errors.


================================================
FILE: docs/src/usage/launching_distributed.rst
================================================
:orphan:

.. _usage_launch_distributed:

Launching Distributed Programs
==============================

.. currentmodule:: mlx.core.distributed

The MLX python package provides two utilities to help you configure
your Macs for distributed computation and also launch distributed programs on
multiple nodes or with many processes in a single node. These utilities are aptly named

- ``mlx.launch``
- ``mlx.distributed_config``

See the :doc:`distributed docs <distributed>` for an introduction and
getting-started guides to the various backends.

``mlx.distributed_config`` 
---------------------------

Unless you are launching distributed jobs locally for development or multi-gpu
CUDA environments, then you have several Macs that you need to configure for
distributed communication with MLX.

``mlx.distributed_config`` aims to automate the process of configuring the
network interfaces (especially for communication over thunderbolt) and also
creating the hostfile to be used with ``mlx.launch``.

We will analyse 3 cases of using ``mlx.distributed_config``

1. RDMA over thunderbolt using JACCL
2. TCP/IP over thunderbolt using the ring backend
3. TCP/IP over ethernet using the ring backend

JACCL
^^^^^^^

After following :ref:`the steps to enable RDMA <jaccl_section>` you can run the
following command to configure the nodes and create the hostfile.

.. code-block::

   mlx.distributed_config --verbose --backend jaccl \
        --hosts m3-ultra-1,m3-ultra-2,m3-ultra-3,m3-ultra-4 --over thunderbolt \
        --auto-setup --output m3-ultra-jaccl.json

Let's walk through the steps that the script takes to configure the nodes.

1. ssh to all nodes to verify that they are reachable
2. Extract the thunderbolt connectivity. Namely run commands on each node to
   calculate which node is connected to which other node.
3. Verify that we have a valid fully connected mesh
4. Check that RDMA is enabled
5. Extract the ethernet IP from interface en0
6. Disable the thunderbolt bridge and set up peer to peer networks for each
   thunderbolt cable
7. Write the hostfile

Knowing the above steps allows you to manually configure the nodes but also
debug any configuration issue. For instance changing the Ethernet IP to a
different interface directly in the config is possible (as long as it is
reachable from all nodes).

The ``--auto-setup`` argument requires password-less sudo on each node. If it
isn't available then the configuration script will print commands to be run on
each node.

Ring over thunderbolt
^^^^^^^^^^^^^^^^^^^^^

Setting up a ring backend over thunderbolt only requires changing the
``--backend`` from ``jaccl`` to ``ring``.

The steps are very similar with the main difference being that instead of
verifying that the nodes are fully connected, the script attempts to identify a
ring topology (or multiple rings).

Ring over Ethernet
^^^^^^^^^^^^^^^^^^

Configuring the ring backend over ethernet doesn't require setting up network
interface and as such it simply extracts the ``en0`` IP from each node and
writes the hostfile.

Debugging cable connections
^^^^^^^^^^^^^^^^^^^^^^^^^^^

``mlx.distributed_config`` can help you debug the connectivity of your nodes
over thunderbolt by exporting a graph of the connections.

Running

.. code-block::

   mlx.distributed_config --verbose \
        --hosts host1,host2,host3,host4 \
        --over thunderbolt --dot

will export a `GraphViz <https://graphviz.org>`_ representation of the
connections between the nodes which makes it very easy to figure out which
cable is not connected correctly.

See :ref:`the JACCL section <jaccl_section>` for an example.


``mlx.launch``
--------------

The minimal usage example of ``mlx.launch`` is simply

.. code:: shell

    mlx.launch --hosts ip1,ip2 my_script.py

or for testing on localhost

.. code:: shell

    mlx.launch -n 2 my_script.py

The ``mlx.launch`` command connects to the provided host and launches the input
script on each host. It monitors each of the launched processes and terminates
the rest if one of them fails unexpectedly or if ``mlx.launch`` is terminated.
It also takes care of forwarding the output of each remote process to stdout
and stderr respectively.

Importantly, it also broadcasts stdin to each process which enables interactive
programs to work in distributed mode as well as debugging using the interactive
debugger.

Providing Hosts
^^^^^^^^^^^^^^^^

Hosts can be provided as command line arguments, like above, but the way that
allows to fully define a list of hosts is via a JSON hostfile. The hostfile has
a very simple schema. It is simply a list of objects that define each host via
a hostname to ssh to and a list of IPs to utilize for the communication.

.. code:: json

    [
        {"ssh": "hostname1", "ips": ["123.123.1.1", "123.123.2.1"]},
        {"ssh": "hostname2", "ips": ["123.123.1.2", "123.123.2.2"]}
    ]

You can use ``mlx.distributed_config --over ethernet`` to create a hostfile
with IPs corresponding to the ``en0`` interface.

Setting up Remote Hosts
^^^^^^^^^^^^^^^^^^^^^^^^

In order to be able to launch the script on each host we need to be able to
connect via ssh. Moreover the input script and python binary need to be on each
host and on the same path. A good checklist to debug errors is the following:

* ``ssh hostname`` works without asking for password or host confirmation
* the python binary is available on all hosts at the same path. You can use
  ``mlx.launch --print-python`` to see what that path is.
* the script you want to run is available on all hosts at the same path

If you are launching from a node with a completely different setup than the
nodes that the program will run on, you can specify ``--no-verify-script`` so
that ``mlx.launch`` does not attempt to verify that the executable and script
exist locally before launching the distributed job.

.. _ring_specifics:

Ring Specifics
^^^^^^^^^^^^^^

The :ref:`ring <ring_section>` backend, which is also the default
backend, can be explicitly selected with the argument ``--backend ring``. The
ring backend has some specific requirements and arguments that are different to
other backends:

* The argument ``--hosts`` only accepts IPs and not hostnames. If we need to
  ssh to a hostname that does not correspond to the IP we want to bind to we
  have to provide a hostfile.
* ``--starting-port`` defines the port to bind to on the remote hosts.
  Specifically rank 0 for the first IP will use this port and each subsequent
  IP or rank will add 1 to this port.
* ``--connections-per-ip`` allows us to increase the number of connections
  between neighboring nodes. This corresponds to ``--mca btl_tcp_links 2`` for
  ``mpirun``.

.. _jaccl_specifics:

JACCL Specifics
^^^^^^^^^^^^^^^^

The :ref:`JACCL <jaccl_section>` backend can be selected with the argument
``--backend jaccl``. A hostfile is necessary to launch with this backend
because it needs to contain the RDMA devices connecting each node to each other
node.

NCCL Specifics
^^^^^^^^^^^^^^

The :ref:`NCCL <nccl_section>` backend is the default backend for CUDA
environments. When launching from a Mac to a Linux machine with CUDA then the
backend should be selected using ``--backend nccl``.

The ``--repeat-hosts, -n`` argument should be used to launch multi-node and
multi-gpu jobs. For instance

.. code-block::

   mlx.launch --backend nccl --hosts linux-1,linux-2 -n 8 --no-verify-script -- ./my-job.sh

will attempt to launch 16 processes, 8 on each node that will all run
``my-job.sh``.

.. _mpi_specifics:

MPI Specifics
^^^^^^^^^^^^^

One can use MPI by passing ``--backend mpi`` to ``mlx.launch``. In that case,
``mlx.launch`` is a thin wrapper over ``mpirun``. Moreover,

* The IPs in the hostfile are ignored
* The ssh connectivity requirement is stronger as every node needs to be able
  to connect to every other node
* ``mpirun`` needs to be available on every node at the same path

Finally, one can pass arguments to ``mpirun`` using ``--mpi-arg``. For instance
to choose a specific interface for the byte-transfer-layer of MPI we can call
``mlx.launch`` as follows:

.. code:: shell

    mlx.launch --backend mpi --mpi-arg '--mca btl_tcp_if_include en0' --hostfile hosts.json my_script.py


================================================
FILE: docs/src/usage/lazy_evaluation.rst
================================================
.. _lazy eval:

Lazy Evaluation
===============

.. currentmodule:: mlx.core

Why Lazy Evaluation
-------------------

When you perform operations in MLX, no computation actually happens. Instead a
compute graph is recorded. The actual computation only happens if an
:func:`eval` is performed.

MLX uses lazy evaluation because it has some nice features, some of which we
describe below.

Transforming Compute Graphs
^^^^^^^^^^^^^^^^^^^^^^^^^^^

Lazy evaluation lets us record a compute graph without actually doing any
computations. This is useful for function transformations like :func:`grad` and
:func:`vmap` and graph optimizations.

Currently, MLX does not compile and rerun compute graphs. They are all
generated dynamically. However, lazy evaluation makes it much easier to
integrate compilation for future performance enhancements.

Only Compute What You Use
^^^^^^^^^^^^^^^^^^^^^^^^^

In MLX you do not need to worry as much about computing outputs that are never
used. For example:

.. code-block:: python

  def fun(x):
      a = fun1(x)
      b = expensive_fun(a)
      return a, b

  y, _ = fun(x)

Here, we never actually compute the output of ``expensive_fun``. Use this
pattern with care though, as the graph of ``expensive_fun`` is still built, and
that has some cost associated to it.

Similarly, lazy evaluation can be beneficial for saving memory while keeping
code simple. Say you have a very large model ``Model`` derived from
:obj:`mlx.nn.Module`. You can instantiate this model with ``model = Model()``.
Typically, this will initialize all of the weights as ``float32``, but the
initialization does not actually compute anything until you perform an
:func:`eval`. If you update the model with ``float16`` weights, your maximum
consumed memory will be half that required if eager computation was used
instead.

This pattern is simple to do in MLX thanks to lazy computation:

.. code-block:: python

  model = Model() # no memory used yet
  model.load_weights("weights_fp16.safetensors")

When to Evaluate
----------------

A common question is when to use :func:`eval`. The trade-off is between
letting graphs get too large and not batching enough useful work.

For example:

.. code-block:: python

  for _ in range(100):
       a = a + b
       mx.eval(a)
       b = b * 2
       mx.eval(b)

This is a bad idea because there is some fixed overhead with each graph
evaluation. On the other hand, there is some slight overhead which grows with
the compute graph size, so extremely large graphs (while computationally
correct) can be costly.

Luckily, a wide range of compute graph sizes work pretty well with MLX:
anything from a few tens of operations to many thousands of operations per
evaluation should be okay.

Most numerical computations have an iterative outer loop (e.g. the iteration in
stochastic gradient descent). A natural and usually efficient place to use
:func:`eval` is at each iteration of this outer loop.

Here is a concrete example:

.. code-block:: python

   for batch in dataset:

       # Nothing has been evaluated yet
       loss, grad = value_and_grad_fn(model, batch)

       # Still nothing has been evaluated
       optimizer.update(model, grad)

       # Evaluate the loss and the new parameters which will
       # run the full gradient computation and optimizer update
       mx.eval(loss, model.parameters())


An important behavior to be aware of is when the graph will be implicitly
evaluated. Anytime you ``print`` an array, convert it to an
:obj:`numpy.ndarray`, or otherwise access its memory via :obj:`memoryview`,
the graph will be evaluated. Saving arrays via :func:`save` (or any other MLX
saving functions) will also evaluate the array.


Calling :func:`array.item` on a scalar array will also evaluate it. In the
example above, printing the loss (``print(loss)``) or adding the loss scalar to
a list (``losses.append(loss.item())``) would cause a graph evaluation. If
these lines are before ``mx.eval(loss, model.parameters())`` then this
will be a partial evaluation, computing only the forward pass.

Also, calling :func:`eval` on an array or set of arrays multiple times is
perfectly fine. This is effectively a no-op.

.. warning::

  Using scalar arrays for control-flow will cause an evaluation.

Here is an example:

.. code-block:: python

   def fun(x):
       h, y = first_layer(x)
       if y > 0:  # An evaluation is done here!
           z  = second_layer_a(h)
       else:
           z  = second_layer_b(h)
       return z

Using arrays for control flow should be done with care. The above example works
and can even be used with gradient transformations. However, this can be very
inefficient if evaluations are done too frequently.


================================================
FILE: docs/src/usage/numpy.rst
================================================
.. _numpy:

Conversion to NumPy and Other Frameworks
========================================

MLX array supports conversion between other frameworks with either:

* The `Python Buffer Protocol <https://docs.python.org/3/c-api/buffer.html>`_.
* `DLPack <https://dmlc.github.io/dlpack/latest/>`_.

Let's convert an array to NumPy and back.

.. code-block:: python

  import mlx.core as mx
  import numpy as np

  a = mx.arange(3)
  b = np.array(a) # copy of a
  c = mx.array(b) # copy of b

.. note::

    Since NumPy does not support ``bfloat16`` arrays, you will need to convert
    to ``float16`` or ``float32`` first: ``np.array(a.astype(mx.float32))``.
    Otherwise, you will receive an error like: ``Item size 2 for PEP 3118
    buffer format string does not match the dtype V item size 0.``

By default, NumPy copies data to a new array. This can be prevented by creating
an array view:

.. code-block:: python

  a = mx.arange(3)
  a_view = np.array(a, copy=False)
  print(a_view.flags.owndata) # False
  a_view[0] = 1
  print(a[0].item()) # 1

.. note::

    NumPy arrays with type ``float64`` will be default converted to MLX arrays
    with type ``float32``.

A NumPy array view is a normal NumPy array, except that it does not own its
memory. This means writing to the view is reflected in the original array.

While this is quite powerful to prevent copying arrays, it should be noted that
external changes to the memory of arrays cannot be reflected in gradients.

Let's demonstrate this in an example:

.. code-block:: python

  def f(x):
      x_view = np.array(x, copy=False)
      x_view[:] *= x_view # modify memory without telling mx
      return x.sum()

  x = mx.array([3.0])
  y, df = mx.value_and_grad(f)(x)
  print("f(x) = x² =", y.item()) # 9.0
  print("f'(x) = 2x !=", df.item()) # 1.0


The function ``f`` indirectly modifies the array ``x`` through a memory view.
However, this modification is not reflected in the gradient, as seen in the
last line outputting ``1.0``, representing the gradient of the sum operation
alone.  The squaring of ``x`` occurs externally to MLX, meaning that no
gradient is incorporated.  It's important to note that a similar issue arises
during array conversion and copying.  For instance, a function defined as
``mx.array(np.array(x)**2).sum()`` would also result in an incorrect gradient,
even though no in-place operations on MLX memory are executed.

PyTorch
-------

.. warning::

   PyTorch Support for :obj:`memoryview` is experimental and can break for
   multi-dimensional arrays. Casting to NumPy first is advised for now.

PyTorch supports the buffer protocol, but it requires an explicit
:obj:`memoryview`.

.. code-block:: python

  import mlx.core as mx
  import torch

  a = mx.arange(3)
  b = torch.tensor(memoryview(a))
  c = mx.array(b)

JAX
---
JAX fully supports the buffer protocol.

.. code-block:: python

  import mlx.core as mx
  import jax.numpy as jnp

  a = mx.arange(3)
  b = jnp.array(a)
  c = mx.array(b)

TensorFlow
----------

TensorFlow supports the buffer protocol, but it requires an explicit
:obj:`memoryview`.

.. code-block:: python

  import mlx.core as mx
  import tensorflow as tf

  a = mx.arange(3)
  b = tf.constant(memoryview(a))
  c = mx.array(b)


================================================
FILE: docs/src/usage/quick_start.rst
================================================
Quick Start Guide
=================


Basics
------

.. currentmodule:: mlx.core

Import ``mlx.core`` and make an :class:`array`:

.. code-block:: python

  >> import mlx.core as mx
  >> a = mx.array([1, 2, 3, 4])
  >> a.shape
  [4]
  >> a.dtype
  int32
  >> b = mx.array([1.0, 2.0, 3.0, 4.0])
  >> b.dtype
  float32

Operations in MLX are lazy. The outputs of MLX operations are not computed
until they are needed. To force an array to be evaluated use
:func:`eval`.  Arrays will automatically be evaluated in a few cases. For
example, inspecting a scalar with :meth:`array.item`, printing an array,
or converting an array from :class:`array` to :class:`numpy.ndarray` all
automatically evaluate the array.

.. code-block:: python

  >> c = a + b    # c not yet evaluated
  >> mx.eval(c)  # evaluates c
  >> c = a + b
  >> print(c)     # Also evaluates c
  array([2, 4, 6, 8], dtype=float32)
  >> c = a + b
  >> import numpy as np
  >> np.array(c)   # Also evaluates c
  array([2., 4., 6., 8.], dtype=float32)


See the page on :ref:`Lazy Evaluation <lazy eval>` for more details.

Function and Graph Transformations
----------------------------------

MLX has standard function transformations like :func:`grad` and :func:`vmap`.
Transformations can be composed arbitrarily. For example
``grad(vmap(grad(fn)))`` (or any other composition) is allowed.

.. code-block:: python

  >> x = mx.array(0.0)
  >> mx.sin(x)
  array(0, dtype=float32)
  >> mx.grad(mx.sin)(x)
  array(1, dtype=float32)
  >> mx.grad(mx.grad(mx.sin))(x)
  array(-0, dtype=float32)

Other gradient transformations include :func:`vjp` for vector-Jacobian products
and :func:`jvp` for Jacobian-vector products.

Use :func:`value_and_grad` to efficiently compute both a function's output and
gradient with respect to the function's input.


================================================
FILE: docs/src/usage/saving_and_loading.rst
================================================
.. _saving_and_loading:

Saving and Loading Arrays
=========================

.. currentmodule:: mlx.core

MLX supports multiple array serialization formats.

.. list-table:: Serialization Formats
   :widths: 20 8 25 25
   :header-rows: 1

   * - Format
     - Extension
     - Function
     - Notes
   * - NumPy
     - ``.npy``
     - :func:`save`
     - Single arrays only
   * - NumPy archive
     - ``.npz``
     - :func:`savez` and :func:`savez_compressed`
     - Multiple arrays
   * - Safetensors
     - ``.safetensors``
     - :func:`save_safetensors`
     - Multiple arrays
   * - GGUF
     - ``.gguf``
     - :func:`save_gguf`
     - Multiple arrays

The :func:`load` function will load any of the supported serialization
formats. It determines the format from the extensions. The output of
:func:`load` depends on the format.

Here's an example of saving a single array to a file:

.. code-block:: shell

   >>> a = mx.array([1.0])
   >>> mx.save("array", a)

The array ``a`` will be saved in the file ``array.npy`` (notice the extension
is automatically added). Including the extension is optional; if it is missing
it will be added. You can load the array with:

.. code-block:: shell

   >>> mx.load("array.npy")
   array([1], dtype=float32)

Here's an example of saving several arrays to a single file:

.. code-block:: shell

   >>> a = mx.array([1.0])
   >>> b = mx.array([2.0])
   >>> mx.savez("arrays", a, b=b)

For compatibility with :func:`numpy.savez` the MLX :func:`savez` takes arrays
as arguments. If the keywords are missing, then default names will be
provided. This can be loaded with:

.. code-block:: shell

   >>> mx.load("arrays.npz")
   {'b': array([2], dtype=float32), 'arr_0': array([1], dtype=float32)}

In this case :func:`load` returns a dictionary of names to arrays.

The functions :func:`save_safetensors` and :func:`save_gguf` are similar to
:func:`savez`, but they take as input a :obj:`dict` of string names to arrays:

.. code-block:: shell

   >>> a = mx.array([1.0])
   >>> b = mx.array([2.0])
   >>> mx.save_safetensors("arrays", {"a": a, "b": b})


================================================
FILE: docs/src/usage/unified_memory.rst
================================================
.. _unified_memory:

Unified Memory
==============

.. currentmodule:: mlx.core

Apple silicon has a unified memory architecture. The CPU and GPU have direct
access to the same memory pool. MLX is designed to take advantage of that.

Concretely, when you make an array in MLX you don't have to specify its location:


.. code-block:: python

  a = mx.random.normal((100,))
  b = mx.random.normal((100,))

Both ``a`` and ``b`` live in unified memory.

In MLX, rather than moving arrays to devices, you specify the device when you
run the operation. Any device can perform any operation on ``a`` and ``b``
without needing to move them from one memory location to another. For example:

.. code-block:: python

  mx.add(a, b, stream=mx.cpu)
  mx.add(a, b, stream=mx.gpu)

In the above, both the CPU and the GPU will perform the same add
operation. The operations can (and likely will) be run in parallel since
there are no dependencies between them. See :ref:`using_streams` for more
information the semantics of streams in MLX.

In the above ``add`` example, there are no dependencies between operations, so
there is no possibility for race conditions. If there are dependencies, the
MLX scheduler will automatically manage them. For example:

.. code-block:: python

  c = mx.add(a, b, stream=mx.cpu)
  d = mx.add(a, c, stream=mx.gpu)

In the above case, the second ``add`` runs on the GPU but it depends on the
output of the first ``add`` which is running on the CPU. MLX will
automatically insert a dependency between the two streams so that the second
``add`` only starts executing after the first is complete and ``c`` is
available.

A Simple Example
~~~~~~~~~~~~~~~~

Here is a more interesting (albeit slightly contrived example) of how unified
memory can be helpful. Suppose we have the following computation:

.. code-block:: python

  def fun(a, b, d1, d2):
    x = mx.matmul(a, b, stream=d1)
    for _ in range(500):
        b = mx.exp(b, stream=d2)
    return x, b

which we want to run with the following arguments:

.. code-block:: python

  a = mx.random.uniform(shape=(4096, 512))
  b = mx.random.uniform(shape=(512, 4))

The first ``matmul`` operation is a good fit for the GPU since it's more
compute dense. The second sequence of operations are a better fit for the CPU,
since they are very small and would probably be overhead bound on the GPU.

If we time the computation fully on the GPU, we get 2.8 milliseconds. But if we
run the computation with ``d1=mx.gpu`` and ``d2=mx.cpu``, then the time is only
about 1.4 milliseconds, about twice as fast. These times were measured on an M1
Max.


================================================
FILE: docs/src/usage/using_streams.rst
================================================
.. _using_streams:

Using Streams
=============

.. currentmodule:: mlx.core

Specifying the :obj:`Stream`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~

All operations (including random number generation) take an optional
keyword argument ``stream``. The ``stream`` kwarg specifies which
:obj:`Stream` the operation should run on. If the stream is unspecified then
the operation is run on the default stream of the default device:
``mx.default_stream(mx.default_device())``.  The ``stream`` kwarg can also
be a :obj:`Device` (e.g. ``stream=my_device``) in which case the operation is
run on the default stream of the provided device
``mx.default_stream(my_device)``.


================================================
FILE: examples/cmake_project/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.27)

project(example LANGUAGES CXX)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

# Comment the following two commands only the MLX C++ library is installed and
# set(MLX_ROOT "/path/to/mlx") directly if needed.
find_package(
  Python 3.9
  COMPONENTS Interpreter Development.Module
  REQUIRED)
execute_process(
  COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
  OUTPUT_STRIP_TRAILING_WHITESPACE
  OUTPUT_VARIABLE MLX_ROOT)

find_package(MLX CONFIG REQUIRED)

add_executable(example example.cpp)
target_link_libraries(example PRIVATE mlx)


================================================
FILE: examples/cmake_project/README.md
================================================
## Build and Run 

Install MLX with Python:

```bash
pip install mlx>=0.22
```

Build the C++ example:

```bash
cmake -B build -DCMAKE_BUILD_TYPE=Release
cmake --build build
```

Run the C++ example:

```
./build/example
```

which should output:

```
array([2, 4, 6], dtype=int32)
```


================================================
FILE: examples/cmake_project/example.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <iostream>

#include "mlx/mlx.h"

namespace mx = mlx::core;

int main() {
  auto x = mx::array({1, 2, 3});
  auto y = mx::array({1, 2, 3});
  std::cout << x + y << std::endl;
  return 0;
}


================================================
FILE: examples/cpp/CMakeLists.txt
================================================
function(build_example SRCFILE)
  get_filename_component(src_name ${SRCFILE} NAME_WE)
  set(target "${src_name}")
  add_executable(${target} ${SRCFILE})
  target_link_libraries(${target} PRIVATE mlx)
endfunction(build_example)

build_example(tutorial.cpp)
build_example(linear_regression.cpp)
build_example(logistic_regression.cpp)
build_example(metal_capture.cpp)
build_example(distributed.cpp)


================================================
FILE: examples/cpp/distributed.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <iostream>

#include "mlx/mlx.h"

namespace mx = mlx::core;

int main() {
  if (!mx::distributed::is_available()) {
    std::cout << "No communication backend found" << std::endl;
    return 1;
  }

  auto global_group = mx::distributed::init();
  std::cout << global_group.rank() << " / " << global_group.size() << std::endl;

  mx::array x = mx::ones({10});
  mx::array out = mx::distributed::all_sum(x, global_group);

  std::cout << out << std::endl;
}


================================================
FILE: examples/cpp/linear_regression.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <chrono>
#include <cmath>
#include <iostream>

#include "mlx/mlx.h"
#include "timer.h"

/**
 * An example of linear regression with MLX.
 */
namespace mx = mlx::core;

int main() {
  int num_features = 100;
  int num_examples = 1'000;
  int num_iters = 10'000;
  float learning_rate = 0.01;

  // True parameters
  auto w_star = mx::random::normal({num_features});

  // The input examples (design matrix)
  auto X = mx::random::normal({num_examples, num_features});

  // Noisy labels
  auto eps = 1e-2 * mx::random::normal({num_examples});
  auto y = mx::matmul(X, w_star) + eps;

  // Initialize random parameters
  mx::array w = 1e-2 * mx::random::normal({num_features});

  auto loss_fn = [&](mx::array w) {
    auto yhat = mx::matmul(X, w);
    return (0.5f / num_examples) * mx::sum(mx::square(yhat - y));
  };

  auto grad_fn = mx::grad(loss_fn);

  auto tic = timer::time();
  for (int it = 0; it < num_iters; ++it) {
    auto grads = grad_fn(w);
    w = w - learning_rate * grads;
    mx::eval(w);
  }
  auto toc = timer::time();

  auto loss = loss_fn(w);
  auto error_norm = std::sqrt(mx::sum(mx::square(w - w_star)).item<float>());
  auto throughput = num_iters / timer::seconds(toc - tic);
  std::cout << "Loss " << loss << ", |w - w*| = " << error_norm
            << ", Throughput " << throughput << " (it/s)." << std::endl;
}


================================================
FILE: examples/cpp/logistic_regression.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <chrono>
#include <cmath>
#include <iostream>

#include "mlx/mlx.h"
#include "timer.h"

/**
 * An example of logistic regression with MLX.
 */
namespace mx = mlx::core;

int main() {
  int num_features = 100;
  int num_examples = 1'000;
  int num_iters = 10'000;
  float learning_rate = 0.1;

  // True parameters
  auto w_star = mx::random::normal({num_features});

  // The input examples
  auto X = mx::random::normal({num_examples, num_features});

  // Labels
  auto y = mx::matmul(X, w_star) > 0;

  // Initialize random parameters
  mx::array w = 1e-2 * mx::random::normal({num_features});

  auto loss_fn = [&](mx::array w) {
    auto logits = mx::matmul(X, w);
    auto scale = (1.0f / num_examples);
    return scale * mx::sum(mx::logaddexp(mx::array(0.0f), logits) - y * logits);
  };

  auto grad_fn = mx::grad(loss_fn);

  auto tic = timer::time();
  for (int it = 0; it < num_iters; ++it) {
    auto grads = grad_fn(w);
    w = w - learning_rate * grads;
    mx::eval(w);
  }
  auto toc = timer::time();

  auto loss = loss_fn(w);
  auto acc = mx::sum((mx::matmul(X, w) > 0) == y) / num_examples;
  auto throughput = num_iters / timer::seconds(toc - tic);
  std::cout << "Loss " << loss << ", Accuracy, " << acc << ", Throughput "
            << throughput << " (it/s)." << std::endl;
}


================================================
FILE: examples/cpp/metal_capture.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <cassert>
#include <iostream>

#include "mlx/mlx.h"

namespace mx = mlx::core;

int main() {
  // To use Metal debugging and profiling:
  // 1. Build with the MLX_METAL_DEBUG CMake option (i.e. -DMLX_METAL_DEBUG=ON).
  // 2. Run with MTL_CAPTURE_ENABLED=1.
  mx::metal::start_capture("mlx_trace.gputrace");

  // Start at index two because the default GPU and CPU streams have indices
  // zero and one, respectively. This naming matches the label assigned to each
  // stream's command queue.
  auto s2 = new_stream(mx::Device::gpu);
  auto s3 = new_stream(mx::Device::gpu);

  auto a = mx::arange(1.f, 10.f, 1.f, mx::float32, s2);
  auto b = mx::arange(1.f, 10.f, 1.f, mx::float32, s3);
  auto x = mx::add(a, a, s2);
  auto y = mx::add(b, b, s3);

  // The multiply will happen on the default stream.
  std::cout << mx::multiply(x, y) << std::endl;

  mx::metal::stop_capture();
}


================================================
FILE: examples/cpp/timer.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include <chrono>

namespace timer {

using namespace std::chrono;

template <typename R, typename P>
inline double seconds(duration<R, P> x) {
  return duration_cast<nanoseconds>(x).count() / 1e9;
}

inline auto time() {
  return high_resolution_clock::now();
}

} // namespace timer


================================================
FILE: examples/cpp/tutorial.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <cassert>
#include <iostream>

#include "mlx/mlx.h"

namespace mx = mlx::core;

void array_basics() {
  // Make a scalar array:
  mx::array x(1.0);

  // Get the value out of it:
  auto s = x.item<float>();
  assert(s == 1.0);

  // Scalars have a size of 1:
  size_t size = x.size();
  assert(size == 1);

  // Scalars have 0 dimensions:
  int ndim = x.ndim();
  assert(ndim == 0);

  // The shape should be an empty vector:
  auto shape = x.shape();
  assert(shape.empty());

  // The datatype should be float32:
  auto dtype = x.dtype();
  assert(dtype == mx::float32);

  // Specify the dtype when constructing the array:
  x = mx::array(1, mx::int32);
  assert(x.dtype() == mx::int32);
  x.item<int>(); // OK
  // x.item<float>();  // Undefined!

  // Make a multidimensional array:
  x = mx::array({1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
  // mlx is row-major by default so the first row of this array
  // is [1.0, 2.0] and the second row is [3.0, 4.0]

  // Make an array of shape {2, 2} filled with ones:
  auto y = mx::ones({2, 2});

  // Pointwise add x and y:
  auto z = mx::add(x, y);

  // Same thing:
  z = x + y;

  // mlx is lazy by default. At this point `z` only
  // has a shape and a type but no actual data:
  assert(z.dtype() == mx::float32);
  assert(z.shape(0) == 2);
  assert(z.shape(1) == 2);

  // To actually run the computation you must evaluate `z`.
  // Under the hood, mlx records operations in a graph.
  // The variable `z` is a node in the graph which points to its operation
  // and inputs. When `eval` is called on an array (or arrays), the array and
  // all of its dependencies are recursively evaluated to produce the result.
  // Once an array is evaluated, it has data and is detached from its inputs.
  mx::eval(z);

  // Of course the array can still be an input to other operations. You can
  // even call eval on the array again, this will just be a no-op:
  mx::eval(z); // no-op

  // Some functions or methods on arrays implicitly evaluate them. For example
  // accessing a value in an array or printing the array implicitly evaluate it:
  z = mx::ones({1});
  z.item<float>(); // implicit evaluation

  z = mx::ones({2, 2});
  std::cout << z << std::endl; // implicit evaluation
}

void automatic_differentiation() {
  auto fn = [](mx::array x) { return mx::square(x); };

  // Computing the derivative function of a function
  auto grad_fn = mx::grad(fn);
  // Call grad_fn on the input to get the derivative
  auto x = mx::array(1.5);
  auto dfdx = grad_fn(x);
  // dfdx is 2 * x

  // Get the second derivative by composing grad with grad
  auto d2fdx2 = mx::grad(mx::grad(fn))(x);
  // d2fdx2 is 2
}

int main() {
  array_basics();
  automatic_differentiation();
}


================================================
FILE: examples/export/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.27)

project(import_mlx LANGUAGES CXX)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

find_package(
  Python 3.9
  COMPONENTS Interpreter Development.Module
  REQUIRED)
execute_process(
  COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
  OUTPUT_STRIP_TRAILING_WHITESPACE
  OUTPUT_VARIABLE MLX_ROOT)
find_package(MLX CONFIG REQUIRED)

add_executable(eval_mlp eval_mlp.cpp)
target_link_libraries(eval_mlp PRIVATE mlx)

add_executable(train_mlp train_mlp.cpp)
target_link_libraries(train_mlp PRIVATE mlx)


================================================
FILE: examples/export/README.md
================================================
## Setup

Install MLX:

```bash
pip install mlx>=0.22
```

Build the C++ examples:

```bash
cmake -B build -DCMAKE_BUILD_TYPE=Release
cmake --build build
```

## Run

### Eval MLP

Run the Python script to export the eval function:

```bash
python eval_mlp.py
```

Then run the C++ program to import and run the function:

```
./build/eval_mlp
```

The Python and C++ programs should output the same result.

### Train MLP

Run the Python script to export the model initialization and training
functions:

```bash
python train_mlp.py
```

Then run the C++ program to import and run the functions:

```
./build/train_mlp
```

The Python and C++ programs should output the same results.


================================================
FILE: examples/export/eval_mlp.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <mlx/mlx.h>
#include <iostream>

namespace mx = mlx::core;

int main() {
  int batch_size = 8;
  int input_dim = 32;

  // Make the input
  mx::random::seed(42);
  auto example_x = mx::random::uniform({batch_size, input_dim});

  // Import the function
  auto forward = mx::import_function("eval_mlp.mlxfn");

  // Call the imported function
  auto out = forward({example_x})[0];

  std::cout << out << std::endl;

  return 0;
}


================================================
FILE: examples/export/eval_mlp.py
================================================
# Copyright © 2024 Apple Inc.

import mlx.core as mx
import mlx.nn as nn
import mlx.utils


class MLP(nn.Module):
    """A simple MLP."""

    def __init__(
        self, num_layers: int, input_dim: int, hidden_dim: int, output_dim: int
    ):
        super().__init__()
        layer_sizes = [input_dim] + [hidden_dim] * num_layers + [output_dim]
        self.layers = [
            nn.Linear(idim, odim)
            for idim, odim in zip(layer_sizes[:-1], layer_sizes[1:])
        ]

    def __call__(self, x):
        for l in self.layers[:-1]:
            x = nn.relu(l(x))
        return self.layers[-1](x)


if __name__ == "__main__":

    batch_size = 8
    input_dim = 32
    output_dim = 10

    # Load the model
    mx.random.seed(0)  # Seed for params
    model = MLP(num_layers=5, input_dim=input_dim, hidden_dim=64, output_dim=output_dim)
    mx.eval(model)

    # Note, the model parameters are saved in the export function
    def forward(x):
        return model(x)

    mx.random.seed(42)  # Seed for input
    example_x = mx.random.uniform(shape=(batch_size, input_dim))

    mx.export_function("eval_mlp.mlxfn", forward, example_x)

    # Import in Python
    imported_forward = mx.import_function("eval_mlp.mlxfn")
    expected = forward(example_x)
    (out,) = imported_forward(example_x)
    assert mx.allclose(expected, out)
    print(out)


================================================
FILE: examples/export/train_mlp.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <mlx/mlx.h>
#include <iostream>

namespace mx = mlx::core;

int main() {
  int batch_size = 8;
  int input_dim = 32;
  int output_dim = 10;

  auto state = mx::import_function("init_mlp.mlxfn")({});

  // Make the input
  mx::random::seed(42);
  auto example_X = mx::random::normal({batch_size, input_dim});
  auto example_y = mx::random::randint(0, output_dim, {batch_size});

  // Import the function
  auto step = mx::import_function("train_mlp.mlxfn");

  // Call the imported function
  for (int it = 0; it < 100; ++it) {
    state.insert(state.end(), {example_X, example_y});
    state = step(state);
    eval(state);
    auto loss = state.back();
    state.pop_back();
    if (it % 10 == 0) {
      std::cout << "Loss " << loss.item<float>() << std::endl;
    }
  }
  return 0;
}


================================================
FILE: examples/export/train_mlp.py
================================================
# Copyright © 2024 Apple Inc.

import mlx.core as mx
import mlx.nn as nn
import mlx.optimizers as optim
import mlx.utils


class MLP(nn.Module):
    """A simple MLP."""

    def __init__(
        self, num_layers: int, input_dim: int, hidden_dim: int, output_dim: int
    ):
        super().__init__()
        layer_sizes = [input_dim] + [hidden_dim] * num_layers + [output_dim]
        self.layers = [
            nn.Linear(idim, odim)
            for idim, odim in zip(layer_sizes[:-1], layer_sizes[1:])
        ]

    def __call__(self, x):
        for l in self.layers[:-1]:
            x = nn.relu(l(x))
        return self.layers[-1](x)


if __name__ == "__main__":

    batch_size = 8
    input_dim = 32
    output_dim = 10

    def init():
        # Seed for the parameter initialization
        mx.random.seed(0)
        model = MLP(
            num_layers=3, input_dim=input_dim, hidden_dim=64, output_dim=output_dim
        )
        optimizer = optim.SGD(learning_rate=1e-1)
        optimizer.init(model.parameters())
        state = [model.parameters(), optimizer.state]
        tree_structure, state = zip(*mlx.utils.tree_flatten(state))
        return model, optimizer, tree_structure, state

    # Export the model parameter initialization
    model, optimizer, tree_structure, state = init()
    mx.eval(state)
    mx.export_function("init_mlp.mlxfn", lambda: init()[-1])

    def loss_fn(params, X, y):
        model.update(params)
        return nn.losses.cross_entropy(model(X), y, reduction="mean")

    def step(*inputs):
        *state, X, y = inputs
        params, opt_state = mlx.utils.tree_unflatten(list(zip(tree_structure, state)))
        optimizer.state = opt_state
        loss, grads = mx.value_and_grad(loss_fn)(params, X, y)
        params = optimizer.apply_gradients(grads, params)
        _, state = zip(*mlx.utils.tree_flatten([params, optimizer.state]))
        return *state, loss

    # Make some random data
    mx.random.seed(42)
    example_X = mx.random.normal(shape=(batch_size, input_dim))
    example_y = mx.random.randint(low=0, high=output_dim, shape=(batch_size,))
    mx.export_function("train_mlp.mlxfn", step, *state, example_X, example_y)

    # Export one step of SGD
    imported_step = mx.import_function("train_mlp.mlxfn")

    for it in range(100):
        *state, loss = imported_step(*state, example_X, example_y)
        if it % 10 == 0:
            print(f"Loss {loss.item():.6}")


================================================
FILE: examples/extensions/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.27)

project(_ext LANGUAGES CXX)

# ----------------------------- Setup -----------------------------
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

option(BUILD_SHARED_LIBS "Build extensions as a shared library" ON)

# ----------------------------- Dependencies -----------------------------
find_package(
  Python 3.8
  COMPONENTS Interpreter Development.Module
  REQUIRED)
execute_process(
  COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
  OUTPUT_STRIP_TRAILING_WHITESPACE
  OUTPUT_VARIABLE nanobind_ROOT)
find_package(nanobind CONFIG REQUIRED)

execute_process(
  COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
  OUTPUT_STRIP_TRAILING_WHITESPACE
  OUTPUT_VARIABLE MLX_ROOT)
find_package(MLX CONFIG REQUIRED)

# ----------------------------- Extensions -----------------------------

# Add library
add_library(mlx_ext)

# Add sources
target_sources(mlx_ext PUBLIC ${CMAKE_CURRENT_LIST_DIR}/axpby/axpby.cpp)

# Add include headers
target_include_directories(mlx_ext PUBLIC ${CMAKE_CURRENT_LIST_DIR})

# Link to mlx
target_link_libraries(mlx_ext PUBLIC mlx)

# ----------------------------- Metal -----------------------------

# Build metallib
if(MLX_BUILD_METAL)
  mlx_build_metallib(
    TARGET
    mlx_ext_metallib
    TITLE
    mlx_ext
    SOURCES
    ${CMAKE_CURRENT_LIST_DIR}/axpby/axpby.metal
    INCLUDE_DIRS
    ${PROJECT_SOURCE_DIR}
    ${MLX_INCLUDE_DIRS}
    OUTPUT_DIRECTORY
    ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})

  add_dependencies(mlx_ext mlx_ext_metallib)

endif()

# ----------------------------- Python Bindings -----------------------------
nanobind_add_module(
  _ext
  NB_STATIC
  STABLE_ABI
  LTO
  NOMINSIZE
  NB_DOMAIN
  mlx
  ${CMAKE_CURRENT_LIST_DIR}/bindings.cpp)
target_link_libraries(_ext PRIVATE mlx_ext)

if(BUILD_SHARED_LIBS)
  target_link_options(_ext PRIVATE -Wl,-rpath,@loader_path)
endif()


================================================
FILE: examples/extensions/README.md
================================================

## Build

```
pip install -e .
```

For faster builds during development, you can also pre-install the requirements:

```
pip install -r requirements.txt
```

And then run:

```
python setup.py build_ext -j8 --inplace
```

## Test

```
python test.py
```


================================================
FILE: examples/extensions/axpby/axpby.cpp
================================================
// Copyright © 2023-2025 Apple Inc.

#include <dlfcn.h>
#include <iostream>
#include <sstream>

#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/utils.h"

#include "axpby/axpby.h"

#ifdef _METAL_
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/utils.h"
#endif

namespace my_ext {

// A helper function to find the location of the current binary on disk.
// The Metal library ("mlx_ext.mtllib"), should be in the same directory.
std::string current_binary_dir() {
  static std::string binary_dir = []() {
    Dl_info info;
    if (!dladdr(reinterpret_cast<void*>(&current_binary_dir), &info)) {
      throw std::runtime_error("Unable to get current binary dir.");
    }
    return std::filesystem::path(info.dli_fname).parent_path().string();
  }();
  return binary_dir;
}

///////////////////////////////////////////////////////////////////////////////
// Operation Implementation
///////////////////////////////////////////////////////////////////////////////

/**
 *  Scale and sum two vectors element-wise
 *  z = alpha * x + beta * y
 *
 *  Follow numpy style broadcasting between x and y
 *  Inputs are upcasted to floats if needed
 **/
mx::array axpby(
    const mx::array& x, // Input mx::array x
    const mx::array& y, // Input mx::array y
    const float alpha, // Scaling factor for x
    const float beta, // Scaling factor for y
    mx::StreamOrDevice s /* = {} */ // Stream on which to schedule the operation
) {
  // Promote dtypes between x and y as needed
  auto promoted_dtype = promote_types(x.dtype(), y.dtype());

  // Upcast to float32 for non-floating point inputs x and y
  auto out_dtype = mx::issubdtype(promoted_dtype, mx::float32)
      ? promoted_dtype
      : promote_types(promoted_dtype, mx::float32);

  // Cast x and y up to the determined dtype (on the same stream s)
  auto x_casted = mx::astype(x, out_dtype, s);
  auto y_casted = mx::astype(y, out_dtype, s);

  // Broadcast the shapes of x and y (on the same stream s)
  auto broadcasted_inputs = broadcast_arrays({x_casted, y_casted}, s);
  auto out_shape = broadcasted_inputs[0].shape();

  // Construct the array as the output of the Axpby primitive
  // with the broadcasted and upcasted arrays as inputs
  return mx::array(
      /* const mx::Shape& shape = */ out_shape,
      /* mx::Dtype dtype = */ out_dtype,
      /* std::shared_ptr<mx::Primitive> primitive = */
      std::make_shared<Axpby>(to_stream(s), alpha, beta),
      /* const std::vector<mx::array>& inputs = */ broadcasted_inputs);
}

///////////////////////////////////////////////////////////////////////////////
// Primitive Common Backend Implementation
///////////////////////////////////////////////////////////////////////////////

template <typename T>
void axpby_impl(
    const mx::array& x,
    const mx::array& y,
    mx::array& out,
    float alpha_,
    float beta_,
    mx::Stream stream) {
  out.set_data(mx::allocator::malloc(out.nbytes()));

  // Get the CPU command encoder and register input and output arrays
  auto& encoder = mx::cpu::get_command_encoder(stream);
  encoder.set_input_array(x);
  encoder.set_input_array(y);
  encoder.set_output_array(out);

  // Launch the CPU kernel
  encoder.dispatch([x_ptr = x.data<T>(),
                    y_ptr = y.data<T>(),
                    out_ptr = out.data<T>(),
                    size = out.size(),
                    shape = out.shape(),
                    x_strides = x.strides(),
                    y_strides = y.strides(),
                    alpha_,
                    beta_]() {
    // Cast alpha and beta to the relevant types
    T alpha = static_cast<T>(alpha_);
    T beta = static_cast<T>(beta_);

    // Do the element-wise operation for each output
    for (size_t out_idx = 0; out_idx < size; out_idx++) {
      // Map linear indices to offsets in x and y
      auto x_offset = mx::elem_to_loc(out_idx, shape, x_strides);
      auto y_offset = mx::elem_to_loc(out_idx, shape, y_strides);

      // We allocate the output to be contiguous and regularly strided
      // (defaults to row major) and hence it doesn't need additional mapping
      out_ptr[out_idx] = alpha * x_ptr[x_offset] + beta * y_ptr[y_offset];
    }
  });
}

void Axpby::eval_cpu(
    const std::vector<mx::array>& inputs,
    std::vector<mx::array>& outputs) {
  auto& x = inputs[0];
  auto& y = inputs[1];
  auto& out = outputs[0];

  // Dispatch to the correct dtype
  if (out.dtype() == mx::float32) {
    return axpby_impl<float>(x, y, out, alpha_, beta_, stream());
  } else if (out.dtype() == mx::float16) {
    return axpby_impl<mx::float16_t>(x, y, out, alpha_, beta_, stream());
  } else if (out.dtype() == mx::bfloat16) {
    return axpby_impl<mx::bfloat16_t>(x, y, out, alpha_, beta_, stream());
  } else if (out.dtype() == mx::complex64) {
    return axpby_impl<mx::complex64_t>(x, y, out, alpha_, beta_, stream());
  } else {
    throw std::runtime_error(
        "Axpby is only supported for floating point types.");
  }
}

///////////////////////////////////////////////////////////////////////////////
// Primitive Metal Backend Implementation
///////////////////////////////////////////////////////////////////////////////

#ifdef _METAL_

/** Evaluate primitive on GPU */
void Axpby::eval_gpu(
    const std::vector<mx::array>& inputs,
    std::vector<mx::array>& outputs) {
  // Prepare inputs
  auto& x = inputs[0];
  auto& y = inputs[1];
  auto& out = outputs[0];

  // Each primitive carries the stream it should execute on
  // and each stream carries its device identifiers
  auto& s = stream();
  // We get the needed metal device using the stream
  auto& d = mx::metal::device(s.device);

  // Prepare to specialize based on contiguity
  bool contiguous_kernel =
      (x.flags().row_contiguous && y.flags().row_contiguous) ||
      (x.flags().col_contiguous && y.flags().col_contiguous);

  // Allocate output memory with strides based on specialization
  if (contiguous_kernel) {
    out.set_data(
        mx::allocator::malloc(x.data_size() * out.itemsize()),
        x.data_size(),
        x.strides(),
        x.flags());
  } else {
    out.set_data(mx::allocator::malloc(out.nbytes()));
  }

  // Resolve name of kernel (corresponds to axpby.metal)
  std::string kname = "axpby_";
  kname += (contiguous_kernel ? "contiguous_" : "general_");
  kname += type_to_name(out);

  // Load the metal library
  auto lib = d.get_library("mlx_ext", current_binary_dir());

  // Make a kernel from this metal library
  auto kernel = d.get_kernel(kname, lib);

  // Prepare to encode kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Kernel parameters are registered with buffer indices corresponding to
  // those in the kernel declaration at axpby.metal
  int ndim = out.ndim();
  size_t nelem = out.size();

  // Encode input arrays to kernel
  compute_encoder.set_input_array(x, 0);
  compute_encoder.set_input_array(y, 1);

  // Encode output arrays to kernel
  compute_encoder.set_output_array(out, 2);

  // Encode alpha and beta
  compute_encoder.set_bytes(alpha_, 3);
  compute_encoder.set_bytes(beta_, 4);

  // Encode shape, strides and ndim if needed
  if (!contiguous_kernel) {
    compute_encoder.set_vector_bytes(x.shape(), 5);
    compute_encoder.set_vector_bytes(x.strides(), 6);
    compute_encoder.set_vector_bytes(y.strides(), 7);
    compute_encoder.set_bytes(ndim, 8);
  }

  // We launch 1 thread for each input and make sure that the number of
  // threads in any given threadgroup is not higher than the max allowed
  size_t tgp_size = std::min(nelem, kernel->maxTotalThreadsPerThreadgroup());

  // Fix the 3D size of each threadgroup (in terms of threads)
  MTL::Size group_dims = MTL::Size(tgp_size, 1, 1);

  // Fix the 3D size of the launch grid (in terms of threads)
  MTL::Size grid_dims = MTL::Size(nelem, 1, 1);

  // Launch the grid with the given number of threads divided among
  // the given threadgroups
  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

#else // Metal is not available

/** Fail evaluation on GPU */
void Axpby::eval_gpu(
    const std::vector<mx::array>& inputs,
    std::vector<mx::array>& out) {
  throw std::runtime_error("Axpby has no GPU implementation.");
}

#endif

///////////////////////////////////////////////////////////////////////////////
// Primitive Transforms
///////////////////////////////////////////////////////////////////////////////

/** The Jacobian-vector product. */
std::vector<mx::array> Axpby::jvp(
    const std::vector<mx::array>& primals,
    const std::vector<mx::array>& tangents,
    const std::vector<int>& argnums) {
  // Forward mode diff that pushes along the tangents
  // The jvp transform on the primitive can built with ops
  // that are scheduled on the same stream as the primitive

  // If argnums = {0}, we only push along x in which case the
  // jvp is just the tangent scaled by alpha
  // Similarly, if argnums = {1}, the jvp is just the tangent
  // scaled by beta
  if (argnums.size() > 1) {
    auto scale = argnums[0] == 0 ? alpha_ : beta_;
    auto scale_arr = mx::array(scale, tangents[0].dtype());
    return {mx::multiply(scale_arr, tangents[0], stream())};
  }
  // If, argnums = {0, 1}, we take contributions from both
  // which gives us jvp = tangent_x * alpha + tangent_y * beta
  else {
    return {axpby(tangents[0], tangents[1], alpha_, beta_, stream())};
  }
}

/** The vector-Jacobian product. */
std::vector<mx::array> Axpby::vjp(
    const std::vector<mx::array>& primals,
    const std::vector<mx::array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<mx::array>&) {
  // Reverse mode diff
  std::vector<mx::array> vjps;
  for (auto arg : argnums) {
    auto scale = arg == 0 ? alpha_ : beta_;
    auto scale_arr = mx::array(scale, cotangents[0].dtype());
    vjps.push_back(mx::multiply(scale_arr, cotangents[0], stream()));
  }
  return vjps;
}

/** Vectorize primitive along given axis */
std::pair<std::vector<mx::array>, std::vector<int>> Axpby::vmap(
    const std::vector<mx::array>& inputs,
    const std::vector<int>& axes) {
  throw std::runtime_error("Axpby has no vmap implementation.");
}

/** Equivalence check **/
bool Axpby::is_equivalent(const Primitive& other) const {
  const Axpby& r_other = static_cast<const Axpby&>(other);
  return alpha_ == r_other.alpha_ && beta_ == r_other.beta_;
}

} // namespace my_ext


================================================
FILE: examples/extensions/axpby/axpby.h
================================================
// Copyright © 2023-2025 Apple Inc.

#pragma once

#include "mlx/ops.h"
#include "mlx/primitives.h"

namespace mx = mlx::core;

namespace my_ext {

///////////////////////////////////////////////////////////////////////////////
// Operation
///////////////////////////////////////////////////////////////////////////////

/**
 *  Scale and sum two vectors element-wise
 *  z = alpha * x + beta * y
 *
 *  Follow numpy style broadcasting between x and y
 *  Inputs are upcasted to floats if needed
 **/
mx::array axpby(
    const mx::array& x, // Input array x
    const mx::array& y, // Input array y
    const float alpha, // Scaling factor for x
    const float beta, // Scaling factor for y
    mx::StreamOrDevice s = {} // Stream on which to schedule the operation
);

///////////////////////////////////////////////////////////////////////////////
// Primitive
///////////////////////////////////////////////////////////////////////////////

class Axpby : public mx::Primitive {
 public:
  explicit Axpby(mx::Stream stream, float alpha, float beta)
      : mx::Primitive(stream), alpha_(alpha), beta_(beta) {};

  /**
   * A primitive must know how to evaluate itself on the CPU/GPU
   * for the given inputs and populate the output array.
   *
   * To avoid unnecessary allocations, the evaluation function
   * is responsible for allocating space for the array.
   */
  void eval_cpu(
      const std::vector<mx::array>& inputs,
      std::vector<mx::array>& outputs) override;
  void eval_gpu(
      const std::vector<mx::array>& inputs,
      std::vector<mx::array>& outputs) override;

  /** The Jacobian-vector product. */
  std::vector<mx::array> jvp(
      const std::vector<mx::array>& primals,
      const std::vector<mx::array>& tangents,
      const std::vector<int>& argnums) override;

  /** The vector-Jacobian product. */
  std::vector<mx::array> vjp(
      const std::vector<mx::array>& primals,
      const std::vector<mx::array>& cotangents,
      const std::vector<int>& argnums,
      const std::vector<mx::array>& outputs) override;

  /**
   * The primitive must know how to vectorize itself across
   * the given axes. The output is a pair containing the array
   * representing the vectorized computation and the axis which
   * corresponds to the output vectorized dimension.
   */
  std::pair<std::vector<mx::array>, std::vector<int>> vmap(
      const std::vector<mx::array>& inputs,
      const std::vector<int>& axes) override;

  /** The name of primitive. */
  const char* name() const override {
    return "Axpby";
  }

  /** Equivalence check **/
  bool is_equivalent(const mx::Primitive& other) const override;

 private:
  float alpha_;
  float beta_;
};

} // namespace my_ext


================================================
FILE: examples/extensions/axpby/axpby.metal
================================================
// Copyright © 2023-2025 Apple Inc.

#include <metal_stdlib>

#include "mlx/backend/metal/kernels/utils.h"

template <typename T>
[[kernel]] void axpby_general(
    device const T* x [[buffer(0)]],
    device const T* y [[buffer(1)]],
    device T* out [[buffer(2)]],
    constant const float& alpha [[buffer(3)]],
    constant const float& beta [[buffer(4)]],
    constant const int* shape [[buffer(5)]],
    constant const int64_t* x_strides [[buffer(6)]],
    constant const int64_t* y_strides [[buffer(7)]],
    constant const int& ndim [[buffer(8)]],
    uint index [[thread_position_in_grid]]) {
  auto x_offset = elem_to_loc(index, shape, x_strides, ndim);
  auto y_offset = elem_to_loc(index, shape, y_strides, ndim);
  out[index] =
      static_cast<T>(alpha) * x[x_offset] + static_cast<T>(beta) * y[y_offset];
}

template <typename T>
[[kernel]] void axpby_contiguous(
    device const T* x [[buffer(0)]],
    device const T* y [[buffer(1)]],
    device T* out [[buffer(2)]],
    constant const float& alpha [[buffer(3)]],
    constant const float& beta [[buffer(4)]],
    uint index [[thread_position_in_grid]]) {
  out[index] =
      static_cast<T>(alpha) * x[index] + static_cast<T>(beta) * y[index];
}

// clang-format off
#define instantiate_axpby(type_name, type)                             \
  instantiate_kernel("axpby_general_" #type_name, axpby_general, type) \
  instantiate_kernel(                                                  \
          "axpby_contiguous_" #type_name, axpby_contiguous, type)

instantiate_axpby(float32, float);
instantiate_axpby(float16, half);
instantiate_axpby(bfloat16, bfloat16_t);
instantiate_axpby(complex64, complex64_t);
// clang-format on


================================================
FILE: examples/extensions/bindings.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <nanobind/nanobind.h>
#include <nanobind/stl/variant.h>

#include "axpby/axpby.h"

namespace nb = nanobind;
using namespace nb::literals;

NB_MODULE(_ext, m) {
  m.doc() = "Sample extension for MLX";

  m.def(
      "axpby",
      &my_ext::axpby,
      "x"_a,
      "y"_a,
      "alpha"_a,
      "beta"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      R"(
        Scale and sum two vectors element-wise
        ``z = alpha * x + beta * y``

        Follows numpy style broadcasting between ``x`` and ``y``
        Inputs are upcasted to floats if needed

        Args:
            x (array): Input array.
            y (array): Input array.
            alpha (float): Scaling factor for ``x``.
            beta (float): Scaling factor for ``y``.

        Returns:
            array: ``alpha * x + beta * y``
      )");
}


================================================
FILE: examples/extensions/mlx_sample_extensions/__init__.py
================================================
# Copyright © 2023 Apple Inc.

import mlx.core as mx

from ._ext import axpby


================================================
FILE: examples/extensions/pyproject.toml
================================================
[build-system]
requires = [
  "setuptools>=42",
  "cmake>=3.25",
  "mlx>=0.18.0",
  "nanobind==2.10.2",
]
build-backend = "setuptools.build_meta"


================================================
FILE: examples/extensions/requirements.txt
================================================
setuptools>=42
cmake>=3.25
mlx>=0.21.0
nanobind==2.10.2


================================================
FILE: examples/extensions/setup.py
================================================
# Copyright © 2023-2024 Apple Inc.

from setuptools import setup

from mlx import extension

if __name__ == "__main__":
    setup(
        name="mlx_sample_extensions",
        version="0.0.0",
        description="Sample C++ and Metal extensions for MLX primitives.",
        ext_modules=[extension.CMakeExtension("mlx_sample_extensions._ext")],
        cmdclass={"build_ext": extension.CMakeBuild},
        packages=["mlx_sample_extensions"],
        package_data={"mlx_sample_extensions": ["*.so", "*.dylib", "*.metallib"]},
        zip_safe=False,
        python_requires=">=3.8",
    )


================================================
FILE: examples/extensions/test.py
================================================
import mlx.core as mx
from mlx_sample_extensions import axpby

a = mx.ones((3, 4))
b = mx.ones((3, 4))
c_cpu = axpby(a, b, 4.0, 2.0, stream=mx.cpu)
c_gpu = axpby(a, b, 4.0, 2.0, stream=mx.gpu)

print(f"c shape: {c_cpu.shape}")
print(f"c dtype: {c_cpu.dtype}")
print(f"c_cpu correct: {mx.all(c_cpu == 6.0).item()}")
print(f"c_gpu correct: {mx.all(c_gpu == 6.0).item()}")


================================================
FILE: examples/python/linear_regression.py
================================================
# Copyright © 2023 Apple Inc.

import time

import mlx.core as mx

num_features = 100
num_examples = 1_000
num_iters = 10_000
lr = 0.01

# True parameters
w_star = mx.random.normal((num_features,))

# Input examples (design matrix)
X = mx.random.normal((num_examples, num_features))

# Noisy labels
eps = 1e-2 * mx.random.normal((num_examples,))
y = X @ w_star + eps

# Initialize random parameters
w = 1e-2 * mx.random.normal((num_features,))


def loss_fn(w):
    return 0.5 * mx.mean(mx.square(X @ w - y))


grad_fn = mx.grad(loss_fn)

tic = time.perf_counter()
for _ in range(num_iters):
    grad = grad_fn(w)
    w = w - lr * grad
    mx.eval(w)
toc = time.perf_counter()

loss = loss_fn(w)
error_norm = mx.sum(mx.square(w - w_star)).item() ** 0.5
throughput = num_iters / (toc - tic)

print(
    f"Loss {loss.item():.5f}, L2 distance: |w-w*| = {error_norm:.5f}, "
    f"Throughput {throughput:.5f} (it/s)"
)


================================================
FILE: examples/python/logistic_regression.py
================================================
# Copyright © 2023 Apple Inc.

import time

import mlx.core as mx

num_features = 100
num_examples = 1_000
num_iters = 10_000
lr = 0.1

# True parameters
w_star = mx.random.normal((num_features,))

# Input examples
X = mx.random.normal((num_examples, num_features))

# Labels
y = (X @ w_star) > 0


# Initialize random parameters
w = 1e-2 * mx.random.normal((num_features,))


def loss_fn(w):
    logits = X @ w
    return mx.mean(mx.logaddexp(0.0, logits) - y * logits)


grad_fn = mx.grad(loss_fn)

tic = time.perf_counter()
for _ in range(num_iters):
    grad = grad_fn(w)
    w = w - lr * grad
    mx.eval(w)

toc = time.perf_counter()

loss = loss_fn(w)
final_preds = (X @ w) > 0
acc = mx.mean(final_preds == y)

throughput = num_iters / (toc - tic)
print(
    f"Loss {loss.item():.5f}, Accuracy {acc.item():.5f} "
    f"Throughput {throughput:.5f} (it/s)"
)


================================================
FILE: examples/python/qqmm.py
================================================
from itertools import product

import mlx.core as mx


# In mxfp8 mode, the results do not match exactly:
# fewer than 1% of output elements differ.
# This does not appear to be a systematic error.
# The error can exceed 1 ULP for very small values,
# and is always below 1 ULP for larger values.
# For nvfp4, the results match exactly.
# therefore I suspect that the discrepancy comes from
# the mxfp8 matmul implementation in cuBLASLt..
def ulp_bf16_at(x):
    ax = mx.abs(x)
    min_normal = mx.array(2.0**-126)
    ax = mx.where(ax < min_normal, min_normal, ax)
    e = mx.floor(mx.log2(ax))
    return mx.power(2.0, e - 7.0)


def test_qqmm():
    key = mx.random.key(0)
    k1, k2 = mx.random.split(key)
    dtypes = [mx.bfloat16, mx.float32, mx.float16]

    tests = (
        (16, "nvfp4", 4),
        (32, "mxfp8", 8),
    )
    shapes = (
        [64, 65, 33, 128, 256, 1024, 1024 * 8],  # M
        [64, 128, 256, 1024, 1024 * 8],  # N
        [64, 128, 256, 1024, 1024 * 8],  # K
    )
    for group_size, mode, bits in tests:
        for M, N, K in product(*shapes):
            for dtype in dtypes:
                x = mx.random.normal(shape=(M, K), key=k1, dtype=dtype)
                w = mx.random.normal(shape=(N, K), key=k2, dtype=dtype)
                w_q, scales_w = mx.quantize(w, group_size, bits, mode=mode)
                w_dq = mx.dequantize(
                    w_q,
                    scales_w,
                    group_size=group_size,
                    bits=bits,
                    mode=mode,
                    dtype=dtype,
                )
                y_q = mx.qqmm(
                    x,
                    w_q,
                    scales_w,
                    group_size=group_size,
                    bits=bits,
                    mode=mode,
                )
                x_q, scales_x = mx.quantize(
                    x, group_size=group_size, bits=bits, mode=mode
                )
                x_dq = mx.dequantize(
                    x_q,
                    scales_x,
                    group_size=group_size,
                    bits=bits,
                    mode=mode,
                    dtype=dtype,
                )
                y_hat = mx.matmul(x_dq, mx.transpose(w_dq))
                ulp = ulp_bf16_at(y_hat)
                error = (y_q - y_hat).abs()
                if not (mx.logical_or(error < 1e-3, error <= ulp).all()):
                    raise AssertionError(
                        f"qqmm test failed for shape {(M, N, K)}, "
                        f"group_size={group_size}, bits={bits}, "
                        f"mode={mode}, dtype={dtype}"
                    )


def test_qqmm_vjp():
    key = mx.random.key(0)
    k1, k2 = mx.random.split(key)
    M = 64
    N = 1024
    K = 512
    tests = (
        (16, "nvfp4", 4),
        (32, "mxfp8", 8),
    )
    x = mx.random.normal(shape=(M, K), key=k1)
    c = mx.ones(shape=(M, N))

    for group_size, mode, bits in tests:
        w = mx.random.normal(shape=(N, K), key=k2)

        def fn(x):
            return mx.qqmm(x, w, group_size=group_size, bits=bits, mode=mode)

        _, vjp_out = mx.vjp(fn, primals=(x,), cotangents=(c,))
        w_tq, scales_wt = mx.quantize(
            mx.transpose(w), group_size=group_size, bits=bits, mode=mode
        )
        expected_out = mx.qqmm(
            c, w_tq, scales_wt, group_size=group_size, bits=bits, mode=mode
        )
        ulp = ulp_bf16_at(expected_out)
        error = (vjp_out[0] - expected_out).abs()
        if not (mx.logical_or(error < 1e-3, error <= ulp).all()):
            raise AssertionError(
                f"qqmm vjp test failed for shape {(M, N, K)}, "
                f"group_size={group_size}, bits={bits}, mode={mode}"
            )


if __name__ == "__main__":
    test_qqmm()
    test_qqmm_vjp()


================================================
FILE: mlx/3rdparty/.clang-format
================================================
DisableFormat: true
SortIncludes: Never


================================================
FILE: mlx/3rdparty/pocketfft.h
================================================
/*
This file is part of pocketfft.

Copyright (C) 2010-2022 Max-Planck-Society
Copyright (C) 2019-2020 Peter Bell

For the odd-sized DCT-IV transforms:
  Copyright (C) 2003, 2007-14 Matteo Frigo
  Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology

Authors: Martin Reinecke, Peter Bell

All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
  list of conditions and the following disclaimer in the documentation and/or
  other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its contributors may
  be used to endorse or promote products derived from this software without
  specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#ifndef POCKETFFT_HDRONLY_H
#define POCKETFFT_HDRONLY_H

#ifndef __cplusplus
#error This file is C++ and requires a C++ compiler.
#endif

#if !(__cplusplus >= 201103L || _MSVC_LANG+0L >= 201103L)
#error This file requires at least C++11 support.
#endif

#ifndef POCKETFFT_CACHE_SIZE
#define POCKETFFT_CACHE_SIZE 0
#endif

#include <cmath>
#include <cstdlib>
#include <stdexcept>
#include <memory>
#include <vector>
#include <complex>
#include <algorithm>
#if POCKETFFT_CACHE_SIZE!=0
#include <array>
#include <mutex>
#endif

#ifndef POCKETFFT_NO_MULTITHREADING
#include <mutex>
#include <condition_variable>
#include <thread>
#include <queue>
#include <atomic>
#include <functional>
#include <new>

#ifdef POCKETFFT_PTHREADS
#  include <pthread.h>
#endif
#endif

#if defined(__GNUC__)
#define POCKETFFT_NOINLINE __attribute__((noinline))
#define POCKETFFT_RESTRICT __restrict__
#elif defined(_MSC_VER)
#define POCKETFFT_NOINLINE __declspec(noinline)
#define POCKETFFT_RESTRICT __restrict
#else
#define POCKETFFT_NOINLINE
#define POCKETFFT_RESTRICT
#endif

namespace pocketfft {

namespace detail {
using std::size_t;
using std::ptrdiff_t;

// Always use std:: for <cmath> functions
template <typename T> T cos(T) = delete;
template <typename T> T sin(T) = delete;
template <typename T> T sqrt(T) = delete;

using shape_t = std::vector<size_t>;
using stride_t = std::vector<ptrdiff_t>;

constexpr bool FORWARD  = true,
               BACKWARD = false;

// only enable vector support for gcc>=5.0 and clang>=5.0
#ifndef POCKETFFT_NO_VECTORS
#define POCKETFFT_NO_VECTORS
#if defined(__INTEL_COMPILER)
// do nothing. This is necessary because this compiler also sets __GNUC__.
#elif defined(__clang__)
// AppleClang has their own version numbering
#ifdef __apple_build_version__
#  if (__clang_major__ > 9) || (__clang_major__ == 9 && __clang_minor__ >= 1)
#     undef POCKETFFT_NO_VECTORS
#  endif
#elif __clang_major__ >= 5
#  undef POCKETFFT_NO_VECTORS
#endif
#elif defined(__GNUC__)
#if __GNUC__>=5
#undef POCKETFFT_NO_VECTORS
#endif
#endif
#endif

template<typename T> struct VLEN { static constexpr size_t val=1; };

#ifndef POCKETFFT_NO_VECTORS
#if (defined(__AVX512F__))
template<> struct VLEN<float> { static constexpr size_t val=16; };
template<> struct VLEN<double> { static constexpr size_t val=8; };
#elif (defined(__AVX__))
template<> struct VLEN<float> { static constexpr size_t val=8; };
template<> struct VLEN<double> { static constexpr size_t val=4; };
#elif (defined(__SSE2__))
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
#elif (defined(__VSX__))
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
#elif (defined(__ARM_NEON__) || defined(__ARM_NEON))
template<> struct VLEN<float> { static constexpr size_t val=4; };
template<> struct VLEN<double> { static constexpr size_t val=2; };
#else
#define POCKETFFT_NO_VECTORS
#endif
#endif

// the __MINGW32__ part in the conditional below works around the problem that
// the standard C++ library on Windows does not provide aligned_alloc() even
// though the MinGW compiler and MSVC may advertise C++17 compliance.
#if (__cplusplus >= 201703L) && (!defined(__MINGW32__)) && (!defined(_MSC_VER))
inline void *aligned_alloc(size_t align, size_t size)
  {
  // aligned_alloc() requires that the requested size is a multiple of "align"
  void *ptr = ::aligned_alloc(align,(size+align-1)&(~(align-1)));
  if (!ptr) throw std::bad_alloc();
  return ptr;
  }
inline void aligned_dealloc(void *ptr)
    { free(ptr); }
#else // portable emulation
inline void *aligned_alloc(size_t align, size_t size)
  {
  align = std::max(align, alignof(max_align_t));
  void *ptr = malloc(size+align);
  if (!ptr) throw std::bad_alloc();
  void *res = reinterpret_cast<void *>
    ((reinterpret_cast<uintptr_t>(ptr) & ~(uintptr_t(align-1))) + uintptr_t(align));
  (reinterpret_cast<void**>(res))[-1] = ptr;
  return res;
  }
inline void aligned_dealloc(void *ptr)
  { if (ptr) free((reinterpret_cast<void**>(ptr))[-1]); }
#endif

template<typename T> class arr
  {
  private:
    T *p;
    size_t sz;

#if defined(POCKETFFT_NO_VECTORS)
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
      void *res = malloc(num*sizeof(T));
      if (!res) throw std::bad_alloc();
      return reinterpret_cast<T *>(res);
      }
    static void dealloc(T *ptr)
      { free(ptr); }
#else
    static T *ralloc(size_t num)
      {
      if (num==0) return nullptr;
      void *ptr = aligned_alloc(64, num*sizeof(T));
      return static_cast<T*>(ptr);
      }
    static void dealloc(T *ptr)
      { aligned_dealloc(ptr); }
#endif

  public:
    arr() : p(0), sz(0) {}
    arr(size_t n) : p(ralloc(n)), sz(n) {}
    arr(arr &&other)
      : p(other.p), sz(other.sz)
      { other.p=nullptr; other.sz=0; }
    ~arr() { dealloc(p); }

    void resize(size_t n)
      {
      if (n==sz) return;
      dealloc(p);
      p = ralloc(n);
      sz = n;
      }

    T &operator[](size_t idx) { return p[idx]; }
    const T &operator[](size_t idx) const { return p[idx]; }

    T *data() { return p; }
    const T *data() const { return p; }

    size_t size() const { return sz; }
  };

template<typename T> struct cmplx {
  T r, i;
  cmplx() {}
  cmplx(T r_, T i_) : r(r_), i(i_) {}
  void Set(T r_, T i_) { r=r_; i=i_; }
  void Set(T r_) { r=r_; i=T(0); }
  cmplx &operator+= (const cmplx &other)
    { r+=other.r; i+=other.i; return *this; }
  template<typename T2>cmplx &operator*= (T2 other)
    { r*=other; i*=other; return *this; }
  template<typename T2>cmplx &operator*= (const cmplx<T2> &other)
    {
    T tmp = r*other.r - i*other.i;
    i = r*other.i + i*other.r;
    r = tmp;
    return *this;
    }
  template<typename T2>cmplx &operator+= (const cmplx<T2> &other)
    { r+=other.r; i+=other.i; return *this; }
  template<typename T2>cmplx &operator-= (const cmplx<T2> &other)
    { r-=other.r; i-=other.i; return *this; }
  template<typename T2> auto operator* (const T2 &other) const
    -> cmplx<decltype(r*other)>
    { return {r*other, i*other}; }
  template<typename T2> auto operator+ (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r+other.r, i+other.i}; }
  template<typename T2> auto operator- (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r-other.r, i-other.i}; }
  template<typename T2> auto operator* (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    { return {r*other.r-i*other.i, r*other.i + i*other.r}; }
  template<bool fwd, typename T2> auto special_mul (const cmplx<T2> &other) const
    -> cmplx<decltype(r+other.r)>
    {
    using Tres = cmplx<decltype(r+other.r)>;
    return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i)
               : Tres(r*other.r-i*other.i, r*other.i+i*other.r);
    }
};
template<typename T> inline void PM(T &a, T &b, T c, T d)
  { a=c+d; b=c-d; }
template<typename T> inline void PMINPLACE(T &a, T &b)
  { T t = a; a+=b; b=t-b; }
template<typename T> inline void MPINPLACE(T &a, T &b)
  { T t = a; a-=b; b=t+b; }
template<typename T> cmplx<T> conj(const cmplx<T> &a)
  { return {a.r, -a.i}; }
template<bool fwd, typename T, typename T2> void special_mul (const cmplx<T> &v1, const cmplx<T2> &v2, cmplx<T> &res)
  {
  res = fwd ? cmplx<T>(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i)
            : cmplx<T>(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r);
  }

template<typename T> void ROT90(cmplx<T> &a)
  { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; }
template<bool fwd, typename T> void ROTX90(cmplx<T> &a)
  { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; }

//
// twiddle factor section
//
template<typename T> class sincos_2pibyn
  {
  private:
    using Thigh = typename std::conditional<(sizeof(T)>sizeof(double)), T, double>::type;
    size_t N, mask, shift;
    arr<cmplx<Thigh>> v1, v2;

    static cmplx<Thigh> calc(size_t x, size_t n, Thigh ang)
      {
      x<<=3;
      if (x<4*n) // first half
        {
        if (x<2*n) // first quadrant
          {
          if (x<n) return cmplx<Thigh>(std::cos(Thigh(x)*ang), std::sin(Thigh(x)*ang));
          return cmplx<Thigh>(std::sin(Thigh(2*n-x)*ang), std::cos(Thigh(2*n-x)*ang));
          }
        else // second quadrant
          {
          x-=2*n;
          if (x<n) return cmplx<Thigh>(-std::sin(Thigh(x)*ang), std::cos(Thigh(x)*ang));
          return cmplx<Thigh>(-std::cos(Thigh(2*n-x)*ang), std::sin(Thigh(2*n-x)*ang));
          }
        }
      else
        {
        x=8*n-x;
        if (x<2*n) // third quadrant
          {
          if (x<n) return cmplx<Thigh>(std::cos(Thigh(x)*ang), -std::sin(Thigh(x)*ang));
          return cmplx<Thigh>(std::sin(Thigh(2*n-x)*ang), -std::cos(Thigh(2*n-x)*ang));
          }
        else // fourth quadrant
          {
          x-=2*n;
          if (x<n) return cmplx<Thigh>(-std::sin(Thigh(x)*ang), -std::cos(Thigh(x)*ang));
          return cmplx<Thigh>(-std::cos(Thigh(2*n-x)*ang), -std::sin(Thigh(2*n-x)*ang));
          }
        }
      }

  public:
    POCKETFFT_NOINLINE sincos_2pibyn(size_t n)
      : N(n)
      {
      constexpr auto pi = 3.141592653589793238462643383279502884197L;
      Thigh ang = Thigh(0.25L*pi/n);
      size_t nval = (n+2)/2;
      shift = 1;
      while((size_t(1)<<shift)*(size_t(1)<<shift) < nval) ++shift;
      mask = (size_t(1)<<shift)-1;
      v1.resize(mask+1);
      v1[0].Set(Thigh(1), Thigh(0));
      for (size_t i=1; i<v1.size(); ++i)
        v1[i]=calc(i,n,ang);
      v2.resize((nval+mask)/(mask+1));
      v2[0].Set(Thigh(1), Thigh(0));
      for (size_t i=1; i<v2.size(); ++i)
        v2[i]=calc(i*(mask+1),n,ang);
      }

    cmplx<T> operator[](size_t idx) const
      {
      if (2*idx<=N)
        {
        auto x1=v1[idx&mask], x2=v2[idx>>shift];
        return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r));
        }
      idx = N-idx;
      auto x1=v1[idx&mask], x2=v2[idx>>shift];
      return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), -T(x1.r*x2.i+x1.i*x2.r));
      }
  };

struct util // hack to avoid duplicate symbols
  {
  static POCKETFFT_NOINLINE size_t largest_prime_factor (size_t n)
    {
    size_t res=1;
    while ((n&1)==0)
      { res=2; n>>=1; }
    for (size_t x=3; x*x<=n; x+=2)
      while ((n%x)==0)
        { res=x; n/=x; }
    if (n>1) res=n;
    return res;
    }

  static POCKETFFT_NOINLINE double cost_guess (size_t n)
    {
    constexpr double lfp=1.1; // penalty for non-hardcoded larger factors
    size_t ni=n;
    double result=0.;
    while ((n&1)==0)
      { result+=2; n>>=1; }
    for (size_t x=3; x*x<=n; x+=2)
      while ((n%x)==0)
        {
        result+= (x<=5) ? double(x) : lfp*double(x); // penalize larger prime factors
        n/=x;
        }
    if (n>1) result+=(n<=5) ? double(n) : lfp*double(n);
    return result*double(ni);
    }

  /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
  static POCKETFFT_NOINLINE size_t good_size_cmplx(size_t n)
    {
    if (n<=12) return n;

    size_t bestfac=2*n;
    for (size_t f11=1; f11<bestfac; f11*=11)
      for (size_t f117=f11; f117<bestfac; f117*=7)
        for (size_t f1175=f117; f1175<bestfac; f1175*=5)
          {
          size_t x=f1175;
          while (x<n) x*=2;
          for (;;)
            {
            if (x<n)
              x*=3;
            else if (x>n)
              {
              if (x<bestfac) bestfac=x;
              if (x&1) break;
              x>>=1;
              }
            else
              return n;
            }
          }
    return bestfac;
    }

  /* returns the smallest composite of 2, 3, 5 which is >= n */
  static POCKETFFT_NOINLINE size_t good_size_real(size_t n)
    {
    if (n<=6) return n;

    size_t bestfac=2*n;
    for (size_t f5=1; f5<bestfac; f5*=5)
      {
      size_t x = f5;
      while (x<n) x *= 2;
      for (;;)
        {
        if (x<n)
          x*=3;
        else if (x>n)
          {
          if (x<bestfac) bestfac=x;
          if (x&1) break;
          x>>=1;
          }
        else
          return n;
        }
      }
    return bestfac;
    }

  static size_t prod(const shape_t &shape)
    {
    size_t res=1;
    for (auto sz: shape)
      res*=sz;
    return res;
    }

  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
    const stride_t &stride_in, const stride_t &stride_out, bool inplace)
    {
    auto ndim = shape.size();
    if (ndim<1) throw std::runtime_error("ndim must be >= 1");
    if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim))
      throw std::runtime_error("stride dimension mismatch");
    if (inplace && (stride_in!=stride_out))
      throw std::runtime_error("stride mismatch");
    }

  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
    const stride_t &stride_in, const stride_t &stride_out, bool inplace,
    const shape_t &axes)
    {
    sanity_check(shape, stride_in, stride_out, inplace);
    auto ndim = shape.size();
    shape_t tmp(ndim,0);
    for (auto ax : axes)
      {
      if (ax>=ndim) throw std::invalid_argument("bad axis number");
      if (++tmp[ax]>1) throw std::invalid_argument("axis specified repeatedly");
      }
    }

  static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
    const stride_t &stride_in, const stride_t &stride_out, bool inplace,
    size_t axis)
    {
    sanity_check(shape, stride_in, stride_out, inplace);
    if (axis>=shape.size()) throw std::invalid_argument("bad axis number");
    }

#ifdef POCKETFFT_NO_MULTITHREADING
  static size_t thread_count (size_t /*nthreads*/, const shape_t &/*shape*/,
    size_t /*axis*/, size_t /*vlen*/)
    { return 1; }
#else
  static size_t thread_count (size_t nthreads, const shape_t &shape,
    size_t axis, size_t vlen)
    {
    if (nthreads==1) return 1;
    size_t size = prod(shape);
    size_t parallel = size / (shape[axis] * vlen);
    if (shape[axis] < 1000)
      parallel /= 4;
    size_t max_threads = nthreads == 0 ?
      std::thread::hardware_concurrency() : nthreads;
    return std::max(size_t(1), std::min(parallel, max_threads));
    }
#endif
  };

namespace threading {

#ifdef POCKETFFT_NO_MULTITHREADING

constexpr inline size_t thread_id() { return 0; }
constexpr inline size_t num_threads() { return 1; }

template <typename Func>
void thread_map(size_t /* nthreads */, Func f)
  { f(); }

#else

inline size_t &thread_id()
  {
  static thread_local size_t thread_id_=0;
  return thread_id_;
  }
inline size_t &num_threads()
  {
  static thread_local size_t num_threads_=1;
  return num_threads_;
  }
static const size_t max_threads = std::max(1u, std::thread::hardware_concurrency());

class latch
  {
    std::atomic<size_t> num_left_;
    std::mutex mut_;
    std::condition_variable completed_;
    using lock_t = std::unique_lock<std::mutex>;

  public:
    latch(size_t n): num_left_(n) {}

    void count_down()
      {
      lock_t lock(mut_);
      if (--num_left_)
        return;
      completed_.notify_all();
      }

    void wait()
      {
      lock_t lock(mut_);
      completed_.wait(lock, [this]{ return is_ready(); });
      }
    bool is_ready() { return num_left_ == 0; }
  };

template <typename T> class concurrent_queue
  {
    std::queue<T> q_;
    std::mutex mut_;
    std::atomic<size_t> size_;
    using lock_t = std::lock_guard<std::mutex>;

  public:

    void push(T val)
      {
      lock_t lock(mut_);
      ++size_;
      q_.push(std::move(val));
      }

    bool try_pop(T &val)
      {
      if (size_ == 0) return false;
      lock_t lock(mut_);
      // Queue might have been emptied while we acquired the lock
      if (q_.empty()) return false;

      val = std::move(q_.front());
      --size_;
      q_.pop();
      return true;
      }

    bool empty() const { return size_==0; }
  };

// C++ allocator with support for over-aligned types
template <typename T> struct aligned_allocator
  {
  using value_type = T;
  template <class U>
  aligned_allocator(const aligned_allocator<U>&) {}
  aligned_allocator() = default;

  T *allocate(size_t n)
    {
    void* mem = aligned_alloc(alignof(T), n*sizeof(T));
    return static_cast<T*>(mem);
    }

  void deallocate(T *p, size_t /*n*/)
    { aligned_dealloc(p); }
  };

class thread_pool
  {
    // A reasonable guess, probably close enough for most hardware
    static constexpr size_t cache_line_size = 64;
    struct alignas(cache_line_size) worker
      {
      std::thread thread;
      std::condition_variable work_ready;
      std::mutex mut;
      std::atomic_flag busy_flag = ATOMIC_FLAG_INIT;
      std::function<void()> work;

      void worker_main(
        std::atomic<bool> &shutdown_flag,
        std::atomic<size_t> &unscheduled_tasks,
        concurrent_queue<std::function<void()>> &overflow_work)
        {
        using lock_t = std::unique_lock<std::mutex>;
        bool expect_work = true;
        while (!shutdown_flag || expect_work)
          {
          std::function<void()> local_work;
          if (expect_work || unscheduled_tasks == 0)
            {
            lock_t lock(mut);
            // Wait until there is work to be executed
            work_ready.wait(lock, [&]{ return (work || shutdown_flag); });
            local_work.swap(work);
            expect_work = false;
            }

          bool marked_busy = false;
          if (local_work)
            {
            marked_busy = true;
            local_work();
            }

          if (!overflow_work.empty())
            {
            if (!marked_busy && busy_flag.test_and_set())
              {
              expect_work = true;
              continue;
              }
            marked_busy = true;

            while (overflow_work.try_pop(local_work))
              {
              --unscheduled_tasks;
              local_work();
              }
            }

          if (marked_busy) busy_flag.clear();
          }
        }
      };

    concurrent_queue<std::function<void()>> overflow_work_;
    std::mutex mut_;
    std::vector<worker, aligned_allocator<worker>> workers_;
    std::atomic<bool> shutdown_;
    std::atomic<size_t> unscheduled_tasks_;
    using lock_t = std::lock_guard<std::mutex>;

    void create_threads()
      {
      lock_t lock(mut_);
      size_t nthreads=workers_.size();
      for (size_t i=0; i<nthreads; ++i)
        {
        try
          {
          auto *worker = &workers_[i];
          worker->busy_flag.clear();
          worker->work = nullptr;
          worker->thread = std::thread([worker, this]
            {
            worker->worker_main(shutdown_, unscheduled_tasks_, overflow_work_);
            });
          }
        catch (...)
          {
          shutdown_locked();
          throw;
          }
        }
      }

    void shutdown_locked()
      {
      shutdown_ = true;
      for (auto &worker : workers_)
        worker.work_ready.notify_all();

      for (auto &worker : workers_)
        if (worker.thread.joinable())
          worker.thread.join();
      }

  public:
    explicit thread_pool(size_t nthreads):
      workers_(nthreads)
      { create_threads(); }

    thread_pool(): thread_pool(max_threads) {}

    ~thread_pool() { shutdown(); }

    void submit(std::function<void()> work)
      {
      lock_t lock(mut_);
      if (shutdown_)
        throw std::runtime_error("Work item submitted after shutdown");

      ++unscheduled_tasks_;

      // First check for any idle workers and wake those
      for (auto &worker : workers_)
        if (!worker.busy_flag.test_and_set())
          {
          --unscheduled_tasks_;
          {
          lock_t lock(worker.mut);
          worker.work = std::move(work);
          }
          worker.work_ready.notify_one();
          return;
          }

      // If no workers were idle, push onto the overflow queue for later
      overflow_work_.push(std::move(work));
      }

    void shutdown()
      {
      lock_t lock(mut_);
      shutdown_locked();
      }

    void restart()
      {
      shutdown_ = false;
      create_threads();
      }
  };

inline thread_pool & get_pool()
  {
  static thread_pool pool;
#ifdef POCKETFFT_PTHREADS
  static std::once_flag f;
  std::call_once(f,
    []{
    pthread_atfork(
      +[]{ get_pool().shutdown(); },  // prepare
      +[]{ get_pool().restart(); },   // parent
      +[]{ get_pool().restart(); }    // child
      );
    });
#endif

  return pool;
  }

/** Map a function f over nthreads */
template <typename Func>
void thread_map(size_t nthreads, Func f)
  {
  if (nthreads == 0)
    nthreads = max_threads;

  if (nthreads == 1)
    { f(); return; }

  auto & pool = get_pool();
  latch counter(nthreads);
  std::exception_ptr ex;
  std::mutex ex_mut;
  for (size_t i=0; i<nthreads; ++i)
    {
    pool.submit(
      [&f, &counter, &ex, &ex_mut, i, nthreads] {
      thread_id() = i;
      num_threads() = nthreads;
      try { f(); }
      catch (...)
        {
        std::lock_guard<std::mutex> lock(ex_mut);
        ex = std::current_exception();
        }
      counter.count_down();
      });
    }
  counter.wait();
  if (ex)
    std::rethrow_exception(ex);
  }

#endif

}

//
// complex FFTPACK transforms
//

template<typename T0> class cfftp
  {
  private:
    struct fctdata
      {
      size_t fct;
      cmplx<T0> *tw, *tws;
      };

    size_t length;
    arr<cmplx<T0>> mem;
    std::vector<fctdata> fact;

    void add_factor(size_t factor)
      { fact.push_back({factor, nullptr, nullptr}); }

template<bool fwd, typename T> void pass2 (size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
  {
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+2*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
      CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
      CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
      for (size_t i=1; i<ido; ++i)
        {
        CH(i,k,0) = CC(i,0,k)+CC(i,1,k);
        special_mul<fwd>(CC(i,0,k)-CC(i,1,k),WA(0,i),CH(i,k,1));
        }
      }
  }

#define POCKETFFT_PREP3(idx) \
        T t0 = CC(idx,0,k), t1, t2; \
        PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \
        CH(idx,k,0)=t0+t1;
#define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \
        { \
        T ca=t0+t1*twr; \
        T cb{-t2.i*twi, t2.r*twi}; \
        PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\
        }
#define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \
        { \
        T ca=t0+t1*twr; \
        T cb{-t2.i*twi, t2.r*twi}; \
        special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
        special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
        }
template<bool fwd, typename T> void pass3 (size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
  {
  constexpr T0 tw1r=-0.5,
               tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L);

  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+3*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      POCKETFFT_PREP3(0)
      POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
      POCKETFFT_PREP3(0)
      POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
      }
      for (size_t i=1; i<ido; ++i)
        {
        POCKETFFT_PREP3(i)
        POCKETFFT_PARTSTEP3b(1,2,tw1r,tw1i)
        }
      }
  }

#undef POCKETFFT_PARTSTEP3b
#undef POCKETFFT_PARTSTEP3a
#undef POCKETFFT_PREP3

template<bool fwd, typename T> void pass4 (size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
  {
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+4*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      T t1, t2, t3, t4;
      PM(t2,t1,CC(0,0,k),CC(0,2,k));
      PM(t3,t4,CC(0,1,k),CC(0,3,k));
      ROTX90<fwd>(t4);
      PM(CH(0,k,0),CH(0,k,2),t2,t3);
      PM(CH(0,k,1),CH(0,k,3),t1,t4);
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
      T t1, t2, t3, t4;
      PM(t2,t1,CC(0,0,k),CC(0,2,k));
      PM(t3,t4,CC(0,1,k),CC(0,3,k));
      ROTX90<fwd>(t4);
      PM(CH(0,k,0),CH(0,k,2),t2,t3);
      PM(CH(0,k,1),CH(0,k,3),t1,t4);
      }
      for (size_t i=1; i<ido; ++i)
        {
        T t1, t2, t3, t4;
        T cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
        PM(t2,t1,cc0,cc2);
        PM(t3,t4,cc1,cc3);
        ROTX90<fwd>(t4);
        CH(i,k,0) = t2+t3;
        special_mul<fwd>(t1+t4,WA(0,i),CH(i,k,1));
        special_mul<fwd>(t2-t3,WA(1,i),CH(i,k,2));
        special_mul<fwd>(t1-t4,WA(2,i),CH(i,k,3));
        }
      }
  }

#define POCKETFFT_PREP5(idx) \
        T t0 = CC(idx,0,k), t1, t2, t3, t4; \
        PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \
        PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \
        CH(idx,k,0).r=t0.r+t1.r+t2.r; \
        CH(idx,k,0).i=t0.i+t1.i+t2.i;

#define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
        { \
        T ca,cb; \
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
        cb.i=twai*t4.r twbi*t3.r; \
        cb.r=-(twai*t4.i twbi*t3.i); \
        PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \
        }

#define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
        { \
        T ca,cb,da,db; \
        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
        cb.i=twai*t4.r twbi*t3.r; \
        cb.r=-(twai*t4.i twbi*t3.i); \
        special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
        special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
        }
template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
  {
  constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L),
               tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L),
               tw2r= T0(-0.8090169943749474241022934171828191L),
               tw2i= (fwd ? -1: 1) * T0(0.5877852522924731291687059546390728L);

  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+5*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      POCKETFFT_PREP5(0)
      POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
      POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
      POCKETFFT_PREP5(0)
      POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
      POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
      }
      for (size_t i=1; i<ido; ++i)
        {
        POCKETFFT_PREP5(i)
        POCKETFFT_PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
        POCKETFFT_PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
        }
      }
  }

#undef POCKETFFT_PARTSTEP5b
#undef POCKETFFT_PARTSTEP5a
#undef POCKETFFT_PREP5

#define POCKETFFT_PREP7(idx) \
        T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
        PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \
        PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \
        PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \
        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;

#define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
        { \
        T ca,cb; \
        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
        cb.i=y1*t7.r y2*t6.r y3*t5.r; \
        cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
        PM(out1,out2,ca,cb); \
        }
#define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
#define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
        { \
        T da,db; \
        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
        special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
        special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
        }

template<bool fwd, typename T> void pass7(size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
  {
  constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L),
               tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L),
               tw2r= T0(-0.2225209339563144042889025644967948L),
               tw2i= (fwd ? -1 : 1) * T0(0.9749279121818236070181316829939312L),
               tw3r= T0(-0.9009688679024191262361023195074451L),
               tw3i= (fwd ? -1 : 1) * T0(0.433883739117558120475768332848359L);

  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+7*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      POCKETFFT_PREP7(0)
      POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
      POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
      POCKETFFT_PREP7(0)
      POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
      POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
      }
      for (size_t i=1; i<ido; ++i)
        {
        POCKETFFT_PREP7(i)
        POCKETFFT_PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
        POCKETFFT_PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
        POCKETFFT_PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
        }
      }
  }

#undef POCKETFFT_PARTSTEP7
#undef POCKETFFT_PARTSTEP7a0
#undef POCKETFFT_PARTSTEP7a
#undef POCKETFFT_PREP7

template <bool fwd, typename T> void ROTX45(T &a) const
  {
  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
  if (fwd)
    { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); }
  else
    { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); }
  }
template <bool fwd, typename T> void ROTX135(T &a) const
  {
  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
  if (fwd)
    { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); }
  else
    { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); }
  }

template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
  {
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+8*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      T a0, a1, a2, a3, a4, a5, a6, a7;
      PM(a1,a5,CC(0,1,k),CC(0,5,k));
      PM(a3,a7,CC(0,3,k),CC(0,7,k));
      PMINPLACE(a1,a3);
      ROTX90<fwd>(a3);

      ROTX90<fwd>(a7);
      PMINPLACE(a5,a7);
      ROTX45<fwd>(a5);
      ROTX135<fwd>(a7);

      PM(a0,a4,CC(0,0,k),CC(0,4,k));
      PM(a2,a6,CC(0,2,k),CC(0,6,k));
      PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
      PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
      ROTX90<fwd>(a6);
      PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
      PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
      T a0, a1, a2, a3, a4, a5, a6, a7;
      PM(a1,a5,CC(0,1,k),CC(0,5,k));
      PM(a3,a7,CC(0,3,k),CC(0,7,k));
      PMINPLACE(a1,a3);
      ROTX90<fwd>(a3);

      ROTX90<fwd>(a7);
      PMINPLACE(a5,a7);
      ROTX45<fwd>(a5);
      ROTX135<fwd>(a7);

      PM(a0,a4,CC(0,0,k),CC(0,4,k));
      PM(a2,a6,CC(0,2,k),CC(0,6,k));
      PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
      PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
      ROTX90<fwd>(a6);
      PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
      PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
      }
      for (size_t i=1; i<ido; ++i)
        {
        T a0, a1, a2, a3, a4, a5, a6, a7;
        PM(a1,a5,CC(i,1,k),CC(i,5,k));
        PM(a3,a7,CC(i,3,k),CC(i,7,k));
        ROTX90<fwd>(a7);
        PMINPLACE(a1,a3);
        ROTX90<fwd>(a3);
        PMINPLACE(a5,a7);
        ROTX45<fwd>(a5);
        ROTX135<fwd>(a7);
        PM(a0,a4,CC(i,0,k),CC(i,4,k));
        PM(a2,a6,CC(i,2,k),CC(i,6,k));
        PMINPLACE(a0,a2);
        CH(i,k,0) = a0+a1;
        special_mul<fwd>(a0-a1,WA(3,i),CH(i,k,4));
        special_mul<fwd>(a2+a3,WA(1,i),CH(i,k,2));
        special_mul<fwd>(a2-a3,WA(5,i),CH(i,k,6));
        ROTX90<fwd>(a6);
        PMINPLACE(a4,a6);
        special_mul<fwd>(a4+a5,WA(0,i),CH(i,k,1));
        special_mul<fwd>(a4-a5,WA(4,i),CH(i,k,5));
        special_mul<fwd>(a6+a7,WA(2,i),CH(i,k,3));
        special_mul<fwd>(a6-a7,WA(6,i),CH(i,k,7));
        }
      }
   }


#define POCKETFFT_PREP11(idx) \
        T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
        PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \
        PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \
        PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \
        PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \
        PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \
        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;

#define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
        { \
        T ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \
          cb; \
        cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
        cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
        PM(out1,out2,ca,cb); \
        }
#define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
        POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
#define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
        { \
        T da,db; \
        POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
        special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
        special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
        }

template<bool fwd, typename T> void pass11 (size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa) const
  {
  constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L),
               tw1i= (fwd ? -1 : 1) * T0(0.5406408174555975821076359543186917L),
               tw2r= T0(0.4154150130018864255292741492296232L),
               tw2i= (fwd ? -1 : 1) * T0(0.9096319953545183714117153830790285L),
               tw3r= T0(-0.1423148382732851404437926686163697L),
               tw3i= (fwd ? -1 : 1) * T0(0.9898214418809327323760920377767188L),
               tw4r= T0(-0.6548607339452850640569250724662936L),
               tw4i= (fwd ? -1 : 1) * T0(0.7557495743542582837740358439723444L),
               tw5r= T0(-0.9594929736144973898903680570663277L),
               tw5i= (fwd ? -1 : 1) * T0(0.2817325568414296977114179153466169L);

  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+11*c)]; };
  auto WA = [wa, ido](size_t x, size_t i)
    { return wa[i-1+x*(ido-1)]; };

  if (ido==1)
    for (size_t k=0; k<l1; ++k)
      {
      POCKETFFT_PREP11(0)
      POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
      POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
      POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
      POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
      }
  else
    for (size_t k=0; k<l1; ++k)
      {
      {
      POCKETFFT_PREP11(0)
      POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
      POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
      POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
      POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
      POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
      }
      for (size_t i=1; i<ido; ++i)
        {
        POCKETFFT_PREP11(i)
        POCKETFFT_PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
        POCKETFFT_PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
        POCKETFFT_PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
        POCKETFFT_PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
        POCKETFFT_PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
        }
      }
  }

#undef POCKETFFT_PARTSTEP11
#undef POCKETFFT_PARTSTEP11a0
#undef POCKETFFT_PARTSTEP11a
#undef POCKETFFT_PREP11

template<bool fwd, typename T> void passg (size_t ido, size_t ip,
  size_t l1, T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const cmplx<T0> * POCKETFFT_RESTRICT wa,
  const cmplx<T0> * POCKETFFT_RESTRICT csarr) const
  {
  const size_t cdim=ip;
  size_t ipph = (ip+1)/2;
  size_t idl1 = ido*l1;

  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+cdim*c)]; };
  auto CX = [cc, ido, l1](size_t a, size_t b, size_t c) -> T&
    { return cc[a+ido*(b+l1*c)]; };
  auto CX2 = [cc, idl1](size_t a, size_t b) -> T&
    { return cc[a+idl1*b]; };
  auto CH2 = [ch, idl1](size_t a, size_t b) -> const T&
    { return ch[a+idl1*b]; };

  arr<cmplx<T0>> wal(ip);
  wal[0] = cmplx<T0>(1., 0.);
  for (size_t i=1; i<ip; ++i)
    wal[i]=cmplx<T0>(csarr[i].r,fwd ? -csarr[i].i : csarr[i].i);

  for (size_t k=0; k<l1; ++k)
    for (size_t i=0; i<ido; ++i)
      CH(i,k,0) = CC(i,0,k);
  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
    for (size_t k=0; k<l1; ++k)
      for (size_t i=0; i<ido; ++i)
        PM(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k));
  for (size_t k=0; k<l1; ++k)
    for (size_t i=0; i<ido; ++i)
      {
      T tmp = CH(i,k,0);
      for (size_t j=1; j<ipph; ++j)
        tmp+=CH(i,k,j);
      CX(i,k,0) = tmp;
      }
  for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
    {
    // j=0
    for (size_t ik=0; ik<idl1; ++ik)
      {
      CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
      CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
      CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
      CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
      }

    size_t iwal=2*l;
    size_t j=3, jc=ip-3;
    for (; j<ipph-1; j+=2, jc-=2)
      {
      iwal+=l; if (iwal>ip) iwal-=ip;
      cmplx<T0> xwal=wal[iwal];
      iwal+=l; if (iwal>ip) iwal-=ip;
      cmplx<T0> xwal2=wal[iwal];
      for (size_t ik=0; ik<idl1; ++ik)
        {
        CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
        CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
        CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
        CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
        }
      }
    for (; j<ipph; ++j, --jc)
      {
      iwal+=l; if (iwal>ip) iwal-=ip;
      cmplx<T0> xwal=wal[iwal];
      for (size_t ik=0; ik<idl1; ++ik)
        {
        CX2(ik,l).r += CH2(ik,j).r*xwal.r;
        CX2(ik,l).i += CH2(ik,j).i*xwal.r;
        CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
        CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
        }
      }
    }

  // shuffling and twiddling
  if (ido==1)
    for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
      for (size_t ik=0; ik<idl1; ++ik)
        {
        T t1=CX2(ik,j), t2=CX2(ik,jc);
        PM(CX2(ik,j),CX2(ik,jc),t1,t2);
        }
  else
    {
    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
      for (size_t k=0; k<l1; ++k)
        {
        T t1=CX(0,k,j), t2=CX(0,k,jc);
        PM(CX(0,k,j),CX(0,k,jc),t1,t2);
        for (size_t i=1; i<ido; ++i)
          {
          T x1, x2;
          PM(x1,x2,CX(i,k,j),CX(i,k,jc));
          size_t idij=(j-1)*(ido-1)+i-1;
          special_mul<fwd>(x1,wa[idij],CX(i,k,j));
          idij=(jc-1)*(ido-1)+i-1;
          special_mul<fwd>(x2,wa[idij],CX(i,k,jc));
          }
        }
    }
  }

template<bool fwd, typename T> void pass_all(T c[], T0 fct) const
  {
  if (length==1) { c[0]*=fct; return; }
  size_t l1=1;
  arr<T> ch(length);
  T *p1=c, *p2=ch.data();

  for(size_t k1=0; k1<fact.size(); k1++)
    {
    size_t ip=fact[k1].fct;
    size_t l2=ip*l1;
    size_t ido = length/l2;
    if     (ip==4)
      pass4<fwd> (ido, l1, p1, p2, fact[k1].tw);
    else if(ip==8)
      pass8<fwd>(ido, l1, p1, p2, fact[k1].tw);
    else if(ip==2)
      pass2<fwd>(ido, l1, p1, p2, fact[k1].tw);
    else if(ip==3)
      pass3<fwd> (ido, l1, p1, p2, fact[k1].tw);
    else if(ip==5)
      pass5<fwd> (ido, l1, p1, p2, fact[k1].tw);
    else if(ip==7)
      pass7<fwd> (ido, l1, p1, p2, fact[k1].tw);
    else if(ip==11)
      pass11<fwd> (ido, l1, p1, p2, fact[k1].tw);
    else
      {
      passg<fwd>(ido, ip, l1, p1, p2, fact[k1].tw, fact[k1].tws);
      std::swap(p1,p2);
      }
    std::swap(p1,p2);
    l1=l2;
    }
  if (p1!=c)
    {
    if (fct!=1.)
      for (size_t i=0; i<length; ++i)
        c[i] = ch[i]*fct;
    else
      std::copy_n (p1, length, c);
    }
  else
    if (fct!=1.)
      for (size_t i=0; i<length; ++i)
        c[i] *= fct;
  }

  public:
    template<typename T> void exec(T c[], T0 fct, bool fwd) const
      { fwd ? pass_all<true>(c, fct) : pass_all<false>(c, fct); }

  private:
    POCKETFFT_NOINLINE void factorize()
      {
      size_t len=length;
      while ((len&7)==0)
        { add_factor(8); len>>=3; }
      while ((len&3)==0)
        { add_factor(4); len>>=2; }
      if ((len&1)==0)
        {
        len>>=1;
        // factor 2 should be at the front of the factor list
        add_factor(2);
        std::swap(fact[0].fct, fact.back().fct);
        }
      for (size_t divisor=3; divisor*divisor<=len; divisor+=2)
        while ((len%divisor)==0)
          {
          add_factor(divisor);
          len/=divisor;
          }
      if (len>1) add_factor(len);
      }

    size_t twsize() const
      {
      size_t twsize=0, l1=1;
      for (size_t k=0; k<fact.size(); ++k)
        {
        size_t ip=fact[k].fct, ido= length/(l1*ip);
        twsize+=(ip-1)*(ido-1);
        if (ip>11)
          twsize+=ip;
        l1*=ip;
        }
      return twsize;
      }

    void comp_twiddle()
      {
      sincos_2pibyn<T0> twiddle(length);
      size_t l1=1;
      size_t memofs=0;
      for (size_t k=0; k<fact.size(); ++k)
        {
        size_t ip=fact[k].fct, ido=length/(l1*ip);
        fact[k].tw=mem.data()+memofs;
        memofs+=(ip-1)*(ido-1);
        for (size_t j=1; j<ip; ++j)
          for (size_t i=1; i<ido; ++i)
            fact[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i];
        if (ip>11)
          {
          fact[k].tws=mem.data()+memofs;
          memofs+=ip;
          for (size_t j=0; j<ip; ++j)
            fact[k].tws[j] = twiddle[j*l1*ido];
          }
        l1*=ip;
        }
      }

  public:
    POCKETFFT_NOINLINE cfftp(size_t length_)
      : length(length_)
      {
      if (length==0) throw std::runtime_error("zero-length FFT requested");
      if (length==1) return;
      factorize();
      mem.resize(twsize());
      comp_twiddle();
      }
  };

//
// real-valued FFTPACK transforms
//

template<typename T0> class rfftp
  {
  private:
    struct fctdata
      {
      size_t fct;
      T0 *tw, *tws;
      };

    size_t length;
    arr<T0> mem;
    std::vector<fctdata> fact;

    void add_factor(size_t factor)
      { fact.push_back({factor, nullptr, nullptr}); }

/* (a+ib) = conj(c+id) * (e+if) */
template<typename T1, typename T2, typename T3> inline void MULPM
  (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) const
  {  a=c*e+d*f; b=c*f-d*e; }

template<typename T> void radf2 (size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const T0 * POCKETFFT_RESTRICT wa) const
  {
  auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
  auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+l1*c)]; };
  auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+2*c)]; };

  for (size_t k=0; k<l1; k++)
    PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1));
  if ((ido&1)==0)
    for (size_t k=0; k<l1; k++)
      {
      CH(    0,1,k) = -CC(ido-1,k,1);
      CH(ido-1,0,k) =  CC(ido-1,k,0);
      }
  if (ido<=2) return;
  for (size_t k=0; k<l1; k++)
    for (size_t i=2; i<ido; i+=2)
      {
      size_t ic=ido-i;
      T tr2, ti2;
      MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
      PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2);
      PM (CH(i  ,0,k),CH(ic  ,1,k),ti2,CC(i  ,k,0));
      }
  }

// a2=a+b; b2=i*(b-a);
#define POCKETFFT_REARRANGE(rx, ix, ry, iy) \
  {\
  auto t1=rx+ry, t2=ry-rx, t3=ix+iy, t4=ix-iy; \
  rx=t1; ix=t3; ry=t4; iy=t2; \
  }

template<typename T> void radf3(size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const T0 * POCKETFFT_RESTRICT wa) const
  {
  constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L);

  auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
  auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+l1*c)]; };
  auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+3*c)]; };

  for (size_t k=0; k<l1; k++)
    {
    T cr2=CC(0,k,1)+CC(0,k,2);
    CH(0,0,k) = CC(0,k,0)+cr2;
    CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
    CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
    }
  if (ido==1) return;
  for (size_t k=0; k<l1; k++)
    for (size_t i=2; i<ido; i+=2)
      {
      size_t ic=ido-i;
      T di2, di3, dr2, dr3;
      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)); // d2=conj(WA0)*CC1
      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)); // d3=conj(WA1)*CC2
      POCKETFFT_REARRANGE(dr2, di2, dr3, di3);
      CH(i-1,0,k) = CC(i-1,k,0)+dr2; // c add
      CH(i  ,0,k) = CC(i  ,k,0)+di2;
      T tr2 = CC(i-1,k,0)+taur*dr2; // c add
      T ti2 = CC(i  ,k,0)+taur*di2;
      T tr3 = taui*dr3;  // t3 = taui*i*(d3-d2)?
      T ti3 = taui*di3;
      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3); // PM(i) = t2+t3
      PM(CH(i  ,2,k),CH(ic  ,1,k),ti3,ti2); // PM(ic) = conj(t2-t3)
      }
  }

template<typename T> void radf4(size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const T0 * POCKETFFT_RESTRICT wa) const
  {
  constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);

  auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
  auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+l1*c)]; };
  auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+4*c)]; };

  for (size_t k=0; k<l1; k++)
    {
    T tr1,tr2;
    PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1));
    PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2));
    PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1);
    }
  if ((ido&1)==0)
    for (size_t k=0; k<l1; k++)
      {
      T ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
      T tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
      PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1);
      PM (CH(    0,3,k),CH(    0,1,k),ti1,CC(ido-1,k,2));
      }
  if (ido<=2) return;
  for (size_t k=0; k<l1; k++)
    for (size_t i=2; i<ido; i+=2)
      {
      size_t ic=ido-i;
      T ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
      MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
      MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
      MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
      PM(tr1,tr4,cr4,cr2);
      PM(ti1,ti4,ci2,ci4);
      PM(tr2,tr3,CC(i-1,k,0),cr3);
      PM(ti2,ti3,CC(i  ,k,0),ci3);
      PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1);
      PM(CH(i  ,0,k),CH(ic  ,3,k),ti1,ti2);
      PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4);
      PM(CH(i  ,2,k),CH(ic  ,1,k),tr4,ti3);
      }
  }

template<typename T> void radf5(size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const T0 * POCKETFFT_RESTRICT wa) const
  {
  constexpr T0 tr11= T0(0.3090169943749474241022934171828191L),
               ti11= T0(0.9510565162951535721164393333793821L),
               tr12= T0(-0.8090169943749474241022934171828191L),
               ti12= T0(0.5877852522924731291687059546390728L);

  auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
  auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+l1*c)]; };
  auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+5*c)]; };

  for (size_t k=0; k<l1; k++)
    {
    T cr2, cr3, ci4, ci5;
    PM (cr2,ci5,CC(0,k,4),CC(0,k,1));
    PM (cr3,ci4,CC(0,k,3),CC(0,k,2));
    CH(0,0,k)=CC(0,k,0)+cr2+cr3;
    CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
    CH(0,2,k)=ti11*ci5+ti12*ci4;
    CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
    CH(0,4,k)=ti12*ci5-ti11*ci4;
    }
  if (ido==1) return;
  for (size_t k=0; k<l1;++k)
    for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
      {
      T di2, di3, di4, di5, dr2, dr3, dr4, dr5;
      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
      MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
      MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4));
      POCKETFFT_REARRANGE(dr2, di2, dr5, di5);
      POCKETFFT_REARRANGE(dr3, di3, dr4, di4);
      CH(i-1,0,k)=CC(i-1,k,0)+dr2+dr3;
      CH(i  ,0,k)=CC(i  ,k,0)+di2+di3;
      T tr2=CC(i-1,k,0)+tr11*dr2+tr12*dr3;
      T ti2=CC(i  ,k,0)+tr11*di2+tr12*di3;
      T tr3=CC(i-1,k,0)+tr12*dr2+tr11*dr3;
      T ti3=CC(i  ,k,0)+tr12*di2+tr11*di3;
      T tr5 = ti11*dr5 + ti12*dr4;
      T ti5 = ti11*di5 + ti12*di4;
      T tr4 = ti12*dr5 - ti11*dr4;
      T ti4 = ti12*di5 - ti11*di4;
      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5);
      PM(CH(i  ,2,k),CH(ic  ,1,k),ti5,ti2);
      PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4);
      PM(CH(i  ,4,k),CH(ic  ,3,k),ti4,ti3);
      }
  }

#undef POCKETFFT_REARRANGE

template<typename T> void radfg(size_t ido, size_t ip, size_t l1,
  T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) const
  {
  const size_t cdim=ip;
  size_t ipph=(ip+1)/2;
  size_t idl1 = ido*l1;

  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> T&
    { return cc[a+ido*(b+cdim*c)]; };
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> const T&
    { return ch[a+ido*(b+l1*c)]; };
  auto C1 = [cc,ido,l1] (size_t a, size_t b, size_t c) -> T&
    { return cc[a+ido*(b+l1*c)]; };
  auto C2 = [cc,idl1] (size_t a, size_t b) -> T&
    { return cc[a+idl1*b]; };
  auto CH2 = [ch,idl1] (size_t a, size_t b) -> T&
    { return ch[a+idl1*b]; };

  if (ido>1)
    {
    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)              // 114
      {
      size_t is=(j-1)*(ido-1),
             is2=(jc-1)*(ido-1);
      for (size_t k=0; k<l1; ++k)                            // 113
        {
        size_t idij=is;
        size_t idij2=is2;
        for (size_t i=1; i<=ido-2; i+=2)                      // 112
          {
          T t1=C1(i,k,j ), t2=C1(i+1,k,j ),
            t3=C1(i,k,jc), t4=C1(i+1,k,jc);
          T x1=wa[idij]*t1 + wa[idij+1]*t2,
            x2=wa[idij]*t2 - wa[idij+1]*t1,
            x3=wa[idij2]*t3 + wa[idij2+1]*t4,
            x4=wa[idij2]*t4 - wa[idij2+1]*t3;
          PM(C1(i,k,j),C1(i+1,k,jc),x3,x1);
          PM(C1(i+1,k,j),C1(i,k,jc),x2,x4);
          idij+=2;
          idij2+=2;
          }
        }
      }
    }

  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 123
    for (size_t k=0; k<l1; ++k)                              // 122
      MPINPLACE(C1(0,k,jc), C1(0,k,j));

//everything in C
//memset(ch,0,ip*l1*ido*sizeof(double));

  for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)                 // 127
    {
    for (size_t ik=0; ik<idl1; ++ik)                         // 124
      {
      CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
      CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
      }
    size_t iang = 2*l;
    size_t j=3, jc=ip-3;
    for (; j<ipph-3; j+=4,jc-=4)              // 126
      {
      iang+=l; if (iang>=ip) iang-=ip;
      T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
      iang+=l; if (iang>=ip) iang-=ip;
      T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
      iang+=l; if (iang>=ip) iang-=ip;
      T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1];
      iang+=l; if (iang>=ip) iang-=ip;
      T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1];
      for (size_t ik=0; ik<idl1; ++ik)                       // 125
        {
        CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
                     +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
        CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
                     +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
        }
      }
    for (; j<ipph-1; j+=2,jc-=2)              // 126
      {
      iang+=l; if (iang>=ip) iang-=ip;
      T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
      iang+=l; if (iang>=ip) iang-=ip;
      T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
      for (size_t ik=0; ik<idl1; ++ik)                       // 125
        {
        CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
        CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
        }
      }
    for (; j<ipph; ++j,--jc)              // 126
      {
      iang+=l; if (iang>=ip) iang-=ip;
      T0 ar=csarr[2*iang], ai=csarr[2*iang+1];
      for (size_t ik=0; ik<idl1; ++ik)                       // 125
        {
        CH2(ik,l ) += ar*C2(ik,j );
        CH2(ik,lc) += ai*C2(ik,jc);
        }
      }
    }
  for (size_t ik=0; ik<idl1; ++ik)                         // 101
    CH2(ik,0) = C2(ik,0);
  for (size_t j=1; j<ipph; ++j)                              // 129
    for (size_t ik=0; ik<idl1; ++ik)                         // 128
      CH2(ik,0) += C2(ik,j);

// everything in CH at this point!
//memset(cc,0,ip*l1*ido*sizeof(double));

  for (size_t k=0; k<l1; ++k)                                // 131
    for (size_t i=0; i<ido; ++i)                             // 130
      CC(i,0,k) = CH(i,k,0);

  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 137
    {
    size_t j2=2*j-1;
    for (size_t k=0; k<l1; ++k)                              // 136
      {
      CC(ido-1,j2,k) = CH(0,k,j);
      CC(0,j2+1,k) = CH(0,k,jc);
      }
    }

  if (ido==1) return;

  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 140
    {
    size_t j2=2*j-1;
    for(size_t k=0; k<l1; ++k)                               // 139
      for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 138
        {
        CC(i   ,j2+1,k) = CH(i  ,k,j )+CH(i  ,k,jc);
        CC(ic  ,j2  ,k) = CH(i  ,k,j )-CH(i  ,k,jc);
        CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
        CC(ic+1,j2  ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
        }
    }
  }

template<typename T> void radb2(size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const T0 * POCKETFFT_RESTRICT wa) const
  {
  auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+2*c)]; };
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };

  for (size_t k=0; k<l1; k++)
    PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k));
  if ((ido&1)==0)
    for (size_t k=0; k<l1; k++)
      {
      CH(ido-1,k,0) = 2*CC(ido-1,0,k);
      CH(ido-1,k,1) =-2*CC(0    ,1,k);
      }
  if (ido<=2) return;
  for (size_t k=0; k<l1;++k)
    for (size_t i=2; i<ido; i+=2)
      {
      size_t ic=ido-i;
      T ti2, tr2;
      PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k));
      PM (ti2,CH(i  ,k,0),CC(i  ,0,k),CC(ic  ,1,k));
      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2);
      }
  }

template<typename T> void radb3(size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const T0 * POCKETFFT_RESTRICT wa) const
  {
  constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L);

  auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+3*c)]; };
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };

  for (size_t k=0; k<l1; k++)
    {
    T tr2=2*CC(ido-1,1,k);
    T cr2=CC(0,0,k)+taur*tr2;
    CH(0,k,0)=CC(0,0,k)+tr2;
    T ci3=2*taui*CC(0,2,k);
    PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
    }
  if (ido==1) return;
  for (size_t k=0; k<l1; k++)
    for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
      {
      T tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
      T ti2=CC(i  ,2,k)-CC(ic  ,1,k);
      T cr2=CC(i-1,0,k)+taur*tr2;     // c2=CC +taur*t2
      T ci2=CC(i  ,0,k)+taur*ti2;
      CH(i-1,k,0)=CC(i-1,0,k)+tr2;         // CH=CC+t2
      CH(i  ,k,0)=CC(i  ,0,k)+ti2;
      T cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
      T ci3=taui*(CC(i  ,2,k)+CC(ic  ,1,k));
      T di2, di3, dr2, dr3;
      PM(dr3,dr2,cr2,ci3); // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
      PM(di2,di3,ci2,cr3); // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2); // ch = WA*d2
      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
      }
  }

template<typename T> void radb4(size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const T0 * POCKETFFT_RESTRICT wa) const
  {
  constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);

  auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+4*c)]; };
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };

  for (size_t k=0; k<l1; k++)
    {
    T tr1, tr2;
    PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k));
    T tr3=2*CC(ido-1,1,k);
    T tr4=2*CC(0,2,k);
    PM (CH(0,k,0),CH(0,k,2),tr2,tr3);
    PM (CH(0,k,3),CH(0,k,1),tr1,tr4);
    }
  if ((ido&1)==0)
    for (size_t k=0; k<l1; k++)
      {
      T tr1,tr2,ti1,ti2;
      PM (ti1,ti2,CC(0    ,3,k),CC(0    ,1,k));
      PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k));
      CH(ido-1,k,0)=tr2+tr2;
      CH(ido-1,k,1)=sqrt2*(tr1-ti1);
      CH(ido-1,k,2)=ti2+ti2;
      CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
      }
  if (ido<=2) return;
  for (size_t k=0; k<l1;++k)
    for (size_t i=2; i<ido; i+=2)
      {
      T ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
      size_t ic=ido-i;
      PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k));
      PM (ti1,ti2,CC(i  ,0,k),CC(ic  ,3,k));
      PM (tr4,ti3,CC(i  ,2,k),CC(ic  ,1,k));
      PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k));
      PM (CH(i-1,k,0),cr3,tr2,tr3);
      PM (CH(i  ,k,0),ci3,ti2,ti3);
      PM (cr4,cr2,tr1,tr4);
      PM (ci2,ci4,ti1,ti4);
      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2);
      MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3);
      MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4);
      }
  }

template<typename T> void radb5(size_t ido, size_t l1,
  const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const T0 * POCKETFFT_RESTRICT wa) const
  {
  constexpr T0 tr11= T0(0.3090169943749474241022934171828191L),
               ti11= T0(0.9510565162951535721164393333793821L),
               tr12= T0(-0.8090169943749474241022934171828191L),
               ti12= T0(0.5877852522924731291687059546390728L);

  auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
  auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+5*c)]; };
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };

  for (size_t k=0; k<l1; k++)
    {
    T ti5=CC(0,2,k)+CC(0,2,k);
    T ti4=CC(0,4,k)+CC(0,4,k);
    T tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
    T tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
    CH(0,k,0)=CC(0,0,k)+tr2+tr3;
    T cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
    T cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
    T ci4, ci5;
    MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
    PM(CH(0,k,4),CH(0,k,1),cr2,ci5);
    PM(CH(0,k,3),CH(0,k,2),cr3,ci4);
    }
  if (ido==1) return;
  for (size_t k=0; k<l1;++k)
    for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
      {
      T tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
      PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k));
      PM(ti5,ti2,CC(i  ,2,k),CC(ic  ,1,k));
      PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k));
      PM(ti4,ti3,CC(i  ,4,k),CC(ic  ,3,k));
      CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
      CH(i  ,k,0)=CC(i  ,0,k)+ti2+ti3;
      T cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
      T ci2=CC(i  ,0,k)+tr11*ti2+tr12*ti3;
      T cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
      T ci3=CC(i  ,0,k)+tr12*ti2+tr11*ti3;
      T ci4, ci5, cr5, cr4;
      MULPM(cr5,cr4,tr5,tr4,ti11,ti12);
      MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
      T dr2, dr3, dr4, dr5, di2, di3, di4, di5;
      PM(dr4,dr3,cr3,ci4);
      PM(di3,di4,ci3,cr4);
      PM(dr5,dr2,cr2,ci5);
      PM(di2,di5,ci2,cr5);
      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2);
      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
      MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4);
      MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5);
      }
  }

template<typename T> void radbg(size_t ido, size_t ip, size_t l1,
  T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
  const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) const
  {
  const size_t cdim=ip;
  size_t ipph=(ip+1)/ 2;
  size_t idl1 = ido*l1;

  auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+cdim*c)]; };
  auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
    { return ch[a+ido*(b+l1*c)]; };
  auto C1 = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
    { return cc[a+ido*(b+l1*c)]; };
  auto C2 = [cc,idl1](size_t a, size_t b) -> T&
    { return cc[a+idl1*b]; };
  auto CH2 = [ch,idl1](size_t a, size_t b) -> T&
    { return ch[a+idl1*b]; };

  for (size_t k=0; k<l1; ++k)        // 102
    for (size_t i=0; i<ido; ++i)     // 101
      CH(i,k,0) = CC(i,0,k);
  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)   // 108
    {
    size_t j2=2*j-1;
    for (size_t k=0; k<l1; ++k)
      {
      CH(0,k,j ) = 2*CC(ido-1,j2,k);
      CH(0,k,jc) = 2*CC(0,j2+1,k);
      }
    }

  if (ido!=1)
    {
    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 111
      {
      size_t j2=2*j-1;
      for (size_t k=0; k<l1; ++k)
        for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 109
          {
          CH(i  ,k,j ) = CC(i  ,j2+1,k)+CC(ic  ,j2,k);
          CH(i  ,k,jc) = CC(i  ,j2+1,k)-CC(ic  ,j2,k);
          CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
          CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
          }
      }
    }
  for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
    {
    for (size_t ik=0; ik<idl1; ++ik)
      {
      C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
      C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
      }
    size_t iang=2*l;
    size_t j=3,jc=ip-3;
    for(; j<ipph-3; j+=4,jc-=4)
      {
      iang+=l; if(iang>ip) iang-=ip;
      T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
      iang+=l; if(iang>ip) iang-=ip;
      T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
      iang+=l; if(iang>ip) iang-=ip;
      T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1];
      iang+=l; if(iang>ip) iang-=ip;
      T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1];
      for (size_t ik=0; ik<idl1; ++ik)
        {
        C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
                    +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
        C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
                    +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
        }
      }
    for(; j<ipph-1; j+=2,jc-=2)
      {
      iang+=l; if(iang>ip) iang-=ip;
      T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
      iang+=l; if(iang>ip) iang-=ip;
      T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
      for (size_t ik=0; ik<idl1; ++ik)
        {
        C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
        C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
        }
      }
    for(; j<ipph; ++j,--jc)
      {
      iang+=l; if(iang>ip) iang-=ip;
      T0 war=csarr[2*iang], wai=csarr[2*iang+1];
      for (size_t ik=0; ik<idl1; ++ik)
        {
        C2(ik,l ) += war*CH2(ik,j );
        C2(ik,lc) += wai*CH2(ik,jc);
        }
      }
    }
  for (size_t j=1; j<ipph; ++j)
    for (size_t ik=0; ik<idl1; ++ik)
      CH2(ik,0) += CH2(ik,j);
  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 124
    for (size_t k=0; k<l1; ++k)
      PM(CH(0,k,jc),CH(0,k,j),C1(0,k,j),C1(0,k,jc));

  if (ido==1) return;

  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)  // 127
    for (size_t k=0; k<l1; ++k)
      for (size_t i=1; i<=ido-2; i+=2)
        {
        CH(i  ,k,j ) = C1(i  ,k,j)-C1(i+1,k,jc);
        CH(i  ,k,jc) = C1(i  ,k,j)+C1(i+1,k,jc);
        CH(i+1,k,j ) = C1(i+1,k,j)+C1(i  ,k,jc);
        CH(i+1,k,jc) = C1(i+1,k,j)-C1(i  ,k,jc);
        }

// All in CH

  for (size_t j=1; j<ip; ++j)
    {
    size_t is = (j-1)*(ido-1);
    for (size_t k=0; k<l1; ++k)
      {
      size_t idij = is;
      for (size_t i=1; i<=ido-2; i+=2)
        {
        T t1=CH(i,k,j), t2=CH(i+1,k,j);
        CH(i  ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
        CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
        idij+=2;
        }
      }
    }
  }

    template<typename T> void copy_and_norm(T *c, T *p1, T0 fct) const
      {
      if (p1!=c)
        {
        if (fct!=1.)
          for (size_t i=0; i<length; ++i)
            c[i] = fct*p1[i];
        else
          std::copy_n (p1, length, c);
        }
      else
        if (fct!=1.)
          for (size_t i=0; i<length; ++i)
            c[i] *= fct;
      }

  public:
    template<typename T> void exec(T c[], T0 fct, bool r2hc) const
      {
      if (length==1) { c[0]*=fct; return; }
      size_t nf=fact.size();
      arr<T> ch(length);
      T *p1=c, *p2=ch.data();

      if (r2hc)
        for(size_t k1=0, l1=length; k1<nf;++k1)
          {
          size_t k=nf-k1-1;
          size_t ip=fact[k].fct;
          size_t ido=length / l1;
          l1 /= ip;
          if(ip==4)
            radf4(ido, l1, p1, p2, fact[k].tw);
          else if(ip==2)
            radf2(ido, l1, p1, p2, fact[k].tw);
          else if(ip==3)
            radf3(ido, l1, p1, p2, fact[k].tw);
          else if(ip==5)
            radf5(ido, l1, p1, p2, fact[k].tw);
          else
            { radfg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws); std::swap (p1,p2); }
          std::swap (p1,p2);
          }
      else
        for(size_t k=0, l1=1; k<nf; k++)
          {
          size_t ip = fact[k].fct,
                 ido= length/(ip*l1);
          if(ip==4)
            radb4(ido, l1, p1, p2, fact[k].tw);
          else if(ip==2)
            radb2(ido, l1, p1, p2, fact[k].tw);
          else if(ip==3)
            radb3(ido, l1, p1, p2, fact[k].tw);
          else if(ip==5)
            radb5(ido, l1, p1, p2, fact[k].tw);
          else
            radbg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws);
          std::swap (p1,p2);
          l1*=ip;
          }

      copy_and_norm(c,p1,fct);
      }

  private:
    void factorize()
      {
      size_t len=length;
      while ((len%4)==0)
        { add_factor(4); len>>=2; }
      if ((len%2)==0)
        {
        len>>=1;
        // factor 2 should be at the front of the factor list
        add_factor(2);
        std::swap(fact[0].fct, fact.back().fct);
        }
      for (size_t divisor=3; divisor*divisor<=len; divisor+=2)
        while ((len%divisor)==0)
          {
          add_factor(divisor);
          len/=divisor;
          }
      if (len>1) add_factor(len);
      }

    size_t twsize() const
      {
      size_t twsz=0, l1=1;
      for (size_t k=0; k<fact.size(); ++k)
        {
        size_t ip=fact[k].fct, ido=length/(l1*ip);
        twsz+=(ip-1)*(ido-1);
        if (ip>5) twsz+=2*ip;
        l1*=ip;
        }
      return twsz;
      }

    void comp_twiddle()
      {
      sincos_2pibyn<T0> twid(length);
      size_t l1=1;
      T0 *ptr=mem.data();
      for (size_t k=0; k<fact.size(); ++k)
        {
        size_t ip=fact[k].fct, ido=length/(l1*ip);
        if (k<fact.size()-1) // last factor doesn't need twiddles
          {
          fact[k].tw=ptr; ptr+=(ip-1)*(ido-1);
          for (size_t j=1; j<ip; ++j)
            for (size_t i=1; i<=(ido-1)/2; ++i)
              {
              fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[j*l1*i].r;
              fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[j*l1*i].i;
              }
          }
        if (ip>5) // special factors required by *g functions
          {
          fact[k].tws=ptr; ptr+=2*ip;
          fact[k].tws[0] = 1.;
          fact[k].tws[1] = 0.;
          for (size_t i=2, ic=2*ip-2; i<=ic; i+=2, ic-=2)
            {
            fact[k].tws[i  ] = twid[i/2*(length/ip)].r;
            fact[k].tws[i+1] = twid[i/2*(length/ip)].i;
            fact[k].tws[ic]   = twid[i/2*(length/ip)].r;
            fact[k].tws[ic+1] = -twid[i/2*(length/ip)].i;
            }
          }
        l1*=ip;
        }
      }

  public:
    POCKETFFT_NOINLINE rfftp(size_t length_)
      : length(length_)
      {
      if (length==0) throw std::runtime_error("zero-length FFT requested");
      if (length==1) return;
      factorize();
      mem.resize(twsize());
      comp_twiddle();
      }
};

//
// complex Bluestein transforms
//

template<typename T0> class fftblue
  {
  private:
    size_t n, n2;
    cfftp<T0> plan;
    arr<cmplx<T0>> mem;
    cmplx<T0> *bk, *bkf;

    template<bool fwd, typename T> void fft(cmplx<T> c[], T0 fct) const
      {
      arr<cmplx<T>> akf(n2);

      /* initialize a_k and FFT it */
      for (size_t m=0; m<n; ++m)
        special_mul<fwd>(c[m],bk[m],akf[m]);
      auto zero = akf[0]*T0(0);
      for (size_t m=n; m<n2; ++m)
        akf[m]=zero;

      plan.exec (akf.data(),1.,true);

      /* do the convolution */
      akf[0] = akf[0].template special_mul<!fwd>(bkf[0]);
      for (size_t m=1; m<(n2+1)/2; ++m)
        {
        akf[m] = akf[m].template special_mul<!fwd>(bkf[m]);
        akf[n2-m] = akf[n2-m].template special_mul<!fwd>(bkf[m]);
        }
      if ((n2&1)==0)
        akf[n2/2] = akf[n2/2].template special_mul<!fwd>(bkf[n2/2]);

      /* inverse FFT */
      plan.exec (akf.data(),1.,false);

      /* multiply by b_k */
      for (size_t m=0; m<n; ++m)
        c[m] = akf[m].template special_mul<fwd>(bk[m])*fct;
      }

  public:
    POCKETFFT_NOINLINE fftblue(size_t length)
      : n(length), n2(util::good_size_cmplx(n*2-1)), plan(n2), mem(n+n2/2+1),
        bk(mem.data()), bkf(mem.data()+n)
      {
      /* initialize b_k */
      sincos_2pibyn<T0> tmp(2*n);
      bk[0].Set(1, 0);

      size_t coeff=0;
      for (size_t m=1; m<n; ++m)
        {
        coeff+=2*m-1;
        if (coeff>=2*n) coeff-=2*n;
        bk[m] = tmp[coeff];
        }

      /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
      arr<cmplx<T0>> tbkf(n2);
      T0 xn2 = T0(1)/T0(n2);
      tbkf[0] = bk[0]*xn2;
      for (size_t m=1; m<n; ++m)
        tbkf[m] = tbkf[n2-m] = bk[m]*xn2;
      for (size_t m=n;m<=(n2-n);++m)
        tbkf[m].Set(0.,0.);
      plan.exec(tbkf.data(),1.,true);
      for (size_t i=0; i<n2/2+1; ++i)
        bkf[i] = tbkf[i];
      }

    template<typename T> void exec(cmplx<T> c[], T0 fct, bool fwd) const
      { fwd ? fft<true>(c,fct) : fft<false>(c,fct); }

    template<typename T> void exec_r(T c[], T0 fct, bool fwd)
      {
      arr<cmplx<T>> tmp(n);
      if (fwd)
        {
        auto zero = T0(0)*c[0];
        for (size_t m=0; m<n; ++m)
          tmp[m].Set(c[m], zero);
        fft<true>(tmp.data(),fct);
        c[0] = tmp[0].r;
        std::copy_n (&tmp[1].r, n-1, &c[1]);
        }
      else
        {
        tmp[0].Set(c[0],c[0]*0);
        std::copy_n (c+1, n-1, &tmp[1].r);
        if ((n&1)==0) tmp[n/2].i=T0(0)*c[0];
        for (size_t m=1; 2*m<n; ++m)
          tmp[n-m].Set(tmp[m].r, -tmp[m].i);
        fft<false>(tmp.data(),fct);
        for (size_t m=0; m<n; ++m)
          c[m] = tmp[m].r;
        }
      }
  };

//
// flexible (FFTPACK/Bluestein) complex 1D transform
//

template<typename T0> class pocketfft_c
  {
  private:
    std::unique_ptr<cfftp<T0>> packplan;
    std::unique_ptr<fftblue<T0>> blueplan;
    size_t len;

  public:
    POCKETFFT_NOINLINE pocketfft_c(size_t length)
      : len(length)
      {
      if (length==0) throw std::runtime_error("zero-length FFT requested");
      size_t tmp = (length<50) ? 0 : util::largest_prime_factor(length);
      if (tmp*tmp <= length)
        {
        packplan=std::unique_ptr<cfftp<T0>>(new cfftp<T0>(length));
        return;
        }
      double comp1 = util::cost_guess(length);
      double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1));
      comp2*=1.5; /* fudge factor that appears to give good overall performance */
      if (comp2<comp1) // use Bluestein
        blueplan=std::unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
      else
        packplan=std::unique_ptr<cfftp<T0>>(new cfftp<T0>(length));
      }

    template<typename T> POCKETFFT_NOINLINE void exec(cmplx<T> c[], T0 fct, bool fwd) const
      { packplan ? packplan->exec(c,fct,fwd) : blueplan->exec(c,fct,fwd); }

    size_t length() const { return len; }
  };

//
// flexible (FFTPACK/Bluestein) real-valued 1D transform
//

template<typename T0> class pocketfft_r
  {
  private:
    std::unique_ptr<rfftp<T0>> packplan;
    std::unique_ptr<fftblue<T0>> blueplan;
    size_t len;

  public:
    POCKETFFT_NOINLINE pocketfft_r(size_t length)
      : len(length)
      {
      if (length==0) throw std::runtime_error("zero-length FFT requested");
      size_t tmp = (length<50) ? 0 : util::largest_prime_factor(length);
      if (tmp*tmp <= length)
        {
        packplan=std::unique_ptr<rfftp<T0>>(new rfftp<T0>(length));
        return;
        }
      double comp1 = 0.5*util::cost_guess(length);
      double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1));
      comp2*=1.5; /* fudge factor that appears to give good overall performance */
      if (comp2<comp1) // use Bluestein
        blueplan=std::unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
      else
        packplan=std::unique_ptr<rfftp<T0>>(new rfftp<T0>(length));
      }

    template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool fwd) const
      { packplan ? packplan->exec(c,fct,fwd) : blueplan->exec_r(c,fct,fwd); }

    size_t length() const { return len; }
  };


//
// sine/cosine transforms
//

template<typename T0> class T_dct1
  {
  private:
    pocketfft_r<T0> fftplan;

  public:
    POCKETFFT_NOINLINE T_dct1(size_t length)
      : fftplan(2*(length-1)) {}

    template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho,
      int /*type*/, bool /*cosine*/) const
      {
      constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
      size_t N=fftplan.length(), n=N/2+1;
      if (ortho)
        { c[0]*=sqrt2; c[n-1]*=sqrt2; }
      arr<T> tmp(N);
      tmp[0] = c[0];
      for (size_t i=1; i<n; ++i)
        tmp[i] = tmp[N-i] = c[i];
      fftplan.exec(tmp.data(), fct, true);
      c[0] = tmp[0];
      for (size_t i=1; i<n; ++i)
        c[i] = tmp[2*i-1];
      if (ortho)
        { c[0]*=sqrt2*T0(0.5); c[n-1]*=sqrt2*T0(0.5); }
      }

    size_t length() const { return fftplan.length()/2+1; }
  };

template<typename T0> class T_dst1
  {
  private:
    pocketfft_r<T0> fftplan;

  public:
    POCKETFFT_NOINLINE T_dst1(size_t length)
      : fftplan(2*(length+1)) {}

    template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct,
      bool /*ortho*/, int /*type*/, bool /*cosine*/) const
      {
      size_t N=fftplan.length(), n=N/2-1;
      arr<T> tmp(N);
      tmp[0] = tmp[n+1] = c[0]*0;
      for (size_t i=0; i<n; ++i)
        { tmp[i+1]=c[i]; tmp[N-1-i]=-c[i]; }
      fftplan.exec(tmp.data(), fct, true);
      for (size_t i=0; i<n; ++i)
        c[i] = -tmp[2*i+2];
      }

    size_t length() const { return fftplan.length()/2-1; }
  };

template<typename T0> class T_dcst23
  {
  private:
    pocketfft_r<T0> fftplan;
    std::vector<T0> twiddle;

  public:
    POCKETFFT_NOINLINE T_dcst23(size_t length)
      : fftplan(length), twiddle(length)
      {
      sincos_2pibyn<T0> tw(4*length);
      for (size_t i=0; i<length; ++i)
        twiddle[i] = tw[i+1].r;
      }

    template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho,
      int type, bool cosine) const
      {
      constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
      size_t N=length();
      size_t NS2 = (N+1)/2;
      if (type==2)
        {
        if (!cosine)
          for (size_t k=1; k<N; k+=2)
            c[k] = -c[k];
        c[0] *= 2;
        if ((N&1)==0) c[N-1]*=2;
        for (size_t k=1; k<N-1; k+=2)
          MPINPLACE(c[k+1], c[k]);
        fftplan.exec(c, fct, false);
        for (size_t k=1, kc=N-1; k<NS2; ++k, --kc)
          {
          T t1 = twiddle[k-1]*c[kc]+twiddle[kc-1]*c[k];
          T t2 = twiddle[k-1]*c[k]-twiddle[kc-1]*c[kc];
          c[k] = T0(0.5)*(t1+t2); c[kc]=T0(0.5)*(t1-t2);
          }
        if ((N&1)==0)
          c[NS2] *= twiddle[NS2-1];
        if (!cosine)
          for (size_t k=0, kc=N-1; k<kc; ++k, --kc)
            std::swap(c[k], c[kc]);
        if (ortho) c[0]*=sqrt2*T0(0.5);
        }
      else
        {
        if (ortho) c[0]*=sqrt2;
        if (!cosine)
          for (size_t k=0, kc=N-1; k<NS2; ++k, --kc)
            std::swap(c[k], c[kc]);
        for (size_t k=1, kc=N-1; k<NS2; ++k, --kc)
          {
          T t1=c[k]+c[kc], t2=c[k]-c[kc];
          c[k] = twiddle[k-1]*t2+twiddle[kc-1]*t1;
          c[kc]= twiddle[k-1]*t1-twiddle[kc-1]*t2;
          }
        if ((N&1)==0)
          c[NS2] *= 2*twiddle[NS2-1];
        fftplan.exec(c, fct, true);
        for (size_t k=1; k<N-1; k+=2)
          MPINPLACE(c[k], c[k+1]);
        if (!cosine)
          for (size_t k=1; k<N; k+=2)
            c[k] = -c[k];
        }
      }

    size_t length() const { return fftplan.length(); }
  };

template<typename T0> class T_dcst4
  {
  private:
    size_t N;
    std::unique_ptr<pocketfft_c<T0>> fft;
    std::unique_ptr<pocketfft_r<T0>> rfft;
    arr<cmplx<T0>> C2;

  public:
    POCKETFFT_NOINLINE T_dcst4(size_t length)
      : N(length),
        fft((N&1) ? nullptr : new pocketfft_c<T0>(N/2)),
        rfft((N&1)? new pocketfft_r<T0>(N) : nullptr),
        C2((N&1) ? 0 : N/2)
      {
      if ((N&1)==0)
        {
        sincos_2pibyn<T0> tw(16*N);
        for (size_t i=0; i<N/2; ++i)
          C2[i] = conj(tw[8*i+1]);
        }
      }

    template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct,
      bool /*ortho*/, int /*type*/, bool cosine) const
      {
      size_t n2 = N/2;
      if (!cosine)
        for (size_t k=0, kc=N-1; k<n2; ++k, --kc)
          std::swap(c[k], c[kc]);
      if (N&1)
        {
        // The following code is derived from the FFTW3 function apply_re11()
        // and is released under the 3-clause BSD license with friendly
        // permission of Matteo Frigo and Steven G. Johnson.

        arr<T> y(N);
        {
        size_t i=0, m=n2;
        for (; m<N; ++i, m+=4)
          y[i] = c[m];
        for (; m<2*N; ++i, m+=4)
          y[i] = -c[2*N-m-1];
        for (; m<3*N; ++i, m+=4)
          y[i] = -c[m-2*N];
        for (; m<4*N; ++i, m+=4)
          y[i] = c[4*N-m-1];
        for (; i<N; ++i, m+=4)
          y[i] = c[m-4*N];
        }
        rfft->exec(y.data(), fct, true);
        {
        auto SGN = [](size_t i)
           {
           constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
           return (i&2) ? -sqrt2 : sqrt2;
           };
        c[n2] = y[0]*SGN(n2+1);
        size_t i=0, i1=1, k=1;
        for (; k<n2; ++i, ++i1, k+=2)
          {
          c[i    ] = y[2*k-1]*SGN(i1)     + y[2*k  ]*SGN(i);
          c[N -i1] = y[2*k-1]*SGN(N -i)   - y[2*k  ]*SGN(N -i1);
          c[n2-i1] = y[2*k+1]*SGN(n2-i)   - y[2*k+2]*SGN(n2-i1);
          c[n2+i1] = y[2*k+1]*SGN(n2+i+2) + y[2*k+2]*SGN(n2+i1);
          }
        if (k == n2)
          {
          c[i   ] = y[2*k-1]*SGN(i+1) + y[2*k]*SGN(i);
          c[N-i1] = y[2*k-1]*SGN(i+2) + y[2*k]*SGN(i1);
          }
        }

        // FFTW-derived code ends here
        }
      else
        {
        // even length algorithm from
        // https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/
        arr<cmplx<T>> y(n2);
        for(size_t i=0; i<n2; ++i)
          {
          y[i].Set(c[2*i],c[N-1-2*i]);
          y[i] *= C2[i];
          }
        fft->exec(y.data(), fct, true);
        for(size_t i=0, ic=n2-1; i<n2; ++i, --ic)
          {
          c[2*i  ] =  2*(y[i ].r*C2[i ].r-y[i ].i*C2[i ].i);
          c[2*i+1] = -2*(y[ic].i*C2[ic].r+y[ic].r*C2[ic].i);
          }
        }
      if (!cosine)
        for (size_t k=1; k<N; k+=2)
          c[k] = -c[k];
      }

    size_t length() const { return N; }
  };


//
// multi-D infrastructure
//

template<typename T> std::shared_ptr<T> get_plan(size_t length)
  {
#if POCKETFFT_CACHE_SIZE==0
  return std::make_shared<T>(length);
#else
  constexpr size_t nmax=POCKETFFT_CACHE_SIZE;
  static std::array<std::shared_ptr<T>, nmax> cache;
  static std::array<size_t, nmax> last_access{{0}};
  static size_t access_counter = 0;
  static std::mutex mut;

  auto find_in_cache = [&]() -> std::shared_ptr<T>
    {
    for (size_t i=0; i<nmax; ++i)
      if (cache[i] && (cache[i]->length()==length))
        {
        // no need to update if this is already the most recent entry
        if (last_access[i]!=access_counter)
          {
          last_access[i] = ++access_counter;
          // Guard against overflow
          if (access_counter == 0)
            last_access.fill(0);
          }
        return cache[i];
        }

    return nullptr;
    };

  {
  std::lock_guard<std::mutex> lock(mut);
  auto p = find_in_cache();
  if (p) return p;
  }
  auto plan = std::make_shared<T>(length);
  {
  std::lock_guard<std::mutex> lock(mut);
  auto p = find_in_cache();
  if (p) return p;

  size_t lru = 0;
  for (size_t i=1; i<nmax; ++i)
    if (last_access[i] < last_access[lru])
      lru = i;

  cache[lru] = plan;
  last_access[lru] = ++access_counter;
  }
  return plan;
#endif
  }

class arr_info
  {
  protected:
    shape_t shp;
    stride_t str;

  public:
    arr_info(const shape_t &shape_, const stride_t &stride_)
      : shp(shape_), str(stride_) {}
    size_t ndim() const { return shp.size(); }
    size_t size() const { return util::prod(shp); }
    const shape_t &shape() const { return shp; }
    size_t shape(size_t i) const { return shp[i]; }
    const stride_t &stride() const { return str; }
    const ptrdiff_t &stride(size_t i) const { return str[i]; }
  };

template<typename T> class cndarr: public arr_info
  {
  protected:
    const char *d;

  public:
    cndarr(const void *data_, const shape_t &shape_, const stride_t &stride_)
      : arr_info(shape_, stride_),
        d(reinterpret_cast<const char *>(data_)) {}
    const T &operator[](ptrdiff_t ofs) const
      { return *reinterpret_cast<const T *>(d+ofs); }
  };

template<typename T> class ndarr: public cndarr<T>
  {
  public:
    ndarr(void *data_, const shape_t &shape_, const stride_t &stride_)
      : cndarr<T>::cndarr(const_cast<const void *>(data_), shape_, stride_)
      {}
    T &operator[](ptrdiff_t ofs)
      { return *reinterpret_cast<T *>(const_cast<char *>(cndarr<T>::d+ofs)); }
  };

template<size_t N> class multi_iter
  {
  private:
    shape_t pos;
    const arr_info &iarr, &oarr;
    ptrdiff_t p_ii, p_i[N], str_i, p_oi, p_o[N], str_o;
    size_t idim, rem;

    void advance_i()
      {
      for (int i_=int(pos.size())-1; i_>=0; --i_)
        {
        auto i = size_t(i_);
        if (i==idim) continue;
        p_ii += iarr.stride(i);
        p_oi += oarr.stride(i);
        if (++pos[i] < iarr.shape(i))
          return;
        pos[i] = 0;
        p_ii -= ptrdiff_t(iarr.shape(i))*iarr.stride(i);
        p_oi -= ptrdiff_t(oarr.shape(i))*oarr.stride(i);
        }
      }

  public:
    multi_iter(const arr_info &iarr_, const arr_info &oarr_, size_t idim_)
      : pos(iarr_.ndim(), 0), iarr(iarr_), oarr(oarr_), p_ii(0),
        str_i(iarr.stride(idim_)), p_oi(0), str_o(oarr.stride(idim_)),
        idim(idim_), rem(iarr.size()/iarr.shape(idim))
      {
      auto nshares = threading::num_threads();
      if (nshares==1) return;
      if (nshares==0) throw std::runtime_error("can't run with zero threads");
      auto myshare = threading::thread_id();
      if (myshare>=nshares) throw std::runtime_error("impossible share requested");
      size_t nbase = rem/nshares;
      size_t additional = rem%nshares;
      size_t lo = myshare*nbase + ((myshare<additional) ? myshare : additional);
      size_t hi = lo+nbase+(myshare<additional);
      size_t todo = hi-lo;

      size_t chunk = rem;
      for (size_t i=0; i<pos.size(); ++i)
        {
        if (i==idim) continue;
        chunk /= iarr.shape(i);
        size_t n_advance = lo/chunk;
        pos[i] += n_advance;
        p_ii += ptrdiff_t(n_advance)*iarr.stride(i);
        p_oi += ptrdiff_t(n_advance)*oarr.stride(i);
        lo -= n_advance*chunk;
        }
      rem = todo;
      }
    void advance(size_t n)
      {
      if (rem<n) throw std::runtime_error("underrun");
      for (size_t i=0; i<n; ++i)
        {
        p_i[i] = p_ii;
        p_o[i] = p_oi;
        advance_i();
        }
      rem -= n;
      }
    ptrdiff_t iofs(size_t i) const { return p_i[0] + ptrdiff_t(i)*str_i; }
    ptrdiff_t iofs(size_t j, size_t i) const { return p_i[j] + ptrdiff_t(i)*str_i; }
    ptrdiff_t oofs(size_t i) const { return p_o[0] + ptrdiff_t(i)*str_o; }
    ptrdiff_t oofs(size_t j, size_t i) const { return p_o[j] + ptrdiff_t(i)*str_o; }
    size_t length_in() const { return iarr.shape(idim); }
    size_t length_out() const { return oarr.shape(idim); }
    ptrdiff_t stride_in() const { return str_i; }
    ptrdiff_t stride_out() const { return str_o; }
    size_t remaining() const { return rem; }
  };

class simple_iter
  {
  private:
    shape_t pos;
    const arr_info &arr;
    ptrdiff_t p;
    size_t rem;

  public:
    simple_iter(const arr_info &arr_)
      : pos(arr_.ndim(), 0), arr(arr_), p(0), rem(arr_.size()) {}
    void advance()
      {
      --rem;
      for (int i_=int(pos.size())-1; i_>=0; --i_)
        {
        auto i = size_t(i_);
        p += arr.stride(i);
        if (++pos[i] < arr.shape(i))
          return;
        pos[i] = 0;
        p -= ptrdiff_t(arr.shape(i))*arr.stride(i);
        }
      }
    ptrdiff_t ofs() const { return p; }
    size_t remaining() const { return rem; }
  };

class rev_iter
  {
  private:
    shape_t pos;
    const arr_info &arr;
    std::vector<char> rev_axis;
    std::vector<char> rev_jump;
    size_t last_axis, last_size;
    shape_t shp;
    ptrdiff_t p, rp;
    size_t rem;

  public:
    rev_iter(const arr_info &arr_, const shape_t &axes)
      : pos(arr_.ndim(), 0), arr(arr_), rev_axis(arr_.ndim(), 0),
        rev_jump(arr_.ndim(), 1), p(0), rp(0)
      {
      for (auto ax: axes)
        rev_axis[ax]=1;
      last_axis = axes.back();
      last_size = arr.shape(last_axis)/2 + 1;
      shp = arr.shape();
      shp[last_axis] = last_size;
      rem=1;
      for (auto i: shp)
        rem *= i;
      }
    void advance()
      {
      --rem;
      for (int i_=int(pos.size())-1; i_>=0; --i_)
        {
        auto i = size_t(i_);
        p += arr.stride(i);
        if (!rev_axis[i])
          rp += arr.stride(i);
        else
          {
          rp -= arr.stride(i);
          if (rev_jump[i])
            {
            rp += ptrdiff_t(arr.shape(i))*arr.stride(i);
            rev_jump[i] = 0;
            }
          }
        if (++pos[i] < shp[i])
          return;
        pos[i] = 0;
        p -= ptrdiff_t(shp[i])*arr.stride(i);
        if (rev_axis[i])
          {
          rp -= ptrdiff_t(arr.shape(i)-shp[i])*arr.stride(i);
          rev_jump[i] = 1;
          }
        else
          rp -= ptrdiff_t(shp[i])*arr.stride(i);
        }
      }
    ptrdiff_t ofs() const { return p; }
    ptrdiff_t rev_ofs() const { return rp; }
    size_t remaining() const { return rem; }
  };

template<typename T> struct VTYPE {};
template <typename T> using vtype_t = typename VTYPE<T>::type;

#ifndef POCKETFFT_NO_VECTORS
template<> struct VTYPE<float>
  {
  using type = float __attribute__ ((vector_size (VLEN<float>::val*sizeof(float))));
  };
template<> struct VTYPE<double>
  {
  using type = double __attribute__ ((vector_size (VLEN<double>::val*sizeof(double))));
  };
template<> struct VTYPE<long double>
  {
  using type = long double __attribute__ ((vector_size (VLEN<long double>::val*sizeof(long double))));
  };
#endif

template<typename T> arr<char> alloc_tmp(const shape_t &shape,
  size_t axsize, size_t elemsize)
  {
  auto othersize = util::prod(shape)/axsize;
  auto tmpsize = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1);
  return arr<char>(tmpsize*elemsize);
  }
template<typename T> arr<char> alloc_tmp(const shape_t &shape,
  const shape_t &axes, size_t elemsize)
  {
  size_t fullsize=util::prod(shape);
  size_t tmpsize=0;
  for (size_t i=0; i<axes.size(); ++i)
    {
    auto axsize = shape[axes[i]];
    auto othersize = fullsize/axsize;
    auto sz = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1);
    if (sz>tmpsize) tmpsize=sz;
    }
  return arr<char>(tmpsize*elemsize);
  }

template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
  const cndarr<cmplx<T>> &src, cmplx<vtype_t<T>> *POCKETFFT_RESTRICT dst)
  {
  for (size_t i=0; i<it.length_in(); ++i)
    for (size_t j=0; j<vlen; ++j)
      {
      dst[i].r[j] = src[it.iofs(j,i)].r;
      dst[i].i[j] = src[it.iofs(j,i)].i;
      }
  }

template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
  const cndarr<T> &src, vtype_t<T> *POCKETFFT_RESTRICT dst)
  {
  for (size_t i=0; i<it.length_in(); ++i)
    for (size_t j=0; j<vlen; ++j)
      dst[i][j] = src[it.iofs(j,i)];
  }

template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
  const cndarr<T> &src, T *POCKETFFT_RESTRICT dst)
  {
  if (dst == &src[it.iofs(0)]) return;  // in-place
  for (size_t i=0; i<it.length_in(); ++i)
    dst[i] = src[it.iofs(i)];
  }

template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
  const cmplx<vtype_t<T>> *POCKETFFT_RESTRICT src, ndarr<cmplx<T>> &dst)
  {
  for (size_t i=0; i<it.length_out(); ++i)
    for (size_t j=0; j<vlen; ++j)
      dst[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]);
  }

template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
  const vtype_t<T> *POCKETFFT_RESTRICT src, ndarr<T> &dst)
  {
  for (size_t i=0; i<it.length_out(); ++i)
    for (size_t j=0; j<vlen; ++j)
      dst[it.oofs(j,i)] = src[i][j];
  }

template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
  const T *POCKETFFT_RESTRICT src, ndarr<T> &dst)
  {
  if (src == &dst[it.oofs(0)]) return;  // in-place
  for (size_t i=0; i<it.length_out(); ++i)
    dst[it.oofs(i)] = src[i];
  }

template <typename T> struct add_vec { using type = vtype_t<T>; };
template <typename T> struct add_vec<cmplx<T>>
  { using type = cmplx<vtype_t<T>>; };
template <typename T> using add_vec_t = typename add_vec<T>::type;

template<typename Tplan, typename T, typename T0, typename Exec>
POCKETFFT_NOINLINE void general_nd(const cndarr<T> &in, ndarr<T> &out,
  const shape_t &axes, T0 fct, size_t nthreads, const Exec & exec,
  const bool allow_inplace=true)
  {
  std::shared_ptr<Tplan> plan;

  for (size_t iax=0; iax<axes.size(); ++iax)
    {
    size_t len=in.shape(axes[iax]);
    if ((!plan) || (len!=plan->length()))
      plan = get_plan<Tplan>(len);

    threading::thread_map(
      util::thread_count(nthreads, in.shape(), axes[iax], VLEN<T>::val),
      [&] {
        constexpr auto vlen = VLEN<T0>::val;
        auto storage = alloc_tmp<T0>(in.shape(), len, sizeof(T));
        const auto &tin(iax==0? in : out);
        multi_iter<vlen> it(tin, out, axes[iax]);
#ifndef POCKETFFT_NO_VECTORS
        if (vlen>1)
          while (it.remaining()>=vlen)
            {
            it.advance(vlen);
            auto tdatav = reinterpret_cast<add_vec_t<T> *>(storage.data());
            exec(it, tin, out, tdatav, *plan, fct);
            }
#endif
        while (it.remaining()>0)
          {
          it.advance(1);
          auto buf = allow_inplace && it.stride_out() == sizeof(T) ?
            &out[it.oofs(0)] : reinterpret_cast<T *>(storage.data());
          exec(it, tin, out, buf, *plan, fct);
          }
      });  // end of parallel region
    fct = T0(1); // factor has been applied, use 1 for remaining axes
    }
  }

struct ExecC2C
  {
  bool forward;

  template <typename T0, typename T, size_t vlen> void operator () (
    const multi_iter<vlen> &it, const cndarr<cmplx<T0>> &in,
    ndarr<cmplx<T0>> &out, T * buf, const pocketfft_c<T0> &plan, T0 fct) const
    {
    copy_input(it, in, buf);
    plan.exec(buf, fct, forward);
    copy_output(it, buf, out);
    }
  };

template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it,
  const vtype_t<T> *POCKETFFT_RESTRICT src, ndarr<T> &dst)
  {
  for (size_t j=0; j<vlen; ++j)
    dst[it.oofs(j,0)] = src[0][j];
  size_t i=1, i1=1, i2=it.length_out()-1;
  for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
    for (size_t j=0; j<vlen; ++j)
      {
        dst[it.oofs(j,i1)] = src[i][j]+src[i+1][j];
        dst[it.oofs(j,i2)] = src[i][j]-src[i+1][j];
      }
  if (i<it.length_out())
    for (size_t j=0; j<vlen; ++j)
      dst[it.oofs(j,i1)] = src[i][j];
  }

template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it,
  const T *POCKETFFT_RESTRICT src, ndarr<T> &dst)
  {
  dst[it.oofs(0)] = src[0];
  size_t i=1, i1=1, i2=it.length_out()-1;
  for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
    {
    dst[it.oofs(i1)] = src[i]+src[i+1];
    dst[it.oofs(i2)] = src[i]-src[i+1];
    }
  if (i<it.length_out())
    dst[it.oofs(i1)] = src[i];
  }

struct ExecHartley
  {
  template <typename T0, typename T, size_t vlen> void operator () (
    const multi_iter<vlen> &it, const cndarr<T0> &in, ndarr<T0> &out,
    T * buf, const pocketfft_r<T0> &plan, T0 fct) const
    {
    copy_input(it, in, buf);
    plan.exec(buf, fct, true);
    copy_hartley(it, buf, out);
    }
  };

struct ExecDcst
  {
  bool ortho;
  int type;
  bool cosine;

  template <typename T0, typename T, typename Tplan, size_t vlen>
  void operator () (const multi_iter<vlen> &it, const cndarr<T0> &in,
    ndarr<T0> &out, T * buf, const Tplan &plan, T0 fct) const
    {
    copy_input(it, in, buf);
    plan.exec(buf, fct, ortho, type, cosine);
    copy_output(it, buf, out);
    }
  };

template<typename T> POCKETFFT_NOINLINE void general_r2c(
  const cndarr<T> &in, ndarr<cmplx<T>> &out, size_t axis, bool forward, T fct,
  size_t nthreads)
  {
  auto plan = get_plan<pocketfft_r<T>>(in.shape(axis));
  size_t len=in.shape(axis);
  threading::thread_map(
    util::thread_count(nthreads, in.shape(), axis, VLEN<T>::val),
    [&] {
    constexpr auto vlen = VLEN<T>::val;
    auto storage = alloc_tmp<T>(in.shape(), len, sizeof(T));
    multi_iter<vlen> it(in, out, axis);
#ifndef POCKETFFT_NO_VECTORS
    if (vlen>1)
      while (it.remaining()>=vlen)
        {
        it.advance(vlen);
        auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
        copy_input(it, in, tdatav);
        plan->exec(tdatav, fct, true);
        for (size_t j=0; j<vlen; ++j)
          out[it.oofs(j,0)].Set(tdatav[0][j]);
        size_t i=1, ii=1;
        if (forward)
          for (; i<len-1; i+=2, ++ii)
            for (size_t j=0; j<vlen; ++j)
              out[it.oofs(j,ii)].Set(tdatav[i][j], tdatav[i+1][j]);
        else
          for (; i<len-1; i+=2, ++ii)
            for (size_t j=0; j<vlen; ++j)
              out[it.oofs(j,ii)].Set(tdatav[i][j], -tdatav[i+1][j]);
        if (i<len)
          for (size_t j=0; j<vlen; ++j)
            out[it.oofs(j,ii)].Set(tdatav[i][j]);
        }
#endif
    while (it.remaining()>0)
      {
      it.advance(1);
      auto tdata = reinterpret_cast<T *>(storage.data());
      copy_input(it, in, tdata);
      plan->exec(tdata, fct, true);
      out[it.oofs(0)].Set(tdata[0]);
      size_t i=1, ii=1;
      if (forward)
        for (; i<len-1; i+=2, ++ii)
          out[it.oofs(ii)].Set(tdata[i], tdata[i+1]);
      else
        for (; i<len-1; i+=2, ++ii)
          out[it.oofs(ii)].Set(tdata[i], -tdata[i+1]);
      if (i<len)
        out[it.oofs(ii)].Set(tdata[i]);
      }
    });  // end of parallel region
  }
template<typename T> POCKETFFT_NOINLINE void general_c2r(
  const cndarr<cmplx<T>> &in, ndarr<T> &out, size_t axis, bool forward, T fct,
  size_t nthreads)
  {
  auto plan = get_plan<pocketfft_r<T>>(out.shape(axis));
  size_t len=out.shape(axis);
  threading::thread_map(
    util::thread_count(nthreads, in.shape(), axis, VLEN<T>::val),
    [&] {
      constexpr auto vlen = VLEN<T>::val;
      auto storage = alloc_tmp<T>(out.shape(), len, sizeof(T));
      multi_iter<vlen> it(in, out, axis);
#ifndef POCKETFFT_NO_VECTORS
      if (vlen>1)
        while (it.remaining()>=vlen)
          {
          it.advance(vlen);
          auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
          for (size_t j=0; j<vlen; ++j)
            tdatav[0][j]=in[it.iofs(j,0)].r;
          {
          size_t i=1, ii=1;
          if (forward)
            for (; i<len-1; i+=2, ++ii)
              for (size_t j=0; j<vlen; ++j)
                {
                tdatav[i  ][j] =  in[it.iofs(j,ii)].r;
                tdatav[i+1][j] = -in[it.iofs(j,ii)].i;
                }
          else
            for (; i<len-1; i+=2, ++ii)
              for (size_t j=0; j<vlen; ++j)
                {
                tdatav[i  ][j] = in[it.iofs(j,ii)].r;
                tdatav[i+1][j] = in[it.iofs(j,ii)].i;
                }
          if (i<len)
            for (size_t j=0; j<vlen; ++j)
              tdatav[i][j] = in[it.iofs(j,ii)].r;
          }
          plan->exec(tdatav, fct, false);
          copy_output(it, tdatav, out);
          }
#endif
      while (it.remaining()>0)
        {
        it.advance(1);
        auto tdata = reinterpret_cast<T *>(storage.data());
        tdata[0]=in[it.iofs(0)].r;
        {
        size_t i=1, ii=1;
        if (forward)
          for (; i<len-1; i+=2, ++ii)
            {
            tdata[i  ] =  in[it.iofs(ii)].r;
            tdata[i+1] = -in[it.iofs(ii)].i;
            }
        else
          for (; i<len-1; i+=2, ++ii)
            {
            tdata[i  ] = in[it.iofs(ii)].r;
            tdata[i+1] = in[it.iofs(ii)].i;
            }
        if (i<len)
          tdata[i] = in[it.iofs(ii)].r;
        }
        plan->exec(tdata, fct, false);
        copy_output(it, tdata, out);
        }
    });  // end of parallel region
  }

struct ExecR2R
  {
  bool r2h, forward;

  template <typename T0, typename T, size_t vlen> void operator () (
    const multi_iter<vlen> &it, const cndarr<T0> &in, ndarr<T0> &out, T * buf,
    const pocketfft_r<T0> &plan, T0 fct) const
    {
    copy_input(it, in, buf);
    if ((!r2h) && forward)
      for (size_t i=2; i<it.length_out(); i+=2)
        buf[i] = -buf[i];
    plan.exec(buf, fct, r2h);
    if (r2h && (!forward))
      for (size_t i=2; i<it.length_out(); i+=2)
        buf[i] = -buf[i];
    copy_output(it, buf, out);
    }
  };

template<typename T> void c2c(const shape_t &shape, const stride_t &stride_in,
  const stride_t &stride_out, const shape_t &axes, bool forward,
  const std::complex<T> *data_in, std::complex<T> *data_out, T fct,
  size_t nthreads=1)
  {
  if (util::prod(shape)==0) return;
  util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
  cndarr<cmplx<T>> ain(data_in, shape, stride_in);
  ndarr<cmplx<T>> aout(data_out, shape, stride_out);
  general_nd<pocketfft_c<T>>(ain, aout, axes, fct, nthreads, ExecC2C{forward});
  }

template<typename T> void dct(const shape_t &shape,
  const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
  int type, const T *data_in, T *data_out, T fct, bool ortho, size_t nthreads=1)
  {
  if ((type<1) || (type>4)) throw std::invalid_argument("invalid DCT type");
  if (util::prod(shape)==0) return;
  util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
  cndarr<T> ain(data_in, shape, stride_in);
  ndarr<T> aout(data_out, shape, stride_out);
  const ExecDcst exec{ortho, type, true};
  if (type==1)
    general_nd<T_dct1<T>>(ain, aout, axes, fct, nthreads, exec);
  else if (type==4)
    general_nd<T_dcst4<T>>(ain, aout, axes, fct, nthreads, exec);
  else
    general_nd<T_dcst23<T>>(ain, aout, axes, fct, nthreads, exec);
  }

template<typename T> void dst(const shape_t &shape,
  const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
  int type, const T *data_in, T *data_out, T fct, bool ortho, size_t nthreads=1)
  {
  if ((type<1) || (type>4)) throw std::invalid_argument("invalid DST type");
  if (util::prod(shape)==0) return;
  util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
  cndarr<T> ain(data_in, shape, stride_in);
  ndarr<T> aout(data_out, shape, stride_out);
  const ExecDcst exec{ortho, type, false};
  if (type==1)
    general_nd<T_dst1<T>>(ain, aout, axes, fct, nthreads, exec);
  else if (type==4)
    general_nd<T_dcst4<T>>(ain, aout, axes, fct, nthreads, exec);
  else
    general_nd<T_dcst23<T>>(ain, aout, axes, fct, nthreads, exec);
  }

template<typename T> void r2c(const shape_t &shape_in,
  const stride_t &stride_in, const stride_t &stride_out, size_t axis,
  bool forward, const T *data_in, std::complex<T> *data_out, T fct,
  size_t nthreads=1)
  {
  if (util::prod(shape_in)==0) return;
  util::sanity_check(shape_in, stride_in, stride_out, false, axis);
  cndarr<T> ain(data_in, shape_in, stride_in);
  shape_t shape_out(shape_in);
  shape_out[axis] = shape_in[axis]/2 + 1;
  ndarr<cmplx<T>> aout(data_out, shape_out, stride_out);
  general_r2c(ain, aout, axis, forward, fct, nthreads);
  }

template<typename T> void r2c(const shape_t &shape_in,
  const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
  bool forward, const T *data_in, std::complex<T> *data_out, T fct,
  size_t nthreads=1)
  {
  if (util::prod(shape_in)==0) return;
  util::sanity_check(shape_in, stride_in, stride_out, false, axes);
  r2c(shape_in, stride_in, stride_out, axes.back(), forward, data_in, data_out,
    fct, nthreads);
  if (axes.size()==1) return;

  shape_t shape_out(shape_in);
  shape_out[axes.back()] = shape_in[axes.back()]/2 + 1;
  auto newaxes = shape_t{axes.begin(), --axes.end()};
  c2c(shape_out, stride_out, stride_out, newaxes, forward, data_out, data_out,
    T(1), nthreads);
  }

template<typename T> void c2r(const shape_t &shape_out,
  const stride_t &stride_in, const stride_t &stride_out, size_t axis,
  bool forward, const std::complex<T> *data_in, T *data_out, T fct,
  size_t nthreads=1)
  {
  if (util::prod(shape_out)==0) return;
  util::sanity_check(shape_out, stride_in, stride_out, false, axis);
  shape_t shape_in(shape_out);
  shape_in[axis] = shape_out[axis]/2 + 1;
  cndarr<cmplx<T>> ain(data_in, shape_in, stride_in);
  ndarr<T> aout(data_out, shape_out, stride_out);
  general_c2r(ain, aout, axis, forward, fct, nthreads);
  }

template<typename T> void c2r(const shape_t &shape_out,
  const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
  bool forward, const std::complex<T> *data_in, T *data_out, T fct,
  size_t nthreads=1)
  {
  if (util::prod(shape_out)==0) return;
  if (axes.size()==1)
    return c2r(shape_out, stride_in, stride_out, axes[0], forward,
      data_in, data_out, fct, nthreads);
  util::sanity_check(shape_out, stride_in, stride_out, false, axes);
  auto shape_in = shape_out;
  shape_in[axes.back()] = shape_out[axes.back()]/2 + 1;
  auto nval = util::prod(shape_in);
  stride_t stride_inter(shape_in.size());
  stride_inter.back() = sizeof(cmplx<T>);
  for (int i=int(shape_in.size())-2; i>=0; --i)
    stride_inter[size_t(i)] =
      stride_inter[size_t(i+1)]*ptrdiff_t(shape_in[size_t(i+1)]);
  arr<std::complex<T>> tmp(nval);
  auto newaxes = shape_t{axes.begin(), --axes.end()};
  c2c(shape_in, stride_in, stride_inter, newaxes, forward, data_in, tmp.data(),
    T(1), nthreads);
  c2r(shape_out, stride_inter, stride_out, axes.back(), forward,
    tmp.data(), data_out, fct, nthreads);
  }

template<typename T> void r2r_fftpack(const shape_t &shape,
  const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
  bool real2hermitian, bool forward, const T *data_in, T *data_out, T fct,
  size_t nthreads=1)
  {
  if (util::prod(shape)==0) return;
  util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
  cndarr<T> ain(data_in, shape, stride_in);
  ndarr<T> aout(data_out, shape, stride_out);
  general_nd<pocketfft_r<T>>(ain, aout, axes, fct, nthreads,
    ExecR2R{real2hermitian, forward});
  }

template<typename T> void r2r_separable_hartley(const shape_t &shape,
  const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
  const T *data_in, T *data_out, T fct, size_t nthreads=1)
  {
  if (util::prod(shape)==0) return;
  util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
  cndarr<T> ain(data_in, shape, stride_in);
  ndarr<T> aout(data_out, shape, stride_out);
  general_nd<pocketfft_r<T>>(ain, aout, axes, fct, nthreads, ExecHartley{},
    false);
  }

template<typename T> void r2r_genuine_hartley(const shape_t &shape,
  const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
  const T *data_in, T *data_out, T fct, size_t nthreads=1)
  {
  if (util::prod(shape)==0) return;
  if (axes.size()==1)
    return r2r_separable_hartley(shape, stride_in, stride_out, axes, data_in,
      data_out, fct, nthreads);
  util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
  shape_t tshp(shape);
  tshp[axes.back()] = tshp[axes.back()]/2+1;
  arr<std::complex<T>> tdata(util::prod(tshp));
  stride_t tstride(shape.size());
  tstride.back()=sizeof(std::complex<T>);
  for (size_t i=tstride.size()-1; i>0; --i)
    tstride[i-1]=tstride[i]*ptrdiff_t(tshp[i]);
  r2c(shape, stride_in, tstride, axes, true, data_in, tdata.data(), fct, nthreads);
  cndarr<cmplx<T>> atmp(tdata.data(), tshp, tstride);
  ndarr<T> aout(data_out, shape, stride_out);
  simple_iter iin(atmp);
  rev_iter iout(aout, axes);
  while(iin.remaining()>0)
    {
    auto v = atmp[iin.ofs()];
    aout[iout.ofs()] = v.r+v.i;
    aout[iout.rev_ofs()] = v.r-v.i;
    iin.advance(); iout.advance();
    }
  }

} // namespace detail

using detail::FORWARD;
using detail::BACKWARD;
using detail::shape_t;
using detail::stride_t;
using detail::c2c;
using detail::c2r;
using detail::r2c;
using detail::r2r_fftpack;
using detail::r2r_separable_hartley;
using detail::r2r_genuine_hartley;
using detail::dct;
using detail::dst;

} // namespace pocketfft

#undef POCKETFFT_NOINLINE
#undef POCKETFFT_RESTRICT

#endif // POCKETFFT_HDRONLY_H


================================================
FILE: mlx/CMakeLists.txt
================================================
target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/array.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/compile.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/dtype.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/dtype_utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/export.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/einsum.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fast.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/ops.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/graph_utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/random.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/scheduler.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/transforms.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/linalg.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/metal.h)

# Define MLX_VERSION only in the version.cpp file.
add_library(mlx_version OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/version.cpp)
target_compile_definitions(mlx_version PRIVATE MLX_VERSION="${MLX_VERSION}")
target_include_directories(mlx_version PRIVATE ${PROJECT_SOURCE_DIR})
target_link_libraries(mlx PRIVATE $<BUILD_INTERFACE:mlx_version>)

# Do not export symbols by default.
set_target_properties(
  mlx mlx_version
  PROPERTIES VISIBILITY_INLINES_HIDDEN ON
             CXX_VISIBILITY_PRESET hidden
             CUDA_VISIBILITY_PRESET hidden)

# Define MLX_EXPORT for shared libraries, MLX_STATIC for static libraries.
set_target_properties(mlx PROPERTIES DEFINE_SYMBOL MLX_EXPORT)
if(BUILD_SHARED_LIBS)
  target_compile_definitions(mlx_version PUBLIC MLX_EXPORT)
else()
  target_compile_definitions(mlx PUBLIC MLX_STATIC)
  target_compile_definitions(mlx_version PUBLIC MLX_STATIC)
endif()

if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
  # Supress warnings: note: parameter passing for argument of type
  # 'std::pair<float, float>' when C++17 is enabled changed to match C++14 in
  # GCC 10.1
  target_compile_options(mlx PRIVATE -Wno-psabi)
endif()

if(MSVC)
  # Some of CUDA's headers include windows.h, which defines min/max macros.
  target_compile_definitions(mlx PRIVATE NOMINMAX WIN32_LEAN_AND_MEAN)
  # Unicode support in fmt does not compile in .cu files.
  target_compile_definitions(mlx PRIVATE FMT_UNICODE=0)
  # Disable some MSVC warnings to speed up compilation.
  target_compile_options(
    mlx
    PUBLIC $<$<COMPILE_LANGUAGE:CXX>:/wd4244 /wd4267>
    PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/wd4068
            /wd4146
            /wd4700
            /wd4804
            /wd4805>
            $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/wd4244
            -Xcompiler=/wd4267>)
  # Enable /bigobj for heavily templated code (e.g., binary.cpp) that exceeds
  # the default 65,535 section limit in COFF object files.
  target_compile_options(
    mlx PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/bigobj>
                $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/bigobj>)
  # Use modern preprocessor, otherwise CCCL would complain.
  target_compile_options(
    mlx PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/Zc:preprocessor>
                $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/Zc:preprocessor>)
endif()

add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/common)

if(MLX_BUILD_CPU)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/cpu)
else()
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_cpu)
endif()

add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/distributed)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/io)

if(MLX_BUILD_METAL)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/metal)
else()
  target_sources(mlx
                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/backend/metal/no_metal.cpp)
endif()

if(MLX_BUILD_CUDA)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/cuda)
else()
  target_sources(mlx
                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/backend/cuda/no_cuda.cpp)
endif()

if(MLX_BUILD_METAL OR MLX_BUILD_CUDA)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/gpu)
else()
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_gpu)
endif()


================================================
FILE: mlx/allocator.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include <cstdlib>

#include "mlx/api.h"

namespace mlx::core::allocator {

// Simple wrapper around buffer pointers
// WARNING: Only Buffer objects constructed from and those that wrap
//          raw pointers from mlx::allocator are supported.
class MLX_API Buffer {
 private:
  void* ptr_;

 public:
  explicit Buffer(void* ptr) : ptr_(ptr) {};

  // Get the raw data pointer from the buffer
  void* raw_ptr();

  // Get the buffer pointer from the buffer
  const void* ptr() const {
    return ptr_;
  };
  void* ptr() {
    return ptr_;
  };
};

class MLX_API Allocator {
  /** Abstract base class for a memory allocator. */
 public:
  virtual Buffer malloc(size_t size) = 0;
  virtual void free(Buffer buffer) = 0;
  virtual size_t size(Buffer buffer) const = 0;
  virtual Buffer make_buffer(void* ptr, size_t size) {
    return Buffer{nullptr};
  };
  virtual void release(Buffer buffer) {}

  Allocator() = default;
  Allocator(const Allocator& other) = delete;
  Allocator(Allocator&& other) = delete;
  Allocator& operator=(const Allocator& other) = delete;
  Allocator& operator=(Allocator&& other) = delete;
  virtual ~Allocator() = default;
};

MLX_API Allocator& allocator();

inline Buffer malloc(size_t size) {
  return allocator().malloc(size);
}

inline void free(Buffer buffer) {
  allocator().free(buffer);
}

// Make a Buffer from a raw pointer of the given size without a copy.  If a
// no-copy conversion is not possible then the returned buffer.ptr() will be
// nullptr. Any buffer created with this function must be released with
// release(buffer)
inline Buffer make_buffer(void* ptr, size_t size) {
  return allocator().make_buffer(ptr, size);
};

// Release a buffer from the allocator made with make_buffer
inline void release(Buffer buffer) {
  allocator().release(buffer);
}

} // namespace mlx::core::allocator


================================================
FILE: mlx/api.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

// MLX_API macro for controlling symbol visibility, must add for public APIs.
//
// Usage:
//   MLX_API void some_function(...);
//   class MLX_API SomeClass { ... };

#if defined(MLX_STATIC)

// Static library build - no import/export decorations needed
#define MLX_API

#else

// Shared library build.
#if defined(_WIN32)
#if defined(MLX_EXPORT)
#define MLX_API __declspec(dllexport)
#else
#define MLX_API __declspec(dllimport)
#endif // defined(MLX_EXPORT)
#else
#define MLX_API __attribute__((visibility("default")))
#endif // defined(_WIN32)

#endif // defined(MLX_STATIC)


================================================
FILE: mlx/array.cpp
================================================
// Copyright © 2023-2024 Apple Inc.
#include <functional>
#include <unordered_map>

#include "mlx/array.h"
#include "mlx/ops.h"
#include "mlx/primitives.h"
#include "mlx/transforms.h"
#include "mlx/transforms_impl.h"

namespace mlx::core {

array::array(const std::complex<float>& val, Dtype dtype /* = complex64 */)
    : array_desc_(std::make_shared<ArrayDesc>(Shape{}, dtype)) {
  auto cval = static_cast<complex64_t>(val);
  init(&cval);
}

array::array(
    Shape shape,
    Dtype dtype,
    std::shared_ptr<Primitive> primitive,
    std::vector<array> inputs)
    : array_desc_(
          std::make_shared<ArrayDesc>(
              std::move(shape),
              dtype,
              std::move(primitive),
              std::move(inputs))) {
  if (has_primitive() && this->primitive().stream().device == Device::gpu) {
    for (auto& in : this->inputs()) {
      if (in.dtype() == float64) {
        throw std::invalid_argument("float64 is not supported on the GPU");
      }
    }
    if (this->dtype() == float64) {
      throw std::invalid_argument("float64 is not supported on the GPU");
    }
  }
}

std::vector<array> array::make_arrays(
    std::vector<Shape> shapes,
    const std::vector<Dtype>& dtypes,
    const std::shared_ptr<Primitive>& primitive,
    const std::vector<array>& inputs) {
  std::vector<array> outputs;
  for (size_t i = 0; i < shapes.size(); ++i) {
    outputs.emplace_back(std::move(shapes[i]), dtypes[i], primitive, inputs);
  }
  // For each node in |outputs|, its siblings are the other nodes.
  for (size_t i = 0; i < outputs.size(); ++i) {
    auto siblings = outputs;
    siblings.erase(siblings.begin() + i);
    outputs[i].set_siblings(std::move(siblings), i);
  }
  return outputs;
}

array array::unsafe_weak_copy(const array& other) {
  auto cpy = array(other.shape(), other.dtype(), nullptr, {});
  cpy.set_data(
      other.buffer(),
      other.data_size(),
      other.strides(),
      other.flags(),
      [](auto) {});
  cpy.array_desc_->offset = other.array_desc_->offset;
  return cpy;
}

array::array(std::initializer_list<float> data)
    : array_desc_(
          std::make_shared<ArrayDesc>(
              Shape{static_cast<ShapeElem>(data.size())},
              float32)) {
  init(data.begin());
}

array::array(std::initializer_list<int> data, Dtype dtype)
    : array_desc_(
          std::make_shared<ArrayDesc>(
              Shape{static_cast<ShapeElem>(data.size())},
              dtype)) {
  init(data.begin());
}

array::array(
    void* data,
    Shape shape,
    Dtype dtype,
    const std::function<void(void*)>& deleter)
    : array_desc_(std::make_shared<ArrayDesc>(std::move(shape), dtype)) {
  auto buffer = allocator::make_buffer(data, nbytes());
  if (buffer.ptr() == nullptr) {
    set_data(allocator::malloc(nbytes()));
    auto ptr = static_cast<char*>(data);
    std::copy(ptr, ptr + nbytes(), this->data<char>());
    deleter(data);
  } else {
    auto wrapped_deleter = [deleter](allocator::Buffer buffer) {
      auto ptr = buffer.raw_ptr();
      allocator::release(buffer);
      return deleter(ptr);
    };
    set_data(buffer, std::move(wrapped_deleter));
  }
}

/* Build an array from a shared buffer */
array::array(allocator::Buffer data, Shape shape, Dtype dtype, Deleter deleter)
    : array_desc_(std::make_shared<ArrayDesc>(std::move(shape), dtype)) {
  set_data(data, deleter);
}

void array::detach() {
  array_desc_->primitive = nullptr;
  for (auto& s : array_desc_->siblings) {
    s.array_desc_->primitive = nullptr;
  }
  for (auto& s : array_desc_->siblings) {
    s.array_desc_->inputs.clear();
    s.array_desc_->siblings.clear();
    s.array_desc_->position = 0;
  }
  array_desc_->inputs.clear();
  array_desc_->siblings.clear();
  array_desc_->position = 0;
}

bool array::is_available() const {
  if (status() == Status::available) {
    return true;
  } else if (
      status() == Status::evaluated &&
      (!event().valid() || event().is_signaled())) {
    detach_event();
    set_status(Status::available);
    return true;
  }
  return false;
}

void array::wait() {
  if (!is_available()) {
    if (event().valid()) {
      event().wait();
      detach_event();
    }
    set_status(Status::available);
  }
}

void array::eval() {
  // Ensure the array is ready to be read
  if (status() == Status::unscheduled) {
    mlx::core::eval({*this});
  } else {
    wait();
  }
}

bool array::is_tracer() const {
  return (array_desc_->is_tracer && detail::in_tracing()) ||
      detail::retain_graph();
}

void array::set_data(allocator::Buffer buffer, Deleter d) {
  array_desc_->data = std::make_shared<Data>(buffer, d);
  array_desc_->offset = 0;
  array_desc_->data_size = size();
  array_desc_->flags.contiguous = true;
  array_desc_->flags.row_contiguous = true;
  auto max_dim = std::max_element(shape().begin(), shape().end());
  array_desc_->flags.col_contiguous = size() <= 1 || size() == *max_dim;
}

void array::set_data(
    allocator::Buffer buffer,
    size_t data_size,
    Strides strides,
    Flags flags,
    Deleter d) {
  array_desc_->data = std::make_shared<Data>(buffer, d);
  array_desc_->offset = 0;
  array_desc_->data_size = data_size;
  array_desc_->strides = std::move(strides);
  array_desc_->flags = flags;
}

void array::copy_shared_buffer(
    const array& other,
    const Strides& strides,
    Flags flags,
    size_t data_size,
    int64_t offset /* = 0 */) {
  array_desc_->data = other.array_desc_->data;
  array_desc_->strides = strides;
  array_desc_->flags = flags;
  array_desc_->data_size = data_size;
  array_desc_->offset =
      sizeof(char) * itemsize() * offset + other.array_desc_->offset;
}

void array::copy_shared_buffer(const array& other) {
  copy_shared_buffer(other, other.strides(), other.flags(), other.data_size());
}

array::~array() {
  if (array_desc_ == nullptr) {
    return;
  }

  // Detached/detaching
  if (array_desc_->primitive == nullptr) {
    return;
  }

  // Break circular reference for non-detached arrays with siblings
  if (auto n = siblings().size(); n > 0) {
    bool do_detach = true;
    // If all siblings have siblings.size() references except
    // the one we are currently destroying (which has siblings.size() + 1)
    // then there are no more external references
    do_detach &= (array_desc_.use_count() == (n + 1));
    for (auto& s : siblings()) {
      do_detach &= (s.array_desc_.use_count() == n);
      if (!do_detach) {
        break;
      }
    }
    if (do_detach) {
      for (auto& s : siblings()) {
        for (auto& ss : s.siblings()) {
          // Set to null here to avoid descending into array destructor
          // for siblings
          ss.array_desc_ = nullptr;
        }
        s.array_desc_->siblings.clear();
      }
    }
  }
}

void array::ArrayDesc::init() {
  strides.resize(shape.size());
  size = 1;
  for (int i = shape.size() - 1; i >= 0; --i) {
    strides[i] = size;
    size *= shape[i];
  }
  for (const auto& in : inputs) {
    is_tracer |= in.is_tracer();
  }
}

array::ArrayDesc::ArrayDesc(Shape shape, Dtype dtype)
    : shape(std::move(shape)), dtype(dtype), status(Status::available) {
  init();
}

array::ArrayDesc::ArrayDesc(
    Shape shape,
    Dtype dtype,
    std::shared_ptr<Primitive> primitive,
    std::vector<array> inputs)
    : shape(std::move(shape)),
      dtype(dtype),
      primitive(std::move(primitive)),
      status(Status::unscheduled),
      inputs(std::move(inputs)) {
  init();
}

array::ArrayDesc::~ArrayDesc() {
  // When an array description is destroyed it will delete a bunch of arrays
  // that may also destroy their corresponding descriptions and so on and so
  // forth.
  //
  // This calls recursively the destructor and can result in stack overflow, we
  // instead put them in a vector and destroy them one at a time resulting in a
  // max stack depth of 2.
  if (inputs.empty()) {
    return;
  }

  std::vector<std::shared_ptr<ArrayDesc>> for_deletion;

  auto append_deletable_inputs = [&for_deletion](ArrayDesc& ad) {
    std::unordered_map<std::uintptr_t, array> input_map;
    for (array& a : ad.inputs) {
      if (a.array_desc_) {
        input_map.insert({a.id(), a});
        for (auto& s : a.siblings()) {
          input_map.insert({s.id(), s});
        }
      }
    }
    ad.inputs.clear();
    for (auto& [_, a] : input_map) {
      bool is_deletable =
          (a.array_desc_.use_count() <= a.siblings().size() + 1);
      // An array with siblings is deletable only if all of its siblings
      // are deletable
      for (auto& s : a.siblings()) {
        if (!is_deletable) {
          break;
        }
        int is_input = (input_map.find(s.id()) != input_map.end());
        is_deletable &=
            s.array_desc_.use_count() <= a.siblings().size() + is_input;
      }
      if (is_deletable) {
        for_deletion.push_back(std::move(a.array_desc_));
      }
    }
  };

  append_deletable_inputs(*this);

  while (!for_deletion.empty()) {
    // top is going to be deleted at the end of the block *after* the arrays
    // with inputs have been moved into the vector
    auto top = std::move(for_deletion.back());
    for_deletion.pop_back();
    append_deletable_inputs(*top);

    // Clear out possible siblings to break circular references
    for (auto& s : top->siblings) {
      // Set to null here to avoid descending into top-level
      // array destructor for siblings
      s.array_desc_ = nullptr;
    }
    top->siblings.clear();
  }
}

array::ArrayIterator::ArrayIterator(const array& arr, int idx)
    : arr(arr), idx(idx) {
  if (arr.ndim() == 0) {
    throw std::invalid_argument("Cannot iterate over 0-d array.");
  }
}

array::ArrayIterator::reference array::ArrayIterator::operator*() const {
  auto start = Shape(arr.ndim(), 0);
  auto end = arr.shape();
  auto shape = arr.shape();
  shape.erase(shape.begin());
  start[0] = idx;
  end[0] = idx + 1;
  return reshape(slice(arr, start, end), shape);
};

} // namespace mlx::core


================================================
FILE: mlx/array.h
================================================
// Copyright © 2023 Apple Inc.
#pragma once

#include <algorithm>
#include <cstdint>
#include <functional>
#include <memory>
#include <vector>

#include "mlx/allocator.h"
#include "mlx/api.h"
#include "mlx/dtype.h"
#include "mlx/event.h"
#include "mlx/small_vector.h"

namespace mlx::core {

// Forward declaration
class Primitive;

using Deleter = std::function<void(allocator::Buffer)>;
using ShapeElem = int32_t;
using Shape = SmallVector<ShapeElem>;
using Strides = SmallVector<int64_t>;

class MLX_API array {
  /* An array is really a node in a graph. It contains a shared ArrayDesc
   * object */

 public:
  /** Construct a scalar array with zero dimensions. */
  template <typename T>
  explicit array(T val, Dtype dtype = TypeToDtype<T>());

  /* Special case since std::complex can't be implicitly converted to other
   * types. */
  explicit array(const std::complex<float>& val, Dtype dtype = complex64);

  template <typename It>
  explicit array(
      It data,
      Shape shape,
      Dtype dtype =
          TypeToDtype<typename std::iterator_traits<It>::value_type>());

  template <typename T>
  explicit array(std::initializer_list<T> data, Dtype dtype = TypeToDtype<T>());

  /* Special case so empty lists default to float32. */
  explicit array(std::initializer_list<float> data);

  /* Special case so array({}, type) is an empty array. */
  explicit array(std::initializer_list<int> data, Dtype dtype);

  template <typename T>
  explicit array(
      std::initializer_list<T> data,
      Shape shape,
      Dtype dtype = TypeToDtype<T>());

  /* Build an array from a raw pointer. The constructor will attempt to use the
   * input data without a copy. The deleter will be called when the array no
   * longer needs the underlying memory - after the array is destroyed in the
   * no-copy case and after the copy otherwise. */
  explicit array(
      void* data,
      Shape shape,
      Dtype dtype,
      const std::function<void(void*)>& deleter);

  /* Build an array from a buffer */
  explicit array(
      allocator::Buffer data,
      Shape shape,
      Dtype dtype,
      Deleter deleter = allocator::free);

  /** Assignment to rvalue does not compile. */
  array& operator=(const array& other) && = delete;
  array& operator=(array&& other) && = delete;

  /** Default copy and move constructors otherwise. */
  array& operator=(array&& other) & = default;
  array(const array& other) = default;
  array(array&& other) = default;

  array& operator=(const array& other) & {
    if (this->id() != other.id()) {
      this->array_desc_ = other.array_desc_;
    }
    return *this;
  }

  /** The size of the array's datatype in bytes. */
  size_t itemsize() const {
    return size_of(dtype());
  }

  /** The number of elements in the array. */
  size_t size() const {
    return array_desc_->size;
  }

  /** The number of bytes in the array. */
  size_t nbytes() const {
    return size() * itemsize();
  }

  /** The number of dimensions of the array. */
  size_t ndim() const {
    return array_desc_->shape.size();
  }

  /** The shape of the array as a vector of integers. */
  const Shape& shape() const {
    return array_desc_->shape;
  }

  /**
   *  Get the size of the corresponding dimension.
   *
   *  This function supports negative indexing and provides
   *  bounds checking. */
  auto shape(int dim) const {
    return shape().at(dim < 0 ? dim + static_cast<int>(ndim()) : dim);
  }

  /** The strides of the array. */
  const Strides& strides() const {
    return array_desc_->strides;
  }

  /**
   *  Get the stride of the corresponding dimension.
   *
   *  This function supports negative indexing and provides
   *  bounds checking. */
  auto strides(int dim) const {
    return strides().at(dim < 0 ? dim + static_cast<int>(ndim()) : dim);
  }

  /** Get the arrays data type. */
  Dtype dtype() const {
    return array_desc_->dtype;
  }

  /** Evaluate the array. */
  void eval();

  /** Get the value from a scalar array. */
  template <typename T>
  T item();

  template <typename T>
  T item() const;

  struct MLX_API ArrayIterator {
    using iterator_category = std::random_access_iterator_tag;
    using difference_type = size_t;
    using value_type = const array;
    using reference = value_type;

    explicit ArrayIterator(const array& arr, int idx = 0);

    reference operator*() const;

    ArrayIterator& operator+(difference_type diff) {
      idx += diff;
      return *this;
    }

    ArrayIterator& operator++() {
      idx++;
      return *this;
    }

    friend bool operator==(const ArrayIterator& a, const ArrayIterator& b) {
      return a.arr.id() == b.arr.id() && a.idx == b.idx;
    }
    friend bool operator!=(const ArrayIterator& a, const ArrayIterator& b) {
      return !(a == b);
    }

   private:
    const array& arr;
    int idx;
  };

  ArrayIterator begin() const {
    return ArrayIterator(*this);
  }
  ArrayIterator end() const {
    return ArrayIterator(*this, shape(0));
  }

  /**
   * The following methods should be used with caution.
   * They are intended for use by the backend implementation and the
   * API may change.
   */

  array(
      Shape shape,
      Dtype dtype,
      std::shared_ptr<Primitive> primitive,
      std::vector<array> inputs);

  static std::vector<array> make_arrays(
      std::vector<Shape> shapes,
      const std::vector<Dtype>& dtypes,
      const std::shared_ptr<Primitive>& primitive,
      const std::vector<array>& inputs);

  /**
   * Get a new array that refers to the same data as the input but with a
   * non-owning pointer to it. Note the array is detached from the graph and has
   * no inputs, siblings or primitive.
   */
  static array unsafe_weak_copy(const array& other);

  /** A unique identifier for an array. */
  std::uintptr_t id() const {
    return reinterpret_cast<std::uintptr_t>(array_desc_.get());
  }

  /** A unique identifier for an arrays primitive. */
  std::uintptr_t primitive_id() const {
    return reinterpret_cast<std::uintptr_t>(array_desc_->primitive.get());
  }

  struct Data {
    allocator::Buffer buffer;
    Deleter d;
    Data(allocator::Buffer buffer, Deleter d = allocator::free)
        : buffer(buffer), d(d) {}
    // Not copyable
    Data(const Data& d) = delete;
    Data& operator=(const Data& d) = delete;
    Data(Data&& o) : buffer(o.buffer), d(o.d) {
      o.buffer = allocator::Buffer(nullptr);
      o.d = [](allocator::Buffer) {};
    }
    ~Data() {
      d(buffer);
    }
  };

  struct Flags {
    // True iff there are no gaps in the underlying data. Each item
    // in the underlying data buffer belongs to at least one index.
    //
    // True iff:
    // prod(shape[i] for i in range(ndim) if strides[i] > 0) == data_size()
    bool contiguous : 1;

    // True iff:
    // strides[-1] == 1 and
    // all(strides[i] == (shape[i+1]*strides[i+1]) or shape[i] == 1 for i in
    // range(ndim - 1))
    bool row_contiguous : 1;

    // True iff:
    // strides[0] == 1 and
    // all(strides[i] == (shape[i-1]*strides[i-1]) or shape[i] == 1 for i in
    // range(1, ndim))
    bool col_contiguous : 1;
  };

  /** The array's primitive. */
  Primitive& primitive() const {
    return *(array_desc_->primitive);
  }

  /** A shared pointer to the array's primitive. */
  std::shared_ptr<Primitive>& primitive_ptr() const {
    return array_desc_->primitive;
  }

  /** Check if the array has an attached primitive or is a leaf node. */
  bool has_primitive() const {
    return array_desc_->primitive != nullptr;
  }

  /** The array's inputs. */
  const std::vector<array>& inputs() const {
    return array_desc_->inputs;
  }

  std::vector<array>& inputs() {
    return array_desc_->inputs;
  }

  /** True indicates the arrays buffer is safe to reuse */
  bool is_donatable() const {
    return array_desc_.use_count() == 1 && (array_desc_->data.use_count() == 1);
  }

  /** The array's siblings. */
  const std::vector<array>& siblings() const {
    return array_desc_->siblings;
  }

  /** The array's siblings. */
  std::vector<array>& siblings() {
    return array_desc_->siblings;
  }

  /** The array's position in the sibling list. */
  int sibling_position() const {
    return array_desc_->position;
  }

  void set_siblings(std::vector<array> siblings, uint16_t position) {
    array_desc_->siblings = std::move(siblings);
    array_desc_->position = position;
  }

  /** The outputs of the array's primitive (i.e. this array and
   * its siblings) in the order the primitive expects. */
  std::vector<array> outputs() const {
    auto idx = array_desc_->position;
    std::vector<array> outputs;
    outputs.reserve(siblings().size() + 1);
    outputs.insert(outputs.end(), siblings().begin(), siblings().begin() + idx);
    outputs.push_back(*this);
    outputs.insert(outputs.end(), siblings().begin() + idx, siblings().end());
    return outputs;
  }

  /** Detach the array from the graph. */
  void detach();

  /** Get the Flags bit-field. */
  const Flags& flags() const {
    return array_desc_->flags;
  }

  /** The size (in elements) of the underlying buffer the array points to.
   *
   * This can be different than the actual size of the array if the array has
   * been broadcast or irregularly strided.  If ``first`` is the offset into
   * the data buffer of the first element of the array (i.e. the offset
   * corresponding to ``arr[0, 0, ...]``) and last is the offset into the
   * data buffer of the last element of the array (i.e. the offset
   * corresponding to ``arr[-1, -1, ...]``) then ``data_size = last - first``.
   * Note, ``data_size`` is in units of ``item_size`` (not bytes).
   **/
  size_t data_size() const {
    return array_desc_->data_size;
  }

  allocator::Buffer& buffer() {
    return array_desc_->data->buffer;
  }
  const allocator::Buffer& buffer() const {
    return array_desc_->data->buffer;
  }

  size_t buffer_size() const {
    return allocator::allocator().size(buffer());
  }

  // Return the shared pointer to the array::Data struct
  const std::shared_ptr<Data>& data_shared_ptr() const {
    return array_desc_->data;
  }

  // Return a raw pointer to the arrays data. This function may do a copy if
  // the underlying buffer is not accessible on the CPU. When accessing the
  // data for GPU kernels, be sure to use the correct method / function for the
  // given backend to access the GPU pointer.
  template <typename T>
  T* data() {
    return reinterpret_cast<T*>(
        (static_cast<char*>(buffer().raw_ptr()) + array_desc_->offset));
  }

  template <typename T>
  const T* data() const {
    return const_cast<array&>(*this).data<T>();
  }

  int64_t offset() const {
    return array_desc_->offset;
  }

  enum Status {
    // The output of a computation which has not been scheduled.
    // For example, the status of `x` in `auto x = a + b`.
    unscheduled,

    // The array's `eval_*` function has been run, but the computation is not
    // necessarily complete. The array will have memory allocated and if it is
    // not a tracer then it will be detached from the graph.
    evaluated,

    // If the array is the output of a computation then the computation
    // is complete. Constant arrays are always available (e.g. `array({1, 2,
    // 3})`)
    available
  };

  // Check if the array is safe to read.
  bool is_available() const;

  // Wait on the array to be available. After this `is_available` returns
  // `true`.
  void wait();

  Status status() const {
    return array_desc_->status;
  }

  void set_status(Status s) const {
    array_desc_->status = s;
  }

  // Get the array's shared event
  Event& event() const {
    return array_desc_->event;
  }

  // Attach an event to a not yet evaluated array
  void attach_event(Event e) const {
    array_desc_->event = std::move(e);
  }

  void detach_event() const {
    array_desc_->event = Event{};
  }

  // Mark the array as a tracer array (true) or not.
  void set_tracer(bool is_tracer) {
    array_desc_->is_tracer = is_tracer;
  }
  // Check if the array is a tracer array
  bool is_tracer() const;

  void set_data(allocator::Buffer buffer, Deleter d = allocator::free);

  void set_data(
      allocator::Buffer buffer,
      size_t data_size,
      Strides strides,
      Flags flags,
      Deleter d = allocator::free);

  void copy_shared_buffer(
      const array& other,
      const Strides& strides,
      Flags flags,
      size_t data_size,
      int64_t offset = 0);

  void copy_shared_buffer(const array& other);

  void overwrite_descriptor(const array& other) {
    array_desc_ = other.array_desc_;
  }

  ~array();

 private:
  // Initialize the arrays data
  template <typename It>
  void init(const It src);

  struct MLX_API ArrayDesc {
    Shape shape;
    Strides strides;
    size_t size;
    Dtype dtype;
    std::shared_ptr<Primitive> primitive;

    Status status;

    // An event on the array used for synchronization
    Event event;

    // Indicates an array is being used in a graph transform
    // and should not be detached from the graph
    bool is_tracer{false};

    // This is a shared pointer so that *different* arrays
    // can share the underlying data buffer.
    std::shared_ptr<Data> data;

    // Offset from beginning of data pointer
    int64_t offset{0};

    // The size in elements of the data buffer the array accesses
    size_t data_size{0};

    // Contains useful meta data about the array
    Flags flags{true, true, true};

    std::vector<array> inputs;
    // An array to keep track of the siblings from a multi-output
    // primitive.
    std::vector<array> siblings;
    // The arrays position in the output list
    uint32_t position{0};

    explicit ArrayDesc(Shape shape, Dtype dtype);

    explicit ArrayDesc(
        Shape shape,
        Dtype dtype,
        std::shared_ptr<Primitive> primitive,
        std::vector<array> inputs);

    ~ArrayDesc();

   private:
    // Initialize size, strides, and other metadata
    void init();
  };

  // The ArrayDesc contains the details of the materialized array including the
  // shape, strides, the data type. It also includes
  // the primitive which knows how to compute the array's data from its inputs
  // and the list of array's inputs for the primitive.
  std::shared_ptr<ArrayDesc> array_desc_;
};

template <typename T>
array::array(T val, Dtype dtype /* = TypeToDtype<T>() */)
    : array_desc_(std::make_shared<ArrayDesc>(Shape{}, dtype)) {
  init(&val);
}

template <typename It>
array::array(
  It data,
  Shape shape,
  Dtype dtype /* = TypeToDtype<typename std::iterator_traits<It>::value_type>() */) :
    array_desc_(std::make_shared<ArrayDesc>(std::move(shape), dtype)) {
  init(data);
}

template <typename T>
array::array(
    std::initializer_list<T> data,
    Dtype dtype /* = TypeToDtype<T>() */)
    : array_desc_(
          std::make_shared<ArrayDesc>(
              Shape{static_cast<ShapeElem>(data.size())},
              dtype)) {
  init(data.begin());
}

template <typename T>
array::array(
    std::initializer_list<T> data,
    Shape shape,
    Dtype dtype /* = TypeToDtype<T>() */)
    : array_desc_(std::make_shared<ArrayDesc>(std::move(shape), dtype)) {
  if (data.size() != size()) {
    throw std::invalid_argument(
        "Data size and provided shape mismatch in array construction.");
  }
  init(data.begin());
}

template <typename T>
T array::item() {
  if (size() != 1) {
    throw std::invalid_argument("item can only be called on arrays of size 1.");
  }
  eval();
  return *data<T>();
}

template <typename T>
T array::item() const {
  if (size() != 1) {
    throw std::invalid_argument("item can only be called on arrays of size 1.");
  }
  if (status() == Status::unscheduled) {
    throw std::invalid_argument(
        "item() const can only be called on evaled arrays");
  }
  const_cast<array*>(this)->eval();
  return *data<T>();
}

template <typename It>
void array::init(It src) {
  set_data(allocator::malloc(size() * size_of(dtype())));
  switch (dtype()) {
    case bool_:
      std::copy(src, src + size(), data<bool>());
      break;
    case uint8:
      std::copy(src, src + size(), data<uint8_t>());
      break;
    case uint16:
      std::copy(src, src + size(), data<uint16_t>());
      break;
    case uint32:
      std::copy(src, src + size(), data<uint32_t>());
      break;
    case uint64:
      std::copy(src, src + size(), data<uint64_t>());
      break;
    case int8:
      std::copy(src, src + size(), data<int8_t>());
      break;
    case int16:
      std::copy(src, src + size(), data<int16_t>());
      break;
    case int32:
      std::copy(src, src + size(), data<int32_t>());
      break;
    case int64:
      std::copy(src, src + size(), data<int64_t>());
      break;
    case float16:
      std::copy(src, src + size(), data<float16_t>());
      break;
    case float32:
      std::copy(src, src + size(), data<float>());
      break;
    case float64:
      std::copy(src, src + size(), data<double>());
      break;
    case bfloat16:
      std::copy(src, src + size(), data<bfloat16_t>());
      break;
    case complex64:
      std::copy(src, src + size(), data<complex64_t>());
      break;
  }
}

/* Utilities for determining whether a template parameter is array. */
template <typename T>
inline constexpr bool is_array_v =
    std::is_same_v<std::remove_cv_t<std::remove_reference_t<T>>, array>;

template <typename... T>
inline constexpr bool is_arrays_v = (is_array_v<T> && ...);

template <typename... T>
using enable_for_arrays_t = typename std::enable_if_t<is_arrays_v<T...>>;

} // namespace mlx::core


================================================
FILE: mlx/backend/common/CMakeLists.txt
================================================
target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/broadcasting.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp)


================================================
FILE: mlx/backend/common/binary.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include "mlx/allocator.h"
#include "mlx/array.h"
#include "mlx/backend/common/utils.h"

namespace mlx::core {

enum class BinaryOpType {
  ScalarScalar,
  ScalarVector,
  VectorScalar,
  VectorVector,
  General,
};

inline BinaryOpType get_binary_op_type(const array& a, const array& b) {
  BinaryOpType bopt;
  if (a.data_size() == 1 && b.data_size() == 1) {
    bopt = BinaryOpType::ScalarScalar;
  } else if (a.data_size() == 1 && b.flags().contiguous) {
    bopt = BinaryOpType::ScalarVector;
  } else if (b.data_size() == 1 && a.flags().contiguous) {
    bopt = BinaryOpType::VectorScalar;
  } else if (
      (a.flags().row_contiguous && b.flags().row_contiguous) ||
      (a.flags().col_contiguous && b.flags().col_contiguous)) {
    bopt = BinaryOpType::VectorVector;
  } else {
    bopt = BinaryOpType::General;
  }
  return bopt;
}

inline void set_binary_op_output_data(
    const array& a,
    const array& b,
    array& out,
    BinaryOpType bopt,
    std::function<allocator::Buffer(size_t)> mallocfn = allocator::malloc) {
  bool b_donatable = is_donatable(b, out);
  bool a_donatable = is_donatable(a, out);
  switch (bopt) {
    case BinaryOpType::ScalarScalar:
      out.set_data(mallocfn(out.itemsize()), 1, a.strides(), a.flags());
      break;
    case BinaryOpType::ScalarVector:
      if (b_donatable) {
        out.copy_shared_buffer(b);
      } else {
        out.set_data(
            mallocfn(b.data_size() * out.itemsize()),
            b.data_size(),
            b.strides(),
            b.flags());
      }
      break;
    case BinaryOpType::VectorScalar:
      if (a_donatable) {
        out.copy_shared_buffer(a);
      } else {
        out.set_data(
            mallocfn(a.data_size() * out.itemsize()),
            a.data_size(),
            a.strides(),
            a.flags());
      }
      break;
    case BinaryOpType::VectorVector:
      if (a_donatable) {
        out.copy_shared_buffer(a);
      } else if (b_donatable) {
        out.copy_shared_buffer(b);
      } else {
        out.set_data(
            mallocfn(a.data_size() * out.itemsize()),
            a.data_size(),
            a.strides(),
            a.flags());
      }
      break;
    case BinaryOpType::General:
      if (a_donatable && a.flags().row_contiguous && a.size() == out.size()) {
        out.copy_shared_buffer(a);
      } else if (
          b_donatable && b.flags().row_contiguous && b.size() == out.size()) {
        out.copy_shared_buffer(b);
      } else {
        out.set_data(mallocfn(out.nbytes()));
      }
      break;
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/common/broadcasting.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/backend/common/utils.h"

namespace mlx::core {

void broadcast(const array& in, array& out) {
  if (out.size() == 0) {
    out.set_data(allocator::malloc(0));
    return;
  }
  Strides strides(out.ndim(), 0);
  int diff = out.ndim() - in.ndim();
  for (int i = in.ndim() - 1; i >= 0; --i) {
    strides[i + diff] = (in.shape()[i] == 1) ? 0 : in.strides()[i];
  }
  auto flags = in.flags();
  if (out.size() > in.size()) {
    flags.row_contiguous = flags.col_contiguous = false;
  }
  out.copy_shared_buffer(in, strides, flags, in.data_size());
}

} // namespace mlx::core


================================================
FILE: mlx/backend/common/broadcasting.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/array.h"

namespace mlx::core {

void broadcast(const array& in, array& out);

} // namespace mlx::core


================================================
FILE: mlx/backend/common/buffer_cache.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include <algorithm>
#include <cassert>
#include <functional>
#include <map>

namespace mlx::core {

template <typename T>
class BufferCache {
 public:
  BufferCache(
      size_t page_size,
      std::function<size_t(T*)> get_size,
      std::function<void(T*)> free)
      : page_size_(page_size),
        get_size_(std::move(get_size)),
        free_(std::move(free)) {}

  ~BufferCache() {
    clear();
  }

  BufferCache(const BufferCache&) = delete;
  BufferCache& operator=(const BufferCache&) = delete;

  T* reuse_from_cache(size_t size) {
    // Find the closest buffer in pool.
    auto it = buffer_pool_.lower_bound(size);
    if (it == buffer_pool_.end() ||
        it->first >= std::min(2 * size, size + 2 * page_size_)) {
      return nullptr;
    }

    // Collect from the cache.
    T* buf = it->second->buf;
    pool_size_ -= it->first;

    // Remove from record.
    remove_from_list(it->second);
    buffer_pool_.erase(it);
    return buf;
  }

  void recycle_to_cache(T* buf) {
    assert(buf);
    // Add to cache.
    BufferHolder* bh = new BufferHolder(buf);
    add_at_head(bh);
    size_t size = get_size_(buf);
    pool_size_ += size;
    buffer_pool_.emplace(size, bh);
  }

  int release_cached_buffers(size_t min_bytes_to_free) {
    if (min_bytes_to_free >= 0.9 * pool_size_) {
      return clear();
    } else {
      int n_release = 0;
      size_t total_bytes_freed = 0;

      while (tail_ && (total_bytes_freed < min_bytes_to_free)) {
        // Release buffer.
        size_t size = get_size_(tail_->buf);
        total_bytes_freed += size;
        free_(tail_->buf);
        n_release++;

        // Remove from record.
        auto its = buffer_pool_.equal_range(size);
        auto it = std::find_if(its.first, its.second, [this](const auto& el) {
          return el.second == tail_;
        });
        assert(it != buffer_pool_.end());
        buffer_pool_.erase(it);
        remove_from_list(tail_);
      }

      pool_size_ -= total_bytes_freed;
      return n_release;
    }
  }

  int clear() {
    int n_release = 0;
    for (auto& [size, holder] : buffer_pool_) {
      free_(holder->buf);
      n_release++;
      delete holder;
    }
    buffer_pool_.clear();
    pool_size_ = 0;
    head_ = nullptr;
    tail_ = nullptr;
    return n_release;
  }

  size_t cache_size() const {
    return pool_size_;
  }

  size_t page_size() const {
    return page_size_;
  }

 private:
  struct BufferHolder {
   public:
    explicit BufferHolder(T* buf_) : buf(buf_) {}

    BufferHolder* prev{nullptr};
    BufferHolder* next{nullptr};
    T* buf;
  };

  void add_at_head(BufferHolder* to_add) {
    if (!head_) {
      head_ = to_add;
      tail_ = to_add;
    } else {
      head_->prev = to_add;
      to_add->next = head_;
      head_ = to_add;
    }
  }

  void remove_from_list(BufferHolder* to_remove) {
    if (to_remove->prev && to_remove->next) { // if middle
      to_remove->prev->next = to_remove->next;
      to_remove->next->prev = to_remove->prev;
    } else if (to_remove->prev && to_remove == tail_) { // if tail
      tail_ = to_remove->prev;
      tail_->next = nullptr;
    } else if (to_remove == head_ && to_remove->next) { // if head
      head_ = to_remove->next;
      head_->prev = nullptr;
    } else if (to_remove == head_ && to_remove == tail_) { // if only element
      head_ = nullptr;
      tail_ = nullptr;
    }

    delete to_remove;
  }

  std::multimap<size_t, BufferHolder*> buffer_pool_;
  BufferHolder* head_{nullptr};
  BufferHolder* tail_{nullptr};
  size_t pool_size_{0};

  const size_t page_size_;
  std::function<size_t(T*)> get_size_;
  std::function<void(T*)> free_;
};

} // namespace mlx::core


================================================
FILE: mlx/backend/common/common.cpp
================================================
// Copyright © 2024 Apple Inc.
#include <cassert>

#include "mlx/backend/common/broadcasting.h"
#include "mlx/backend/common/utils.h"
#include "mlx/primitives.h"

namespace mlx::core {

void AsStrided::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);

  auto& in = inputs[0];

  if (!in.flags().row_contiguous) {
    // Just ensuring that inputs[0] came from the ops which would ensure the
    // input is row contiguous.
    throw std::runtime_error(
        "AsStrided must be used with row contiguous arrays only.");
  }

  // Compute the flags given the shape and strides
  bool row_contiguous = true, col_contiguous = true;
  size_t r = 1, c = 1;
  for (int i = strides_.size() - 1, j = 0; i >= 0; i--, j++) {
    row_contiguous &= (r == strides_[i]) || (shape_[i] == 1);
    col_contiguous &= (c == strides_[j]) || (shape_[j] == 1);
    r *= shape_[i];
    c *= shape_[j];
  }
  auto flags = in.flags();
  // TODO: Compute the contiguous flag in a better way cause now we are
  //       unnecessarily strict.
  flags.contiguous = row_contiguous || col_contiguous;
  flags.row_contiguous = row_contiguous;
  flags.col_contiguous = col_contiguous;

  // There is no easy way to compute the actual data size so we use out.size().
  // The contiguous flag will almost certainly not be set so no code should
  // rely on data_size anyway.
  size_t data_size = out.size();

  return out.copy_shared_buffer(in, strides_, flags, data_size, offset_);
}

void Broadcast::eval(const std::vector<array>& inputs, array& out) {
  broadcast(inputs[0], out);
}

void BroadcastAxes::eval(const std::vector<array>& inputs, array& out) {
  broadcast(inputs[0], out);
}

void Copy::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  out.copy_shared_buffer(inputs[0]);
}

void CustomTransforms::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() > outputs.size());
  for (int i = 0, j = inputs.size() - outputs.size(); i < outputs.size();
       i++, j++) {
    outputs[i].copy_shared_buffer(inputs[j]);
  }
}

void Depends::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() > outputs.size());
  for (int i = 0; i < outputs.size(); i++) {
    outputs[i].copy_shared_buffer(inputs[i]);
  }
}

void ExpandDims::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  auto strides = in.strides();
  for (auto ax : axes_) {
    strides.insert(strides.begin() + ax, 1);
  }
  out.copy_shared_buffer(in, strides, in.flags(), in.data_size());
}

void NumberOfElements::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  out.set_data(allocator::malloc(out.nbytes()));

  double numel = 1;
  for (auto ax : axes_) {
    numel *= inputs[0].shape(ax);
  }

  if (inverted_) {
    numel = 1.0 / numel;
  }

  switch (out.dtype()) {
    case bool_:
      *out.data<bool>() = static_cast<bool>(numel);
      break;
    case uint8:
      *out.data<uint8_t>() = static_cast<uint8_t>(numel);
      break;
    case uint16:
      *out.data<uint16_t>() = static_cast<uint16_t>(numel);
      break;
    case uint32:
      *out.data<uint32_t>() = static_cast<uint32_t>(numel);
      break;
    case uint64:
      *out.data<uint64_t>() = static_cast<uint64_t>(numel);
      break;
    case int8:
      *out.data<int8_t>() = static_cast<int8_t>(numel);
      break;
    case int16:
      *out.data<int16_t>() = static_cast<int16_t>(numel);
      break;
    case int32:
      *out.data<int32_t>() = static_cast<int32_t>(numel);
      break;
    case int64:
      *out.data<int64_t>() = static_cast<int64_t>(numel);
      break;
    case float16:
      *out.data<float16_t>() = static_cast<float16_t>(numel);
      break;
    case float32:
      *out.data<float>() = static_cast<float>(numel);
      break;
    case bfloat16:
      *out.data<bfloat16_t>() = static_cast<bfloat16_t>(numel);
      break;
    case float64:
      *out.data<double>() = static_cast<double>(numel);
      break;
    case complex64:
      *out.data<complex64_t>() = static_cast<complex64_t>(numel);
      break;
  }
}

std::pair<bool, Strides> prepare_reshape(const array& in, const array& out) {
  // Special case for empty arrays or row contiguous arrays
  if (in.size() == 0 || in.flags().row_contiguous) {
    return {false, out.strides()};
  }

  // Special case for scalars
  if (in.ndim() == 0) {
    return {false, Strides(out.ndim(), 0)};
  }

  // Firstly let's collapse all the contiguous dimensions of the input
  auto [shape, strides] = collapse_contiguous_dims(in);

  // If shapes fit exactly in the contiguous dims then no copy is necessary so
  // let's check.
  Strides out_strides;
  bool copy_necessary = false;
  int j = 0;
  for (int i = 0; i < out.ndim(); i++) {
    int N = out.shape(i);
    if (j < shape.size() && shape[j] % N == 0) {
      shape[j] /= N;
      out_strides.push_back(shape[j] * strides[j]);
      j += (shape[j] == 1);
    } else if (N == 1) {
      // i > 0 because otherwise j < shape.size() && shape[j] % 1 == 0
      out_strides.push_back(out_strides.back());
    } else {
      copy_necessary = true;
      break;
    }
  }

  return {copy_necessary, out_strides};
}

void shared_buffer_reshape(
    const array& in,
    const Strides& out_strides,
    array& out) {
  auto flags = in.flags();
  if (flags.row_contiguous) {
    // For row contiguous reshapes:
    // - Shallow copy the buffer
    // - If reshaping into a vector (all singleton dimensions except one) it
    //    becomes col contiguous again.
    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
  }
  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
}

void Split::eval(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() == 1);

  auto& in = inputs[0];

  auto compute_new_flags = [](const auto& shape,
                              const auto& strides,
                              size_t in_data_size,
                              auto flags) {
    size_t data_size = 1;
    size_t f_stride = 1;
    size_t b_stride = 1;
    flags.row_contiguous = true;
    flags.col_contiguous = true;
    for (int i = 0, ri = shape.size() - 1; ri >= 0; i++, ri--) {
      flags.col_contiguous &= strides[i] == f_stride || shape[i] == 1;
      flags.row_contiguous &= strides[ri] == b_stride || shape[ri] == 1;
      f_stride *= shape[i];
      b_stride *= shape[ri];
      if (strides[i] > 0) {
        data_size *= shape[i];
      }
    }

    if (data_size == 1) {
      // Broadcasted scalar array is contiguous.
      flags.contiguous = true;
    } else if (data_size == in_data_size) {
      // Means we sliced a broadcasted dimension so leave the "no holes" flag
      // alone.
    } else {
      // We sliced something. So either we are row or col contiguous or we
      // punched a hole.
      flags.contiguous &= flags.row_contiguous || flags.col_contiguous;
    }

    return std::pair<decltype(flags), size_t>{flags, data_size};
  };

  std::vector<int> indices(1, 0);
  indices.insert(indices.end(), indices_.begin(), indices_.end());
  for (int i = 0; i < indices.size(); i++) {
    size_t offset = indices[i] * in.strides()[axis_];
    auto [new_flags, data_size] = compute_new_flags(
        outputs[i].shape(), in.strides(), in.data_size(), in.flags());
    outputs[i].copy_shared_buffer(
        in, in.strides(), new_flags, data_size, offset);
  }
}

void Squeeze::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  Strides strides;
  for (int i = 0, j = 0; i < in.ndim(); ++i) {
    if (j < axes_.size() && i == axes_[j]) {
      j++;
    } else {
      strides.push_back(in.strides(i));
    }
  }
  out.copy_shared_buffer(in, strides, in.flags(), in.data_size());
}

void StopGradient::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  out.copy_shared_buffer(inputs[0]);
}

void Transpose::eval(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  Strides out_strides(out.ndim());
  auto& in = inputs[0];
  for (int ax = 0; ax < axes_.size(); ++ax) {
    out_strides[ax] = in.strides()[axes_[ax]];
  }

  // Conditions for {row/col}_contiguous
  // - array must be contiguous (no gaps)
  // - underlying buffer size should have the same size as the array
  // - cumulative product of shapes is equal to the strides (we can ignore axes
  //   with size == 1)
  //   - in the forward direction (column contiguous)
  //   - in the reverse direction (row contiguous)
  // - vectors are both row and col contiguous (hence if both row/col are
  //   true, they stay true)
  auto flags = in.flags();
  if (flags.contiguous && in.data_size() == in.size()) {
    int64_t f_stride = 1;
    int64_t b_stride = 1;
    flags.col_contiguous = true;
    flags.row_contiguous = true;
    for (int i = 0, ri = out.ndim() - 1; i < out.ndim(); ++i, --ri) {
      flags.col_contiguous &= (out_strides[i] == f_stride || out.shape(i) == 1);
      f_stride *= out.shape(i);
      flags.row_contiguous &=
          (out_strides[ri] == b_stride || out.shape(ri) == 1);
      b_stride *= out.shape(ri);
    }
  }
  out.copy_shared_buffer(in, out_strides, flags, in.data_size());
}

} // namespace mlx::core


================================================
FILE: mlx/backend/common/compiled.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include "mlx/backend/common/compiled.h"
#include "mlx/backend/common/utils.h"
#include "mlx/utils.h"

namespace mlx::core {

void print_constant(std::ostream& os, const array& x) {
  switch (x.dtype()) {
    case float32:
      return print_float_constant<float>(os, x);
    case float16:
      return print_float_constant<float16_t>(os, x);
    case bfloat16:
      return print_float_constant<bfloat16_t>(os, x);
    case float64:
      return print_float_constant<double>(os, x);
    case complex64:
      return print_complex_constant<complex64_t>(os, x);
    case int8:
      os << static_cast<int32_t>(x.item<int8_t>());
      return;
    case int16:
      return print_int_constant<int16_t>(os, x);
    case int32:
      return print_int_constant<int32_t>(os, x);
    case int64:
      return print_int_constant<int64_t>(os, x);
    case uint8:
      os << static_cast<uint32_t>(x.item<uint8_t>());
      return;
    case uint16:
      return print_int_constant<uint16_t>(os, x);
    case uint32:
      return print_int_constant<uint32_t>(os, x);
    case uint64:
      return print_int_constant<uint64_t>(os, x);
    case bool_:
      os << std::boolalpha << x.item<bool>();
      return;
    default:
      throw std::runtime_error("Unsupported constant type");
  }
}

std::string get_type_string(Dtype d) {
  switch (d) {
    case float32:
      return "float";
    case float16:
      return "float16_t";
    case bfloat16:
      return "bfloat16_t";
    case float64:
      return "double";
    case complex64:
      return "complex64_t";
    case bool_:
      return "bool";
    case int8:
      return "int8_t";
    case int16:
      return "int16_t";
    case int32:
      return "int32_t";
    case int64:
      return "int64_t";
    case uint8:
      return "uint8_t";
    case uint16:
      return "uint16_t";
    case uint32:
      return "uint32_t";
    case uint64:
      return "uint64_t";
    default: {
      std::ostringstream msg;
      msg << "Unsupported compilation type " << d;
      throw std::runtime_error(msg.str());
    }
  }
}

bool compiled_check_contiguity(
    const std::vector<array>& inputs,
    const Shape& shape) {
  bool contiguous = true;
  bool all_contig = true;
  bool all_row_contig = true;
  bool all_col_contig = true;
  int non_scalar_inputs = 0;
  for (const auto& x : inputs) {
    if (is_scalar(x)) {
      continue;
    }
    non_scalar_inputs++;
    bool shape_eq = x.shape() == shape;
    all_contig &= (x.flags().contiguous && shape_eq);
    all_row_contig &= (x.flags().row_contiguous && shape_eq);
    all_col_contig &= (x.flags().col_contiguous && shape_eq);
  }
  if (non_scalar_inputs > 1 && !all_row_contig && !all_col_contig) {
    contiguous = false;
  } else if (non_scalar_inputs == 1 && !all_contig) {
    contiguous = false;
  } else if (non_scalar_inputs == 0 && !shape.empty()) {
    contiguous = false;
  }
  return contiguous;
}

void compiled_allocate_outputs(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const std::function<bool(size_t)>& is_constant,
    bool contiguous,
    const std::function<allocator::Buffer(size_t)>&
        mallocfn /* = allocator::malloc */) {
  if (contiguous) {
    int o = 0;
    Strides strides;
    size_t data_size;
    array::Flags flags;
    for (int i = 0; i < inputs.size() && o < outputs.size(); ++i) {
      auto& in = inputs[i];
      // Conditions for donation
      // - Correct size
      // - Not a scalar
      // - Donatable
      // - Not a constant
      if (in.itemsize() == outputs[o].itemsize() && !is_scalar(in) &&
          in.is_donatable() && !is_constant(i)) {
        outputs[o++].copy_shared_buffer(in);
      }
      // Get representative input flags to properly set non-donated outputs
      if (strides.empty() && in.size() == outputs[0].size()) {
        strides = in.strides();
        flags = in.flags();
        data_size = in.data_size();
      }
    }
    for (; o < outputs.size(); ++o) {
      outputs[o].set_data(
          mallocfn(data_size * outputs[o].itemsize()),
          data_size,
          strides,
          flags);
    }
  } else {
    int o = 0;
    for (int i = 0; i < inputs.size() && o < outputs.size(); ++i) {
      auto& in = inputs[i];
      // Conditions for donation
      // - Row contiguous
      // - Donatable
      // - Correct size
      // - Not a constant
      if (in.flags().row_contiguous && in.size() == outputs[o].size() &&
          in.itemsize() == outputs[o].itemsize() && in.is_donatable() &&
          !is_constant(i)) {
        outputs[o].copy_shared_buffer(
            in, outputs[o].strides(), in.flags(), in.data_size());
        o++;
      }
    }
    for (; o < outputs.size(); ++o) {
      outputs[o].set_data(mallocfn(outputs[o].nbytes()));
    }
  }
}

std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(
    const std::vector<array>& inputs,
    const array& out,
    const std::function<bool(size_t)>& is_constant) {
  const Shape& shape = out.shape();
  bool contiguous = compiled_check_contiguity(inputs, shape);
  if (contiguous) {
    return {true, shape, {}};
  }

  std::vector<Strides> strides_vec{out.strides()};
  for (size_t i = 0; i < inputs.size(); ++i) {
    // Skip constants.
    if (is_constant(i)) {
      continue;
    }

    // Skip scalar inputs.
    const auto& x = inputs[i];
    if (is_scalar(x)) {
      continue;
    }

    // Broadcast the inputs to the output shape.
    Strides xstrides;
    size_t j = 0;
    for (; j < shape.size() - x.ndim(); ++j) {
      if (shape[j] == 1) {
        xstrides.push_back(out.strides()[j]);
      } else {
        xstrides.push_back(0);
      }
    }
    for (size_t i = 0; i < x.ndim(); ++i, ++j) {
      if (x.shape(i) == 1) {
        if (shape[j] == 1) {
          xstrides.push_back(out.strides()[j]);
        } else {
          xstrides.push_back(0);
        }
      } else {
        xstrides.push_back(x.strides()[i]);
      }
    }
    strides_vec.push_back(std::move(xstrides));
  }

  auto tup = collapse_contiguous_dims(shape, strides_vec, INT32_MAX);
  return {false, std::move(std::get<0>(tup)), std::move(std::get<1>(tup))};
}

bool compiled_use_large_index(
    const std::vector<array>& inputs,
    const std::vector<array>& outputs,
    bool contiguous) {
  if (contiguous) {
    size_t max_size = 0;
    for (const auto& in : inputs) {
      max_size = std::max(max_size, in.data_size());
    }
    return max_size > UINT32_MAX;
  } else {
    size_t max_size = 0;
    for (const auto& o : outputs) {
      max_size = std::max(max_size, o.size());
    }
    return max_size > UINT32_MAX;
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/common/compiled.h
================================================
// Copyright © 2023-2024 Apple Inc.
#pragma once

#include <functional>
#include <iomanip>

#include "mlx/array.h"
#include "mlx/primitives.h"

namespace mlx::core {

inline bool is_static_cast(const Primitive& p) {
  return (typeid(p) == typeid(Broadcast) || typeid(p) == typeid(AsType));
}

std::string get_type_string(Dtype d);

template <typename T>
void print_float_constant(std::ostream& os, const array& x) {
  auto old_precision = os.precision();
  if constexpr (std::is_same_v<T, double>) {
    os << std::setprecision(std::numeric_limits<double>::digits10 + 1);
  } else {
    os << std::setprecision(std::numeric_limits<float>::digits10 + 1);
  }
  os << x.item<T>() << std::setprecision(old_precision);
}

template <typename T>
void print_int_constant(std::ostream& os, const array& x) {
  os << x.item<T>();
}

template <typename T>
void print_complex_constant(std::ostream& os, const array& x) {
  auto old_precision = os.precision();
  T constant = x.item<T>();

  os << get_type_string(x.dtype()) << "("
     << std::setprecision(std::numeric_limits<float>::digits10 + 1)
     << constant.real() << ", " << constant.imag() << ")"
     << std::setprecision(old_precision);
}

void print_constant(std::ostream& os, const array& x);

inline bool is_scalar(const array& x) {
  return x.ndim() == 0;
}

// Check if we can use a contiguous operation given inputs and the output shape
bool compiled_check_contiguity(
    const std::vector<array>& inputs,
    const Shape& shape);

// Allocate space for the outputs possibly with input donation
void compiled_allocate_outputs(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const std::function<bool(size_t)>& is_constant,
    bool contiguous,
    const std::function<allocator::Buffer(size_t)>& mallocfn =
        allocator::malloc);

// Collapse contiguous dims ignoring scalars and constants.
std::tuple<bool, Shape, std::vector<Strides>> compiled_collapse_contiguous_dims(
    const std::vector<array>& inputs,
    const array& out,
    const std::function<bool(size_t)>& is_constant);

// Return whether the kernel should use large index.
bool compiled_use_large_index(
    const std::vector<array>& inputs,
    const std::vector<array>& outputs,
    bool contiguous);

} // namespace mlx::core


================================================
FILE: mlx/backend/common/copy.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include "mlx/backend/common/utils.h"

namespace mlx::core {

enum class CopyType {
  // Copy a raw scalar input into the full contiguous output
  Scalar,

  // Copy the raw input buffer contiguously into a raw output buffer of the same
  // size
  Vector,

  // Copy the full virtual input to the full contiguous output
  General,

  // Copy the full virtual input to the full virtual output. We assume the
  // input and output have the same shape.
  GeneralGeneral
};

inline bool set_copy_output_data(
    const array& in,
    array& out,
    CopyType ctype,
    std::function<allocator::Buffer(size_t)> mallocfn = allocator::malloc) {
  if (ctype == CopyType::Vector) {
    // If the input is donateable, we are doing a vector copy and the types
    // have the same size, then the input buffer can hold the output.
    if (is_donatable(in, out)) {
      out.copy_shared_buffer(in);
      return true;
    } else {
      out.set_data(
          mallocfn(in.data_size() * out.itemsize()),
          in.data_size(),
          in.strides(),
          in.flags());
      return false;
    }
  } else {
    out.set_data(mallocfn(out.nbytes()));
    return false;
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/common/hadamard.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include <map>

#include "mlx/utils.h"

namespace mlx::core {

// From http://neilsloane.com/hadamard/
constexpr std::string_view h12 = R"(
+-++++++++++
--+-+-+-+-+-
+++-++----++
+---+--+-++-
+++++-++----
+-+---+--+-+
++--+++-++--
+--++---+--+
++----+++-++
+--+-++---+-
++++----+++-
+-+--+-++---
)";

constexpr std::string_view h20 = R"(
+----+----++--++-++-
-+----+---+++---+-++
--+----+---+++-+-+-+
---+----+---+++++-+-
----+----++--++-++-+
-+++++-----+--+++--+
+-+++-+---+-+--+++--
++-++--+---+-+--+++-
+++-+---+---+-+--+++
++++-----++--+-+--++
--++-+-++-+-----++++
---++-+-++-+---+-+++
+---++-+-+--+--++-++
++---++-+----+-+++-+
-++---++-+----+++++-
-+--+--++-+----+----
+-+-----++-+----+---
-+-+-+---+--+----+--
--+-+++------+----+-
+--+--++------+----+
)";

constexpr std::string_view h28 = R"(
+------++----++-+--+-+--++--
-+-----+++-----+-+--+-+--++-
--+-----+++---+-+-+----+--++
---+-----+++---+-+-+-+--+--+
----+-----+++---+-+-+++--+--
-----+-----++++--+-+--++--+-
------++----++-+--+-+--++--+
--++++-+-------++--+++-+--+-
---++++-+-----+-++--+-+-+--+
+---+++--+----++-++--+-+-+--
++---++---+----++-++--+-+-+-
+++---+----+----++-++--+-+-+
++++--------+-+--++-++--+-+-
-++++--------+++--++--+--+-+
-+-++-++--++--+--------++++-
+-+-++--+--++--+--------++++
-+-+-++--+--++--+----+---+++
+-+-+-++--+--+---+---++---++
++-+-+-++--+------+--+++---+
-++-+-+-++--+------+-++++---
+-++-+---++--+------+-++++--
-++--++-+-++-+++----++------
+-++--++-+-++-+++-----+-----
++-++---+-+-++-+++-----+----
-++-++-+-+-+-+--+++-----+---
--++-++++-+-+----+++-----+--
+--++-+-++-+-+----+++-----+-
++--++-+-++-+-+----++------+
)";

inline const std::map<int, std::string_view> hadamard_matrices() {
  return {{12, h12}, {20, h20}, {28, h28}};
}

inline std::pair<int, int> decompose_hadamard(int n) {
  // n = m*2^k
  int m = 1;
  if (!is_power_of_2(n)) {
    auto h_matrices = hadamard_matrices();
    for (auto [factor, _] : h_matrices) {
      if (n % factor == 0) {
        m = factor;
        n /= factor;
        break;
      }
    }
    if (m == 1) {
      throw std::invalid_argument(
          "[hadamard] Only supports n = m*2^k where m in (1, 12, 20, 28).");
    }
  }
  if (n > (1 << 26)) {
    throw std::invalid_argument(
        "[hadamard] Only supports n = m*2^k where k <= 26");
  }
  return {n, m};
}

} // namespace mlx::core


================================================
FILE: mlx/backend/common/load.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <algorithm>
#include <utility>

#include "mlx/primitives.h"
#include "mlx/scheduler.h"

namespace {

template <const uint8_t scalar_size>
void swap_endianness(uint8_t* data_bytes, size_t N) {
  struct Elem {
    uint8_t bytes[scalar_size];
  };

  Elem* data = reinterpret_cast<Elem*>(data_bytes);

  for (size_t i = 0; i < N; i++) {
    for (size_t j = 0; j < (scalar_size / 2); j++) {
      std::swap(data[i].bytes[j], data[i].bytes[scalar_size - j - 1]);
    }
  }
}

} // namespace

namespace mlx::core {

void Load::eval_cpu(const std::vector<array>& inputs, array& out) {
  out.set_data(allocator::malloc(out.nbytes()));
  auto read_task = [out_ptr = out.data<char>(),
                    size = out.size(),
                    itemsize = out.itemsize(),
                    offset = offset_,
                    reader = reader_,
                    swap_endianness_ = swap_endianness_]() mutable {
    reader->read(out_ptr, size * itemsize, offset);
    if (swap_endianness_) {
      switch (itemsize) {
        case 2:
          swap_endianness<2>(reinterpret_cast<uint8_t*>(out_ptr), size);
          break;
        case 4:
          swap_endianness<4>(reinterpret_cast<uint8_t*>(out_ptr), size);
          break;
        case 8:
          swap_endianness<8>(reinterpret_cast<uint8_t*>(out_ptr), size);
          break;
      }
    }
  };
  auto fut = io::thread_pool().enqueue(std::move(read_task)).share();
  scheduler::enqueue(stream(), [fut = std::move(fut)]() { fut.wait(); });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/common/matmul.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/backend/common/utils.h"
#include "mlx/utils.h"

#include <sstream>

namespace mlx::core {

inline std::tuple<Shape, Strides, Strides> collapse_batches(
    const array& a,
    const array& b) {
  if (a.ndim() == 2) {
    return {Shape{1}, Strides{0}, Strides{0}};
  }

  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};

  auto [batch_shape, batch_strides] =
      collapse_contiguous_dims(A_bshape, std::vector{A_bstride, B_bstride});

  auto a_batch_strides = batch_strides[0];
  auto b_batch_strides = batch_strides[1];

  if (batch_shape.empty()) {
    batch_shape.push_back(1);
    a_batch_strides.push_back(0);
    b_batch_strides.push_back(0);
  }

  return std::make_tuple(batch_shape, a_batch_strides, b_batch_strides);
}

inline std::tuple<Shape, Strides, Strides, Strides>
collapse_batches(const array& a, const array& b, const array& c) {
  if (a.ndim() == 2) {
    return {Shape{1}, Strides{0}, Strides{0}, Strides{0}};
  }

  Shape A_bshape{a.shape().begin(), a.shape().end() - 2};
  Strides A_bstride{a.strides().begin(), a.strides().end() - 2};
  Strides B_bstride{b.strides().begin(), b.strides().end() - 2};
  Strides C_bstride{c.strides().begin(), c.strides().end() - 2};

  auto [batch_shape, batch_strides] = collapse_contiguous_dims(
      A_bshape, std::vector{A_bstride, B_bstride, C_bstride});

  auto A_batch_stride = batch_strides[0];
  auto B_batch_stride = batch_strides[1];
  auto C_batch_stride = batch_strides[2];

  if (batch_shape.empty()) {
    batch_shape.push_back(1);
    A_batch_stride.push_back(0);
    B_batch_stride.push_back(0);
    C_batch_stride.push_back(0);
  }

  return std::make_tuple(
      batch_shape, A_batch_stride, B_batch_stride, C_batch_stride);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/common/quantized.h
================================================
// Copyright © 2026 Apple Inc.

namespace mlx::core {

inline constexpr short get_pack_factor(int bits, int wsize = 8) {
  return (bits == 3 || bits == 5) ? 8 : (bits == 6 ? 4 : wsize / bits);
}

inline constexpr short get_bytes_per_pack(int bits, int wsize = 8) {
  bool power_of_2_bits = (bits & (bits - 1)) == 0;
  return power_of_2_bits ? (wsize / 8) : (bits == 5 ? 5 : 3);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/common/reduce.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/backend/common/reduce.h"

namespace mlx::core {

std::pair<Shape, Strides> shapes_without_reduction_axes(
    Shape shape,
    Strides strides,
    const std::vector<int>& axes) {
  for (int i = axes.size() - 1; i >= 0; i--) {
    int a = axes[i];
    shape.erase(shape.begin() + a);
    strides.erase(strides.begin() + a);
  }

  return std::make_pair(shape, strides);
}

std::pair<Shape, Strides> shapes_without_reduction_axes(
    const array& x,
    const std::vector<int>& axes) {
  auto shape = x.shape();
  auto strides = x.strides();
  return shapes_without_reduction_axes(
      std::move(shape), std::move(strides), axes);
}

ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes) {
  // The data is all there and we are reducing over everything
  if (x.size() == x.data_size() && axes.size() == x.ndim() &&
      x.flags().contiguous) {
    return ContiguousAllReduce;
  }

  // Row contiguous input so the output is row contiguous
  if (x.flags().row_contiguous) {
    // Merge consecutive axes
    Shape shape = {x.shape(axes[0])};
    Strides strides = {x.strides()[axes[0]]};
    for (int i = 1; i < axes.size(); i++) {
      if (axes[i] - 1 == axes[i - 1] && x.shape(axes[i]) > 1) {
        shape.back() *= x.shape(axes[i]);
        strides.back() = x.strides()[axes[i]];
      } else {
        shape.push_back(x.shape(axes[i]));
        strides.push_back(x.strides()[axes[i]]);
      }
    }

    // Remove singleton axes from the plan
    for (int i = shape.size() - 1; i >= 0; i--) {
      if (shape[i] == 1) {
        shape.erase(shape.begin() + i);
        strides.erase(strides.begin() + i);
      }
    }

    if (strides.back() == 1) {
      return ReductionPlan(ContiguousReduce, shape, strides);
    } else if (strides.back() > 1) {
      return ReductionPlan(ContiguousStridedReduce, shape, strides);
    }
  }

  // Let's check if we can optimize our access patterns
  //
  // 1. We have a reduction axis with stride 1. Simply call
  //    GeneralContiguousReduce and be done with it.
  // 2. We have transpositions and we are not reducing over the axis with
  //    stride 1. However, we are reducing over an axis where everything is
  //    contiguous in memory to the right of that axis. We can call strided
  //    reduce and be done with it.
  // 2. We have weird transpositions and expands. Copy the strides to the
  //    output, then call strided reduce.

  // Sort reduction axes by stride in order to merge them and figure out if we
  // have a contiguous reduction.
  std::vector<std::pair<int, int64_t>> reductions;
  for (auto a : axes) {
    if (x.shape(a) > 1) {
      reductions.push_back(std::make_pair(x.shape(a), x.strides()[a]));
    }
  }
  std::sort(reductions.begin(), reductions.end(), [](auto a, auto b) {
    bool a_is_zero = a.second == 0;
    bool b_is_zero = b.second == 0;
    return (a_is_zero != b_is_zero) ? a.second < b.second : a.second > b.second;
  });
  // Extract the two smallest and try to merge them in case the contiguous
  // reduction can be bigger than just the last axis.
  for (int i = reductions.size() - 1; i >= 1; i--) {
    auto a = reductions[i];
    auto b = reductions[i - 1];

    // b.stride = a.shape * a.stride then a and b are contiguous
    if (b.second == a.first * a.second) {
      reductions.erase(reductions.begin() + i);
      reductions[i - 1] = std::make_pair(a.first * b.first, a.second);
    }
  }

  Shape shape;
  Strides strides;
  for (auto r : reductions) {
    shape.push_back(r.first);
    strides.push_back(r.second);
  }

  // We can call the contiguous reduction op for every weird way the input is
  // structured in the rest of the axes.
  if (strides.back() == 1) {
    return ReductionPlan(GeneralContiguousReduce, shape, strides);
  }

  // Delegate to the general strided reduction op if the axes after
  // strides.back() are contiguous.
  if (strides.back() > 1) {
    int64_t size = 1;
    bool have_expand = false;
    for (int i = x.ndim() - 1; i >= 0; i--) {
      if (axes.back() == i) {
        continue;
      }

      auto stride_i = x.strides()[i];
      auto shape_i = x.shape(i);
      if (stride_i == 0) {
        if (shape_i == 1) {
          continue;
        }

        have_expand = true;
        break;
      }

      if (stride_i != size && shape_i != 1) {
        break;
      }
      size *= shape_i;
    }
    // In the case of an expanded dimension we are being conservative and
    // require the smallest reduction stride to be smaller than the maximum row
    // contiguous size. The reason is that we can't easily know if the reduced
    // axis is before or after an expanded dimension.
    if (size > strides.back() || (size == strides.back() && !have_expand)) {
      return ReductionPlan(GeneralStridedReduce, shape, strides);
    }
  }

  return ReductionPlan(GeneralReduce, shape, strides);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/common/reduce.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include "mlx/backend/common/utils.h"

namespace mlx::core {

enum ReductionOpType {
  // Self-explanatory. Read everything and produce 1 output.
  ContiguousAllReduce,

  // The input is contiguous and the last axis is reduced
  // N1xR1xN2xR2x...xNnxRn
  ContiguousReduce,

  // The input is contiguous and the last axis is not reduced
  // R1xN1xR2xN2x...xRnxNn
  ContiguousStridedReduce,

  // The input is not contiguous but the last axis is and it is reduced so we
  // need to figure out the offsets but we can call the contiguous reduce after
  // that.
  // N3xR1xN1xR4x...xRn
  GeneralContiguousReduce,

  // The input is not contiguous but the last reduction axis and the last axis
  // are so we need to figure out the offset but we can call the strided reduce
  // after that.
  GeneralStridedReduce,

  // The input is not contiguous after the reduction axis and it may contain
  // 0-stride axes or transpositions. We could copy the strides and produce a
  // transposed outcome or we can read the input out of order and write the
  // output in order.
  GeneralReduce
};

struct ReductionPlan {
  ReductionOpType type;
  Shape shape;
  Strides strides;

  ReductionPlan(ReductionOpType type_, Shape shape_, Strides strides_)
      : type(type_), shape(std::move(shape_)), strides(std::move(strides_)) {}
  ReductionPlan(ReductionOpType type_) : type(type_) {}
};

ReductionPlan get_reduction_plan(const array& x, const std::vector<int>& axes);

std::pair<Shape, Strides> shapes_without_reduction_axes(
    const array& x,
    const std::vector<int>& axes);
std::pair<Shape, Strides> shapes_without_reduction_axes(
    Shape shape,
    Strides strides,
    const std::vector<int>& axes);

} // namespace mlx::core


================================================
FILE: mlx/backend/common/slicing.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/backend/common/utils.h"

namespace mlx::core {

std::tuple<int64_t, Strides> prepare_slice(
    const array& in,
    const Shape& start_indices,
    const Shape& strides) {
  int64_t data_offset = 0;
  Strides inp_strides(in.ndim(), 0);
  for (int i = 0; i < in.ndim(); ++i) {
    data_offset += start_indices[i] * in.strides()[i];
    inp_strides[i] = in.strides()[i] * strides[i];
  }
  return std::make_tuple(data_offset, inp_strides);
}

void shared_buffer_slice(
    const array& in,
    const Strides& out_strides,
    int64_t data_offset,
    size_t data_size,
    array& out) {
  // Compute row/col contiguity
  auto [no_bsx_size, is_row_contiguous, is_col_contiguous] =
      check_contiguity(out.shape(), out_strides);

  auto flags = in.flags();
  flags.row_contiguous = is_row_contiguous;
  flags.col_contiguous = is_col_contiguous;
  flags.contiguous = (no_bsx_size == data_size);

  out.copy_shared_buffer(in, out_strides, flags, data_size, data_offset);
}

void slice(
    const array& in,
    array& out,
    const Shape& start_indices,
    const Shape& strides) {
  if (out.size() == 0) {
    out.set_data(allocator::malloc(0));
    return;
  }

  // Calculate out strides, initial offset
  auto [data_offset, inp_strides] = prepare_slice(in, start_indices, strides);

  // Get the location of the end based on the inp strides and out.shape()
  int64_t low_idx = 0;
  int64_t high_idx = 0;
  for (int i = 0; i < inp_strides.size(); ++i) {
    auto delta = inp_strides[i] * (out.shape()[i] - 1);
    if (inp_strides[i] > 0) {
      high_idx += delta;
    } else {
      low_idx += delta;
    }
  }
  int64_t data_size = (high_idx - low_idx) + 1;
  if (data_size < 0) {
    std::ostringstream msg;
    msg << "[slice] Computed invalid data size: " << data_size << ".";
    throw std::runtime_error(msg.str());
  }
  shared_buffer_slice(in, inp_strides, data_offset, data_size, out);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/common/slicing.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/array.h"

namespace mlx::core {

std::tuple<int64_t, Strides> prepare_slice(
    const array& in,
    const Shape& start_indices,
    const Shape& strides);

void slice(
    const array& in,
    array& out,
    const Shape& start_indices,
    const Shape& strides);

} // namespace mlx::core


================================================
FILE: mlx/backend/common/ternary.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once
#include "mlx/allocator.h"
#include "mlx/array.h"
#include "mlx/backend/common/utils.h"

namespace mlx::core {

// TODO: Add support for more combinations of input types.
enum class TernaryOpType {
  ScalarScalarScalar,
  VectorVectorVector,
  VectorVectorScalar,
  VectorScalarVector,
  General,
};

inline TernaryOpType
get_ternary_op_type(const array& a, const array& b, const array& c) {
  TernaryOpType topt;
  if (a.data_size() == 1 && b.data_size() == 1 && c.data_size() == 1) {
    topt = TernaryOpType::ScalarScalarScalar;
  } else if (
      (a.flags().row_contiguous && b.flags().row_contiguous &&
       c.flags().row_contiguous) ||
      (a.flags().col_contiguous && b.flags().col_contiguous &&
       c.flags().col_contiguous)) {
    topt = TernaryOpType::VectorVectorVector;
  } else if (
      b.data_size() == 1 && a.flags().row_contiguous &&
      c.flags().row_contiguous) {
    topt = TernaryOpType::VectorScalarVector;
  } else if (
      c.data_size() == 1 && a.flags().row_contiguous &&
      b.flags().row_contiguous) {
    topt = TernaryOpType::VectorVectorScalar;
  } else {
    topt = TernaryOpType::General;
  }
  return topt;
}

inline void set_ternary_op_output_data(
    const array& a,
    const array& b,
    const array& c,
    array& out,
    TernaryOpType topt,
    std::function<allocator::Buffer(size_t)> mallocfn = allocator::malloc) {
  auto maybe_donate = [&out](const array& x) {
    if (is_donatable(x, out)) {
      out.copy_shared_buffer(x);
      return true;
    }
    return false;
  };

  switch (topt) {
    case TernaryOpType::ScalarScalarScalar:
      out.set_data(mallocfn(out.itemsize()), 1, b.strides(), b.flags());
      break;
    case TernaryOpType::VectorVectorVector:
      if (!(maybe_donate(a) || maybe_donate(b) || maybe_donate(c))) {
        out.set_data(
            mallocfn(out.itemsize() * b.data_size()),
            b.data_size(),
            b.strides(),
            b.flags());
      }
      break;
    case TernaryOpType::VectorVectorScalar:
    case TernaryOpType::VectorScalarVector:
    case TernaryOpType::General:
      // Try to donate an input which is row_contiguous
      if (!((a.flags().row_contiguous && maybe_donate(a)) ||
            (b.flags().row_contiguous && maybe_donate(b)) ||
            (c.flags().row_contiguous && maybe_donate(c)))) {
        out.set_data(mallocfn(out.nbytes()));
      }
      break;
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/common/unary.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/allocator.h"
#include "mlx/backend/common/utils.h"

namespace mlx::core {

inline void set_unary_output_data(
    const array& in,
    array& out,
    std::function<allocator::Buffer(size_t)> mallocfn = allocator::malloc) {
  if (in.flags().contiguous) {
    if (is_donatable(in, out)) {
      out.copy_shared_buffer(in);
    } else {
      out.set_data(
          mallocfn(in.data_size() * out.itemsize()),
          in.data_size(),
          in.strides(),
          in.flags());
    }
  } else {
    out.set_data(mallocfn(out.nbytes()));
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/common/utils.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <dlfcn.h>

#include "mlx/backend/common/utils.h"

namespace mlx::core {

std::filesystem::path current_binary_dir() {
  static std::filesystem::path binary_dir = []() {
    Dl_info info;
    if (!dladdr(reinterpret_cast<void*>(&current_binary_dir), &info)) {
      throw std::runtime_error("Unable to get current binary dir.");
    }
    return std::filesystem::path(info.dli_fname).parent_path();
  }();
  return binary_dir;
}

std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
    const Shape& shape,
    const std::vector<Strides>& strides,
    int64_t size_cap) {
  // Make a vector that has axes separated with -1. Collapse all axes between
  // -1.
  Shape to_collapse;
  if (shape.size() > 0) {
    if (shape[0] != 1) {
      to_collapse.push_back(0);
    }
    size_t size = shape[0];
    for (int i = 1; i < shape.size(); i++) {
      bool contiguous = true;
      size *= shape[i];
      for (const auto& st : strides) {
        if (st[i] * shape[i] != st[i - 1] || size > size_cap) {
          contiguous = false;
          size = shape[i];
          break;
        }
      }
      if (!contiguous) {
        to_collapse.push_back(-1);
      }
      if (shape[i] != 1) {
        to_collapse.push_back(i);
      }
    }
    to_collapse.push_back(-1);
  }

  Shape out_shape;
  std::vector<Strides> out_strides(strides.size());
  for (int i = 0;;) {
    while (i < to_collapse.size() && to_collapse[i] == -1) {
      ++i;
    };
    if (i == to_collapse.size()) {
      break;
    }
    int current_shape = shape[to_collapse[i]];
    int k = i;
    while (to_collapse[++k] != -1) {
      current_shape *= shape[to_collapse[k]];
    }
    out_shape.push_back(current_shape);
    for (int j = 0; j < strides.size(); j++) {
      const auto& st = strides[j];
      out_strides[j].push_back(st[to_collapse[k - 1]]);
    }
    i = k + 1;
  }

  if (!shape.empty() && out_shape.empty()) {
    out_shape.push_back(1);
    for (auto& out_stride : out_strides) {
      out_stride.push_back(0);
    }
  }
  return std::make_tuple(out_shape, out_strides);
}

std::pair<Shape, Strides> collapse_contiguous_dims(
    const Shape& shape,
    const Strides& strides,
    int64_t size_cap) {
  Shape collapsed_shape;
  Strides collapsed_strides;

  if (shape.size() > 0) {
    collapsed_shape.push_back(shape[0]);
    collapsed_strides.push_back(strides[0]);
    for (int i = 1; i < shape.size(); i++) {
      if (shape[i] == 1) {
        continue;
      } else if (
          strides[i] * shape[i] != collapsed_strides.back() ||
          collapsed_shape.back() * static_cast<int64_t>(shape[i]) > size_cap) {
        collapsed_shape.push_back(shape[i]);
        collapsed_strides.push_back(strides[i]);
      } else {
        collapsed_shape.back() *= shape[i];
        collapsed_strides.back() = strides[i];
      }
    }
  }

  return std::make_pair(collapsed_shape, collapsed_strides);
}

std::pair<Shape, Strides> collapse_contiguous_dims(
    const array& a,
    int64_t size_cap /* = std::numeric_limits<int32_t>::max()*/) {
  return collapse_contiguous_dims(a.shape(), a.strides(), size_cap);
}

Dims get_block_dims_common(int dim0, int dim1, int dim2, int pow2 /* = 10 */) {
  int pows[3] = {0, 0, 0};
  int sum = 0;
  while (true) {
    int presum = sum;
    // Check all the pows
    if (dim0 >= (1 << (pows[0] + 1))) {
      pows[0]++;
      sum++;
    }
    if (sum == 10) {
      break;
    }
    if (dim1 >= (1 << (pows[1] + 1))) {
      pows[1]++;
      sum++;
    }
    if (sum == 10) {
      break;
    }
    if (dim2 >= (1 << (pows[2] + 1))) {
      pows[2]++;
      sum++;
    }
    if (sum == presum || sum == pow2) {
      break;
    }
  }
  return std::make_tuple(1ul << pows[0], 1ul << pows[1], 1ul << pows[2]);
}

Dims get_2d_grid_dims_common(const Shape& shape, const Strides& strides) {
  // Dims with strides of 0 are ignored as they
  // correspond to broadcasted dimensions
  size_t grid_x = 1;
  size_t grid_y = 1;
  for (int i = 0; i < shape.size(); ++i) {
    if (strides[i] == 0) {
      continue;
    }
    if (grid_x * shape[i] < UINT32_MAX) {
      grid_x *= shape[i];
    } else {
      grid_y *= shape[i];
    }
  }
  if (grid_y > UINT32_MAX || grid_x > UINT32_MAX) {
    throw std::runtime_error("Unable to safely factor shape.");
  }
  if (grid_y > grid_x) {
    std::swap(grid_x, grid_y);
  }
  return std::make_tuple(
      static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y), 1);
}

Dims get_2d_grid_dims_common(
    const Shape& shape,
    const Strides& strides,
    size_t divisor) {
  // Compute the 2d grid dimensions such that the total size of the grid is
  // divided by divisor.
  size_t grid_x = 1;
  size_t grid_y = 1;
  for (int i = 0; i < shape.size(); ++i) {
    if (strides[i] == 0) {
      continue;
    }

    // No need to add this shape we can just remove it from the divisor.
    if (divisor % shape[i] == 0) {
      divisor /= shape[i];
      continue;
    }

    if (grid_x * shape[i] < UINT32_MAX) {
      grid_x *= shape[i];
    } else {
      grid_y *= shape[i];
    }

    if (divisor > 1) {
      if (grid_x % divisor == 0) {
        grid_x /= divisor;
        divisor = 1;
      } else if (grid_y % divisor == 0) {
        grid_y /= divisor;
        divisor = 1;
      }
    }
  }
  if (grid_y > UINT32_MAX || grid_x > UINT32_MAX) {
    throw std::runtime_error("Unable to safely factor shape.");
  }
  if (grid_y > grid_x) {
    std::swap(grid_x, grid_y);
  }
  if (divisor > 1) {
    grid_x = ((grid_x + divisor - 1) / divisor) * divisor;
  }
  return std::make_tuple(
      static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y), 1);
}

std::pair<Dims, Dims> get_grid_and_block_common(int dim0, int dim1, int dim2) {
  auto [bx, by, bz] = get_block_dims_common(dim0, dim1, dim2);
  auto gx = (dim0 + bx - 1) / bx;
  auto gy = (dim1 + by - 1) / by;
  auto gz = (dim2 + bz - 1) / bz;

  return std::make_pair(
      std::make_tuple(gx, gy, gz), std::make_tuple(bx, by, bz));
}

} // namespace mlx::core


================================================
FILE: mlx/backend/common/utils.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <filesystem>
#include <tuple>
#include <vector>

#include "mlx/array.h"

namespace mlx::core {

// Return the directory that contains current shared library.
std::filesystem::path current_binary_dir();

inline int64_t
elem_to_loc(int elem, const Shape& shape, const Strides& strides) {
  int64_t loc = 0;
  for (int i = shape.size() - 1; i >= 0; --i) {
    auto q_and_r = ldiv(elem, shape[i]);
    loc += q_and_r.rem * strides[i];
    elem = q_and_r.quot;
  }
  return loc;
}

inline int64_t elem_to_loc(int elem, const array& a) {
  if (a.flags().row_contiguous) {
    return elem;
  }
  return elem_to_loc(elem, a.shape(), a.strides());
}

inline Strides make_contiguous_strides(const Shape& shape) {
  Strides strides(shape.size(), 1);
  for (int i = shape.size() - 1; i > 0; i--) {
    strides[i - 1] = strides[i] * shape[i];
  }
  return strides;
}

// Collapse dims that are contiguous to possibly route to a better kernel
// e.g. for x = transpose(array({0, 1, 2, 3, 4, 5, 6, 7}, {2, 2, 2}), {2, 0, 1})
// should return {{2, 4}, {{1, 2}}}.
//
// When multiple arrays are passed they should all have the same shape. The
// collapsed axes are also the same so one shape is returned.
std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
    const Shape& shape,
    const std::vector<Strides>& strides,
    int64_t size_cap = std::numeric_limits<int32_t>::max());

inline std::tuple<Shape, std::vector<Strides>> collapse_contiguous_dims(
    const std::vector<array>& xs,
    size_t size_cap = std::numeric_limits<int32_t>::max()) {
  std::vector<Strides> strides;
  for (auto& x : xs) {
    strides.emplace_back(x.strides());
  }
  return collapse_contiguous_dims(xs[0].shape(), strides, size_cap);
}

template <typename... Arrays, typename = enable_for_arrays_t<Arrays...>>
inline auto collapse_contiguous_dims(Arrays&&... xs) {
  return collapse_contiguous_dims(
      std::vector<array>{std::forward<Arrays>(xs)...});
}

// The single array version of the above.
std::pair<Shape, Strides> collapse_contiguous_dims(
    const Shape& shape,
    const Strides& strides,
    int64_t size_cap = std::numeric_limits<int32_t>::max());
std::pair<Shape, Strides> collapse_contiguous_dims(
    const array& a,
    int64_t size_cap = std::numeric_limits<int32_t>::max());

// Compute the thread block dimensions which fit the given
// input dimensions.
// - The thread block dimensions will be powers of two
// - The thread block size will be less than 2^pow2
using Dims = std::tuple<uint32_t, uint32_t, uint32_t>;
Dims get_block_dims_common(int dim0, int dim1, int dim2, int pow2 = 10);

// Computes a 2D grid where each element is < UINT_MAX
// Assumes:
// - overall size (product of non-broadcasted dimensions) is < UINT_MAX^2
// - shape and strides correspond to a contiguous (no holes) but
//   possibly broadcasted array
Dims get_2d_grid_dims_common(const Shape& shape, const Strides& strides);

// Same as above but we do an implicit division with divisor.
// Basically, equivalent to factorizing
//    Prod(s \forall s in shape if strides[s] > 0) / divisor.
Dims get_2d_grid_dims_common(
    const Shape& shape,
    const Strides& strides,
    size_t divisor);

// Get both the block and a grid of blocks that covers dim0, dim1 and dim2.
std::pair<Dims, Dims> get_grid_and_block_common(int dim0, int dim1, int dim2);

struct ContiguousIterator {
  inline void step() {
    int dims = shape_.size();
    if (dims == 0) {
      return;
    }
    int i = dims - 1;
    while (pos_[i] == (shape_[i] - 1) && i > 0) {
      pos_[i] = 0;
      loc -= (shape_[i] - 1) * strides_[i];
      i--;
    }
    pos_[i]++;
    loc += strides_[i];
  }

  void step(int64_t s) {
    int dims = shape_.size();
    if (dims == 0) {
      return;
    }
    int i = dims - 1;
    while (s > 0) {
      if (shape_[i] - pos_[i] > 1) {
        int steps = static_cast<int>(
            std::min(static_cast<int64_t>(shape_[i] - pos_[i] - 1), s));
        pos_[i] += steps;
        loc += strides_[i] * steps;
        s -= steps;
      } else {
        while (pos_[i] == (shape_[i] - 1) && i > 0) {
          pos_[i] = 0;
          loc -= (shape_[i] - 1) * strides_[i];
          i--;
        }
        pos_[i]++;
        loc += strides_[i];
        s--;
      }
    }
  }

  int64_t contiguous_suffix() {
    if (shape_.size() == 0) {
      return 0;
    }
    return (strides_.back() == 1) ? shape_.back() : 0;
  }

  void seek(int64_t n) {
    loc = 0;
    for (int i = shape_.size() - 1; i >= 0; --i) {
      auto q_and_r = ldiv(n, shape_[i]);
      loc += q_and_r.rem * strides_[i];
      pos_[i] = q_and_r.rem;
      n = q_and_r.quot;
    }
  }

  void reset() {
    loc = 0;
    std::fill(pos_.begin(), pos_.end(), 0);
  }

  ContiguousIterator() {};

  explicit ContiguousIterator(const array& a)
      : shape_(a.shape()), strides_(a.strides()) {
    if (!shape_.empty()) {
      std::tie(shape_, strides_) = collapse_contiguous_dims(shape_, strides_);
      pos_ = Shape(shape_.size(), 0);
    }
  }

  explicit ContiguousIterator(
      const Shape& shape,
      const Strides& strides,
      int dims)
      : shape_(shape.begin(), shape.begin() + dims),
        strides_(strides.begin(), strides.begin() + dims) {
    if (!shape_.empty()) {
      std::tie(shape_, strides_) = collapse_contiguous_dims(shape_, strides_);
      pos_ = Shape(shape_.size(), 0);
    }
  }

  int64_t loc{0};

 private:
  Shape shape_;
  Strides strides_;
  Shape pos_;
};

inline auto check_contiguity(const Shape& shape, const Strides& strides) {
  size_t no_broadcast_data_size = 1;
  int64_t f_stride = 1;
  int64_t b_stride = 1;
  bool is_row_contiguous = true;
  bool is_col_contiguous = true;

  for (int i = 0, ri = shape.size() - 1; ri >= 0; i++, ri--) {
    is_col_contiguous &= strides[i] == f_stride || shape[i] == 1;
    is_row_contiguous &= strides[ri] == b_stride || shape[ri] == 1;
    f_stride *= shape[i];
    b_stride *= shape[ri];
    if (strides[i] > 0) {
      no_broadcast_data_size *= shape[i];
    }
  }

  return std::make_tuple(
      no_broadcast_data_size, is_row_contiguous, is_col_contiguous);
}

inline bool is_donatable(const array& in, const array& out) {
  constexpr size_t donation_extra = 16384;

  return in.is_donatable() && in.itemsize() == out.itemsize() &&
      in.buffer_size() <= out.nbytes() + donation_extra;
}

std::pair<bool, Strides> prepare_reshape(const array& in, const array& out);

void shared_buffer_reshape(
    const array& in,
    const Strides& out_strides,
    array& out);

template <typename T>
inline SmallVector<T> remove_index(SmallVector<T> vec, size_t index) {
  vec.erase(std::next(vec.begin(), index));
  return vec;
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/CMakeLists.txt
================================================
if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
  set(COMPILER ${CMAKE_C_COMPILER})
  set(CLANG TRUE)
else()
  set(COMPILER ${CMAKE_CXX_COMPILER})
endif()

set(COMPILE_DEPS
    ${PROJECT_SOURCE_DIR}/mlx/types/half_types.h
    ${PROJECT_SOURCE_DIR}/mlx/types/fp16.h
    ${PROJECT_SOURCE_DIR}/mlx/types/bf16.h
    ${PROJECT_SOURCE_DIR}/mlx/types/complex.h
    simd/simd.h
    simd/base_simd.h
    simd/math.h
    simd/type.h
    unary_ops.h
    binary_ops.h)

if(MSVC)
  set(SHELL_EXT ps1)
  set(SHELL_CMD powershell -ExecutionPolicy Bypass -File)
else()
  set(SHELL_EXT sh)
  set(SHELL_CMD bash)
endif()

add_custom_command(
  OUTPUT compiled_preamble.cpp
  COMMAND
    ${SHELL_CMD} ${CMAKE_CURRENT_SOURCE_DIR}/make_compiled_preamble.${SHELL_EXT}
    ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp ${COMPILER}
    ${PROJECT_SOURCE_DIR} ${CLANG} ${CMAKE_SYSTEM_PROCESSOR}
  DEPENDS make_compiled_preamble.${SHELL_EXT} compiled_preamble.h
          ${COMPILE_DEPS})

add_custom_target(cpu_compiled_preamble DEPENDS compiled_preamble.cpp)

add_dependencies(mlx cpu_compiled_preamble)

target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/device_info.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/distributed.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/eig.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/eigh.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/encoder.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/gemms/cblas.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/masked_mm.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/scan.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/select.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/logsumexp.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/threefry.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/luf.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/qrf.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/svd.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/inverse.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/cholesky.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/unary.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/eval.cpp
          ${CMAKE_CURRENT_BINARY_DIR}/compiled_preamble.cpp)

if(MLX_BUILD_ACCELERATE)
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/bnns.cpp)
else()
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/simd_fp16.cpp
                             ${CMAKE_CURRENT_SOURCE_DIR}/gemms/simd_bf16.cpp)
endif()

if(IOS)
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../no_cpu/compiled.cpp)
else()
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
                             ${CMAKE_CURRENT_SOURCE_DIR}/jit_compiler.cpp)
endif()


================================================
FILE: mlx/backend/cpu/arange.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include "mlx/array.h"
#include "mlx/backend/cpu/encoder.h"

namespace mlx::core {

namespace {

template <typename T>
void arange(T start, T next, array& out, size_t size, Stream stream) {
  auto ptr = out.data<T>();
  auto step_size = next - start;
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_output_array(out);
  encoder.dispatch([ptr, start, step_size, size]() mutable {
    for (int i = 0; i < size; ++i) {
      ptr[i] = start;
      start += step_size;
    }
  });
}

} // namespace

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/arg_reduce.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <cassert>

#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/primitives.h"

namespace mlx::core {

namespace {

template <typename InT, typename OpT>
void arg_reduce(const array& in, array& out, const OpT& op, int axis) {
  auto axis_size = in.shape()[axis];
  auto axis_stride = in.strides()[axis];
  Strides strides = remove_index(in.strides(), axis);
  Shape shape = remove_index(in.shape(), axis);
  auto in_ptr = in.data<InT>();
  auto out_ptr = out.data<uint32_t>();

  for (uint32_t i = 0; i < out.size(); ++i) {
    auto loc = elem_to_loc(i, shape, strides);
    auto local_in_ptr = in_ptr + loc;
    uint32_t ind_v = 0;
    InT v = (*local_in_ptr);
    for (uint32_t j = 0; j < axis_size; ++j, local_in_ptr += axis_stride) {
      op(j, (*local_in_ptr), &ind_v, &v);
    }
    out_ptr[i] = ind_v;
  }
}

template <typename InT>
void arg_reduce_dispatch(
    const array& in,
    array& out,
    ArgReduce::ReduceType rtype,
    int axis) {
  switch (rtype) {
    case ArgReduce::ArgMin: {
      auto op = [](auto ind_x, auto x, auto ind_y, auto y) {
        if (x < (*y)) {
          (*y) = x;
          (*ind_y) = ind_x;
        }
      };
      arg_reduce<InT>(in, out, op, axis);
      break;
    }
    case ArgReduce::ArgMax: {
      auto op = [](auto ind_x, auto x, auto ind_y, auto y) {
        if (x > (*y)) {
          (*y) = x;
          (*ind_y) = ind_x;
        }
      };
      arg_reduce<InT>(in, out, op, axis);
      break;
    }
  }
}

} // namespace

void ArgReduce::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  out.set_data(allocator::malloc(out.nbytes()));
  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  encoder.dispatch([in = array::unsafe_weak_copy(in),
                    out = array::unsafe_weak_copy(out),
                    reduce_type_ = reduce_type_,
                    axis_ = axis_]() mutable {
    switch (in.dtype()) {
      case bool_:
        arg_reduce_dispatch<bool>(in, out, reduce_type_, axis_);
        break;
      case uint8:
        arg_reduce_dispatch<uint8_t>(in, out, reduce_type_, axis_);
        break;
      case uint16:
        arg_reduce_dispatch<uint16_t>(in, out, reduce_type_, axis_);
        break;
      case uint32:
        arg_reduce_dispatch<uint32_t>(in, out, reduce_type_, axis_);
        break;
      case uint64:
        arg_reduce_dispatch<uint64_t>(in, out, reduce_type_, axis_);
        break;
      case int8:
        arg_reduce_dispatch<int8_t>(in, out, reduce_type_, axis_);
        break;
      case int16:
        arg_reduce_dispatch<int16_t>(in, out, reduce_type_, axis_);
        break;
      case int32:
        arg_reduce_dispatch<int32_t>(in, out, reduce_type_, axis_);
        break;
      case int64:
        arg_reduce_dispatch<int64_t>(in, out, reduce_type_, axis_);
        break;
      case float16:
        arg_reduce_dispatch<float16_t>(in, out, reduce_type_, axis_);
        break;
      case float32:
        arg_reduce_dispatch<float>(in, out, reduce_type_, axis_);
        break;
      case bfloat16:
        arg_reduce_dispatch<bfloat16_t>(in, out, reduce_type_, axis_);
        break;
      case float64:
        arg_reduce_dispatch<double>(in, out, reduce_type_, axis_);
        break;
      case complex64:
        arg_reduce_dispatch<complex64_t>(in, out, reduce_type_, axis_);
        break;
    }
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/binary.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <cassert>
#include <cmath>
#include <sstream>

#include "mlx/allocator.h"
#include "mlx/backend/cpu/binary.h"
#include "mlx/backend/cpu/binary_ops.h"
#include "mlx/backend/cpu/binary_two.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/primitives.h"
#include "mlx/utils.h"

namespace mlx::core {

void Add::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary_op_cpu(a, b, out, detail::Add(), stream());
}

void DivMod::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto bopt = get_binary_op_type(a, b);
  auto& out_a = outputs[0];
  auto& out_b = outputs[1];
  set_binary_op_output_data(a, b, out_a, bopt);
  set_binary_op_output_data(a, b, out_b, bopt);

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out_a);
  encoder.set_output_array(out_b);

  encoder.dispatch([a = array::unsafe_weak_copy(a),
                    b = array::unsafe_weak_copy(b),
                    out_a = array::unsafe_weak_copy(out_a),
                    out_b = array::unsafe_weak_copy(out_b),
                    bopt]() mutable {
    auto integral_op = [](auto x, auto y) {
      return std::make_pair(x / y, x % y);
    };
    auto float_op = [](auto x, auto y) {
      return std::make_pair(std::trunc(x / y), std::fmod(x, y));
    };

    switch (out_a.dtype()) {
      case bool_:
        binary_op<bool>(a, b, out_a, out_b, integral_op, bopt);
      case uint8:
        binary_op<uint8_t>(a, b, out_a, out_b, integral_op, bopt);
        break;
      case uint16:
        binary_op<uint16_t>(a, b, out_a, out_b, integral_op, bopt);
        break;
      case uint32:
        binary_op<uint32_t>(a, b, out_a, out_b, integral_op, bopt);
        break;
      case uint64:
        binary_op<uint64_t>(a, b, out_a, out_b, integral_op, bopt);
        break;
      case int8:
        binary_op<int8_t>(a, b, out_a, out_b, integral_op, bopt);
        break;
      case int16:
        binary_op<int16_t>(a, b, out_a, out_b, integral_op, bopt);
        break;
      case int32:
        binary_op<int32_t>(a, b, out_a, out_b, integral_op, bopt);
        break;
      case int64:
        binary_op<int64_t>(a, b, out_a, out_b, integral_op, bopt);
        break;
      case float16:
        binary_op<float16_t>(a, b, out_a, out_b, float_op, bopt);
        break;
      case float32:
        binary_op<float>(a, b, out_a, out_b, float_op, bopt);
        break;
      case float64:
        binary_op<double>(a, b, out_a, out_b, float_op, bopt);
        break;
      case bfloat16:
        binary_op<bfloat16_t>(a, b, out_a, out_b, float_op, bopt);
        break;
      case complex64:
        // Should never get here
        throw std::runtime_error("[DivMod] Complex type not supported");
        break;
    }
  });
}

void Divide::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary_op_cpu(a, b, out, detail::Divide(), stream());
}

void Remainder::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary_op_cpu(a, b, out, detail::Remainder(), stream());
}

void Equal::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  if (equal_nan_) {
    auto bopt = get_binary_op_type(a, b);
    set_binary_op_output_data(a, b, out, bopt);

    auto& encoder = cpu::get_command_encoder(stream());
    encoder.set_input_array(a);
    encoder.set_input_array(b);
    encoder.set_output_array(out);
    encoder.dispatch([a = array::unsafe_weak_copy(a),
                      b = array::unsafe_weak_copy(b),
                      out = array::unsafe_weak_copy(out),
                      bopt]() mutable {
      switch (a.dtype()) {
        case float16:
          binary_op<float16_t, bool, detail::NaNEqual>(a, b, out, bopt);
          break;
        case float32:
          binary_op<float, bool, detail::NaNEqual>(a, b, out, bopt);
          break;
        case float64:
          binary_op<double, bool, detail::NaNEqual>(a, b, out, bopt);
          break;
        case bfloat16:
          binary_op<bfloat16_t, bool, detail::NaNEqual>(a, b, out, bopt);
          break;
        case complex64:
          binary_op<complex64_t, bool, detail::NaNEqual>(a, b, out, bopt);
          break;
        default:
          throw std::runtime_error(
              "[NanEqual::eval_cpu] Only for floating point types.");
      }
    });
  } else {
    comparison_op_cpu(a, b, out, detail::Equal(), stream());
  }
}

void Greater::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  comparison_op_cpu(inputs[0], inputs[1], out, detail::Greater(), stream());
}

void GreaterEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  comparison_op_cpu(
      inputs[0], inputs[1], out, detail::GreaterEqual(), stream());
}

void Less::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  comparison_op_cpu(inputs[0], inputs[1], out, detail::Less(), stream());
}

void LessEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  comparison_op_cpu(inputs[0], inputs[1], out, detail::LessEqual(), stream());
}

void LogAddExp::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary_float_op_cpu(a, b, out, detail::LogAddExp(), stream());
}

void LogicalAnd::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2); // LogicalAnd requires two input arrays
  auto& in1 = inputs[0];
  auto& in2 = inputs[1];
  binary_op_cpu(in1, in2, out, detail::LogicalAnd(), stream());
}

void LogicalOr::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2); // LogicalOr requires two input arrays
  auto& in1 = inputs[0];
  auto& in2 = inputs[1];
  binary_op_cpu(in1, in2, out, detail::LogicalOr(), stream());
}

void Maximum::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary_op_cpu(a, b, out, detail::Maximum(), stream());
}

void Minimum::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary_op_cpu(a, b, out, detail::Minimum(), stream());
}

void Multiply::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary_op_cpu(a, b, out, detail::Multiply(), stream());
}

void NotEqual::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  comparison_op_cpu(inputs[0], inputs[1], out, detail::NotEqual(), stream());
}

void Power::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary_op_cpu(a, b, out, detail::Power(), stream());
}

void Subtract::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  binary_op_cpu(a, b, out, detail::Subtract(), stream());
}

void BitwiseBinary::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  switch (op_) {
    case BitwiseBinary::And:
      binary_int_op_cpu(a, b, out, detail::BitwiseAnd(), stream());
      break;
    case BitwiseBinary::Or:
      binary_int_op_cpu(a, b, out, detail::BitwiseOr(), stream());
      break;
    case BitwiseBinary::Xor:
      binary_int_op_cpu(a, b, out, detail::BitwiseXor(), stream());
      break;
    case BitwiseBinary::LeftShift:
      binary_int_op_cpu(a, b, out, detail::LeftShift(), stream());
      break;
    case BitwiseBinary::RightShift:
      binary_int_op_cpu(a, b, out, detail::RightShift(), stream());
      break;
  }
}

void ArcTan2::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  const auto& a = inputs[0];
  const auto& b = inputs[1];
  binary_float_op_cpu(a, b, out, detail::ArcTan2(), stream());
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/binary.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once
#include <cassert>

#include "mlx/array.h"
#include "mlx/backend/common/binary.h"
#include "mlx/backend/common/utils.h"

#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/simd/simd.h"

namespace mlx::core {

template <typename Op>
struct VectorScalar {
  template <typename T, typename U>
  void operator()(const T* a, const T* b, U* dst, int size) {
    T scalar = *b;
    constexpr int N = simd::max_size<T>;
    while (size >= N) {
      simd::store(dst, Op{}(simd::load<T, N>(a), simd::Simd<T, N>(scalar)));
      dst += N;
      a += N;
      size -= N;
    }
    while (size-- > 0) {
      *dst = Op{}(*a, scalar);
      dst++;
      a++;
    }
  }
};

template <typename Op>
struct ScalarVector {
  template <typename T, typename U>
  void operator()(const T* a, const T* b, U* dst, int size) {
    T scalar = *a;
    constexpr int N = simd::max_size<T>;
    while (size >= N) {
      simd::store(dst, Op{}(simd::Simd<T, N>(scalar), simd::load<T, N>(b)));
      dst += N;
      b += N;
      size -= N;
    }
    while (size-- > 0) {
      *dst = Op{}(scalar, *b);
      dst++;
      b++;
    }
  }
};

template <typename Op>
struct VectorVector {
  template <typename T, typename U>
  void operator()(const T* a, const T* b, U* dst, int size) {
    constexpr int N = simd::max_size<T>;
    while (size >= N) {
      simd::store(dst, Op{}(simd::load<T, N>(a), simd::load<T, N>(b)));
      dst += N;
      a += N;
      b += N;
      size -= N;
    }
    while (size-- > 0) {
      *dst = Op{}(*a, *b);
      dst++;
      a++;
      b++;
    }
  }
};

template <typename T, typename U, typename Op, int D, bool Strided>
void binary_op_dims(
    const T* a,
    const T* b,
    U* out,
    const Shape& shape,
    const Strides& a_strides,
    const Strides& b_strides,
    const Strides& out_strides,
    int axis) {
  auto stride_a = a_strides[axis];
  auto stride_b = b_strides[axis];
  auto stride_out = out_strides[axis];
  auto N = shape[axis];

  for (int i = 0; i < N; i++) {
    if constexpr (D > 1) {
      binary_op_dims<T, U, Op, D - 1, Strided>(
          a, b, out, shape, a_strides, b_strides, out_strides, axis + 1);
    } else {
      if constexpr (Strided) {
        Op{}(a, b, out, stride_out);
      } else {
        *out = Op{}(*a, *b);
      }
    }
    out += stride_out;
    a += stride_a;
    b += stride_b;
  }
}

template <typename T, typename U, bool Strided, typename Op>
void binary_op_dispatch_dims(
    const T* a,
    const T* b,
    U* out,
    int dim,
    int size,
    const Shape& shape,
    const Strides& a_strides,
    const Strides& b_strides,
    const Strides& out_strides) {
  switch (dim) {
    case 1:
      binary_op_dims<T, U, Op, 1, Strided>(
          a, b, out, shape, a_strides, b_strides, out_strides, 0);
      return;
    case 2:
      binary_op_dims<T, U, Op, 2, Strided>(
          a, b, out, shape, a_strides, b_strides, out_strides, 0);
      return;
    case 3:
      binary_op_dims<T, U, Op, 3, Strided>(
          a, b, out, shape, a_strides, b_strides, out_strides, 0);
      return;
  }

  ContiguousIterator a_it(shape, a_strides, dim - 3);
  ContiguousIterator b_it(shape, b_strides, dim - 3);
  auto stride = out_strides[dim - 4];
  for (int64_t elem = 0; elem < size; elem += stride) {
    binary_op_dims<T, U, Op, 3, Strided>(
        a + a_it.loc,
        b + b_it.loc,
        out + elem,
        shape,
        a_strides,
        b_strides,
        out_strides,
        dim - 3);
    a_it.step();
    b_it.step();
  }
}

template <typename T, typename U, typename Op>
void binary_op(const array& a, const array& b, array& out, BinaryOpType bopt) {
  // The full computation is scalar scalar so call the base op once
  auto a_ptr = a.data<T>();
  auto b_ptr = b.data<T>();

  auto out_ptr = out.data<U>();
  if (bopt == BinaryOpType::ScalarScalar) {
    *out_ptr = Op{}(*a_ptr, *b_ptr);
    return;
  }

  // The full computation is scalar vector so delegate to the op
  if (bopt == BinaryOpType::ScalarVector) {
    ScalarVector<Op>{}(a_ptr, b_ptr, out_ptr, b.data_size());
    return;
  }

  // The full computation is vector scalar so delegate to the op
  if (bopt == BinaryOpType::VectorScalar) {
    VectorScalar<Op>{}(a_ptr, b_ptr, out_ptr, a.data_size());
    return;
  }

  // The full computation is vector vector so delegate to the op
  if (bopt == BinaryOpType::VectorVector) {
    VectorVector<Op>{}(a_ptr, b_ptr, out_ptr, a.size());
    return;
  }

  // General computation so let's try to optimize
  auto [new_shape, new_strides] = collapse_contiguous_dims(
      a.shape(), {a.strides(), b.strides(), out.strides()});
  auto& a_strides = new_strides[0];
  auto& b_strides = new_strides[1];
  auto& strides = new_strides[2];

  // Get the left-most dim such that the array is row contiguous after
  auto leftmost_rc_dim = [&strides](const auto& arr_strides) {
    int d = arr_strides.size() - 1;
    for (; d >= 0 && arr_strides[d] == strides[d]; d--) {
    }
    return d + 1;
  };
  auto a_rc_dim = leftmost_rc_dim(a_strides);
  auto b_rc_dim = leftmost_rc_dim(b_strides);

  // Get the left-most dim such that the array is a broadcasted "scalar" after
  auto leftmost_s_dim = [](const auto& arr_strides) {
    int d = arr_strides.size() - 1;
    for (; d >= 0 && arr_strides[d] == 0; d--) {
    }
    return d + 1;
  };
  auto a_s_dim = leftmost_s_dim(a_strides);
  auto b_s_dim = leftmost_s_dim(b_strides);

  auto ndim = new_shape.size();

  // Case 1: LxM and FxM where L and F are broadcastable and M is row
  // contiguous
  int dim = ndim;
  if (int d = std::max(a_rc_dim, b_rc_dim); d < ndim) {
    bopt = BinaryOpType::VectorVector;
    dim = d;
    // Case 2: LxM and Fx1 where L and F are broadcastable and M is row
    // contiguous
  } else if (int d = std::max(a_rc_dim, b_s_dim); d < ndim) {
    bopt = BinaryOpType::VectorScalar;
    dim = d;
    // Case 3: Lx1 and FxM where L and F are broadcastable and M is row
    // contiguous
  } else if (int d = std::max(a_s_dim, b_rc_dim); d < ndim) {
    bopt = BinaryOpType::ScalarVector;
    dim = d;
  }

  // Can be sure dim > 0 since otherwise we would have used one of the fully
  // contiguous methods above. Except for the case that the flags do not
  // correspond to the underlying contiguity.
  if (dim == 0 || strides[dim - 1] < 16) {
    bopt = BinaryOpType::General;
    dim = ndim;
  }

  switch (bopt) {
    case BinaryOpType::VectorVector:
      binary_op_dispatch_dims<T, U, true, VectorVector<Op>>(
          a_ptr,
          b_ptr,
          out_ptr,
          dim,
          a.size(),
          new_shape,
          a_strides,
          b_strides,
          strides);
      break;
    case BinaryOpType::VectorScalar:
      binary_op_dispatch_dims<T, U, true, VectorScalar<Op>>(
          a_ptr,
          b_ptr,
          out_ptr,
          dim,
          a.size(),
          new_shape,
          a_strides,
          b_strides,
          strides);
      break;
    case BinaryOpType::ScalarVector:
      binary_op_dispatch_dims<T, U, true, ScalarVector<Op>>(
          a_ptr,
          b_ptr,
          out_ptr,
          dim,
          a.size(),
          new_shape,
          a_strides,
          b_strides,
          strides);
      break;
    default:
      binary_op_dispatch_dims<T, U, false, Op>(
          a_ptr,
          b_ptr,
          out_ptr,
          dim,
          a.size(),
          new_shape,
          a_strides,
          b_strides,
          strides);
      break;
  }
}

template <typename T, typename Op>
void binary_op(const array& a, const array& b, array& out, BinaryOpType bopt) {
  binary_op<T, T, Op>(a, b, out, bopt);
}

template <typename Op>
void binary_op_cpu(
    const array& a,
    const array& b,
    array& out,
    Op op,
    Stream stream) {
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, out, bopt);

  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
  encoder.dispatch([a = array::unsafe_weak_copy(a),
                    b = array::unsafe_weak_copy(b),
                    out = array::unsafe_weak_copy(out),
                    bopt]() mutable {
    switch (out.dtype()) {
      case bool_:
        binary_op<bool, Op>(a, b, out, bopt);
        break;
      case uint8:
        binary_op<uint8_t, Op>(a, b, out, bopt);
        break;
      case uint16:
        binary_op<uint16_t, Op>(a, b, out, bopt);
        break;
      case uint32:
        binary_op<uint32_t, Op>(a, b, out, bopt);
        break;
      case uint64:
        binary_op<uint64_t, Op>(a, b, out, bopt);
        break;
      case int8:
        binary_op<int8_t, Op>(a, b, out, bopt);
        break;
      case int16:
        binary_op<int16_t, Op>(a, b, out, bopt);
        break;
      case int32:
        binary_op<int32_t, Op>(a, b, out, bopt);
        break;
      case int64:
        binary_op<int64_t, Op>(a, b, out, bopt);
        break;
      case float16:
        binary_op<float16_t, Op>(a, b, out, bopt);
        break;
      case float32:
        binary_op<float, Op>(a, b, out, bopt);
        break;
      case float64:
        binary_op<double, Op>(a, b, out, bopt);
        break;
      case bfloat16:
        binary_op<bfloat16_t, Op>(a, b, out, bopt);
        break;
      case complex64:
        binary_op<complex64_t, Op>(a, b, out, bopt);
        break;
    }
  });
}

template <typename Op>
void comparison_op_cpu(
    const array& a,
    const array& b,
    array& out,
    Op op,
    Stream stream) {
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, out, bopt);

  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
  encoder.dispatch([a = array::unsafe_weak_copy(a),
                    b = array::unsafe_weak_copy(b),
                    out = array::unsafe_weak_copy(out),
                    bopt]() mutable {
    switch (a.dtype()) {
      case bool_:
        binary_op<bool, bool, Op>(a, b, out, bopt);
        break;
      case uint8:
        binary_op<uint8_t, bool, Op>(a, b, out, bopt);
        break;
      case uint16:
        binary_op<uint16_t, bool, Op>(a, b, out, bopt);
        break;
      case uint32:
        binary_op<uint32_t, bool, Op>(a, b, out, bopt);
        break;
      case uint64:
        binary_op<uint64_t, bool, Op>(a, b, out, bopt);
        break;
      case int8:
        binary_op<int8_t, bool, Op>(a, b, out, bopt);
        break;
      case int16:
        binary_op<int16_t, bool, Op>(a, b, out, bopt);
        break;
      case int32:
        binary_op<int32_t, bool, Op>(a, b, out, bopt);
        break;
      case int64:
        binary_op<int64_t, bool, Op>(a, b, out, bopt);
        break;
      case float16:
        binary_op<float16_t, bool, Op>(a, b, out, bopt);
        break;
      case float32:
        binary_op<float, bool, Op>(a, b, out, bopt);
        break;
      case float64:
        binary_op<double, bool, Op>(a, b, out, bopt);
        break;
      case bfloat16:
        binary_op<bfloat16_t, bool, Op>(a, b, out, bopt);
        break;
      case complex64:
        binary_op<complex64_t, bool, Op>(a, b, out, bopt);
        break;
    }
  });
}

template <typename Op>
void binary_float_op_cpu(
    const array& a,
    const array& b,
    array& out,
    Op op,
    Stream stream) {
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, out, bopt);

  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
  encoder.dispatch([a = array::unsafe_weak_copy(a),
                    b = array::unsafe_weak_copy(b),
                    out = array::unsafe_weak_copy(out),
                    bopt]() mutable {
    switch (out.dtype()) {
      case float16:
        binary_op<float16_t, Op>(a, b, out, bopt);
        break;
      case float32:
        binary_op<float, Op>(a, b, out, bopt);
        break;
      case float64:
        binary_op<double, Op>(a, b, out, bopt);
        break;
      case bfloat16:
        binary_op<bfloat16_t, Op>(a, b, out, bopt);
        break;
      case complex64:
        binary_op<complex64_t, Op>(a, b, out, bopt);
        break;
      default:
        throw std::runtime_error(
            "[binary_float] Only supports floating point types.");
    }
  });
}

template <typename Op>
void binary_int_op_cpu(
    const array& a,
    const array& b,
    array& out,
    Op op,
    Stream stream) {
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, out, bopt);

  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
  encoder.dispatch([a = array::unsafe_weak_copy(a),
                    b = array::unsafe_weak_copy(b),
                    out = array::unsafe_weak_copy(out),
                    bopt]() mutable {
    switch (out.dtype()) {
      case bool_:
        binary_op<bool, Op>(a, b, out, bopt);
      case uint8:
        binary_op<uint8_t, Op>(a, b, out, bopt);
        break;
      case uint16:
        binary_op<uint16_t, Op>(a, b, out, bopt);
        break;
      case uint32:
        binary_op<uint32_t, Op>(a, b, out, bopt);
        break;
      case uint64:
        binary_op<uint64_t, Op>(a, b, out, bopt);
        break;
      case int8:
        binary_op<int8_t, Op>(a, b, out, bopt);
        break;
      case int16:
        binary_op<int16_t, Op>(a, b, out, bopt);
        break;
      case int32:
        binary_op<int32_t, Op>(a, b, out, bopt);
        break;
      case int64:
        binary_op<int64_t, Op>(a, b, out, bopt);
        break;
      default:
        throw std::runtime_error("[binary_int] Type not supported");
        break;
    }
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/binary_ops.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include "mlx/backend/cpu/simd/simd.h"

namespace mlx::core::detail {

using namespace mlx::core::simd;

#define BINARY_SINGLE()                                 \
  template <typename T>                                 \
  T operator()(T x, T y) {                              \
    return (*this)(Simd<T, 1>(x), Simd<T, 1>(y)).value; \
  }

#define DEFAULT_BINARY_OP(Op, op)                       \
  struct Op {                                           \
    template <int N, typename T>                        \
    Simd<T, N> operator()(Simd<T, N> x, Simd<T, N> y) { \
      return op(x, y);                                  \
    }                                                   \
    BINARY_SINGLE()                                     \
  };

DEFAULT_BINARY_OP(Add, operator+)
DEFAULT_BINARY_OP(ArcTan2, atan2)
DEFAULT_BINARY_OP(Divide, operator/)
DEFAULT_BINARY_OP(Multiply, operator*)
DEFAULT_BINARY_OP(Subtract, operator-)
DEFAULT_BINARY_OP(LogicalAnd, operator&&)
DEFAULT_BINARY_OP(LogicalOr, operator||)
DEFAULT_BINARY_OP(BitwiseAnd, operator&)
DEFAULT_BINARY_OP(BitwiseOr, operator|)
DEFAULT_BINARY_OP(BitwiseXor, operator^)
DEFAULT_BINARY_OP(LeftShift, operator<<)
DEFAULT_BINARY_OP(RightShift, operator>>)
DEFAULT_BINARY_OP(Remainder, remainder)
DEFAULT_BINARY_OP(Maximum, maximum)
DEFAULT_BINARY_OP(Minimum, minimum)
DEFAULT_BINARY_OP(Power, pow)

#define DEFAULT_BOOL_OP(Op, op)                            \
  struct Op {                                              \
    template <int N, typename T>                           \
    Simd<bool, N> operator()(Simd<T, N> x, Simd<T, N> y) { \
      return op(x, y);                                     \
    }                                                      \
    template <typename T>                                  \
    bool operator()(T x, T y) {                            \
      return (*this)(Simd<T, 1>(x), Simd<T, 1>(y)).value;  \
    }                                                      \
  };

DEFAULT_BOOL_OP(Equal, operator==)
DEFAULT_BOOL_OP(Greater, operator>)
DEFAULT_BOOL_OP(GreaterEqual, operator>=)
DEFAULT_BOOL_OP(Less, operator<)
DEFAULT_BOOL_OP(LessEqual, operator<=)
DEFAULT_BOOL_OP(NotEqual, operator!=)

struct NaNEqual {
  template <int N, typename T>
  Simd<bool, N> operator()(Simd<T, N> x, Simd<T, N> y) {
    return x == y || (isnan(x) && isnan(y));
  }
  template <typename T>
  bool operator()(T x, T y) {
    return (*this)(Simd<T, 1>(x), Simd<T, 1>(y)).value;
  }
};

struct LogAddExp {
  template <int N, typename T>
  Simd<T, N> operator()(Simd<T, N> x, Simd<T, N> y) {
    auto maxval = maximum(x, y);
    auto minval = minimum(x, y);
    auto mask = minval == -inf || maxval == inf;
    auto out = maxval + log1p(exp(minval - maxval));
    return select(mask, Simd<T, N>(maxval), Simd<T, N>(out));
  }
  BINARY_SINGLE()
};

struct Select {
  template <typename T>
  T operator()(bool condition, T x, T y) {
    return (*this)(Simd<bool, 1>(condition), Simd<T, 1>(x), Simd<T, 1>(y))
        .value;
  }

  template <int N, typename T>
  Simd<T, N> operator()(Simd<bool, N> condition, Simd<T, N> x, Simd<T, N> y) {
    return select(condition, x, y);
  }
};

} // namespace mlx::core::detail


================================================
FILE: mlx/backend/cpu/binary_two.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/binary.h"

namespace mlx::core {

namespace {

template <typename T, typename U, typename Op, int D>
void binary_op_dims(
    const T* a,
    const T* b,
    U* out_a,
    U* out_b,
    Op op,
    const Shape& shape,
    const Strides& a_strides,
    const Strides& b_strides,
    const Strides& out_strides,
    int axis) {
  auto stride_a = a_strides[axis];
  auto stride_b = b_strides[axis];
  auto stride_out = out_strides[axis];
  auto N = shape[axis];

  for (int i = 0; i < N; i++) {
    if constexpr (D > 1) {
      binary_op_dims<T, U, Op, D - 1>(
          a,
          b,
          out_a,
          out_b,
          op,
          shape,
          a_strides,
          b_strides,
          out_strides,
          axis + 1);
    } else {
      std::tie(*out_a, *out_b) = op(*a, *b);
    }
    a += stride_a;
    b += stride_b;
    out_a += stride_out;
    out_b += stride_out;
  }
}

template <typename T, typename U, typename Op>
void binary_op_dispatch_dims(
    const array& a,
    const array& b,
    array& out_a,
    array& out_b,
    Op op) {
  auto [shape, strides] = collapse_contiguous_dims(
      a.shape(), {a.strides(), b.strides(), out_a.strides()});
  const T* a_ptr = a.data<T>();
  const T* b_ptr = b.data<T>();
  U* out_a_ptr = out_a.data<U>();
  U* out_b_ptr = out_b.data<U>();

  const auto& a_strides = strides[0];
  const auto& b_strides = strides[1];
  const auto& out_strides = strides[2];
  int ndim = shape.size();
  switch (ndim) {
    case 1:
      binary_op_dims<T, U, Op, 1>(
          a_ptr,
          b_ptr,
          out_a_ptr,
          out_b_ptr,
          op,
          shape,
          a_strides,
          b_strides,
          out_strides,
          0);
      return;
    case 2:
      binary_op_dims<T, U, Op, 2>(
          a_ptr,
          b_ptr,
          out_a_ptr,
          out_b_ptr,
          op,
          shape,
          a_strides,
          b_strides,
          out_strides,
          0);
      return;
  }

  ContiguousIterator a_it(shape, a_strides, ndim - 2);
  ContiguousIterator b_it(shape, b_strides, ndim - 2);
  auto stride = out_strides[ndim - 3];
  for (size_t elem = 0; elem < a.size(); elem += stride) {
    binary_op_dims<T, U, Op, 2>(
        a_ptr + a_it.loc,
        b_ptr + b_it.loc,
        out_a_ptr + elem,
        out_b_ptr + elem,
        op,
        shape,
        a_strides,
        b_strides,
        out_strides,
        ndim - 2);
    a_it.step();
    b_it.step();
  }
}

template <typename T, typename U = T, typename Op>
void binary_op(
    const array& a,
    const array& b,
    array& out_a,
    array& out_b,
    Op op,
    BinaryOpType bopt) {
  // The full computation is scalar scalar so call the base op once
  if (bopt == BinaryOpType::General) {
    binary_op_dispatch_dims<T, U, Op>(a, b, out_a, out_b, op);
    return;
  }

  auto a_ptr = a.data<T>();
  auto b_ptr = b.data<T>();
  auto out_a_ptr = out_a.data<U>();
  auto out_b_ptr = out_b.data<U>();
  if (bopt == BinaryOpType::ScalarScalar) {
    std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
  } else if (bopt == BinaryOpType::ScalarVector) {
    for (size_t i = 0; i < b.data_size(); ++i) {
      std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
      out_a_ptr++;
      out_b_ptr++;
      b_ptr++;
    }
  } else if (bopt == BinaryOpType::VectorScalar) {
    for (size_t i = 0; i < a.data_size(); ++i) {
      std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
      out_a_ptr++;
      out_b_ptr++;
      a_ptr++;
    }
  } else { // VectorVector
    for (size_t i = 0; i < a.size(); ++i) {
      std::tie(*out_a_ptr, *out_b_ptr) = op(*a_ptr, *b_ptr);
      out_a_ptr++;
      out_b_ptr++;
      a_ptr++;
      b_ptr++;
    }
  }
}

} // namespace

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/cholesky.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include "mlx/allocator.h"
#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/lapack.h"
#include "mlx/linalg.h"
#include "mlx/primitives.h"

namespace mlx::core {

template <typename T>
void cholesky_impl(const array& a, array& factor, bool upper, Stream stream) {
  // Lapack uses the column-major convention. We take advantage of the fact that
  // the matrix should be symmetric:
  //   (A)ᵀ = A
  // and that a column-major lower triangular matrix is a row-major upper
  // triangular matrix, so uplo is the opposite of what we would expect from
  // upper

  // The decomposition is computed in place, so just copy the input to the
  // output.
  copy_cpu(
      a,
      factor,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
      stream);

  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_output_array(factor);
  encoder.dispatch([matrix = factor.data<T>(),
                    upper,
                    N = a.shape(-1),
                    size = a.size()]() mutable {
    char uplo = (upper) ? 'L' : 'U';
    size_t num_matrices = size / (N * N);
    for (int i = 0; i < num_matrices; i++) {
      // Compute Cholesky factorization.
      int info;
      potrf<T>(
          /* uplo = */ &uplo,
          /* n = */ &N,
          /* a = */ matrix,
          /* lda = */ &N,
          /* info = */ &info);

      // TODO: We do nothing when the matrix is not positive semi-definite
      // because throwing an error would result in a crash. If we figure out how
      // to catch errors from the implementation we should throw.
      if (info < 0) {
        std::stringstream msg;
        msg << "[Cholesky::eval_cpu] Cholesky decomposition failed with error code "
            << info;
        throw std::runtime_error(msg.str());
      }

      // Zero out the upper/lower triangle while advancing the pointer to the
      // next matrix at the same time.
      for (int row = 0; row < N; row++) {
        if (upper) {
          std::fill(matrix, matrix + row, 0);
        } else {
          std::fill(matrix + row + 1, matrix + N, 0);
        }
        matrix += N;
      }
    }
  });
}

void Cholesky::eval_cpu(const std::vector<array>& inputs, array& output) {
  switch (inputs[0].dtype()) {
    case float32:
      cholesky_impl<float>(inputs[0], output, upper_, stream());
      break;
    case float64:
      cholesky_impl<double>(inputs[0], output, upper_, stream());
      break;
    default:
      throw std::runtime_error(
          "[Cholesky::eval_cpu] only supports float32 or float64.");
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/compiled.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <dlfcn.h>
#include <filesystem>
#include <fstream>
#include <list>
#include <mutex>
#include <shared_mutex>

#include <fmt/format.h>

#include "mlx/backend/common/compiled.h"
#include "mlx/backend/cpu/compiled_preamble.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/jit_compiler.h"
#include "mlx/device.h"
#include "mlx/graph_utils.h"
#include "mlx/version.h"

namespace mlx::core {

struct CompilerCache {
  struct DLib {
    DLib(const std::string& libname) {
      lib = dlopen(libname.c_str(), RTLD_NOW);
      if (!lib) {
        std::ostringstream msg;
        msg << "Could not load C++ shared library " << dlerror();
        throw std::runtime_error(msg.str());
      }
    }

    ~DLib() {
      dlclose(lib);
    }
    void* lib;
  };
  // Statics to cache compiled libraries and functions
  std::list<DLib> libs;
  std::unordered_map<std::string, void*> kernels;
  std::shared_mutex mtx;
};

static CompilerCache& cache() {
  static CompilerCache cache_;
  return cache_;
};

// GPU compile is always available if the GPU is available and since we are in
// this file CPU compile is also available.
namespace detail {
bool compile_available_for_device(const Device& device) {
  return true;
}

} // namespace detail

// Return a pointer to a compiled function
void* compile(
    const std::string& kernel_name,
    const std::function<std::string(void)>& source_builder) {
  {
    std::shared_lock lock(cache().mtx);
    if (auto it = cache().kernels.find(kernel_name);
        it != cache().kernels.end()) {
      return it->second;
    }
  }

  std::unique_lock lock(cache().mtx);
  if (auto it = cache().kernels.find(kernel_name);
      it != cache().kernels.end()) {
    return it->second;
  }
  std::string source_code = source_builder();
  std::string kernel_file_name;

  // Deal with long kernel names. Maximum length for filename on macOS is 255
  // characters, and on Windows the maximum length for whole path is 260. Clip
  // file name with a little extra room and append a 16 character hash.
#ifdef _WIN32
  constexpr int max_file_name_length = 140;
#else
  constexpr int max_file_name_length = 245;
#endif
  if (kernel_name.size() > max_file_name_length) {
    std::ostringstream file_name;
    file_name
        << std::string_view(kernel_name).substr(0, max_file_name_length - 16);
    auto file_id =
        std::hash<std::string>{}(kernel_name.substr(max_file_name_length - 16));
    file_name << "_" << std::hex << std::setw(16) << file_id << std::dec;
    kernel_file_name = file_name.str();
  } else {
    kernel_file_name = kernel_name;
  }

  auto output_dir =
      std::filesystem::temp_directory_path() / "mlx" / version() / "cpu";
  if (!std::filesystem::exists(output_dir)) {
    std::filesystem::create_directories(output_dir);
  }

  std::string shared_lib_name = "lib" + kernel_file_name + ".so";
  auto shared_lib_path = (output_dir / shared_lib_name).string();
  bool lib_exists = false;
  {
    std::ifstream f(shared_lib_path.c_str());
    lib_exists = f.good();
  }

  if (!lib_exists) {
    // Open source file and write source code to it
    std::string source_file_name = kernel_file_name + ".cpp";
    auto source_file_path = (output_dir / source_file_name).string();

    std::ofstream source_file(source_file_path);
    source_file << source_code;
    source_file.close();

    try {
      JitCompiler::exec(
          JitCompiler::build_command(
              output_dir, source_file_name, shared_lib_name));
    } catch (const std::exception& error) {
      throw std::runtime_error(
          fmt::format(
              "[Compile::eval_cpu] Failed to compile function {0}: {1}",
              kernel_name,
              error.what()));
    }
  }

  // load library
  cache().libs.emplace_back(shared_lib_path);

  // Load function
  void* fun = dlsym(cache().libs.back().lib, kernel_name.c_str());
  if (!fun) {
    std::ostringstream msg;
    msg << "[Compile::eval_cpu] Failed to load compiled function "
        << kernel_name << std::endl
        << dlerror();
    throw std::runtime_error(msg.str());
  }
  cache().kernels.insert({kernel_name, fun});
  return fun;
}

inline void build_kernel(
    std::ostream& os,
    const std::string& kernel_name,
    const std::vector<array>& inputs,
    const std::vector<array>& outputs,
    const std::vector<array>& tape,
    const std::function<bool(size_t)>& is_constant,
    bool contiguous,
    int ndim) {
  NodeNamer namer;

#ifdef _MSC_VER
  // Export the symbol
  os << "__declspec(dllexport) ";
#endif

  // Start the kernel
  os << "void " << kernel_name
     << "(int* shape, int64_t** strides, void** args) {" << std::endl;

  // Add the input arguments
  int cnt = 0;
  int strides_index = 1;
  for (size_t i = 0; i < inputs.size(); ++i) {
    // Skip constants from the input list
    if (is_constant(i)) {
      continue;
    }

    const auto& x = inputs[i];
    auto& xname = namer.get_name(x);

    auto tstr = get_type_string(x.dtype());
    os << "  " << tstr << "* " << xname << " = (" << tstr << "*)args[" << cnt++
       << "];" << std::endl;
    // Scalars and contiguous need no strides
    if (!is_scalar(x) && !contiguous) {
      os << "  const int64_t* " << xname << "_strides = strides["
         << strides_index++ << "];" << std::endl;
    }
  }

  // Add the output arguments
  for (auto& x : outputs) {
    auto tstr = get_type_string(x.dtype());
    os << "  " << tstr << "* " << namer.get_name(x) << " = (" << tstr
       << "*)args[" << cnt++ << "];" << std::endl;
  }
  // Add output size
  if (contiguous) {
    os << "  const size_t size = (size_t)args[" << cnt++ << "];" << std::endl;
  }

  if (contiguous) {
    os << "  for (size_t i = 0; i < size; ++i) {" << std::endl;
  } else {
    for (int d = 0; d < ndim; ++d) {
      os << "  for (int i" << d << " = 0; i" << d << " < shape[" << d
         << "]; ++i" << d << ") {" << std::endl;
    }
  }

  // Read the inputs in tmps
  for (size_t i = 0; i < inputs.size(); ++i) {
    const auto& x = inputs[i];
    auto& xname = namer.get_name(x);

    if (is_constant(i)) {
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = ";
      print_constant(os, x);
      os << ";" << std::endl;
    } else if (is_scalar(x)) {
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
         << xname << "[0];" << std::endl;
    } else if (contiguous) {
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = "
         << xname << "[i];" << std::endl;
    } else {
      os << "  " << get_type_string(x.dtype()) << " tmp_" << xname << " = *"
         << xname << ";" << std::endl;
    }
  }

  // Actually write the computation
  for (auto& x : tape) {
    os << "  " << get_type_string(x.dtype()) << " tmp_" << namer.get_name(x)
       << " = ";
    if (is_static_cast(x.primitive())) {
      os << "static_cast<" << get_type_string(x.dtype()) << ">(tmp_"
         << namer.get_name(x.inputs()[0]) << ");" << std::endl;
    } else {
      os << x.primitive().name();
      os << "()(";
      for (int i = 0; i < x.inputs().size() - 1; i++) {
        os << "tmp_" << namer.get_name(x.inputs()[i]) << ", ";
      }
      os << "tmp_" << namer.get_name(x.inputs().back()) << ");" << std::endl;
    }
  }

  // Write the outputs from tmps
  for (auto& x : outputs) {
    if (contiguous) {
      os << "  " << namer.get_name(x) << "[i] = tmp_" << namer.get_name(x)
         << ";" << std::endl;
    } else {
      os << "  *" << namer.get_name(x) << "++ = tmp_" << namer.get_name(x)
         << ";" << std::endl;
    }
  }

  // Close loops
  if (contiguous) {
    os << "  }" << std::endl;
  } else {
    for (int d = ndim - 1; d >= 0; --d) {
      // Update pointers
      for (size_t i = 0; i < inputs.size(); ++i) {
        const auto& x = inputs[i];
        if (is_constant(i) || is_scalar(x)) {
          continue;
        }
        auto& xname = namer.get_name(x);
        os << "  " << xname << " += " << xname << "_strides[" << d << "];"
           << std::endl;
        if (d < ndim - 1) {
          os << "  " << xname << " -= " << xname << "_strides[" << d + 1 << "]"
             << " * shape[" << d + 1 << "];" << std::endl;
        }
      }
      os << "  }" << std::endl;
    }
  }

  // Finish the kernel
  os << "}" << std::endl;
}

void Compiled::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  auto& encoder = cpu::get_command_encoder(stream());

  // Collapse contiguous dims to route to a faster kernel if possible. Also
  // handle all broadcasting.
  auto [contiguous, shape, strides] =
      compiled_collapse_contiguous_dims(inputs, outputs[0], is_constant_);

  // Collect function input arguments.
  std::vector<void*> args;
  for (size_t i = 0; i < inputs.size(); ++i) {
    if (is_constant_(i)) {
      continue;
    }
    const auto& x = inputs[i];
    encoder.set_input_array(x);
    args.push_back((void*)x.data<void>());
  }

  // Get the kernel name from the lib
  int ndim = shape.size();
  auto kernel_name = kernel_lib_ + (contiguous ? "_contiguous" : "_strided_");
  if (!contiguous) {
    kernel_name += std::to_string(ndim);
  }

  // Get the function
  auto fn_ptr = compile(kernel_name, [&, contiguous = contiguous]() {
    std::ostringstream kernel;
    kernel << get_kernel_preamble() << std::endl;
    kernel << "extern \"C\"  {" << std::endl;
    build_kernel(
        kernel,
        kernel_name,
        inputs_,
        outputs_,
        tape_,
        is_constant_,
        contiguous,
        ndim);
    // Close extern "C"
    kernel << "}" << std::endl;
    return kernel.str();
  });

  compiled_allocate_outputs(inputs, outputs, is_constant_, contiguous);

  for (auto& x : outputs) {
    args.push_back(x.data<void>());
    encoder.set_output_array(x);
  }
  if (contiguous) {
    args.push_back((void*)outputs[0].data_size());
  }
  auto fun = reinterpret_cast<void (*)(int*, int64_t**, void**)>(fn_ptr);
  encoder.dispatch([fun,
                    args = std::move(args),
                    strides = std::move(strides),
                    shape = std::move(shape)]() mutable {
    SmallVector<int64_t*> strides_ptrs;
    for (auto& s : strides) {
      strides_ptrs.push_back(s.data());
    }
    fun(shape.data(), strides_ptrs.data(), args.data());
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/compiled_preamble.h
================================================
// Copyright © 2023-24 Apple Inc.

#pragma once

// clang-format off
#include "mlx/types/half_types.h"
#include "mlx/types/complex.h"
#include "mlx/backend/cpu/unary_ops.h"
#include "mlx/backend/cpu/binary_ops.h"
// clang-format on

const char* get_kernel_preamble();


================================================
FILE: mlx/backend/cpu/conv.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <cassert>
#include <numeric>

#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/lapack.h"
#include "mlx/primitives.h"
#include "mlx/utils.h"

namespace mlx::core {

namespace {

///////////////////////////////////////////////////////////////////////////////
// Naive reference conv
///////////////////////////////////////////////////////////////////////////////

template <typename T>
void slow_conv_1D(
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding_lo,
    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    bool flip,
    Stream stream) {
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(in);
  encoder.set_input_array(wt);
  encoder.set_output_array(out);

  encoder.dispatch([start_wt_ptr = wt.data<T>(),
                    in_ptr = in.data<T>(),
                    out_ptr = out.data<T>(),

                    N = in.shape(
                        0), // Batch size, should be the same as out.shape(0)
                    iH = 1 +
                        in_dilation[0] * (in.shape(1) - 1), // Input spatial dim
                    oH = out.shape(1), // Output spatial dim
                    wH = wt.shape(1), // Weight spatial dim
                    groups = in.shape(2) / wt.shape(2),
                    O = wt.shape(0), // Out channels
                    C_per_group = wt.shape(2),

                    in_stride_N = in.strides()[0],
                    in_stride_H = in.strides()[1],
                    in_stride_C = in.strides()[2],

                    wt_stride_O = wt.strides()[0],
                    wt_stride_H = wt.strides()[1],
                    wt_stride_C = wt.strides()[2],

                    out_stride_N = out.strides()[0],
                    out_stride_H = out.strides()[1],
                    out_stride_O = out.strides()[2],

                    flip,
                    padding_lo = padding_lo[0],
                    padding_hi = padding_hi[0],
                    wt_stride = wt_strides[0],
                    wt_dilation = wt_dilation[0],
                    in_dilation = in_dilation[0]]() mutable {
    auto O_per_group = O / groups;

    for (int n = 0; n < N; ++n) {
      for (int oh = 0; oh < oH; ++oh) {
        for (int g = 0; g < groups; ++g) {
          for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
            const T* filter_wt_ptr = start_wt_ptr + o * wt_stride_O;
            float r = 0.;

            for (int wh = 0; wh < wH; ++wh) {
              const T* wt_ptr = filter_wt_ptr + wh * wt_stride_H;

              int wh_flip = flip ? (wH - wh - 1) : wh;
              int ih = oh * wt_stride - padding_lo + wh_flip * wt_dilation;

              auto ih_div = std::div(ih, in_dilation);

              if (ih >= 0 && ih < iH && ih_div.rem == 0) {
                for (int c = g * C_per_group; c < (g + 1) * C_per_group; ++c) {
                  r +=
                      static_cast<float>(
                          in_ptr[ih_div.quot * in_stride_H + c * in_stride_C]) *
                      static_cast<float>(
                          wt_ptr[(c % C_per_group) * wt_stride_C]);
                } // c

              } // ih check
            } // wh

            out_ptr[oh * out_stride_H + o * out_stride_O] = static_cast<T>(r);
          } // o
        } // g
      } // oh

      in_ptr += in_stride_N;
      out_ptr += out_stride_N;
    } // n
  });
}

template <typename T>
void slow_conv_2D(
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding_lo,
    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    bool flip,
    Stream stream) {
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(in);
  encoder.set_input_array(wt);
  encoder.set_output_array(out);

  encoder.dispatch(
      [st_wt_ptr = wt.data<T>(),
       st_in_ptr = in.data<T>(),
       st_out_ptr = out.data<T>(),

       N = in.shape(0), // Batch size, should be the same as out.shape(0)
       iH = 1 + in_dilation[0] * (in.shape(1) - 1), // Input spatial dim
       iW = 1 + in_dilation[1] * (in.shape(2) - 1), // Input spatial dim
       C = in.shape(3), // In channels
       oH = out.shape(1), // Output spatial dim
       oW = out.shape(2), // Output spatial dim
       O = wt.shape(0), // Out channels
       wH = wt.shape(1), // Weight spatial dim
       wW = wt.shape(2), // Weight spatial dim

       groups = in.shape(3) / wt.shape(3),
       C_per_group = wt.shape(3),

       in_stride_N = in.strides()[0],
       in_stride_H = in.strides()[1],
       in_stride_W = in.strides()[2],
       in_stride_C = in.strides()[3],

       wt_stride_O = wt.strides()[0],
       wt_stride_H = wt.strides()[1],
       wt_stride_W = wt.strides()[2],
       wt_stride_C = wt.strides()[3],

       out_stride_N = out.strides()[0],
       out_stride_H = out.strides()[1],
       out_stride_W = out.strides()[2],
       out_stride_O = out.strides()[3],

       padding_lo,
       padding_hi,
       wt_strides,
       wt_dilation,
       in_dilation,
       flip]() mutable {
        bool is_idil_one = in_dilation[0] == 1 && in_dilation[1] == 1;

        const int O_per_group = O / groups;
        auto pt_conv_no_checks =
            [&](const T* in_ptr, const T* wt_ptr, T* out_ptr, int oh, int ow) {
              out_ptr += oh * out_stride_H + ow * out_stride_W;
              int ih_base = oh * wt_strides[0] - padding_lo[0];
              int iw_base = ow * wt_strides[1] - padding_lo[1];

              for (int g = 0; g < groups; ++g) {
                for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
                  float r = 0.;

                  for (int wh = 0; wh < wH; ++wh) {
                    for (int ww = 0; ww < wW; ++ww) {
                      int wh_flip = flip ? wH - wh - 1 : wh;
                      int ww_flip = flip ? wW - ww - 1 : ww;
                      int ih = ih_base + wh_flip * wt_dilation[0];
                      int iw = iw_base + ww_flip * wt_dilation[1];

                      const T* wt_ptr_pt =
                          wt_ptr + wh * wt_stride_H + ww * wt_stride_W;
                      const T* in_ptr_pt =
                          in_ptr + ih * in_stride_H + iw * in_stride_W;

                      for (int c = g * C_per_group; c < (g + 1) * C_per_group;
                           ++c) {
                        r += static_cast<float>(in_ptr_pt[c * in_stride_C]) *
                            static_cast<float>(
                                 wt_ptr_pt[(c % C_per_group) * wt_stride_C]);
                      } // c
                    } // ww
                  } // wh

                  out_ptr[0] = static_cast<T>(r);
                  out_ptr += out_stride_O;
                  wt_ptr += wt_stride_O;
                } // o
              } // g
            };

        int jump_h = flip ? -wt_dilation[0] : wt_dilation[0];
        int jump_w = flip ? -wt_dilation[1] : wt_dilation[1];

        int init_h = (flip ? (wH - 1) * wt_dilation[0] : 0);
        int init_w = (flip ? (wW - 1) * wt_dilation[1] : 0);

        int f_wgt_jump_h =
            std::lcm(in_dilation[0], wt_dilation[0]) / wt_dilation[0];
        int f_wgt_jump_w =
            std::lcm(in_dilation[1], wt_dilation[1]) / wt_dilation[1];

        int f_out_jump_h =
            std::lcm(in_dilation[0], wt_strides[0]) / wt_strides[0];
        int f_out_jump_w =
            std::lcm(in_dilation[1], wt_strides[1]) / wt_strides[1];

        std::vector<int> base_h(f_out_jump_h);
        std::vector<int> base_w(f_out_jump_w);

        for (int i = 0; i < f_out_jump_h; ++i) {
          int ih_loop = i * wt_strides[0] - padding_lo[0] + init_h;

          int wh_base = 0;
          while (wh_base < wH && ih_loop % in_dilation[0] != 0) {
            wh_base++;
            ih_loop += jump_h;
          }

          base_h[i] = wh_base;
        }

        for (int j = 0; j < f_out_jump_w; ++j) {
          int iw_loop = j * wt_strides[1] - padding_lo[1] + init_w;

          int ww_base = 0;
          while (ww_base < wW && iw_loop % in_dilation[1] != 0) {
            ww_base++;
            iw_loop += jump_w;
          }

          base_w[j] = ww_base;
        }

        auto pt_conv_all_checks =
            [&](const T* in_ptr, const T* wt_ptr, T* out_ptr, int oh, int ow) {
              out_ptr += oh * out_stride_H + ow * out_stride_W;

              int ih_base = oh * wt_strides[0] - padding_lo[0];
              int iw_base = ow * wt_strides[1] - padding_lo[1];

              int wh_base = base_h[oh % f_out_jump_h];
              int ww_base = base_w[ow % f_out_jump_w];

              for (int g = 0; g < groups; ++g) {
                for (int o = g * O_per_group; o < (g + 1) * O_per_group; ++o) {
                  float r = 0.;

                  for (int wh = wh_base; wh < wH; wh += f_wgt_jump_h) {
                    for (int ww = ww_base; ww < wW; ww += f_wgt_jump_w) {
                      int wh_flip = flip ? wH - wh - 1 : wh;
                      int ww_flip = flip ? wW - ww - 1 : ww;
                      int ih = ih_base + wh_flip * wt_dilation[0];
                      int iw = iw_base + ww_flip * wt_dilation[1];

                      if (ih >= 0 && ih < iH && iw >= 0 && iw < iW) {
                        const T* wt_ptr_pt =
                            wt_ptr + wh * wt_stride_H + ww * wt_stride_W;

                        int ih_dil = !is_idil_one ? (ih / in_dilation[0]) : ih;
                        int iw_dil = !is_idil_one ? (iw / in_dilation[1]) : iw;

                        const T* in_ptr_pt = in_ptr + ih_dil * in_stride_H +
                            iw_dil * in_stride_W;

                        for (int c = g * C_per_group; c < (g + 1) * C_per_group;
                             ++c) {
                          r += static_cast<float>(in_ptr_pt[c * in_stride_C]) *
                              static_cast<float>(
                                   wt_ptr_pt[(c % C_per_group) * wt_stride_C]);
                        } // c

                      } // ih, iw check
                    } // ww
                  } // wh

                  out_ptr[0] = static_cast<T>(r);
                  out_ptr += out_stride_O;
                  wt_ptr += wt_stride_O;
                } // o
              } // g
            };

        int oH_border_0 = 0;
        int oH_border_1 = is_idil_one
            ? ((padding_lo[0] + wt_strides[0] - 1) / wt_strides[0])
            : oH;
        int oH_border_2 = std::max(
            oH_border_1,
            (iH + padding_lo[0] - wH * wt_dilation[0]) / wt_strides[0]);
        int oH_border_3 = oH;

        int oW_border_0 = 0;
        int oW_border_1 = is_idil_one
            ? ((padding_lo[1] + wt_strides[1] - 1) / wt_strides[1])
            : oW;
        int oW_border_2 = std::max(
            oW_border_1,
            (iW + padding_lo[1] - wW * wt_dilation[1]) / wt_strides[1]);
        int oW_border_3 = oW;

        for (int n = 0; n < N; ++n) {
          // Case 1: oh might put us out of bounds
          for (int oh = oH_border_0; oh < oH_border_1; ++oh) {
            for (int ow = 0; ow < oW; ++ow) {
              pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
            } // ow
          } // oh

          // Case 2: oh in bounds
          for (int oh = oH_border_1; oh < oH_border_2; ++oh) {
            // Case a: ow might put us out of bounds
            for (int ow = oW_border_0; ow < oW_border_1; ++ow) {
              pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
            } // ow

            // Case b: ow in bounds
            for (int ow = oW_border_1; ow < oW_border_2; ++ow) {
              pt_conv_no_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
            } // ow

            // Case c: ow might put us out of bounds
            for (int ow = oW_border_2; ow < oW_border_3; ++ow) {
              pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
            } // ow

          } // oh

          // Case 3: oh might put us out of bounds
          for (int oh = oH_border_2; oh < oH_border_3; ++oh) {
            for (int ow = 0; ow < oW; ++ow) {
              pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, oh, ow);
            } // ow
          } // oh

          st_in_ptr += in_stride_N;
          st_out_ptr += out_stride_N;

        } // n
      });
}

template <typename T>
void slow_conv_3D(
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding_lo,
    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    bool flip,
    Stream stream) {
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(in);
  encoder.set_input_array(wt);
  encoder.set_output_array(out);

  encoder.dispatch([st_wt_ptr = wt.data<T>(),
                    st_in_ptr = in.data<T>(),
                    st_out_ptr = out.data<T>(),

                    N = in.shape(
                        0), // Batch size, should be the same as out.shape(0)
                    iD = 1 +
                        in_dilation[0] * (in.shape(1) - 1), // Input spatial dim
                    iH = 1 +
                        in_dilation[1] * (in.shape(2) - 1), // Input spatial dim
                    iW = 1 +
                        in_dilation[2] * (in.shape(3) - 1), // Input spatial dim
                    oD = out.shape(1), // Output spatial dim
                    oH = out.shape(2), // Output spatial dim
                    oW = out.shape(3), // Output spatial dim
                    O = wt.shape(0), // Out channels
                    C = wt.shape(4), // In channels
                    wD = wt.shape(1), // Weight spatial dim
                    wH = wt.shape(2), // Weight spatial dim
                    wW = wt.shape(3), // Weight spatial dim

                    in_stride_N = in.strides()[0],
                    in_stride_D = in.strides()[1],
                    in_stride_H = in.strides()[2],
                    in_stride_W = in.strides()[3],
                    in_stride_C = in.strides()[4],

                    wt_stride_O = wt.strides()[0],
                    wt_stride_D = wt.strides()[1],
                    wt_stride_H = wt.strides()[2],
                    wt_stride_W = wt.strides()[3],
                    wt_stride_C = wt.strides()[4],

                    out_stride_N = out.strides()[0],
                    out_stride_D = out.strides()[1],
                    out_stride_H = out.strides()[2],
                    out_stride_W = out.strides()[3],
                    out_stride_O = out.strides()[4],
                    padding_lo,
                    padding_hi,
                    wt_strides,
                    wt_dilation,
                    in_dilation,
                    flip]() mutable {
    bool is_idil_one =
        in_dilation[0] == 1 && in_dilation[1] == 1 && in_dilation[2] == 1;

    auto pt_conv_no_checks = [&](const T* in_ptr,
                                 const T* wt_ptr,
                                 T* out_ptr,
                                 int od,
                                 int oh,
                                 int ow) {
      out_ptr += od * out_stride_D + oh * out_stride_H + ow * out_stride_W;
      int id_base = od * wt_strides[0] - padding_lo[0];
      int ih_base = oh * wt_strides[1] - padding_lo[1];
      int iw_base = ow * wt_strides[2] - padding_lo[2];

      for (int o = 0; o < O; ++o) {
        float r = 0.;

        for (int wd = 0; wd < wD; ++wd) {
          for (int wh = 0; wh < wH; ++wh) {
            for (int ww = 0; ww < wW; ++ww) {
              int wd_flip = flip ? wD - wd - 1 : wd;
              int wh_flip = flip ? wH - wh - 1 : wh;
              int ww_flip = flip ? wW - ww - 1 : ww;
              int id = id_base + wd_flip * wt_dilation[0];
              int ih = ih_base + wh_flip * wt_dilation[1];
              int iw = iw_base + ww_flip * wt_dilation[2];

              const T* wt_ptr_pt = wt_ptr + wd * wt_stride_D +
                  wh * wt_stride_H + ww * wt_stride_W;
              const T* in_ptr_pt = in_ptr + id * in_stride_D +
                  ih * in_stride_H + iw * in_stride_W;

              for (int c = 0; c < C; ++c) {
                r += static_cast<float>(in_ptr_pt[0]) *
                    static_cast<float>(wt_ptr_pt[0]);
                in_ptr_pt += in_stride_C;
                wt_ptr_pt += wt_stride_C;
              } // c

            } // ww
          } // wh
        } // wd

        out_ptr[0] = static_cast<T>(r);
        out_ptr += out_stride_O;
        wt_ptr += wt_stride_O;
      } // o
    };

    int jump_d = flip ? -wt_dilation[0] : wt_dilation[0];
    int jump_h = flip ? -wt_dilation[1] : wt_dilation[1];
    int jump_w = flip ? -wt_dilation[2] : wt_dilation[2];

    int init_d = (flip ? (wD - 1) * wt_dilation[0] : 0);
    int init_h = (flip ? (wH - 1) * wt_dilation[1] : 0);
    int init_w = (flip ? (wW - 1) * wt_dilation[2] : 0);

    int f_wgt_jump_d =
        std::lcm(in_dilation[0], wt_dilation[0]) / wt_dilation[0];
    int f_wgt_jump_h =
        std::lcm(in_dilation[1], wt_dilation[1]) / wt_dilation[1];
    int f_wgt_jump_w =
        std::lcm(in_dilation[2], wt_dilation[2]) / wt_dilation[2];

    int f_out_jump_d = std::lcm(in_dilation[0], wt_strides[0]) / wt_strides[0];
    int f_out_jump_h = std::lcm(in_dilation[1], wt_strides[1]) / wt_strides[1];
    int f_out_jump_w = std::lcm(in_dilation[2], wt_strides[2]) / wt_strides[2];

    std::vector<int> base_d(f_out_jump_d);
    std::vector<int> base_h(f_out_jump_h);
    std::vector<int> base_w(f_out_jump_w);

    for (int i = 0; i < f_out_jump_d; ++i) {
      int id_loop = i * wt_strides[0] - padding_lo[0] + init_d;

      int wd_base = 0;
      while (wd_base < wD && id_loop % in_dilation[0] != 0) {
        wd_base++;
        id_loop += jump_d;
      }

      base_d[i] = wd_base;
    }

    for (int i = 0; i < f_out_jump_h; ++i) {
      int ih_loop = i * wt_strides[1] - padding_lo[1] + init_h;

      int wh_base = 0;
      while (wh_base < wH && ih_loop % in_dilation[1] != 0) {
        wh_base++;
        ih_loop += jump_h;
      }

      base_h[i] = wh_base;
    }

    for (int j = 0; j < f_out_jump_w; ++j) {
      int iw_loop = j * wt_strides[2] - padding_lo[2] + init_w;

      int ww_base = 0;
      while (ww_base < wW && iw_loop % in_dilation[2] != 0) {
        ww_base++;
        iw_loop += jump_w;
      }

      base_w[j] = ww_base;
    }

    auto pt_conv_all_checks = [&](const T* in_ptr,
                                  const T* wt_ptr,
                                  T* out_ptr,
                                  int od,
                                  int oh,
                                  int ow) {
      out_ptr += od * out_stride_D + oh * out_stride_H + ow * out_stride_W;

      int id_base = od * wt_strides[0] - padding_lo[0];
      int ih_base = oh * wt_strides[1] - padding_lo[1];
      int iw_base = ow * wt_strides[2] - padding_lo[2];

      int wd_base = base_d[od % f_out_jump_d];
      int wh_base = base_h[oh % f_out_jump_h];
      int ww_base = base_w[ow % f_out_jump_w];

      for (int o = 0; o < O; ++o) {
        float r = 0.;

        for (int wd = wd_base; wd < wD; wd += f_wgt_jump_d) {
          for (int wh = wh_base; wh < wH; wh += f_wgt_jump_h) {
            for (int ww = ww_base; ww < wW; ww += f_wgt_jump_w) {
              int wd_flip = flip ? wD - wd - 1 : wd;
              int wh_flip = flip ? wH - wh - 1 : wh;
              int ww_flip = flip ? wW - ww - 1 : ww;
              int id = id_base + wd_flip * wt_dilation[0];
              int ih = ih_base + wh_flip * wt_dilation[1];
              int iw = iw_base + ww_flip * wt_dilation[2];

              if (id >= 0 && id < iD && ih >= 0 && ih < iH && iw >= 0 &&
                  iw < iW) {
                const T* wt_ptr_pt = wt_ptr + wd * wt_stride_D +
                    wh * wt_stride_H + ww * wt_stride_W;

                int id_dil = !is_idil_one ? (id / in_dilation[0]) : id;
                int ih_dil = !is_idil_one ? (ih / in_dilation[1]) : ih;
                int iw_dil = !is_idil_one ? (iw / in_dilation[2]) : iw;

                const T* in_ptr_pt = in_ptr + id_dil * in_stride_D +
                    ih_dil * in_stride_H + iw_dil * in_stride_W;

                for (int c = 0; c < C; ++c) {
                  r += static_cast<float>(in_ptr_pt[0]) *
                      static_cast<float>(wt_ptr_pt[0]);
                  in_ptr_pt += in_stride_C;
                  wt_ptr_pt += wt_stride_C;
                } // c

              } // iD, ih, iw check
            } // ww
          } // wh
        } // wd

        out_ptr[0] = static_cast<T>(r);
        out_ptr += out_stride_O;
        wt_ptr += wt_stride_O;
      } // o
    };

    int oD_border_0 = 0;
    int oD_border_1 = is_idil_one
        ? ((padding_lo[0] + wt_strides[0] - 1) / wt_strides[0])
        : oD;
    int oD_border_2 = std::max(
        oD_border_1,
        (iD + padding_lo[0] - wD * wt_dilation[0]) / wt_strides[0]);
    int oD_border_3 = oD;

    int oH_border_0 = 0;
    int oH_border_1 = is_idil_one
        ? ((padding_lo[1] + wt_strides[1] - 1) / wt_strides[1])
        : oH;
    int oH_border_2 = std::max(
        oH_border_1,
        (iH + padding_lo[1] - wH * wt_dilation[1]) / wt_strides[1]);
    int oH_border_3 = oH;

    int oW_border_0 = 0;
    int oW_border_1 = is_idil_one
        ? ((padding_lo[2] + wt_strides[2] - 1) / wt_strides[2])
        : oW;
    int oW_border_2 = std::max(
        oW_border_1,
        (iW + padding_lo[2] - wW * wt_dilation[2]) / wt_strides[2]);
    int oW_border_3 = oW;

    for (int n = 0; n < N; ++n) {
      // Case 1: od might put us out of bounds
      for (int od = oD_border_0; od < oD_border_1; ++od) {
        for (int oh = 0; oh < oH; ++oh) {
          for (int ow = 0; ow < oW; ++ow) {
            pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
          } // ow
        } // oh
      } // od

      // Case 2: od in bounds
      for (int od = oD_border_1; od < oD_border_2; ++od) {
        // Case 2.1: oh might put us out of bounds
        for (int oh = oH_border_0; oh < oH_border_1; ++oh) {
          for (int ow = 0; ow < oW; ++ow) {
            pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
          } // ow
        } // oh

        // Case 2.2: oh in bounds
        for (int oh = oH_border_1; oh < oH_border_2; ++oh) {
          // Case 2.2.1: ow might put us out of bounds
          for (int ow = oW_border_0; ow < oW_border_1; ++ow) {
            pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
          } // ow

          // Case 2.2.2: ow in bounds
          for (int ow = oW_border_1; ow < oW_border_2; ++ow) {
            pt_conv_no_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
          } // ow

          // Case 2.2.3: ow might put us out of bounds
          for (int ow = oW_border_2; ow < oW_border_3; ++ow) {
            pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
          } // ow
        } // oh

        // Case 2.3: oh might put us out of bounds
        for (int oh = oH_border_2; oh < oH_border_3; ++oh) {
          for (int ow = 0; ow < oW; ++ow) {
            pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
          } // ow
        } // oh
      } // od

      // Case 3: od might put us out of bounds
      for (int od = oD_border_2; od < oD_border_3; ++od) {
        for (int oh = 0; oh < oH; ++oh) {
          for (int ow = 0; ow < oW; ++ow) {
            pt_conv_all_checks(st_in_ptr, st_wt_ptr, st_out_ptr, od, oh, ow);
          } // ow
        } // oh
      } // od

      st_in_ptr += in_stride_N;
      st_out_ptr += out_stride_N;

    } // n
  });
}

void dispatch_slow_conv_1D(
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding_lo,
    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    bool flip,
    Stream stream) {
  if (in.dtype() == float32) {
    return slow_conv_1D<float>(
        in,
        wt,
        out,
        padding_lo,
        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
        flip,
        stream);
  } else if (in.dtype() == float16) {
    return slow_conv_1D<float16_t>(
        in,
        wt,
        out,
        padding_lo,
        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
        flip,
        stream);
  } else if (in.dtype() == bfloat16) {
    return slow_conv_1D<bfloat16_t>(
        in,
        wt,
        out,
        padding_lo,
        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
        flip,
        stream);
  } else {
    throw std::invalid_argument(
        "[Convolution::eval] got unsupported data type.");
  }
}

void dispatch_slow_conv_2D(
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding_lo,
    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    bool flip,
    Stream stream) {
  if (in.dtype() == float32) {
    return slow_conv_2D<float>(
        in,
        wt,
        out,
        padding_lo,
        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
        flip,
        stream);
  } else if (in.dtype() == float16) {
    return slow_conv_2D<float16_t>(
        in,
        wt,
        out,
        padding_lo,
        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
        flip,
        stream);
  } else if (in.dtype() == bfloat16) {
    return slow_conv_2D<bfloat16_t>(
        in,
        wt,
        out,
        padding_lo,
        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
        flip,
        stream);
  } else {
    throw std::invalid_argument(
        "[Convolution::eval] got unsupported data type.");
  }
}

void dispatch_slow_conv_3D(
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding_lo,
    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    bool flip,
    Stream stream) {
  if (in.dtype() == float32) {
    return slow_conv_3D<float>(
        in,
        wt,
        out,
        padding_lo,
        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
        flip,
        stream);
  } else if (in.dtype() == float16) {
    return slow_conv_3D<float16_t>(
        in,
        wt,
        out,
        padding_lo,
        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
        flip,
        stream);
  } else if (in.dtype() == bfloat16) {
    return slow_conv_3D<bfloat16_t>(
        in,
        wt,
        out,
        padding_lo,
        padding_hi,
        wt_strides,
        wt_dilation,
        in_dilation,
        flip,
        stream);
  } else {
    throw std::invalid_argument(
        "[Convolution::eval] got unsupported data type.");
  }
}

///////////////////////////////////////////////////////////////////////////////
// Explicit gemm conv
///////////////////////////////////////////////////////////////////////////////

template <typename T>
void flip_spatial_dims_inplace(
    T* x,
    size_t in_channels,
    size_t out_channels,
    size_t spatial_size) {
  for (size_t i = 0; i < out_channels; i++) {
    T* top = x + i * spatial_size * in_channels;
    T* bottom =
        x + i * spatial_size * in_channels + (spatial_size - 1) * in_channels;
    for (size_t j = 0; j < spatial_size / 2; j++) {
      for (size_t k = 0; k < in_channels; k++) {
        std::swap(top[k], bottom[k]);
      }
      top += in_channels;
      bottom -= in_channels;
    }
  }
}

void explicit_gemm_conv_1D_cpu(
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding_lo,
    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    Stream stream) {
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
  const int iH = in.shape(1); // Input spatial dim
  const int C = in.shape(2); // Input channels
  const int oH = out.shape(1); // Output spatial dim
  const int O = wt.shape(0); // Out channels
  const int wH = wt.shape(1); // Weight spatial dim

  const int groups = C / wt.shape(2);
  const int C_per_group = wt.shape(2);
  const int O_per_group = O / groups;

  auto conv_dtype = float32;
  auto& encoder = cpu::get_command_encoder(stream);

  // Pad input
  Shape padded_shape = {N, iH + padding_lo[0] + padding_hi[0], C};
  array in_padded(padded_shape, conv_dtype, nullptr, {});

  // Fill with zeros
  std::vector<array> temps;
  temps.push_back(array(0, conv_dtype));
  copy_cpu(temps.back(), in_padded, CopyType::Scalar, stream);

  // Pick input slice from padded
  size_t data_offset = padding_lo[0] * in_padded.strides()[1];
  array in_padded_slice(in.shape(), in_padded.dtype(), nullptr, {});
  in_padded_slice.copy_shared_buffer(
      in_padded,
      in_padded.strides(),
      in_padded.flags(),
      in_padded_slice.size(),
      data_offset);
  // Copy input values into the slice
  copy_cpu_inplace(in, in_padded_slice, CopyType::GeneralGeneral, stream);
  temps.push_back(in_padded_slice);

  // Make strided view
  Shape strided_shape = {N, oH, wH, C};

  Strides strided_strides = {
      in_padded.strides()[0],
      in_padded.strides()[1] * wt_strides[0],
      in_padded.strides()[1],
      in_padded.strides()[2]};
  auto flags = in_padded.flags();
  if (groups > 1) {
    // Transpose the last two dimensions for grouped convolutions
    std::swap(strided_shape[2], strided_shape[3]);
    std::swap(strided_strides[2], strided_strides[3]);
  }

  array in_strided_view(strided_shape, in_padded.dtype(), nullptr, {});
  in_strided_view.copy_shared_buffer(
      in_padded, strided_strides, flags, in_strided_view.size(), 0);

  // Materialize strided view
  Shape strided_reshape = {N * oH, wH * C};
  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
  copy_cpu(in_strided_view, in_strided, CopyType::General, stream);
  temps.push_back(in_strided);

  // Check wt dtype and prepare
  auto gemm_wt = wt;
  auto gemm_out = out;

  if (groups > 1) {
    // Transpose the last two dimensions for grouped convolutions
    array wt_transpose(
        {wt.shape(0), wt.shape(2), wt.shape(1)}, wt.dtype(), nullptr, {});
    wt_transpose.copy_shared_buffer(
        wt,
        {wt.strides(0), wt.strides(2), wt.strides(1)},
        wt.flags(),
        wt.size(),
        0);
    gemm_wt = array(wt_transpose.shape(), float32, nullptr, {});
    copy_cpu(wt_transpose, gemm_wt, CopyType::General, stream);
    temps.push_back(gemm_wt);
  } else if (wt.dtype() != float32 || !wt.flags().row_contiguous) {
    auto ctype =
        wt.flags().row_contiguous ? CopyType::Vector : CopyType::General;
    gemm_wt = array(wt.shape(), float32, nullptr, {});
    copy_cpu(wt, gemm_wt, ctype, stream);
    temps.push_back(gemm_wt);
  }

  if (out.dtype() != float32) {
    gemm_out = array(out.shape(), float32, nullptr, {});
    gemm_out.set_data(allocator::malloc(gemm_out.nbytes()));
    temps.push_back(gemm_out);
  }

  encoder.set_input_array(in_strided);
  encoder.set_input_array(gemm_wt);
  encoder.set_output_array(gemm_out);

  encoder.dispatch([in_strided_ptr = in_strided.data<float>(),
                    gemm_wt_ptr = gemm_wt.data<float>(),
                    gemm_out_ptr = gemm_out.data<float>(),
                    groups,
                    strided_reshape = strided_reshape[0],
                    O,
                    C,
                    wH,
                    O_per_group,
                    C_per_group]() {
    for (int g = 0; g < groups; ++g) {
      // Perform gemm
      cblas_sgemm(
          CblasRowMajor,
          CblasNoTrans, // no trans A
          CblasTrans, // transB
          strided_reshape, // M
          O_per_group, // N
          C_per_group * wH, // K
          1.0f, // alpha
          in_strided_ptr + g * C_per_group * wH, // A
          wH * C, // lda
          gemm_wt_ptr + g * O_per_group * C_per_group * wH, // B
          wH * C_per_group, // ldb
          0.0f, // beta
          gemm_out_ptr + g * O_per_group, // C
          O // ldc
      );
    }
  });

  // Copy results if needed
  if (out.dtype() != float32) {
    copy_cpu_inplace(gemm_out, out, CopyType::Vector, stream);
  }
  encoder.add_temporaries(std::move(temps));
}

void explicit_gemm_conv_ND_cpu(
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding_lo,
    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const bool flip,
    Stream stream) {
  const int N = in.shape(0); // Batch size, should be the same as out.shape(0)
  const auto iDim =
      Shape(in.shape().begin() + 1, in.shape().end() - 1); // Input spatial dim
  const auto oDim = Shape(
      out.shape().begin() + 1, out.shape().end() - 1); // Output spatial dim
  const int O = wt.shape(0); // Out channels
  const int C = wt.shape(-1); // In channels
  const auto wDim =
      Shape(wt.shape().begin() + 1, wt.shape().end() - 1); // Weight spatial dim

  auto conv_dtype = float32;

  auto& encoder = cpu::get_command_encoder(stream);

  // Pad input
  Shape padded_shape(in.shape().size());
  padded_shape.front() = N;
  for (size_t i = 0; i < iDim.size(); i++) {
    padded_shape[i + 1] = iDim[i] + padding_lo[i] + padding_hi[i];
  }
  padded_shape.back() = C;
  array in_padded(padded_shape, conv_dtype, nullptr, {});

  // Fill with zeros
  std::vector<array> temps = {array(0, conv_dtype)};
  copy_cpu(temps.back(), in_padded, CopyType::Scalar, stream);

  // Pick input slice from padded
  size_t data_offset = 0;
  for (size_t i = 0; i < padding_lo.size(); i++) {
    data_offset += padding_lo[i] * in_padded.strides()[i + 1];
  }

  array in_padded_slice(in.shape(), in_padded.dtype(), nullptr, {});
  in_padded_slice.copy_shared_buffer(
      in_padded,
      in_padded.strides(),
      in_padded.flags(),
      in_padded_slice.size(),
      data_offset);

  // Copy input values into the slice
  copy_cpu_inplace(in, in_padded_slice, CopyType::GeneralGeneral, stream);
  temps.push_back(in_padded_slice);

  // Make strided view
  Shape strided_shape(oDim.size() + wDim.size() + 2);
  strided_shape.front() = N;
  for (size_t i = 0; i < oDim.size(); i++) {
    strided_shape[i + 1] = oDim[i];
  }
  for (size_t i = 0; i < wDim.size(); i++) {
    strided_shape[i + 1 + oDim.size()] = wDim[i];
  }
  strided_shape.back() = C;

  Strides strided_strides(in.shape().size() * 2 - 2);
  strided_strides[0] = in_padded.strides()[0];
  for (size_t i = 0; i < wt_strides.size(); i++) {
    strided_strides[i + 1] = in_padded.strides()[i + 1] * wt_strides[i];
  }
  for (size_t i = 1; i < in_padded.strides().size(); i++) {
    strided_strides[i + wt_strides.size()] = in_padded.strides()[i];
  }

  auto flags = in_padded.flags();

  array in_strided_view(strided_shape, in_padded.dtype(), nullptr, {});
  in_strided_view.copy_shared_buffer(
      in_padded, strided_strides, flags, in_strided_view.size(), 0);

  // Materialize strided view
  Shape strided_reshape = {N, C};
  for (const auto& o : oDim) {
    strided_reshape[0] *= o;
  }
  for (const auto& w : wDim) {
    strided_reshape[1] *= w;
  }

  array in_strided(strided_reshape, in_strided_view.dtype(), nullptr, {});
  copy_cpu(in_strided_view, in_strided, CopyType::General, stream);
  temps.push_back(in_strided);

  // Check wt dtype and prepare
  auto gemm_wt = wt;
  auto gemm_out = out;

  if (wt.dtype() != float32 || !wt.flags().row_contiguous) {
    auto ctype =
        wt.flags().row_contiguous ? CopyType::Vector : CopyType::General;
    gemm_wt = array(wt.shape(), float32, nullptr, {});
    copy_cpu(wt, gemm_wt, ctype, stream);
    temps.push_back(gemm_wt);
  }

  if (flip) {
    auto gemm_wt_ = array(gemm_wt.shape(), float32, nullptr, {});
    copy_cpu(gemm_wt, gemm_wt_, CopyType::Vector, stream);
    temps.push_back(gemm_wt_);

    // Calculate the total size of the spatial dimensions
    int spatial_size = 1;
    for (int d = 1; d < gemm_wt.ndim() - 1; ++d) {
      spatial_size *= gemm_wt.shape(d);
    }
    encoder.set_output_array(gemm_wt_);
    encoder.dispatch([gemm_wt_ptr = gemm_wt_.data<float>(),
                      out_channels = gemm_wt.shape(0),
                      in_channels = gemm_wt.shape(-1),
                      spatial_size]() {
      flip_spatial_dims_inplace<float>(
          gemm_wt_ptr, in_channels, out_channels, spatial_size);
    });
    gemm_wt = gemm_wt_;
  }

  if (out.dtype() != float32) {
    gemm_out = array(out.shape(), float32, nullptr, {});
    gemm_out.set_data(allocator::malloc(gemm_out.nbytes()));
    temps.push_back(gemm_out);
  }

  encoder.set_input_array(in_strided);
  encoder.set_input_array(gemm_wt);
  encoder.set_output_array(gemm_out);

  encoder.dispatch([in_strided_ptr = in_strided.data<float>(),
                    gemm_wt_ptr = gemm_wt.data<float>(),
                    gemm_out_ptr = gemm_out.data<float>(),
                    strided_reshape = std::move(strided_reshape),
                    O]() {
    // Perform gemm
    cblas_sgemm(
        CblasRowMajor,
        CblasNoTrans, // no trans A
        CblasTrans, // transB
        strided_reshape[0], // M
        O, // N
        strided_reshape[1], // K
        1.0f, // alpha
        in_strided_ptr,
        strided_reshape[1], // lda
        gemm_wt_ptr,
        strided_reshape[1], // ldb
        0.0f, // beta
        gemm_out_ptr,
        O // ldc
    );
  });

  // Copy results if needed
  if (out.dtype() != float32) {
    copy_cpu_inplace(gemm_out, out, CopyType::Vector, stream);
  }
  encoder.add_temporaries(std::move(temps));
}

///////////////////////////////////////////////////////////////////////////////
// Conv routing
///////////////////////////////////////////////////////////////////////////////

void conv_1D_cpu(
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding_lo,
    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    bool flip,
    Stream stream) {
  const int groups = in.shape().back() / wt.shape().back();
  if (wt_dilation[0] == 1 && in_dilation[0] == 1 && !flip) {
    return explicit_gemm_conv_1D_cpu(
        in, wt, out, padding_lo, padding_hi, wt_strides, wt_dilation, stream);
  }
  if (wt_dilation[0] == 1 && in_dilation[0] == 1 && groups == 1) {
    return explicit_gemm_conv_ND_cpu(
        in,
        wt,
        out,
        padding_lo,
        padding_hi,
        wt_strides,
        wt_dilation,
        flip,
        stream);
  }

  return dispatch_slow_conv_1D(
      in,
      wt,
      out,
      padding_lo,
      padding_hi,
      wt_strides,
      wt_dilation,
      in_dilation,
      flip,
      stream);
}

void conv_2D_cpu(
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding_lo,
    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    bool flip,
    Stream stream) {
  const int groups = in.shape().back() / wt.shape().back();
  if (wt_dilation[0] == 1 && wt_dilation[1] == 1 && in_dilation[0] == 1 &&
      in_dilation[1] == 1 && groups == 1) {
    return explicit_gemm_conv_ND_cpu(
        in,
        wt,
        out,
        padding_lo,
        padding_hi,
        wt_strides,
        wt_dilation,
        flip,
        stream);
  }
  return dispatch_slow_conv_2D(
      in,
      wt,
      out,
      padding_lo,
      padding_hi,
      wt_strides,
      wt_dilation,
      in_dilation,
      flip,
      stream);
}

void conv_3D_cpu(
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding_lo,
    const std::vector<int>& padding_hi,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    bool flip,
    Stream stream) {
  const int groups = in.shape().back() / wt.shape().back();
  if (wt_dilation[0] == 1 && wt_dilation[1] == 1 && wt_dilation[2] == 1 &&
      in_dilation[0] == 1 && in_dilation[1] == 1 && in_dilation[2] == 1 &&
      groups == 1) {
    return explicit_gemm_conv_ND_cpu(
        in,
        wt,
        out,
        padding_lo,
        padding_hi,
        wt_strides,
        wt_dilation,
        flip,
        stream);
  }

  return dispatch_slow_conv_3D(
      in,
      wt,
      out,
      padding_lo,
      padding_hi,
      wt_strides,
      wt_dilation,
      in_dilation,
      flip,
      stream);
}

} // namespace

void Convolution::eval_cpu(const std::vector<array>& inputs, array& out) {
  out.set_data(allocator::malloc(out.nbytes()));

  auto& in = inputs[0];
  auto& wt = inputs[1];

  // 3D convolution
  if (in.ndim() == (3 + 2)) {
    return conv_3D_cpu(
        in,
        wt,
        out,
        padding_lo_,
        padding_hi_,
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
        flip_,
        stream());
  }
  // 2D convolution
  else if (in.ndim() == (2 + 2)) {
    return conv_2D_cpu(
        in,
        wt,
        out,
        padding_lo_,
        padding_hi_,
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
        flip_,
        stream());
  }
  // 1D convolution
  else if (in.ndim() == (1 + 2)) {
    return conv_1D_cpu(
        in,
        wt,
        out,
        padding_lo_,
        padding_hi_,
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
        flip_,
        stream());
  }
  // Throw error
  else {
    std::ostringstream msg;
    msg << "[Convolution::eval] Convolution currently only supports"
        << " 1D, 2D and 3D convolutions. Got inputs with " << in.ndim() - 2
        << " spatial dimensions";
    throw std::invalid_argument(msg.str());
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/copy.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <numeric>

#include "mlx/allocator.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/simd/simd.h"

namespace mlx::core {

namespace {

template <typename SrcT, typename DstT>
void copy_single(const array& src, array& dst) {
  auto src_ptr = src.data<SrcT>();
  auto dst_ptr = dst.data<DstT>();
  auto size = dst.size();
  auto val = static_cast<DstT>(src_ptr[0]);
  std::fill_n(dst_ptr, size, val);
}

template <typename SrcT, typename DstT>
void copy_vector(const array& src, array& dst) {
  auto src_ptr = src.data<SrcT>();
  auto dst_ptr = dst.data<DstT>();
  auto size = src.data_size();
  std::copy(src_ptr, src_ptr + size, dst_ptr);
}

template <typename SrcT, typename DstT, int D>
inline void copy_dims(
    const SrcT* src,
    DstT* dst,
    const Shape& shape,
    const Strides& i_strides,
    const Strides& o_strides,
    int axis) {
  auto stride_src = i_strides[axis];
  auto stride_dst = o_strides[axis];
  auto N = shape[axis];

  for (int i = 0; i < N; i++) {
    if constexpr (D > 1) {
      copy_dims<SrcT, DstT, D - 1>(
          src, dst, shape, i_strides, o_strides, axis + 1);
    } else {
      *dst = static_cast<DstT>(*src);
    }
    src += stride_src;
    dst += stride_dst;
  }
}

template <typename SrcT, typename DstT>
void copy_general_general(
    const array& src,
    array& dst,
    const Shape& data_shape,
    const Strides& i_strides,
    const Strides& o_strides,
    int64_t i_offset,
    int64_t o_offset,
    const std::optional<array>& dynamic_i_offset,
    const std::optional<array>& dynamic_o_offset) {
  auto src_ptr = src.data<SrcT>() + i_offset;
  auto dst_ptr = dst.data<DstT>() + o_offset;
  auto i_offset_ptr =
      dynamic_i_offset ? dynamic_i_offset->data<int64_t>() : nullptr;
  auto o_offset_ptr =
      dynamic_o_offset ? dynamic_o_offset->data<int64_t>() : nullptr;
  auto size = src.size();
  if (data_shape.empty()) {
    auto val = static_cast<DstT>(*src_ptr);
    *dst_ptr = val;
    return;
  }
  auto [shape, strides] =
      collapse_contiguous_dims(data_shape, {i_strides, o_strides});

  int ndim = shape.size();
  if (ndim < 3) {
    if (i_offset_ptr) {
      src_ptr += i_offset_ptr[0];
    }
    if (o_offset_ptr) {
      dst_ptr += o_offset_ptr[0];
    }

    if (ndim == 1) {
      copy_dims<SrcT, DstT, 1>(
          src_ptr, dst_ptr, shape, strides[0], strides[1], 0);
    } else if (ndim == 2) {
      copy_dims<SrcT, DstT, 2>(
          src_ptr, dst_ptr, shape, strides[0], strides[1], 0);
    } else if (ndim == 3) {
      copy_dims<SrcT, DstT, 3>(
          src_ptr, dst_ptr, shape, strides[0], strides[1], 0);
    }
    return;
  }
  if (i_offset_ptr) {
    src_ptr += i_offset_ptr[0];
  }
  if (o_offset_ptr) {
    dst_ptr += o_offset_ptr[0];
  }

  ContiguousIterator in(shape, strides[0], ndim - 3);
  ContiguousIterator out(shape, strides[1], ndim - 3);
  auto stride = std::accumulate(
      shape.end() - 3, shape.end(), 1, std::multiplies<int64_t>());
  for (int64_t elem = 0; elem < size; elem += stride) {
    copy_dims<SrcT, DstT, 3>(
        src_ptr + in.loc,
        dst_ptr + out.loc,
        shape,
        strides[0],
        strides[1],
        ndim - 3);
    in.step();
    out.step();
  }
}

template <typename SrcT, typename DstT>
inline void copy_general_general(const array& src, array& dst) {
  copy_general_general<SrcT, DstT>(
      src,
      dst,
      src.shape(),
      src.strides(),
      dst.strides(),
      0,
      0,
      std::nullopt,
      std::nullopt);
}

template <typename SrcT, typename DstT>
void copy_general(
    const array& src,
    array& dst,
    const Shape& data_shape,
    const Strides& i_strides,
    const Strides&,
    int64_t i_offset,
    int64_t o_offset,
    const std::optional<array>& dynamic_i_offset,
    const std::optional<array>& dynamic_o_offset) {
  copy_general_general<SrcT, DstT>(
      src,
      dst,
      data_shape,
      i_strides,
      make_contiguous_strides(data_shape),
      i_offset,
      o_offset,
      dynamic_i_offset,
      dynamic_o_offset);
}

template <typename SrcT, typename DstT>
inline void copy_general(const array& src, array& dst) {
  copy_general_general<SrcT, DstT>(
      src,
      dst,
      src.shape(),
      src.strides(),
      make_contiguous_strides(src.shape()),
      0,
      0,
      std::nullopt,
      std::nullopt);
}

template <typename SrcT, typename DstT, typename... Args>
void copy(const array& src, array& dst, CopyType ctype, Args&&... args) {
  switch (ctype) {
    case CopyType::Scalar:
      copy_single<SrcT, DstT>(src, dst);
      return;
    case CopyType::Vector:
      copy_vector<SrcT, DstT>(src, dst);
      return;
    case CopyType::General:
      copy_general<SrcT, DstT>(src, dst, std::forward<Args>(args)...);
      return;
    case CopyType::GeneralGeneral:
      copy_general_general<SrcT, DstT>(src, dst, std::forward<Args>(args)...);
      return;
  }
}

template <typename SrcT, typename... Args>
void copy(const array& src, array& dst, CopyType ctype, Args&&... args) {
  switch (dst.dtype()) {
    case bool_:
      copy<SrcT, bool>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case uint8:
      copy<SrcT, uint8_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case uint16:
      copy<SrcT, uint16_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case uint32:
      copy<SrcT, uint32_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case uint64:
      copy<SrcT, uint64_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case int8:
      copy<SrcT, int8_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case int16:
      copy<SrcT, int16_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case int32:
      copy<SrcT, int32_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case int64:
      copy<SrcT, int64_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case float16:
      copy<SrcT, float16_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case float32:
      copy<SrcT, float>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case float64:
      copy<SrcT, double>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case bfloat16:
      copy<SrcT, bfloat16_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case complex64:
      copy<SrcT, complex64_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
  }
}

template <typename... Args>
inline void copy_inplace_dispatch(
    const array& src,
    array& dst,
    CopyType ctype,
    Args&&... args) {
  switch (src.dtype()) {
    case bool_:
      copy<bool>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case uint8:
      copy<uint8_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case uint16:
      copy<uint16_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case uint32:
      copy<uint32_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case uint64:
      copy<uint64_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case int8:
      copy<int8_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case int16:
      copy<int16_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case int32:
      copy<int32_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case int64:
      copy<int64_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case float16:
      copy<float16_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case float32:
      copy<float>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case float64:
      copy<double>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case bfloat16:
      copy<bfloat16_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
    case complex64:
      copy<complex64_t>(src, dst, ctype, std::forward<Args>(args)...);
      break;
  }
}

} // namespace

void copy_cpu_inplace(
    const array& src,
    array& dst,
    CopyType ctype,
    Stream stream) {
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(src);
  encoder.set_output_array(dst);
  encoder.dispatch(
      [src = array::unsafe_weak_copy(src),
       dst = array::unsafe_weak_copy(dst),
       ctype]() mutable { copy_inplace_dispatch(src, dst, ctype); });
}

void copy_cpu(const array& src, array& dst, CopyType ctype, Stream stream) {
  bool donated = set_copy_output_data(src, dst, ctype);
  if (donated && src.dtype() == dst.dtype()) {
    // If the output has the same type as the input then there is nothing to
    // copy, just use the buffer.
    return;
  }
  if (ctype == CopyType::GeneralGeneral) {
    ctype = CopyType::General;
  }
  copy_cpu_inplace(src, dst, ctype, stream);
}

void copy_cpu_inplace(
    const array& src,
    array& dst,
    const Shape& data_shape,
    const Strides& i_strides,
    const Strides& o_strides,
    int64_t i_offset,
    int64_t o_offset,
    CopyType ctype,
    Stream stream,
    const std::optional<array>& dynamic_i_offset, /* = std::nullopt */
    const std::optional<array>& dynamic_o_offset /* = std::nullopt */) {
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(src);
  encoder.set_output_array(dst);
  auto weak_copy_if_set = [](auto x) -> std::optional<array> {
    if (x) {
      return array::unsafe_weak_copy(*x);
    } else {
      return std::nullopt;
    }
  };
  encoder.dispatch(
      [src = array::unsafe_weak_copy(src),
       dst = array::unsafe_weak_copy(dst),
       data_shape,
       i_strides,
       o_strides,
       i_offset,
       o_offset,
       ctype,
       dynamic_i_offset = weak_copy_if_set(dynamic_i_offset),
       dynamic_o_offset = weak_copy_if_set(dynamic_o_offset)]() mutable {
        switch (ctype) {
          case CopyType::General:
          case CopyType::GeneralGeneral:
            copy_inplace_dispatch(
                src,
                dst,
                ctype,
                data_shape,
                i_strides,
                o_strides,
                i_offset,
                o_offset,
                dynamic_i_offset,
                dynamic_o_offset);
            break;
          case CopyType::Scalar:
          case CopyType::Vector:
            copy_inplace_dispatch(src, dst, ctype);
        }
      });
}

array contiguous_copy_cpu(const array& arr, Stream stream) {
  array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
  copy_cpu(arr, arr_copy, CopyType::General, stream);
  return arr_copy;
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/copy.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <optional>

#include "mlx/array.h"
#include "mlx/backend/common/copy.h"
#include "mlx/backend/common/utils.h"

namespace mlx::core {

void copy_cpu(const array& src, array& dst, CopyType ctype, Stream stream);
void copy_cpu_inplace(
    const array& src,
    array& dst,
    CopyType ctype,
    Stream stream);

void copy_cpu_inplace(
    const array& src,
    array& dst,
    const Shape& data_shape,
    const Strides& i_strides,
    const Strides& o_strides,
    int64_t i_offset,
    int64_t o_offset,
    CopyType ctype,
    Stream stream,
    const std::optional<array>& dynamic_i_offset = std::nullopt,
    const std::optional<array>& dynamic_o_offset = std::nullopt);

// Return a contiguous array with same shape that copies the data of |arr|.
array contiguous_copy_cpu(const array& arr, Stream stream);

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/device_info.cpp
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/cpu/device_info.h"

#ifdef __APPLE__
#include <sys/sysctl.h>
#include <sys/utsname.h>
#elif defined(_WIN32)
#include <windows.h>
#else
#include <sys/utsname.h>
#include <fstream>
#endif

namespace mlx::core::cpu {

namespace {

// Get CPU architecture string at runtime
std::string get_cpu_architecture() {
#ifdef _WIN32
  // Use GetNativeSystemInfo to get the actual hardware architecture,
  // even when running under WoW64 emulation
  SYSTEM_INFO sysInfo;
  GetNativeSystemInfo(&sysInfo);
  switch (sysInfo.wProcessorArchitecture) {
    case PROCESSOR_ARCHITECTURE_AMD64:
      return "x86_64";
    case PROCESSOR_ARCHITECTURE_ARM64:
      return "arm64";
    case PROCESSOR_ARCHITECTURE_INTEL:
      return "x86";
    case PROCESSOR_ARCHITECTURE_ARM:
      return "arm";
    default:
      return "unknown";
  }
#else
  // Use uname() for runtime detection on Unix-like systems.
  // This returns the actual hardware architecture (e.g., "arm64" on Apple
  // Silicon even when running x86_64 binaries via Rosetta 2)
  struct utsname info;
  if (uname(&info) == 0) {
    return std::string(info.machine);
  }
  return "unknown";
#endif
}

// Get CPU device name (brand string)
std::string get_cpu_name() {
#ifdef __APPLE__
  char model[256];
  size_t len = sizeof(model);
  if (sysctlbyname("machdep.cpu.brand_string", &model, &len, NULL, 0) == 0) {
    return std::string(model);
  }
#elif defined(_WIN32)
  // Read CPU brand string from registry
  HKEY hKey;
  if (RegOpenKeyExA(
          HKEY_LOCAL_MACHINE,
          "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
          0,
          KEY_READ,
          &hKey) == ERROR_SUCCESS) {
    char brand[256];
    DWORD size = sizeof(brand);
    if (RegQueryValueExA(
            hKey, "ProcessorNameString", NULL, NULL, (LPBYTE)brand, &size) ==
        ERROR_SUCCESS) {
      RegCloseKey(hKey);
      return std::string(brand);
    }
    RegCloseKey(hKey);
  }
#else
  // Try reading from /proc/cpuinfo on Linux
  std::ifstream cpuinfo("/proc/cpuinfo");
  if (cpuinfo.is_open()) {
    std::string line;
    while (std::getline(cpuinfo, line)) {
      if (line.starts_with("model name")) {
        if (auto n = line.find(": "); n != std::string::npos) {
          return line.substr(n + 2);
        }
      }
    }
  }
#endif
  return get_cpu_architecture();
}

} // anonymous namespace

bool is_available() {
  return true;
}

int device_count() {
  return 1;
}

const std::unordered_map<std::string, std::variant<std::string, size_t>>&
device_info(int /* device_index */) {
  static auto info =
      std::unordered_map<std::string, std::variant<std::string, size_t>>{
          {"device_name", get_cpu_name()},
          {"architecture", get_cpu_architecture()}};
  return info;
}

} // namespace mlx::core::cpu


================================================
FILE: mlx/backend/cpu/device_info.h
================================================
// Copyright © 2026 Apple Inc.

#pragma once

#include <string>
#include <unordered_map>
#include <variant>

namespace mlx::core::cpu {

bool is_available();

/**
 * Get the number of available CPU devices.
 *
 * For CPU, always returns 1.
 */
int device_count();

/**
 * Get CPU device information.
 *
 * Returns a map with basic CPU device properties.
 */
const std::unordered_map<std::string, std::variant<std::string, size_t>>&
device_info(int device_index = 0);

} // namespace mlx::core::cpu


================================================
FILE: mlx/backend/cpu/distributed.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <cassert>

#include "mlx/allocator.h"
#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/distributed/primitives.h"

namespace mlx::core::distributed {

std::pair<array, bool> ensure_row_contiguous(const array& arr, Stream stream) {
  if (arr.flags().row_contiguous) {
    return {arr, false};
  } else {
    return {contiguous_copy_cpu(arr, stream), true};
  }
};

void AllReduce::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() == 1);
  assert(outputs.size() == 1);

  auto donate_or_copy = [s = stream()](const array& in, array& out) {
    if (in.flags().row_contiguous) {
      if (in.is_donatable()) {
        out.copy_shared_buffer(in);
      } else {
        out.set_data(allocator::malloc(out.nbytes()));
      }
      return in;
    } else {
      array arr_copy = contiguous_copy_cpu(in, s);
      out.copy_shared_buffer(arr_copy);
      return arr_copy;
    }
  };

  auto in = donate_or_copy(inputs[0], outputs[0]);
  switch (reduce_type_) {
    case Sum:
      distributed::detail::all_sum(group(), in, outputs[0], stream());
      break;
    case Max:
      distributed::detail::all_max(group(), in, outputs[0], stream());
      break;
    case Min:
      distributed::detail::all_min(group(), in, outputs[0], stream());
      break;
    default:
      throw std::runtime_error(
          "Only all reduce sum, min and max are supported for now");
  }
}

void AllGather::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() == 1);
  assert(outputs.size() == 1);

  auto [in, copied] = ensure_row_contiguous(inputs[0], stream());
  outputs[0].set_data(allocator::malloc(outputs[0].nbytes()));
  distributed::detail::all_gather(group(), in, outputs[0], stream());
  if (copied) {
    auto& enc = cpu::get_command_encoder(stream());
    enc.add_temporary(in);
  }
}

void Send::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() == 1);
  assert(outputs.size() == 1);

  auto [in, copied] = ensure_row_contiguous(inputs[0], stream());
  distributed::detail::send(group(), in, dst_, stream());
  outputs[0].copy_shared_buffer(inputs[0]);
  if (copied) {
    auto& enc = cpu::get_command_encoder(stream());
    enc.add_temporary(in);
  }
}

void Recv::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() == 0);
  assert(outputs.size() == 1);

  outputs[0].set_data(allocator::malloc(outputs[0].nbytes()));
  distributed::detail::recv(group(), outputs[0], src_, stream());
}

void ReduceScatter::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  throw std::runtime_error("[ReduceScatter] Not implemented yet.");
}
} // namespace mlx::core::distributed


================================================
FILE: mlx/backend/cpu/eig.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/allocator.h"
#include "mlx/array.h"
#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/lapack.h"
#include "mlx/linalg.h"
#include "mlx/primitives.h"

namespace mlx::core {

namespace {

template <typename T>
complex64_t to_complex(T r, T i) {
  return {static_cast<float>(r), static_cast<float>(i)};
}

template <typename T, class Enable = void>
struct EigWork {};

template <typename T>
struct EigWork<
    T,
    typename std::enable_if<std::is_floating_point<T>::value>::type> {
  using O = complex64_t;

  char jobl;
  char jobr;
  int N;
  int lwork;
  int info;
  std::vector<array::Data> buffers;

  EigWork(char jobl_, char jobr_, int N_, bool compute_eigenvectors)
      : jobl(jobl_), jobr(jobr_), N(N_), lwork(-1) {
    T work;
    int n_vecs_l = compute_eigenvectors ? N_ : 1;
    int n_vecs_r = 1;
    geev<T>(
        &jobl,
        &jobr,
        &N,
        nullptr,
        &N,
        nullptr,
        nullptr,
        nullptr,
        &n_vecs_l,
        nullptr,
        &n_vecs_r,
        &work,
        &lwork,
        &info);
    lwork = static_cast<int>(work);

    buffers.emplace_back(allocator::malloc(sizeof(T) * N * 2));
    if (compute_eigenvectors) {
      buffers.emplace_back(allocator::malloc(sizeof(T) * N * N * 2));
    }
    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
  }

  void run(T* a, O* values, O* vectors) {
    auto eig_tmp = static_cast<T*>(buffers[0].buffer.raw_ptr());
    T* vec_tmp = nullptr;
    if (vectors) {
      vec_tmp = static_cast<T*>(buffers[1].buffer.raw_ptr());
    }
    auto work = static_cast<T*>(buffers.back().buffer.raw_ptr());

    int n_vecs_l = vectors ? N : 1;
    int n_vecs_r = 1;
    geev<T>(
        &jobl,
        &jobr,
        &N,
        a,
        &N,
        eig_tmp,
        eig_tmp + N,
        vectors ? vec_tmp : nullptr,
        &n_vecs_l,
        nullptr,
        &n_vecs_r,
        work,
        &lwork,
        &info);

    for (int i = 0; i < N; ++i) {
      values[i] = to_complex(eig_tmp[i], eig_tmp[N + i]);
    }

    if (vectors) {
      for (int i = 0; i < N; ++i) {
        if (values[i].imag() != 0) {
          for (int j = 0; j < N; ++j) {
            vectors[i * N + j] =
                to_complex(vec_tmp[i * N + j], -vec_tmp[(i + 1) * N + j]);
            vectors[(i + 1) * N + j] =
                to_complex(vec_tmp[i * N + j], vec_tmp[(i + 1) * N + j]);
          }
          i += 1;
        } else {
          for (int j = 0; j < N; ++j) {
            vectors[i * N + j] = to_complex(vec_tmp[i * N + j], T(0.0));
          }
        }
      }
    }
  }
};

template <>
struct EigWork<std::complex<float>> {
  using T = std::complex<float>;
  using R = float;
  using O = T;

  char jobl;
  char jobr;
  int N;
  int lwork;
  int lrwork;
  int info;
  std::vector<array::Data> buffers;

  EigWork(char jobl_, char jobr_, int N_, bool compute_eigenvectors)
      : jobl(jobl_), jobr(jobr_), N(N_), lwork(-1), lrwork(2 * N_) {
    T work;
    R rwork;
    int n_vecs_l = compute_eigenvectors ? N_ : 1;
    int n_vecs_r = 1;
    geev<T>(
        &jobl,
        &jobr,
        &N,
        nullptr,
        &N,
        nullptr,
        nullptr,
        &n_vecs_l,
        nullptr,
        &n_vecs_r,
        &work,
        &lwork,
        &rwork,
        &info);
    lwork = static_cast<int>(work.real());
    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
    buffers.emplace_back(allocator::malloc(sizeof(R) * lrwork));
  }

  void run(T* a, T* values, T* vectors) {
    int n_vecs_l = vectors ? N : 1;
    int n_vecs_r = 1;
    geev<T>(
        &jobl,
        &jobr,
        &N,
        a,
        &N,
        values,
        vectors,
        &n_vecs_l,
        nullptr,
        &n_vecs_r,
        static_cast<T*>(buffers[0].buffer.raw_ptr()),
        &lwork,
        static_cast<R*>(buffers[1].buffer.raw_ptr()),
        &info);
  }
};

template <typename T>
void eig_impl(
    array& a,
    array& vectors,
    array& values,
    bool compute_eigenvectors,
    Stream stream) {
  auto a_ptr = a.data<T>();
  auto val_ptr = values.data<complex64_t>();

  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  encoder.set_output_array(values);
  complex64_t* vec_ptr = nullptr;
  if (compute_eigenvectors) {
    encoder.set_output_array(vectors);
    vec_ptr = vectors.data<complex64_t>();
  }
  encoder.dispatch([a_ptr,
                    val_ptr,
                    vec_ptr,
                    compute_eigenvectors,
                    N = vectors.shape(-1),
                    size = vectors.size()]() mutable {
    char jobr = 'N';
    char jobl = compute_eigenvectors ? 'V' : 'N';

    EigWork<T> work(jobl, jobr, N, compute_eigenvectors);

    for (size_t i = 0; i < size / (N * N); ++i) {
      work.run(a_ptr, val_ptr, vec_ptr);
      a_ptr += N * N;
      val_ptr += N;
      if (vec_ptr) {
        vec_ptr += N * N;
      }
      if (work.info != 0) {
        std::stringstream msg;
        msg << "[Eig::eval_cpu] Eigenvalue decomposition failed with error code "
            << work.info;
        throw std::runtime_error(msg.str());
      }
    }
  });
  encoder.add_temporary(a);
}

} // namespace

void Eig::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  const auto& a = inputs[0];
  auto& values = outputs[0];

  auto vectors = compute_eigenvectors_
      ? outputs[1]
      : array(a.shape(), complex64, nullptr, {});

  auto a_copy = array(a.shape(), a.dtype(), nullptr, {});
  copy_cpu(
      a,
      a_copy,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
      stream());

  values.set_data(allocator::malloc(values.nbytes()));

  if (compute_eigenvectors_) {
    // Set the strides and flags so the eigenvectors
    // are in the columns of the output
    auto flags = vectors.flags();
    auto strides = vectors.strides();
    auto ndim = a.ndim();
    std::swap(strides[ndim - 1], strides[ndim - 2]);

    if (a.size() > 1) {
      flags.row_contiguous = false;
      if (ndim > 2) {
        flags.col_contiguous = false;
      } else {
        flags.col_contiguous = true;
      }
    }
    vectors.set_data(
        allocator::malloc(vectors.nbytes()), vectors.size(), strides, flags);
  }
  switch (a.dtype()) {
    case float32:
      eig_impl<float>(a_copy, vectors, values, compute_eigenvectors_, stream());
      break;
    case float64:
      eig_impl<double>(
          a_copy, vectors, values, compute_eigenvectors_, stream());
      break;
    case complex64:
      eig_impl<std::complex<float>>(
          a_copy, vectors, values, compute_eigenvectors_, stream());
      break;
    default:
      throw std::runtime_error(
          "[Eig::eval_cpu] only supports float32, float64, or complex64.");
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/eigh.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include "mlx/allocator.h"
#include "mlx/array.h"
#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/lapack.h"
#include "mlx/linalg.h"
#include "mlx/primitives.h"

namespace mlx::core {

namespace {

template <typename T, class Enable = void>
struct EighWork {};

template <typename T>
struct EighWork<
    T,
    typename std::enable_if<std::is_floating_point<T>::value>::type> {
  using R = T;

  char jobz;
  char uplo;
  int N;
  int lwork;
  int liwork;
  int info;
  std::vector<array::Data> buffers;

  EighWork(char jobz_, char uplo_, int N_)
      : jobz(jobz_), uplo(uplo_), N(N_), lwork(-1), liwork(-1) {
    T work;
    int iwork;
    syevd<T>(
        &jobz,
        &uplo,
        &N,
        nullptr,
        &N,
        nullptr,
        &work,
        &lwork,
        &iwork,
        &liwork,
        &info);
    lwork = static_cast<int>(work);
    liwork = iwork;
    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
    buffers.emplace_back(allocator::malloc(sizeof(int) * liwork));
  }

  void run(T* vectors, T* values) {
    syevd<T>(
        &jobz,
        &uplo,
        &N,
        vectors,
        &N,
        values,
        static_cast<T*>(buffers[0].buffer.raw_ptr()),
        &lwork,
        static_cast<int*>(buffers[1].buffer.raw_ptr()),
        &liwork,
        &info);
  }
};

template <>
struct EighWork<std::complex<float>> {
  using T = std::complex<float>;
  using R = float;

  char jobz;
  char uplo;
  int N;
  int lwork;
  int lrwork;
  int liwork;
  int info;
  std::vector<array::Data> buffers;

  EighWork(char jobz_, char uplo_, int N_)
      : jobz(jobz_), uplo(uplo_), N(N_), lwork(-1), lrwork(-1), liwork(-1) {
    T work;
    R rwork;
    int iwork;
    heevd<T>(
        &jobz,
        &uplo,
        &N,
        nullptr,
        &N,
        nullptr,
        &work,
        &lwork,
        &rwork,
        &lrwork,
        &iwork,
        &liwork,
        &info);
    lwork = static_cast<int>(work.real());
    lrwork = static_cast<int>(rwork);
    liwork = iwork;
    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
    buffers.emplace_back(allocator::malloc(sizeof(R) * lrwork));
    buffers.emplace_back(allocator::malloc(sizeof(int) * liwork));
  }

  void run(T* vectors, R* values) {
    heevd<T>(
        &jobz,
        &uplo,
        &N,
        vectors,
        &N,
        values,
        static_cast<T*>(buffers[0].buffer.raw_ptr()),
        &lwork,
        static_cast<R*>(buffers[1].buffer.raw_ptr()),
        &lrwork,
        static_cast<int*>(buffers[2].buffer.raw_ptr()),
        &liwork,
        &info);
    if (jobz == 'V') {
      // We have pre-transposed the vectors but we also must conjugate them
      // when they are complex.
      //
      // We could vectorize this but it is so fast in comparison to heevd that
      // it doesn't really matter.
      for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
          *vectors = std::conj(*vectors);
          vectors++;
        }
      }
    }
  }
};

template <typename T>
void eigh_impl(
    array& vectors,
    array& values,
    const std::string& uplo,
    bool compute_eigenvectors,
    Stream stream) {
  using R = typename EighWork<T>::R;

  auto vec_ptr = vectors.data<T>();
  auto eig_ptr = values.data<R>();
  char jobz = compute_eigenvectors ? 'V' : 'N';

  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_output_array(vectors);
  encoder.set_output_array(values);
  encoder.dispatch([vec_ptr,
                    eig_ptr,
                    jobz,
                    uplo = uplo[0],
                    N = vectors.shape(-1),
                    size = vectors.size()]() mutable {
    // Work query
    EighWork<T> work(jobz, uplo, N);

    // Work loop
    for (size_t i = 0; i < size / (N * N); ++i) {
      work.run(vec_ptr, eig_ptr);
      vec_ptr += N * N;
      eig_ptr += N;
      if (work.info != 0) {
        std::stringstream msg;
        msg << "[Eigh::eval_cpu] Eigenvalue decomposition failed with error code "
            << work.info;
        throw std::runtime_error(msg.str());
      }
    }
  });
  if (!compute_eigenvectors) {
    encoder.add_temporary(vectors);
  }
}

} // namespace

void Eigh::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  const auto& a = inputs[0];
  auto& values = outputs[0];

  auto vectors = compute_eigenvectors_
      ? outputs[1]
      : array(a.shape(), a.dtype(), nullptr, {});

  values.set_data(allocator::malloc(values.nbytes()));

  copy_cpu(
      a,
      vectors,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
      stream());

  if (compute_eigenvectors_) {
    // Set the strides and flags so the eigenvectors
    // are in the columns of the output
    auto flags = vectors.flags();
    auto strides = vectors.strides();
    auto ndim = a.ndim();
    std::swap(strides[ndim - 1], strides[ndim - 2]);

    if (a.size() > 1) {
      flags.row_contiguous = false;
      if (ndim > 2) {
        flags.col_contiguous = false;
      } else {
        flags.col_contiguous = true;
      }
    }
    vectors.copy_shared_buffer(vectors, strides, flags, vectors.data_size());
  }
  switch (a.dtype()) {
    case float32:
      eigh_impl<float>(vectors, values, uplo_, compute_eigenvectors_, stream());
      break;
    case float64:
      eigh_impl<double>(
          vectors, values, uplo_, compute_eigenvectors_, stream());
      break;
    case complex64:
      eigh_impl<std::complex<float>>(
          vectors, values, uplo_, compute_eigenvectors_, stream());
      break;
    default:
      throw std::runtime_error(
          "[Eigh::eval_cpu] only supports float32 or float64.");
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/encoder.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cpu/encoder.h"

namespace mlx::core::cpu {

CommandEncoder& get_command_encoder(Stream stream) {
  static std::unordered_map<int, CommandEncoder> encoder_map;
  auto it = encoder_map.find(stream.index);
  if (it == encoder_map.end()) {
    it = encoder_map.emplace(stream.index, stream).first;
  }
  return it->second;
}

} // namespace mlx::core::cpu


================================================
FILE: mlx/backend/cpu/encoder.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include <unordered_map>

#include "mlx/array.h"
#include "mlx/scheduler.h"

namespace mlx::core::cpu {

// Number of dispatches per scheduler task
constexpr int DISPATCHES_PER_TASK = 10;

struct MLX_API CommandEncoder {
  CommandEncoder(Stream stream) : stream_(stream) {}

  CommandEncoder(const CommandEncoder&) = delete;
  CommandEncoder& operator=(const CommandEncoder&) = delete;
  CommandEncoder(CommandEncoder&&) = delete;
  CommandEncoder& operator=(CommandEncoder&&) = delete;

  void set_input_array(const array& a) {}
  void set_output_array(array& a) {}

  // Hold onto a temporary until any already scheduled tasks which use it as
  // an input are complete.
  void add_temporary(array arr) {
    temporaries_.push_back(std::move(arr));
  }

  void add_temporaries(std::vector<array> arrays) {
    temporaries_.insert(
        temporaries_.end(),
        std::make_move_iterator(arrays.begin()),
        std::make_move_iterator(arrays.end()));
  }

  std::vector<array>& temporaries() {
    return temporaries_;
  }

  template <class F, class... Args>
  void dispatch(F&& f, Args&&... args) {
    num_ops_ = (num_ops_ + 1) % DISPATCHES_PER_TASK;
    auto task = std::bind(std::forward<F>(f), std::forward<Args>(args)...);
    if (num_ops_ == 0) {
      scheduler::notify_new_task(stream_);
      auto task_wrap = [s = stream_, task = std::move(task)]() mutable {
        task();
        scheduler::notify_task_completion(s);
      };
      scheduler::enqueue(stream_, std::move(task_wrap));
    } else {
      scheduler::enqueue(stream_, std::move(task));
    }
  }

 private:
  Stream stream_;
  std::vector<array> temporaries_;
  int num_ops_{0};
};

MLX_API CommandEncoder& get_command_encoder(Stream stream);

} // namespace mlx::core::cpu


================================================
FILE: mlx/backend/cpu/eval.cpp
================================================
// Copyright © 2025 Apple Inc.
#include "mlx/backend/cpu/eval.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/primitives.h"
#include "mlx/scheduler.h"
#include "mlx/utils.h"

namespace mlx::core::cpu {

void eval(array& arr) {
  auto s = arr.primitive().stream();

  auto outputs = arr.outputs();
  {
    // If the array is a tracer hold a reference
    // to its inputs so they don't get donated
    std::vector<array> inputs;
    if (arr.is_tracer()) {
      inputs = arr.inputs();
    }
    arr.primitive().eval_cpu(arr.inputs(), outputs);
  }

  std::unordered_set<std::shared_ptr<array::Data>> buffers;
  for (auto& in : arr.inputs()) {
    buffers.insert(in.data_shared_ptr());
  }
  for (auto& s : arr.siblings()) {
    buffers.insert(s.data_shared_ptr());
  }
  // Remove the output if it was donated to by an input
  if (auto it = buffers.find(arr.data_shared_ptr()); it != buffers.end()) {
    buffers.erase(it);
  }
  auto& encoder = cpu::get_command_encoder(s);
  encoder.dispatch([buffers = std::move(buffers),
                    temps = std::move(encoder.temporaries())]() {});
}

} // namespace mlx::core::cpu


================================================
FILE: mlx/backend/cpu/eval.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/array.h"
#include "mlx/stream.h"

namespace mlx::core::cpu {

void eval(array& arr);

} // namespace mlx::core::cpu


================================================
FILE: mlx/backend/cpu/fft.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <numeric>

#include "mlx/3rdparty/pocketfft.h"
#include "mlx/allocator.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/primitives.h"

namespace mlx::core {

void FFT::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];
  std::vector<std::ptrdiff_t> strides_in(
      in.strides().begin(), in.strides().end());
  for (auto& s : strides_in) {
    s *= in.itemsize();
  }
  std::vector<std::ptrdiff_t> strides_out(
      out.strides().begin(), out.strides().end());
  for (auto& s : strides_out) {
    s *= out.itemsize();
  }

  out.set_data(allocator::malloc(out.nbytes()));

  std::vector<size_t> shape;
  if (out.dtype() == float32) {
    shape.insert(shape.end(), out.shape().begin(), out.shape().end());
  } else {
    shape.insert(shape.end(), in.shape().begin(), in.shape().end());
  }

  float scale = 1.0f;
  if (inverse_) {
    size_t nelem = std::accumulate(
        axes_.begin(), axes_.end(), 1, [&shape](auto x, auto y) {
          return x * shape[y];
        });
    scale /= nelem;
  }

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_input_array(in);
  encoder.set_output_array(out);

  if (in.dtype() == complex64 && out.dtype() == complex64) {
    auto in_ptr =
        reinterpret_cast<const std::complex<float>*>(in.data<complex64_t>());
    auto out_ptr =
        reinterpret_cast<std::complex<float>*>(out.data<complex64_t>());
    encoder.dispatch([shape = std::move(shape),
                      strides_in = std::move(strides_in),
                      strides_out = std::move(strides_out),
                      axes = axes_,
                      inverse = inverse_,
                      in_ptr,
                      out_ptr,
                      scale]() {
      pocketfft::c2c(
          shape,
          strides_in,
          strides_out,
          axes,
          !inverse,
          in_ptr,
          out_ptr,
          scale);
    });
  } else if (in.dtype() == float32 && out.dtype() == complex64) {
    auto in_ptr = in.data<float>();
    auto out_ptr =
        reinterpret_cast<std::complex<float>*>(out.data<complex64_t>());
    encoder.dispatch([shape = std::move(shape),
                      strides_in = std::move(strides_in),
                      strides_out = std::move(strides_out),
                      axes = axes_,
                      inverse = inverse_,
                      in_ptr,
                      out_ptr,
                      scale]() {
      pocketfft::r2c(
          shape,
          strides_in,
          strides_out,
          axes,
          !inverse,
          in_ptr,
          out_ptr,
          scale);
    });
  } else if (in.dtype() == complex64 && out.dtype() == float32) {
    auto in_ptr =
        reinterpret_cast<const std::complex<float>*>(in.data<complex64_t>());
    auto out_ptr = out.data<float>();
    encoder.dispatch([shape = std::move(shape),
                      strides_in = std::move(strides_in),
                      strides_out = std::move(strides_out),
                      axes = axes_,
                      inverse = inverse_,
                      in_ptr,
                      out_ptr,
                      scale]() {
      pocketfft::c2r(
          shape,
          strides_in,
          strides_out,
          axes,
          !inverse,
          in_ptr,
          out_ptr,
          scale);
    });
  } else {
    throw std::runtime_error(
        "[FFT] Received unexpected input and output type combination.");
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/gemm.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once
#include "mlx/array.h"

namespace mlx::core {

template <typename T>
void matmul(
    const T* a,
    const T* b,
    T* out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
    size_t ldc,
    float alpha,
    float beta,
    size_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides);

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/gemms/bnns.cpp
================================================
// Copyright © 2023-2024 Apple Inc.
#include <Accelerate/Accelerate.h>

#include "mlx/array.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/gemm.h"
#include "mlx/dtype.h"

namespace mlx::core {

template <typename T>
constexpr BNNSDataType to_bnns_dtype();

template <>
constexpr BNNSDataType to_bnns_dtype<float>() {
  return BNNSDataType(BNNSDataTypeFloatBit | 32);
}
template <>
constexpr BNNSDataType to_bnns_dtype<float16_t>() {
  return BNNSDataType(BNNSDataTypeFloatBit | 16);
}

template <>
constexpr BNNSDataType to_bnns_dtype<bfloat16_t>() {
  return BNNSDataTypeBFloat16;
}

template <typename T>
void matmul_bnns(
    const T* a,
    const T* b,
    T* out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
    size_t ldc,
    float alpha,
    float beta,
    size_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides) {
  auto ndim = a_shape.size();
  size_t M = a_shape[ndim - 2];
  size_t N = b_shape[ndim - 1];
  size_t K = a_shape[ndim - 1];

  BNNSDataType bnns_dtype = to_bnns_dtype<T>();
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
  if (beta != 1.0 && beta != 0.0) {
    // scale the output
    for (auto i = 0; i < batch_size * M * N; ++i) {
      out[i] *= beta;
    }
    beta = 1.0;
  }
  const BNNSLayerParametersBroadcastMatMul gemm_params{
      /* float alpha = */ alpha,
      /* float beta = */ beta,
      /* bool transA = */ a_transposed,
      /* bool transB = */ b_transposed,
      /* bool quadratic = */ false,
      /* bool a_is_weights = */ false,
      /* bool b_is_weights = */ false,
      /* BNNSNDArrayDescriptor iA_desc = */
      BNNSNDArrayDescriptor{
          /* BNNSNDArrayFlags flags = */ BNNSNDArrayFlagBackpropSet,
          /* BNNSDataLayout layout = */ BNNSDataLayoutRowMajorMatrix,

          /* size_t size[BNNS_MAX_TENSOR_DIMENSION] = */
          {lda, (M * K) / lda, 0, 0, 0, 0, 0, 0},
          /* size_t stride[BNNS_MAX_TENSOR_DIMENSION] = */
          {1, lda, 0, 0, 0, 0, 0, 0},

          /* void * _Nullable data = */ nullptr,
          /* BNNSDataType data_type = */ bnns_dtype,

          /* void * _Nullable table_data = */ nullptr,
          /* BNNSDataType table_data_type = */ bnns_dtype,

          /* float data_scale = */ 1.0,
          /* float data_bias = */ 0.0,
      },
      /* BNNSNDArrayDescriptor iB_desc = */
      BNNSNDArrayDescriptor{
          /* BNNSNDArrayFlags flags = */ BNNSNDArrayFlagBackpropSet,
          /* BNNSDataLayout layout = */ BNNSDataLayoutRowMajorMatrix,

          /* size_t size[BNNS_MAX_TENSOR_DIMENSION] = */
          {ldb, (K * N) / ldb, 0, 0, 0, 0, 0, 0},
          /* size_t stride[BNNS_MAX_TENSOR_DIMENSION] = */
          {1, ldb, 0, 0, 0, 0, 0, 0},

          /* void * _Nullable data = */ nullptr,
          /* BNNSDataType data_type = */ bnns_dtype,

          /* void * _Nullable table_data = */ nullptr,
          /* BNNSDataType table_data_type = */ bnns_dtype,

          /* float data_scale = */ 1.0,
          /* float data_bias = */ 0.0,
      },
      /* BNNSNDArrayDescriptor o_desc = */
      BNNSNDArrayDescriptor{
          /* BNNSNDArrayFlags flags = */ BNNSNDArrayFlagBackpropSet,
          /* BNNSDataLayout layout = */ BNNSDataLayoutRowMajorMatrix,

          /* size_t size[BNNS_MAX_TENSOR_DIMENSION] = */
          {N, M, 0, 0, 0, 0, 0, 0},
          /* size_t stride[BNNS_MAX_TENSOR_DIMENSION] = */
          {1, N, 0, 0, 0, 0, 0, 0},

          /* void * _Nullable data = */ nullptr,
          /* BNNSDataType data_type = */ bnns_dtype,

          /* void * _Nullable table_data = */ nullptr,
          /* BNNSDataType table_data_type = */ bnns_dtype,

          /* float data_scale = */ 1.0,
          /* float data_bias = */ 0.0,
      },
  };

  auto bnns_filter =
      BNNSFilterCreateLayerBroadcastMatMul(&gemm_params, nullptr);

  for (int i = 0; i < batch_size; ++i) {
    BNNSFilterApplyTwoInput(
        bnns_filter,
        reinterpret_cast<const uint8_t*>(
            a + elem_to_loc(M * K * i, a_shape, a_strides)),
        reinterpret_cast<const uint8_t*>(
            b + elem_to_loc(K * N * i, b_shape, b_strides)),
        reinterpret_cast<uint8_t*>(out + M * N * i));
  }

  BNNSFilterDestroy(bnns_filter);
#pragma GCC diagnostic pop
}

template <>
void matmul<float16_t>(
    const float16_t* a,
    const float16_t* b,
    float16_t* out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
    size_t ldc,
    float alpha,
    float beta,
    size_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides) {
  matmul_bnns(
      a,
      b,
      out,
      a_transposed,
      b_transposed,
      lda,
      ldb,
      ldc,
      alpha,
      beta,
      batch_size,
      a_shape,
      a_strides,
      b_shape,
      b_strides);
}

template <>
void matmul<bfloat16_t>(
    const bfloat16_t* a,
    const bfloat16_t* b,
    bfloat16_t* out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
    size_t ldc,
    float alpha,
    float beta,
    size_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides) {
  matmul_bnns(
      a,
      b,
      out,
      a_transposed,
      b_transposed,
      lda,
      ldb,
      ldc,
      alpha,
      beta,
      batch_size,
      a_shape,
      a_strides,
      b_shape,
      b_strides);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/gemms/cblas.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/gemm.h"
#include "mlx/backend/cpu/lapack.h"

namespace mlx::core {

template <>
void matmul<float>(
    const float* a,
    const float* b,
    float* out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
    size_t ldc,
    float alpha,
    float beta,
    size_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides) {
  auto ndim = a_shape.size();
  size_t M = a_shape[ndim - 2];
  size_t N = b_shape[ndim - 1];
  size_t K = a_shape[ndim - 1];

  for (int i = 0; i < batch_size; ++i) {
    cblas_sgemm(
        CblasRowMajor,
        a_transposed ? CblasTrans : CblasNoTrans, // transA
        b_transposed ? CblasTrans : CblasNoTrans, // transB
        M,
        N,
        K,
        alpha,
        a + elem_to_loc(M * K * i, a_shape, a_strides),
        lda,
        b + elem_to_loc(K * N * i, b_shape, b_strides),
        ldb,
        beta,
        out + M * N * i,
        ldc);
  }
}

template <>
void matmul<double>(
    const double* a,
    const double* b,
    double* out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
    size_t ldc,
    float alpha,
    float beta,
    size_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides) {
  auto ndim = a_shape.size();
  size_t M = a_shape[ndim - 2];
  size_t N = b_shape[ndim - 1];
  size_t K = a_shape[ndim - 1];

  for (int i = 0; i < batch_size; ++i) {
    cblas_dgemm(
        CblasRowMajor,
        a_transposed ? CblasTrans : CblasNoTrans, // transA
        b_transposed ? CblasTrans : CblasNoTrans, // transB
        M,
        N,
        K,
        alpha,
        a + elem_to_loc(M * K * i, a_shape, a_strides),
        lda,
        b + elem_to_loc(K * N * i, b_shape, b_strides),
        ldb,
        beta,
        out + M * N * i,
        ldc);
  }
}

template <>
void matmul<complex64_t>(
    const complex64_t* a,
    const complex64_t* b,
    complex64_t* out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
    size_t ldc,
    float alpha,
    float beta,
    size_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides) {
  auto ndim = a_shape.size();
  size_t M = a_shape[ndim - 2];
  size_t N = b_shape[ndim - 1];
  size_t K = a_shape[ndim - 1];
  auto calpha = static_cast<complex64_t>(alpha);
  auto cbeta = static_cast<complex64_t>(beta);

  for (int i = 0; i < batch_size; ++i) {
    cblas_cgemm(
        CblasRowMajor,
        a_transposed ? CblasTrans : CblasNoTrans, // transA
        b_transposed ? CblasTrans : CblasNoTrans, // transB
        M,
        N,
        K,
        &calpha,
        a + elem_to_loc(M * K * i, a_shape, a_strides),
        lda,
        b + elem_to_loc(K * N * i, b_shape, b_strides),
        ldb,
        &cbeta,
        out + M * N * i,
        ldc);
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/gemms/simd_bf16.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/gemm.h"
#include "mlx/backend/cpu/gemms/simd_gemm.h"

namespace mlx::core {

template <>
void matmul<bfloat16_t>(
    const bfloat16_t* a,
    const bfloat16_t* b,
    bfloat16_t* out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
    size_t ldc,
    float alpha,
    float beta,
    size_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides) {
  auto ndim = a_shape.size();
  size_t M = a_shape[ndim - 2];
  size_t N = b_shape[ndim - 1];
  size_t K = a_shape[ndim - 1];
  for (int i = 0; i < batch_size; ++i) {
    simd_gemm<bfloat16_t, float>(
        a + elem_to_loc(M * K * i, a_shape, a_strides),
        b + elem_to_loc(K * N * i, b_shape, b_strides),
        out + M * N * i,
        a_transposed,
        b_transposed,
        M,
        N,
        K,
        alpha,
        beta);
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/gemms/simd_fp16.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/gemm.h"
#include "mlx/backend/cpu/gemms/simd_gemm.h"

namespace mlx::core {

template <>
void matmul<float16_t>(
    const float16_t* a,
    const float16_t* b,
    float16_t* out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
    size_t ldc,
    float alpha,
    float beta,
    size_t batch_size,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides) {
  auto ndim = a_shape.size();
  size_t M = a_shape[ndim - 2];
  size_t N = b_shape[ndim - 1];
  size_t K = a_shape[ndim - 1];
  for (int i = 0; i < batch_size; ++i) {
    simd_gemm<float16_t, float>(
        a + elem_to_loc(M * K * i, a_shape, a_strides),
        b + elem_to_loc(K * N * i, b_shape, b_strides),
        out + M * N * i,
        a_transposed,
        b_transposed,
        M,
        N,
        K,
        alpha,
        beta);
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/gemms/simd_gemm.h
================================================
// Copyright © 2025 Apple Inc.
#pragma once

#include "mlx/backend/cpu/simd/simd.h"

namespace mlx::core {

inline int ceildiv(int a, int b) {
  return (a + b - 1) / b;
}

template <int block_size, typename T, typename AccT>
void load_block(
    const T* in,
    AccT* out,
    int M,
    int N,
    int i,
    int j,
    bool transpose) {
  if (transpose) {
    for (int ii = 0; ii < block_size && i * block_size + ii < M; ++ii) {
      for (int jj = 0; jj < block_size && j * block_size + jj < N; ++jj) {
        out[jj * block_size + ii] =
            in[(i * block_size + ii) * N + j * block_size + jj];
      }
    }
  } else {
    for (int ii = 0; ii < block_size && i * block_size + ii < M; ++ii) {
      for (int jj = 0; jj < block_size && j * block_size + jj < N; ++jj) {
        out[ii * block_size + jj] =
            in[(i * block_size + ii) * N + j * block_size + jj];
      }
    }
  }
}

template <typename T, typename AccT>
void simd_gemm(
    const T* a,
    const T* b,
    T* c,
    bool a_trans,
    bool b_trans,
    int M,
    int N,
    int K,
    float alpha,
    float beta) {
  constexpr int block_size = 16;
  constexpr int simd_size = simd::max_size<AccT>;
  static_assert(
      (block_size % simd_size) == 0,
      "Block size must be divisible by SIMD size");

  int last_k_block_size = K - block_size * (K / block_size);
  int last_k_simd_block = (last_k_block_size / simd_size) * simd_size;
  for (int i = 0; i < ceildiv(M, block_size); i++) {
    for (int j = 0; j < ceildiv(N, block_size); j++) {
      AccT c_block[block_size * block_size] = {0.0};
      AccT a_block[block_size * block_size];
      AccT b_block[block_size * block_size];

      int k = 0;
      for (; k < K / block_size; k++) {
        // Load a and b blocks
        if (a_trans) {
          load_block<block_size>(a, a_block, K, M, k, i, true);
        } else {
          load_block<block_size>(a, a_block, M, K, i, k, false);
        }
        if (b_trans) {
          load_block<block_size>(b, b_block, N, K, j, k, false);
        } else {
          load_block<block_size>(b, b_block, K, N, k, j, true);
        }

        // Multiply and accumulate
        for (int ii = 0; ii < block_size && i * block_size + ii < M; ++ii) {
          for (int jj = 0; jj < block_size && j * block_size + jj < N; ++jj) {
            for (int kk = 0; kk < block_size; kk += simd_size) {
              auto av =
                  simd::load<AccT, simd_size>(a_block + ii * block_size + kk);
              auto bv =
                  simd::load<AccT, simd_size>(b_block + jj * block_size + kk);
              c_block[ii * block_size + jj] += simd::sum(av * bv);
            }
          }
        }
      }
      if (last_k_block_size) {
        // Load a and b blocks
        if (a_trans) {
          load_block<block_size>(a, a_block, K, M, k, i, true);
        } else {
          load_block<block_size>(a, a_block, M, K, i, k, false);
        }
        if (b_trans) {
          load_block<block_size>(b, b_block, N, K, j, k, false);
        } else {
          load_block<block_size>(b, b_block, K, N, k, j, true);
        }

        // Multiply and accumulate
        for (int ii = 0; ii < block_size && i * block_size + ii < M; ++ii) {
          for (int jj = 0; jj < block_size && j * block_size + jj < N; ++jj) {
            int kk = 0;
            for (; kk < last_k_simd_block; kk += simd_size) {
              auto av =
                  simd::load<AccT, simd_size>(a_block + ii * block_size + kk);
              auto bv =
                  simd::load<AccT, simd_size>(b_block + jj * block_size + kk);
              c_block[ii * block_size + jj] += simd::sum(av * bv);
            }
            for (; kk < last_k_block_size; ++kk) {
              c_block[ii * block_size + jj] +=
                  a_block[ii * block_size + kk] * b_block[jj * block_size + kk];
            }
          }
        }
      }

      // Store
      for (int ii = 0; ii < block_size && i * block_size + ii < M; ++ii) {
        for (int jj = 0; jj < block_size && j * block_size + jj < N; ++jj) {
          auto c_idx = (i * block_size + ii) * N + j * block_size + jj;
          if (beta != 0) {
            c[c_idx] = static_cast<T>(
                alpha * c_block[ii * block_size + jj] + beta * c[c_idx]);
          } else {
            c[c_idx] = static_cast<T>(alpha * c_block[ii * block_size + jj]);
          }
        }
      }
    }
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/hadamard.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <cassert>

#include "mlx/backend/common/hadamard.h"
#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/primitives.h"

namespace mlx::core {

// n = 2^k component
template <typename T>
void hadamard_n(T* out, int n, int m, float scale, size_t size) {
  for (int b = 0; b < size / n; b++) {
    size_t loc = b * n;
    T* data_ptr = out + loc;
    int h = 1;
    int n_over_2 = n / 2;
    while (h < n) {
      for (int i = 0; i < n / 2; i++) {
        int k = i & (h - 1);
        int j = ((i - k) << 1) + k;
        float x = *(data_ptr + j);
        float y = *(data_ptr + j + h);
        *(data_ptr + j) = x + y;
        *(data_ptr + j + h) = x - y;
        if (h == n_over_2) {
          *(data_ptr + j) *= scale;
          *(data_ptr + j + h) *= scale;
        }
      }
      h <<= 1;
    }
  }
}

// m component
template <typename T>
void hadamard_m(T* out, int n, int m, float scale, size_t size) {
  auto h_matrices = hadamard_matrices();
  auto& matrix = h_matrices[m];
  auto start = 1;
  auto end = matrix.find('\n', start);
  std::vector<bool> hmat_vec;
  while (end != std::string_view::npos) {
    auto row = matrix.substr(start, end - start);
    for (int i = 0; i < row.length(); i++) {
      hmat_vec.push_back(row[i] == '+');
    }
    start = end + 1;
    end = matrix.find('\n', start);
  }

  for (int b = 0; b < size / m / n; b++) {
    size_t loc = b * n * m;
    T* data_ptr = out + loc;
    for (int i = 0; i < n; i++) {
      std::vector<float> out(m);
      for (int j = 0; j < m; j++) {
        for (int k = 0; k < m; k++) {
          float x = *(data_ptr + i + k * n);
          if (hmat_vec[k + j * m]) {
            out[j] += x;
          } else {
            out[j] -= x;
          }
        }
      }
      for (int j = 0; j < m; j++) {
        *(data_ptr + i + j * n) = out[j] * scale;
      }
    }
  }
}

template <typename T>
void hadamard(array& out, int n, int m, float scale, Stream stream) {
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_output_array(out);
  auto out_ptr = out.data<T>();
  encoder.dispatch([out_ptr, size = out.size(), n, m, scale]() {
    float n_scale = m > 1 ? 1.0 : scale;
    hadamard_n<T>(out_ptr, n, m, n_scale, size);
    if (m > 1) {
      hadamard_m<T>(out_ptr, n, m, scale, size);
    }
  });
}

void Hadamard::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];

  // Copy input to output
  if (in.flags().row_contiguous && in.is_donatable()) {
    out.copy_shared_buffer(in);
  } else {
    copy_cpu(
        in,
        out,
        in.flags().row_contiguous ? CopyType::Vector : CopyType::General,
        stream());
  }

  int axis = out.ndim() - 1;
  auto [n, m] = decompose_hadamard(out.shape(axis));

  switch (in.dtype()) {
    case float32:
      return hadamard<float>(out, n, m, scale_, stream());
    case float16:
      return hadamard<float16_t>(out, n, m, scale_, stream());
    case bfloat16:
      return hadamard<bfloat16_t>(out, n, m, scale_, stream());
    default:
      throw std::invalid_argument("[hadamard] Unsupported type.");
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/indexing.cpp
================================================
// Copyright © 2023 Apple Inc.
#include <algorithm>
#include <cassert>
#include <cmath>

#include "mlx/allocator.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/binary.h"
#include "mlx/backend/cpu/binary_ops.h"
#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/slicing.h"
#include "mlx/dtype_utils.h"
#include "mlx/primitives.h"

namespace mlx::core {

template <typename IdxT>
inline size_t offset_neg_idx(IdxT idx, size_t size) {
  return (idx < 0) ? idx + size : idx;
}

template <>
inline size_t offset_neg_idx(uint32_t idx, size_t) {
  return idx;
}

struct None {
  template <typename T>
  void operator()(T x, T* y) {
    (*y) = x;
  }
};
struct Sum {
  template <typename T>
  void operator()(T x, T* y) {
    (*y) += x;
  }
};

struct Prod {
  template <typename T>
  void operator()(T x, T* y) {
    (*y) *= x;
  }
};

struct Max {
  template <typename T>
  void operator()(T x, T* y) {
    (*y) = (*y > x) ? *y : x;
  }
};

struct Min {
  template <typename T>
  void operator()(T x, T* y) {
    (*y) = (*y < x) ? *y : x;
  }
};

template <typename T, typename IdxT>
void gather(
    const array& src,
    const std::vector<array>& inds,
    array& out,
    const std::vector<int>& axes,
    const Shape& slice_sizes) {
  // If the array is row contiguous then we can do a contiguous copy given
  // two conditions on the slice size:
  // - Any number of leading ones in the slice sizes are allowed
  // - All other slice sizes match the corresponding dimension except the
  //   first non-singleton slice size
  // If the array is col contiguous then the reverse is the case:
  // - Any number of trailing ones in the slice sizes are allowed
  // - All other slice sizes match the corresponding dimension except the
  //   first non-singleton slice size from the end

  bool can_copy = false;
  if (src.flags().row_contiguous) {
    can_copy = true;

    // Ignore leading 1s
    int i = 0;
    for (; i < slice_sizes.size() && slice_sizes[i] == 1; ++i)
      ;

    // Check the remaining
    i++;
    for (; i < src.ndim() && can_copy; ++i) {
      can_copy = (src.shape(i) == slice_sizes[i]);
    }
  } else if (src.flags().col_contiguous) {
    can_copy = true;

    // Ignore trailing 1s
    int i = slice_sizes.size() - 1;
    for (; i >= 0 && slice_sizes[i] == 1; --i)
      ;

    // Skip the next slice size and check the remaining
    i--;
    for (; i >= 0 && can_copy; --i) {
      can_copy = (src.shape(i) == slice_sizes[i]);
    }
  }
  size_t slice_size = 1;
  for (auto s : slice_sizes) {
    slice_size *= s;
  }
  size_t ind_size = slice_size == 0 ? 0 : out.size() / slice_size;
  const T* src_ptr = src.data<T>();
  T* dst_ptr = out.data<T>();

  std::vector<ContiguousIterator> its(inds.begin(), inds.end());
  ContiguousIterator src_it;
  if (!can_copy && src.ndim() > 0) {
    src_it = ContiguousIterator(slice_sizes, src.strides(), src.ndim());
  }

  size_t out_idx = 0;
  for (int idx = 0; idx < ind_size; idx++) {
    size_t src_idx = 0;
    for (int ii = 0; ii < inds.size(); ++ii) {
      auto ax = axes[ii];
      auto idx_loc = its[ii].loc;
      its[ii].step();
      auto idx_val =
          offset_neg_idx(inds[ii].data<IdxT>()[idx_loc], src.shape(ax));
      src_idx += (idx_val * src.strides()[ax]);
    }

    if (slice_size == 1) {
      dst_ptr[out_idx++] = src_ptr[src_idx];
    } else if (can_copy) {
      std::copy(
          src_ptr + src_idx, src_ptr + src_idx + slice_size, dst_ptr + out_idx);
      out_idx += slice_size;
    } else {
      for (int jj = 0; jj < slice_size; jj++) {
        dst_ptr[out_idx++] = src_ptr[src_idx + src_it.loc];
        src_it.step();
      }
      src_it.reset();
    }
  }
}

template <typename IdxT>
void dispatch_gather(
    const array& src,
    const std::vector<array>& inds,
    array& out,
    const std::vector<int>& axes,
    const Shape& size) {
  switch (out.dtype()) {
    case bool_:
      gather<bool, IdxT>(src, inds, out, axes, size);
      break;
    case uint8:
      gather<uint8_t, IdxT>(src, inds, out, axes, size);
      break;
    case uint16:
      gather<uint16_t, IdxT>(src, inds, out, axes, size);
      break;
    case uint32:
      gather<uint32_t, IdxT>(src, inds, out, axes, size);
      break;
    case uint64:
      gather<uint64_t, IdxT>(src, inds, out, axes, size);
      break;
    case int8:
      gather<int8_t, IdxT>(src, inds, out, axes, size);
      break;
    case int16:
      gather<int16_t, IdxT>(src, inds, out, axes, size);
      break;
    case int32:
      gather<int32_t, IdxT>(src, inds, out, axes, size);
      break;
    case int64:
      gather<int64_t, IdxT>(src, inds, out, axes, size);
      break;
    case float16:
      gather<float16_t, IdxT>(src, inds, out, axes, size);
      break;
    case float32:
      gather<float, IdxT>(src, inds, out, axes, size);
      break;
    case float64:
      gather<double, IdxT>(src, inds, out, axes, size);
      break;
    case bfloat16:
      gather<bfloat16_t, IdxT>(src, inds, out, axes, size);
      break;
    case complex64:
      gather<complex64_t, IdxT>(src, inds, out, axes, size);
      break;
  }
}

void Gather::eval_cpu(const std::vector<array>& inputs, array& out) {
  out.set_data(allocator::malloc(out.nbytes()));

  auto& src = inputs[0];
  std::vector<array> inds;
  for (auto it = inputs.begin() + 1; it < inputs.end(); ++it) {
    inds.push_back(array::unsafe_weak_copy(*it));
  }
  auto& encoder = cpu::get_command_encoder(stream());
  for (auto& in : inputs) {
    encoder.set_input_array(in);
  }
  encoder.set_output_array(out);
  encoder.dispatch([axes_ = axes_,
                    slice_sizes_ = slice_sizes_,
                    src = array::unsafe_weak_copy(src),
                    inds = std::move(inds),
                    out = array::unsafe_weak_copy(out)]() mutable {
    if (inds.empty()) {
      dispatch_gather<uint8_t>(src, inds, out, axes_, slice_sizes_);
      return;
    }

    switch (inds[0].dtype()) {
      case uint8:
        dispatch_gather<uint8_t>(src, inds, out, axes_, slice_sizes_);
        break;
      case uint16:
        dispatch_gather<uint16_t>(src, inds, out, axes_, slice_sizes_);
        break;
      case uint32:
        dispatch_gather<uint32_t>(src, inds, out, axes_, slice_sizes_);
        break;
      case uint64:
        dispatch_gather<uint64_t>(src, inds, out, axes_, slice_sizes_);
        break;
      case int8:
        dispatch_gather<int8_t>(src, inds, out, axes_, slice_sizes_);
        break;
      case int16:
        dispatch_gather<int16_t>(src, inds, out, axes_, slice_sizes_);
        break;
      case int32:
        dispatch_gather<int32_t>(src, inds, out, axes_, slice_sizes_);
        break;
      case int64:
        dispatch_gather<int64_t>(src, inds, out, axes_, slice_sizes_);
        break;
      default:
        throw std::runtime_error(
            "[Gather::eval_cpu] Cannot gather with indices type.");
        break;
    }
  });
}
template <typename T, typename IdxT>
void gather_axis(
    const array& src,
    const array& ind,
    array& out,
    const int axis) {
  auto shape = remove_index(ind.shape(), axis);
  ContiguousIterator ind_it(
      shape, remove_index(ind.strides(), axis), src.ndim() - 1);
  ContiguousIterator src_it(
      shape, remove_index(src.strides(), axis), src.ndim() - 1);

  auto ind_ptr = ind.data<IdxT>();
  auto src_ptr = src.data<T>();
  auto dst_ptr = out.data<T>();
  auto ind_ax_stride = ind.strides(axis);
  auto src_ax_stride = src.strides(axis);
  auto dst_ax_stride = out.strides(axis);
  auto ind_ax_size = ind.shape(axis);
  auto src_ax_size = src.shape(axis);

  size_t size_pre = 1;
  size_t size_post = 1;
  for (int i = 0; i < axis; ++i) {
    size_pre *= ind.shape(i);
  }
  for (int i = axis + 1; i < ind.ndim(); ++i) {
    size_post *= ind.shape(i);
  }

  size_t stride_pre = size_post * ind_ax_size;
  for (size_t i = 0; i < size_pre; i++) {
    for (size_t k = 0; k < size_post; k++) {
      for (int j = 0; j < ind_ax_size; ++j) {
        auto ind_val = offset_neg_idx(
            ind_ptr[ind_it.loc + j * ind_ax_stride], src_ax_size);
        dst_ptr[k + j * dst_ax_stride] =
            src_ptr[src_it.loc + ind_val * src_ax_stride];
      }
      ind_it.step();
      src_it.step();
    }
    dst_ptr += stride_pre;
  }
}

template <typename IdxT>
void dispatch_gather_axis(
    const array& src,
    const array& inds,
    array& out,
    const int axis) {
  switch (out.dtype()) {
    case bool_:
      gather_axis<bool, IdxT>(src, inds, out, axis);
      break;
    case uint8:
      gather_axis<uint8_t, IdxT>(src, inds, out, axis);
      break;
    case uint16:
      gather_axis<uint16_t, IdxT>(src, inds, out, axis);
      break;
    case uint32:
      gather_axis<uint32_t, IdxT>(src, inds, out, axis);
      break;
    case uint64:
      gather_axis<uint64_t, IdxT>(src, inds, out, axis);
      break;
    case int8:
      gather_axis<int8_t, IdxT>(src, inds, out, axis);
      break;
    case int16:
      gather_axis<int16_t, IdxT>(src, inds, out, axis);
      break;
    case int32:
      gather_axis<int32_t, IdxT>(src, inds, out, axis);
      break;
    case int64:
      gather_axis<int64_t, IdxT>(src, inds, out, axis);
      break;
    case float16:
      gather_axis<float16_t, IdxT>(src, inds, out, axis);
      break;
    case float32:
      gather_axis<float, IdxT>(src, inds, out, axis);
      break;
    case float64:
      gather_axis<double, IdxT>(src, inds, out, axis);
      break;
    case bfloat16:
      gather_axis<bfloat16_t, IdxT>(src, inds, out, axis);
      break;
    case complex64:
      gather_axis<complex64_t, IdxT>(src, inds, out, axis);
      break;
  }
}

void GatherAxis::eval_cpu(const std::vector<array>& inputs, array& out) {
  out.set_data(allocator::malloc(out.nbytes()));

  auto& src = inputs[0];
  auto& inds = inputs[1];
  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_input_array(src);
  encoder.set_input_array(inds);
  encoder.set_output_array(out);
  encoder.dispatch([axis_ = axis_,
                    src = array::unsafe_weak_copy(src),
                    inds = array::unsafe_weak_copy(inds),
                    out = array::unsafe_weak_copy(out)]() mutable {
    switch (inds.dtype()) {
      case uint8:
        dispatch_gather_axis<uint8_t>(src, inds, out, axis_);
        break;
      case uint16:
        dispatch_gather_axis<uint16_t>(src, inds, out, axis_);
        break;
      case uint32:
        dispatch_gather_axis<uint32_t>(src, inds, out, axis_);
        break;
      case uint64:
        dispatch_gather_axis<uint64_t>(src, inds, out, axis_);
        break;
      case int8:
        dispatch_gather_axis<int8_t>(src, inds, out, axis_);
        break;
      case int16:
        dispatch_gather_axis<int16_t>(src, inds, out, axis_);
        break;
      case int32:
        dispatch_gather_axis<int32_t>(src, inds, out, axis_);
        break;
      case int64:
        dispatch_gather_axis<int64_t>(src, inds, out, axis_);
        break;
      default:
        throw std::runtime_error(
            "[GatherAxis::eval_cpu] Cannot gather with indices type.");
        break;
    }
  });
}

template <typename InT, typename IdxT, typename OpT>
void scatter(
    const array& updates,
    array& out,
    const std::vector<array>& inds,
    const std::vector<int>& axes) {
  int nind = inds.size();
  auto inds_ndim = updates.ndim() - out.ndim();
  size_t n_updates = nind ? inds[0].size() : 1;

  Shape update_shape(
      updates.shape().begin() + inds_ndim, updates.shape().end());
  size_t update_size = 1;
  for (auto us : update_shape) {
    update_size *= us;
  }

  std::vector<ContiguousIterator> its(inds.begin(), inds.end());
  ContiguousIterator update_it(updates);
  ContiguousIterator out_it(update_shape, out.strides(), out.ndim());

  auto out_ptr = out.data<InT>();
  auto upd_ptr = updates.data<InT>();
  for (int i = 0; i < n_updates; ++i) {
    size_t out_offset = 0;
    for (int j = 0; j < inds.size(); ++j) {
      auto ax = axes[j];
      auto idx_loc = its[j].loc;
      its[j].step();
      auto idx_val =
          offset_neg_idx(inds[j].data<IdxT>()[idx_loc], out.shape(ax));
      out_offset += (idx_val * out.strides()[ax]);
    }
    update_it.seek(i * update_size);
    for (int j = 0; j < update_size; ++j) {
      OpT{}(upd_ptr[update_it.loc], out_ptr + out_offset + out_it.loc);
      update_it.step();
      out_it.step();
    }
    out_it.reset();
    update_it.reset();
  }
}

template <typename InT, typename IdxT>
void dispatch_scatter_inds(
    array& out,
    const std::vector<array>& indices,
    const array& updates,
    const std::vector<int>& axes,
    Scatter::ReduceType rtype) {
  switch (rtype) {
    case Scatter::None:
      scatter<InT, IdxT, None>(updates, out, indices, axes);
      break;
    case Scatter::Sum:
      scatter<InT, IdxT, Sum>(updates, out, indices, axes);
      break;
    case Scatter::Prod:
      scatter<InT, IdxT, Prod>(updates, out, indices, axes);
      break;
    case Scatter::Max:
      scatter<InT, IdxT, Max>(updates, out, indices, axes);
      break;
    case Scatter::Min:
      scatter<InT, IdxT, Min>(updates, out, indices, axes);
      break;
  }
}

template <typename InT>
void dispatch_scatter(
    array& out,
    const std::vector<array>& inds,
    const array& updates,
    const std::vector<int>& axes,
    Scatter::ReduceType rtype) {
  if (inds.empty()) {
    dispatch_scatter_inds<InT, uint8_t>(out, inds, updates, axes, rtype);
    return;
  }

  switch (inds[0].dtype()) {
    case uint8:
      dispatch_scatter_inds<InT, uint8_t>(out, inds, updates, axes, rtype);
      break;
    case uint16:
      dispatch_scatter_inds<InT, uint16_t>(out, inds, updates, axes, rtype);
      break;
    case uint32:
      dispatch_scatter_inds<InT, uint32_t>(out, inds, updates, axes, rtype);
      break;
    case uint64:
      dispatch_scatter_inds<InT, uint64_t>(out, inds, updates, axes, rtype);
      break;
    case int8:
      dispatch_scatter_inds<InT, int8_t>(out, inds, updates, axes, rtype);
      break;
    case int16:
      dispatch_scatter_inds<InT, int16_t>(out, inds, updates, axes, rtype);
      break;
    case int32:
      dispatch_scatter_inds<InT, int32_t>(out, inds, updates, axes, rtype);
      break;
    case int64:
      dispatch_scatter_inds<InT, int64_t>(out, inds, updates, axes, rtype);
      break;
    default:
      throw std::runtime_error(
          "[Scatter::eval_cpu] Cannot scatter with indices type.");
  }
}

void Scatter::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() >= 2);

  auto& src = inputs[0];
  auto& updates = inputs.back();

  // Copy src into out (copy allocates memory for out)
  auto ctype =
      src.flags().row_contiguous ? CopyType::Vector : CopyType::General;
  copy_cpu(src, out, ctype, stream());

  auto& encoder = cpu::get_command_encoder(stream());
  std::vector<array> inds;
  for (auto it = inputs.begin() + 1; it < inputs.end() - 1; ++it) {
    encoder.set_input_array(*it);
    inds.push_back(array::unsafe_weak_copy(*it));
  }
  encoder.set_input_array(updates);
  encoder.set_output_array(out);
  encoder.dispatch([axes_ = axes_,
                    reduce_type_ = reduce_type_,
                    updates = array::unsafe_weak_copy(updates),
                    inds = std::move(inds),
                    out = array::unsafe_weak_copy(out)]() mutable {
    switch (out.dtype()) {
      case bool_:
        dispatch_scatter<bool>(out, inds, updates, axes_, reduce_type_);
        break;
      case uint8:
        dispatch_scatter<uint8_t>(out, inds, updates, axes_, reduce_type_);
        break;
      case uint16:
        dispatch_scatter<uint16_t>(out, inds, updates, axes_, reduce_type_);
        break;
      case uint32:
        dispatch_scatter<uint32_t>(out, inds, updates, axes_, reduce_type_);
        break;
      case uint64:
        dispatch_scatter<uint64_t>(out, inds, updates, axes_, reduce_type_);
        break;
      case int8:
        dispatch_scatter<int8_t>(out, inds, updates, axes_, reduce_type_);
        break;
      case int16:
        dispatch_scatter<int16_t>(out, inds, updates, axes_, reduce_type_);
        break;
      case int32:
        dispatch_scatter<int32_t>(out, inds, updates, axes_, reduce_type_);
        break;
      case int64:
        dispatch_scatter<int64_t>(out, inds, updates, axes_, reduce_type_);
        break;
      case float16:
        dispatch_scatter<float16_t>(out, inds, updates, axes_, reduce_type_);
        break;
      case float32:
        dispatch_scatter<float>(out, inds, updates, axes_, reduce_type_);
        break;
      case float64:
        dispatch_scatter<double>(out, inds, updates, axes_, reduce_type_);
        break;
      case bfloat16:
        dispatch_scatter<bfloat16_t>(out, inds, updates, axes_, reduce_type_);
        break;
      case complex64:
        dispatch_scatter<complex64_t>(out, inds, updates, axes_, reduce_type_);
        break;
    }
  });
}

template <typename T, typename IdxT, typename OpT>
void scatter_axis(array& out, const array idx, const array& upd, int axis) {
  auto shape = remove_index(idx.shape(), axis);
  ContiguousIterator idx_it(
      shape, remove_index(idx.strides(), axis), upd.ndim() - 1);
  ContiguousIterator upd_it(
      shape, remove_index(upd.strides(), axis), upd.ndim() - 1);

  auto idx_ptr = idx.data<IdxT>();
  auto upd_ptr = upd.data<T>();
  auto dst_ptr = out.data<T>();
  auto idx_ax_stride = idx.strides(axis);
  auto upd_ax_stride = upd.strides(axis);
  auto dst_ax_stride = out.strides(axis);
  auto idx_ax_size = idx.shape(axis);
  auto dst_ax_size = out.shape(axis);

  size_t size_pre = 1;
  size_t size_post = 1;
  for (int i = 0; i < axis; ++i) {
    size_pre *= idx.shape(i);
  }
  for (int i = axis + 1; i < idx.ndim(); ++i) {
    size_post *= idx.shape(i);
  }
  size_t stride_pre = size_post * dst_ax_size;
  for (size_t i = 0; i < size_pre; i++) {
    for (size_t k = 0; k < size_post; k++) {
      for (int j = 0; j < idx_ax_size; ++j) {
        auto ind_val = offset_neg_idx(
            idx_ptr[idx_it.loc + j * idx_ax_stride], dst_ax_size);
        OpT{}(
            upd_ptr[upd_it.loc + j * upd_ax_stride],
            dst_ptr + k + ind_val * dst_ax_stride);
      }
      idx_it.step();
      upd_it.step();
    }
    dst_ptr += stride_pre;
  }
}

template <typename InT, typename IdxT>
void dispatch_scatter_axis_op(
    array& out,
    const array& idx,
    const array& updates,
    int axis,
    ScatterAxis::ReduceType rtype) {
  switch (rtype) {
    case ScatterAxis::None:
      scatter_axis<InT, IdxT, None>(out, idx, updates, axis);
      break;
    case ScatterAxis::Sum:
      scatter_axis<InT, IdxT, Sum>(out, idx, updates, axis);
      break;
  }
}

template <typename InT>
void dispatch_scatter_axis(
    array& out,
    const array& idx,
    const array& updates,
    int axis,
    ScatterAxis::ReduceType rtype) {
  switch (idx.dtype()) {
    case uint8:
      dispatch_scatter_axis_op<InT, uint8_t>(out, idx, updates, axis, rtype);
      break;
    case uint16:
      dispatch_scatter_axis_op<InT, uint16_t>(out, idx, updates, axis, rtype);
      break;
    case uint32:
      dispatch_scatter_axis_op<InT, uint32_t>(out, idx, updates, axis, rtype);
      break;
    case uint64:
      dispatch_scatter_axis_op<InT, uint64_t>(out, idx, updates, axis, rtype);
      break;
    case int8:
      dispatch_scatter_axis_op<InT, int8_t>(out, idx, updates, axis, rtype);
      break;
    case int16:
      dispatch_scatter_axis_op<InT, int16_t>(out, idx, updates, axis, rtype);
      break;
    case int32:
      dispatch_scatter_axis_op<InT, int32_t>(out, idx, updates, axis, rtype);
      break;
    case int64:
      dispatch_scatter_axis_op<InT, int64_t>(out, idx, updates, axis, rtype);
      break;
    default:
      throw std::runtime_error(
          "[ScatterAxis::eval_cpu] Cannot scatter with indices type.");
  }
}

void ScatterAxis::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() >= 2);

  auto& src = inputs[0];
  auto& idx = inputs[1];
  auto& updates = inputs[2];

  // Copy src into out (copy allocates memory for out)
  auto ctype =
      src.flags().row_contiguous ? CopyType::Vector : CopyType::General;
  copy_cpu(src, out, ctype, stream());

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_input_array(idx);
  encoder.set_input_array(updates);
  encoder.set_output_array(out);
  encoder.dispatch([axis_ = axis_,
                    reduce_type_ = reduce_type_,
                    idx = array::unsafe_weak_copy(idx),
                    updates = array::unsafe_weak_copy(updates),
                    out = array::unsafe_weak_copy(out)]() mutable {
    switch (out.dtype()) {
      case bool_:
        dispatch_scatter_axis<bool>(out, idx, updates, axis_, reduce_type_);
        break;
      case uint8:
        dispatch_scatter_axis<uint8_t>(out, idx, updates, axis_, reduce_type_);
        break;
      case uint16:
        dispatch_scatter_axis<uint16_t>(out, idx, updates, axis_, reduce_type_);
        break;
      case uint32:
        dispatch_scatter_axis<uint32_t>(out, idx, updates, axis_, reduce_type_);
        break;
      case uint64:
        dispatch_scatter_axis<uint64_t>(out, idx, updates, axis_, reduce_type_);
        break;
      case int8:
        dispatch_scatter_axis<int8_t>(out, idx, updates, axis_, reduce_type_);
        break;
      case int16:
        dispatch_scatter_axis<int16_t>(out, idx, updates, axis_, reduce_type_);
        break;
      case int32:
        dispatch_scatter_axis<int32_t>(out, idx, updates, axis_, reduce_type_);
        break;
      case int64:
        dispatch_scatter_axis<int64_t>(out, idx, updates, axis_, reduce_type_);
        break;
      case float16:
        dispatch_scatter_axis<float16_t>(
            out, idx, updates, axis_, reduce_type_);
        break;
      case float32:
        dispatch_scatter_axis<float>(out, idx, updates, axis_, reduce_type_);
        break;
      case float64:
        dispatch_scatter_axis<double>(out, idx, updates, axis_, reduce_type_);
        break;
      case bfloat16:
        dispatch_scatter_axis<bfloat16_t>(
            out, idx, updates, axis_, reduce_type_);
        break;
      case complex64:
        dispatch_scatter_axis<complex64_t>(
            out, idx, updates, axis_, reduce_type_);
        break;
    }
  });
}

template <typename T>
void masked_scatter_impl(const array& mask, const array& src, array& out) {
  ContiguousIterator mask_it(mask);
  ContiguousIterator src_it(src);
  ContiguousIterator out_it(out);

  const bool* mask_ptr = mask.data<bool>();
  const T* src_ptr = src.data<T>();
  T* dst_ptr = out.data<T>();

  const size_t batch_count = mask.shape(0);
  const size_t mask_batch_size = mask.size() / batch_count;
  const size_t src_batch_size = src.size() / batch_count;

  for (size_t b = 0; b < batch_count; ++b) {
    size_t src_consumed = 0;
    src_it.seek(b * src_batch_size);

    for (size_t i = 0; i < mask_batch_size; ++i) {
      if (mask_ptr[mask_it.loc]) {
        if (src_consumed >= src_batch_size) {
          throw std::runtime_error(
              "[MaskedScatter::eval_cpu] Source does not have enough elements for mask.");
        }
        dst_ptr[out_it.loc] = src_ptr[src_it.loc];
        src_it.step();
        ++src_consumed;
      }
      mask_it.step();
      out_it.step();
    }
  }
}

void MaskedScatter::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 3);

  auto& dst = inputs[0];
  auto& mask = inputs[1];
  auto& src = inputs[2];

  // Copy dst into out (copy allocates memory for out)
  auto ctype =
      dst.flags().row_contiguous ? CopyType::Vector : CopyType::General;
  copy_cpu(dst, out, ctype, stream());

  if (mask.size() == 0) {
    return;
  }

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_input_array(mask);
  encoder.set_input_array(src);
  encoder.set_output_array(out);
  encoder.dispatch([mask = array::unsafe_weak_copy(mask),
                    src = array::unsafe_weak_copy(src),
                    out = array::unsafe_weak_copy(out)]() mutable {
    switch (out.dtype()) {
      case bool_:
        masked_scatter_impl<bool>(mask, src, out);
        break;
      case uint8:
        masked_scatter_impl<uint8_t>(mask, src, out);
        break;
      case uint16:
        masked_scatter_impl<uint16_t>(mask, src, out);
        break;
      case uint32:
        masked_scatter_impl<uint32_t>(mask, src, out);
        break;
      case uint64:
        masked_scatter_impl<uint64_t>(mask, src, out);
        break;
      case int8:
        masked_scatter_impl<int8_t>(mask, src, out);
        break;
      case int16:
        masked_scatter_impl<int16_t>(mask, src, out);
        break;
      case int32:
        masked_scatter_impl<int32_t>(mask, src, out);
        break;
      case int64:
        masked_scatter_impl<int64_t>(mask, src, out);
        break;
      case float16:
        masked_scatter_impl<float16_t>(mask, src, out);
        break;
      case float32:
        masked_scatter_impl<float>(mask, src, out);
        break;
      case float64:
        masked_scatter_impl<double>(mask, src, out);
        break;
      case bfloat16:
        masked_scatter_impl<bfloat16_t>(mask, src, out);
        break;
      case complex64:
        masked_scatter_impl<complex64_t>(mask, src, out);
        break;
    }
  });
}

template <typename T, typename Op>
void slice_update_impl(
    array& out,
    const array& upd,
    int64_t data_offset,
    const Strides& out_strides) {
  ContiguousIterator out_it(upd.shape(), out_strides, upd.ndim());
  ContiguousIterator upd_it(upd);
  Op op;

  constexpr int SIMD_START = 32;

  T* out_ptr = out.data<T>() + data_offset;
  const T* upd_ptr = upd.data<T>();
  int64_t size = upd.size();
  int64_t suffix = out_it.contiguous_suffix();

  if (upd.data_size() == 1) {
    if (suffix >= SIMD_START) {
      for (int64_t i = 0; i < size; i += suffix) {
        VectorScalar<Op>{}(
            out_ptr + out_it.loc, upd_ptr, out_ptr + out_it.loc, suffix);
        out_it.step(suffix);
      }
    } else {
      T update = upd_ptr[0];
      for (int64_t i = 0; i < size; i++) {
        out_ptr[out_it.loc] = op(out_ptr[out_it.loc], update);
        out_it.step();
      }
    }
  } else if (suffix == upd_it.contiguous_suffix() && suffix >= SIMD_START) {
    for (int64_t i = 0; i < size; i += suffix) {
      VectorVector<Op>{}(
          out_ptr + out_it.loc,
          upd_ptr + upd_it.loc,
          out_ptr + out_it.loc,
          suffix);
      out_it.step(suffix);
      upd_it.step(suffix);
    }
  } else {
    for (int64_t i = 0; i < size; i++) {
      out_ptr[out_it.loc] = op(out_ptr[out_it.loc], upd_ptr[upd_it.loc]);
      out_it.step();
      upd_it.step();
    }
  }
}

void SliceUpdate::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  if (out.size() == 0) {
    out.set_data(allocator::malloc(0));
    return;
  }

  auto& in = inputs[0];
  auto& upd = inputs[1];

  if (upd.size() == 0) {
    out.copy_shared_buffer(in);
    return;
  }

  // Check if materialization is needed
  auto ctype = in.flags().contiguous && in.size() == in.data_size()
      ? CopyType::Vector
      : CopyType::General;
  copy_cpu(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, stream());

  // Calculate out strides, initial offset and if copy needs to be made
  auto [data_offset, out_strides] =
      prepare_slice(out, start_indices_, strides_);

  // Do copy
  if (reduce_type_ == SliceUpdate::None) {
    copy_cpu_inplace(
        /* const array& src = */ upd,
        /* array& dst = */ out,
        /* const std::vector<int>& data_shape = */ upd.shape(),
        /* const std::vector<stride_t>& i_strides = */ upd.strides(),
        /* const std::vector<stride_t>& o_strides = */ out_strides,
        /* int64_t i_offset = */ 0,
        /* int64_t o_offset = */ data_offset,
        /* CopyType ctype = */ CopyType::GeneralGeneral,
        stream());
    return;
  }

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_input_array(upd);
  encoder.set_output_array(out);
  encoder.dispatch([upd = array::unsafe_weak_copy(upd),
                    out = array::unsafe_weak_copy(out),
                    data_offset = data_offset,
                    out_strides = std::move(out_strides),
                    reduce_type = reduce_type_]() mutable {
    dispatch_all_types(out.dtype(), [&](auto type_tag) {
      using T = MLX_GET_TYPE(type_tag);
      switch (reduce_type) {
        case SliceUpdate::Sum:
          slice_update_impl<T, detail::Add>(out, upd, data_offset, out_strides);
          break;
        case SliceUpdate::Prod:
          slice_update_impl<T, detail::Multiply>(
              out, upd, data_offset, out_strides);
          break;
        case SliceUpdate::Max:
          slice_update_impl<T, detail::Maximum>(
              out, upd, data_offset, out_strides);
          break;
        case SliceUpdate::Min:
          slice_update_impl<T, detail::Minimum>(
              out, upd, data_offset, out_strides);
          break;
        case SliceUpdate::None:
          // Should never be here
          break;
      }
    });
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/inverse.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include "mlx/allocator.h"
#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/lapack.h"
#include "mlx/primitives.h"

namespace mlx::core {

template <typename T>
void general_inv(T* inv, int N) {
  int info;
  auto ipiv = array::Data{allocator::malloc(sizeof(int) * N)};
  // Compute LU factorization.
  getrf<T>(
      /* m = */ &N,
      /* n = */ &N,
      /* a = */ inv,
      /* lda = */ &N,
      /* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
      /* info = */ &info);

  if (info != 0) {
    std::stringstream ss;
    ss << "[Inverse::eval_cpu] LU factorization failed with error code "
       << info;
    throw std::runtime_error(ss.str());
  }

  static const int lwork_query = -1;
  T workspace_size = 0;

  // Compute workspace size.
  getri<T>(
      /* m = */ &N,
      /* a = */ nullptr,
      /* lda = */ &N,
      /* ipiv = */ nullptr,
      /* work = */ &workspace_size,
      /* lwork = */ &lwork_query,
      /* info = */ &info);

  if (info != 0) {
    std::stringstream ss;
    ss << "[Inverse::eval_cpu] LU workspace calculation failed with error code "
       << info;
    throw std::runtime_error(ss.str());
  }

  const int lwork = workspace_size;
  auto scratch = array::Data{allocator::malloc(sizeof(T) * lwork)};

  // Compute inverse.
  getri<T>(
      /* m = */ &N,
      /* a = */ inv,
      /* lda = */ &N,
      /* ipiv = */ static_cast<int*>(ipiv.buffer.raw_ptr()),
      /* work = */ static_cast<T*>(scratch.buffer.raw_ptr()),
      /* lwork = */ &lwork,
      /* info = */ &info);

  if (info != 0) {
    std::stringstream ss;
    ss << "[Inverse::eval_cpu] inversion failed with error code " << info;
    throw std::runtime_error(ss.str());
  }
}

template <typename T>
void tri_inv(T* inv, int N, bool upper) {
  const char uplo = upper ? 'L' : 'U';
  const char diag = 'N';
  int info;
  trtri<T>(
      /* uplo = */ &uplo,
      /* diag = */ &diag,
      /* N = */ &N,
      /* a = */ inv,
      /* lda = */ &N,
      /* info = */ &info);

  // zero out the other triangle
  if (upper) {
    for (int i = 0; i < N; i++) {
      std::fill(inv, inv + i, 0.0f);
      inv += N;
    }
  } else {
    for (int i = 0; i < N; i++) {
      std::fill(inv + i + 1, inv + N, 0.0f);
      inv += N;
    }
  }

  if (info != 0) {
    std::stringstream ss;
    ss << "[Inverse::eval_cpu] triangular inversion failed with error code "
       << info;
    throw std::runtime_error(ss.str());
  }
}

template <typename T>
void inverse_impl(
    const array& a,
    array& inv,
    bool tri,
    bool upper,
    Stream stream) {
  // Lapack uses the column-major convention. We take advantage of the following
  // identity to avoid transposing (see
  // https://math.stackexchange.com/a/340234):
  //   (A⁻¹)ᵀ = (Aᵀ)⁻¹

  // The inverse is computed in place, so just copy the input to the output.
  copy_cpu(
      a,
      inv,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
      stream);

  const int N = a.shape(-1);
  const size_t num_matrices = a.size() / (N * N);

  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_output_array(inv);

  auto inv_ptr = inv.data<T>();
  if (tri) {
    encoder.dispatch([inv_ptr, N, num_matrices, upper]() {
      for (int i = 0; i < num_matrices; i++) {
        tri_inv<T>(inv_ptr + N * N * i, N, upper);
      }
    });
  } else {
    encoder.dispatch([inv_ptr, N, num_matrices]() {
      for (int i = 0; i < num_matrices; i++) {
        general_inv<T>(inv_ptr + N * N * i, N);
      }
    });
  }
}

void Inverse::eval_cpu(const std::vector<array>& inputs, array& output) {
  switch (inputs[0].dtype()) {
    case float32:
      inverse_impl<float>(inputs[0], output, tri_, upper_, stream());
      break;
    case float64:
      inverse_impl<double>(inputs[0], output, tri_, upper_, stream());
      break;
    default:
      throw std::runtime_error(
          "[Inverse::eval_cpu] only supports float32 or float64.");
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/jit_compiler.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/backend/cpu/jit_compiler.h"

#include <algorithm>
#include <sstream>
#include <vector>

#include <fmt/format.h>

namespace mlx::core {

#ifdef _MSC_VER

namespace {

// Split string into array.
std::vector<std::string> str_split(const std::string& str, char delimiter) {
  std::vector<std::string> tokens;
  std::string token;
  std::istringstream tokenStream(str);
  while (std::getline(tokenStream, token, delimiter)) {
    tokens.push_back(token);
  }
  return tokens;
}

// Get path information about MSVC.
struct VisualStudioInfo {
  VisualStudioInfo() {
#ifdef _M_ARM64
    arch = "arm64";
#else
    arch = "x64";
#endif
    // Get path of Visual Studio.
    // Use -latest to get only the most recent installation when multiple
    // versions are installed, avoiding path concatenation issues.
    std::string vs_path = JitCompiler::exec(
        fmt::format(
            "\"{0}\\Microsoft Visual Studio\\Installer\\vswhere.exe\""
            " -latest -property installationPath",
            std::getenv("ProgramFiles(x86)")));
    if (vs_path.empty()) {
      throw std::runtime_error("Can not find Visual Studio.");
    }
    // Trim any trailing whitespace/newlines from the path
    vs_path.erase(
        std::find_if(
            vs_path.rbegin(),
            vs_path.rend(),
            [](unsigned char ch) { return !std::isspace(ch); })
            .base(),
        vs_path.end());
    // Read the envs from vcvarsall.
    std::string envs = JitCompiler::exec(
        fmt::format(
            "\"{0}\\VC\\Auxiliary\\Build\\vcvarsall.bat\" {1} >NUL && set",
            vs_path,
            arch));
    for (const std::string& line : str_split(envs, '\n')) {
      // Each line is in the format "ENV_NAME=values".
      auto pos = line.find_first_of('=');
      if (pos == std::string::npos || pos == 0 || pos == line.size() - 1)
        continue;
      std::string name = line.substr(0, pos);
      std::string value = line.substr(pos + 1);
      if (name == "LIB") {
        libpaths = str_split(value, ';');
      } else if (name == "VCToolsInstallDir" || name == "VCTOOLSINSTALLDIR") {
        cl_exe = fmt::format("{0}\\bin\\Host{1}\\{1}\\cl.exe", value, arch);
      }
    }
  }
  std::string arch;
  std::string cl_exe;
  std::vector<std::string> libpaths;
};

const VisualStudioInfo& GetVisualStudioInfo() {
  static VisualStudioInfo info;
  return info;
}

} // namespace

#endif // _MSC_VER

std::string JitCompiler::build_command(
    const std::filesystem::path& dir,
    const std::string& source_file_name,
    const std::string& shared_lib_name) {
#ifdef _MSC_VER
  const VisualStudioInfo& info = GetVisualStudioInfo();
  std::string libpaths;
  for (const std::string& lib : info.libpaths) {
    libpaths += fmt::format(" /libpath:\"{0}\"", lib);
  }
  return fmt::format(
      "\""
      "cd /D \"{0}\" && "
      "\"{1}\" /LD /EHsc /MD /Ox /nologo /std:c++17 \"{2}\" "
      "/link /out:\"{3}\" {4} 2>&1"
      "\"",
      dir.string(),
      info.cl_exe,
      source_file_name,
      shared_lib_name,
      libpaths);
#else
  return fmt::format(
      "g++ -std=c++17 -O3 -Wall -fPIC -shared \"{0}\" -o \"{1}\" 2>&1",
      (dir / source_file_name).string(),
      (dir / shared_lib_name).string());
#endif
}

std::string JitCompiler::exec(const std::string& cmd) {
#ifdef _MSC_VER
  FILE* pipe = _popen(cmd.c_str(), "r");
#else
  FILE* pipe = popen(cmd.c_str(), "r");
#endif
  if (!pipe) {
    throw std::runtime_error("popen() failed.");
  }
  char buffer[128];
  std::string ret;
  while (fgets(buffer, sizeof(buffer), pipe)) {
    ret += buffer;
  }
  // Trim trailing spaces.
  ret.erase(
      std::find_if(
          ret.rbegin(),
          ret.rend(),
          [](unsigned char ch) { return !std::isspace(ch); })
          .base(),
      ret.end());

#ifdef _MSC_VER
  int status = _pclose(pipe);
#else
  int status = pclose(pipe);
#endif
  if (status == -1) {
    throw std::runtime_error("pclose() failed.");
  }
#if defined(_WIN32) || defined(__FreeBSD__)
  int code = status;
#else
  int code = WEXITSTATUS(status);
#endif
  if (code != 0) {
    throw std::runtime_error(
        fmt::format(
            "Failed to execute command with return code {0}: \"{1}\", "
            "the output is: {2}",
            code,
            cmd,
            ret));
  }
  return ret;
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/jit_compiler.h
================================================
// Copyright © 2024 Apple Inc.
#pragma once

#include <filesystem>

namespace mlx::core {

class JitCompiler {
 public:
  // Build a shell command that compiles a source code file to a shared library.
  static std::string build_command(
      const std::filesystem::path& dir,
      const std::string& source_file_name,
      const std::string& shared_lib_name);

  // Run a command and get its output.
  static std::string exec(const std::string& cmd);
};

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/lapack.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <complex>
#define LAPACK_COMPLEX_CUSTOM
#define lapack_complex_float std::complex<float>
#define lapack_complex_double std::complex<double>
#define lapack_complex_float_real(z) ((z).real())
#define lapack_complex_float_imag(z) ((z).imag())
#define lapack_complex_double_real(z) ((z).real())
#define lapack_complex_double_imag(z) ((z).imag())

#ifdef MLX_USE_ACCELERATE
#include <Accelerate/Accelerate.h>
#else
#include <cblas.h>
#include <lapack.h>
#endif

#if defined(LAPACK_GLOBAL) || defined(LAPACK_NAME)

// This is to work around a change in the function signatures of lapack >= 3.9.1
// where functions taking char* also include a strlen argument, see a similar
// change in OpenCV:
// https://github.com/opencv/opencv/blob/1eb061f89de0fb85c4c75a2deeb0f61a961a63ad/cmake/OpenCVFindLAPACK.cmake#L57
#define MLX_LAPACK_FUNC(f) LAPACK_##f

#else

#define MLX_LAPACK_FUNC(f) f##_

#endif

#define INSTANTIATE_LAPACK_REAL(FUNC)                        \
  template <typename T, typename... Args>                    \
  void FUNC(Args... args) {                                  \
    if constexpr (std::is_same_v<T, float>) {                \
      MLX_LAPACK_FUNC(s##FUNC)(std::forward<Args>(args)...); \
    } else if constexpr (std::is_same_v<T, double>) {        \
      MLX_LAPACK_FUNC(d##FUNC)(std::forward<Args>(args)...); \
    }                                                        \
  }

INSTANTIATE_LAPACK_REAL(geqrf)
INSTANTIATE_LAPACK_REAL(orgqr)
INSTANTIATE_LAPACK_REAL(syevd)
INSTANTIATE_LAPACK_REAL(potrf)
INSTANTIATE_LAPACK_REAL(getrf)
INSTANTIATE_LAPACK_REAL(getri)
INSTANTIATE_LAPACK_REAL(trtri)

#define INSTANTIATE_LAPACK_COMPLEX(FUNC)                            \
  template <typename T, typename... Args>                           \
  void FUNC(Args... args) {                                         \
    if constexpr (std::is_same_v<T, std::complex<float>>) {         \
      MLX_LAPACK_FUNC(c##FUNC)(std::forward<Args>(args)...);        \
    } else if constexpr (std::is_same_v<T, std::complex<double>>) { \
      MLX_LAPACK_FUNC(z##FUNC)(std::forward<Args>(args)...);        \
    }                                                               \
  }

INSTANTIATE_LAPACK_COMPLEX(heevd)

#define INSTANTIATE_LAPACK_ALL(FUNC)                                \
  template <typename T, typename... Args>                           \
  void FUNC(Args... args) {                                         \
    if constexpr (std::is_same_v<T, float>) {                       \
      MLX_LAPACK_FUNC(s##FUNC)(std::forward<Args>(args)...);        \
    } else if constexpr (std::is_same_v<T, double>) {               \
      MLX_LAPACK_FUNC(d##FUNC)(std::forward<Args>(args)...);        \
    } else if constexpr (std::is_same_v<T, std::complex<float>>) {  \
      MLX_LAPACK_FUNC(c##FUNC)(std::forward<Args>(args)...);        \
    } else if constexpr (std::is_same_v<T, std::complex<double>>) { \
      MLX_LAPACK_FUNC(z##FUNC)(std::forward<Args>(args)...);        \
    }                                                               \
  }

INSTANTIATE_LAPACK_ALL(geev)
INSTANTIATE_LAPACK_ALL(gesdd)


================================================
FILE: mlx/backend/cpu/logsumexp.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <cassert>
#include <cmath>

#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/simd/simd.h"
#include "mlx/primitives.h"
#include "mlx/types/limits.h"

namespace mlx::core {

namespace {

using namespace mlx::core::simd;

template <typename T, typename AccT>
void logsumexp(const array& in, array& out, Stream stream) {
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(in);
  encoder.set_output_array(out);

  const T* in_ptr = in.data<T>();
  T* out_ptr = out.data<T>();

  int M = in.shape().back();
  int L = in.data_size() / M;

  encoder.dispatch([in_ptr, out_ptr, M, L]() mutable {
    constexpr int N = std::min(max_size<AccT>, max_size<T>);

    const T* current_in_ptr;

    for (int i = 0; i < L; i++, in_ptr += M, out_ptr += 1) {
      // Find the maximum
      current_in_ptr = in_ptr;
      Simd<AccT, N> vmaximum(-numeric_limits<AccT>::infinity());
      size_t s = M;
      while (s >= N) {
        Simd<AccT, N> vals = load<T, N>(current_in_ptr);
        vmaximum = maximum(vals, vmaximum);
        current_in_ptr += N;
        s -= N;
      }

      AccT maximum = max(vmaximum);
      while (s-- > 0) {
        maximum = std::max(maximum, static_cast<AccT>(*current_in_ptr));
        current_in_ptr++;
      }

      // Compute the normalizer and the exponentials
      Simd<AccT, N> vnormalizer(0.0);
      current_in_ptr = in_ptr;
      s = M;
      while (s >= N) {
        Simd<AccT, N> vexp = load<T, N>(current_in_ptr);
        vexp = exp(vexp - maximum);
        vnormalizer = vnormalizer + vexp;
        current_in_ptr += N;
        s -= N;
      }
      AccT normalizer = sum(vnormalizer);
      while (s-- > 0) {
        AccT _exp = std::exp(*current_in_ptr - maximum);
        normalizer += _exp;
        current_in_ptr++;
      }
      // Normalize
      *out_ptr = std::isinf(maximum)
          ? static_cast<T>(maximum)
          : static_cast<T>(std::log(normalizer) + maximum);
    }
  });
}

} // namespace

void LogSumExp::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);

  // Make sure that the last dimension is contiguous
  auto s = stream();
  auto& encoder = cpu::get_command_encoder(s);
  auto ensure_contiguous = [&s, &encoder](const array& x) {
    if (x.flags().contiguous && x.strides()[x.ndim() - 1] == 1) {
      return x;
    } else {
      array x_copy = contiguous_copy_cpu(x, s);
      encoder.add_temporary(x_copy);
      return x_copy;
    }
  };

  auto in = ensure_contiguous(inputs[0]);
  if (in.flags().row_contiguous) {
    out.set_data(allocator::malloc(out.nbytes()));
  } else {
    auto n = in.shape(-1);
    auto flags = in.flags();
    auto strides = in.strides();
    for (auto& s : strides) {
      s /= n;
    }
    bool col_contig = strides[0] == 1;
    for (int i = 1; col_contig && i < strides.size(); ++i) {
      col_contig &=
          (out.shape(i) == 1 || strides[i - 1] == out.shape(i) * strides[i]);
    }
    flags.col_contiguous = col_contig;
    out.set_data(
        allocator::malloc(in.nbytes() / n),
        in.data_size() / n,
        std::move(strides),
        flags);
  }

  switch (in.dtype()) {
    case float32:
      logsumexp<float, float>(in, out, stream());
      break;
    case float16:
      logsumexp<float16_t, float>(in, out, stream());
      break;
    case bfloat16:
      logsumexp<bfloat16_t, float>(in, out, stream());
      break;
    case float64:
      logsumexp<double, double>(in, out, stream());
      break;
    default:
      throw std::runtime_error(
          "[logsumexp] only supports floating point types");
      break;
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/luf.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <cassert>

#include "mlx/allocator.h"
#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/lapack.h"
#include "mlx/primitives.h"

namespace mlx::core {

template <typename T>
void luf_impl(
    const array& a,
    array& lu,
    array& pivots,
    array& row_indices,
    Stream stream) {
  int M = a.shape(-2);
  int N = a.shape(-1);
  int K = std::min(M, N);

  // Copy a into lu and make it col contiguous
  auto ndim = lu.ndim();
  auto flags = lu.flags();
  flags.col_contiguous = ndim == 2;
  flags.row_contiguous = false;
  flags.contiguous = true;
  auto strides = lu.strides();
  strides[ndim - 1] = M;
  strides[ndim - 2] = 1;
  lu.set_data(allocator::malloc(lu.nbytes()), lu.nbytes(), strides, flags);
  copy_cpu_inplace(
      a,
      lu,
      a.shape(),
      a.strides(),
      strides,
      0,
      0,
      CopyType::GeneralGeneral,
      stream);

  auto a_ptr = lu.data<T>();
  pivots.set_data(allocator::malloc(pivots.nbytes()));
  row_indices.set_data(allocator::malloc(row_indices.nbytes()));
  auto pivots_ptr = pivots.data<uint32_t>();
  auto row_indices_ptr = row_indices.data<uint32_t>();
  size_t num_matrices = a.size() / (M * N);
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  encoder.set_output_array(lu);
  encoder.set_output_array(pivots);
  encoder.set_output_array(row_indices);

  encoder.dispatch(
      [a_ptr, pivots_ptr, row_indices_ptr, num_matrices, M, N, K]() mutable {
        int info;
        for (size_t i = 0; i < num_matrices; ++i) {
          // Compute LU factorization of A
          getrf<T>(
              /* m */ &M,
              /* n */ &N,
              /* a */ a_ptr,
              /* lda */ &M,
              /* ipiv */ reinterpret_cast<int*>(pivots_ptr),
              /* info */ &info);

          if (info != 0) {
            std::stringstream ss;
            ss << "[LUF::eval_cpu] sgetrf_ failed with code " << info
               << ((info > 0) ? " because matrix is singular"
                              : " because argument had an illegal value");
            throw std::runtime_error(ss.str());
          }

          // Subtract 1 to get 0-based index
          int j = 0;
          for (; j < K; ++j) {
            pivots_ptr[j]--;
            row_indices_ptr[j] = j;
          }
          for (; j < M; ++j) {
            row_indices_ptr[j] = j;
          }
          for (int j = K - 1; j >= 0; --j) {
            auto piv = pivots_ptr[j];
            auto t1 = row_indices_ptr[piv];
            auto t2 = row_indices_ptr[j];
            row_indices_ptr[j] = t1;
            row_indices_ptr[piv] = t2;
          }

          // Advance pointers to the next matrix
          a_ptr += M * N;
          pivots_ptr += K;
          row_indices_ptr += M;
        }
      });
}

void LUF::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() == 1);
  switch (inputs[0].dtype()) {
    case float32:
      luf_impl<float>(inputs[0], outputs[0], outputs[1], outputs[2], stream());
      break;
    case float64:
      luf_impl<double>(inputs[0], outputs[0], outputs[1], outputs[2], stream());
      break;
    default:
      throw std::runtime_error(
          "[LUF::eval_cpu] only supports float32 or float64.");
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/make_compiled_preamble.ps1
================================================
# This script generates a C++ function that provides the CPU
# code for use with kernel generation.
#
# Copyright © 2024 Apple Inc.

$OUTPUT_FILE = $args[0]
$CL = $args[1]
$SRCDIR = $args[2]

# Get command result as array.
$CONTENT = & $CL /std:c++17 /EP "/I$SRCDIR" /Tp "$SRCDIR/mlx/backend/cpu/compiled_preamble.h"
# Remove empty lines.
# Otherwise there will be too much empty lines making the result unreadable.
$CONTENT = $CONTENT | Where-Object { $_.Trim() -ne '' }
# Concatenate to string.
$CONTENT = $CONTENT -join "`n"

# Append extra content.
$CONTENT = @"
$($CONTENT)
using namespace mlx::core;
using namespace mlx::core::detail;
"@

# Convert each char to ASCII code.
# Unlike the unix script that outputs string literal directly, the output from
# MSVC is way too large to be embedded as string and compilation will fail, so
# we store it as static array instead.
$CHARCODES = ([System.Text.Encoding]::ASCII.GetBytes($CONTENT) -join ', ') + ', 0'

$OUTPUT = @"
const char* get_kernel_preamble() {
  static char preamble[] = { $CHARCODES };
  return preamble;
}
"@

Set-Content -Path $OUTPUT_FILE -Value $OUTPUT


================================================
FILE: mlx/backend/cpu/make_compiled_preamble.sh
================================================
#!/bin/bash
#
# This script generates a C++ function that provides the CPU
# code for use with kernel generation.
#
# Copyright © 2023-24 Apple Inc.


OUTPUT_FILE=$1
GCC=$2
SRCDIR=$3
CLANG=$4
ARCH=$5

if [ "$CLANG" = "TRUE" ]; then
  read -r -d '' INCLUDES <<- EOM
#include <cmath>
#include <complex>
#include <cstdint>
#include <vector>
#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
#include <arm_fp16.h>
#endif
EOM
CC_FLAGS="-arch ${ARCH} -nobuiltininc -nostdinc"
else
CC_FLAGS="-std=c++17"
fi

CONTENT=$($GCC $CC_FLAGS -I "$SRCDIR" -E -P "$SRCDIR/mlx/backend/cpu/compiled_preamble.h" 2>/dev/null)

cat << EOF > "$OUTPUT_FILE"
const char* get_kernel_preamble() {
return R"preamble(
$INCLUDES
$CONTENT
using namespace mlx::core;
using namespace mlx::core::detail;
)preamble";
}
EOF


================================================
FILE: mlx/backend/cpu/masked_mm.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <cstring>

#include "mlx/array.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/gemm.h"
#include "mlx/backend/cpu/lapack.h"
#include "mlx/primitives.h"

namespace mlx::core {

namespace {

template <typename T, typename mask_t>
inline void mask_matrix(
    T* data,
    const mask_t* mask,
    int block_size,
    const int X,
    const int Y,
    const int64_t X_data_str,
    const int64_t Y_data_str,
    const int64_t X_mask_str,
    const int64_t Y_mask_str,
    const size_t mask_offset) {
  int tX = (X + block_size - 1) / block_size;
  int tY = (Y + block_size - 1) / block_size;

  for (int i = 0; i < tX; i++) {
    for (int j = 0; j < tY; j++) {
      mask_t do_mask = mask[mask_offset + i * X_mask_str + j * Y_mask_str];
      if (do_mask != 1) {
        int loc_x = i * block_size;
        int loc_y = j * block_size;
        T* data_block = data + loc_x * X_data_str + loc_y * Y_data_str;

        int size_x = std::min(block_size, X - loc_x);
        int size_y = std::min(block_size, Y - loc_y);
        for (int ii = 0; ii < size_x; ii++) {
          for (int jj = 0; jj < size_y; jj++) {
            if constexpr (std::is_same_v<mask_t, bool>) {
              data_block[ii * X_data_str + jj * Y_data_str] = T(0.);
            } else {
              data_block[ii * X_data_str + jj * Y_data_str] *= do_mask;
            }
          }
        }
      }
    }
  }
}

template <typename T>
inline void segmented_mm(
    const T* a,
    const T* b,
    const uint32_t* segments,
    T* out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
    const Shape& a_shape,
    const Strides& a_strides,
    const Shape& b_shape,
    const Strides& b_strides,
    size_t num_segments,
    const Shape& segments_shape,
    const Strides& segments_strides) {
  int ndim = a_shape.size();
  Shape a_copy = a_shape;
  Shape b_copy = b_shape;
  int32_t M = a_copy[ndim - 2];
  int32_t N = b_copy[ndim - 1];
  for (int i = 0; i < num_segments; i++) {
    uint32_t k_start =
        segments[elem_to_loc(2 * i, segments_shape, segments_strides)];
    uint32_t k_end =
        segments[elem_to_loc(2 * i + 1, segments_shape, segments_strides)];
    if (k_end <= k_start) {
      std::fill_n(out + i * M * N, M * N, T(0));
      continue;
    }
    a_copy[ndim - 1] = k_end - k_start;
    b_copy[ndim - 2] = k_end - k_start;
    matmul<T>(
        a + k_start * a_strides[ndim - 1],
        b + k_start * b_strides[ndim - 2],
        out + i * M * N,
        a_transposed,
        b_transposed,
        lda,
        ldb,
        N,
        1.0,
        0.0,
        1,
        a_copy,
        a_strides,
        b_copy,
        b_strides);
  }
}

} // namespace

void BlockMaskedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  if (out.dtype() != float32) {
    throw std::runtime_error(
        "[BlockMaskedMM::eval] Currently only supports float32.");
  }
  out.set_data(allocator::malloc(out.nbytes()));

  auto& a_pre = inputs[0];
  auto& b_pre = inputs[1];

  auto check_transpose =
      [s = stream()](const array& arr, bool do_copy, bool expand_all = false) {
        auto stx = arr.strides()[arr.ndim() - 2];
        auto sty = arr.strides()[arr.ndim() - 1];
        if (!expand_all && stx == arr.shape(-1) && sty == 1) {
          if (do_copy) {
            array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
            copy_cpu(arr, arr_copy, CopyType::Vector, s);
            return std::make_tuple(false, stx, arr_copy, true);
          }
          return std::make_tuple(false, stx, arr, false);
        } else if (!expand_all && stx == 1 && sty == arr.shape(-2)) {
          if (do_copy) {
            array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
            copy_cpu(arr, arr_copy, CopyType::Vector, s);
            return std::make_tuple(true, sty, arr_copy, true);
          }
          return std::make_tuple(true, sty, arr, false);
        } else {
          int64_t stx = arr.shape(-1);
          array arr_copy = contiguous_copy_cpu(arr, s);
          return std::make_tuple(false, stx, arr_copy, true);
        }
      };

  bool has_op_mask = inputs.size() > 3;
  bool has_out_mask = inputs.size() == 3 || inputs.size() == 5;
  auto [a_transposed, lda, a, a_copied] =
      check_transpose(a_pre, has_op_mask, inputs.back().dtype() != bool_);
  auto [b_transposed, ldb, b, b_copied] =
      check_transpose(b_pre, has_op_mask, inputs.back().dtype() != bool_);

  size_t M = a.shape(-2);
  size_t N = b.shape(-1);
  size_t K = a.shape(-1);

  if (M == 0 || N == 0) {
    return;
  }

  auto& encoder = cpu::get_command_encoder(stream());
  if (K == 0) {
    encoder.set_output_array(out);
    encoder.dispatch([out_ptr = out.data<void>(), nbytes = out.nbytes()]() {
      std::memset(out_ptr, 0, nbytes);
    });
    return;
  }

  auto mask_array = [](const void* mask,
                       float* data,
                       int block_size,
                       int batch_idx,
                       int X,
                       int Y,
                       size_t X_data_str,
                       size_t Y_data_str,
                       const Shape& mask_shape,
                       const Strides& mask_strides,
                       bool is_bool) {
    auto ndim = mask_shape.size();
    auto mask_offset = elem_to_loc(
        mask_shape[ndim - 1] * mask_shape[ndim - 2] * batch_idx,
        mask_shape,
        mask_strides);

    auto X_mask_str = mask_strides[ndim - 2];
    auto Y_mask_str = mask_strides[ndim - 1];

    if (is_bool) {
      return mask_matrix(
          data,
          static_cast<const bool*>(mask),
          block_size,
          X,
          Y,
          X_data_str,
          Y_data_str,
          X_mask_str,
          Y_mask_str,
          mask_offset);
    } else {
      return mask_matrix(
          data,
          static_cast<const float*>(mask),
          block_size,
          X,
          Y,
          X_data_str,
          Y_data_str,
          X_mask_str,
          Y_mask_str,
          mask_offset);
    }
  };

  encoder.set_input_array(a);
  encoder.set_input_array(b);
  const void* a_mask_ptr = nullptr;
  const void* b_mask_ptr = nullptr;
  const void* out_mask_ptr = nullptr;
  Shape a_mask_shape;
  Shape b_mask_shape;
  Shape out_mask_shape;
  Strides a_mask_strides;
  Strides b_mask_strides;
  Strides out_mask_strides;
  bool a_mask_bool = false;
  bool b_mask_bool = false;
  bool out_mask_bool = false;
  if (has_op_mask) {
    auto& a_mask = inputs[inputs.size() - 2];
    auto& b_mask = inputs[inputs.size() - 1];
    a_mask_ptr = a_mask.data<void>();
    b_mask_ptr = b_mask.data<void>();
    a_mask_shape = a_mask.shape();
    b_mask_shape = b_mask.shape();
    a_mask_strides = a_mask.strides();
    b_mask_strides = b_mask.strides();
    a_mask_bool = (a_mask.dtype() == bool_);
    b_mask_bool = (b_mask.dtype() == bool_);
    encoder.set_input_array(a_mask);
    encoder.set_input_array(b_mask);
  }
  if (has_out_mask) {
    auto& out_mask = inputs[2];
    out_mask_ptr = out_mask.data<void>();
    out_mask_bool = (out_mask.dtype() == bool_);
    encoder.set_input_array(out_mask);
    out_mask_shape = out_mask.shape();
    out_mask_strides = out_mask.strides();
  }
  encoder.set_output_array(out);
  auto a_ptr = a.data<float>();
  auto b_ptr = b.data<float>();
  auto out_ptr = out.data<float>();
  size_t num_matrices = out.size() / (M * size_t(N));
  auto ldc = out.shape(-1);

  encoder.dispatch([a_ptr,
                    b_ptr,
                    out_ptr,
                    a_mask_ptr,
                    b_mask_ptr,
                    out_mask_ptr,
                    has_op_mask,
                    has_out_mask,
                    block_size = block_size_,
                    num_matrices,
                    M,
                    N,
                    K,
                    a_transposed = a_transposed,
                    b_transposed = b_transposed,
                    lda = lda,
                    ldb = ldb,
                    ldc,
                    a_shape = a.shape(),
                    a_strides = a.strides(),
                    b_shape = b.shape(),
                    b_strides = b.strides(),
                    a_mask_shape = std::move(a_mask_shape),
                    b_mask_shape = std::move(b_mask_shape),
                    out_mask_shape = std::move(out_mask_shape),
                    a_mask_strides = std::move(a_mask_strides),
                    b_mask_strides = std::move(b_mask_strides),
                    out_mask_strides = std::move(out_mask_strides),
                    mask_array,
                    a_mask_bool,
                    b_mask_bool,
                    out_mask_bool]() {
    for (int i = 0; i < num_matrices; ++i) {
      // Adjust pointer
      float* ai = a_ptr + elem_to_loc(M * K * i, a_shape, a_strides);
      float* bi = b_ptr + elem_to_loc(K * N * i, b_shape, b_strides);
      float* ci = out_ptr + M * N * i;

      // Zero out blocks in a and b if needed
      if (has_op_mask) {
        mask_array(
            a_mask_ptr,
            ai,
            block_size,
            i,
            M,
            K,
            a_transposed ? 1 : lda,
            a_transposed ? lda : 1,
            a_mask_shape,
            a_mask_strides,
            a_mask_bool);

        mask_array(
            b_mask_ptr,
            bi,
            block_size,
            i,
            K,
            N,
            b_transposed ? 1 : ldb,
            b_transposed ? ldb : 1,
            b_mask_shape,
            b_mask_strides,
            b_mask_bool);
      }

      // Do matmul
      cblas_sgemm(
          CblasRowMajor,
          a_transposed ? CblasTrans : CblasNoTrans, // transA
          b_transposed ? CblasTrans : CblasNoTrans, // transB
          M,
          N,
          K,
          1.0, // alpha
          ai,
          lda,
          bi,
          ldb,
          0.0, // beta
          ci,
          ldc);

      // Zero out blocks in out
      if (has_out_mask) {
        mask_array(
            out_mask_ptr,
            ci,
            block_size,
            i,
            M,
            N,
            N,
            1,
            out_mask_shape,
            out_mask_strides,
            out_mask_bool);
      }
    }
  });
  if (a_copied) {
    encoder.add_temporary(a);
  }
  if (b_copied) {
    encoder.add_temporary(b);
  }
}

void GatherMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  if (out.dtype() != float32) {
    throw std::runtime_error(
        "[GatherMM::eval] Currently only supports float32.");
  }
  out.set_data(allocator::malloc(out.nbytes()));

  auto& a_pre = inputs[0];
  auto& b_pre = inputs[1];

  std::vector<array> temps;
  auto check_transpose = [s = stream(), &temps](const array& arr) {
    auto stx = arr.strides()[arr.ndim() - 2];
    auto sty = arr.strides()[arr.ndim() - 1];
    if (stx == arr.shape(-1) && sty == 1) {
      return std::make_tuple(false, stx, arr);
    } else if (stx == 1 && sty == arr.shape(-2)) {
      return std::make_tuple(true, sty, arr);
    } else {
      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
      copy_cpu(arr, temps.back(), CopyType::General, s);
      int64_t stx = arr.shape(-1);
      return std::make_tuple(false, stx, temps.back());
    }
  };

  auto [a_transposed, lda, a] = check_transpose(a_pre);
  auto [b_transposed, ldb, b] = check_transpose(b_pre);

  size_t M = a.shape(-2);
  size_t N = b.shape(-1);
  size_t K = a.shape(-1);

  if (M == 0 || N == 0) {
    return;
  }

  auto& encoder = cpu::get_command_encoder(stream());
  if (K == 0) {
    encoder.set_output_array(out);
    encoder.dispatch([out_ptr = out.data<float>(), size = out.size()]() {
      std::fill_n(out_ptr, size, 0);
    });
    return;
  }

  // Get batch dims
  auto batch_size_out = out.size() / (M * N);
  size_t matrix_stride_out = M * N;

  auto get_batch_dims = [](const auto& v) {
    return decltype(v){v.begin(), v.end() - 2};
  };

  auto& lhs_indices = inputs[2];
  auto& rhs_indices = inputs[3];

  auto batch_shape = get_batch_dims(out.shape());

  auto batch_shape_A = get_batch_dims(a.shape());
  auto batch_strides_A = get_batch_dims(a.strides());
  auto batch_shape_B = get_batch_dims(b.shape());
  auto batch_strides_B = get_batch_dims(b.strides());

  const uint32_t* lhs_indices_ptr = lhs_indices.data<uint32_t>();
  const uint32_t* rhs_indices_ptr = rhs_indices.data<uint32_t>();
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_input_array(lhs_indices);
  encoder.set_input_array(rhs_indices);
  encoder.set_output_array(out);
  auto ldc = out.shape(-1);

  encoder.dispatch([a_ptr = a.data<float>(),
                    b_ptr = b.data<float>(),
                    out_ptr = out.data<float>(),
                    M,
                    N,
                    K,
                    lda = lda,
                    ldb = ldb,
                    a_transposed = a_transposed,
                    b_transposed = b_transposed,
                    ldc,
                    lhs_indices_ptr,
                    rhs_indices_ptr,
                    lhs_indices_shape = lhs_indices.shape(),
                    lhs_indices_strides = lhs_indices.strides(),
                    rhs_indices_shape = rhs_indices.shape(),
                    rhs_indices_strides = rhs_indices.strides(),
                    batch_size_out,
                    matrix_stride_out,
                    batch_shape_A = std::move(batch_shape_A),
                    batch_shape_B = std::move(batch_shape_B),
                    batch_strides_A = std::move(batch_strides_A),
                    batch_strides_B = std::move(batch_strides_B)]() {
    for (int i = 0; i < batch_size_out; i++) {
      // Get index
      uint32_t indx_A = lhs_indices_ptr[elem_to_loc(
          i, lhs_indices_shape, lhs_indices_strides)];
      uint32_t indx_B = rhs_indices_ptr[elem_to_loc(
          i, rhs_indices_shape, rhs_indices_strides)];

      cblas_sgemm(
          CblasRowMajor,
          a_transposed ? CblasTrans : CblasNoTrans, // transA
          b_transposed ? CblasTrans : CblasNoTrans, // transB
          M,
          N,
          K,
          1.0f, // alpha
          a_ptr + elem_to_loc(indx_A, batch_shape_A, batch_strides_A),
          lda,
          b_ptr + elem_to_loc(indx_B, batch_shape_B, batch_strides_B),
          ldb,
          0.0f, // beta
          out_ptr + matrix_stride_out * i,
          ldc);
    }
  });
  encoder.add_temporaries(std::move(temps));
}

void SegmentedMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  out.set_data(allocator::malloc(out.nbytes()));

  auto& s = stream();
  auto& encoder = cpu::get_command_encoder(stream());
  auto check_transpose = [&s, &encoder](const array& x) {
    auto stx = x.strides()[x.ndim() - 2];
    auto sty = x.strides()[x.ndim() - 1];
    if (stx == x.shape(-1) && sty == 1) {
      return std::make_tuple(false, stx, x);
    } else if (stx == 1 && sty == x.shape(-2)) {
      return std::make_tuple(true, sty, x);
    } else {
      array xc(x.shape(), x.dtype(), nullptr, {});
      copy_cpu(x, xc, CopyType::General, s);
      encoder.add_temporary(xc);
      int64_t stx = x.shape(-1);
      return std::make_tuple(false, stx, xc);
    }
  };

  auto [a_transposed, lda, a] = check_transpose(inputs[0]);
  auto [b_transposed, ldb, b] = check_transpose(inputs[1]);
  auto& segments = inputs[2];

  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_input_array(segments);
  encoder.set_output_array(out);
  encoder.dispatch([a = array::unsafe_weak_copy(a),
                    b = array::unsafe_weak_copy(b),
                    segments = array::unsafe_weak_copy(segments),
                    out_ptr = out.data<void>(),
                    a_transposed = a_transposed,
                    b_transposed = b_transposed,
                    lda = lda,
                    ldb = ldb]() {
    switch (a.dtype()) {
      case float64:
        segmented_mm<double>(
            a.data<double>(),
            b.data<double>(),
            segments.data<uint32_t>(),
            static_cast<double*>(out_ptr),
            a_transposed,
            b_transposed,
            lda,
            ldb,
            a.shape(),
            a.strides(),
            b.shape(),
            b.strides(),
            segments.size() / 2,
            segments.shape(),
            segments.strides());
        break;
      case float32:
        segmented_mm<float>(
            a.data<float>(),
            b.data<float>(),
            segments.data<uint32_t>(),
            static_cast<float*>(out_ptr),
            a_transposed,
            b_transposed,
            lda,
            ldb,
            a.shape(),
            a.strides(),
            b.shape(),
            b.strides(),
            segments.size() / 2,
            segments.shape(),
            segments.strides());
        break;
      case float16:
        segmented_mm<float16_t>(
            a.data<float16_t>(),
            b.data<float16_t>(),
            segments.data<uint32_t>(),
            static_cast<float16_t*>(out_ptr),
            a_transposed,
            b_transposed,
            lda,
            ldb,
            a.shape(),
            a.strides(),
            b.shape(),
            b.strides(),
            segments.size() / 2,
            segments.shape(),
            segments.strides());
        break;
      case bfloat16:
        segmented_mm<bfloat16_t>(
            a.data<bfloat16_t>(),
            b.data<bfloat16_t>(),
            segments.data<uint32_t>(),
            static_cast<bfloat16_t*>(out_ptr),
            a_transposed,
            b_transposed,
            lda,
            ldb,
            a.shape(),
            a.strides(),
            b.shape(),
            b.strides(),
            segments.size() / 2,
            segments.shape(),
            segments.strides());
        break;
      default:
        throw std::invalid_argument(
            "Segmented mm supports only real float types.");
    }
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/matmul.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <cstring>
#include "mlx/array.h"
#include "mlx/backend/cpu/binary.h"
#include "mlx/backend/cpu/binary_ops.h"
#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/gemm.h"
#include "mlx/primitives.h"

namespace mlx::core {

template <typename T>
void matmul_dispatch(
    const array& a,
    const array& b,
    array& out,
    bool a_transposed,
    bool b_transposed,
    size_t lda,
    size_t ldb,
    float alpha,
    float beta,
    Stream stream) {
  const T* a_ptr = a.data<T>();
  const T* b_ptr = b.data<T>();
  T* out_ptr = out.data<T>();
  size_t ldc = out.shape(-1);
  size_t batch_size = a.size() / (a.shape(-2) * a.shape(-1));
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
  encoder.dispatch([a_ptr,
                    b_ptr,
                    out_ptr,
                    a_transposed,
                    b_transposed,
                    lda,
                    ldb,
                    ldc,
                    alpha,
                    beta,
                    batch_size,
                    a_shape = a.shape(),
                    a_strides = a.strides(),
                    b_shape = b.shape(),
                    b_strides = b.strides()]() {
    matmul<T>(
        a_ptr,
        b_ptr,
        out_ptr,
        a_transposed,
        b_transposed,
        lda,
        ldb,
        ldc,
        alpha,
        beta,
        batch_size,
        a_shape,
        a_strides,
        b_shape,
        b_strides);
  });
}

void matmul_general(
    const array& a_pre,
    const array& b_pre,
    array& out,
    Stream stream,
    float alpha = 1.0f,
    float beta = 0.0f) {
  std::vector<array> temps;
  auto check_transpose = [stream, &temps](const array& arr) {
    auto stx = arr.strides()[arr.ndim() - 2];
    auto sty = arr.strides()[arr.ndim() - 1];
    if (stx == arr.shape(-1) && sty == 1) {
      return std::make_tuple(false, stx, arr);
    } else if (stx == 1 && sty == arr.shape(-2)) {
      return std::make_tuple(true, sty, arr);
    } else {
      temps.push_back(array(arr.shape(), arr.dtype(), nullptr, {}));
      copy_cpu(arr, temps.back(), CopyType::General, stream);
      stx = arr.shape(-1);
      return std::make_tuple(false, stx, temps.back());
    }
  };

  auto [a_transposed, lda, a] = check_transpose(a_pre);
  auto [b_transposed, ldb, b] = check_transpose(b_pre);
  size_t M = a.shape(-2);
  size_t N = b.shape(-1);
  if (M == 0 || N == 0) {
    return;
  }

  if (out.dtype() == float32) {
    matmul_dispatch<float>(
        a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta, stream);
  } else if (out.dtype() == float16) {
    matmul_dispatch<float16_t>(
        a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta, stream);
  } else if (out.dtype() == bfloat16) {
    matmul_dispatch<bfloat16_t>(
        a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta, stream);
  } else if (out.dtype() == float64) {
    matmul_dispatch<double>(
        a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta, stream);
  } else if (out.dtype() == complex64) {
    matmul_dispatch<complex64_t>(
        a, b, out, a_transposed, b_transposed, lda, ldb, alpha, beta, stream);
  } else {
    throw std::runtime_error("[Matmul::eval_cpu] Invalid type.");
  }
  cpu::get_command_encoder(stream).add_temporaries(std::move(temps));
}

void Matmul::eval_cpu(const std::vector<array>& inputs, array& out) {
  out.set_data(allocator::malloc(out.nbytes()));
  if (inputs[0].shape(-1) == 0) {
    auto& encoder = cpu::get_command_encoder(stream());
    encoder.set_output_array(out);
    encoder.dispatch([out_ptr = out.data<void>(), nbytes = out.nbytes()]() {
      std::memset(out_ptr, 0, nbytes);
    });
    return;
  }
  matmul_general(inputs[0], inputs[1], out, stream());
}

void AddMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  if (out.size() == 0) {
    out.set_data(allocator::malloc(out.nbytes()));
    return;
  }

  // Handle empty matrix case (K=0)
  if (inputs[0].shape(-1) == 0) {
    auto& c = inputs[2];
    if (beta_ == 1.0f) {
      CopyType ctype = c.data_size() == 1
          ? CopyType::Scalar
          : (c.flags().row_contiguous ? CopyType::Vector : CopyType::General);
      copy_cpu(c, out, ctype, stream());
    } else {
      array beta_scalar = array(beta_, c.dtype());
      auto& encoder = cpu::get_command_encoder(stream());
      binary_float_op_cpu(c, beta_scalar, out, detail::Multiply(), stream());
      encoder.add_temporary(std::move(beta_scalar));
    }
    return;
  }

  // Fill output with C
  auto& c = inputs[2];
  CopyType ctype = c.data_size() == 1
      ? CopyType::Scalar
      : (c.flags().row_contiguous ? CopyType::Vector : CopyType::General);
  copy_cpu(c, out, ctype, stream());
  matmul_general(inputs[0], inputs[1], out, stream(), alpha_, beta_);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/primitives.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <algorithm>
#include <cassert>
#include <cmath>
#include <numeric>
#include <sstream>

#include "mlx/allocator.h"
#include "mlx/backend/common/slicing.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/arange.h"
#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/threefry.h"
#include "mlx/primitives.h"
#include "mlx/utils.h"

namespace mlx::core {

void reshape(const array& in, array& out) {
  auto [copy_necessary, out_strides] = prepare_reshape(in, out);
  if (copy_necessary) {
    out.set_data(allocator::malloc(out.nbytes()));
    copy_cpu_inplace(in, out, CopyType::General, out.primitive().stream());
  } else {
    shared_buffer_reshape(in, out_strides, out);
  }
}

static std::pair<array, bool> compute_dynamic_offset(
    const array& indices,
    const Strides& strides,
    const std::vector<int>& axes,
    Stream stream) {
  array offset({1}, int64, nullptr, {});
  bool donate = indices.is_donatable() &&
      (indices.data_size() * indices.itemsize()) >= offset.itemsize();
  if (donate) {
    offset.copy_shared_buffer(indices);
  } else {
    offset.set_data(allocator::malloc(offset.itemsize()));
  }

  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(indices);
  encoder.set_output_array(offset);
  auto compute_offset =
      [strides, axes, offset = offset.data<int64_t>()](const auto* indices) {
        int64_t offset_ = 0;
        for (int i = 0; i < axes.size(); ++i) {
          offset_ += indices[i] * strides[axes[i]];
        }
        offset[0] = offset_;
      };
  switch (indices.dtype()) {
    case int8:
    case uint8:
      encoder.dispatch(compute_offset, indices.data<uint8_t>());
      break;
    case int16:
    case uint16:
      encoder.dispatch(compute_offset, indices.data<uint16_t>());
      break;
    case int32:
    case uint32:
      encoder.dispatch(compute_offset, indices.data<uint32_t>());
      break;
    case int64:
    case uint64:
      encoder.dispatch(compute_offset, indices.data<uint64_t>());
      break;
    default:
      throw std::runtime_error("Invalid indices type.");
  }
  return {offset, donate};
}

void AsStrided::eval_cpu(const std::vector<array>& inputs, array& out) {
  eval(inputs, out);
}
void Broadcast::eval_cpu(const std::vector<array>& inputs, array& out) {
  eval(inputs, out);
}
void BroadcastAxes::eval_cpu(const std::vector<array>& inputs, array& out) {
  eval(inputs, out);
}
void Copy::eval_cpu(const std::vector<array>& inputs, array& out) {
  eval(inputs, out);
}
void CustomTransforms::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  eval(inputs, outputs);
}
void Depends::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  eval(inputs, outputs);
}
void ExpandDims::eval_cpu(const std::vector<array>& inputs, array& out) {
  eval(inputs, out);
}
void NumberOfElements::eval_cpu(const std::vector<array>& inputs, array& out) {
  eval(inputs, out);
}
void Slice::eval_cpu(const std::vector<array>& inputs, array& out) {
  slice(inputs[0], out, start_indices_, strides_);
}
void Split::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  eval(inputs, outputs);
}
void Squeeze::eval_cpu(const std::vector<array>& inputs, array& out) {
  eval(inputs, out);
}
void StopGradient::eval_cpu(const std::vector<array>& inputs, array& out) {
  eval(inputs, out);
}
void Transpose::eval_cpu(const std::vector<array>& inputs, array& out) {
  eval(inputs, out);
}

void Arange::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 0);
  out.set_data(allocator::malloc(out.nbytes()));
  switch (out.dtype()) {
    case bool_:
      throw std::runtime_error("Bool type unsupported for arange.");
      break;
    case uint8:
      arange<uint8_t>(start_, start_ + step_, out, out.size(), stream());
      break;
    case uint16:
      arange<uint16_t>(start_, start_ + step_, out, out.size(), stream());
      break;
    case uint32:
      arange<uint32_t>(start_, start_ + step_, out, out.size(), stream());
      break;
    case uint64:
      arange<uint64_t>(start_, start_ + step_, out, out.size(), stream());
      break;
    case int8:
      arange<int8_t>(start_, start_ + step_, out, out.size(), stream());
      break;
    case int16:
      arange<int16_t>(start_, start_ + step_, out, out.size(), stream());
      break;
    case int32:
      arange<int32_t>(start_, start_ + step_, out, out.size(), stream());
      break;
    case int64:
      arange<int64_t>(start_, start_ + step_, out, out.size(), stream());
      break;
    case float16:
      arange<float16_t>(start_, start_ + step_, out, out.size(), stream());
      break;
    case float32:
      arange<float>(start_, start_ + step_, out, out.size(), stream());
      break;
    case float64:
      arange<double>(start_, start_ + step_, out, out.size(), stream());
      break;
    case bfloat16:
      arange<bfloat16_t>(start_, start_ + step_, out, out.size(), stream());
      break;
    case complex64:
      arange<complex64_t>(start_, start_ + step_, out, out.size(), stream());
      break;
  }
}

void AsType::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  CopyType ctype = in.flags().contiguous ? CopyType::Vector : CopyType::General;
  copy_cpu(in, out, ctype, stream());
}

void Concatenate::eval_cpu(const std::vector<array>& inputs, array& out) {
  std::vector<int> sizes;
  sizes.push_back(0);
  for (auto& p : inputs) {
    sizes.push_back(p.shape(axis_));
  }
  std::partial_sum(sizes.cbegin(), sizes.cend(), sizes.begin());

  out.set_data(allocator::malloc(out.nbytes()));

  auto strides = out.strides();
  auto flags = out.flags();
  flags.row_contiguous = false;
  flags.col_contiguous = false;
  flags.contiguous = false;
  for (int i = 0; i < inputs.size(); i++) {
    array out_slice(inputs[i].shape(), out.dtype(), nullptr, {});
    size_t data_offset = strides[axis_] * sizes[i];
    out_slice.copy_shared_buffer(
        out, strides, flags, out_slice.size(), data_offset);
    copy_cpu_inplace(inputs[i], out_slice, CopyType::GeneralGeneral, stream());
  }
}

void Contiguous::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  constexpr size_t extra_bytes = 16384;
  if (in.buffer_size() <= out.nbytes() + extra_bytes &&
      (in.flags().row_contiguous ||
       (allow_col_major_ && in.flags().col_contiguous))) {
    out.copy_shared_buffer(in);
  } else {
    copy_cpu(in, out, CopyType::General, stream());
  }
}

void Flatten::eval_cpu(const std::vector<array>& inputs, array& out) {
  reshape(inputs[0], out);
}

void Unflatten::eval_cpu(const std::vector<array>& inputs, array& out) {
  reshape(inputs[0], out);
}

void Full::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  assert(in.dtype() == out.dtype());
  CopyType ctype;
  if (in.data_size() == 1) {
    ctype = CopyType::Scalar;
  } else if (in.flags().contiguous) {
    ctype = CopyType::Vector;
  } else {
    ctype = CopyType::General;
  }
  copy_cpu(in, out, ctype, stream());
}

void Pad::eval_cpu(const std::vector<array>& inputs, array& out) {
  // Inputs must be base input array and scalar val array
  assert(inputs.size() == 2);
  auto& in = inputs[0];
  auto& val = inputs[1];

  // Padding value must be a scalar
  assert(val.size() == 1);

  // Padding value, input and output must be of the same type
  assert(val.dtype() == in.dtype() && in.dtype() == out.dtype());

  // Fill output with val
  copy_cpu(val, out, CopyType::Scalar, stream());

  // Find offset for start of input values
  size_t data_offset = 0;
  for (int i = 0; i < axes_.size(); i++) {
    auto ax = axes_[i] < 0 ? out.ndim() + axes_[i] : axes_[i];
    data_offset += out.strides()[ax] * low_pad_size_[i];
  }

  // Extract slice from output where input will be pasted
  array out_slice(in.shape(), out.dtype(), nullptr, {});
  out_slice.copy_shared_buffer(
      out, out.strides(), out.flags(), out_slice.size(), data_offset);

  // Copy input values into the slice
  copy_cpu_inplace(in, out_slice, CopyType::GeneralGeneral, stream());
}

void RandomBits::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  // keys has shape (N1, ..., NK, 2)
  // out has shape (N1, ..., NK, M1, M2, ...)
  auto& keys = inputs[0];
  size_t num_keys = keys.size() / 2;

  size_t elems_per_key = out.size() / num_keys;
  size_t bytes_per_key = out.itemsize() * elems_per_key;
  out.set_data(allocator::malloc(out.nbytes()));

  auto kptr = inputs[0].data<uint32_t>();
  auto cptr = out.data<char>();
  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_input_array(inputs[0]);
  encoder.set_output_array(out);
  encoder.dispatch([kptr,
                    cptr,
                    bytes_per_key,
                    num_keys,
                    kshape = keys.shape(),
                    kstrides = keys.strides()]() mutable {
    auto copy_remaining = [&](char* cptr, size_t loc, uint32_t v) {
      if (4 * loc + 4 <= bytes_per_key) {
        reinterpret_cast<uint32_t*>(cptr)[loc] = v;
      } else {
        std::copy(
            reinterpret_cast<char*>(&v),
            reinterpret_cast<char*>(&v) + bytes_per_key - 4 * loc,
            cptr + 4 * loc);
      }
    };

    size_t out_skip = (bytes_per_key + 4 - 1) / 4;
    auto half_size = out_skip / 2;
    bool even = out_skip % 2 == 0;
    for (int i = 0; i < num_keys; ++i, cptr += bytes_per_key) {
      auto ptr = reinterpret_cast<uint32_t*>(cptr);
      // Get ith key
      auto kidx = 2 * i;
      auto k1_elem = elem_to_loc(kidx, kshape, kstrides);
      auto k2_elem = elem_to_loc(kidx + 1, kshape, kstrides);
      auto key = std::make_pair(kptr[k1_elem], kptr[k2_elem]);

      std::pair<uintptr_t, uintptr_t> count{0, half_size + !even};
      for (; count.first + 1 < half_size; count.first++, count.second++) {
        std::tie(ptr[count.first], ptr[count.second]) =
            random::threefry2x32_hash(key, count);
      }
      if (count.first < half_size) {
        auto rb = random::threefry2x32_hash(key, count);
        ptr[count.first++] = rb.first;
        copy_remaining(cptr, count.second, rb.second);
      }
      if (!even) {
        count.second = 0;
        copy_remaining(
            cptr, half_size, random::threefry2x32_hash(key, count).first);
      }
    }
  });
}

void Reshape::eval_cpu(const std::vector<array>& inputs, array& out) {
  reshape(inputs[0], out);
}

void DynamicSlice::eval_cpu(const std::vector<array>& inputs, array& out) {
  if (out.size() == 0) {
    out.set_data(allocator::malloc(0));
    return;
  }
  auto& in = inputs[0];
  out.set_data(allocator::malloc(out.nbytes()));
  auto [in_offset, donated] =
      compute_dynamic_offset(inputs[1], in.strides(), axes_, stream());
  copy_cpu_inplace(
      /* const array& src = */ in,
      /* array& dst = */ out,
      /* const Shape& data_shape = */ out.shape(),
      /* const Strides& i_strides = */ in.strides(),
      /* const Strides& o_strides = */ out.strides(),
      /* int64_t i_offset = */ 0,
      /* int64_t o_offset = */ 0,
      /* CopyType ctype = */ CopyType::GeneralGeneral,
      stream(),
      /* const std::optional<array>& dynamic_i_offset = */ in_offset,
      /* const std::optional<array>& dynamic_o_offset = */ std::nullopt);
  if (!donated) {
    cpu::get_command_encoder(stream()).add_temporary(std::move(in_offset));
  }
}

void DynamicSliceUpdate::eval_cpu(
    const std::vector<array>& inputs,
    array& out) {
  if (out.size() == 0) {
    out.set_data(allocator::malloc(0));
    return;
  }

  auto& in = inputs[0];
  auto& upd = inputs[1];

  // Copy or move src to dst
  auto ctype = in.flags().contiguous && in.size() == in.data_size()
      ? CopyType::Vector
      : CopyType::General;
  copy_cpu(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, stream());

  auto [out_offset, donated] =
      compute_dynamic_offset(inputs[2], out.strides(), axes_, stream());
  copy_cpu_inplace(
      /* const array& src = */ upd,
      /* array& dst = */ out,
      /* const std::vector<int>& data_shape = */ upd.shape(),
      /* const std::vector<stride_t>& i_strides = */ upd.strides(),
      /* const std::vector<stride_t>& o_strides = */ out.strides(),
      /* int64_t i_offset = */ 0,
      /* int64_t o_offset = */ 0,
      /* CopyType ctype = */ CopyType::GeneralGeneral,
      stream(),
      /* const std::optional<array>& dynamic_i_offset = */ std::nullopt,
      /* const std::optional<array>& dynamic_o_offset = */ out_offset);
  if (!donated) {
    cpu::get_command_encoder(stream()).add_temporary(std::move(out_offset));
  }
}

void View::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  auto ibytes = size_of(in.dtype());
  auto obytes = size_of(out.dtype());
  // Conditions for buffer copying (disjunction):
  // - type size is the same
  // - type size is smaller and the last axis is contiguous
  // - the entire array is row contiguous
  if (ibytes == obytes || (obytes < ibytes && in.strides().back() == 1) ||
      in.flags().row_contiguous) {
    auto strides = in.strides();
    for (int i = 0; i < static_cast<int>(strides.size()) - 1; ++i) {
      strides[i] *= ibytes;
      strides[i] /= obytes;
    }
    out.copy_shared_buffer(
        in, strides, in.flags(), in.data_size() * ibytes / obytes);
  } else {
    auto tmp = array(
        in.shape(), in.dtype() == bool_ ? uint8 : in.dtype(), nullptr, {});
    tmp.set_data(allocator::malloc(tmp.nbytes()));
    if (in.dtype() == bool_) {
      auto in_tmp = array(in.shape(), uint8, nullptr, {});
      in_tmp.copy_shared_buffer(in);
      copy_cpu_inplace(in_tmp, tmp, CopyType::General, stream());
    } else {
      copy_cpu_inplace(in, tmp, CopyType::General, stream());
    }

    auto flags = out.flags();
    flags.contiguous = true;
    flags.row_contiguous = true;
    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
    out.copy_shared_buffer(tmp, out.strides(), flags, out.size());
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/qrf.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include "mlx/allocator.h"
#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/lapack.h"
#include "mlx/primitives.h"

namespace mlx::core {

template <typename T>
void qrf_impl(const array& a, array& q, array& r, Stream stream) {
  const int M = a.shape(-2);
  const int N = a.shape(-1);
  const int lda = M;
  size_t num_matrices = a.size() / (M * N);

  // Copy A to inplace input and make it col-contiguous
  array in(a.shape(), a.dtype(), nullptr, {});
  auto flags = in.flags();

  // Copy the input to be column contiguous
  flags.col_contiguous = num_matrices == 1;
  flags.row_contiguous = false;
  auto strides = in.strides();
  strides[in.ndim() - 2] = 1;
  strides[in.ndim() - 1] = M;
  in.set_data(allocator::malloc(in.nbytes()), in.nbytes(), strides, flags);
  copy_cpu_inplace(a, in, CopyType::GeneralGeneral, stream);
  auto& encoder = cpu::get_command_encoder(stream);
  q.set_data(allocator::malloc(q.nbytes()));
  r.set_data(allocator::malloc(r.nbytes()));

  auto in_ptr = in.data<T>();
  auto r_ptr = r.data<T>();
  auto q_ptr = q.data<T>();

  encoder.set_input_array(in);
  encoder.set_output_array(q);
  encoder.set_output_array(r);
  encoder.dispatch([in_ptr, q_ptr, r_ptr, M, N, lda, num_matrices]() {
    int num_reflectors = std::min(M, N);
    auto tau = allocator::malloc(sizeof(T) * num_matrices * num_reflectors);

    T optimal_work;
    int lwork = -1;
    int info;

    // Compute workspace size
    geqrf<T>(&M, &N, nullptr, &lda, nullptr, &optimal_work, &lwork, &info);

    // Update workspace size
    lwork = optimal_work;
    auto work = allocator::malloc(sizeof(T) * lwork);

    // Loop over matrices
    for (int i = 0; i < num_matrices; ++i) {
      // Solve
      geqrf<T>(
          &M,
          &N,
          in_ptr + M * N * i,
          &lda,
          static_cast<T*>(tau.raw_ptr()) + num_reflectors * i,
          static_cast<T*>(work.raw_ptr()),
          &lwork,
          &info);
    }
    allocator::free(work);

    for (int i = 0; i < num_matrices; ++i) {
      /// num_reflectors x N
      for (int j = 0; j < num_reflectors; ++j) {
        for (int k = 0; k < j; ++k) {
          r_ptr[i * N * num_reflectors + j * N + k] = 0;
        }
        for (int k = j; k < N; ++k) {
          r_ptr[i * N * num_reflectors + j * N + k] =
              in_ptr[i * N * M + j + k * M];
        }
      }
    }

    // Get work size
    lwork = -1;
    orgqr<T>(
        &M,
        &num_reflectors,
        &num_reflectors,
        nullptr,
        &lda,
        nullptr,
        &optimal_work,
        &lwork,
        &info);
    lwork = optimal_work;
    work = allocator::malloc(sizeof(T) * lwork);

    // Loop over matrices
    for (int i = 0; i < num_matrices; ++i) {
      // Compute Q
      orgqr<T>(
          &M,
          &num_reflectors,
          &num_reflectors,
          in_ptr + M * N * i,
          &lda,
          static_cast<T*>(tau.raw_ptr()) + num_reflectors * i,
          static_cast<T*>(work.raw_ptr()),
          &lwork,
          &info);
    }

    for (int i = 0; i < num_matrices; ++i) {
      // M x num_reflectors
      for (int j = 0; j < M; ++j) {
        for (int k = 0; k < num_reflectors; ++k) {
          q_ptr[i * M * num_reflectors + j * num_reflectors + k] =
              in_ptr[i * N * M + j + k * M];
        }
      }
    }

    // Cleanup
    allocator::free(work);
    allocator::free(tau);
  });
  encoder.add_temporary(in);
}

void QRF::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  switch (inputs[0].dtype()) {
    case float32:
      qrf_impl<float>(inputs[0], outputs[0], outputs[1], stream());
      break;
    case float64:
      qrf_impl<double>(inputs[0], outputs[0], outputs[1], stream());
      break;
    default:
      throw std::runtime_error(
          "[QRF::eval_cpu] only supports float32 or float64.");
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/quantized.cpp
================================================
// Copyright © 2023 Apple Inc.

#include "mlx/backend/common/quantized.h"
#include "mlx/backend/common/unary.h"
#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/simd/simd.h"
#include "mlx/backend/cpu/unary.h"
#include "mlx/backend/cpu/unary_ops.h"
#include "mlx/fast_primitives.h"
#include "mlx/primitives.h"
#include "mlx/utils.h"

namespace mlx::core {

namespace {

array ensure_row_contiguous(
    const array& arr,
    cpu::CommandEncoder& encoder,
    Stream s) {
  if (arr.flags().row_contiguous) {
    return arr;
  } else {
    auto arr_cpy = contiguous_copy_cpu(arr, s);
    encoder.add_temporary(arr_cpy);
    return arr_cpy;
  }
};

const static float FP4_LUT[16] = {
    +0.0f,
    +0.5f,
    +1.0f,
    +1.5f,
    +2.0f,
    +3.0f,
    +4.0f,
    +6.0f,
    -0.0f,
    -0.5f,
    -1.0f,
    -1.5f,
    -2.0f,
    -3.0f,
    -4.0f,
    -6.0f};

template <typename T, int group_size>
static inline T dequantize_scale(uint8_t s) {
  if constexpr (group_size == 16) {
    return static_cast<T>(detail::FromFP8{}(s));
  } else {
    using FOrI = union {
      bfloat16_t f;
      uint16_t i;
    };
    FOrI out;
    out.i = (s == 0 ? 0x40 : (static_cast<uint16_t>(s) << 7));
    return static_cast<T>(out.f);
  }
}

template <typename T, int bits>
void extract_bits(const uint8_t* w_in, T* w_out) {
  static_assert(bits == 3 || bits == 5 || bits == 6);
  if (bits == 3) {
    w_out[0] = static_cast<T>(w_in[0] & 0x7);
    w_out[1] = static_cast<T>((w_in[0] & 0x38) >> 3);
    w_out[2] = static_cast<T>(((w_in[0] & 0xc0) >> 6) + ((w_in[1] & 0x1) << 2));
    w_out[3] = static_cast<T>((w_in[1] & 0xe) >> 1);
    w_out[4] = static_cast<T>((w_in[1] & 0x70) >> 4);
    w_out[5] = static_cast<T>(((w_in[1] & 0x80) >> 7) + ((w_in[2] & 0x3) << 1));
    w_out[6] = static_cast<T>((w_in[2] & 0x1c) >> 2);
    w_out[7] = static_cast<T>((w_in[2] & 0xe0) >> 5);
  } else if (bits == 5) {
    w_out[0] = static_cast<T>(w_in[0] & 0x1f);
    w_out[1] = static_cast<T>(((w_in[0] & 0xe0) >> 5) + ((w_in[1] & 0x3) << 3));
    w_out[2] = static_cast<T>((w_in[1] & 0x7c) >> 2);
    w_out[3] = static_cast<T>(((w_in[1] & 0x80) >> 7) + ((w_in[2] & 0xf) << 1));
    w_out[4] = static_cast<T>(((w_in[2] & 0xf0) >> 4) + ((w_in[3] & 0x1) << 4));
    w_out[5] = static_cast<T>((w_in[3] & 0x3e) >> 1);
    w_out[6] = static_cast<T>(((w_in[3] & 0xc0) >> 6) + ((w_in[4] & 0x7) << 2));
    w_out[7] = static_cast<T>((w_in[4] & 0xf8) >> 3);

  } else if (bits == 6) {
    w_out[0] = static_cast<T>(w_in[0] & 0x3f);
    w_out[1] =
        static_cast<T>(((w_in[0] >> 6) & 0x03) + ((w_in[1] & 0x0f) << 2));
    w_out[2] =
        static_cast<T>(((w_in[1] >> 4) & 0x0f) + ((w_in[2] & 0x03) << 4));
    w_out[3] = static_cast<T>((w_in[2] >> 2) & 0x3f);
  }
}

template <typename T, int bits, int group_size>
void _qmm(
    T* result,
    const T* x,
    const uint32_t* w,
    const T* scales,
    const T* biases,
    int M,
    int N,
    int K) {
  constexpr int bitmask = (1 << bits) - 1;
  constexpr int pack_factor = get_pack_factor(bits, 8);
  constexpr int bytes_per_pack = get_bytes_per_pack(bits);
  constexpr int packs_in_group = group_size / pack_factor;

  for (int m = 0; m < M; m++) {
    const uint8_t* w_local = (const uint8_t*)w;
    const T* scales_local = scales;
    const T* biases_local = biases;

    std::fill(result, result + N, 0);

    for (int k = 0; k < K; k++) {
      T* result_local = result;
      T xi = *x++;

      for (int n = 0; n < N; n += group_size) {
        T scale = *scales_local++;
        T bias = *biases_local++;
        for (int ng = 0; ng < packs_in_group; ng++) {
          if constexpr (bits == 3 || bits == 5 || bits == 6) {
            T wl[pack_factor];
            extract_bits<T, bits>(w_local, wl);
#pragma clang loop unroll(full)
            for (int p = 0; p < pack_factor; p++) {
              (*result_local++) += xi * (scale * wl[p] + bias);
            }
            w_local += bytes_per_pack;

          } else {
            uint8_t wi = *w_local++;
#pragma clang loop unroll(full)
            for (int p = 0; p < pack_factor; p++) {
              (*result_local++) +=
                  xi * (scale * static_cast<T>(wi & bitmask) + bias);
              if (bits != 8) {
                wi >>= bits;
              }
            }
          }
        }
      }
    }

    result += N;
  }
}

template <typename T, int bits, int group_size>
void _qmm_t(
    T* result,
    const T* x,
    const uint32_t* w,
    const T* scales,
    const T* biases,
    int M,
    int N,
    int K) {
  constexpr int bitmask = (1 << bits) - 1;

  constexpr int pack_factor = get_pack_factor(bits, 8);
  constexpr int bytes_per_pack = get_bytes_per_pack(bits);
  constexpr int packs_in_group = group_size / pack_factor;

  for (int m = 0; m < M; m++) {
    const uint8_t* w_local = (const uint8_t*)w;
    const T* scales_local = scales;
    const T* biases_local = biases;

    for (int n = 0; n < N; n++) {
      const T* x_local = x;
      T sum = 0;
      for (int k = 0; k < K; k += group_size) {
        T scale = *scales_local++;
        T bias = *biases_local++;

        for (int kw = 0; kw < packs_in_group; kw++) {
          if constexpr (bits == 3 || bits == 5 || bits == 6) {
            T wl[pack_factor];
            extract_bits<T, bits>(w_local, wl);
#pragma clang loop unroll(full)
            for (int p = 0; p < pack_factor; p++) {
              sum += x_local[p] * (scale * wl[p] + bias);
            }
            w_local += bytes_per_pack;
            x_local += pack_factor;

          } else {
            uint8_t wi = *w_local++;
#pragma clang loop unroll(full)
            for (int p = 0; p < pack_factor; p++) {
              sum +=
                  (*x_local++) * (scale * static_cast<T>(wi & bitmask) + bias);
              if (bits != 8) {
                wi >>= bits;
              }
            }
          }
        }
      }
      *result = sum;
      result++;
    }

    x += K;
  }
}

template <int bits, int S>
simd::Simd<uint32_t, S> extract_bits_simd(const uint32_t* w) {
  constexpr int bitmask = (1 << bits) - 1;
  simd::Simd<uint32_t, S> wi;
  if constexpr (bits == 4 && S == 8) {
    constexpr std::array<uint32_t, 8> shifts_ = {{0, 4, 8, 12, 16, 20, 24, 28}};
    auto shifts(*(simd::Simd<uint32_t, S>*)&shifts_);
    wi = simd::Simd<uint32_t, S>(*w);
    wi = wi >> shifts;
    wi = wi & bitmask;
  } else if constexpr (bits == 8 && S == 8) {
    constexpr std::array<uint32_t, 8> shifts_ = {{0, 8, 16, 24, 0, 8, 16, 24}};
    auto shifts(*(simd::Simd<uint32_t, S>*)&shifts_);
    auto l = simd::Simd<uint32_t, S / 2>(*w++);
    auto r = simd::Simd<uint32_t, S / 2>(*w);
    wi = simd::Simd<uint32_t, S>(l, r);
    wi = wi >> shifts;
    wi = wi & bitmask;
  } else {
    // Appease compiler.. but should never get here
    throw std::runtime_error("Unsupported combination for simd qmm.");
  }
  return wi;
}

template <typename T, int bits, int group_size>
void _qmm_t_simd(
    T* result,
    const T* x,
    const uint32_t* w,
    const T* scales,
    const T* biases,
    int M,
    int N,
    int K) {
  constexpr int pack_factor = 32 / bits;
  constexpr int packs_in_group = group_size / pack_factor;
  constexpr int S = simd::max_size<T>;
  static_assert(
      S % pack_factor == 0, "SIMD size must be divisible by pack factor");
  constexpr int packs_per_simd = S / pack_factor;

  for (int m = 0; m < M; m++) {
    const uint32_t* w_local = w;
    const T* scales_local = scales;
    const T* biases_local = biases;

    for (int n = 0; n < N; n++) {
      simd::Simd<float, S> acc(0);
      auto x_local = x;
      for (int k = 0; k < K; k += group_size) {
        T scale = *scales_local++;
        T bias = *biases_local++;

        for (int kw = 0; kw < packs_in_group; kw += packs_per_simd) {
          auto wf = simd::Simd<float, S>(extract_bits_simd<bits, S>(w_local));
          w_local += packs_per_simd;
          wf = wf * scale;
          wf = wf + bias;
          simd::Simd<float, S> x_simd = simd::load<T, S>(x_local);
          acc = acc + x_simd * wf;
          x_local += S;
        }
      }

      *result = T(simd::sum(acc));
      result++;
    }
    x += K;
  }
}

template <typename T, int bits, int group_size>
void _qmm_dispatch_transpose(
    T* result,
    const T* x,
    const uint32_t* w,
    const T* scales,
    const T* biases,
    int M,
    int N,
    int K,
    bool transposed_w) {
  if (transposed_w) {
    // the simd size must be a multiple of the number of elements per word
    if constexpr (32 % bits == 0 && simd::max_size<T> % (32 / bits) == 0) {
      _qmm_t_simd<T, bits, group_size>(result, x, w, scales, biases, M, N, K);
    } else {
      _qmm_t<T, bits, group_size>(result, x, w, scales, biases, M, N, K);
    }
  } else {
    _qmm<T, bits, group_size>(result, x, w, scales, biases, M, N, K);
  }
}

template <typename T, int bits>
void _qmm_dispatch_group(
    T* result,
    const T* x,
    const uint32_t* w,
    const T* scales,
    const T* biases,
    int M,
    int N,
    int K,
    int group_size,
    bool transposed_w) {
  switch (group_size) {
    case 32:
      _qmm_dispatch_transpose<T, bits, 32>(
          result, x, w, scales, biases, M, N, K, transposed_w);
      break;
    case 64:
      _qmm_dispatch_transpose<T, bits, 64>(
          result, x, w, scales, biases, M, N, K, transposed_w);
      break;
    case 128:
      _qmm_dispatch_transpose<T, bits, 128>(
          result, x, w, scales, biases, M, N, K, transposed_w);
      break;
    default:
      throw std::invalid_argument(
          "Quantization group size must be 32, 64 or 128.");
  }
}

template <typename T>
void _qmm_dispatch_typed(
    T* result,
    const T* x,
    const uint32_t* w,
    const T* scales,
    const T* biases,
    int M,
    int N,
    int K,
    int group_size,
    int bits,
    bool transposed_w) {
  switch (bits) {
    case 2:
      _qmm_dispatch_group<T, 2>(
          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
      break;
    case 3:
      _qmm_dispatch_group<T, 3>(
          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
      break;
    case 4:
      _qmm_dispatch_group<T, 4>(
          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
      break;
    case 5:
      _qmm_dispatch_group<T, 5>(
          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
      break;
    case 6:
      _qmm_dispatch_group<T, 6>(
          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
      break;
    case 8:
      _qmm_dispatch_group<T, 8>(
          result, x, w, scales, biases, M, N, K, group_size, transposed_w);
      break;
    default:
      throw std::invalid_argument("Quantization bits must be 2, 3, 4, 6 or 8.");
  }
}

template <typename T>
void _qmm_dispatch_typed(
    array& out,
    const array& x,
    const array& w,
    const array& scales,
    const array& biases,
    int bits,
    int group_size,
    bool transposed_w) {
  int K = x.shape(-1);
  int M = x.ndim() > 1 ? x.shape(-2) : 1;
  int N = out.shape(-1);
  int w_els = w.ndim() > 2 ? w.shape(-1) * w.shape(-2) : 0;
  int g_els = w.ndim() > 2 ? scales.shape(-1) * scales.shape(-2) : 0;
  int batch_size = x.size() / (K * M);

  auto out_ptr = out.data<T>();
  auto x_ptr = x.data<T>();
  auto w_ptr = w.data<uint32_t>();
  auto scales_ptr = scales.data<T>();
  auto biases_ptr = biases.data<T>();
  for (int i = 0; i < batch_size; i++) {
    _qmm_dispatch_typed<T>(
        out_ptr + i * M * N,
        x_ptr + elem_to_loc(i * M * K, x.shape(), x.strides()),
        w_ptr + elem_to_loc(i * w_els, w.shape(), w.strides()),
        scales_ptr + elem_to_loc(i * g_els, scales.shape(), scales.strides()),
        biases_ptr + elem_to_loc(i * g_els, biases.shape(), biases.strides()),
        M,
        N,
        K,
        bits,
        group_size,
        transposed_w);
  }
}

void _qmm_dispatch(
    array& out,
    const array& x,
    const array& w,
    const array& scales,
    const array& biases,
    int bits,
    int group_size,
    bool transposed_w) {
  switch (x.dtype()) {
    case float32:
      _qmm_dispatch_typed<float>(
          out, x, w, scales, biases, bits, group_size, transposed_w);
      break;
    case float16:
      _qmm_dispatch_typed<float16_t>(
          out, x, w, scales, biases, bits, group_size, transposed_w);
      break;
    case bfloat16:
      _qmm_dispatch_typed<bfloat16_t>(
          out, x, w, scales, biases, bits, group_size, transposed_w);
      break;
    default:
      throw std::invalid_argument(
          "[quantized_matmul] only floating types are supported");
  }
}

template <typename T, int group_size, int bits>
void fp_qmm(
    T* result,
    const T* x,
    const uint32_t* w,
    const uint8_t* scales,
    int M,
    int N,
    int K) {
  constexpr int pack_factor = get_pack_factor(bits, 8);
  constexpr int packs_in_group = group_size / pack_factor;

  for (int m = 0; m < M; m++) {
    const uint8_t* w_local = (const uint8_t*)w;
    const uint8_t* scales_local = scales;

    std::fill(result, result + N, 0);

    for (int k = 0; k < K; k++) {
      T* result_local = result;
      T xi = *x++;

      for (int n = 0; n < N; n += group_size) {
        T scale = dequantize_scale<T, group_size>(*scales_local++);
        for (int ng = 0; ng < packs_in_group; ng++) {
          if constexpr (bits == 4) {
            (*result_local++) +=
                xi * scale * static_cast<T>(FP4_LUT[w_local[0] & 0xf]);
            (*result_local++) +=
                xi * scale * static_cast<T>(FP4_LUT[(w_local[0] >> 4) & 0xf]);
          } else {
            (*result_local++) +=
                xi * scale * static_cast<T>(detail::FromFP8{}(w_local[0]));
          }
          w_local++;
        }
      }
    }
    result += N;
  }
}

template <typename T, int group_size, int bits>
void fp_qmm_t(
    T* result,
    const T* x,
    const uint32_t* w,
    const uint8_t* scales,
    int M,
    int N,
    int K) {
  constexpr int pack_factor = get_pack_factor(bits, 8);
  constexpr int packs_in_group = group_size / pack_factor;

  for (int m = 0; m < M; m++) {
    const uint8_t* w_local = (const uint8_t*)w;
    const uint8_t* scales_local = scales;

    for (int n = 0; n < N; n++) {
      const T* x_local = x;
      T sum = 0;
      for (int k = 0; k < K; k += group_size) {
        T scale = dequantize_scale<T, group_size>(*scales_local++);

        T gsum = 0;
        for (int kw = 0; kw < packs_in_group; kw++) {
          if constexpr (bits == 4) {
            gsum += (*x_local++) * static_cast<T>(FP4_LUT[w_local[0] & 0xf]);
            gsum +=
                (*x_local++) * static_cast<T>(FP4_LUT[(w_local[0] >> 4) & 0xf]);
          } else {
            gsum +=
                (*x_local++) * static_cast<T>(detail::FromFP8{}(w_local[0]));
          }
          w_local++;
        }
        sum += scale * gsum;
      }
      *result = sum;
      result++;
    }

    x += K;
  }
}

template <int S, int bits>
simd::Simd<float, S> fp_extract_bits_simd(const uint32_t* w) {
  if constexpr (S == 8 && bits == 4) {
    constexpr std::array<uint32_t, 8> shifts_ = {{0, 4, 8, 12, 16, 20, 24, 28}};
    auto shifts(*(simd::Simd<uint32_t, S>*)&shifts_);
    auto wi = simd::Simd<uint32_t, S>(*w);
    wi = wi >> shifts;
    wi = wi & 0xf;
    simd::Simd<float, S> w_out;
    for (int i = 0; i < S; ++i) {
      w_out[i] = FP4_LUT[wi[i]];
    }
    return w_out;
  } else if constexpr (S == 8 && bits == 8) {
    auto w_out = simd::load<uint8_t, S>(reinterpret_cast<const uint8_t*>(w));
    return detail::FromFP8{}(w_out);
  } else {
    // Appease compiler.. but should never get here
    throw std::runtime_error("Unsupported combination for simd qmm.");
  }
}

template <typename T, int group_size, int bits>
void fp_qmm_t_simd(
    T* result,
    const T* x,
    const uint32_t* w,
    const uint8_t* scales,
    int M,
    int N,
    int K) {
  constexpr int pack_factor = get_pack_factor(bits, 32);
  constexpr int packs_in_group = group_size / pack_factor;
  constexpr int S = simd::max_size<T>;
  static_assert(
      S % pack_factor == 0, "SIMD size must be divisible by pack factor");
  constexpr int packs_per_simd = S / pack_factor;

  for (int m = 0; m < M; m++) {
    const uint32_t* w_local = w;
    const uint8_t* scales_local = scales;

    for (int n = 0; n < N; n++) {
      simd::Simd<float, S> acc(0);
      auto x_local = x;
      for (int k = 0; k < K; k += group_size) {
        T scale = dequantize_scale<T, group_size>(*scales_local++);

        simd::Simd<float, S> g_acc(0);
        for (int kw = 0; kw < packs_in_group; kw += packs_per_simd) {
          // Extract bits
          auto wf = fp_extract_bits_simd<S, bits>(w_local);
          w_local += packs_per_simd;
          simd::Simd<float, S> x_simd = simd::load<T, S>(x_local);
          g_acc = g_acc + x_simd * wf;
          x_local += S;
        }
        acc = acc + scale * g_acc;
      }

      *result = T(simd::sum(acc));
      result++;
    }
    x += K;
  }
}

template <typename T, int group_size, int bits>
void fp_qmm_dispatch_transpose(
    T* result,
    const T* x,
    const uint32_t* w,
    const uint8_t* scales,
    int M,
    int N,
    int K,
    bool transposed_w) {
  if (transposed_w) {
    // the simd size must be a multiple of the number of elements per word
    if constexpr (simd::max_size<T> % 8 == 0) {
      fp_qmm_t_simd<T, group_size, bits>(result, x, w, scales, M, N, K);
    } else {
      fp_qmm_t<T, group_size, bits>(result, x, w, scales, M, N, K);
    }
  } else {
    fp_qmm<T, group_size, bits>(result, x, w, scales, M, N, K);
  }
}

template <typename T, int group_size, int bits>
void fp_qmm_dispatch_mode(
    array& out,
    const array& x,
    const array& w,
    const array& scales,
    bool transposed_w) {
  int K = x.shape(-1);
  int M = x.ndim() > 1 ? x.shape(-2) : 1;
  int N = out.shape(-1);
  int w_els = w.ndim() > 2 ? w.shape(-1) * w.shape(-2) : 0;
  int g_els = w.ndim() > 2 ? scales.shape(-1) * scales.shape(-2) : 0;
  int batch_size = x.size() / (K * M);

  auto out_ptr = out.data<T>();
  auto x_ptr = x.data<T>();
  auto w_ptr = w.data<uint32_t>();
  auto scales_ptr = scales.data<uint8_t>();
  for (int i = 0; i < batch_size; i++) {
    fp_qmm_dispatch_transpose<T, group_size, bits>(
        out_ptr + i * M * N,
        x_ptr + elem_to_loc(i * M * K, x.shape(), x.strides()),
        w_ptr + elem_to_loc(i * w_els, w.shape(), w.strides()),
        scales_ptr + elem_to_loc(i * g_els, scales.shape(), scales.strides()),
        M,
        N,
        K,
        transposed_w);
  }
}

template <typename T>
void fp_qmm_dispatch_typed(
    array& out,
    const array& x,
    const array& w,
    const array& scales,
    int group_size,
    int bits,
    bool transposed_w) {
  if (bits == 8) {
    fp_qmm_dispatch_mode<T, 32, 8>(out, x, w, scales, transposed_w);
  } else if (group_size == 32) {
    fp_qmm_dispatch_mode<T, 32, 4>(out, x, w, scales, transposed_w);
  } else {
    fp_qmm_dispatch_mode<T, 16, 4>(out, x, w, scales, transposed_w);
  }
}

void fp_qmm_dispatch(
    array& out,
    const array& x,
    const array& w,
    const array& scales,
    int group_size,
    int bits,
    bool transposed_w) {
  switch (x.dtype()) {
    case bfloat16:
      fp_qmm_dispatch_typed<bfloat16_t>(
          out, x, w, scales, group_size, bits, transposed_w);
      break;
    case float16:
      fp_qmm_dispatch_typed<float16_t>(
          out, x, w, scales, group_size, bits, transposed_w);
      break;
    case float32:
      fp_qmm_dispatch_typed<float>(
          out, x, w, scales, group_size, bits, transposed_w);
      break;
    default:
      throw std::invalid_argument(
          "[quantized_matmul] only floating types are supported");
  }
}

template <typename T>
void _bs_qmm_dispatch_typed(
    array& out,
    const array& x,
    const array& w,
    const array& scales,
    const array& biases,
    const array& lhs_indices,
    const array& rhs_indices,
    int bits,
    int group_size,
    bool transposed_w) {
  int K = x.shape(-1);
  int M = x.shape(-2);
  int N = out.shape(-1);

  int w_els = w.shape(-1) * w.shape(-2);
  int g_els = scales.shape(-1) * scales.shape(-2);

  auto out_ptr = out.data<T>();
  auto x_ptr = x.data<T>();
  auto w_ptr = w.data<uint32_t>();
  auto scales_ptr = scales.data<T>();
  auto biases_ptr = biases.data<T>();
  auto lhs_indices_ptr = lhs_indices.data<uint32_t>();
  auto rhs_indices_ptr = rhs_indices.data<uint32_t>();

  for (int i = 0; i < lhs_indices.size(); i++) {
    int x_idx = lhs_indices_ptr[elem_to_loc(
        i, lhs_indices.shape(), lhs_indices.strides())];
    int w_idx = rhs_indices_ptr[elem_to_loc(
        i, rhs_indices.shape(), rhs_indices.strides())];
    _qmm_dispatch_typed<T>(
        out_ptr + i * M * N,
        x_ptr + elem_to_loc(x_idx * M * K, x.shape(), x.strides()),
        w_ptr + elem_to_loc(w_idx * w_els, w.shape(), w.strides()),
        scales_ptr +
            elem_to_loc(w_idx * g_els, scales.shape(), scales.strides()),
        biases_ptr +
            elem_to_loc(w_idx * g_els, biases.shape(), biases.strides()),
        M,
        N,
        K,
        bits,
        group_size,
        transposed_w);
  }
}

void _bs_qmm_dispatch(
    array& out,
    const array& x,
    const array& w,
    const array& scales,
    const array& biases,
    const array& lhs_indices,
    const array& rhs_indices,
    int bits,
    int group_size,
    bool transposed_w) {
  switch (x.dtype()) {
    case float32:
      _bs_qmm_dispatch_typed<float>(
          out,
          x,
          w,
          scales,
          biases,
          lhs_indices,
          rhs_indices,
          bits,
          group_size,
          transposed_w);
      break;
    case float16:
      _bs_qmm_dispatch_typed<float16_t>(
          out,
          x,
          w,
          scales,
          biases,
          lhs_indices,
          rhs_indices,
          bits,
          group_size,
          transposed_w);
      break;
    case bfloat16:
      _bs_qmm_dispatch_typed<bfloat16_t>(
          out,
          x,
          w,
          scales,
          biases,
          lhs_indices,
          rhs_indices,
          bits,
          group_size,
          transposed_w);
      break;
    default:
      throw std::invalid_argument(
          "[quantized_matmul] only floating types are supported");
  }
}
template <typename T, int group_size, int bits>
void fp_bs_qmm_dispatch_mode(
    array& out,
    const array& x,
    const array& w,
    const array& scales,
    const array& lhs_indices,
    const array& rhs_indices,
    bool transposed_w) {
  int K = x.shape(-1);
  int M = x.shape(-2);
  int N = out.shape(-1);

  int w_els = w.shape(-1) * w.shape(-2);
  int g_els = scales.shape(-1) * scales.shape(-2);

  auto out_ptr = out.data<T>();
  auto x_ptr = x.data<T>();
  auto w_ptr = w.data<uint32_t>();
  auto scales_ptr = scales.data<uint8_t>();
  auto lhs_indices_ptr = lhs_indices.data<uint32_t>();
  auto rhs_indices_ptr = rhs_indices.data<uint32_t>();

  for (int i = 0; i < lhs_indices.size(); i++) {
    int x_idx = lhs_indices_ptr[elem_to_loc(
        i, lhs_indices.shape(), lhs_indices.strides())];
    int w_idx = rhs_indices_ptr[elem_to_loc(
        i, rhs_indices.shape(), rhs_indices.strides())];
    fp_qmm_dispatch_transpose<T, group_size, bits>(
        out_ptr + i * M * N,
        x_ptr + elem_to_loc(x_idx * M * K, x.shape(), x.strides()),
        w_ptr + elem_to_loc(w_idx * w_els, w.shape(), w.strides()),
        scales_ptr +
            elem_to_loc(w_idx * g_els, scales.shape(), scales.strides()),
        M,
        N,
        K,
        transposed_w);
  }
}

template <typename T>
void fp_bs_qmm_dispatch_typed(
    array& out,
    const array& x,
    const array& w,
    const array& scales,
    const array& lhs_indices,
    const array& rhs_indices,
    int group_size,
    int bits,
    bool transposed_w) {
  if (bits == 8) {
    fp_bs_qmm_dispatch_mode<T, 32, 8>(
        out, x, w, scales, lhs_indices, rhs_indices, transposed_w);
  } else if (group_size == 32) {
    fp_bs_qmm_dispatch_mode<T, 32, 4>(
        out, x, w, scales, lhs_indices, rhs_indices, transposed_w);
  } else {
    fp_bs_qmm_dispatch_mode<T, 16, 4>(
        out, x, w, scales, lhs_indices, rhs_indices, transposed_w);
  }
}

void fp_bs_qmm_dispatch(
    array& out,
    const array& x,
    const array& w,
    const array& scales,
    const array& lhs_indices,
    const array& rhs_indices,
    int group_size,
    int bits,
    bool transposed_w) {
  switch (x.dtype()) {
    case float32:
      fp_bs_qmm_dispatch_typed<float>(
          out,
          x,
          w,
          scales,
          lhs_indices,
          rhs_indices,
          group_size,
          bits,
          transposed_w);
      break;
    case float16:
      fp_bs_qmm_dispatch_typed<float16_t>(
          out,
          x,
          w,
          scales,
          lhs_indices,
          rhs_indices,
          group_size,
          bits,
          transposed_w);
      break;
    case bfloat16:
      fp_bs_qmm_dispatch_typed<bfloat16_t>(
          out,
          x,
          w,
          scales,
          lhs_indices,
          rhs_indices,
          group_size,
          bits,
          transposed_w);
      break;
    default:
      throw std::invalid_argument(
          "[quantized_matmul] only floating types are supported");
  }
}

} // namespace

void QuantizedMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& x_pre = inputs[0];
  auto& w_pre = inputs[1];
  auto& scales_pre = inputs[2];

  auto& encoder = cpu::get_command_encoder(stream());
  auto x = ensure_row_contiguous(x_pre, encoder, stream());
  auto w = ensure_row_contiguous(w_pre, encoder, stream());
  auto scales = ensure_row_contiguous(scales_pre, encoder, stream());

  out.set_data(allocator::malloc(out.nbytes()));

  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_input_array(scales);
  encoder.set_output_array(out);
  if (mode_ == QuantizationMode::Affine) {
    auto biases = ensure_row_contiguous(inputs[3], encoder, stream());
    encoder.set_input_array(biases);
    encoder.dispatch([out = array::unsafe_weak_copy(out),
                      x = array::unsafe_weak_copy(x),
                      w = array::unsafe_weak_copy(w),
                      scales = array::unsafe_weak_copy(scales),
                      biases = array::unsafe_weak_copy(biases),
                      group_size_ = group_size_,
                      bits_ = bits_,
                      transpose_ = transpose_]() mutable {
      _qmm_dispatch(out, x, w, scales, biases, group_size_, bits_, transpose_);
    });
  } else {
    encoder.dispatch([out = array::unsafe_weak_copy(out),
                      x = array::unsafe_weak_copy(x),
                      w = array::unsafe_weak_copy(w),
                      scales = array::unsafe_weak_copy(scales),
                      group_size_ = group_size_,
                      bits_ = bits_,
                      transpose_ = transpose_]() mutable {
      fp_qmm_dispatch(out, x, w, scales, group_size_, bits_, transpose_);
    });
  }
}

void GatherQMM::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& x_pre = inputs[0];
  auto& w_pre = inputs[1];
  auto& scales_pre = inputs[2];
  auto& lhs_indices = inputs[inputs.size() - 2];
  auto& rhs_indices = inputs[inputs.size() - 1];

  auto& encoder = cpu::get_command_encoder(stream());
  auto ensure_row_contiguous_last_dims = [s = stream(),
                                          &encoder](const array& arr) {
    auto stride_0 = arr.strides()[arr.ndim() - 2];
    auto stride_1 = arr.strides()[arr.ndim() - 1];
    if (stride_0 == arr.shape(-1) && stride_1 == 1) {
      return arr;
    } else {
      auto arr_cpy = array(arr.shape(), arr.dtype(), nullptr, {});
      copy_cpu(arr, arr_cpy, CopyType::General, s);
      encoder.add_temporary(arr_cpy);
      return arr_cpy;
    }
  };

  auto x = ensure_row_contiguous_last_dims(x_pre);
  auto w = ensure_row_contiguous_last_dims(w_pre);
  auto scales = ensure_row_contiguous_last_dims(scales_pre);

  out.set_data(allocator::malloc(out.nbytes()));

  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_input_array(scales);
  encoder.set_input_array(lhs_indices);
  encoder.set_input_array(rhs_indices);
  encoder.set_output_array(out);
  if (mode_ == QuantizationMode::Affine) {
    auto biases = ensure_row_contiguous_last_dims(inputs[3]);
    encoder.set_input_array(biases);
    encoder.dispatch([out = array::unsafe_weak_copy(out),
                      x = array::unsafe_weak_copy(x),
                      w = array::unsafe_weak_copy(w),
                      scales = array::unsafe_weak_copy(scales),
                      biases = array::unsafe_weak_copy(biases),
                      lhs_indices = array::unsafe_weak_copy(lhs_indices),
                      rhs_indices = array::unsafe_weak_copy(rhs_indices),
                      group_size_ = group_size_,
                      bits_ = bits_,
                      transpose_ = transpose_]() mutable {
      _bs_qmm_dispatch(
          out,
          x,
          w,
          scales,
          biases,
          lhs_indices,
          rhs_indices,
          group_size_,
          bits_,
          transpose_);
    });
  } else {
    encoder.dispatch([out = array::unsafe_weak_copy(out),
                      x = array::unsafe_weak_copy(x),
                      w = array::unsafe_weak_copy(w),
                      scales = array::unsafe_weak_copy(scales),
                      lhs_indices = array::unsafe_weak_copy(lhs_indices),
                      rhs_indices = array::unsafe_weak_copy(rhs_indices),
                      group_size_ = group_size_,
                      bits_ = bits_,
                      transpose_ = transpose_]() mutable {
      fp_bs_qmm_dispatch(
          out,
          x,
          w,
          scales,
          lhs_indices,
          rhs_indices,
          group_size_,
          bits_,
          transpose_);
    });
  }
}

uint8_t to_fp8_e8m0(float x) {
  if (!std::isfinite(x)) {
    return 0xFF;
  }
  if (x < 0.0f) {
    return 0x00;
  }
  float le = std::log2(x);
  int n = int(std::round(le));

  n = n < -127 ? -127 : n;
  n = n > 127 ? 127 : n;
  return static_cast<uint8_t>(n + 127);
}

uint8_t to_fp4_e2m1(float x) {
  if (std::isnan(x)) {
    return 0x7;
  }

  const uint8_t sign_bit = (std::signbit(x)) ? 0x8 : 0x0;
  x = std::abs(x);

  uint8_t bits;
  if (x > 5.0f) {
    bits = 0x7;
  } else if (x >= 3.5f) {
    bits = 0x6;
  } else if (x > 2.5f) {
    bits = 0x5;
  } else if (x >= 1.75f) {
    bits = 0x4;
  } else if (x > 1.25f) {
    bits = 0x3;
  } else if (x >= 0.75f) {
    bits = 0x2;
  } else if (x > 0.25f) {
    bits = 0x1;
  } else {
    bits = 0x0;
  }
  return bits | sign_bit;
}

template <typename T>
void fp_quantize_dequantize(
    const array& w_arr,
    array& out_arr,
    int bits,
    int group_size,
    size_t w_size) {
  auto w = w_arr.data<T>();
  auto out = out_arr.data<T>();

  size_t n_groups = w_size / group_size;

  for (size_t i = 0; i < n_groups; ++i) {
    size_t idx = i * group_size;
    float scale = -std::numeric_limits<float>::infinity();
    for (int j = 0; j < group_size; ++j) {
      scale = std::max(scale, std::abs(w[idx + j]));
    }
    scale /= bits == 4 ? 6.0f : 448.0f;
    if (group_size == 16) {
      scale = dequantize_scale<float, 16>(detail::ToFP8()(scale));
    } else {
      scale = dequantize_scale<float, 32>(to_fp8_e8m0(scale));
    }

    for (int j = 0; j < group_size; ++j) {
      float w_el = scale == 0 ? 0.0f : w[idx + j] / scale;
      float output;
      if (bits == 8) {
        output = detail::FromFP8()(detail::ToFP8()(w_el));
      } else {
        output = FP4_LUT[to_fp4_e2m1(w_el)];
      }
      out[idx + j] = static_cast<T>(scale * output);
    }
  }
}

void dispatch_quantize_dequantize(
    const array& w,
    array& out,
    int bits,
    int group_size) {
  if (w.dtype() == float16) {
    fp_quantize_dequantize<float16_t>(w, out, bits, group_size, w.size());
  } else if (w.dtype() == bfloat16) {
    fp_quantize_dequantize<bfloat16_t>(w, out, bits, group_size, w.size());
  } else if (w.dtype() == float32) {
    fp_quantize_dequantize<float>(w, out, bits, group_size, w.size());
  } else {
    throw std::runtime_error(
        "[quantize_dequantize] Only supports floating point inputs");
  }
}

template <typename T, typename U>
void quantize(
    const T* w,
    U* out,
    T* scales,
    T* biases,
    int bits,
    int group_size,
    size_t w_size) {
  float n_bins = (1 << bits) - 1;
  float eps = 1e-7;

  bool power_of_2_bits = is_power_of_2(bits);
  int el_per_int = get_pack_factor(bits, 32);
  int bytes_per_pack = get_bytes_per_pack(bits);
  int int_per_group = group_size * bytes_per_pack / el_per_int;
  size_t n_groups = w_size / group_size;

  for (size_t i = 0; i < n_groups; ++i) {
    size_t w_idx = i * group_size;
    float w_min = std::numeric_limits<float>::infinity();
    float w_max = -w_min;
    for (int j = 0; j < group_size; ++j) {
      w_max = std::max(w_max, (float)w[w_idx + j]);
      w_min = std::min(w_min, (float)w[w_idx + j]);
    }
    bool mask = std::abs(w_min) > std::abs(w_max);
    float scale = std::max((w_max - w_min) / n_bins, eps);
    scale = mask ? scale : -scale;

    float edge = mask ? w_min : w_max;
    float q0 = std::rint(edge / scale);
    float bias = 0;
    if (q0 != 0) {
      scale = edge / q0;
      bias = edge;
    }
    size_t out_idx = i * int_per_group;
    for (int j = 0; j < int_per_group / bytes_per_pack; ++j) {
      uint64_t out_el = 0;
      for (int k = 0; k < el_per_int; ++k) {
        float w_el = w[w_idx + j * el_per_int + k];
        w_el = std::rint((w_el - bias) / scale);
        w_el = std::min(std::max(w_el, 0.0f), n_bins);
        out_el |= static_cast<uint64_t>(w_el) << (k * bits);
      }
      if (power_of_2_bits) {
        out[out_idx + j] = out_el;
      } else if (bits == 5) {
        out[out_idx + bytes_per_pack * j] = out_el & 0xff;
        out[out_idx + bytes_per_pack * j + 1] = (out_el & 0xff00) >> 8;
        out[out_idx + bytes_per_pack * j + 2] = (out_el & 0xff0000) >> 16;
        out[out_idx + bytes_per_pack * j + 3] = (out_el & 0xff000000) >> 24;
        out[out_idx + bytes_per_pack * j + 4] = (out_el & 0xff00000000) >> 32;
      } else {
        out[out_idx + bytes_per_pack * j] = out_el & 0xff;
        out[out_idx + bytes_per_pack * j + 1] = (out_el & 0xff00) >> 8;
        out[out_idx + bytes_per_pack * j + 2] = (out_el & 0xff0000) >> 16;
      }
    }
    scales[i] = static_cast<T>(scale);
    biases[i] = static_cast<T>(bias);
  }
}

template <typename T, typename U>
void dispatch_quantize(
    const array& w,
    array& out,
    array& scales,
    array& biases,
    int bits,
    int group_size) {
  auto w_ptr = w.data<T>();
  auto out_ptr = out.data<U>();
  auto scales_ptr = scales.data<T>();
  auto biases_ptr = biases.data<T>();
  quantize<T, U>(
      w_ptr, out_ptr, scales_ptr, biases_ptr, bits, group_size, w.size());
}

void fast::Quantize::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  auto& encoder = cpu::get_command_encoder(stream());
  auto w = ensure_row_contiguous(inputs[0], encoder, stream());
  auto& out = outputs[0];
  out.set_data(allocator::malloc(out.nbytes()));

  auto& scales = outputs[1];
  auto& biases = outputs[2];
  scales.set_data(allocator::malloc(scales.nbytes()));
  biases.set_data(allocator::malloc(biases.nbytes()));
  encoder.set_input_array(w);
  encoder.set_input_array(scales);
  encoder.set_input_array(biases);
  encoder.set_output_array(out);
  encoder.dispatch([w = array::unsafe_weak_copy(w),
                    out = array::unsafe_weak_copy(out),
                    scales = array::unsafe_weak_copy(scales),
                    biases = array::unsafe_weak_copy(biases),
                    group_size_ = group_size_,
                    bits_ = bits_]() mutable {
    if (w.dtype() == float16) {
      if (is_power_of_2(bits_)) {
        dispatch_quantize<float16_t, uint32_t>(
            w, out, scales, biases, bits_, group_size_);
      } else {
        dispatch_quantize<float16_t, uint8_t>(
            w, out, scales, biases, bits_, group_size_);
      }
    } else if (w.dtype() == bfloat16) {
      if (is_power_of_2(bits_)) {
        dispatch_quantize<bfloat16_t, uint32_t>(
            w, out, scales, biases, bits_, group_size_);
      } else {
        dispatch_quantize<bfloat16_t, uint8_t>(
            w, out, scales, biases, bits_, group_size_);
      }
    } else if (w.dtype() == float32) {
      if (is_power_of_2(bits_)) {
        dispatch_quantize<float, uint32_t>(
            w, out, scales, biases, bits_, group_size_);
      } else {
        dispatch_quantize<float, uint8_t>(
            w, out, scales, biases, bits_, group_size_);
      }
    } else {
      throw std::runtime_error(
          "[fast::Quantize::eval_cpu] Only supports floating point inputs");
    }
  });
}

void fast::ConvertFP8::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  auto& in = inputs[0];
  auto& out = outputs[0];
  set_unary_output_data(in, out);
  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  encoder.dispatch([in = array::unsafe_weak_copy(in),
                    out = array::unsafe_weak_copy(out),
                    to_fp8 = to_fp8_]() mutable {
    if (to_fp8) {
      switch (in.dtype()) {
        case float16:
          unary_op<float16_t, uint8_t>(in, out, detail::ToFP8());
          break;
        case bfloat16:
          unary_op<bfloat16_t, uint8_t>(in, out, detail::ToFP8());
          break;
        default:
          unary_op<float, uint8_t>(in, out, detail::ToFP8());
          break;
      }
    } else {
      switch (out.dtype()) {
        case float16:
          unary_op<uint8_t, float16_t>(in, out, detail::FromFP8());
          break;
        case bfloat16:
          unary_op<uint8_t, bfloat16_t>(in, out, detail::FromFP8());
          break;
        default:
          unary_op<uint8_t, float>(in, out, detail::FromFP8());
          break;
      }
    }
  });
}

void QQMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& encoder = cpu::get_command_encoder(stream());

  bool w_quantized = (inputs[1].dtype() == uint32);
  if (w_quantized && inputs[0].shape(-2) == 1) {
    bool donate_x = inputs[0].is_donatable();
    auto x = ensure_row_contiguous(inputs[0], encoder, stream());
    auto w = ensure_row_contiguous(inputs[1], encoder, stream());
    auto scales = ensure_row_contiguous(inputs[2], encoder, stream());

    out.set_data(allocator::malloc(out.nbytes()));

    // If x is a copy it should be donatable
    donate_x |= x.is_donatable();
    auto xhat = donate_x
        ? x
        : array(allocator::malloc(x.nbytes()), x.shape(), x.dtype());
    if (!donate_x) {
      encoder.add_temporary(xhat);
    }
    encoder.set_input_array(x);
    encoder.set_input_array(w);
    encoder.set_input_array(scales);
    encoder.set_output_array(out);
    encoder.dispatch([out = array::unsafe_weak_copy(out),
                      x = array::unsafe_weak_copy(x),
                      xhat = array::unsafe_weak_copy(xhat),
                      w = array::unsafe_weak_copy(w),
                      scales = array::unsafe_weak_copy(scales),
                      group_size_ = group_size_,
                      bits_ = bits_]() mutable {
      dispatch_quantize_dequantize(x, xhat, bits_, group_size_);
      fp_qmm_dispatch(out, xhat, w, scales, group_size_, bits_, true);
    });
    return;
  } else {
    throw std::runtime_error("[QQMatmul] NYI for the general case");
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/reduce.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <cassert>
#include <functional>
#include <limits>

#include "mlx/backend/common/reduce.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/simd/simd.h"
#include "mlx/primitives.h"

namespace mlx::core {

template <typename U>
struct Limits {
  static const U max;
  static const U min;
};

#define instantiate_default_limit(type)                           \
  template <>                                                     \
  struct Limits<type> {                                           \
    static constexpr type max = std::numeric_limits<type>::max(); \
    static constexpr type min = std::numeric_limits<type>::min(); \
  };

instantiate_default_limit(uint8_t);
instantiate_default_limit(uint16_t);
instantiate_default_limit(uint32_t);
instantiate_default_limit(uint64_t);
instantiate_default_limit(int8_t);
instantiate_default_limit(int16_t);
instantiate_default_limit(int32_t);
instantiate_default_limit(int64_t);

#define instantiate_float_limit(type) \
  template <>                         \
  struct Limits<type> {               \
    static const type max;            \
    static const type min;            \
  };

instantiate_float_limit(float16_t);
instantiate_float_limit(bfloat16_t);
instantiate_float_limit(float);
instantiate_float_limit(double);
instantiate_float_limit(complex64_t);

template <>
struct Limits<bool> {
  static constexpr bool max = true;
  static constexpr bool min = false;
};

const float Limits<float>::max = std::numeric_limits<float>::infinity();
const float Limits<float>::min = -std::numeric_limits<float>::infinity();
const bfloat16_t Limits<bfloat16_t>::max =
    std::numeric_limits<float>::infinity();
const bfloat16_t Limits<bfloat16_t>::min =
    -std::numeric_limits<float>::infinity();
const float16_t Limits<float16_t>::max = std::numeric_limits<float>::infinity();
const float16_t Limits<float16_t>::min =
    -std::numeric_limits<float>::infinity();
const double Limits<double>::max = std::numeric_limits<double>::infinity();
const double Limits<double>::min = -std::numeric_limits<double>::infinity();
const complex64_t Limits<complex64_t>::max =
    std::numeric_limits<float>::infinity();
const complex64_t Limits<complex64_t>::min =
    -std::numeric_limits<float>::infinity();

template <typename T, typename U, typename Op>
void strided_reduce(
    const T* x,
    U* accumulator,
    int size,
    size_t stride,
    Op op) {
  constexpr int N = std::min(simd::max_size<T>, simd::max_size<U>);
  for (int i = 0; i < size; i++) {
    U* moving_accumulator = accumulator;
    auto s = stride;
    while (s >= N) {
      auto acc = simd::load<U, N>(moving_accumulator);
      auto v = simd::Simd<U, N>(simd::load<T, N>(x));
      simd::store<U, N>(moving_accumulator, op(acc, v));
      moving_accumulator += N;
      x += N;
      s -= N;
    }
    while (s-- > 0) {
      *moving_accumulator = op(*moving_accumulator, *x);
      moving_accumulator++;
      x++;
    }
  }
};

template <typename T, typename U, typename Op>
void contiguous_reduce(const T* x, U* accumulator, int size, Op op, U init) {
  constexpr int N = std::min(simd::max_size<T>, simd::max_size<U>);
  simd::Simd<U, N> accumulator_v(init);
  while (size >= N) {
    accumulator_v = op(accumulator_v, simd::Simd<U, N>(simd::load<T, N>(x)));
    x += N;
    size -= N;
  }
  *accumulator = op(*accumulator, op(accumulator_v));
  while (size-- > 0) {
    *accumulator = op(*accumulator, *x);
    x++;
  }
}

// Helper for the ndimensional strided loop
void nd_loop(
    std::function<void(int)> callback,
    const Shape& shape,
    const Strides& strides) {
  std::function<void(int, int)> loop_inner;
  loop_inner = [&](int dim, int offset) {
    if (dim < shape.size() - 1) {
      auto size = shape[dim];
      auto stride = strides[dim];
      for (int i = 0; i < size; i++) {
        loop_inner(dim + 1, offset + i * stride);
      }
    } else {
      auto size = shape[dim];
      auto stride = strides[dim];
      for (int i = 0; i < size; i++) {
        callback(offset + i * stride);
      }
    }
  };
  loop_inner(0, 0);
}

template <typename T, typename U, typename Op>
void reduction_op(
    const array& x,
    array& out,
    const std::vector<int>& axes,
    U init) {
  ReductionPlan plan = get_reduction_plan(x, axes);

  auto in_ptr = x.data<T>();
  auto out_ptr = out.data<U>();
  if (plan.type == ContiguousAllReduce) {
    *out_ptr = init;
    contiguous_reduce(in_ptr, out_ptr, x.size(), Op{}, init);
    return;
  }

  if (plan.type == ContiguousReduce && plan.shape.size() == 1) {
    int reduction_size = plan.shape[0];
    for (int i = 0; i < out.size(); i++, out_ptr++, in_ptr += reduction_size) {
      *out_ptr = init;
      contiguous_reduce(in_ptr, out_ptr, reduction_size, Op{}, init);
    }
    return;
  }

  if (plan.type == GeneralContiguousReduce || plan.type == ContiguousReduce) {
    int reduction_size = plan.shape.back();
    plan.shape.pop_back();
    plan.strides.pop_back();
    // Unrolling the following loop (and implementing it in order for
    // ContiguousReduce) should hold extra performance boost.
    auto [shape, strides] = shapes_without_reduction_axes(x, axes);
    if (plan.shape.size() == 0) {
      for (int i = 0; i < out.size(); i++, out_ptr++) {
        int offset = elem_to_loc(i, shape, strides);
        *out_ptr = init;
        contiguous_reduce(in_ptr + offset, out_ptr, reduction_size, Op{}, init);
      }
    } else {
      for (int i = 0; i < out.size(); i++, out_ptr++) {
        int offset = elem_to_loc(i, shape, strides);
        *out_ptr = init;
        nd_loop(
            [&](int extra_offset) {
              contiguous_reduce(
                  in_ptr + offset + extra_offset,
                  out_ptr,
                  reduction_size,
                  Op{},
                  init);
            },
            plan.shape,
            plan.strides);
      }
    }
    return;
  }

  if (plan.type == ContiguousStridedReduce && plan.shape.size() == 1) {
    int reduction_size = plan.shape.back();
    size_t reduction_stride = plan.strides.back();
    plan.shape.pop_back();
    plan.strides.pop_back();
    for (int i = 0; i < out.size(); i += reduction_stride) {
      std::fill_n(out_ptr, reduction_stride, init);
      strided_reduce(in_ptr, out_ptr, reduction_size, reduction_stride, Op{});
      in_ptr += reduction_stride * reduction_size;
      out_ptr += reduction_stride;
    }
    return;
  }

  if (plan.type == GeneralStridedReduce ||
      plan.type == ContiguousStridedReduce) {
    int reduction_size = plan.shape.back();
    size_t reduction_stride = plan.strides.back();
    plan.shape.pop_back();
    plan.strides.pop_back();
    auto [shape, strides] = shapes_without_reduction_axes(x, axes);

    if (plan.shape.size() == 0) {
      for (int i = 0; i < out.size(); i += reduction_stride) {
        int offset = elem_to_loc(i, shape, strides);
        std::fill_n(out_ptr, reduction_stride, init);
        strided_reduce(
            in_ptr + offset, out_ptr, reduction_size, reduction_stride, Op{});
        out_ptr += reduction_stride;
      }
    } else {
      for (int i = 0; i < out.size(); i += reduction_stride) {
        int offset = elem_to_loc(i, shape, strides);
        std::fill_n(out_ptr, reduction_stride, init);
        nd_loop(
            [&](int extra_offset) {
              strided_reduce(
                  in_ptr + offset + extra_offset,
                  out_ptr,
                  reduction_size,
                  reduction_stride,
                  Op{});
            },
            plan.shape,
            plan.strides);
        out_ptr += reduction_stride;
      }
    }
    return;
  }

  if (plan.type == GeneralReduce) {
    auto [shape, strides] = shapes_without_reduction_axes(x, axes);

    for (int i = 0; i < out.size(); i++, out_ptr++) {
      int offset = elem_to_loc(i, shape, strides);
      U val = init;
      nd_loop(
          [&](int extra_offset) {
            val = Op{}(val, *(in_ptr + offset + extra_offset));
          },
          plan.shape,
          plan.strides);
      *out_ptr = val;
    }
  }
}

struct AndReduce {
  template <typename T>
  bool operator()(bool x, T y) {
    return x & (y != 0);
  }

  bool operator()(bool x, bool y) {
    return x & y;
  }

  template <int N, typename T>
  simd::Simd<bool, N> operator()(simd::Simd<bool, N> y, simd::Simd<T, N> x) {
    return x & (y != 0);
  };

  template <int N>
  simd::Simd<bool, N> operator()(simd::Simd<bool, N> y, simd::Simd<bool, N> x) {
    return x & y;
  };

  template <int N, typename T>
  bool operator()(simd::Simd<T, N> x) {
    return simd::all(x);
  };
};

struct OrReduce {
  template <typename T>
  bool operator()(bool x, T y) {
    return x | (y != 0);
  }

  bool operator()(bool x, bool y) {
    return x | y;
  }

  template <int N, typename T>
  simd::Simd<bool, N> operator()(simd::Simd<bool, N> y, simd::Simd<T, N> x) {
    return x | (y != 0);
  };

  template <int N>
  simd::Simd<bool, N> operator()(simd::Simd<bool, N> y, simd::Simd<bool, N> x) {
    return x | y;
  };

  template <int N, typename T>
  bool operator()(simd::Simd<T, N> x) {
    return simd::any(x);
  };
};

struct MaxReduce {
  template <typename T>
  T operator()(T y, T x) {
    return (*this)(simd::Simd<T, 1>(x), simd::Simd<T, 1>(y)).value;
  };

  template <int N, typename T>
  simd::Simd<T, N> operator()(simd::Simd<T, N> y, simd::Simd<T, N> x) {
    return simd::maximum(x, y);
  };

  template <int N, typename T>
  std::enable_if_t<std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
    return simd::max(x);
  };

  template <int N, typename T>
  std::enable_if_t<!std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
    if (simd::any(x != x)) {
      return static_cast<T>(NAN);
    }
    return simd::max(x);
  };
};

struct MinReduce {
  template <typename T>
  T operator()(T y, T x) {
    return (*this)(simd::Simd<T, 1>(x), simd::Simd<T, 1>(y)).value;
  };

  template <int N, typename T>
  simd::Simd<T, N> operator()(simd::Simd<T, N> y, simd::Simd<T, N> x) {
    return simd::minimum(x, y);
  };

  template <int N, typename T>
  std::enable_if_t<std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
    return simd::min(x);
  };

  template <int N, typename T>
  std::enable_if_t<!std::is_integral_v<T>, T> operator()(simd::Simd<T, N> x) {
    if (simd::any(x != x)) {
      return static_cast<T>(NAN);
    }
    return simd::min(x);
  };
};

struct SumReduce {
  template <typename T, typename U>
  U operator()(U y, T x) {
    return x + y;
  };

  template <int N, typename T, typename U>
  simd::Simd<U, N> operator()(simd::Simd<U, N> y, simd::Simd<T, N> x) {
    return y + x;
  };

  template <int N, typename T>
  T operator()(simd::Simd<T, N> x) {
    return simd::sum(x);
  };
};

struct ProdReduce {
  template <typename T, typename U>
  U operator()(U y, T x) {
    return x * y;
  };

  template <int N, typename T, typename U>
  simd::Simd<U, N> operator()(simd::Simd<U, N> y, simd::Simd<T, N> x) {
    return x * y;
  };

  template <int N, typename T>
  T operator()(simd::Simd<T, N> x) {
    return simd::prod(x);
  };
};

template <typename InT>
void reduce_dispatch_and_or(
    const array& in,
    array& out,
    Reduce::ReduceType rtype,
    const std::vector<int>& axes) {
  if (rtype == Reduce::And) {
    reduction_op<InT, bool, AndReduce>(in, out, axes, true);
  } else {
    reduction_op<InT, bool, OrReduce>(in, out, axes, false);
  }
}

template <typename InT>
void reduce_dispatch_sum_prod(
    const array& in,
    array& out,
    Reduce::ReduceType rtype,
    const std::vector<int>& axes) {
  if (rtype == Reduce::Sum) {
    if constexpr (std::is_integral_v<InT> && sizeof(InT) <= 4) {
      reduction_op<InT, int32_t, SumReduce>(in, out, axes, 0);
    } else {
      reduction_op<InT, InT, SumReduce>(in, out, axes, 0);
    }
  } else {
    if constexpr (std::is_integral_v<InT> && sizeof(InT) <= 4) {
      reduction_op<InT, int32_t, ProdReduce>(in, out, axes, 1);
    } else {
      reduction_op<InT, InT, ProdReduce>(in, out, axes, 1);
    }
  }
}

template <typename InT>
void reduce_dispatch_min_max(
    const array& in,
    array& out,
    Reduce::ReduceType rtype,
    const std::vector<int>& axes) {
  if (rtype == Reduce::Max) {
    auto init = Limits<InT>::min;
    reduction_op<InT, InT, MaxReduce>(in, out, axes, init);
  } else {
    auto init = Limits<InT>::max;
    reduction_op<InT, InT, MinReduce>(in, out, axes, init);
  }
}

void Reduce::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  out.set_data(allocator::malloc(out.nbytes()));
  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  encoder.dispatch([in = array::unsafe_weak_copy(in),
                    out = array::unsafe_weak_copy(out),
                    reduce_type_ = reduce_type_,
                    axes_ = axes_]() mutable {
    switch (reduce_type_) {
      case Reduce::And:
      case Reduce::Or: {
        switch (in.dtype()) {
          case bool_:
          case uint8:
          case int8:
            reduce_dispatch_and_or<int8_t>(in, out, reduce_type_, axes_);
            break;
          case int16:
          case uint16:
          case float16:
          case bfloat16:
            reduce_dispatch_and_or<int16_t>(in, out, reduce_type_, axes_);
            break;
          case uint32:
          case int32:
          case float32:
            reduce_dispatch_and_or<int32_t>(in, out, reduce_type_, axes_);
            break;
          case uint64:
          case int64:
          case float64:
          case complex64:
            reduce_dispatch_and_or<int64_t>(in, out, reduce_type_, axes_);
            break;
        }
        break;
      }
      case Reduce::Sum:
      case Reduce::Prod: {
        switch (in.dtype()) {
          case bool_:
          case uint8:
            reduce_dispatch_sum_prod<uint8_t>(in, out, reduce_type_, axes_);
            break;
          case uint16:
            reduce_dispatch_sum_prod<uint16_t>(in, out, reduce_type_, axes_);
            break;
          case uint32:
            reduce_dispatch_sum_prod<uint32_t>(in, out, reduce_type_, axes_);
            break;
          case uint64:
            reduce_dispatch_sum_prod<uint64_t>(in, out, reduce_type_, axes_);
            break;
          case int8:
            reduce_dispatch_sum_prod<int8_t>(in, out, reduce_type_, axes_);
            break;
          case int16:
            reduce_dispatch_sum_prod<int16_t>(in, out, reduce_type_, axes_);
            break;
          case int32:
            reduce_dispatch_sum_prod<int32_t>(in, out, reduce_type_, axes_);
            break;
          case int64:
            reduce_dispatch_sum_prod<int64_t>(in, out, reduce_type_, axes_);
            break;
          case float16:
            reduce_dispatch_sum_prod<float16_t>(in, out, reduce_type_, axes_);
            break;
          case bfloat16:
            reduce_dispatch_sum_prod<bfloat16_t>(in, out, reduce_type_, axes_);
            break;
          case float32:
            reduce_dispatch_sum_prod<float>(in, out, reduce_type_, axes_);
            break;
          case float64:
            reduce_dispatch_sum_prod<double>(in, out, reduce_type_, axes_);
            break;
          case complex64:
            reduce_dispatch_sum_prod<complex64_t>(in, out, reduce_type_, axes_);
            break;
        }
        break;
      }
      case Reduce::Max:
      case Reduce::Min: {
        switch (in.dtype()) {
          case bool_:
            reduce_dispatch_min_max<bool>(in, out, reduce_type_, axes_);
            break;
          case uint8:
            reduce_dispatch_min_max<uint8_t>(in, out, reduce_type_, axes_);
            break;
          case uint16:
            reduce_dispatch_min_max<uint16_t>(in, out, reduce_type_, axes_);
            break;
          case uint32:
            reduce_dispatch_min_max<uint32_t>(in, out, reduce_type_, axes_);
            break;
          case uint64:
            reduce_dispatch_min_max<uint64_t>(in, out, reduce_type_, axes_);
            break;
          case int8:
            reduce_dispatch_min_max<int8_t>(in, out, reduce_type_, axes_);
            break;
          case int16:
            reduce_dispatch_min_max<int16_t>(in, out, reduce_type_, axes_);
            break;
          case int32:
            reduce_dispatch_min_max<int32_t>(in, out, reduce_type_, axes_);
            break;
          case int64:
            reduce_dispatch_min_max<int64_t>(in, out, reduce_type_, axes_);
            break;
          case float16:
            reduce_dispatch_min_max<float16_t>(in, out, reduce_type_, axes_);
            break;
          case float32:
            reduce_dispatch_min_max<float>(in, out, reduce_type_, axes_);
            break;
          case float64:
            reduce_dispatch_min_max<double>(in, out, reduce_type_, axes_);
            break;
          case bfloat16:
            reduce_dispatch_min_max<bfloat16_t>(in, out, reduce_type_, axes_);
            break;
          case complex64:
            reduce_dispatch_min_max<complex64_t>(in, out, reduce_type_, axes_);
            break;
        }
        break;
      }
    }
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/scan.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <cassert>

#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/binary_ops.h"
#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/simd/simd.h"
#include "mlx/primitives.h"

namespace mlx::core {

namespace {

template <typename T, typename U, typename Op>
void contiguous_scan(
    const T* input,
    U* output,
    int count,
    int stride,
    bool reverse,
    bool inclusive,
    const Op& op,
    U init) {
  if (!reverse) {
    if (inclusive) {
      for (int i = 0; i < count; i++) {
        *output = *input;
        for (int j = 1; j < stride; j++) {
          input++;
          output++;
          *output = op(*(output - 1), *input);
        }
        output++;
        input++;
      }
    } else {
      for (int i = 0; i < count; i++) {
        *output = init;
        for (int j = 1; j < stride; j++) {
          *(output + 1) = op(*output, *input);
          input++;
          output++;
        }
        output++;
        input++;
      }
    }
  } else {
    if (inclusive) {
      for (int i = 0; i < count; i++) {
        output += stride - 1;
        input += stride - 1;
        *output = *input;
        for (int j = 1; j < stride; j++) {
          input--;
          output--;
          *output = op(*(output + 1), *input);
        }
        output += stride;
        input += stride;
      }
    } else {
      for (int i = 0; i < count; i++) {
        output += stride - 1;
        input += stride - 1;
        *output = init;
        for (int j = 1; j < stride; j++) {
          *(output - 1) = op(*output, *input);
          input--;
          output--;
        }
        output += stride;
        input += stride;
      }
    }
  }
};

template <typename T, typename U, typename Op>
void strided_scan(
    const T* input,
    U* output,
    int count,
    int size,
    int stride,
    bool reverse,
    bool inclusive,
    const Op& op,
    U init) {
  // TODO: Vectorize the following naive implementation
  if (!reverse) {
    if (inclusive) {
      for (int i = 0; i < count; i++) {
        std::copy(input, input + stride, output);
        output += stride;
        input += stride;
        for (int j = 1; j < size; j++) {
          for (int k = 0; k < stride; k++) {
            *output = op(*(output - stride), *input);
            output++;
            input++;
          }
        }
      }
    } else {
      for (int i = 0; i < count; i++) {
        std::fill(output, output + stride, init);
        output += stride;
        input += stride;
        for (int j = 1; j < size; j++) {
          for (int k = 0; k < stride; k++) {
            *output = op(*(output - stride), *(input - stride));
            output++;
            input++;
          }
        }
      }
    }
  } else {
    if (inclusive) {
      for (int i = 0; i < count; i++) {
        output += (size - 1) * stride;
        input += (size - 1) * stride;
        std::copy(input, input + stride, output);
        for (int j = 1; j < size; j++) {
          for (int k = 0; k < stride; k++) {
            output--;
            input--;
            *output = op(*(output + stride), *input);
          }
        }
        output += size * stride;
        input += size * stride;
      }
    } else {
      for (int i = 0; i < count; i++) {
        output += (size - 1) * stride;
        input += (size - 1) * stride;
        std::fill(output, output + stride, init);
        for (int j = 1; j < size; j++) {
          for (int k = 0; k < stride; k++) {
            output--;
            input--;
            *output = op(*(output + stride), *(input + stride));
          }
        }
        output += size * stride;
        input += size * stride;
      }
    }
  }
};

template <typename T, typename U, typename Op>
void scan_op(
    const array& in,
    array& out,
    int axis,
    bool reverse,
    bool inclusive,
    const Op& op,
    U init) {
  if (in.flags().row_contiguous) {
    if (in.strides()[axis] == 1) {
      contiguous_scan(
          in.data<T>(),
          out.data<U>(),
          in.size() / in.shape(axis),
          in.shape(axis),
          reverse,
          inclusive,
          op,
          init);
    } else {
      strided_scan(
          in.data<T>(),
          out.data<U>(),
          in.size() / in.shape(axis) / in.strides()[axis],
          in.shape(axis),
          in.strides()[axis],
          reverse,
          inclusive,
          op,
          init);
    }
  } else {
    throw std::runtime_error("Scan op supports only contiguous inputs");
  }
}

template <typename T, typename U>
void scan_dispatch(
    Scan::ReduceType rtype,
    const array& in,
    array& out,
    int axis,
    bool reverse,
    bool inclusive) {
  switch (rtype) {
    case Scan::Sum: {
      auto op = [](U y, T x) { return y + x; };
      auto init = static_cast<U>(0);
      scan_op<T, U>(in, out, axis, reverse, inclusive, op, init);
      break;
    }
    case Scan::Prod: {
      auto op = [](U y, T x) { return y * x; };
      auto init = static_cast<U>(1);
      scan_op<T, U>(in, out, axis, reverse, inclusive, op, init);
      break;
    }
    case Scan::Min: {
      auto op = [](U y, T x) { return x < y ? x : y; };
      auto init = (issubdtype(in.dtype(), floating))
          ? static_cast<U>(std::numeric_limits<float>::infinity())
          : std::numeric_limits<U>::max();
      scan_op<T, U>(in, out, axis, reverse, inclusive, op, init);
      break;
    }
    case Scan::Max: {
      auto op = [](U y, T x) { return x < y ? y : x; };
      auto init = (issubdtype(in.dtype(), floating))
          ? static_cast<U>(-std::numeric_limits<float>::infinity())
          : std::numeric_limits<U>::min();
      scan_op<T, U>(in, out, axis, reverse, inclusive, op, init);
      break;
    }
    case Scan::LogAddExp: {
      auto op = [](U a, T b) {
        return detail::LogAddExp{}(a, static_cast<U>(b));
      };
      auto init = (issubdtype(in.dtype(), floating))
          ? static_cast<U>(-std::numeric_limits<float>::infinity())
          : std::numeric_limits<U>::min();
      scan_op<T, U>(in, out, axis, reverse, inclusive, op, init);
      break;
    }
  }
}

} // namespace

void Scan::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);

  auto& encoder = cpu::get_command_encoder(stream());

  // Ensure contiguity
  auto in = inputs[0];
  if (!in.flags().row_contiguous) {
    in = contiguous_copy_cpu(in, stream());
    encoder.add_temporary(in);
  }
  out.set_data(allocator::malloc(out.nbytes()));

  encoder.set_input_array(in);
  encoder.set_output_array(out);
  encoder.dispatch([in = array::unsafe_weak_copy(in),
                    out = array::unsafe_weak_copy(out),
                    axis_ = axis_,
                    reduce_type_ = reduce_type_,
                    reverse_ = reverse_,
                    inclusive_ = inclusive_]() mutable {
    switch (in.dtype()) {
      case bool_: {
        // We could do a full dtype x dtype switch but this is the only case
        // where we accumulate in a different type, for now.
        //
        // TODO: If we add the option to accumulate floats in higher precision
        //       floats perhaps we should add the full all-to-all dispatch.
        if (reduce_type_ == Scan::Sum && out.dtype() == int32) {
          scan_dispatch<bool, int32_t>(
              reduce_type_, in, out, axis_, reverse_, inclusive_);
        } else {
          scan_dispatch<bool, bool>(
              reduce_type_, in, out, axis_, reverse_, inclusive_);
        }
        break;
      }
      case uint8:
        scan_dispatch<uint8_t, uint8_t>(
            reduce_type_, in, out, axis_, reverse_, inclusive_);
        break;
      case uint16:
        scan_dispatch<uint16_t, uint16_t>(
            reduce_type_, in, out, axis_, reverse_, inclusive_);
        break;
      case uint32:
        scan_dispatch<uint32_t, uint32_t>(
            reduce_type_, in, out, axis_, reverse_, inclusive_);
        break;
      case uint64:
        scan_dispatch<uint64_t, uint64_t>(
            reduce_type_, in, out, axis_, reverse_, inclusive_);
        break;
      case int8:
        scan_dispatch<int8_t, int8_t>(
            reduce_type_, in, out, axis_, reverse_, inclusive_);
        break;
      case int16:
        scan_dispatch<int16_t, int16_t>(
            reduce_type_, in, out, axis_, reverse_, inclusive_);
        break;
      case int32:
        scan_dispatch<int32_t, int32_t>(
            reduce_type_, in, out, axis_, reverse_, inclusive_);
        break;
      case int64:
        scan_dispatch<int64_t, int64_t>(
            reduce_type_, in, out, axis_, reverse_, inclusive_);
        break;
      case float16:
        scan_dispatch<float16_t, float16_t>(
            reduce_type_, in, out, axis_, reverse_, inclusive_);
        break;
      case float32:
        scan_dispatch<float, float>(
            reduce_type_, in, out, axis_, reverse_, inclusive_);
        break;
      case float64:
        scan_dispatch<double, double>(
            reduce_type_, in, out, axis_, reverse_, inclusive_);
        break;
      case bfloat16:
        scan_dispatch<bfloat16_t, bfloat16_t>(
            reduce_type_, in, out, axis_, reverse_, inclusive_);
        break;
      case complex64:
        scan_dispatch<complex64_t, complex64_t>(
            reduce_type_, in, out, axis_, reverse_, inclusive_);
        break;
    }
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/select.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <cassert>

#include "mlx/backend/cpu/binary_ops.h"
#include "mlx/backend/cpu/ternary.h"
#include "mlx/primitives.h"

namespace mlx::core {

namespace {

template <typename Op>
void select_op(
    const array& a,
    const array& b,
    const array& c,
    array& out,
    Op op,
    Stream stream) {
  TernaryOpType topt = get_ternary_op_type(a, b, c);
  set_ternary_op_output_data(a, b, c, out, topt);

  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_input_array(c);
  encoder.set_output_array(out);
  encoder.dispatch([a = array::unsafe_weak_copy(a),
                    b = array::unsafe_weak_copy(b),
                    c = array::unsafe_weak_copy(c),
                    out = array::unsafe_weak_copy(out),
                    op,
                    topt]() mutable {
    switch (out.dtype()) {
      case bool_:
        ternary_op<bool, bool, bool, bool>(a, b, c, out, op, topt);
        break;
      case uint8:
        ternary_op<bool, uint8_t, uint8_t, uint8_t>(a, b, c, out, op, topt);
        break;
      case uint16:
        ternary_op<bool, uint16_t, uint16_t, uint16_t>(a, b, c, out, op, topt);
        break;
      case uint32:
        ternary_op<bool, uint32_t, uint32_t, uint32_t>(a, b, c, out, op, topt);
        break;
      case uint64:
        ternary_op<bool, uint64_t, uint64_t, uint64_t>(a, b, c, out, op, topt);
        break;
      case int8:
        ternary_op<bool, int8_t, int8_t, int8_t>(a, b, c, out, op, topt);
        break;
      case int16:
        ternary_op<bool, int16_t, int16_t, int16_t>(a, b, c, out, op, topt);
        break;
      case int32:
        ternary_op<bool, int32_t, int32_t, int32_t>(a, b, c, out, op, topt);
        break;
      case int64:
        ternary_op<bool, int64_t, int64_t, int64_t>(a, b, c, out, op, topt);
        break;
      case float16:
        ternary_op<bool, float16_t, float16_t, float16_t>(
            a, b, c, out, op, topt);
        break;
      case float32:
        ternary_op<bool, float, float, float>(a, b, c, out, op, topt);
        break;
      case float64:
        ternary_op<bool, double, double, double>(a, b, c, out, op, topt);
        break;
      case bfloat16:
        ternary_op<bool, bfloat16_t, bfloat16_t, bfloat16_t>(
            a, b, c, out, op, topt);
        break;
      case complex64:
        ternary_op<bool, complex64_t, complex64_t, complex64_t>(
            a, b, c, out, op, topt);
        break;
    }
  });
}

} // namespace

void Select::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 3);
  const auto& condition = inputs[0];
  const auto& a = inputs[1];
  const auto& b = inputs[2];
  select_op(condition, a, b, out, detail::Select(), stream());
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/simd/accelerate_fp16_simd.h
================================================
#pragma once

#include "mlx/backend/cpu/simd/base_simd.h"

#if MLX_SIMD_LIBRARY_VERSION < 6
#include "mlx/backend/cpu/simd/neon_fp16_simd.h"
#endif

namespace mlx::core::simd {

#if MLX_SIMD_LIBRARY_VERSION >= 6
constexpr int N = 8;
template <int N>
struct ScalarT<float16_t, N> {
  using v = _Float16;
};
#endif

template <>
inline constexpr int max_size<float16_t> = N;

#define SIMD_FP16_DEFAULT_UNARY(op)                    \
  template <>                                          \
  inline Simd<float16_t, N> op(Simd<float16_t, N> v) { \
    Simd<float, N> in = v;                             \
    return op(in);                                     \
  }

SIMD_FP16_DEFAULT_UNARY(acos)
SIMD_FP16_DEFAULT_UNARY(acosh)
SIMD_FP16_DEFAULT_UNARY(asin)
SIMD_FP16_DEFAULT_UNARY(asinh)
SIMD_FP16_DEFAULT_UNARY(atan)
SIMD_FP16_DEFAULT_UNARY(atanh)
SIMD_FP16_DEFAULT_UNARY(cosh)
SIMD_FP16_DEFAULT_UNARY(expm1)
SIMD_FP16_DEFAULT_UNARY(log)
SIMD_FP16_DEFAULT_UNARY(log2)
SIMD_FP16_DEFAULT_UNARY(log10)
SIMD_FP16_DEFAULT_UNARY(log1p)
SIMD_FP16_DEFAULT_UNARY(sinh)
SIMD_FP16_DEFAULT_UNARY(tan)
SIMD_FP16_DEFAULT_UNARY(tanh)

#define SIMD_FP16_DEFAULT_BINARY(op)                                         \
  template <>                                                                \
  inline Simd<float16_t, N> op(Simd<float16_t, N> x, Simd<float16_t, N> y) { \
    Simd<float, N> a = x;                                                    \
    Simd<float, N> b = y;                                                    \
    return op(a, b);                                                         \
  }
SIMD_FP16_DEFAULT_BINARY(atan2)
SIMD_FP16_DEFAULT_BINARY(remainder)
SIMD_FP16_DEFAULT_BINARY(pow)

} // namespace mlx::core::simd


================================================
FILE: mlx/backend/cpu/simd/accelerate_simd.h
================================================
#pragma once

#include <arm_neon.h>
#include <simd/math.h>
#include <simd/vector.h>

#include <stdint.h>
#include <cmath>
#include <complex>

#include "mlx/backend/cpu/simd/base_simd.h"

// There seems to be a bug in simd/base_simd.h
// __XROS_2_0 is not defined, the expression evaluates
// to true instead of false setting the SIMD library
// higher than it should be even on macOS < 15
#if __MAC_OS_X_VERSION_MIN_REQUIRED >= 150000 ||  \
    __IPHONE_OS_VERSION_MIN_REQUIRED >= 180000 || \
    __WATCH_OS_VERSION_MIN_REQUIRED >= 110000 ||  \
    __WATCH_OS_VERSION_MIN_REQUIRED >= 110000 ||  \
    __TV_OS_VERSION_MIN_REQUIRED >= 180000
#define MLX_SIMD_LIBRARY_VERSION 6
#else
#define MLX_SIMD_LIBRARY_VERSION 5
#endif

namespace mlx::core::simd {

// Apple simd namespace
namespace asd = ::simd;

// This indirection is needed to remap certain types to ones that accelerate
// SIMD can handle
template <typename T, int N>
struct ScalarT {
  using v = T;
};
template <int N>
struct ScalarT<bool, N> {
  using v = char;
};
template <int N>
struct ScalarT<int8_t, N> {
  using v = char;
};
template <int N>
struct ScalarT<uint64_t, N> {
  using v = unsigned long;
};
template <int N>
struct ScalarT<int64_t, N> {
  using v = long;
};

template <typename T, int N>
struct Simd {
  static constexpr int size = N;
  using scalar_t = typename ScalarT<T, N>::v;

  Simd<T, N>() {}

  template <typename U>
  Simd<T, N>(Simd<U, N> other) : value(asd::convert<scalar_t>(other.value)) {}

  template <typename U>
  Simd<T, N>(U v) : value(v){};

  Simd<T, N>(Simd<T, N / 2> x, Simd<T, N / 2> y) {
    value = asd::make<typename asd::Vector<scalar_t, N>::packed_t>(
        x.value, y.value);
  };

  T operator[](int idx) const {
    return reinterpret_cast<const T*>(&value)[idx];
  }

  T& operator[](int idx) {
    return reinterpret_cast<T*>(&value)[idx];
  }

  typename asd::Vector<scalar_t, N>::packed_t value;
};

// Values chosen based on benchmarks on M3 Max
// TODO: consider choosing these more optimally
template <>
inline constexpr int max_size<int8_t> = 16;
template <>
inline constexpr int max_size<int16_t> = 16;
template <>
inline constexpr int max_size<int> = 8;
template <>
inline constexpr int max_size<int64_t> = 4;
template <>
inline constexpr int max_size<uint8_t> = 16;
template <>
inline constexpr int max_size<uint16_t> = 16;
template <>
inline constexpr int max_size<uint32_t> = 8;
template <>
inline constexpr int max_size<uint64_t> = 4;
template <>
inline constexpr int max_size<float> = 8;
template <>
inline constexpr int max_size<double> = 4;

#define SIMD_DEFAULT_UNARY(name, op) \
  template <typename T, int N>       \
  Simd<T, N> name(Simd<T, N> v) {    \
    return op(v.value);              \
  }

SIMD_DEFAULT_UNARY(abs, asd::abs)
SIMD_DEFAULT_UNARY(floor, asd::floor)
SIMD_DEFAULT_UNARY(acos, asd::acos)
SIMD_DEFAULT_UNARY(acosh, asd::acosh)
SIMD_DEFAULT_UNARY(asin, asd::asin)
SIMD_DEFAULT_UNARY(asinh, asd::asinh)
SIMD_DEFAULT_UNARY(atan, asd::atan)
SIMD_DEFAULT_UNARY(atanh, asd::atanh)
SIMD_DEFAULT_UNARY(ceil, asd::ceil)
SIMD_DEFAULT_UNARY(cosh, asd::cosh)
SIMD_DEFAULT_UNARY(expm1, asd::expm1)
SIMD_DEFAULT_UNARY(log, asd::log)
SIMD_DEFAULT_UNARY(log2, asd::log2)
SIMD_DEFAULT_UNARY(log10, asd::log10)
SIMD_DEFAULT_UNARY(log1p, asd::log1p)
SIMD_DEFAULT_UNARY(rint, asd::rint)
SIMD_DEFAULT_UNARY(sinh, asd::sinh)
SIMD_DEFAULT_UNARY(sqrt, asd::sqrt)
SIMD_DEFAULT_UNARY(rsqrt, asd::rsqrt)
SIMD_DEFAULT_UNARY(recip, asd::recip)
SIMD_DEFAULT_UNARY(tan, asd::tan)
SIMD_DEFAULT_UNARY(tanh, asd::tanh)

template <typename T, int N>
Simd<T, N> operator-(Simd<T, N> v) {
  return -v.value;
}

template <typename T, int N>
Simd<T, N> operator~(Simd<T, N> v) {
  return ~v.value;
}

template <typename T, int N>
Simd<bool, N> isnan(Simd<T, N> v) {
  return asd::convert<char>(v.value != v.value);
}

// No simd_boolN in accelerate, use int8_t instead
template <typename T, int N>
Simd<bool, N> operator!(Simd<T, N> v) {
  return asd::convert<char>(!v.value);
}

#define SIMD_DEFAULT_BINARY(OP)                                              \
  template <typename T, typename U, int N>                                   \
  Simd<T, N> operator OP(Simd<T, N> x, U y) {                                \
    return asd::convert<typename Simd<T, N>::scalar_t>(x.value OP y);        \
  }                                                                          \
  template <typename T1, typename T2, int N>                                 \
  Simd<T2, N> operator OP(T1 x, Simd<T2, N> y) {                             \
    return asd::convert<typename Simd<T2, N>::scalar_t>(x OP y.value);       \
  }                                                                          \
  template <typename T1, typename T2, int N>                                 \
  Simd<T1, N> operator OP(Simd<T1, N> x, Simd<T2, N> y) {                    \
    return asd::convert<typename Simd<T1, N>::scalar_t>(x.value OP y.value); \
  }

SIMD_DEFAULT_BINARY(+)
SIMD_DEFAULT_BINARY(-)
SIMD_DEFAULT_BINARY(/)
SIMD_DEFAULT_BINARY(*)
SIMD_DEFAULT_BINARY(<<)
SIMD_DEFAULT_BINARY(>>)
SIMD_DEFAULT_BINARY(|)
SIMD_DEFAULT_BINARY(^)
SIMD_DEFAULT_BINARY(&)
SIMD_DEFAULT_BINARY(&&)
SIMD_DEFAULT_BINARY(||)

#define SIMD_DEFAULT_COMPARISONS(OP)                        \
  template <int N, typename T, typename U>                  \
  Simd<bool, N> operator OP(Simd<T, N> a, U b) {            \
    return asd::convert<char>(a.value OP b);                \
  }                                                         \
  template <int N, typename T, typename U>                  \
  Simd<bool, N> operator OP(T a, Simd<U, N> b) {            \
    return asd::convert<char>(a OP b.value);                \
  }                                                         \
  template <int N, typename T1, typename T2>                \
  Simd<bool, N> operator OP(Simd<T1, N> a, Simd<T2, N> b) { \
    return asd::convert<char>(a.value OP b.value);          \
  }

SIMD_DEFAULT_COMPARISONS(>)
SIMD_DEFAULT_COMPARISONS(<)
SIMD_DEFAULT_COMPARISONS(>=)
SIMD_DEFAULT_COMPARISONS(<=)
SIMD_DEFAULT_COMPARISONS(==)
SIMD_DEFAULT_COMPARISONS(!=)

template <typename T, int N>
Simd<T, N> clz(Simd<T, N> x) {
  auto a = *(uint32x4_t*)(&x);
  auto b = *((uint32x4_t*)(&x) + 1);
  a = vclzq_u32(a);
  b = vclzq_u32(b);
  return asd::make_uint8(a, b);
}

template <typename T, int N>
Simd<T, N> atan2(Simd<T, N> a, Simd<T, N> b) {
  return asd::atan2(a.value, b.value);
}

template <typename T, int N>
Simd<T, N> maximum(Simd<T, N> a, Simd<T, N> b) {
  auto out = Simd<T, N>(asd::max(a.value, b.value));
  if constexpr (!std::is_integral_v<T>) {
    out = select(isnan(b), b, select(isnan(a), a, out));
  }
  return out;
}

template <typename T, int N>
Simd<T, N> minimum(Simd<T, N> a, Simd<T, N> b) {
  auto out = Simd<T, N>(asd::min(a.value, b.value));
  if constexpr (!std::is_integral_v<T>) {
    out = select(isnan(b), b, select(isnan(a), a, out));
  }
  return out;
}

template <typename T, int N>
Simd<T, N> remainder(Simd<T, N> a, Simd<T, N> b) {
  Simd<T, N> r;
  if constexpr (!std::is_integral_v<T>) {
    r = asd::remainder(a.value, b.value);
  } else {
    r = a - b * (a / b);
  }
  if constexpr (std::is_signed_v<T>) {
    auto mask = r != 0 && (r < 0 != b < 0);
    r = select(mask, r + b, r);
  }
  return r;
}

template <typename MaskT, typename T1, typename T2, int N>
Simd<T1, N> select(Simd<MaskT, N> mask, Simd<T1, N> x, Simd<T2, N> y) {
  static_assert(std::is_same_v<MaskT, bool>);
  if constexpr (sizeof(T1) == 1) {
    return asd::bitselect(y.value, x.value, asd::convert<char>(mask.value));
  } else if constexpr (sizeof(T1) == 2) {
    return asd::bitselect(y.value, x.value, asd::convert<short>(mask.value));
  } else if constexpr (sizeof(T1) == 4) {
    return asd::bitselect(y.value, x.value, asd::convert<int>(mask.value));
  } else {
    return asd::bitselect(y.value, x.value, asd::convert<long>(mask.value));
  }
}

template <typename T, int N>
Simd<T, N> pow(Simd<T, N> base, Simd<T, N> exp) {
  if constexpr (!std::is_integral_v<T>) {
    return asd::pow(base.value, exp.value);
  } else {
    Simd<T, N> res = 1;
    // Raising an integer to a negative power is undefined
    if (any(exp < 0)) {
      return 0;
    }
    while (any(exp > 0)) {
      res = select((exp & 1) != 0, res * base, res);
      base = select(exp > 0, base * base, base);
      exp = exp >> 1;
    }
    return res;
  }
}

template <typename T, int N>
Simd<T, N> clamp(Simd<T, N> v, Simd<T, N> min, Simd<T, N> max) {
  return asd::clamp(v.value, min.value, max.value);
}

template <typename T, typename U, int N>
Simd<T, N> fma(Simd<T, N> x, Simd<T, N> y, U z) {
  return asd::muladd(x.value, y.value, Simd<T, N>(z).value);
}

// Reductions

template <typename T, int N>
bool all(Simd<T, N> x) {
  return asd::all(x.value);
}
template <typename T, int N>
bool any(Simd<T, N> x) {
  return asd::any(x.value);
}
template <typename T, int N>
T sum(Simd<T, N> x) {
  return asd::reduce_add(x.value);
}
template <typename T, int N>
T max(Simd<T, N> x) {
  return asd::reduce_max(x.value);
}
template <typename T, int N>
T min(Simd<T, N> x) {
  return asd::reduce_min(x.value);
}

template <typename T, int N>
T prod(Simd<T, N> x) {
  auto ptr = (T*)&x;
  auto lhs = load<T, N / 2>(ptr);
  auto rhs = load<T, N / 2>(ptr + N / 2);
  return prod(lhs * rhs);
}

} // namespace mlx::core::simd

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include "mlx/backend/cpu/simd/accelerate_fp16_simd.h"
#endif


================================================
FILE: mlx/backend/cpu/simd/base_simd.h
================================================
#pragma once

// Required for using M_LN2 in MSVC.
#define _USE_MATH_DEFINES

#include <math.h>
#include <stdint.h>
#include <algorithm>
#include <complex>
#include <functional>

#ifdef _MSC_VER
#include <intrin.h> // For _BitScanReverse
#endif

namespace mlx::core::simd {
template <typename T, int N>
struct Simd;

template <typename T>
static constexpr int max_size = 1;

template <typename T>
struct Simd<T, 1> {
  static constexpr int size = 1;
  T value;
  Simd() {}
  template <typename U>
  Simd(Simd<U, 1> v) : value(v.value) {}
  template <typename U>
  Simd(U v) : value(v) {}

  T operator[](int) const {
    return value;
  }

  T& operator[](int) {
    return value;
  }
};

template <typename T, int N>
Simd<T, N> load(const T* x) {
  return *(Simd<T, N>*)x;
}

template <typename T, int N>
void store(T* dst, Simd<T, N> x) {
  // Maintain invariant that bool is either 0 or 1 as
  // simd comparison ops set all bits in the result to 1
  if constexpr (std::is_same_v<T, bool> && N > 1) {
    x = x & 1;
  }
  *(Simd<T, N>*)dst = x;
}

template <typename, typename = void>
constexpr bool is_complex = false;

template <typename T>
constexpr bool is_complex<T, std::void_t<decltype(std::declval<T>().real())>> =
    true;

template <typename T>
Simd<T, 1> rint(Simd<T, 1> in) {
  if constexpr (is_complex<T>) {
    return Simd<T, 1>{
        T{std::rint(in.value.real()), std::rint(in.value.imag())}};
  } else {
    return Simd<T, 1>{std::rint(in.value)};
  }
}

template <typename T>
Simd<T, 1> rsqrt(Simd<T, 1> in) {
  return T(1.0) / sqrt(in);
}

template <typename T>
Simd<T, 1> recip(Simd<T, 1> in) {
  return T(1.0) / in;
}

#define DEFAULT_UNARY(name, op)    \
  template <typename T>            \
  Simd<T, 1> name(Simd<T, 1> in) { \
    return op(in.value);           \
  }

DEFAULT_UNARY(operator-, std::negate{})
DEFAULT_UNARY(operator!, std::logical_not{})
DEFAULT_UNARY(abs, std::abs)
DEFAULT_UNARY(acos, std::acos)
DEFAULT_UNARY(acosh, std::acosh)
DEFAULT_UNARY(asin, std::asin)
DEFAULT_UNARY(asinh, std::asinh)
DEFAULT_UNARY(atan, std::atan)
DEFAULT_UNARY(atanh, std::atanh)
DEFAULT_UNARY(ceil, std::ceil)
DEFAULT_UNARY(conj, std::conj)
DEFAULT_UNARY(cosh, std::cosh)
DEFAULT_UNARY(expm1, std::expm1)
DEFAULT_UNARY(floor, std::floor)
DEFAULT_UNARY(log, std::log)
DEFAULT_UNARY(log10, std::log10)
DEFAULT_UNARY(sinh, std::sinh)
DEFAULT_UNARY(sqrt, std::sqrt)
DEFAULT_UNARY(tan, std::tan)
DEFAULT_UNARY(tanh, std::tanh)

template <typename T>
Simd<T, 1> log1p(Simd<T, 1> in) {
  if constexpr (is_complex<T>) {
    auto x = in.value.real();
    auto y = in.value.imag();
    auto zabs = std::abs(in.value);
    auto theta = std::atan2(y, x + 1);
    if (zabs < 0.5) {
      auto r = x * (2 + x) + y * y;
      if (r == 0) { // handle underflow
        return Simd<T, 1>{T{x, theta}};
      }
      return Simd<T, 1>{T{((decltype(x))(0.5)) * std::log1p(r), theta}};
    } else {
      auto z0 = std::hypot(x + 1, y);
      return Simd<T, 1>{T{std::log(z0), theta}};
    }
  } else {
    return Simd<T, 1>{std::log1p(in.value)};
  }
}

template <typename T>
Simd<T, 1> log2(Simd<T, 1> in) {
  if constexpr (is_complex<T>) {
    auto out = std::log(in.value);
    auto scale = decltype(out.real())(M_LN2);
    return Simd<T, 1>{T{out.real() / scale, out.imag() / scale}};
  } else {
    return Simd<T, 1>{std::log2(in.value)};
  }
}

template <typename T>
Simd<T, 1> operator~(Simd<T, 1> in) {
  return ~in.value;
}

template <typename T>
auto real(Simd<T, 1> in) -> Simd<decltype(std::real(in.value)), 1> {
  return std::real(in.value);
}
template <typename T>
auto imag(Simd<T, 1> in) -> Simd<decltype(std::imag(in.value)), 1> {
  return std::imag(in.value);
}
template <typename T>
Simd<bool, 1> isnan(Simd<T, 1> in) {
  return std::isnan(in.value);
}

#define DEFAULT_BINARY(OP)                                                 \
  template <typename T1, typename T2>                                      \
  auto operator OP(Simd<T1, 1> a, Simd<T2, 1> b)                           \
      ->Simd<decltype(a.value OP b.value), 1> {                            \
    return a.value OP b.value;                                             \
  }                                                                        \
  template <typename T1, typename T2>                                      \
  auto operator OP(T1 a, Simd<T2, 1> b)->Simd<decltype(a OP b.value), 1> { \
    return a OP b.value;                                                   \
  }                                                                        \
  template <typename T1, typename T2>                                      \
  auto operator OP(Simd<T1, 1> a, T2 b)->Simd<decltype(a.value OP b), 1> { \
    return a.value OP b;                                                   \
  }

DEFAULT_BINARY(+)
DEFAULT_BINARY(-)
DEFAULT_BINARY(*)
DEFAULT_BINARY(/)
DEFAULT_BINARY(<<)
DEFAULT_BINARY(>>)
DEFAULT_BINARY(|)
DEFAULT_BINARY(^)
DEFAULT_BINARY(&)
DEFAULT_BINARY(&&)
DEFAULT_BINARY(||)

template <typename T>
Simd<T, 1> clz(Simd<T, 1> x_) {
#ifdef _MSC_VER
  // MSVC doesn't have __builtin_clz, use _BitScanReverse instead
  unsigned long index;
  if (_BitScanReverse(&index, static_cast<unsigned long>(x_.value))) {
    return static_cast<T>(31 - index);
  }
  return static_cast<T>(32); // All zeros case
#else
  return __builtin_clz(x_.value);
#endif
}

template <typename T>
Simd<T, 1> remainder(Simd<T, 1> a_, Simd<T, 1> b_) {
  T a = a_.value;
  T b = b_.value;
  T r;
  if constexpr (std::is_integral_v<T>) {
    r = a % b;
  } else {
    r = std::remainder(a, b);
  }
  if constexpr (std::is_signed_v<T>) {
    if (r != 0 && (r < 0 != b < 0)) {
      r += b;
    }
  }
  return r;
}

template <typename T>
Simd<T, 1> maximum(Simd<T, 1> a_, Simd<T, 1> b_) {
  T a = a_.value;
  T b = b_.value;
  if constexpr (!std::is_integral_v<T>) {
    if (std::isnan(a)) {
      return a;
    }
  }
  return (a > b) ? a : b;
}

template <typename T>
Simd<T, 1> minimum(Simd<T, 1> a_, Simd<T, 1> b_) {
  T a = a_.value;
  T b = b_.value;
  if constexpr (!std::is_integral_v<T>) {
    if (std::isnan(a)) {
      return a;
    }
  }
  return (a < b) ? a : b;
}

template <typename T>
Simd<T, 1> pow(Simd<T, 1> a, Simd<T, 1> b) {
  T base = a.value;
  T exp = b.value;
  if constexpr (!std::is_integral_v<T>) {
    return std::pow(base, exp);
  } else {
    T res = 1;
    while (exp) {
      if (exp & 1) {
        res *= base;
      }
      exp >>= 1;
      base *= base;
    }
    return res;
  }
}

template <typename T>
Simd<T, 1> atan2(Simd<T, 1> a, Simd<T, 1> b) {
  return std::atan2(a.value, b.value);
}

#define DEFAULT_COMPARISONS(OP)                             \
  template <typename T1, typename T2>                       \
  Simd<bool, 1> operator OP(Simd<T1, 1> a, Simd<T2, 1> b) { \
    return a.value OP b.value;                              \
  }                                                         \
  template <typename T1, typename T2>                       \
  Simd<bool, 1> operator OP(T1 a, Simd<T2, 1> b) {          \
    return a OP b.value;                                    \
  }                                                         \
  template <typename T1, typename T2>                       \
  Simd<bool, 1> operator OP(Simd<T1, 1> a, T2 b) {          \
    return a.value OP b;                                    \
  }

DEFAULT_COMPARISONS(>)
DEFAULT_COMPARISONS(<)
DEFAULT_COMPARISONS(>=)
DEFAULT_COMPARISONS(<=)
DEFAULT_COMPARISONS(==)
DEFAULT_COMPARISONS(!=)

template <typename MaskT, typename T>
Simd<T, 1> select(Simd<MaskT, 1> mask, Simd<T, 1> x, Simd<T, 1> y) {
  return mask.value ? x.value : y.value;
}

template <typename T>
Simd<T, 1> clamp(Simd<T, 1> v, Simd<T, 1> min, Simd<T, 1> max) {
  return std::clamp(v.value, min.value, max.value);
}

template <typename T, typename U>
Simd<T, 1> fma(Simd<T, 1> x, Simd<T, 1> y, U z) {
  return std::fma(x.value, y.value, Simd<T, 1>(z).value);
}

// Reductions
#define DEFAULT_REDUCTION(name, type) \
  template <typename T>               \
  type name(Simd<T, 1> x) {           \
    return x.value;                   \
  }

DEFAULT_REDUCTION(max, T)
DEFAULT_REDUCTION(min, T)
DEFAULT_REDUCTION(sum, T)
DEFAULT_REDUCTION(prod, T)
DEFAULT_REDUCTION(any, bool)
DEFAULT_REDUCTION(all, bool)

} // namespace mlx::core::simd


================================================
FILE: mlx/backend/cpu/simd/math.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/backend/cpu/simd/type.h"

namespace mlx::core::simd {

constexpr float inf = std::numeric_limits<float>::infinity();

/**
 * Compute exp(x) in an optimizer friendly way as follows:
 *
 * First change the problem to computing 2**y where y = x / ln(2).
 *
 * Now we will compute 2**y as 2**y1 * 2**y2 where y1 is the integer part
 * `ipart` and y2 is fractional part. For the integer part we perform bit
 * shifting and for the fractional part we use a polynomial approximation.
 *
 * The algorithm and constants of the polynomial taken from
 * https://github.com/akohlmey/fastermath/blob/master/src/exp.c which took them
 * from Cephes math library.
 *
 * Note: The implementation below is a general fast exp. There could be faster
 *       implementations for numbers strictly < 0.
 */
template <typename T, int N>
Simd<T, N> exp(Simd<T, N> in) {
  if constexpr (is_complex<T>) {
    return Simd<T, 1>{std::exp(in.value)};
  } else {
    Simd<float, N> x_init = in;
    auto x = x_init * 1.442695f; // multiply with log_2(e)
    Simd<float, N> ipart, fpart;
    ipart = floor(x + 0.5);
    fpart = x - ipart;

    x = 1.535336188319500e-4f;
    x = fma(x, fpart, 1.339887440266574e-3f);
    x = fma(x, fpart, 9.618437357674640e-3f);
    x = fma(x, fpart, 5.550332471162809e-2f);
    x = fma(x, fpart, 2.402264791363012e-1f);
    x = fma(x, fpart, 6.931472028550421e-1f);
    x = fma(x, fpart, 1.000000000000000f);

    // generate 2**ipart in the floating point representation using integer
    // bitshifting
    Simd<int, N> epart = (Simd<int, N>(ipart) + 127) << 23;

    // Deal with NaN and Inf
    auto result = select(isnan(x_init), x_init, (*(Simd<float, N>*)&epart) * x);
    result = select(x_init > 88.0f, Simd<float, N>(inf), result);
    result = select(x_init < -88.0f, Simd<float, N>(0), result);
    return Simd<T, N>(result);
  }
}

/* Implementation from:
 * https://github.com/JishinMaster/simd_utils/blob/3c1433a86fb38edcc9b02039f3c9a65b16640976/neon_mathfun.h#L357
 * which originally came from the Cephes math library.
 */
template <bool Sine, typename T, int N>
Simd<T, N> sincos(Simd<T, N> in) {
  auto sign_mask_sin = in < 0;
  in = abs(in);
  Simd<float, N> x = in;

  // scale by 4/Pi
  auto y = x * 1.27323954473516f;

  // store the integer part of y in mm0
  Simd<uint32_t, N> emm2 = y;

  // j=(j+1) & (~1) (see the cephes sources)
  emm2 = emm2 + 1;
  emm2 = emm2 & ~1;

  y = emm2;

  // Get the polynom selection mask. There is one polynom for 0 <= x <= Pi/4
  // and another one for Pi/4<x<=Pi/2. Both branches will be computed.
  auto poly_mask = (emm2 & 2) != 0;

  // The magic pass: "Extended precision modular arithmetic"
  // x = ((x - y * DP1) - y * DP2) - y * DP3
  x = fma(y, Simd<float, N>(-0.78515625f), x);
  x = fma(y, Simd<float, N>(-2.4187564849853515625e-4f), x);
  x = fma(y, Simd<float, N>(-3.77489497744594108e-8f), x);

  sign_mask_sin = sign_mask_sin ^ ((emm2 & 4) != 0);
  auto sign_mask_cos = ((emm2 - 2) & 4) != 0;

  // Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
  // and the second polynom      (Pi/4 <= x <= 0) in y2
  auto z = x * x;

  auto y1 =
      fma(z, Simd<float, N>(2.443315711809948e-5f), -1.388731625493765e-3f);
  auto y2 = fma(z, Simd<float, N>(-1.9515295891e-4f), 8.3321608736e-3f);
  y1 = fma(y1, z, 4.166664568298827e-2f);
  y2 = fma(y2, z, -1.6666654611e-1f);
  y1 = y1 * z;
  y2 = y2 * z;
  y1 = y1 * z;
  y2 = fma(x, y2, x);
  y1 = fma(z, Simd<float, N>(-0.5f), y1);
  y1 = y1 + 1.0f;

  if constexpr (Sine) {
    auto ys = select(poly_mask, y1, y2);
    return select(sign_mask_sin, -ys, ys);
  } else {
    auto yc = select(poly_mask, y2, y1);
    return select(sign_mask_cos, yc, -yc);
  }
}

template <typename T, int N>
Simd<T, N> sin(Simd<T, N> x) {
  if constexpr (is_complex<T>) {
    return std::sin(x.value);
  } else {
    return sincos<true>(x);
  }
}

template <typename T, int N>
Simd<T, N> cos(Simd<T, N> x) {
  if constexpr (is_complex<T>) {
    return std::cos(x.value);
  } else {
    return sincos<false>(x);
  }
}

template <typename T, int N>
Simd<T, N> erf(Simd<T, N> x) {
  // https://github.com/pytorch/pytorch/blob/abf28982a8cb43342e7669d859de9543fd804cc9/aten/src/ATen/cpu/vec/vec256/vec256_float.h#L175
  Simd<float, N> v = x;
  auto t = recip(fma(Simd<float, N>(0.3275911f), abs(v), 1.0f));
  auto r = fma(Simd<float, N>(1.061405429f), t, -1.453152027f);
  r = fma(r, t, 1.421413741f);
  r = fma(r, t, -0.284496736f);
  r = fma(r, t, 0.254829592f);
  auto e = -exp(-v * v);
  auto result = Simd<T, N>(fma(e * t, r, 1.0f));
  return select(x > 0, result, -result);
}

template <typename T, int N>
Simd<T, N> erfinv(Simd<T, N> a_) {
  Simd<float, N> a = a_;
  auto t = fma(a, 0.0f - a, 1.0f);
  t = log(t);
  auto lhs = [](auto t) {
    Simd<float, N> p;
    p = 3.03697567e-10f; //  0x1.4deb44p-32
    p = fma(p, t, 2.93243101e-8f); //  0x1.f7c9aep-26
    p = fma(p, t, 1.22150334e-6f); //  0x1.47e512p-20
    p = fma(p, t, 2.84108955e-5f); //  0x1.dca7dep-16
    p = fma(p, t, 3.93552968e-4f); //  0x1.9cab92p-12
    p = fma(p, t, 3.02698812e-3f); //  0x1.8cc0dep-9
    p = fma(p, t, 4.83185798e-3f); //  0x1.3ca920p-8
    p = fma(p, t, -2.64646143e-1f); // -0x1.0eff66p-2
    return fma(p, t, 8.40016484e-1f); //  0x1.ae16a4p-1
  };
  auto rhs = [](auto t) {
    Simd<float, N> p;
    p = 5.43877832e-9f; //  0x1.75c000p-28
    p = fma(p, t, 1.43285448e-7f); //  0x1.33b402p-23
    p = fma(p, t, 1.22774793e-6f); //  0x1.499232p-20
    p = fma(p, t, 1.12963626e-7f); //  0x1.e52cd2p-24
    p = fma(p, t, -5.61530760e-5f); // -0x1.d70bd0p-15
    p = fma(p, t, -1.47697632e-4f); // -0x1.35be90p-13
    p = fma(p, t, 2.31468678e-3f); //  0x1.2f6400p-9
    p = fma(p, t, 1.15392581e-2f); //  0x1.7a1e50p-7
    p = fma(p, t, -2.32015476e-1f); // -0x1.db2aeep-3
    return fma(p, t, 8.86226892e-1f); //  0x1.c5bf88p-1
  };
  auto thresh = 6.125f;
  // Compute both branches and select if N > 1
  if constexpr (N == 1) {
    if ((abs(t) > thresh).value) { // maximum ulp error = 2.35793
      return a * lhs(t);
    } else { // maximum ulp error = 2.35002
      return a * rhs(t);
    }
  } else {
    return a * select(abs(t) > thresh, lhs(t), rhs(t));
  }
}

} // namespace mlx::core::simd


================================================
FILE: mlx/backend/cpu/simd/neon_fp16_simd.h
================================================
#pragma once

#include <arm_neon.h>

#include "mlx/backend/cpu/simd/base_simd.h"

namespace mlx::core::simd {

constexpr int N = 8;

template <>
struct Simd<float16_t, N> {
  static constexpr int size = N;
  using scalar_t = float16_t;

  Simd<float16_t, N>() {}

  template <typename U>
  Simd<float16_t, N>(U v) : value(vdupq_n_f16(v)){};

  Simd<float16_t, N>(float16x8_t v) : value(v){};

  Simd<float16_t, N>(Simd<float, N> other) {
    auto f32x4_a = *(float32x4_t*)(&other);
    auto f32x4_b = *((float32x4_t*)(&other) + 1);
    value = vcvt_high_f16_f32(vcvt_f16_f32(f32x4_a), f32x4_b);
  };

  Simd<float16_t, N>(Simd<uint16_t, N> other) {
    value = vcvtq_f16_u16(*(uint16x8_t*)(&other.value));
  };

  operator Simd<int16_t, N>() {
    auto v = vcvtq_s16_f16(value);
    return load<int16_t, N>((int16_t*)&v);
  };

  operator Simd<float, N>() {
    float32x4x2_t v;
    v.val[0] = vcvt_f32_f16(*(float16x4_t*)(&value));
    v.val[1] = vcvt_high_f32_f16(value);
    return load<float, N>((float*)&v);
  }
  float16_t operator[](int idx) const {
    return reinterpret_cast<const float16_t*>(&value)[idx];
  }

  float16_t& operator[](int idx) {
    return reinterpret_cast<float16_t*>(&value)[idx];
  }

  float16x8_t value;
};

#define DEFINE_NEON_UNARY_OP(name, op)                   \
  inline Simd<float16_t, N> name(Simd<float16_t, N> a) { \
    return Simd<float16_t, N>{op(a.value)};              \
  }

DEFINE_NEON_UNARY_OP(abs, vabsq_f16)
DEFINE_NEON_UNARY_OP(ceil, vrndpq_f16)
DEFINE_NEON_UNARY_OP(floor, vrndmq_f16)
DEFINE_NEON_UNARY_OP(sqrt, vsqrtq_f16)
DEFINE_NEON_UNARY_OP(rsqrt, vrsqrteq_f16)
DEFINE_NEON_UNARY_OP(recip, vrecpeq_f16)
DEFINE_NEON_UNARY_OP(rint, vrndnq_f16)

#define DEFINE_NEON_BINARY_OP(name, op)                                        \
  inline Simd<float16_t, N> name(Simd<float16_t, N> a, Simd<float16_t, N> b) { \
    return op(a.value, b.value);                                               \
  }                                                                            \
  template <typename T>                                                        \
  Simd<float16_t, N> name(Simd<float16_t, N> a, T b) {                         \
    return op(a.value, Simd<float16_t, N>(b).value);                           \
  }                                                                            \
  template <typename T>                                                        \
  Simd<float16_t, N> name(T a, Simd<float16_t, N> b) {                         \
    return op(Simd<float16_t, N>(a).value, b.value);                           \
  }

inline Simd<float16_t, N> operator!(Simd<float16_t, N> v) {
  auto out = vceqzq_f16(v.value);
  return Simd<uint16_t, N>(*(uint16_t*)&out);
}

inline Simd<float16_t, N> operator-(Simd<float16_t, N> v) {
  return vnegq_f16(v.value);
}

DEFINE_NEON_BINARY_OP(maximum, vmaxq_f16)
DEFINE_NEON_BINARY_OP(minimum, vminq_f16)
DEFINE_NEON_BINARY_OP(operator+, vaddq_f16)
DEFINE_NEON_BINARY_OP(operator-, vsubq_f16)
DEFINE_NEON_BINARY_OP(operator*, vmulq_f16)
DEFINE_NEON_BINARY_OP(operator/, vdivq_f16)

#define DEFINE_NEON_COMPARISON(Op, op)                   \
  template <typename T>                                  \
  Simd<bool, N> operator Op(Simd<float16_t, N> a, T b) { \
    auto out = op(a.value, Simd<float16_t, N>(b).value); \
    return Simd<uint16_t, N>(*(uint16_t*)(&out));        \
  }                                                      \
  template <typename T>                                  \
  Simd<bool, N> operator Op(T a, Simd<float16_t, N> b) { \
    auto out = op(Simd<float16_t, N>(a).value, b.value); \
    return Simd<uint16_t, N>(*(uint16_t*)(&out));        \
  }                                                      \
  inline Simd<bool, N> operator Op(                      \
      Simd<float16_t, N> a, Simd<float16_t, N> b) {      \
    auto out = op(a.value, b.value);                     \
    return Simd<uint16_t, N>(*(uint16_t*)(&out));        \
  }

DEFINE_NEON_COMPARISON(==, vceqq_f16)
DEFINE_NEON_COMPARISON(>=, vcgeq_f16)
DEFINE_NEON_COMPARISON(<=, vcleq_f16)
DEFINE_NEON_COMPARISON(>, vcgtq_f16)
DEFINE_NEON_COMPARISON(<, vcltq_f16)

template <typename T>
Simd<bool, N> operator!=(Simd<float16_t, N> a, T b) {
  return !(a == b);
}
template <typename T>
Simd<bool, N> operator!=(T a, Simd<float16_t, N> b) {
  return !(a == b);
}
inline Simd<bool, N> operator!=(Simd<float16_t, N> a, Simd<float16_t, N> b) {
  return !(a == b);
}

inline Simd<float16_t, N> operator||(
    Simd<float16_t, N> a,
    Simd<float16_t, N> b) {
  return Simd<uint16_t, N>((a != 0) || (b != 0));
}
template <typename T>
Simd<float16_t, N> operator||(Simd<float16_t, N> a, T b) {
  return Simd<uint16_t, N>((a != 0) || (b != 0));
}
template <typename T>
Simd<float16_t, N> operator||(T a, Simd<float16_t, N> b) {
  return Simd<uint16_t, N>((a != 0) || (b != 0));
}
inline Simd<float16_t, N> operator&&(
    Simd<float16_t, N> a,
    Simd<float16_t, N> b) {
  return Simd<uint16_t, N>((a != 0) && (b != 0));
}
template <typename T>
Simd<float16_t, N> operator&&(Simd<float16_t, N> a, T b) {
  return Simd<uint16_t, N>((a != 0) && (b != 0));
}
template <typename T>
Simd<float16_t, N> operator&&(T a, Simd<float16_t, N> b) {
  return Simd<uint16_t, N>((a != 0) && (b != 0));
}

template <>
inline Simd<bool, N> isnan(Simd<float16_t, N> v) {
  return v != v;
}

template <>
inline Simd<float16_t, N>
clamp(Simd<float16_t, N> v, Simd<float16_t, N> min, Simd<float16_t, N> max) {
  return minimum(maximum(v, min), max);
}

template <typename T>
Simd<float16_t, N> fma(Simd<float16_t, N> x, Simd<float16_t, N> y, T z) {
  return vfmaq_f16(x.value, y.value, Simd<float16_t, N>(z).value);
}

template <typename MaskT>
Simd<float16_t, N>
select(Simd<MaskT, N> mask, Simd<float16_t, N> x, Simd<float16_t, N> y) {
  return vbslq_f16(Simd<uint16_t, N>(mask).value, x.value, y.value);
}

// Reductions
inline float16_t max(Simd<float16_t, N> x) {
  float16x4_t y;
  y = vpmax_f16(vget_low_f16(x.value), vget_high_f16(x.value));
  y = vpmax_f16(y, y);
  y = vpmax_f16(y, y);
  return vget_lane_f16(y, 0);
}
inline float16_t min(Simd<float16_t, N> x) {
  float16x4_t y;
  y = vpmin_f16(vget_low_f16(x.value), vget_high_f16(x.value));
  y = vpmin_f16(y, y);
  y = vpmin_f16(y, y);
  return vget_lane_f16(y, 0);
}
inline float16_t sum(Simd<float16_t, N> x) {
  float16x4_t y;
  y = vpadd_f16(vget_low_f16(x.value), vget_high_f16(x.value));
  y = vpadd_f16(y, y);
  y = vpadd_f16(y, y);
  return vget_lane_f16(y, 0);
}
inline float16_t prod(Simd<float16_t, N> x) {
  auto hx = vmul_f16(vget_low_f16(x.value), vget_high_f16(x.value));
  auto out = hx[0];
  hx[0] *= hx[1];
  hx[0] *= hx[2];
  hx[0] *= hx[3];
  return hx[0];
}

} // namespace mlx::core::simd


================================================
FILE: mlx/backend/cpu/simd/simd.h
================================================
#pragma once

#include "mlx/backend/cpu/simd/math.h"
#include "mlx/backend/cpu/simd/type.h"


================================================
FILE: mlx/backend/cpu/simd/type.h
================================================
#pragma once

#include "mlx/backend/cpu/simd/base_simd.h"

#ifdef MLX_USE_ACCELERATE
#if defined(__x86_64__)
// the accelerate_simd implementation require neon -- use base implementation
#else
#include "mlx/backend/cpu/simd/accelerate_simd.h"
#endif
#endif


================================================
FILE: mlx/backend/cpu/slicing.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/array.h"

namespace mlx::core {

std::tuple<int64_t, Strides> prepare_slice(
    const array& in,
    const Shape& start_indices,
    const Shape& strides);

void shared_buffer_slice(
    const array& in,
    const Strides& out_strides,
    size_t data_offset,
    size_t data_size,
    array& out);

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/softmax.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <cassert>
#include <cmath>

#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/simd/simd.h"
#include "mlx/primitives.h"
#include "mlx/types/limits.h"

namespace mlx::core {

namespace {

using namespace mlx::core::simd;

template <typename T, typename AccT>
void softmax(const array& in, array& out, Stream stream) {
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(in);
  encoder.set_output_array(out);

  const T* in_ptr = in.data<T>();
  T* out_ptr = out.data<T>();

  int M = in.shape().back();
  int L = in.data_size() / M;

  encoder.dispatch([in_ptr, out_ptr, M, L]() mutable {
    constexpr bool same_t = std::is_same_v<T, AccT>;
    constexpr int N = std::min(max_size<AccT>, max_size<T>);

    const T* current_in_ptr;
    T* current_out_ptr;

    for (int i = 0; i < L; i++, in_ptr += M, out_ptr += M) {
      // Find the maximum
      current_in_ptr = in_ptr;
      Simd<AccT, N> vmaximum(-numeric_limits<AccT>::infinity());
      size_t s = M;
      while (s >= N) {
        Simd<AccT, N> vals = load<T, N>(current_in_ptr);
        vmaximum = maximum(vals, vmaximum);
        current_in_ptr += N;
        s -= N;
      }

      AccT maximum = max(vmaximum);
      while (s-- > 0) {
        maximum = std::max(maximum, static_cast<AccT>(*current_in_ptr));
        current_in_ptr++;
      }

      // Compute the normalizer and the exponentials
      Simd<AccT, N> vnormalizer(0.0);
      current_out_ptr = out_ptr;
      current_in_ptr = in_ptr;
      s = M;
      while (s >= N) {
        Simd<AccT, N> vexp = load<T, N>(current_in_ptr);
        vexp = exp(vexp - maximum);
        if constexpr (same_t) {
          store(current_out_ptr, vexp);
        }
        vnormalizer = vnormalizer + vexp;
        current_in_ptr += N;
        current_out_ptr += N;
        s -= N;
      }
      AccT normalizer = sum(vnormalizer);
      while (s-- > 0) {
        AccT _exp = std::exp(*current_in_ptr - maximum);
        if constexpr (same_t) {
          *current_out_ptr = _exp;
        }
        normalizer += _exp;
        current_in_ptr++;
        current_out_ptr++;
      }
      normalizer = 1 / normalizer;

      // Normalize
      current_out_ptr = out_ptr;
      current_in_ptr = in_ptr;
      s = M;
      while (s >= N) {
        if constexpr (same_t) {
          store(
              current_out_ptr,
              Simd<T, N>(load<T, N>(current_out_ptr) * normalizer));
        } else {
          Simd<AccT, N> vexp = load<T, N>(current_in_ptr);
          vexp = exp(vexp - maximum) * normalizer;
          store(current_out_ptr, Simd<T, N>(vexp));
          current_in_ptr += N;
        }
        current_out_ptr += N;
        s -= N;
      }
      while (s-- > 0) {
        if constexpr (same_t) {
          *current_out_ptr *= normalizer;
        } else {
          AccT _exp = std::exp(*current_in_ptr - maximum);
          *current_out_ptr = static_cast<T>(_exp * normalizer);
          current_in_ptr++;
        }
        current_out_ptr++;
      }
    }
  });
}

} // namespace

void Softmax::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);

  // Make sure that the last dimension is contiguous
  auto set_output = [s = stream(), &out](const array& x) {
    if (x.flags().contiguous && x.strides()[x.ndim() - 1] == 1) {
      if (x.is_donatable()) {
        out.copy_shared_buffer(x);
      } else {
        out.set_data(
            allocator::malloc(x.data_size() * x.itemsize()),
            x.data_size(),
            x.strides(),
            x.flags());
      }
      return x;
    } else {
      array x_copy = contiguous_copy_cpu(x, s);
      out.copy_shared_buffer(x_copy);
      return x_copy;
    }
  };

  auto in = set_output(inputs[0]);

  switch (in.dtype()) {
    case float32:
      softmax<float, float>(in, out, stream());
      break;
    case float16:
      if (precise_) {
        softmax<float16_t, float>(in, out, stream());
      } else {
        softmax<float16_t, float16_t>(in, out, stream());
      }
      break;
    case bfloat16:
      if (precise_) {
        softmax<bfloat16_t, float>(in, out, stream());
      } else {
        softmax<bfloat16_t, bfloat16_t>(in, out, stream());
      }
      break;
    case float64:
      softmax<double, double>(in, out, stream());
      break;
    default:
      throw std::runtime_error(
          "[softmax] Only defined for floating point types.");
      break;
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/sort.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <algorithm>
#include <cassert>
#include <cmath>
#include <numeric>

#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/dtype_utils.h"
#include "mlx/primitives.h"

namespace mlx::core {

namespace {

template <typename T>
inline constexpr bool is_floating_v = std::is_floating_point_v<T> ||
    std::is_same_v<T, float16_t> || std::is_same_v<T, bfloat16_t>;

// NaN-aware comparator that places NaNs at the end
template <typename T>
bool nan_aware_less(T a, T b) {
  if constexpr (is_floating_v<T> || std::is_same_v<T, complex64_t>) {
    if (std::isnan(a))
      return false;
    if (std::isnan(b))
      return true;
  }
  return a < b;
}

template <typename T>
struct StridedIterator {
  using iterator_category = std::random_access_iterator_tag;
  using difference_type = int32_t;
  using value_type = T;
  using reference = value_type&;
  using pointer = value_type*;

  // Constructors
  StridedIterator() = default;

  explicit StridedIterator(T* ptr, int64_t stride, difference_type offset = 0)
      : stride_(stride), ptr_(ptr + offset * stride) {}

  explicit StridedIterator(array& arr, int axis, difference_type offset = 0)
      : StridedIterator(arr.data<T>(), arr.strides()[axis], offset) {}

  // Accessors
  reference operator*() const {
    return ptr_[0];
  }

  reference operator[](difference_type idx) const {
    return ptr_[idx * stride_];
  }

  // Comparisons
  bool operator==(const StridedIterator& other) const {
    return ptr_ == other.ptr_ && stride_ == other.stride_;
  }

  bool operator!=(const StridedIterator& other) const {
    return ptr_ != other.ptr_;
  }

  bool operator<(const StridedIterator& other) const {
    return ptr_ < other.ptr_;
  }

  bool operator>(const StridedIterator& other) const {
    return ptr_ > other.ptr_;
  }

  bool operator<=(const StridedIterator& other) const {
    return ptr_ <= other.ptr_;
  }

  bool operator>=(const StridedIterator& other) const {
    return ptr_ >= other.ptr_;
  }

  difference_type operator-(const StridedIterator& other) const {
    return (ptr_ - other.ptr_) / stride_;
  }

  // Moving
  StridedIterator& operator++() {
    ptr_ += stride_;
    return *this;
  }

  StridedIterator& operator--() {
    ptr_ -= stride_;
    return *this;
  }

  StridedIterator& operator+=(difference_type diff) {
    ptr_ += diff * stride_;
    return *this;
  }

  StridedIterator& operator-=(difference_type diff) {
    ptr_ -= diff * stride_;
    return *this;
  }

  StridedIterator operator+(difference_type diff) {
    return StridedIterator(ptr_, stride_, diff);
  }

  StridedIterator operator-(difference_type diff) {
    return StridedIterator(ptr_, stride_, -diff);
  }

 private:
  int64_t stride_;
  T* ptr_;
};

template <typename T>
void sort(array& out, int axis) {
  // Get axis, shape and stride info
  axis = axis < 0 ? axis + out.ndim() : axis;
  size_t in_size = out.size();
  size_t n_rows = in_size / out.shape(axis);

  auto remaining_shape = out.shape();
  remaining_shape.erase(remaining_shape.begin() + axis);

  auto remaining_strides = out.strides();
  remaining_strides.erase(remaining_strides.begin() + axis);

  auto axis_stride = out.strides()[axis];
  auto axis_size = out.shape(axis);

  // Perform sorting in place
  ContiguousIterator src_it(
      remaining_shape, remaining_strides, remaining_shape.size());
  auto out_ptr = out.data<T>();
  for (int i = 0; i < n_rows; i++) {
    T* data_ptr = out_ptr + src_it.loc;

    StridedIterator st(data_ptr, axis_stride, 0);
    StridedIterator ed(data_ptr, axis_stride, axis_size);

    std::stable_sort(st, ed, nan_aware_less<T>);
    src_it.step();
  }
}

template <typename T, typename IdxT = uint32_t>
void argsort(const array& in, array& out, int axis) {
  // Get axis, shape and stride info
  axis = axis < 0 ? axis + in.ndim() : axis;
  size_t n_rows = in.size() / in.shape(axis);

  auto in_remaining_shape = in.shape();
  in_remaining_shape.erase(in_remaining_shape.begin() + axis);

  auto in_remaining_strides = in.strides();
  in_remaining_strides.erase(in_remaining_strides.begin() + axis);

  auto out_remaining_shape = out.shape();
  out_remaining_shape.erase(out_remaining_shape.begin() + axis);

  auto out_remaining_strides = out.strides();
  out_remaining_strides.erase(out_remaining_strides.begin() + axis);

  auto in_stride = in.strides()[axis];
  auto out_stride = out.strides()[axis];
  auto axis_size = in.shape(axis);

  // Perform sorting
  ContiguousIterator in_it(
      in_remaining_shape, in_remaining_strides, in_remaining_shape.size());
  ContiguousIterator out_it(
      out_remaining_shape, out_remaining_strides, out_remaining_shape.size());
  auto in_ptr = in.data<T>();
  auto out_ptr = out.data<IdxT>();
  for (int i = 0; i < n_rows; i++) {
    const T* data_ptr = in_ptr + in_it.loc;
    IdxT* idx_ptr = out_ptr + out_it.loc;

    in_it.step();
    out_it.step();

    StridedIterator st_(idx_ptr, out_stride, 0);
    StridedIterator ed_(idx_ptr, out_stride, axis_size);

    // Initialize with iota
    std::iota(st_, ed_, IdxT(0));

    // Sort according to vals
    StridedIterator st(idx_ptr, out_stride, 0);
    StridedIterator ed(idx_ptr, out_stride, axis_size);

    std::stable_sort(st, ed, [data_ptr, in_stride](IdxT a, IdxT b) {
      auto v1 = data_ptr[a * in_stride];
      auto v2 = data_ptr[b * in_stride];

      // Handle NaNs (place them at the end)
      if constexpr (is_floating_v<T>) {
        if (std::isnan(v1))
          return false;
        if (std::isnan(v2))
          return true;
      }

      return v1 < v2 || (v1 == v2 && a < b);
    });
  }
}

template <typename T>
void partition(array& out, int axis, int kth) {
  // Get axis, shape and stride info
  axis = axis < 0 ? axis + out.ndim() : axis;
  size_t in_size = out.size();
  size_t n_rows = in_size / out.shape(axis);

  auto remaining_shape = out.shape();
  remaining_shape.erase(remaining_shape.begin() + axis);

  auto remaining_strides = out.strides();
  remaining_strides.erase(remaining_strides.begin() + axis);

  auto axis_stride = out.strides()[axis];
  int axis_size = out.shape(axis);

  kth = kth < 0 ? kth + axis_size : kth;

  // Perform partition in place
  ContiguousIterator src_it(
      remaining_shape, remaining_strides, remaining_shape.size());
  auto out_ptr = out.data<T>();
  for (int i = 0; i < n_rows; i++) {
    T* data_ptr = out_ptr + src_it.loc;
    src_it.step();

    StridedIterator st(data_ptr, axis_stride, 0);
    StridedIterator md(data_ptr, axis_stride, kth);
    StridedIterator ed(data_ptr, axis_stride, axis_size);

    std::nth_element(st, md, ed, nan_aware_less<T>);
  }
}

template <typename T, typename IdxT = uint32_t>
void argpartition(const array& in, array& out, int axis, int kth) {
  // Get axis, shape and stride info
  axis = axis < 0 ? axis + in.ndim() : axis;
  size_t n_rows = in.size() / in.shape(axis);

  auto in_remaining_shape = in.shape();
  in_remaining_shape.erase(in_remaining_shape.begin() + axis);

  auto in_remaining_strides = in.strides();
  in_remaining_strides.erase(in_remaining_strides.begin() + axis);

  auto out_remaining_shape = out.shape();
  out_remaining_shape.erase(out_remaining_shape.begin() + axis);

  auto out_remaining_strides = out.strides();
  out_remaining_strides.erase(out_remaining_strides.begin() + axis);

  auto in_stride = in.strides()[axis];
  auto out_stride = out.strides()[axis];
  auto axis_size = in.shape(axis);

  kth = kth < 0 ? kth + axis_size : kth;

  // Perform partition
  ContiguousIterator in_it(
      in_remaining_shape, in_remaining_strides, in_remaining_shape.size());
  ContiguousIterator out_it(
      out_remaining_shape, out_remaining_strides, out_remaining_shape.size());

  auto in_ptr = in.data<T>();
  auto out_ptr = out.data<IdxT>();

  for (int i = 0; i < n_rows; i++) {
    const T* data_ptr = in_ptr + in_it.loc;
    IdxT* idx_ptr = out_ptr + out_it.loc;
    in_it.step();
    out_it.step();

    StridedIterator st_(idx_ptr, out_stride, 0);
    StridedIterator ed_(idx_ptr, out_stride, axis_size);

    // Initialize with iota
    std::iota(st_, ed_, IdxT(0));

    // Sort according to vals
    StridedIterator st(idx_ptr, out_stride, 0);
    StridedIterator md(idx_ptr, out_stride, kth);
    StridedIterator ed(idx_ptr, out_stride, axis_size);

    std::nth_element(st, md, ed, [data_ptr, in_stride](IdxT a, IdxT b) {
      auto v1 = data_ptr[a * in_stride];
      auto v2 = data_ptr[b * in_stride];

      // Handle NaNs (place them at the end)
      if constexpr (is_floating_v<T>) {
        if (std::isnan(v1))
          return false;
        if (std::isnan(v2))
          return true;
      }

      return v1 < v2 || (v1 == v2 && a < b);
    });
  }
}

} // namespace

void ArgSort::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];

  // Allocate output
  out.set_data(allocator::malloc(out.nbytes()));

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_input_array(in);
  encoder.set_input_array(out);
  encoder.dispatch([in = array::unsafe_weak_copy(in),
                    out = array::unsafe_weak_copy(out),
                    axis_ = axis_]() mutable {
    switch (in.dtype()) {
      case bool_:
        return argsort<bool>(in, out, axis_);
      case uint8:
        return argsort<uint8_t>(in, out, axis_);
      case uint16:
        return argsort<uint16_t>(in, out, axis_);
      case uint32:
        return argsort<uint32_t>(in, out, axis_);
      case uint64:
        return argsort<uint64_t>(in, out, axis_);
      case int8:
        return argsort<int8_t>(in, out, axis_);
      case int16:
        return argsort<int16_t>(in, out, axis_);
      case int32:
        return argsort<int32_t>(in, out, axis_);
      case int64:
        return argsort<int64_t>(in, out, axis_);
      case float32:
        return argsort<float>(in, out, axis_);
      case float64:
        return argsort<double>(in, out, axis_);
      case float16:
        return argsort<float16_t>(in, out, axis_);
      case bfloat16:
        return argsort<bfloat16_t>(in, out, axis_);
      case complex64:
        return argsort<complex64_t>(in, out, axis_);
    }
  });
}

void Sort::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];

  int axis = axis_;
  if (axis < 0) {
    axis += in.ndim();
  }

  // Copy input to output
  CopyType ctype = (in.flags().contiguous && in.strides()[axis] != 0)
      ? CopyType::Vector
      : CopyType::General;
  copy_cpu(in, out, ctype, stream());

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_output_array(out);
  encoder.dispatch([out = array::unsafe_weak_copy(out), axis]() mutable {
    dispatch_all_types(out.dtype(), [&](auto type_tag) {
      sort<MLX_GET_TYPE(type_tag)>(out, axis);
    });
  });
}

void ArgPartition::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];

  // Allocate output
  out.set_data(allocator::malloc(out.nbytes()));

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_input_array(in);
  encoder.set_input_array(out);
  encoder.dispatch([in = array::unsafe_weak_copy(in),
                    out = array::unsafe_weak_copy(out),
                    axis_ = axis_,
                    kth_ = kth_]() mutable {
    switch (in.dtype()) {
      case bool_:
        return argpartition<bool>(in, out, axis_, kth_);
      case uint8:
        return argpartition<uint8_t>(in, out, axis_, kth_);
      case uint16:
        return argpartition<uint16_t>(in, out, axis_, kth_);
      case uint32:
        return argpartition<uint32_t>(in, out, axis_, kth_);
      case uint64:
        return argpartition<uint64_t>(in, out, axis_, kth_);
      case int8:
        return argpartition<int8_t>(in, out, axis_, kth_);
      case int16:
        return argpartition<int16_t>(in, out, axis_, kth_);
      case int32:
        return argpartition<int32_t>(in, out, axis_, kth_);
      case int64:
        return argpartition<int64_t>(in, out, axis_, kth_);
      case float32:
        return argpartition<float>(in, out, axis_, kth_);
      case float64:
        return argpartition<double>(in, out, axis_, kth_);
      case float16:
        return argpartition<float16_t>(in, out, axis_, kth_);
      case bfloat16:
        return argpartition<bfloat16_t>(in, out, axis_, kth_);
      case complex64:
        return argpartition<complex64_t>(in, out, axis_, kth_);
    }
  });
}

void Partition::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];

  // Copy input to output
  CopyType ctype = (in.flags().contiguous && in.strides()[axis_] != 0)
      ? CopyType::Vector
      : CopyType::General;
  copy_cpu(in, out, ctype, stream());

  auto& encoder = cpu::get_command_encoder(stream());
  encoder.set_output_array(out);
  encoder.dispatch([out = array::unsafe_weak_copy(out),
                    axis_ = axis_,
                    kth_ = kth_]() mutable {
    switch (out.dtype()) {
      case bool_:
        return partition<bool>(out, axis_, kth_);
      case uint8:
        return partition<uint8_t>(out, axis_, kth_);
      case uint16:
        return partition<uint16_t>(out, axis_, kth_);
      case uint32:
        return partition<uint32_t>(out, axis_, kth_);
      case uint64:
        return partition<uint64_t>(out, axis_, kth_);
      case int8:
        return partition<int8_t>(out, axis_, kth_);
      case int16:
        return partition<int16_t>(out, axis_, kth_);
      case int32:
        return partition<int32_t>(out, axis_, kth_);
      case int64:
        return partition<int64_t>(out, axis_, kth_);
      case float32:
        return partition<float>(out, axis_, kth_);
      case float64:
        return partition<double>(out, axis_, kth_);
      case float16:
        return partition<float16_t>(out, axis_, kth_);
      case bfloat16:
        return partition<bfloat16_t>(out, axis_, kth_);
      case complex64:
        return partition<complex64_t>(out, axis_, kth_);
    }
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/svd.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/allocator.h"
#include "mlx/backend/cpu/copy.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/lapack.h"
#include "mlx/primitives.h"

namespace mlx::core {

template <typename T, class Enable = void>
struct SVDWork {};

template <typename T>
struct SVDWork<
    T,
    typename std::enable_if<std::is_floating_point<T>::value>::type> {
  using R = T;

  int N;
  int M;
  int K;
  int lda;
  int ldu;
  int ldvt;
  char jobz;
  std::vector<array::Data> buffers;
  int lwork;

  SVDWork(int N, int M, int K, char jobz)
      : N(N), M(M), K(K), lda(N), ldu(N), ldvt(M), jobz(jobz) {
    T workspace_dimension = 0;

    // Will contain the indices of eigenvectors that failed to converge (not
    // used here but required by lapack).
    buffers.emplace_back(allocator::malloc(sizeof(int) * 8 * K));

    int lwork_query = -1;
    int info;

    // Compute workspace size.
    gesdd<T>(
        /* jobz = */ &jobz,
        // M and N are swapped since lapack expects column-major.
        /* m = */ &N,
        /* n = */ &M,
        /* a = */ nullptr,
        /* lda = */ &lda,
        /* s = */ nullptr,
        /* u = */ nullptr,
        /* ldu = */ &ldu,
        /* vt = */ nullptr,
        /* ldvt = */ &ldvt,
        /* work = */ &workspace_dimension,
        /* lwork = */ &lwork_query,
        /* iwork = */ static_cast<int*>(buffers[0].buffer.raw_ptr()),
        /* info = */ &info);

    if (info != 0) {
      std::stringstream ss;
      ss << "[SVD::eval_cpu] workspace calculation failed with code " << info;
      throw std::runtime_error(ss.str());
    }

    lwork = workspace_dimension;
    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
  }

  void run(T* a, R* s, T* u, T* vt) {
    int info;
    gesdd<T>(
        /* jobz = */ &jobz,
        // M and N are swapped since lapack expects column-major.
        /* m = */ &N,
        /* n = */ &M,
        /* a = */ a,
        /* lda = */ &lda,
        /* s = */ s,
        // According to the identity above, lapack will write Vᵀᵀ as U.
        /* u = */ u,
        /* ldu = */ &ldu,
        // According to the identity above, lapack will write Uᵀ as Vᵀ.
        /* vt = */ vt,
        /* ldvt = */ &ldvt,
        /* work = */ static_cast<T*>(buffers[1].buffer.raw_ptr()),
        /* lwork = */ &lwork,
        /* iwork = */ static_cast<int*>(buffers[0].buffer.raw_ptr()),
        /* info = */ &info);

    if (info != 0) {
      std::stringstream ss;
      ss << "svd_impl: sgesvdx_ failed with code " << info;
      throw std::runtime_error(ss.str());
    }
  }
};

template <>
struct SVDWork<std::complex<float>> {
  using T = std::complex<float>;
  using R = float;

  int N;
  int M;
  int K;
  int lda;
  int ldu;
  int ldvt;
  char jobz;
  std::vector<array::Data> buffers;
  int lwork;

  SVDWork(int N, int M, int K, char jobz)
      : N(N), M(M), K(K), lda(N), ldu(N), ldvt(M), jobz(jobz) {
    T workspace_dimension = 0;

    // Will contain the indices of eigenvectors that failed to converge (not
    // used here but required by lapack).
    buffers.emplace_back(allocator::malloc(sizeof(int) * 8 * K));

    const int lrwork =
        jobz == 'A' ? std::max(1, 5 * K * K + 5 * K) : std::max(1, 7 * K);
    buffers.emplace_back(allocator::malloc(sizeof(float) * lrwork));

    int lwork_query = -1;
    int work_query = -1;
    int info;

    // Compute workspace size.
    gesdd<T>(
        /* jobz = */ &jobz,
        // M and N are swapped since lapack expects column-major.
        /* m = */ &N,
        /* n = */ &M,
        /* a = */ nullptr,
        /* lda = */ &lda,
        /* s = */ nullptr,
        /* u = */ nullptr,
        /* ldu = */ &ldu,
        /* vt = */ nullptr,
        /* ldvt = */ &ldvt,
        /* work = */ &workspace_dimension,
        /* lwork = */ &lwork_query,
        /* rwork = */ static_cast<float*>(buffers[1].buffer.raw_ptr()),
        /* iwork = */ static_cast<int*>(buffers[0].buffer.raw_ptr()),
        /* info = */ &info);

    if (info != 0) {
      std::stringstream ss;
      ss << "[SVD::eval_cpu] workspace calculation failed with code " << info;
      throw std::runtime_error(ss.str());
    }

    lwork = workspace_dimension.real();
    buffers.emplace_back(allocator::malloc(sizeof(T) * lwork));
  }

  void run(T* a, R* s, T* u, T* vt) {
    int info;
    gesdd<T>(
        /* jobz = */ &jobz,
        // M and N are swapped since lapack expects column-major.
        /* m = */ &N,
        /* n = */ &M,
        /* a = */ a,
        /* lda = */ &lda,
        /* s = */ s,
        // According to the identity above, lapack will write Vᵀᵀ as U.
        /* u = */ u,
        /* ldu = */ &ldu,
        // According to the identity above, lapack will write Uᵀ as Vᵀ.
        /* vt = */ vt,
        /* ldvt = */ &ldvt,
        /* work = */ static_cast<T*>(buffers[2].buffer.raw_ptr()),
        /* lwork = */ &lwork,
        /* rwork = */ static_cast<float*>(buffers[1].buffer.raw_ptr()),
        /* iwork = */ static_cast<int*>(buffers[0].buffer.raw_ptr()),
        /* info = */ &info);

    if (info != 0) {
      std::stringstream ss;
      ss << "svd_impl: sgesvdx_ failed with code " << info;
      throw std::runtime_error(ss.str());
    }
  }
};

template <typename T>
void svd_impl(
    const array& a,
    std::vector<array>& outputs,
    bool compute_uv,
    Stream stream) {
  // Lapack uses the column-major convention. To avoid having to transpose
  // the input and then transpose the outputs, we swap the indices/sizes of the
  // matrices and take advantage of the following identity (see
  // https://math.stackexchange.com/a/30077)
  //    A = UΣVᵀ
  //    Aᵀ = VΣUᵀ
  // As a result some of the indices/sizes are swapped as noted above.

  // Rows and cols of the original matrix in row-major order.
  const int M = a.shape(-2);
  const int N = a.shape(-1);
  const int K = std::min(M, N);

  using R = typename SVDWork<T>::R;

  size_t num_matrices = a.size() / (M * N);

  // lapack clobbers the input, so we have to make a copy.
  array in(a.shape(), a.dtype(), nullptr, {});
  copy_cpu(
      a,
      in,
      a.flags().row_contiguous ? CopyType::Vector : CopyType::General,
      stream);

  // Allocate outputs.
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  auto in_ptr = in.data<T>();
  T* u_ptr;
  R* s_ptr;
  T* vt_ptr;

  if (compute_uv) {
    array& u = outputs[0];
    array& s = outputs[1];
    array& vt = outputs[2];

    u.set_data(allocator::malloc(u.nbytes()));
    s.set_data(allocator::malloc(s.nbytes()));
    vt.set_data(allocator::malloc(vt.nbytes()));

    encoder.set_output_array(u);
    encoder.set_output_array(s);
    encoder.set_output_array(vt);

    s_ptr = s.data<R>();
    u_ptr = u.data<T>();
    vt_ptr = vt.data<T>();
  } else {
    array& s = outputs[0];

    s.set_data(allocator::malloc(s.nbytes()));

    encoder.set_output_array(s);

    s_ptr = s.data<R>();
    u_ptr = nullptr;
    vt_ptr = nullptr;
  }

  encoder.dispatch([in_ptr, u_ptr, s_ptr, vt_ptr, M, N, K, num_matrices]() {
    auto jobz = (u_ptr) ? 'A' : 'N';
    SVDWork<T> svd_work(N, M, K, jobz);
    // Loop over matrices.
    for (int i = 0; i < num_matrices; i++) {
      svd_work.run(
          in_ptr + M * N * i,
          s_ptr + K * i,
          vt_ptr ? vt_ptr + N * N * i : nullptr,
          u_ptr ? u_ptr + M * M * i : nullptr);
    }
  });
  encoder.add_temporary(in);
}

void SVD::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  switch (inputs[0].dtype()) {
    case float32:
      svd_impl<float>(inputs[0], outputs, compute_uv_, stream());
      break;
    case float64:
      svd_impl<double>(inputs[0], outputs, compute_uv_, stream());
      break;
    case complex64:
      svd_impl<std::complex<float>>(inputs[0], outputs, compute_uv_, stream());
      break;
    default:
      throw std::runtime_error(
          "[SVD::eval_cpu] only supports float32, float64, or complex64.");
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/ternary.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once
#include "mlx/array.h"
#include "mlx/backend/common/ternary.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/encoder.h"

namespace mlx::core {

template <typename T1, typename T2, typename T3, typename U, typename Op, int D>
void ternary_op_dims(
    const T1* a,
    const T2* b,
    const T3* c,
    U* out,
    Op op,
    const Shape& shape,
    const Strides& a_strides,
    const Strides& b_strides,
    const Strides& c_strides,
    const Strides& out_strides,
    int axis) {
  auto stride_a = a_strides[axis];
  auto stride_b = b_strides[axis];
  auto stride_c = c_strides[axis];
  auto stride_out = out_strides[axis];
  auto N = shape[axis];

  for (int i = 0; i < N; i++) {
    if constexpr (D > 1) {
      ternary_op_dims<T1, T2, T3, U, Op, D - 1>(
          a,
          b,
          c,
          out,
          op,
          shape,
          a_strides,
          b_strides,
          c_strides,
          out_strides,
          axis + 1);
    } else {
      *out = op(*a, *b, *c);
    }
    a += stride_a;
    b += stride_b;
    c += stride_c;
    out += stride_out;
  }
}

template <typename T1, typename T2, typename T3, typename U, typename Op>
void ternary_op_dispatch_dims(
    const T1* a_ptr,
    const T2* b_ptr,
    const T3* c_ptr,
    U* out_ptr,
    Op op,
    size_t size,
    Shape& shape,
    std::vector<Strides>& strides) {
  const auto& a_strides = strides[0];
  const auto& b_strides = strides[1];
  const auto& c_strides = strides[2];
  const auto& out_strides = strides[3];
  int ndim = shape.size();
  switch (ndim) {
    case 1:
      ternary_op_dims<T1, T2, T3, U, Op, 1>(
          a_ptr,
          b_ptr,
          c_ptr,
          out_ptr,
          op,
          shape,
          a_strides,
          b_strides,
          c_strides,
          out_strides,
          0);
      return;
    case 2:
      ternary_op_dims<T1, T2, T3, U, Op, 2>(
          a_ptr,
          b_ptr,
          c_ptr,
          out_ptr,
          op,
          shape,
          a_strides,
          b_strides,
          c_strides,
          out_strides,
          0);
      return;
  }

  ContiguousIterator a_it(shape, a_strides, ndim - 2);
  ContiguousIterator b_it(shape, b_strides, ndim - 2);
  ContiguousIterator c_it(shape, c_strides, ndim - 2);
  auto stride = out_strides[ndim - 3];
  for (size_t elem = 0; elem < size; elem += stride) {
    ternary_op_dims<T1, T2, T3, U, Op, 2>(
        a_ptr + a_it.loc,
        b_ptr + b_it.loc,
        c_ptr + c_it.loc,
        out_ptr + elem,
        op,
        shape,
        a_strides,
        b_strides,
        c_strides,
        out_strides,
        ndim - 2);
    a_it.step();
    b_it.step();
    c_it.step();
  }
}

template <typename T1, typename T2, typename T3, typename U, typename Op>
void ternary_op(
    const array& a,
    const array& b,
    const array& c,
    array& out,
    Op op,
    TernaryOpType topt) {
  const T1* a_ptr = a.data<T1>();
  const T2* b_ptr = b.data<T2>();
  const T3* c_ptr = c.data<T3>();
  U* out_ptr = out.data<U>();

  if (topt == TernaryOpType::ScalarScalarScalar) {
    *out_ptr = op(*a_ptr, *b_ptr, *c_ptr);
  } else if (topt == TernaryOpType::VectorVectorVector) {
    for (size_t i = 0; i < out.size(); ++i) {
      *out_ptr = op(*a_ptr, *b_ptr, *c_ptr);
      a_ptr++;
      b_ptr++;
      c_ptr++;
      out_ptr++;
    }
  } else {
    auto [shape, strides] = collapse_contiguous_dims(
        a.shape(), {a.strides(), b.strides(), c.strides(), out.strides()});
    ternary_op_dispatch_dims<T1, T2, T3, U>(
        a_ptr, b_ptr, c_ptr, out_ptr, op, out.size(), shape, strides);
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/threefry.cpp
================================================
// Copyright © 2023 Apple Inc.

#include "mlx/backend/cpu/threefry.h"

namespace mlx::core::random {

std::pair<uint32_t, uint32_t> threefry2x32_hash(
    const std::pair<uint32_t, uint32_t>& key,
    std::pair<uint32_t, uint32_t> count) {
  constexpr static uint32_t rotations[2][4] = {
      {13, 15, 26, 6}, {17, 29, 16, 24}};

  uint32_t ks[3] = {key.first, key.second, key.first ^ key.second ^ 0x1BD11BDA};

  count.first += ks[0];
  count.second += ks[1];

  for (int i = 0; i < 5; ++i) {
    for (auto r : rotations[i % 2]) {
      count.first += count.second;
      count.second = (count.second << r) | (count.second >> (32 - r));
      count.second ^= count.first;
    }
    count.first += ks[(i + 1) % 3];
    count.second += ks[(i + 2) % 3] + i + 1;
  }

  return count;
}

} // namespace mlx::core::random


================================================
FILE: mlx/backend/cpu/threefry.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include <cstdint>
#include <utility>

namespace mlx::core::random {

/** Applies the Threefry 2x32 hash function.
 * This code is based on the Jax counter-based and splittable PRNG
 * https://github.com/google/jax/blob/main/docs/jep/263-prng.md
 *
 * Original Threefry reference:
 * http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
 */
std::pair<uint32_t, uint32_t> threefry2x32_hash(
    const std::pair<uint32_t, uint32_t>& key,
    std::pair<uint32_t, uint32_t> count);

} // namespace mlx::core::random


================================================
FILE: mlx/backend/cpu/unary.cpp
================================================
// Copyright © 2024 Apple Inc.

// Required for using M_LN2 in MSVC.
#define _USE_MATH_DEFINES

#include <cassert>

#include "mlx/backend/cpu/unary.h"
#include "mlx/backend/cpu/unary_ops.h"
#include "mlx/primitives.h"

namespace mlx::core {

void Abs::eval_cpu(const std::vector<array>& inputs, array& out) {
  auto& in = inputs[0];
  if (issubdtype(in.dtype(), unsignedinteger) || in.dtype() == bool_) {
    // No-op for unsigned types
    out.copy_shared_buffer(in);
  } else {
    unary_signed(in, out, detail::Abs(), stream());
  }
}

void ArcCos::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_fp(in, out, detail::ArcCos(), stream());
}

void ArcCosh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_fp(in, out, detail::ArcCosh(), stream());
}

void ArcSin::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_fp(in, out, detail::ArcSin(), stream());
}

void ArcSinh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_fp(in, out, detail::ArcSinh(), stream());
}

void ArcTan::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_fp(in, out, detail::ArcTan(), stream());
}

void ArcTanh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_fp(in, out, detail::ArcTanh(), stream());
}

void BitwiseInvert::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_int(in, out, detail::BitwiseInvert(), stream());
}

void Ceil::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  if (issubdtype(in.dtype(), inexact)) {
    unary_fp(in, out, detail::Ceil(), stream());
  } else {
    // No-op integer types
    out.copy_shared_buffer(in);
  }
}

void Conjugate::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  unary_complex(inputs[0], out, detail::Conjugate(), stream());
}

void Cos::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_fp(in, out, detail::Cos(), stream());
}

void Cosh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_fp(in, out, detail::Cosh(), stream());
}

void Erf::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_real_fp(in, out, detail::Erf(), stream());
}

void ErfInv::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_real_fp(in, out, detail::ErfInv(), stream());
}

void Exp::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_fp(in, out, detail::Exp(), stream());
}

void Expm1::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_fp(in, out, detail::Expm1(), stream());
}

void Floor::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  if (issubdtype(in.dtype(), inexact)) {
    unary_fp(in, out, detail::Floor(), stream());
  } else {
    // No-op integer types
    out.copy_shared_buffer(in);
  }
}

void Imag::eval_cpu(const std::vector<array>& inputs, array& out) {
  unary_complex_to_float(inputs[0], out, detail::Imag(), stream());
}

void Log::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  switch (base_) {
    case Base::e:
      unary_fp(in, out, detail::Log(), stream());
      break;
    case Base::two:
      unary_fp(in, out, detail::Log2(), stream());
      break;
    case Base::ten:
      unary_fp(in, out, detail::Log10(), stream());
      break;
  }
}

void Log1p::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_fp(in, out, detail::Log1p(), stream());
}

void LogicalNot::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  unary(in, out, detail::LogicalNot(), stream());
}

void Negative::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  unary(in, out, detail::Negative(), stream());
}

void Real::eval_cpu(const std::vector<array>& inputs, array& out) {
  unary_complex_to_float(inputs[0], out, detail::Real(), stream());
}

void Round::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  if (issubdtype(in.dtype(), inexact)) {
    unary_fp(in, out, detail::Round(), stream());
  } else {
    // No-op integer types
    out.copy_shared_buffer(in);
  }
}

void Sigmoid::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_fp(in, out, detail::Sigmoid(), stream());
}

void Sign::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  if (in.dtype() == bool_) {
    out.copy_shared_buffer(in);
  } else {
    unary(in, out, detail::Sign(), stream());
  }
}

void Sin::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_fp(in, out, detail::Sin(), stream());
}

void Sinh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_fp(in, out, detail::Sinh(), stream());
}

void Square::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  unary(in, out, detail::Square(), stream());
}

void Sqrt::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  if (recip_) {
    unary_fp(in, out, detail::Rsqrt(), stream());
  } else {
    unary_fp(in, out, detail::Sqrt(), stream());
  }
}

void Tan::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_fp(in, out, detail::Tan(), stream());
}

void Tanh::eval_cpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  unary_fp(in, out, detail::Tanh(), stream());
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/unary.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include "mlx/backend/common/unary.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/backend/cpu/simd/simd.h"
#include "mlx/utils.h"

namespace mlx::core {

template <typename T, typename U = T, typename Op>
void unary_op(const T* a, U* out, size_t shape, size_t stride) {
  for (size_t i = 0; i < shape; i += 1) {
    out[i] = Op{}(*a);
    a += stride;
  }
}

template <typename T, typename U = T, typename Op>
void unary_op(const array& a, array& out, Op) {
  const T* src = a.data<T>();
  U* dst = out.data<U>();
  auto ndim = a.ndim();
  if (a.flags().contiguous) {
    auto size = a.data_size();
    constexpr int N = std::min(simd::max_size<T>, simd::max_size<U>);
    while (size >= N) {
      simd::store(dst, simd::Simd<U, N>(Op{}(simd::load<T, N>(src))));
      size -= N;
      src += N;
      dst += N;
    }
    while (size > 0) {
      *dst = Op{}(*src);
      size--;
      dst++;
      src++;
    }
  } else {
    size_t shape = ndim > 0 ? a.shape().back() : 1;
    size_t stride = ndim > 0 ? a.strides().back() : 1;
    if (ndim <= 1) {
      unary_op<T, U, Op>(src, dst, shape, stride);
      return;
    }
    auto it = ContiguousIterator(a.shape(), a.strides(), ndim - 1);
    for (size_t elem = 0; elem < a.size(); elem += shape) {
      unary_op<T, U, Op>(src + it.loc, dst + elem, shape, stride);
      it.step();
    }
  }
}

template <typename Op>
void unary(const array& a, array& out, Op op, Stream stream) {
  set_unary_output_data(a, out);
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  encoder.set_output_array(out);
  encoder.dispatch([a = array::unsafe_weak_copy(a),
                    out = array::unsafe_weak_copy(out),
                    op = op]() mutable {
    switch (out.dtype()) {
      case bool_:
        unary_op<bool>(a, out, op);
        break;
      case uint8:
        unary_op<uint8_t>(a, out, op);
        break;
      case uint16:
        unary_op<uint16_t>(a, out, op);
        break;
      case uint32:
        unary_op<uint32_t>(a, out, op);
        break;
      case uint64:
        unary_op<uint64_t>(a, out, op);
        break;
      case int8:
        unary_op<int8_t>(a, out, op);
        break;
      case int16:
        unary_op<int16_t>(a, out, op);
        break;
      case int32:
        unary_op<int32_t>(a, out, op);
        break;
      case int64:
        unary_op<int64_t>(a, out, op);
        break;
      case float16:
        unary_op<float16_t>(a, out, op);
        break;
      case float32:
        unary_op<float>(a, out, op);
        break;
      case float64:
        unary_op<double>(a, out, op);
        break;
      case bfloat16:
        unary_op<bfloat16_t>(a, out, op);
        break;
      case complex64:
        unary_op<complex64_t>(a, out, op);
        break;
    }
  });
}

template <typename Op>
void unary_real_fp(const array& a, array& out, Op op, Stream stream) {
  set_unary_output_data(a, out);
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  encoder.set_output_array(out);
  encoder.dispatch([a = array::unsafe_weak_copy(a),
                    out = array::unsafe_weak_copy(out),
                    op = op]() mutable {
    switch (out.dtype()) {
      case bfloat16:
        unary_op<bfloat16_t>(a, out, op);
        break;
      case float16:
        unary_op<float16_t>(a, out, op);
        break;
      case float32:
        unary_op<float>(a, out, op);
        break;
      case float64:
        unary_op<double>(a, out, op);
        break;
      default:
        std::ostringstream err;
        err << "[unary_real] Does not support " << out.dtype();
        throw std::runtime_error(err.str());
    }
  });
}
template <typename Op>
void unary_fp(const array& a, array& out, Op op, Stream stream) {
  set_unary_output_data(a, out);
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  encoder.set_output_array(out);
  encoder.dispatch([a = array::unsafe_weak_copy(a),
                    out = array::unsafe_weak_copy(out),
                    op = op]() mutable {
    switch (out.dtype()) {
      case bfloat16:
        unary_op<bfloat16_t>(a, out, op);
        break;
      case float16:
        unary_op<float16_t>(a, out, op);
        break;
      case float32:
        unary_op<float>(a, out, op);
        break;
      case float64:
        unary_op<double>(a, out, op);
        break;
      case complex64:
        unary_op<complex64_t>(a, out, op);
        break;
      default:
        std::ostringstream err;
        err << "[unary_fp] Does not support " << out.dtype();
        throw std::runtime_error(err.str());
    }
  });
}

template <typename Op>
void unary_signed(const array& a, array& out, Op op, Stream stream) {
  set_unary_output_data(a, out);
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  encoder.set_output_array(out);
  encoder.dispatch([a = array::unsafe_weak_copy(a),
                    out = array::unsafe_weak_copy(out),
                    op = op]() mutable {
    switch (out.dtype()) {
      case int8:
        unary_op<int8_t>(a, out, op);
        break;
      case int16:
        unary_op<int16_t>(a, out, op);
        break;
      case int32:
        unary_op<int32_t>(a, out, op);
        break;
      case int64:
        unary_op<int64_t>(a, out, op);
        break;
      case float16:
        unary_op<float16_t>(a, out, op);
        break;
      case float32:
        unary_op<float>(a, out, op);
        break;
      case float64:
        unary_op<double>(a, out, op);
        break;
      case bfloat16:
        unary_op<bfloat16_t>(a, out, op);
        break;
      case complex64:
        unary_op<complex64_t>(a, out, op);
        break;
      default:
        throw std::runtime_error("[Abs] Called on unsigned type");
    }
  });
}

template <typename Op>
void unary_complex(const array& a, array& out, Op op, Stream stream) {
  set_unary_output_data(a, out);
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  encoder.set_output_array(out);
  encoder.dispatch([a = array::unsafe_weak_copy(a),
                    out = array::unsafe_weak_copy(out),
                    op = op]() mutable { unary_op<complex64_t>(a, out, op); });
}

template <typename Op>
void unary_complex_to_float(const array& a, array& out, Op op, Stream stream) {
  set_unary_output_data(a, out);
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  encoder.set_output_array(out);
  encoder.dispatch(
      [a = array::unsafe_weak_copy(a),
       out = array::unsafe_weak_copy(out),
       op = op]() mutable { unary_op<complex64_t, float>(a, out, op); });
}

template <typename Op>
void unary_int(const array& a, array& out, Op op, Stream stream) {
  set_unary_output_data(a, out);
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(a);
  encoder.set_output_array(out);
  encoder.dispatch([a = array::unsafe_weak_copy(a),
                    out = array::unsafe_weak_copy(out),
                    op = op]() mutable {
    switch (out.dtype()) {
      case uint8:
        unary_op<uint8_t>(a, out, op);
        break;
      case uint16:
        unary_op<uint16_t>(a, out, op);
        break;
      case uint32:
        unary_op<uint32_t>(a, out, op);
        break;
      case uint64:
        unary_op<uint64_t>(a, out, op);
        break;
      case int8:
        unary_op<int8_t>(a, out, op);
        break;
      case int16:
        unary_op<int16_t>(a, out, op);
        break;
      case int32:
        unary_op<int32_t>(a, out, op);
        break;
      case int64:
        unary_op<int64_t>(a, out, op);
        break;
      default:
        std::ostringstream err;
        err << "[unary_int] Does not support " << out.dtype();
        throw std::runtime_error(err.str());
    }
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cpu/unary_ops.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include <stdint.h>
#include <cmath>
#include <complex>

#include "mlx/backend/cpu/simd/simd.h"

namespace mlx::core::detail {

using namespace mlx::core::simd;

#define SINGLE()                         \
  template <typename T>                  \
  T operator()(T x) {                    \
    return (*this)(Simd<T, 1>(x)).value; \
  }

#define DEFAULT_OP(Op, op)                \
  struct Op {                             \
    template <int N, typename T>          \
    Simd<T, N> operator()(Simd<T, N> x) { \
      return simd::op(x);                 \
    }                                     \
    SINGLE()                              \
  };

DEFAULT_OP(Abs, abs)
DEFAULT_OP(ArcCos, acos)
DEFAULT_OP(ArcCosh, acosh)
DEFAULT_OP(ArcSin, asin)
DEFAULT_OP(ArcSinh, asinh)
DEFAULT_OP(ArcTan, atan)
DEFAULT_OP(ArcTanh, atanh)
DEFAULT_OP(BitwiseInvert, operator~)
DEFAULT_OP(Ceil, ceil)
DEFAULT_OP(Conjugate, conj)
DEFAULT_OP(Cos, cos)
DEFAULT_OP(Cosh, cosh)
DEFAULT_OP(Erf, erf)
DEFAULT_OP(ErfInv, erfinv)
DEFAULT_OP(Exp, exp)
DEFAULT_OP(Expm1, expm1)
DEFAULT_OP(Floor, floor);
DEFAULT_OP(Log, log);
DEFAULT_OP(Log2, log2);
DEFAULT_OP(Log10, log10);
DEFAULT_OP(Log1p, log1p);
DEFAULT_OP(LogicalNot, operator!)
DEFAULT_OP(Negative, operator-)
DEFAULT_OP(Round, rint);
DEFAULT_OP(Sin, sin)
DEFAULT_OP(Sinh, sinh)
DEFAULT_OP(Sqrt, sqrt)
DEFAULT_OP(Rsqrt, rsqrt)
DEFAULT_OP(Tan, tan)
DEFAULT_OP(Tanh, tanh)

struct Imag {
  template <int N>
  Simd<float, N> operator()(Simd<complex64_t, N> x) {
    return simd::imag(x);
  }
  SINGLE()
};

struct Real {
  template <int N>
  Simd<float, N> operator()(Simd<complex64_t, N> x) {
    return simd::real(x);
  }
  SINGLE()
};

struct Sigmoid {
  template <int N, typename T>
  Simd<T, N> operator()(Simd<T, N> x) {
    auto y = 1.0f / (1.0f + simd::exp(simd::abs(x)));
    return simd::select(x < Simd<T, N>{0}, y, Simd<T, N>{1} - y);
  }
  SINGLE()
};

struct Sign {
  template <int N, typename T>
  Simd<T, N> operator()(Simd<T, N> x) {
    auto z = Simd<T, N>{0};
    auto o = Simd<T, N>{1};
    auto m = Simd<T, N>{-1};
    if constexpr (std::is_unsigned_v<T>) {
      return simd::select(x == z, z, o);
    } else if constexpr (std::is_same_v<T, complex64_t>) {
      return simd::select(x == z, x, Simd<T, N>(x / simd::abs(x)));
    } else {
      return simd::select(x < z, m, simd::select(x > z, o, z));
    }
  }
  SINGLE()
};

struct Square {
  template <int N, typename T>
  Simd<T, N> operator()(Simd<T, N> x) {
    return x * x;
  }
  SINGLE()
};

template <int N>
Simd<float, N> fp32_from_bits(Simd<uint32_t, N> x) {
  return *(Simd<float, N>*)(&x);
}
template <int N>
Simd<uint32_t, N> fp32_to_bits(Simd<float, N> x) {
  return *(Simd<uint32_t, N>*)(&x);
}

struct ToFP8 {
  template <typename T, int N>
  Simd<uint8_t, N> operator()(Simd<T, N> f) {
    uint32_t fp8_max = 543 << 21;
    auto denorm_mask = Simd<uint32_t, N>(141 << 23);
    Simd<uint32_t, N> f_bits;
    Simd<float, N> f32 = f;
    f_bits = fp32_to_bits(f32);
    Simd<uint8_t, N> result = 0u;
    auto sign = f_bits & 0x80000000;
    f_bits = f_bits ^ sign;

    auto f_bits_low =
        fp32_to_bits(fp32_from_bits(f_bits) + fp32_from_bits(denorm_mask));
    auto result_low = Simd<uint8_t, N>(f_bits_low - denorm_mask);

    auto mant_odd = Simd<uint8_t, N>((f_bits >> 20) & 1);
    auto f_bits_high = f_bits + (((uint32_t)(7 - 127) << 23) + 0x7FFFF);
    f_bits_high = f_bits_high + Simd<uint32_t, N>(mant_odd);

    auto result_high = Simd<uint8_t, N>(f_bits_high >> 20);
    result = select(f_bits < (121 << 23), result_low, result_high);

    auto result_sat = Simd<uint8_t, N>(0x7E);
    result = select(f_bits >= fp8_max, result_sat, result);
    return result | Simd<uint8_t, N>(sign >> 24);
  }

  template <typename T>
  uint8_t operator()(T x) {
    return (*this)(Simd<T, 1>(x)).value;
  }
};

struct FromFP8 {
  template <int N>
  Simd<float, N> operator()(Simd<uint8_t, N> x) {
    auto v = Simd<uint16_t, N>(x & 127) << 7;
    Simd<float, N> out;
    if constexpr (simd::max_size<float16_t> >= N) {
      auto converted = *(Simd<float16_t, N>*)(&v);
      out = converted * 256.0;
    } else {
      for (int i = 0; i < N; ++i) {
        auto converted = *(float16_t*)(&v[i]);
        out[i] = converted * 256.0;
      }
    }
    auto sign = Simd<bool, N>(x & 128);
    return select(sign, -out, out);
  }
  float operator()(uint8_t x) {
    return (*this)(Simd<uint8_t, 1>(x)).value;
  }
};
} // namespace mlx::core::detail


================================================
FILE: mlx/backend/cuda/CMakeLists.txt
================================================
# Filename rules in cuda backend:
#
# * Use .cu/.cuh if code contains device code, and .cpp/.h if not.
# * Device-only code should be put in device/ subdir.
# * Files in device/ subdir should not include files outside.
target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/arange.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/binary_two.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_contiguous.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_dynamic.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/copy/copy_general_input.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/conv/gemm_conv.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/conv/gemm_grouped_conv.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/cublas_utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/cudnn_utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device_info.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/custom_kernel.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/distributed.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/eval.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/event.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/fence.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/gemms/gemv.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/gemms/cublas_gemm.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/gemms/grouped_gemm_unaligned.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/jit_module.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/kernel_utils.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/layer_norm.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/logsumexp.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/random.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/all_reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/col_reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/init_reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce/row_reduce.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/rms_norm.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/rope.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/scaled_dot_product_attention.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/scaled_dot_product_attention.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/scan.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/affine_quantize.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/fp_quantize.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/quantized.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/qqmm.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/qqmm_utils.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/convert_fp8.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)

add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/binary)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/quantized/qmm)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unary)

# fp4 is not available on < 12.8
if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8.0)
  target_include_directories(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/quantized/)
  target_sources(mlx
                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/quantized/no_qqmm_impl.cpp)
else()
  target_sources(
    mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/quantized/qqmm_impl.cpp
                ${CMAKE_CURRENT_SOURCE_DIR}/quantized/cublas_qqmm.cpp)
endif()

if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9.0)
  target_sources(
    mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/cublas_gemm_batched_12_9.cu)
else()
  target_sources(
    mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gemms/cublas_gemm_batched_12_0.cpp)
endif()

# Embed kernel sources in binary for JIT compilation.
file(
  GLOB MLX_JIT_SOURCES
  RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
  "${CMAKE_CURRENT_SOURCE_DIR}/device/*.h"
  "${CMAKE_CURRENT_SOURCE_DIR}/device/*.cuh")
string(JOIN ":" MLX_JIT_SOURCES_ARG ${MLX_JIT_SOURCES})
add_custom_command(
  OUTPUT gen/cuda_jit_sources.h
  COMMAND
    ${CMAKE_COMMAND} -DMLX_SOURCE_ROOT=${CMAKE_CURRENT_SOURCE_DIR}
    -DMLX_JIT_SOURCES=${MLX_JIT_SOURCES_ARG} -P
    "${CMAKE_CURRENT_SOURCE_DIR}/bin2h.cmake"
  DEPENDS bin2h.cmake ${MLX_JIT_SOURCES})
add_custom_target(cuda_jit_sources DEPENDS gen/cuda_jit_sources.h)
add_dependencies(mlx cuda_jit_sources)
target_include_directories(mlx PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/gen")

# ------------------------ Compilation configs ------------------------

target_compile_definitions(mlx PRIVATE MLX_USE_CUDA)

# Enable defining device lambda functions.
target_compile_options(mlx
                       PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>")

# Enable calling host constexpr functions from device. This is needed because
# the constexpr version of isnan is host only.
target_compile_options(
  mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>")

if(MSVC)
  # Ignore warnings from CUTLASS.
  target_compile_options(
    mlx PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe="--diag_suppress=2908">)
else()
  # Required for generating optimized CUTLASS code.
  target_compile_options(
    mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-fno-strict-aliasing>")
endif()

# Suppress nvcc warnings on C++ headers.
target_compile_options(
  mlx
  PRIVATE
    $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe="--diag_suppress=27,997,1394,20011,20208">
)

# Ignore some valid nvcc warnings, we might want to fix them in future.
target_compile_options(
  mlx PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe="--diag_suppress=177,550">)

# Use stronger binaries compression. This feature was introduced in CUDA 12.8
# and requires drivers released after CUDA 12.4.
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8.0)
  target_compile_options(
    mlx PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--compress-mode=size>")
endif()

# Use native CUDA arch by default.
if(NOT DEFINED MLX_CUDA_ARCHITECTURES)
  execute_process(
    COMMAND __nvcc_device_query
    OUTPUT_VARIABLE MLX_CUDA_ARCHITECTURES
    OUTPUT_STRIP_TRAILING_WHITESPACE)
  if(MLX_CUDA_ARCHITECTURES STREQUAL "")
    message(
      FATAL_ERROR
        "Can not get native CUDA arch, must set MLX_CUDA_ARCHITECTURES")
  elseif(MLX_CUDA_ARCHITECTURES GREATER_EQUAL 90)
    # Use arch-specific compute capability whenever possible.
    set(MLX_CUDA_ARCHITECTURES "${MLX_CUDA_ARCHITECTURES}a")
  endif()
endif()
message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}")
set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES
                                     "${MLX_CUDA_ARCHITECTURES}")

# Skip Hopper-only kernels when not building for sm90a.
if(NOT DEFINED ENV{MLX_DISABLE_SM90A_KERNELS}
   AND (("90a" IN_LIST MLX_CUDA_ARCHITECTURES) OR ("90a-real" IN_LIST
                                                   MLX_CUDA_ARCHITECTURES)))
  target_compile_definitions(mlx PRIVATE MLX_CUDA_SM90A_ENABLED)
endif()

# Search CUDA libs from installed python packages.
if(WIN32)
  # Resolve paths of unfound DLL at runtime.
  if(BUILD_SHARED_LIBS)
    target_link_libraries(mlx PRIVATE "delayimp.lib")
    target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/delayload.cpp)
  else()
    # For static library the delayload must be compiled into final executables.
    target_link_libraries(mlx PUBLIC "delayimp.lib")
    target_sources(
      mlx PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/delayload.cpp>)
  endif()
  # Get all the CUDA DLLs we could link with.
  file(
    GLOB CUDA_DLL_NAMES
    RELATIVE "${CUDAToolkit_BIN_DIR}/x64"
    "${CUDAToolkit_BIN_DIR}/x64/*.dll")
  # Delay load CUDA and cuDNN libs.
  foreach(CUDA_DLL ${CUDA_DLL_NAMES} ${CUDNN_DLL_NAMES})
    target_link_options(mlx PUBLIC "/DELAYLOAD:${CUDA_DLL}")
  endforeach()
  # Pass the locations where CUDA DLLs are placed.
  if(NOT MLX_LOAD_CUDA_LIBS_FROM_PYTHON)
    target_compile_definitions(
      mlx PUBLIC MLX_CUDA_BIN_DIR="${CUDAToolkit_BIN_DIR}/x64"
                 MLX_CUDNN_BIN_DIR="${CUDNN_BIN_DIR}")
  endif()
else()
  # For POSIX we rely on RPATH to search for CUDA libs.
  if(MLX_LOAD_CUDA_LIBS_FROM_PYTHON)
    set_property(
      TARGET mlx
      APPEND
      PROPERTY INSTALL_RPATH
               # The paths here should match the install_requires in setup.py.
               "$ORIGIN/../../nvidia/cublas/lib"
               "$ORIGIN/../../nvidia/cuda_nvrtc/lib"
               "$ORIGIN/../../nvidia/cudnn/lib"
               "$ORIGIN/../../nvidia/nccl/lib")
  endif()
endif()

# ------------------------ Dependencies ------------------------

# Use fixed version of CCCL.
FetchContent_Declare(
  cccl
  URL "https://github.com/NVIDIA/cccl/releases/download/v3.1.3/cccl-v3.1.3.zip")
FetchContent_MakeAvailable(cccl)
target_include_directories(mlx BEFORE PRIVATE "${cccl_SOURCE_DIR}/include")

# Install CCCL headers for JIT.
install(DIRECTORY ${cccl_SOURCE_DIR}/include/cuda
        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cccl)
install(DIRECTORY ${cccl_SOURCE_DIR}/include/nv
        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cccl)

# The binary of C++ tests will not be installed so it can not find the CCCL
# headers, and we have to hard-code the path.
if(MLX_BUILD_TESTS)
  target_compile_definitions(mlx
                             PRIVATE MLX_CCCL_DIR="${cccl_SOURCE_DIR}/include")
endif()

# Use fixed version of NVTX.
FetchContent_Declare(
  nvtx3
  GIT_REPOSITORY https://github.com/NVIDIA/NVTX.git
  GIT_TAG v3.1.1
  GIT_SHALLOW TRUE
  SOURCE_SUBDIR c EXCLUDE_FROM_ALL)
FetchContent_MakeAvailable(nvtx3)
target_link_libraries(mlx PUBLIC $<BUILD_INTERFACE:nvtx3-cpp>)

# Make cuda runtime APIs available in non-cuda files.
target_include_directories(mlx PRIVATE ${CUDAToolkit_INCLUDE_DIRS})

# Use cublasLt.
target_link_libraries(mlx PRIVATE CUDA::cublasLt)

# Use cuFFT.
target_link_libraries(mlx PRIVATE CUDA::cufft)

# Use NVRTC and driver APIs.
target_link_libraries(mlx PRIVATE CUDA::nvrtc CUDA::cuda_driver)

# Use the frontend APIs of cuDNN.
FetchContent_Declare(
  cudnn
  GIT_REPOSITORY https://github.com/NVIDIA/cudnn-frontend.git
  GIT_TAG v1.16.0
  GIT_SHALLOW TRUE
  EXCLUDE_FROM_ALL)
set(CUDNN_FRONTEND_SKIP_JSON_LIB ON)
set(CUDNN_FRONTEND_BUILD_SAMPLES OFF)
set(CUDNN_FRONTEND_BUILD_TESTS OFF)
set(CUDNN_FRONTEND_BUILD_PYTHON_BINDINGS OFF)
FetchContent_MakeAvailable(cudnn)
target_link_libraries(mlx PRIVATE cudnn_frontend)
# Link with the actual cuDNN libraries.
target_link_libraries(mlx PRIVATE CUDNN::cudnn_all)

# Use header-only CUTLASS.
FetchContent_Declare(
  cutlass
  GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
  GIT_TAG v4.3.5
  GIT_SHALLOW TRUE
  SOURCE_SUBDIR include EXCLUDE_FROM_ALL)
FetchContent_MakeAvailable(cutlass)
target_include_directories(
  mlx SYSTEM PRIVATE $<BUILD_INTERFACE:${cutlass_SOURCE_DIR}/include>)


================================================
FILE: mlx/backend/cuda/allocator.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/allocator.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/utils.h"
#include "mlx/backend/gpu/device_info.h"
#include "mlx/memory.h"
#include "mlx/scheduler.h"
#include "mlx/utils.h"

#include <cuda_runtime.h>
#include <fmt/format.h>

#include <cassert>
#include <fstream>
#include <string>

namespace mlx::core {

namespace cu {

constexpr int page_size = 16384;

// Any allocations smaller than this will try to use the small pool
constexpr int small_block_size = 8;

// The small pool size in bytes. This should be a multiple of the host page
// size and small_block_size.
constexpr int small_pool_size = 4 * page_size;

// Check if running on Windows or Windows Subsystem for Linux
bool is_windows() {
#if defined(_WIN32)
  return true;
#elif defined(__linux__)
  // WSL kernels contain "microsoft" or "WSL" in /proc/version
  static bool is_wsl = []() {
    std::ifstream version("/proc/version");
    if (version.is_open()) {
      std::string line;
      std::getline(version, line);
      return line.find("microsoft") != std::string::npos ||
          line.find("Microsoft") != std::string::npos ||
          line.find("WSL") != std::string::npos;
    }
    return false;
  }();
  return is_wsl;
#else
  return false;
#endif
}

bool supports_managed_memory() {
  static bool managed_memory = []() {
    int device_count = gpu::device_count();
    for (int i = 0; i < device_count; ++i) {
      auto& d = cu::device(i);
      if (!d.managed_memory()) {
        return false;
      }
      // Empirically on Windows (and WSL) if there is no concurrentManagedAccess
      // the managed memory also does not work.
      if (is_windows() && !d.concurrent_managed_access()) {
        return false;
      }
    }
    return true;
  }();
  return managed_memory;
}

inline void* unified_malloc(size_t size) {
  void* data = nullptr;
  if (supports_managed_memory()) {
    CHECK_CUDA_ERROR(cudaMallocManaged(&data, size));
  } else {
    CHECK_CUDA_ERROR(cudaMallocHost(&data, size));
  }
  return data;
}

inline void unified_free(void* data) {
  if (supports_managed_memory()) {
    CHECK_CUDA_ERROR(cudaFree(data));
  } else {
    CHECK_CUDA_ERROR(cudaFreeHost(data));
  }
}

#if CUDART_VERSION >= 13000
inline cudaMemLocation cuda_mem_loc(int i) {
  cudaMemLocation loc;
  loc.type = cudaMemLocationTypeDevice;
  loc.id = i;
  return loc;
}
#else
inline int cuda_mem_loc(int i) {
  return i;
}
#endif // CUDART_VERSION >= 13000

SmallSizePool::SmallSizePool() {
  auto num_blocks = small_pool_size / small_block_size;
  buffer_ = new Block[num_blocks];
  next_free_ = buffer_;

  data_ = unified_malloc(small_pool_size);
  if (supports_managed_memory()) {
    int device_count = gpu::device_count();
    for (int i = 0; i < device_count; ++i) {
      if (device(i).concurrent_managed_access()) {
        auto loc = cuda_mem_loc(i);
        CHECK_CUDA_ERROR(cudaMemAdvise(
            data_, small_pool_size, cudaMemAdviseSetAccessedBy, loc));
      }
    }
  }

  auto curr = next_free_;
  for (size_t i = 1; i < num_blocks; ++i) {
    curr->next = buffer_ + i;
    curr = curr->next;
  }
  curr->next = nullptr;
}

SmallSizePool::~SmallSizePool() {
  unified_free(data_);
  delete[] buffer_;
}

CudaBuffer* SmallSizePool::malloc() {
  if (next_free_ == nullptr) {
    return nullptr;
  }
  Block* b = next_free_;
  uint64_t i = next_free_ - buffer_;
  next_free_ = next_free_->next;
  b->buf.data = static_cast<char*>(data_) + i * small_block_size;
  b->buf.size = small_block_size;
  b->buf.device = -1;
  return &b->buf;
}

void SmallSizePool::free(CudaBuffer* buf) {
  auto b = reinterpret_cast<Block*>(buf);
  b->next = next_free_;
  next_free_ = b;
}

bool SmallSizePool::in_pool(CudaBuffer* buf) {
  constexpr int num_blocks = (small_pool_size / small_block_size);
  auto b = reinterpret_cast<Block*>(buf);
  int64_t block_num = b - buffer_;
  return block_num >= 0 && block_num < num_blocks;
}

CudaAllocator::CudaAllocator()
    : buffer_cache_(
          page_size,
          [](CudaBuffer* buf) { return buf->size; },
          [this](CudaBuffer* buf) { free_cuda_buffer(buf); }) {
  size_t free;
  CHECK_CUDA_ERROR(cudaMemGetInfo(&free, &total_memory_));
  memory_limit_ = total_memory_ * 0.95;
  free_limit_ = total_memory_ - memory_limit_;
  max_pool_size_ = memory_limit_;

  int device_count = gpu::device_count();
  free_streams_.resize(device_count);
  mem_pools_.resize(device_count);
  for (int i = 0; i < device_count; ++i) {
    auto& d = device(i);
    if (d.memory_pools()) {
      free_streams_[i] = CudaStream(d);
      CHECK_CUDA_ERROR(cudaDeviceGetDefaultMemPool(&mem_pools_[i], i));
    }
  }
}

Buffer
CudaAllocator::malloc_async(size_t size, int device, cudaStream_t stream) {
  if (size == 0) {
    return Buffer{new CudaBuffer{nullptr, 0, -1}};
  }

  if (size <= small_block_size) {
    size = 8;
  } else if (size < page_size) {
    size = next_power_of_2(size);
  } else {
    size = page_size * ((size + page_size - 1) / page_size);
  }

  if (size <= small_block_size || stream == nullptr) {
    device = -1;
  }

  // Find available buffer from cache.
  std::unique_lock lock(mutex_);
  CudaBuffer* buf = buffer_cache_.reuse_from_cache(size);
  if (!buf) {
    // If we have a lot of memory pressure try to reclaim memory from the cache.
    int64_t mem_to_free =
        get_active_memory() + get_cache_memory() + size - memory_limit_;
    if (mem_to_free > 0) {
      buffer_cache_.release_cached_buffers(mem_to_free);
    }

    // Try the scalar pool first
    if (size <= small_block_size) {
      buf = scalar_pool_.malloc();
    }
    lock.unlock();
    if (!buf) {
      void* data = nullptr;
      if (device == -1) {
        data = unified_malloc(size);
      } else {
        cu::device(device).make_current();
        if (mem_pools_[device]) { // supports memory pools
          CHECK_CUDA_ERROR(cudaMallocAsync(&data, size, stream));
        } else {
          CHECK_CUDA_ERROR(cudaMalloc(&data, size));
        }
      }
      if (!data) {
        std::ostringstream msg;
        msg << "[malloc] Unable to allocate " << size << " bytes.";
        throw std::runtime_error(msg.str());
      }
      buf = new CudaBuffer{data, size, device};
    }
    lock.lock();

    // If any cuda memory pool has too much reserved memory, clear some
    // memory from the cache. This prevents graph / kernel execution failing
    // from OOM
    if (get_cache_memory() > 0) {
      for (auto p : mem_pools_) {
        if (p) {
          size_t used = 0;
          CHECK_CUDA_ERROR(cudaMemPoolGetAttribute(
              p, cudaMemPoolAttrReservedMemCurrent, &used));
          if (used > (total_memory_ - free_limit_)) {
            buffer_cache_.release_cached_buffers(free_limit_);
            break;
          }
        }
      }
    }
  }
  active_memory_ += buf->size;
  peak_memory_ = std::max(active_memory_, peak_memory_);

  // Maintain the cache below the requested limit.
  if (get_cache_memory() > max_pool_size_) {
    buffer_cache_.release_cached_buffers(get_cache_memory() - max_pool_size_);
  }
  lock.unlock();
  // Copy to unified memory here if the buffer is not on the right device.
  if (buf->device >= 0 && buf->device != device) {
    move_to_unified_memory(*buf, stream);
  }
  return Buffer{buf};
}

Buffer CudaAllocator::malloc(size_t size) {
  return malloc_async(size, -1, nullptr);
}

void CudaAllocator::free(Buffer buffer) {
  auto* buf = static_cast<CudaBuffer*>(buffer.ptr());
  if (!buf) {
    return;
  }
  if (buf->size == 0) {
    delete buf;
    return;
  }

  std::unique_lock lock(mutex_);
  active_memory_ -= buf->size;
  if (get_cache_memory() < max_pool_size_) {
    buffer_cache_.recycle_to_cache(buf);
  } else {
    free_cuda_buffer(buf);
  }
}

size_t CudaAllocator::size(Buffer buffer) const {
  auto* buf = static_cast<CudaBuffer*>(buffer.ptr());
  if (!buf) {
    return 0;
  }
  return buf->size;
}

void CudaAllocator::move_to_unified_memory(
    CudaBuffer& buf,
    cudaStream_t stream) {
  if (buf.device == -1) {
    return;
  }
  void* data = unified_malloc(buf.size);
  cudaMemcpyKind kind =
      supports_managed_memory() ? cudaMemcpyDefault : cudaMemcpyDeviceToHost;
  if (stream && mem_pools_[buf.device]) {
    CHECK_CUDA_ERROR(cudaMemcpyAsync(data, buf.data, buf.size, kind, stream));
    free_async(buf, stream);
  } else {
    CHECK_CUDA_ERROR(cudaMemcpy(data, buf.data, buf.size, kind));
    free_async(buf);
  }
  buf.data = data;
  buf.device = -1;
}

// This must be called with mutex_ aquired
void CudaAllocator::free_cuda_buffer(CudaBuffer* buf) {
  if (scalar_pool_.in_pool(buf)) {
    scalar_pool_.free(buf);
  } else {
    free_async(*buf);
    delete buf;
  }
}

void CudaAllocator::free_async(CudaBuffer& buf, cudaStream_t stream) {
  if (buf.device == -1) {
    unified_free(buf.data);
  } else {
    // Free asynchronously when memory pools is supported.
    if (mem_pools_[buf.device]) {
      if (!stream) {
        stream = free_streams_[buf.device];
      }
      CHECK_CUDA_ERROR(cudaFreeAsync(buf.data, stream));
    } else {
      CHECK_CUDA_ERROR(cudaFree(buf.data));
    }
  }
}

size_t CudaAllocator::get_active_memory() const {
  return active_memory_;
}

size_t CudaAllocator::get_peak_memory() const {
  return peak_memory_;
}

void CudaAllocator::reset_peak_memory() {
  std::lock_guard lock(mutex_);
  peak_memory_ = 0;
}

size_t CudaAllocator::get_memory_limit() {
  return memory_limit_;
}

size_t CudaAllocator::set_memory_limit(size_t limit) {
  std::lock_guard lock(mutex_);
  std::swap(limit, memory_limit_);
  return limit;
}

size_t CudaAllocator::get_cache_memory() const {
  return buffer_cache_.cache_size();
}

size_t CudaAllocator::set_cache_limit(size_t limit) {
  std::lock_guard lk(mutex_);
  std::swap(limit, max_pool_size_);
  return limit;
}

void CudaAllocator::clear_cache() {
  std::lock_guard lk(mutex_);
  buffer_cache_.clear();
}

CudaAllocator& allocator() {
  static auto* allocator_ = []() {
    // Ensure scheduler is created before allocator.
    scheduler::scheduler();
    // By creating the |allocator_| on heap, the destructor of CudaAllocator
    // will not be called on exit and buffers in the cache will be leaked. This
    // can save some time at program exit.
    return new CudaAllocator();
  }();
  return *allocator_;
}

Buffer malloc_async(size_t size, CommandEncoder& encoder) {
  return allocator().malloc_async(
      size, encoder.device().cuda_device(), encoder.stream());
}

} // namespace cu

namespace allocator {

Allocator& allocator() {
  return cu::allocator();
}

void* Buffer::raw_ptr() {
  if (!ptr_) {
    return nullptr;
  }
  auto& cbuf = *static_cast<cu::CudaBuffer*>(ptr_);
  cu::allocator().move_to_unified_memory(cbuf);
  return cbuf.data;
}

} // namespace allocator

size_t get_active_memory() {
  return cu::allocator().get_active_memory();
}
size_t get_peak_memory() {
  return cu::allocator().get_peak_memory();
}
void reset_peak_memory() {
  return cu::allocator().reset_peak_memory();
}
size_t set_memory_limit(size_t limit) {
  return cu::allocator().set_memory_limit(limit);
}
size_t get_memory_limit() {
  return cu::allocator().get_memory_limit();
}
size_t get_cache_memory() {
  return cu::allocator().get_cache_memory();
}
size_t set_cache_limit(size_t limit) {
  return cu::allocator().set_cache_limit(limit);
}
void clear_cache() {
  cu::allocator().clear_cache();
}

// Not supported in CUDA.
size_t set_wired_limit(size_t) {
  return 0;
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/allocator.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/allocator.h"
#include "mlx/backend/common/buffer_cache.h"
#include "mlx/backend/cuda/cuda_utils.h"

#include <cuda_runtime.h>
#include <mutex>
#include <set>
#include <utility>

namespace mlx::core::cu {

class CommandEncoder;

using allocator::Buffer;

// Stores cuda-managed unified memory.
struct CudaBuffer {
  void* data;
  size_t size;
  int device; // -1 for managed
};

class SmallSizePool {
 private:
  union Block {
    Block* next;
    CudaBuffer buf;
  };

  Block* buffer_{nullptr};
  void* data_{nullptr};
  Block* next_free_{nullptr};

 public:
  SmallSizePool();
  ~SmallSizePool();

  SmallSizePool(const SmallSizePool&) = delete;
  SmallSizePool& operator=(const SmallSizePool&) = delete;

  CudaBuffer* malloc();
  void free(CudaBuffer* buf);
  bool in_pool(CudaBuffer* buf);
};

class CudaAllocator : public allocator::Allocator {
 public:
  Buffer malloc(size_t size) override;
  Buffer malloc_async(size_t size, int device, cudaStream_t stream);
  void free(Buffer buffer) override;
  size_t size(Buffer buffer) const override;

  // Replace the memory of |buf| with unified memory (managed memory or pinned
  // host memory), and copy the data over. Pass |stream| to copy asynchronously.
  void move_to_unified_memory(CudaBuffer& buf, cudaStream_t stream = nullptr);

  size_t get_active_memory() const;
  size_t get_peak_memory() const;
  void reset_peak_memory();
  size_t get_memory_limit();
  size_t set_memory_limit(size_t limit);
  size_t get_cache_memory() const;
  size_t set_cache_limit(size_t limit);
  void clear_cache();

 private:
  void free_cuda_buffer(CudaBuffer* buf);
  void free_async(CudaBuffer& buf, cudaStream_t stream = nullptr);

  CudaAllocator();
  friend CudaAllocator& allocator();

  std::mutex mutex_;
  size_t memory_limit_;
  size_t free_limit_;
  size_t total_memory_;
  size_t max_pool_size_;
  BufferCache<CudaBuffer> buffer_cache_;
  size_t active_memory_{0};
  size_t peak_memory_{0};
  std::vector<CudaStream> free_streams_;
  std::vector<cudaMemPool_t> mem_pools_;
  SmallSizePool scalar_pool_;
};

CudaAllocator& allocator();

Buffer malloc_async(size_t size, CommandEncoder& encoder);

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/arange.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/device/fp16_math.cuh"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/dtype_utils.h"
#include "mlx/primitives.h"

#include <cooperative_groups.h>
#include <nvtx3/nvtx3.hpp>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <typename T, typename IdxT, int N_WRITES>
__global__ void arange(T* out, IdxT size, T start, T step) {
  IdxT index = cg::this_grid().thread_rank();

  if ((index + 1) * N_WRITES > size) {
    for (IdxT i = index * N_WRITES; i < size; ++i) {
      out[i] = start + i * step;
    }
  } else {
    AlignedVector<T, N_WRITES> out_vec;
#pragma unroll
    for (int i = 0; i < N_WRITES; ++i) {
      out_vec[i] = start + (index * N_WRITES + i) * step;
    }

    store_vector<N_WRITES>(out, index, out_vec);
  }
}

} // namespace cu

void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Arange::eval_gpu");
  if (out.size() == 0) {
    return;
  }
  auto& encoder = cu::get_command_encoder(stream());
  out.set_data(cu::malloc_async(out.nbytes(), encoder));
  encoder.set_output_array(out);

  dispatch_int_float_types(out.dtype(), "Arange", [&](auto type_tag) {
    using CTYPE = MLX_GET_TYPE(type_tag);
    using OutType = cuda_type_t<CTYPE>;
    constexpr int N_WRITES = 16 / sizeof(OutType);
    dispatch_bool(out.data_size() > INT32_MAX, [&](auto large) {
      using IdxT = std::conditional_t<large(), int64_t, int32_t>;
      auto [num_blocks, block_dims] = get_launch_args(out, large(), N_WRITES);
      encoder.add_kernel_node(
          cu::arange<OutType, IdxT, N_WRITES>,
          num_blocks,
          block_dims,
          gpu_ptr<OutType>(out),
          out.data_size(),
          static_cast<CTYPE>(start_),
          static_cast<CTYPE>(start_ + step_) - static_cast<CTYPE>(start_));
    });
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/arg_reduce.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/utils.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/device/fp16_math.cuh"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/dtype_utils.h"
#include "mlx/primitives.h"

#include <cooperative_groups.h>
#include <nvtx3/nvtx3.hpp>
#include <cub/block/block_load.cuh>
#include <cub/block/block_reduce.cuh>

#include <cassert>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <typename T>
struct IndexValPair {
  uint32_t index;
  T val;
};

template <typename T>
struct ArgMin {
  constexpr __device__ T init() {
    return Limits<T>::max();
  }

  __device__ IndexValPair<T> operator()(
      const IndexValPair<T>& best,
      const IndexValPair<T>& current) {
    if (best.val > current.val ||
        (best.val == current.val && best.index > current.index)) {
      return current;
    } else {
      return best;
    }
  }

  template <int N>
  __device__ IndexValPair<T> reduce_many(
      IndexValPair<T> best,
      const AlignedVector<T, N>& vals,
      uint32_t offset) {
#pragma unroll
    for (int i = 0; i < N; i++) {
      if (vals[i] < best.val) {
        best.val = vals[i];
        best.index = offset + i;
      }
    }
    return best;
  }
};

template <typename T>
struct ArgMax {
  constexpr __device__ T init() {
    return Limits<T>::min();
  }

  __device__ IndexValPair<T> operator()(
      const IndexValPair<T>& best,
      const IndexValPair<T>& current) {
    if (best.val < current.val ||
        (best.val == current.val && best.index > current.index)) {
      return current;
    } else {
      return best;
    }
  }

  template <int N>
  __device__ IndexValPair<T> reduce_many(
      IndexValPair<T> best,
      const AlignedVector<T, N>& vals,
      uint32_t offset) {
#pragma unroll
    for (int i = 0; i < N; i++) {
      if (vals[i] > best.val) {
        best.val = vals[i];
        best.index = offset + i;
      }
    }
    return best;
  }
};

template <typename T, typename Op, int BLOCK_DIM, int N_READS = 4>
__global__ void arg_reduce_general(
    const T* in,
    uint32_t* out,
    size_t size,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides in_strides,
    const __grid_constant__ Strides out_strides,
    int32_t ndim,
    int64_t axis_stride,
    int32_t axis_size) {
  auto block = cg::this_thread_block();

  int64_t index = cg::this_grid().block_rank();
  if (index >= size) {
    return;
  }

  int64_t in_idx = elem_to_loc(index, shape.data(), in_strides.data(), ndim);
  int64_t out_idx = elem_to_loc(index, shape.data(), out_strides.data(), ndim);
  in += in_idx;

  Op op;
  T init = op.init();
  IndexValPair<T> best{0, init};

  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
    auto tid = r * BLOCK_DIM + block.thread_index().x;
    auto vals = load_vector<N_READS>(in, tid, axis_size, axis_stride, init);
    best = op.reduce_many(best, vals, tid * N_READS);
  }

  typedef cub::BlockReduce<IndexValPair<T>, BLOCK_DIM> BlockReduceT;
  __shared__ typename BlockReduceT::TempStorage temp;

  best = BlockReduceT(temp).Reduce(best, op);

  if (block.thread_rank() == 0) {
    out[out_idx] = best.index;
  }
}

} // namespace cu

void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("ArgReduce::eval_gpu");
  assert(inputs.size() == 1);
  auto& in = inputs[0];

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
  out.set_data(cu::malloc_async(out.nbytes(), encoder));

  // Prepare the shapes, strides and axis arguments.
  Shape shape = remove_index(in.shape(), axis_);
  Strides in_strides = remove_index(in.strides(), axis_);
  Strides out_strides = out.ndim() == in.ndim()
      ? remove_index(out.strides(), axis_)
      : out.strides();
  int64_t axis_stride = in.strides()[axis_];
  int32_t axis_size = in.shape()[axis_];
  int32_t ndim = shape.size();

  // ArgReduce.
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  dispatch_real_types(in.dtype(), "ArgReduce", [&](auto type_tag) {
    using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    constexpr uint32_t N_READS = 4;
    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
      dim3 num_blocks = get_2d_grid_dims(out.shape(), out.strides());
      auto kernel =
          cu::arg_reduce_general<T, cu::ArgMax<T>, block_dim(), N_READS>;
      if (reduce_type_ == ArgReduce::ArgMin) {
        kernel = cu::arg_reduce_general<T, cu::ArgMin<T>, block_dim(), N_READS>;
      }
      encoder.add_kernel_node(
          kernel,
          num_blocks,
          block_dim(),
          gpu_ptr<T>(in),
          gpu_ptr<uint32_t>(out),
          out.size(),
          const_param(shape),
          const_param(in_strides),
          const_param(out_strides),
          ndim,
          axis_stride,
          axis_size);
    });
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/bin2h.cmake
================================================
# Based on: https://github.com/sivachandran/cmake-bin2h
#
# Copyright 2020 Sivachandran Paramasivam
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

include(CMakeParseArguments)

# Function to wrap a given string into multiple lines at the given column
# position.
#
# Parameters:
#
# * VARIABLE - The name of the CMake variable holding the string.
# * AT_COLUMN - The column position at which string will be wrapped.
function(WRAP_STRING)
  set(oneValueArgs VARIABLE AT_COLUMN)
  cmake_parse_arguments(WRAP_STRING "${options}" "${oneValueArgs}" "" ${ARGN})

  string(LENGTH ${${WRAP_STRING_VARIABLE}} stringLength)
  math(EXPR offset "0")

  while(stringLength GREATER 0)
    if(stringLength GREATER ${WRAP_STRING_AT_COLUMN})
      math(EXPR length "${WRAP_STRING_AT_COLUMN}")
    else()
      math(EXPR length "${stringLength}")
    endif()

    string(SUBSTRING ${${WRAP_STRING_VARIABLE}} ${offset} ${length} line)
    set(lines "${lines}\n ${line}")

    math(EXPR stringLength "${stringLength} - ${length}")
    math(EXPR offset "${offset} + ${length}")
  endwhile()

  set(${WRAP_STRING_VARIABLE}
      "${lines}"
      PARENT_SCOPE)
endfunction()

# Function to embed contents of a file as byte array in C/C++ header file(.h).
# The header file will contain a byte array and integer variable holding the
# size of the array.
#
# Parameters:
#
# * SOURCE_FILES - The paths of source files whose contents will be embedded in
#   the header file.
# * VARIABLE_NAME - The name of the variable for the byte array. The string
#   "_SIZE" will be append to this name and will be used a variable name for
#   size variable.
# * HEADER_FILE - The path of header file.
# * APPEND - If specified appends to the header file instead of overwriting it
# * HEADER_NAMESPACE - The namespace, where the array should be located in.
# * NULL_TERMINATE - If specified a null byte(zero) will be append to the byte
#   array.
#
# Usage:
#
# bin2h(SOURCE_FILE "Logo.png" HEADER_FILE "Logo.h" VARIABLE_NAME "LOGO_PNG")
function(BIN2H)
  set(options APPEND NULL_TERMINATE)
  set(oneValueArgs VARIABLE_NAME HEADER_FILE HEADER_NAMESPACE)
  set(multiValueArgs SOURCE_FILES)
  cmake_parse_arguments(BIN2H "${options}" "${oneValueArgs}"
                        "${multiValueArgs}" ${ARGN})

  set(arrayDefinition "")
  foreach(SOURCE_FILE IN LISTS BIN2H_SOURCE_FILES)
    # get filename without extension
    get_filename_component(FILE_NAME_WE ${SOURCE_FILE} NAME_WE)
    # convert the filename to a valid C identifier
    string(MAKE_C_IDENTIFIER "${FILE_NAME_WE}" VALID_FILE_NAME)

    # reads source file contents as hex string
    file(READ ${SOURCE_FILE} hexString HEX)

    # append null
    if(BIN2H_NULL_TERMINATE)
      string(APPEND hexString "00")
    endif()

    # wraps the hex string into multiple lines
    wrap_string(VARIABLE hexString AT_COLUMN 24)

    # strip the © in source code
    string(REGEX REPLACE "c2a9" "2020" arrayValues ${hexString})

    string(REGEX REPLACE "([0-9a-f][0-9a-f])" " 0x\\1," arrayValues
                         ${arrayValues})

    # make a full variable name for the array
    set(FULL_VARIABLE_NAME "${BIN2H_VARIABLE_NAME}_${VALID_FILE_NAME}")

    # declares byte array and the length variables
    string(APPEND arrayDefinition
           "constexpr char ${FULL_VARIABLE_NAME}[] = {${arrayValues}\n};\n\n")
  endforeach()

  # add namespace wrapper if defined
  if(DEFINED BIN2H_HEADER_NAMESPACE)
    set(namespaceStart "namespace ${BIN2H_HEADER_NAMESPACE} {")
    set(namespaceEnd "} // namespace ${BIN2H_HEADER_NAMESPACE}")
    set(declarations "${namespaceStart}\n\n${arrayDefinition}${namespaceEnd}\n")
  endif()

  set(arrayIncludes "#pragma once")
  string(PREPEND declarations "${arrayIncludes}\n\n")

  if(BIN2H_APPEND)
    file(APPEND ${BIN2H_HEADER_FILE} "${declarations}")
  else()
    file(WRITE ${BIN2H_HEADER_FILE} "${declarations}")
  endif()
endfunction()

# ----------------------------- CLI args -----------------------------

string(REPLACE ":" ";" MLX_JIT_SOURCES_LIST ${MLX_JIT_SOURCES})
foreach(source ${MLX_JIT_SOURCES_LIST})
  list(APPEND MLX_JIT_SOURCES_ABS "${MLX_SOURCE_ROOT}/${source}")
endforeach()

bin2h(
  SOURCE_FILES
  ${MLX_JIT_SOURCES_ABS}
  NULL_TERMINATE
  VARIABLE_NAME
  "jit_source"
  HEADER_NAMESPACE
  "mlx::core"
  HEADER_FILE
  "${CMAKE_CURRENT_BINARY_DIR}/gen/cuda_jit_sources.h")


================================================
FILE: mlx/backend/cuda/binary/CMakeLists.txt
================================================
target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/add.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arctan2.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/bitwise_binary.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/divide.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/equal.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/greater.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/greater_equal.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/less.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/less_equal.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/logical_and.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/logical_or.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/log_add_exp.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/minimum.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/maximum.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/multiply.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/power.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/remainder.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/not_equal.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/subtract.cu)


================================================
FILE: mlx/backend/cuda/binary/add.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
BINARY_GPU(Add)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/arctan2.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
BINARY_GPU(ArcTan2)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/binary.cuh
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/binary.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/device/binary_ops.cuh"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/dtype_utils.h"
#include "mlx/primitives.h"

#include <cooperative_groups.h>
#include <nvtx3/nvtx3.hpp>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

constexpr int BINARY_MAX_BLOCK_DIM = 1024;

template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
__global__ __launch_bounds__(BINARY_MAX_BLOCK_DIM) void binary_ss(
    const In* a,
    const In* b,
    Out* out,
    IdxT size) {
  IdxT index = cg::this_grid().thread_rank();

  if ((index + 1) * N_READS > size) {
    for (int i = index * N_READS; i < size; ++i) {
      out[i] = Op{}(a[0], b[0]);
    }
  } else {
    AlignedVector<Out, N_READS> out_vec;
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      out_vec[i] = Op{}(a[0], b[0]);
    }

    store_vector<N_READS>(out, index, out_vec);
  }
}

template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
__global__ __launch_bounds__(BINARY_MAX_BLOCK_DIM) void binary_sv(
    const In* a,
    const In* b,
    Out* out,
    IdxT size) {
  IdxT index = cg::this_grid().thread_rank();

  if ((index + 1) * N_READS > size) {
    for (IdxT i = index * N_READS; i < size; ++i) {
      out[i] = Op{}(a[0], b[i]);
    }
  } else {
    auto b_vec = load_vector<N_READS>(b, index);

    AlignedVector<Out, N_READS> out_vec;
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      out_vec[i] = Op{}(a[0], b_vec[i]);
    }

    store_vector<N_READS>(out, index, out_vec);
  }
}

template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
__global__ __launch_bounds__(BINARY_MAX_BLOCK_DIM) void binary_vs(
    const In* a,
    const In* b,
    Out* out,
    IdxT size) {
  IdxT index = cg::this_grid().thread_rank();

  if ((index + 1) * N_READS > size) {
    for (IdxT i = index * N_READS; i < size; ++i) {
      out[i] = Op{}(a[i], b[0]);
    }
  } else {
    auto a_vec = load_vector<N_READS>(a, index);

    AlignedVector<Out, N_READS> out_vec;
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      out_vec[i] = Op{}(a_vec[i], b[0]);
    }

    store_vector<N_READS>(out, index, out_vec);
  }
}

template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
__global__ __launch_bounds__(BINARY_MAX_BLOCK_DIM) void binary_vv(
    const In* a,
    const In* b,
    Out* out,
    IdxT size) {
  IdxT index = cg::this_grid().thread_rank();

  if ((index + 1) * N_READS > size) {
    for (IdxT i = index * N_READS; i < size; ++i) {
      out[i] = Op{}(a[i], b[i]);
    }
  } else {
    auto a_vec = load_vector<N_READS>(a, index);
    auto b_vec = load_vector<N_READS>(b, index);

    AlignedVector<Out, N_READS> out_vec;
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      out_vec[i] = Op{}(a_vec[i], b_vec[i]);
    }

    store_vector<N_READS>(out, index, out_vec);
  }
}

template <
    typename Op,
    typename In,
    typename Out,
    typename IdxT,
    int NDIM,
    int N_READS>
__global__ void binary_g_nd(
    const In* a,
    const In* b,
    Out* out,
    IdxT size_rest,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides) {
  auto block = cg::this_thread_block();
  auto grid = cg::this_grid();
  IdxT index_rest =
      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
  if (index_rest >= size_rest) {
    return;
  }

  auto shape_x = shape[NDIM - 1];
  auto a_stride_x = a_strides[NDIM - 1];
  auto b_stride_x = b_strides[NDIM - 1];
  IdxT index_x =
      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
  auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
      index_rest * shape_x, shape.data(), a_strides.data(), b_strides.data());
  auto a_vec =
      load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, In(0));
  auto b_vec =
      load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, In(0));

  AlignedVector<Out, N_READS> out_vec;
#pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    out_vec[i] = Op{}(a_vec[i], b_vec[i]);
  }
  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
}

template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
__global__ void binary_g(
    const In* a,
    const In* b,
    Out* out,
    IdxT size_rest,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides a_strides,
    const __grid_constant__ Strides b_strides,
    int ndim) {
  auto block = cg::this_thread_block();
  auto grid = cg::this_grid();
  IdxT index_rest =
      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
  if (index_rest >= size_rest) {
    return;
  }

  auto shape_x = shape[ndim - 1];
  auto a_stride_x = a_strides[ndim - 1];
  auto b_stride_x = b_strides[ndim - 1];
  IdxT index_x =
      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
  auto [a_idx, b_idx] = elem_to_loc(
      index_rest * shape_x,
      shape.data(),
      a_strides.data(),
      b_strides.data(),
      ndim);
  auto a_vec =
      load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, In(0));
  auto b_vec =
      load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, In(0));

  AlignedVector<Out, N_READS> out_vec;
#pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    out_vec[i] = Op{}(a_vec[i], b_vec[i]);
  }
  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
}

template <typename Op, typename In, typename Out>
constexpr bool supports_binary_op() {
  if (std::is_same_v<Op, Add> || std::is_same_v<Op, Divide> ||
      std::is_same_v<Op, Maximum> || std::is_same_v<Op, Minimum> ||
      std::is_same_v<Op, Multiply> || std::is_same_v<Op, Subtract> ||
      std::is_same_v<Op, Power> || std::is_same_v<Op, Remainder>) {
    return std::is_same_v<In, Out>;
  }
  if (std::is_same_v<Op, Equal> || std::is_same_v<Op, Greater> ||
      std::is_same_v<Op, GreaterEqual> || std::is_same_v<Op, Less> ||
      std::is_same_v<Op, LessEqual> || std::is_same_v<Op, NotEqual>) {
    return std::is_same_v<Out, bool>;
  }
  if (std::is_same_v<Op, LogicalAnd> || std::is_same_v<Op, LogicalOr>) {
    return std::is_same_v<Out, bool> && std::is_same_v<In, bool>;
  }
  if (std::is_same_v<Op, NaNEqual>) {
    return std::is_same_v<Out, bool> && is_inexact_v<In>;
  }
  if (std::is_same_v<Op, LogAddExp>) {
    return std::is_same_v<In, Out> && is_inexact_v<In>;
  }
  if (std::is_same_v<Op, ArcTan2>) {
    return std::is_same_v<In, Out> && is_floating_v<In>;
  }
  if (std::is_same_v<Op, BitwiseAnd> || std::is_same_v<Op, BitwiseOr> ||
      std::is_same_v<Op, BitwiseXor>) {
    return std::is_same_v<In, Out> && std::is_integral_v<In>;
  }
  if (std::is_same_v<Op, LeftShift> || std::is_same_v<Op, RightShift>) {
    return std::is_same_v<In, Out> && std::is_integral_v<In> &&
        !std::is_same_v<In, bool>;
  }
  return false;
}

} // namespace cu

template <typename Op>
void binary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s) {
  assert(inputs.size() > 1);
  const auto& a = inputs[0];
  const auto& b = inputs[1];
  if (out.size() == 0) {
    return;
  }

  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
  dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
      if constexpr (cu::supports_binary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
        using InType = cuda_type_t<CTYPE_IN>;
        using OutType = cuda_type_t<CTYPE_OUT>;
        auto bopt = get_binary_op_type(a, b);
        if (bopt == BinaryOpType::General) {
          dispatch_bool(
              a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
                  out.data_size() > INT32_MAX,
              [&](auto large) {
                using IdxT = std::conditional_t<large(), int64_t, int32_t>;
                Shape shape;
                std::vector<Strides> strides;
                std::tie(shape, strides) = collapse_contiguous_dims(a, b, out);
                auto& a_strides = strides[0];
                auto& b_strides = strides[1];
                int ndim = shape.size();
                int work_per_thread = 1;
                auto dim0 = ndim > 0 ? shape.back() : 1;
                auto rest = out.size() / dim0;
                if (dim0 >= 4) {
                  work_per_thread = 4;
                }
                dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
                auto block_dims = get_block_dims(dim0, rest, 1);
                uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
                uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
                if (ndim <= 3) {
                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
                    auto kernel = cu::binary_g_nd<
                        Op,
                        InType,
                        OutType,
                        IdxT,
                        dims_constant(),
                        1>;
                    if (work_per_thread == 4) {
                      kernel = cu::binary_g_nd<
                          Op,
                          InType,
                          OutType,
                          IdxT,
                          dims_constant(),
                          4>;
                    }
                    encoder.add_kernel_node(
                        kernel,
                        {num_blocks_x, num_blocks_y},
                        block_dims,
                        gpu_ptr<InType>(a),
                        gpu_ptr<InType>(b),
                        gpu_ptr<OutType>(out),
                        rest,
                        const_param<dims_constant()>(shape),
                        const_param<dims_constant()>(a_strides),
                        const_param<dims_constant()>(b_strides));
                  });
                } else {
                  auto kernel = cu::binary_g<Op, InType, OutType, IdxT, 1>;
                  if (work_per_thread == 4) {
                    kernel = cu::binary_g<Op, InType, OutType, IdxT, 4>;
                  }
                  encoder.add_kernel_node(
                      kernel,
                      {num_blocks_x, num_blocks_y},
                      block_dims,
                      gpu_ptr<InType>(a),
                      gpu_ptr<InType>(b),
                      gpu_ptr<OutType>(out),
                      rest,
                      const_param(shape),
                      const_param(a_strides),
                      const_param(b_strides),
                      ndim);
                }
              });
        } else {
          dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) {
            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
            constexpr int N_READS = 16 / sizeof(InType);
            auto kernel = cu::binary_ss<Op, InType, OutType, IdxT, N_READS>;
            if (bopt == BinaryOpType::ScalarVector) {
              kernel = cu::binary_sv<Op, InType, OutType, IdxT, N_READS>;
            } else if (bopt == BinaryOpType::VectorScalar) {
              kernel = cu::binary_vs<Op, InType, OutType, IdxT, N_READS>;
            } else if (bopt == BinaryOpType::VectorVector) {
              kernel = cu::binary_vv<Op, InType, OutType, IdxT, N_READS>;
            }
            auto [num_blocks, block_dims] = get_launch_args(
                out.data_size(),
                out.shape(),
                out.strides(),
                large(),
                N_READS,
                cu::BINARY_MAX_BLOCK_DIM);
            encoder.add_kernel_node(
                kernel,
                num_blocks,
                block_dims,
                gpu_ptr<InType>(a),
                gpu_ptr<InType>(b),
                gpu_ptr<OutType>(out),
                out.data_size());
          });
        }
      } else {
        throw std::runtime_error(
            fmt::format(
                "Can not do binary op {} on inputs of {} with result of {}.",
                op,
                dtype_to_string(a.dtype()),
                dtype_to_string(out.dtype())));
      }
    });
  });
}

template <typename Op>
void binary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s) {
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto bopt = get_binary_op_type(a, b);
  auto& encoder = cu::get_command_encoder(s);

  set_binary_op_output_data(
      a, b, out, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });
  binary_op_gpu_inplace<Op>(inputs, out, op, s);
}

#define BINARY_GPU(func)                                              \
  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
    nvtx3::scoped_range r(#func "::eval_gpu");                        \
    auto& s = out.primitive().stream();                               \
    binary_op_gpu<cu::func>(inputs, out, name(), s);                  \
  }

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/bitwise_binary.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("BitwiseBinary::eval_gpu");
  auto& s = out.primitive().stream();
  switch (op_) {
    case BitwiseBinary::And:
      binary_op_gpu<cu::BitwiseAnd>(inputs, out, name(), s);
      break;
    case BitwiseBinary::Or:
      binary_op_gpu<cu::BitwiseOr>(inputs, out, name(), s);
      break;
    case BitwiseBinary::Xor:
      binary_op_gpu<cu::BitwiseXor>(inputs, out, name(), s);
      break;
    case BitwiseBinary::LeftShift:
      binary_op_gpu<cu::LeftShift>(inputs, out, name(), s);
      break;
    case BitwiseBinary::RightShift:
      binary_op_gpu<cu::RightShift>(inputs, out, name(), s);
      break;
  }
}
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/divide.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
BINARY_GPU(Divide)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/equal.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
void Equal::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Equal::eval_gpu");
  auto& s = out.primitive().stream();
  if (equal_nan_) {
    binary_op_gpu<cu::NaNEqual>(inputs, out, name(), s);
  } else {
    binary_op_gpu<cu::Equal>(inputs, out, name(), s);
  }
}
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/greater.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
BINARY_GPU(Greater)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/greater_equal.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
BINARY_GPU(GreaterEqual)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/less.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
BINARY_GPU(Less)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/less_equal.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
BINARY_GPU(LessEqual)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/log_add_exp.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
BINARY_GPU(LogAddExp)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/logical_and.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
BINARY_GPU(LogicalAnd)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/logical_or.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
BINARY_GPU(LogicalOr)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/maximum.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
BINARY_GPU(Maximum)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/minimum.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
BINARY_GPU(Minimum)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/multiply.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
BINARY_GPU(Multiply)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/not_equal.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
BINARY_GPU(NotEqual)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/power.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
BINARY_GPU(Power)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/remainder.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
BINARY_GPU(Remainder)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary/subtract.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/binary/binary.cuh"

namespace mlx::core {
BINARY_GPU(Subtract)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/binary_two.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/binary.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/device/binary_ops.cuh"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/dtype_utils.h"
#include "mlx/primitives.h"

#include <cooperative_groups.h>
#include <nvtx3/nvtx3.hpp>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
__global__ void
binary_two_ss(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();

  if ((index + 1) * N_READS > size) {
    for (IdxT i = index * N_READS; i < size; ++i) {
      auto out = Op{}(a[0], b[0]);
      out_a[i] = out[0];
      out_b[i] = out[1];
    }
  } else {
    AlignedVector<Out, N_READS> out_a_vec;
    AlignedVector<Out, N_READS> out_b_vec;
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      auto out = Op{}(a[0], b[0]);
      out_a_vec[i] = out[0];
      out_b_vec[i] = out[1];
    }

    store_vector<N_READS>(out_a, index, out_a_vec);
    store_vector<N_READS>(out_b, index, out_b_vec);
  }
}

template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
__global__ void
binary_two_sv(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();

  if ((index + 1) * N_READS > size) {
    for (IdxT i = index * N_READS; i < size; ++i) {
      auto out = Op{}(a[0], b[i]);
      out_a[i] = out[0];
      out_b[i] = out[1];
    }
  } else {
    auto b_vec = load_vector<N_READS>(b, index);

    AlignedVector<Out, N_READS> out_a_vec;
    AlignedVector<Out, N_READS> out_b_vec;
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      auto out = Op{}(a[0], b_vec[i]);
      out_a_vec[i] = out[0];
      out_b_vec[i] = out[1];
    }

    store_vector<N_READS>(out_a, index, out_a_vec);
    store_vector<N_READS>(out_b, index, out_b_vec);
  }
}

template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
__global__ void
binary_two_vs(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();

  if ((index + 1) * N_READS > size) {
    for (IdxT i = index * N_READS; i < size; ++i) {
      auto out = Op{}(a[i], b[0]);
      out_a[i] = out[0];
      out_b[i] = out[1];
    }
  } else {
    auto a_vec = load_vector<N_READS>(a, index);

    AlignedVector<Out, N_READS> out_a_vec;
    AlignedVector<Out, N_READS> out_b_vec;
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      auto out = Op{}(a_vec[i], b[0]);
      out_a_vec[i] = out[0];
      out_b_vec[i] = out[1];
    }

    store_vector<N_READS>(out_a, index, out_a_vec);
    store_vector<N_READS>(out_b, index, out_b_vec);
  }
}

template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
__global__ void
binary_two_vv(const In* a, const In* b, Out* out_a, Out* out_b, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();

  if ((index + 1) * N_READS > size) {
    for (IdxT i = index * N_READS; i < size; ++i) {
      auto out = Op{}(a[i], b[i]);
      out_a[i] = out[0];
      out_b[i] = out[1];
    }
  } else {
    auto a_vec = load_vector<N_READS>(a, index);
    auto b_vec = load_vector<N_READS>(b, index);

    AlignedVector<Out, N_READS> out_a_vec;
    AlignedVector<Out, N_READS> out_b_vec;
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      auto out = Op{}(a_vec[i], b_vec[i]);
      out_a_vec[i] = out[0];
      out_b_vec[i] = out[1];
    }

    store_vector<N_READS>(out_a, index, out_a_vec);
    store_vector<N_READS>(out_b, index, out_b_vec);
  }
}

template <
    typename Op,
    typename In,
    typename Out,
    typename IdxT,
    int NDIM,
    int N_READS>
__global__ void binary_two_g_nd(
    const In* a,
    const In* b,
    Out* out_a,
    Out* out_b,
    IdxT size_rest,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides) {
  auto block = cg::this_thread_block();
  auto grid = cg::this_grid();
  IdxT index_rest =
      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
  if (index_rest >= size_rest) {
    return;
  }

  auto shape_x = shape[NDIM - 1];
  auto a_stride_x = a_strides[NDIM - 1];
  auto b_stride_x = b_strides[NDIM - 1];
  IdxT index_x =
      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
  auto [a_idx, b_idx] = elem_to_loc_nd<NDIM>(
      index_rest * shape_x, shape.data(), a_strides.data(), b_strides.data());
  auto a_vec =
      load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, In(0));
  auto b_vec =
      load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, In(0));

  AlignedVector<Out, N_READS> out_vec_a;
  AlignedVector<Out, N_READS> out_vec_b;
#pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    auto out = Op{}(a_vec[i], b_vec[i]);
    out_vec_a[i] = out[0];
    out_vec_b[i] = out[1];
  }
  store_vector(out_a + shape_x * index_rest, index_x, out_vec_a, shape_x);
  store_vector(out_b + shape_x * index_rest, index_x, out_vec_b, shape_x);
}

template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
__global__ void binary_two_g(
    const In* a,
    const In* b,
    Out* out_a,
    Out* out_b,
    IdxT size_rest,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides a_strides,
    const __grid_constant__ Strides b_strides,
    int ndim) {
  auto block = cg::this_thread_block();
  auto grid = cg::this_grid();
  IdxT index_rest =
      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
  if (index_rest >= size_rest) {
    return;
  }

  auto shape_x = shape[ndim - 1];
  auto a_stride_x = a_strides[ndim - 1];
  auto b_stride_x = b_strides[ndim - 1];
  IdxT index_x =
      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
  auto [a_idx, b_idx] = elem_to_loc(
      index_rest * shape_x,
      shape.data(),
      a_strides.data(),
      b_strides.data(),
      ndim);
  auto a_vec =
      load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, In(0));
  auto b_vec =
      load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, In(0));

  AlignedVector<Out, N_READS> out_vec_a;
  AlignedVector<Out, N_READS> out_vec_b;
#pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    auto out = Op{}(a_vec[i], b_vec[i]);
    out_vec_a[i] = out[0];
    out_vec_b[i] = out[1];
  }
  store_vector(out_a + shape_x * index_rest, index_x, out_vec_a, shape_x);
  store_vector(out_b + shape_x * index_rest, index_x, out_vec_b, shape_x);
}

template <typename Op, typename In, typename Out>
constexpr bool supports_binary_two_op() {
  if (std::is_same_v<Op, DivMod>) {
    return std::is_same_v<In, Out> &&
        (std::is_integral_v<Out> || is_floating_v<Out>);
  }
  return false;
}

} // namespace cu

template <typename Op>
void binary_two_op_gpu_inplace(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const char* op,
    const Stream& s) {
  assert(inputs.size() > 1);
  const auto& a = inputs[0];
  const auto& b = inputs[1];
  auto& out_a = outputs[0];
  auto& out_b = outputs[1];
  auto bopt = get_binary_op_type(a, b);
  auto& encoder = cu::get_command_encoder(s);
  set_binary_op_output_data(
      a, b, out_a, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });
  set_binary_op_output_data(
      a, b, out_b, bopt, [&](auto n) { return cu::malloc_async(n, encoder); });

  if (out_a.size() == 0) {
    return;
  }

  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out_a);
  encoder.set_output_array(out_b);
  dispatch_all_types(a.dtype(), [&](auto in_type_tag) {
    dispatch_all_types(out_a.dtype(), [&](auto out_type_tag) {
      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
      if constexpr (cu::supports_binary_two_op<Op, CTYPE_IN, CTYPE_OUT>()) {
        using InType = cuda_type_t<CTYPE_IN>;
        using OutType = cuda_type_t<CTYPE_OUT>;

        auto bopt = get_binary_op_type(a, b);
        if (bopt == BinaryOpType::General) {
          dispatch_bool(
              a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
                  out_a.data_size() > INT32_MAX,
              [&](auto large) {
                using IdxT = std::conditional_t<large(), int64_t, int32_t>;
                Shape shape;
                std::vector<Strides> strides;
                std::tie(shape, strides) =
                    collapse_contiguous_dims(a, b, out_a);
                auto& a_strides = strides[0];
                auto& b_strides = strides[1];
                int ndim = shape.size();
                int work_per_thread = 1;
                auto dim0 = ndim > 0 ? shape.back() : 1;
                auto rest = out_a.size() / dim0;
                if (dim0 >= 4) {
                  work_per_thread = 4;
                }
                dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
                auto block_dims = get_block_dims(dim0, rest, 1);
                uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
                uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);

                if (ndim <= 3) {
                  dispatch_1_2_3(ndim, [&](auto dims_constant) {
                    auto kernel = cu::binary_two_g_nd<
                        Op,
                        InType,
                        OutType,
                        IdxT,
                        dims_constant(),
                        1>;
                    if (work_per_thread == 4) {
                      kernel = cu::binary_two_g_nd<
                          Op,
                          InType,
                          OutType,
                          IdxT,
                          dims_constant(),
                          4>;
                    }
                    encoder.add_kernel_node(
                        kernel,
                        {num_blocks_x, num_blocks_y},
                        block_dims,
                        gpu_ptr<InType>(a),
                        gpu_ptr<InType>(b),
                        gpu_ptr<OutType>(out_a),
                        gpu_ptr<OutType>(out_b),
                        rest,
                        const_param<dims_constant()>(shape),
                        const_param<dims_constant()>(a_strides),
                        const_param<dims_constant()>(b_strides));
                  });
                } else {
                  auto kernel = cu::binary_two_g<Op, InType, OutType, IdxT, 1>;
                  if (work_per_thread == 4) {
                    kernel = cu::binary_two_g<Op, InType, OutType, IdxT, 4>;
                  }
                  encoder.add_kernel_node(
                      kernel,
                      {num_blocks_x, num_blocks_y},
                      block_dims,
                      gpu_ptr<InType>(a),
                      gpu_ptr<InType>(b),
                      gpu_ptr<OutType>(out_a),
                      gpu_ptr<OutType>(out_b),
                      rest,
                      const_param(shape),
                      const_param(a_strides),
                      const_param(b_strides),
                      ndim);
                }
              });
        } else {
          dispatch_bool(out_a.data_size() > UINT32_MAX, [&](auto large) {
            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
            constexpr int N_READS = 16 / sizeof(InType);
            auto kernel = cu::binary_two_ss<Op, InType, OutType, IdxT, N_READS>;
            if (bopt == BinaryOpType::ScalarVector) {
              kernel = cu::binary_two_sv<Op, InType, OutType, IdxT, N_READS>;
            } else if (bopt == BinaryOpType::VectorScalar) {
              kernel = cu::binary_two_vs<Op, InType, OutType, IdxT, N_READS>;
            } else if (bopt == BinaryOpType::VectorVector) {
              kernel = cu::binary_two_vv<Op, InType, OutType, IdxT, N_READS>;
            }
            auto [num_blocks, block_dims] = get_launch_args(
                out_a.data_size(),
                out_a.shape(),
                out_a.strides(),
                large(),
                N_READS);
            encoder.add_kernel_node(
                kernel,
                num_blocks,
                block_dims,
                gpu_ptr<InType>(a),
                gpu_ptr<InType>(b),
                gpu_ptr<OutType>(out_a),
                gpu_ptr<OutType>(out_b),
                out_a.data_size());
          });
        }
      } else {
        throw std::runtime_error(
            fmt::format(
                "Can not do binary op {} on inputs of {} with result of {}.",
                op,
                dtype_to_string(a.dtype()),
                dtype_to_string(out_a.dtype())));
      }
    });
  });
}

template <typename Op>
void binary_two_op_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const char* op,
    const Stream& s) {
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, outputs[0], bopt);
  set_binary_op_output_data(a, b, outputs[1], bopt);
  binary_two_op_gpu_inplace<Op>(inputs, outputs, op, s);
}

void DivMod::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  nvtx3::scoped_range r("DivMod::eval_gpu");
  auto& s = outputs[0].primitive().stream();
  binary_two_op_gpu<cu::DivMod>(inputs, outputs, name(), s);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/compiled.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/compiled.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/jit_module.h"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/graph_utils.h"
#include "mlx/primitives.h"

#include <fmt/format.h>
#include <nvtx3/nvtx3.hpp>

namespace mlx::core {

namespace cu {

struct FusedKernelBuilder {
  std::string os;
  const std::string& kernel_name;
  const std::vector<array>& inputs;
  const std::vector<array>& outputs;
  const std::vector<array>& tape;
  const std::function<bool(size_t)>& is_constant;

  void build(const char* name, bool contiguous) {
    NodeNamer namer;

    // Function parameters.
    std::vector<std::string> params;
    for (size_t i = 0; i < inputs.size(); ++i) {
      if (is_constant(i)) {
        continue;
      }
      const auto& x = inputs[i];
      const std::string& xname = namer.get_name(x);
      params.push_back(
          fmt::format("const {}* {}", dtype_to_cuda_type(x.dtype()), xname));
      if (!is_scalar(x) && !contiguous) {
        params.push_back(
            fmt::format(
                "const __grid_constant__ cuda::std::array<int64_t, NDIM> {}_strides",
                xname));
      }
    }
    for (const auto& x : outputs) {
      params.push_back(
          fmt::format(
              "{}* {}", dtype_to_cuda_type(x.dtype()), namer.get_name(x)));
    }
    if (!contiguous) {
      params.push_back(
          "const __grid_constant__ cuda::std::array<int32_t, NDIM> shape");
    }
    params.push_back("IdxT size");

    // Build function signature.
    if (contiguous) {
      os += "template <typename IdxT = uint32_t, int work_per_thread = 1>\n";
    } else {
      os +=
          "template <int NDIM, typename IdxT = uint32_t, int work_per_thread = 1>\n";
    }
    os += fmt::format("__global__ void {}(\n", kernel_name + name);
    for (size_t i = 0; i < params.size(); ++i) {
      os += "    ";
      os += params[i];
      if (i != params.size() - 1) {
        os += ",\n";
      }
    }
    os += ") {\n";

    // Index. For non contiguous kernels we create a separate index
    // variable per variable otherwise everyone uses `index`.
    os +=
        "  IdxT index = cg::this_grid().thread_rank() * work_per_thread;\n"
        "  if (index >= size) {\n"
        "    return;\n"
        "  }\n";
    if (!contiguous) {
      for (size_t i = 0; i < inputs.size(); ++i) {
        const auto& x = inputs[i];
        const std::string& xname = namer.get_name(x);
        if (is_scalar(x) || is_constant(i)) {
          continue;
        }
        os += "  IdxT " + xname + "_idx = 0;\n";
      }
      os += "  {\n";
      os += "    IdxT loc = index;\n";
      os +=
          "    #pragma unroll\n"
          "    for (int i = NDIM - 1; i >= 0; i--) {\n";
      for (size_t i = 0; i < inputs.size(); ++i) {
        const auto& x = inputs[i];
        const std::string& xname = namer.get_name(x);
        if (is_scalar(x) || is_constant(i)) {
          continue;
        }
        os += "      " + xname + "_idx += (loc \% shape[i]) * IdxT(" + xname +
            "_strides[i]);\n";
      }
      os +=
          "      loc /= shape[i];\n"
          "    }\n"
          "  }\n";
    }

    // Vectorized read loop
    if (contiguous) {
      for (size_t i = 0; i < inputs.size(); ++i) {
        const auto& x = inputs[i];
        if (is_scalar(x) || is_constant(i)) {
          continue;
        }
        const std::string& xname = namer.get_name(x);
        std::string type = dtype_to_cuda_type(x.dtype());
        os += fmt::format(
            "  auto vec_{0} = load_vector<work_per_thread, {1}>({0} + index, 0, size - index, 0);\n",
            xname,
            type);
      }
    }

    // Create some space for the outputs
    for (const auto& x : outputs) {
      const std::string& xname = namer.get_name(x);
      std::string type = dtype_to_cuda_type(x.dtype());
      os += fmt::format(
          "  AlignedVector<{}, work_per_thread> vec_{};\n", type, xname);
    }

    // Work loop
    if (!contiguous) {
      os +=
          "\n"
          "  for (int i = 0; i < work_per_thread && index < size; i++) {\n";
    } else {
      os +=
          "\n"
          "  #pragma unroll\n"
          "  for (int i = 0; i < work_per_thread; i++) {\n";
    }

    // Read inputs.
    for (size_t i = 0; i < inputs.size(); ++i) {
      const auto& x = inputs[i];
      const std::string& xname = namer.get_name(x);
      std::string type = dtype_to_cuda_type(x.dtype());
      std::string value;
      if (is_constant(i)) {
        std::ostringstream ss;
        print_constant(ss, x);
        value = fmt::format("static_cast<{}>({})", type, ss.str());
      } else if (is_scalar(x)) {
        value = fmt::format("{}[0]", xname);
      } else if (contiguous) {
        value = fmt::format("vec_{}[i]", xname);
      } else {
        value = fmt::format("{}[{}_idx]", xname, xname);
      }
      os += fmt::format("    {} tmp_{} = {};\n", type, xname, value);
    }

    // Write tape.
    for (const auto& x : tape) {
      const std::string& xname = namer.get_name(x);
      std::string type = dtype_to_cuda_type(x.dtype());
      std::string value;
      if (is_static_cast(x.primitive())) {
        value = fmt::format(
            "static_cast<{}>(tmp_{})", type, namer.get_name(x.inputs()[0]));
      } else {
        value = x.primitive().name();
        value += "{}(";
        for (size_t i = 0; i < x.inputs().size() - 1; ++i) {
          value += fmt::format("tmp_{}, ", namer.get_name(x.inputs()[i]));
        }
        value += fmt::format("tmp_{})", namer.get_name(x.inputs().back()));
      }
      os += fmt::format("    {} tmp_{} = {};\n", type, xname, value);
    }

    // Write output.
    for (const auto& x : outputs) {
      os += fmt::format("    vec_{0}[i] = tmp_{0};\n", namer.get_name(x));
    }

    // End of work loop
    if (!contiguous) {
      os += "\n";
      for (size_t i = 0; i < inputs.size(); ++i) {
        const auto& x = inputs[i];
        const std::string& xname = namer.get_name(x);
        if (is_scalar(x) || is_constant(i)) {
          continue;
        }
        os += fmt::format("    {0}_idx += {0}_strides[NDIM - 1];\n", xname);
      }
    }
    os += "  }\n";

    // Store the output to global memory
    for (const auto& x : outputs) {
      os += fmt::format(
          "  store_vector({0} + index, 0, vec_{0}, size - index);\n",
          namer.get_name(x));
    }

    os += "}\n";
  }
};

} // namespace cu

constexpr const char* g_jit_includes = R"(
#include "mlx/backend/cuda/device/binary_ops.cuh"
#include "mlx/backend/cuda/device/ternary_ops.cuh"
#include "mlx/backend/cuda/device/unary_ops.cuh"
#include "mlx/backend/cuda/device/utils.cuh"

#include <cooperative_groups.h>

#define inf cuda::std::numeric_limits<float>::infinity()
)";

void Compiled::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  nvtx3::scoped_range r("Compiled::eval_gpu");
  auto& s = stream();

  // Determine the work per thread for the vectorized reads/writes. We take it
  // as 16 over the max itemsize for the outputs. Another heuristic could be
  // over the max itemsize of all arrays.
  int max_size = 1;
  for (const auto& x : outputs) {
    max_size = (max_size > x.itemsize()) ? max_size : x.itemsize();
  }
  int work_per_thread = 16 / max_size;

  cu::JitModule& mod = cu::get_jit_module(s.device, lib_name(), [&]() {
    // Build source code.
    cu::FusedKernelBuilder builder{
        g_jit_includes, lib_name(), inputs_, outputs_, tape_, is_constant_};
    builder.os +=
        "namespace mlx::core::cu {\n\n"
        "namespace cg = cooperative_groups;\n\n";
    builder.build("_contiguous", true);
    builder.os += "\n";
    builder.build("_strided", false);
    builder.os += "\n} // namespace mlx::core::cu\n";
    // Build kernel names.
    std::vector<std::string> kernel_names;
    kernel_names.push_back(
        fmt::format(
            "mlx::core::cu::{}_contiguous<uint32_t, {}>",
            lib_name(),
            work_per_thread));
    kernel_names.push_back(
        fmt::format(
            "mlx::core::cu::{}_contiguous<int64_t, {}>",
            lib_name(),
            work_per_thread));
    for (int wpt : {1, work_per_thread}) {
      for (int i = 1; i <= MAX_NDIM; ++i) {
        kernel_names.push_back(
            fmt::format(
                "mlx::core::cu::{}_strided<{}, uint32_t, {}>",
                lib_name(),
                i,
                wpt));
        kernel_names.push_back(
            fmt::format(
                "mlx::core::cu::{}_strided<{}, int64_t, {}>",
                lib_name(),
                i,
                wpt));
      }
    }

    return std::make_tuple(
        false, std::move(builder.os), std::move(kernel_names));
  });

  // Collapse contiguous dims to route to a faster kernel if possible. Also
  // handle all broadcasting.
  auto [contiguous, shape, strides_vec] =
      compiled_collapse_contiguous_dims(inputs, outputs[0], is_constant_);

  // Whether to use large index.
  bool large = compiled_use_large_index(inputs, outputs, contiguous);

  cu::KernelArgs args;
  // Put inputs.
  int strides_index = 1;
  for (size_t i = 0; i < inputs.size(); ++i) {
    if (is_constant_(i)) {
      continue;
    }
    const auto& x = inputs[i];
    args.append(x);
    if (!contiguous && !is_scalar(x)) {
      args.append_ptr(strides_vec[strides_index++].data());
    }
  }

  auto& encoder = cu::get_command_encoder(s);

  // Put outputs.
  compiled_allocate_outputs(
      inputs, outputs, is_constant_, contiguous, [&](auto n) {
        return cu::malloc_async(n, encoder);
      });
  for (auto& x : outputs) {
    args.append(x);
  }

  // Put shape and size.
  if (!contiguous) {
    args.append_ptr(shape.data());
  }
  if (large) {
    args.append<int64_t>(outputs[0].data_size());
  } else {
    args.append<uint32_t>(outputs[0].data_size());
  }

  // Choose work per thread
  if (!contiguous && shape.back() % work_per_thread != 0) {
    work_per_thread = 1;
  }

  // Launch kernel.
  const char* index_type = large ? "int64_t" : "uint32_t";
  std::string kernel_name = fmt::format("mlx::core::cu::{}", lib_name());
  if (contiguous) {
    kernel_name +=
        fmt::format("_contiguous<{}, {}>", index_type, work_per_thread);
  } else {
    kernel_name += fmt::format(
        "_strided<{}, {}, {}>", shape.size(), index_type, work_per_thread);
  }
  for (const auto& in : inputs) {
    encoder.set_input_array(in);
  }
  for (const auto& out : outputs) {
    encoder.set_output_array(out);
  }

  auto [kernel, max_block_dims] = mod.get_kernel_and_dims(kernel_name);
  auto [num_blocks, block_dims] =
      get_launch_args(outputs[0], large, work_per_thread, max_block_dims);
  encoder.add_kernel_node_raw(
      kernel, num_blocks, block_dims, {}, 0, args.args());
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/conv/conv.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/gpu/copy.h"

namespace mlx::core {

template <int NDIM>
struct ConvParams {
  int N; // Batch size
  int C; // In channels
  int O; // Out channels
  int strides[NDIM];
  int padding[NDIM];
  int kernel_dilation[NDIM];
  int input_dilation[NDIM];
  int groups;
  bool flip;
  int in_spatial_dims[NDIM];
  int wt_spatial_dims[NDIM];
  int out_spatial_dims[NDIM];
  int64_t in_strides[NDIM + 2];

  ConvParams(
      const array& in,
      const array& wt,
      const array& out,
      const std::vector<int>& strides,
      const std::vector<int>& padding,
      const std::vector<int>& kernel_dilation,
      const std::vector<int>& input_dilation,
      int groups,
      bool flip)
      : N(in.shape(0)),
        C(in.shape(-1)),
        O(wt.shape(0)),
        groups(groups),
        flip(flip) {
    std::copy_n(strides.begin(), NDIM, this->strides);
    std::copy_n(padding.begin(), NDIM, this->padding);
    std::copy_n(kernel_dilation.begin(), NDIM, this->kernel_dilation);
    std::copy_n(input_dilation.begin(), NDIM, this->input_dilation);
    std::copy_n(in.shape().begin() + 1, NDIM, this->in_spatial_dims);
    std::copy_n(wt.shape().begin() + 1, NDIM, this->wt_spatial_dims);
    std::copy_n(out.shape().begin() + 1, NDIM, this->out_spatial_dims);
    std::copy_n(in.strides().begin(), NDIM + 2, this->in_strides);
  }
};

void gemm_grouped_conv(
    cu::CommandEncoder& encoder,
    const array& in,
    const array& wt,
    array& out,
    const std::vector<int>& strides,
    const std::vector<int>& padding,
    const std::vector<int>& kernel_dilation,
    const std::vector<int>& input_dilation,
    int groups,
    bool flip,
    Stream s);

void gemm_conv(
    cu::CommandEncoder& encoder,
    const array& in,
    const array& wt,
    array& out,
    const std::vector<int>& strides,
    const std::vector<int>& padding,
    const std::vector<int>& kernel_dilation,
    const std::vector<int>& input_dilation,
    bool flip,
    Stream s);

inline void gemm_conv(
    cu::CommandEncoder& encoder,
    array in,
    array wt,
    array& out,
    const std::vector<int>& strides,
    const std::vector<int>& padding,
    const std::vector<int>& kernel_dilation,
    const std::vector<int>& input_dilation,
    int groups,
    bool flip,
    Stream s) {
  if (!in.flags().row_contiguous) {
    in = contiguous_copy_gpu(in, s);
    encoder.add_temporary(in);
  }
  if (!wt.flags().row_contiguous) {
    wt = contiguous_copy_gpu(wt, s);
    encoder.add_temporary(wt);
  }

  if (groups == 1) {
    gemm_conv(
        encoder,
        in,
        wt,
        out,
        strides,
        padding,
        kernel_dilation,
        input_dilation,
        flip,
        s);
  } else {
    gemm_grouped_conv(
        encoder,
        in,
        wt,
        out,
        strides,
        padding,
        kernel_dilation,
        input_dilation,
        groups,
        flip,
        s);
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/conv/gemm_conv.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/conv/conv.h"
#include "mlx/backend/cuda/gemms/cublas_gemm.h"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/dtype_utils.h"

#include <cooperative_groups.h>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <typename T, int NDIM>
__global__ void naive_unfold_nd(
    const T* in,
    T* out,
    int filter_size,
    int out_pixels,
    const __grid_constant__ ConvParams<NDIM> params) {
  auto block = cg::this_thread_block();
  auto tid = block.group_index();
  auto lid = block.thread_index();

  int index_batch = tid.z / out_pixels; // [0, N)
  int index_out_spatial = tid.z % out_pixels; // [0, H_out * W_out)
  int index_wt_spatial =
      tid.x * block.dim_threads().x + lid.x; // [0, H_wt * W_wt)

  if (index_wt_spatial >= filter_size / params.C) {
    return;
  }

  in += tid.y; // [0, C)
  out += tid.z * filter_size + index_wt_spatial * params.C + tid.y;

  bool valid = index_batch < params.N;

  // Get the coordinates in input.
  int index_in[NDIM] = {};
#pragma unroll
  for (int i = NDIM - 1; i >= 0; --i) {
    int index_out = index_out_spatial % params.out_spatial_dims[i];
    int index_wt = index_wt_spatial % params.wt_spatial_dims[i];

    if (params.flip) {
      index_wt = params.wt_spatial_dims[i] - index_wt - 1;
    }

    int index = index_out * params.strides[i] - params.padding[i] +
        index_wt * params.kernel_dilation[i];
    int index_max =
        1 + params.input_dilation[i] * (params.in_spatial_dims[i] - 1);

    valid &= (index >= 0) && (index < index_max) &&
        (index % params.input_dilation[i] == 0);

    index_in[i] = index / params.input_dilation[i];

    index_out_spatial /= params.out_spatial_dims[i];
    index_wt_spatial /= params.wt_spatial_dims[i];
  }

  if (valid) {
    int in_offset = index_batch * params.in_strides[0];
#pragma unroll
    for (int i = 0; i < NDIM; ++i) {
      in_offset += index_in[i] * params.in_strides[i + 1];
    }
    *out = in[in_offset];
  } else {
    *out = T{0};
  }
}

} // namespace cu

template <int NDIM>
array unfold_inputs_nd(
    cu::CommandEncoder& encoder,
    const array& in,
    int mat_M,
    int mat_K,
    int mat_N,
    ConvParams<NDIM>& params) {
  array unfolded({mat_M, mat_K}, in.dtype(), nullptr, {});
  unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder));
  encoder.add_temporary(unfolded);

  int filter_size = params.C;
#pragma unroll
  for (int i = 0; i < NDIM; ++i) {
    filter_size *= params.wt_spatial_dims[i];
  }

  int out_pixels = 1;
#pragma unroll
  for (int i = 0; i < NDIM; ++i) {
    out_pixels *= params.out_spatial_dims[i];
  }

  int wt_spatial_size = mat_K / params.C;
  dim3 block_dims;
  block_dims.x = std::min(std::max(wt_spatial_size, 32), 1024);
  dim3 num_blocks;
  num_blocks.x = cuda::ceil_div(wt_spatial_size, block_dims.x);
  num_blocks.y = params.C;
  num_blocks.z = mat_M;

  encoder.set_input_array(in);
  encoder.set_output_array(unfolded);
  dispatch_float_types(in.dtype(), "unfold", [&](auto type_tag) {
    using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    encoder.add_kernel_node(
        cu::naive_unfold_nd<DataType, NDIM>,
        num_blocks,
        block_dims,
        gpu_ptr<DataType>(in),
        gpu_ptr<DataType>(unfolded),
        filter_size,
        out_pixels,
        params);
  });

  return unfolded;
}

template <int NDIM>
void gemm_conv_nd(
    cu::CommandEncoder& encoder,
    const array& in,
    const array& wt,
    array& out,
    ConvParams<NDIM>& params,
    Stream s) {
  // Get gemm shapes.
  int mat_M = out.size() / params.O; // N * H_out * W_out
  int mat_K = wt.size() / params.O; // C * H_wt * W_wt
  int mat_N = params.O; // O

  // Unfold input to (N * H_out * W_out, C * H_wt * W_wt) for gemm.
  array in_unfolded =
      unfold_inputs_nd<NDIM>(encoder, in, mat_M, mat_K, mat_N, params);

  // Reshape weight to (C * H_wt * W_wt, O) for gemm.
  array wt_reshaped({mat_K, mat_N}, wt.dtype(), nullptr, {});
  wt_reshaped.copy_shared_buffer(
      wt,
      {1, mat_K},
      {false, false, /* col_contiguous */ true},
      wt.data_size());

  // Single batch.
  Shape batch_shape{1};
  Strides a_batch_strides{0};
  Strides b_batch_strides{0};

  // Run matmul.
  CublasGemm gemm(
      encoder.device(),
      in.dtype(),
      false, // a_transposed
      mat_M, // a_rows
      mat_K, // a_cols
      mat_K, // lda
      true, // b_transposed
      mat_K, // b_rows
      mat_N, // b_cols
      mat_K, // ldb
      batch_shape.back(),
      a_batch_strides.back(),
      b_batch_strides.back());
  gemm.run(
      encoder,
      out,
      in_unfolded,
      wt_reshaped,
      batch_shape,
      a_batch_strides,
      b_batch_strides);
}

void gemm_conv(
    cu::CommandEncoder& encoder,
    const array& in,
    const array& wt,
    array& out,
    const std::vector<int>& strides,
    const std::vector<int>& padding,
    const std::vector<int>& kernel_dilation,
    const std::vector<int>& input_dilation,
    bool flip,
    Stream s) {
  int conv_ndim = in.ndim() - 2;
  if (conv_ndim < 1 || conv_ndim > 3) {
    throw std::runtime_error(
        fmt::format("[conv] Unsupported gemm_conv for {}D conv.", conv_ndim));
  }
  dispatch_1_2_3(conv_ndim, [&](auto ndim_constant) {
    ConvParams<ndim_constant()> params(
        in,
        wt,
        out,
        strides,
        padding,
        kernel_dilation,
        input_dilation,
        1, // groups
        flip);
    gemm_conv_nd<ndim_constant()>(encoder, in, wt, out, params, s);
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/conv/gemm_grouped_conv.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/conv/conv.h"
#include "mlx/backend/cuda/gemms/cublas_gemm.h"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/dtype_utils.h"

#include <cooperative_groups.h>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <typename T, int NDIM>
__global__ void naive_grouped_unfold_transpose_nd(
    const T* in,
    T* out,
    int filter_size,
    int out_pixels,
    const __grid_constant__ ConvParams<NDIM> params) {
  auto block = cg::this_thread_block();
  auto tid = block.group_index();
  auto lid = block.thread_index();

  int index_batch = tid.z / out_pixels; // [0, N)
  int index_out_spatial = tid.z % out_pixels; // [0, H_out * W_out)
  int index_wt_spatial =
      tid.x * block.dim_threads().x + lid.x; // [0, H_wt * W_wt)

  if (index_wt_spatial >= filter_size / params.C) {
    return;
  }

  in += tid.y; // [0, C)
  out += tid.z * filter_size + tid.y * (filter_size / params.C);

  bool valid = index_batch < params.N;

  // Get the coordinates in input.
  int index_in[NDIM] = {};
  int wt_stride = 1;
#pragma unroll
  for (int i = NDIM - 1; i >= 0; --i) {
    int index_out = index_out_spatial % params.out_spatial_dims[i];
    int index_wt = index_wt_spatial % params.wt_spatial_dims[i];
    out += index_wt * wt_stride;

    if (params.flip) {
      index_wt = params.wt_spatial_dims[i] - index_wt - 1;
    }

    int index = index_out * params.strides[i] - params.padding[i] +
        index_wt * params.kernel_dilation[i];
    int index_max =
        1 + params.input_dilation[i] * (params.in_spatial_dims[i] - 1);

    valid &= (index >= 0) && (index < index_max) &&
        (index % params.input_dilation[i] == 0);

    index_in[i] = index / params.input_dilation[i];

    index_out_spatial /= params.out_spatial_dims[i];
    index_wt_spatial /= params.wt_spatial_dims[i];
    wt_stride *= params.wt_spatial_dims[i];
  }

  if (valid) {
    int in_offset = index_batch * params.in_strides[0];
#pragma unroll
    for (int i = 0; i < NDIM; ++i) {
      in_offset += index_in[i] * params.in_strides[i + 1];
    }
    *out = in[in_offset];
  } else {
    *out = T{0};
  }
}

} // namespace cu

template <int NDIM>
array grouped_unfold_transpose_inputs_nd(
    cu::CommandEncoder& encoder,
    const array& in,
    int mat_M,
    int mat_K,
    int mat_N,
    ConvParams<NDIM>& params) {
  array unfolded({mat_M, mat_K * params.groups}, in.dtype(), nullptr, {});
  unfolded.set_data(cu::malloc_async(unfolded.nbytes(), encoder));
  encoder.add_temporary(unfolded);

  int filter_size = params.C;
#pragma unroll
  for (int i = 0; i < NDIM; ++i) {
    filter_size *= params.wt_spatial_dims[i];
  }

  int out_pixels = 1;
#pragma unroll
  for (int i = 0; i < NDIM; ++i) {
    out_pixels *= params.out_spatial_dims[i];
  }

  int wt_spatial_size = (mat_K * params.groups) / params.C;
  dim3 block_dims;
  block_dims.x = std::min(std::max(wt_spatial_size, 32), 1024);
  dim3 num_blocks;
  num_blocks.x = cuda::ceil_div(wt_spatial_size, block_dims.x);
  num_blocks.y = params.C;
  num_blocks.z = mat_M;

  encoder.set_input_array(in);
  encoder.set_output_array(unfolded);
  dispatch_float_types(in.dtype(), "unfold", [&](auto type_tag) {
    using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    encoder.add_kernel_node(
        cu::naive_grouped_unfold_transpose_nd<DataType, NDIM>,
        num_blocks,
        block_dims,
        gpu_ptr<DataType>(in),
        gpu_ptr<DataType>(unfolded),
        filter_size,
        out_pixels,
        params);
  });

  return unfolded;
}

template <int NDIM>
void gemm_grouped_conv_nd(
    cu::CommandEncoder& encoder,
    const array& in,
    const array& wt,
    array& out,
    ConvParams<NDIM>& params,
    Stream s) {
  // Get gemm shapes.
  int C_per_group = params.C / params.groups;
  int O_per_group = params.O / params.groups;
  int mat_M = out.size() / params.O; // N * H_out * W_out
  int mat_K = wt.size() / params.O; // C_per_group * H_wt * W_wt
  int mat_N = O_per_group; // O_per_group

  // Unfold input to (N * H_out * W_out, C * H_wt * W_wt) for gemm.
  array in_unfolded = grouped_unfold_transpose_inputs_nd<NDIM>(
      encoder, in, mat_M, mat_K, mat_N, params);

  // Reshape weight to (O, C_per_group, H_wt * W_wt) for gemm.
  int wt_spatial_size = (wt.size() / wt.shape(0)) / wt.shape(-1);
  array wt_view(
      {params.O, C_per_group, wt_spatial_size}, wt.dtype(), nullptr, {});
  wt_view.copy_shared_buffer(
      wt, {wt.strides(0), 1, C_per_group}, wt.flags(), wt.size());
  array wt_reshaped = contiguous_copy_gpu(wt_view, s);

  // Batch with size of groups.
  Shape batch_shape{params.groups};
  Strides a_batch_strides{mat_K};
  Strides b_batch_strides{mat_N * mat_K};

  // Run matmul.
  CublasGemm gemm(
      encoder.device(),
      in.dtype(),
      false, // a_transposed
      mat_M, // a_rows
      mat_K, // a_cols
      mat_K * params.groups, // lda
      true, // b_transposed
      mat_K, // b_rows
      mat_N, // b_cols
      mat_K, // ldb
      batch_shape.back(),
      a_batch_strides.back(),
      b_batch_strides.back());
  gemm.set_out(
      out.dtype(),
      false, // out_transposed
      mat_M, // out_rows
      mat_N, // out_cols
      mat_N * params.groups, // out_ld
      params.groups, // batch_count
      mat_N); // batch_stride
  gemm.run(
      encoder,
      out,
      in_unfolded,
      wt_reshaped,
      batch_shape,
      a_batch_strides,
      b_batch_strides);
}

void gemm_grouped_conv(
    cu::CommandEncoder& encoder,
    const array& in,
    const array& wt,
    array& out,
    const std::vector<int>& strides,
    const std::vector<int>& padding,
    const std::vector<int>& kernel_dilation,
    const std::vector<int>& input_dilation,
    int groups,
    bool flip,
    Stream s) {
  int conv_ndim = in.ndim() - 2;
  if (conv_ndim < 1 || conv_ndim > 3) {
    throw std::runtime_error(
        fmt::format("[conv] Unsupported gemm_conv for {}D conv.", conv_ndim));
  }
  dispatch_1_2_3(conv_ndim, [&](auto ndim_constant) {
    ConvParams<ndim_constant()> params(
        in,
        wt,
        out,
        strides,
        padding,
        kernel_dilation,
        input_dilation,
        groups,
        flip);
    gemm_grouped_conv_nd<ndim_constant()>(encoder, in, wt, out, params, s);
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/conv.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/conv/conv.h"
#include "mlx/backend/cuda/cudnn_utils.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/lru_cache.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/primitives.h"

#include <nvtx3/nvtx3.hpp>

#include <cassert>

namespace mlx::core {

namespace {

enum ConvBackendType {
  CONV_FALLBACK,
  CONV_FORWARD,
  CONV_BACKWARD_INPUT,
  CONV_BACKWARD_WEIGHT,
};

struct ConvCacheKey {
  int device_id;
  fe::DataType_t cudnn_dtype;
  std::array<int, MAX_NDIM> input_shape;
  std::array<int, MAX_NDIM> weight_shape;
  std::array<int, MAX_NDIM> stride;
  std::array<int, MAX_NDIM> padding_lo;
  std::array<int, MAX_NDIM> padding_hi;
  std::array<int, MAX_NDIM> dilation;
  int groups;
  bool flip;
  uint8_t input_alignment;
  uint8_t weight_alignment;
  uint8_t output_alignment;
};

auto& conv_cache() {
  static LRUBytesKeyCache<
      ConvCacheKey,
      std::pair<ConvBackendType, std::optional<DnnGraph>>>
      cache("MLX_CUDA_CONV_CACHE_SIZE", /* default_capacity */ 128);
  return cache;
}

auto get_conv_settings(
    ConvBackendType backend_type,
    array& x,
    array& w,
    array& y,
    const std::vector<int>& kernel_strides,
    const std::vector<int>& padding_lo_,
    const std::vector<int>& padding_hi_,
    const std::vector<int>& kernel_dilation,
    const std::vector<int>& input_dilation) {
  auto padding_lo = convert_vector<int64_t>(padding_lo_);
  auto padding_hi = convert_vector<int64_t>(padding_hi_);

  if (backend_type == CONV_BACKWARD_INPUT) {
    for (int i = 0; i < padding_lo.size(); ++i) {
      int wt_size = 1 + kernel_dilation[i] * (w.shape(1 + i) - 1);
      padding_lo[i] = wt_size - padding_lo[i] - 1;
      int in_size = 1 + kernel_strides[i] * (y.shape(1 + i) - 1);
      int out_size = 1 + input_dilation[i] * (x.shape(1 + i) - 1);
      padding_hi[i] = out_size - in_size + padding_hi[i];
    }
    return std::make_tuple(
        convert_vector<int64_t>(input_dilation),
        std::move(padding_lo),
        std::move(padding_hi),
        convert_vector<int64_t>(kernel_dilation));

  } else if (backend_type == CONV_BACKWARD_WEIGHT) {
    padding_hi = padding_lo;
    return std::make_tuple(
        convert_vector<int64_t>(kernel_dilation),
        std::move(padding_lo),
        std::move(padding_hi),
        convert_vector<int64_t>(kernel_strides));

  } else {
    return std::make_tuple(
        convert_vector<int64_t>(kernel_strides),
        std::move(padding_lo),
        std::move(padding_hi),
        convert_vector<int64_t>(kernel_dilation));
  }
}

std::optional<DnnGraph> build_conv_graph(
    cu::CommandEncoder& encoder,
    ConvBackendType backend_type,
    Dtype dtype,
    array& x,
    array& w,
    array& y,
    const std::vector<int64_t>& stride,
    const std::vector<int64_t>& padding_lo,
    const std::vector<int64_t>& padding_hi,
    const std::vector<int64_t>& dilation) {
  auto compute_dtype =
      (dtype == float16 || dtype == bfloat16) ? float32 : dtype;
  DnnGraph graph(encoder.device().get_cudnn_handle(), dtype, compute_dtype);
  auto x_ = graph.tensor_nchw("X", 'x', x);
  auto w_ = graph.tensor_nchw("W", 'w', w);

  auto set_options = [&](auto& options) {
    options.set_compute_data_type(dtype_to_cudnn_type(compute_dtype))
        .set_convolution_mode(fe::ConvolutionMode_t::CROSS_CORRELATION)
        .set_stride(stride)
        .set_pre_padding(padding_lo)
        .set_post_padding(padding_hi)
        .set_dilation(dilation);
  };

  std::shared_ptr<fe::graph::Tensor_attributes> y_;
  if (backend_type == CONV_FORWARD) {
    auto options = fe::graph::Conv_fprop_attributes();
    set_options(options);
    y_ = graph.conv_fprop(x_, w_, options);
  } else if (backend_type == CONV_BACKWARD_INPUT) {
    auto options = fe::graph::Conv_dgrad_attributes();
    set_options(options);
    y_ = graph.conv_dgrad(x_, w_, options);
  } else if (backend_type == CONV_BACKWARD_WEIGHT) {
    auto options = fe::graph::Conv_wgrad_attributes();
    set_options(options);
    y_ = graph.conv_wgrad(w_, x_, options);
  }
  graph.tensor_nchw(y_, 'y', y)->set_output(true);

  if (graph.prepare().is_bad()) {
    return std::nullopt;
  }
  graph.deselect_numeric_notes({fe::NumericalNote_t::DOWN_CONVERT_INPUTS});
  if (dtype == float32 && !env::enable_tf32()) {
    graph.deselect_numeric_notes({fe::NumericalNote_t::TENSOR_CORE});
  }
  CHECK_CUDNN_FE_ERROR(graph.build());
  return graph;
}

// Transpose from (C_out, H, W, C_in / groups) to (C_in, H, W, C_out / groups).
array group_transpose(
    const array& x,
    int groups,
    int group_dim,
    int axis1,
    int axis2,
    Stream s) {
  if (groups == 1) {
    return swapaxes_in_eval(x, axis1, axis2);
  }
  int ndim = x.ndim();
  if (group_dim < 0) {
    group_dim += ndim;
  }
  if (axis1 < 0) {
    axis1 += ndim;
  }
  if (axis2 < 0) {
    axis2 += ndim;
  }
  if (group_dim <= axis1) {
    axis1 += 1;
  }
  if (group_dim <= axis2) {
    axis2 += 1;
  }
  auto shape = x.shape();
  shape.insert(shape.begin() + group_dim, groups);
  shape[group_dim + 1] = shape[group_dim + 1] / groups;
  array x_trans = reshape_in_eval(x, std::move(shape), s);
  x_trans = swapaxes_in_eval(x_trans, axis1, axis2);
  x_trans = flatten_in_eval(x_trans, group_dim, group_dim + 1, s);
  return x_trans;
}

// Do necessary transposes and copies to prepare the inputs and outputs for
// building the cuDNN conv op. It is safe to be called multiple times in one
// eval_gpu, with cost of possible redundant copies.
std::tuple<array, array, array> prepare_args(
    cu::CommandEncoder& encoder,
    ConvBackendType backend_type,
    array in,
    array wt,
    array out,
    int groups,
    Stream s) {
  // Transpose the args depending on the backend type.
  // TODO: Handle groups.
  if (backend_type == CONV_BACKWARD_INPUT) {
    wt = group_transpose(wt, groups, 0, 0, -1, s);
  } else if (backend_type == CONV_BACKWARD_WEIGHT) {
    in = group_transpose(in, groups, -1, 0, -1, s);
    wt = swapaxes_in_eval(wt, 0, -1);
    // Create a contiguous array that shares the data with |out|, but with dim
    // C_in and C_out swapped.
    Shape shape(out.shape());
    std::swap(shape.front(), shape.back());
    Strides strides(shape.size(), 1);
    for (int i = shape.size() - 2; i >= 0; --i) {
      strides[i] = shape[i + 1] * strides[i + 1];
    }
    array intermediate(std::move(shape), out.dtype(), nullptr, {});
    intermediate.copy_shared_buffer(
        out, std::move(strides), {true, true, false}, out.data_size());
    out = intermediate;
  }

  // cuDNN requires contiguous input.
  if (!in.flags().row_contiguous) {
    in = contiguous_copy_gpu(in, s);
    encoder.add_temporary(in);
  }
  if (!wt.flags().row_contiguous) {
    wt = contiguous_copy_gpu(wt, s);
    encoder.add_temporary(wt);
  }

  return {std::move(in), std::move(wt), std::move(out)};
}

// Register inputs and outputs before actually running conv op. Can only be
// called once per eval_gpu.
void register_args(
    cu::CommandEncoder& encoder,
    ConvBackendType backend_type,
    array& in,
    array& wt,
    array& intermediate_out,
    array& final_out) {
  encoder.set_input_array(in);
  encoder.set_input_array(wt);
  encoder.set_output_array(final_out);

  if (backend_type == CONV_BACKWARD_WEIGHT) {
    // Turn |out| into a strided array, which will have C_in and C_out swapped
    // in vjp and the final |grad_weight| will then be contiguous.
    Strides strides = intermediate_out.strides();
    std::swap(strides.front(), strides.back());
    final_out.copy_shared_buffer(
        intermediate_out,
        std::move(strides),
        {false, false, false},
        intermediate_out.data_size());
  }
}

} // namespace

void Convolution::eval_gpu(const std::vector<array>& inputs, array& out_) {
  nvtx3::scoped_range r("Convolution::eval_gpu");
  if (out_.size() == 0) {
    return;
  }
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);

  assert(inputs.size() == 2);
  array in = inputs[0];
  array wt = inputs[1];
  array out = out_;
  out.set_data(cu::malloc_async(out.nbytes(), encoder));
  Dtype dtype = out.dtype();

  // Search cache.
  BytesKey<ConvCacheKey> cache_key;
  cache_key.pod.device_id = encoder.device().cuda_device();
  cache_key.pod.cudnn_dtype = dtype_to_cudnn_type(dtype);
  cache_key.pod.input_shape = vector_key(in.shape());
  cache_key.pod.weight_shape = vector_key(wt.shape());
  cache_key.pod.stride = vector_key(kernel_strides_);
  cache_key.pod.padding_lo = vector_key(padding_lo_);
  cache_key.pod.padding_hi = vector_key(padding_hi_);
  cache_key.pod.dilation = vector_key(kernel_dilation_);
  cache_key.pod.groups = groups_;
  cache_key.pod.flip = flip_;
  cache_key.pod.input_alignment = get_alignment(in);
  cache_key.pod.weight_alignment = get_alignment(wt);
  cache_key.pod.output_alignment = get_alignment(out);
  if (auto it = conv_cache().find(cache_key); it != conv_cache().end()) {
    auto& [backend_type, graph] = it->second;
    if (graph) {
      // Run cached graph.
      std::tie(in, wt, out) =
          prepare_args(encoder, backend_type, in, wt, out, groups_, s);
      register_args(encoder, backend_type, in, wt, out, out_);
      CHECK_CUDNN_FE_ERROR(graph->encode_capturing(
          encoder,
          {
              {'x', gpu_ptr<void>(in)},
              {'w', gpu_ptr<void>(wt)},
              {'y', gpu_ptr<void>(out)},
          }));
    } else {
      // Run fallback kernel.
      gemm_conv(
          encoder,
          in,
          wt,
          out,
          kernel_strides_,
          padding_lo_,
          kernel_dilation_,
          input_dilation_,
          groups_,
          flip_,
          s);
    }
    return;
  }

  // There is no reliable way to deduce the proper cuDNN backend for the
  // convolution, so we make a best guess and then try.
  SmallVector<ConvBackendType, 2> try_backends;
  if (flip_) {
    // When weight is flipped, we assume it is backward input convolution.
    try_backends.push_back(CONV_BACKWARD_INPUT);
  } else {
    // Otherwise it could be backward weight convolution or forward convolution,
    // mathematically there is no difference so we have to use heuristics.
    // Empirically backward convolutions have large kernel dimensions, and
    // usually have |in| and |wt| transposed.
    if (!in.flags().row_contiguous && !wt.flags().row_contiguous &&
        wt.shape(2) > out.shape(2)) {
      try_backends = {CONV_BACKWARD_WEIGHT, CONV_FORWARD};
    } else {
      try_backends = {CONV_FORWARD, CONV_BACKWARD_WEIGHT};
    }
  }

  // Try to build op graph.
  ConvBackendType backend_type;
  std::optional<DnnGraph> graph;
  for (auto try_backend : try_backends) {
    auto [x, w, y] =
        prepare_args(encoder, try_backend, in, wt, out, groups_, s);
    auto [stride, padding_lo, padding_hi, dilation] = get_conv_settings(
        try_backend,
        x,
        w,
        y,
        kernel_strides_,
        padding_lo_,
        padding_hi_,
        kernel_dilation_,
        input_dilation_);
    graph = build_conv_graph(
        encoder,
        try_backend,
        dtype,
        x,
        w,
        y,
        stride,
        padding_lo,
        padding_hi,
        dilation);
    if (graph) {
      backend_type = try_backend;
      in = std::move(x);
      wt = std::move(w);
      out = std::move(y);
      break;
    }
  }

  if (graph) {
    register_args(encoder, backend_type, in, wt, out, out_);
    CHECK_CUDNN_FE_ERROR(graph->encode_capturing(
        encoder,
        {
            {'x', gpu_ptr<void>(in)},
            {'w', gpu_ptr<void>(wt)},
            {'y', gpu_ptr<void>(out)},
        }));
    conv_cache().emplace(
        cache_key, std::make_pair(backend_type, std::move(*graph)));
    return;
  }

  // Use fallback kernel for settings not supported by cuDNN.
  gemm_conv(
      encoder,
      in,
      wt,
      out,
      kernel_strides_,
      padding_lo_,
      kernel_dilation_,
      input_dilation_,
      groups_,
      flip_,
      s);
  conv_cache().emplace(cache_key, std::make_pair(CONV_FALLBACK, std::nullopt));
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/copy/copy.cuh
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/device/cast_op.cuh"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/backend/gpu/copy.h"
#include "mlx/dtype_utils.h"

namespace mlx::core {

void copy_contiguous(
    cu::CommandEncoder& encoder,
    CopyType ctype,
    const array& in,
    array& out,
    int64_t offset_in,
    int64_t offset_out);

void copy_general(
    cu::CommandEncoder& encoder,
    CopyType ctype,
    const array& in,
    array& out,
    int64_t offset_in,
    int64_t offset_out,
    const Shape& shape,
    const Strides& strides_in,
    const Strides& strides_out);

void copy_general_dynamic(
    cu::CommandEncoder& encoder,
    CopyType ctype,
    const array& in,
    array& out,
    int64_t offset_in,
    int64_t offset_out,
    const Shape& shape,
    const Strides& strides_in,
    const Strides& strides_out,
    const array& dynamic_offset_in,
    const array& dynamic_offset_out);

void copy_general_input(
    cu::CommandEncoder& encoder,
    CopyType ctype,
    const array& in,
    array& out,
    int64_t offset_in,
    int64_t offset_out,
    const Shape& shape,
    const Strides& strides_in);

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/copy/copy_contiguous.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/copy/copy.cuh"

#include <cooperative_groups.h>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <typename In, typename Out, typename IdxT, int N_READS>
__global__ void copy_s(const In* in, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();

  if ((index + 1) * N_READS > size) {
    for (IdxT i = index * N_READS; i < size; ++i) {
      out[i] = cast_to<Out>(in[0]);
    }
  } else {
    AlignedVector<Out, N_READS> out_vec;
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      out_vec[i] = cast_to<Out>(in[0]);
    }

    store_vector<N_READS>(out, index, out_vec);
  }
}

template <typename In, typename Out, typename IdxT, int N_READS>
__global__ void copy_v(const In* in, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();

  if ((index + 1) * N_READS > size) {
    for (IdxT i = index * N_READS; i < size; ++i) {
      out[i] = cast_to<Out>(in[i]);
    }
  } else {
    auto in_vec = load_vector<N_READS>(in, index);

    AlignedVector<Out, N_READS> out_vec;
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      out_vec[i] = cast_to<Out>(in_vec[i]);
    }

    store_vector<N_READS>(out, index, out_vec);
  }
}

} // namespace cu

void copy_contiguous(
    cu::CommandEncoder& encoder,
    CopyType ctype,
    const array& in,
    array& out,
    int64_t in_offset,
    int64_t out_offset) {
  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
      dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) {
        using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
        using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
        using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
        constexpr int N_READS = 16 / sizeof(InType);
        auto kernel = cu::copy_s<InType, OutType, IdxT, N_READS>;
        if (ctype == CopyType::Vector) {
          kernel = cu::copy_v<InType, OutType, IdxT, N_READS>;
        }
        auto [num_blocks, block_dims] = get_launch_args(
            out.data_size(), out.shape(), out.strides(), large(), N_READS);
        encoder.add_kernel_node(
            kernel,
            num_blocks,
            block_dims,
            gpu_ptr<InType>(in) + in_offset,
            gpu_ptr<OutType>(out) + out_offset,
            out.data_size());
      });
    });
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/copy/copy_general.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/copy/copy.cuh"

#include <cooperative_groups.h>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <typename In, typename Out, typename IdxT, int NDIM, int N_READS>
__global__ void copy_gg_nd(
    const In* in,
    Out* out,
    IdxT size_rest,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_out) {
  auto block = cg::this_thread_block();
  auto grid = cg::this_grid();
  IdxT index_rest =
      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
  if (index_rest >= size_rest) {
    return;
  }

  auto shape_x = shape[NDIM - 1];
  auto in_stride_x = strides_in[NDIM - 1];
  auto out_stride_x = strides_out[NDIM - 1];
  IdxT index_x =
      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
  auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
      index_rest * shape_x,
      shape.data(),
      strides_in.data(),
      strides_out.data());

  auto in_vec =
      load_vector<N_READS>(in + idx_in, index_x, shape_x, in_stride_x, In(0));
  AlignedVector<Out, N_READS> out_vec;
#pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    out_vec[i] = CastOp<In, Out>{}(in_vec[i]);
  }
  store_vector(out + idx_out, index_x, out_vec, shape_x, out_stride_x);
}

template <typename In, typename Out, typename IdxT, int N_READS>
__global__ void copy_gg(
    const In* in,
    Out* out,
    IdxT size_rest,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides strides_in,
    const __grid_constant__ Strides strides_out,
    int ndim) {
  auto block = cg::this_thread_block();
  auto grid = cg::this_grid();
  IdxT index_rest =
      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
  if (index_rest >= size_rest) {
    return;
  }

  auto shape_x = shape[ndim - 1];
  auto in_stride_x = strides_in[ndim - 1];
  auto out_stride_x = strides_out[ndim - 1];
  IdxT index_x =
      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
  auto [idx_in, idx_out] = elem_to_loc(
      index_rest * shape_x,
      shape.data(),
      strides_in.data(),
      strides_out.data(),
      ndim);

  auto in_vec =
      load_vector<N_READS>(in + idx_in, index_x, shape_x, in_stride_x, In(0));
  AlignedVector<Out, N_READS> out_vec;
#pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    out_vec[i] = CastOp<In, Out>{}(in_vec[i]);
  }
  store_vector(out + idx_out, index_x, out_vec, shape_x, out_stride_x);
}

} // namespace cu

void copy_general(
    cu::CommandEncoder& encoder,
    CopyType ctype,
    const array& in,
    array& out,
    int64_t offset_in,
    int64_t offset_out,
    const Shape& shape,
    const Strides& strides_in,
    const Strides& strides_out) {
  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
      dispatch_bool(
          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
          [&](auto large) {
            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
            const InType* in_ptr = gpu_ptr<InType>(in) + offset_in;
            OutType* out_ptr = gpu_ptr<OutType>(out) + offset_out;
            int ndim = shape.size();
            size_t data_size = 1;
            for (auto& s : shape)
              data_size *= s;

            int work_per_thread = 1;
            auto dim0 = ndim > 0 ? shape.back() : 1;
            auto rest = data_size / dim0;
            if (dim0 >= 4) {
              work_per_thread = 4;
            }

            dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
            auto block_dims = get_block_dims(dim0, rest, 1);
            uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
            uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);

            if (ndim <= 3) {
              dispatch_1_2_3(ndim, [&](auto ndim_constant) {
                auto kernel =
                    cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant(), 1>;
                if (work_per_thread == 4) {
                  kernel =
                      cu::copy_gg_nd<InType, OutType, IdxT, ndim_constant(), 4>;
                }
                encoder.add_kernel_node(
                    kernel,
                    {num_blocks_x, num_blocks_y},
                    block_dims,
                    in_ptr,
                    out_ptr,
                    rest,
                    const_param<ndim_constant()>(shape),
                    const_param<ndim_constant()>(strides_in),
                    const_param<ndim_constant()>(strides_out));
              });
            } else { // ndim >= 4
              auto kernel = cu::copy_gg<InType, OutType, IdxT, 1>;
              if (work_per_thread == 4) {
                kernel = cu::copy_gg<InType, OutType, IdxT, 4>;
              }
              encoder.add_kernel_node(
                  kernel,
                  {num_blocks_x, num_blocks_y},
                  block_dims,
                  in_ptr,
                  out_ptr,
                  rest,
                  const_param(shape),
                  const_param(strides_in),
                  const_param(strides_out),
                  ndim);
            }
          });
    });
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/copy/copy_general_dynamic.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/copy/copy.cuh"

#include <cooperative_groups.h>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <typename In, typename Out, typename IdxT, int NDIM>
__global__ void copy_gg_dynamic_nd(
    const In* in,
    Out* out,
    IdxT size,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_in,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides_out,
    const int64_t* offset_in,
    const int64_t* offset_out) {
  IdxT index = cg::this_grid().thread_rank();
  if (index < size) {
    auto [idx_in, idx_out] = elem_to_loc_nd<NDIM>(
        index, shape.data(), strides_in.data(), strides_out.data());
    out[idx_out + *offset_out] = CastOp<In, Out>{}(in[idx_in + *offset_in]);
  }
}

template <typename In, typename Out, typename IdxT>
__global__ void copy_gg_dynamic(
    const In* in,
    Out* out,
    IdxT size,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides strides_in,
    const __grid_constant__ Strides strides_out,
    int ndim,
    const int64_t* offset_in,
    const int64_t* offset_out) {
  IdxT index = cg::this_grid().thread_rank();
  if (index < size) {
    auto [idx_in, idx_out] = elem_to_loc(
        index, shape.data(), strides_in.data(), strides_out.data(), ndim);
    out[idx_out + *offset_out] = CastOp<In, Out>{}(in[idx_in + *offset_in]);
  }
}

} // namespace cu

void copy_general_dynamic(
    cu::CommandEncoder& encoder,
    CopyType ctype,
    const array& in,
    array& out,
    int64_t offset_in,
    int64_t offset_out,
    const Shape& shape,
    const Strides& strides_in,
    const Strides& strides_out,
    const array& dynamic_offset_in,
    const array& dynamic_offset_out) {
  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
      dispatch_bool(
          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
          [&](auto large) {
            using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
            using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
            const InType* in_ptr = gpu_ptr<InType>(in) + offset_in;
            OutType* out_ptr = gpu_ptr<OutType>(out) + offset_out;
            int ndim = shape.size();
            if (ndim <= 3) {
              dispatch_1_2_3(ndim, [&](auto dims_constant) {
                auto [num_blocks, block_dims] = get_launch_args(out, large());
                encoder.add_kernel_node(
                    cu::copy_gg_dynamic_nd<
                        InType,
                        OutType,
                        IdxT,
                        dims_constant()>,
                    num_blocks,
                    block_dims,
                    in_ptr,
                    out_ptr,
                    out.size(),
                    const_param<dims_constant()>(shape),
                    const_param<dims_constant()>(strides_in),
                    const_param<dims_constant()>(strides_out),
                    gpu_ptr<int64_t>(dynamic_offset_in),
                    gpu_ptr<int64_t>(dynamic_offset_out));
              });
            } else { // ndim >= 4
              auto [num_blocks, block_dims] = get_launch_args(out, large());
              encoder.add_kernel_node(
                  cu::copy_gg_dynamic<InType, OutType, IdxT>,
                  num_blocks,
                  block_dims,
                  in_ptr,
                  out_ptr,
                  out.size(),
                  const_param(shape),
                  const_param(strides_in),
                  const_param(strides_out),
                  ndim,
                  gpu_ptr<int64_t>(dynamic_offset_in),
                  gpu_ptr<int64_t>(dynamic_offset_out));
            }
          });
    });
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/copy/copy_general_input.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/copy/copy.cuh"

#include <cooperative_groups.h>

namespace mlx::core {
static constexpr int TILE_SIZE = 16;

namespace cu {

namespace cg = cooperative_groups;

template <typename In, typename Out, typename IdxT, int NDIM, int N_READS>
__global__ void copy_g_nd(
    const In* in,
    Out* out,
    IdxT size_rest,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> strides) {
  auto block = cg::this_thread_block();
  auto grid = cg::this_grid();
  IdxT index_rest =
      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
  if (index_rest >= size_rest) {
    return;
  }

  auto shape_x = shape[NDIM - 1];
  auto stride_x = strides[NDIM - 1];
  IdxT index_x =
      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
  auto idx =
      elem_to_loc_nd<NDIM>(index_rest * shape_x, shape.data(), strides.data());
  auto in_vec =
      load_vector<N_READS>(in + idx, index_x, shape_x, stride_x, In(0));
  AlignedVector<Out, N_READS> out_vec;
#pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    out_vec[i] = CastOp<In, Out>{}(in_vec[i]);
  }
  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
}

template <typename In, typename Out, typename IdxT, int N_READS>
__global__ void copy_g(
    const In* in,
    Out* out,
    IdxT size_rest,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides strides,
    int ndim) {
  auto block = cg::this_thread_block();
  auto grid = cg::this_grid();
  IdxT index_rest =
      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
  if (index_rest >= size_rest) {
    return;
  }

  auto shape_x = shape[ndim - 1];
  auto stride_x = strides[ndim - 1];
  IdxT index_x =
      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
  auto idx =
      elem_to_loc(index_rest * shape_x, shape.data(), strides.data(), ndim);
  auto in_vec =
      load_vector<N_READS>(in + idx, index_x, shape_x, stride_x, In(0));
  AlignedVector<Out, N_READS> out_vec;
#pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    out_vec[i] = CastOp<In, Out>{}(in_vec[i]);
  }
  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
}

template <typename In, typename Out, int N_READS>
__global__ void
copy_col_row(const In* in, Out* out, int64_t rows, int64_t cols) {
  __shared__ Out
      tile[N_READS * TILE_SIZE][N_READS * TILE_SIZE + 4 / sizeof(Out)];

  auto block = cg::this_thread_block();
  auto grid = cg::this_grid();

  auto tile_row = grid.block_index().x * TILE_SIZE * N_READS;
  auto tile_col = grid.block_index().y * TILE_SIZE * N_READS;

  auto tidx = block.thread_index().x;
  auto tidy = N_READS * block.thread_index().y;

  auto in_ptr = in + (tile_col + tidy) * rows + tile_row;

#pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    if ((tile_col + tidy + i) < cols) {
      auto in_vec = load_vector<N_READS>(in_ptr, tidx, rows - tile_row, In(0));
#pragma unroll
      for (int j = 0; j < N_READS; ++j) {
        tile[N_READS * tidx + j][tidy + i] = CastOp<In, Out>{}(in_vec[j]);
      }
      in_ptr += rows;
    }
  }

  block.sync();

  auto out_ptr = out + (tile_row + tidy) * cols + tile_col;

#pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    if ((tile_row + tidy + i) < rows) {
      AlignedVector<Out, N_READS> out_vec;
#pragma unroll
      for (int j = 0; j < N_READS; ++j) {
        out_vec[j] = tile[tidy + i][N_READS * tidx + j];
      }
      store_vector(out_ptr, tidx, out_vec, cols - tile_col);
      out_ptr += cols;
    }
  }
}

} // namespace cu

void copy_general_input(
    cu::CommandEncoder& encoder,
    CopyType ctype,
    const array& in,
    array& out,
    int64_t offset_in,
    int64_t offset_out,
    const Shape& shape,
    const Strides& strides_in) {
  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
      using InType = cuda_type_t<MLX_GET_TYPE(in_type_tag)>;
      using OutType = cuda_type_t<MLX_GET_TYPE(out_type_tag)>;
      const InType* in_ptr = gpu_ptr<InType>(in) + offset_in;
      OutType* out_ptr = gpu_ptr<OutType>(out) + offset_out;
      int ndim = shape.size();

      // Column contiguous to row contiguous specialization
      if (ndim == 2 && strides_in[0] == 1 && strides_in[1] == shape[0]) {
        constexpr int work_per_thread =
            std::min(static_cast<int>(16 / sizeof(OutType)), 8);
        dim3 block_dims = {TILE_SIZE, TILE_SIZE};
        uint32_t num_blocks_x =
            cuda::ceil_div(shape[0], TILE_SIZE * work_per_thread);
        uint32_t num_blocks_y =
            cuda::ceil_div(shape[1], TILE_SIZE * work_per_thread);
        auto kernel = cu::copy_col_row<InType, OutType, work_per_thread>;
        encoder.add_kernel_node(
            kernel,
            {num_blocks_x, num_blocks_y},
            block_dims,
            in_ptr,
            out_ptr,
            int64_t(shape[0]),
            int64_t(shape[1]));
        return;
      }

      dispatch_bool(
          in.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
          [&](auto large) {
            using IdxT = std::conditional_t<large(), int64_t, int32_t>;

            int work_per_thread = 8;
            auto dim0 = ndim > 0 ? shape.back() : 1;
            auto rest = out.size() / dim0;
            if (dim0 >= 4 && dim0 < 8) {
              work_per_thread = 4;
            } else if (dim0 < 4) {
              work_per_thread = 1;
            }
            dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
            auto block_dims = get_block_dims(dim0, rest, 1);
            uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
            uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);

            if (ndim <= 3) {
              dispatch_1_2_3(ndim, [&](auto dims_constant) {
                auto kernel =
                    cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 1>;
                if (work_per_thread == 8) {
                  kernel =
                      cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 8>;
                } else if (work_per_thread == 4) {
                  kernel =
                      cu::copy_g_nd<InType, OutType, IdxT, dims_constant(), 4>;
                }
                encoder.add_kernel_node(
                    kernel,
                    {num_blocks_x, num_blocks_y},
                    block_dims,
                    in_ptr,
                    out_ptr,
                    rest,
                    const_param<dims_constant()>(shape),
                    const_param<dims_constant()>(strides_in));
              });
            } else { // ndim >= 4
              auto kernel = cu::copy_g<InType, OutType, IdxT, 1>;
              if (work_per_thread == 8) {
                kernel = cu::copy_g<InType, OutType, IdxT, 8>;
              } else if (work_per_thread == 4) {
                kernel = cu::copy_g<InType, OutType, IdxT, 4>;
              }
              encoder.add_kernel_node(
                  kernel,
                  {num_blocks_x, num_blocks_y},
                  block_dims,
                  in_ptr,
                  out_ptr,
                  rest,
                  const_param(shape),
                  const_param(strides_in),
                  ndim);
            }
          });
    });
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/copy.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/utils.h"
#include "mlx/backend/cuda/copy/copy.cuh"

namespace mlx::core {

void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s) {
  auto& encoder = cu::get_command_encoder(s);
  bool donated = set_copy_output_data(
      in, out, ctype, [&](auto n) { return cu::malloc_async(n, encoder); });
  if (donated && in.dtype() == out.dtype()) {
    // If the output has the same type as the input then there is nothing to
    // copy, just use the buffer.
    return;
  }
  if (ctype == CopyType::GeneralGeneral) {
    ctype = CopyType::General;
  }
  copy_gpu_inplace(in, out, ctype, s);
}

void copy_gpu_inplace(
    const array& in,
    array& out,
    const Shape& shape,
    const Strides& strides_in,
    const Strides& strides_out,
    int64_t offset_in,
    int64_t offset_out,
    CopyType ctype,
    const Stream& s,
    std::optional<array> dynamic_offset_in,
    std::optional<array> dynamic_offset_out) {
  if (out.size() == 0) {
    return;
  }

  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  if (ctype == CopyType::Scalar || ctype == CopyType::Vector) {
    copy_contiguous(encoder, ctype, in, out, offset_in, offset_out);
    return;
  }

  if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
    auto [shape_collapsed, strides_vec] = collapse_contiguous_dims(
        shape, std::vector{strides_in, strides_out}, INT32_MAX);
    if (ctype == CopyType::General) {
      copy_general_input(
          encoder,
          ctype,
          in,
          out,
          offset_in,
          offset_out,
          shape_collapsed,
          strides_vec[0]);
    } else {
      if (dynamic_offset_in || dynamic_offset_out) {
        if (!dynamic_offset_in) {
          dynamic_offset_in = array(0, int64);
          encoder.add_temporary(*dynamic_offset_in);
        }
        if (!dynamic_offset_out) {
          dynamic_offset_out = array(0, int64);
          encoder.add_temporary(*dynamic_offset_out);
        }
        encoder.set_input_array(*dynamic_offset_in);
        encoder.set_input_array(*dynamic_offset_out);
        copy_general_dynamic(
            encoder,
            ctype,
            in,
            out,
            offset_in,
            offset_out,
            shape_collapsed,
            strides_vec[0],
            strides_vec[1],
            *dynamic_offset_in,
            *dynamic_offset_out);
      } else {
        copy_general(
            encoder,
            ctype,
            in,
            out,
            offset_in,
            offset_out,
            shape_collapsed,
            strides_vec[0],
            strides_vec[1]);
      }
    }
    return;
  }
}

void fill_gpu(const array& in, array& out, const Stream& s) {
  if (out.size() == 0) {
    return;
  }
  auto& encoder = cu::get_command_encoder(s);
  out.set_data(cu::malloc_async(out.nbytes(), encoder));
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  copy_contiguous(encoder, CopyType::Scalar, in, out, 0, 0);
}

void reshape_gpu(const array& in, array& out, Stream s) {
  auto [copy_necessary, out_strides] = prepare_reshape(in, out);
  if (copy_necessary) {
    auto& encoder = cu::get_command_encoder(s);
    out.set_data(cu::malloc_async(out.nbytes(), encoder));
    copy_gpu_inplace(
        in,
        out,
        in.shape(),
        in.strides(),
        make_contiguous_strides(in.shape()),
        0,
        0,
        CopyType::General,
        s);
  } else {
    shared_buffer_reshape(in, out_strides, out);
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/cublas_utils.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/cublas_utils.h"
#include "mlx/backend/cuda/cuda.h"
#include "mlx/utils.h"

namespace mlx::core {
namespace cublas_utils {

namespace {

struct CublasPreference {
  CublasPreference(cu::Device& device) {
    // The recommended cublas workspace size is 4 MiB for pre-Hopper and 32 MiB
    // for Hopper+:
    // https://docs.nvidia.com/cuda/cublas/#cublassetworkspace
    uint64_t MiB = 1024 * 1024;
    uint64_t workspace_size =
        device.compute_capability_major() >= 9 ? 32 * MiB : 4 * MiB;

    CHECK_CUBLAS_ERROR(cublasLtMatmulPreferenceCreate(&pref_));
    CHECK_CUBLAS_ERROR(cublasLtMatmulPreferenceSetAttribute(
        pref_,
        CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
        &workspace_size,
        sizeof(uint64_t)));
  }

  ~CublasPreference() {
    CHECK_CUBLAS_ERROR(cublasLtMatmulPreferenceDestroy(pref_));
  }

  cublasLtMatmulPreference_t pref_{nullptr};
};

} // namespace

cublasLtMatmulPreference_t get_preference(cu::Device& device) {
  static CublasPreference pref(device);
  return pref.pref_;
}

cublasLtMatrixLayout_t create_matrix_layout(
    cudaDataType_t type,
    uint64_t rows,
    uint64_t cols,
    bool transposed,
    int64_t ld,
    int32_t batch_count,
    int64_t batch_stride) {
  cublasLtMatrixLayout_t desc;
  if (transposed) {
    std::swap(rows, cols);
  }
  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutCreate(&desc, type, rows, cols, ld));
  if (batch_count > 1) {
    CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutSetAttribute(
        desc,
        CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
        &batch_count,
        sizeof(int32_t)));
    CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutSetAttribute(
        desc,
        CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
        &batch_stride,
        sizeof(int64_t)));
  }
  return desc;
}

} // namespace cublas_utils

CublasMatmulBase::~CublasMatmulBase() {
  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutDestroy(a_desc_));
  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutDestroy(b_desc_));
  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutDestroy(c_desc_));
  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutDestroy(out_desc_));
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescDestroy(matmul_desc_));
}

void CublasMatmulBase::init_base(
    cu::Device& device,
    cudaDataType_t scale_type,
    cublasComputeType_t compute_type,
    cudaDataType_t data_type,
    cudaDataType_t output_type,
    bool a_transposed,
    uint64_t a_rows,
    uint64_t a_cols,
    int64_t lda,
    bool b_transposed,
    uint64_t b_rows,
    uint64_t b_cols,
    int64_t ldb,
    int32_t batch_count,
    int64_t a_batch_stride,
    int64_t b_batch_stride) {
  M_ = a_rows;
  N_ = b_cols;
  scale_type_ = scale_type;
  handle_ = device.get_cublaslt_handle();
  pref_ = cublas_utils::get_preference(device);
  heuristic_.state = CUBLAS_STATUS_NOT_INITIALIZED;

  CHECK_CUBLAS_ERROR(
      cublasLtMatmulDescCreate(&matmul_desc_, compute_type, scale_type));

  // In cublasLt matrices use column-major layout, while it is possible to use
  // the CUBLASLT_ORDER_ROW option to switch to row-major layout, the bias
  // epilogue does not work with the option. So instead we swap A and B to make
  // cublasLt return the row-major result, which works because:
  // - the data of a matrix in row-major layout is identical to its transpose in
  //   column-major layout
  // - C^T = (A @ B)^T = B^T @ A^T
  cublasOperation_t a_op = b_transposed ? CUBLAS_OP_T : CUBLAS_OP_N;
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
      matmul_desc_,
      CUBLASLT_MATMUL_DESC_TRANSA,
      &a_op,
      sizeof(cublasOperation_t)));
  cublasOperation_t b_op = a_transposed ? CUBLAS_OP_T : CUBLAS_OP_N;
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
      matmul_desc_,
      CUBLASLT_MATMUL_DESC_TRANSB,
      &b_op,
      sizeof(cublasOperation_t)));

  a_desc_ = cublas_utils::create_matrix_layout(
      data_type,
      b_cols,
      b_rows,
      b_transposed,
      ldb,
      batch_count,
      b_batch_stride);
  b_desc_ = cublas_utils::create_matrix_layout(
      data_type,
      a_cols,
      a_rows,
      a_transposed,
      lda,
      batch_count,
      a_batch_stride);
  out_desc_ = cublas_utils::create_matrix_layout(
      output_type, b_cols, a_rows, false, b_cols, batch_count, b_cols * a_rows);
}

void CublasMatmulBase::execute_matmul(
    cu::CommandEncoder& encoder,
    void* out,
    const void* a,
    const void* b,
    const void* c,
    const void* alpha_ptr,
    const void* beta_ptr) {
  if (heuristic_.state != CUBLAS_STATUS_SUCCESS) {
    int ret = 0;
    CHECK_CUBLAS_ERROR(cublasLtMatmulAlgoGetHeuristic(
        handle_,
        matmul_desc_,
        a_desc_,
        b_desc_,
        c ? c_desc_ : out_desc_,
        out_desc_,
        pref_,
        1,
        &heuristic_,
        &ret));
    if (ret == 0) {
      throw std::runtime_error("Can not find algorithm for matmul.");
    }
  }

  void* workspace_ptr = allocate_workspace(encoder, heuristic_.workspaceSize);

  // Execute matmul
  auto capture = encoder.capture_context();
  CHECK_CUBLAS_ERROR(cublasLtMatmul(
      handle_,
      matmul_desc_,
      alpha_ptr,
      b, // a and b are swapped for row-major layout
      a_desc_,
      a,
      b_desc_,
      beta_ptr,
      c ? c : out,
      c ? c_desc_ : out_desc_,
      out,
      out_desc_,
      &heuristic_.algo,
      workspace_ptr,
      heuristic_.workspaceSize,
      encoder.stream()));
}

void CublasMatmulBase::set_bias(
    cu::CommandEncoder& encoder,
    const array& bias) {
  encoder.set_input_array(bias);
  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS;
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
      matmul_desc_,
      CUBLASLT_MATMUL_DESC_EPILOGUE,
      &epilogue,
      sizeof(epilogue)));
  auto* bias_ptr = gpu_ptr<void>(bias);
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
      matmul_desc_,
      CUBLASLT_MATMUL_DESC_BIAS_POINTER,
      &bias_ptr,
      sizeof(bias_ptr)));
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/cublas_utils.h
================================================
// Copyright © 2025 Apple Inc.
#pragma once

#include <cublasLt.h>
#include "mlx/array.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/dtype_utils.h"

namespace mlx::core {
namespace cublas_utils {

// Get the shared cublas preference for a device
cublasLtMatmulPreference_t get_preference(cu::Device& device);

cublasLtMatrixLayout_t create_matrix_layout(
    cudaDataType_t type,
    uint64_t rows,
    uint64_t cols,
    bool transposed,
    int64_t ld,
    int32_t batch_count,
    int64_t batch_stride);

inline cudaDataType_t dtype_to_cublas_type(Dtype dtype, std::string_view tag) {
  switch (dtype) {
    case float16:
      return CUDA_R_16F;
    case bfloat16:
      return CUDA_R_16BF;
    case float32:
      return CUDA_R_32F;
    case float64:
      return CUDA_R_64F;
    case complex64:
      return CUDA_C_32F;
    default:
      throw std::runtime_error(
          fmt::format(
              "Unsupported dtype in {}: {}.", tag, dtype_to_string(dtype)));
  }
}

} // namespace cublas_utils

class CublasMatmulBase {
 public:
  virtual ~CublasMatmulBase();

  void set_bias(cu::CommandEncoder& encoder, const array& bias);

 protected:
  CublasMatmulBase() = default;

  // Common member variables shared by all matmul types
  uint64_t M_;
  uint64_t N_;
  cudaDataType_t scale_type_;
  cublasLtMatmulPreference_t pref_{nullptr};
  cublasLtHandle_t handle_{nullptr};
  cublasLtMatmulDesc_t matmul_desc_{nullptr};
  cublasLtMatrixLayout_t a_desc_{nullptr};
  cublasLtMatrixLayout_t b_desc_{nullptr};
  cublasLtMatrixLayout_t c_desc_{nullptr};
  cublasLtMatrixLayout_t out_desc_{nullptr};
  cublasLtMatmulHeuristicResult_t heuristic_;

  void init_base(
      cu::Device& device,
      cudaDataType_t scale_type,
      cublasComputeType_t compute_type,
      cudaDataType_t data_type,
      cudaDataType_t output_type,
      bool a_transposed,
      uint64_t a_rows,
      uint64_t a_cols,
      int64_t lda,
      bool b_transposed,
      uint64_t b_rows,
      uint64_t b_cols,
      int64_t ldb,
      int32_t batch_count,
      int64_t a_batch_stride,
      int64_t b_batch_stride);

  void execute_matmul(
      cu::CommandEncoder& encoder,
      void* out,
      const void* a,
      const void* b,
      const void* c,
      const void* alpha_ptr,
      const void* beta_ptr);
};

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/cuda.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include <string>
#include <unordered_map>
#include <variant>

#include "mlx/api.h"

namespace mlx::core::cu {

/* Check if the CUDA backend is available. */
MLX_API bool is_available();

/* Get information about a CUDA device. */
MLX_API const
    std::unordered_map<std::string, std::variant<std::string, size_t>>&
    device_info(int device_index = 0);

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/cuda_utils.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include <cublasLt.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cudnn.h>

namespace mlx::core {

// Throw exception if the cuda API does not succeed.
void check_cublas_error(const char* name, cublasStatus_t err);
void check_cuda_error(const char* name, cudaError_t err);
void check_cuda_error(const char* name, CUresult err);
void check_cudnn_error(const char* name, cudnnStatus_t err);

// The macro version that prints the command that failed.
#define CHECK_CUBLAS_ERROR(cmd) check_cublas_error(#cmd, (cmd))
#define CHECK_CUDA_ERROR(cmd) check_cuda_error(#cmd, (cmd))
#define CHECK_CUDNN_ERROR(cmd) check_cudnn_error(#cmd, (cmd))

// Base class for RAII managed CUDA resources.
template <typename Handle, cudaError_t (*Destroy)(Handle)>
class CudaHandle {
 public:
  CudaHandle(Handle handle = nullptr) : handle_(handle) {}

  CudaHandle(CudaHandle&& other) : handle_(other.handle_) {
    assert(this != &other);
    other.handle_ = nullptr;
  }

  ~CudaHandle() {
    // Skip if there was an error to avoid throwing in the destructors
    if (cudaPeekAtLastError() != cudaSuccess) {
      return;
    }
    reset();
  }

  CudaHandle(const CudaHandle&) = delete;
  CudaHandle& operator=(const CudaHandle&) = delete;

  CudaHandle& operator=(CudaHandle&& other) {
    assert(this != &other);
    reset();
    std::swap(handle_, other.handle_);
    return *this;
  }

  void reset() {
    if (handle_ != nullptr) {
      CHECK_CUDA_ERROR(Destroy(handle_));
      handle_ = nullptr;
    }
  }

  operator Handle() const {
    return handle_;
  }

 protected:
  Handle handle_;
};

namespace cu {
class Device;
}; // namespace cu

// Wrappers of CUDA resources.
class CudaGraph : public CudaHandle<cudaGraph_t, cudaGraphDestroy> {
 public:
  using CudaHandle::CudaHandle;
  explicit CudaGraph(cu::Device& device);
  void end_capture(cudaStream_t stream);
};

class CudaGraphExec : public CudaHandle<cudaGraphExec_t, cudaGraphExecDestroy> {
 public:
  void instantiate(cudaGraph_t graph);
};

class CudaStream : public CudaHandle<cudaStream_t, cudaStreamDestroy> {
 public:
  using CudaHandle::CudaHandle;
  explicit CudaStream(cu::Device& device);
};

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/cudnn_utils.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/cudnn_utils.h"
#include "mlx/backend/cuda/device.h"

namespace mlx::core {

namespace {

#define RETURN_IF_ERROR(cmd)          \
  if (auto ret = cmd; ret.is_bad()) { \
    return ret;                       \
  }

// In MLX a singleton dim (shape[dim] == 1) can have any stride, but in cuDNN
// whether a tensor is contiguous is determined with:
// shape[dim] == shape[dim + 1] * strides[dim + 1]
// So a contiguous array with singleton dims in MLX may be mistakenly treated
// as strided in cuDNN, and we work around it by normalizing the strides.
std::vector<int64_t> normalized_strides(const array& x) {
  std::vector<int64_t> strides(x.strides().begin(), x.strides().end());
  if (std::all_of(
          strides.begin(), strides.end(), [](int64_t s) { return s == 0; })) {
    strides.back() = 1;
    return strides;
  }
  if (!x.flags().row_contiguous || x.ndim() < 2) {
    return strides;
  }
  for (int i = x.ndim() - 2; i >= 0; --i) {
    if (x.shape(i) == 1) {
      strides[i] = x.shape(i + 1) * strides[i + 1];
    }
  }
  return strides;
}

// Return the shape and strides after transposing from NHWC to NCHW.
inline auto nhwc_to_nchw(const array& x) {
  auto shape = convert_vector<int64_t>(x.shape());
  auto strides = normalized_strides(x);
  assert(shape.size() >= 3);
  shape.insert(shape.begin() + 1, shape.back());
  shape.erase(shape.end() - 1);
  strides.insert(strides.begin() + 1, strides.back());
  strides.erase(strides.end() - 1);
  return std::make_tuple(std::move(shape), std::move(strides));
}

} // namespace

fe::error_t DnnGraph::prepare() {
  RETURN_IF_ERROR(validate());
  try {
    RETURN_IF_ERROR(build_operation_graph(handle_));
  } catch (cudnn_frontend::cudnnException& error) {
    // cuDNN bug: they did not catch all exceptions in the API.
    return {fe::error_code_t::CUDNN_BACKEND_API_FAILED, error.what()};
  }
  RETURN_IF_ERROR(create_execution_plans({fe::HeurMode_t::A}));
  return {};
}

fe::error_t DnnGraph::build() {
  RETURN_IF_ERROR(check_support(handle_));
  RETURN_IF_ERROR(build_plans(handle_));
  return {};
}

fe::error_t DnnGraph::encode_graph(
    cu::CommandEncoder& encoder,
    std::unordered_map<int64_t, void*> variant_pack) {
  cudnnSetStream(handle_, encoder.stream());
  auto* workspace_ptr = prepare_workspace(encoder);
  if (!cached_cuda_graph_) {
    // First call: populate the CUDA graph from the cuDNN execution plan.
    // Also compute and cache the subgraph key to avoid calling
    // cudaGraphKernelNodeGetAttribute on every subsequent call (expensive
    // on WDDM where each driver API call has ~40-400us overhead).
    cached_cuda_graph_.emplace(encoder.device());
    RETURN_IF_ERROR(populate_cuda_graph(
        handle_, variant_pack, workspace_ptr, *cached_cuda_graph_));
    std::tie(cached_subgraph_key_, cached_is_updatable_) =
        cu::subgraph_to_key(*cached_cuda_graph_);
  } else {
    // Subsequent calls: patch data pointers without re-running kernel setup.
    RETURN_IF_ERROR(update_cuda_graph(
        handle_, variant_pack, workspace_ptr, *cached_cuda_graph_));
  }
  // Add the cuDNN child graph to the parent CUDA graph for batched launch.
  // The pre-computed subgraph key avoids expensive per-node attribute queries.
  encoder.add_graph_node(
      *cached_cuda_graph_, cached_subgraph_key_, cached_is_updatable_);
  return {};
}

fe::error_t DnnGraph::encode_capturing(
    cu::CommandEncoder& encoder,
    std::unordered_map<int64_t, void*> variant_pack) {
  auto* workspace_ptr = prepare_workspace(encoder);
  auto capture = encoder.capture_context();
  cudnnSetStream(handle_, encoder.stream());
  auto ret = execute(handle_, variant_pack, workspace_ptr);
  if (ret.is_bad()) {
    capture.discard = true;
  }
  return ret;
}

void* DnnGraph::prepare_workspace(cu::CommandEncoder& encoder) {
  int64_t workspace_size = 0;
  CHECK_CUDNN_FE_ERROR(get_workspace_size(workspace_size));
  return allocate_workspace(encoder, workspace_size);
}

void DnnGraph::set_tensor_attrs(
    std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
    int64_t uid,
    const array& x,
    const std::vector<int64_t>& shape,
    const std::vector<int64_t>& strides) {
  tensor->set_uid(uid)
      .set_alignment(get_alignment(x))
      .set_data_type(dtype_to_cudnn_type(x.dtype()))
      .set_dim(shape)
      .set_stride(strides);
}

void DnnGraph::set_tensor_attrs(
    std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
    int64_t uid,
    const array& x) {
  set_tensor_attrs(
      tensor,
      uid,
      x,
      convert_vector<int64_t>(x.shape()),
      normalized_strides(x));
}

void DnnGraph::set_tensor_attrs_nchw(
    std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
    int64_t uid,
    const array& x) {
  auto [shape, strides] = nhwc_to_nchw(x);
  set_tensor_attrs(tensor, uid, x, shape, strides);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/cudnn_utils.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include <cassert>
#include <optional>

#include "mlx/backend/cuda/cuda_utils.h"
#include "mlx/backend/cuda/device/config.h"
#include "mlx/backend/cuda/utils.h"
#include "mlx/dtype_utils.h"

#include <cudnn_frontend.h>
#include <fmt/format.h>

namespace mlx::core {

namespace cu {
class CommandEncoder;
}

namespace fe = cudnn_frontend;

#define CHECK_CUDNN_FE_ERROR(cmd)                                    \
  do {                                                               \
    auto error = cmd;                                                \
    if (!error.is_good()) {                                          \
      throw std::runtime_error(                                      \
          fmt::format("{} failed: {}.", #cmd, error.get_message())); \
    }                                                                \
  } while (0)

// Return pointer alignment of |x|'s data.
inline uint8_t get_alignment(const array& x) {
  uint8_t alignment = 1;
  uintptr_t address = reinterpret_cast<uintptr_t>(gpu_ptr<void>(x));
  for (; alignment < 32; alignment *= 2) {
    if (address % (alignment * 2)) {
      return alignment;
    }
  }
  return alignment;
}

// Convert the type of elements in |vec| to |T|.
template <typename T, typename Vec>
inline std::vector<T> convert_vector(const Vec& vec) {
  return std::vector<T>(vec.begin(), vec.end());
}

// Map dtype to cudnn data type.
inline fe::DataType_t dtype_to_cudnn_type(Dtype dtype) {
  switch (dtype) {
    case int8:
      return fe::DataType_t::INT8;
    case int32:
      return fe::DataType_t::INT32;
    case uint8:
      return fe::DataType_t::UINT8;
    case float16:
      return fe::DataType_t::HALF;
    case bfloat16:
      return fe::DataType_t::BFLOAT16;
    case float32:
      return fe::DataType_t::FLOAT;
    case float64:
      return fe::DataType_t::DOUBLE;
    default:
      throw std::runtime_error(
          fmt::format(
              "Unsupported dtype in cuDNN: {}.", dtype_to_string(dtype)));
  }
}

// Return an array that can be used as map key for |vec| with size <= MAX_NDIM.
//
// There are 2 differences from the const_param util from kernel_utils.cuh:
// 1. The rest of array is filled with 0.
// 2. This util can be used in .cpp files.
template <int NDIM = MAX_NDIM, typename Vec>
inline std::array<typename Vec::value_type, NDIM> vector_key(const Vec& vec) {
  if (vec.size() > NDIM) {
    throw std::runtime_error(
        fmt::format("ndim can not be larger than {}.", NDIM));
  }
  std::array<typename Vec::value_type, NDIM> result = {};
  std::copy_n(vec.begin(), vec.size(), result.begin());
  return result;
}

// Extends cuDNN graph with helpers.
class DnnGraph : public fe::graph::Graph {
 public:
  DnnGraph(cudnnHandle_t handle, Dtype io_dtype, Dtype compute_dtype = float32)
      : handle_(handle) {
    set_io_data_type(dtype_to_cudnn_type(io_dtype));
    set_intermediate_data_type(dtype_to_cudnn_type(compute_dtype));
    set_compute_data_type(dtype_to_cudnn_type(compute_dtype));
  }

  // Create a cuDNN tensor description from MLX array |x|.
  auto& tensor(
      std::shared_ptr<fe::graph::Tensor_attributes>& attrs,
      int64_t uid,
      const array& x) {
    set_tensor_attrs(attrs, uid, x);
    return attrs;
  }
  auto tensor(const char* name, int64_t uid, const array& x) {
    auto attrs = Graph::tensor(fe::graph::Tensor_attributes().set_name(name));
    tensor(attrs, uid, x);
    return attrs;
  }

  // Create a cuDNN tensor description from MLX array |x|, and transpose it from
  // NHWC layout to NCHW.
  auto& tensor_nchw(
      std::shared_ptr<fe::graph::Tensor_attributes>& attrs,
      int64_t uid,
      const array& x) {
    set_tensor_attrs_nchw(attrs, uid, x);
    return attrs;
  }
  auto tensor_nchw(const char* name, int64_t uid, const array& x) {
    auto attrs = Graph::tensor(fe::graph::Tensor_attributes().set_name(name));
    tensor_nchw(attrs, uid, x);
    return attrs;
  }

  // Create a 4D cuDNN tensor from 1D array, with |axis| being contiguous dim.
  auto tensor_4d(const char* name, int64_t uid, const array& x, int axis) {
    assert(x.ndim() == 1);
    auto attrs = Graph::tensor(fe::graph::Tensor_attributes().set_name(name));
    std::vector<int64_t> shape(4, 1);
    std::vector<int64_t> strides(4, 1);
    shape.at(axis) = x.size();
    if (axis > 0) {
      strides.at(axis - 1) = x.size();
    }
    set_tensor_attrs(attrs, uid, x, shape, strides);
    return attrs;
  }

  // Create a cuDNN tensor for scalar.
  auto scalar(const char* name, int64_t uid, Dtype dtype) {
    return Graph::tensor(
        fe::graph::Tensor_attributes()
            .set_name(name)
            .set_uid(uid)
            .set_dim({1, 1, 1, 1})
            .set_stride({1, 1, 1, 1})
            .set_is_pass_by_value(true)
            .set_data_type(dtype_to_cudnn_type(dtype)));
  }

  // Call this before setting notes.
  fe::error_t prepare();
  // Call this after setting notes.
  fe::error_t build();

  // Add cuDNN graph to CUDA graph, using native CUDA graph API.
  fe::error_t encode_graph(
      cu::CommandEncoder& encoder,
      std::unordered_map<int64_t, void*> variant_pack);
  // Add cuDNN graph to CUDA graph, using stream capture.
  fe::error_t encode_capturing(
      cu::CommandEncoder& encoder,
      std::unordered_map<int64_t, void*> variant_pack);

 private:
  void* prepare_workspace(cu::CommandEncoder& encoder);

  void set_tensor_attrs(
      std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
      int64_t uid,
      const array& x,
      const std::vector<int64_t>& shape,
      const std::vector<int64_t>& strides);
  void set_tensor_attrs(
      std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
      int64_t uid,
      const array& x);
  void set_tensor_attrs_nchw(
      std::shared_ptr<fe::graph::Tensor_attributes>& tensor,
      int64_t uid,
      const array& x);

  cudnnHandle_t handle_;
  std::optional<CudaGraph> cached_cuda_graph_;
  std::string cached_subgraph_key_;
  bool cached_is_updatable_{true};
};

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/custom_kernel.cpp
================================================
// Copyright © 2025 Apple Inc.

#include <iostream>

#include "mlx/backend/common/compiled.h"
#include "mlx/backend/cuda/jit_module.h"
#include "mlx/backend/cuda/utils.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/fast.h"
#include "mlx/fast_primitives.h"

#include <fmt/format.h>
#include <nvtx3/nvtx3.hpp>

namespace mlx::core::fast {

namespace {

constexpr const char* default_header = R"(
#include "mlx/backend/cuda/device/utils.cuh"

#include <cooperative_groups.h>

#define inf cuda::std::numeric_limits<float>::infinity()

)";

std::string template_arguments_hash(
    const std::vector<std::pair<std::string, TemplateArg>>& template_args) {
  if (template_args.empty()) {
    return "";
  }

  std::string hash;
  hash.reserve(512);

  for (const auto& [name, arg] : template_args) {
    if (std::holds_alternative<int>(arg)) {
      hash += fmt::format("_{}", std::get<int>(arg));
    } else if (std::holds_alternative<bool>(arg)) {
      hash += (std::get<bool>(arg)) ? "_t" : "_f";
    } else if (std::holds_alternative<Dtype>(arg)) {
      hash += "_";
      hash += get_type_string(std::get<Dtype>(arg));
    }
  }

  return hash;
}

std::string build_kernel(
    const std::string& func_name,
    const std::string& header,
    const std::string& source,
    const std::vector<std::string>& input_names,
    const std::vector<array>& inputs,
    const std::vector<std::string>& output_names,
    const std::vector<Dtype>& output_dtypes,
    const std::vector<std::pair<std::string, TemplateArg>>& template_args,
    const std::vector<std::tuple<bool, bool, bool>>& shape_infos) {
  std::string kernel_source;
  kernel_source.reserve(header.size() + source.size() + 8192);
  kernel_source += default_header;
  kernel_source += header;
  kernel_source +=
      "namespace mlx::core::cu {\n\n"
      "namespace cg = cooperative_groups;\n\n";

  kernel_source += "__global__ void ";
  kernel_source += func_name;
  kernel_source += "(\n";

  // Add inputs
  for (int i = 0; i < inputs.size(); ++i) {
    const auto& name = input_names[i];
    const auto& arr = inputs[i];
    kernel_source += "    const ";
    kernel_source += dtype_to_cuda_type(arr.dtype());
    kernel_source += "* ";
    kernel_source += name;
    kernel_source += ",\n";
    // Add input shape, strides and ndim if present in the source
    if (arr.ndim() > 0) {
      if (std::get<0>(shape_infos[i])) {
        kernel_source += "    const __grid_constant__ Shape ";
        kernel_source += name;
        kernel_source += "_shape,\n";
      }
      if (std::get<1>(shape_infos[i])) {
        kernel_source += "    const __grid_constant__ Strides ";
        kernel_source += name;
        kernel_source += "_strides,\n";
      }
      if (std::get<2>(shape_infos[i])) {
        kernel_source += "    const __grid_constant__ int ";
        kernel_source += name;
        kernel_source += "_ndim,\n";
      }
    }
  }

  // Add outputs
  for (int i = 0; i < output_names.size(); ++i) {
    const auto& name = output_names[i];
    const auto& dtype = output_dtypes[i];
    kernel_source += "    ";
    kernel_source += dtype_to_cuda_type(dtype);
    kernel_source += "* ";
    kernel_source += name;
    if (i < output_names.size() - 1) {
      kernel_source += ",\n";
    } else {
      kernel_source += ") {\n";
    }
  }

  // Set compile time constants
  if (!template_args.empty()) {
    for (const auto& [name, arg] : template_args) {
      if (std::holds_alternative<int>(arg)) {
        kernel_source +=
            fmt::format("  constexpr int {} = {};\n", name, std::get<int>(arg));
      } else if (std::holds_alternative<bool>(arg)) {
        kernel_source += fmt::format(
            "  constexpr bool {} = {};\n", name, std::get<bool>(arg));
      } else {
        kernel_source += fmt::format(
            "  using {} = {};\n",
            name,
            dtype_to_cuda_type(std::get<Dtype>(arg)));
      }
    }
    kernel_source += "\n";
  }

  kernel_source += source;
  kernel_source += "\n}\n\n} // namespace mlx::core::cu\n";

  return kernel_source;
}

} // namespace

CustomKernelFunction cuda_kernel(
    const std::string& name,
    const std::vector<std::string>& input_names,
    const std::vector<std::string>& output_names,
    const std::string& source,
    const std::string& header,
    bool ensure_row_contiguous,
    int shared_memory) {
  if (output_names.empty()) {
    throw std::invalid_argument(
        "[custom_kernel] Must specify at least one output.");
  }

  std::vector<std::tuple<bool, bool, bool>> shape_infos;
  for (auto& n : input_names) {
    std::tuple<bool, bool, bool> shape_info;
    std::get<0>(shape_info) = source.find(n + "_shape") != std::string::npos;
    std::get<1>(shape_info) = source.find(n + "_strides") != std::string::npos;
    std::get<2>(shape_info) = source.find(n + "_ndim") != std::string::npos;
    shape_infos.push_back(shape_info);
  }

  return [=, shape_infos = std::move(shape_infos)](
             const std::vector<array>& inputs,
             const std::vector<Shape>& output_shapes,
             const std::vector<Dtype>& output_dtypes,
             std::tuple<int, int, int> grid,
             std::tuple<int, int, int> threadgroup,
             const std::vector<std::pair<std::string, TemplateArg>>&
                 template_args = {},
             std::optional<float> init_value = std::nullopt,
             bool verbose = false,
             StreamOrDevice s_ = {}) {
    if (inputs.size() != input_names.size()) {
      std::ostringstream msg;
      msg << "[custom_kernel] Expected `inputs` to have size "
          << input_names.size() << " but got size " << inputs.size() << "."
          << std::endl;
      throw std::invalid_argument(msg.str());
    }
    if (output_shapes.size() != output_names.size()) {
      std::ostringstream msg;
      msg << "[custom_kernel] Expected `output_shapes` to have size "
          << output_names.size() << " but got size " << output_shapes.size()
          << "." << std::endl;
      throw std::invalid_argument(msg.str());
    }
    if (output_dtypes.size() != output_names.size()) {
      std::ostringstream msg;
      msg << "[custom_kernel] Expected `output_dtypes` to have size "
          << output_names.size() << " but got size " << output_dtypes.size()
          << "." << std::endl;
      throw std::invalid_argument(msg.str());
    }

    auto s = to_stream(s_);
    if (s.device != Device::gpu) {
      throw std::invalid_argument("[custom_kernel] Only supports the GPU.");
    }

    std::string kernel_name =
        "custom_kernel_" + name + template_arguments_hash(template_args);
    std::string kernel_source = build_kernel(
        kernel_name,
        header,
        source,
        input_names,
        inputs,
        output_names,
        output_dtypes,
        template_args,
        shape_infos);

    if (verbose) {
      std::cout << "Generated source code for `" << kernel_name
                << "`:" << std::endl
                << "```" << std::endl
                << kernel_source << std::endl
                << "```" << std::endl;
    }

    return array::make_arrays(
        std::move(output_shapes),
        std::move(output_dtypes),
        std::make_shared<CustomKernel>(
            s,
            std::move(kernel_name),
            std::move(kernel_source),
            grid,
            threadgroup,
            shape_infos,
            ensure_row_contiguous,
            init_value,
            std::vector<ScalarArg>{},
            false,
            shared_memory),
        std::move(inputs));
  };
}

std::vector<array> precompiled_cuda_kernel(
    const std::string& name,
    const std::string& compiled_source,
    const std::vector<array>& inputs,
    const std::vector<Shape>& output_shapes,
    const std::vector<Dtype>& output_dtypes,
    const std::vector<ScalarArg>& scalars,
    std::tuple<int, int, int> grid,
    std::tuple<int, int, int> threadgroup,
    int shared_memory,
    std::optional<float> init_value,
    bool ensure_row_contiguous,
    StreamOrDevice s) {
  std::vector<std::tuple<bool, bool, bool>> shape_infos(
      inputs.size(), {false, false, false});
  return array::make_arrays(
      output_shapes,
      output_dtypes,
      std::make_shared<CustomKernel>(
          to_stream(s),
          name,
          compiled_source,
          grid,
          threadgroup,
          shape_infos,
          ensure_row_contiguous,
          init_value,
          scalars,
          true,
          shared_memory),
      inputs);
}

void CustomKernel::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  nvtx3::scoped_range r("CustomKernel::eval_gpu");
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);

  std::vector<array> copies;

  // Allocate and initialize the output arrays
  for (auto& out : outputs) {
    if (init_value_) {
      copies.emplace_back(init_value_.value(), out.dtype());
      fill_gpu(copies.back(), out, s);
    } else {
      out.set_data(cu::malloc_async(out.nbytes(), encoder));
    }
  }

  // Create the input arrays and copy if needed
  auto check_input = [&copies, &s, this](const array& x) -> const array {
    bool no_copy = x.flags().row_contiguous;
    if (!ensure_row_contiguous_ || no_copy) {
      return x;
    } else {
      copies.push_back(array(x.shape(), x.dtype(), nullptr, {}));
      copy_gpu(x, copies.back(), CopyType::General, s);
      return copies.back();
    }
  };
  std::vector<array> checked_inputs;
  for (const array& in : inputs) {
    checked_inputs.push_back(check_input(in));
  }

  // Compile the custom kernel
  std::string kernel_name =
      (is_precompiled_) ? name_ : "mlx::core::cu::" + name_;
  cu::JitModule& mod = cu::get_jit_module(
      s.device,
      name_,
      [&]() {
        return std::make_tuple(
            is_precompiled_, source_, std::vector{kernel_name});
      },
      false);

  // Make the arguments
  cu::KernelArgs args;
  for (int i = 0; i < checked_inputs.size(); i++) {
    const array& in = checked_inputs[i];
    auto& shape_info = shape_infos_[i];
    args.append(in);
    if (std::get<0>(shape_info)) {
      args.append_ndim(in.shape());
    }
    if (std::get<1>(shape_info)) {
      args.append_ndim(in.strides());
    }
    if (std::get<2>(shape_info)) {
      args.append<int32_t>(in.ndim());
    }
  }
  for (auto& out : outputs) {
    args.append(out);
  }
  for (auto& s : scalar_arguments_) {
    if (std::holds_alternative<bool>(s)) {
      args.append(std::get<bool>(s));
    } else if (std::holds_alternative<int>(s)) {
      args.append(std::get<int>(s));
    } else if (std::holds_alternative<float>(s)) {
      args.append(std::get<float>(s));
    }
  }

  // Make the grid
  const auto [tx, ty, tz] = threadgroup_;
  const auto [gx, gy, gz] = grid_;
  dim3 block(std::min(tx, gx), std::min(ty, gy), std::min(tz, gz));
  dim3 grid((gx + tx - 1) / tx, (gy + ty - 1) / ty, (gz + tz - 1) / tz);

  // Call the kernel
  for (const auto& in : checked_inputs) {
    encoder.set_input_array(in);
  }
  for (const auto& out : outputs) {
    encoder.set_output_array(out);
  }
  for (const auto& t : copies) {
    encoder.add_temporary(t);
  }
  auto kernel =
      mod.get_kernel(kernel_name, [smem = shared_memory_](CUfunction kernel) {
        if (smem > 0 && smem > 48000) {
          cuFuncSetAttribute(
              kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem);
        }
      });
  encoder.add_kernel_node_raw(
      kernel, grid, block, {}, shared_memory_, args.args());
}

} // namespace mlx::core::fast


================================================
FILE: mlx/backend/cuda/cutlass_utils.cuh
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/dtype.h"

#include <cutlass/bfloat16.h>
#include <cutlass/half.h>
#include <fmt/format.h>

namespace mlx::core {

// Throw exception if the cutlass API does not succeed.
inline void check_cutlass_error(const char* name, cutlass::Status status) {
  if (status != cutlass::Status::kSuccess) {
    throw std::runtime_error(
        fmt::format(
            "{} failed with code: {}.",
            name,
            cutlass::cutlassGetStatusString(status)));
  }
}

// The macro version that prints the command that failed.
#define CHECK_CUTLASS_ERROR(cmd) ::mlx::core::check_cutlass_error(#cmd, (cmd))

// Maps CPU types to CUTLASS types.
template <typename T>
struct CTypeToCutlassType {
  using type = T;
};

template <>
struct CTypeToCutlassType<float16_t> {
  using type = cutlass::half_t;
};

template <>
struct CTypeToCutlassType<bfloat16_t> {
  using type = cutlass::bfloat16_t;
};

template <typename T>
using cutlass_type_t = typename CTypeToCutlassType<T>::type;

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/delayload.cpp
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/common/utils.h"

// clang-format off
#include <windows.h> // must be included first
#include <delayimp.h>
// clang-format on

namespace mlx::core {

namespace fs = std::filesystem;

inline fs::path relative_to_current_binary(const char* relative) {
  return fs::absolute(current_binary_dir() / relative);
}

inline fs::path cublas_bin_dir() {
#if defined(MLX_CUDA_BIN_DIR)
  return MLX_CUDA_BIN_DIR;
#else
  return relative_to_current_binary("../nvidia/cublas/bin");
#endif
}

fs::path load_nvrtc() {
#if defined(MLX_CUDA_BIN_DIR)
  fs::path nvrtc_bin_dir = MLX_CUDA_BIN_DIR;
#else
  fs::path nvrtc_bin_dir =
      relative_to_current_binary("../nvidia/cuda_nvrtc/bin");
#endif
  // Internally nvrtc loads some libs dynamically, add to search dirs.
  ::AddDllDirectory(nvrtc_bin_dir.c_str());
  return nvrtc_bin_dir;
}

fs::path load_cudnn() {
#if defined(MLX_CUDNN_BIN_DIR)
  fs::path cudnn_bin_dir = MLX_CUDNN_BIN_DIR;
#else
  fs::path cudnn_bin_dir = relative_to_current_binary("../nvidia/cudnn/bin");
#endif
  // Must load cudnn_graph64_9.dll before locating symbols, otherwise We would
  // get errors like "Invalid handle. Cannot load symbol cudnnCreate".
  for (const auto& dll : fs::directory_iterator(cudnn_bin_dir)) {
    if (dll.path().filename().string().starts_with("cudnn_graph") &&
        dll.path().extension() == ".dll") {
      ::LoadLibraryW(dll.path().c_str());
      break;
    }
  }
  // Internally cuDNN loads some libs dynamically, add to search dirs.
  load_nvrtc();
  ::AddDllDirectory(cudnn_bin_dir.c_str());
  ::AddDllDirectory(cublas_bin_dir().c_str());
  return cudnn_bin_dir;
}

// Called by system when failed to locate a lazy-loaded DLL.
FARPROC WINAPI delayload_helper(unsigned dliNotify, PDelayLoadInfo pdli) {
  HMODULE mod = NULL;
  if (dliNotify == dliNotePreLoadLibrary) {
    std::string dll = pdli->szDll;
    if (dll.starts_with("cudnn")) {
      static auto cudnn_bin_dir = load_cudnn();
      mod = ::LoadLibraryW((cudnn_bin_dir / dll).c_str());
    } else if (dll.starts_with("cublas")) {
      mod = ::LoadLibraryW((cublas_bin_dir() / dll).c_str());
    } else if (dll.starts_with("nvrtc")) {
      static auto nvrtc_bin_dir = load_nvrtc();
      mod = ::LoadLibraryW((nvrtc_bin_dir / dll).c_str());
    }
  }
  return reinterpret_cast<FARPROC>(mod);
}

} // namespace mlx::core

extern "C" const PfnDliHook __pfnDliNotifyHook2 = mlx::core::delayload_helper;


================================================
FILE: mlx/backend/cuda/device/atomic_ops.cuh
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/backend/cuda/device/complex.cuh"
#include "mlx/backend/cuda/device/fp16_math.cuh"

#include <cuda/atomic>

namespace mlx::core::cu {

template <typename T>
inline __device__ void atomic_add(T* out, T val) {
  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
  ref += val;
}

template <typename T>
inline __device__ void atomic_prod(T* out, T val) {
  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
  T old = ref.load();
  while (!ref.compare_exchange_strong(old, old * val)) {
  }
}

template <typename T>
inline __device__ void atomic_max(T* out, T val) {
  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
  ref.fetch_max(val);
}

template <typename T>
inline __device__ void atomic_min(T* out, T val) {
  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
  ref.fetch_min(val);
}

// Somehow cuda::atomic_ref does not provide atomic add for following types.
template <typename T>
inline __device__ void atomic_add_general(T* out, T val) {
  cuda::atomic_ref<T, cuda::thread_scope_device> ref(*out);
  T old = ref.load();
  while (!ref.compare_exchange_strong(old, old + val)) {
  }
}

inline __device__ void atomic_add(__half* out, __half val) {
  atomicAdd(out, val);
}

inline __device__ void atomic_add(complex64_t* out, complex64_t val) {
  atomic_add_general(out, val);
}

inline __device__ void atomic_add(__nv_bfloat16* out, __nv_bfloat16 val) {
#if __CUDA_ARCH__ < 800
  atomic_add_general(out, val);
#else
  atomicAdd(out, val);
#endif
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device/binary_ops.cuh
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device/unary_ops.cuh"

#include <cuda/std/array>

namespace mlx::core::cu {

struct Add {
  template <typename T>
  __device__ T operator()(T x, T y) {
    return x + y;
  }
};

struct FloorDivide {
  template <typename T>
  __device__ T operator()(T x, T y) {
    if constexpr (cuda::std::is_integral_v<T>) {
      return x / y;
    } else {
      return cuda::std::trunc(x / y);
    }
  }
};

struct Divide {
  template <typename T>
  __device__ T operator()(T x, T y) {
    return x / y;
  }
};

struct Remainder {
  template <typename T>
  __device__ T operator()(T x, T y) {
    if constexpr (cuda::std::is_integral_v<T>) {
      if constexpr (cuda::std::is_signed_v<T>) {
        auto r = x % y;
        if (r != 0 && (r < 0 != y < 0)) {
          r += y;
        }
        return r;
      } else {
        return x % y;
      }
    } else if constexpr (is_complex_v<T>) {
      return x % y;
    } else {
      T r = cuda::std::fmod(x, y);
      if (r != 0 && (r < 0 != y < 0)) {
        r = r + y;
      }
      return r;
    }
  }
};

struct Equal {
  template <typename T>
  __device__ bool operator()(T x, T y) {
    return x == y;
  }
};

struct NaNEqual {
  template <typename T>
  __device__ bool operator()(T x, T y) {
    using cuda::std::isnan;
    if constexpr (is_complex_v<T>) {
      return x == y ||
          (isnan(x.real()) && isnan(y.real()) && isnan(x.imag()) &&
           isnan(y.imag())) ||
          (x.real() == y.real() && isnan(x.imag()) && isnan(y.imag())) ||
          (isnan(x.real()) && isnan(y.real()) && x.imag() == y.imag());
    } else {
      return x == y || (isnan(x) && isnan(y));
    }
  }
};

struct Greater {
  template <typename T>
  __device__ bool operator()(T x, T y) {
    return x > y;
  }
};

struct GreaterEqual {
  template <typename T>
  __device__ bool operator()(T x, T y) {
    return x >= y;
  }
};

struct Less {
  template <typename T>
  __device__ bool operator()(T x, T y) {
    return x < y;
  }
};

struct LessEqual {
  template <typename T>
  __device__ bool operator()(T x, T y) {
    return x <= y;
  }
};

struct LogAddExp {
  template <typename T>
  __device__ T operator()(T x, T y) {
    if constexpr (is_complex_v<T>) {
      if (cuda::std::isnan(x.real()) || cuda::std::isnan(x.imag()) ||
          cuda::std::isnan(y.real()) || cuda::std::isnan(y.imag())) {
        return {
            cuda::std::numeric_limits<float>::quiet_NaN(),
            cuda::std::numeric_limits<float>::quiet_NaN()};
      }
      auto max = x.real() > y.real() ? x : y;
      auto min = x.real() < y.real() ? x : y;
      auto min_real = min.real();
      auto max_real = max.real();
      if (!cuda::std::isfinite(min_real) && (min_real == max_real)) {
        if (min_real < 0) {
          return min;
        } else {
          return Log{}(Exp{}(min) + Exp{}(max));
        }
      } else {
        return Log1p{}(Exp{}(min - max)) + max;
      }
    } else {
      if (cuda::std::isnan(x) || cuda::std::isnan(y)) {
        return cuda::std::numeric_limits<T>::quiet_NaN();
      }
      T maxval = max(x, y);
      T minval = min(x, y);
      return (minval == -cuda::std::numeric_limits<T>::infinity() ||
              maxval == cuda::std::numeric_limits<T>::infinity())
          ? maxval
          : T(maxval + cuda::std::log1p(cuda::std::exp(minval - maxval)));
    }
  };
};

struct Maximum {
  template <typename T>
  __device__ T operator()(T x, T y) {
    if constexpr (cuda::std::is_integral_v<T>) {
      return max(x, y);
    } else if constexpr (is_complex_v<T>) {
      if (cuda::std::isnan(x.real()) || cuda::std::isnan(x.imag())) {
        return x;
      }
      return x > y ? x : y;
    } else {
      if (cuda::std::isnan(x)) {
        return x;
      }
      return x > y ? x : y;
    }
  }
};

struct Minimum {
  template <typename T>
  __device__ T operator()(T x, T y) {
    if constexpr (cuda::std::is_integral_v<T>) {
      return min(x, y);
    } else if constexpr (is_complex_v<T>) {
      if (cuda::std::isnan(x.real()) || cuda::std::isnan(x.imag())) {
        return x;
      }
      return x < y ? x : y;
    } else {
      if (cuda::std::isnan(x)) {
        return x;
      }
      return x < y ? x : y;
    }
  }
};

struct Multiply {
  template <typename T>
  __device__ T operator()(T x, T y) {
    return x * y;
  }
};

struct NotEqual {
  template <typename T>
  __device__ bool operator()(T x, T y) {
    if constexpr (is_complex_v<T>) {
      return x.real() != y.real() || x.imag() != y.imag();
    } else {
      return x != y;
    }
  }
};

struct Power {
  template <typename T>
  __device__ T operator()(T base, T exp) {
    if constexpr (cuda::std::is_integral_v<T>) {
      T res = 1;
      // Raising an integer to a negative power is undefined
      if constexpr (cuda::std::is_signed_v<T>) {
        if (exp < 0) {
          return 0;
        }
      }
      while (exp) {
        if (exp & 1) {
          res *= base;
        }
        exp >>= 1;
        base *= base;
      }
      return res;
    } else if constexpr (is_complex_v<T>) {
      return cuda::std::pow(base, exp);
    } else {
      return cuda::std::pow(base, exp);
    }
  }
};

struct Subtract {
  template <typename T>
  __device__ T operator()(T x, T y) {
    return x - y;
  }
};

struct LogicalAnd {
  template <typename T>
  __device__ T operator()(T x, T y) {
    return x && y;
  };
};

struct LogicalOr {
  template <typename T>
  __device__ T operator()(T x, T y) {
    return x || y;
  };
};

struct BitwiseAnd {
  template <typename T>
  __device__ T operator()(T x, T y) {
    return x & y;
  };
};

struct BitwiseOr {
  template <typename T>
  __device__ T operator()(T x, T y) {
    return x | y;
  };
};

struct BitwiseXor {
  template <typename T>
  __device__ T operator()(T x, T y) {
    return x ^ y;
  };
};

struct LeftShift {
  template <typename T>
  __device__ T operator()(T x, T y) {
    return x << y;
  };
};

struct RightShift {
  template <typename T>
  __device__ T operator()(T x, T y) {
    return x >> y;
  };
};

struct ArcTan2 {
  template <typename T>
  __device__ T operator()(T y, T x) {
    return cuda::std::atan2(y, x);
  }
};

struct DivMod {
  template <typename T>
  __device__ cuda::std::array<T, 2> operator()(T x, T y) {
    return {FloorDivide{}(x, y), Remainder{}(x, y)};
  };
};

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device/cast_op.cuh
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/backend/cuda/device/complex.cuh"

#include <cuda_bf16.h>
#include <cuda_fp16.h>

namespace mlx::core::cu {

// An op that does static_cast, with custom conversions for some types.
template <typename SrcT, typename DstT, typename = void>
struct CastOp {
  static constexpr bool is_castable = cuda::std::is_convertible_v<SrcT, DstT>;

  __device__ DstT operator()(SrcT x) {
    return static_cast<DstT>(x);
  }
};

// Castings between complex and boolean.
template <typename T>
struct CastOp<complex_t<T>, bool> {
  static constexpr bool is_castable = true;

  __device__ bool operator()(complex_t<T> x) {
    return x.real() != 0 && x.imag() != 0;
  }
};

template <typename T>
struct CastOp<bool, complex_t<T>> {
  static constexpr bool is_castable = true;

  __device__ complex_t<T> operator()(bool x) {
    return x ? complex_t<T>{1, 1} : complex_t<T>{0, 0};
  }
};

// Converting a complex number to real number discards the imaginary part.
template <typename T, typename DstT>
struct CastOp<complex_t<T>, DstT, cuda::std::enable_if_t<!is_complex_v<DstT>>> {
  static constexpr bool is_castable = cuda::std::is_convertible_v<T, DstT>;

  __device__ DstT operator()(complex_t<T> x) {
    static_assert(!is_complex_v<DstT>);
    return static_cast<DstT>(x.real());
  }
};

// Allow converting a real number to complex number.
template <typename SrcT, typename T>
struct CastOp<SrcT, complex_t<T>, cuda::std::enable_if_t<!is_complex_v<SrcT>>> {
  static constexpr bool is_castable = cuda::std::is_convertible_v<SrcT, T>;

  __device__ complex_t<T> operator()(SrcT x) {
    static_assert(!is_complex_v<SrcT>);
    return complex_t<T>{static_cast<T>(x), 0};
  }
};

// Do nothing when no casting is needed.
template <typename SrcT, typename DstT>
struct CastOp<
    SrcT,
    DstT,
    cuda::std::enable_if_t<cuda::std::is_same_v<SrcT, DstT>>> {
  static constexpr bool is_castable = true;

  __device__ SrcT operator()(SrcT x) {
    return x;
  }
};

// In CUDA 11 the half types do not define conversions between some types,
// provide fallbacks here.
#if CUDART_VERSION < 12000
template <typename SrcT, typename DstT>
struct CastOp<
    SrcT,
    DstT,
    cuda::std::enable_if_t<
        !cuda::std::is_convertible_v<SrcT, DstT> && !is_complex_v<SrcT> &&
        (cuda::std::is_same_v<DstT, __half> ||
         cuda::std::is_same_v<DstT, __nv_bfloat16>)>> {
  static constexpr bool is_castable = true;

  __device__ DstT operator()(SrcT x) {
    return DstT(static_cast<float>(x));
  }
};

template <typename SrcT, typename DstT>
struct CastOp<
    SrcT,
    DstT,
    cuda::std::enable_if_t<
        !cuda::std::is_convertible_v<SrcT, DstT> && !is_complex_v<SrcT> &&
        !cuda::std::is_same_v<DstT, __half> &&
        !cuda::std::is_same_v<DstT, __nv_bfloat16> &&
        (cuda::std::is_same_v<SrcT, __half> ||
         cuda::std::is_same_v<SrcT, __nv_bfloat16>)>> {
  static constexpr bool is_castable = true;

  __device__ DstT operator()(SrcT x) {
    return DstT(static_cast<float>(x));
  }
};
#endif // CUDART_VERSION < 12000

// Helper to deduce the SrcT.
template <typename DstT, typename SrcT>
inline __host__ __device__ auto cast_to(SrcT x) {
  return CastOp<SrcT, DstT>{}(x);
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device/complex.cuh
================================================
// Copyright © 2025 Apple Inc.

#pragma once

// Make multiplication and division faster.
#define LIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS

#include <cuda/std/complex>
#include <cuda/std/type_traits>

namespace mlx::core::cu {

// TODO: Consider using a faster implementation as cuda::std::complex has to
// conform to C++ standard.
template <typename T>
using complex_t = cuda::std::complex<T>;

using complex64_t = complex_t<float>;
using complex128_t = complex_t<double>;

template <typename T>
struct is_complex : cuda::std::false_type {};

template <typename T>
struct is_complex<cuda::std::complex<T>> : cuda::std::true_type {};

template <typename T>
inline constexpr bool is_complex_v = is_complex<T>::value;

// cuda::std::complex is missing some operators.
template <typename T>
inline __host__ __device__ complex_t<T> operator%(
    complex_t<T> a,
    complex_t<T> b) {
  T r = a.real() - floor(a.real() / b.real()) * b.real();
  T i = a.imag() - floor(a.imag() / b.imag()) * b.imag();
  return complex_t<T>{r, i};
}

template <typename T>
inline __host__ __device__ bool operator>(complex_t<T> a, complex_t<T> b) {
  return (a.real() > b.real()) || (a.real() == b.real() && a.imag() > b.imag());
}

template <typename T>
inline __host__ __device__ bool operator<(complex_t<T> a, complex_t<T> b) {
  return operator>(b, a);
}

template <typename T>
inline __host__ __device__ bool operator<=(complex_t<T> a, complex_t<T> b) {
  return !(a > b);
}

template <typename T>
inline __host__ __device__ bool operator>=(complex_t<T> a, complex_t<T> b) {
  return !(a < b);
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device/config.h
================================================
// Copyright © 2025 Apple Inc.

// This file is used by both CUDA kernel code and host-only C++ code.

#pragma once

// The maximum dimensions of shape/strides passed as kernel parameters.
#define MAX_NDIM 10

// All existing NVIDIA hardware has a fixed 32 warp size. Though a built-in
// warpSize variable exists, using it would prevent compile-time optimizations.
#define WARP_SIZE 32


================================================
FILE: mlx/backend/cuda/device/fp16_math.cuh
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda/std/type_traits>

namespace mlx::core::cu {

///////////////////////////////////////////////////////////////////////////////
// Binary ops for half types.
///////////////////////////////////////////////////////////////////////////////

#define MLX_DEFINE_BINARY_OP(NAME, HALF_OP)                        \
  template <typename T>                                            \
  __forceinline__ __device__ auto NAME(T x, T y) {                 \
    if constexpr (cuda::std::is_same_v<T, __half>) {               \
      return HALF_OP(x, y);                                        \
    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) { \
      return HALF_OP(x, y);                                        \
    } else {                                                       \
      return ::NAME(x, y);                                         \
    }                                                              \
  }

MLX_DEFINE_BINARY_OP(max, __hmax)
MLX_DEFINE_BINARY_OP(min, __hmin)

#undef MLX_DEFINE_BINARY_OP

///////////////////////////////////////////////////////////////////////////////
// Additional C++ operator overrides between half types and native types.
///////////////////////////////////////////////////////////////////////////////

template <typename T, typename U>
constexpr bool is_integral_except =
    cuda::std::is_integral_v<T> && !cuda::std::is_same_v<T, U>;

template <typename T, typename U>
constexpr bool is_arithmetic_except =
    cuda::std::is_arithmetic_v<T> && !cuda::std::is_same_v<T, U>;

#define MLX_DEFINE_HALF_OP(HALF, HALF2FLOAT, FLOAT2HALF, OP)          \
  template <                                                          \
      typename T,                                                     \
      typename = cuda::std::enable_if_t<is_integral_except<T, HALF>>> \
  __forceinline__ __device__ HALF operator OP(HALF x, T y) {          \
    return FLOAT2HALF(HALF2FLOAT(x) OP static_cast<float>(y));        \
  }                                                                   \
  template <                                                          \
      typename T,                                                     \
      typename = cuda::std::enable_if_t<is_integral_except<T, HALF>>> \
  __forceinline__ __device__ HALF operator OP(T x, HALF y) {          \
    return FLOAT2HALF(static_cast<float>(x) OP HALF2FLOAT(y));        \
  }

#define MLX_DEFINE_HALF_CMP(HALF, HALF2FLOAT, OP)                       \
  template <                                                            \
      typename T,                                                       \
      typename = cuda::std::enable_if_t<is_arithmetic_except<T, HALF>>> \
  __forceinline__ __device__ bool operator OP(HALF x, T y) {            \
    return HALF2FLOAT(x) OP static_cast<float>(y);                      \
  }                                                                     \
  template <                                                            \
      typename T,                                                       \
      typename = cuda::std::enable_if_t<is_arithmetic_except<T, HALF>>> \
  __forceinline__ __device__ bool operator OP(T x, HALF y) {            \
    return static_cast<float>(y) OP HALF2FLOAT(x);                      \
  }

MLX_DEFINE_HALF_OP(__half, __half2float, __float2half, +)
MLX_DEFINE_HALF_OP(__half, __half2float, __float2half, -)
MLX_DEFINE_HALF_OP(__half, __half2float, __float2half, *)
MLX_DEFINE_HALF_OP(__half, __half2float, __float2half, /)
MLX_DEFINE_HALF_OP(__nv_bfloat16, __bfloat162float, __float2bfloat16, +)
MLX_DEFINE_HALF_OP(__nv_bfloat16, __bfloat162float, __float2bfloat16, -)
MLX_DEFINE_HALF_OP(__nv_bfloat16, __bfloat162float, __float2bfloat16, *)
MLX_DEFINE_HALF_OP(__nv_bfloat16, __bfloat162float, __float2bfloat16, /)
MLX_DEFINE_HALF_CMP(__half, __half2float, <)
MLX_DEFINE_HALF_CMP(__half, __half2float, >)
MLX_DEFINE_HALF_CMP(__half, __half2float, <=)
MLX_DEFINE_HALF_CMP(__half, __half2float, >=)
MLX_DEFINE_HALF_CMP(__half, __half2float, ==)
MLX_DEFINE_HALF_CMP(__half, __half2float, !=)
MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, <)
MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, >)
MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, <=)
MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, >=)
MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, ==)
MLX_DEFINE_HALF_CMP(__nv_bfloat16, __bfloat162float, !=)

#undef MLX_DEFINE_HALF_OP
#undef MLX_DEFINE_HALF_CMP

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device/gather.cuh
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device/indexing.cuh"
#include "mlx/backend/cuda/device/utils.cuh"

#include <cooperative_groups.h>

namespace mlx::core::cu {

namespace cg = cooperative_groups;

template <typename T, typename IdxT, int NIDX, int IDX_NDIM, typename LocT>
__global__ void gather(
    const T* src,
    T* out,
    LocT size,
    const __grid_constant__ Shape src_shape,
    const __grid_constant__ Strides src_strides,
    int32_t src_ndim,
    const __grid_constant__ Shape slice_sizes,
    uint32_t slice_size,
    const __grid_constant__ cuda::std::array<int32_t, NIDX> axes,
    const __grid_constant__ cuda::std::array<IdxT*, NIDX> indices,
    const __grid_constant__ cuda::std::array<int32_t, NIDX * IDX_NDIM>
        indices_shape,
    const __grid_constant__ cuda::std::array<int64_t, NIDX * IDX_NDIM>
        indices_strides) {
  LocT out_idx = cg::this_grid().thread_rank();
  if (out_idx >= size) {
    return;
  }

  LocT src_elem = out_idx % slice_size;
  LocT idx_elem = out_idx / slice_size;

  LocT src_loc =
      elem_to_loc(src_elem, slice_sizes.data(), src_strides.data(), src_ndim);

#pragma unroll
  for (int i = 0; i < NIDX; ++i) {
    LocT idx_loc = elem_to_loc_nd<IDX_NDIM>(
        idx_elem,
        indices_shape.data() + i * IDX_NDIM,
        indices_strides.data() + i * IDX_NDIM);
    int32_t axis = axes[i];
    LocT idx_val = absolute_index(indices[i][idx_loc], src_shape[axis]);
    src_loc += idx_val * src_strides[axis];
  }

  out[out_idx] = src[src_loc];
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device/gather_axis.cuh
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device/indexing.cuh"
#include "mlx/backend/cuda/device/utils.cuh"

#include <cooperative_groups.h>

namespace mlx::core::cu {

namespace cg = cooperative_groups;

template <
    typename T,
    typename IdxT,
    int NDIM,
    bool SrcC,
    bool IdxC,
    typename LocT>
__global__ void gather_axis(
    const T* src,
    const IdxT* indices,
    T* out,
    LocT idx_size_pre,
    LocT idx_size_axis,
    LocT idx_size_post,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> src_strides,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> idx_strides,
    int32_t axis,
    int32_t axis_size,
    int64_t src_stride_axis,
    int64_t idx_stride_axis) {
  LocT index = cg::this_grid().thread_rank();
  if (index >= idx_size_pre * idx_size_axis * idx_size_post) {
    return;
  }

  auto [x, y, z] = index_to_dims(index, idx_size_axis, idx_size_pre);

  LocT elem_idx = z * idx_size_post;

  LocT idx_loc = y * idx_stride_axis;
  if constexpr (IdxC) {
    idx_loc += elem_idx * idx_size_axis + x;
  } else {
    idx_loc +=
        elem_to_loc_nd<NDIM>(elem_idx + x, shape.data(), idx_strides.data());
  }

  auto idx_val = absolute_index(indices[idx_loc], axis_size);

  LocT src_loc = idx_val * src_stride_axis;
  if constexpr (SrcC) {
    src_loc += elem_idx * axis_size + x;
  } else {
    src_loc +=
        elem_to_loc_nd<NDIM>(elem_idx + x, shape.data(), src_strides.data());
  }

  LocT out_idx = y * idx_size_post + elem_idx * idx_size_axis + x;

  out[out_idx] = src[src_loc];
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device/hadamard.cuh
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/backend/cuda/device/utils.cuh"

namespace mlx::core::cu {

__device__ __forceinline__ void hadamard_radix_m(float* x);

template <int N>
struct Pow2Log2 {
  static_assert(
      (N > 0) && ((N & (N - 1)) == 0),
      "N must be a positive power of two.");
  static constexpr int value = 1 + Pow2Log2<N / 2>::value;
};

template <>
struct Pow2Log2<1> {
  static constexpr int value = 0;
};

template <int R>
__device__ __forceinline__ void hadamard_radix_pow2(float* x) {
  constexpr int kLogR = Pow2Log2<R>::value;
  int h = 1;
#pragma unroll
  for (int s = 0; s < kLogR; ++s) {
#pragma unroll
    for (int i = 0; i < R / 2; ++i) {
      int k = i & (h - 1);
      int j = ((i - k) << 1) + k;
      float a = x[j];
      float b = x[j + h];
      x[j] = a + b;
      x[j + h] = a - b;
    }
    h <<= 1;
  }
}

template <typename T, int N, int max_radix, int read_width, int stride = 1>
__global__ void
hadamard_n(const T* in, T* out, float scale, long long num_transforms) {
  constexpr int kNumThreads = N / max_radix;
  constexpr int kLogN = Pow2Log2<N>::value;
  constexpr int kLogR = Pow2Log2<max_radix>::value;
  constexpr int kNumSteps = kLogN / kLogR;
  constexpr int kLogFinal = kLogN % kLogR;
  constexpr int kFinalRadix = 1 << kLogFinal;

  if (threadIdx.x >= kNumThreads) {
    return;
  }

  __shared__ T buf[N];
  int i = threadIdx.x;

  for (long long transform = blockIdx.x; transform < num_transforms;
       transform += gridDim.x) {
    long long base = (transform / stride) * static_cast<long long>(N) * stride +
        (transform % stride);

    if constexpr (stride == 1) {
#pragma unroll
      for (int j = 0; j < max_radix / read_width; ++j) {
        int index = j * read_width * kNumThreads + i * read_width;
#pragma unroll
        for (int r = 0; r < read_width; ++r) {
          buf[index + r] = in[base + index + r];
        }
      }
    } else {
#pragma unroll
      for (int j = 0; j < max_radix; ++j) {
        buf[j * kNumThreads + i] = in[base + (j * kNumThreads + i) * stride];
      }
    }
    __syncthreads();

    float x[max_radix];
    int h = 1;

#pragma unroll
    for (int s = 0; s < kNumSteps; ++s) {
      int k = i & (h - 1);
      int j = ((i - k) << kLogR) + k;

#pragma unroll
      for (int r = 0; r < max_radix; ++r) {
        x[r] = static_cast<float>(buf[j + h * r]);
      }

      hadamard_radix_pow2<max_radix>(x);

#pragma unroll
      for (int r = 0; r < max_radix; ++r) {
        buf[j + h * r] = static_cast<T>(x[r]);
      }

      h <<= kLogR;
      __syncthreads();
    }

    if constexpr (kFinalRadix > 1) {
#pragma unroll
      for (int t = 0; t < max_radix / kFinalRadix; ++t) {
        int index = i + t * kNumThreads;
        int k = index & (h - 1);
        int j = ((index - k) << kLogFinal) + k;
#pragma unroll
        for (int r = 0; r < kFinalRadix; ++r) {
          x[r] = static_cast<float>(buf[j + h * r]);
        }

        hadamard_radix_pow2<kFinalRadix>(x);

#pragma unroll
        for (int r = 0; r < kFinalRadix; ++r) {
          buf[j + h * r] = static_cast<T>(x[r]);
        }
      }
      __syncthreads();
    }

    if constexpr (stride == 1) {
#pragma unroll
      for (int j = 0; j < max_radix / read_width; ++j) {
        int index = j * read_width * kNumThreads + i * read_width;
#pragma unroll
        for (int r = 0; r < read_width; ++r) {
          float val = static_cast<float>(buf[index + r]);
          out[base + index + r] = static_cast<T>(val * scale);
        }
      }
    } else {
#pragma unroll
      for (int j = 0; j < max_radix; ++j) {
        out[base + (j * kNumThreads + i) * stride] = buf[j * kNumThreads + i];
      }
    }

    __syncthreads();
  }
}

template <typename T, int N, int M, int read_width>
__global__ void
hadamard_m(const T* in, T* out, float scale, long long num_tasks) {
  constexpr int kTasksPerBatch = N / read_width;

  for (long long task = blockIdx.x * blockDim.x + threadIdx.x; task < num_tasks;
       task += blockDim.x * gridDim.x) {
    long long i = task % kTasksPerBatch;
    long long batch = task / kTasksPerBatch;
    long long base = batch * static_cast<long long>(M) * N;

    float x[read_width][M];
#pragma unroll
    for (int c = 0; c < M; ++c) {
#pragma unroll
      for (int r = 0; r < read_width; ++r) {
        x[r][c] = static_cast<float>(in[base + c * N + i * read_width + r]);
      }
    }

#pragma unroll
    for (int r = 0; r < read_width; ++r) {
      hadamard_radix_m(x[r]);
    }

#pragma unroll
    for (int c = 0; c < M; ++c) {
#pragma unroll
      for (int r = 0; r < read_width; ++r) {
        out[base + c * N + i * read_width + r] =
            static_cast<T>(x[r][c] * scale);
      }
    }
  }
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device/indexing.cuh
================================================
// Copyright © 2025 Apple Inc.

#include <cuda/std/tuple>
#include <cuda/std/type_traits>

namespace mlx::core::cu {

// Convert an absolute index to positions in a 3d grid, assuming the index is
// calculated with:
// index = x * dim1 * dim2 + y * dim2 + z
template <typename T>
inline __host__ __device__ cuda::std::tuple<T, T, T>
index_to_dims(T index, T dim1, T dim2) {
  T x = index / (dim1 * dim2);
  T y = (index % (dim1 * dim2)) / dim2;
  T z = index % dim2;
  return cuda::std::make_tuple(x, y, z);
}

// Get absolute index from possible negative index.
template <typename IdxT>
inline __host__ __device__ auto absolute_index(IdxT idx, int32_t size) {
  if constexpr (cuda::std::is_unsigned_v<IdxT>) {
    return idx;
  } else {
    return static_cast<int32_t>(idx < 0 ? idx + size : idx);
  }
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device/scatter.cuh
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device/indexing.cuh"
#include "mlx/backend/cuda/device/scatter_ops.cuh"
#include "mlx/backend/cuda/device/utils.cuh"

#include <cooperative_groups.h>

namespace mlx::core::cu {

namespace cg = cooperative_groups;

template <
    typename T,
    typename IdxT,
    typename Op,
    int NIDX,
    int IDX_NDIM,
    typename LocT>
__global__ void scatter(
    const T* upd,
    T* out,
    LocT size,
    const __grid_constant__ Shape upd_shape,
    const __grid_constant__ Strides upd_strides,
    int32_t upd_ndim,
    LocT upd_post_idx_size,
    const __grid_constant__ Shape out_shape,
    const __grid_constant__ Strides out_strides,
    int32_t out_ndim,
    const __grid_constant__ cuda::std::array<int32_t, NIDX> axes,
    const __grid_constant__ cuda::std::array<IdxT*, NIDX> indices,
    const __grid_constant__ cuda::std::array<int32_t, NIDX * IDX_NDIM>
        indices_shape,
    const __grid_constant__ cuda::std::array<int64_t, NIDX * IDX_NDIM>
        indices_strides) {
  LocT upd_idx = cg::this_grid().thread_rank();
  if (upd_idx >= size) {
    return;
  }

  LocT out_elem = upd_idx % upd_post_idx_size;
  LocT idx_elem = upd_idx / upd_post_idx_size;

  LocT out_idx = elem_to_loc(
      out_elem, upd_shape.data() + IDX_NDIM, out_strides.data(), out_ndim);

#pragma unroll
  for (int i = 0; i < NIDX; ++i) {
    LocT idx_loc = elem_to_loc_nd<IDX_NDIM>(
        idx_elem,
        indices_shape.data() + i * IDX_NDIM,
        indices_strides.data() + i * IDX_NDIM);
    int32_t axis = axes[i];
    LocT idx_val = absolute_index(indices[i][idx_loc], out_shape[axis]);
    out_idx += idx_val * out_strides[axis];
  }

  LocT upd_loc = elem_to_loc(
      out_elem + idx_elem * upd_post_idx_size,
      upd_shape.data(),
      upd_strides.data(),
      upd_ndim);

  Op{}(out + out_idx, upd[upd_loc]);
}

template <typename T, bool SrcContiguous, bool DstContiguous, typename IdxT>
__global__ void masked_scatter(
    const T* dst,
    const bool* mask,
    const int32_t* scatter_offsets,
    const T* src,
    T* out,
    IdxT size,
    IdxT src_batch_size,
    IdxT mask_batch_size,
    const __grid_constant__ Shape dst_shape,
    const __grid_constant__ Strides dst_strides,
    int32_t dst_ndim,
    const __grid_constant__ Shape src_shape,
    const __grid_constant__ Strides src_strides,
    int32_t src_ndim) {
  IdxT index = cg::this_grid().thread_rank();
  if (index >= size) {
    return;
  }

  T dst_val;
  if constexpr (DstContiguous) {
    dst_val = dst[index];
  } else {
    IdxT dst_loc =
        elem_to_loc(index, dst_shape.data(), dst_strides.data(), dst_ndim);
    dst_val = dst[dst_loc];
  }

  if (mask[index]) {
    IdxT src_index = static_cast<IdxT>(scatter_offsets[index]);
    if (src_index < src_batch_size) {
      IdxT batch_idx = index / mask_batch_size;
      if constexpr (SrcContiguous) {
        out[index] = src[batch_idx * src_batch_size + src_index];
      } else {
        IdxT src_elem = batch_idx * src_batch_size + src_index;
        IdxT src_loc = elem_to_loc(
            src_elem, src_shape.data(), src_strides.data(), src_ndim);
        out[index] = src[src_loc];
      }
      return;
    }
  }

  out[index] = dst_val;
}

template <typename T, typename IdxT, int N_READS>
__global__ void masked_scatter_vec_contiguous(
    const T* dst,
    const bool* mask,
    const int32_t* scatter_offsets,
    const T* src,
    T* out,
    IdxT size,
    IdxT src_batch_size,
    IdxT mask_batch_size) {
  IdxT vec_index = cg::this_grid().thread_rank();
  IdxT base = vec_index * N_READS;
  if (base >= size) {
    return;
  }

  auto out_vec = load_vector<N_READS>(dst, vec_index, size, static_cast<T>(0));
  auto mask_vec = load_vector<N_READS>(mask, vec_index, size, false);
  auto offset_vec = load_vector<N_READS>(scatter_offsets, vec_index, size, 0);

#pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    IdxT index = base + i;
    if (index >= size) {
      break;
    }
    if (mask_vec[i]) {
      IdxT src_index = static_cast<IdxT>(offset_vec[i]);
      if (src_index < src_batch_size) {
        IdxT batch_idx = index / mask_batch_size;
        out_vec[i] = src[batch_idx * src_batch_size + src_index];
      }
    }
  }

  store_vector<N_READS>(out, vec_index, out_vec, size);
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device/scatter_axis.cuh
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device/indexing.cuh"
#include "mlx/backend/cuda/device/scatter_ops.cuh"
#include "mlx/backend/cuda/device/utils.cuh"

#include <cooperative_groups.h>

namespace mlx::core::cu {

namespace cg = cooperative_groups;

template <
    typename T,
    typename IdxT,
    typename Op,
    int NDIM,
    bool UpdC,
    bool IdxC,
    typename LocT>
__global__ void scatter_axis(
    const T* upd,
    const IdxT* indices,
    T* out,
    LocT idx_size_pre,
    LocT idx_size_axis,
    LocT idx_size_post,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> upd_strides,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> idx_strides,
    int32_t axis,
    int32_t axis_size,
    int64_t upd_stride_axis,
    int64_t idx_stride_axis) {
  LocT index = cg::this_grid().thread_rank();
  if (index >= idx_size_pre * idx_size_axis * idx_size_post) {
    return;
  }

  auto [x, y, z] = index_to_dims(index, idx_size_axis, idx_size_pre);

  LocT elem_idx = z * idx_size_post;

  LocT idx_loc = y * idx_stride_axis;
  if constexpr (IdxC) {
    idx_loc += elem_idx * idx_size_axis + x;
  } else {
    idx_loc +=
        elem_to_loc_nd<NDIM>(elem_idx + x, shape.data(), idx_strides.data());
  }

  auto idx_val = absolute_index(indices[idx_loc], axis_size);

  LocT upd_loc = y * upd_stride_axis;
  if constexpr (UpdC) {
    upd_loc += elem_idx * idx_size_axis + x;
  } else {
    upd_loc +=
        elem_to_loc_nd<NDIM>(elem_idx + x, shape.data(), upd_strides.data());
  }

  LocT out_idx = idx_val * idx_size_post + elem_idx * axis_size + x;

  Op{}(out + out_idx, upd[upd_loc]);
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device/scatter_ops.cuh
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/backend/cuda/device/atomic_ops.cuh"

namespace mlx::core::cu {

struct ScatterAssign {
  template <typename T>
  __device__ void operator()(T* out, T val) const {
    *out = val;
  }
};

struct ScatterSum {
  template <typename T>
  __device__ void operator()(T* out, T val) const {
    atomic_add(out, val);
  }
};

struct ScatterProd {
  template <typename T>
  __device__ void operator()(T* out, T val) const {
    atomic_prod(out, val);
  }
};

struct ScatterMax {
  template <typename T>
  __device__ void operator()(T* out, T val) const {
    atomic_max(out, val);
  }
};

struct ScatterMin {
  template <typename T>
  __device__ void operator()(T* out, T val) const {
    atomic_min(out, val);
  }
};

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device/slice_update.cuh
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/backend/cuda/device/binary_ops.cuh"
#include "mlx/backend/cuda/device/utils.cuh"

#include <cooperative_groups.h>

namespace mlx::core::cu {

namespace cg = cooperative_groups;

template <
    typename T,
    typename IdxT,
    typename Op,
    bool OUT_ROW_CONTIG,
    bool UPD_ROW_CONTIG,
    bool UPD_SCALAR,
    int NWORK>
__global__ void slice_update_op(
    const T* updates,
    T* out,
    int64_t update_size,
    const __grid_constant__ Shape update_shape,
    const __grid_constant__ Strides update_strides,
    int32_t update_ndim,
    const __grid_constant__ Strides output_strides,
    int64_t output_offset) {
  Op op;

  IdxT idx = cg::this_grid().thread_rank() * NWORK;
  IdxT out_idx;
  IdxT update_idx;

  if constexpr (OUT_ROW_CONTIG) {
    out_idx = idx;
  } else {
    out_idx = elem_to_loc<IdxT>(
        idx, update_shape.data(), output_strides.data(), update_ndim);
  }

  if constexpr (!UPD_SCALAR) {
    if constexpr (UPD_ROW_CONTIG) {
      update_idx = idx;
    } else {
      update_idx = elem_to_loc<IdxT>(
          idx, update_shape.data(), update_strides.data(), update_ndim);
    }
  } else {
    update_idx = 0;
  }

  out += output_offset;

  for (int j = 0; j < NWORK && idx < update_size; j++) {
    out[out_idx] = op(out[out_idx], updates[update_idx]);
    idx++;

    if constexpr (OUT_ROW_CONTIG) {
      out_idx = idx;
    } else {
      out_idx += output_strides[update_ndim - 1];
    }

    if constexpr (UPD_ROW_CONTIG) {
      update_idx = idx;
    } else if constexpr (!UPD_SCALAR) {
      update_idx += update_strides[update_ndim - 1];
    }
  }
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device/ternary_ops.cuh
================================================
// Copyright © 2025 Apple Inc.
#pragma once

namespace mlx::core::cu {

struct Select {
  template <typename T>
  __device__ T operator()(bool condition, T x, T y) {
    return condition ? x : y;
  }
};

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device/unary_ops.cuh
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/backend/cuda/device/fp16_math.cuh"
#include "mlx/backend/cuda/device/utils.cuh"

#include <cuda_fp8.h>
#include <math_constants.h>
#include <cuda/std/cmath>

namespace mlx::core::cu {

struct Abs {
  template <typename T>
  __device__ T operator()(T x) {
    if constexpr (cuda::std::is_unsigned_v<T>) {
      return x;
    } else {
      return cuda::std::abs(x);
    }
  }
};

struct ArcCos {
  template <typename T>
  __device__ T operator()(T x) {
    return cuda::std::acos(x);
  }
};

struct ArcCosh {
  template <typename T>
  __device__ T operator()(T x) {
    return cuda::std::acosh(x);
  }
};

struct ArcSin {
  template <typename T>
  __device__ T operator()(T x) {
    return cuda::std::asin(x);
  }
};

struct ArcSinh {
  template <typename T>
  __device__ T operator()(T x) {
    return cuda::std::asinh(x);
  }
};

struct ArcTan {
  template <typename T>
  __device__ T operator()(T x) {
    return cuda::std::atan(x);
  }
};

struct ArcTanh {
  template <typename T>
  __device__ T operator()(T x) {
    return cuda::std::atanh(x);
  }
};

struct BitwiseInvert {
  template <typename T>
  __device__ T operator()(T x) {
    return ~x;
  }
};

struct Ceil {
  template <typename T>
  __device__ T operator()(T x) {
    if constexpr (cuda::std::is_integral_v<T>) {
      return x;
    } else if constexpr (is_complex_v<T>) {
      return T{cuda::std::ceil(x.real()), cuda::std::ceil(x.imag())};
    } else {
      return cuda::std::ceil(x);
    }
  }
};

struct Conjugate {
  template <typename T>
  __device__ complex_t<T> operator()(complex_t<T> x) {
    return cuda::std::conj(x);
  }
};

struct Cos {
  template <typename T>
  __device__ T operator()(T x) {
    return cuda::std::cos(x);
  }
};

struct Cosh {
  template <typename T>
  __device__ T operator()(T x) {
    return cuda::std::cosh(x);
  }
};

struct Erf {
  template <typename T>
  __device__ T operator()(T x) {
    if constexpr (cuda::std::is_same_v<T, __half>) {
      return erf(__half2float(x));
    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
      return erf(__bfloat162float(x));
    } else {
      return erf(x);
    }
  }
};

struct ErfInv {
  template <typename T>
  __device__ T operator()(T x) {
    if constexpr (cuda::std::is_same_v<T, __half>) {
      return erfinv(__half2float(x));
    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
      return erfinv(__bfloat162float(x));
    } else {
      return erfinv(x);
    }
  }
};

struct Exp {
  template <typename T>
  __device__ T operator()(T x) {
    return cuda::std::exp(x);
  }
};

struct Expm1 {
  template <typename T>
  __device__ T operator()(T x) {
    return cuda::std::expm1(x);
  }
};

struct Floor {
  template <typename T>
  __device__ T operator()(T x) {
    if constexpr (cuda::std::is_integral_v<T>) {
      return x;
    } else if constexpr (is_complex_v<T>) {
      return T{cuda::std::floor(x.real()), cuda::std::floor(x.imag())};
    } else {
      return cuda::std::floor(x);
    }
  }
};

struct Imag {
  template <typename T>
  __device__ auto operator()(complex_t<T> x) {
    return x.imag();
  }
};

struct Log {
  template <typename T>
  __device__ T operator()(T x) {
    return cuda::std::log(x);
  }
};

struct Log2 {
  template <typename T>
  __device__ T operator()(T x) {
    if constexpr (is_complex_v<T>) {
      auto y = Log{}(x);
      return {y.real() / CUDART_LN2_F, y.imag() / CUDART_LN2_F};
    } else {
      return cuda::std::log2(x);
    }
  }
};

struct Log10 {
  template <typename T>
  __device__ T operator()(T x) {
    return cuda::std::log10(x);
  }
};

struct Log1p {
  template <typename T>
  __device__ T operator()(T z) {
    if constexpr (is_complex_v<T>) {
      float x = z.real();
      float y = z.imag();
      float zabs = Abs{}(z).real();
      float theta = atan2f(y, x + 1);
      if (zabs < 0.5f) {
        float r = x * (2 + x) + y * y;
        if (r == 0) { // handle underflow
          return {x, theta};
        }
        return {0.5f * log1pf(r), theta};
      } else {
        float z0 = hypotf(x + 1, y);
        return {logf(z0), theta};
      }
    } else {
      return cuda::std::log1p(z);
    }
  }
};

struct LogicalNot {
  __device__ bool operator()(bool x) {
    return !x;
  }
};

struct Negative {
  template <typename T>
  __device__ T operator()(T x) {
    if constexpr (is_complex_v<T>) {
      return T{0, 0} - x;
    } else {
      return -x;
    }
  }
};

struct Real {
  template <typename T>
  __device__ auto operator()(complex_t<T> x) {
    return x.real();
  }
};

struct Round {
  template <typename T>
  __device__ T operator()(T x) {
    if constexpr (is_complex_v<T>) {
      return {cuda::std::rint(x.real()), cuda::std::rint(x.imag())};
    } else {
      return cuda::std::rint(x);
    }
  }
};

struct Sigmoid {
  template <typename T>
  __device__ T operator()(T x) {
    T y = 1 / (1 + cuda::std::exp(cuda::std::abs(x)));
    return (x < 0) ? y : 1 - y;
  }
};

struct Sign {
  template <typename T>
  __device__ T operator()(T x) {
    if constexpr (cuda::std::is_unsigned_v<T>) {
      return x != 0;
    } else if constexpr (is_complex_v<T>) {
      if (x.real() == 0 && x.imag() == 0) {
        return x;
      } else {
        return x / Abs()(x);
      }
    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
      return static_cast<float>((x > T(0.f)) - (x < T(0.f)));
    } else {
      return (x > T(0)) - (x < T(0));
    }
  }
};

struct Sin {
  template <typename T>
  __device__ T operator()(T x) {
    return cuda::std::sin(x);
  }
};

struct Sinh {
  template <typename T>
  __device__ T operator()(T x) {
    return cuda::std::sinh(x);
  }
};

struct Square {
  template <typename T>
  __device__ T operator()(T x) {
    return x * x;
  }
};

struct Sqrt {
  template <typename T>
  __device__ T operator()(T x) {
    return cuda::std::sqrt(x);
  }
};

struct Rsqrt {
  template <typename T>
  __device__ T operator()(T x) {
    if constexpr (is_complex_v<T>) {
      return 1.0f / Sqrt{}(x);
    } else if constexpr (cuda::std::is_same_v<T, __half>) {
      return rsqrt(__half2float(x));
    } else if constexpr (cuda::std::is_same_v<T, __nv_bfloat16>) {
      return rsqrt(__bfloat162float(x));
    } else {
      return rsqrt(x);
    }
  }
};

struct Tan {
  template <typename T>
  __device__ T operator()(T x) {
    return cuda::std::tan(x);
  }
};

struct Tanh {
  template <typename T>
  __device__ T operator()(T x) {
    return cuda::std::tanh(x);
  }
};

struct ToFP8 {
  template <typename T>
  __device__ uint8_t operator()(T x) {
    return __nv_fp8_e4m3(x).__x;
  }
};

struct FromFP8 {
  __device__ float operator()(uint8_t x) {
    return float(*(__nv_fp8_e4m3*)(&x));
  }
};

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device/utils.cuh
================================================
// Copyright © 2025 Apple Inc.

// This file must not include any host-only code, utilities that work under both
// host and device can be put here.
//
// See more about the requirements at:
// https://docs.nvidia.com/cuda/nvrtc/#language

#pragma once

#include "mlx/backend/cuda/device/complex.cuh"
#include "mlx/backend/cuda/device/config.h"

#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda/std/array>
#include <cuda/std/limits>
#include <cuda/std/tuple>

namespace mlx::core::cu {

///////////////////////////////////////////////////////////////////////////////
// CUDA kernel utils
///////////////////////////////////////////////////////////////////////////////

// To pass shape/strides to kernels via constant memory, their size must be
// known at compile time.
using Shape = cuda::std::array<int32_t, MAX_NDIM>;
using Strides = cuda::std::array<int64_t, MAX_NDIM>;

// Vectorized load/store.
template <typename T, int N>
struct alignas(sizeof(T) * N) AlignedVector {
  T val[N];

  __device__ T& operator[](int i) {
    return val[i];
  }

  __device__ T operator[](int i) const {
    return val[i];
  }
};

template <int N, typename T>
inline __host__ __device__ bool is_aligned(T* x) {
  return (reinterpret_cast<uintptr_t>(x) % (N * sizeof(T))) == 0;
}

template <int N, typename T>
inline __device__ AlignedVector<T, N> unsafe_load_vector(
    const T* ptr,
    uint32_t offset) {
  auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
  return from[offset];
}

template <int N, typename T>
inline __device__ AlignedVector<T, N> load_vector(
    const T* ptr,
    uint32_t offset) {
  if (is_aligned<N>(ptr)) {
    auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
    return from[offset];
  } else {
    AlignedVector<T, N> v;
#pragma unroll
    for (int i = 0; i < N; ++i) {
      v[i] = ptr[offset * N + i];
    }
    return v;
  }
}

template <int N, typename T, typename SizeT>
inline __device__ AlignedVector<T, N>
load_vector(const T* ptr, uint32_t offset, SizeT size, T fallback) {
  if (is_aligned<N>(ptr) && (offset + 1) * N <= size) {
    auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
    return from[offset];
  } else {
    AlignedVector<T, N> v;
#pragma unroll
    for (int i = 0; i < N; ++i) {
      v[i] = (N * offset + i) < size ? ptr[offset * N + i] : fallback;
    }
    return v;
  }
}

template <int N, typename T, typename SizeT>
inline __device__ AlignedVector<T, N> load_vector(
    const T* ptr,
    uint32_t offset,
    SizeT size,
    int64_t stride,
    T fallback) {
  if (is_aligned<N>(ptr) && stride == 1 && (offset + 1) * N <= size) {
    auto* from = reinterpret_cast<const AlignedVector<T, N>*>(ptr);
    return from[offset];
  } else {
    AlignedVector<T, N> v;
#pragma unroll
    for (int i = 0; i < N; ++i) {
      v[i] =
          (N * offset + i) < size ? ptr[stride * (offset * N + i)] : fallback;
    }
    return v;
  }
}

template <int N, typename T>
inline __device__ void
unsafe_store_vector(T* ptr, uint32_t offset, const AlignedVector<T, N>& vec) {
  auto* to = reinterpret_cast<AlignedVector<T, N>*>(ptr);
  to[offset] = vec;
}

template <int N, typename T>
inline __device__ void
store_vector(T* ptr, uint32_t offset, const AlignedVector<T, N>& vec) {
  if (is_aligned<N>(ptr)) {
    auto* to = reinterpret_cast<AlignedVector<T, N>*>(ptr);
    to[offset] = vec;
  } else {
#pragma unroll
    for (int i = 0; i < N; ++i) {
      ptr[offset * N + i] = vec[i];
    }
  }
}

template <int N, typename T, typename SizeT>
inline __device__ void store_vector(
    T* ptr,
    uint32_t offset,
    const AlignedVector<T, N>& vec,
    SizeT size) {
  if (is_aligned<N>(ptr) && (offset + 1) * N <= size) {
    auto* to = reinterpret_cast<AlignedVector<T, N>*>(ptr);
    to[offset] = vec;
  } else {
    for (int i = 0; (offset * N + i) < size && i < N; ++i) {
      ptr[offset * N + i] = vec[i];
    }
  }
}

template <int N, typename T, typename SizeT>
inline __device__ void store_vector(
    T* ptr,
    uint32_t offset,
    const AlignedVector<T, N>& vec,
    SizeT size,
    int64_t stride) {
  if (is_aligned<N>(ptr) && (offset + 1) * N <= size && stride == 1) {
    auto* to = reinterpret_cast<AlignedVector<T, N>*>(ptr);
    to[offset] = vec;
  } else {
    for (int i = 0; (offset * N + i) < size && i < N; ++i) {
      ptr[stride * (offset * N + i)] = vec[i];
    }
  }
}

///////////////////////////////////////////////////////////////////////////////
// Type limits utils
///////////////////////////////////////////////////////////////////////////////

template <typename T, typename = void>
struct Limits {
  static constexpr __host__ __device__ T max() {
    return cuda::std::numeric_limits<T>::max();
  }
  static constexpr __host__ __device__ T min() {
    return cuda::std::numeric_limits<T>::min();
  }
  static constexpr __host__ __device__ T finite_max() {
    return cuda::std::numeric_limits<T>::max();
  }
  static constexpr __host__ __device__ T finite_min() {
    return cuda::std::numeric_limits<T>::min();
  }
};

template <typename T>
struct Limits<
    T,
    cuda::std::enable_if_t<
        cuda::std::is_same_v<T, float> || cuda::std::is_same_v<T, double>>> {
  static constexpr __host__ __device__ T max() {
    return cuda::std::numeric_limits<T>::infinity();
  }
  static constexpr __host__ __device__ T min() {
    return -cuda::std::numeric_limits<T>::infinity();
  }
  static constexpr __host__ __device__ T finite_max() {
    return cuda::std::numeric_limits<T>::max();
  }
  static constexpr __host__ __device__ T finite_min() {
    return cuda::std::numeric_limits<T>::lowest();
  }
};

// CUDA 11 does not have host side arithmetic operators for half types.
template <typename T>
struct Limits<
    T,
    cuda::std::enable_if_t<
        cuda::std::is_same_v<T, __half> ||
        cuda::std::is_same_v<T, __nv_bfloat16>>> {
  static constexpr __host__ __device__ T max() {
    return cuda::std::numeric_limits<T>::infinity();
  }
  static constexpr __host__ __device__ T min() {
#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
    return -cuda::std::numeric_limits<float>::infinity();
#else
    return -cuda::std::numeric_limits<T>::infinity();
#endif
  }
  static constexpr __host__ __device__ T finite_max() {
    return cuda::std::numeric_limits<T>::max();
  }
  static constexpr __host__ __device__ T finite_min() {
#if CUDART_VERSION < 12000 && __CUDA_ARCH__ < 800
    return cuda::std::numeric_limits<float>::lowest();
#else
    return cuda::std::numeric_limits<T>::lowest();
#endif
  }
};

template <>
struct Limits<bool> {
  static constexpr __host__ __device__ bool max() {
    return true;
  }
  static constexpr __host__ __device__ bool min() {
    return false;
  }
};

template <typename T>
struct Limits<complex_t<T>> {
  static constexpr __host__ __device__ complex_t<T> max() {
    return {Limits<T>::max(), Limits<T>::max()};
  }
  static constexpr __host__ __device__ complex_t<T> min() {
    return {Limits<T>::min(), Limits<T>::min()};
  }
};

///////////////////////////////////////////////////////////////////////////////
// Indexing utils
///////////////////////////////////////////////////////////////////////////////

template <typename IdxT = int64_t>
inline __host__ __device__ IdxT
elem_to_loc(IdxT elem, const int* shape, const int64_t* strides, int ndim) {
  IdxT loc = 0;
  for (int i = ndim - 1; i >= 0 && elem > 0; --i) {
    loc += (elem % shape[i]) * IdxT(strides[i]);
    elem /= shape[i];
  }
  return loc;
}

// Optimize when the ndim is known at compile time.
template <int NDIM, typename IdxT = int64_t>
inline __host__ __device__ IdxT
elem_to_loc_nd(IdxT elem, const int* shape, const int64_t* strides) {
  IdxT loc = 0;
#pragma unroll
  for (int i = NDIM - 1; i >= 0; --i) {
    loc += (elem % shape[i]) * IdxT(strides[i]);
    elem /= shape[i];
  }
  return loc;
}

template <int NDIM, typename IdxT = int64_t>
inline __host__ __device__ cuda::std::tuple<IdxT, IdxT> elem_to_loc_nd(
    IdxT elem,
    const int* shape,
    const int64_t* a_strides,
    const int64_t* b_strides) {
  IdxT a_loc = 0;
  IdxT b_loc = 0;
#pragma unroll
  for (int i = NDIM - 1; i >= 0; --i) {
    int dim_idx = elem % shape[i];
    a_loc += dim_idx * IdxT(a_strides[i]);
    b_loc += dim_idx * IdxT(b_strides[i]);
    elem /= shape[i];
  }
  return cuda::std::make_tuple(a_loc, b_loc);
}

template <int NDIM, typename IdxT = int64_t>
inline __host__ __device__ cuda::std::tuple<IdxT, IdxT, IdxT> elem_to_loc_nd(
    IdxT elem,
    const int* shape,
    const int64_t* a_strides,
    const int64_t* b_strides,
    const int64_t* c_strides) {
  IdxT a_loc = 0;
  IdxT b_loc = 0;
  IdxT c_loc = 0;
#pragma unroll
  for (int i = NDIM - 1; i >= 0; --i) {
    int dim_idx = elem % shape[i];
    a_loc += dim_idx * IdxT(a_strides[i]);
    b_loc += dim_idx * IdxT(b_strides[i]);
    c_loc += dim_idx * IdxT(c_strides[i]);
    elem /= shape[i];
  }
  return cuda::std::make_tuple(a_loc, b_loc, c_loc);
}

template <typename IdxT = int64_t>
inline __host__ __device__ cuda::std::tuple<IdxT, IdxT> elem_to_loc(
    IdxT elem,
    const int* shape,
    const int64_t* a_strides,
    const int64_t* b_strides,
    int ndim) {
  IdxT a_loc = 0;
  IdxT b_loc = 0;
  for (int i = ndim - 1; i >= 0; --i) {
    int dim_idx = elem % shape[i];
    a_loc += dim_idx * IdxT(a_strides[i]);
    b_loc += dim_idx * IdxT(b_strides[i]);
    elem /= shape[i];
  }
  return cuda::std::make_tuple(a_loc, b_loc);
}

template <typename IdxT = int64_t>
inline __host__ __device__ cuda::std::tuple<IdxT, IdxT, IdxT> elem_to_loc(
    IdxT elem,
    const int* shape,
    const int64_t* a_strides,
    const int64_t* b_strides,
    const int64_t* c_strides,
    int ndim) {
  IdxT a_loc = 0;
  IdxT b_loc = 0;
  IdxT c_loc = 0;
  for (int i = ndim - 1; i >= 0; --i) {
    int dim_idx = elem % shape[i];
    a_loc += dim_idx * IdxT(a_strides[i]);
    b_loc += dim_idx * IdxT(b_strides[i]);
    c_loc += dim_idx * IdxT(c_strides[i]);
    elem /= shape[i];
  }
  return cuda::std::make_tuple(a_loc, b_loc, c_loc);
}

///////////////////////////////////////////////////////////////////////////////
// Elem to loc in a loop utils
///////////////////////////////////////////////////////////////////////////////

template <int DIM, bool General = true, typename OffsetT = size_t>
struct LoopedElemToLoc {
  int dim;
  LoopedElemToLoc<DIM - 1, General, OffsetT> inner_looper;
  OffsetT offset{0};
  int index{0};

  __device__ LoopedElemToLoc(int dim) : dim(dim), inner_looper(dim - 1) {}

  __device__ void next(const int* shape, const int64_t* strides) {
    if (dim == 0) {
      return;
    }
    index++;
    offset += OffsetT(strides[dim - 1]);
    if (index >= shape[dim - 1]) {
      index = 0;
      inner_looper.next(shape, strides);
      offset = inner_looper.offset;
    }
  }

  __device__ void next(int n, const int* shape, const int64_t* strides) {
    if (dim == 0) {
      return;
    }
    index += n;
    offset += n * OffsetT(strides[dim - 1]);

    if (index >= shape[dim - 1]) {
      int extra = index - shape[dim - 1];
      if (extra >= shape[dim - 1]) {
        inner_looper.next(1 + extra / shape[dim - 1], shape, strides);
        extra = extra % shape[dim - 1];
      } else {
        inner_looper.next(shape, strides);
      }
      index = 0;
      offset = inner_looper.offset;
      if (extra > 0) {
        next(extra, shape, strides);
      }
    }
  }

  __device__ OffsetT location() {
    return offset;
  }
};

template <typename OffsetT>
struct LoopedElemToLoc<1, true, OffsetT> {
  int dim;
  OffsetT offset{0};
  int index{0};

  __device__ LoopedElemToLoc(int dim) : dim(dim) {}

  __device__ void next(const int* shape, const int64_t* strides) {
    index++;
    if (dim > 1) {
      offset = elem_to_loc<OffsetT>(index, shape, strides, dim);
    } else {
      offset += OffsetT(strides[0]);
    }
  }

  __device__ void next(int n, const int* shape, const int64_t* strides) {
    index += n;
    if (dim > 1) {
      offset = elem_to_loc<OffsetT>(index, shape, strides, dim);
    } else {
      offset = index * OffsetT(strides[0]);
    }
  }

  __device__ OffsetT location() {
    return offset;
  }
};

template <typename OffsetT>
struct LoopedElemToLoc<1, false, OffsetT> {
  OffsetT offset{0};

  __device__ LoopedElemToLoc(int) {}

  __device__ void next(const int*, const int64_t* strides) {
    offset += OffsetT(strides[0]);
  }

  __device__ void next(int n, const int*, const int64_t* strides) {
    offset += n * OffsetT(strides[0]);
  }

  __device__ OffsetT location() {
    return offset;
  }
};

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/jit_module.h"
#include "mlx/backend/cuda/worker.h"
#include "mlx/backend/gpu/device_info.h"
#include "mlx/utils.h"

#include <fmt/format.h>
#include <nvtx3/nvtx3.hpp>
#include <future>
#include <unordered_set>

namespace mlx::core::cu {

namespace {

bool use_cuda_graphs() {
  static bool use_graphs = env::get_var("MLX_USE_CUDA_GRAPHS", true);
  return use_graphs;
}

const char* save_cuda_graphs_dot_file() {
  static const char* filename = []() -> const char* {
    const char* env = std::getenv("MLX_SAVE_CUDA_GRAPHS_DOT_FILE");
    if (env && std::strlen(env) == 0) {
      return nullptr;
    }
    return env;
  }();
  return filename;
}

inline bool is_empty_dim(dim3 dim) {
  return (dim.x == 0 && dim.y == 0 && dim.z == 0) ||
      (dim.x == 1 && dim.y == 1 && dim.z == 1);
}

} // namespace

Device::Device(int device) : device_(device) {
  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
      &compute_capability_major_, cudaDevAttrComputeCapabilityMajor, device_));
  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
      &compute_capability_minor_, cudaDevAttrComputeCapabilityMinor, device_));
  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
      &concurrent_managed_access_,
      cudaDevAttrConcurrentManagedAccess,
      device_));
  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
      &host_native_atomic_, cudaDevAttrHostNativeAtomicSupported, device_));
  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
      &managed_memory_, cudaDevAttrManagedMemory, device_));
  CHECK_CUDA_ERROR(cudaDeviceGetAttribute(
      &memory_pools_, cudaDevAttrMemoryPoolsSupported, device_));
}

Device::~Device() {
  if (cudnn_handle_) {
    CHECK_CUDNN_ERROR(cudnnDestroy(cudnn_handle_));
  }
  if (cublaslt_handle_) {
    CHECK_CUBLAS_ERROR(cublasLtDestroy(cublaslt_handle_));
  }
}

void Device::make_current() {
  // We need to set/get current CUDA device very frequently, cache it to reduce
  // actual calls of CUDA APIs. Use -1 as sentinel so the first call on each
  // new thread always calls cudaSetDevice (which establishes the CUDA primary
  // context). Without this, device 0 would never get set on a new thread.
  static thread_local int current = -1;
  if (current != device_) {
    CHECK_CUDA_ERROR(cudaSetDevice(device_));
    current = device_;
  }
}

CommandEncoder& Device::get_command_encoder(Stream s) {
  auto it = encoders_.find(s.index);
  if (it == encoders_.end()) {
    it = encoders_.try_emplace(s.index, *this).first;
  }
  return it->second;
}

cublasLtHandle_t Device::get_cublaslt_handle() {
  if (!cublaslt_handle_) {
    make_current();
    CHECK_CUBLAS_ERROR(cublasLtCreate(&cublaslt_handle_));
  }
  return cublaslt_handle_;
}

cudnnHandle_t Device::get_cudnn_handle() {
  if (!cudnn_handle_) {
    make_current();
    CHECK_CUDNN_ERROR(cudnnCreate(&cudnn_handle_));
  }
  return cudnn_handle_;
}

CommandEncoder::CaptureContext::CaptureContext(CommandEncoder& enc) : enc(enc) {
  enc.device().make_current();
  if (!use_cuda_graphs()) {
    return;
  }
  CHECK_CUDA_ERROR(
      cudaStreamBeginCapture(enc.stream(), cudaStreamCaptureModeThreadLocal));
}

CommandEncoder::CaptureContext::~CaptureContext() {
  if (!use_cuda_graphs()) {
    enc.node_count_++;
    return;
  }

  graph.end_capture(enc.stream());
  if (discard) {
    return;
  }
  enc.add_graph_node(graph);
}

CommandEncoder::ConcurrentContext::ConcurrentContext(CommandEncoder& enc)
    : enc(enc) {
  enc.in_concurrent_ = true;
}

CommandEncoder::ConcurrentContext::~ConcurrentContext() {
  enc.in_concurrent_ = false;
  if (!use_cuda_graphs()) {
    return;
  }

  // Use an empty graph node for synchronization
  CommandEncoder::GraphNode empty{NULL, "E", std::to_string(enc.node_count_++)};
  CHECK_CUDA_ERROR(cudaGraphAddEmptyNode(&empty.node, enc.graph_, NULL, 0));

  // Insert the concurrent -> empty node dependencies
  for (auto& from : enc.concurrent_nodes_) {
    enc.from_nodes_.push_back(from.node);
    enc.to_nodes_.push_back(empty.node);
    enc.graph_deps_key_ += from.id;
    enc.graph_deps_key_ += "-";
    enc.graph_deps_key_ += empty.id;
    enc.graph_deps_key_ += "-";
  }

  // Insert the input -> concurrent node dependencies without updating output
  // nodes
  auto outputs = std::move(enc.active_outputs_);
  enc.insert_graph_dependencies(std::move(enc.concurrent_nodes_));

  // Update output node to be the empty node
  for (auto o : outputs) {
    enc.node_map_.emplace(o, empty).first->second = empty;
  }
}

void CommandEncoder::insert_graph_dependencies(GraphNode node) {
  node.id = std::to_string(node_count_++);
  if (in_concurrent_) {
    concurrent_nodes_.push_back(std::move(node));
  } else {
    std::vector<GraphNode> nodes;
    nodes.push_back(std::move(node));
    insert_graph_dependencies(std::move(nodes));
  }
}

void CommandEncoder::insert_graph_dependencies(std::vector<GraphNode> nodes) {
  for (auto& node : nodes) {
    graph_nodes_key_ += node.node_type;
    graph_nodes_key_ += "-";
  }
  std::vector<GraphNode> deps;
  {
    // Dependencies must be added in the same order to produce a consistent
    // topology
    std::unordered_set<cudaGraphNode_t> set_deps;
    for (auto d : active_deps_) {
      if (auto it = node_map_.find(d); it != node_map_.end()) {
        auto [_, inserted] = set_deps.insert(it->second.node);
        if (inserted) {
          deps.push_back(it->second);
        }
      }
    }
  }
  active_deps_.clear();

  for (auto o : active_outputs_) {
    for (auto& node : nodes) {
      node_map_.emplace(o, node).first->second = node;
    }
  }
  active_outputs_.clear();

  for (auto& from : deps) {
    for (auto& to : nodes) {
      from_nodes_.push_back(from.node);
      to_nodes_.push_back(to.node);
      graph_deps_key_ += from.id;
      graph_deps_key_ += "-";
      graph_deps_key_ += to.id;
      graph_deps_key_ += "-";
    }
  }
}

// Can be tuned with MLX_MAX_OPS_PER_BUFFER, MLX_MAX_MB_PER_BUFFER
std::pair<int, int> get_graph_limits(Device& d) {
  auto cc =
      d.compute_capability_major() * 100 + d.compute_capability_minor() * 10;
  int ops = 20;
  int mb = 100;
  switch (cc) {
    case 800: // A100
      ops = 20;
      mb = 400;
      break;
    case 900: // H100
    case 1000: // B200
    case 1200: // Consumer Blackwell
      ops = 100;
      mb = 1000;
      break;
    case 1210: // DGX Spark
      ops = 20;
      mb = 25;
      break;
  }
  return {env::max_ops_per_buffer(ops), env::max_mb_per_buffer(mb)};
}

CommandEncoder::CommandEncoder(Device& d)
    : device_(d),
      stream_(d),
      graph_(d),
      worker_(d),
      graph_cache_("MLX_CUDA_GRAPH_CACHE_SIZE", /* default_capacity */ 400) {
  std::tie(max_ops_per_graph_, max_mb_per_graph_) = get_graph_limits(d);
}

void CommandEncoder::add_completed_handler(std::function<void()> task) {
  worker_.add_task(std::move(task));
}

void CommandEncoder::set_input_array(const array& arr) {
  if (!use_cuda_graphs()) {
    return;
  }
  bytes_in_graph_ += arr.data_size();
  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
  active_deps_.push_back(id);
}

void CommandEncoder::set_output_array(const array& arr) {
  if (!use_cuda_graphs()) {
    return;
  }

  auto id = reinterpret_cast<std::uintptr_t>(arr.buffer().ptr());
  active_deps_.push_back(id);
  active_outputs_.push_back(id);
}

void CommandEncoder::add_kernel_node_raw(
    void* func,
    dim3 grid_dim,
    dim3 block_dim,
    dim3 cluster_dim,
    uint32_t smem_bytes,
    void** params) {
  bool use_cluster = !is_empty_dim(cluster_dim);
  assert(!use_cluster || device_.compute_capability_major() >= 9);

  if (!use_cuda_graphs()) {
    node_count_++;
    cudaLaunchConfig_t config = {};
    config.gridDim = grid_dim;
    config.blockDim = block_dim;
    config.dynamicSmemBytes = smem_bytes;
    config.stream = stream();
    cudaLaunchAttribute attr = {};
    if (use_cluster) {
      attr.id = cudaLaunchAttributeClusterDimension;
      attr.val.clusterDim.x = cluster_dim.x;
      attr.val.clusterDim.y = cluster_dim.y;
      attr.val.clusterDim.z = cluster_dim.z;
      config.attrs = &attr;
      config.numAttrs = 1;
    }
    CHECK_CUDA_ERROR(cudaLaunchKernelExC(&config, func, params));
    return;
  }

  cudaKernelNodeParams kernel_params = {0};
  kernel_params.func = func;
  kernel_params.gridDim = grid_dim;
  kernel_params.blockDim = block_dim;
  kernel_params.kernelParams = params;
  kernel_params.sharedMemBytes = smem_bytes;
  cudaGraphNode_t node = add_kernel_node_raw(kernel_params);
  if (use_cluster) {
    cudaKernelNodeAttrValue attr = {};
    attr.clusterDim.x = cluster_dim.x;
    attr.clusterDim.y = cluster_dim.y;
    attr.clusterDim.z = cluster_dim.z;
    CHECK_CUDA_ERROR(cudaGraphKernelNodeSetAttribute(
        node, cudaLaunchAttributeClusterDimension, &attr));
  }
}

void CommandEncoder::add_kernel_node_raw(
    CUfunction func,
    dim3 grid_dim,
    dim3 block_dim,
    dim3 cluster_dim,
    uint32_t smem_bytes,
    void** params) {
  bool use_cluster = !is_empty_dim(cluster_dim);
  assert(!use_cluster || device_.compute_capability_major() >= 9);

  if (!use_cuda_graphs()) {
    node_count_++;
    CUlaunchConfig config = {};
    config.gridDimX = grid_dim.x;
    config.gridDimY = grid_dim.y;
    config.gridDimZ = grid_dim.z;
    config.blockDimX = block_dim.x;
    config.blockDimY = block_dim.y;
    config.blockDimZ = block_dim.z;
    config.sharedMemBytes = smem_bytes;
    config.hStream = stream();
    CUlaunchAttribute attr = {};
    if (use_cluster) {
      attr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
      attr.value.clusterDim.x = cluster_dim.x;
      attr.value.clusterDim.y = cluster_dim.y;
      attr.value.clusterDim.z = cluster_dim.z;
      config.attrs = &attr;
      config.numAttrs = 1;
    }
    CHECK_CUDA_ERROR(cuLaunchKernelEx(&config, func, params, nullptr));
    return;
  }

  CUDA_KERNEL_NODE_PARAMS kernel_params = {};
  kernel_params.func = func;
  kernel_params.gridDimX = grid_dim.x;
  kernel_params.gridDimY = grid_dim.y;
  kernel_params.gridDimZ = grid_dim.z;
  kernel_params.blockDimX = block_dim.x;
  kernel_params.blockDimY = block_dim.y;
  kernel_params.blockDimZ = block_dim.z;
  kernel_params.kernelParams = params;
  kernel_params.sharedMemBytes = smem_bytes;
  CUgraphNode node = add_kernel_node_raw(kernel_params);
  if (use_cluster) {
    CUlaunchAttributeValue attr = {};
    attr.clusterDim.x = cluster_dim.x;
    attr.clusterDim.y = cluster_dim.y;
    attr.clusterDim.z = cluster_dim.z;
    CHECK_CUDA_ERROR(cuGraphKernelNodeSetAttribute(
        node, CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION, &attr));
  }
}

cudaGraphNode_t CommandEncoder::add_kernel_node_raw(
    const cudaKernelNodeParams& params) {
  cudaGraphNode_t node;
  CHECK_CUDA_ERROR(cudaGraphAddKernelNode(&node, graph_, NULL, 0, &params));
  insert_graph_dependencies(GraphNode{node, "K"});
  return node;
}

CUgraphNode CommandEncoder::add_kernel_node_raw(
    const CUDA_KERNEL_NODE_PARAMS& params) {
  CUgraphNode node;
  CHECK_CUDA_ERROR(cuGraphAddKernelNode(&node, graph_, NULL, 0, &params));
  insert_graph_dependencies(GraphNode{node, "K"});
  return node;
}

std::pair<std::string, bool> subgraph_to_key(cudaGraph_t graph) {
  // Constructs a key representing the nodes of a sub-graph.
  // Also checks if the sub-graph is updatable as CUDA graphs do not get
  // updated correctly if a kernel node getting updated has a different cluster
  // shape than the node it's being updated with.
  std::string key = "(";
  size_t num_nodes = 0;
  CHECK_CUDA_ERROR(cudaGraphGetNodes(graph, nullptr, &num_nodes));
  if (num_nodes == 0) {
    return {key + ")", true};
  }
  bool is_updatable = true;
  std::vector<cudaGraphNode_t> nodes(num_nodes);
  CHECK_CUDA_ERROR(cudaGraphGetNodes(graph, nodes.data(), &num_nodes));
  for (const auto& node : nodes) {
    if (!is_updatable) {
      break;
    }
    cudaGraphNodeType type;
    CHECK_CUDA_ERROR(cudaGraphNodeGetType(node, &type));
    switch (type) {
      case cudaGraphNodeTypeGraph: {
        // Try to be updatable for a structure like graph -> graph -> kernel
        cudaGraph_t child;
        CHECK_CUDA_ERROR(cudaGraphChildGraphNodeGetGraph(node, &child));
        auto [subkey, sub_is_updatable] = subgraph_to_key(child);
        is_updatable &= sub_is_updatable;
        key += subkey;
        break;
      }
      case cudaGraphNodeTypeHost:
        key += "H";
        break;
      case cudaGraphNodeTypeMemset:
        key += "M";
        break;
      case cudaGraphNodeTypeKernel: {
        cudaLaunchAttributeValue cluster_dim;
        CHECK_CUDA_ERROR(cudaGraphKernelNodeGetAttribute(
            node, cudaLaunchAttributeClusterDimension, &cluster_dim));
        // Only allow dim.x to be greater than 1
        if (cluster_dim.clusterDim.y > 1 || cluster_dim.clusterDim.z > 1) {
          is_updatable = false;
        } else {
          key += "K";
          key += std::to_string(cluster_dim.clusterDim.x);
        }
        break;
      }
      case cudaGraphNodeTypeWaitEvent:
        key += "W";
        break;
      case cudaGraphNodeTypeEventRecord:
        key += "R";
        break;
      default:
        is_updatable = false;
    }
  }
  key += ")";
  return {key, is_updatable};
}

void CommandEncoder::add_graph_node(cudaGraph_t child) {
  if (!use_cuda_graphs()) {
    node_count_++;
    CudaGraphExec graph_exec;
    graph_exec.instantiate(child);
    device_.make_current();
    CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream()));
    return;
  }
  cudaGraphNode_t node;
  auto [sub_graph_key, is_updatable] = subgraph_to_key(child);
  is_graph_updatable_ &= is_updatable;
  CHECK_CUDA_ERROR(cudaGraphAddChildGraphNode(&node, graph_, NULL, 0, child));
  insert_graph_dependencies(GraphNode{node, sub_graph_key});
}

void CommandEncoder::add_graph_node(
    cudaGraph_t child,
    const std::string& subgraph_key,
    bool is_updatable) {
  if (!use_cuda_graphs()) {
    node_count_++;
    CudaGraphExec graph_exec;
    graph_exec.instantiate(child);
    device_.make_current();
    CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream()));
    return;
  }
  is_graph_updatable_ &= is_updatable;
  cudaGraphNode_t node;
  CHECK_CUDA_ERROR(cudaGraphAddChildGraphNode(&node, graph_, NULL, 0, child));
  insert_graph_dependencies(GraphNode{node, subgraph_key});
}

bool CommandEncoder::needs_commit() {
  return (node_count_ > max_ops_per_graph_) ||
      ((bytes_in_graph_ >> 20) > max_mb_per_graph_);
}

void CommandEncoder::commit() {
  nvtx3::scoped_range r("CommandEncoder::commit");
  if (!temporaries_.empty()) {
    add_completed_handler([temporaries = std::move(temporaries_)]() {});
  }
  if (use_cuda_graphs() && node_count_ > 0) {
    if (!from_nodes_.empty()) {
#if CUDART_VERSION >= 13000
      CHECK_CUDA_ERROR(cudaGraphAddDependencies(
          graph_,
          from_nodes_.data(),
          to_nodes_.data(),
          nullptr, // edgeData
          from_nodes_.size()));
#else
      CHECK_CUDA_ERROR(cudaGraphAddDependencies(
          graph_, from_nodes_.data(), to_nodes_.data(), from_nodes_.size()));
#endif
    }

    device_.make_current();

    if (!is_graph_updatable_) {
      CudaGraphExec graph_exec;
      graph_exec.instantiate(graph_);
      CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream_));
    } else {
      auto graph_key = graph_nodes_key_ + ":" + graph_deps_key_;
      auto& graph_exec = graph_cache_[graph_key];

      if (graph_exec != nullptr) {
        cudaGraphExecUpdateResult update_result;
#if CUDART_VERSION >= 12000
        cudaGraphExecUpdateResultInfo info;
        cudaGraphExecUpdate(graph_exec, graph_, &info);
        update_result = info.result;
#else
        cudaGraphNode_t error_node;
        cudaGraphExecUpdate(graph_exec, graph_, &error_node, &update_result);
#endif // CUDART_VERSION >= 12000
        if (update_result != cudaGraphExecUpdateSuccess) {
          cudaGetLastError(); // reset error
          graph_exec.reset();
        }
      }
      if (graph_exec == nullptr) {
        graph_exec.instantiate(graph_);
      }

      CHECK_CUDA_ERROR(cudaGraphLaunch(graph_exec, stream_));
    }

    // Save cuda graph to dot file
    if (const char* filename = save_cuda_graphs_dot_file(); filename) {
      static int count = 0;
      auto path = fmt::format("{}_{}.dot", filename, ++count);
      CHECK_CUDA_ERROR(cudaGraphDebugDotPrint(graph_, path.c_str(), 0));
    }

    // Reset state
    from_nodes_.clear();
    to_nodes_.clear();
    graph_deps_key_.clear();
    graph_nodes_key_.clear();
    node_map_.clear();
    graph_ = CudaGraph(device_);
    is_graph_updatable_ = true;
  }

  // Put completion handlers in a batch.
  worker_.commit(stream_);
  node_count_ = 0;
  bytes_in_graph_ = 0;
}

void CommandEncoder::synchronize() {
  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
  auto p = std::make_shared<std::promise<void>>();
  std::future<void> f = p->get_future();
  add_completed_handler([p = std::move(p)]() { p->set_value(); });
  commit();
  f.wait();
}

Device& device(int cuda_device) {
  static auto devices = []() {
    std::vector<Device> devices;
    int device_count = gpu::device_count();
    for (int i = 0; i < device_count; ++i) {
      devices.emplace_back(i);
    }
    // Initialize the jit module cache here ensures it is not unloaded before
    // any evaluation is done.
    get_jit_module_cache();
    return devices;
  }();
  return devices.at(cuda_device);
}

Device& device(mlx::core::Device d) {
  return device(d.index);
}

CommandEncoder& get_command_encoder(Stream s) {
  return device(s.device).get_command_encoder(s);
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/array.h"
#include "mlx/backend/cuda/allocator.h"
#include "mlx/backend/cuda/lru_cache.h"
#include "mlx/backend/cuda/worker.h"
#include "mlx/stream.h"

#include <cublasLt.h>
#include <cuda.h>
#include <cudnn.h>

#include <unordered_map>

namespace mlx::core::cu {

// Compute a key and updatability flag for a CUDA graph by walking its nodes.
std::pair<std::string, bool> subgraph_to_key(cudaGraph_t graph);

class CommandEncoder {
 public:
  struct CaptureContext {
    CaptureContext(CommandEncoder& enc);
    ~CaptureContext();
    CudaGraph graph;
    CommandEncoder& enc;
    bool discard{false};
  };
  struct ConcurrentContext {
    ConcurrentContext(CommandEncoder& enc);
    ~ConcurrentContext();
    CommandEncoder& enc;
  };

  explicit CommandEncoder(Device& d);

  CommandEncoder(const CommandEncoder&) = delete;
  CommandEncoder& operator=(const CommandEncoder&) = delete;

  CaptureContext capture_context() {
    return CaptureContext{*this};
  }
  ConcurrentContext concurrent_context() {
    return ConcurrentContext{*this};
  }

  void set_input_array(const array& arr);
  void set_output_array(const array& arr);

  template <typename F, typename... Params>
  void
  add_kernel_node(F* func, dim3 grid_dim, dim3 block_dim, Params&&... params) {
    add_kernel_node_ex(func, grid_dim, block_dim, {}, 0, params...);
  }

  template <typename F, typename... Params>
  void add_kernel_node_ex(
      F* func,
      dim3 grid_dim,
      dim3 block_dim,
      dim3 cluster_dim,
      uint32_t smem_bytes,
      Params&&... params) {
    constexpr size_t num = sizeof...(Params);
    void* ptrs[num];
    size_t i = 0;
    ([&](auto&& p) { ptrs[i++] = static_cast<void*>(&p); }(
         std::forward<Params>(params)),
     ...);
    add_kernel_node_raw(
        reinterpret_cast<void*>(func),
        grid_dim,
        block_dim,
        cluster_dim,
        smem_bytes,
        ptrs);
  }

  void add_kernel_node_raw(
      void* func,
      dim3 grid_dim,
      dim3 block_dim,
      dim3 cluster_dim,
      uint32_t smem_bytes,
      void** params);

  void add_kernel_node_raw(
      CUfunction func,
      dim3 grid_dim,
      dim3 block_dim,
      dim3 cluster_dim,
      uint32_t smem_bytes,
      void** params);

  void add_graph_node(cudaGraph_t child);
  void add_graph_node(
      cudaGraph_t child,
      const std::string& subgraph_key,
      bool is_updatable);

  void add_temporary(const array& arr) {
    temporaries_.push_back(arr.data_shared_ptr());
  }

  void add_completed_handler(std::function<void()> task);
  bool needs_commit();
  void commit();

  Device& device() {
    return device_;
  }

  CudaStream& stream() {
    return stream_;
  }

  // Wait until kernels and completion handlers are finished
  void synchronize();

 private:
  cudaGraphNode_t add_kernel_node_raw(const cudaKernelNodeParams& params);
  CUgraphNode add_kernel_node_raw(const CUDA_KERNEL_NODE_PARAMS& params);

  struct GraphNode {
    cudaGraphNode_t node;
    // K = kernel
    // E = empty
    // () = subgraph (with metadata)
    // Symbols ':', '-' are reserved as separators
    std::string node_type;
    std::string id;
  };

  void insert_graph_dependencies(GraphNode node);
  void insert_graph_dependencies(std::vector<GraphNode> nodes);

  Device& device_;
  CudaStream stream_;
  CudaGraph graph_;
  Worker worker_;
  int node_count_{0};
  bool in_concurrent_{false};
  std::vector<cudaGraphNode_t> from_nodes_;
  std::vector<cudaGraphNode_t> to_nodes_;
  std::string graph_nodes_key_;
  std::string graph_deps_key_;
  std::vector<GraphNode> concurrent_nodes_;
  std::vector<std::shared_ptr<array::Data>> temporaries_;
  LRUCache<std::string, CudaGraphExec> graph_cache_;
  std::vector<std::uintptr_t> active_deps_;
  std::vector<std::uintptr_t> active_outputs_;
  std::unordered_map<std::uintptr_t, GraphNode> node_map_;
  size_t bytes_in_graph_{0};
  bool is_graph_updatable_{true};
  int max_ops_per_graph_;
  int max_mb_per_graph_;
};

class Device {
 public:
  explicit Device(int device);
  ~Device();

  Device(Device&&) = default;
  Device(const Device&) = delete;
  Device& operator=(const Device&) = delete;

  // Make this device the current cuda device, this method is thread-safe.
  void make_current();

  CommandEncoder& get_command_encoder(Stream s);
  cublasLtHandle_t get_cublaslt_handle();
  cudnnHandle_t get_cudnn_handle();

  int cuda_device() const {
    return device_;
  }
  int compute_capability_major() const {
    return compute_capability_major_;
  }
  int compute_capability_minor() const {
    return compute_capability_minor_;
  }
  bool concurrent_managed_access() const {
    return concurrent_managed_access_ == 1;
  }
  bool host_native_atomic() const {
    return host_native_atomic_ == 1;
  }
  bool managed_memory() const {
    return managed_memory_ == 1;
  }
  bool memory_pools() const {
    return memory_pools_ == 1;
  }

 private:
  int device_;
  int compute_capability_major_;
  int compute_capability_minor_;
  int concurrent_managed_access_;
  int host_native_atomic_;
  int managed_memory_;
  int memory_pools_;
  std::string device_name_;
  cublasLtHandle_t cublaslt_handle_{nullptr};
  cudnnHandle_t cudnn_handle_{nullptr};
  std::unordered_map<int, CommandEncoder> encoders_;
};

Device& device(int cuda_device);
Device& device(mlx::core::Device d);
CommandEncoder& get_command_encoder(Stream s);

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/device_info.cpp
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/gpu/device_info.h"
#include "mlx/backend/cuda/cuda.h"

#include <cuda_runtime.h>
#include <dlfcn.h>

#include <string>
#include <unordered_map>
#include <variant>
#include <vector>

namespace mlx::core {

namespace {

// NVML dynamic loading for accurate memory reporting
// (cudaMemGetInfo only sees current process)

typedef int nvmlReturn_t;
typedef struct nvmlDevice_st* nvmlDevice_t;
struct nvmlMemory_t {
  unsigned long long total;
  unsigned long long free;
  unsigned long long used;
};

struct NVMLState {
  void* handle = nullptr;
  nvmlReturn_t (*nvmlInit_v2)() = nullptr;
  nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char*, nvmlDevice_t*) =
      nullptr;
  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t*) =
      nullptr;
};

bool nvml_init(NVMLState& nvml) {
#ifdef _WIN32
  nvml.handle = dlopen("nvml.dll", RTLD_LAZY);
  if (!nvml.handle) {
    nvml.handle = dlopen(
        "C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvml.dll", RTLD_LAZY);
  }
#else
  nvml.handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY);
#endif
  if (!nvml.handle)
    return false;

  nvml.nvmlInit_v2 =
      (decltype(nvml.nvmlInit_v2))dlsym(nvml.handle, "nvmlInit_v2");
  nvml.nvmlDeviceGetHandleByUUID =
      (decltype(nvml.nvmlDeviceGetHandleByUUID))dlsym(
          nvml.handle, "nvmlDeviceGetHandleByUUID");
  nvml.nvmlDeviceGetMemoryInfo = (decltype(nvml.nvmlDeviceGetMemoryInfo))dlsym(
      nvml.handle, "nvmlDeviceGetMemoryInfo");

  if (!nvml.nvmlInit_v2 || !nvml.nvmlDeviceGetHandleByUUID ||
      !nvml.nvmlDeviceGetMemoryInfo) {
    return false;
  }
  return nvml.nvmlInit_v2() == 0;
}

bool nvml_get_memory(
    NVMLState& nvml,
    const char* uuid,
    size_t* free,
    size_t* total) {
  if (!nvml.handle)
    return false;
  nvmlDevice_t device;
  if (nvml.nvmlDeviceGetHandleByUUID(uuid, &device) != 0)
    return false;
  nvmlMemory_t mem;
  if (nvml.nvmlDeviceGetMemoryInfo(device, &mem) != 0)
    return false;
  *free = mem.free;
  *total = mem.total;
  return true;
}

std::string format_uuid(const cudaUUID_t& uuid) {
  char buf[64];
  snprintf(
      buf,
      sizeof(buf),
      "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
      (unsigned char)uuid.bytes[0],
      (unsigned char)uuid.bytes[1],
      (unsigned char)uuid.bytes[2],
      (unsigned char)uuid.bytes[3],
      (unsigned char)uuid.bytes[4],
      (unsigned char)uuid.bytes[5],
      (unsigned char)uuid.bytes[6],
      (unsigned char)uuid.bytes[7],
      (unsigned char)uuid.bytes[8],
      (unsigned char)uuid.bytes[9],
      (unsigned char)uuid.bytes[10],
      (unsigned char)uuid.bytes[11],
      (unsigned char)uuid.bytes[12],
      (unsigned char)uuid.bytes[13],
      (unsigned char)uuid.bytes[14],
      (unsigned char)uuid.bytes[15]);
  return buf;
}

const std::unordered_map<std::string, std::variant<std::string, size_t>>&
device_info_impl(int device_index) {
  // Static cache of device properties including UUID (needed for NVML lookup)
  static auto all_devices = []() {
    // Get device count
    int count = 0;
    cudaGetDeviceCount(&count);

    // Collect info for all devices
    struct DeviceInfo {
      std::unordered_map<std::string, std::variant<std::string, size_t>> info;
      std::string uuid;
    };

    std::vector<DeviceInfo> devices;

    for (int i = 0; i < count; ++i) {
      cudaDeviceProp prop;
      cudaGetDeviceProperties(&prop, i);

      DeviceInfo dev;
      dev.info["device_name"] = std::string(prop.name);
      dev.uuid = format_uuid(prop.uuid);
      dev.info["uuid"] = dev.uuid;

      // Architecture string (e.g., "sm_89")
      char arch[16];
      snprintf(arch, sizeof(arch), "sm_%d%d", prop.major, prop.minor);
      dev.info["architecture"] = std::string(arch);

      // PCI bus ID (domain:bus:device.function)
      char pci_id[32];
      snprintf(
          pci_id,
          sizeof(pci_id),
          "%04x:%02x:%02x.0",
          prop.pciDomainID,
          prop.pciBusID,
          prop.pciDeviceID);
      dev.info["pci_bus_id"] = std::string(pci_id);

      // Compute capability as size_t (to match Metal's variant type)
      dev.info["compute_capability_major"] = static_cast<size_t>(prop.major);
      dev.info["compute_capability_minor"] = static_cast<size_t>(prop.minor);

      devices.push_back(std::move(dev));
    }
    return devices;
  }();

  // Initialize NVML once for fresh memory reads
  static NVMLState nvml;
  static bool nvml_initialized = nvml_init(nvml);

  if (device_index < 0 ||
      device_index >= static_cast<int>(all_devices.size())) {
    static auto empty =
        std::unordered_map<std::string, std::variant<std::string, size_t>>();
    return empty;
  }

  // Return a copy with fresh memory info
  // Using thread_local to avoid locks while keeping free_memory fresh
  thread_local auto device_info_copy =
      std::unordered_map<std::string, std::variant<std::string, size_t>>();

  device_info_copy = all_devices[device_index].info;

  // Get fresh memory info - try NVML first (system-wide), fallback to
  // cudaMemGetInfo (process-level)
  size_t free_mem, total_mem;

  if (nvml_initialized &&
      nvml_get_memory(
          nvml,
          all_devices[device_index].uuid.c_str(),
          &free_mem,
          &total_mem)) {
    // NVML succeeded - use system-wide memory
  } else {
    // Fallback to cudaMemGetInfo (process-scoped)
    int prev_device;
    cudaGetDevice(&prev_device);
    cudaSetDevice(device_index);
    cudaMemGetInfo(&free_mem, &total_mem);
    cudaSetDevice(prev_device);
  }

  device_info_copy["free_memory"] = free_mem;
  device_info_copy["total_memory"] = total_mem;

  return device_info_copy;
}

} // anonymous namespace

namespace gpu {

bool is_available() {
  return true;
}

int device_count() {
  int count = 0;
  cudaGetDeviceCount(&count);
  return count;
}

const std::unordered_map<std::string, std::variant<std::string, size_t>>&
device_info(int device_index) {
  return device_info_impl(device_index);
}

} // namespace gpu

namespace cu {

bool is_available() {
  return true;
}

} // namespace cu

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/distributed.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/backend/gpu/copy.h"
#include "mlx/distributed/primitives.h"
#include "mlx/primitives.h"

#include <cassert>

namespace mlx::core::distributed {
void AllReduce::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() == 1);
  assert(outputs.size() == 1);

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
  auto set_input_output = [&](const array& in,
                              array& out) -> std::pair<array, array> {
    if (!in.flags().row_contiguous) {
      copy_gpu(in, out, CopyType::General, s);
      return {out, out};
    } else if (in.is_donatable()) {
      out.copy_shared_buffer(in);
      return {in, out};
    } else {
      out.set_data(cu::malloc_async(out.nbytes(), encoder));
      return {in, out};
    }
  };

  auto [input, output] = set_input_output(inputs[0], outputs[0]);

  encoder.set_input_array(input);
  encoder.set_output_array(output);

  auto capture = encoder.capture_context();

  switch (reduce_type_) {
    case Sum:
      distributed::detail::all_sum(group(), input, output, s);
      break;
    case Max:
      distributed::detail::all_max(group(), input, output, s);
      break;
    case Min:
      distributed::detail::all_min(group(), input, output, s);
      break;
    default:
      throw std::runtime_error(
          "Only all reduce sum, max, and min are supported.");
  }
}

void AllGather::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() == 1);
  assert(outputs.size() == 1);

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);

  auto ensure_contiguous = [&s, &encoder](const array& x) {
    if (x.flags().row_contiguous) {
      return x;
    } else {
      array x_copy = contiguous_copy_gpu(x, s);
      encoder.add_temporary(x_copy);
      return x_copy;
    }
  };

  auto input = ensure_contiguous(inputs[0]);
  outputs[0].set_data(cu::malloc_async(outputs[0].nbytes(), encoder));

  encoder.set_input_array(input);
  encoder.set_output_array(outputs[0]);

  auto capture = encoder.capture_context();
  distributed::detail::all_gather(group(), input, outputs[0], s);
}

void ReduceScatter::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(inputs.size() == 1);
  assert(outputs.size() == 1);

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);

  auto ensure_contiguous = [&s, &encoder](const array& x) {
    if (x.flags().row_contiguous) {
      return x;
    } else {
      array x_copy = contiguous_copy_gpu(x, s);
      encoder.add_temporary(x_copy);
      return x_copy;
    }
  };

  auto input = ensure_contiguous(inputs[0]);
  outputs[0].set_data(cu::malloc_async(outputs[0].nbytes(), encoder));

  encoder.set_input_array(input);
  encoder.set_output_array(outputs[0]);

  auto capture = encoder.capture_context();

  switch (reduce_type_) {
    case Sum:
      distributed::detail::sum_scatter(group(), input, outputs[0], s);
      break;
    default:
      throw std::runtime_error("Only sum scatter is supported. ");
  }
}
} // namespace mlx::core::distributed


================================================
FILE: mlx/backend/cuda/eval.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/gpu/eval.h"
#include "mlx/backend/cuda/allocator.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/primitives.h"
#include "mlx/scheduler.h"

#include <nvtx3/nvtx3.hpp>

namespace mlx::core::gpu {

void new_stream(Stream s) {
  // Force initalization of CUDA, so CUDA runtime get destroyed at last.
  cudaFree(nullptr);
  // Make sure CUDA event pool get destroyed after device and stream.
  cu::CudaEvent::init_pool();
  // Ensure the static stream objects get created.
  cu::get_command_encoder(s);
}

void eval(array& arr) {
  nvtx3::scoped_range r("gpu::eval");
  // Ensure CUDA context is active on this thread. Required when MLX is called
  // from threads that have not yet established a CUDA context (e.g. thread
  // pools, language runtimes that migrate work across OS threads).
  cu::device(arr.primitive().stream().device).make_current();
  auto outputs = arr.outputs();
  {
    // If the array is a tracer hold a reference
    // to its inputs so they don't get donated
    std::vector<array> inputs;
    if (arr.is_tracer()) {
      inputs = arr.inputs();
    }
    arr.primitive().eval_gpu(arr.inputs(), outputs);
  }

  auto& stream = arr.primitive().stream();
  auto& encoder = cu::get_command_encoder(stream);
  // Keep used buffers alive until kernel finishes running.
  for (auto& in : arr.inputs()) {
    // Except for the donated one.
    if (in.data_shared_ptr() != arr.data_shared_ptr()) {
      encoder.add_temporary(in);
    }
  }
  for (auto& s : arr.siblings()) {
    encoder.add_temporary(s);
  }

  if (encoder.needs_commit()) {
    scheduler::notify_new_task(stream);
    encoder.add_completed_handler(
        [stream]() { scheduler::notify_task_completion(stream); });
    encoder.commit();
  }
}

void finalize(Stream s) {
  nvtx3::scoped_range r("gpu::finalize");
  cu::get_command_encoder(s).commit();
}

void synchronize(Stream s) {
  nvtx3::scoped_range r("gpu::synchronize");
  cu::get_command_encoder(s).synchronize();
}

} // namespace mlx::core::gpu


================================================
FILE: mlx/backend/cuda/event.cu
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/backend/cuda/allocator.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/event.h"
#include "mlx/backend/gpu/device_info.h"
#include "mlx/event.h"
#include "mlx/scheduler.h"

#include <map>
#include <vector>

#include <nvtx3/nvtx3.hpp>

namespace mlx::core {

namespace cu {

///////////////////////////////////////////////////////////////////////////////
// CudaEvent implementations
///////////////////////////////////////////////////////////////////////////////

namespace {

// Manage cached cudaEvent_t objects.
class CudaEventPool {
 public:
  CudaEventHandle create(Device& d, int flags) {
    if (!on_creation_thread()) {
      return CudaEventHandle(d, flags);
    }
    auto& cache = cache_for(d, flags);
    if (cache.empty()) {
      return CudaEventHandle(d, flags);
    } else {
      CudaEventHandle ret = std::move(cache.back());
      cache.pop_back();
      return ret;
    }
  }

  void release(CudaEventHandle event) {
    if (!on_creation_thread()) {
      // Event will be destroyed directly instead of getting moved to cache.
      return;
    }
    cache_for(event.device, event.flags).push_back(std::move(event));
  }

 private:
  std::vector<CudaEventHandle>& cache_for(Device& d, int flags) {
    return cache_[d.cuda_device()][flags];
  }

  bool on_creation_thread() {
    return std::this_thread::get_id() == thread_id_;
  }

  // The CudaEvent may be created and destroyed on different threads (for
  // example when waiting on GPU work in CPU stream), we don't want to make
  // the cache thread-safe as it adds overhead, so we just skip cache when
  // using events in worker threads.
  std::thread::id thread_id_{std::this_thread::get_id()};

  // {device: {flags: [events]}}
  std::map<int, std::map<int, std::vector<CudaEventHandle>>> cache_;
};

CudaEventPool& cuda_event_pool() {
  static CudaEventPool pool;
  return pool;
}

} // namespace

CudaEventHandle::CudaEventHandle(Device& d, int flags)
    : device(d), flags(flags) {
  device.make_current();
  CHECK_CUDA_ERROR(cudaEventCreateWithFlags(&handle_, flags));
  assert(handle_ != nullptr);
}

CudaEvent::CudaEvent(Device& d, int flags)
    : event_(cuda_event_pool().create(d, flags)) {}

CudaEvent::~CudaEvent() {
  cuda_event_pool().release(std::move(event_));
}

void CudaEvent::wait() {
  nvtx3::scoped_range r("cu::CudaEvent::wait");
  event_.device.make_current();
  cudaEventSynchronize(event_);
}

void CudaEvent::wait(cudaStream_t stream) {
  event_.device.make_current();
  cudaStreamWaitEvent(stream, event_);
}

void CudaEvent::record(cudaStream_t stream) {
  event_.device.make_current();
  cudaEventRecord(event_, stream);
}

bool CudaEvent::completed() const {
  // Note: cudaEventQuery can be safely called from any device.
  return cudaEventQuery(event_) == cudaSuccess;
}

// static
void CudaEvent::init_pool() {
  cuda_event_pool();
}

// Wraps CudaEvent with a few features:
// 1. The class can be copied.
// 2. Make wait/record work with CPU streams.
// 3. Add checks for waiting on un-recorded event.
class CopyableCudaEvent {
 public:
  explicit CopyableCudaEvent(Device& d)
      : event_(
            std::make_shared<CudaEvent>(
                d,
                cudaEventDisableTiming | cudaEventBlockingSync)) {}

  void wait() {
    event_->wait();
  }

  void wait(Stream s) {
    if (s.device == mlx::core::Device::cpu) {
      scheduler::enqueue(s, [*this]() mutable {
        check_recorded();
        event_->wait();
      });
    } else {
      check_recorded();
      auto& encoder = cu::get_command_encoder(s);
      encoder.commit();
      event_->wait(encoder.stream());
    }
  }

  void record(Stream s) {
    if (s.device == mlx::core::Device::cpu) {
      throw std::runtime_error("CudaEvent can not wait on CPU stream.");
    } else {
      auto& encoder = cu::get_command_encoder(s);
      encoder.commit();
      event_->record(encoder.stream());
      recorded_ = true;
    }
  }

  bool is_signaled() const {
    return recorded_ && event_->completed();
  }

 private:
  void check_recorded() const {
    if (!recorded_) {
      throw std::runtime_error(
          "Should not wait on a CudaEvent before recording.");
    }
  }

  std::shared_ptr<CudaEvent> event_;
  bool recorded_{false};
};

///////////////////////////////////////////////////////////////////////////////
// AtomicEvent implementations
///////////////////////////////////////////////////////////////////////////////

__host__ __device__ void event_wait(uint32_t* ptr, uint32_t value) {
  cuda::atomic_ref<uint32_t> ac(*ptr);
  uint32_t current;
  while ((current = ac.load()) < value) {
    ac.wait(current);
  }
}

__host__ __device__ void event_signal(uint32_t* ptr, uint32_t value) {
  cuda::atomic_ref<uint32_t> ac(*ptr);
  ac.store(value);
  ac.notify_all();
}

__global__ void event_wait_kernel(uint32_t* ptr, uint32_t value) {
  event_wait(ptr, value);
}

__global__ void event_signal_kernel(uint32_t* ptr, uint32_t value) {
  __threadfence_system();
  event_signal(ptr, value);
  __threadfence_system();
}

auto check_gpu_coherency() {
  static auto coherency = []() {
    int device_count = gpu::device_count();
    bool concurrent_managed_access = true;
    bool host_native_atomic = true;
    for (int i = 0; i < device_count; ++i) {
      auto& d = cu::device(i);
      concurrent_managed_access &= d.concurrent_managed_access();
      host_native_atomic &= d.host_native_atomic();
    }
    return std::make_tuple(concurrent_managed_access, host_native_atomic);
  }();
  return coherency;
}

AtomicEvent::AtomicEvent(Device& d) {
  void* buf;
  cudaError_t (*cuda_free)(void*);
  // There are 3 kinds of systems we are implementing for:
  // 1. concurrentManagedAccess == true
  //    => use cuda::atom_ref on managed memory
  // 2. hostNativeAtomicSupported == true
  //    => use cuda::atom_ref on pinned host memory
  // 2. no hardware cpu/gpu coherency
  //    => use cuda::atom_ref on device memory
  d.make_current();
  auto [concurrent_managed_access, host_native_atomic] = check_gpu_coherency();
  if (concurrent_managed_access) {
    CHECK_CUDA_ERROR(cudaMallocManaged(&buf, sizeof(uint32_t)));
    cuda_free = cudaFree;
    coherent_ = true;
  } else if (host_native_atomic) {
    CHECK_CUDA_ERROR(cudaMallocHost(&buf, sizeof(uint32_t)));
    cuda_free = cudaFreeHost;
    coherent_ = true;
  } else {
    CHECK_CUDA_ERROR(cudaMalloc(&buf, sizeof(uint32_t)));
    cuda_free = cudaFree;
    coherent_ = false;
  }
  buf_ = std::shared_ptr<void>(
      buf, [cuda_free](void* buf) { CHECK_CUDA_ERROR(cuda_free(buf)); });
  if (coherent_) {
    *ptr() = 0;
  } else {
    CHECK_CUDA_ERROR(cudaMemset(buf, 0, sizeof(uint32_t)));
  }
}

void AtomicEvent::wait(uint32_t value) {
  nvtx3::scoped_range r("cu::AtomicEvent::wait");
  if (coherent_) {
    event_wait(ptr(), value);
  } else {
    while (!is_signaled(value)) {
      std::this_thread::yield();
    }
  }
}

void AtomicEvent::wait(cudaStream_t stream, uint32_t value) {
  event_wait_kernel<<<1, 1, 0, stream>>>(ptr(), value);
}

void AtomicEvent::wait(Stream s, uint32_t value) {
  nvtx3::scoped_range r("cu::AtomicEvent::wait(s)");
  if (s.device == mlx::core::Device::cpu) {
    scheduler::enqueue(s, [*this, value]() mutable { wait(value); });
  } else {
    auto& encoder = get_command_encoder(s);
    encoder.commit();
    wait(encoder.stream(), value);
    encoder.add_completed_handler([buf = buf_]() {});
  }
}

void AtomicEvent::signal(uint32_t value) {
  nvtx3::scoped_range r("cu::AtomicEvent::signal");
  if (coherent_) {
    event_signal(ptr(), value);
  } else {
    signal(signal_stream(), value);
  }
}

void AtomicEvent::signal(cudaStream_t stream, uint32_t value) {
  event_signal_kernel<<<1, 1, 0, stream>>>(ptr(), value);
}

void AtomicEvent::signal(Stream s, uint32_t value) {
  nvtx3::scoped_range r("cu::AtomicEvent::signal(s)");
  if (s.device == mlx::core::Device::cpu) {
    // Signal through a GPU stream so the atomic is updated in GPU - updating
    // the atomic in CPU sometimes does not get GPU notified.
    scheduler::enqueue(
        s, [*this, value]() mutable { signal(signal_stream(), value); });
  } else {
    auto& encoder = get_command_encoder(s);
    encoder.commit();
    signal(encoder.stream(), value);
    encoder.add_completed_handler([buf = buf_]() {});
  }
}

bool AtomicEvent::is_signaled(uint32_t val) const {
  return value() >= val;
}

uint32_t AtomicEvent::value() const {
  nvtx3::scoped_range r("cu::AtomicEvent::value");
  if (coherent_) {
    cuda::atomic_ref<uint32_t> ac(*ptr());
    return ac.load();
  } else {
    uint32_t val;
    CHECK_CUDA_ERROR(
        cudaMemcpy(&val, ptr(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
    return val;
  }
}

const CudaStream& AtomicEvent::signal_stream() {
  static CudaStream stream(device(0));
  return stream;
}

} // namespace cu

///////////////////////////////////////////////////////////////////////////////
// Event implementations
///////////////////////////////////////////////////////////////////////////////

namespace {

struct EventImpl {
  // CudaEvent is preferred when possible because it is fast, however we have
  // to fallback to AtomicEvent in following cases:
  // 1. the event is used to wait/signal a cpu stream;
  // 2. signal value other than 1 has been specified.
  std::unique_ptr<cu::CopyableCudaEvent> cuda;
  std::unique_ptr<cu::AtomicEvent> atomic;

  bool is_created() const {
    return cuda || atomic;
  }

  void ensure_created(Stream s, uint64_t signal_value) {
    if (is_created()) {
      return;
    }
    auto& d = cu::device(s.device);
    if (s.device == mlx::core::Device::cpu || signal_value > 1) {
      nvtx3::mark("Using slow AtomicEvent");
      atomic = std::make_unique<cu::AtomicEvent>(d);
    } else {
      cuda = std::make_unique<cu::CopyableCudaEvent>(d);
    }
  }
};

} // namespace

Event::Event(Stream s) : stream_(s) {
  event_ = std::shared_ptr<void>(
      new EventImpl(), [](void* ptr) { delete static_cast<EventImpl*>(ptr); });
}

void Event::wait() {
  auto* event = static_cast<EventImpl*>(event_.get());
  assert(event->is_created());
  if (event->cuda) {
    assert(value() == 1);
    event->cuda->wait();
  } else {
    event->atomic->wait(value());
  }
  CHECK_CUDA_ERROR(cudaPeekAtLastError());
}

void Event::wait(Stream s) {
  auto* event = static_cast<EventImpl*>(event_.get());
  assert(event->is_created());
  if (event->cuda) {
    assert(value() == 1);
    event->cuda->wait(s);
  } else {
    event->atomic->wait(s, value());
  }
}

void Event::signal(Stream s) {
  auto* event = static_cast<EventImpl*>(event_.get());
  event->ensure_created(s, value());
  if (event->cuda) {
    assert(value() == 1);
    event->cuda->record(s);
  } else {
    event->atomic->signal(s, value());
  }
}

bool Event::is_signaled() const {
  auto* event = static_cast<EventImpl*>(event_.get());
  if (!event->is_created()) {
    return false;
  }
  if (event->cuda) {
    assert(value() == 1);
    return event->cuda->is_signaled();
  } else {
    return event->atomic->is_signaled(value());
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/event.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/allocator.h"
#include "mlx/backend/cuda/utils.h"
#include "mlx/stream.h"

#include <memory>

#include <cuda_runtime.h>
#include <cuda/atomic>

namespace mlx::core::cu {

class Device;

// RAII-managed move-only wrapper of cudaEvent_t.
struct CudaEventHandle : public CudaHandle<cudaEvent_t, cudaEventDestroy> {
  CudaEventHandle(Device& d, int flags);
  Device& device;
  int flags;
};

// Wrapper of native cuda event. It can synchronize between GPU streams, or wait
// on GPU stream in CPU stream, but can not wait on CPU stream.
class CudaEvent {
 public:
  CudaEvent(Device& d, int flags);
  ~CudaEvent();

  CudaEvent(CudaEvent&&) = default;
  CudaEvent& operator=(CudaEvent&&) = default;

  CudaEvent(const CudaEvent&) = delete;
  CudaEvent& operator=(const CudaEvent&) = delete;

  void wait();
  void wait(cudaStream_t stream);
  void record(cudaStream_t stream);

  // Return whether the recorded kernels have completed. Note that this method
  // returns true if record() has not been called.
  bool completed() const;

  // Internal: make sure event pool is initialized.
  static void init_pool();

 private:
  CudaEventHandle event_;
};

// Event that can synchronize between CPU and GPU. It is much slower than
// CudaEvent so the latter should always be preferred when possible.
class AtomicEvent {
 public:
  AtomicEvent(Device& d);

  void wait(uint32_t value);
  void wait(cudaStream_t stream, uint32_t value);
  void wait(Stream s, uint32_t value);
  void signal(uint32_t value);
  void signal(cudaStream_t stream, uint32_t value);
  void signal(Stream s, uint32_t value);
  bool is_signaled(uint32_t value) const;
  uint32_t value() const;

 private:
  const CudaStream& signal_stream();

  uint32_t* ptr() const {
    return static_cast<uint32_t*>(buf_.get());
  }

  bool coherent_;
  std::shared_ptr<void> buf_;
};

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/fence.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/fence.h"
#include "mlx/backend/cuda/allocator.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/event.h"

namespace mlx::core {

struct FenceImpl {
  uint32_t count;
  cu::AtomicEvent event;
};

Fence::Fence(Stream s) {
  fence_ = std::shared_ptr<void>(
      new FenceImpl{0, cu::device(s.device)},
      [](void* ptr) { delete static_cast<FenceImpl*>(ptr); });
}

void Fence::wait(Stream s, const array&) {
  auto* fence = static_cast<FenceImpl*>(fence_.get());
  fence->event.wait(fence->count);
}

void Fence::update(Stream s, const array& a, bool cross_device) {
  auto* fence = static_cast<FenceImpl*>(fence_.get());
  if (cross_device) {
    // Move to managed memory if there is a device switch
    auto& cbuf =
        *static_cast<cu::CudaBuffer*>(const_cast<array&>(a).buffer().ptr());
    if (cbuf.device != -1) {
      auto& encoder = cu::get_command_encoder(s);
      encoder.commit();
      cu::allocator().move_to_unified_memory(cbuf, encoder.stream());
    }
  }
  fence->count++;
  fence->event.signal(s, fence->count);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/fft.cu
================================================
// Copyright © 2025 Apple Inc.

#include <cufftXt.h>
#include <algorithm>
#include <cstdint>
#include <memory>
#include <numeric>
#include <stdexcept>
#include <string>
#include <vector>

#include <cooperative_groups.h>
#include <nvtx3/nvtx3.hpp>

#include "mlx/backend/common/utils.h"
#include "mlx/backend/cuda/allocator.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/device/complex.cuh"
#include "mlx/backend/cuda/lru_cache.h"
#include "mlx/backend/cuda/utils.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/primitives.h"

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <typename T>
__global__ void scale_fft_output(T* out, T scale, size_t size) {
  auto index = cg::this_grid().thread_rank();
  if (index < size) {
    out[index] *= scale;
  }
}

} // namespace cu

namespace {

void check_cufft_error(const char* name, cufftResult err) {
  if (err != CUFFT_SUCCESS) {
    throw std::runtime_error(
        std::string(name) +
        " failed with code: " + std::to_string(static_cast<int>(err)) + ".");
  }
}

#define CHECK_CUFFT_ERROR(cmd) check_cufft_error(#cmd, (cmd))

enum class FFTTransformType : uint8_t {
  C2C = 0,
  R2C = 1,
  C2R = 2,
};

struct FFTPlanKey {
  int device_id;
  FFTTransformType transform_type;
  int64_t n;
  int64_t batch;
};

struct CuFFTPlan {
  explicit CuFFTPlan(int device_id, cufftHandle handle, size_t workspace_size)
      : device_id(device_id), handle(handle), workspace_size(workspace_size) {}

  ~CuFFTPlan() {
    if (handle != 0) {
      try {
        cu::device(device_id).make_current();
        cufftDestroy(handle);
      } catch (...) {
      }
    }
  }

  int device_id;
  cufftHandle handle;
  size_t workspace_size;
};

struct OrderedArray {
  array arr;
  std::vector<int> order;
};

auto& fft_plan_cache() {
  static LRUBytesKeyCache<FFTPlanKey, std::shared_ptr<CuFFTPlan>> cache(
      "MLX_CUDA_FFT_CACHE_SIZE",
      /* default_capacity */ 128);
  return cache;
}

FFTPlanKey make_plan_key(
    int device_id,
    FFTTransformType transform_type,
    int64_t n,
    int64_t batch) {
  FFTPlanKey key{};
  key.device_id = device_id;
  key.transform_type = transform_type;
  key.n = n;
  key.batch = batch;
  return key;
}

cudaDataType_t input_type(FFTTransformType transform_type) {
  switch (transform_type) {
    case FFTTransformType::C2C:
    case FFTTransformType::C2R:
      return CUDA_C_32F;
    case FFTTransformType::R2C:
      return CUDA_R_32F;
  }
  throw std::runtime_error("[FFT] Unsupported cuFFT input transform type.");
}

cudaDataType_t output_type(FFTTransformType transform_type) {
  switch (transform_type) {
    case FFTTransformType::C2C:
    case FFTTransformType::R2C:
      return CUDA_C_32F;
    case FFTTransformType::C2R:
      return CUDA_R_32F;
  }
  throw std::runtime_error("[FFT] Unsupported cuFFT output transform type.");
}

cudaDataType_t execution_type(FFTTransformType transform_type) {
  switch (transform_type) {
    case FFTTransformType::C2C:
      return CUDA_C_32F;
    case FFTTransformType::R2C:
      return CUDA_R_32F;
    case FFTTransformType::C2R:
      return CUDA_C_32F;
  }
  throw std::runtime_error("[FFT] Unsupported cuFFT execution transform type.");
}

int64_t input_embed(FFTTransformType transform_type, int64_t n) {
  return transform_type == FFTTransformType::C2R ? (n / 2 + 1) : n;
}

int64_t output_embed(FFTTransformType transform_type, int64_t n) {
  return transform_type == FFTTransformType::R2C ? (n / 2 + 1) : n;
}

int exec_direction(FFTTransformType transform_type, bool inverse) {
  switch (transform_type) {
    case FFTTransformType::C2C:
      return inverse ? CUFFT_INVERSE : CUFFT_FORWARD;
    case FFTTransformType::R2C:
      return CUFFT_FORWARD;
    case FFTTransformType::C2R:
      return CUFFT_INVERSE;
  }
  throw std::runtime_error("[FFT] Unsupported cuFFT execution direction.");
}

std::shared_ptr<CuFFTPlan> get_fft_plan(
    cu::CommandEncoder& encoder,
    FFTTransformType transform_type,
    int64_t n,
    int64_t batch) {
  auto key = BytesKey<FFTPlanKey>{};
  key.pod =
      make_plan_key(encoder.device().cuda_device(), transform_type, n, batch);

  auto& cache = fft_plan_cache();
  if (auto entry = cache.find(key); entry != cache.end()) {
    return entry->second;
  }

  encoder.device().make_current();

  cufftHandle handle = 0;
  size_t workspace_size = 0;
  try {
    CHECK_CUFFT_ERROR(cufftCreate(&handle));
    CHECK_CUFFT_ERROR(cufftSetAutoAllocation(handle, 0));
    CHECK_CUFFT_ERROR(cufftSetStream(handle, encoder.stream()));

    long long plan_n[1] = {n};
    long long inembed[1] = {input_embed(transform_type, n)};
    long long onembed[1] = {output_embed(transform_type, n)};
    CHECK_CUFFT_ERROR(cufftXtMakePlanMany(
        handle,
        /* rank= */ 1,
        plan_n,
        inembed,
        /* istride= */ 1,
        /* idist= */ input_embed(transform_type, n),
        input_type(transform_type),
        onembed,
        /* ostride= */ 1,
        /* odist= */ output_embed(transform_type, n),
        output_type(transform_type),
        batch,
        &workspace_size,
        execution_type(transform_type)));
  } catch (...) {
    if (handle != 0) {
      encoder.device().make_current();
      cufftDestroy(handle);
    }
    throw;
  }

  auto plan = std::make_shared<CuFFTPlan>(
      encoder.device().cuda_device(), handle, workspace_size);
  return cache.emplace(key, plan).first->second;
}

std::vector<int> make_identity_order(int ndim) {
  std::vector<int> order(ndim);
  std::iota(order.begin(), order.end(), 0);
  return order;
}

std::vector<int> move_axis_to_back_permutation(int ndim, int axis_pos) {
  std::vector<int> perm;
  perm.reserve(ndim);
  for (int i = 0; i < ndim; ++i) {
    if (i != axis_pos) {
      perm.push_back(i);
    }
  }
  perm.push_back(axis_pos);
  return perm;
}

std::vector<int> apply_permutation(
    const std::vector<int>& values,
    const std::vector<int>& perm) {
  std::vector<int> out(perm.size());
  for (int i = 0; i < perm.size(); ++i) {
    out[i] = values[perm[i]];
  }
  return out;
}

int find_axis_position(const std::vector<int>& order, int axis) {
  auto it = std::find(order.begin(), order.end(), axis);
  if (it == order.end()) {
    throw std::runtime_error("[FFT] Internal axis tracking mismatch.");
  }
  return static_cast<int>(it - order.begin());
}

OrderedArray prepare_input(
    const OrderedArray& current,
    int axis,
    bool allow_direct,
    cu::CommandEncoder& encoder,
    Stream s) {
  int axis_pos = find_axis_position(current.order, axis);
  bool axis_last = axis_pos == static_cast<int>(current.order.size()) - 1;
  bool direct = allow_direct && axis_last && current.arr.flags().row_contiguous;

  if (direct) {
    return current;
  }

  array view = current.arr;
  std::vector<int> order = current.order;
  if (!axis_last) {
    auto perm = move_axis_to_back_permutation(current.arr.ndim(), axis_pos);
    view = transpose_in_eval(current.arr, perm);
    order = apply_permutation(current.order, perm);
  }

  array packed = contiguous_copy_gpu(view, s);
  encoder.add_temporary(packed);
  return {std::move(packed), std::move(order)};
}

void execute_fft(
    const array& in,
    array& out,
    FFTTransformType transform_type,
    bool inverse,
    cu::CommandEncoder& encoder) {
  if (!in.flags().row_contiguous || in.strides(-1) != 1) {
    throw std::runtime_error("[FFT] Expected packed row-contiguous FFT input.");
  }

  int64_t n =
      transform_type == FFTTransformType::C2R ? out.shape(-1) : in.shape(-1);
  int64_t batch = in.shape().empty() ? 1 : in.size() / in.shape(-1);
  auto plan = get_fft_plan(encoder, transform_type, n, batch);

  encoder.set_input_array(in);
  out.set_data(cu::malloc_async(out.nbytes(), encoder));
  encoder.set_output_array(out);
  encoder.add_completed_handler([plan]() {});

  encoder.device().make_current();
  CHECK_CUFFT_ERROR(cufftSetStream(plan->handle, encoder.stream()));
  auto* workspace = allocate_workspace(encoder, plan->workspace_size);
  CHECK_CUFFT_ERROR(cufftSetWorkArea(plan->handle, workspace));

  auto capture = encoder.capture_context();
  CHECK_CUFFT_ERROR(cufftXtExec(
      plan->handle,
      gpu_ptr<void>(in),
      gpu_ptr<void>(out),
      exec_direction(transform_type, inverse)));
}

void restore_output_layout(const OrderedArray& current, array& out) {
  Strides out_strides(out.ndim());
  for (int i = 0; i < current.order.size(); ++i) {
    out_strides[current.order[i]] = current.arr.strides(i);
  }

  auto [data_size, row_contiguous, col_contiguous] =
      check_contiguity(out.shape(), out_strides);
  bool contiguous =
      current.arr.flags().contiguous && data_size == current.arr.data_size();

  out.copy_shared_buffer(
      current.arr,
      out_strides,
      {contiguous, row_contiguous, col_contiguous},
      current.arr.data_size());
}

void apply_inverse_scale(
    array& arr,
    const std::vector<size_t>& axes,
    const array& out,
    cu::CommandEncoder& encoder) {
  if (axes.empty()) {
    return;
  }

  double scale = 1.0;
  for (auto axis : axes) {
    scale /= out.shape(axis);
  }

  size_t size = arr.data_size();
  dim3 block_dims(256);
  dim3 grid_dims((size + block_dims.x - 1) / block_dims.x);

  encoder.set_input_array(arr);
  encoder.set_output_array(arr);

  if (arr.dtype() == float32) {
    float scale_f = static_cast<float>(scale);
    encoder.add_kernel_node(
        cu::scale_fft_output<float>,
        grid_dims,
        block_dims,
        gpu_ptr<float>(arr),
        scale_f,
        size);
  } else if (arr.dtype() == complex64) {
    cu::complex64_t scale_f(static_cast<float>(scale), 0.0f);
    encoder.add_kernel_node(
        cu::scale_fft_output<cu::complex64_t>,
        grid_dims,
        block_dims,
        gpu_ptr<cu::complex64_t>(arr),
        scale_f,
        size);
  } else {
    throw std::runtime_error("[FFT] Unsupported dtype for inverse scaling.");
  }
}

} // namespace

void FFT::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("FFT::eval_gpu");
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
  auto& in = inputs[0];

  if (out.size() == 0) {
    return;
  }

  auto order = make_identity_order(in.ndim());
  OrderedArray current{in, std::move(order)};

  std::vector<int> axis_sequence;
  axis_sequence.reserve(axes_.size());
  if (inverse_) {
    for (auto axis : axes_) {
      axis_sequence.push_back(static_cast<int>(axis));
    }
  } else {
    for (int i = static_cast<int>(axes_.size()) - 1; i >= 0; --i) {
      axis_sequence.push_back(static_cast<int>(axes_[i]));
    }
  }

  int real_axis = axes_.empty() ? -1 : static_cast<int>(axes_.back());

  for (int i = 0; i < axis_sequence.size(); ++i) {
    int axis = axis_sequence[i];
    bool step_real = real_ && axis == real_axis;
    auto transform_type = step_real
        ? (inverse_ ? FFTTransformType::C2R : FFTTransformType::R2C)
        : FFTTransformType::C2C;

    // cuFFT may overwrite the input buffer for C2R, so only use the direct
    // input when the transform is out-of-place from the library's perspective
    // or when the original input may be donated to the output.
    auto prepared = prepare_input(
        current,
        axis,
        /* allow_direct= */ transform_type != FFTTransformType::C2R ||
            is_donatable(in, out),
        encoder,
        s);

    Shape step_shape = prepared.arr.shape();
    if (step_real) {
      step_shape.back() = out.shape(axis);
    }

    Dtype step_dtype =
        transform_type == FFTTransformType::C2R ? float32 : complex64;
    array step_out(std::move(step_shape), step_dtype, nullptr, {});
    execute_fft(prepared.arr, step_out, transform_type, inverse_, encoder);
    encoder.add_temporary(step_out);

    current = {std::move(step_out), std::move(prepared.order)};
  }

  if (inverse_) {
    apply_inverse_scale(current.arr, axes_, out, encoder);
  }

  restore_output_layout(current, out);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/gemms/cublas_gemm.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/gemms/cublas_gemm.h"
#include "mlx/backend/cuda/cublas_utils.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/dtype_utils.h"
#include "mlx/utils.h"

#include <fmt/format.h>

namespace mlx::core {

namespace {

cublasComputeType_t dtype_to_compute_type(Dtype dtype) {
  switch (dtype) {
    case float16:
      return CUBLAS_COMPUTE_32F;
    case bfloat16:
      return CUBLAS_COMPUTE_32F;
    case float32:
      return mlx::core::env::enable_tf32() ? CUBLAS_COMPUTE_32F_FAST_TF32
                                           : CUBLAS_COMPUTE_32F;
    case float64:
      return CUBLAS_COMPUTE_64F;
    case complex64:
      return mlx::core::env::enable_tf32() ? CUBLAS_COMPUTE_32F_FAST_TF32
                                           : CUBLAS_COMPUTE_32F;
    default:
      throw std::runtime_error(
          fmt::format(
              "Unsupported dtype in CublasGemm: {}.", dtype_to_string(dtype)));
  }
}

} // namespace

CublasGemm::CublasGemm(
    cu::Device& device,
    Dtype dtype,
    bool a_transposed,
    uint64_t a_rows,
    uint64_t a_cols,
    int64_t lda,
    bool b_transposed,
    uint64_t b_rows,
    uint64_t b_cols,
    int64_t ldb,
    int32_t batch_count,
    int64_t a_batch_stride,
    int64_t b_batch_stride) {
  scale_type_ = cublas_utils::dtype_to_cublas_type(dtype, "CublasGemm");
  if (dtype == bfloat16 || dtype == float16) {
    scale_type_ = CUDA_R_32F;
  }
  cudaDataType_t cublas_dtype =
      cublas_utils::dtype_to_cublas_type(dtype, "CublasGemm");

  init_base(
      device,
      scale_type_,
      dtype_to_compute_type(dtype),
      cublas_dtype,
      cublas_dtype,
      a_transposed,
      a_rows,
      a_cols,
      lda,
      b_transposed,
      b_rows,
      b_cols,
      ldb,
      batch_count,
      a_batch_stride,
      b_batch_stride);

  // alpha and beta are both host pointers
  cublasLtPointerMode_t pointer_mode = CUBLASLT_POINTER_MODE_HOST;
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
      matmul_desc_,
      CUBLASLT_MATMUL_DESC_POINTER_MODE,
      &pointer_mode,
      sizeof(pointer_mode)));
}

CublasGemm::CublasGemm(
    cu::Device& device,
    Dtype dtype,
    bool a_transposed,
    uint64_t a_rows,
    uint64_t a_cols,
    int64_t lda,
    bool b_transposed,
    uint64_t b_rows,
    uint64_t b_cols,
    int64_t ldb,
    int64_t ldc,
    int32_t batch_count,
    int64_t a_batch_stride,
    int64_t b_batch_stride,
    int64_t c_batch_stride)
    : CublasGemm(
          device,
          dtype,
          a_transposed,
          a_rows,
          a_cols,
          lda,
          b_transposed,
          b_rows,
          b_cols,
          ldb,
          batch_count,
          a_batch_stride,
          b_batch_stride) {
  auto type = cublas_utils::dtype_to_cublas_type(dtype, "CublasGemm");
  c_desc_ = cublas_utils::create_matrix_layout(
      type, b_cols, a_rows, false, ldc, batch_count, c_batch_stride);
}

void CublasGemm::set_out(
    Dtype dtype,
    bool transposed,
    uint64_t rows,
    uint64_t cols,
    int64_t ld,
    int32_t batch_count,
    int64_t batch_stride) {
  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutDestroy(out_desc_));
  out_desc_ = cublas_utils::create_matrix_layout(
      cublas_utils::dtype_to_cublas_type(dtype, "CublasGemm"),
      cols,
      rows,
      transposed,
      ld,
      batch_count,
      batch_stride);
}

void CublasGemm::run(
    cu::CommandEncoder& encoder,
    array& out,
    const array& a,
    const array& b,
    const Shape& batch_shape,
    const Strides& a_batch_strides,
    const Strides& b_batch_strides,
    float alpha) {
  int batch_count = out.size() / (M_ * N_);
  if (batch_count / batch_shape.back() > 1) {
    run_batched(
        encoder,
        out,
        a,
        b,
        batch_shape,
        a_batch_strides,
        b_batch_strides,
        alpha);
    return;
  }

  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out);

  execute(
      encoder,
      gpu_ptr<void>(out),
      gpu_ptr<void>(a),
      gpu_ptr<void>(b),
      nullptr,
      alpha);
}

void CublasGemm::run(
    cu::CommandEncoder& encoder,
    array& out,
    const array& a,
    const array& b,
    const array& c,
    const Shape& batch_shape,
    const Strides& a_batch_strides,
    const Strides& b_batch_strides,
    const Strides& c_batch_strides,
    float alpha,
    float beta) {
  int batch_count = out.size() / (M_ * N_);
  if (batch_count / batch_shape.back() > 1) {
    run_batched(
        encoder,
        out,
        a,
        b,
        c,
        batch_shape,
        a_batch_strides,
        b_batch_strides,
        c_batch_strides,
        alpha,
        beta);
    return;
  }

  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_input_array(c);
  encoder.set_output_array(out);

  execute(
      encoder,
      gpu_ptr<void>(out),
      gpu_ptr<void>(a),
      gpu_ptr<void>(b),
      gpu_ptr<void>(c),
      alpha,
      beta);
}

void CublasGemm::execute(
    cu::CommandEncoder& encoder,
    void* out,
    const void* a,
    const void* b,
    const void* c,
    const float alpha /* = 1 */,
    const float beta /* = 0 */) {
  const void* alpha_ptr = &alpha;
  const void* beta_ptr = &beta;
  complex64_t alpha_c, beta_c;
  if (scale_type_ == CUDA_C_32F) {
    alpha_c = complex64_t{alpha, 0.0f};
    beta_c = complex64_t{beta, 0.0f};
    alpha_ptr = &alpha_c;
    beta_ptr = &beta_c;
  }

  execute_matmul(encoder, out, a, b, c, alpha_ptr, beta_ptr);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/gemms/cublas_gemm.h
================================================
// Copyright © 2025 Apple Inc.
#pragma once

#include "mlx/array.h"
#include "mlx/backend/cuda/cublas_utils.h"
#include "mlx/backend/cuda/device.h"

#include <cublasLt.h>

namespace mlx::core {

class CublasGemm : public CublasMatmulBase {
 public:
  CublasGemm(
      cu::Device& device,
      Dtype dtype,
      bool a_transposed,
      uint64_t a_rows,
      uint64_t a_cols,
      int64_t lda,
      bool b_transposed,
      uint64_t b_rows,
      uint64_t b_cols,
      int64_t ldb,
      int32_t batch_count,
      int64_t a_batch_stride,
      int64_t b_batch_stride);

  CublasGemm(
      cu::Device& device,
      Dtype dtype,
      bool a_transposed,
      uint64_t a_rows,
      uint64_t a_cols,
      int64_t lda,
      bool b_transposed,
      uint64_t b_rows,
      uint64_t b_cols,
      int64_t ldb,
      int64_t ldc,
      int32_t batch_count,
      int64_t a_batch_stride,
      int64_t b_batch_stride,
      int64_t c_batch_stride);

  // The output's descriptor is inferred from inputs by default, use this method
  // for unusual output.
  void set_out(
      Dtype dtype,
      bool transposed,
      uint64_t rows,
      uint64_t cols,
      int64_t ld,
      int32_t batch_count,
      int64_t batch_stride);

  void run(
      cu::CommandEncoder& encoder,
      array& out,
      const array& a,
      const array& b,
      const Shape& batch_shape,
      const Strides& a_batch_strides,
      const Strides& b_batch_strides,
      float alpha = 1.0f);

  void run(
      cu::CommandEncoder& encoder,
      array& out,
      const array& a,
      const array& b,
      const array& c,
      const Shape& batch_shape,
      const Strides& a_batch_strides,
      const Strides& b_batch_strides,
      const Strides& c_batch_strides,
      float alpha,
      float beta);

 private:
  void run_batched(
      cu::CommandEncoder& encoder,
      array& out,
      const array& a,
      const array& b,
      const Shape& batch_shape,
      const Strides& a_batch_strides,
      const Strides& b_batch_strides,
      float alpha);

  void run_batched(
      cu::CommandEncoder& encoder,
      array& out,
      const array& a,
      const array& b,
      const array& c,
      const Shape& batch_shape,
      const Strides& a_batch_strides,
      const Strides& b_batch_strides,
      const Strides& c_batch_strides,
      float alpha,
      float beta);

  void execute(
      cu::CommandEncoder& encoder,
      void* out,
      const void* a,
      const void* b,
      const void* c,
      float alpha = 1,
      float beta = 0);
};

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/gemms/cublas_gemm_batched_12_0.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/utils.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/gemms/cublas_gemm.h"

namespace mlx::core {

void CublasGemm::run_batched(
    cu::CommandEncoder& encoder,
    array& out,
    const array& a,
    const array& b,
    const Shape& batch_shape,
    const Strides& a_batch_strides,
    const Strides& b_batch_strides,
    float alpha) {
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
  auto nbatch = out.size() / (M_ * N_ * batch_shape.back());
  ContiguousIterator a_it(batch_shape, a_batch_strides, batch_shape.size() - 1);
  ContiguousIterator b_it(batch_shape, b_batch_strides, batch_shape.size() - 1);
  auto concurrent = encoder.concurrent_context();
  for (size_t i = 0; i < nbatch; ++i) {
    execute(
        encoder,
        gpu_ptr<int8_t>(out) +
            out.itemsize() * i * batch_shape.back() * M_ * N_,
        gpu_ptr<int8_t>(a) + a.itemsize() * a_it.loc,
        gpu_ptr<int8_t>(b) + b.itemsize() * b_it.loc,
        nullptr,
        alpha);
    a_it.step();
    b_it.step();
  }
}

void CublasGemm::run_batched(
    cu::CommandEncoder& encoder,
    array& out,
    const array& a,
    const array& b,
    const array& c,
    const Shape& batch_shape,
    const Strides& a_batch_strides,
    const Strides& b_batch_strides,
    const Strides& c_batch_strides,
    float alpha,
    float beta) {
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_input_array(c);
  encoder.set_output_array(out);

  auto nbatch = out.size() / (M_ * N_ * batch_shape.back());
  ContiguousIterator a_it(batch_shape, a_batch_strides, batch_shape.size() - 1);
  ContiguousIterator b_it(batch_shape, b_batch_strides, batch_shape.size() - 1);
  ContiguousIterator c_it(batch_shape, c_batch_strides, batch_shape.size() - 1);
  auto concurrent = encoder.concurrent_context();
  for (size_t i = 0; i < nbatch; ++i) {
    execute(
        encoder,
        gpu_ptr<int8_t>(out) +
            out.itemsize() * i * batch_shape.back() * M_ * N_,
        gpu_ptr<int8_t>(a) + a.itemsize() * a_it.loc,
        gpu_ptr<int8_t>(b) + b.itemsize() * b_it.loc,
        gpu_ptr<int8_t>(c) + c.itemsize() * c_it.loc,
        alpha,
        beta);
    a_it.step();
    b_it.step();
    c_it.step();
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/gemms/cublas_gemm_batched_12_9.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/gemms/cublas_gemm.h"
#include "mlx/backend/cuda/kernel_utils.cuh"

#include <cooperative_groups.h>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <int NDIM>
__global__ void set_mm_device_pointers_nd(
    int8_t** pointers,
    int8_t* a_start,
    int8_t* b_start,
    int8_t* out_start,
    int item_size,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> batch_shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_batch_strides,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_batch_strides,
    int64_t batch_stride,
    int batch_count) {
  auto index = cg::this_grid().thread_rank();
  if (index >= batch_count) {
    return;
  }
  auto [a_offset, b_offset] = elem_to_loc_nd<NDIM>(
      index,
      batch_shape.data(),
      a_batch_strides.data(),
      b_batch_strides.data());
  pointers[index] = a_start + item_size * a_offset;
  pointers[index + batch_count] = b_start + item_size * b_offset;
  pointers[index + 2 * batch_count] =
      out_start + item_size * index * batch_stride;
}

__global__ void set_mm_device_pointers_g(
    int8_t** pointers,
    int8_t* a_start,
    int8_t* b_start,
    int8_t* out_start,
    int item_size,
    const __grid_constant__ Shape batch_shape,
    const __grid_constant__ Strides a_batch_strides,
    const __grid_constant__ Strides b_batch_strides,
    int64_t batch_stride,
    int batch_ndim,
    int batch_count) {
  auto index = cg::this_grid().thread_rank();
  if (index >= batch_count) {
    return;
  }
  auto [a_offset, b_offset] = elem_to_loc(
      index,
      batch_shape.data(),
      a_batch_strides.data(),
      b_batch_strides.data(),
      batch_ndim);
  pointers[index] = a_start + item_size * a_offset;
  pointers[index + batch_count] = b_start + item_size * b_offset;
  pointers[index + 2 * batch_count] =
      out_start + item_size * index * batch_stride;
}

template <int NDIM>
__global__ void set_addmm_device_pointers_nd(
    int8_t** pointers,
    int8_t* a_start,
    int8_t* b_start,
    int8_t* c_start,
    int8_t* out_start,
    int item_size,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> batch_shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_batch_strides,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_batch_strides,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> c_batch_strides,
    int64_t batch_stride,
    int batch_count) {
  auto index = cg::this_grid().thread_rank();
  if (index >= batch_count) {
    return;
  }
  auto [a_offset, b_offset, c_offset] = elem_to_loc_nd<NDIM>(
      index,
      batch_shape.data(),
      a_batch_strides.data(),
      b_batch_strides.data(),
      c_batch_strides.data());
  pointers[index] = a_start + item_size * a_offset;
  pointers[index + batch_count] = b_start + item_size * b_offset;
  pointers[index + 2 * batch_count] = c_start + item_size * c_offset;
  pointers[index + 3 * batch_count] =
      out_start + item_size * index * batch_stride;
}

__global__ void set_addmm_device_pointers_g(
    int8_t** pointers,
    int8_t* a_start,
    int8_t* b_start,
    int8_t* c_start,
    int8_t* out_start,
    int item_size,
    const __grid_constant__ Shape batch_shape,
    const __grid_constant__ Strides a_batch_strides,
    const __grid_constant__ Strides b_batch_strides,
    const __grid_constant__ Strides c_batch_strides,
    int64_t batch_stride,
    int batch_ndim,
    int batch_count) {
  auto index = cg::this_grid().thread_rank();
  if (index >= batch_count) {
    return;
  }
  auto [a_offset, b_offset, c_offset] = elem_to_loc(
      index,
      batch_shape.data(),
      a_batch_strides.data(),
      b_batch_strides.data(),
      c_batch_strides.data(),
      batch_ndim);
  pointers[index] = a_start + item_size * a_offset;
  pointers[index + batch_count] = b_start + item_size * b_offset;
  pointers[index + 2 * batch_count] = c_start + item_size * c_offset;
  pointers[index + 3 * batch_count] =
      out_start + item_size * index * batch_stride;
}

} // namespace cu

namespace {

void set_pointer_mode(cublasLtMatrixLayout_t desc, int batch_count) {
  auto batch_mode = CUBLASLT_BATCH_MODE_POINTER_ARRAY;
  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutSetAttribute(
      desc,
      CUBLASLT_MATRIX_LAYOUT_BATCH_MODE,
      &batch_mode,
      sizeof(batch_mode)));
  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutSetAttribute(
      desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, sizeof(int32_t)));
}

} // namespace

void CublasGemm::run_batched(
    cu::CommandEncoder& encoder,
    array& out,
    const array& a,
    const array& b,
    const Shape& batch_shape,
    const Strides& a_batch_strides,
    const Strides& b_batch_strides,
    float alpha) {
  int batch_count = out.size() / (M_ * N_);
  set_pointer_mode(a_desc_, batch_count);
  set_pointer_mode(b_desc_, batch_count);
  set_pointer_mode(out_desc_, batch_count);

  // Launch kernel to set device offsets
  auto pointers = array(
      cu::malloc_async(batch_count * sizeof(void*) * 3, encoder),
      {batch_count * 3},
      uint64);

  encoder.add_temporary(pointers);
  encoder.set_output_array(pointers);

  int block_dims = std::min(batch_count, 256);
  int num_blocks = cuda::ceil_div(batch_count, block_dims);
  int64_t batch_stride = M_ * N_;
  int item_size = out.itemsize();

  int ndim = batch_shape.size();
  if (ndim <= 3) {
    dispatch_1_2_3(ndim, [&](auto ndim_constant) {
      encoder.add_kernel_node(
          cu::set_mm_device_pointers_nd<ndim_constant()>,
          num_blocks,
          block_dims,
          gpu_ptr<int8_t*>(pointers),
          gpu_ptr<int8_t>(a),
          gpu_ptr<int8_t>(b),
          gpu_ptr<int8_t>(out),
          item_size,
          const_param<ndim_constant()>(batch_shape),
          const_param<ndim_constant()>(a_batch_strides),
          const_param<ndim_constant()>(b_batch_strides),
          batch_stride,
          batch_count);
    });
  } else {
    encoder.add_kernel_node(
        cu::set_mm_device_pointers_g,
        num_blocks,
        block_dims,
        gpu_ptr<int8_t*>(pointers),
        gpu_ptr<int8_t>(a),
        gpu_ptr<int8_t>(b),
        gpu_ptr<int8_t>(out),
        item_size,
        const_param(batch_shape),
        const_param(a_batch_strides),
        const_param(b_batch_strides),
        batch_stride,
        ndim,
        batch_count);
  }

  // Run matmul
  encoder.set_input_array(pointers);
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out);

  auto a_pointers = gpu_ptr<int8_t*>(pointers);
  auto b_pointers = a_pointers + batch_count;
  auto out_pointers = b_pointers + batch_count;
  execute(
      encoder,
      reinterpret_cast<void*>(out_pointers),
      reinterpret_cast<void*>(a_pointers),
      reinterpret_cast<void*>(b_pointers),
      nullptr,
      alpha);
}

void CublasGemm::run_batched(
    cu::CommandEncoder& encoder,
    array& out,
    const array& a,
    const array& b,
    const array& c,
    const Shape& batch_shape,
    const Strides& a_batch_strides,
    const Strides& b_batch_strides,
    const Strides& c_batch_strides,
    float alpha,
    float beta) {
  int batch_count = out.size() / (M_ * N_);
  set_pointer_mode(a_desc_, batch_count);
  set_pointer_mode(b_desc_, batch_count);
  set_pointer_mode(c_desc_, batch_count);
  set_pointer_mode(out_desc_, batch_count);

  // Launch kernel to set device offsets
  auto pointers = array(
      cu::malloc_async(batch_count * sizeof(uint64_t) * 4, encoder),
      {batch_count * 4},
      uint64);

  encoder.add_temporary(pointers);
  encoder.set_output_array(pointers);

  int block_dims = std::min(batch_count, 256);
  int num_blocks = cuda::ceil_div(batch_count, block_dims);
  int64_t batch_stride = M_ * N_;
  int item_size = out.itemsize();

  int ndim = batch_shape.size();
  if (ndim <= 3) {
    dispatch_1_2_3(ndim, [&](auto ndim_constant) {
      encoder.add_kernel_node(
          cu::set_addmm_device_pointers_nd<ndim_constant()>,
          num_blocks,
          block_dims,
          gpu_ptr<int8_t*>(pointers),
          gpu_ptr<int8_t>(a),
          gpu_ptr<int8_t>(b),
          gpu_ptr<int8_t>(c),
          gpu_ptr<int8_t>(out),
          item_size,
          const_param<ndim_constant()>(batch_shape),
          const_param<ndim_constant()>(a_batch_strides),
          const_param<ndim_constant()>(b_batch_strides),
          const_param<ndim_constant()>(c_batch_strides),
          batch_stride,
          batch_count);
    });
  } else {
    encoder.add_kernel_node(
        cu::set_addmm_device_pointers_g,
        num_blocks,
        block_dims,
        gpu_ptr<int8_t*>(pointers),
        gpu_ptr<int8_t>(a),
        gpu_ptr<int8_t>(b),
        gpu_ptr<int8_t>(c),
        gpu_ptr<int8_t>(out),
        item_size,
        const_param(batch_shape),
        const_param(a_batch_strides),
        const_param(b_batch_strides),
        const_param(c_batch_strides),
        batch_stride,
        ndim,
        batch_count);
  }

  // Run matmul
  encoder.set_input_array(pointers);
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_input_array(c);
  encoder.set_output_array(out);

  auto a_pointers = gpu_ptr<int8_t*>(pointers);
  auto b_pointers = a_pointers + batch_count;
  auto c_pointers = b_pointers + batch_count;
  auto out_pointers = c_pointers + batch_count;
  execute(
      encoder,
      reinterpret_cast<void*>(out_pointers),
      reinterpret_cast<void*>(a_pointers),
      reinterpret_cast<void*>(b_pointers),
      reinterpret_cast<void*>(c_pointers),
      alpha,
      beta);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/gemms/gemv.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/gemms/gemv.h"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/dtype_utils.h"

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>

namespace mlx::core::cu {

namespace cg = cooperative_groups;

static constexpr int rows_per_block = 8;

// Accumulator type selection per input element type T.
template <typename T>
struct GemvAccType {
  using type = T;
};

template <>
struct GemvAccType<__half> {
  using type = float;
};

template <>
struct GemvAccType<__nv_bfloat16> {
  using type = float;
};

template <>
struct GemvAccType<float> {
  using type = float;
};

template <>
struct GemvAccType<double> {
  using type = double;
};

template <>
struct GemvAccType<cu::complex64_t> {
  using type = cu::complex64_t;
};

template <typename T, int rows_per_block, int n_per_thread>
__device__ void
gemv_impl(const T* mat, const T* vec, T* out, int rows, int cols) {
  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<WARP_SIZE>(block);

  auto g_idx = block.group_index();
  auto t_idx = block.thread_index();
  int row = g_idx.x * rows_per_block + t_idx.y;

  if (row < rows) {
    using Acc = typename GemvAccType<T>::type;
    Acc sum = Acc(0);
    for (int col = n_per_thread * warp.thread_rank(); col < cols;
         col += (WARP_SIZE * n_per_thread)) {
      auto local_mat =
          unsafe_load_vector<n_per_thread>(mat + row * cols + col, 0);
      auto local_vec = unsafe_load_vector<n_per_thread>(vec + col, 0);
#pragma unroll
      for (int j = 0; j < n_per_thread; ++j) {
        sum += static_cast<Acc>(local_mat[j]) * static_cast<Acc>(local_vec[j]);
      }
    }

    sum = cg::reduce(warp, sum, cg::plus<Acc>{});
    if (warp.thread_rank() == 0) {
      out[row] = static_cast<T>(sum);
    }
  }
}

template <typename T, int rows_per_block, int n_per_thread>
__global__ void
gemv_single(const T* mat, const T* vec, T* out, int rows, int cols) {
  gemv_impl<T, rows_per_block, n_per_thread>(mat, vec, out, rows, cols);
}

template <typename T, int rows_per_block, int n_per_thread>
__global__ void gemv_batched(
    const T* mat,
    const T* vec,
    T* out,
    int rows,
    int cols,
    const __grid_constant__ Shape batch_shape,
    const __grid_constant__ Strides mat_batch_strides,
    const __grid_constant__ Strides vec_batch_strides,
    int batch_ndim) {
  auto block = cg::this_thread_block();
  auto batch_idx = block.group_index().y;
  auto [vec_offset, mat_offset] = elem_to_loc(
      batch_idx,
      batch_shape.data(),
      vec_batch_strides.data(),
      mat_batch_strides.data(),
      batch_ndim);
  gemv_impl<T, rows_per_block, n_per_thread>(
      mat + mat_offset, vec + vec_offset, out + batch_idx * rows, rows, cols);
}

template <typename T, int rows_per_block, int n_per_thread>
__global__ void gemv_gather(
    const T* mat,
    const T* vec,
    T* out,
    uint32_t* mat_indices,
    uint32_t* vec_indices,
    int rows,
    int cols,
    const __grid_constant__ Shape mat_batch_shape,
    const __grid_constant__ Strides mat_batch_strides,
    int mat_batch_ndim,
    const __grid_constant__ Shape vec_batch_shape,
    const __grid_constant__ Strides vec_batch_strides,
    int vec_batch_ndim,
    const __grid_constant__ Shape index_shape,
    const __grid_constant__ Strides mat_index_strides,
    const __grid_constant__ Strides vec_index_strides,
    int index_batch_ndim) {
  auto block = cg::this_thread_block();
  auto indices_idx = block.group_index().y;
  uint32_t index_mat, index_vec;
  if (index_batch_ndim > 1) {
    auto [mat_idx_offset, vec_idx_offset] = elem_to_loc(
        indices_idx,
        index_shape.data(),
        mat_index_strides.data(),
        vec_index_strides.data(),
        index_batch_ndim);
    index_mat = mat_indices[mat_idx_offset];
    index_vec = vec_indices[vec_idx_offset];
  } else {
    index_mat = mat_indices[indices_idx * mat_index_strides[0]];
    index_vec = vec_indices[indices_idx * vec_index_strides[0]];
  }

  int64_t mat_offset;
  if (mat_batch_ndim > 1) {
    mat_offset = elem_to_loc(
        index_mat,
        mat_batch_shape.data(),
        mat_batch_strides.data(),
        mat_batch_ndim);
  } else {
    mat_offset = index_mat * mat_batch_strides[0];
  }

  int64_t vec_offset;
  if (vec_batch_ndim > 1) {
    vec_offset = elem_to_loc(
        index_vec,
        vec_batch_shape.data(),
        vec_batch_strides.data(),
        vec_batch_ndim);
  } else {
    vec_offset = index_vec * vec_batch_strides[0];
  }

  gemv_impl<T, rows_per_block, n_per_thread>(
      mat + mat_offset, vec + vec_offset, out + indices_idx * rows, rows, cols);
}

bool can_use_gemv(int M, int N, int K, bool a_transposed, bool b_transposed) {
  return K % 32 == 0 && ((M == 1 && b_transposed) || (N == 1 && !a_transposed));
}

template <typename F>
void dispatch_n_per_thread(int n_per_thread, F&& f) {
  switch (n_per_thread) {
    case 1:
      f(std::integral_constant<int, 1>{});
      break;
    case 2:
      f(std::integral_constant<int, 2>{});
      break;
    case 4:
      f(std::integral_constant<int, 4>{});
      break;
  }
}

void gemv(
    const array& a,
    const array& b,
    array& out,
    int M,
    int N,
    int K,
    uint32_t batch_count,
    const mlx::core::Shape& batch_shape,
    const mlx::core::Strides& a_batch_strides,
    const mlx::core::Strides& b_batch_strides,
    CommandEncoder& encoder) {
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
  dispatch_inexact_types(out.dtype(), "gemv", [&](auto type_tag) {
    using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    dim3 block_dims{WARP_SIZE, rows_per_block};
    const DataType* mat;
    const DataType* vec;
    int rows;
    int cols = K;
    auto mat_strides = const_param(a_batch_strides);
    auto vec_strides = const_param(b_batch_strides);

    if (M == 1) {
      mat = gpu_ptr<DataType>(b);
      vec = gpu_ptr<DataType>(a);
      rows = N;
      std::swap(mat_strides, vec_strides);
    } else {
      mat = gpu_ptr<DataType>(a);
      vec = gpu_ptr<DataType>(b);
      rows = M;
    }
    uint32_t num_blocks_x = (rows + rows_per_block - 1) / rows_per_block;
    int n_per_t;
    if (K % 128 == 0 && is_aligned<4>(mat) && is_aligned<4>(vec)) {
      n_per_t = 4;
    } else if (K % 64 == 0 && is_aligned<2>(mat) && is_aligned<2>(vec)) {
      n_per_t = 2;
    } else {
      n_per_t = 1;
    }
    dispatch_n_per_thread(n_per_t, [&](auto n_per_thread) {
      if (batch_count == 1) {
        auto kernel = gemv_single<DataType, rows_per_block, n_per_thread()>;
        encoder.add_kernel_node(
            kernel,
            num_blocks_x,
            block_dims,
            mat,
            vec,
            gpu_ptr<DataType>(out),
            rows,
            cols);
      } else {
        auto kernel = gemv_batched<DataType, rows_per_block, n_per_thread()>;
        encoder.add_kernel_node(
            kernel,
            dim3{num_blocks_x, batch_count},
            block_dims,
            mat,
            vec,
            gpu_ptr<DataType>(out),
            rows,
            cols,
            const_param(batch_shape),
            mat_strides,
            vec_strides,
            batch_shape.size());
      }
    });
  });
}

void gather_mv(
    const array& mat_,
    const array& vec_,
    const array& mat_indices,
    const array& vec_indices,
    array& out,
    int N,
    int K,
    CommandEncoder& encoder) {
  encoder.set_input_array(mat_);
  encoder.set_input_array(vec_);
  encoder.set_input_array(mat_indices);
  encoder.set_input_array(vec_indices);
  encoder.set_output_array(out);
  dispatch_inexact_types(out.dtype(), "gather_mv", [&](auto type_tag) {
    using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    dim3 block_dims{WARP_SIZE, rows_per_block};
    int rows = N;
    int cols = K;
    uint32_t batch_size = static_cast<uint32_t>(out.size() / N);
    const DataType* mat = gpu_ptr<DataType>(mat_);
    const DataType* vec = gpu_ptr<DataType>(vec_);

    uint32_t num_blocks_x = (rows + rows_per_block - 1) / rows_per_block;
    int n_per_t;
    if (K % 128 == 0 && is_aligned<4>(mat) && is_aligned<4>(vec)) {
      n_per_t = 4;
    } else if (K % 64 == 0 && is_aligned<2>(mat) && is_aligned<2>(vec)) {
      n_per_t = 2;
    } else {
      n_per_t = 1;
    }

    dispatch_n_per_thread(n_per_t, [&](auto n_per_thread) {
      auto kernel = gemv_gather<DataType, rows_per_block, n_per_thread()>;
      encoder.add_kernel_node(
          kernel,
          dim3{num_blocks_x, batch_size},
          block_dims,
          mat,
          vec,
          gpu_ptr<DataType>(out),
          gpu_ptr<uint32_t>(mat_indices),
          gpu_ptr<uint32_t>(vec_indices),
          rows,
          cols,
          const_param(mat_.shape()),
          const_param(mat_.strides()),
          mat_.ndim() - 2,
          const_param(vec_.shape()),
          const_param(vec_.strides()),
          vec_.ndim() - 2,
          const_param(mat_indices.shape()),
          const_param(mat_indices.strides()),
          const_param(vec_indices.strides()),
          mat_indices.ndim());
    });
  });
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/gemms/gemv.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/backend/cuda/device.h"

namespace mlx::core::cu {

bool can_use_gemv(int M, int N, int K, bool a_transposed, bool b_transposed);

void gemv(
    const array& a,
    const array& b,
    array& out,
    int M,
    int N,
    int K,
    uint32_t batch_count,
    const mlx::core::Shape& batch_shape,
    const mlx::core::Strides& a_batch_strides,
    const mlx::core::Strides& b_batch_strides,
    CommandEncoder& encoder);

void gather_mv(
    const array& mat,
    const array& vec,
    const array& mat_indices,
    const array& vec_indices,
    array& out,
    int N,
    int K,
    CommandEncoder& encoder);

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/gemms/grouped_gemm.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

namespace mlx::core {

namespace cu {
class CommandEncoder;
}

class array;

void cutlass_grouped_gemm_unaligned(
    bool a_transposed,
    int lda,
    bool b_transposed,
    int ldb,
    int group_count,
    const array& a,
    const array& b,
    const array& indices,
    array& out,
    cu::CommandEncoder& encoder);

void cutlass_segmented_mm(
    bool a_transposed,
    int lda,
    bool b_transposed,
    int ldb,
    int num_segments,
    int M,
    int N,
    const array& a,
    const array& b,
    const array& segments,
    array& out,
    cu::CommandEncoder& encoder);

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/gemms/grouped_gemm_unaligned.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/cublas_utils.h"
#include "mlx/backend/cuda/cutlass_utils.cuh"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/gemms/grouped_gemm.h"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/dtype_utils.h"

#include <cooperative_groups.h>
#include <cutlass/gemm/device/default_gemm_configuration.h>
#include <cutlass/gemm/device/gemm_grouped.h>
#include <cutlass/gemm/kernel/default_gemm_grouped.h>
#include <nvtx3/nvtx3.hpp>

namespace mlx::core {

using ProblemSize = cutlass::gemm::GemmCoord;

namespace cu {

namespace cg = cooperative_groups;

template <int N_READS>
__global__ void prepare_grouped_mm_data(
    const uint32_t* indices,
    size_t size,
    int group_count,
    int K,
    int N,
    int lda,
    int ldb,
    int item_size,
    int8_t* a_start,
    int8_t* b_start,
    int8_t* out_start,
    int a_batch_stride,
    int b_batch_stride,
    int out_batch_stride,
    ProblemSize* problem_sizes,
    int64_t* a_lds,
    int64_t* b_lds,
    int64_t* out_lds,
    void** a_ptrs,
    void** b_ptrs,
    void** out_ptrs) {
  auto block = cg::this_thread_block();

  // cumsum(histogram(indices)) - offset for each group.
  extern __shared__ uint32_t cum_histo[];

  int group = block.thread_rank();
  if (group < group_count) {
    cum_histo[group] = 0;
  }

  block.sync();

  // Since |indices| is sorted, the position where element changes would be its
  // cumulative histogram.
  size_t elems_per_block = block.num_threads() * N_READS;
  for (int r = 0; r < cuda::ceil_div(size, elems_per_block); ++r) {
    // TODO: Use vectorized read.
    for (int i = 0; i < N_READS; ++i) {
      size_t pos = r * elems_per_block + group * N_READS + i;
      if (pos >= size) {
        break;
      }
      auto elem = indices[pos];
      auto next = pos < size - 1 ? indices[pos + 1] : group_count;
      while (elem < next) {
        cum_histo[elem] = pos + 1;
        elem++;
      }
    }
  }

  block.sync();

  if (group < group_count) {
    // Fill shapes.
    int delta =
        group == 0 ? cum_histo[0] : cum_histo[group] - cum_histo[group - 1];
    problem_sizes[group] = {delta, N, K};
    a_lds[group] = lda;
    b_lds[group] = ldb;
    out_lds[group] = N;
    // Fill pointers.
    auto offset = group == 0 ? 0 : cum_histo[group - 1];
    a_ptrs[group] = a_start + offset * item_size * a_batch_stride;
    b_ptrs[group] = b_start + group * item_size * b_batch_stride;
    out_ptrs[group] = out_start + offset * item_size * out_batch_stride;
  }
}

__global__ void prepare_segmented_mm_data(
    const uint32_t* segments,
    int num_segments,
    int M,
    int N,
    int lda,
    int ldb,
    int item_size,
    bool a_transposed,
    bool b_transposed,
    int8_t* a_start,
    int8_t* b_start,
    int8_t* out_start,
    ProblemSize* problem_sizes,
    int64_t* a_lds,
    int64_t* b_lds,
    int64_t* out_lds,
    void** a_ptrs,
    void** b_ptrs,
    void** out_ptrs) {
  int idx = cg::this_grid().thread_rank();
  if (idx >= num_segments)
    return;

  int64_t start = segments[2 * idx];
  int64_t end = segments[2 * idx + 1];
  int K_i = (end > start) ? static_cast<int>(end - start) : 0;

  problem_sizes[idx] = {M, N, K_i};
  a_lds[idx] = lda;
  b_lds[idx] = ldb;
  out_lds[idx] = N;

  // Offset into K dimension depends on layout:
  // A [M,K]: row-major offset = start, col-major offset = start * lda
  // B [K,N]: row-major offset = start * ldb, col-major offset = start
  int64_t a_offset = a_transposed ? start * lda : start;
  int64_t b_offset = b_transposed ? start : start * ldb;

  a_ptrs[idx] = a_start + a_offset * item_size;
  b_ptrs[idx] = b_start + b_offset * item_size;
  out_ptrs[idx] = out_start + static_cast<int64_t>(idx) * M * N * item_size;
}

} // namespace cu

namespace {

// Shared GEMM configuration for every type and arch.
template <typename T, typename ArchTag, int kAlignmentC>
struct CommonGemmConfiguration {
  using Element = T;
  using Arch = ArchTag;
  using Accumulator = std::conditional_t<(sizeof(T) < 4), float, T>;
  using EpilogueOutputOp = cutlass::epilogue::thread::
      LinearCombination<T, kAlignmentC, Accumulator, Accumulator>;
};

// Slow GEMM configuration as fallback.
template <
    typename T,
    typename Arch,
    int kAlignmentC = 1,
    bool kEnableTF32 = false,
    typename Enable = void>
struct GemmConfiguration : public CommonGemmConfiguration<T, Arch, 1> {
  using OpClass = cutlass::arch::OpClassSimt;
  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 8>;
  using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
  static const int kAlignmentAB = 1;
  static const int kStages = 2;
};

// Specialized GEMM configuration for sm80 and later.
template <typename T, typename Arch, int kAlignmentC>
struct GemmConfiguration<
    T,
    Arch,
    kAlignmentC,
    true,
    std::enable_if_t<Arch::kMinComputeCapability >= 80 && sizeof(T) <= 4>>
    : public CommonGemmConfiguration<T, cutlass::arch::Sm80, kAlignmentC> {
  using OpClass = cutlass::arch::OpClassTensorOp;
  using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 32>;
  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32 / sizeof(T)>;
  static const int kAlignmentAB = 1;
  static const int kStages = 2;
};

// Specialized GEMM configuration for tf32 on sm80.
template <int kAlignmentC>
struct GemmConfiguration<float, cutlass::arch::Sm80, kAlignmentC, true>
    : public CommonGemmConfiguration<float, cutlass::arch::Sm80, kAlignmentC> {
  using OpClass = cutlass::arch::OpClassTensorOp;
  using ThreadblockShape = cutlass::gemm::GemmShape<256, 128, 32>;
  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
  static const int kAlignmentAB = 1;
  static const int kStages = 3; // use SM80_CP_ASYNC
};

// Get direct access to kernel.
template <typename GemmKernel>
class GemmGroupedEncoder
    : public cutlass::gemm::device::GemmGrouped<GemmKernel> {
 public:
  void encode(cu::CommandEncoder& encoder) {
    encoder.add_kernel_node_ex(
        cutlass::Kernel<GemmKernel>,
        {static_cast<uint32_t>(this->params_.threadblock_count), 1, 1},
        {GemmKernel::kThreadCount, 1, 1},
        {},
        sizeof(typename GemmKernel::SharedStorage),
        this->params_);
  }
};

// Invoke the grouped GEMM of CUTLASS 2.x API, which supports small alignments.
template <typename GemmConfiguration>
void grouped_gemm_v2(
    bool a_transposed,
    bool b_transposed,
    int group_count,
    ProblemSize* problem_sizes,
    int64_t* a_lds,
    int64_t* b_lds,
    int64_t* out_lds,
    void* a_ptrs,
    void* b_ptrs,
    void* out_ptrs,
    cu::CommandEncoder& encoder) {
  dispatch_bool(a_transposed, [&](auto a_transposed_tag) {
    dispatch_bool(b_transposed, [&](auto b_transposed_tag) {
      using LayoutA = std::conditional_t<
          a_transposed_tag.value,
          cutlass::layout::ColumnMajor,
          cutlass::layout::RowMajor>;
      using LayoutB = std::conditional_t<
          b_transposed_tag.value,
          cutlass::layout::ColumnMajor,
          cutlass::layout::RowMajor>;
      using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
          typename GemmConfiguration::Element,
          LayoutA,
          cutlass::ComplexTransform::kNone,
          GemmConfiguration::kAlignmentAB,
          typename GemmConfiguration::Element,
          LayoutB,
          cutlass::ComplexTransform::kNone,
          GemmConfiguration::kAlignmentAB,
          typename GemmConfiguration::Element,
          cutlass::layout::RowMajor,
          typename GemmConfiguration::Accumulator,
          typename GemmConfiguration::OpClass,
          typename GemmConfiguration::Arch,
          typename GemmConfiguration::ThreadblockShape,
          typename GemmConfiguration::WarpShape,
          typename GemmConfiguration::InstructionShape,
          typename GemmConfiguration::EpilogueOutputOp,
          cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
          GemmConfiguration::kStages>::GemmKernel;
      using GemmGrouped = GemmGroupedEncoder<GemmKernel>;

      static int threadblock_count = GemmGrouped::sufficient();
      typename GemmGrouped::Arguments args(
          problem_sizes,
          group_count,
          threadblock_count,
          {/* alpha */ 1, /* beta */ 0},
          reinterpret_cast<typename GemmGrouped::ElementA**>(a_ptrs),
          reinterpret_cast<typename GemmGrouped::ElementB**>(b_ptrs),
          reinterpret_cast<typename GemmGrouped::ElementC**>(out_ptrs),
          reinterpret_cast<typename GemmGrouped::ElementC**>(out_ptrs),
          a_lds,
          b_lds,
          out_lds,
          out_lds);

      GemmGrouped gemm;
      CHECK_CUTLASS_ERROR(gemm.initialize(
          args,
          allocate_workspace(encoder, gemm.get_workspace_size(args)),
          encoder.stream()));
      gemm.encode(encoder);
    });
  });
}

template <typename F>
void dispatch_cutlass_arch(cu::Device& device, F&& f) {
  if (device.compute_capability_major() < 8) {
    f(type_identity<cutlass::arch::Sm75>{});
  } else if (device.compute_capability_major() == 8) {
    f(type_identity<cutlass::arch::Sm80>{});
  } else {
    f(type_identity<cutlass::arch::Sm90>{});
  }
}

auto* get_grouped_mm_funcion(Dtype dtype, int N, cu::Device& device) {
  auto* fun = grouped_gemm_v2<GemmConfiguration<float, cutlass::arch::Sm75>>;
  dispatch_float_types(dtype, "grouped_gemm_v2", [&](auto type_tag) {
    using DataType = cutlass_type_t<MLX_GET_TYPE(type_tag)>;
    dispatch_cutlass_arch(device, [&](auto arch_tag) {
      using Arch = MLX_GET_TYPE(arch_tag);
      dispatch_bool(N % 8 == 0, [&](auto is_out_aligned) {
        constexpr int kAlignmentC = is_out_aligned ? 8 : 1;
        dispatch_bool(env::enable_tf32(), [&](auto kEnableTF32) {
          fun = grouped_gemm_v2<
              GemmConfiguration<DataType, Arch, kAlignmentC, kEnableTF32>>;
        });
      });
    });
  });
  return fun;
}

} // namespace

void cutlass_grouped_gemm_unaligned(
    bool a_transposed,
    int lda,
    bool b_transposed,
    int ldb,
    int group_count,
    const array& a,
    const array& b,
    const array& indices,
    array& out,
    cu::CommandEncoder& encoder) {
  int K = a.shape(-1);
  int N = b.shape(-1);

  // Prepare device pointers for matmul.
  int problem_sizes_nbytes =
      group_count * cuda::ceil_div(sizeof(ProblemSize), 8) * 8;
  int nbytes = problem_sizes_nbytes +
      group_count * (3 * sizeof(void*) + 3 * sizeof(int64_t));
  nbytes = cuda::ceil_div(nbytes, 256) * 256;
  array gemm_args(cu::malloc_async(nbytes, encoder), {nbytes}, int8);
  encoder.add_temporary(gemm_args);

  ProblemSize* problem_sizes = gpu_ptr<ProblemSize>(gemm_args);
  int64_t* a_lds = gpu_ptr<int64_t>(gemm_args) + problem_sizes_nbytes / 8;
  int64_t* b_lds = a_lds + group_count;
  int64_t* out_lds = b_lds + group_count;
  void** a_ptrs = reinterpret_cast<void**>(out_lds + group_count);
  void** b_ptrs = a_ptrs + group_count;
  void** out_ptrs = b_ptrs + group_count;

  // Fill the pointers by computing offsets from indices.
  constexpr int N_READS = 4;
  int n_threads = cuda::ceil_div(indices.size(), N_READS);
  n_threads = group_count < n_threads ? n_threads : group_count;
  dim3 block_dims(std::min(n_threads, 1024));
  dim3 num_blocks(1);

  encoder.set_input_array(indices);
  encoder.set_output_array(gemm_args);
  encoder.add_kernel_node_ex(
      cu::prepare_grouped_mm_data<N_READS>,
      num_blocks,
      block_dims,
      {},
      group_count * sizeof(uint32_t), // sizeof(cum_histo)
      gpu_ptr<uint32_t>(indices),
      indices.size(),
      group_count,
      K,
      N,
      lda,
      ldb,
      out.itemsize(),
      gpu_ptr<int8_t>(a),
      gpu_ptr<int8_t>(b),
      gpu_ptr<int8_t>(out),
      a.shape(-2) * a.shape(-1), // a_batch_stride
      b.shape(-2) * b.shape(-1), // b_batch_stride
      out.shape(-2) * out.shape(-1), // out_batch_stride
      problem_sizes,
      a_lds,
      b_lds,
      out_lds,
      a_ptrs,
      b_ptrs,
      out_ptrs);

  // Invoke grouped GEMM.
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_input_array(gemm_args);
  encoder.set_output_array(out);
  auto* fun = get_grouped_mm_funcion(a.dtype(), N, encoder.device());
  fun(a_transposed,
      b_transposed,
      group_count,
      problem_sizes,
      a_lds,
      b_lds,
      out_lds,
      a_ptrs,
      b_ptrs,
      out_ptrs,
      encoder);
}

void cutlass_segmented_mm(
    bool a_transposed,
    int lda,
    bool b_transposed,
    int ldb,
    int num_segments,
    int M,
    int N,
    const array& a,
    const array& b,
    const array& segments,
    array& out,
    cu::CommandEncoder& encoder) {
  // Allocate grouped GEMM args on device.
  int problem_sizes_nbytes =
      num_segments * cuda::ceil_div(sizeof(ProblemSize), 8) * 8;
  int nbytes = problem_sizes_nbytes +
      num_segments * (3 * sizeof(void*) + 3 * sizeof(int64_t));
  nbytes = cuda::ceil_div(nbytes, 256) * 256;
  array gemm_args(cu::malloc_async(nbytes, encoder), {nbytes}, int8);
  encoder.add_temporary(gemm_args);

  ProblemSize* problem_sizes = gpu_ptr<ProblemSize>(gemm_args);
  int64_t* a_lds = gpu_ptr<int64_t>(gemm_args) + problem_sizes_nbytes / 8;
  int64_t* b_lds = a_lds + num_segments;
  int64_t* out_lds = b_lds + num_segments;
  void** a_ptrs = reinterpret_cast<void**>(out_lds + num_segments);
  void** b_ptrs = a_ptrs + num_segments;
  void** out_ptrs = b_ptrs + num_segments;

  // Build problem descriptions from segments on the GPU.
  int block_size = std::min(num_segments, 256);
  int num_blocks = cuda::ceil_div(num_segments, block_size);

  encoder.set_input_array(segments);
  encoder.set_output_array(gemm_args);
  encoder.add_kernel_node_ex(
      cu::prepare_segmented_mm_data,
      dim3(num_blocks),
      dim3(block_size),
      {},
      0,
      gpu_ptr<uint32_t>(segments),
      num_segments,
      M,
      N,
      static_cast<int>(lda),
      static_cast<int>(ldb),
      static_cast<int>(out.itemsize()),
      a_transposed,
      b_transposed,
      gpu_ptr<int8_t>(a),
      gpu_ptr<int8_t>(b),
      gpu_ptr<int8_t>(out),
      problem_sizes,
      a_lds,
      b_lds,
      out_lds,
      a_ptrs,
      b_ptrs,
      out_ptrs);

  // Dispatch grouped GEMM.
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_input_array(gemm_args);
  encoder.set_output_array(out);
  auto* fun = get_grouped_mm_funcion(a.dtype(), N, encoder.device());
  fun(a_transposed,
      b_transposed,
      num_segments,
      problem_sizes,
      a_lds,
      b_lds,
      out_lds,
      a_ptrs,
      b_ptrs,
      out_ptrs,
      encoder);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/hadamard.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/hadamard.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/jit_module.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/dtype_utils.h"
#include "mlx/primitives.h"

#include <fmt/format.h>
#include <nvtx3/nvtx3.hpp>

#include <algorithm>
#include <cassert>
#include <sstream>
#include <stdexcept>
#include <string_view>

namespace mlx::core {

namespace {

constexpr int MAX_HADAMARD_THREADS_PER_BLOCK = 256;

std::string gen_hadamard_codelet(int m) {
  std::ostringstream source;
  source << "namespace mlx::core::cu {\n";
  source << "__device__ __forceinline__ void hadamard_radix_m(float* x) {\n";
  if (m == 1) {
    source << "}\n";
    source << "} // namespace mlx::core::cu\n";
    return source.str();
  }

  auto h_matrices = hadamard_matrices();
  auto it = h_matrices.find(m);
  if (it == h_matrices.end()) {
    throw std::runtime_error("[hadamard] Invalid radix m.");
  }
  auto& matrix = it->second;

  source << "  float tmp[" << m << "];\n";
  auto start = 1;
  auto end = matrix.find('\n', start);
  int row_idx = 0;
  while (end != std::string_view::npos) {
    auto row = matrix.substr(start, end - start);
    source << "  tmp[" << row_idx << "] =";
    for (int i = 0; i < row.length(); ++i) {
      source << " " << row[i] << " x[" << i << "]";
    }
    source << ";\n";
    start = end + 1;
    end = matrix.find('\n', start);
    row_idx++;
  }
  source << "  #pragma unroll\n";
  source << "  for (int i = 0; i < " << m << "; ++i) { x[i] = tmp[i]; }\n";
  source << "}\n";
  source << "} // namespace mlx::core::cu\n";
  return source.str();
}

std::string hadamard_n_kernel_name(
    const Dtype& dtype,
    int n,
    int max_radix,
    int read_width,
    int stride) {
  return fmt::format(
      "mlx::core::cu::hadamard_n<{}, {}, {}, {}, {}>",
      dtype_to_cuda_type(dtype),
      n,
      max_radix,
      read_width,
      stride);
}

std::string
hadamard_m_kernel_name(const Dtype& dtype, int n, int m, int read_width) {
  return fmt::format(
      "mlx::core::cu::hadamard_m<{}, {}, {}, {}>",
      dtype_to_cuda_type(dtype),
      n,
      m,
      read_width);
}

void hadamard_mn_contiguous(
    const array& x,
    array& y,
    int m,
    int n1,
    int n2,
    float scale,
    const Stream& s) {
  const int n = n1 * n2;
  const int read_width_n1 = (n1 == 2) ? 2 : 4;
  const int read_width_n2 = (n2 == 2) ? 2 : 4;
  const int read_width_m = (n == 2 || m == 28) ? 2 : 4;
  const int max_radix_1 = std::min(n1, 16);
  const int max_radix_2 = std::min(n2, 16);
  const float scale_n1 = 1.0f;
  const float scale_n2 = (m == 1) ? scale : 1.0f;
  const float scale_m = scale;

  const std::string n1_kernel_name =
      hadamard_n_kernel_name(x.dtype(), n1, max_radix_1, read_width_n1, n2);
  const std::string n2_kernel_name =
      hadamard_n_kernel_name(x.dtype(), n2, max_radix_2, read_width_n2, 1);
  const std::string m_kernel_name =
      hadamard_m_kernel_name(x.dtype(), n, m, read_width_m);

  const std::string module_name = fmt::format(
      "hadamard_{}_{}_{}_{}_{}_{}_{}_{}",
      dtype_to_string(x.dtype()),
      n,
      m,
      n1,
      n2,
      read_width_n1,
      read_width_n2,
      read_width_m);

  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
    std::vector<std::string> kernel_names = {n2_kernel_name};
    if (n1 > 1) {
      kernel_names.push_back(n1_kernel_name);
    }
    if (m > 1) {
      kernel_names.push_back(m_kernel_name);
    }

    std::string source = R"(
        #include "mlx/backend/cuda/device/utils.cuh"
    )";
    source += gen_hadamard_codelet(m);
    source += R"(
        #include "mlx/backend/cuda/device/hadamard.cuh"
    )";

    return std::make_tuple(false, std::move(source), std::move(kernel_names));
  });

  auto& encoder = cu::get_command_encoder(s);

  if (n1 > 1) {
    const int64_t num_transforms = x.size() / n1;
    const uint32_t num_blocks =
        static_cast<uint32_t>(std::min<int64_t>(num_transforms, 65535));

    encoder.set_input_array(x);
    encoder.set_output_array(y);

    cu::KernelArgs args;
    args.append(x);
    args.append(y);
    args.append(scale_n1);
    args.append(num_transforms);

    auto kernel = mod.get_kernel(n1_kernel_name);
    encoder.add_kernel_node_raw(
        kernel, num_blocks, n1 / max_radix_1, {}, 0, args.args());
  }

  {
    const auto& in = (n1 > 1) ? y : x;
    const int64_t num_transforms = x.size() / n2;
    const uint32_t num_blocks =
        static_cast<uint32_t>(std::min<int64_t>(num_transforms, 65535));

    encoder.set_input_array(in);
    encoder.set_output_array(y);

    cu::KernelArgs args;
    args.append(in);
    args.append(y);
    args.append(scale_n2);
    args.append(num_transforms);

    auto kernel = mod.get_kernel(n2_kernel_name);
    encoder.add_kernel_node_raw(
        kernel, num_blocks, n2 / max_radix_2, {}, 0, args.args());
  }

  if (m > 1) {
    const int64_t num_tasks = x.size() / (m * read_width_m);
    const uint32_t block_dim = static_cast<uint32_t>(
        std::min<int64_t>(num_tasks, MAX_HADAMARD_THREADS_PER_BLOCK));
    const uint32_t num_blocks = static_cast<uint32_t>(
        std::min<int64_t>((num_tasks + block_dim - 1) / block_dim, 65535));

    encoder.set_input_array(y);
    encoder.set_output_array(y);

    cu::KernelArgs args;
    args.append(y);
    args.append(y);
    args.append(scale_m);
    args.append(num_tasks);

    auto kernel = mod.get_kernel(m_kernel_name);
    encoder.add_kernel_node_raw(
        kernel, num_blocks, block_dim, {}, 0, args.args());
  }
}

} // namespace

void Hadamard::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Hadamard::eval_gpu");
  assert(inputs.size() == 1);

  auto& in = inputs[0];
  if (in.dtype() != float16 && in.dtype() != bfloat16 &&
      in.dtype() != float32) {
    throw std::invalid_argument("[hadamard] Unsupported type.");
  }

  // n = m * 2^k where m in (1, 12, 20, 28)
  auto [n, m] = decompose_hadamard(in.shape().back());
  int n1 = 1;
  int n2 = n;
  if (n > 8192) {
    for (n2 = 2; n2 * n2 < n; n2 *= 2) {
    }
    n1 = n / n2;
  }

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
  if (in.flags().row_contiguous) {
    if (in.is_donatable()) {
      out.copy_shared_buffer(in);
    } else {
      out.set_data(cu::malloc_async(out.nbytes(), encoder));
    }
    hadamard_mn_contiguous(in, out, m, n1, n2, scale_, s);
  } else {
    copy_gpu(in, out, CopyType::General, s);
    hadamard_mn_contiguous(out, out, m, n1, n2, scale_, s);
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/indexing.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/compiled.h"
#include "mlx/backend/common/slicing.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/jit_module.h"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/gpu/scan.h"
#include "mlx/backend/gpu/slicing.h"
#include "mlx/dtype_utils.h"
#include "mlx/primitives.h"

#include "cuda_jit_sources.h"

#include <cuda.h>
#include <fmt/format.h>
#include <nvrtc.h>
#include <nvtx3/nvtx3.hpp>

#include <cassert>
#include <numeric>

namespace mlx::core {

namespace {

constexpr const char* g_scatter_ops[] = {"Max", "Min", "Sum", "Prod", "Assign"};
constexpr const char* g_slice_ops[] =
    {"Maximum", "Minimum", "Add", "Multiply", ""};

void append_indices_arg(
    cu::KernelArgs& args,
    const std::vector<array>& inputs,
    int nidx,
    int idx_ndim) {
  SmallVector<const void*> indices(nidx);
  for (int i = 0; i < nidx; ++i) {
    indices[i] = gpu_ptr<void>(inputs[i + 1]);
  }
  args.append(std::move(indices));
  SmallVector<int32_t> indices_shape(nidx * idx_ndim);
  for (int i = 0; i < nidx; ++i) {
    std::copy_n(
        inputs[i + 1].shape().begin(),
        idx_ndim,
        indices_shape.data() + i * idx_ndim);
  }
  args.append(std::move(indices_shape));
  SmallVector<int64_t> indices_strides(nidx * idx_ndim);
  for (int i = 0; i < nidx; ++i) {
    std::copy_n(
        inputs[i + 1].strides().begin(),
        idx_ndim,
        indices_strides.data() + i * idx_ndim);
  }
  args.append(std::move(indices_strides));
}

} // namespace

void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Gather::eval_gpu");
  assert(inputs.size() > 0);
  const auto& src = inputs[0];

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
  out.set_data(cu::malloc_async(out.nbytes(), encoder));
  if (out.size() == 0) {
    return;
  }

  int nidx = inputs.size() - 1;
  Dtype idx_dtype = nidx > 0 ? inputs[1].dtype() : int32;
  int32_t idx_ndim = nidx > 0 ? inputs[1].ndim() : 0;

  bool large = (nidx > 0 && inputs[1].size() > INT32_MAX) ||
      (src.size() > INT32_MAX) || (out.size() > INT32_MAX);

  uint32_t slice_size = std::accumulate(
      slice_sizes_.begin(), slice_sizes_.end(), 1, std::multiplies<uint32_t>());

  std::string module_name = fmt::format(
      "gather_{}_{}_{}",
      dtype_to_string(out.dtype()),
      dtype_to_string(idx_dtype),
      nidx);

  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
    std::vector<std::string> kernel_names;
    for (int ndim = 0; ndim <= MAX_NDIM; ++ndim) {
      for (int large = 0; large <= 1; ++large) {
        kernel_names.push_back(
            fmt::format(
                "mlx::core::cu::gather<{}, {}, {}, {}, {}>",
                dtype_to_cuda_type(out.dtype()),
                dtype_to_cuda_type(idx_dtype),
                nidx,
                ndim,
                large ? "int64_t" : "int32_t"));
      }
    }
    return std::make_tuple(false, jit_source_gather, std::move(kernel_names));
  });

  cu::KernelArgs args;
  args.append(src);
  args.append(out);
  if (large) {
    args.append<int64_t>(out.size());
  } else {
    args.append<int32_t>(out.size());
  }
  args.append_ndim(src.shape());
  args.append_ndim(src.strides());
  args.append<int32_t>(src.ndim());
  args.append_ndim(slice_sizes_);
  args.append(slice_size);
  args.append(axes_);
  append_indices_arg(args, inputs, nidx, idx_ndim);

  std::string kernel_name = fmt::format(
      "mlx::core::cu::gather<{}, {}, {}, {}, {}>",
      dtype_to_cuda_type(out.dtype()),
      dtype_to_cuda_type(idx_dtype),
      nidx,
      idx_ndim,
      large ? "int64_t" : "int32_t");

  for (const auto& in : inputs) {
    encoder.set_input_array(in);
  }
  encoder.set_output_array(out);

  auto kernel = mod.get_kernel(kernel_name);
  auto [num_blocks, block_dims] = get_launch_args(out, large);
  encoder.add_kernel_node_raw(
      kernel, num_blocks, block_dims, {}, 0, args.args());
}

void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Gather::eval_gpu");
  assert(inputs.size() > 1);
  auto& upd = inputs.back();

  // Copy src into out.
  CopyType copy_type;
  if (inputs[0].data_size() == 1) {
    copy_type = CopyType::Scalar;
  } else if (inputs[0].flags().row_contiguous) {
    copy_type = CopyType::Vector;
  } else {
    copy_type = CopyType::General;
  }
  copy_gpu(inputs[0], out, copy_type);

  // Empty update.
  if (upd.size() == 0) {
    return;
  }

  int nidx = axes_.size();
  Dtype idx_dtype = nidx > 0 ? inputs[1].dtype() : int32;
  int32_t idx_ndim = nidx > 0 ? inputs[1].ndim() : 0;

  bool large = (nidx > 0 && inputs[1].size() > INT32_MAX) ||
      (upd.size() > INT32_MAX) || (out.size() > INT32_MAX);

  int32_t upd_post_idx_size = std::accumulate(
      upd.shape().begin() + idx_ndim,
      upd.shape().end(),
      1,
      std::multiplies<int32_t>());

  const char* op = g_scatter_ops[reduce_type_];
  std::string module_name = fmt::format(
      "scatter_{}_{}_{}_{}",
      dtype_to_string(out.dtype()),
      dtype_to_string(idx_dtype),
      op,
      nidx);

  auto& s = stream();
  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
    std::vector<std::string> kernel_names;
    for (int ndim = 0; ndim <= MAX_NDIM; ++ndim) {
      for (int large = 0; large <= 1; ++large) {
        kernel_names.push_back(
            fmt::format(
                "mlx::core::cu::scatter<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}>",
                dtype_to_cuda_type(out.dtype()),
                dtype_to_cuda_type(idx_dtype),
                op,
                nidx,
                ndim,
                large ? "int64_t" : "int32_t"));
      }
    }
    return std::make_tuple(false, jit_source_scatter, std::move(kernel_names));
  });

  cu::KernelArgs args;
  args.append(upd);
  args.append(out);
  if (large) {
    args.append<int64_t>(upd.size());
  } else {
    args.append<int32_t>(upd.size());
  }
  args.append_ndim(upd.shape());
  args.append_ndim(upd.strides());
  args.append<int32_t>(upd.ndim());
  if (large) {
    args.append<int64_t>(upd_post_idx_size);
  } else {
    args.append<int32_t>(upd_post_idx_size);
  }
  args.append_ndim(out.shape());
  args.append_ndim(out.strides());
  args.append<int32_t>(out.ndim());
  args.append(axes_);
  append_indices_arg(args, inputs, nidx, idx_ndim);

  std::string kernel_name = fmt::format(
      "mlx::core::cu::scatter<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}>",
      dtype_to_cuda_type(out.dtype()),
      dtype_to_cuda_type(idx_dtype),
      op,
      nidx,
      idx_ndim,
      large ? "int64_t" : "int32_t");

  auto& encoder = cu::get_command_encoder(s);
  for (const auto& in : inputs) {
    encoder.set_input_array(in);
  }
  encoder.set_output_array(out);
  auto kernel = mod.get_kernel(kernel_name);
  auto [num_blocks, block_dims] = get_launch_args(upd, large);
  encoder.add_kernel_node_raw(
      kernel, num_blocks, block_dims, {}, 0, args.args());
}

void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("GatherAxis::eval_gpu");
  assert(inputs.size() > 1);
  const auto& src = inputs[0];
  const auto& idx = inputs[1];

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
  out.set_data(cu::malloc_async(out.nbytes(), encoder));
  if (out.size() == 0) {
    return;
  }

  bool large = idx.size() > INT32_MAX || src.size() > INT32_MAX;

  std::string module_name = fmt::format(
      "gather_axis_{}_{}",
      dtype_to_string(out.dtype()),
      dtype_to_string(idx.dtype()));

  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
    std::vector<std::string> kernel_names;
    for (int ndim = 0; ndim <= MAX_NDIM; ++ndim) {
      for (int contiguous = 0; contiguous < 4; ++contiguous) {
        for (int large = 0; large <= 1; ++large) {
          kernel_names.push_back(
              fmt::format(
                  "mlx::core::cu::gather_axis<{}, {}, {}, {}, {}, {}>",
                  dtype_to_cuda_type(out.dtype()),
                  dtype_to_cuda_type(idx.dtype()),
                  ndim,
                  contiguous & 1 ? true : false,
                  contiguous & 2 ? true : false,
                  large ? "int64_t" : "int32_t"));
        }
      }
    }
    return std::make_tuple(
        false, jit_source_gather_axis, std::move(kernel_names));
  });

  size_t idx_size_pre = 1;
  size_t idx_size_post = 1;
  for (int i = 0; i < axis_; ++i) {
    idx_size_pre *= idx.shape(i);
  }
  for (int i = axis_ + 1; i < idx.ndim(); ++i) {
    idx_size_post *= idx.shape(i);
  }
  size_t idx_size_axis = idx.shape(axis_);

  cu::KernelArgs args;
  args.append(src);
  args.append(idx);
  args.append(out);
  if (large) {
    args.append<int64_t>(idx_size_pre);
    args.append<int64_t>(idx_size_axis);
    args.append<int64_t>(idx_size_post);
  } else {
    args.append<int32_t>(idx_size_pre);
    args.append<int32_t>(idx_size_axis);
    args.append<int32_t>(idx_size_post);
  }
  args.append(remove_index(idx.shape(), axis_));
  args.append(remove_index(src.strides(), axis_));
  args.append(remove_index(idx.strides(), axis_));
  args.append<int32_t>(axis_);
  args.append(src.shape(axis_));
  args.append(src.strides(axis_));
  args.append(idx.strides(axis_));

  std::string kernel_name = fmt::format(
      "mlx::core::cu::gather_axis<{}, {}, {}, {}, {}, {}>",
      dtype_to_cuda_type(out.dtype()),
      dtype_to_cuda_type(idx.dtype()),
      src.ndim() - 1,
      src.flags().row_contiguous,
      idx.flags().row_contiguous,
      large ? "int64_t" : "int32_t");

  for (const auto& in : inputs) {
    encoder.set_input_array(in);
  }
  encoder.set_output_array(out);
  auto kernel = mod.get_kernel(kernel_name);
  auto [num_blocks, block_dims] = get_launch_args(idx, large);
  encoder.add_kernel_node_raw(
      kernel, num_blocks, block_dims, {}, 0, args.args());
}

void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("ScatterAxis::eval_gpu");
  assert(inputs.size() > 2);
  const auto& src = inputs[0];
  const auto& idx = inputs[1];
  const auto& upd = inputs[2];

  // Copy src into out.
  CopyType copy_type;
  if (src.data_size() == 1) {
    copy_type = CopyType::Scalar;
  } else if (src.flags().row_contiguous) {
    copy_type = CopyType::Vector;
  } else {
    copy_type = CopyType::General;
  }
  copy_gpu(src, out, copy_type);

  // Empty update.
  if (upd.size() == 0) {
    return;
  }

  bool large = idx.size() > INT32_MAX || src.size() > INT32_MAX;

  const char* op = reduce_type_ == ScatterAxis::Sum ? "Sum" : "Assign";
  std::string module_name = fmt::format(
      "scatter_axis_{}_{}_{}",
      dtype_to_string(out.dtype()),
      dtype_to_string(idx.dtype()),
      op);

  auto& s = stream();
  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
    std::vector<std::string> kernel_names;
    for (int ndim = 0; ndim <= MAX_NDIM; ++ndim) {
      for (int contiguous = 0; contiguous < 4; ++contiguous) {
        for (int large = 0; large <= 1; ++large) {
          kernel_names.push_back(
              fmt::format(
                  "mlx::core::cu::scatter_axis<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}, {}>",
                  dtype_to_cuda_type(out.dtype()),
                  dtype_to_cuda_type(idx.dtype()),
                  op,
                  ndim,
                  contiguous & 1 ? true : false,
                  contiguous & 2 ? true : false,
                  large ? "int64_t" : "int32_t"));
        }
      }
    }
    return std::make_tuple(
        false, jit_source_scatter_axis, std::move(kernel_names));
  });

  size_t idx_size_pre = 1;
  size_t idx_size_post = 1;
  for (int i = 0; i < axis_; ++i) {
    idx_size_pre *= idx.shape(i);
  }
  for (int i = axis_ + 1; i < idx.ndim(); ++i) {
    idx_size_post *= idx.shape(i);
  }
  size_t idx_size_axis = idx.shape(axis_);

  cu::KernelArgs args;
  args.append(upd);
  args.append(idx);
  args.append(out);
  if (large) {
    args.append<int64_t>(idx_size_pre);
    args.append<int64_t>(idx_size_axis);
    args.append<int64_t>(idx_size_post);
  } else {
    args.append<int32_t>(idx_size_pre);
    args.append<int32_t>(idx_size_axis);
    args.append<int32_t>(idx_size_post);
  }
  args.append(remove_index(idx.shape(), axis_));
  args.append(remove_index(upd.strides(), axis_));
  args.append(remove_index(idx.strides(), axis_));
  args.append<int32_t>(axis_);
  args.append(out.shape(axis_));
  args.append(upd.strides(axis_));
  args.append(idx.strides(axis_));

  std::string kernel_name = fmt::format(
      "mlx::core::cu::scatter_axis<{}, {}, mlx::core::cu::Scatter{}, {}, {}, {}, {}>",
      dtype_to_cuda_type(out.dtype()),
      dtype_to_cuda_type(idx.dtype()),
      op,
      idx.ndim() - 1,
      upd.flags().row_contiguous,
      idx.flags().row_contiguous,
      large ? "int64_t" : "int32_t");

  auto& encoder = cu::get_command_encoder(s);
  for (const auto& in : inputs) {
    encoder.set_input_array(in);
  }
  encoder.set_output_array(out);
  auto kernel = mod.get_kernel(kernel_name);
  auto [num_blocks, block_dims] = get_launch_args(idx, large);
  encoder.add_kernel_node_raw(
      kernel, num_blocks, block_dims, {}, 0, args.args());
}

void MaskedScatter::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("MaskedScatter::eval_gpu");
  assert(inputs.size() == 3);

  const array& dst = inputs[0];
  const array& mask = inputs[1];
  const array& src = inputs[2];

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);

  const size_t total = mask.size();
  out.set_data(cu::malloc_async(out.nbytes(), encoder));
  if (total == 0) {
    return;
  }

  array mask_flat = flatten_in_eval(mask, 1, -1, s);
  if (mask_flat.data<void>() != mask.data<void>()) {
    encoder.add_temporary(mask_flat);
  }
  if (!mask_flat.flags().row_contiguous) {
    mask_flat = contiguous_copy_gpu(mask_flat, s);
    encoder.add_temporary(mask_flat);
  }

  array scatter_offsets(mask_flat.shape(), int32, nullptr, {});
  scatter_offsets.set_data(cu::malloc_async(scatter_offsets.nbytes(), encoder));
  encoder.add_temporary(scatter_offsets);

  scan_gpu_inplace(
      mask_flat,
      scatter_offsets,
      Scan::Sum,
      /* axis= */ 1,
      /* reverse= */ false,
      /* inclusive= */ false,
      s);

  const size_t batch_count = mask.shape(0);
  const size_t mask_batch_size = mask_flat.size() / batch_count;
  const size_t src_batch_size = src.size() / src.shape(0);
  bool large = total > INT32_MAX || src.size() > INT32_MAX;
  bool vectorized = src.flags().row_contiguous && dst.flags().row_contiguous;
  constexpr int kMaskedScatterVecSize = 16;
  constexpr int kMaskedScatterVecBlockDim = 256;

  std::string module_name =
      fmt::format("masked_scatter_{}", dtype_to_string(out.dtype()));
  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
    std::vector<std::string> kernel_names;
    for (int src_contiguous = 0; src_contiguous <= 1; ++src_contiguous) {
      for (int dst_contiguous = 0; dst_contiguous <= 1; ++dst_contiguous) {
        for (int use_large = 0; use_large <= 1; ++use_large) {
          kernel_names.push_back(
              fmt::format(
                  "mlx::core::cu::masked_scatter<{}, {}, {}, {}>",
                  dtype_to_cuda_type(out.dtype()),
                  src_contiguous ? "true" : "false",
                  dst_contiguous ? "true" : "false",
                  use_large ? "int64_t" : "int32_t"));
        }
      }
    }
    for (int use_large = 0; use_large <= 1; ++use_large) {
      kernel_names.push_back(
          fmt::format(
              "mlx::core::cu::masked_scatter_vec_contiguous<{}, {}, {}>",
              dtype_to_cuda_type(out.dtype()),
              use_large ? "int64_t" : "int32_t",
              kMaskedScatterVecSize));
    }
    return std::make_tuple(false, jit_source_scatter, std::move(kernel_names));
  });

  cu::KernelArgs args;
  args.append(dst);
  args.append(mask_flat);
  args.append(scatter_offsets);
  args.append(src);
  args.append(out);
  if (large) {
    args.append<int64_t>(mask_flat.size());
    args.append<int64_t>(src_batch_size);
    args.append<int64_t>(mask_batch_size);
  } else {
    args.append<int32_t>(mask_flat.size());
    args.append<int32_t>(src_batch_size);
    args.append<int32_t>(mask_batch_size);
  }
  if (!vectorized) {
    args.append_ndim(dst.shape());
    args.append_ndim(dst.strides());
    args.append<int32_t>(dst.ndim());
    args.append_ndim(src.shape());
    args.append_ndim(src.strides());
    args.append<int32_t>(src.ndim());
  }

  encoder.set_input_array(dst);
  encoder.set_input_array(mask_flat);
  encoder.set_input_array(scatter_offsets);
  encoder.set_input_array(src);
  encoder.set_output_array(out);

  std::string kernel_name = vectorized
      ? fmt::format(
            "mlx::core::cu::masked_scatter_vec_contiguous<{}, {}, {}>",
            dtype_to_cuda_type(out.dtype()),
            large ? "int64_t" : "int32_t",
            kMaskedScatterVecSize)
      : fmt::format(
            "mlx::core::cu::masked_scatter<{}, {}, {}, {}>",
            dtype_to_cuda_type(out.dtype()),
            src.flags().row_contiguous ? "true" : "false",
            dst.flags().row_contiguous ? "true" : "false",
            large ? "int64_t" : "int32_t");
  auto kernel = mod.get_kernel(kernel_name);
  auto [num_blocks, block_dims] = vectorized
      ? get_launch_args(
            mask_flat, large, kMaskedScatterVecSize, kMaskedScatterVecBlockDim)
      : get_launch_args(mask_flat, large);
  encoder.add_kernel_node_raw(
      kernel, num_blocks, block_dims, {}, 0, args.args());
}

void SliceUpdate::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("SliceUpdate::eval_gpu");
  assert(inputs.size() == 2);
  if (out.size() == 0) {
    return;
  }

  auto& in = inputs[0];
  auto& upd = inputs[1];

  if (upd.size() == 0) {
    out.copy_shared_buffer(in);
    return;
  }

  auto ctype = in.flags().contiguous && in.size() == in.data_size()
      ? CopyType::Vector
      : CopyType::General;
  copy_gpu(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, stream());

  // Calculate out strides, initial offset and if copy needs to be made
  auto [data_offset, out_strides] =
      prepare_slice(out, start_indices_, strides_);

  // Do copy for None reduce type
  if (reduce_type_ == SliceUpdate::None) {
    copy_gpu_inplace(
        /* const array& src = */ upd,
        /* array& dst = */ out,
        /* const Shape& data_shape = */ upd.shape(),
        /* const Strides& i_strides = */ upd.strides(),
        /* const Strides& o_strides = */ out_strides,
        /* int64_t i_offset = */ 0,
        /* int64_t o_offset = */ data_offset,
        /* CopyType ctype = */ CopyType::GeneralGeneral,
        /* const Stream& s = */ stream());
    return;
  }

  auto [shape, strides] =
      collapse_contiguous_dims(upd.shape(), {upd.strides(), out_strides});
  int nwork = 1;
  if (shape.back() % 4 == 0) {
    nwork = 4;
  } else if (shape.back() % 2 == 0) {
    nwork = 2;
  }

  const char* op_name = g_slice_ops[reduce_type_];
  auto [ds, rc, cc] = check_contiguity(shape, strides[1]);
  bool upd_contiguous = upd.flags().row_contiguous;
  bool upd_scalar = upd.data_size() == 1;
  bool out_contiguous = rc;
  bool large = upd.size() > INT32_MAX;
  std::string module_name =
      fmt::format("slice_update_{}_{}", op_name, dtype_to_string(out.dtype()));

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);

  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
    std::vector<std::string> kernel_names;
    for (int out_c = 0; out_c <= 1; ++out_c) {
      for (int upd_c = 0; upd_c <= 1; ++upd_c) {
        for (int upd_s = 0; upd_s <= 1; ++upd_s) {
          for (int large = 0; large <= 1; ++large) {
            for (int nwork = 1; nwork <= 16; nwork *= 2) {
              kernel_names.push_back(
                  fmt::format(
                      "mlx::core::cu::slice_update_op<{}, {}, mlx::core::cu::{}, {}, {}, {}, {}>",
                      dtype_to_cuda_type(out.dtype()),
                      large ? "int64_t" : "int32_t",
                      op_name,
                      out_c ? "true" : "false",
                      upd_c ? "true" : "false",
                      upd_s ? "true" : "false",
                      nwork));
            }
          }
        }
      }
    }
    return std::make_tuple(
        false, jit_source_slice_update, std::move(kernel_names));
  });

  cu::KernelArgs args;
  args.append(upd);
  args.append(out);
  args.append<int64_t>(upd.size());
  args.append_ndim(shape);
  args.append_ndim(strides[0]);
  args.append<int32_t>(shape.size());
  args.append_ndim(strides[1]);
  args.append<int64_t>(data_offset);

  encoder.set_input_array(upd);
  encoder.set_output_array(out);

  std::string kernel_name;
  kernel_name = fmt::format(
      "mlx::core::cu::slice_update_op<{}, {}, mlx::core::cu::{}, {}, {}, {}, {}>",
      dtype_to_cuda_type(out.dtype()),
      large ? "int64_t" : "int32_t",
      op_name,
      out_contiguous,
      upd_contiguous,
      upd_scalar,
      nwork);

  auto kernel = mod.get_kernel(kernel_name);
  auto [num_blocks, block_dims] = get_launch_args(upd, large, nwork);
  encoder.add_kernel_node_raw(
      kernel, num_blocks, block_dims, {}, 0, args.args());
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/jit_module.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/jit_module.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/version.h"

#include "cuda_jit_sources.h"

#include <cstdlib>
#include <filesystem>
#include <fstream>

#include <fmt/format.h>
#include <nvrtc.h>

namespace mlx::core::cu {

namespace {

#define CHECK_NVRTC_ERROR(cmd) check_nvrtc_error(#cmd, (cmd))

void check_nvrtc_error(const char* name, nvrtcResult err) {
  if (err != NVRTC_SUCCESS) {
    throw std::runtime_error(
        fmt::format("{} failed: {}", name, nvrtcGetErrorString(err)));
  }
}

// Return the default path to CUDA toolkit.
const std::filesystem::path& default_cuda_toolkit_path() {
#if defined(_WIN32)
  static auto cached_path = []() -> std::filesystem::path {
    std::filesystem::path root(
        LR"(C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA)");
    for (auto& file : std::filesystem::directory_iterator(root)) {
      if (std::filesystem::exists(file.path() / "include" / "cuda.h")) {
        return file.path();
      }
    }
    return {};
  }();
#else
  static std::filesystem::path cached_path = "/usr/local/cuda";
#endif
  return cached_path;
}

// Return the --include-path args used for invoking NVRTC.
const std::vector<std::string>& include_path_args() {
  static std::vector<std::string> cached_args = []() {
    std::vector<std::string> args;
    // Add path to bundled CCCL headers.
    auto root_dir = current_binary_dir();
#if !defined(_WIN32)
    root_dir = root_dir.parent_path();
#endif
    auto path = root_dir / "include" / "cccl";
#if defined(MLX_CCCL_DIR)
    if (!std::filesystem::exists(path)) {
      path = MLX_CCCL_DIR;
    }
#endif
    if (std::filesystem::exists(path)) {
      args.push_back(fmt::format("--include-path={}", path.string()));
    }
    // Add path to CUDA runtime headers, try local-installed python package
    // first and then system-installed headers.
    path = root_dir.parent_path() / "nvidia" / "cuda_runtime" / "include";
    if (!std::filesystem::exists(path)) {
      const char* home = std::getenv("CUDA_HOME");
      if (!home) {
        home = std::getenv("CUDA_PATH");
      }
      path = home ? std::filesystem::path(home) : default_cuda_toolkit_path();
      if (!path.empty()) {
        path = path / "include";
      }
      if (path.empty() || !std::filesystem::exists(path)) {
        throw std::runtime_error(
            "Can not find locations of CUDA headers, please set environment "
            "variable CUDA_HOME or CUDA_PATH.");
      }
    }
    args.push_back(fmt::format("--include-path={}", path.string()));
    return args;
  }();
  return cached_args;
}

// Get the cache directory for storing compiled results.
const std::filesystem::path& ptx_cache_dir() {
  static std::filesystem::path cache = []() -> std::filesystem::path {
    std::filesystem::path cache;
    if (auto c = std::getenv("MLX_PTX_CACHE_DIR"); c) {
      cache = c;
    } else {
      cache =
          std::filesystem::temp_directory_path() / "mlx" / version() / "ptx";
    }

#if defined(_WIN32)
    // Add "\\?\" prefix to support long file path.
    const wchar_t* long_path_prefix = L"\\\\?\\";
    if (cache.is_relative()) {
      cache = std::filesystem::absolute(cache);
    }
    if (!cache.native().starts_with(long_path_prefix)) {
      cache = long_path_prefix + cache.native();
    }
#endif

    if (!std::filesystem::exists(cache)) {
      std::error_code error;
      if (!std::filesystem::create_directories(cache, error)) {
        return std::filesystem::path();
      }
    }
    return cache;
  }();
  return cache;
}

std::filesystem::path get_ptx_path(
    const std::filesystem::path& cache_dir,
    const std::string& module_name) {
  constexpr int max_file_name_length = 245;
  if (module_name.size() <= max_file_name_length) {
    return cache_dir / (module_name + ".ptx");
  }

  auto ptx_path = cache_dir;
  int offset = 0;
  while (module_name.size() - offset > max_file_name_length) {
    ptx_path /= module_name.substr(offset, max_file_name_length);
    offset += max_file_name_length;
  }
  ptx_path /= module_name.substr(offset) + ".ptx";

  return ptx_path;
}

// Try to read the cached |ptx| and |ptx_kernels| from |cache_dir|.
bool read_cached_ptx(
    const std::filesystem::path& cache_dir,
    const std::string& module_name,
    std::string& ptx,
    std::vector<std::pair<std::string, std::string>>& ptx_kernels) {
  if (cache_dir.empty()) {
    return false;
  }

  auto ptx_path = get_ptx_path(cache_dir, module_name);
  std::error_code error;
  auto ptx_size = std::filesystem::file_size(ptx_path, error);
  if (error) {
    return false;
  }
  std::ifstream ptx_file(ptx_path, std::ios::binary);
  if (!ptx_file.good()) {
    return false;
  }
  ptx.resize(ptx_size);
  ptx_file.read(ptx.data(), ptx_size);

  std::ifstream txt_file(ptx_path.replace_extension(".txt"), std::ios::binary);
  std::string line;
  while (std::getline(txt_file, line)) {
    auto tab = line.find('\t');
    if (tab != std::string::npos) {
      ptx_kernels.emplace_back(line.substr(0, tab), line.substr(tab + 1));
    }
  }
  return true;
}

// Write the |ptx| and |ptx_kernels| to |cache_dir| with |name|.
void write_cached_ptx(
    const std::filesystem::path& cache_dir,
    const std::string& module_name,
    const std::string& ptx,
    const std::vector<std::pair<std::string, std::string>>& ptx_kernels,
    const std::string& source_code) {
  if (cache_dir.empty()) {
    return;
  }

  auto ptx_path = get_ptx_path(cache_dir, module_name);

  // Ensure that the directory exists
  auto parent = ptx_path.parent_path();
  if (parent != cache_dir) {
    std::filesystem::create_directories(parent);
  }

  // Write the compiled code and mangled names
  std::ofstream ptx_file(ptx_path, std::ios::binary);
  if (!ptx.empty()) {
    ptx_file.write(&ptx.front(), ptx.size());
  }
  std::ofstream txt_file(ptx_path.replace_extension(".txt"), std::ios::binary);
  for (const auto& [name, mangled] : ptx_kernels) {
    txt_file << name << "\t" << mangled << std::endl;
  }

  // Write the generated code
  std::ofstream source_file(ptx_path.replace_extension(".cu"));
  source_file << source_code;
}

// Return if |device|'s version is not newer than |major|.|minor| version.
inline bool version_lower_equal(Device& device, int major, int minor) {
  if (device.compute_capability_major() < major) {
    return true;
  } else if (device.compute_capability_major() == major) {
    return device.compute_capability_minor() <= minor;
  } else {
    return false;
  }
}

// Return whether NVRTC supports compiling to |device|'s SASS code.
bool compiler_supports_device_sass(Device& device) {
  int nvrtc_major, nvrtc_minor;
  CHECK_NVRTC_ERROR(nvrtcVersion(&nvrtc_major, &nvrtc_minor));
  if (nvrtc_major < 9) {
    return false;
  } else if (nvrtc_major == 9) {
    return version_lower_equal(device, 7, 2);
  } else if (nvrtc_major == 10) {
    return version_lower_equal(device, 7, 5);
  } else if (nvrtc_major == 11 && nvrtc_minor == 0) {
    return version_lower_equal(device, 8, 0);
  } else if (nvrtc_major == 11 && nvrtc_minor < 8) {
    return version_lower_equal(device, 8, 6);
  } else {
    return true;
  }
}

#define INCLUDE_PREFIX "mlx/backend/cuda/device/"

constexpr const char* g_include_names[] = {
    INCLUDE_PREFIX "atomic_ops.cuh",
    INCLUDE_PREFIX "binary_ops.cuh",
    INCLUDE_PREFIX "cast_op.cuh",
    INCLUDE_PREFIX "config.h",
    INCLUDE_PREFIX "complex.cuh",
    INCLUDE_PREFIX "fp16_math.cuh",
    INCLUDE_PREFIX "hadamard.cuh",
    INCLUDE_PREFIX "indexing.cuh",
    INCLUDE_PREFIX "scatter_ops.cuh",
    INCLUDE_PREFIX "unary_ops.cuh",
    INCLUDE_PREFIX "ternary_ops.cuh",
    INCLUDE_PREFIX "utils.cuh",
};

#undef INCLUDE_PREFIX

constexpr const char* g_headers[] = {
    jit_source_atomic_ops,
    jit_source_binary_ops,
    jit_source_cast_op,
    jit_source_config,
    jit_source_complex,
    jit_source_fp16_math,
    jit_source_hadamard,
    jit_source_indexing,
    jit_source_scatter_ops,
    jit_source_unary_ops,
    jit_source_ternary_ops,
    jit_source_utils,
};

void compile(
    Device& device,
    const std::string& module_name,
    const std::string& source,
    const std::vector<std::string>& kernel_names,
    std::string& ptx,
    std::vector<std::pair<std::string, std::string>>& ptx_kernels) {
  // Create the program
  nvrtcProgram prog;
  CHECK_NVRTC_ERROR(nvrtcCreateProgram(
      &prog,
      source.c_str(),
      (module_name + ".cu").c_str(),
      std::size(g_headers),
      g_headers,
      g_include_names));
  std::unique_ptr<nvrtcProgram, void (*)(nvrtcProgram*)> prog_freer(
      &prog,
      [](nvrtcProgram* p) { CHECK_NVRTC_ERROR(nvrtcDestroyProgram(p)); });
  for (const auto& name : kernel_names) {
    CHECK_NVRTC_ERROR(nvrtcAddNameExpression(prog, name.c_str()));
  }

  // Compile program.
  std::vector<const char*> args;
  bool use_sass = compiler_supports_device_sass(device);
  auto cc = device.compute_capability_major();
  std::string arch_tag = (cc >= 9) ? "a" : "";
  std::string compute = fmt::format(
      "--gpu-architecture={}_{}{}{}",
      use_sass ? "sm" : "compute",
      cc,
      device.compute_capability_minor(),
      arch_tag);
  args.push_back(compute.c_str());
  for (const auto& include : include_path_args()) {
    args.push_back(include.c_str());
  }
  nvrtcResult compile_result =
      nvrtcCompileProgram(prog, args.size(), args.data());
  if (compile_result != NVRTC_SUCCESS) {
    size_t log_size;
    CHECK_NVRTC_ERROR(nvrtcGetProgramLogSize(prog, &log_size));
    std::vector<char> log(log_size + 1, 0);
    CHECK_NVRTC_ERROR(nvrtcGetProgramLog(prog, log.data()));
    throw std::runtime_error(
        fmt::format("Failed to compile kernel: {}.", log.data()));
  }

  // Get mangled names of kernel names.
  for (const auto& name : kernel_names) {
    const char* mangled;
    CHECK_NVRTC_ERROR(nvrtcGetLoweredName(prog, name.c_str(), &mangled));
    ptx_kernels.emplace_back(name, mangled);
  }

  // Get ptx data.
  size_t ptx_size;
  if (use_sass) {
    CHECK_NVRTC_ERROR(nvrtcGetCUBINSize(prog, &ptx_size));
  } else {
    CHECK_NVRTC_ERROR(nvrtcGetPTXSize(prog, &ptx_size));
  }
  ptx.resize(ptx_size);
  if (use_sass) {
    CHECK_NVRTC_ERROR(nvrtcGetCUBIN(prog, ptx.data()));
  } else {
    CHECK_NVRTC_ERROR(nvrtcGetPTX(prog, ptx.data()));
  }
}

void load_module(
    const std::string& module_name,
    const std::string& ptx,
    const std::vector<std::pair<std::string, std::string>>& ptx_kernels,
    CUmodule& module_,
    std::unordered_map<std::string, std::tuple<CUfunction, bool, uint32_t>>&
        kernels) {
  // Load module.
  char jit_log[4089] = {};
  CUjit_option options[] = {
      CU_JIT_ERROR_LOG_BUFFER, CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES};
  void* values[] = {jit_log, reinterpret_cast<void*>(std::size(jit_log) - 1)};
  CUresult jit_result = cuModuleLoadDataEx(
      &module_, ptx.data(), std::size(options), options, values);
  if (jit_result != CUDA_SUCCESS) {
    throw std::runtime_error(
        fmt::format(
            "Failed to load compiled {} kernel: {}.", module_name, jit_log));
  }

  // Load kernels.
  for (const auto& [name, mangled] : ptx_kernels) {
    CUfunction kernel;
    CHECK_CUDA_ERROR(cuModuleGetFunction(&kernel, module_, mangled.c_str()));
    kernels[name] = std::make_tuple(kernel, false, 0);
  }
}

} // namespace

JitModule::JitModule(
    Device& device,
    const std::string& module_name,
    const KernelBuilder& builder,
    bool use_disk_cache) {
  // Will hold the actual device executable source code and kernel names
  std::string ptx;
  std::vector<std::pair<std::string, std::string>> ptx_kernels;

  // Try to load them from the file cache
  if (!read_cached_ptx(ptx_cache_dir(), module_name, ptx, ptx_kernels)) {
    auto [precompiled, source_code, kernel_names] = builder();

    // Get the PTX or cubin
    if (precompiled) {
      ptx = std::move(source_code);
      for (auto& name : kernel_names) {
        ptx_kernels.emplace_back(name, name);
      }
    } else {
      compile(device, module_name, source_code, kernel_names, ptx, ptx_kernels);
    }

    // If requested save them in the file cache for the next launch
    if (use_disk_cache) {
      write_cached_ptx(
          ptx_cache_dir(), module_name, ptx, ptx_kernels, source_code);
    }
  }

  // Load the module
  load_module(module_name, ptx, ptx_kernels, module_, kernels_);
}

JitModule::~JitModule() {
  CHECK_CUDA_ERROR(cuModuleUnload(module_));
}

std::pair<CUfunction, uint32_t> JitModule::get_kernel_and_dims(
    const std::string& kernel_name,
    std::function<void(CUfunction)> configure_kernel) {
  auto it = kernels_.find(kernel_name);
  if (it == kernels_.end()) {
    throw std::runtime_error(
        fmt::format("There is no kernel named {}.", kernel_name));
  }

  // If it is the first time we run this kernel then configure it. Do it only
  // once!
  auto kernel = std::get<0>(it->second);
  if (!std::get<1>(it->second)) {
    if (configure_kernel) {
      configure_kernel(kernel);
    }
    std::get<1>(it->second) = true;
    std::get<2>(it->second) = max_occupancy_block_dim(kernel);
  }

  return {kernel, std::get<2>(it->second)};
}

CUfunction JitModule::get_kernel(
    const std::string& kernel_name,
    std::function<void(CUfunction)> configure_kernel) {
  return get_kernel_and_dims(kernel_name, std::move(configure_kernel)).first;
}

std::unordered_map<std::string, JitModule>& get_jit_module_cache() {
  static std::unordered_map<std::string, JitModule> map;
  return map;
}

JitModule& get_jit_module(
    const mlx::core::Device& device,
    const std::string& name,
    const KernelBuilder& builder,
    bool cache) {
  auto& map = get_jit_module_cache();
  auto it = map.find(name);
  if (it == map.end()) {
    it = map.try_emplace(name, cu::device(device), name, builder, cache).first;
  }
  return it->second;
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/jit_module.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/array.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/device/config.h"

#include <deque>
#include <unordered_map>
#include <utility>
#include <variant>

#include <cuda.h>
#include <fmt/format.h>

namespace mlx::core::cu {

class Device;

using KernelBuilderResult = std::tuple<
    /* precompiled */ bool,
    /* source code */ std::string,
    /* kernel names */ std::vector<std::string>>;
using KernelBuilder = std::function<KernelBuilderResult()>;

struct KernelArgs {
  void** args() {
    return args_.data();
  }

  void append(const array& a) {
    append(reinterpret_cast<CUdeviceptr>(gpu_ptr<void>(a)));
  }

  template <typename T>
  void append(T val) {
    storage_.emplace_back(val);
    append_ptr(&storage_.back());
  }

  template <typename T>
  void append(SmallVector<T> vec) {
    storage_.emplace_back(std::move(vec));
    append_ptr(std::get<SmallVector<T>>(storage_.back()).data());
  }

  template <typename T>
  void append(const std::vector<T>& vec) {
    append(SmallVector<T>(vec.begin(), vec.end()));
  }

  // Make sure the arg is copied to an array with size of NDIM.
  template <size_t NDIM = MAX_NDIM, typename T>
  void append_ndim(SmallVector<T> vec) {
    if (vec.size() > NDIM) {
      throw std::runtime_error(
          fmt::format("ndim can not be larger than {}.", NDIM));
    }
    vec.resize(NDIM);
    append(std::move(vec));
  }

  void append_ptr(const void* v) {
    args_.push_back(const_cast<void*>(v));
  }

 private:
  std::vector<void*> args_;

  // The cuGraphAddKernelNode API requires passing pointers to arguments so
  // store temporary values until the node is created.
  using Arg = std::variant<
      std::monostate,
      CUdeviceptr,
      bool,
      int32_t,
      uint32_t,
      int64_t,
      float,
      SmallVector<const void*>,
      SmallVector<int32_t>,
      SmallVector<int64_t>>;
  std::deque<Arg> storage_;
};

class JitModule {
 public:
  JitModule(
      Device& device,
      const std::string& module_name,
      const KernelBuilder& builder,
      bool cache);
  ~JitModule();

  JitModule(const JitModule&) = delete;
  JitModule& operator=(const JitModule&) = delete;
  CUfunction get_kernel(
      const std::string& kernel_name,
      std::function<void(CUfunction)> configure_kernel = nullptr);
  std::pair<CUfunction, uint32_t> get_kernel_and_dims(
      const std::string& kernel_name,
      std::function<void(CUfunction)> configure_kernel = nullptr);

 private:
  CUmodule module_{nullptr};
  std::unordered_map<std::string, std::tuple<CUfunction, bool, uint32_t>>
      kernels_;
};

std::unordered_map<std::string, JitModule>& get_jit_module_cache();

JitModule& get_jit_module(
    const mlx::core::Device& device,
    const std::string& name,
    const KernelBuilder& builder,
    bool use_disk_cache = true);

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/kernel_utils.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/utils.h"
#include "mlx/backend/cuda/kernel_utils.cuh"

namespace mlx::core {

dim3 get_block_dims(int dim0, int dim1, int dim2, int pow2) {
  Dims dims = get_block_dims_common(dim0, dim1, dim2, pow2);
  return dim3(std::get<0>(dims), std::get<1>(dims), std::get<2>(dims));
}

dim3 get_2d_grid_dims(const Shape& shape, const Strides& strides) {
  Dims dims = get_2d_grid_dims_common(shape, strides);
  return dim3(std::get<0>(dims), std::get<1>(dims), std::get<2>(dims));
}

dim3 get_2d_grid_dims(
    const Shape& shape,
    const Strides& strides,
    size_t divisor) {
  Dims dims = get_2d_grid_dims_common(shape, strides, divisor);
  return dim3(std::get<0>(dims), std::get<1>(dims), std::get<2>(dims));
}

std::pair<dim3, dim3> get_grid_and_block(int dim0, int dim1, int dim2) {
  auto [grid, block] = get_grid_and_block_common(dim0, dim1, dim2);
  auto [gx, gy, gz] = grid;
  auto [bx, by, bz] = block;
  return std::make_pair(dim3(gx, gy, gz), dim3(bx, by, bz));
}

std::tuple<dim3, uint32_t> get_launch_args(
    size_t size,
    const Shape& shape,
    const Strides& strides,
    bool large,
    int work_per_thread /* = 1 */,
    uint32_t max_block_dim /* = 1024 */) {
  size_t nthreads = cuda::ceil_div(size, work_per_thread);
  uint32_t block_dim = max_block_dim < nthreads ? max_block_dim : nthreads;
  dim3 num_blocks;
  if (large) {
    num_blocks = get_2d_grid_dims(shape, strides, work_per_thread);
    num_blocks.x = cuda::ceil_div(num_blocks.x, block_dim);
  } else {
    num_blocks.x = cuda::ceil_div(nthreads, block_dim);
  }
  return std::make_tuple(num_blocks, block_dim);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/kernel_utils.cuh
================================================
// Copyright © 2025 Apple Inc.

// This file includes host-only utilities for writing CUDA kernels, the
// difference from backend/cuda/device/utils.cuh is that the latter file only
// include device-only code.

#pragma once

#include <type_traits>

#include "mlx/array.h"
#include "mlx/backend/cuda/allocator.h"
#include "mlx/backend/cuda/device/utils.cuh"

#include <cuda.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <fmt/format.h>
#include <cuda/cmath>

namespace mlx::core {

template <typename F>
void dispatch_1_2_3(int n, F&& f) {
  switch (n) {
    case 1:
      f(std::integral_constant<int, 1>{});
      break;
    case 2:
      f(std::integral_constant<int, 2>{});
      break;
    case 3:
      f(std::integral_constant<int, 3>{});
      break;
  }
}

template <typename F>
void dispatch_bool(bool v, F&& f) {
  if (v) {
    f(std::true_type{});
  } else {
    f(std::false_type{});
  }
}

template <typename F>
void dispatch_block_dim(int threads, F&& f) {
  if (threads <= WARP_SIZE) {
    f(std::integral_constant<int, WARP_SIZE>{});
  } else if (threads <= WARP_SIZE * 2) {
    f(std::integral_constant<int, WARP_SIZE * 2>{});
  } else if (threads <= WARP_SIZE * 4) {
    f(std::integral_constant<int, WARP_SIZE * 4>{});
  } else if (threads <= WARP_SIZE * 8) {
    f(std::integral_constant<int, WARP_SIZE * 8>{});
  } else if (threads <= WARP_SIZE * 16) {
    f(std::integral_constant<int, WARP_SIZE * 16>{});
  } else {
    f(std::integral_constant<int, WARP_SIZE * 32>{});
  }
}

// Maps CPU types to CUDA types.
template <typename T>
struct CTypeToCudaType {
  using type = T;
};

template <>
struct CTypeToCudaType<float16_t> {
  using type = __half;
};

template <>
struct CTypeToCudaType<bfloat16_t> {
  using type = __nv_bfloat16;
};

template <>
struct CTypeToCudaType<complex64_t> {
  using type = cu::complex64_t;
};

template <typename T>
using cuda_type_t = typename CTypeToCudaType<T>::type;

// Type traits for detecting floating numbers.
template <typename T>
inline constexpr bool is_floating_v =
    cuda::std::is_same_v<T, float> || cuda::std::is_same_v<T, double> ||
    cuda::std::is_same_v<T, float16_t> || cuda::std::is_same_v<T, bfloat16_t> ||
    cuda::std::is_same_v<T, __half> || cuda::std::is_same_v<T, __nv_bfloat16>;

// Type traits for detecting complex numbers.
template <typename T>
inline constexpr bool is_complex_v = cuda::std::is_same_v<T, complex64_t> ||
    cuda::std::is_same_v<T, complex128_t>;

// Type traits for detecting complex or real floating point numbers.
template <typename T>
inline constexpr bool is_inexact_v = is_floating_v<T> || is_complex_v<T>;

// Utility to copy data from vector to array in host.
template <int NDIM = MAX_NDIM, typename T = int32_t>
inline cuda::std::array<T, NDIM> const_param(const SmallVector<T>& vec) {
  if (vec.size() > NDIM) {
    throw std::runtime_error(
        fmt::format("ndim can not be larger than {}.", NDIM));
  }
  cuda::std::array<T, NDIM> result;
  std::copy_n(vec.begin(), vec.size(), result.begin());
  return result;
}

// Compute the grid and block dimensions, check backend/common/utils.h for docs.
dim3 get_block_dims(int dim0, int dim1, int dim2, int pow2 = 10);
dim3 get_2d_grid_dims(const Shape& shape, const Strides& strides);
dim3 get_2d_grid_dims(
    const Shape& shape,
    const Strides& strides,
    size_t divisor);
std::pair<dim3, dim3> get_grid_and_block(int dim0, int dim1, int dim2);

// Get the num_blocks and block_dims assuming each thread handles
// |work_per_thread| elements of |arr|.
std::tuple<dim3, uint32_t> get_launch_args(
    size_t size,
    const Shape& shape,
    const Strides& strides,
    bool large,
    int work_per_thread = 1,
    uint32_t max_block_dim = 1024);

inline std::tuple<dim3, uint32_t> get_launch_args(
    const array& arr,
    bool large,
    int work_per_thread = 1,
    uint32_t max_block_dim = 1024) {
  return get_launch_args(
      arr.size(),
      arr.shape(),
      arr.strides(),
      large,
      work_per_thread,
      max_block_dim);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/layer_norm.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/backend/cuda/reduce/reduce.cuh"
#include "mlx/backend/gpu/copy.h"
#include "mlx/dtype_utils.h"
#include "mlx/fast_primitives.h"

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include <nvtx3/nvtx3.hpp>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

inline __device__ float3 plus_f3(const float3& a, const float3& b) {
  return {a.x + b.x, a.y + b.y, a.z + b.z};
}

// Similar to cub::BlockReduce, but result is broadcasted to every thread.
template <typename T, int BLOCK_DIM>
struct BlockBroadcastReduce {
  static_assert(WARP_SIZE <= BLOCK_DIM && BLOCK_DIM <= WARP_SIZE * WARP_SIZE);
  static_assert(BLOCK_DIM % WARP_SIZE == 0);
  using TempStorage = T[BLOCK_DIM / WARP_SIZE];

  cg::thread_block& block;
  TempStorage& temp;

  template <typename Op>
  __device__ T Reduce(const T& input, const Op& op, const T& init_value) {
    auto warp = cg::tiled_partition<WARP_SIZE>(block);
    T x = cg::reduce(warp, input, op);
    if (warp.thread_rank() == 0) {
      temp[warp.meta_group_rank()] = x;
    }
    block.sync();
    x = warp.thread_rank() < warp.meta_group_size() ? temp[warp.thread_rank()]
                                                    : init_value;
    return cg::reduce(warp, x, op);
  }

  __device__ T Sum(const T& input) {
    return Reduce(input, cg::plus<T>{}, T{});
  }
};

template <typename T, int BLOCK_DIM, int N_READS = 4>
__global__ void layer_norm(
    const T* x,
    const T* w,
    const T* b,
    T* out,
    float eps,
    int32_t axis_size,
    int64_t w_stride,
    int64_t b_stride) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();

  using BlockReduceT = BlockBroadcastReduce<float, BLOCK_DIM>;
  __shared__ typename BlockReduceT::TempStorage temp;

  x += grid.block_rank() * axis_size;
  out += grid.block_rank() * axis_size;

  // Sum.
  float sum = 0;
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
    auto index = r * BLOCK_DIM + block.thread_rank();
    auto xn = load_vector<N_READS>(x, index, axis_size, T(0));
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      sum += static_cast<float>(xn[i]);
    }
  }
  sum = BlockReduceT{block, temp}.Sum(sum);

  // Mean.
  float mean = sum / axis_size;

  // Normalizer.
  float normalizer = 0;
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
    auto index = r * BLOCK_DIM + block.thread_rank();
    if ((index + 1) * N_READS <= axis_size) {
      auto xn = load_vector<N_READS>(x, index);
#pragma unroll
      for (int i = 0; i < N_READS; ++i) {
        float t = static_cast<float>(xn[i]) - mean;
        normalizer += t * t;
      }
    } else {
      for (int i = index * N_READS; i < axis_size; ++i) {
        float t = static_cast<float>(x[i]) - mean;
        normalizer += t * t;
      }
    }
  }
  normalizer = BlockReduceT{block, temp}.Sum(normalizer);
  normalizer = rsqrt(normalizer / axis_size + eps);

  // Outputs.
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
    auto index = r * BLOCK_DIM + block.thread_rank();
    auto xn = load_vector<N_READS>(x, index, axis_size, T(0));
    auto wn = load_vector<N_READS>(w, index, axis_size, w_stride, T(0));
    auto bn = load_vector<N_READS>(b, index, axis_size, b_stride, T(0));
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      float norm = (static_cast<float>(xn[i]) - mean) * normalizer;
      xn[i] = wn[i] * static_cast<T>(norm) + bn[i];
    }
    store_vector<N_READS>(out, index, xn, axis_size);
  }
}

template <typename T, bool HAS_W, int BLOCK_DIM, int N_READS = 4>
__global__ void layer_norm_vjp(
    const T* x,
    const T* w,
    const T* g,
    T* gx,
    T* gw,
    float eps,
    int32_t axis_size,
    int64_t w_stride) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();

  using BlockReduceF = BlockBroadcastReduce<float, BLOCK_DIM>;
  using BlockReduceF3 = BlockBroadcastReduce<float3, BLOCK_DIM>;
  __shared__ union {
    typename BlockReduceF::TempStorage f;
    typename BlockReduceF3::TempStorage f3;
  } temp;

  x += grid.block_rank() * axis_size;
  g += grid.block_rank() * axis_size;
  gx += grid.block_rank() * axis_size;
  gw += grid.block_rank() * axis_size;

  // Sum.
  float sum = 0;
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
    auto index = r * BLOCK_DIM + block.thread_rank();
    auto xn = load_vector<N_READS>(x, index, axis_size, T(0));
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      sum += static_cast<float>(xn[i]);
    }
  }
  sum = BlockReduceF{block, temp.f}.Sum(sum);

  // Mean.
  float mean = sum / axis_size;

  // Normalizer.
  float3 factors = {};
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
    auto index = r * BLOCK_DIM + block.thread_rank();
    auto gn = load_vector<N_READS>(g, index, axis_size, T(0));
    auto wn = load_vector<N_READS>(w, index, axis_size, w_stride, T(0));

    if ((index + 1) * N_READS <= axis_size) {
      auto xn = load_vector<N_READS>(x, index);
#pragma unroll
      for (int i = 0; i < N_READS; ++i) {
        float t = static_cast<float>(xn[i]) - mean;
        float wi = wn[i];
        float gi = gn[i];
        float wg = wi * gi;
        factors = plus_f3(factors, {wg, wg * t, t * t});
      }
    } else {
      for (int i = index * N_READS; i < axis_size; ++i) {
        float t = static_cast<float>(x[i]) - mean;
        float wi = wn[i];
        float gi = gn[i];
        float wg = wi * gi;
        factors = plus_f3(factors, {wg, wg * t, t * t});
      }
    }
  }
  factors = BlockReduceF3{block, temp.f3}.Reduce(factors, plus_f3, {});
  float meanwg = factors.x / axis_size;
  float meanwgxc = factors.y / axis_size;
  float normalizer2 = 1 / (factors.z / axis_size + eps);
  float normalizer = sqrt(normalizer2);

  // Outputs.
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
    auto index = r * BLOCK_DIM + block.thread_rank();
    auto xn = load_vector<N_READS>(x, index, axis_size, T(0));
    auto gn = load_vector<N_READS>(g, index, axis_size, T(0));
    auto wn = load_vector<N_READS>(w, index, axis_size, w_stride, T(0));

    for (int i = 0; i < N_READS; i++) {
      float xi = (static_cast<float>(xn[i]) - mean) * normalizer;
      float wi = wn[i];
      float gi = gn[i];
      xn[i] = normalizer * (wi * gi - meanwg) - xi * meanwgxc * normalizer2;
      if constexpr (HAS_W) {
        wn[i] = gi * xi;
      }
    }
    store_vector<N_READS>(gx, index, xn, axis_size);
    if constexpr (HAS_W) {
      store_vector<N_READS>(gw, index, wn, axis_size);
    }
  }
}

} // namespace cu

namespace fast {

bool LayerNorm::use_fallback(Stream s) {
  return s.device == Device::cpu;
}

// TODO: There are duplicate code with backend/metal/normalization.cpp
void LayerNorm::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  nvtx3::scoped_range r("LayerNorm::eval_gpu");
  auto& s = stream();
  auto& out = outputs[0];
  auto& encoder = cu::get_command_encoder(s);

  // Make sure that the last dimension is contiguous.
  auto set_output = [&s, &out, &encoder](const array& x) {
    bool no_copy = x.flags().contiguous && x.strides()[x.ndim() - 1] == 1;
    if (no_copy && x.ndim() > 1) {
      auto s = x.strides()[x.ndim() - 2];
      no_copy &= (s == 0 || s == x.shape().back());
    }
    if (no_copy) {
      if (x.is_donatable()) {
        out.copy_shared_buffer(x);
      } else {
        out.set_data(
            cu::malloc_async(x.data_size() * x.itemsize(), encoder),
            x.data_size(),
            x.strides(),
            x.flags());
      }
      return x;
    } else {
      array x_copy = contiguous_copy_gpu(x, s);
      out.copy_shared_buffer(x_copy);
      return x_copy;
    }
  };

  const array x = set_output(inputs[0]);
  const array& w = inputs[1];
  const array& b = inputs[2];

  int32_t axis_size = x.shape().back();
  int32_t n_rows = x.data_size() / axis_size;
  int64_t w_stride = (w.ndim() == 1) ? w.strides()[0] : 0;
  int64_t b_stride = (b.ndim() == 1) ? b.strides()[0] : 0;

  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_input_array(b);
  encoder.set_output_array(out);
  dispatch_float_types(out.dtype(), "layernorm", [&](auto type_tag) {
    using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    constexpr int N_READS = 16 / sizeof(DataType);
    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
      auto kernel = cu::layer_norm<DataType, block_dim(), N_READS>;
      encoder.add_kernel_node(
          kernel,
          n_rows,
          block_dim(),
          gpu_ptr<DataType>(x),
          gpu_ptr<DataType>(w),
          gpu_ptr<DataType>(b),
          gpu_ptr<DataType>(out),
          eps_,
          axis_size,
          w_stride,
          b_stride);
    });
  });
}

void LayerNormVJP::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  nvtx3::scoped_range r("LayerNormVJP::eval_gpu");
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);

  // Ensure row contiguity. We could relax this step by checking that the array
  // is contiguous (no broadcasts or holes) and that the input strides are the
  // same as the cotangent strides but for now this is simpler.
  auto check_input = [&s](const array& x, bool& copied) {
    if (x.flags().row_contiguous) {
      copied = false;
      return x;
    }
    copied = true;
    return contiguous_copy_gpu(x, s);
  };
  bool donate_x = inputs[0].is_donatable();
  bool donate_g = inputs[3].is_donatable();
  bool copied;
  auto x = check_input(inputs[0], copied);
  donate_x |= copied;
  const array& w = inputs[1];
  const array& b = inputs[2];
  bool g_copied;
  auto g = check_input(inputs[3], g_copied);
  donate_g |= g_copied;
  array& gx = outputs[0];
  array& gw = outputs[1];
  array& gb = outputs[2];

  // Check whether we had a weight.
  bool has_w = w.ndim() != 0;

  // Allocate space for the outputs.
  bool g_in_gx = false;
  if (donate_x) {
    gx.copy_shared_buffer(x);
  } else if (donate_g) {
    gx.copy_shared_buffer(g);
    g_in_gx = true;
  } else {
    gx.set_data(cu::malloc_async(gx.nbytes(), encoder));
  }
  if (g_copied && !g_in_gx) {
    encoder.add_temporary(g);
  }

  int32_t axis_size = x.shape().back();
  int32_t n_rows = x.data_size() / axis_size;
  int64_t w_stride = (w.ndim() == 1) ? w.strides()[0] : 0;

  // Allocate a temporary to store the gradients for w and allocate the output
  // gradient accumulators.
  array gw_temp =
      (has_w) ? array({n_rows, x.shape().back()}, gw.dtype(), nullptr, {}) : w;
  bool g_in_gw = false;
  if (has_w) {
    if (!g_in_gx && donate_g) {
      g_in_gw = true;
      gw_temp.copy_shared_buffer(g);
    } else {
      gw_temp.set_data(cu::malloc_async(gw_temp.nbytes(), encoder));
      encoder.add_temporary(gw_temp);
    }
  }

  // The gradient for b in case we had a b.
  bool has_gb = (gb.ndim() == 1 && gb.size() == axis_size);
  if (has_gb) {
    ReductionPlan plan(
        ReductionOpType::ContiguousStridedReduce, {n_rows}, {axis_size});
    col_reduce(encoder, g, gb, Reduce::ReduceType::Sum, {0}, plan);
  }

  // Insert dependency if `g` was donated
  if ((g_in_gx || g_in_gw) && has_gb) {
    encoder.set_input_array(gb);
  }
  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_input_array(g);
  encoder.set_output_array(gx);
  encoder.set_output_array(gw_temp);
  dispatch_float_types(gx.dtype(), "layernorm_vjp", [&](auto type_tag) {
    dispatch_bool(has_w, [&](auto has_w_constant) {
      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      constexpr int N_READS = 16 / sizeof(DataType);
      dispatch_block_dim(
          cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
            auto kernel = cu::layer_norm_vjp<
                DataType,
                has_w_constant.value,
                block_dim(),
                N_READS>;
            encoder.add_kernel_node(
                kernel,
                n_rows,
                block_dim(),
                gpu_ptr<DataType>(x),
                gpu_ptr<DataType>(w),
                gpu_ptr<DataType>(g),
                gpu_ptr<DataType>(gx),
                gpu_ptr<DataType>(gw_temp),
                eps_,
                axis_size,
                w_stride);
          });
    });
  });

  if (has_w) {
    ReductionPlan plan(
        ReductionOpType::ContiguousStridedReduce, {n_rows}, {axis_size});
    col_reduce(encoder, gw_temp, gw, Reduce::ReduceType::Sum, {0}, plan);
  }
}

} // namespace fast

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/load.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <algorithm>
#include <utility>

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/utils.h"
#include "mlx/primitives.h"

namespace {

template <const uint8_t scalar_size>
void swap_endianness(uint8_t* data_bytes, size_t N) {
  struct Elem {
    uint8_t bytes[scalar_size];
  };

  Elem* data = reinterpret_cast<Elem*>(data_bytes);

  for (size_t i = 0; i < N; i++) {
    for (size_t j = 0; j < (scalar_size / 2); j++) {
      std::swap(data[i].bytes[j], data[i].bytes[scalar_size - j - 1]);
    }
  }
}

} // namespace

namespace mlx::core {

void Load::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& encoder = cu::get_command_encoder(stream());
  auto size = out.size();
  auto nbytes = size * out.itemsize();
  out.set_data(cu::malloc_async(nbytes, encoder));
  auto out_ptr = malloc(nbytes);
  reader_->read(static_cast<char*>(out_ptr), nbytes, offset_);
  if (swap_endianness_) {
    switch (out.itemsize()) {
      case 2:
        swap_endianness<2>(reinterpret_cast<uint8_t*>(out_ptr), size);
        break;
      case 4:
        swap_endianness<4>(reinterpret_cast<uint8_t*>(out_ptr), size);
        break;
      case 8:
        swap_endianness<8>(reinterpret_cast<uint8_t*>(out_ptr), size);
        break;
    }
  }
  CHECK_CUDA_ERROR(cudaMemcpyAsync(
      gpu_ptr<void>(out),
      out_ptr,
      nbytes,
      cudaMemcpyDefault,
      encoder.stream()));
  CHECK_CUDA_ERROR(cudaLaunchHostFunc(encoder.stream(), free, out_ptr));
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/logsumexp.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/device/cast_op.cuh"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/backend/gpu/copy.h"
#include "mlx/dtype_utils.h"
#include "mlx/primitives.h"

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include <nvtx3/nvtx3.hpp>
#include <cub/block/block_load.cuh>

#include <cassert>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <typename T>
inline __device__ T softmax_exp(T x) {
  // Softmax doesn't need high precision exponential cause x is gonna be in
  // (-oo, 0] anyway and subsequently it will be divided by sum(exp(x_i)).
  return __expf(x);
}

template <typename T, typename AccT, int BLOCK_DIM, int N_READS = 4>
__global__ void logsumexp(const T* in, T* out, int axis_size) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<WARP_SIZE>(block);

  in += grid.block_rank() * axis_size;

  cg::greater<AccT> max_op;
  cg::plus<AccT> plus_op;

  // Thread reduce.
  AccT prevmax;
  AccT maxval = Limits<AccT>::finite_min();
  AccT normalizer = 0;
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); r++) {
    auto index = r * BLOCK_DIM + block.thread_rank();
    auto vals = load_vector<N_READS>(in, index, axis_size, Limits<T>::min());
    prevmax = maxval;
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      maxval = max_op(maxval, static_cast<AccT>(vals[i]));
    }
    // Online normalizer calculation for softmax:
    // https://github.com/NVIDIA/online-softmax
    normalizer = normalizer * softmax_exp(prevmax - maxval);
    for (int i = 0; i < N_READS; i++) {
      normalizer =
          normalizer + softmax_exp(static_cast<AccT>(vals[i]) - maxval);
    }
  }

  // First warp reduce.
  prevmax = maxval;
  maxval = cg::reduce(warp, maxval, max_op);
  normalizer = normalizer * softmax_exp(prevmax - maxval);
  normalizer = cg::reduce(warp, normalizer, plus_op);

  __shared__ AccT local_max[WARP_SIZE];
  __shared__ AccT local_normalizer[WARP_SIZE];

  // Write to shared memory and do second warp reduce.
  prevmax = maxval;
  if (warp.thread_rank() == 0) {
    local_max[warp.meta_group_rank()] = maxval;
  }
  block.sync();
  maxval = warp.thread_rank() < warp.meta_group_size()
      ? local_max[warp.thread_rank()]
      : Limits<AccT>::finite_min();
  maxval = cg::reduce(warp, maxval, max_op);
  normalizer = normalizer * softmax_exp(prevmax - maxval);
  if (warp.thread_rank() == 0) {
    local_normalizer[warp.meta_group_rank()] = normalizer;
  }
  block.sync();
  normalizer = warp.thread_rank() < warp.meta_group_size()
      ? local_normalizer[warp.thread_rank()]
      : AccT{};
  normalizer = cg::reduce(warp, normalizer, plus_op);

  // Write output.
  if (block.thread_rank() == 0) {
    out[grid.block_rank()] = isinf(maxval) ? maxval : log(normalizer) + maxval;
  }
}

} // namespace cu

void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("LogSumExp::eval_gpu");
  assert(inputs.size() == 1);
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);

  // Make sure that the last dimension is contiguous.
  auto ensure_contiguous = [&s, &encoder](const array& x) {
    if (x.flags().contiguous && x.strides()[x.ndim() - 1] == 1) {
      return x;
    } else {
      array x_copy = contiguous_copy_gpu(x, s);
      encoder.add_temporary(x_copy);
      return x_copy;
    }
  };

  auto in = ensure_contiguous(inputs[0]);
  if (in.flags().row_contiguous) {
    out.set_data(cu::malloc_async(out.nbytes(), encoder));
  } else {
    auto n = in.shape(-1);
    auto flags = in.flags();
    auto strides = in.strides();
    for (auto& s : strides) {
      s /= n;
    }
    bool col_contig = strides[0] == 1;
    for (int i = 1; col_contig && i < strides.size(); ++i) {
      col_contig &=
          (out.shape(i) == 1 || strides[i - 1] == out.shape(i) * strides[i]);
    }
    flags.col_contiguous = col_contig;
    out.set_data(
        cu::malloc_async(in.nbytes() / n, encoder),
        in.data_size() / n,
        std::move(strides),
        flags);
  }

  int axis_size = in.shape().back();
  int n_rows = in.data_size() / axis_size;

  encoder.set_input_array(in);
  encoder.set_output_array(out);
  dispatch_float_types(out.dtype(), "logsumexp", [&](auto type_tag) {
    using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    constexpr int N_READS = 16 / sizeof(DataType);
    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
      auto kernel = cu::logsumexp<DataType, float, block_dim(), N_READS>;
      encoder.add_kernel_node(
          kernel,
          n_rows,
          block_dim(),
          gpu_ptr<DataType>(in),
          gpu_ptr<DataType>(out),
          axis_size);
    });
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/lru_cache.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/utils.h"

#include <cstring>
#include <list>
#include <unordered_map>
#include <utility>

#include <fmt/format.h>

namespace mlx::core {

template <
    typename K,
    typename V,
    template <typename...> typename M = std::unordered_map>
class LRUCache {
 public:
  using value_type = std::pair<K, V>;
  using list_type = std::list<value_type>;
  using iterator = typename list_type::iterator;
  using const_iterator = typename list_type::const_iterator;
  using map_type = M<K, iterator>;

  explicit LRUCache(size_t capacity) : capacity_(capacity) {
    if (capacity == 0) {
      throw std::runtime_error("LRUCache requires capacity > 0.");
    }
  }

  // Initialize with capacity read from |env_name|.
  LRUCache(const char* env_name, int default_capacity)
      : LRUCache(env::get_var(env_name, default_capacity)) {
    if (env::get_var("MLX_ENABLE_CACHE_THRASHING_CHECK", 1)) {
      env_name_ = env_name;
    }
  }

  size_t size() const {
    return map_.size();
  }
  size_t capacity() const {
    return capacity_;
  }
  bool empty() const {
    return vlist_.empty();
  }

  void resize(size_t new_capacity) {
    capacity_ = new_capacity;
    trim();
  }

  iterator begin() {
    return vlist_.begin();
  }
  const_iterator begin() const {
    return vlist_.begin();
  }
  iterator end() {
    return vlist_.end();
  }
  const_iterator end() const {
    return vlist_.end();
  }

  void clear() {
    map_.clear();
    vlist_.clear();
  }

  iterator find(const K& key) {
    auto it = map_.find(key);
    if (it == map_.end())
      return end();
    vlist_.splice(vlist_.begin(), vlist_, it->second);
    return it->second;
  }

  template <typename U>
  std::pair<iterator, bool> emplace(const K& key, U&& value) {
    auto it = map_.find(key);
    if (it != map_.end()) {
      vlist_.splice(vlist_.begin(), vlist_, it->second);
      return {it->second, false};
    }

    if (env_name_ && ++cache_misses_ > 2 * capacity_) {
      throw std::runtime_error(
          fmt::format(
              "Cache thrashing is happening, please set the environment variable "
              "{} to a larger value than {} to fix degraded performance.",
              env_name_,
              capacity_));
    }

    vlist_.emplace_front(key, std::forward<U>(value));
    map_[key] = vlist_.begin();

    trim();

    return {vlist_.begin(), true};
  }

  iterator erase(iterator pos) {
    map_.erase(pos->first);
    return vlist_.erase(pos);
  }

  V& operator[](const K& key) {
    auto it = find(key);
    if (it == end()) {
      it = emplace(key, V{}).first;
    }
    return it->second;
  }

 private:
  void trim() {
    while (map_.size() > capacity_) {
      auto last = std::prev(vlist_.end());
      map_.erase(last->first);
      vlist_.pop_back();
    }
  }

  const char* env_name_{nullptr};
  size_t cache_misses_{0};

  list_type vlist_;
  map_type map_;
  size_t capacity_;
};

// Turn a POD struct into a container key by doing bytes compare.
//
// IMPORTANT: Do not use aggregate init on the pod field (key.pod = {...}).
// It creates a stack temporary whose padding bytes are uninitialized, and
// trivial copy-assignment copies the entire struct including padding —
// breaking the memcmp-based comparison. Set fields individually instead.
//
// Usage:
//   BytesKey<MyKey> key;
//   key.pod.field1 = value1;
//   key.pod.field2 = value2;
template <typename T>
struct BytesKey {
  T pod;
  static_assert(std::is_standard_layout_v<T>, "T is not POD");

  BytesKey() {
    // Make sure the paddings between members are filled with 0.
    memset(&pod, 0, sizeof(T));
  }

  BytesKey(const BytesKey& other) {
    memcpy(&pod, &other.pod, sizeof(T));
  }

  BytesKey(BytesKey&& other) {
    memcpy(&pod, &other.pod, sizeof(T));
  }

  bool operator==(const BytesKey& other) const {
    auto* ptr1 = reinterpret_cast<const uint8_t*>(&pod);
    auto* ptr2 = reinterpret_cast<const uint8_t*>(&other.pod);
    return memcmp(ptr1, ptr2, sizeof(T)) == 0;
  }
};

// Compute hash according to the bytes value of T.
template <typename T>
struct BytesHash {
  static_assert(std::is_standard_layout_v<T>, "T is not POD");

  size_t operator()(const T& pod) const {
    auto* ptr = reinterpret_cast<const uint8_t*>(&pod);
    uint32_t value = 0x811C9DC5;
    for (int i = 0; i < sizeof(T); ++i) {
      value ^= ptr[i];
      value *= 0x01000193;
    }
    return value;
  }
};

template <typename K, typename V>
using BytesKeyHashMap = std::unordered_map<K, V, BytesHash<K>>;

template <typename K, typename V>
using LRUBytesKeyCache = LRUCache<BytesKey<K>, V, BytesKeyHashMap>;

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/matmul.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/matmul.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/gemms/cublas_gemm.h"
#include "mlx/backend/cuda/gemms/gemv.h"
#include "mlx/backend/cuda/gemms/grouped_gemm.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/primitives.h"

#include <nvtx3/nvtx3.hpp>
#include <numeric>

namespace mlx::core {

namespace {

std::tuple<bool, int64_t, array>
check_transpose(cu::CommandEncoder& enc, const Stream& s, const array& arr) {
  auto stx = arr.strides()[arr.ndim() - 2];
  auto sty = arr.strides()[arr.ndim() - 1];
  if (sty == 1 && stx == arr.shape(-1)) {
    return std::make_tuple(false, stx, arr);
  } else if (stx == 1 && sty == arr.shape(-2)) {
    return std::make_tuple(true, sty, arr);
  } else {
    array arr_copy = contiguous_copy_gpu(arr, s);
    enc.add_temporary(arr_copy);
    return std::make_tuple(false, arr.shape(-1), arr_copy);
  }
}

std::tuple<bool, int64_t, array>
ensure_batch_contiguous(const array& x, cu::CommandEncoder& encoder, Stream s) {
  if (x.flags().row_contiguous) {
    return std::make_tuple(false, x.strides(-2), x);
  }

  bool rc = true;
  for (int i = 0; i < x.ndim() - 3; i++) {
    rc &= (x.strides(i + 1) * x.shape(i)) == x.strides(i);
  }
  if (rc) {
    return check_transpose(encoder, s, x);
  }

  array x_copy = contiguous_copy_gpu(x, s);
  encoder.add_temporary(x_copy);
  return std::make_tuple(false, x_copy.strides(-2), x_copy);
}

array ensure_row_contiguous(
    const array& x,
    cu::CommandEncoder& encoder,
    Stream s) {
  if (!x.flags().row_contiguous) {
    array x_copy = contiguous_copy_gpu(x, s);
    encoder.add_temporary(x_copy);
    return x_copy;
  } else {
    return x;
  }
}

void gemm_and_bias(
    cu::CommandEncoder& encoder,
    int M,
    int N,
    int K,
    bool a_transposed,
    int64_t lda,
    bool b_transposed,
    int64_t ldb,
    array& out,
    const array& a,
    const array& b,
    const std::optional<array>& bias = std::nullopt,
    float alpha = 1.0f) {
  // Check and collapse batch dimensions
  auto [batch_shape, a_batch_strides, b_batch_strides] = collapse_batches(a, b);

  auto batch_count = out.size() / (M * N);

  // Collapse batches into M if needed
  if (batch_count > 1 && !a_transposed && batch_shape.size() == 1 &&
      a.strides()[a.ndim() - 2] == K && a_batch_strides.back() == M * K &&
      b_batch_strides.back() == 0) {
    M *= batch_shape.back();
    batch_count = 1;

    a_batch_strides = {0};
    b_batch_strides = {0};
    batch_shape = {1};
  }

  // Use gemmv when possible
  if (!bias && cu::can_use_gemv(M, N, K, a_transposed, b_transposed)) {
    cu::gemv(
        a,
        b,
        out,
        M,
        N,
        K,
        batch_count,
        batch_shape,
        a_batch_strides,
        b_batch_strides,
        encoder);
    return;
  }

  // Invoke cublasLt
  CublasGemm gemm(
      encoder.device(),
      a.dtype(),
      a_transposed,
      M,
      K,
      lda,
      b_transposed,
      K,
      N,
      ldb,
      batch_shape.back(),
      a_batch_strides.back(),
      b_batch_strides.back());
  if (bias) {
    if (a.dtype() == complex64) {
      throw std::runtime_error(
          "[gemm_and_bias] complex64 bias epilogue isn’t supported in cublasLtMatmul.");
    }
    gemm.set_bias(encoder, *bias);
  }
  gemm.run(
      encoder, out, a, b, batch_shape, a_batch_strides, b_batch_strides, alpha);
}

void gather_mm_rhs(
    const array& a_,
    const array& b_,
    const array& indices_,
    array& out,
    cu::CommandEncoder& encoder,
    Stream s) {
  if (a_.size() / a_.shape(-2) / a_.shape(-1) != indices_.size()) {
    throw std::runtime_error("[gather_mm] Broadcasting lhs is not supported.");
  }

  int group_count = b_.size() / b_.shape(-1) / b_.shape(-2);
  if (group_count > 1024) {
    throw std::runtime_error(
        "[gather_mm] Group count can not be larger than 1024.");
  }

  auto [a_transposed, lda, a] = ensure_batch_contiguous(a_, encoder, s);
  auto [b_transposed, ldb, b] = ensure_batch_contiguous(b_, encoder, s);
  auto indices = ensure_row_contiguous(indices_, encoder, s);

  cutlass_grouped_gemm_unaligned(
      a_transposed,
      lda,
      b_transposed,
      ldb,
      group_count,
      a,
      b,
      indices,
      out,
      encoder);
}

} // namespace

void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Matmul::eval_gpu");
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);

  assert(inputs.size() == 2);
  auto& a_pre = inputs[0];
  auto& b_pre = inputs[1];
  // Return 0s if either input is empty.
  if (a_pre.size() == 0 || b_pre.size() == 0) {
    array zero(0, a_pre.dtype());
    encoder.add_temporary(zero);
    fill_gpu(zero, out, s);
    return;
  }

  out.set_data(cu::malloc_async(out.nbytes(), encoder));

  int M = a_pre.shape(-2);
  int N = b_pre.shape(-1);
  int K = a_pre.shape(-1);

  // Keep a vector with copies to be cleared in the completed buffer to release
  // the arrays
  auto [a_transposed, lda, a] = check_transpose(encoder, s, a_pre);
  auto [b_transposed, ldb, b] = check_transpose(encoder, s, b_pre);

  gemm_and_bias(
      encoder, M, N, K, a_transposed, lda, b_transposed, ldb, out, a, b);
}

void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("AddMM::eval_gpu");
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);

  assert(inputs.size() == 3);
  auto& a_pre = inputs[0];
  auto& b_pre = inputs[1];
  auto c = inputs[2];

  /////////////////////////////////////////////////////////////////////////////
  // Init checks and prep

  int M = a_pre.shape(-2);
  int N = b_pre.shape(-1);
  int K = a_pre.shape(-1);

  // Keep a vector with copies to be cleared in the completed buffer to release
  // the arrays
  auto [a_transposed, lda, a] = check_transpose(encoder, s, a_pre);
  auto [b_transposed, ldb, b] = check_transpose(encoder, s, b_pre);

  /////////////////////////////////////////////////////////////////////////////
  // Dispatch to GEMM with epilogue or AddMM

  if (beta_ == 1 && a.dtype() != complex64 && c.strides(-1) == 1 &&
      c.data_size() == out.shape(-1)) {
    out.set_data(cu::malloc_async(out.nbytes(), encoder));
    gemm_and_bias(
        encoder,
        M,
        N,
        K,
        a_transposed,
        lda,
        b_transposed,
        ldb,
        out,
        a,
        b,
        c,
        alpha_);
    return;
  }

  int64_t ldc;
  {
    auto stx = c.strides()[c.ndim() - 2];
    auto sty = c.strides()[c.ndim() - 1];
    if (sty == 1 && stx == c.shape(-1)) {
      ldc = stx;
      out.set_data(cu::malloc_async(out.nbytes(), encoder));
    } else if (sty == 1 && stx == 0) {
      ldc = 0;
      out.set_data(cu::malloc_async(out.nbytes(), encoder));
    } else {
      // Copy C into out and set C to out
      ldc = c.shape(-1);
      copy_gpu(c, out, CopyType::General, s);
      c = out;
    }
  }

  /////////////////////////////////////////////////////////////////////////////
  // Check and collapse batch dimensions

  auto [batch_shape, a_batch_strides, b_batch_strides, c_batch_strides] =
      collapse_batches(a, b, c);

  auto batch_count = out.size() / (M * N);

  // Collapse batches into M if needed
  if (batch_count > 1 && !a_transposed && batch_shape.size() == 1 &&
      a.strides()[a.ndim() - 2] == K && a_batch_strides.back() == M * K &&
      c_batch_strides.back() == M * c.strides()[c.ndim() - 2] &&
      b_batch_strides.back() == 0) {
    M *= batch_shape.back();
    batch_count = 1;

    a_batch_strides = {0};
    b_batch_strides = {0};
    c_batch_strides = {0};
    batch_shape = {1};
  }

  /////////////////////////////////////////////////////////////////////////////
  // Invoke cublasLt with AddMM settings

  CublasGemm gemm(
      cu::device(s.device),
      a.dtype(),
      a_transposed,
      M,
      K,
      lda,
      b_transposed,
      K,
      N,
      ldb,
      ldc,
      batch_shape.back(),
      a_batch_strides.back(),
      b_batch_strides.back(),
      c_batch_strides.back());
  gemm.run(
      encoder,
      out,
      a,
      b,
      c,
      batch_shape,
      a_batch_strides,
      b_batch_strides,
      c_batch_strides,
      alpha_,
      beta_);
}

void GatherMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("GatherMM::eval_gpu");
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);

  assert(inputs.size() == 4);
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto& lhs_indices = inputs[2];
  auto& rhs_indices = inputs[3];

  // Return 0s if either input is empty.
  if (a.size() == 0 || b.size() == 0) {
    array zero(0, a.dtype());
    encoder.add_temporary(zero);
    fill_gpu(zero, out, s);
    return;
  }

  out.set_data(cu::malloc_async(out.nbytes(), encoder));

  // Extract shapes from inputs.
  int M = a.shape(-2);
  int N = b.shape(-1);
  int K = a.shape(-1);

  // We are walking a in order and b is also in order so we can batch up the
  // matmuls and reuse reading a and b.
  if (M == 1 && right_sorted_ == true) {
    gather_mm_rhs(a, b, rhs_indices, out, encoder, s);
    return;
  }

  auto [transposed_a, lda, a_] = check_transpose(encoder, s, a);
  auto [transposed_b, ldb, b_] = check_transpose(encoder, s, b);
  auto use_gemv = cu::can_use_gemv(M, N, K, transposed_a, transposed_b);
  if (M == 1 && use_gemv) {
    gather_mv(b_, a_, rhs_indices, lhs_indices, out, N, K, encoder);
    return;
  }

  if (N == 1 && use_gemv) {
    gather_mv(a_, b_, lhs_indices, rhs_indices, out, M, K, encoder);
    return;
  }

  throw std::runtime_error("NYI");
}

void SegmentedMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("SegmentedMM::eval_gpu");
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);

  assert(inputs.size() == 3);
  auto& a_pre = inputs[0];
  auto& b_pre = inputs[1];
  auto& segments_pre = inputs[2];

  // Return zeros if output is empty or either input is empty.
  if (out.size() == 0 || a_pre.size() == 0 || b_pre.size() == 0) {
    array zero(0, a_pre.dtype());
    encoder.add_temporary(zero);
    fill_gpu(zero, out, s);
    return;
  }

  out.set_data(cu::malloc_async(out.nbytes(), encoder));

  int M = a_pre.shape(-2);
  int N = b_pre.shape(-1);
  int num_segments = segments_pre.size() / 2;

  auto [a_transposed, lda, a] = check_transpose(encoder, s, a_pre);
  auto [b_transposed, ldb, b] = check_transpose(encoder, s, b_pre);
  auto segments = [&] {
    if (segments_pre.flags().row_contiguous) {
      return segments_pre;
    }
    array copy = contiguous_copy_gpu(segments_pre, s);
    encoder.add_temporary(copy);
    return copy;
  }();

  cutlass_segmented_mm(
      a_transposed,
      lda,
      b_transposed,
      ldb,
      num_segments,
      M,
      N,
      a,
      b,
      segments,
      out,
      encoder);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/no_cuda.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/cuda.h"
#include "mlx/fast.h"

namespace mlx::core {

namespace cu {

bool is_available() {
  return false;
}

} // namespace cu

namespace fast {

CustomKernelFunction cuda_kernel(
    const std::string&,
    const std::vector<std::string>&,
    const std::vector<std::string>&,
    const std::string&,
    const std::string&,
    bool,
    int) {
  throw std::runtime_error("[cuda_kernel] No CUDA back-end.");
}

std::vector<array> precompiled_cuda_kernel(
    const std::string&,
    const std::string&,
    const std::vector<array>&,
    const std::vector<Shape>&,
    const std::vector<Dtype>&,
    const std::vector<ScalarArg>&,
    std::tuple<int, int, int>,
    std::tuple<int, int, int>,
    int shared_memory,
    std::optional<float> init_value,
    bool ensure_row_contiguous,
    StreamOrDevice) {
  throw std::runtime_error("[cuda_kernel] No CUDA back-end.");
}

} // namespace fast

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/primitives.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/distributed/primitives.h"
#include <cuda_runtime.h>
#include "mlx/fast_primitives.h"
#include "mlx/primitives.h"

namespace mlx::core {

#define NO_GPU_MULTI(func)                                             \
  void func::eval_gpu(                                                 \
      const std::vector<array>& inputs, std::vector<array>& outputs) { \
    throw std::runtime_error(#func " has no CUDA implementation.");    \
  }

#define NO_GPU_USE_FALLBACK(func)     \
  bool func::use_fallback(Stream s) { \
    return true;                      \
  }                                   \
  NO_GPU_MULTI(func)

#define NO_GPU(func)                                                  \
  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
    throw std::runtime_error(#func " has no CUDA implementation.");   \
  }

NO_GPU(BlockMaskedMM)
NO_GPU(GatherQMM)
NO_GPU_MULTI(LUF)
NO_GPU_MULTI(QRF)
NO_GPU_MULTI(SVD)
NO_GPU(Inverse)
NO_GPU(Cholesky)
NO_GPU_MULTI(Eig)
NO_GPU_MULTI(Eigh)

namespace distributed {
NO_GPU_MULTI(Send)
NO_GPU_MULTI(Recv)
} // namespace distributed

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/affine_quantize.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/quantized.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/dtype_utils.h"

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>

namespace mlx::core {
namespace cu {

namespace cg = cooperative_groups;

template <typename T, int group_size, int bits>
__global__ void
affine_quantize(const T* w, uint8_t* out, T* scales, T* biases, size_t size) {
  auto block_size = cg::this_thread_block().dim_threads();
  auto block_idx = cg::this_thread_block().group_index();
  auto idx_in_block = cg::this_thread_block().thread_index();

  auto tidx = block_idx.x * block_size.x + idx_in_block.x;
  auto tidy = block_idx.y * block_size.y + idx_in_block.y;

  auto grid_dim_x = cg::this_grid().dim_blocks().x * block_size.x;
  constexpr float eps = 1e-7;
  constexpr int simd_size = WARP_SIZE;
  constexpr float n_bins = (1 << bits) - 1;
  constexpr int pack_factor = get_pack_factor(bits, 8);
  constexpr int bytes_per_pack = get_bytes_per_pack(bits);
  constexpr int values_per_reduce = group_size / simd_size;
  constexpr int writes_per_reduce = pack_factor / values_per_reduce;
  constexpr int writes_per_pack =
      writes_per_reduce > 1 ? 1 : values_per_reduce / pack_factor;
  constexpr int power_of_2_bits = (bits & (bits - 1)) == 0;

  size_t offset = tidx + grid_dim_x * size_t(tidy);
  size_t in_index = offset * values_per_reduce;
  if (in_index >= size) {
    return;
  }
  size_t out_index = power_of_2_bits
      ? offset * writes_per_pack
      : offset * bytes_per_pack / writes_per_reduce;

  float w_thread[values_per_reduce];
  float w_min = Limits<float>::max();
  float w_max = 0;

#pragma clang loop unroll(full)
  for (int i = 0; i < values_per_reduce; i++) {
    float val = w[in_index + i];
    w_thread[i] = val;
    w_min = min(w_min, val);
    w_max = max(w_max, val);
  }

  cg::greater<float> max_op;
  cg::less<float> min_op;
  auto warp = cg::tiled_partition<WARP_SIZE>(cg::this_thread_block());

  w_min = cg::reduce(warp, w_min, min_op);
  w_max = cg::reduce(warp, w_max, max_op);

  float scale = max((w_max - w_min) / n_bins, eps);
  bool side = abs(w_min) > abs(w_max);
  scale = side ? scale : -scale;
  float edge = side ? w_min : w_max;
  float q0 = round(edge / scale);
  bool at_zero = q0 == 0.0f;
  scale = at_zero ? scale : edge / q0;
  float bias = at_zero ? 0 : edge;

  // Write out the scales and biases
  size_t gindex = in_index / group_size;
  if (in_index % group_size == 0) {
    scales[gindex] = static_cast<T>(scale);
    biases[gindex] = static_cast<T>(bias);
  }

  using OutType = std::conditional_t<bits == 5, uint64_t, uint32_t>;
  OutType output = 0;

#pragma clang loop unroll(full)
  for (int i = 0; i < values_per_reduce; i++) {
    uint8_t val = min(round((w_thread[i] - bias) / scale), n_bins);
    if (bits == 8) {
      output = val;
    } else {
      output |= val << (bits * (i % pack_factor));
    }

    if (pack_factor < values_per_reduce && i % pack_factor == pack_factor - 1) {
      out[out_index + i / pack_factor] = output;
      output = 0;
    } else {
#pragma clang loop unroll(full)
      for (int j = 1; j < writes_per_reduce; j++) {
        uint8_t sval = warp.shfl_down(val, j);
        output |= static_cast<OutType>(sval)
            << (bits * (j * values_per_reduce + i));
      }
    }
  }
  if constexpr (bits == 3 || bits == 6) {
    if (in_index % pack_factor == 0 && out_index % bytes_per_pack == 0) {
      out[out_index] = output & 0xff;
      out[out_index + 1] = (output & 0xff00) >> 8;
      out[out_index + 2] = (output & 0xff0000) >> 16;
    }
  } else if constexpr (bits == 5) {
    if (in_index % pack_factor == 0 && out_index % bytes_per_pack == 0) {
      out[out_index] = output & 0xff;
      out[out_index + 1] = (output & 0xff00) >> 8;
      out[out_index + 2] = (output & 0xff0000) >> 16;
      out[out_index + 3] = (output & 0xff000000) >> 24;
      out[out_index + 4] = (output & 0xff00000000) >> 32;
    }
  } else {
    if constexpr (writes_per_reduce > 0) {
      if (out_index % writes_per_reduce == 0) {
        out[out_index / writes_per_reduce] = output;
      }
    }
  }
}

template <typename T, int group_size, int bits>
__global__ void affine_dequantize(
    const uint8_t* w,
    const T* scales,
    const T* biases,
    T* out,
    size_t size) {
  auto block_size = cg::this_thread_block().dim_threads();
  auto block_idx = cg::this_thread_block().group_index();
  auto idx_in_block = cg::this_thread_block().thread_index();

  auto tidx = block_idx.x * block_size.x + idx_in_block.x;
  auto tidy = block_idx.y * block_size.y + idx_in_block.y;

  auto grid_dim_x = cg::this_grid().dim_blocks().x * block_size.x;

  constexpr int pack_factor = get_pack_factor(bits, 8);
  constexpr int bytes_per_pack = get_bytes_per_pack(bits);

  size_t offset = tidx + grid_dim_x * size_t(tidy);
  size_t oindex = offset * pack_factor;

  if (oindex >= size) {
    return;
  }

  size_t gindex = oindex / group_size;
  T scale = scales[gindex];
  T bias = biases[gindex];
  out += oindex;

  if constexpr (bits == 3) {
    w += offset * bytes_per_pack;
    out[0] = static_cast<T>(w[0] & 0x7) * scale + bias;
    out[1] = static_cast<T>((w[0] & 0x38) >> 3) * scale + bias;
    out[2] = (static_cast<T>((w[0] & 0xc0) >> 6) +
              static_cast<T>((w[1] & 0x1) << 2)) *
            scale +
        bias;
    out[3] = static_cast<T>((w[1] & 0xe) >> 1) * scale + bias;
    out[4] = static_cast<T>((w[1] & 0x70) >> 4) * scale + bias;
    out[5] = (static_cast<T>((w[1] & 0x80) >> 7) +
              static_cast<T>((w[2] & 0x3) << 1)) *
            scale +
        bias;
    out[6] = static_cast<T>((w[2] & 0x1c) >> 2) * scale + bias;
    out[7] = static_cast<T>((w[2] & 0xe0) >> 5) * scale + bias;
  } else if constexpr (bits == 5) {
    w += offset * bytes_per_pack;
    out[0] = static_cast<T>(w[0] & 0x1f) * scale + bias;
    out[1] = (static_cast<T>((w[0] & 0xe0) >> 5) +
              static_cast<T>((w[1] & 0x3) << 3)) *
            scale +
        bias;
    out[2] = static_cast<T>((w[1] & 0x7c) >> 2) * scale + bias;
    out[3] = (static_cast<T>((w[1] & 0x80) >> 7) +
              static_cast<T>((w[2] & 0xf) << 1)) *
            scale +
        bias;
    out[4] = (static_cast<T>((w[2] & 0xf0) >> 4) +
              static_cast<T>((w[3] & 0x1) << 4)) *
            scale +
        bias;
    out[5] = static_cast<T>((w[3] & 0x3e) >> 1) * scale + bias;
    out[6] = (static_cast<T>((w[3] & 0xc0) >> 6) +
              static_cast<T>((w[4] & 0x7) << 2)) *
            scale +
        bias;
    out[7] = static_cast<T>((w[4] & 0xf8) >> 3) * scale + bias;
  } else if constexpr (bits == 6) {
    w += offset * bytes_per_pack;
    out[0] = static_cast<T>(w[0] & 0x3f) * scale + bias;
    out[1] = (static_cast<T>((w[0] >> 6) & 0x03) +
              static_cast<T>((w[1] & 0x0f) << 2)) *
            scale +
        bias;
    out[2] = (static_cast<T>((w[1] >> 4) & 0x0f) +
              static_cast<T>((w[2] & 0x03) << 4)) *
            scale +
        bias;
    out[3] = static_cast<T>((w[2] >> 2) & 0x3f) * scale + bias;
  } else {
    uint32_t val = w[offset];
#pragma clang loop unroll(full)
    for (int i = 0; i < pack_factor; i++) {
      uint8_t d;
      if (bits == 2) {
        d = (val >> (bits * i)) & 0x03;
      } else if (bits == 4) {
        d = (val >> (bits * i)) & 0x0f;
      } else if (bits == 8) {
        d = val;
      }
      out[i] = scale * static_cast<T>(d) + bias;
    }
  }
}

} // namespace cu

template <typename F>
void dispatch_groups(int group_size, F&& f) {
  switch (group_size) {
    case 32:
      f(std::integral_constant<int, 32>{});
      break;
    case 64:
      f(std::integral_constant<int, 64>{});
      break;
    case 128:
      f(std::integral_constant<int, 128>{});
      break;
  }
}

template <typename F>
void dispatch_bits(int bits, F&& f) {
  switch (bits) {
    case 2:
      f(std::integral_constant<int, 2>{});
      break;
    case 3:
      f(std::integral_constant<int, 3>{});
      break;
    case 4:
      f(std::integral_constant<int, 4>{});
      break;
    case 5:
      f(std::integral_constant<int, 5>{});
      break;
    case 6:
      f(std::integral_constant<int, 6>{});
      break;
    case 8:
      f(std::integral_constant<int, 8>{});
      break;
  }
}

void affine_quantize(
    const array& w,
    array& wq,
    array& scales,
    array& biases,
    int group_size_,
    int bits_,
    cu::CommandEncoder& enc,
    const Stream& s) {
  // Calculate the number of elements per thread
  int per_thread = group_size_ / WARP_SIZE;
  size_t size = w.size() / per_thread;

  // Calculate the thread grid that we need to launch
  bool large = size > UINT_MAX;
  auto grid_shape = w.shape();
  grid_shape.back() /= per_thread;

  enc.set_input_array(w);
  enc.set_output_array(wq);
  enc.set_output_array(scales);
  enc.set_output_array(biases);
  dispatch_float_types(w.dtype(), "affine_quantize", [&](auto type_tag) {
    dispatch_groups(group_size_, [&](auto group_size) {
      dispatch_bits(bits_, [&](auto bits) {
        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
        auto kernel = cu::affine_quantize<T, group_size.value, bits.value>;
        auto [num_blocks, block_dims] =
            get_launch_args(size, grid_shape, w.strides(), large);
        enc.add_kernel_node(
            kernel,
            num_blocks,
            block_dims,
            gpu_ptr<T>(w),
            gpu_ptr<uint8_t>(wq),
            gpu_ptr<T>(scales),
            gpu_ptr<T>(biases),
            w.size());
      });
    });
  });
}

void affine_dequantize(
    const array& wq,
    const array& scales,
    const array& biases,
    array& w,
    int group_size_,
    int bits_,
    cu::CommandEncoder& enc,
    const Stream& s) {
  // Calculate how many numbers we pack together. For 2, 4, 8 bits we pack in
  // one uint8, for 3, 6 in 3 uint8 and for 5 in 5 uint8.
  constexpr int uint8_per_uint32 = 4;
  int packs_per_int;
  switch (bits_) {
    case 3:
    case 5:
      packs_per_int = 8;
      break;
    case 6:
      packs_per_int = 4;
      break;
    default:
      packs_per_int = 8 / bits_;
  }

  size_t size = w.size() / packs_per_int;
  bool large = size > UINT_MAX;
  auto grid_shape = w.shape();
  grid_shape.back() *= uint8_per_uint32;

  enc.set_input_array(wq);
  enc.set_input_array(scales);
  enc.set_input_array(biases);
  enc.set_output_array(w);
  dispatch_float_types(w.dtype(), "affine_dequantize", [&](auto type_tag) {
    dispatch_groups(group_size_, [&](auto group_size) {
      dispatch_bits(bits_, [&](auto bits) {
        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
        auto kernel = cu::affine_dequantize<T, group_size.value, bits.value>;
        auto [num_blocks, block_dims] =
            get_launch_args(size, grid_shape, w.strides(), large);
        enc.add_kernel_node(
            kernel,
            num_blocks,
            block_dims,
            gpu_ptr<uint8_t>(wq),
            gpu_ptr<T>(scales),
            gpu_ptr<T>(biases),
            gpu_ptr<T>(w),
            w.size());
      });
    });
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/convert_fp8.cu
================================================
// Copyright © 2025 Apple Inc.
#include "mlx/backend/cuda/unary/unary.cuh"
#include "mlx/fast_primitives.h"

namespace mlx::core {
void fast::ConvertFP8::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  nvtx3::scoped_range r("ConvertFP8::eval_gpu");
  auto& in = inputs[0];
  auto& out = outputs[0];
  auto& s = out.primitive().stream();
  if (to_fp8_) {
    unary_op_gpu<cu::ToFP8>(inputs, out, name(), s);
  } else {
    unary_op_gpu<cu::FromFP8>(inputs, out, name(), s);
  }
}
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/cublas_qqmm.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/quantized/cublas_qqmm.h"

#include <fmt/format.h>
#include "mlx/backend/cuda/cublas_utils.h"

#include "mlx/backend/cuda/device.h"
#include "mlx/dtype_utils.h"
#include "mlx/utils.h"

namespace mlx::core {

namespace {

struct QuantModeConfig {
  cudaDataType_t data_type;
  cudaDataType_t scale_dtype;
  cublasLtMatmulMatrixScale_t scale_mode;
};

QuantModeConfig get_quant_mode_config(const std::string& mode) {
  if (mode == "mxfp8") {
    return {
        CUDA_R_8F_E4M3,
        CUDA_R_8F_UE8M0,
        CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0};
  } else if (mode == "nvfp4") {
    return {
        CUDA_R_4F_E2M1,
        CUDA_R_8F_UE4M3,
        CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3};
  }
  throw std::runtime_error(
      fmt::format("Unsupported quantization mode in CublasQQMM: {}.", mode));
}

} // namespace

CublasQQMM::CublasQQMM(
    cu::Device& device,
    bool a_transposed,
    uint64_t a_rows,
    uint64_t a_cols,
    int64_t lda,
    bool b_transposed,
    uint64_t b_rows,
    uint64_t b_cols,
    int64_t ldb,
    int32_t batch_count,
    int64_t a_batch_stride,
    int64_t b_batch_stride,
    Dtype out_dtype,
    const std::string& qmode) {
  auto config = get_quant_mode_config(qmode);

  // The compute type must be CUBLAS_COMPUTE_32F.
  // The scale type must be CUDA_R_32F.
  cudaDataType_t scale_type = CUDA_R_32F;
  cublasComputeType_t gemm_compute_type = CUBLAS_COMPUTE_32F;
  cudaDataType_t output_type =
      cublas_utils::dtype_to_cublas_type(out_dtype, "CublasQQMM");

  init_base(
      device,
      scale_type,
      gemm_compute_type,
      config.data_type,
      output_type,
      a_transposed,
      a_rows,
      a_cols,
      lda,
      b_transposed,
      b_rows,
      b_cols,
      ldb,
      batch_count,
      a_batch_stride,
      b_batch_stride);

  a_scale_mode_ = config.scale_mode;
  b_scale_mode_ = config.scale_mode;

  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
      matmul_desc_,
      CUBLASLT_MATMUL_DESC_B_SCALE_MODE,
      &a_scale_mode_,
      sizeof(a_scale_mode_)));
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
      matmul_desc_,
      CUBLASLT_MATMUL_DESC_A_SCALE_MODE,
      &b_scale_mode_,
      sizeof(b_scale_mode_)));
}

CublasQQMM::CublasQQMM(
    cu::Device& device,
    bool a_transposed,
    uint64_t a_rows,
    uint64_t a_cols,
    int64_t lda,
    bool b_transposed,
    uint64_t b_rows,
    uint64_t b_cols,
    int64_t ldb,
    int64_t ldc,
    int32_t batch_count,
    int64_t a_batch_stride,
    int64_t b_batch_stride,
    int64_t c_batch_stride,
    Dtype out_dtype,
    const std::string& qmode)
    : CublasQQMM(
          device,
          a_transposed,
          a_rows,
          a_cols,
          lda,
          b_transposed,
          b_rows,
          b_cols,
          ldb,
          batch_count,
          a_batch_stride,
          b_batch_stride,
          out_dtype,
          qmode) {
  auto type = cublas_utils::dtype_to_cublas_type(
      out_dtype, "CublasQQMM"); // must match the output type
  c_desc_ = cublas_utils::create_matrix_layout(
      type,
      b_transposed ? b_rows : b_cols,
      a_transposed ? a_cols : a_rows,
      false,
      ldc,
      batch_count,
      c_batch_stride);
}

void CublasQQMM::run(
    cu::CommandEncoder& encoder,
    array& out,
    const array& a,
    const array& b,
    const array& a_scale,
    const array& b_scale,
    const array& alpha,
    const array& beta) {
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_input_array(a_scale);
  encoder.set_input_array(b_scale);
  encoder.set_input_array(alpha);
  encoder.set_input_array(beta);
  encoder.set_output_array(out);

  execute(
      encoder,
      gpu_ptr<void>(out),
      gpu_ptr<void>(a),
      gpu_ptr<void>(b),
      gpu_ptr<void>(a_scale),
      gpu_ptr<void>(b_scale),
      nullptr,
      gpu_ptr<void>(alpha),
      gpu_ptr<void>(beta));
}

void CublasQQMM::run(
    cu::CommandEncoder& encoder,
    array& out,
    const array& a,
    const array& b,
    const array& a_scale,
    const array& b_scale) {
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_input_array(a_scale);
  encoder.set_input_array(b_scale);
  encoder.set_output_array(out);

  execute(
      encoder,
      gpu_ptr<void>(out),
      gpu_ptr<void>(a),
      gpu_ptr<void>(b),
      gpu_ptr<void>(a_scale),
      gpu_ptr<void>(b_scale),
      nullptr);
}

void CublasQQMM::set_scales_ptrs(
    cu::CommandEncoder& encoder,
    const void* a_scale,
    const void* b_scale) {
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
      matmul_desc_,
      CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
      &b_scale,
      sizeof(b_scale)));
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
      matmul_desc_,
      CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
      &a_scale,
      sizeof(a_scale)));
}

void CublasQQMM::execute(
    cu::CommandEncoder& encoder,
    void* out,
    const void* a,
    const void* b,
    const void* a_scale,
    const void* b_scale,
    const void* c,
    const void* alpha,
    const void* beta) {
  set_scales_ptrs(encoder, a_scale, b_scale);
  // alpha and beta are both should be device pointers for nvfp4
  // by default cublas uses host pointers
  // https://docs.nvidia.com/cuda/cublas/#cublasltpointermode-t
  cublasLtPointerMode_t pointer_mode = CUBLASLT_POINTER_MODE_DEVICE;
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
      matmul_desc_,
      CUBLASLT_MATMUL_DESC_POINTER_MODE,
      &pointer_mode,
      sizeof(pointer_mode)));
  execute_matmul(encoder, out, a, b, c, alpha, beta);
}

void CublasQQMM::execute(
    cu::CommandEncoder& encoder,
    void* out,
    const void* a,
    const void* b,
    const void* a_scale,
    const void* b_scale,
    const void* c,
    const float alpha /* = 1 */,
    const float beta /* = 0 */) {
  set_scales_ptrs(encoder, a_scale, b_scale);
  // alpha and beta are both should be host pointers
  cublasLtPointerMode_t pointer_mode = CUBLASLT_POINTER_MODE_HOST;
  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
      matmul_desc_,
      CUBLASLT_MATMUL_DESC_POINTER_MODE,
      &pointer_mode,
      sizeof(pointer_mode)));

  const void* alpha_ptr = &alpha;
  const void* beta_ptr = &beta;

  execute_matmul(encoder, out, a, b, c, alpha_ptr, beta_ptr);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/cublas_qqmm.h
================================================
// Copyright © 2025 Apple Inc.
#pragma once

#include "mlx/array.h"
#include "mlx/backend/cuda/cublas_utils.h"
#include "mlx/backend/cuda/device.h"

#include <cublasLt.h>

namespace mlx::core {

class CublasQQMM : public CublasMatmulBase {
 public:
  CublasQQMM(
      cu::Device& device,
      bool a_transposed,
      uint64_t a_rows,
      uint64_t a_cols,
      int64_t lda,
      bool b_transposed,
      uint64_t b_rows,
      uint64_t b_cols,
      int64_t ldb,
      int32_t batch_count,
      int64_t a_batch_stride,
      int64_t b_batch_stride,
      Dtype out_dtype,
      const std::string& quantization_mode);

  CublasQQMM(
      cu::Device& device,
      bool a_transposed,
      uint64_t a_rows,
      uint64_t a_cols,
      int64_t lda,
      bool b_transposed,
      uint64_t b_rows,
      uint64_t b_cols,
      int64_t ldb,
      int64_t ldc,
      int32_t batch_count,
      int64_t a_batch_stride,
      int64_t b_batch_stride,
      int64_t c_batch_stride,
      Dtype out_dtype,
      const std::string& quantization_mode);

  void run(
      cu::CommandEncoder& encoder,
      array& out,
      const array& a,
      const array& b,
      const array& a_scale,
      const array& b_scale,
      const array& alpha,
      const array& beta);

  void run(
      cu::CommandEncoder& encoder,
      array& out,
      const array& a,
      const array& b,
      const array& a_scale,
      const array& b_scale);

 private:
  void set_scales_ptrs(
      cu::CommandEncoder& encoder,
      const void* a_scale,
      const void* b_scale);

  void execute(
      cu::CommandEncoder& encoder,
      void* out,
      const void* a,
      const void* b,
      const void* a_scale,
      const void* b_scale,
      const void* c,
      const void* alpha,
      const void* beta);

  void execute(
      cu::CommandEncoder& encoder,
      void* out,
      const void* a,
      const void* b,
      const void* a_scale,
      const void* b_scale,
      const void* c,
      const float alpha = 1.0f,
      const float beta = 0.0f);

  cublasLtMatmulMatrixScale_t a_scale_mode_;
  cublasLtMatmulMatrixScale_t b_scale_mode_;
  cublasLtMatmulMatrixScale_t c_scale_mode_;
  cublasLtMatmulMatrixScale_t out_scale_mode_;
};

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/fp_quantize.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/quantized.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/backend/cuda/quantized/mxfp8_quantize.cuh"
#include "mlx/backend/cuda/quantized/nvfp4_quantize.cuh"
#include "mlx/backend/cuda/quantized/quantized.h"
#include "mlx/backend/cuda/vector_types.cuh"
#include "mlx/dtype_utils.h"

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include <cutlass/float8.h>
#include <cutlass/numeric_conversion.h>

constexpr float F8E4M3_MAX = 448.0f;
constexpr float F4E2M1_MAX = 6.0f;

namespace mlx::core {
namespace cu {

template <int bits>
struct Dequantize {
  __device__ float operator()(uint8_t x) {
    if constexpr (bits == 8) {
      return float(*(cutlass::float_e4m3_t*)(&x));
    } else {
      return float(*(cutlass::float_e2m1_t*)(&x));
    }
  }
};

template <typename T>
__device__ __forceinline__ void absmax_x2(T& out, const T& x1, const T& x2) {
  if constexpr (
      (std::is_same<T, __nv_bfloat162>::value) ||
      (std::is_same<T, __half2>::value)) {
    T a = x1;
    T b = x2;
    out = __hmax2(__habs2(a), __habs2(b));
  } else if constexpr (std::is_same<T, float2>::value) {
    float2 a = x1;
    float2 b = x2;
    out.x = fmaxf(fabsf(a.x), fabsf(b.x));
    out.y = fmaxf(fabsf(a.y), fabsf(b.y));
  }
}

namespace cg = cooperative_groups;

template <typename T, int group_size, int bits, bool use_mx_scale, bool USE_SR>
__global__ void fp_quantize_dequantize(
    T* w,
    T* out,
    size_t size,
    float* global_scale = nullptr) {
  const bool use_global_scale = global_scale != nullptr;
  const float scale_enc =
      use_global_scale ? (F8E4M3_MAX * F4E2M1_MAX) / *global_scale : 1.0f;
  const float inv_scale_enc = use_global_scale ? 1.0f / scale_enc : 1.0f;

  using Tx2 = Vector2_t<T>;
  uint32_t rbits = 0; // reserved bits for future use
  auto block_size = cg::this_thread_block().dim_threads();
  auto block_idx = cg::this_thread_block().group_index();
  auto idx_in_block = cg::this_thread_block().thread_index();
  auto tidx = block_idx.x * block_size.x + idx_in_block.x;
  auto tidy = block_idx.y * block_size.y + idx_in_block.y;
  auto grid_dim_x = cg::this_grid().dim_blocks().x * block_size.x;

  size_t thread_idx = tidx + grid_dim_x * size_t(tidy);
  size_t base_idx = thread_idx * group_size;

  if (base_idx >= size) {
    return;
  }

  auto w_tile = load_vector<group_size, T>(w, thread_idx);
  float scale_dec_b = 0.0f;

  Tx2 amax_2x = Tx2{0.0f, 0.0f};

#pragma unroll
  for (int i = 0; i < group_size; i += 2) {
    auto pair = Tx2{w_tile[i], w_tile[i + 1]};
    absmax_x2<Tx2>(amax_2x, amax_2x, pair);
  }

  scale_dec_b = static_cast<float>(
      max(fabsf(static_cast<float>(amax_2x.x)),
          fabsf(static_cast<float>(amax_2x.y))));

  scale_dec_b /= bits == 4 ? F4E2M1_MAX : F8E4M3_MAX;
  scale_dec_b *= scale_enc;
  // Convert to mx scale or nv scale
  using ScaleType = std::conditional_t<
      use_mx_scale,
      cutlass::float_ue8m0_t,
      cutlass::float_e4m3_t>;
  auto s = ScaleType(scale_dec_b);
  float scale_enc_b = scale_enc / float(s);
  float scale_dec = float(s) * inv_scale_enc;
  AlignedVector<T, group_size> w_hat;

#pragma unroll
  for (int i = 0; i < group_size / 8; i++) {
    auto& w = *reinterpret_cast<cutlass::Array<T, 8>*>(&w_tile[i * 8]);
    cutlass::NumericArrayConverter<float, T, 8> fp32_t;
    auto scaled = fp32_t(w) * scale_enc_b;
    cutlass::Array<float, 8> dq;
    if constexpr (bits == 8) {
      cutlass::NumericArrayConverter<cutlass::float_e4m3_t, float, 8> fp8_fp32;
      auto quant = fp8_fp32(scaled);
      cutlass::NumericArrayConverter<float, cutlass::float_e4m3_t, 8> fp32_fp8;
      dq = fp32_fp8(quant);
    } else {
      cutlass::NumericArrayConverter<cutlass::float_e2m1_t, float, 8> fp4_fp32;
      auto quant = fp4_fp32(scaled);
      cutlass::NumericArrayConverter<float, cutlass::float_e2m1_t, 8> fp32_fp4;
      dq = fp32_fp4(quant);
    }
    cutlass::NumericArrayConverter<T, float, 8> t_fp32;
    *reinterpret_cast<cutlass::Array<T, 8>*>(&w_hat[i * 8]) =
        t_fp32(dq * scale_dec);
  }
  store_vector<group_size>(out, thread_idx, w_hat);
}

template <typename T, int group_size, int bits, bool use_mx_scale, bool USE_SR>
__global__ void fp_quantize_rowwise(
    T* w,
    uint8_t* out,
    uint8_t* scales,
    size_t size,
    float* global_scale = nullptr) {
  // NVFP4 conversion:
  // Global encode scale: (448 × 6) / *global_scale
  // Per-block decode scale: S_dec_b = (block_amax / 6) × S_enc → stored as FP8
  // E4M3 Per-block encode scale: S_enc_b = S_enc / S_dec_b
  const bool use_global_scale = global_scale != nullptr;
  const float scale_enc =
      use_global_scale ? (F8E4M3_MAX * F4E2M1_MAX) / *global_scale : 1.0f;

  using Tx2 = Vector2_t<T>;
  using Tx4 = Vector4_t<T>;
  uint32_t rbits = 0; // reserved bits for future use
  auto block_size = cg::this_thread_block().dim_threads();
  auto block_idx = cg::this_thread_block().group_index();
  auto idx_in_block = cg::this_thread_block().thread_index();
  auto tidx = block_idx.x * block_size.x + idx_in_block.x;
  auto tidy = block_idx.y * block_size.y + idx_in_block.y;
  auto grid_dim_x = cg::this_grid().dim_blocks().x * block_size.x;

  size_t thread_idx = tidx + grid_dim_x * size_t(tidy);
  size_t base_idx = thread_idx * group_size;

  if (base_idx >= size) {
    return;
  }

  auto w_tile = load_vector<group_size, T>(w, thread_idx);
  float scale_dec_b = 0.0f;

  Tx2 amax_2x = Tx2{0.0f, 0.0f};

#pragma unroll
  for (int i = 0; i < group_size; i += 2) {
    auto pair = Tx2{w_tile[i], w_tile[i + 1]};
    absmax_x2<Tx2>(amax_2x, amax_2x, pair);
  }

  scale_dec_b = static_cast<float>(
      max(fabsf(static_cast<float>(amax_2x.x)),
          fabsf(static_cast<float>(amax_2x.y))));

  scale_dec_b /= bits == 4 ? F4E2M1_MAX : F8E4M3_MAX;
  scale_dec_b *= scale_enc;
  // Convert to mx scale or nv scale
  using ScaleType = std::conditional_t<
      use_mx_scale,
      cutlass::float_ue8m0_t,
      cutlass::float_e4m3_t>;
  auto s = ScaleType(scale_dec_b);
  uint8_t q_scale = s.storage;
  float scale_enc_b = scale_enc / float(s);

  scales[thread_idx] = q_scale;
  constexpr int elem_per_byte = bits == 8 ? 1 : 2;
  AlignedVector<uint8_t, group_size / elem_per_byte> quantized;

#pragma unroll
  for (int i = 0; i < group_size / 4; i++) {
    Tx4 w_Tx4 = *reinterpret_cast<Tx4*>(&w_tile[i * 4]);
    if constexpr (bits == 8) {
      uint32_t quantized_val =
          scale_cvt_Tx4_to_fp8x4<T, USE_SR>(w_Tx4, scale_enc_b, rbits);
      *reinterpret_cast<uint32_t*>(&quantized[i * 4]) = quantized_val;
    } else {
      uint16_t quantized_val =
          scale_cvt_Tx4_to_fp4x4<T, USE_SR>(w_Tx4, scale_enc_b, rbits);
      *reinterpret_cast<uint16_t*>(&quantized[i * 2]) = quantized_val;
    }
  }
  store_vector<group_size / elem_per_byte>(out, thread_idx, quantized);
}

template <typename T, int group_size, int bits, bool use_mx_scale, bool USE_SR>
__global__ void fp_quantize_columnwise(
    T* w,
    uint8_t* out,
    uint8_t* scales,
    size_t size,
    int M,
    int K,
    float* global_scale = nullptr) {
  // Input: [M, K] with strides [1, M] (M-major)
  // Quantized output: [M, K/elem_per_byte] row-major (K-major)
  // Scales: [M, K/group_size] row-major (K-major)
  // Quantize along K (last dimension, groups of group_size elements)
  const bool use_global_scale = global_scale != nullptr;
  const float scale_enc =
      use_global_scale ? (F8E4M3_MAX * F4E2M1_MAX) / *global_scale : 1.0f;

  using Tx2 = Vector2_t<T>;
  using Tx4 = Vector4_t<T>;
  uint32_t rbits = 0;

  auto block_idx = cg::this_thread_block().group_index();
  auto idx_in_block = cg::this_thread_block().thread_index();

  constexpr int BLOCK_X = 16;
  constexpr int BLOCK_Y = 32;
  constexpr int elem_per_byte = (bits == 8) ? 1 : 2;
  constexpr int bytes_per_group = group_size / elem_per_byte;

  constexpr int rows_per_block = BLOCK_X;
  constexpr int cols_per_block = BLOCK_Y * group_size;
  constexpr int local_cols = cols_per_block / elem_per_byte;
  constexpr int bytes_per_block = rows_per_block * local_cols;

  constexpr int SMEM_PAD = 4;
  constexpr int padded_local_cols = local_cols + SMEM_PAD;

  auto tidx = idx_in_block.x;
  auto tidy = idx_in_block.y;

  int num_col_blocks = (K + cols_per_block - 1) / cols_per_block;
  auto bidx = block_idx.x % num_col_blocks;
  auto bidy = block_idx.x / num_col_blocks;

  T thread_data[group_size];

  __shared__ uint8_t quantized_smem[rows_per_block * padded_local_cols];
  __shared__ uint8_t scales_smem[BLOCK_X][BLOCK_Y + SMEM_PAD];

  int row_base = bidy * rows_per_block + tidx;
  int col_base = bidx * cols_per_block + tidy * group_size;

  bool valid = (row_base < M) && (col_base + group_size <= K);
  if (valid) {
#pragma unroll
    for (int i = 0; i < group_size; i++) {
      auto index = row_base + (col_base + i) * M;
      thread_data[i] = w[index];
    }

    // Compute scale
    Tx2 amax_2x = Tx2{0.0f, 0.0f};
#pragma unroll
    for (int r = 0; r < group_size; r += 2) {
      auto pair = Tx2{thread_data[r], thread_data[r + 1]};
      absmax_x2<Tx2>(amax_2x, amax_2x, pair);
    }
    float scale_dec_b =
        max(fabsf(static_cast<float>(amax_2x.x)),
            fabsf(static_cast<float>(amax_2x.y)));
    scale_dec_b /= bits == 4 ? F4E2M1_MAX : F8E4M3_MAX;
    scale_dec_b *= scale_enc;
    // Convert to mx scale or nv scale
    using ScaleType = std::conditional_t<
        use_mx_scale,
        cutlass::float_ue8m0_t,
        cutlass::float_e4m3_t>;
    auto s = ScaleType(scale_dec_b);
    float scale_enc_b = scale_enc / float(s);
    scales_smem[tidx][tidy] = s.storage;

    int shared_idx = tidx * padded_local_cols + tidy * bytes_per_group;

#pragma unroll
    for (int j = 0; j < group_size / 4; j++) {
      Tx4 w_Tx4 = *reinterpret_cast<Tx4*>(&thread_data[j * 4]);
      if constexpr (bits == 8) {
        uint32_t quantized_val =
            scale_cvt_Tx4_to_fp8x4<T, USE_SR>(w_Tx4, scale_enc_b, rbits);
        *reinterpret_cast<uint32_t*>(&quantized_smem[shared_idx + j * 4]) =
            quantized_val;
      } else {
        uint16_t quantized_val =
            scale_cvt_Tx4_to_fp4x4<T, USE_SR>(w_Tx4, scale_enc_b, rbits);
        *reinterpret_cast<uint16_t*>(&quantized_smem[shared_idx + j * 2]) =
            quantized_val;
      }
    }
  }
  __syncthreads();

  int output_cols = K / elem_per_byte;
  int num_groups_per_row = K / group_size;
  int linear_tid = tidx + tidy * BLOCK_X;
  // Write back quantized values
#pragma unroll
  for (int i = linear_tid; i < bytes_per_block; i += BLOCK_X * BLOCK_Y) {
    int local_row = i / local_cols;
    int local_col = i % local_cols;

    int global_row = bidy * rows_per_block + local_row;
    int global_col = bidx * local_cols + local_col;

    if (global_row < M && global_col < output_cols) {
      int physical_idx = local_row * padded_local_cols + local_col;
      out[global_row * output_cols + global_col] = quantized_smem[physical_idx];
    }
  }
  // Write back scales
  constexpr int num_scales = BLOCK_X * BLOCK_Y;
#pragma unroll
  for (int i = linear_tid; i < num_scales; i += BLOCK_X * BLOCK_Y) {
    int local_row = i / BLOCK_Y;
    int local_col = i % BLOCK_Y;

    int global_row = bidy * BLOCK_X + local_row;
    int global_col = bidx * BLOCK_Y + local_col;

    if (global_row < M && global_col < num_groups_per_row) {
      scales[global_row * num_groups_per_row + global_col] =
          scales_smem[local_row][local_col];
    }
  }
}

template <typename T, int group_size, int bits, bool use_mx_scale>
__global__ void fp_dequantize(
    const uint8_t* w,
    const uint8_t* scales,
    T* out,
    size_t size,
    float* global_scale = nullptr) {
  auto block_size = cg::this_thread_block().dim_threads();
  auto block_idx = cg::this_thread_block().group_index();
  auto idx_in_block = cg::this_thread_block().thread_index();

  auto tidx = block_idx.x * block_size.x + idx_in_block.x;
  auto tidy = block_idx.y * block_size.y + idx_in_block.y;

  auto grid_dim_x = cg::this_grid().dim_blocks().x * block_size.x;

  constexpr int pack_factor = bits == 8 ? 1 : 2;
  const bool use_global_scale = global_scale != nullptr;
  const float inv_scale_enc = use_mx_scale
      ? 1.0f
      : (use_global_scale ? (*global_scale) / (F8E4M3_MAX * F4E2M1_MAX) : 1.0f);
  size_t offset = tidx + grid_dim_x * size_t(tidy);
  size_t oindex = offset * pack_factor;

  if (oindex >= size) {
    return;
  }

  size_t gindex = oindex / group_size;
  using ScaleType = std::conditional_t<
      use_mx_scale,
      cutlass::float_ue8m0_t,
      cutlass::float_e4m3_t>;
  auto scale = float(((ScaleType*)(scales))[gindex]) * inv_scale_enc;

  out += oindex;

  uint32_t val = w[offset];
#pragma clang loop unroll(full)
  for (int i = 0; i < pack_factor; i++) {
    uint8_t d;
    if (bits == 4) {
      d = (val >> (bits * i)) & 0x0f;
    } else if (bits == 8) {
      d = val;
    }
    out[i] = static_cast<T>(scale * Dequantize<bits>{}(d));
  }
}

inline std::tuple<dim3, dim3>
get_columnwise_quantize_launch_args(size_t size, int group_size, int M, int K) {
  constexpr int BLOCK_X = 16;
  constexpr int BLOCK_Y = 32;
  int rows_per_block = BLOCK_X;
  int cols_per_block = BLOCK_Y * group_size;

  dim3 grid;
  grid.x =
      cuda::ceil_div(K, cols_per_block) * cuda::ceil_div(M, rows_per_block);
  grid.y = 1;
  grid.z = 1;

  dim3 block(BLOCK_X, BLOCK_Y);

  return std::make_tuple(grid, block);
}

} // namespace cu

void fp_quantize_dequantize(
    const array& w,
    array& what,
    int group_size,
    int bits,
    const std::optional<array>& global_scale /* = std::nullopt */,
    cu::CommandEncoder& enc,
    const Stream& s) {
  enc.set_input_array(w);
  if (global_scale.has_value()) {
    enc.set_input_array(global_scale.value());
  }
  enc.set_output_array(what);
  dispatch_float_types(w.dtype(), "fp_quantize_dequantize", [&](auto type_tag) {
    using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    if constexpr (!std::is_same_v<T, double>) {
      auto kernel = cu::fp_quantize_dequantize<T, 32, 4, true, false>;
      if (bits == 8) {
        kernel = cu::fp_quantize_dequantize<T, 32, 8, true, false>;
      } else if (group_size == 16) {
        kernel = cu::fp_quantize_dequantize<T, 16, 4, false, false>;
      }
      bool large = w.size() > UINT_MAX;
      auto [num_blocks, block_dims] =
          get_launch_args(w.size(), w.shape(), w.strides(), large, group_size);

      enc.add_kernel_node(
          kernel,
          num_blocks,
          block_dims,
          gpu_ptr<T>(w),
          gpu_ptr<T>(what),
          w.size(),
          global_scale.has_value() ? gpu_ptr<float>(global_scale.value())
                                   : nullptr);
    }
  });
}

void fp_quantize(
    const array& w,
    array& wq,
    array& scales,
    int group_size,
    int bits,
    const std::optional<array>& global_scale /* = std::nullopt */,
    cu::CommandEncoder& enc,
    const Stream& s) {
  enc.set_input_array(w);
  if (global_scale.has_value()) {
    enc.set_input_array(global_scale.value());
  }
  enc.set_output_array(wq);
  enc.set_output_array(scales);
  if (w.strides().back() != 1) {
    dispatch_float_types(w.dtype(), "fp_quantize_columnwise", [&](auto type_tag) {
      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      if constexpr (!std::is_same_v<T, double>) {
        auto M = w.shape(-2);
        auto K = w.shape(-1);
        auto kernel = cu::fp_quantize_columnwise<T, 32, 4, true, false>;
        if (bits == 8) {
          kernel = cu::fp_quantize_columnwise<T, 32, 8, true, false>;
        } else if (group_size == 16) {
          kernel = cu::fp_quantize_columnwise<T, 16, 4, false, false>;
        }
        auto [num_blocks, block_dims] =
            cu::get_columnwise_quantize_launch_args(w.size(), group_size, M, K);
        enc.add_kernel_node(
            kernel,
            num_blocks,
            block_dims,
            gpu_ptr<T>(w),
            gpu_ptr<uint8_t>(wq),
            gpu_ptr<uint8_t>(scales),
            w.size(),
            M,
            K,
            global_scale.has_value() ? gpu_ptr<float>(global_scale.value())
                                     : nullptr);
      } else {
        throw std::runtime_error(
            "[Quantize::eval_gpu] Can not quantize input with type float64.");
      }
    });
  } else {
    dispatch_float_types(w.dtype(), "fp_quantize_rowwise", [&](auto type_tag) {
      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      if constexpr (!std::is_same_v<T, double>) {
        auto kernel = cu::fp_quantize_rowwise<T, 32, 4, true, false>;
        if (bits == 8) {
          kernel = cu::fp_quantize_rowwise<T, 32, 8, true, false>;
        } else if (group_size == 16) {
          kernel = cu::fp_quantize_rowwise<T, 16, 4, false, false>;
        }
        bool large = w.size() > UINT_MAX;
        auto [num_blocks, block_dims] = get_launch_args(
            w.size(), w.shape(), w.strides(), large, group_size);

        enc.add_kernel_node(
            kernel,
            num_blocks,
            block_dims,
            gpu_ptr<T>(w),
            gpu_ptr<uint8_t>(wq),
            gpu_ptr<uint8_t>(scales),
            w.size(),
            global_scale.has_value() ? gpu_ptr<float>(global_scale.value())
                                     : nullptr);
      } else {
        throw std::runtime_error(
            "[Quantize::eval_gpu] Can not quantize input with type float64.");
      }
    });
  }
}

void fp_dequantize(
    const array& wq,
    const array& scales,
    array& w,
    int group_size,
    int bits,
    const std::optional<array>& global_scale /* = std::nullopt */,
    cu::CommandEncoder& enc,
    const Stream& s) {
  constexpr int uint8_per_uint32 = 4;
  int packs_per_int = 8 / bits;

  size_t size = w.size() / packs_per_int;
  bool large = size > UINT_MAX;
  auto grid_shape = w.shape();
  grid_shape.back() *= uint8_per_uint32;

  enc.set_input_array(wq);
  enc.set_input_array(scales);
  if (global_scale.has_value()) {
    enc.set_input_array(global_scale.value());
  }
  enc.set_output_array(w);
  dispatch_float_types(w.dtype(), "fp_dequantize", [&](auto type_tag) {
    using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    if constexpr (!std::is_same_v<T, double>) {
      auto kernel = cu::fp_dequantize<T, 32, 4, true>;
      if (bits == 8) {
        kernel = cu::fp_dequantize<T, 32, 8, true>;
      } else if (group_size == 16) {
        kernel = cu::fp_dequantize<T, 16, 4, false>;
      }
      auto [num_blocks, block_dims] =
          get_launch_args(size, grid_shape, w.strides(), large);
      enc.add_kernel_node(
          kernel,
          num_blocks,
          block_dims,
          gpu_ptr<uint8_t>(wq),
          gpu_ptr<uint8_t>(scales),
          gpu_ptr<T>(w),
          w.size(),
          global_scale.has_value() ? gpu_ptr<float>(global_scale.value())
                                   : nullptr);
    } else {
      throw std::runtime_error(
          "[Quantize::eval_gpu] Can not dequantize to output with type float64.");
    }
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/mxfp8_quantize.cuh
================================================
#pragma once

#include "mlx/backend/cuda/vector_types.cuh"

#include <cutlass/numeric_conversion.h>

namespace mlx::core::cu {

// Place holder for future fast path implementation
template <typename T, bool USE_SR>
__device__ __forceinline__ uint32_t scale_cvt_Tx4_to_fp8x4(
    const Vector4_t<T>& input,
    const float scale,
    uint32_t rbits) {
  cutlass::NumericArrayConverter<float, T, 4> fp32_t;
  auto scaled =
      fp32_t(*reinterpret_cast<const cutlass::Array<T, 4>*>(&input)) * scale;
  cutlass::NumericArrayConverter<cutlass::float_e4m3_t, float, 4> fp8_fp32;
  auto quant = fp8_fp32(scaled);
  return *reinterpret_cast<uint32_t*>(&quant);
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/quantized/no_qqmm_impl.cpp
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/cuda/quantized/qqmm_impl.h"

namespace mlx::core {
void qqmm_impl(
    cu::CommandEncoder&,
    int,
    int,
    int,
    bool,
    int64_t,
    bool,
    int64_t,
    array&,
    const array&,
    const array&,
    const array&,
    const array&,
    QuantizationMode,
    const GemmScalars&) {
  throw std::runtime_error(
      "[QQMatmul::eval_gpu] QQMM is only supported with CUDA 12.8 or higher.");
}
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/nvfp4_quantize.cuh
================================================
#pragma once

#include "mlx/backend/cuda/vector_types.cuh"

#include <cutlass/numeric_conversion.h>

namespace mlx::core::cu {

using bf16x4 = Vector4_t<__nv_bfloat16>;
using fp16x4 = Vector4_t<__half>;
using f32x4 = Vector4_t<float>;

template <typename T>
__device__ __forceinline__ uint16_t
scale_cvt_Tx4_to_fp4x4_fallback(const Vector4_t<T>& input, const float scale) {
  // Fallback implementation for architectures that do not support cvt
  // instructions or for cuda versions with no fp4 support (< 12.8) -> scalar
  cutlass::NumericArrayConverter<float, T, 4> fp32_t;
  auto scaled =
      fp32_t(*reinterpret_cast<const cutlass::Array<T, 4>*>(&input)) * scale;
  cutlass::NumericArrayConverter<cutlass::float_e2m1_t, float, 4> fp4_fp32;
  auto quant = fp4_fp32(scaled);
  return *reinterpret_cast<uint16_t*>(&quant);
}

#if (CUDART_VERSION >= 12080) && (__CUDA_ARCH__ >= 1000) && \
    defined(__CUDA_ARCH_SPECIFIC__)

__device__ __forceinline__ uint16_t
scale_cvt_bf16x4_to_fp4x4_rn(const bf16x4 input_bf16x4, const float2 scale) {
  uint16_t out_fp4x4 = 0;
  asm volatile(
      "{\n"
      ".reg.b16 x0_bf16; \n\t" // first bf16
      ".reg.b16 x1_bf16; \n\t" // second bf16
      ".reg.b16 x2_bf16; \n\t" // third bf16
      ".reg.b16 x3_bf16; \n\t" // fourth bf16
      ".reg.b32 x0; \n\t" // to hold scaled first
      ".reg.b32 x1; \n\t" // to hold scaled second
      ".reg.b32 x2; \n\t" // to hold scaled third
      ".reg.b32 x3; \n\t" // to hold scaled fourth
      ".reg.b64 x01; \n\t" // to hold vector mul
      ".reg.b64 x23; \n\t"
      ".reg.b8 q0; \n\t" // output byte fp4x2 (first pair)
      ".reg.b8 q1; \n\t" // output byte fp4x2 (second pair)
      "mov.b64 {x0_bf16, x1_bf16, x2_bf16, x3_bf16} , %1; \n\t" // unpack bf16
      "cvt.f32.bf16 x0, x0_bf16; \n\t" // convert to f32
      "cvt.f32.bf16 x1, x1_bf16; \n\t"
      "cvt.f32.bf16 x2, x2_bf16; \n\t"
      "cvt.f32.bf16 x3, x3_bf16; \n\t"
      "mov.b64 x01, {x0, x1}; \n\t"
      "mul.f32x2 x01, x01, %2; \n\t" // scale first pair
      "mov.b64 x23, {x2, x3}; \n\t"
      "mul.f32x2 x23, x23, %2; \n\t" // scale second pair
      "mov.b64 {x0, x1}, x01; \n\t"
      "mov.b64 {x2, x3}, x23; \n\t"
      "cvt.rn.satfinite.e2m1x2.f32 q0, x1, x0; \n\t" // convert to fp4x2 first
                                                     // pair
      "cvt.rn.satfinite.e2m1x2.f32 q1, x3, x2; \n\t" // convert to fp4x2 second
                                                     // pair
      "mov.b16 %0, {q0, q1}; \n\t" // pack to output
      "}"
      : "=h"(out_fp4x4)
      : "l"(reinterpret_cast<const uint64_t&>(input_bf16x4)),
        "l"(reinterpret_cast<const uint64_t&>(
            scale))); // here cast is needed becuase an asm operand must have
                      // scalar type
  return out_fp4x4;
}

__device__ __forceinline__ uint16_t scale_cvt_bf16x4_to_fp4x4_rs(
    const bf16x4 input_bf16x4,
    const float2 scale,
    uint32_t rbits) {
  uint16_t out_fp4x4 = 0;
  asm volatile(
      "{\n"
      ".reg.b16 x0_bf16; \n\t"
      ".reg.b16 x1_bf16; \n\t"
      ".reg.b16 x2_bf16; \n\t"
      ".reg.b16 x3_bf16; \n\t"
      ".reg.b32 x0; \n\t"
      ".reg.b32 x1; \n\t"
      ".reg.b32 x2; \n\t"
      ".reg.b32 x3; \n\t"
      ".reg.b64 x01; \n\t"
      ".reg.b64 x23; \n\t"
      ".reg.b16 q0; \n\t"
      "mov.b64 {x0_bf16, x1_bf16, x2_bf16, x3_bf16} , %1; \n\t"
      "cvt.f32.bf16 x0, x0_bf16; \n\t"
      "cvt.f32.bf16 x1, x1_bf16; \n\t"
      "cvt.f32.bf16 x2, x2_bf16; \n\t"
      "cvt.f32.bf16 x3, x3_bf16; \n\t"
      "mov.b64 x01, {x0, x1}; \n\t"
      "mul.f32x2 x01, x01, %2; \n\t"
      "mov.b64 x23, {x2, x3}; \n\t"
      "mul.f32x2 x23, x23, %2; \n\t"
      "mov.b64 {x0, x1}, x01; \n\t"
      "mov.b64 {x2, x3}, x23; \n\t"
      "cvt.rs.satfinite.e2m1x4.f32 q0, {x3, x2, x1, x0}, %3; \n\t"
      "}"
      : "=h"(out_fp4x4)
      : "l"(reinterpret_cast<const uint64_t&>(input_bf16x4)),
        "l"(reinterpret_cast<const uint64_t&>(scale)),
        "r"(rbits));
  return out_fp4x4;
}

__device__ __forceinline__ uint16_t scale_cvt_fp32x4_to_fp4x4_rn(
    const float2 input_fp32x2_0,
    const float2 input_fp32x2_1,
    const float2 scale) {
  uint16_t out_fp4x4 = 0;
  asm volatile(
      "{\n"
      ".reg.b32 x0; \n\t"
      ".reg.b32 x1; \n\t"
      ".reg.b32 x2; \n\t"
      ".reg.b32 x3; \n\t"
      ".reg.b64 x01; \n\t"
      ".reg.b64 x23; \n\t"
      ".reg.b8 q0; \n\t"
      ".reg.b8 q1; \n\t"
      "mov.b64 x01, {%1, %2}; \n\t"
      "mul.f32x2 x01, x01, %5; \n\t"
      "mov.b64 x23, {%3, %4}; \n\t"
      "mul.f32x2 x23, x23, %5; \n\t"
      "mov.b64 {x0, x1}, x01; \n\t"
      "mov.b64 {x2, x3}, x23; \n\t"
      "cvt.rn.satfinite.e2m1x2.f32 q0, x1, x0; \n\t"
      "cvt.rn.satfinite.e2m1x2.f32 q1, x3, x2; \n\t"
      "mov.b16 %0, {q0, q1}; \n\t"
      "}"
      : "=h"(out_fp4x4)
      : "f"(input_fp32x2_0.x),
        "f"(input_fp32x2_0.y),
        "f"(input_fp32x2_1.x),
        "f"(input_fp32x2_1.y),
        "l"(reinterpret_cast<const uint64_t&>(scale)));
  return out_fp4x4;
}

__device__ __forceinline__ uint16_t scale_cvt_fp32x4_to_fp4x4_rs(
    const float2 input_fp32x2_0,
    const float2 input_fp32x2_1,
    const float2 scale,
    uint32_t rbits) {
  uint16_t out_fp4x4 = 0;
  asm volatile(
      "{\n"
      ".reg.b32 x0; \n\t"
      ".reg.b32 x1; \n\t"
      ".reg.b32 x2; \n\t"
      ".reg.b32 x3; \n\t"
      ".reg.b64 x01; \n\t"
      ".reg.b64 x23; \n\t"
      ".reg.b16 q0; \n\t"
      "mov.b64 x01, {%1, %2}; \n\t"
      "mul.f32x2 x01, x01, %5; \n\t"
      "mov.b64 x23, {%3, %4}; \n\t"
      "mul.f32x2 x23, x23, %5; \n\t"
      "mov.b64 {x0, x1}, x01; \n\t"
      "mov.b64 {x2, x3}, x23; \n\t"
      "cvt.rs.satfinite.e2m1x4.f32 q0, {x3, x2, x1, x0}, %6; \n\t"
      "}"
      : "=h"(out_fp4x4)
      : "f"(input_fp32x2_0.x),
        "f"(input_fp32x2_0.y),
        "f"(input_fp32x2_1.x),
        "f"(input_fp32x2_1.y),
        "l"(reinterpret_cast<const uint64_t&>(scale)),
        "r"(rbits));
  return out_fp4x4;
}

__device__ __forceinline__ uint16_t
scale_cvt_fp16x4_to_fp4x4_rn(const fp16x4 input_fp16x4, const float2 scale) {
  uint16_t out_fp4x4 = 0;
  asm volatile(
      "{\n"
      ".reg.b16 x0_fp16; \n\t"
      ".reg.b16 x1_fp16; \n\t"
      ".reg.b16 x2_fp16; \n\t"
      ".reg.b16 x3_fp16; \n\t"
      ".reg.b32 x0; \n\t"
      ".reg.b32 x1; \n\t"
      ".reg.b32 x2; \n\t"
      ".reg.b32 x3; \n\t"
      ".reg.b64 x01; \n\t"
      ".reg.b64 x23; \n\t"
      ".reg.b8 q0; \n\t"
      ".reg.b8 q1; \n\t"
      "mov.b64 {x0_fp16, x1_fp16, x2_fp16, x3_fp16} , %1; \n\t"
      "cvt.f32.f16 x0, x0_fp16; \n\t"
      "cvt.f32.f16 x1, x1_fp16; \n\t"
      "cvt.f32.f16 x2, x2_fp16; \n\t"
      "cvt.f32.f16 x3, x3_fp16; \n\t"
      "mov.b64 x01, {x0, x1}; \n\t"
      "mul.f32x2 x01, x01, %2; \n\t"
      "mov.b64 x23, {x2, x3}; \n\t"
      "mul.f32x2 x23, x23, %2; \n\t"
      "mov.b64 {x0, x1}, x01; \n\t"
      "mov.b64 {x2, x3}, x23; \n\t"
      "cvt.rn.satfinite.e2m1x2.f32 q0, x1, x0; \n\t"
      "cvt.rn.satfinite.e2m1x2.f32 q1, x3, x2; \n\t"
      "mov.b16 %0, {q0, q1}; \n\t"
      "}"
      : "=h"(out_fp4x4)
      : "l"(reinterpret_cast<const uint64_t&>(input_fp16x4)),
        "l"(reinterpret_cast<const uint64_t&>(scale)));
  return out_fp4x4;
}

__device__ __forceinline__ uint16_t scale_cvt_fp16x4_to_fp4x4_rs(
    const fp16x4 input_fp16x4,
    const float2 scale,
    uint32_t rbits) {
  uint16_t out_fp4x4 = 0;
  asm volatile(
      "{\n"
      ".reg.b16 x0_fp16; \n\t"
      ".reg.b16 x1_fp16; \n\t"
      ".reg.b16 x2_fp16; \n\t"
      ".reg.b16 x3_fp16; \n\t"
      ".reg.b32 x0; \n\t"
      ".reg.b32 x1; \n\t"
      ".reg.b32 x2; \n\t"
      ".reg.b32 x3; \n\t"
      ".reg.b64 x01; \n\t"
      ".reg.b64 x23; \n\t"
      ".reg.b16 q0; \n\t"
      "mov.b64 {x0_fp16, x1_fp16, x2_fp16, x3_fp16} , %1; \n\t"
      "cvt.f32.f16 x0, x0_fp16; \n\t"
      "cvt.f32.f16 x1, x1_fp16; \n\t"
      "cvt.f32.f16 x2, x2_fp16; \n\t"
      "cvt.f32.f16 x3, x3_fp16; \n\t"
      "mov.b64 x01, {x0, x1}; \n\t"
      "mul.f32x2 x01, x01, %2; \n\t"
      "mov.b64 x23, {x2, x3}; \n\t"
      "mul.f32x2 x23, x23, %2; \n\t"
      "mov.b64 {x0, x1}, x01; \n\t"
      "mov.b64 {x2, x3}, x23; \n\t"
      "cvt.rs.satfinite.e2m1x4.f32 q0, {x3, x2, x1, x0}, %3; \n\t"
      "}"
      : "=h"(out_fp4x4)
      : "l"(reinterpret_cast<const uint64_t&>(input_fp16x4)),
        "l"(reinterpret_cast<const uint64_t&>(scale)),
        "r"(rbits));
  return out_fp4x4;
}

template <bool USE_SR>
__device__ __forceinline__ uint16_t scale_cvt_bf16x4_to_fp4x4(
    const bf16x4 input,
    const float scale,
    uint32_t rbits) {
  float2 scale_fp32x2 = make_float2(scale, scale);
  if constexpr (USE_SR) {
    return scale_cvt_bf16x4_to_fp4x4_rs(input, scale_fp32x2, rbits);
  } else {
    return scale_cvt_bf16x4_to_fp4x4_rn(input, scale_fp32x2);
  }
}

template <bool USE_SR>
__device__ __forceinline__ uint16_t scale_cvt_fp16x4_to_fp4x4(
    const fp16x4 input,
    const float scale,
    uint32_t rbits) {
  float2 scale_fp32x2 = make_float2(scale, scale);
  if constexpr (USE_SR) {
    return scale_cvt_fp16x4_to_fp4x4_rs(input, scale_fp32x2, rbits);
  } else {
    return scale_cvt_fp16x4_to_fp4x4_rn(input, scale_fp32x2);
  }
}

template <bool USE_SR>
__device__ __forceinline__ uint16_t
scale_cvt_f32x4_to_fp4x4(const f32x4 input, const float scale, uint32_t rbits) {
  float2 scale_fp32x2 = make_float2(scale, scale);
  float2 input_fp32x2_0 = make_float2(input.x, input.y);
  float2 input_fp32x2_1 = make_float2(input.z, input.w);

  if constexpr (USE_SR) {
    return scale_cvt_fp32x4_to_fp4x4_rs(
        input_fp32x2_0, input_fp32x2_1, scale_fp32x2, rbits);
  } else {
    return scale_cvt_fp32x4_to_fp4x4_rn(
        input_fp32x2_0, input_fp32x2_1, scale_fp32x2);
  }
}

template <typename T, bool USE_SR>
__device__ __forceinline__ uint16_t scale_cvt_Tx4_to_fp4x4_fast(
    const Vector4_t<T> input,
    const float scale,
    uint32_t rbits) {
  if constexpr (std::is_same<T, __nv_bfloat16>::value) {
    return scale_cvt_bf16x4_to_fp4x4<USE_SR>(input, scale, rbits);
  } else if constexpr (std::is_same<T, __half>::value) {
    return scale_cvt_fp16x4_to_fp4x4<USE_SR>(input, scale, rbits);
  } else {
    return scale_cvt_f32x4_to_fp4x4<USE_SR>(input, scale, rbits);
  }
}
#endif // (CUDART_VERSION >= 12080) && (__CUDA_ARCH__ >= 1000) &&
       // (__CUDA_ARCH_FAMILY_SPECIFIC__ >= 1000)

template <typename T, bool USE_SR>
__device__ __forceinline__ uint16_t scale_cvt_Tx4_to_fp4x4(
    const Vector4_t<T>& input,
    const float scale,
    uint32_t rbits) {
#if (CUDART_VERSION >= 12080) && (__CUDA_ARCH__ >= 1000) && \
    (__CUDA_ARCH_FAMILY_SPECIFIC__ >= 1000)
  return scale_cvt_Tx4_to_fp4x4_fast<T, USE_SR>(input, scale, rbits);
#else
  static_assert(
      !USE_SR,
      "Stochastic rounding (USE_SR=true) requires CUDA >= 12.8 and compute capability >= 1000.");
  return scale_cvt_Tx4_to_fp4x4_fallback(input, scale);
#endif
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/quantized/qmm/CMakeLists.txt
================================================
target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/qmm.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/qmv.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/fp_qmv.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/qmm_impl_sm80_m16.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/qmm_impl_sm80_m32.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/qmm_impl_sm80_m64.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/qmm_impl_sm90_m128_n16_m1.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/qmm_impl_sm90_m128_n32_m1.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/qmm_impl_sm90_m128_n64_m2.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/qmm_impl_sm90_m128_n128_m2.cu
          ${CMAKE_CURRENT_SOURCE_DIR}/qmm_impl_sm90_m128_n256_m2.cu)


================================================
FILE: mlx/backend/cuda/quantized/qmm/fp_qmv.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/quantized.h"
#include "mlx/backend/cuda/device/utils.cuh"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/backend/cuda/quantized/qmm/qmm.h"
#include "mlx/backend/cuda/quantized/quantized_utils.h"
#include "mlx/dtype_utils.h"

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include <cutlass/float8.h>
#include <cutlass/numeric_conversion.h>

namespace mlx::core {

constexpr int rows_per_block = 8;

namespace cu {

namespace cg = cooperative_groups;

template <typename T>
__device__ void adjust_matrix_offsets(
    const T*& x,
    const uint32_t*& w,
    const uint8_t*& scales,
    T*& y,
    int output_stride,
    const int& x_batch_ndims,
    const Shape x_shape,
    const Strides x_strides,
    const int& w_batch_ndims,
    const Shape w_shape,
    const Strides w_strides,
    const Strides s_strides) {
  uint32_t idx = cg::this_grid().block_index().z;
  if (x_batch_ndims == 1) {
    x += idx * x_strides[0];
  } else {
    x += elem_to_loc(idx, x_shape.data(), x_strides.data(), x_batch_ndims);
  }
  if (w_batch_ndims == 1) {
    w += idx * w_strides[0];
    scales += idx * s_strides[0];
  } else {
    auto [w_idx, s_idx] = elem_to_loc(
        idx, w_shape.data(), w_strides.data(), s_strides.data(), w_batch_ndims);
    w += w_idx;
    scales += s_idx;
  }
  y += idx * output_stride;
}

template <
    typename T,
    int rows_per_block,
    int n_per_thread,
    int bits,
    int group_size,
    bool use_mx_scale>
__device__ void fp_qmv_impl(
    const uint32_t* mat,
    const uint8_t* scales_,
    const T* vec,
    T* out,
    int rows,
    int cols) {
  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<WARP_SIZE>(block);

  constexpr int vals_per_item = bits == 8 ? 4 : 8;
  constexpr int nv_per_thread = vals_per_item * n_per_thread;
  auto g_idx = block.group_index();
  auto t_idx = block.thread_index();
  int row = g_idx.y * rows_per_block + t_idx.y;

  vec += g_idx.x * cols;
  out += g_idx.x * rows;

  using ScaleType = std::conditional_t<
      use_mx_scale,
      cutlass::float_ue8m0_t,
      cutlass::float_e4m3_t>;
  auto scales = (ScaleType*)(scales_);
  auto packed_cols = cols / vals_per_item;

  if (row < rows) {
    constexpr int scales_per_step = std::max(nv_per_thread / group_size, 1);
    constexpr int scale_step = (WARP_SIZE * nv_per_thread) / group_size;
    constexpr int n_per_step = n_per_thread / scales_per_step;
    // Offset scales to correct row
    scales += row * (cols / group_size) +
        (warp.thread_rank() * nv_per_thread) / group_size;
    float sum = 0.0f;
    for (int col = n_per_thread * warp.thread_rank(); col < packed_cols;
         col += (WARP_SIZE * n_per_thread)) {
      auto local_vec =
          unsafe_load_vector<nv_per_thread>(vec + vals_per_item * col, 0);
      auto local_mat =
          unsafe_load_vector<n_per_thread>(mat + row * packed_cols + col, 0);
#pragma unroll
      for (int i = 0; i < scales_per_step; ++i) {
        float2 local_sum = {0.0f, 0.0f};
#pragma unroll
        for (int j = 0; j < n_per_step; ++j) {
          int k = n_per_step * i + j;
          if constexpr (bits == 8) {
            cutlass::NumericArrayConverter<float, cutlass::float_e4m3_t, 4>
                converter;
            auto v = converter(
                *reinterpret_cast<cutlass::Array<cutlass::float_e4m3_t, 4>*>(
                    &local_mat[k]));
            local_sum.x +=
                v[0] * static_cast<float>(local_vec[vals_per_item * k]);
            local_sum.x +=
                v[1] * static_cast<float>(local_vec[vals_per_item * k + 1]);
            local_sum.y +=
                v[2] * static_cast<float>(local_vec[vals_per_item * k + 2]);
            local_sum.y +=
                v[3] * static_cast<float>(local_vec[vals_per_item * k + 3]);
          } else {
            cutlass::NumericArrayConverter<float, cutlass::float_e2m1_t, 8>
                converter;
            auto v = converter(
                *reinterpret_cast<cutlass::Array<cutlass::float_e2m1_t, 8>*>(
                    &local_mat[k]));
            local_sum.x +=
                v[0] * static_cast<float>(local_vec[vals_per_item * k]);
            local_sum.y +=
                v[1] * static_cast<float>(local_vec[vals_per_item * k + 1]);
            local_sum.x +=
                v[2] * static_cast<float>(local_vec[vals_per_item * k + 2]);
            local_sum.y +=
                v[3] * static_cast<float>(local_vec[vals_per_item * k + 3]);
            local_sum.x +=
                v[4] * static_cast<float>(local_vec[vals_per_item * k + 4]);
            local_sum.y +=
                v[5] * static_cast<float>(local_vec[vals_per_item * k + 5]);
            local_sum.x +=
                v[6] * static_cast<float>(local_vec[vals_per_item * k + 6]);
            local_sum.y +=
                v[7] * static_cast<float>(local_vec[vals_per_item * k + 7]);
          }
        }
        sum += (local_sum.x + local_sum.y) * float(scales[i]);
      }
      scales += scale_step;
    }

    sum = cg::reduce(warp, sum, cg::plus<float>{});
    if (warp.thread_rank() == 0) {
      out[row] = static_cast<T>(sum);
    }
  }
}

template <
    typename T,
    int rows_per_block,
    int n_per_thread,
    int bits,
    int group_size,
    bool use_mx_scale>
__global__ void fp_qmv_single(
    const uint32_t* mat,
    const uint8_t* scales,
    const T* vec,
    T* out,
    int rows,
    int cols) {
  fp_qmv_impl<T, rows_per_block, n_per_thread, bits, group_size, use_mx_scale>(
      mat, scales, vec, out, rows, cols);
}

template <
    typename T,
    int rows_per_block,
    int n_per_thread,
    int bits,
    int group_size,
    bool use_mx_scale>
__global__ void fp_qmv_batched(
    const uint32_t* mat,
    const uint8_t* scales,
    const T* vec,
    T* out,
    int rows,
    int cols,
    int vec_batch_ndims,
    const __grid_constant__ Shape vec_shape,
    const __grid_constant__ Strides vec_strides,
    int mat_batch_ndims,
    const __grid_constant__ Shape mat_shape,
    const __grid_constant__ Strides mat_strides,
    const __grid_constant__ Strides scales_strides) {
  adjust_matrix_offsets<T>(
      vec,
      mat,
      scales,
      out,
      rows * vec_shape[vec_batch_ndims],
      vec_batch_ndims,
      vec_shape,
      vec_strides,
      mat_batch_ndims,
      mat_shape,
      mat_strides,
      scales_strides);
  fp_qmv_impl<T, rows_per_block, n_per_thread, bits, group_size, use_mx_scale>(
      mat, scales, vec, out, rows, cols);
}

} // namespace cu

template <typename F>
void dispatch_1_2_4(int n, F&& f) {
  switch (n) {
    case 1:
      f(std::integral_constant<int, 1>{});
      break;
    case 2:
      f(std::integral_constant<int, 2>{});
      break;
    case 4:
      f(std::integral_constant<int, 4>{});
      break;
  }
}

void fp_qmv(
    const array& x,
    const array& w,
    const array& scales_,
    array& out,
    int bits,
    int group_size,
    cu::CommandEncoder& encoder,
    Stream s) {
  uint32_t M = x.shape(-2);
  uint32_t N = out.shape(-1);
  uint32_t K = x.shape(-1);
  uint32_t B = out.size() / (M * N);

  // Make sure the last two dims of x and w, s, b are contiguous. This should
  // be relaxed for x.
  array vec = ensure_row_contiguous_matrix(x, encoder, s);
  array mat = ensure_row_contiguous_matrix(w, encoder, s);
  array scales = ensure_row_contiguous_matrix(scales_, encoder, s);

  encoder.set_input_array(mat);
  encoder.set_input_array(scales);
  encoder.set_input_array(vec);
  encoder.set_output_array(out);
  dispatch_float_types(out.dtype(), "qmv", [&](auto type_tag) {
    using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    if constexpr (!std::is_same_v<T, double>) {
      dim3 block_dims{WARP_SIZE, rows_per_block};
      uint32_t blocks_y = (N + rows_per_block - 1) / rows_per_block;
      const uint32_t* mat_ptr = gpu_ptr<uint32_t>(mat);
      const T* vec_ptr = gpu_ptr<T>(vec);
      int n = 1;
      if (K % 32 == 0 && cu::is_aligned<4>(mat_ptr) &&
          ((bits == 4 && cu::is_aligned<8>(vec_ptr)) ||
           cu::is_aligned<4>(vec_ptr))) {
        n = 4;
      } else if (
          cu::is_aligned<2>(mat_ptr) &&
          ((bits == 4 && cu::is_aligned<4>(vec_ptr)) ||
           cu::is_aligned<2>(vec_ptr))) {
        n = 2;
      }
      dispatch_1_2_4(n, [&](auto n) {
        if (B == 1) {
          auto kernel =
              cu::fp_qmv_single<T, rows_per_block, n.value, 4, 32, true>;
          if (bits == 8) {
            kernel = cu::fp_qmv_single<T, rows_per_block, n.value, 8, 32, true>;
          } else if (group_size == 16) {
            kernel =
                cu::fp_qmv_single<T, rows_per_block, n.value, 4, 16, false>;
          }
          encoder.add_kernel_node(
              kernel,
              {uint32_t(x.size() / K), blocks_y},
              block_dims,
              mat_ptr,
              gpu_ptr<uint8_t>(scales),
              vec_ptr,
              gpu_ptr<T>(out),
              N,
              K);
        } else {
          auto kernel =
              cu::fp_qmv_batched<T, rows_per_block, n.value, 4, 32, true>;
          if (bits == 8) {
            kernel =
                cu::fp_qmv_batched<T, rows_per_block, n.value, 8, 32, true>;
          } else if (group_size == 16) {
            kernel =
                cu::fp_qmv_batched<T, rows_per_block, n.value, 4, 16, false>;
          }
          encoder.add_kernel_node(
              kernel,
              {M, blocks_y, B},
              block_dims,
              mat_ptr,
              gpu_ptr<uint8_t>(scales),
              vec_ptr,
              gpu_ptr<T>(out),
              N,
              K,
              vec.ndim() - 2,
              const_param(vec.shape()),
              const_param(vec.strides()),
              mat.ndim() - 2,
              const_param(mat.shape()),
              const_param(mat.strides()),
              const_param(scales.strides()));
        }
      });
    }
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/qmm/qmm.cu
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/cuda/quantized/qmm/qmm.h"

#include <cute/tensor.hpp>

namespace mlx::core {

#if defined(MLX_CUDA_SM90A_ENABLED)
// Defined in qmm_impl_sm90_xxx.cu files.
template <typename TileShape, typename ClusterShape>
void qmm_impl_sm90(
    const array& x,
    const array& w,
    const array& scales,
    const array& biases,
    array& out,
    int bits,
    int group_size,
    cu::CommandEncoder& encoder,
    Stream s);
#endif // defined(MLX_CUDA_SM90A_ENABLED)

bool supports_qmm_sm90(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    const array& out,
    bool transpose,
    int bits,
    int group_size,
    QuantizationMode mode,
    cu::Device& device) {
  if (device.compute_capability_major() != 9) {
    return false;
  }
  int k = x.shape(-1);
  if (k % 64 != 0) {
    return false;
  }
  if (!biases) {
    return false;
  }
  if (!x.flags().row_contiguous || !w.flags().row_contiguous ||
      !scales.flags().row_contiguous || !biases->flags().row_contiguous) {
    return false;
  }
  if (!transpose) {
    return false;
  }
  if (bits % 2 != 0) {
    return false;
  }
  if (group_size < k) {
    return false;
  }
  if (mode != QuantizationMode::Affine) {
    return false;
  }
  return true;
}

void qmm_sm90(
    const array& x,
    const array& w,
    const array& scales,
    const array& biases,
    array& out,
    int bits,
    int group_size,
    cu::CommandEncoder& encoder,
    Stream s) {
#if defined(MLX_CUDA_SM90A_ENABLED)
  auto dispatch = [&]<int tile_m, int tile_n, int cluster_m>() {
    using cute::Int;
    using TileShapeMN = cute::Shape<Int<tile_m>, Int<tile_n>>;
    using ClusterShape = cute::Shape<Int<cluster_m>, Int<1>, Int<1>>;
    qmm_impl_sm90<TileShapeMN, ClusterShape>(
        x, w, scales, biases, out, bits, group_size, encoder, s);
  };
  int m = out.shape(-2);
  if (m <= 16) {
    dispatch.template operator()<128, 16, 1>();
  } else if (m <= 32) {
    dispatch.template operator()<128, 32, 1>();
  } else if (m <= 64) {
    dispatch.template operator()<128, 64, 2>();
  } else if (m <= 128) {
    dispatch.template operator()<128, 128, 2>();
  } else {
    dispatch.template operator()<128, 256, 2>();
  }
#else
  throw std::runtime_error(
      "[quantized_matmul] Hopper-only kernel is not available.");
#endif // defined(MLX_CUDA_SM90A_ENABLED)
}

// Defined in qmm_impl_sm80_xxx.cu files.
template <int TileM>
void qmm_impl_sm80(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    array& out,
    int bits,
    int group_size,
    QuantizationMode mode,
    cu::CommandEncoder& encoder);

bool supports_qmm_sm80(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    const array& out,
    bool transpose,
    int bits,
    int group_size,
    QuantizationMode mode,
    cu::Device& device) {
  if (device.compute_capability_major() < 8) {
    return false;
  }
  int n = out.shape(-1);
  int k = x.shape(-1);
  if ((n % 128 != 0) || (k % std::max(64, group_size) != 0)) {
    return false;
  }
  if (!x.flags().row_contiguous || !w.flags().row_contiguous ||
      !scales.flags().row_contiguous) {
    return false;
  }
  if (biases && !biases->flags().row_contiguous) {
    return false;
  }
  if (x.dtype() != float16 && x.dtype() != bfloat16) {
    return false;
  }
  if (!transpose) {
    return false;
  }
  if (bits != 4 && bits != 8) {
    return false;
  }
  return true;
}

void qmm_sm80(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    array& out,
    int bits,
    int group_size,
    QuantizationMode mode,
    cu::CommandEncoder& encoder) {
  auto dispatch = [&]<int TileM>() {
    qmm_impl_sm80<TileM>(
        x, w, scales, biases, out, bits, group_size, mode, encoder);
  };
  int m = out.shape(-2);
  if (m <= 16) {
    dispatch.template operator()<16>();
  } else if (m <= 32) {
    dispatch.template operator()<32>();
  } else {
    dispatch.template operator()<64>();
  }
}

bool supports_fp_qmv(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    const array& out,
    bool transpose,
    int bits,
    int group_size,
    QuantizationMode mode,
    cu::Device& device) {
  // The fp_qmv kernel uses less registers and is faster for sm120. For sm80/90
  // the qmv kernel is faster. We didn't test sm89/100.
  if (device.compute_capability_major() <= 9) {
    return false;
  }
  bool non_batched = w.ndim() == 2;
  int k = x.shape(-1);
  int n = out.shape(-1);
  int vec_batch = non_batched ? x.size() / k : x.shape(-2);
  if (vec_batch > 8) {
    return false;
  }
  if (!transpose) {
    return false;
  }
  if (mode == QuantizationMode::Affine) {
    return false;
  }
  return true;
}

bool supports_qmv(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    const array& out,
    bool transpose,
    int bits,
    int group_size,
    QuantizationMode mode,
    cu::Device& device) {
  int k = x.shape(-1);
  if (k % 8 != 0) {
    return false;
  }
  if (!x.flags().row_contiguous || !w.flags().row_contiguous ||
      !scales.flags().row_contiguous) {
    return false;
  }
  if (biases && !biases->flags().row_contiguous) {
    return false;
  }
  if (!transpose) {
    return false;
  }
  return true;
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/qmm/qmm.h
================================================
// Copyright © 2026 Apple Inc.

#pragma once

#include "mlx/backend/cuda/device.h"
#include "mlx/primitives.h"

#include <optional>

namespace mlx::core {

bool supports_qmm_sm90(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    const array& out,
    bool transpose,
    int bits,
    int group_size,
    QuantizationMode mode,
    cu::Device& device);

void qmm_sm90(
    const array& x,
    const array& w,
    const array& scales,
    const array& biases,
    array& out,
    int bits,
    int group_size,
    cu::CommandEncoder& encoder,
    Stream s);

bool supports_qmm_sm80(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    const array& out,
    bool transpose,
    int bits,
    int group_size,
    QuantizationMode mode,
    cu::Device& device);

void qmm_sm80(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    array& out,
    int bits,
    int group_size,
    QuantizationMode mode,
    cu::CommandEncoder& encoder);

bool supports_fp_qmv(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    const array& out,
    bool transpose,
    int bits,
    int group_size,
    QuantizationMode mode,
    cu::Device& device);

void fp_qmv(
    const array& x,
    const array& w,
    const array& scales,
    array& out,
    int bits,
    int group_size,
    cu::CommandEncoder& encoder,
    Stream s);

bool supports_qmv(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    const array& out,
    bool transpose,
    int bits,
    int group_size,
    QuantizationMode mode,
    cu::Device& device);

void qmv(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    array& out,
    int bits,
    int group_size,
    QuantizationMode mode,
    cu::CommandEncoder& encoder);

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/qmm/qmm_impl_sm80.cuh
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/cuda/quantized/qmm/qmm.h"
#include "mlx/dtype_utils.h"

#include <cute/tensor.hpp>
#include <cutlass/numeric_conversion.h>

// clang-format off

// We can't put kernel code in mlx::core due to name conflicts of "Shape".
namespace cutlass_gemm {

using namespace cute;

template <typename Quant>
constexpr bool has_zero_point_v = !cutlass::has_negative_zero_v<Quant>;

template <typename Element,
          typename Quant,
          typename SmemLayoutA,
          typename SmemLayoutB,
          typename SmemLayoutC>
union SharedStorage {
  struct {
    ArrayEngine<Element, cosize_v<SmemLayoutA>> A;
    ArrayEngine<Quant,   cosize_v<SmemLayoutB>> B;
  } mainloop;
  struct {
    ArrayEngine<Element, cosize_v<SmemLayoutC>> C;
  } epilogue;
};

template <typename Q, typename S, typename Z, typename T>
__device__ __forceinline__ void
dequant(const Q& w, const S& s, const Z& z, T out) {
  // Scale must be one element.
  CUTE_STATIC_ASSERT_V(cosize(s.layout()) == Int<1>{});
  CUTE_STATIC_ASSERT_V(cosize(z.layout()) == Int<1>{});
  // Quant must be contiguous.
  auto layout = coalesce(w.layout());
  CUTE_STATIC_ASSERT_V(stride(layout) == Int<1>{});
  // Use cutlass for conversions.
  constexpr int N = size(layout);
  using Element = typename T::value_type;
  using Quant = typename Q::value_type;
  auto& w_vec = *(reinterpret_cast<const cutlass::Array<Quant, N>*>(raw_pointer_cast(w.data())));
  Element scale{s[0]};
  cutlass::NumericArrayConverter<Element, Quant, N> converter;
  auto w_dq = converter(w_vec) * scale;
  if constexpr (has_zero_point_v<Quant>) {
    Element zero_point{z[0]};
    w_dq = w_dq + zero_point;
  }
  copy(make_tensor(make_rmem_ptr<Element>(&w_dq), out.layout()), out);
}

template <typename ProblemShape, typename CtaTiler,
          typename Element, typename Quant, typename Scale,
          typename StrideA, typename SmemLayoutA, typename TiledCopyA, typename S2RAtomA,
          typename StrideB, typename SmemLayoutB, typename TiledCopyB, typename S2RAtomB,
          typename StrideC, typename SmemLayoutC, typename TiledCopyC, typename R2SAtomC,
          typename LayoutS, typename G2RAtomS, typename TiledMma>
__global__ void qmm_sm80_kernel(
    ProblemShape shape_MNKL, CtaTiler cta_tiler,
    const Element* A, StrideA dA, SmemLayoutA sA_layout, TiledCopyA g2s_copy_a, S2RAtomA s2r_atom_a,
    const Quant*   B, StrideB dB, SmemLayoutB sB_layout, TiledCopyB g2s_copy_b, S2RAtomB s2r_atom_b,
          Element* C, StrideC dC, SmemLayoutC sC_layout, TiledCopyC s2g_copy_c, R2SAtomC r2s_atom_c,
    const Scale* S, const Element* Z, LayoutS S_layout, G2RAtomS g2r_atom_s, TiledMma mma) {
  CUTE_STATIC_ASSERT_V(size(g2s_copy_a) == size(mma));
  CUTE_STATIC_ASSERT_V(size(g2s_copy_b) == size(mma));
  CUTE_STATIC_ASSERT_V(size(s2g_copy_c) == size(mma));
  CUTE_STATIC_ASSERT_V(congruent(select<0,2,3>(shape_MNKL), dA));
  CUTE_STATIC_ASSERT_V(congruent(select<1,2,3>(shape_MNKL), dB));
  CUTE_STATIC_ASSERT_V(congruent(select<0,1,3>(shape_MNKL), dC));

  int thread_idx = int(threadIdx.x);
  auto [m_coord, n_coord, l_coord] = static_cast<uint3>(blockIdx);

  // Represent the full tensors.
  Tensor mA_mkl = make_tensor(make_gmem_ptr(A),        select<0,2,3>(shape_MNKL), dA); // (M,K,L)
  Tensor mB_nkl = make_tensor(make_gmem_ptr<Quant>(B), select<1,2,3>(shape_MNKL), dB); // (N,K,L)
  Tensor mC_mnl = make_tensor(make_gmem_ptr(C),        select<0,1,3>(shape_MNKL), dC); // (M,N,L)

  Tensor mS_nkl = make_tensor(make_gmem_ptr(S), S_layout); // (N,(group_size,K/group_size),L)
  Tensor mZ_nkl = make_tensor(make_gmem_ptr(Z), S_layout); // (N,(group_size,K/group_size),L)

  // Get batch slice.
  Tensor mA = mA_mkl(_,_,l_coord); // (M,K)
  Tensor mB = mB_nkl(_,_,l_coord); // (N,K)
  Tensor mC = mC_mnl(_,_,l_coord); // (M,N)

  Tensor mS = mS_nkl(_,_,l_coord); // (N,(group_size,K/group_size))
  Tensor mZ = mZ_nkl(_,_,l_coord); // (N,(group_size,K/group_size))

  // Get the appropriate blocks for this thread block.
  auto cta_coord = make_coord(m_coord, n_coord, _); // (m,n,k)
  Tensor gA = local_tile(mA, cta_tiler, cta_coord, Step<_1, X,_1>{}); // (BLK_M,BLK_K,k)
  Tensor gB = local_tile(mB, cta_tiler, cta_coord, Step< X,_1,_1>{}); // (BLK_N,BLK_K,k)
  Tensor gC = local_tile(mC, cta_tiler, cta_coord, Step<_1,_1, X>{}); // (BLK_M,BLK_N)

  Tensor gS = local_tile(mS, cta_tiler, cta_coord, Step< X,_1,_1>{}); // (BLK_N,BLK_K,k)
  Tensor gZ = local_tile(mZ, cta_tiler, cta_coord, Step< X,_1,_1>{}); // (BLK_N,BLK_K,k)

  // Shared memory buffers.
  extern __shared__ char shared_memory[];
  using SharedStorage = SharedStorage<Element, Quant,
                                      SmemLayoutA,
                                      SmemLayoutB,
                                      SmemLayoutC>;
  SharedStorage& smem = *reinterpret_cast<SharedStorage*>(shared_memory);
  Tensor sA = make_tensor(make_smem_ptr(smem.mainloop.A.begin()), sA_layout); // (BLK_M,BLK_K)
  Tensor sB = make_tensor(make_smem_ptr(smem.mainloop.B.begin()), sB_layout); // (BLK_N,BLK_K)
  Tensor sC = make_tensor(make_smem_ptr(smem.epilogue.C.begin()), sC_layout); // (BLK_M,BLK_N)

  // Partition the copying of A/B/C tiles across the threads.
  ThrCopy g2s_thr_copy_a = g2s_copy_a.get_slice(thread_idx);
  Tensor tAgA = g2s_thr_copy_a.partition_S(gA); // (ACPY,ACPY_M,ACPY_K,k)
  Tensor tAsA = g2s_thr_copy_a.partition_D(sA); // (ACPY,ACPY_M,ACPY_K,PIPE)

  ThrCopy g2s_thr_copy_b = g2s_copy_b.get_slice(thread_idx);
  Tensor tBgB = g2s_thr_copy_b.partition_S(gB);  // (BCPY,BCPY_N,BCPY_K,k)
  Tensor tBsB = g2s_thr_copy_b.partition_D(sB);  // (BCPY,BCPY_N,BCPY_K,PIPE)

  ThrCopy s2g_thr_copy_c = s2g_copy_c.get_slice(thread_idx);
  Tensor s2g_tCsC = s2g_thr_copy_c.partition_S(sC); // (CCPY,CCPY_M,CCPY_N)
  Tensor s2g_tCgC = s2g_thr_copy_c.partition_D(gC); // (CCPY,CCPY_M,CCPY_N)

  // MMA.
  ThrMMA thr_mma = mma.get_slice(thread_idx);
  Tensor tCrA = thr_mma.partition_fragment_A(sA(_,_,0)); // (MMA,MMA_M,MMA_K)
  Tensor tCsB = thr_mma.partition_B(sB(_,_,0));          // (MMA,MMA_N,MMA_K)
  Tensor tCrB = make_fragment_like<Quant>(tCsB);         // (MMA,MMA_N,MMA_K)
  Tensor tCrB_dq = make_fragment_like<Element>(tCsB);    // (MMA,MMA_N,MMA_K)
  Tensor tCgC = thr_mma.partition_C(gC);                 // (MMA,MMA_M,MMA_N)
  Tensor tCrC_accu = make_fragment_like<float>(tCgC);    // (MMA,MMA_M,MMA_N)
  Tensor tCrC = make_fragment_like<Element>(tCgC);       // (MMA,MMA_M,MMA_N)

  Tensor tCgS = thr_mma.partition_B(gS);         // (MMA,MMA_N,MMA_K,k)
  Tensor tCrS = make_tensor_like(tCgS(_,_,_,0)); // (MMA,MMA_N,MMA_K)
  Tensor tCgZ = thr_mma.partition_B(gZ);         // (MMA,MMA_N,MMA_K,k)
  Tensor tCrZ = make_tensor_like(tCgZ(_,_,_,0)); // (MMA,MMA_N,MMA_K)

  // Copy Atom retiling.
  TiledCopy s2r_copy_a = make_tiled_copy_A(s2r_atom_a, mma);
  ThrCopy s2r_thr_copy_a = s2r_copy_a.get_slice(thread_idx);
  Tensor s2r_tCsA = s2r_thr_copy_a.partition_S(sA); // (ACPY,MMA_M,MMA_K,PIPE)
  Tensor s2r_tCrA = s2r_thr_copy_a.retile_D(tCrA);  // (ACPY,MMA_M,MMA_K)

  TiledCopy s2r_copy_b = make_tiled_copy_B(s2r_atom_b, mma);
  ThrCopy s2r_thr_copy_b = s2r_copy_b.get_slice(thread_idx);
  Tensor s2r_tCsB = s2r_thr_copy_b.partition_S(sB); // (BCPY,MMA_N,MMA_K,PIPE)
  Tensor s2r_tCrB = s2r_thr_copy_b.retile_D(tCrB);  // (BCPY,MMA_N,MMA_K)

  TiledCopy r2s_copy_c = make_tiled_copy_C(r2s_atom_c, mma);
  ThrCopy r2s_thr_copy_c = r2s_copy_c.get_slice(thread_idx);
  Tensor r2s_tCrC = r2s_thr_copy_c.retile_S(tCrC);  // (CCPY,MMA_M,MMA_N)
  Tensor r2s_tCsC = r2s_thr_copy_c.partition_D(sC); // (CCPY,MMA_M,MMA_N)

  TiledCopy g2r_copy_s = make_tiled_copy_B(g2r_atom_s, mma);
  ThrCopy g2r_thr_copy_s = g2r_copy_s.get_slice(thread_idx);
  Tensor g2r_tCgS = g2r_thr_copy_s.partition_S(gS); // (BCPY,MMA_N,MMA_K,k)
  Tensor g2r_tCrS = g2r_thr_copy_s.retile_D(tCrS);  // (BCPY,MMA_N,MMA_K)
  Tensor g2r_tCgZ = g2r_thr_copy_s.partition_S(gZ); // (BCPY,MMA_N,MMA_K,k)
  Tensor g2r_tCrZ = g2r_thr_copy_s.retile_D(tCrZ);  // (BCPY,MMA_N,MMA_K)

  // Predicates for m bound.
  auto m_max_coord = size<0>(shape_MNKL) - size<0>(gA) * m_coord; // M - BLK_M * m_coord
  Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});         // (CPY_M,CPY_K)
  Tensor tCpC = make_tensor<bool>(make_shape(size<1>(s2g_tCsC), size<2>(s2g_tCsC)), Stride<_1,_0>{}); // (CPY_M,CPY_N)
  Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA))); // (BLK_M,BLK_K)
  Tensor cC = make_identity_tensor(make_shape(size<0>(sC), size<1>(sC))); // (BLK_M,BLK_N)
  Tensor tAcA = g2s_thr_copy_a.partition_D(cA); // (CPY,CPY_M,CPY_K)
  Tensor tCcC = s2g_thr_copy_c.partition_D(cC); // (CPY,CPY_M,CPY_N)
  CUTE_UNROLL
  for (int m = 0; m < size<0>(tApA); ++m) {
    tApA(m,0) = get<0>(tAcA(0,m,0)) < m_max_coord;
  }
  CUTE_UNROLL
  for (int m = 0; m < size<0>(tCpC); ++m) {
    tCpC(m,0) = get<0>(tCcC(0,m,0)) < m_max_coord;
  }

  auto K_PIPE_MAX = size<3>(tAsA);
  int smem_pipe_read = 0;
  int smem_pipe_write = 0;

  // Copy A/B: GMEM => SMEM.
  auto fetch_gmem = [&](int tile) {
    copy_if(g2s_copy_a, tApA, tAgA(_,_,_,tile), tAsA(_,_,_,smem_pipe_write));
    copy(g2s_copy_b, tBgB(_,_,_,tile), tBsB(_,_,_,smem_pipe_write));
    cp_async_fence();
    smem_pipe_write = (smem_pipe_write + 1) % K_PIPE_MAX;
  };
  // Copy S/Z: GMEM => RMEM.
  auto fetch_scales = [&](int tile) {
    copy(g2r_copy_s, g2r_tCgS(_,_,_,tile), g2r_tCrS);
    if constexpr (has_zero_point_v<Quant>) {
      copy(g2r_copy_s, g2r_tCgZ(_,_,_,tile), g2r_tCrZ);
    }
  };
  // Copy A/B: SMEM => RMEM.
  auto fetch_smem = [&](auto block) {
    copy(s2r_atom_a, s2r_tCsA(_,_,block,smem_pipe_read), s2r_tCrA(_,_,block));
    copy(s2r_atom_b, s2r_tCsB(_,_,block,smem_pipe_read), s2r_tCrB(_,_,block));
    CUTE_UNROLL
    for (int n = 0; n < size<1>(tCrB); ++n) {
      dequant(tCrB(_,n,block), tCrS(_,n,block), tCrZ(_,n,block), tCrB_dq(_,n,block));
    }
  };

  auto K_TILE_MAX = size<3>(tAgA);
  auto K_BLOCK_MAX = size<2>(tCrA);

  // Prefetch beginning tiles.
  int tile_pipe = 0;
  CUTE_UNROLL
  for (; tile_pipe < K_PIPE_MAX - 1; ++tile_pipe) {
    fetch_gmem(tile_pipe);
  }

  // Clear accumulators.
  clear(tCrC_accu);

  // Prefetch first block.
  if constexpr (K_BLOCK_MAX > 1) {
    cp_async_wait<K_PIPE_MAX - 2>();
    __syncthreads();
    fetch_scales(0);
    fetch_smem(Int<0>{});
  }

  // Loop over CTA tiles.
  for (int tile = 0; tile < K_TILE_MAX; ++tile) {
    // Unroll MMA blocks.
    CUTE_UNROLL
    for (int block = 0; block < K_BLOCK_MAX; ++block) {
      // Wait for last tile.
      if (block == K_BLOCK_MAX - 1) {
        smem_pipe_read = (smem_pipe_read + 1) % K_PIPE_MAX;
        cp_async_wait<K_PIPE_MAX - 2>();
        __syncthreads();
        fetch_scales((tile + 1 < K_TILE_MAX) ? tile + 1 : tile);
      }
      // Prefetch next block.
      fetch_smem((block + 1) % K_BLOCK_MAX);
      // Prefetch next tile.
      if (block == 0) {
        fetch_gmem(tile_pipe);
        tile_pipe = (tile_pipe + 1 < K_TILE_MAX) ? tile_pipe + 1 : tile_pipe;
      }
      // MMA.
      gemm(mma, tCrA(_,_,block), tCrB_dq(_,_,block), tCrC_accu);
    }
  }

  // Epilogue.
  CUTE_UNROLL
  for (int i = 0; i < size(tCrC_accu); i++) {
    tCrC(i) = Element(tCrC_accu(i));
  }
  copy(r2s_copy_c, r2s_tCrC, r2s_tCsC);
  __syncthreads();
  copy_if(s2g_copy_c, tCpC, s2g_tCsC, s2g_tCgC);
}

template <typename Element>
inline constexpr auto make_mma_atom() {
  if constexpr (std::is_same_v<Element, half_t>) {
    return SM80_16x8x16_F32F16F16F32_TN{};
  }
  if constexpr (std::is_same_v<Element, bfloat16_t>) {
    return SM80_16x8x16_F32BF16BF16F32_TN{};
  }
}

template <int TileM, typename Element>
inline constexpr auto make_tiled_mma() {
  constexpr auto atom = make_mma_atom<Element>();
  if constexpr (TileM >= 32) {
    return make_tiled_mma(atom, Layout<Shape<_2,_2,_1>>{}, Tile<_32,_32,_16>{});
  } else {
    return make_tiled_mma(atom, Layout<Shape<_1,_4,_1>>{}, Tile<_16,_32,_16>{});
  }
}

template <typename T, int bits, template <typename U> typename Atom, typename NumThreads>
inline auto make_tiled_copy(NumThreads num_threads) {
  return make_tiled_copy(
      Copy_Atom<Atom<uint_bit_t<bits>>, T>{},
      make_layout(make_shape(Int<num_threads / 8>{}, Int<8>{}), LayoutRight{}),
      make_layout(make_shape(Int<1>{}, Int<bits / sizeof_bits_v<T>>{})));
}

template <int TileM = 16, typename Element, typename Quant, typename Scale, typename GroupSize, typename F>
void qmm_sm80(
    const Element* A,
    const Quant*   B,
    const Scale* S,
    const Element* Z,
    Element* C,
    int m, int n, int k, int l,
    GroupSize group_size,
    F&& launch_kernel) {
  // Define shapes (dynamic).
  auto prob_shape = make_shape(m, n, k, l); // (M,N,K,L)

  // Define TN strides (mixed).
  auto dA = make_stride(k, Int<1>{}, m * k); // (dM,dK,dL)
  auto dB = make_stride(k, Int<1>{}, n * k); // (dN,dK,dL)
  auto dC = make_stride(n, Int<1>{}, m * n); // (dM,dN,dL)

  // Define CTA tile sizes (static).
  auto bM = Int<TileM>{};
  auto bN = Int<128>{};
  auto bK = Int<max(64, group_size)>{};
  auto cta_tiler = make_shape(bM, bN, bK); // (BLK_M,BLK_N,BLK_K)

  // Define MMA.
  TiledMMA mma = make_tiled_mma<TileM, Element>();
  auto num_threads = size(mma);

  // Define the A/B smem layouts (static).
  auto swizzle_ab = composition(Swizzle<3,3,3>{},
                                Layout<Shape <_8,Shape <_8, _8>>,
                                       Stride<_8,Stride<_1,_64>>>{});
  auto bP = Int<3>{}; // pipeline
  auto sA_layout = tile_to_shape(swizzle_ab, make_shape(bM, bK, bP));
  auto sB_layout = tile_to_shape(swizzle_ab, make_shape(bN, bK, bP));

  // Define the C smem layouts (static).
  // TODO: Find a better swizzle.
  auto sC_layout = tile_to_shape(swizzle_ab, make_shape(bM, bN));

  // Define the scales/biases smem layouts (static).
  auto bS = ceil_div(bK, group_size);
  auto sS_layout = make_layout(make_shape(bN, make_shape(group_size, bS)),
                               make_stride(bS, Stride<_0, _1>{}));

  // Define layout of scales/biases (mixed).
  auto S_layout = make_layout(
      make_shape(n, make_shape(group_size, k / group_size), l),
      make_stride(k / group_size, Stride<_0, _1>{}, n * k / group_size));

  // Atoms.
  constexpr int element_bits = sizeof_bits_v<Element>;
  constexpr int quant_bits = sizeof_bits_v<Quant>;
  constexpr int qload = 128 / (element_bits / quant_bits);
  TiledCopy g2s_copy_a = make_tiled_copy<Element, 128, SM80_CP_ASYNC_CACHEALWAYS>(num_threads);
  TiledCopy g2s_copy_b = make_tiled_copy<Quant, qload, SM80_CP_ASYNC_CACHEALWAYS>(num_threads);
  TiledCopy s2g_copy_c = make_tiled_copy<Element, 128, UniversalCopy>(num_threads);

  Copy_Atom<SM75_U32x4_LDSM_N, Element> s2r_atom_a;
  Copy_Atom<UniversalCopy<uint_bit_t<2 * quant_bits>>, Quant> s2r_atom_b;
  Copy_Atom<UniversalCopy<uint_bit_t<2 * element_bits>>, Element> r2s_atom_c;
  Copy_Atom<UniversalCopy<Scale>, Scale> g2r_atom_s;

  auto* kernel = &qmm_sm80_kernel<
      decltype(prob_shape), decltype(cta_tiler),
      Element, Quant, Scale,
      decltype(dA), decltype(sA_layout), decltype(g2s_copy_a), decltype(s2r_atom_a),
      decltype(dB), decltype(sB_layout), decltype(g2s_copy_b), decltype(s2r_atom_b),
      decltype(dC), decltype(sC_layout), decltype(s2g_copy_c), decltype(r2s_atom_c),
      decltype(S_layout), decltype(g2r_atom_s), decltype(mma)>;

  // Set L1 to be SMEM only.
  size_t smem_bytes = sizeof(SharedStorage<Element, Quant,
                                           decltype(sA_layout),
                                           decltype(sB_layout),
                                           decltype(sC_layout)>);
  cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
  cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout, 100);

  dim3 num_blocks(size(ceil_div(m, bM)), size(ceil_div(n, bN)), l);
  dim3 block_dims(num_threads);
  void* args[] = {
      &prob_shape, &cta_tiler,
      &A, &dA, &sA_layout, &g2s_copy_a, &s2r_atom_a,
      &B, &dB, &sB_layout, &g2s_copy_b, &s2r_atom_b,
      &C, &dC, &sC_layout, &s2g_copy_c, &r2s_atom_c,
      &S, &Z, &S_layout, &g2r_atom_s, &mma};
  launch_kernel(reinterpret_cast<void*>(kernel), num_blocks, block_dims, smem_bytes, args);
}

} // namespace cutlass_gemm

// clang-format on

namespace mlx::core {

template <typename F>
inline void dispatch_element_types(Dtype dtype, const char* tag, F&& f) {
  if (dtype == float16) {
    f.template operator()<cutlass::half_t>();
  } else if (dtype == bfloat16) {
    f.template operator()<cutlass::bfloat16_t>();
  } else {
    throw std::invalid_argument(
        fmt::format("{} Unsupported dtype: {}.", tag, dtype_to_string(dtype)));
  }
}

template <typename F>
inline void dispatch_groups(int group_size, const char* tag, F&& f) {
  if (group_size == 32) {
    f.template operator()<32>();
  } else if (group_size == 64) {
    f.template operator()<64>();
  } else if (group_size == 128) {
    f.template operator()<128>();
  } else {
    throw std::invalid_argument(
        fmt::format("{} Group size {} is not supported.", tag, group_size));
  }
}

template <typename T, typename F>
inline void dispatch_quant_types(
    int bits,
    int group_size,
    QuantizationMode mode,
    const char* tag,
    F&& f) {
  if (mode == QuantizationMode::Mxfp4) {
    f.template operator()<cutlass::float_e2m1_t, cutlass::float_ue8m0_t, 32>();
  } else if (mode == QuantizationMode::Mxfp8) {
    f.template operator()<cutlass::float_e4m3_t, cutlass::float_ue8m0_t, 32>();
  } else if (mode == QuantizationMode::Nvfp4) {
    f.template operator()<cutlass::float_e2m1_t, cutlass::float_e4m3_t, 16>();
  } else {
    dispatch_groups(group_size, tag, [&]<int group_size>() {
      if (bits == 4) {
        f.template operator()<cutlass::uint4b_t, T, group_size>();
      } else if (bits == 8) {
        f.template operator()<uint8_t, T, group_size>();
      } else {
        throw std::invalid_argument(
            fmt::format("{} {}-bit quantization is not supported.", tag, bits));
      }
    });
  }
}

template <int TileM>
void qmm_impl_sm80(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    array& out,
    int bits,
    int group_size,
    QuantizationMode mode,
    cu::CommandEncoder& encoder) {
  const char* tag = "[quantized_matmul]";
  int m = out.shape(-2);
  int n = out.shape(-1);
  int k = x.shape(-1);
  int l = out.size() / (m * n);

  dispatch_element_types(out.dtype(), tag, [&]<typename Element>() {
    dispatch_quant_types<Element>(
        bits,
        group_size,
        mode,
        tag,
        [&]<typename Quant, typename Scale, int group_size>() {
          encoder.set_input_array(x);
          encoder.set_input_array(w);
          encoder.set_input_array(scales);
          if (biases) {
            encoder.set_input_array(*biases);
          }
          encoder.set_output_array(out);
          cutlass_gemm::qmm_sm80<TileM>(
              gpu_ptr<Element>(x),
              gpu_ptr<Quant>(w),
              gpu_ptr<Scale>(scales),
              biases ? gpu_ptr<Element>(*biases) : nullptr,
              gpu_ptr<Element>(out),
              m,
              n,
              k,
              l,
              cute::Int<group_size>{},
              [&](auto* kernel,
                  dim3 num_blocks,
                  dim3 block_dims,
                  uint32_t smem_bytes,
                  void** args) {
                encoder.add_kernel_node_raw(
                    kernel, num_blocks, block_dims, {}, smem_bytes, args);
              });
        });
  });
}

} // namespace mlx::core

#define QMM_SM80_GPU(TileM)               \
  namespace mlx::core {                   \
  template void qmm_impl_sm80<TileM>(     \
      const array& x,                     \
      const array& w,                     \
      const array& scales,                \
      const std::optional<array>& biases, \
      array& out,                         \
      int bits,                           \
      int group_size,                     \
      QuantizationMode mode,              \
      cu::CommandEncoder& encoder);       \
  }


================================================
FILE: mlx/backend/cuda/quantized/qmm/qmm_impl_sm80_m16.cu
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/cuda/quantized/qmm/qmm_impl_sm80.cuh"

QMM_SM80_GPU(16)


================================================
FILE: mlx/backend/cuda/quantized/qmm/qmm_impl_sm80_m32.cu
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/cuda/quantized/qmm/qmm_impl_sm80.cuh"

QMM_SM80_GPU(32)


================================================
FILE: mlx/backend/cuda/quantized/qmm/qmm_impl_sm80_m64.cu
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/cuda/quantized/qmm/qmm_impl_sm80.cuh"

QMM_SM80_GPU(64)


================================================
FILE: mlx/backend/cuda/quantized/qmm/qmm_impl_sm90.cuh
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/cuda/cutlass_utils.cuh"
#include "mlx/backend/cuda/quantized/quantized_utils.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/dtype_utils.h"

#include <cute/tensor.hpp>
#include <cutlass/cutlass.h>
#include <cutlass/epilogue/collective/collective_builder.hpp>
#include <cutlass/gemm/collective/collective_builder.hpp>
#include <cutlass/gemm/device/gemm_universal_adapter.h>
#include <cutlass/gemm/kernel/gemm_universal.hpp>

#if defined(MLX_CUDA_SM90A_ENABLED)

// We can't put kernel code in mlx::core due to name conflicts of "Shape".
namespace cutlass_gemm {

using namespace cute;

template <
    typename TileShapeMN = Shape<_128, _16>,
    typename ClusterShape = Shape<_1, _1, _1>,
    typename Element,
    typename Quant,
    typename GroupSize,
    typename F>
void qmm_sm90(
    const Element* A,
    const Quant* B,
    const Element* S,
    const Element* Z,
    Element* D,
    int64_t m,
    int64_t n,
    int64_t k,
    int64_t l,
    GroupSize group_size,
    F&& launch_kernel) {
  constexpr int kAlignmentA = 128 / sizeof_bits<Element>::value;
  constexpr int kAlignmentB = 128 / sizeof_bits<Quant>::value;
  constexpr int kTileShapeK =
      std::max(64, 128 * 8 / sizeof_bits<Element>::value);
  static_assert(group_size % kTileShapeK == 0);

  using Arch = cutlass::arch::Sm90;
  using Accumulator = float;
  using TileShape = decltype(append(TileShapeMN{}, Int<kTileShapeK>{}));

  using Epilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      Arch,
      cutlass::arch::OpClassTensorOp,
      TileShape,
      ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      Accumulator,
      Accumulator,
      // ElementC:
      void,
      cutlass::layout::ColumnMajor,
      kAlignmentA,
      // ElementD:
      Element,
      cutlass::layout::ColumnMajor,
      kAlignmentA,
      cutlass::epilogue::TmaWarpSpecializedCooperative>::CollectiveOp;

  // Note that A/B are swapped and transposed to use TMA epilogue.
  using Mainloop = typename cutlass::gemm::collective::CollectiveBuilder<
      Arch,
      cutlass::arch::OpClassTensorOp,
      // ElementA:
      tuple<Quant, Element, Element>,
      cutlass::layout::RowMajor,
      kAlignmentB,
      // ElementB:
      Element,
      cutlass::layout::ColumnMajor,
      kAlignmentA,
      Accumulator,
      TileShape,
      ClusterShape,
      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
          sizeof(typename Epilogue::SharedStorage))>,
      cutlass::gemm::KernelTmaWarpSpecializedCooperative>::CollectiveOp;

  using GemmKernel = cutlass::gemm::kernel::
      GemmUniversal<Shape<int, int, int, int>, Mainloop, Epilogue>;
  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;

  auto dA = make_stride(k, Int<1>{}, m * k);
  auto dB = make_stride(k, Int<1>{}, n * k);
  auto dS = make_stride(Int<1>{}, n, n * k / group_size);
  auto dD = make_stride(Int<1>{}, n, m * n);

  Gemm gemm;
  typename Gemm::Arguments args{
      cutlass::gemm::GemmUniversalMode::kGemm,
      {int(n), int(m), int(k), int(l)},
      {B, dB, A, dA, S, dS, group_size, Z},
      {{1.f, 0.f}, D, dD, D, dD}};

  CHECK_CUTLASS_ERROR(gemm.can_implement(args));
  CHECK_CUTLASS_ERROR(gemm.initialize(args, nullptr));

  auto* kernel = &cutlass::device_kernel<GemmKernel>;
  void* kernel_params[] = {const_cast<Gemm::Params*>(&gemm.params())};
  auto cluster = ClusterShape{};
  launch_kernel(
      reinterpret_cast<void*>(kernel),
      gemm.get_grid_shape(gemm.params()),
      GemmKernel::get_block_shape(),
      {static_cast<unsigned>(get<0>(cluster)),
       static_cast<unsigned>(get<1>(cluster)),
       static_cast<unsigned>(get<2>(cluster))},
      GemmKernel::SharedStorageSize,
      kernel_params);
}

} // namespace cutlass_gemm

namespace mlx::core {

inline array transpose_last_2_dims(
    const array& x,
    cu::CommandEncoder& encoder,
    const Stream& s) {
  array transposed = swapaxes_in_eval(x, -1, -2);
  array transposed_copy = contiguous_copy_gpu(transposed, s);
  encoder.add_temporary(transposed_copy);
  return transposed_copy;
}

template <typename F>
inline void dispatch_element_types(Dtype dtype, const char* tag, F&& f) {
  if (dtype == float32) {
    f.template operator()<float>();
  } else if (dtype == float16) {
    f.template operator()<cutlass::half_t>();
  } else if (dtype == bfloat16) {
    f.template operator()<cutlass::bfloat16_t>();
  } else {
    throw std::invalid_argument(
        fmt::format("{} Unsupported dtype: {}.", tag, dtype_to_string(dtype)));
  }
}

template <typename F>
inline void dispatch_quant_types(int bits, const char* tag, F&& f) {
  if (bits == 2) {
    f.template operator()<cutlass::uint2b_t>();
  } else if (bits == 4) {
    f.template operator()<cutlass::uint4b_t>();
  } else if (bits == 8) {
    f.template operator()<uint8_t>();
  } else {
    throw std::invalid_argument(
        fmt::format("{} {}-bit quantization is not supported.", tag, bits));
  }
}

template <typename F>
inline void dispatch_groups(int group_size, const char* tag, F&& f) {
  if (group_size == 64) {
    f(cute::Int<64>{});
  } else if (group_size == 128) {
    f(cute::Int<128>{});
  } else {
    throw std::invalid_argument(
        fmt::format("{} Group size {} is not supported.", tag, group_size));
  }
}

template <typename TileShapeMN, typename ClusterShape>
void qmm_impl_sm90(
    const array& x,
    const array& w,
    const array& scales_,
    const array& biases_,
    array& out,
    int bits,
    int group_size,
    cu::CommandEncoder& encoder,
    Stream s) {
  const char* tag = "[quantized_matmul]";
  int m = out.shape(-2);
  int n = out.shape(-1);
  int k = x.shape(-1);
  int l = out.size() / (m * n);

  // FIXME: Copy happens for every call.
  array scales = transpose_last_2_dims(scales_, encoder, s);
  array biases = transpose_last_2_dims(biases_, encoder, s);

  dispatch_element_types(out.dtype(), tag, [&]<typename Element>() {
    dispatch_quant_types(bits, tag, [&]<typename Quant>() {
      dispatch_groups(group_size, tag, [&](auto group_size) {
        encoder.set_input_array(x);
        encoder.set_input_array(w);
        encoder.set_input_array(scales);
        encoder.set_input_array(biases);
        encoder.set_output_array(out);
        cutlass_gemm::qmm_sm90(
            gpu_ptr<Element>(x),
            gpu_ptr<Quant>(w),
            gpu_ptr<Element>(scales),
            gpu_ptr<Element>(biases),
            gpu_ptr<Element>(out),
            m,
            n,
            k,
            l,
            group_size,
            [&](auto* kernel,
                dim3 num_blocks,
                dim3 block_dims,
                dim3 cluster_shape,
                uint32_t smem_bytes,
                void** args) {
              encoder.add_kernel_node_raw(
                  kernel,
                  num_blocks,
                  block_dims,
                  cluster_shape,
                  smem_bytes,
                  args);
            });
      });
    });
  });
}

} // namespace mlx::core

#define QMM_SM90_GPU(TileShapeMN, ClusterShape)           \
  namespace mlx::core {                                   \
  template void qmm_impl_sm90<TileShapeMN, ClusterShape>( \
      const array& x,                                     \
      const array& w,                                     \
      const array& scales,                                \
      const array& biases,                                \
      array& out,                                         \
      int bits,                                           \
      int group_size,                                     \
      cu::CommandEncoder& encoder,                        \
      Stream s);                                          \
  }

#else

#define QMM_SM90_GPU(TileShapeMN, ClusterShape)

#endif // defined(MLX_CUDA_SM90A_ENABLED)


================================================
FILE: mlx/backend/cuda/quantized/qmm/qmm_impl_sm90_m128_n128_m2.cu
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/cuda/quantized/qmm/qmm_impl_sm90.cuh"

using namespace cute;

using TileShapeMN = Shape<_128, _128>;
using ClusterShape = Shape<_2, _1, _1>;

QMM_SM90_GPU(TileShapeMN, ClusterShape)


================================================
FILE: mlx/backend/cuda/quantized/qmm/qmm_impl_sm90_m128_n16_m1.cu
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/cuda/quantized/qmm/qmm_impl_sm90.cuh"

using namespace cute;

using TileShapeMN = Shape<_128, _16>;
using ClusterShape = Shape<_1, _1, _1>;

QMM_SM90_GPU(TileShapeMN, ClusterShape)


================================================
FILE: mlx/backend/cuda/quantized/qmm/qmm_impl_sm90_m128_n256_m2.cu
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/cuda/quantized/qmm/qmm_impl_sm90.cuh"

using namespace cute;

using TileShapeMN = Shape<_128, _256>;
using ClusterShape = Shape<_2, _1, _1>;

QMM_SM90_GPU(TileShapeMN, ClusterShape)


================================================
FILE: mlx/backend/cuda/quantized/qmm/qmm_impl_sm90_m128_n32_m1.cu
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/cuda/quantized/qmm/qmm_impl_sm90.cuh"

using namespace cute;

using TileShapeMN = Shape<_128, _32>;
using ClusterShape = Shape<_1, _1, _1>;

QMM_SM90_GPU(TileShapeMN, ClusterShape)


================================================
FILE: mlx/backend/cuda/quantized/qmm/qmm_impl_sm90_m128_n64_m2.cu
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/cuda/quantized/qmm/qmm_impl_sm90.cuh"

using namespace cute;

using TileShapeMN = Shape<_128, _64>;
using ClusterShape = Shape<_2, _1, _1>;

QMM_SM90_GPU(TileShapeMN, ClusterShape)


================================================
FILE: mlx/backend/cuda/quantized/qmm/qmv.cu
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/backend/cuda/quantized/qmm/qmm.h"
#include "mlx/dtype_utils.h"

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include <cute/numeric/numeric_types.hpp>
#include <cutlass/numeric_conversion.h>

namespace cutlass {

using uint3b_t = integer_subbyte<3, false>;
using uint5b_t = integer_subbyte<5, false>;

template <typename T, int N, FloatRoundStyle Round>
struct NumericArrayConverter<T, uint3b_t, N, Round> {
  static_assert(N % 8 == 0);

  using result_type = Array<T, N>;
  using source_type = Array<uint3b_t, N>;

  CUTLASS_HOST_DEVICE
  static result_type convert(const source_type& source) {
    result_type result;
    auto* s_base = reinterpret_cast<const uint8_t*>(&source);
    CUTLASS_PRAGMA_UNROLL
    for (int i = 0; i < N / 8; ++i) {
      auto* s = s_base + i * 3;
      result[i * 8] = T(s[0] & 0x07);
      result[i * 8 + 1] = T((s[0] & 0x38) >> 3);
      result[i * 8 + 2] = T((s[0] & 0xc0) >> 6) + T((s[1] & 0x01) << 2);
      result[i * 8 + 3] = T((s[1] & 0x0e) >> 1);
      result[i * 8 + 4] = T((s[1] & 0x70) >> 4);
      result[i * 8 + 5] = T((s[1] & 0x80) >> 7) + T((s[2] & 0x03) << 1);
      result[i * 8 + 6] = T((s[2] & 0x1c) >> 2);
      result[i * 8 + 7] = T((s[2] & 0xe0) >> 5);
    }
    return result;
  }

  CUTLASS_HOST_DEVICE
  result_type operator()(const source_type& s) const {
    return convert(s);
  }
};

template <typename T, int N, FloatRoundStyle Round>
struct NumericArrayConverter<T, uint5b_t, N, Round> {
  static_assert(N % 8 == 0);

  using result_type = Array<T, N>;
  using source_type = Array<uint5b_t, N>;

  CUTLASS_HOST_DEVICE
  static result_type convert(const source_type& source) {
    result_type result;
    auto* s_base = reinterpret_cast<const uint8_t*>(&source);
    CUTLASS_PRAGMA_UNROLL
    for (int i = 0; i < N / 8; ++i) {
      auto* s = s_base + i * 5;
      result[i * 8] = T(s[0] & 0x1f);
      result[i * 8 + 1] = T((s[0] & 0xe0) >> 5) + T((s[1] & 0x03) << 3);
      result[i * 8 + 2] = T((s[1] & 0x7c) >> 2);
      result[i * 8 + 3] = T((s[1] & 0x80) >> 7) + T((s[2] & 0x0f) << 1);
      result[i * 8 + 4] = T((s[2] & 0xf0) >> 4) + T((s[3] & 0x01) << 4);
      result[i * 8 + 5] = T((s[3] & 0x3e) >> 1);
      result[i * 8 + 6] = T((s[3] & 0xc0) >> 6) + T((s[4] & 0x07) << 2);
      result[i * 8 + 7] = T((s[4] & 0xf8) >> 3);
    }
    return result;
  }

  CUTLASS_HOST_DEVICE
  result_type operator()(const source_type& s) const {
    return convert(s);
  }
};

template <typename T, int N, FloatRoundStyle Round>
struct NumericArrayConverter<T, uint6b_t, N, Round> {
  static_assert(N % 4 == 0);

  using result_type = Array<T, N>;
  using source_type = Array<uint6b_t, N>;

  CUTLASS_HOST_DEVICE
  static result_type convert(const source_type& source) {
    result_type result;
    auto* s_base = reinterpret_cast<const uint8_t*>(&source);
    CUTLASS_PRAGMA_UNROLL
    for (int i = 0; i < N / 4; ++i) {
      auto* s = s_base + i * 3;
      result[i * 4] = T(s[0] & 0x3f);
      result[i * 4 + 1] = T((s[0] >> 6) & 0x03) + T((s[1] & 0x0f) << 2);
      result[i * 4 + 2] = T((s[1] >> 4) & 0x0f) + T((s[2] & 0x03) << 4);
      result[i * 4 + 3] = T((s[2] >> 2) & 0x3f);
    }
    return result;
  }

  CUTLASS_HOST_DEVICE
  result_type operator()(const source_type& s) const {
    return convert(s);
  }
};

} // namespace cutlass

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

// Fused vectorized dequantize and multiply-add:
// w_dq = w * scale + bias
// out = fma(x, w_dq, out)
template <int N, bool has_bias, typename T, typename Q, typename S>
__device__ __forceinline__ void
dequant_fma(const T* x, const Q* w, S scale, T bias, T* out) {
  // Read x/w into registers.
  auto x_vec = *(reinterpret_cast<const cutlass::Array<T, N>*>(x));
  auto w_vec = *(reinterpret_cast<const cutlass::Array<Q, N>*>(w));
  // Output is assumed to be registers.
  auto* out_vec = reinterpret_cast<cutlass::Array<T, N>*>(out);

  // Dequantize w.
  cutlass::NumericArrayConverter<T, Q, N> converter_tq;
  cutlass::Array<T, N> w_dq = converter_tq(w_vec);
  if constexpr (has_bias) {
    if constexpr (cuda::std::is_same_v<T, float>) {
#pragma unroll
      for (int i = 0; i < N; ++i) {
        w_dq[i] = w_dq[i] * T(scale) + bias;
      }
    } else {
      w_dq = w_dq * T(scale) + bias;
    }
  } else {
    w_dq = w_dq * T(scale);
  }

  // Multiply and add.
  *out_vec = cutlass::fma(x_vec, w_dq, *out_vec);
}

// Specialization for doing float32 accumulations on narrow types.
template <
    int N,
    bool has_bias,
    typename T,
    typename Q,
    typename S,
    typename = cuda::std::enable_if_t<!cuda::std::is_same_v<T, float>>>
__device__ __forceinline__ void
dequant_fma(const T* x, const Q* w, S scale, T bias, float* out) {
  // Read x/w into registers.
  auto x_vec = *(reinterpret_cast<const cutlass::Array<T, N>*>(x));
  auto w_vec = *(reinterpret_cast<const cutlass::Array<Q, N>*>(w));
  // Output is assumed to be registers.
  auto* out_vec = reinterpret_cast<cutlass::Array<float, N>*>(out);

  // Dequantize w.
  cutlass::NumericArrayConverter<T, Q, N> converter_tq;
  cutlass::Array<T, N> w_dq = converter_tq(w_vec);
  if constexpr (has_bias) {
    w_dq = w_dq * T(scale) + bias;
  } else {
    w_dq = w_dq * T(scale);
  }

  // Promote x/w to float.
  static_assert(!cuda::std::is_same_v<T, float>);
  cutlass::NumericArrayConverter<float, T, N> converter_ft;
  cutlass::Array<float, N> x_f = converter_ft(x_vec);
  cutlass::Array<float, N> w_f = converter_ft(w_dq);

  // Multiply and add.
  *out_vec = cutlass::fma(x_f, w_f, *out_vec);
}

template <
    int rows_per_block,
    int elems_per_thread,
    int group_size,
    bool has_bias,
    bool has_residue_k,
    typename T,
    typename Q,
    typename S>
__global__ void qmv_kernel(
    const T* x,
    const Q* w,
    const S* scales,
    const T* biases,
    T* out,
    int n,
    int k,
    bool broadcast_w) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<WARP_SIZE>(block);

  // The row that this warp handles.
  int row = block.group_index().x * rows_per_block + warp.meta_group_rank();
  if (row >= n) {
    return;
  }

  // Advance pointers of x/out.
  int m = grid.dim_blocks().y;
  int l = block.group_index().z;
  x += block.group_index().y * k + m * k * l;
  out += block.group_index().y * n + m * n * l;

  // For sub-byte Q, pointer moves by 8bits for each advance, e.g. w += 1 would
  // move past 2 elements for 4-bit Q.
  constexpr int bits = cute::sizeof_bits_v<Q>;
  auto w_step = [&](int idx) { return idx * cuda::std::min(8, bits) / 8; };

  // How many groups (and scales/biases) in a row.
  int groups_per_row = k / group_size;

  // Advance w/scales/biases to current row.
  int w_batch = broadcast_w ? 0 : l;
  w += (static_cast<int64_t>(row) + n * w_batch) * w_step(k);
  scales += (static_cast<int64_t>(row) + n * w_batch) * groups_per_row;
  if constexpr (has_bias) {
    biases += (static_cast<int64_t>(row) + n * w_batch) * groups_per_row;
  }

  // Accumulations of current row.
  cuda::std::conditional_t<(bits >= 8), float, T> sums[elems_per_thread] = {};

  auto dequant_fma_tile = [&](int idx) {
    S scale = scales[idx / group_size];
    T bias{0};
    if constexpr (has_bias) {
      bias = biases[idx / group_size];
    }
    dequant_fma<elems_per_thread, has_bias>(
        x + idx, w + w_step(idx), scale, bias, sums);
  };

  // Loop over k dimension.
  constexpr int elems_per_warp = WARP_SIZE * elems_per_thread;
  for (int r = 0; r < k / elems_per_warp; ++r) {
    int idx = warp.thread_rank() * elems_per_thread + r * elems_per_warp;
    dequant_fma_tile(idx);
  }

  // Handle remaining elements in k dimension.
  if constexpr (has_residue_k) {
    int rest = k % elems_per_warp;
    int idx = warp.thread_rank() * elems_per_thread + k - rest;
    if (idx < k) {
      dequant_fma_tile(idx);
    }
  }

  // Result for current row.
  float sum{0};
#pragma unroll
  for (int i = 0; i < elems_per_thread; ++i) {
    sum += sums[i];
  }
  sum = cg::reduce(warp, sum, cg::plus<float>{});

  // Write result for current warp, which maps to rows 1-to-1.
  if (warp.thread_rank() == 0) {
    out[row] = static_cast<T>(sum);
  }
}

template <
    int group_size,
    bool has_bias,
    typename T,
    typename Q,
    typename S,
    typename F>
void qmv(
    const T* x,
    const Q* w,
    const S* scales,
    const T* biases,
    T* out,
    int m,
    int n,
    int k,
    int l,
    bool broadcast_w,
    F&& launch_kernel) {
  constexpr int rows_per_block = 8;
  constexpr int elems_per_thread =
      (cute::sizeof_bits_v<T> <= 16 && cute::sizeof_bits_v<Q> <= 4) ? 16 : 8;

  dim3 num_blocks{
      uint32_t(cuda::ceil_div(n, rows_per_block)), uint32_t(m), uint32_t(l)};
  dim3 block_dims{WARP_SIZE, rows_per_block};
  void* args[] = {&x, &w, &scales, &biases, &out, &n, &k, &broadcast_w};

  dispatch_bool(k % (WARP_SIZE * elems_per_thread), [&](auto has_residue_k) {
    auto* kernel = &qmv_kernel<
        rows_per_block,
        elems_per_thread,
        group_size,
        has_bias,
        has_residue_k.value,
        T,
        Q,
        S>;
    launch_kernel(
        reinterpret_cast<void*>(kernel), num_blocks, block_dims, args);
  });
}

} // namespace cu

template <typename F>
inline void dispatch_element_types(Dtype dtype, const char* tag, F&& f) {
  if (dtype == float32) {
    f.template operator()<float>();
  } else if (dtype == float16) {
    f.template operator()<cutlass::half_t>();
  } else if (dtype == bfloat16) {
    f.template operator()<cutlass::bfloat16_t>();
  } else {
    throw std::invalid_argument(
        fmt::format("{} Unsupported dtype: {}.", tag, dtype_to_string(dtype)));
  }
}

template <typename F>
inline void dispatch_groups(int group_size, const char* tag, F&& f) {
  if (group_size == 32) {
    f.template operator()<32>();
  } else if (group_size == 64) {
    f.template operator()<64>();
  } else if (group_size == 128) {
    f.template operator()<128>();
  } else {
    throw std::invalid_argument(
        fmt::format("{} Group size {} is not supported.", tag, group_size));
  }
}

template <typename T, typename F>
inline void dispatch_quant_types(
    int bits,
    int group_size,
    QuantizationMode mode,
    const char* tag,
    F&& f) {
  if (mode == QuantizationMode::Mxfp4) {
    f.template operator()<cutlass::float_e2m1_t, cutlass::float_ue8m0_t, 32>();
  } else if (mode == QuantizationMode::Mxfp8) {
    f.template operator()<cutlass::float_e4m3_t, cutlass::float_ue8m0_t, 32>();
  } else if (mode == QuantizationMode::Nvfp4) {
    f.template operator()<cutlass::float_e2m1_t, cutlass::float_e4m3_t, 16>();
  } else {
    dispatch_groups(group_size, tag, [&]<int group_size>() {
      if (bits == 2) {
        f.template operator()<cutlass::uint2b_t, T, group_size>();
      } else if (bits == 3) {
        f.template operator()<cutlass::uint3b_t, T, group_size>();
      } else if (bits == 4) {
        f.template operator()<cutlass::uint4b_t, T, group_size>();
      } else if (bits == 5) {
        f.template operator()<cutlass::uint5b_t, T, group_size>();
      } else if (bits == 6) {
        f.template operator()<cutlass::uint6b_t, T, group_size>();
      } else if (bits == 8) {
        f.template operator()<uint8_t, T, group_size>();
      } else {
        throw std::invalid_argument(
            fmt::format("{} {}-bit quantization is not supported.", tag, bits));
      }
    });
  }
}

void qmv(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    array& out,
    int bits,
    int group_size,
    QuantizationMode mode,
    cu::CommandEncoder& encoder) {
  const char* tag = "[quantized_matmul]";
  int m = out.shape(-2);
  int n = out.shape(-1);
  int k = x.shape(-1);
  int l = out.size() / (m * n);
  bool broadcast_w = w.ndim() == 2;

  dispatch_element_types(out.dtype(), tag, [&]<typename T>() {
    dispatch_quant_types<T>(
        bits,
        group_size,
        mode,
        tag,
        [&]<typename Q, typename S, int group_size>() {
          encoder.set_input_array(x);
          encoder.set_input_array(w);
          encoder.set_input_array(scales);
          if (biases) {
            encoder.set_input_array(*biases);
          }
          encoder.set_output_array(out);
          constexpr bool has_bias = !cutlass::has_negative_zero_v<Q>;
          cu::qmv<group_size, has_bias>(
              gpu_ptr<T>(x),
              gpu_ptr<Q>(w),
              gpu_ptr<S>(scales),
              biases ? gpu_ptr<T>(*biases) : nullptr,
              gpu_ptr<T>(out),
              m,
              n,
              k,
              l,
              broadcast_w,
              [&](auto* kernel, dim3 num_blocks, dim3 block_dims, void** args) {
                encoder.add_kernel_node_raw(
                    kernel, num_blocks, block_dims, {}, 0, args);
              });
        });
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/qqmm.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/quantized/qmm/qmm.h"
#include "mlx/backend/cuda/quantized/qqmm_impl.h"
#include "mlx/backend/cuda/quantized/qqmm_utils.h"
#include "mlx/backend/cuda/quantized/quantized.h"
#include "mlx/backend/cuda/quantized/quantized_utils.h"
#include "mlx/primitives.h"

#include <nvtx3/nvtx3.hpp>

namespace mlx::core {

namespace {

std::tuple<array, array> quantize_input(
    const array& input,
    cu::CommandEncoder& encoder,
    const Stream& s,
    QuantizationMode mode,
    int bits,
    int group_size,
    std::optional<array> global_scale = std::nullopt) {
  const array x = ensure_contiguous(input, encoder, s);

  // Compute output shapes
  auto xq_shape = x.shape();
  xq_shape.back() = x.shape(-1) * bits / 32;

  const int64_t scales_inner = x.shape(-1) / group_size;
  auto [pad_outer, pad_inner] =
      get_padded_scale_dims(x.shape(-2), scales_inner);

  auto sshape = x.shape();
  sshape[x.ndim() - 2] = pad_outer;
  sshape[x.ndim() - 1] = pad_inner;
  sshape.back() = scales_inner;

  // Allocate outputs
  const int64_t xq_bytes = x.size() * bits / 8;
  const int64_t batch = x.size() / (x.shape(-2) * x.shape(-1));
  const int64_t scales_bytes = batch * (pad_outer * pad_inner);

  array x_q(cu::malloc_async(xq_bytes, encoder), std::move(xq_shape), uint32);
  array scales_x(
      cu::malloc_async(scales_bytes, encoder), std::move(sshape), uint8);
  encoder.add_temporary(x_q);
  encoder.add_temporary(scales_x);
  // global_scale is not nullopt only for NVFP4
  fp_quantize(x, x_q, scales_x, group_size, bits, global_scale, encoder, s);
  return {std::move(x_q), std::move(scales_x)};
}

GemmScalars create_nvfp4_scalars(
    const array& global_scale_x,
    const array& global_scale_w,
    cu::CommandEncoder& encoder) {
  // NVFP4 requires alpha/beta as device pointers
  // alpha = amax_x * amax_w / (448 * 6)^2
  // beta = 0
  array alpha(cu::malloc_async(sizeof(float), encoder), {}, float32);
  array beta(cu::malloc_async(sizeof(float), encoder), {}, float32);
  compute_qqmm_pointers(alpha, beta, global_scale_x, global_scale_w, encoder);
  encoder.add_temporary(alpha);
  encoder.add_temporary(beta);
  return {alpha, beta};
}

} // namespace

void QQMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("QQMatmul::eval_gpu");

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
  auto& device = encoder.device();
  bool w_quantized = (inputs[1].dtype() == uint32);
  int base_size = w_quantized ? 3 : 2;

  assert(
      inputs.size() == base_size ||
      (mode_ == QuantizationMode::Nvfp4 && inputs.size() == base_size + 2));

  if (w_quantized && inputs[0].shape(-2) == 1) {
    out.set_data(cu::malloc_async(out.nbytes(), encoder));

    // For nvfp4, get global scale for x from inputs if present
    bool has_global_scale =
        mode_ == QuantizationMode::Nvfp4 && inputs.size() > base_size;
    std::optional<array> global_scale = std::nullopt;
    if (has_global_scale) {
      global_scale = inputs[inputs.size() - 2];
    }

    bool donate_x = inputs[0].is_donatable();
    array x = ensure_row_contiguous(inputs[0], encoder, s);
    // If x is a copy it should be donatable
    donate_x |= x.is_donatable();
    auto xhat = donate_x
        ? x
        : array(cu::malloc_async(x.nbytes(), encoder), x.shape(), x.dtype());
    if (!donate_x) {
      encoder.add_temporary(xhat);
    }
    fp_quantize_dequantize(
        x, xhat, group_size_, bits_, global_scale, encoder, s);

    const array& w = inputs[1];
    const array& scales = inputs[2];
    qmv(xhat, w, scales, std::nullopt, out, bits_, group_size_, mode_, encoder);
    return;
  }

  auto cc = device.compute_capability_major() * 100 +
      device.compute_capability_minor() * 10;
  if (cc < 1000) {
    throw std::runtime_error(
        "[QQMatmul::eval_gpu] QQMM is only supported on GPUs with compute capability 10.0 or higher.");
  }

  // - 2 inputs: x, w (non-quantized w)
  // - 3 inputs: x, w, scales_w (quantized w)

  // For nvfp4, global scales are optional but must be both present or both
  // absent If present, they add 2 more inputs (global_scale_x, global_scale_w)
  bool has_global_scales =
      mode_ == QuantizationMode::Nvfp4 && inputs.size() > base_size;

  // For nvfp4, get global scales from inputs if present
  std::optional<array> global_scale_x = std::nullopt;
  std::optional<array> global_scale_w = std::nullopt;
  if (has_global_scales) {
    global_scale_x = inputs[inputs.size() - 2];
    global_scale_w = inputs[inputs.size() - 1];
  }

  // Quantize inputs (or use pre-quantized)
  auto [x_q, scale_x_pre] = quantize_input(
      inputs[0], encoder, s, mode_, bits_, group_size_, global_scale_x);
  auto [w_q, scale_w_pre] = !w_quantized
      ? quantize_input(
            inputs[1], encoder, s, mode_, bits_, group_size_, global_scale_w)
      : std::make_tuple(
            ensure_contiguous(inputs[1], encoder, s),
            ensure_contiguous(inputs[2], encoder, s));

  out.set_data(cu::malloc_async(out.nbytes(), encoder));

  int M = x_q.shape(-2);
  int N = w_q.shape(-2); // transposed
  int K = x_q.shape(-1) * (32 / bits_);

  bool x_transposed = false;
  bool w_transposed = true; // always transposed
  int64_t lda = K;
  int64_t ldb = K;

  // Repack scales to tiled layout for tensor cores
  array scale_x = pad_and_swizzle_scales(scale_x_pre, encoder, s);
  array scale_w = pad_and_swizzle_scales(scale_w_pre, encoder, s);

  GemmScalars scalars;
  if (has_global_scales) {
    scalars = create_nvfp4_scalars(*global_scale_x, *global_scale_w, encoder);
  }

  qqmm_impl(
      encoder,
      M,
      N,
      K,
      x_transposed,
      lda,
      w_transposed,
      ldb,
      out,
      x_q,
      w_q,
      scale_x,
      scale_w,
      mode_,
      scalars);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/qqmm_impl.cpp
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/cuda/quantized/qqmm_impl.h"
#include "mlx/backend/cuda/quantized/cublas_qqmm.h"

namespace mlx::core {

void qqmm_impl(
    cu::CommandEncoder& encoder,
    int M,
    int N,
    int K,
    bool a_transposed,
    int64_t lda,
    bool b_transposed,
    int64_t ldb,
    array& out,
    const array& a,
    const array& b,
    const array& a_scale,
    const array& b_scale,
    QuantizationMode mode,
    const GemmScalars& scalars) {
  std::string qmode = quantization_mode_to_string(mode);

  CublasQQMM qqmm(
      encoder.device(),
      a_transposed,
      M,
      K,
      lda,
      b_transposed,
      K,
      N,
      ldb,
      1, // batch_count
      0, // a_batch_stride
      0, // b_batch_stride
      out.dtype(),
      qmode);

  if (scalars.has_values()) {
    qqmm.run(
        encoder,
        out,
        a,
        b,
        a_scale,
        b_scale,
        *scalars.alpha_device,
        *scalars.beta_device);
  } else {
    qqmm.run(encoder, out, a, b, a_scale, b_scale);
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/qqmm_impl.h
================================================
// Copyright © 2025 Apple Inc.
#pragma once

#include "mlx/backend/cuda/device.h"
#include "mlx/primitives.h"

#include <optional>

namespace mlx::core {

struct GemmScalars {
  std::optional<array> alpha_device;
  std::optional<array> beta_device;

  bool has_values() const {
    return alpha_device.has_value();
  }
};

void qqmm_impl(
    cu::CommandEncoder& encoder,
    int M,
    int N,
    int K,
    bool a_transposed,
    int64_t lda,
    bool b_transposed,
    int64_t ldb,
    array& out,
    const array& a,
    const array& b,
    const array& a_scale,
    const array& b_scale,
    QuantizationMode mode,
    const GemmScalars& scalars = {});

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/qqmm_utils.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/backend/cuda/quantized/qqmm_utils.h"

#include <cooperative_groups.h>

namespace mlx::core {

namespace cg = cooperative_groups;

constexpr int TILE_ROWS = 128;
constexpr int TILE_COLS = 4;
constexpr int TILES_PER_LANE = 1;
constexpr int LANES_PER_BLOCK = 32;

// To pass scales to tensor cores, they need to be repacked into a tiled layout
// https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout
// Tiled layout for scale factors is very well described in CUTLASS
// documentation:
// https://github.com/NVIDIA/cutlass/blob/main/media/docs/cpp/blackwell_functionality.md#scale-factor-layouts
// Conceptually, it should be like this:
// q_w = mx.zeros(shape=(M, N)) <-- zeros just for an example
// s.shape = (M, N // 16) -- packed in row contigous order, group_size = 16
// cbg_cnt = N // 16 // 4
// rb_cnt = M // 128
// tmp = x.reshape(rb_cnt, 4, 32, cbg_cnt, 4)
// repacked_scales = tmp.transpose(0, 3, 2, 1, 4)
// example: indecis of intial tile 128 x 4 of scales (packed in row major tensor
// (M, K // 16), where M = 128, K = 64): array([[0, 1, 2, 3],
//       [4, 5, 6, 7],
//       [8, 9, 10, 11],
//       ...,
//       [500, 501, 502, 503],
//       [504, 505, 506, 507],
//       [508, 509, 510, 511]]
// packed scales within tile 128 x 4:
// array([[[[[0, 1, 2, 3], <-- s_0,0..s_0,3 scales
//          [128, 129, 130, 131], <-- s_32,0..s_32,3 scales
//          [256, 257, 258, 259], <-- s_64,0..s_64,3 scales
//          [384, 385, 386, 387]], <-- s_96,0..s_96,3 scales
//         [[4, 5, 6, 7], <-- s_1,0..s_1,3 scales
//          [132, 133, 134, 135], ...
//          [260, 261, 262, 263],
//          [388, 389, 390, 391]],
//         [[124, 125, 126, 127],
//          [252, 253, 254, 255],
//          [380, 381, 382, 383],
//          [508, 509, 510, 511]]]]],

inline std::tuple<dim3, dim3> get_swizzle_launch_args(
    size_t M_swizzled,
    size_t K_swizzled) {
  constexpr int tiles_per_block = LANES_PER_BLOCK * TILES_PER_LANE;
  constexpr int warps_per_block = TILE_ROWS / 4; // 128 / 4 = 32

  const int num_tiles_k = K_swizzled / TILE_COLS;
  const int num_tiles_m = M_swizzled / TILE_ROWS;

  dim3 grid;
  grid.x = cuda::ceil_div(num_tiles_k, tiles_per_block);
  grid.y = num_tiles_m;
  grid.z = 1;
  // Block is always (32, 32) = 1024 threads
  dim3 block(LANES_PER_BLOCK, warps_per_block, 1);

  return std::make_tuple(grid, block);
}

namespace cu {

constexpr float F8E4M3_MAX = 448.0f;
constexpr float F4E2M1_MAX = 6.0f;

__global__ void compute_qqmm_pointers(
    float* alpha_out,
    float* beta_out,
    const float* tensor_amax_x,
    const float* tensor_amax_w) {
  // Compute alpha = tensor_amax_x * tensor_amax_w / (448 * 6)^2
  constexpr float inv_scale_sq =
      1.0f / (F8E4M3_MAX * F4E2M1_MAX * F8E4M3_MAX * F4E2M1_MAX);
  *alpha_out = (*tensor_amax_x) * (*tensor_amax_w) * inv_scale_sq;
  *beta_out = 0.0f;
}

__global__ void swizzle_scales(
    const uint8_t* scales_linear,
    uint8_t* scales_swizzled,
    const size_t M,
    const size_t K,
    const size_t M_swizzled,
    const size_t K_swizzled) {
  constexpr int tile_size = TILE_ROWS * TILE_COLS;
  constexpr int num_tile_rows_per_thread = 4;
  constexpr int max_tiles_per_block = LANES_PER_BLOCK * TILES_PER_LANE;

  constexpr int tile_stride = tile_size / 16; // 32 int4s per tile

  // Each thread loads 16 scales from 4 rows (stride 32) and packs them into
  // int4. For example: thread (0, 0) loads scales at rows 0,32,64,96 of tile 0,
  // thread (1, 0) loads rows 0,32,64,96 of of tile 1, etc.
  // The store is strided within a warp (stride 32 int4s), so we first
  // write to shared memory, then do a coalesced store from shared to global
  auto block_size = cg::this_thread_block().dim_threads();
  auto block_idx = cg::this_thread_block().group_index();
  auto idx_in_block = cg::this_thread_block().thread_index();

  auto tidx = idx_in_block.x;
  auto tidy = idx_in_block.y;
  auto linear_tid = tidy * block_size.x + tidx;

  const int bid_x = block_idx.x;
  const int bid_y = block_idx.y;

  const int K_int = K_swizzled / 4;

  const size_t output_offset = static_cast<size_t>(bid_y) * TILE_ROWS * K_int +
      static_cast<size_t>(bid_x) * max_tiles_per_block * tile_size / 4;
  int* output_block = reinterpret_cast<int*>(scales_swizzled) + output_offset;

  const int grid_dim_x = cg::this_grid().dim_blocks().x;
  const int grid_dim_y = cg::this_grid().dim_blocks().y;

  int remaining = K_int - bid_x * max_tiles_per_block;
  int tiles_in_block = min(remaining, max_tiles_per_block);
  bool valid_tile = tidx * TILES_PER_LANE < tiles_in_block;

  __shared__ int4 strided_scales_thread[max_tiles_per_block * tile_stride];

  // Initialize to zero for padding
  int thread_tile_rows[num_tile_rows_per_thread] = {0};

  if (valid_tile) {
    const size_t col_base =
        static_cast<size_t>(bid_x) * max_tiles_per_block * TILE_COLS +
        tidx * TILE_COLS;

    const bool aligned_k = (K % 4 == 0);

    if (aligned_k) {
      // fast path: K is aligned, use vectorized loads with stride K/4
      const int K_stride = K / 4;
      const size_t block_offset =
          static_cast<size_t>(bid_y) * TILE_ROWS * K_stride +
          static_cast<size_t>(bid_x) * max_tiles_per_block;
      const int* input_block =
          reinterpret_cast<const int*>(scales_linear) + block_offset;
// load
#pragma unroll
      for (int i = 0; i < num_tile_rows_per_thread; i++) {
        const size_t row =
            static_cast<size_t>(bid_y) * TILE_ROWS + i * block_size.x + tidy;
        const int thread_offset =
            (i * block_size.x + tidy) * K_stride + tidx * TILES_PER_LANE;
        if (row < M && col_base + TILE_COLS <= K) {
          thread_tile_rows[i] = __ldg(input_block + thread_offset);
        } else if (row < M) {
// partial tile at K boundary: load byte-by-byte
#pragma unroll
          for (int c = 0; c < TILE_COLS; c++) {
            if (col_base + c < K) {
              reinterpret_cast<uint8_t*>(&thread_tile_rows[i])[c] =
                  scales_linear[row * K + col_base + c];
            }
          }
        }
      }
    } else {
#pragma unroll
      for (int i = 0; i < num_tile_rows_per_thread; i++) {
        const size_t row =
            static_cast<size_t>(bid_y) * TILE_ROWS + i * block_size.x + tidy;
        if (row < M) {
          const size_t row_start = row * K;
#pragma unroll
          for (int c = 0; c < TILE_COLS; c++) {
            if (col_base + c < K) {
              reinterpret_cast<uint8_t*>(&thread_tile_rows[i])[c] =
                  scales_linear[row_start + col_base + c];
            }
          }
        }
      }
    }
    // store to shared with XOR swizzle to avoid bank conflicts
    int base_idx = tidx * tile_stride + tidy;
    int xor_bits = (tidy >> 3) & 0x3;
    int swizzled_idx = base_idx ^ xor_bits;
    strided_scales_thread[swizzled_idx] =
        *reinterpret_cast<int4*>(thread_tile_rows);
  }

  cg::thread_block block = cg::this_thread_block();
  cg::sync(block);

  const int total_int4s = tiles_in_block * tile_stride;
#pragma unroll
  for (int i = linear_tid; i < total_int4s; i += block_size.x * block_size.y) {
    int tile_idx = i / tile_stride;
    int row_idx = i % tile_stride;
    int base_idx = tile_idx * tile_stride + row_idx;
    int xor_bits = (row_idx >> 3) & 0x3;
    int swizzled_idx = base_idx ^ xor_bits;
    reinterpret_cast<int4*>(output_block)[i] =
        strided_scales_thread[swizzled_idx];
  }
}
} // namespace cu

void swizzle_scales(
    const array& scales,
    array& scales_tiled,
    cu::CommandEncoder& enc,
    const Stream& s) {
  enc.set_input_array(scales);
  enc.set_output_array(scales_tiled);
  // Note: scales_tiled is padded to full tiles so if num_rows or num_cols
  // are not multiples of tile sizes
  size_t input_rows = scales.shape(-2);
  size_t input_cols = scales.shape(-1);

  size_t output_rows = scales_tiled.shape(-2);
  size_t output_cols = scales_tiled.shape(-1);

  auto [num_blocks, block_dims] =
      get_swizzle_launch_args(output_rows, output_cols);
  enc.add_kernel_node(
      cu::swizzle_scales,
      num_blocks,
      block_dims,
      gpu_ptr<uint8_t>(scales),
      gpu_ptr<uint8_t>(scales_tiled),
      input_rows,
      input_cols,
      output_rows,
      output_cols);
}

void compute_qqmm_pointers(
    array& alpha_out,
    array& beta_out,
    const array& tensor_amax_x,
    const array& tensor_amax_w,
    cu::CommandEncoder& enc) {
  enc.set_input_array(tensor_amax_x);
  enc.set_input_array(tensor_amax_w);
  enc.set_output_array(alpha_out);
  enc.set_output_array(beta_out);
  enc.add_kernel_node(
      cu::compute_qqmm_pointers,
      dim3(1),
      dim3(1),
      gpu_ptr<void>(alpha_out),
      gpu_ptr<void>(beta_out),
      gpu_ptr<void>(tensor_amax_x),
      gpu_ptr<void>(tensor_amax_w));
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/qqmm_utils.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/array.h"
#include "mlx/backend/cuda/device.h"

namespace mlx::core {

// Compute padded dimensions for tiled layout
// Tiles are 128 rows × 4 columns, must allocate full tiles
inline std::pair<int, int> get_padded_scale_dims(int num_rows, int num_cols) {
  constexpr int rows_per_tile = 128;
  constexpr int cols_per_tile = 4;

  int padded_rows =
      ((num_rows + rows_per_tile - 1) / rows_per_tile) * rows_per_tile;
  int padded_cols =
      ((num_cols + cols_per_tile - 1) / cols_per_tile) * cols_per_tile;

  return {padded_rows, padded_cols};
}

void swizzle_scales(
    const array& scales,
    array& scales_tiled,
    cu::CommandEncoder& enc,
    const Stream& s);

inline array pad_and_swizzle_scales(
    const array& scale,
    cu::CommandEncoder& encoder,
    const Stream& s) {
  // Compute padded dimensions for full tiles (128 rows × 4 cols)
  auto [pad_outer, pad_inner] =
      get_padded_scale_dims(scale.shape(-2), scale.shape(-1));
  // cuBLAS requirements for scale factor layout:
  // 1. Dimensions must be padded to full tiles (128 rows × 4 cols)
  // 2. Out-of-bounds values must be filled with zeros
  // 3. Starting addresses must be 16-byte aligned
  // https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout
  // Note: cu::malloc_async already provides 256-byte alignment
  array scale_tiled(
      cu::malloc_async(pad_outer * pad_inner, encoder),
      Shape{pad_outer, pad_inner},
      scale.dtype());
  swizzle_scales(scale, scale_tiled, encoder, s);

  encoder.add_temporary(scale_tiled);
  return scale_tiled;
}

// Compute alpha = tensor_amax_x * tensor_amax_w / (448 * 6)^2
// Allocate beta zero on device as well
void compute_qqmm_pointers(
    array& alpha_out,
    array& beta_out,
    const array& tensor_amax_x,
    const array& tensor_amax_w,
    cu::CommandEncoder& enc);

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/quantized.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/quantized/quantized.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/quantized/qmm/qmm.h"
#include "mlx/backend/cuda/quantized/quantized_utils.h"
#include "mlx/dtype_utils.h"
#include "mlx/fast_primitives.h"
#include "mlx/primitives.h"

#include <nvtx3/nvtx3.hpp>

namespace mlx::core {

void QuantizedMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("QuantizedMatmul::eval_gpu");
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);

  const array& x = inputs[0];
  const array& w = inputs[1];
  const array& scales = inputs[2];
  std::optional<array> biases;
  if (inputs.size() > 3) {
    biases = inputs[3];
  }

  auto supports = [&](auto&& f) {
    return f(
        x,
        w,
        scales,
        biases,
        out,
        transpose_,
        bits_,
        group_size_,
        mode_,
        encoder.device());
  };
  bool can_use_qmm_sm90 = supports(supports_qmm_sm90);
  bool can_use_qmm_sm80 = supports(supports_qmm_sm80);
  bool can_use_fp_qmv = supports(supports_fp_qmv);
  bool can_use_qmv = supports(supports_qmv) || can_use_fp_qmv;

  auto call_qmm_sm90 = [&]() {
    out.set_data(cu::malloc_async(out.nbytes(), encoder));
    qmm_sm90(x, w, scales, *biases, out, bits_, group_size_, encoder, s);
  };
  auto call_qmm_sm80 = [&]() {
    out.set_data(cu::malloc_async(out.nbytes(), encoder));
    qmm_sm80(x, w, scales, biases, out, bits_, group_size_, mode_, encoder);
  };
  auto call_qmv = [&]() {
    out.set_data(cu::malloc_async(out.nbytes(), encoder));
    if (can_use_fp_qmv) {
      fp_qmv(x, w, scales, out, bits_, group_size_, encoder, s);
    } else {
      qmv(x, w, scales, biases, out, bits_, group_size_, mode_, encoder);
    }
  };

  int M = out.shape(-2);
  int N = out.shape(-1);
  int K = x.shape(-1);
  int B = out.size() / (M * N);

  if (can_use_qmm_sm90) {
    if (can_use_qmv && (M == 1 && B == 1 && N <= 16384 && K <= 16384)) {
      call_qmv();
    } else {
      call_qmm_sm90();
    }
    return;
  }

  if (can_use_qmm_sm80) {
    if (can_use_qmv && (M * B < 8)) {
      call_qmv();
    } else {
      call_qmm_sm80();
    }
    return;
  }

  if (can_use_qmv) {
    call_qmv();
    return;
  }

  throw std::runtime_error(
      fmt::format(
          "[quantized_matmul] No implementation for "
          "problem shape: {}x{}x{}x{}, transpose: {}, "
          "activation: {}, bits: {}, group size: {}, mode: \"{}\".",
          M,
          N,
          K,
          B,
          transpose_,
          dtype_to_string(x.dtype()),
          bits_,
          group_size_,
          quantization_mode_to_string(mode_)));
}

void fast::Quantize::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  nvtx3::scoped_range r("Quantize::eval_gpu");
  auto& s = stream();
  auto& d = cu::device(s.device);
  auto& enc = d.get_command_encoder(s);
  if (dequantize_) {
    auto wq = ensure_row_contiguous(inputs[0], enc, s);
    auto scales = ensure_row_contiguous(inputs[1], enc, s);
    auto& w = outputs[0];

    w.set_data(cu::malloc_async(w.nbytes(), enc));

    if (mode_ == QuantizationMode::Affine) {
      auto biases = ensure_row_contiguous(inputs[2], enc, s);
      affine_dequantize(wq, scales, biases, w, group_size_, bits_, enc, s);
    } else {
      // 0 -- xq, 1 -- scales, 2 -- could be global scale for nvfp4
      bool use_global_scale =
          mode_ == QuantizationMode::Nvfp4 && inputs.size() > 2;
      std::optional<array> global_scale =
          use_global_scale ? std::make_optional(inputs[2]) : std::nullopt;
      fp_dequantize(wq, scales, w, group_size_, bits_, global_scale, enc, s);
    }
  } else {
    auto w = ensure_contiguous(inputs[0], enc, s);
    auto& wq = outputs[0];
    auto& scales = outputs[1];

    wq.set_data(cu::malloc_async(wq.nbytes(), enc));
    scales.set_data(cu::malloc_async(scales.nbytes(), enc));

    if (mode_ == QuantizationMode::Affine) {
      auto& biases = outputs[2];
      biases.set_data(cu::malloc_async(biases.nbytes(), enc));
      affine_quantize(w, wq, scales, biases, group_size_, bits_, enc, s);
    } else {
      bool use_global_scale =
          mode_ == QuantizationMode::Nvfp4 && inputs.size() > 1;
      std::optional<array> global_scale =
          use_global_scale ? std::make_optional(inputs[1]) : std::nullopt;
      fp_quantize(w, wq, scales, group_size_, bits_, global_scale, enc, s);
    }
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/quantized.h
================================================
// Copyright © 2025 Apple Inc.

#include <optional>
#include "mlx/backend/cuda/device.h"

namespace mlx::core {

void affine_quantize(
    const array& w,
    array& wq,
    array& scales,
    array& biases,
    int group_size_,
    int bits_,
    cu::CommandEncoder& enc,
    const Stream& s);

void affine_dequantize(
    const array& wq,
    const array& scales,
    const array& biases,
    array& w,
    int group_size_,
    int bits_,
    cu::CommandEncoder& enc,
    const Stream& s);

void fp_quantize(
    const array& w,
    array& wq,
    array& scales,
    int group_size,
    int bits,
    const std::optional<array>& global_scale,
    cu::CommandEncoder& enc,
    const Stream& s);

void fp_dequantize(
    const array& wq,
    const array& scales,
    array& w,
    int group_size,
    int bits,
    const std::optional<array>& global_scale,
    cu::CommandEncoder& enc,
    const Stream& s);

void fp_quantize_dequantize(
    const array& w,
    array& what,
    int group_size,
    int bits,
    const std::optional<array>& global_scale,
    cu::CommandEncoder& enc,
    const Stream& s);

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/quantized/quantized_utils.h
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/gpu/copy.h"

namespace mlx::core {
inline array ensure_row_contiguous(
    const array& x,
    cu::CommandEncoder& enc,
    const Stream& s) {
  if (!x.flags().row_contiguous) {
    array x_copy = contiguous_copy_gpu(x, s);
    enc.add_temporary(x_copy);
    return x_copy;
  } else {
    return x;
  }
}

inline array ensure_row_contiguous_matrix(
    const array& x,
    cu::CommandEncoder& enc,
    const Stream& s) {
  if (x.ndim() < 2) {
    if (x.strides()[0] == 1) {
      return x;
    }
  } else {
    auto stride_0 = x.strides()[x.ndim() - 2];
    auto stride_1 = x.strides()[x.ndim() - 1];
    if (stride_0 == x.shape(-1) && stride_1 == 1) {
      return x;
    }
  }
  array x_copy = contiguous_copy_gpu(x, s);
  enc.add_temporary(x_copy);
  return x_copy;
}

inline array
ensure_contiguous(const array& x, cu::CommandEncoder& enc, const Stream& s) {
  if (x.flags().row_contiguous || x.flags().col_contiguous) {
    return x;
  }
  array x_copy = contiguous_copy_gpu(x, s);
  enc.add_temporary(x_copy);
  return x_copy;
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/random.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/primitives.h"

#include <cooperative_groups.h>
#include <nvtx3/nvtx3.hpp>

#include <cassert>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

__constant__ constexpr uint32_t rotations[2][4] = {
    {13, 15, 26, 6},
    {17, 29, 16, 24}};

union rbits {
  uint2 val;
  uint8_t bytes[2][4];
};

__device__ rbits threefry2x32_hash(uint2 key, uint2 count) {
  uint32_t ks[] = {key.x, key.y, key.x ^ key.y ^ 0x1BD11BDA};

  rbits v;
  v.val.x = count.x + ks[0];
  v.val.y = count.y + ks[1];

  for (int i = 0; i < 5; ++i) {
    for (auto r : rotations[i % 2]) {
      v.val.x += v.val.y;
      v.val.y = (v.val.y << r) | (v.val.y >> (32 - r));
      v.val.y ^= v.val.x;
    }
    v.val.x += ks[(i + 1) % 3];
    v.val.y += ks[(i + 2) % 3] + i + 1;
  }

  return v;
}

__global__ void rbitsc(
    const uint32_t* keys,
    uint8_t* out,
    dim3 grid_dims,
    bool odd,
    uint32_t bytes_per_key) {
  auto grid = cg::this_grid();
  uint32_t thread_index = grid.thread_rank();
  uint32_t index_x = thread_index % grid_dims.x;
  uint32_t index_y = thread_index / grid_dims.x;
  if (index_x >= grid_dims.x || index_y >= grid_dims.y) {
    return;
  }

  auto kidx = 2 * index_x;
  auto key = uint2{keys[kidx], keys[kidx + 1]};
  auto half_size = grid_dims.y - odd;
  out += index_x * bytes_per_key;
  bool drop_last = odd && (index_y == half_size);
  auto bits = threefry2x32_hash(
      key, uint2{index_y, drop_last ? 0 : index_y + grid_dims.y});
  size_t idx = size_t(index_y) << 2;
  for (int i = 0; i < 4; ++i) {
    out[idx + i] = bits.bytes[0][i];
  }
  if (!drop_last) {
    idx = (drop_last ? 0 : size_t(index_y) + grid_dims.y) << 2;
    if ((index_y + 1) == half_size && (bytes_per_key % 4) > 0) {
      int edge_bytes = (bytes_per_key % 4);
      for (int i = 0; i < edge_bytes; ++i) {
        out[idx + i] = bits.bytes[1][i];
      }
    } else {
      for (int i = 0; i < 4; ++i) {
        out[idx + i] = bits.bytes[1][i];
      }
    }
  }
}

__global__ void rbits(
    const uint32_t* keys,
    uint8_t* out,
    dim3 grid_dims,
    bool odd,
    uint32_t bytes_per_key,
    int32_t ndim,
    const __grid_constant__ Shape key_shape,
    const __grid_constant__ Strides key_strides) {
  auto grid = cg::this_grid();
  uint32_t thread_index = grid.thread_rank();
  uint32_t index_x = thread_index % grid_dims.x;
  uint32_t index_y = thread_index / grid_dims.x;
  if (index_x >= grid_dims.x || index_y >= grid_dims.y) {
    return;
  }

  auto kidx = 2 * index_x;
  auto k1_elem = elem_to_loc(kidx, key_shape.data(), key_strides.data(), ndim);
  auto k2_elem =
      elem_to_loc(kidx + 1, key_shape.data(), key_strides.data(), ndim);
  auto key = uint2{keys[k1_elem], keys[k2_elem]};
  auto half_size = grid_dims.y - odd;
  out += size_t(index_x) * bytes_per_key;
  bool drop_last = odd && (index_y == half_size);
  auto bits = threefry2x32_hash(
      key, uint2{index_y, drop_last ? 0 : index_y + grid_dims.y});
  size_t idx = size_t(index_y) << 2;
  for (int i = 0; i < 4; ++i) {
    out[idx + i] = bits.bytes[0][i];
  }
  if (!drop_last) {
    idx = (drop_last ? 0 : size_t(index_y) + grid_dims.y) << 2;
    if ((index_y + 1) == half_size && (bytes_per_key % 4) > 0) {
      int edge_bytes = (bytes_per_key % 4);
      for (int i = 0; i < edge_bytes; ++i) {
        out[idx + i] = bits.bytes[1][i];
      }
    } else {
      for (int i = 0; i < 4; ++i) {
        out[idx + i] = bits.bytes[1][i];
      }
    }
  }
}

} // namespace cu

void RandomBits::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("RandomBits::eval_gpu");
  assert(inputs.size() == 1);

  // keys has shape (N1, ..., NK, 2)
  // out has shape (N1, ..., NK, M1, M2, ...)
  auto& keys = inputs[0];
  size_t num_keys = keys.size() / 2;

  size_t elems_per_key = out.size() / num_keys;
  size_t bytes_per_key = out.itemsize() * elems_per_key;
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
  out.set_data(cu::malloc_async(out.nbytes(), encoder));
  if (out.size() == 0) {
    return;
  }

  size_t out_per_key = (bytes_per_key + 4 - 1) / 4;
  size_t half_size = out_per_key / 2;

  bool odd = out_per_key % 2;
  if ((half_size + odd) >= UINT32_MAX || num_keys >= UINT32_MAX) {
    throw std::runtime_error("[RandomBits::eval_gpu] Large size unsupported");
  }

  encoder.set_input_array(keys);
  encoder.set_output_array(out);
  int64_t total = num_keys * (half_size + odd);
  uint32_t threads_y = 1;
  while ((total / threads_y) >= UINT_MAX) {
    threads_y *= 2;
  }
  uint32_t threads_x = cuda::ceil_div(total, threads_y);

  dim3 grid_dims{
      static_cast<uint32_t>(num_keys), static_cast<uint32_t>(half_size + odd)};
  auto [grid, block] = get_grid_and_block(threads_x, threads_y, 1);
  auto& stream = encoder.stream();
  if (keys.flags().row_contiguous) {
    encoder.add_kernel_node(
        cu::rbitsc,
        grid,
        block,
        gpu_ptr<uint32_t>(keys),
        gpu_ptr<uint8_t>(out),
        grid_dims,
        odd,
        bytes_per_key);
  } else {
    encoder.add_kernel_node(
        cu::rbits,
        grid,
        block,
        gpu_ptr<uint32_t>(keys),
        gpu_ptr<uint8_t>(out),
        grid_dims,
        odd,
        bytes_per_key,
        keys.ndim(),
        const_param(keys.shape()),
        const_param(keys.strides()));
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/reduce/all_reduce.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/reduce/reduce.cuh"

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include <cub/block/block_load.cuh>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <typename T, typename U, typename ReduceOp, int N = 4>
__global__ void all_reduce(T* in, U* out, size_t block_step, size_t size) {
  // TODO: Process multiple "rows" in each thread
  constexpr int M = 1;

  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<WARP_SIZE>(block);

  const U init = cu::ReduceInit<ReduceOp, T>::value();
  ReduceOp op;

  T vals[N];
  U accs[M];
  accs[0] = init;

  size_t start = grid.block_rank() * block_step;
  size_t end = start + block_step;
  size_t check = min(end, size);

  size_t i = start;
  for (; i + block.size() * N <= check; i += block.size() * N) {
    cub::LoadDirectBlockedVectorized<T, N>(block.thread_rank(), in + i, vals);
    for (int j = 0; j < N; j++) {
      accs[0] = op(accs[0], cast_to<U>(vals[j]));
    }
  }

  if (i < check) {
    cub::LoadDirectBlocked(
        block.thread_rank(), in + i, vals, check - i, cast_to<T>(init));
    for (int i = 0; i < N; i++) {
      accs[0] = op(accs[0], cast_to<U>(vals[i]));
    }
  }

  __shared__ U shared_accumulators[32];
  block_reduce(block, warp, accs, shared_accumulators, op, init);

  if (block.thread_rank() == 0) {
    out[grid.block_rank()] = accs[0];
  }
}

} // namespace cu

void all_reduce(
    cu::CommandEncoder& encoder,
    const array& in,
    array& out,
    Reduce::ReduceType reduce_type) {
  constexpr int N_READS = 8;

  out.set_data(cu::malloc_async(out.nbytes(), encoder));

  auto get_args = [](int size, int N) {
    int threads = std::min(512, (size + N - 1) / N);
    threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
    int reductions_per_step = threads * N;
    size_t steps_needed =
        (size + reductions_per_step - 1) / reductions_per_step;

    int blocks;
    if (steps_needed < 32) {
      blocks = 1;
    } else if (steps_needed < 128) {
      blocks = 32;
    } else if (steps_needed < 512) {
      blocks = 128;
    } else if (steps_needed < 1024) {
      blocks = 512;
    } else {
      blocks = 1024;
    }

    size_t steps_per_block = (steps_needed + blocks - 1) / blocks;
    size_t block_step = steps_per_block * reductions_per_step;

    return std::make_tuple(blocks, threads, block_step);
  };

  int blocks, threads;
  size_t block_step;
  size_t insize = in.size();
  Dtype dt = in.dtype();

  // Cub doesn't like const pointers for load (sigh).
  void* indata = const_cast<void*>(gpu_ptr<void>(in));

  // Large array so allocate an intermediate and accumulate there
  std::tie(blocks, threads, block_step) = get_args(insize, N_READS);
  encoder.set_input_array(in);
  if (blocks > 1) {
    array intermediate({blocks}, out.dtype(), nullptr, {});
    intermediate.set_data(cu::malloc_async(intermediate.nbytes(), encoder));
    encoder.add_temporary(intermediate);
    encoder.set_output_array(intermediate);
    dispatch_all_types(dt, [&](auto type_tag) {
      dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
        using OP = MLX_GET_TYPE(reduce_type_tag);
        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
        using U = typename cu::ReduceResult<OP, T>::type;
        auto kernel = cu::all_reduce<T, U, OP, N_READS>;
        encoder.add_kernel_node(
            kernel,
            blocks,
            threads,
            static_cast<T*>(indata),
            gpu_ptr<U>(intermediate),
            block_step,
            insize);
      });
    });

    // Set the input for the next step and recalculate the blocks
    indata = gpu_ptr<void>(intermediate);
    dt = intermediate.dtype();
    insize = intermediate.size();
    std::tie(blocks, threads, block_step) = get_args(insize, N_READS);
    encoder.set_input_array(intermediate);
  }

  encoder.set_output_array(out);
  dispatch_all_types(dt, [&](auto type_tag) {
    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
      using OP = MLX_GET_TYPE(reduce_type_tag);
      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      using U = typename cu::ReduceResult<OP, T>::type;
      auto kernel = cu::all_reduce<T, U, OP, N_READS>;
      encoder.add_kernel_node(
          kernel,
          blocks,
          threads,
          static_cast<T*>(indata),
          gpu_ptr<U>(out),
          block_step,
          insize);
    });
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/reduce/col_reduce.cu
================================================
// Copyright © 2025 Apple Inc.

#include <numeric>

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/reduce/reduce.cuh"

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include <cub/block/block_load.cuh>
#include <cub/cub.cuh>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

struct ColReduceArgs {
  // The size of the contiguous column reduction.
  size_t reduction_size;
  int64_t reduction_stride;

  // Input shape and strides excluding the reduction axes.
  Shape shape;
  Strides strides;
  int ndim;

  // Input shape and strides of the reduction axes (including last dimension).
  Shape reduce_shape;
  Strides reduce_strides;
  int reduce_ndim;

  // The number of column we are reducing. Namely prod(reduce_shape).
  size_t non_col_reductions;

  ColReduceArgs(
      const array& in,
      const ReductionPlan& plan,
      const std::vector<int>& axes) {
    using ShapeVector = decltype(plan.shape);
    using StridesVector = decltype(plan.strides);

    ShapeVector shape_vec;
    StridesVector strides_vec;

    assert(!plan.shape.empty());
    reduction_size = plan.shape.back();
    reduction_stride = plan.strides.back();

    int64_t stride_back = 1;
    std::tie(shape_vec, strides_vec) = shapes_without_reduction_axes(in, axes);
    while (!shape_vec.empty() && stride_back < reduction_stride) {
      stride_back *= shape_vec.back();
      shape_vec.pop_back();
      strides_vec.pop_back();
    }
    std::vector<int> indices(shape_vec.size());
    std::iota(indices.begin(), indices.end(), 0);
    std::sort(indices.begin(), indices.end(), [&](int left, int right) {
      return strides_vec[left] > strides_vec[right];
    });
    ShapeVector sorted_shape;
    StridesVector sorted_strides;
    for (auto idx : indices) {
      sorted_shape.push_back(shape_vec[idx]);
      sorted_strides.push_back(strides_vec[idx]);
    }
    std::tie(shape_vec, strides_vec) =
        collapse_contiguous_dims(sorted_shape, sorted_strides);
    shape = const_param(shape_vec);
    strides = const_param(strides_vec);
    ndim = shape_vec.size();

    reduce_shape = const_param(plan.shape);
    reduce_strides = const_param(plan.strides);
    reduce_ndim = plan.shape.size();

    non_col_reductions = 1;
    for (int i = 0; i < reduce_ndim - 1; i++) {
      non_col_reductions *= reduce_shape[i];
    }
  }
};

template <
    typename T,
    typename U,
    typename Op,
    int NDIM,
    int BM,
    int BN,
    int N_READS = 4,
    int BLOCKS = 1>
__global__ void col_reduce_looped(
    T* in,
    U* out,
    const __grid_constant__ ColReduceArgs args,
    int64_t out_size) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<WARP_SIZE>(block);

  constexpr int threads_per_row = BN / N_READS;

  // Compute the indices for the tile
  size_t tile_idx = grid.block_rank();
  size_t tile_x = tile_idx % ((args.reduction_stride + BN - 1) / BN);
  size_t tile_y = tile_idx / ((args.reduction_stride + BN - 1) / BN);
  size_t tile_out = tile_y / out_size;
  tile_y = tile_y % out_size;

  // Compute the indices for the thread within the tile
  short thread_x = block.thread_rank() % threads_per_row;
  short thread_y = block.thread_rank() / threads_per_row;

  // Move the input pointer
  in += elem_to_loc(tile_y, args.shape.data(), args.strides.data(), args.ndim) +
      tile_x * BN;

  // Initialize the running totals
  Op op;
  U totals[N_READS];
  for (int i = 0; i < N_READS; i++) {
    totals[i] = ReduceInit<Op, T>::value();
  }

  size_t total = args.non_col_reductions * args.reduction_size;
  size_t per_block, start, end;
  if constexpr (BLOCKS > 1) {
    per_block = (total + BLOCKS - 1) / BLOCKS;
    start = tile_out * per_block + thread_y;
    end = min((tile_out + 1) * per_block, total);
  } else {
    per_block = total;
    start = thread_y;
    end = total;
  }

  LoopedElemToLoc<NDIM, (NDIM > 2)> loop(args.reduce_ndim);
  loop.next(start, args.reduce_shape.data(), args.reduce_strides.data());
  if (tile_x * BN + BN <= args.reduction_stride) {
    if (args.reduction_stride % N_READS == 0) {
      for (size_t r = start; r < end; r += BM) {
        T vals[N_READS];
        cub::LoadDirectBlockedVectorized(thread_x, in + loop.location(), vals);
        for (int i = 0; i < N_READS; i++) {
          totals[i] = op(totals[i], cast_to<U>(vals[i]));
        }
        loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());
      }
    } else {
      for (size_t r = start; r < end; r += BM) {
        T vals[N_READS];
        cub::LoadDirectBlocked(thread_x, in + loop.location(), vals);
        for (int i = 0; i < N_READS; i++) {
          totals[i] = op(totals[i], cast_to<U>(vals[i]));
        }
        loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());
      }
    }
  } else {
    for (size_t r = start; r < end; r += BM) {
      T vals[N_READS];
      cub::LoadDirectBlocked(
          thread_x,
          in + loop.location(),
          vals,
          args.reduction_stride - tile_x * BN,
          cast_to<T>(ReduceInit<Op, T>::value()));
      for (int i = 0; i < N_READS; i++) {
        totals[i] = op(totals[i], cast_to<U>(vals[i]));
      }
      loop.next(BM, args.reduce_shape.data(), args.reduce_strides.data());
    }
  }

  // Do warp reduce for each output.
  constexpr int n_outputs = BN / threads_per_row;
  static_assert(BM == 32 && n_outputs == N_READS);
  __shared__ U shared_vals[BM * BN];
  short s_idx = thread_y * BN + thread_x * N_READS;
  for (int i = 0; i < N_READS; i++) {
    shared_vals[s_idx + i] = totals[i];
  }
  block.sync();
  s_idx = warp.thread_rank() * BN + warp.meta_group_rank() * n_outputs;
  for (int i = 0; i < n_outputs; i++) {
    totals[i] = cg::reduce(warp, shared_vals[s_idx + i], op);
  }

  // Write result.
  if (warp.thread_rank() == 0) {
    if (BLOCKS > 1) {
      out += tile_out * out_size * args.reduction_stride;
    }
    cub::StoreDirectBlocked(
        warp.meta_group_rank(),
        out + tile_y * args.reduction_stride + tile_x * BN,
        totals,
        args.reduction_stride - tile_x * BN);
  }
}

template <typename T, typename U, typename Op, int N_READS = 4>
__global__ void col_reduce_small(
    const T* in,
    U* out,
    const __grid_constant__ ColReduceArgs args,
    size_t total) {
  Op op;
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();

  const auto idx = grid.thread_rank() * N_READS;
  const auto before_axis = idx / args.reduction_stride;
  const auto after_axis = idx % args.reduction_stride;
  const auto offset =
      before_axis * args.reduction_stride * args.reduction_size + after_axis;

  if (idx >= total) {
    return;
  }

  in += offset;
  out += idx;

  AlignedVector<U, N_READS> accumulator;
  for (int i = 0; i < N_READS; i++) {
    accumulator[i] = ReduceInit<Op, T>::value();
  }

  for (int i = 0; i < args.reduction_size; i++) {
    auto values = load_vector<N_READS>(in, 0);

    for (int j = 0; j < N_READS; j++) {
      accumulator[j] = op(accumulator[j], cast_to<U>(values[j]));
    }

    in += args.reduction_stride;
  }

  store_vector(out, 0, accumulator);
}

} // namespace cu

inline auto output_grid_for_col_reduce(
    const array& out,
    const cu::ColReduceArgs& args,
    int bn,
    int outer = 1) {
  int gx, gy = 1;
  size_t n_inner_blocks = cuda::ceil_div(args.reduction_stride, bn);
  size_t n_outer_blocks = out.size() / args.reduction_stride;
  size_t n_blocks = n_outer_blocks * n_inner_blocks * outer;
  while (n_blocks / gy > INT32_MAX) {
    gy *= 2;
  }
  gx = cuda::ceil_div(n_blocks, gy);

  return dim3(gx, gy, 1);
}

void col_reduce_looped(
    cu::CommandEncoder& encoder,
    const array& in,
    array& out,
    Reduce::ReduceType reduce_type,
    const std::vector<int>& axes,
    const ReductionPlan& plan,
    const cu::ColReduceArgs& args) {
  // Allocate data for the output using in's layout to access them as
  // contiguously as possible.
  allocate_same_layout(out, in, axes, encoder);

  encoder.set_input_array(in);
  encoder.set_output_array(out);
  dispatch_all_types(in.dtype(), [&](auto type_tag) {
    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
      dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
        using OP = MLX_GET_TYPE(reduce_type_tag);
        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
        using U = typename cu::ReduceResult<OP, T>::type;
        // Cub doesn't like const pointers for vectorized loads. (sigh)
        T* indata = const_cast<T*>(gpu_ptr<T>(in));

        constexpr int N_READS = 4;
        constexpr int BM = 32;
        constexpr int BN = 32;
        dim3 grid = output_grid_for_col_reduce(out, args, BN);
        int blocks = BM * BN / N_READS;
        auto kernel =
            cu::col_reduce_looped<T, U, OP, reduce_ndim(), BM, BN, N_READS>;
        encoder.add_kernel_node(
            kernel,
            grid,
            blocks,
            indata,
            gpu_ptr<U>(out),
            static_cast<cu::ColReduceArgs>(args),
            out.size() / args.reduction_stride);
      });
    });
  });
}

void col_reduce_small(
    cu::CommandEncoder& encoder,
    const array& in,
    array& out,
    Reduce::ReduceType reduce_type,
    const std::vector<int>& axes,
    const ReductionPlan& plan,
    const cu::ColReduceArgs& args) {
  // Allocate data for the output using in's layout to access them as
  // contiguously as possible.
  allocate_same_layout(out, in, axes, encoder);

  encoder.set_input_array(in);
  encoder.set_output_array(out);
  dispatch_all_types(in.dtype(), [&](auto type_tag) {
    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
      using OP = MLX_GET_TYPE(reduce_type_tag);
      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      using U = typename cu::ReduceResult<OP, T>::type;

      constexpr int N_READS = 16 / sizeof(T);
      auto tmp_grid = get_2d_grid_dims(out.shape(), out.strides());
      auto [grid, block] = get_grid_and_block(tmp_grid.x, tmp_grid.y, 1);
      auto kernel = cu::col_reduce_small<T, U, OP, N_READS>;
      encoder.add_kernel_node(
          kernel,
          grid,
          block,
          gpu_ptr<T>(in),
          gpu_ptr<U>(out),
          static_cast<cu::ColReduceArgs>(args),
          out.size());
    });
  });
}

void col_reduce_two_pass(
    cu::CommandEncoder& encoder,
    const array& in,
    array& out,
    Reduce::ReduceType reduce_type,
    const std::vector<int>& axes,
    const ReductionPlan& plan,
    const cu::ColReduceArgs& args) {
  // Allocate data for the output using in's layout to access them as
  // contiguously as possible.
  allocate_same_layout(out, in, axes, encoder);

  // Allocate an intermediate array to hold the 1st pass result
  constexpr int outer = 32;

  Shape intermediate_shape;
  intermediate_shape.push_back(outer);
  intermediate_shape.insert(
      intermediate_shape.end(), out.shape().begin(), out.shape().end());

  Strides intermediate_strides;
  intermediate_strides.push_back(out.size());
  intermediate_strides.insert(
      intermediate_strides.end(), out.strides().begin(), out.strides().end());

  array intermediate(intermediate_shape, out.dtype(), nullptr, {});
  auto [data_size, rc, cc] =
      check_contiguity(intermediate_shape, intermediate_strides);
  auto fl = out.flags();
  fl.row_contiguous = rc;
  fl.col_contiguous = cc;
  fl.contiguous = true;
  intermediate.set_data(
      cu::malloc_async(intermediate.nbytes(), encoder),
      data_size,
      intermediate_strides,
      fl,
      allocator::free);

  encoder.add_temporary(intermediate);
  encoder.set_input_array(in);
  encoder.set_output_array(intermediate);
  dispatch_all_types(in.dtype(), [&](auto type_tag) {
    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
      dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
        using OP = MLX_GET_TYPE(reduce_type_tag);
        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
        using U = typename cu::ReduceResult<OP, T>::type;
        // Cub doesn't like const pointers for vectorized loads. (sigh)
        T* indata = const_cast<T*>(gpu_ptr<T>(in));

        constexpr int N_READS = 4;
        constexpr int BM = 32;
        constexpr int BN = 32;
        dim3 grid = output_grid_for_col_reduce(out, args, BN, outer);
        int blocks = BM * BN / N_READS;
        auto kernel = cu::
            col_reduce_looped<T, U, OP, reduce_ndim(), BM, BN, N_READS, outer>;
        encoder.add_kernel_node(
            kernel,
            grid,
            blocks,
            indata,
            gpu_ptr<U>(intermediate),
            static_cast<cu::ColReduceArgs>(args),
            out.size() / args.reduction_stride);
      });
    });
  });

  // Prepare the reduction arguments for the 2nd pass
  cu::ColReduceArgs second_args = args;
  second_args.reduction_size = outer;
  second_args.reduction_stride = out.size();
  second_args.ndim = 0;
  second_args.reduce_shape[0] = outer;
  second_args.reduce_strides[0] = out.size();
  second_args.reduce_ndim = 1;
  second_args.non_col_reductions = 1;

  encoder.set_input_array(intermediate);
  encoder.set_output_array(out);
  dispatch_all_types(intermediate.dtype(), [&](auto type_tag) {
    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
      dispatch_reduce_ndim(second_args.reduce_ndim, [&](auto reduce_ndim) {
        using OP = MLX_GET_TYPE(reduce_type_tag);
        using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
        using U = typename cu::ReduceResult<OP, T>::type;

        constexpr int N_READS = 4;
        constexpr int BM = 32;
        constexpr int BN = 32;
        dim3 grid = output_grid_for_col_reduce(out, second_args, BN);
        int blocks = BM * BN / N_READS;
        auto kernel =
            cu::col_reduce_looped<T, U, OP, reduce_ndim(), BM, BN, N_READS>;
        encoder.add_kernel_node(
            kernel,
            grid,
            blocks,
            gpu_ptr<T>(intermediate),
            gpu_ptr<U>(out),
            second_args,
            second_args.reduction_stride);
      });
    });
  });
}

void col_reduce(
    cu::CommandEncoder& encoder,
    const array& in,
    array& out,
    Reduce::ReduceType reduce_type,
    const std::vector<int>& axes,
    const ReductionPlan& plan) {
  // Current col reduce options
  //
  // - col_reduce_looped
  //
  //   It is a general strided reduce. Each threadblock computes the output for
  //   a subrow of the fast moving axis. For instance 32 elements.
  //
  // - col_reduce_small
  //
  //  It is a column reduce for small columns. Each thread loops over the whole
  //  column without communicating with any other thread.
  //
  // - col_reduce_two_pass
  //
  //  It is a reduce for long columns. To increase parallelism, we split the
  //  reduction in two passes. First we do a column reduce where many
  //  threadblocks operate on different parts of the reduced axis. Then we
  //  perform a final column reduce.
  //
  // Notes: As in row reduce we opt to read as much in order as possible and
  //        leave transpositions as they are (contrary to our Metal backend).
  //
  //        Moreover we need different kernels for short rows and tuning

  // Make the args struct to help route to the best kernel
  cu::ColReduceArgs args(in, plan, axes);

  // Small col reduce with a single or contiguous reduction axis
  if (args.non_col_reductions == 1 && args.reduction_size <= 32 &&
      args.reduction_stride % (16 / in.itemsize()) == 0) {
    col_reduce_small(encoder, in, out, reduce_type, axes, plan, args);
    return;
  }

  // Long column with smallish row
  size_t total_sums = args.non_col_reductions * args.reduction_size;
  size_t approx_threads = out.size();
  if (total_sums / approx_threads > 32) {
    col_reduce_two_pass(encoder, in, out, reduce_type, axes, plan, args);
    return;
  }

  // Fallback col reduce
  col_reduce_looped(encoder, in, out, reduce_type, axes, plan, args);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/reduce/init_reduce.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/reduce/reduce.cuh"

#include <cooperative_groups.h>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <typename T, typename U, typename Op>
__global__ void init_reduce(U* out, size_t size) {
  auto index = cg::this_grid().thread_rank();
  if (index < size) {
    out[index] = ReduceInit<Op, T>::value();
  }
}

} // namespace cu

void init_reduce(
    cu::CommandEncoder& encoder,
    const array& in,
    array& out,
    Reduce::ReduceType reduce_type) {
  // Allocate if needed
  if (out.data_shared_ptr() == nullptr) {
    out.set_data(cu::malloc_async(out.nbytes(), encoder));
  }

  encoder.set_output_array(out);
  dispatch_all_types(in.dtype(), [&](auto type_tag) {
    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
      using OP = MLX_GET_TYPE(reduce_type_tag);
      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      using U = typename cu::ReduceResult<OP, T>::type;
      auto kernel = cu::init_reduce<T, U, OP>;
      dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
      dim3 block(grid.x < 1024 ? grid.x : 1024, 1, 1);
      grid.x = (grid.x + 1023) / 1024;
      encoder.add_kernel_node(kernel, grid, block, gpu_ptr<U>(out), out.size());
    });
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/reduce/reduce.cuh
================================================
// Copyright © 2025 Apple Inc.

#include <type_traits>

#include "mlx/backend/common/reduce.h"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/backend/cuda/reduce/reduce_ops.cuh"
#include "mlx/dtype_utils.h"
#include "mlx/primitives.h"

namespace mlx::core {

template <typename F>
void dispatch_reduce_ndim(int ndim, F&& f) {
  if (ndim == 1) {
    f(std::integral_constant<int, 1>{});
  } else if (ndim == 2) {
    f(std::integral_constant<int, 2>{});
  } else {
    f(std::integral_constant<int, 5>{});
  }
}

template <typename F>
void dispatch_reduce_ops(Reduce::ReduceType reduce_type, F&& f) {
  if (reduce_type == Reduce::ReduceType::And) {
    f(type_identity<cu::And>{});
  } else if (reduce_type == Reduce::ReduceType::Or) {
    f(type_identity<cu::Or>{});
  } else if (reduce_type == Reduce::ReduceType::Sum) {
    f(type_identity<cu::Sum>{});
  } else if (reduce_type == Reduce::ReduceType::Prod) {
    f(type_identity<cu::Prod>{});
  } else if (reduce_type == Reduce::ReduceType::Max) {
    f(type_identity<cu::Max>{});
  } else if (reduce_type == Reduce::ReduceType::Min) {
    f(type_identity<cu::Min>{});
  } else {
    throw std::invalid_argument("Unknown reduce type.");
  }
}

void all_reduce(
    cu::CommandEncoder& encoder,
    const array& in,
    array& out,
    Reduce::ReduceType reduce_type);

void row_reduce(
    cu::CommandEncoder& encoder,
    const array& in,
    array& out,
    Reduce::ReduceType reduce_type,
    const std::vector<int>& axes,
    const ReductionPlan& plan);

void col_reduce(
    cu::CommandEncoder& encoder,
    const array& in,
    array& out,
    Reduce::ReduceType reduce_type,
    const std::vector<int>& axes,
    const ReductionPlan& plan);

void init_reduce(
    cu::CommandEncoder& encoder,
    const array& in,
    array& out,
    Reduce::ReduceType reduce_type);

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/reduce/reduce_ops.cuh
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/backend/cuda/device/atomic_ops.cuh"
#include "mlx/backend/cuda/device/cast_op.cuh"
#include "mlx/backend/cuda/device/utils.cuh"
#include "mlx/backend/cuda/reduce/reduce_utils.cuh"

namespace mlx::core::cu {

// Reduce ops.
struct And {
  __device__ __forceinline__ bool operator()(bool a, bool b) {
    return a && b;
  }

  __device__ void atomic_update(bool* x, bool y) {
    atomic_reduce<bool, And>(x, y);
  }
};

struct Or {
  __device__ __forceinline__ bool operator()(bool a, bool b) {
    return a || b;
  }

  __device__ void atomic_update(bool* x, bool y) {
    atomic_reduce<bool, Or>(x, y);
  }
};

struct Sum {
  template <typename T>
  __device__ __forceinline__ T operator()(T a, T b) {
    return a + b;
  }

  template <typename T>
  __device__ void atomic_update(T* x, T y) {
    atomic_reduce<T, Sum>(x, y);
  }

  __device__ void atomic_update(__nv_bfloat16* x, __nv_bfloat16 y) {
    atomic_add(x, y);
  }

  __device__ void atomic_update(int* x, int y) {
    atomic_add(x, y);
  }

  __device__ void atomic_update(float* x, float y) {
    atomic_add(x, y);
  }
};

struct Prod {
  template <typename T>
  __device__ __forceinline__ T operator()(T a, T b) {
    return a * b;
  }

  template <typename T>
  __device__ void atomic_update(T* x, T y) {
    atomic_reduce<T, Prod>(x, y);
  }
};

struct Min {
  template <typename T>
  __device__ __forceinline__ T operator()(T a, T b) {
    if constexpr (is_complex_v<T>) {
      if (cuda::std::isnan(a.real()) || cuda::std::isnan(a.imag())) {
        return a;
      }
      if (cuda::std::isnan(b.real()) || cuda::std::isnan(b.imag())) {
        return b;
      }
    } else if constexpr (!cuda::std::is_integral_v<T>) {
      if (cuda::std::isnan(a) || cuda::std::isnan(b)) {
        return cuda::std::numeric_limits<float>::quiet_NaN();
      }
    }
    return a < b ? a : b;
  }

  template <typename T>
  __device__ void atomic_update(T* x, T y) {
    atomic_reduce<T, Min>(x, y);
  }
};

struct Max {
  template <typename T>
  __device__ __forceinline__ T operator()(T a, T b) {
    if constexpr (is_complex_v<T>) {
      if (cuda::std::isnan(a.real()) || cuda::std::isnan(a.imag())) {
        return a;
      }
      if (cuda::std::isnan(b.real()) || cuda::std::isnan(b.imag())) {
        return b;
      }
    } else if constexpr (!cuda::std::is_integral_v<T>) {
      if (cuda::std::isnan(a) || cuda::std::isnan(b)) {
        return cuda::std::numeric_limits<float>::quiet_NaN();
      }
    }
    return a > b ? a : b;
  }

  template <typename T>
  __device__ void atomic_update(T* x, T y) {
    atomic_reduce<T, Max>(x, y);
  }
};

// Traits to get the result type of reduce op.
template <typename Op, typename T>
struct ReduceResult;

template <typename T>
struct ReduceResult<And, T> {
  using type = bool;
};

template <typename T>
struct ReduceResult<Or, T> {
  using type = bool;
};

template <typename T>
struct ReduceResult<Sum, T> {
  using type = cuda::std::conditional_t<
      (cuda::std::is_integral_v<T> && sizeof(T) <= 4),
      int32_t,
      T>;
};

template <typename T>
struct ReduceResult<Prod, T> {
  using type = cuda::std::conditional_t<
      (cuda::std::is_integral_v<T> && sizeof(T) <= 4),
      int32_t,
      T>;
};

template <typename T>
struct ReduceResult<Min, T> {
  using type = T;
};

template <typename T>
struct ReduceResult<Max, T> {
  using type = T;
};

// Traits to get the init value of reduce op.
template <typename Op, typename T>
struct ReduceInit;

template <typename T>
struct ReduceInit<And, T> {
  static constexpr __host__ __device__ bool value() {
    return true;
  }
};

template <typename T>
struct ReduceInit<Or, T> {
  static constexpr __host__ __device__ bool value() {
    return false;
  }
};

template <typename T>
struct ReduceInit<Sum, T> {
  static constexpr __host__ __device__ auto value() {
    if constexpr (is_complex_v<T>) {
      return T{0, 0};
    } else {
      return cast_to<typename ReduceResult<Sum, T>::type>(0);
    }
  }
};

template <typename T>
struct ReduceInit<Prod, T> {
  static constexpr __host__ __device__ auto value() {
    if constexpr (is_complex_v<T>) {
      return T{1, 0};
    } else {
      return cast_to<typename ReduceResult<Prod, T>::type>(1);
    }
  }
};

template <typename T>
struct ReduceInit<Min, T> {
  static constexpr __host__ __device__ T value() {
    return Limits<T>::max();
  }
};

template <typename T>
struct ReduceInit<Max, T> {
  static constexpr __host__ __device__ T value() {
    return Limits<T>::min();
  }
};

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/reduce/reduce_utils.cuh
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include <numeric>

#include "mlx/backend/common/utils.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/device/utils.cuh"

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <size_t N>
struct uint_by_size;
template <>
struct uint_by_size<2> {
  using type = uint16_t;
};
template <>
struct uint_by_size<4> {
  using type = uint32_t;
};
template <>
struct uint_by_size<8> {
  using type = unsigned long long int;
};

template <typename T, typename Op>
__device__ void atomic_reduce(T* x, T y) {
  if constexpr (sizeof(T) == 1) {
    using U = uint16_t;
    U* x_int = (U*)((char*)x - ((size_t)x % 2));
    int shift = ((char*)x - (char*)x_int) * 8;
    int mask = 0xff << shift;
    U old_val, new_val;
    do {
      old_val = *x_int;
      T result = Op{}(static_cast<T>((old_val >> shift) & 0xff), y);
      new_val = (old_val & ~mask) | (result << shift);
    } while (atomicCAS(x_int, old_val, new_val) != old_val);
  } else {
    using U = typename uint_by_size<sizeof(T)>::type;
    U* x_int = (U*)(x);
    U old_val, new_val;
    do {
      old_val = *x_int;
      T result = Op{}(*((T*)&old_val), y);
      new_val = *((U*)&result);
    } while (atomicCAS(x_int, old_val, new_val) != old_val);
  }
}

template <typename T, int N, typename Block, typename Warp, typename Op>
inline __device__ void
block_reduce(Block block, Warp warp, T (&vals)[N], T* smem, Op op, T init) {
  // First reduce in the current warp
  for (int i = 0; i < N; i++) {
    vals[i] = cg::reduce(warp, vals[i], op);
  }

  // Reduce across warps
  if (warp.meta_group_size() > 1) {
    if (warp.thread_rank() == 0) {
      for (int i = 0; i < N; i++) {
        smem[warp.meta_group_rank() * N + i] = vals[i];
      }
    }
    block.sync();
    if (warp.thread_rank() < warp.meta_group_size()) {
      for (int i = 0; i < N; i++) {
        vals[i] = smem[warp.thread_rank() * N + i];
      }
    } else {
      for (int i = 0; i < N; i++) {
        vals[i] = init;
      }
    }
    for (int i = 0; i < N; i++) {
      vals[i] = cg::reduce(warp, vals[i], op);
    }
  }
}

} // namespace cu

inline void allocate_same_layout(
    array& out,
    const array& in,
    const std::vector<int>& axes,
    cu::CommandEncoder& encoder) {
  if (in.flags().row_contiguous) {
    out.set_data(cu::malloc_async(out.nbytes(), encoder));
    return;
  }

  if (out.ndim() < in.ndim()) {
    throw std::runtime_error(
        "Reduction without keepdims only supported for row-contiguous inputs");
  }

  // Calculate the transpositions applied to in in order to apply them to out.
  std::vector<int> axis_order(in.ndim());
  std::iota(axis_order.begin(), axis_order.end(), 0);
  std::sort(axis_order.begin(), axis_order.end(), [&](int left, int right) {
    return in.strides(left) > in.strides(right);
  });

  // Transpose the shape and calculate the strides
  Shape out_shape(in.ndim());
  Strides out_strides(in.ndim(), 1);
  for (int i = 0; i < in.ndim(); i++) {
    out_shape[i] = out.shape(axis_order[i]);
  }
  for (int i = in.ndim() - 2; i >= 0; i--) {
    out_strides[i] = out_shape[i + 1] * out_strides[i + 1];
  }

  // Reverse the axis order to get the final strides
  Strides final_strides(in.ndim());
  for (int i = 0; i < in.ndim(); i++) {
    final_strides[axis_order[i]] = out_strides[i];
  }

  // Calculate the resulting contiguity and do the memory allocation
  auto [data_size, rc, cc] = check_contiguity(out.shape(), final_strides);
  auto fl = in.flags();
  fl.row_contiguous = rc;
  fl.col_contiguous = cc;
  fl.contiguous = true;
  out.set_data(
      cu::malloc_async(out.nbytes(), encoder),
      data_size,
      final_strides,
      fl,
      allocator::free);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/reduce/row_reduce.cu
================================================
// Copyright © 2025 Apple Inc.

#include <numeric>

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/reduce/reduce.cuh"

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

struct RowReduceArgs {
  // The size of the row being reduced, i.e. the size of last dimension.
  int row_size;

  // Input shape and strides excluding the reduction axes.
  Shape shape;
  Strides strides;
  int ndim;

  // Input shape and strides of the reduction axes excluding last dimension.
  Shape reduce_shape;
  Strides reduce_strides;
  int reduce_ndim;

  // The number of rows we are reducing. Namely prod(reduce_shape).
  size_t non_row_reductions;

  RowReduceArgs(
      const array& in,
      const ReductionPlan& plan,
      const std::vector<int>& axes) {
    assert(!plan.shape.empty());
    row_size = plan.shape.back();

    auto [shape_vec, strides_vec] = shapes_without_reduction_axes(in, axes);
    std::tie(shape_vec, strides_vec) =
        collapse_contiguous_dims(shape_vec, strides_vec);
    shape = const_param(shape_vec);
    strides = const_param(strides_vec);
    ndim = shape_vec.size();

    reduce_shape = const_param(plan.shape);
    reduce_strides = const_param(plan.strides);
    reduce_ndim = plan.shape.size() - 1;

    non_row_reductions = 1;
    for (int i = 0; i < reduce_ndim; i++) {
      non_row_reductions *= reduce_shape[i];
    }
  }

  // Convert shape and strides as if in was contiguous
  void sort_access_pattern(const array& in, const std::vector<int>& axes) {
    auto shape_vec = in.shape();
    auto strides_vec = in.strides();
    std::tie(shape_vec, strides_vec) =
        shapes_without_reduction_axes(shape_vec, strides_vec, axes);
    std::vector<int> indices(shape_vec.size());
    std::iota(indices.begin(), indices.end(), 0);
    std::sort(indices.begin(), indices.end(), [&](int left, int right) {
      return strides_vec[left] > strides_vec[right];
    });
    decltype(shape_vec) sorted_shape;
    decltype(strides_vec) sorted_strides;
    for (auto idx : indices) {
      sorted_shape.push_back(shape_vec[idx]);
      sorted_strides.push_back(strides_vec[idx]);
    }
    std::tie(shape_vec, strides_vec) =
        collapse_contiguous_dims(sorted_shape, sorted_strides);
    shape = const_param(shape_vec);
    strides = const_param(strides_vec);
    ndim = shape_vec.size();
  }
};

template <typename T, typename U, typename ReduceOp, int N = 4, int M = 1>
__global__ void
row_reduce_simple(const T* in, U* out, size_t n_rows, int size) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<WARP_SIZE>(block);

  const U init = cu::ReduceInit<ReduceOp, T>::value();
  ReduceOp op;

  AlignedVector<T, N> vals[M];
  AlignedVector<U, M> accs;
  for (int i = 0; i < M; i++) {
    accs[i] = init;
  }

  const size_t start_row =
      min(n_rows - M, static_cast<size_t>(grid.block_rank() * M));
  const size_t full_blocks = size / (block.size() * N);
  const size_t final_offset = full_blocks * (block.size() * N);
  in += start_row * size + block.thread_rank() * N;
  out += start_row;

  for (size_t r = 0; r < full_blocks; r++) {
    for (int k = 0; k < M; k++) {
      vals[k] = load_vector<N>(in + k * size, 0);
    }
    for (int k = 0; k < M; k++) {
      for (int j = 0; j < N; j++) {
        accs[k] = op(accs[k], cast_to<U>(vals[k][j]));
      }
    }

    in += block.size() * N;
  }

  if (final_offset < size) {
    for (int k = 0; k < M; k++) {
      for (int i = 0; i < N; i++) {
        vals[k][i] = ((final_offset + block.thread_rank() * N + i) < size)
            ? in[k * size + i]
            : cast_to<T>(init);
      }
    }
    for (int k = 0; k < M; k++) {
      for (int j = 0; j < N; j++) {
        accs[k] = op(accs[k], cast_to<U>(vals[k][j]));
      }
    }
  }

  __shared__ U shared_accumulators[32 * M];
  block_reduce(block, warp, accs.val, shared_accumulators, op, init);

  if (block.thread_rank() == 0) {
    if (grid.block_rank() * M + M <= n_rows) {
      store_vector(out, 0, accs);
    } else {
      short offset = grid.block_rank() * M + M - n_rows;
      for (int i = offset; i < M; i++) {
        out[i] = accs[i];
      }
    }
  }
}

template <typename T, typename U, typename Op, int NDIM, int N_READS = 4>
__global__ void row_reduce_looped(
    const T* in,
    U* out,
    const __grid_constant__ RowReduceArgs args) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<WARP_SIZE>(block);

  size_t out_idx = grid.block_rank();

  Op op;

  U total[1];
  U init = ReduceInit<Op, T>::value();
  total[0] = init;
  LoopedElemToLoc<NDIM, (NDIM > 2)> loop(args.reduce_ndim);
  const size_t full_blocks = args.row_size / (block.size() * N_READS);
  const size_t final_offset = full_blocks * (block.size() * N_READS);

  in += elem_to_loc(out_idx, args.shape.data(), args.strides.data(), args.ndim);
  in += block.thread_rank() * N_READS;

  // Unaligned reduce
  if (final_offset < args.row_size) {
    bool mask[N_READS];
    for (int i = 0; i < N_READS; i++) {
      mask[i] =
          (final_offset + block.thread_rank() * N_READS + i) < args.row_size;
    }

    for (size_t n = 0; n < args.non_row_reductions; n++) {
      const T* inlocal = in + loop.location();

      for (size_t r = 0; r < full_blocks; r++) {
        auto vals = load_vector<N_READS>(inlocal, 0);
        for (int i = 0; i < N_READS; i++) {
          total[0] = op(total[0], cast_to<U>(vals[i]));
        }
        inlocal += block.size() * N_READS;
      }

      {
        T vals[N_READS];
        for (int i = 0; i < N_READS; i++) {
          vals[i] = mask[i] ? inlocal[i] : cast_to<T>(init);
        }
        for (int i = 0; i < N_READS; i++) {
          total[0] = op(total[0], cast_to<U>(vals[i]));
        }
      }

      loop.next(args.reduce_shape.data(), args.reduce_strides.data());
    }
  }

  // Aligned case
  else {
    for (size_t n = 0; n < args.non_row_reductions; n++) {
      const T* inlocal = in + loop.location();

      for (size_t r = 0; r < full_blocks; r++) {
        auto vals = load_vector<N_READS>(inlocal, 0);
        for (int i = 0; i < N_READS; i++) {
          total[0] = op(total[0], cast_to<U>(vals[i]));
        }
        inlocal += block.size() * N_READS;
      }

      loop.next(args.reduce_shape.data(), args.reduce_strides.data());
    }
  }

  __shared__ U shared_accumulators[32];
  block_reduce(block, warp, total, shared_accumulators, op, init);

  if (block.thread_rank() == 0) {
    out[out_idx] = total[0];
  }
}

} // namespace cu

void row_reduce_simple(
    cu::CommandEncoder& encoder,
    const array& in,
    array& out,
    Reduce::ReduceType reduce_type,
    const std::vector<int>& axes,
    const ReductionPlan& plan) {
  // Allocate data for the output using in's layout to avoid elem_to_loc in the
  // kernel.
  allocate_same_layout(out, in, axes, encoder);

  // TODO: If out.size() < 1024 which will be a common case then write this in
  //       2 passes. Something like 32 * out.size() and then do a warp reduce.
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  dispatch_all_types(in.dtype(), [&](auto type_tag) {
    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
      using OP = MLX_GET_TYPE(reduce_type_tag);
      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      using U = typename cu::ReduceResult<OP, T>::type;

      constexpr int N_READS = 16 / sizeof(T);

      // Calculate the grid and block dims
      size_t reductions = (plan.shape.back() + N_READS - 1) / N_READS;
      dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
      int warps = (reductions + WARP_SIZE - 1) / WARP_SIZE;
      warps /= 4;
      warps = std::max(std::min(warps, 32), 1);
      int threads = warps * WARP_SIZE;
      dim3 block(threads, 1, 1);

      // Pick the kernel
      auto kernel = cu::row_reduce_simple<T, U, OP, N_READS>;
      if (grid.x >= 1024) {
        grid.x = (grid.x + 1) / 2;
        kernel = cu::row_reduce_simple<T, U, OP, N_READS, 2>;
      }

      T* indata = const_cast<T*>(gpu_ptr<T>(in));
      int size = plan.shape.back();
      encoder.add_kernel_node(
          kernel, grid, block, indata, gpu_ptr<U>(out), out.size(), size);
    });
  });
}

void row_reduce_looped(
    cu::CommandEncoder& encoder,
    const array& in,
    array& out,
    Reduce::ReduceType reduce_type,
    const std::vector<int>& axes,
    const ReductionPlan& plan,
    cu::RowReduceArgs args) {
  // Allocate data for the output using in's layout to access them as
  // contiguously as possible.
  allocate_same_layout(out, in, axes, encoder);

  encoder.set_input_array(in);
  encoder.set_output_array(out);
  dispatch_all_types(in.dtype(), [&](auto type_tag) {
    dispatch_reduce_ops(reduce_type, [&](auto reduce_type_tag) {
      using OP = MLX_GET_TYPE(reduce_type_tag);
      using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      using U = typename cu::ReduceResult<OP, T>::type;

      constexpr int N_READS = 16 / sizeof(T);

      // Calculate the grid and block dims
      args.sort_access_pattern(in, axes);
      dim3 grid = get_2d_grid_dims(out.shape(), out.strides());
      size_t reductions = (args.row_size + N_READS - 1) / N_READS;
      int warps = (reductions + WARP_SIZE - 1) / WARP_SIZE;
      warps /= 4;
      warps = std::max(std::min(warps, 32), 1);
      int threads = warps * WARP_SIZE;
      dim3 block(threads, 1, 1);

      // Pick the kernel
      auto kernel = cu::row_reduce_looped<T, U, OP, 1, N_READS>;
      dispatch_reduce_ndim(args.reduce_ndim, [&](auto reduce_ndim) {
        kernel = cu::row_reduce_looped<T, U, OP, reduce_ndim.value, N_READS>;
      });

      encoder.add_kernel_node(
          kernel, grid, block, gpu_ptr<T>(in), gpu_ptr<U>(out), args);
    });
  });
}

void row_reduce(
    cu::CommandEncoder& encoder,
    const array& in,
    array& out,
    Reduce::ReduceType reduce_type,
    const std::vector<int>& axes,
    const ReductionPlan& plan) {
  // Current row reduction options
  //
  // - row_reduce_simple
  //
  //   That means that we are simply reducing across the fastest moving axis.
  //   We are reducing 1 or 2 rows per threadblock depending on the size of
  //   output.
  //
  // - row_reduce_looped
  //
  //   It is a general row reduction. We are computing 1 output per
  //   threadblock. We read the fastest moving axis vectorized and loop over
  //   the rest of the axes.
  //
  // Notes: We opt to read as much in order as possible and leave
  //        transpositions as they are (contrary to our Metal backend).

  // Simple row reduce means that we have 1 axis that we are reducing over and
  // it has stride 1.
  if (plan.shape.size() == 1) {
    row_reduce_simple(encoder, in, out, reduce_type, axes, plan);
    return;
  }

  // Make the args struct to help route to the best kernel
  cu::RowReduceArgs args(in, plan, axes);

  // Fallback row reduce
  row_reduce_looped(encoder, in, out, reduce_type, axes, plan, std::move(args));
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/reduce.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/reduce/reduce.cuh"
#include "mlx/backend/gpu/copy.h"

#include <nvtx3/nvtx3.hpp>

#include <cassert>

namespace mlx::core {

void Reduce::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Reduce::eval_gpu");
  assert(inputs.size() == 1);
  array in = inputs[0];

  // Make sure no identity reductions trickle down here.
  assert(!axes_.empty());
  assert(out.size() != in.size());

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);

  if (in.size() == 0) {
    init_reduce(encoder, in, out, reduce_type_);
    return;
  }

  // Reduce.
  ReductionPlan plan = get_reduction_plan(in, axes_);

  // If it is a general reduce then copy the input to a contiguous array and
  // recompute the plan.
  //
  // TODO: Instead of copying we can use elem-to-loc to deal with broadcasting
  //       like we do in Metal. When it comes to broadcasted reduction axes
  //       some can be ignored eg for min/max.
  bool broadcasted = false;
  for (int i = 0, j = 0; i < in.ndim() && !broadcasted; i++) {
    if (j < axes_.size() && axes_[j] == i) {
      j++;
    } else {
      broadcasted = in.strides(i) == 0;
    }
  }
  if (plan.type == GeneralReduce || broadcasted || !in.flags().contiguous) {
    array in_copy = contiguous_copy_gpu(in, s);
    encoder.add_temporary(in_copy);
    in = in_copy;
    plan = get_reduction_plan(in, axes_);
  }

  if (plan.type == ContiguousAllReduce) {
    all_reduce(encoder, in, out, reduce_type_);
    return;
  }

  if (plan.type == ContiguousReduce || plan.type == GeneralContiguousReduce) {
    row_reduce(encoder, in, out, reduce_type_, axes_, plan);
    return;
  }

  if (plan.type == ContiguousStridedReduce ||
      plan.type == GeneralStridedReduce) {
    col_reduce(encoder, in, out, reduce_type_, axes_, plan);
    return;
  }

  throw std::runtime_error("No plan reached in reduce.");
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/rms_norm.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/backend/cuda/reduce/reduce.cuh"
#include "mlx/backend/gpu/copy.h"
#include "mlx/dtype_utils.h"
#include "mlx/fast_primitives.h"

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include <nvtx3/nvtx3.hpp>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

inline __device__ float2 plus_f2(const float2& a, const float2& b) {
  return {a.x + b.x, a.y + b.y};
}

// Similar to cub::BlockReduce, but result is broadcasted to every thread.
template <typename T, int BLOCK_DIM, int GROUP_DIM = WARP_SIZE>
struct BlockBroadcastReduce {
  using TempStorage = T[std::max(BLOCK_DIM / WARP_SIZE, 1)];

  cg::thread_block& block;
  TempStorage& temp;

  template <typename Op>
  __device__ T Reduce(const T& input, const Op& op, const T& init_value) {
    auto warp = cg::tiled_partition<GROUP_DIM>(block);
    T x = cg::reduce(warp, input, op);
    if constexpr (BLOCK_DIM > GROUP_DIM) {
      if (warp.thread_rank() == 0) {
        temp[warp.meta_group_rank()] = x;
      }
      block.sync();
      x = warp.thread_rank() < warp.meta_group_size() ? temp[warp.thread_rank()]
                                                      : init_value;
      return cg::reduce(warp, x, op);
    } else {
      return x;
    }
  }

  __device__ T Sum(const T& input) {
    return Reduce(input, cg::plus<T>{}, T{});
  }
};

template <typename T, int BLOCK_DIM, int REDUCE_DIM, int N_READS = 4>
__global__ void rms_norm_small(
    const T* x,
    const T* w,
    T* out,
    float eps,
    uint32_t axis_size,
    uint32_t n_rows,
    int64_t w_stride) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();

  using BlockReduceT = BlockBroadcastReduce<float, BLOCK_DIM, REDUCE_DIM>;
  __shared__ typename BlockReduceT::TempStorage temp;

  auto row =
      (grid.block_rank() * block.dim_threads().y) + block.thread_index().y;
  if (row >= n_rows) {
    return;
  }
  x += row * axis_size;
  out += row * axis_size;

  // Normalizer.
  float normalizer = 0;
  auto index = block.thread_index().x;
  auto xn = load_vector<N_READS>(x, index, axis_size, T(0));
#pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    float t = static_cast<float>(xn[i]);
    normalizer += t * t;
  }

  normalizer = BlockReduceT{block, temp}.Sum(normalizer);
  normalizer = rsqrt(normalizer / axis_size + eps);

  // Outputs.
  auto wn = load_vector<N_READS>(w, index, axis_size, w_stride, T(0));
#pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    float y = static_cast<float>(xn[i]) * normalizer;
    xn[i] = wn[i] * static_cast<T>(y);
  }
  store_vector<N_READS>(out, index, xn, axis_size);
}

template <typename T, int BLOCK_DIM, int N_READS = 4>
__global__ void rms_norm(
    const T* x,
    const T* w,
    T* out,
    float eps,
    uint32_t axis_size,
    int64_t w_stride) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();

  using BlockReduceT = BlockBroadcastReduce<float, BLOCK_DIM>;
  __shared__ typename BlockReduceT::TempStorage temp;

  x += grid.block_rank() * axis_size;
  out += grid.block_rank() * axis_size;

  // Normalizer.
  float normalizer = 0;
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
    auto index = r * BLOCK_DIM + block.thread_rank();
    auto xn = load_vector<N_READS>(x, index, axis_size, T(0));
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      float t = static_cast<float>(xn[i]);
      normalizer += t * t;
    }
  }
  normalizer = BlockReduceT{block, temp}.Sum(normalizer);
  normalizer = rsqrt(normalizer / axis_size + eps);

  // Outputs.
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
    auto index = r * BLOCK_DIM + block.thread_rank();
    auto xn = load_vector<N_READS>(x, index, axis_size, T(0));
    auto wn = load_vector<N_READS>(w, index, axis_size, w_stride, T(0));
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      float y = static_cast<float>(xn[i]) * normalizer;
      xn[i] = wn[i] * static_cast<T>(y);
    }
    store_vector<N_READS>(out, index, xn, axis_size);
  }
}

template <
    typename T,
    bool HAS_W,
    int BLOCK_DIM,
    int REDUCE_DIM,
    int N_READS = 4>
__global__ void rms_norm_vjp_small(
    const T* x,
    const T* w,
    const T* g,
    T* gx,
    T* gw,
    float eps,
    int32_t axis_size,
    int32_t n_rows,
    int64_t w_stride) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();

  using BlockReduceF2 = BlockBroadcastReduce<float2, BLOCK_DIM, REDUCE_DIM>;
  __shared__ typename BlockReduceF2::TempStorage temp;

  auto row =
      (grid.block_rank() * block.dim_threads().y) + block.thread_index().y;
  if (row >= n_rows) {
    return;
  }

  x += row * axis_size;
  g += row * axis_size;
  gx += row * axis_size;
  gw += row * axis_size;

  // Normalizer.
  float2 factors = {};
  auto index = block.thread_index().x;
  auto xn = load_vector<N_READS>(x, index, axis_size, T(0));
  auto gn = load_vector<N_READS>(g, index, axis_size, T(0));
  auto wn = load_vector<N_READS>(w, index, axis_size, w_stride, T(0));
  for (int i = 0; i < N_READS; i++) {
    float t = static_cast<float>(xn[i]);
    float wi = wn[i];
    float gi = gn[i];
    float wg = wi * gi;
    factors = plus_f2(factors, {wg * t, t * t});
  }

  factors = BlockReduceF2{block, temp}.Reduce(factors, plus_f2, {});
  float meangwx = factors.x / axis_size;
  float normalizer = rsqrt(factors.y / axis_size + eps);
  float normalizer3 = normalizer * normalizer * normalizer;

  // Outputs.
  for (int i = 0; i < N_READS; i++) {
    float xi = xn[i];
    float wi = wn[i];
    float gi = gn[i];
    xn[i] = static_cast<T>(normalizer * wi * gi - xi * meangwx * normalizer3);
    if constexpr (HAS_W) {
      wn[i] = static_cast<T>(gi * xi * normalizer);
    }
  }
  store_vector<N_READS>(gx, index, xn, axis_size);
  if constexpr (HAS_W) {
    store_vector<N_READS>(gw, index, wn, axis_size);
  }
}

template <typename T, bool HAS_W, int BLOCK_DIM, int N_READS = 4>
__global__ void rms_norm_vjp(
    const T* x,
    const T* w,
    const T* g,
    T* gx,
    T* gw,
    float eps,
    int32_t axis_size,
    int64_t w_stride) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();

  using BlockReduceF2 = BlockBroadcastReduce<float2, BLOCK_DIM>;
  __shared__ typename BlockReduceF2::TempStorage temp;

  x += grid.block_rank() * axis_size;
  g += grid.block_rank() * axis_size;
  gx += grid.block_rank() * axis_size;
  gw += grid.block_rank() * axis_size;

  // Normalizer.
  float2 factors = {};
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
    auto index = r * BLOCK_DIM + block.thread_rank();
    auto xn = load_vector<N_READS>(x, index, axis_size, T(0));
    auto gn = load_vector<N_READS>(g, index, axis_size, T(0));
    auto wn = load_vector<N_READS>(w, index, axis_size, w_stride, T(0));
    for (int i = 0; i < N_READS; i++) {
      float t = static_cast<float>(xn[i]);
      float wi = wn[i];
      float gi = gn[i];
      float wg = wi * gi;
      factors = plus_f2(factors, {wg * t, t * t});
    }
  }
  factors = BlockReduceF2{block, temp}.Reduce(factors, plus_f2, {});
  float meangwx = factors.x / axis_size;
  float normalizer = rsqrt(factors.y / axis_size + eps);
  float normalizer3 = normalizer * normalizer * normalizer;

  // Outputs.
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); ++r) {
    auto index = r * BLOCK_DIM + block.thread_rank();
    auto xn = load_vector<N_READS>(x, index, axis_size, T(0));
    auto gn = load_vector<N_READS>(g, index, axis_size, T(0));
    auto wn = load_vector<N_READS>(w, index, axis_size, w_stride, T(0));
    for (int i = 0; i < N_READS; i++) {
      float xi = xn[i];
      float wi = wn[i];
      float gi = gn[i];
      xn[i] = static_cast<T>(normalizer * wi * gi - xi * meangwx * normalizer3);
      if constexpr (HAS_W) {
        wn[i] = static_cast<T>(gi * xi * normalizer);
      }
    }
    store_vector<N_READS>(gx, index, xn, axis_size);
    if constexpr (HAS_W) {
      store_vector<N_READS>(gw, index, wn, axis_size);
    }
  }
}

} // namespace cu

namespace fast {

bool RMSNorm::use_fallback(Stream s) {
  return s.device == Device::cpu;
}

template <int n_per_thread, typename F>
void dispatch_group_dim(int axis_size, F&& f) {
  if (axis_size <= n_per_thread * 8) {
    f(std::integral_constant<int, 8>{},
      std::integral_constant<int, 1>(),
      std::integral_constant<int, 16>());
  } else if (axis_size <= n_per_thread * 16) {
    f(std::integral_constant<int, 16>{},
      std::integral_constant<int, 1>(),
      std::integral_constant<int, 8>());
  } else if (axis_size <= n_per_thread * 32) {
    f(std::integral_constant<int, 32>{},
      std::integral_constant<int, 1>(),
      std::integral_constant<int, 4>());
  } else if (axis_size <= n_per_thread * 32 * 2) {
    f(std::integral_constant<int, 32>{},
      std::integral_constant<int, 2>(),
      std::integral_constant<int, 2>());
  } else if (axis_size <= n_per_thread * 32 * 4) {
    f(std::integral_constant<int, 32>{},
      std::integral_constant<int, 4>(),
      std::integral_constant<int, 1>());
  } else if (axis_size <= n_per_thread * 32 * 8) {
    f(std::integral_constant<int, 32>{},
      std::integral_constant<int, 8>(),
      std::integral_constant<int, 1>());
  } else if (axis_size <= n_per_thread * 32 * 16) {
    f(std::integral_constant<int, 32>{},
      std::integral_constant<int, 16>(),
      std::integral_constant<int, 1>());
  } else {
    f(std::integral_constant<int, 32>{},
      std::integral_constant<int, 32>(),
      std::integral_constant<int, 1>());
  }
}

// TODO: There are duplicate code with backend/metal/normalization.cpp
void RMSNorm::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  nvtx3::scoped_range r("RMSNorm::eval_gpu");
  auto& s = stream();
  auto& out = outputs[0];
  auto& encoder = cu::get_command_encoder(s);

  // Make sure that the last dimension is contiguous.
  auto set_output = [&s, &out, &encoder](const array& x) {
    bool no_copy = x.flags().contiguous && x.strides()[x.ndim() - 1] == 1;
    if (no_copy && x.ndim() > 1) {
      auto s = x.strides()[x.ndim() - 2];
      no_copy &= (s == 0 || s == x.shape().back());
    }
    if (no_copy) {
      if (x.is_donatable()) {
        out.copy_shared_buffer(x);
      } else {
        out.set_data(
            cu::malloc_async(x.data_size() * x.itemsize(), encoder),
            x.data_size(),
            x.strides(),
            x.flags());
      }
      return x;
    } else {
      array x_copy = contiguous_copy_gpu(x, s);
      out.copy_shared_buffer(x_copy);
      return x_copy;
    }
  };

  const array x = set_output(inputs[0]);
  const array& w = inputs[1];

  int32_t axis_size = x.shape().back();
  int32_t n_rows = x.data_size() / axis_size;
  int64_t w_stride = (w.ndim() == 1) ? w.strides()[0] : 0;

  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_output_array(out);
  dispatch_float_types(out.dtype(), "rms_norm", [&](auto type_tag) {
    using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    constexpr int N_READS = 16 / sizeof(DataType);
    if (axis_size <= N_READS * 1024) {
      dispatch_group_dim<N_READS>(
          axis_size, [&](auto group_dim, auto n_groups, auto groups_per_block) {
            constexpr int block_dim = n_groups() * group_dim();
            auto kernel =
                cu::rms_norm_small<DataType, block_dim, group_dim(), N_READS>;
            auto n_blocks =
                (n_rows + groups_per_block() - 1) / groups_per_block();
            encoder.add_kernel_node(
                kernel,
                n_blocks,
                {block_dim, groups_per_block()},
                gpu_ptr<DataType>(x),
                gpu_ptr<DataType>(w),
                gpu_ptr<DataType>(out),
                eps_,
                axis_size,
                n_rows,
                w_stride);
          });
    } else {
      auto kernel = cu::rms_norm<DataType, 1024, N_READS>;
      encoder.add_kernel_node(
          kernel,
          n_rows,
          1024,
          gpu_ptr<DataType>(x),
          gpu_ptr<DataType>(w),
          gpu_ptr<DataType>(out),
          eps_,
          axis_size,
          w_stride);
    }
  });
}

void RMSNormVJP::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  nvtx3::scoped_range r("RMSNormVJP::eval_gpu");
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);

  // Ensure row contiguity. We could relax this step by checking that the array
  // is contiguous (no broadcasts or holes) and that the input strides are the
  // same as the cotangent strides but for now this is simpler.
  auto check_input = [&s](const array& x, bool& copied) {
    if (x.flags().row_contiguous) {
      copied = false;
      return x;
    }
    copied = true;
    return contiguous_copy_gpu(x, s);
  };
  bool donate_x = inputs[0].is_donatable();
  bool donate_g = inputs[2].is_donatable();
  bool copied;
  auto x = check_input(inputs[0], copied);
  donate_x |= copied;
  const array& w = inputs[1];
  bool g_copied;
  auto g = check_input(inputs[2], g_copied);
  donate_g |= g_copied;
  array& gx = outputs[0];
  array& gw = outputs[1];

  // Check whether we had a weight.
  bool has_w = w.ndim() != 0;

  // Allocate space for the outputs.
  bool g_in_gx = false;
  if (donate_x) {
    gx.copy_shared_buffer(x);
  } else if (donate_g) {
    gx.copy_shared_buffer(g);
    g_in_gx = true;
  } else {
    gx.set_data(cu::malloc_async(gx.nbytes(), encoder));
  }
  if (g_copied && !g_in_gx) {
    encoder.add_temporary(g);
  }

  int32_t axis_size = x.shape().back();
  int32_t n_rows = x.data_size() / axis_size;
  int64_t w_stride = (w.ndim() == 1) ? w.strides()[0] : 0;

  // Allocate a temporary to store the gradients for w and allocate the output
  // gradient accumulators.
  array gw_temp =
      (has_w) ? array({n_rows, x.shape().back()}, gw.dtype(), nullptr, {}) : w;
  if (has_w) {
    if (!g_in_gx && donate_g) {
      gw_temp.copy_shared_buffer(g);
    } else {
      gw_temp.set_data(cu::malloc_async(gw_temp.nbytes(), encoder));
      encoder.add_temporary(gw_temp);
    }
  }

  encoder.set_input_array(x);
  encoder.set_input_array(w);
  encoder.set_input_array(g);
  encoder.set_output_array(gx);
  encoder.set_output_array(gw_temp);
  dispatch_float_types(gx.dtype(), "rms_norm_vjp", [&](auto type_tag) {
    dispatch_bool(has_w, [&](auto has_w_constant) {
      using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
      constexpr int N_READS = 16 / sizeof(DataType);
      if (axis_size <= N_READS * 1024) {
        dispatch_group_dim<N_READS>(
            axis_size,
            [&](auto group_dim, auto n_groups, auto groups_per_block) {
              constexpr int block_dim = group_dim() * n_groups();
              auto kernel = cu::rms_norm_vjp_small<
                  DataType,
                  has_w_constant.value,
                  block_dim,
                  group_dim(),
                  N_READS>;
              auto n_blocks =
                  (n_rows + groups_per_block() - 1) / groups_per_block();
              encoder.add_kernel_node(
                  kernel,
                  n_blocks,
                  {block_dim, groups_per_block()},
                  gpu_ptr<DataType>(x),
                  gpu_ptr<DataType>(w),
                  gpu_ptr<DataType>(g),
                  gpu_ptr<DataType>(gx),
                  gpu_ptr<DataType>(gw_temp),
                  eps_,
                  axis_size,
                  n_rows,
                  w_stride);
            });
      } else {
        auto kernel =
            cu::rms_norm_vjp<DataType, has_w_constant.value, 1024, N_READS>;
        encoder.add_kernel_node(
            kernel,
            n_rows,
            1024,
            gpu_ptr<DataType>(x),
            gpu_ptr<DataType>(w),
            gpu_ptr<DataType>(g),
            gpu_ptr<DataType>(gx),
            gpu_ptr<DataType>(gw_temp),
            eps_,
            axis_size,
            w_stride);
      }
    });
  });

  if (has_w) {
    ReductionPlan plan(
        ReductionOpType::ContiguousStridedReduce, {n_rows}, {axis_size});
    col_reduce(encoder, gw_temp, gw, Reduce::ReduceType::Sum, {0}, plan);
  }
}

} // namespace fast

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/rope.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/backend/gpu/copy.h"
#include "mlx/dtype_utils.h"
#include "mlx/fast_primitives.h"

#include <nvtx3/nvtx3.hpp>

namespace mlx::core {

namespace cu {

template <typename T, bool traditional, bool forward>
__device__ void rope_single_impl(
    const T* in,
    T* out,
    int32_t offset,
    float inv_freq,
    float scale,
    int64_t stride,
    uint2 pos,
    uint2 dims) {
  float L = scale * static_cast<float>(offset);

  // Compute costheta, sintheta
  float theta = L * inv_freq;
  float costheta = cos(theta);
  float sintheta = sin(theta);

  // Compute the input and output indices
  uint32_t index_1, index_2;
  if (traditional) {
    index_1 = 2 * pos.x + pos.y * stride;
    index_2 = index_1 + 1;
  } else {
    index_1 = pos.x + pos.y * stride;
    index_2 = index_1 + dims.x;
  }

  // Read and write the output
  float x1 = static_cast<float>(in[index_1]);
  float x2 = static_cast<float>(in[index_2]);
  float rx1;
  float rx2;
  if (forward) {
    rx1 = x1 * costheta - x2 * sintheta;
    rx2 = x1 * sintheta + x2 * costheta;
  } else {
    rx1 = x2 * sintheta + x1 * costheta;
    rx2 = x2 * costheta - x1 * sintheta;
  }
  out[index_1] = static_cast<T>(rx1);
  out[index_2] = static_cast<T>(rx2);
}

template <typename T, bool traditional, bool forward>
__global__ void rope_single(
    const T* in,
    T* out,
    const int32_t* offset,
    float scale,
    float base,
    int64_t stride,
    uint2 dims) {
  uint2 pos = make_uint2(
      blockIdx.x * blockDim.x + threadIdx.x,
      blockIdx.y * blockDim.y + threadIdx.y);
  if (pos.x >= dims.x || pos.y >= dims.y) {
    return;
  }

  float d = static_cast<float>(pos.x) / static_cast<float>(dims.x);
  float inv_freq = exp2(-d * base);
  rope_single_impl<T, traditional, forward>(
      in, out, *offset, inv_freq, scale, stride, pos, dims);
}

template <typename T, bool traditional, bool forward>
__global__ void rope_single_freqs(
    const T* in,
    T* out,
    const int32_t* offset,
    const float* freqs,
    float scale,
    int64_t stride,
    uint2 dims,
    int64_t freq_stride) {
  uint2 pos = make_uint2(
      blockIdx.x * blockDim.x + threadIdx.x,
      blockIdx.y * blockDim.y + threadIdx.y);
  if (pos.x >= dims.x || pos.y >= dims.y) {
    return;
  }

  float inv_freq = 1.0 / freqs[freq_stride * pos.x];
  rope_single_impl<T, traditional, forward>(
      in, out, *offset, inv_freq, scale, stride, pos, dims);
}

template <typename T, bool traditional, bool forward, int N = 4>
__device__ void rope_impl(
    const T* in,
    T* out,
    const int* offset,
    float inv_freq,
    float scale,
    const cuda::std::array<int64_t, 3> strides,
    const cuda::std::array<int64_t, 3> out_strides,
    int64_t offset_stride,
    int n_head,
    uint3 pos,
    uint3 dims) {
  auto n_head_up = N * ((n_head + N - 1) / N);
  auto head_idx = static_cast<int>((pos.z * N) % n_head_up);
  auto batch_idx = (pos.z * N) / n_head_up;
  auto batch_offset = offset[batch_idx * offset_stride];
  float L = scale * static_cast<float>(pos.y + batch_offset);
  auto mat_idx = batch_idx * n_head + head_idx;

  // Compute costheta, sintheta
  float theta = L * inv_freq;
  float costheta = cos(theta);
  float sintheta = sin(theta);

  // Compute the input and output indices
  size_t in_index_1, in_index_2;
  size_t out_index_1, out_index_2;
  if (traditional) {
    out_index_1 = 2 * pos.x * out_strides[2] + pos.y * out_strides[1] +
        mat_idx * out_strides[0];
    out_index_2 = out_index_1 + 1;
    in_index_1 =
        2 * pos.x * strides[2] + pos.y * strides[1] + mat_idx * strides[0];
    in_index_2 = in_index_1 + strides[2];
  } else {
    out_index_1 = pos.x * out_strides[2] + pos.y * out_strides[1] +
        mat_idx * out_strides[0];
    out_index_2 = out_index_1 + dims.x * out_strides[2];
    in_index_1 = pos.x * strides[2] + pos.y * strides[1] + mat_idx * strides[0];
    in_index_2 = in_index_1 + dims.x * strides[2];
  }
  for (int i = 0; i < N && head_idx + i < n_head; ++i) {
    // Read and write the output
    float x1 = static_cast<float>(in[in_index_1]);
    float x2 = static_cast<float>(in[in_index_2]);
    float rx1;
    float rx2;
    if (forward) {
      rx1 = x1 * costheta - x2 * sintheta;
      rx2 = x1 * sintheta + x2 * costheta;
    } else {
      rx1 = x2 * sintheta + x1 * costheta;
      rx2 = x2 * costheta - x1 * sintheta;
    }
    out[out_index_1] = static_cast<T>(rx1);
    out[out_index_2] = static_cast<T>(rx2);
    in_index_1 += strides[0];
    in_index_2 += strides[0];
    out_index_1 += out_strides[0];
    out_index_2 += out_strides[0];
  }
}

template <typename T, bool traditional, bool forward>
__global__ void rope(
    const T* in,
    T* out,
    const int32_t* offset,
    float scale,
    float base,
    const __grid_constant__ cuda::std::array<int64_t, 3> strides,
    const __grid_constant__ cuda::std::array<int64_t, 3> out_strides,
    int64_t offset_stride,
    int n_head,
    uint3 dims) {
  uint3 pos = make_uint3(
      blockIdx.x * blockDim.x + threadIdx.x,
      blockIdx.y * blockDim.y + threadIdx.y,
      blockIdx.z * blockDim.z + threadIdx.z);
  if (pos.x >= dims.x || pos.y >= dims.y || pos.z >= dims.z) {
    return;
  }

  float d = static_cast<float>(pos.x) / static_cast<float>(dims.x);
  float inv_freq = exp2(-d * base);
  rope_impl<T, traditional, forward>(
      in,
      out,
      offset,
      inv_freq,
      scale,
      strides,
      out_strides,
      offset_stride,
      n_head,
      pos,
      dims);
}

template <typename T, bool traditional, bool forward>
__global__ void rope_freqs(
    const T* in,
    T* out,
    const int32_t* offset,
    const float* freqs,
    float scale,
    float base,
    const __grid_constant__ cuda::std::array<int64_t, 3> strides,
    const __grid_constant__ cuda::std::array<int64_t, 3> out_strides,
    int64_t offset_stride,
    int n_head,
    uint3 dims,
    int64_t freq_stride) {
  uint3 pos = make_uint3(
      blockIdx.x * blockDim.x + threadIdx.x,
      blockIdx.y * blockDim.y + threadIdx.y,
      blockIdx.z * blockDim.z + threadIdx.z);
  if (pos.x >= dims.x || pos.y >= dims.y || pos.z >= dims.z) {
    return;
  }

  float inv_freq = 1.0 / freqs[freq_stride * pos.x];
  rope_impl<T, traditional, forward>(
      in,
      out,
      offset,
      inv_freq,
      scale,
      strides,
      out_strides,
      offset_stride,
      n_head,
      pos,
      dims);
}

} // namespace cu

namespace fast {

bool RoPE::use_fallback(Stream s) {
  return s.device == Device::cpu;
}

void RoPE::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  nvtx3::scoped_range r("RoPE::eval_gpu");

  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);
  auto& in = inputs[0];
  auto& offset = inputs[1];
  auto& out = outputs[0];

  cuda::std::array<int64_t, 3> strides;
  cuda::std::array<int64_t, 3> out_strides;
  bool donated = false;
  int ndim = in.ndim();

  int B = in.shape(0);
  int T = in.shape(-2);
  int D = in.shape(-1);
  size_t mat_size = T * D;
  int dispatch_ndim = ndim;
  while (in.shape(-dispatch_ndim) == 1 && dispatch_ndim > 3) {
    dispatch_ndim--;
  }

  int N = 1;
  for (int i = 1; i < (ndim - 2); ++i) {
    N *= in.shape(i);
  }

  // We apply rope to less that the whole vector so copy to output and then
  // apply in-place.
  if (dims_ < D) {
    donated = true;
    auto ctype =
        (in.flags().row_contiguous) ? CopyType::Vector : CopyType::General;
    copy_gpu(in, out, ctype, s);
    strides[0] = mat_size;
    strides[1] = out.strides()[ndim - 2];
    strides[2] = out.strides()[ndim - 1];
  }

  // Either copy or apply in-place
  else if (in.flags().row_contiguous) {
    if (in.is_donatable()) {
      donated = true;
      out.copy_shared_buffer(in);
    } else {
      out.set_data(cu::malloc_async(out.nbytes(), encoder));
    }
    strides[0] = mat_size;
    strides[1] = in.strides()[ndim - 2];
    strides[2] = in.strides()[ndim - 1];
  } else if (dispatch_ndim == 3) {
    // Handle non-contiguous 3D inputs
    out.set_data(cu::malloc_async(out.nbytes(), encoder));
    strides[0] = in.strides()[ndim - 3];
    strides[1] = in.strides()[ndim - 2];
    strides[2] = in.strides()[ndim - 1];
  } else {
    // Copy non-contiguous > 3D inputs into the output and treat
    // input as donated
    donated = true;
    copy_gpu(in, out, CopyType::General, s);
    strides[0] = mat_size;
    strides[1] = out.strides()[ndim - 2];
    strides[2] = out.strides()[ndim - 1];
  }
  out_strides[0] = mat_size;
  out_strides[1] = out.strides()[ndim - 2];
  out_strides[2] = out.strides()[ndim - 1];

  // Some flags to help us dispatch below
  bool single = in.flags().row_contiguous && B == 1 && T == 1;
  bool with_freqs = inputs.size() == 3;

  encoder.set_input_array(donated ? out : in);
  encoder.set_input_array(offset);
  if (with_freqs) {
    encoder.set_input_array(inputs[2]);
  }
  encoder.set_output_array(out);
  dispatch_float_types(out.dtype(), "rope", [&](auto type_tag) {
    dispatch_bool(traditional_, [&](auto traditional) {
      dispatch_bool(forward_, [&](auto forward) {
        using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
        if (single && !with_freqs) {
          auto kernel =
              cu::rope_single<DataType, traditional.value, forward.value>;
          uint2 dims = make_uint2(dims_ / 2, N);
          auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
          encoder.add_kernel_node(
              kernel,
              grid,
              block,
              gpu_ptr<DataType>(donated ? out : in),
              gpu_ptr<DataType>(out),
              gpu_ptr<int32_t>(offset),
              scale_,
              std::log2(base_),
              mat_size,
              dims);
        } else if (single) {
          auto kernel =
              cu::rope_single_freqs<DataType, traditional.value, forward.value>;
          uint2 dims = make_uint2(dims_ / 2, N);
          auto [grid, block] = get_grid_and_block(dims.x, dims.y, 1);
          encoder.add_kernel_node(
              kernel,
              grid,
              block,
              gpu_ptr<DataType>(donated ? out : in),
              gpu_ptr<DataType>(out),
              gpu_ptr<int32_t>(offset),
              gpu_ptr<float>(inputs[2]),
              scale_,
              mat_size,
              dims,
              inputs[2].strides(0));
        } else if (with_freqs) {
          auto kernel =
              cu::rope_freqs<DataType, traditional.value, forward.value>;
          int n_per_thread = 4;
          uint32_t dimz = B * ((N + n_per_thread - 1) / n_per_thread);
          uint3 dims = make_uint3(dims_ / 2, T, dimz);
          auto [grid, block] = get_grid_and_block(dims.x, dims.y, dims.z);
          int64_t offset_stride = 0;
          if (inputs[1].ndim() > 0) {
            offset_stride = inputs[1].strides()[0];
          }
          encoder.add_kernel_node(
              kernel,
              grid,
              block,
              gpu_ptr<DataType>(donated ? out : in),
              gpu_ptr<DataType>(out),
              gpu_ptr<int32_t>(offset),
              gpu_ptr<float>(inputs[2]),
              scale_,
              std::log2(base_),
              strides,
              out_strides,
              offset_stride,
              N,
              dims,
              inputs[2].strides(0));
        } else {
          auto kernel = cu::rope<DataType, traditional.value, forward.value>;
          int n_per_thread = 4;
          uint32_t dimz = B * ((N + n_per_thread - 1) / n_per_thread);
          uint3 dims = make_uint3(dims_ / 2, T, dimz);
          auto [grid, block] = get_grid_and_block(dims.x, dims.y, dims.z);
          int64_t offset_stride = 0;
          if (inputs[1].ndim() > 0) {
            offset_stride = inputs[1].strides()[0];
          }
          encoder.add_kernel_node(
              kernel,
              grid,
              block,
              gpu_ptr<DataType>(donated ? out : in),
              gpu_ptr<DataType>(out),
              gpu_ptr<int32_t>(offset),
              scale_,
              std::log2(base_),
              strides,
              out_strides,
              offset_stride,
              N,
              dims);
        }
      });
    });
  });
}

} // namespace fast

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/scaled_dot_product_attention.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/cudnn_utils.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/lru_cache.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/fast_primitives.h"

#include <nvtx3/nvtx3.hpp>

namespace mlx::core {

namespace {

array prepare_sdpa_input(const array& x, Stream s) {
  // SDPA kernel's requirements on inputs:
  // 1. last dim's stride be 1;
  // 2. pointer be aligned.
  if (x.strides(-1) != 1 || get_alignment(x) < 16) {
    array x_copy = contiguous_copy_gpu(x, s);
    auto& encoder = cu::get_command_encoder(s);
    encoder.add_temporary(x_copy);
    return x_copy;
  }
  return x;
}

array prepare_sdpa_sinks(const array& sinks, Stream s) {
  // cuDNN requires sinks to be float32.
  if (sinks.dtype() == float32) {
    return sinks;
  }
  array sinks_f32(sinks.shape(), float32, nullptr, {});
  copy_gpu(sinks, sinks_f32, CopyType::Vector, s);
  auto& encoder = cu::get_command_encoder(s);
  encoder.add_temporary(sinks_f32);
  return sinks_f32;
}

void malloc_with_same_layout(
    cu::CommandEncoder& encoder,
    array& o,
    const array& q) {
  if (q.flags().row_contiguous) {
    o.set_data(cu::malloc_async(o.nbytes(), encoder));
    return;
  }
  // fill_order = argsort(q.strides())
  Shape fill_order(q.ndim());
  std::iota(fill_order.begin(), fill_order.end(), 0);
  std::stable_sort(
      fill_order.begin(), fill_order.end(), [&q](int idx1, int idx2) {
        auto s1 = q.strides(idx1) > 0 ? q.strides(idx1) : 1;
        auto s2 = q.strides(idx2) > 0 ? q.strides(idx2) : 1;
        return s1 < s2;
      });
  // Generate o_strides with fill_order
  Strides o_strides(q.ndim());
  int64_t stride = 1;
  for (int i : fill_order) {
    o_strides[i] = stride;
    stride *= o.shape(i);
  }
  // o is a transposed contiguous array
  o.set_data(
      cu::malloc_async(o.nbytes(), encoder),
      o.size(),
      o_strides,
      {true, false, false});
}

bool use_cudnn_for_decoding(
    const array& q,
    const array& k,
    const array& v,
    bool has_arr_mask) {
  if (q.shape(2) != 1) {
    return false;
  }
  if (has_arr_mask) {
    return false;
  }
  // The cuDNN SDPA is faster than vector kernel but for small sequence the
  // overhead would kill the advantage.
  constexpr int kv_cache_step = 256; // number is from mlx-lm
  if (k.shape(2) < kv_cache_step) {
    return false;
  }
  // When called during graph building the strides is not available, and we
  // rely on |supports_sdpa_vector| to decide whether to use fast sdpa since
  // we can fallback to |sdpa_vector|.
  if ((k.status() != array::evaluated) || (v.status() != array::evaluated)) {
    return false;
  }
  // Check if k/v are slices from fixed-size kv cache.
  auto is_slice = [](const array& kv) {
    // Get pre-sliced sequence length from strides, and check if the buffer
    // belongs to a contiguous kv cache.
    int64_t T_kv = kv.strides(1) / kv.strides(2);
    if (kv.size() / kv.shape(2) * T_kv != kv.buffer_size() / kv.itemsize()) {
      return false;
    }
    // It is possible to use heuristic to check slices, but for now just make
    // mlx-lm work.
    return T_kv % kv_cache_step == 0;
  };
  return is_slice(k) && is_slice(v);
}

// Get original kv from slices, i.e. undo keys[..., :offset, :]
array unslice_kv(const array& kv) {
  Shape shape = kv.shape();
  shape[2] = /* T_kv */ kv.strides(1) / kv.strides(2);
  array copy(shape, kv.dtype(), nullptr, {});
  copy.copy_shared_buffer(
      kv,
      make_contiguous_strides(shape),
      {true, true, false},
      /* data_size */ kv.buffer_size() / kv.itemsize(),
      /* offset */ -kv.offset());
  return copy;
}

constexpr int QKV_NDIM = 4;

struct SDPACacheKey {
  int device_id;
  fe::DataType_t cudnn_dtype;
  std::array<int, QKV_NDIM> q_shape;
  std::array<int, QKV_NDIM> k_shape;
  std::array<int, QKV_NDIM> v_shape;
  std::array<int64_t, QKV_NDIM> q_strides;
  std::array<int64_t, QKV_NDIM> k_strides;
  std::array<int64_t, QKV_NDIM> v_strides;
  bool do_causal;
  std::array<int, QKV_NDIM> mask_shape;
  std::array<int64_t, QKV_NDIM> mask_strides;
  bool has_sinks;
  bool output_logsumexp;
};

inline BytesKey<SDPACacheKey> build_sdpa_cache_key(
    cu::CommandEncoder& encoder,
    const array& q,
    const array& k,
    const array& v,
    bool do_causal,
    const std::optional<array>& mask_arr,
    const std::optional<array>& sinks,
    bool decoding = false,
    bool output_logsumexp = false) {
  BytesKey<SDPACacheKey> cache_key;
  cache_key.pod.device_id = encoder.device().cuda_device();
  cache_key.pod.cudnn_dtype = dtype_to_cudnn_type(q.dtype());
  cache_key.pod.q_shape = vector_key<QKV_NDIM>(q.shape());
  cache_key.pod.k_shape = vector_key<QKV_NDIM>(k.shape());
  cache_key.pod.v_shape = vector_key<QKV_NDIM>(v.shape());
  cache_key.pod.q_strides = vector_key<QKV_NDIM>(q.strides());
  cache_key.pod.k_strides = vector_key<QKV_NDIM>(k.strides());
  cache_key.pod.v_strides = vector_key<QKV_NDIM>(v.strides());
  cache_key.pod.do_causal = do_causal;
  cache_key.pod.has_sinks = sinks.has_value();
  cache_key.pod.output_logsumexp = output_logsumexp;
  if (mask_arr) {
    cache_key.pod.mask_shape = vector_key<QKV_NDIM>(mask_arr->shape());
    cache_key.pod.mask_strides = vector_key<QKV_NDIM>(mask_arr->strides());
  }
  if (decoding) {
    int64_t T_kv = k.strides(1) / k.strides(2);
    cache_key.pod.k_shape[2] = T_kv;
    cache_key.pod.v_shape[2] = T_kv;
    cache_key.pod.k_strides.fill(0);
    cache_key.pod.v_strides.fill(0);
  }
  return cache_key;
}

auto& sdpa_cache() {
  static LRUBytesKeyCache<SDPACacheKey, DnnGraph> cache(
      "MLX_CUDA_SDPA_CACHE_SIZE", /* default_capacity */ 256);
  return cache;
}

auto& sdpa_backward_cache() {
  static LRUBytesKeyCache<SDPACacheKey, DnnGraph> cache(
      "MLX_CUDA_SDPA_BACKWARD_CACHE_SIZE", /* default_capacity */ 64);
  return cache;
}

enum UIDS {
  Q,
  K,
  V,
  SCALE,
  BIAS,
  SINKS,
  SEQ_LEN_Q,
  SEQ_LEN_KV,
  O,
  STATS,
  // Backward graph:
  D_Q,
  D_K,
  D_V,
  D_O,
};

DnnGraph build_sdpa_graph(
    cudnnHandle_t handle,
    const array& q,
    const array& k,
    const array& v,
    bool do_causal,
    const std::optional<array>& mask_arr,
    const std::optional<array>& sinks,
    const std::optional<array>& seq_len_q,
    const std::optional<array>& seq_len_kv,
    bool output_logsumexp,
    const array& o,
    const std::optional<array>& stats) {
  DnnGraph graph(handle, q.dtype());

  auto q_ = graph.tensor("Q", Q, q);
  auto k_ = graph.tensor("K", K, k);
  auto v_ = graph.tensor("V", V, v);

  auto options = fe::graph::SDPA_attributes()
                     .set_name("sdpa_cudnn")
                     .set_attn_scale(graph.scalar("Scale", SCALE, float32))
                     .set_generate_stats(output_logsumexp);
  if (do_causal) {
    options.set_causal_mask_bottom_right(do_causal);
  }
  if (mask_arr) {
    options.set_bias(graph.tensor("BIAS", BIAS, *mask_arr));
  }
  if (sinks) {
    options.set_sink_token(graph.tensor_4d("SINKS", SINKS, *sinks, 1));
  }
  if (seq_len_q && seq_len_kv) {
    options.set_padding_mask(true);
    options.set_seq_len_q(graph.tensor("SEQ_LEN_Q", SEQ_LEN_Q, *seq_len_q));
    options.set_seq_len_kv(graph.tensor("SEQ_LEN_KV", SEQ_LEN_KV, *seq_len_kv));
  }

  auto [o_, stats_] = graph.sdpa(q_, k_, v_, options);
  graph.tensor(o_, O, o)->set_output(true);
  if (output_logsumexp) {
    graph.tensor(stats_, STATS, *stats)->set_output(true);
  }

  CHECK_CUDNN_FE_ERROR(graph.prepare());
  graph.select_behavior_notes(
      {fe::BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API});
  CHECK_CUDNN_FE_ERROR(graph.build());
  return graph;
}

DnnGraph build_sdpa_backward_graph(
    cudnnHandle_t handle,
    const array& q,
    const array& k,
    const array& v,
    bool do_causal,
    const std::optional<array>& mask_arr,
    const std::optional<array>& sinks,
    const array& o,
    const array& d_o,
    const array& stats,
    array& d_q,
    array& d_k,
    array& d_v) {
  DnnGraph graph(handle, q.dtype());

  auto q_ = graph.tensor("Q", Q, q);
  auto k_ = graph.tensor("K", K, k);
  auto v_ = graph.tensor("V", V, v);
  auto o_ = graph.tensor("O", O, o);
  auto d_o_ = graph.tensor("D_O", D_O, d_o);
  auto stats_ = graph.tensor("STATS", STATS, stats);

  auto options = fe::graph::SDPA_backward_attributes()
                     .set_name("sdpa_backward_cudnn")
                     .set_attn_scale(graph.scalar("Scale", SCALE, float32));
  if (do_causal) {
    options.set_causal_mask_bottom_right(do_causal);
  }
  if (mask_arr) {
    options.set_bias(graph.tensor("BIAS", BIAS, *mask_arr));
  }
  if (sinks) {
    options.set_sink_token(graph.tensor_4d("SINKS", SINKS, *sinks, 1));
  }

  auto [d_q_, d_k_, d_v_] =
      graph.sdpa_backward(q_, k_, v_, o_, d_o_, stats_, options);
  graph.tensor(d_q_, D_Q, d_q)->set_output(true);
  graph.tensor(d_k_, D_K, d_k)->set_output(true);
  graph.tensor(d_v_, D_V, d_v)->set_output(true);

  CHECK_CUDNN_FE_ERROR(graph.prepare());
  graph.select_behavior_notes(
      {fe::BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API});
  CHECK_CUDNN_FE_ERROR(graph.build());
  return graph;
}

} // namespace

bool supports_sdpa_cudnn(
    const array& q,
    const array& k,
    const array& v,
    bool has_arr_mask,
    bool do_causal,
    Stream s) {
  static bool enabled = env::get_var("MLX_CUDA_USE_CUDNN_SDPA", 1);
  if (!enabled) {
    return false;
  }

  // cuDNN SDPA requires Ampere and later.
  if (cu::device(s.device).compute_capability_major() < 8) {
    return false;
  }

  // Only use cuDNN for decoding when k/v are slices from fixed-size kv cache.
  if ((q.shape(2) == 1) && !use_cudnn_for_decoding(q, k, v, has_arr_mask)) {
    return false;
  }

  // cuDNN does not support bottom right mask when T_q > T_kv.
  if (do_causal && (q.shape(2) > k.shape(2))) {
    return false;
  }

  // D_qk and D_v must be a multiple of 8 with maximum value 128.
  if ((q.shape(-1) % 8 != 0) || (q.shape(-1) > 128) || (v.shape(-1) % 8 != 0) ||
      (v.shape(-1) > 128)) {
    return false;
  }

  Dtype dtype = q.dtype();
  return dtype == float16 || dtype == bfloat16;
}

void sdpa_cudnn(
    const array& q,
    array k,
    array v,
    float scale,
    array& o,
    std::optional<array>& stats,
    bool do_causal,
    const std::optional<array>& mask_arr,
    const std::optional<array>& sinks,
    bool output_logsumexp,
    Stream s) {
  auto& encoder = cu::get_command_encoder(s);
  auto handle = encoder.device().get_cudnn_handle();

  malloc_with_same_layout(encoder, o, q);

  // For decoding, unslice k/v and apply padding mask.
  std::optional<array> seq_len_q;
  std::optional<array> seq_len_kv;
  bool decoding = use_cudnn_for_decoding(q, k, v, mask_arr.has_value());
  if (decoding) {
    int B = q.shape(0);
    std::vector<int> seq_len_q_vec(B, q.shape(2));
    std::vector<int> seq_len_kv_vec(B, k.shape(2));
    seq_len_q = array(seq_len_q_vec.begin(), {B, 1, 1, 1});
    seq_len_kv = array(seq_len_kv_vec.begin(), {B, 1, 1, 1});
    encoder.add_temporary(*seq_len_q);
    encoder.add_temporary(*seq_len_kv);
    k = unslice_kv(k);
    v = unslice_kv(v);
    encoder.add_temporary(k);
    encoder.add_temporary(v);
  }

  encoder.set_input_array(q);
  encoder.set_input_array(k);
  encoder.set_input_array(v);
  encoder.set_output_array(o);
  if (mask_arr) {
    encoder.set_input_array(*mask_arr);
  }
  if (sinks) {
    encoder.set_input_array(*sinks);
  }
  if (seq_len_q && seq_len_kv) {
    encoder.set_input_array(*seq_len_q);
    encoder.set_input_array(*seq_len_kv);
  }
  if (output_logsumexp) {
    stats->set_data(cu::malloc_async(stats->nbytes(), encoder));
    encoder.set_output_array(*stats);
  }

  // Search cache.
  auto cache_key = build_sdpa_cache_key(
      encoder, q, k, v, do_causal, mask_arr, sinks, decoding, output_logsumexp);
  auto it = sdpa_cache().find(cache_key);
  if (it == sdpa_cache().end()) {
    auto graph = build_sdpa_graph(
        handle,
        q,
        k,
        v,
        do_causal,
        mask_arr,
        sinks,
        seq_len_q,
        seq_len_kv,
        output_logsumexp,
        o,
        stats);
    it = sdpa_cache().emplace(cache_key, std::move(graph)).first;
  }
  auto& graph = it->second;

  std::unordered_map<int64_t, void*> variant_pack{
      {Q, gpu_ptr<void>(q)},
      {K, gpu_ptr<void>(k)},
      {V, gpu_ptr<void>(v)},
      {SCALE, &scale},
      {O, gpu_ptr<void>(o)}};
  if (mask_arr) {
    variant_pack[BIAS] = gpu_ptr<void>(*mask_arr);
  }
  if (sinks) {
    variant_pack[SINKS] = gpu_ptr<void>(*sinks);
  }
  if (seq_len_q && seq_len_kv) {
    variant_pack[SEQ_LEN_Q] = gpu_ptr<void>(*seq_len_q);
    variant_pack[SEQ_LEN_KV] = gpu_ptr<void>(*seq_len_kv);
  }
  if (output_logsumexp) {
    variant_pack[STATS] = gpu_ptr<void>(*stats);
  }

  CHECK_CUDNN_FE_ERROR(graph.encode_graph(encoder, std::move(variant_pack)));
}

void sdpa_backward_cudnn(
    const array& q,
    const array& k,
    const array& v,
    float scale,
    const array& o,
    const array& stats,
    bool do_causal,
    const std::optional<array>& mask_arr,
    const std::optional<array>& sinks,
    const array& d_o,
    array& d_q,
    array& d_k,
    array& d_v,
    Stream s) {
  auto& encoder = cu::get_command_encoder(s);
  auto handle = encoder.device().get_cudnn_handle();

  malloc_with_same_layout(encoder, d_q, q);
  malloc_with_same_layout(encoder, d_k, k);
  malloc_with_same_layout(encoder, d_v, v);

  encoder.set_input_array(q);
  encoder.set_input_array(k);
  encoder.set_input_array(v);
  encoder.set_input_array(o);
  encoder.set_input_array(stats);
  encoder.set_input_array(d_o);
  encoder.set_output_array(d_q);
  encoder.set_output_array(d_k);
  encoder.set_output_array(d_v);
  if (mask_arr) {
    encoder.set_input_array(*mask_arr);
  }
  if (sinks) {
    encoder.set_input_array(*sinks);
  }

  // Search cache.
  auto cache_key =
      build_sdpa_cache_key(encoder, q, k, v, do_causal, mask_arr, sinks);
  auto it = sdpa_backward_cache().find(cache_key);
  if (it == sdpa_backward_cache().end()) {
    auto graph = build_sdpa_backward_graph(
        handle,
        q,
        k,
        v,
        do_causal,
        mask_arr,
        sinks,
        o,
        d_o,
        stats,
        d_q,
        d_k,
        d_v);
    it = sdpa_backward_cache().emplace(cache_key, std::move(graph)).first;
  }
  auto& graph = it->second;

  std::unordered_map<int64_t, void*> variant_pack{
      {Q, gpu_ptr<void>(q)},
      {K, gpu_ptr<void>(k)},
      {V, gpu_ptr<void>(v)},
      {SCALE, &scale},
      {O, gpu_ptr<void>(o)},
      {STATS, gpu_ptr<void>(stats)},
      {D_O, gpu_ptr<void>(d_o)},
      {D_Q, gpu_ptr<void>(d_q)},
      {D_K, gpu_ptr<void>(d_k)},
      {D_V, gpu_ptr<void>(d_v)}};
  if (mask_arr) {
    variant_pack[BIAS] = gpu_ptr<void>(*mask_arr);
  }
  if (sinks) {
    variant_pack[SINKS] = gpu_ptr<void>(*sinks);
  }

  CHECK_CUDNN_FE_ERROR(graph.encode_graph(encoder, std::move(variant_pack)));
}

// Defined in scaled_dot_product_attention.cu file.
bool supports_sdpa_vector(
    const array& q,
    const array& k,
    const array& v,
    bool has_arr_mask,
    bool output_logsumexp);
void sdpa_vector(
    const array& q,
    const array& k,
    const array& v,
    float scale,
    array& o,
    bool do_causal,
    const std::optional<array>& sinks,
    Stream s);

namespace fast {

bool ScaledDotProductAttention::use_fallback(
    const array& q,
    const array& k,
    const array& v,
    bool has_mask,
    bool has_arr_mask,
    bool do_causal,
    bool is_training,
    bool output_logsumexp,
    Stream s) {
  if (s.device == Device::cpu) {
    return true;
  }

  return !supports_sdpa_cudnn(q, k, v, has_arr_mask, do_causal, s) &&
      !supports_sdpa_vector(q, k, v, has_arr_mask, output_logsumexp);
}

bool ScaledDotProductAttention::supports_bool_mask() {
  return false;
}

void ScaledDotProductAttention::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  nvtx3::scoped_range r("ScaledDotProductAttention::eval_gpu");

  auto& s = stream();

  array q = prepare_sdpa_input(inputs[0], s);
  array k = prepare_sdpa_input(inputs[1], s);
  array v = prepare_sdpa_input(inputs[2], s);
  array& out = outputs[0];
  bool has_mask = inputs.size() - has_sinks_ > 3;
  bool has_arr_mask = has_mask && !do_causal_;

  std::optional<array> mask_arr;
  if (has_arr_mask) {
    mask_arr = prepare_sdpa_input(inputs[3], s);
  }
  std::optional<array> sinks;
  if (has_sinks_) {
    sinks = inputs.back();
  }
  std::optional<array> stats;
  if (output_logsumexp_) {
    stats = outputs[1];
  }

  if (supports_sdpa_cudnn(q, k, v, has_arr_mask, do_causal_, s)) {
    if (sinks) {
      sinks = prepare_sdpa_sinks(*sinks, s);
    }
    sdpa_cudnn(
        q,
        k,
        v,
        scale_,
        out,
        stats,
        do_causal_,
        mask_arr,
        sinks,
        output_logsumexp_,
        s);
  } else {
    sdpa_vector(q, k, v, scale_, out, do_causal_, sinks, s);
  }
}

bool ScaledDotProductAttentionVJP::use_fallback(const array& q, Stream s) {
  // The frontend adds a padding mask when sequence length is not a multiple of
  // tile size.
  if (q.shape(2) % 128 != 0) {
    return true;
  }
  return s.device == Device::cpu;
}

void ScaledDotProductAttentionVJP::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  nvtx3::scoped_range r("ScaledDotProductAttentionVJP::eval_gpu");

  auto& s = stream();

  assert(inputs.size() >= 6);
  int primals_size = inputs.size() - 3;
  bool has_arr_mask = primals_size > 3 + has_sinks_;

  array q = prepare_sdpa_input(inputs[0], s);
  array k = prepare_sdpa_input(inputs[1], s);
  array v = prepare_sdpa_input(inputs[2], s);
  array o = prepare_sdpa_input(inputs[primals_size], s);
  array stats = prepare_sdpa_input(inputs[primals_size + 1], s);
  array d_o = prepare_sdpa_input(inputs[primals_size + 2], s);

  std::optional<array> mask_arr;
  if (has_arr_mask) {
    mask_arr = prepare_sdpa_input(inputs[3], s);
  }
  std::optional<array> sinks;
  if (has_sinks_) {
    sinks = prepare_sdpa_sinks(inputs.back(), s);
  }

  assert(outputs.size() == 3);
  auto& d_q = outputs[0];
  auto& d_k = outputs[1];
  auto& d_v = outputs[2];

  sdpa_backward_cudnn(
      q,
      k,
      v,
      scale_,
      o,
      stats,
      do_causal_,
      mask_arr,
      sinks,
      d_o,
      d_q,
      d_k,
      d_v,
      s);
}

} // namespace fast

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/scaled_dot_product_attention.cu
================================================
// Copyright © 2025 Apple Inc.

// Required for using M_LOG2E in MSVC.
#define _USE_MATH_DEFINES

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/device/config.h"
#include "mlx/backend/cuda/device/utils.cuh"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/backend/gpu/copy.h"
#include "mlx/dtype_utils.h"

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

#define PRAGMA_LOOP_UNROLL #pragma unroll

struct AttnParams {
  int B;
  int H;
  int D;

  int qL;
  int kL;

  int gqa_factor;
  float scale;

  int64_t Q_strides[3];
  int64_t K_strides[3];
  int64_t V_strides[3];
  int64_t O_strides[3];
};

template <typename T, bool do_causal, int D>
__global__ void kernel_sdpav_1pass(
    const T* Q,
    const T* K,
    const T* V,
    T* O,
    const T* sinks,
    __grid_constant__ const AttnParams params) {
  constexpr int BN = 32;
  constexpr int BD = 32;

  constexpr int v_per_thread = D / BD;

  const int inner_k_stride = BN * int(params.K_strides[2]);
  const int inner_v_stride = BN * int(params.V_strides[2]);

  typedef float U;

  U q[v_per_thread];
  U k[v_per_thread];
  U o[v_per_thread];

  __shared__ U outputs[BN][BD + 1];
  __shared__ U max_scores[BN];
  __shared__ U sum_exp_scores[BN];

  const U scale_log2 = params.scale * M_LOG2E;

  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<32>(block);

  const int lane_idx = warp.thread_rank();
  const int warp_idx = warp.meta_group_rank();

  // Adjust to thread block and thread
  const int batch_idx = blockIdx.z;
  const int head_idx = blockIdx.x;
  const int kv_head_idx = head_idx / params.gqa_factor;

  const int q_seq_idx = blockIdx.y;
  const int kv_seq_idx = warp_idx;

  Q += batch_idx * params.Q_strides[0] + // Batch
      head_idx * params.Q_strides[1] + // Head
      q_seq_idx * params.Q_strides[2]; // Sequence

  K += batch_idx * params.K_strides[0] + // Batch
      kv_head_idx * params.K_strides[1] + // Head
      kv_seq_idx * params.K_strides[2]; // Sequence

  V += batch_idx * params.V_strides[0] + // Batch
      kv_head_idx * params.V_strides[1] + // Head
      kv_seq_idx * params.V_strides[2]; // Sequence

  O += batch_idx * params.O_strides[0] + // Batch
      head_idx * params.O_strides[1] + // Head
      q_seq_idx * params.O_strides[2]; // Sequence

  // Read the query and 0 the output accumulator
  PRAGMA_LOOP_UNROLL
  for (int i = 0; i < v_per_thread; i++) {
    q[i] = scale_log2 * static_cast<U>(Q[v_per_thread * lane_idx + i]);
  }

  PRAGMA_LOOP_UNROLL
  for (int i = 0; i < v_per_thread; i++) {
    o[i] = 0.f;
  }

  U max_score = Limits<U>::finite_min();
  U sum_exp_score = 0.f;
  if (sinks && warp_idx == 0) {
    max_score = M_LOG2E * static_cast<U>(sinks[head_idx]);
    sum_exp_score = 1.f;
  }

  // For each key
  for (int i = kv_seq_idx; i < params.kL; i += BN) {
    bool use_key = true;
    if constexpr (do_causal) {
      use_key = i <= (params.kL - params.qL + q_seq_idx);
    }

    if (use_key) {
      // Read the key
      PRAGMA_LOOP_UNROLL
      for (int j = 0; j < v_per_thread; j++) {
        k[j] = K[v_per_thread * lane_idx + j];
      }

      // Compute the i-th score
      U score = 0.f;
      PRAGMA_LOOP_UNROLL
      for (int j = 0; j < v_per_thread; j++) {
        score += q[j] * k[j];
      }

      // Warp sum
      score = cg::reduce(warp, score, cg::plus<U>());

      // Update the accumulators
      U new_max = max(max_score, score);
      U factor = exp2f(max_score - new_max);
      U exp_score = exp2f(score - new_max);

      max_score = new_max;
      sum_exp_score = sum_exp_score * factor + exp_score;

      // Update the output accumulator
      PRAGMA_LOOP_UNROLL
      for (int j = 0; j < v_per_thread; j++) {
        o[j] = o[j] * factor +
            exp_score * static_cast<U>(V[v_per_thread * lane_idx + j]);
      }
    }

    // Move the pointers to the next kv
    K += inner_k_stride;
    V += inner_v_stride;
  }

  if (lane_idx == 0) {
    max_scores[warp_idx] = max_score;
    sum_exp_scores[warp_idx] = sum_exp_score;
  }
  block.sync();

  max_score = max_scores[lane_idx];
  U new_max = cg::reduce(warp, max_score, cg::greater<U>());
  U factor = exp2f(max_score - new_max);
  sum_exp_score =
      cg::reduce(warp, sum_exp_scores[lane_idx] * factor, cg::plus<U>());
  sum_exp_score = sum_exp_score == 0 ? 0 : __frcp_rn(sum_exp_score);

  // Now we need to aggregate all the outputs
  PRAGMA_LOOP_UNROLL
  for (int i = 0; i < v_per_thread; i++) {
    outputs[lane_idx][warp_idx] = o[i];
    block.sync();
    U ot = outputs[warp_idx][lane_idx] * factor;
    o[i] = cg::reduce(warp, ot, cg::plus<U>()) * sum_exp_score;
    block.sync();
  }

  // And write the output
  if (lane_idx == 0) {
    PRAGMA_LOOP_UNROLL
    for (int i = 0; i < v_per_thread; i++) {
      O[v_per_thread * warp_idx + i] = static_cast<T>(o[i]);
    }
  }
}

template <typename T, bool do_causal, int D>
__global__ void kernel_sdpav_2pass_1(
    const T* Q,
    const T* K,
    const T* V,
    const T* sinks,
    float* partials,
    float* sums,
    float* maxs,
    __grid_constant__ const AttnParams params) {
  constexpr int BN = 8;
  constexpr int BD = 32;
  constexpr int blocks = 32;

  constexpr int v_per_thread = D / BD;

  const int inner_k_stride = blocks * BN * int(params.K_strides[2]);
  const int inner_v_stride = blocks * BN * int(params.V_strides[2]);

  typedef float U;

  U q[v_per_thread];
  U k[v_per_thread];
  U o[v_per_thread];

  __shared__ U outputs[BN][BD + 1];
  __shared__ U max_scores[BN];
  __shared__ U sum_exp_scores[BN];

  const U scale_log2 = params.scale * 1.44269504089f;

  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<32>(block);

  const int lane_idx = warp.thread_rank();
  const int warp_idx = warp.meta_group_rank();

  // Adjust to thread block and thread
  const int batch_idx = blockIdx.z / blocks;
  const int block_idx = blockIdx.z % blocks;
  const int head_idx = blockIdx.x;
  const int kv_head_idx = head_idx / params.gqa_factor;

  const int q_seq_idx = blockIdx.y;
  const int kv_seq_idx = block_idx * BN + warp_idx;

  Q += batch_idx * params.Q_strides[0] + // Batch
      head_idx * params.Q_strides[1] + // Head
      q_seq_idx * params.Q_strides[2]; // Sequence

  K += batch_idx * params.K_strides[0] + // Batch
      kv_head_idx * params.K_strides[1] + // Head
      kv_seq_idx * params.K_strides[2]; // Sequence

  V += batch_idx * params.V_strides[0] + // Batch
      kv_head_idx * params.V_strides[1] + // Head
      kv_seq_idx * params.V_strides[2]; // Sequence

  const int p_stride_s = blocks;
  const int p_stride_h = params.qL * p_stride_s;
  const int p_stride_b = params.H * p_stride_h;
  const int p_offset = batch_idx * p_stride_b + // Batch
      head_idx * p_stride_h + // Head
      q_seq_idx * p_stride_s + // Sequence
      block_idx; // Block

  partials += p_offset * D;
  sums += p_offset;
  maxs += p_offset;

  // Read the query and 0 the output accumulator
  PRAGMA_LOOP_UNROLL
  for (int i = 0; i < v_per_thread; i++) {
    q[i] = scale_log2 * static_cast<U>(Q[v_per_thread * lane_idx + i]);
  }

  PRAGMA_LOOP_UNROLL
  for (int i = 0; i < v_per_thread; i++) {
    o[i] = 0.f;
  }

  U max_score = Limits<U>::finite_min();
  U sum_exp_score = 0.f;
  if (sinks && warp_idx == 0 && block_idx == 0) {
    max_score = M_LOG2E * static_cast<U>(sinks[head_idx]);
    sum_exp_score = 1.f;
  }

  // For each key
  for (int i = kv_seq_idx; i < params.kL; i += blocks * BN) {
    bool use_key = true;
    if constexpr (do_causal) {
      use_key = i <= (params.kL - params.qL + q_seq_idx);
    }

    if (use_key) {
      // Read the key
      PRAGMA_LOOP_UNROLL
      for (int j = 0; j < v_per_thread; j++) {
        k[j] = K[v_per_thread * lane_idx + j];
      }

      // Compute the i-th score
      U score = 0.f;
      PRAGMA_LOOP_UNROLL
      for (int j = 0; j < v_per_thread; j++) {
        score += q[j] * k[j];
      }

      // Warp sum
      score = cg::reduce(warp, score, cg::plus<U>());

      // Update the accumulators
      U new_max = max(max_score, score);
      U factor = exp2f(max_score - new_max);
      U exp_score = exp2f(score - new_max);

      max_score = new_max;
      sum_exp_score = sum_exp_score * factor + exp_score;

      // Update the output accumulator
      PRAGMA_LOOP_UNROLL
      for (int j = 0; j < v_per_thread; j++) {
        o[j] = o[j] * factor +
            exp_score * static_cast<U>(V[v_per_thread * lane_idx + j]);
      }
    }

    // Move the pointers to the next kv
    K += inner_k_stride;
    V += inner_v_stride;
  }

  if (lane_idx == 0) {
    max_scores[warp_idx] = max_score;
    sum_exp_scores[warp_idx] = sum_exp_score;
  }

  block.sync();

  max_score = (lane_idx < BN) ? max_scores[lane_idx] : -1e9;
  U new_max = cg::reduce(warp, max_score, cg::greater<U>());
  U factor = exp2f(max_score - new_max);
  sum_exp_score = (lane_idx < BN) ? sum_exp_scores[lane_idx] : 0.f;
  sum_exp_score = cg::reduce(warp, sum_exp_score * factor, cg::plus<U>());

  // Write the sum and new max
  if (warp_idx == 0) {
    sums[0] = sum_exp_score;
    maxs[0] = new_max;
  }

  // Now we need to aggregate all the outputs
  auto ff = exp2f(max_scores[warp_idx] - new_max);
  PRAGMA_LOOP_UNROLL
  for (int i = 0; i < v_per_thread; i++) {
    outputs[warp_idx][lane_idx] = o[i] * ff;
    block.sync();

    if (warp_idx == 0) {
      U ot = outputs[0][lane_idx];
      PRAGMA_LOOP_UNROLL
      for (int j = 1; j < BN; j++) {
        ot += outputs[j][lane_idx];
        warp.sync();
      }
      o[i] = ot;
    }
    block.sync();
  }

  if (warp_idx == 0) {
    PRAGMA_LOOP_UNROLL
    for (int i = 0; i < v_per_thread; i++) {
      partials[v_per_thread * lane_idx + i] = o[i];
    }
  }
}

template <typename T, bool do_causal, int D>
__global__ void kernel_sdpav_2pass_2(
    const float* partials,
    const float* sums,
    const float* maxs,
    T* O,
    __grid_constant__ const AttnParams params) {
  constexpr int BN = 32;
  constexpr int BD = 32;
  constexpr int blocks = 32;

  constexpr int v_per_thread = D / BD;

  typedef float U;

  U o[v_per_thread];
  __shared__ U outputs[BN][BD + 1];

  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<32>(block);

  const int lane_idx = warp.thread_rank();
  const int warp_idx = warp.meta_group_rank();

  // Adjust to thread block and thread
  const int batch_idx = blockIdx.z;
  const int head_idx = blockIdx.x;
  const int q_seq_idx = blockIdx.y;

  const int p_stride_s = blocks;
  const int p_stride_h = params.qL * p_stride_s;
  const int p_stride_b = params.H * p_stride_h;
  const int p_offset = batch_idx * p_stride_b + // Batch
      head_idx * p_stride_h + // Head
      q_seq_idx * p_stride_s; // Sequence

  partials += p_offset * D + warp_idx * D;
  sums += p_offset;
  maxs += p_offset;

  O += batch_idx * params.O_strides[0] + // Batch
      head_idx * params.O_strides[1] + // Head
      q_seq_idx * params.O_strides[2]; // Sequence

  U max_score = maxs[lane_idx];
  U new_max = cg::reduce(warp, max_score, cg::greater<U>());
  U factor = exp2f(max_score - new_max);
  U sum_exp_score = cg::reduce(warp, sums[lane_idx] * factor, cg::plus<U>());
  sum_exp_score = sum_exp_score == 0 ? 0 : __frcp_rn(sum_exp_score);

  PRAGMA_LOOP_UNROLL
  for (int i = 0; i < v_per_thread; i++) {
    o[i] = partials[v_per_thread * lane_idx + i];
  }

  // Now we need to aggregate all the outputs
  PRAGMA_LOOP_UNROLL
  for (int i = 0; i < v_per_thread; i++) {
    outputs[lane_idx][warp_idx] = o[i];
    block.sync();
    U ot = outputs[warp_idx][lane_idx] * factor;
    o[i] = cg::reduce(warp, ot, cg::plus<U>()) * sum_exp_score;
    block.sync();
  }

  // And write the output
  if (lane_idx == 0) {
    PRAGMA_LOOP_UNROLL
    for (int i = 0; i < v_per_thread; i++) {
      O[v_per_thread * warp_idx + i] = static_cast<T>(o[i]);
    }
  }
}

} // namespace cu

namespace {

template <typename F>
void dispatch_headdim(int n, F&& f) {
  switch (n) {
    case 64:
      f(std::integral_constant<int, 64>{});
      break;
    case 96:
      f(std::integral_constant<int, 96>{});
      break;
    case 128:
      f(std::integral_constant<int, 128>{});
      break;
  }
}

void sdpa_vector_1pass_fallback(
    const Stream& s,
    cu::CommandEncoder& encoder,
    const array& q,
    const array& k,
    const array& v,
    const float scale,
    array& o,
    bool do_causal,
    const std::optional<array>& sinks) {
  encoder.set_input_array(q);
  encoder.set_input_array(k);
  encoder.set_input_array(v);
  if (sinks) {
    encoder.set_input_array(*sinks);
  }
  encoder.set_output_array(o);

  cu::AttnParams params{
      /* int B = */ q.shape(0),
      /* int H = */ q.shape(1),
      /* int D = */ q.shape(3),

      /* int qL = */ q.shape(2),
      /* int kL = */ k.shape(2),

      /* int gqa_factor = */ q.shape(1) / k.shape(1),
      /* float scale = */ scale,

      /* int64_t Q_strides[3] = */ {q.strides(0), q.strides(1), q.strides(2)},
      /* int64_t K_strides[3] = */ {k.strides(0), k.strides(1), k.strides(2)},
      /* int64_t V_strides[3] = */ {v.strides(0), v.strides(1), v.strides(2)},
      /* int64_t O_strides[3] = */ {o.strides(0), o.strides(1), o.strides(2)}};

  dim3 grid_dim(params.H, params.qL, params.B);
  dim3 block_dim(1024, 1, 1);

  dispatch_float_types(o.dtype(), "kernel_sdpav_1pass", [&](auto type_tag) {
    dispatch_bool(do_causal, [&](auto do_causal) {
      dispatch_headdim(params.D, [&](auto headdim) {
        using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;

        auto kernel =
            cu::kernel_sdpav_1pass<DataType, do_causal.value, headdim.value>;
        encoder.add_kernel_node(
            kernel,
            grid_dim,
            block_dim,
            gpu_ptr<DataType>(q),
            gpu_ptr<DataType>(k),
            gpu_ptr<DataType>(v),
            gpu_ptr<DataType>(o),
            sinks ? gpu_ptr<DataType>(*sinks) : nullptr,
            params);
      });
    });
  });
}

void sdpa_vector_2pass_fallback(
    const Stream& s,
    cu::CommandEncoder& encoder,
    const array& q,
    const array& k,
    const array& v,
    const float scale,
    array& o,
    bool do_causal,
    const std::optional<array>& sinks) {
  cu::AttnParams params{
      /* int B = */ q.shape(0),
      /* int H = */ q.shape(1),
      /* int D = */ q.shape(3),

      /* int qL = */ q.shape(2),
      /* int kL = */ k.shape(2),

      /* int gqa_factor = */ q.shape(1) / k.shape(1),
      /* float scale = */ scale,

      /* int64_t Q_strides[3] = */ {q.strides(0), q.strides(1), q.strides(2)},
      /* int64_t K_strides[3] = */ {k.strides(0), k.strides(1), k.strides(2)},
      /* int64_t V_strides[3] = */ {v.strides(0), v.strides(1), v.strides(2)},
      /* int64_t O_strides[3] = */ {o.strides(0), o.strides(1), o.strides(2)}};

  // Allocate the intermediates
  int blocks = 32;

  Shape intermediate_shape;
  intermediate_shape.reserve(o.ndim() + 1);
  intermediate_shape.insert(
      intermediate_shape.end(), o.shape().begin(), o.shape().end() - 1);
  intermediate_shape.push_back(blocks);
  intermediate_shape.push_back(o.shape().back());

  array intermediate(intermediate_shape, float32, nullptr, {});
  intermediate_shape.pop_back();
  array sums(intermediate_shape, float32, nullptr, {});
  array maxs(std::move(intermediate_shape), float32, nullptr, {});

  intermediate.set_data(cu::malloc_async(intermediate.nbytes(), encoder));
  sums.set_data(cu::malloc_async(sums.nbytes(), encoder));
  maxs.set_data(cu::malloc_async(maxs.nbytes(), encoder));

  encoder.add_temporary(intermediate);
  encoder.add_temporary(sums);
  encoder.add_temporary(maxs);

  dispatch_float_types(o.dtype(), "kernel_sdpav_2pass", [&](auto type_tag) {
    dispatch_bool(do_causal, [&](auto do_causal) {
      dispatch_headdim(params.D, [&](auto headdim) {
        using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;

        {
          auto kernel = cu::
              kernel_sdpav_2pass_1<DataType, do_causal.value, headdim.value>;

          encoder.set_input_array(q);
          encoder.set_input_array(k);
          encoder.set_input_array(v);
          if (sinks) {
            encoder.set_input_array(*sinks);
          }

          encoder.set_output_array(intermediate);
          encoder.set_output_array(sums);
          encoder.set_output_array(maxs);

          dim3 grid_dim(params.H, params.qL, params.B * 32);
          dim3 block_dim(8 * 32, 1, 1);

          encoder.add_kernel_node(
              kernel,
              grid_dim,
              block_dim,
              gpu_ptr<DataType>(q),
              gpu_ptr<DataType>(k),
              gpu_ptr<DataType>(v),
              sinks ? gpu_ptr<DataType>(*sinks) : nullptr,
              gpu_ptr<float>(intermediate),
              gpu_ptr<float>(sums),
              gpu_ptr<float>(maxs),
              params);
        }

        {
          auto kernel = cu::
              kernel_sdpav_2pass_2<DataType, do_causal.value, headdim.value>;

          encoder.set_input_array(intermediate);
          encoder.set_input_array(sums);
          encoder.set_input_array(maxs);
          encoder.set_output_array(o);

          dim3 grid_dim(params.H, params.qL, params.B);
          dim3 block_dim(1024, 1, 1);

          encoder.add_kernel_node(
              kernel,
              grid_dim,
              block_dim,
              gpu_ptr<float>(intermediate),
              gpu_ptr<float>(sums),
              gpu_ptr<float>(maxs),
              gpu_ptr<DataType>(o),
              params);
        }
      });
    });
  });
}

void sdpa_vector_fallback(
    const Stream& s,
    cu::CommandEncoder& encoder,
    const array& q,
    const array& k,
    const array& v,
    const float scale,
    array& o,
    bool do_causal,
    const std::optional<array>& sinks) {
  int kL = k.shape(2);

  if (kL > 1024) {
    return sdpa_vector_2pass_fallback(
        s, encoder, q, k, v, scale, o, do_causal, sinks);
  } else {
    return sdpa_vector_1pass_fallback(
        s, encoder, q, k, v, scale, o, do_causal, sinks);
  }
}

} // namespace

bool supports_sdpa_vector(
    const array& q,
    const array& k,
    const array& v,
    bool has_arr_mask,
    bool output_logsumexp) {
  if (output_logsumexp) {
    return false;
  }

  const int value_head_dim = v.shape(-1);
  const int query_head_dim = q.shape(-1);
  const int query_sequence_length = q.shape(2);
  const int key_sequence_length = k.shape(2);

  const bool sdpa_supported_head_dim = query_head_dim == value_head_dim &&
      (query_head_dim == 64 || query_head_dim == 96 || query_head_dim == 128);

  const bool supported_vector_config =
      sdpa_supported_head_dim && query_sequence_length < 4;

  return supported_vector_config && !has_arr_mask;
}

void sdpa_vector(
    const array& q_pre,
    const array& k_pre,
    const array& v_pre,
    float scale,
    array& o,
    bool do_causal,
    const std::optional<array>& sinks_pre,
    Stream s) {
  auto& encoder = cu::get_command_encoder(s);
  std::vector<array> copies;

  // Define some copy functions to ensure the layout of the inputs is as
  // expected.
  copies.reserve(4);
  auto copy_unless = [&copies, &s](
                         auto predicate, const array& arr) -> const array& {
    if (!predicate(arr)) {
      array arr_copy = contiguous_copy_gpu(arr, s);
      copies.push_back(std::move(arr_copy));
      return copies.back();
    } else {
      return arr;
    }
  };

  // Checks that the headdim dimension has stride 1.
  auto is_matrix_contiguous = [](const array& arr) {
    return arr.strides(-1) == 1;
  };

  std::optional<array> sinks = std::nullopt;
  if (sinks_pre) {
    sinks = copy_unless(is_matrix_contiguous, sinks_pre.value());
  }

  // We are in vector mode ie single query
  if (q_pre.shape(2) < 4) {
    auto q_copy_unless = [](const array& arr) {
      if (arr.flags().row_contiguous) {
        return true;
      }
      auto& strides = arr.strides();
      auto& shape = arr.shape();
      if (shape[0] == 1 || shape[1] == 1) {
        // If either the batch or head dimension is a singleton, the other can
        // be transposed with the sequence dimension
        auto bidx = shape[0] == 1 ? 1 : 0;
        return (strides[3] == 1) && (strides[2] == shape[3] * shape[bidx]) &&
            (strides[bidx] == shape[3]);
      }
      return false;
    };

    auto kv_copy_unless = [](const array& arr) {
      // keys and values should be copied if:
      // - the last dimension is not contiguous
      // - the batch and head dim are not contiguous
      auto& strides = arr.strides();
      auto& shape = arr.shape();
      if (strides.back() != 1) {
        return false;
      }
      if (shape[0] == 1 || shape[1] == 1) {
        return true;
      }
      return (strides[0] == strides[1] * shape[1]);
    };

    const auto& q = copy_unless(q_copy_unless, q_pre);
    const auto& k = copy_unless(kv_copy_unless, k_pre);
    const auto& v = copy_unless(kv_copy_unless, v_pre);

    // Donate the query if possible
    if (q.is_donatable() && q.flags().row_contiguous && q.size() == o.size()) {
      o.copy_shared_buffer(q);
    } else {
      int64_t str_oD = 1;
      int64_t str_oH = o.shape(3);
      int64_t str_oL = o.shape(1) * str_oH;
      int64_t str_oB = o.shape(2) * str_oL;

      array::Flags flags{
          /* bool contiguous = */ 1,
          /* bool row_contiguous = */ o.shape(2) == 1,
          /* bool col_contiguous = */ o.size() == o.shape(3),
      };

      o.set_data(
          cu::malloc_async(o.nbytes(), encoder),
          o.size(),
          {str_oB, str_oH, str_oL, str_oD},
          flags);
    }

    for (const auto& cp : copies) {
      encoder.add_temporary(cp);
    }

    sdpa_vector_fallback(s, encoder, q, k, v, scale, o, do_causal, sinks);
  }

  // Full attention mode should never reach here
  else {
    throw std::runtime_error("Doesn't support matrix yet.");
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/scan.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/device/binary_ops.cuh"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/backend/cuda/reduce/reduce_ops.cuh"
#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/gpu/scan.h"
#include "mlx/dtype_utils.h"
#include "mlx/primitives.h"

#include <cooperative_groups.h>
#include <cooperative_groups/scan.h>
#include <nvtx3/nvtx3.hpp>

#include <cassert>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <typename Op, typename T>
struct ScanResult {
  using type = T;
};

template <>
struct ScanResult<Sum, bool> {
  using type = int32_t;
};

template <typename T>
struct ReduceInit<LogAddExp, T> {
  static constexpr __host__ __device__ T value() {
    return Limits<T>::min();
  }
};

template <bool reverse, typename T, typename U, int N_READS>
inline __device__ void
load_values(int index, const T* in, U (&values)[N_READS], int size, U init) {
  int remaining = size - index * N_READS;
  if constexpr (reverse) {
    in += remaining - N_READS;
    if (remaining < N_READS) {
      for (int i = 0; i < N_READS; ++i) {
        values[N_READS - i - 1] =
            (N_READS - i - 1 < remaining) ? cast_to<U>(in[i]) : init;
      }
    } else {
      for (int i = 0; i < N_READS; ++i) {
        values[N_READS - i - 1] = cast_to<U>(in[i]);
      }
    }
  } else {
    in += index * N_READS;
    if (remaining < N_READS) {
      for (int i = 0; i < N_READS; ++i) {
        values[i] = (i < remaining) ? cast_to<U>(in[i]) : init;
      }
    } else {
      for (int i = 0; i < N_READS; ++i) {
        values[i] = cast_to<U>(in[i]);
      }
    }
  }
}

template <bool reverse, int offset, typename T, int N_READS>
inline __device__ void
store_values(int index, T* out, T (&values)[N_READS], int size) {
  int start = index * N_READS + offset;
  int remaining = size - start;
  if constexpr (reverse) {
    out += remaining - N_READS;
    if (remaining < N_READS) {
      for (int i = 0; i < N_READS; ++i) {
        if (N_READS - i - 1 < remaining) {
          out[i] = values[N_READS - i - 1];
        }
      }
    } else {
      for (int i = 0; i < N_READS; ++i) {
        out[i] = values[N_READS - i - 1];
      }
    }
  } else {
    out += start;
    if (remaining < N_READS) {
      for (int i = 0; i < N_READS; ++i) {
        if (i < remaining) {
          out[i] = values[i];
        }
      }
    } else {
      for (int i = 0; i < N_READS; ++i) {
        out[i] = values[i];
      }
    }
  }
}

template <
    typename T,
    typename U,
    typename Op,
    int N_READS,
    bool inclusive,
    bool reverse>
__global__ void contiguous_scan(const T* in, U* out, int32_t axis_size) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<WARP_SIZE>(block);

  in += grid.block_rank() * axis_size;
  out += grid.block_rank() * axis_size;

  __shared__ U warp_sums[WARP_SIZE];

  Op op;
  U init = ReduceInit<Op, T>::value();
  U prefix = init;

  // Scan per block.
  for (int r = 0; r < cuda::ceil_div(axis_size, block.size() * N_READS); ++r) {
    int32_t index = r * block.size() + block.thread_rank();
    U values[N_READS];
    load_values<reverse>(index, in, values, axis_size, init);

    // Compute an inclusive scan per thread.
    for (int i = 1; i < N_READS; ++i) {
      values[i] = op(values[i], values[i - 1]);
    }

    // Compute exclusive scan of thread sums.
    U prev_thread_sum = cg::exclusive_scan(warp, values[N_READS - 1], op);
    if (warp.thread_rank() == 0) {
      prev_thread_sum = init;
    }

    // Write wrap's sum to shared memory.
    if (warp.thread_rank() == WARP_SIZE - 1) {
      warp_sums[warp.meta_group_rank()] =
          op(prev_thread_sum, values[N_READS - 1]);
    }
    block.sync();

    // Compute exclusive scan of warp sums.
    if (warp.meta_group_rank() == 0) {
      U prev_warp_sum =
          cg::exclusive_scan(warp, warp_sums[warp.thread_rank()], op);
      if (warp.thread_rank() == 0) {
        prev_warp_sum = init;
      }
      warp_sums[warp.thread_rank()] = prev_warp_sum;
    }
    block.sync();

    // Compute the output.
    for (int i = 0; i < N_READS; ++i) {
      values[i] = op(values[i], prefix);
      values[i] = op(values[i], warp_sums[warp.meta_group_rank()]);
      values[i] = op(values[i], prev_thread_sum);
    }

    // Write the values.
    if (inclusive) {
      store_values<reverse, 0>(index, out, values, axis_size);
    } else {
      store_values<reverse, 1>(index, out, values, axis_size);
      if (reverse) {
        if (block.thread_rank() == 0 && index == 0) {
          out[axis_size - 1] = init;
        }
      } else {
        if (block.thread_rank() == 0 && index == 0) {
          out[0] = init;
        }
      }
    }
    block.sync();

    // Share the prefix.
    if ((warp.meta_group_rank() == warp.meta_group_size() - 1) &&
        (warp.thread_rank() == WARP_SIZE - 1)) {
      warp_sums[0] = values[N_READS - 1];
    }
    block.sync();
    prefix = warp_sums[0];
  }
}

template <
    typename T,
    typename U,
    typename Op,
    int N_READS,
    int BM,
    int BN,
    bool inclusive,
    bool reverse>
__global__ void strided_scan(
    const T* in,
    U* out,
    int32_t axis_size,
    int64_t stride,
    int64_t stride_blocks) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<WARP_SIZE>(block);

  constexpr int BN_pad = WARP_SIZE + 16 / sizeof(U);
  constexpr int n_warps = BN / N_READS;
  constexpr int n_scans = BN / n_warps;

  __shared__ U read_buffer[BM * BN_pad];

  Op op;
  U init = ReduceInit<Op, T>::value();
  U values[n_scans];
  U prefix[n_scans];
  for (int i = 0; i < n_scans; ++i) {
    prefix[i] = init;
  }

  // Compute offsets.
  int64_t offset = (grid.block_rank() / stride_blocks) * axis_size * stride;
  int64_t global_index_x = (grid.block_rank() % stride_blocks) * BN;
  uint32_t read_offset_y = (block.thread_rank() * N_READS) / BN;
  uint32_t read_offset_x = (block.thread_rank() * N_READS) % BN;
  uint32_t scan_offset_y = warp.thread_rank();
  uint32_t scan_offset_x = warp.meta_group_rank() * n_scans;

  uint32_t stride_limit = stride - global_index_x;
  in += offset + global_index_x + read_offset_x;
  out += offset + global_index_x + read_offset_x;
  U* read_into = read_buffer + read_offset_y * BN_pad + read_offset_x;
  U* read_from = read_buffer + scan_offset_y * BN_pad + scan_offset_x;

  for (uint32_t j = 0; j < axis_size; j += BM) {
    // Calculate the indices for the current thread.
    uint32_t index_y = j + read_offset_y;
    uint32_t check_index_y = index_y;
    if (reverse) {
      index_y = axis_size - 1 - index_y;
    }

    // Read in SM.
    if (check_index_y < axis_size && (read_offset_x + N_READS) < stride_limit) {
      for (int i = 0; i < N_READS; ++i) {
        read_into[i] = in[index_y * stride + i];
      }
    } else {
      for (int i = 0; i < N_READS; ++i) {
        if (check_index_y < axis_size && (read_offset_x + i) < stride_limit) {
          read_into[i] = in[index_y * stride + i];
        } else {
          read_into[i] = init;
        }
      }
    }
    block.sync();

    // Read strided into registers.
    for (int i = 0; i < n_scans; ++i) {
      values[i] = read_from[i];
    }

    // Perform the scan.
    for (int i = 0; i < n_scans; ++i) {
      values[i] = cg::inclusive_scan(warp, values[i], op);
      values[i] = op(values[i], prefix[i]);
      prefix[i] = warp.shfl(values[i], WARP_SIZE - 1);
    }

    // Write to SM.
    for (int i = 0; i < n_scans; ++i) {
      read_from[i] = values[i];
    }
    block.sync();

    // Write to device memory.
    if (!inclusive) {
      if (check_index_y == 0) {
        if ((read_offset_x + N_READS) < stride_limit) {
          for (int i = 0; i < N_READS; ++i) {
            out[index_y * stride + i] = init;
          }
        } else {
          for (int i = 0; i < N_READS; ++i) {
            if ((read_offset_x + i) < stride_limit) {
              out[index_y * stride + i] = init;
            }
          }
        }
      }
      if (reverse) {
        index_y -= 1;
        check_index_y += 1;
      } else {
        index_y += 1;
        check_index_y += 1;
      }
    }
    if (check_index_y < axis_size && (read_offset_x + N_READS) < stride_limit) {
      for (int i = 0; i < N_READS; ++i) {
        out[index_y * stride + i] = read_into[i];
      }
    } else {
      for (int i = 0; i < N_READS; ++i) {
        if (check_index_y < axis_size && (read_offset_x + i) < stride_limit) {
          out[index_y * stride + i] = read_into[i];
        }
      }
    }
  }
}

} // namespace cu

template <typename F>
void dispatch_scan_ops(Scan::ReduceType scan_op, F&& f) {
  if (scan_op == Scan::ReduceType::Max) {
    f(type_identity<cu::Max>{});
  } else if (scan_op == Scan::ReduceType::Min) {
    f(type_identity<cu::Min>{});
  } else if (scan_op == Scan::ReduceType::Sum) {
    f(type_identity<cu::Sum>{});
  } else if (scan_op == Scan::ReduceType::Prod) {
    f(type_identity<cu::Prod>{});
  } else if (scan_op == Scan::ReduceType::LogAddExp) {
    f(type_identity<cu::LogAddExp>{});
  } else {
    throw std::invalid_argument("Unknown reduce type.");
  }
}

template <typename Op>
const char* op_to_string() {
  if (cuda::std::is_same_v<Op, cu::Max>) {
    return "Max";
  } else if (cuda::std::is_same_v<Op, cu::Min>) {
    return "Min";
  } else if (cuda::std::is_same_v<Op, cu::Sum>) {
    return "Sum";
  } else if (cuda::std::is_same_v<Op, cu::Prod>) {
    return "Prod";
  } else if (cuda::std::is_same_v<Op, cu::LogAddExp>) {
    return "LogAddExp";
  } else {
    throw std::invalid_argument("Unknown op.");
  }
}

template <typename Op, typename T>
constexpr bool supports_scan_op() {
  if constexpr (cuda::std::is_same_v<Op, LogAddExp>) {
    return is_inexact_v<T>;
  } else {
    return true;
  }
}

void scan_gpu_inplace(
    array in,
    array& out,
    Scan::ReduceType reduce_type,
    int axis,
    bool reverse,
    bool inclusive,
    const Stream& s) {
  auto& encoder = cu::get_command_encoder(s);
  constexpr int N_READS = 4;
  int32_t axis_size = in.shape(axis);
  bool contiguous = in.strides()[axis] == 1;

  encoder.set_input_array(in);
  encoder.set_output_array(out);

  dispatch_all_types(in.dtype(), [&](auto type_tag) {
    using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    dispatch_scan_ops(reduce_type, [&](auto scan_op_tag) {
      using Op = MLX_GET_TYPE(scan_op_tag);
      if constexpr (supports_scan_op<Op, T>()) {
        using U = typename cu::ScanResult<Op, T>::type;
        dispatch_bool(inclusive, [&](auto inclusive_tag) {
          dispatch_bool(reverse, [&](auto reverse_tag) {
            if (contiguous) {
              auto kernel = cu::contiguous_scan<
                  T,
                  U,
                  Op,
                  N_READS,
                  inclusive_tag.value,
                  reverse_tag.value>;
              int block_dim = cuda::ceil_div(axis_size, N_READS);
              block_dim = cuda::ceil_div(block_dim, WARP_SIZE) * WARP_SIZE;
              block_dim = std::min(block_dim, WARP_SIZE * WARP_SIZE);
              encoder.add_kernel_node(
                  kernel,
                  in.data_size() / axis_size,
                  block_dim,
                  gpu_ptr<T>(in),
                  gpu_ptr<U>(out),
                  axis_size);
            } else {
              constexpr int BM = WARP_SIZE;
              constexpr int BN = WARP_SIZE;
              auto kernel = cu::strided_scan<
                  T,
                  U,
                  Op,
                  N_READS,
                  BM,
                  BN,
                  inclusive_tag.value,
                  reverse_tag.value>;
              int64_t stride = in.strides()[axis];
              int64_t stride_blocks = cuda::ceil_div(stride, BN);
              dim3 num_blocks = get_2d_grid_dims(
                  in.shape(), in.strides(), axis_size * stride);
              if (num_blocks.x * stride_blocks <= UINT32_MAX) {
                num_blocks.x *= stride_blocks;
              } else {
                num_blocks.y *= stride_blocks;
              }
              int block_dim = (BN / N_READS) * WARP_SIZE;
              encoder.add_kernel_node(
                  kernel,
                  num_blocks,
                  block_dim,
                  gpu_ptr<T>(in),
                  gpu_ptr<U>(out),
                  axis_size,
                  stride,
                  stride_blocks);
            }
          });
        });
      } else {
        throw std::runtime_error(
            fmt::format(
                "Can not do scan op {} on inputs of {} with result of {}.",
                op_to_string<Op>(),
                dtype_to_string(in.dtype()),
                dtype_to_string(out.dtype())));
      }
    });
  });
}

void Scan::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Scan::eval_gpu");
  assert(inputs.size() == 1);
  auto in = inputs[0];
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);

  if (in.flags().contiguous && in.strides()[axis_] != 0) {
    if (in.is_donatable() && in.itemsize() == out.itemsize()) {
      out.copy_shared_buffer(in);
    } else {
      out.set_data(
          cu::malloc_async(in.data_size() * out.itemsize(), encoder),
          in.data_size(),
          in.strides(),
          in.flags());
    }
  } else {
    in = contiguous_copy_gpu(in, s);
    out.copy_shared_buffer(in);
  }

  scan_gpu_inplace(in, out, reduce_type_, axis_, reverse_, inclusive_, s);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/slicing.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/slicing.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/jit_module.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/gpu/slicing.h"
#include "mlx/dtype_utils.h"

#include <numeric>

namespace mlx::core {

void concatenate_gpu(
    const std::vector<array>& inputs,
    array& out,
    int axis,
    const Stream& s) {
  std::vector<int> sizes;
  sizes.push_back(0);
  for (auto& p : inputs) {
    sizes.push_back(p.shape(axis));
  }
  std::partial_sum(sizes.cbegin(), sizes.cend(), sizes.begin());

  auto& encoder = cu::get_command_encoder(s);
  out.set_data(cu::malloc_async(out.nbytes(), encoder));

  auto strides = out.strides();
  auto flags = out.flags();
  flags.row_contiguous = false;
  flags.col_contiguous = false;
  flags.contiguous = false;
  auto concurrent = encoder.concurrent_context();
  for (int i = 0; i < inputs.size(); i++) {
    array out_slice(inputs[i].shape(), out.dtype(), nullptr, {});
    size_t data_offset = strides[axis] * sizes[i];
    out_slice.copy_shared_buffer(
        out, strides, flags, out_slice.size(), data_offset);
    copy_gpu_inplace(inputs[i], out_slice, CopyType::GeneralGeneral, s);
  }
}

array compute_dynamic_offset(
    const array& indices,
    const Strides& strides,
    const std::vector<int>& axes,
    const Stream& s) {
  Dtype dtype = indices.dtype();
  int nidx = axes.size();

  std::string module_name =
      fmt::format("compute_dynamic_offset_{}_{}", dtype_to_string(dtype), nidx);
  std::string kernel_name = fmt::format(
      "mlx::core::cu::compute_dynamic_offset<{}, {}>",
      dtype_to_cuda_type(dtype),
      nidx);

  cu::JitModule& mod = cu::get_jit_module(s.device, module_name, [&]() {
    std::string source = R"(
        #include "mlx/backend/cuda/device/utils.cuh"

        namespace mlx::core::cu {

        template <typename T, int NIDX>
        __global__ void compute_dynamic_offset(
            const T* indices,
            int64_t* offset,
            const __grid_constant__ Strides strides,
            const __grid_constant__ cuda::std::array<int, NIDX> axes) {
          int64_t acc = 0;
          #pragma unroll
          for (int i = 0; i < NIDX; ++i) {
            acc += indices[i] * strides[axes[i]];
          }
          *offset = acc;
        }

        } // namespace mlx::core::cu
    )";
    return std::make_tuple(false, std::move(source), std::vector{kernel_name});
  });

  auto& encoder = cu::get_command_encoder(s);
  // Prepare output.
  array offset({1}, int64, nullptr, {});
  bool donate = indices.is_donatable() &&
      (indices.data_size() * indices.itemsize()) >= offset.itemsize();
  if (donate) {
    offset.copy_shared_buffer(indices);
  } else {
    offset.set_data(cu::malloc_async(offset.itemsize(), encoder));
  }

  encoder.add_temporary(offset);
  encoder.set_input_array(indices);
  encoder.set_output_array(offset);

  cu::KernelArgs args;
  args.append(indices);
  args.append(offset);
  args.append_ndim(strides);
  args.append(axes);

  auto kernel = mod.get_kernel(kernel_name);
  encoder.add_kernel_node_raw(kernel, 1, 1, {}, 0, args.args());

  return offset;
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/softmax.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/device/cast_op.cuh"
#include "mlx/backend/cuda/device/fp16_math.cuh"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/backend/gpu/copy.h"
#include "mlx/dtype_utils.h"
#include "mlx/primitives.h"

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include <nvtx3/nvtx3.hpp>

#include <cassert>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <typename T>
inline __device__ T softmax_exp(T x) {
  // Softmax doesn't need high precision exponential cause x is gonna be in
  // (-oo, 0] anyway and subsequently it will be divided by sum(exp(x_i)).
  return __expf(x);
}

template <typename T, typename AccT, int BLOCK_DIM, int N_READS = 4>
__global__ void softmax(const T* in, T* out, int axis_size) {
  auto grid = cg::this_grid();
  auto block = cg::this_thread_block();
  auto warp = cg::tiled_partition<WARP_SIZE>(block);

  in += grid.block_rank() * axis_size;
  out += grid.block_rank() * axis_size;

  cg::greater<AccT> max_op;
  cg::plus<AccT> plus_op;

  // Thread reduce.
  AccT prevmax;
  AccT maxval = Limits<AccT>::finite_min();
  AccT normalizer = cast_to<AccT>(0);
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); r++) {
    auto index = r * BLOCK_DIM + block.thread_rank();
    auto vals = load_vector<N_READS>(in, index, axis_size, Limits<T>::min());
    prevmax = maxval;
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      maxval = max_op(maxval, static_cast<AccT>(vals[i]));
    }

    // Online normalizer calculation for softmax:
    // https://github.com/NVIDIA/online-softmax
    normalizer = normalizer * softmax_exp(prevmax - maxval);
#pragma unroll
    for (int i = 0; i < N_READS; i++) {
      normalizer =
          normalizer + softmax_exp(static_cast<AccT>(vals[i]) - maxval);
    }
  }

  // First warp reduce.
  prevmax = maxval;
  maxval = cg::reduce(warp, maxval, max_op);
  normalizer = normalizer * softmax_exp(prevmax - maxval);
  normalizer = cg::reduce(warp, normalizer, plus_op);

  __shared__ AccT local_max[WARP_SIZE];
  __shared__ AccT local_normalizer[WARP_SIZE];

  // Write to shared memory and do second warp reduce.
  prevmax = maxval;
  if (warp.thread_rank() == 0) {
    local_max[warp.meta_group_rank()] = maxval;
  }
  block.sync();
  maxval = warp.thread_rank() < warp.meta_group_size()
      ? local_max[warp.thread_rank()]
      : Limits<AccT>::min();
  maxval = cg::reduce(warp, maxval, max_op);
  normalizer = normalizer * softmax_exp(prevmax - maxval);
  if (warp.thread_rank() == 0) {
    local_normalizer[warp.meta_group_rank()] = normalizer;
  }
  block.sync();
  normalizer = warp.thread_rank() < warp.meta_group_size()
      ? local_normalizer[warp.thread_rank()]
      : AccT{};
  normalizer = cg::reduce(warp, normalizer, plus_op);
  normalizer = 1 / normalizer;

  // Write output.
  for (int r = 0; r < cuda::ceil_div(axis_size, BLOCK_DIM * N_READS); r++) {
    auto index = r * BLOCK_DIM + block.thread_rank();
    auto vals = load_vector<N_READS>(in, index, axis_size, T(0));
    for (int i = 0; i < N_READS; i++) {
      vals[i] = softmax_exp(static_cast<AccT>(vals[i]) - maxval) * normalizer;
    }
    store_vector<N_READS>(out, index, vals, axis_size);
  }
}

} // namespace cu

void Softmax::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Softmax::eval_gpu");
  assert(inputs.size() == 1);
  auto& s = stream();
  auto& encoder = cu::get_command_encoder(s);

  // Make sure that the last dimension is contiguous.
  auto set_output = [&s, &out, &encoder](const array& x) {
    if (x.flags().contiguous && x.strides()[x.ndim() - 1] == 1) {
      if (x.is_donatable()) {
        out.copy_shared_buffer(x);
      } else {
        out.set_data(
            cu::malloc_async(x.data_size() * x.itemsize(), encoder),
            x.data_size(),
            x.strides(),
            x.flags());
      }
      return x;
    } else {
      array x_copy = contiguous_copy_gpu(x, s);
      out.copy_shared_buffer(x_copy);
      return x_copy;
    }
  };

  array in = set_output(inputs[0]);
  bool precise = in.dtype() != float32 && precise_;

  int axis_size = in.shape().back();
  int n_rows = in.data_size() / axis_size;

  encoder.set_input_array(in);
  encoder.set_output_array(out);
  dispatch_float_types(out.dtype(), "softmax", [&](auto type_tag) {
    using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
    constexpr int N_READS = 16 / sizeof(DataType);
    dispatch_block_dim(cuda::ceil_div(axis_size, N_READS), [&](auto block_dim) {
      auto kernel = cu::softmax<DataType, DataType, block_dim(), N_READS>;
      if (precise) {
        kernel = cu::softmax<DataType, float, block_dim(), N_READS>;
      }
      encoder.add_kernel_node(
          kernel,
          n_rows,
          block_dim(),
          gpu_ptr<DataType>(in),
          gpu_ptr<DataType>(out),
          axis_size);
    });
  });
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/sort.cu
================================================
// Copyright © 2025 Apple Inc.

#include <algorithm>
#include <cassert>
#include <cstdint>

#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/device/fp16_math.cuh"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/backend/gpu/copy.h"
#include "mlx/dtype_utils.h"
#include "mlx/primitives.h"

#include <nvtx3/nvtx3.hpp>
#include <cuda/std/limits>
#include <cuda/std/type_traits>

namespace mlx::core {

constexpr int N_PER_THREAD = 8;

namespace cu {

template <typename T>
__device__ __forceinline__ T nan_value();

template <>
__device__ __forceinline__ float nan_value<float>() {
  return cuda::std::numeric_limits<float>::quiet_NaN();
}

template <>
__device__ __forceinline__ double nan_value<double>() {
  return cuda::std::numeric_limits<double>::quiet_NaN();
}

template <>
__device__ __forceinline__ __half nan_value<__half>() {
  return __float2half(cuda::std::numeric_limits<float>::quiet_NaN());
}

template <>
__device__ __forceinline__ __nv_bfloat16 nan_value<__nv_bfloat16>() {
  return __float2bfloat16(cuda::std::numeric_limits<float>::quiet_NaN());
}

template <typename T, typename = void>
struct InitValue {
  __device__ __forceinline__ static T value() {
    return Limits<T>::max();
  }
};

template <typename T>
struct InitValue<T, cuda::std::enable_if_t<is_floating_v<T>>> {
  __device__ __forceinline__ static T value() {
    return nan_value<T>();
  }
};

template <typename T>
__device__ __forceinline__ void thread_swap(T& a, T& b) {
  T w = a;
  a = b;
  b = w;
}

template <typename T>
struct LessThan {
  __device__ __forceinline__ static T init() {
    return InitValue<T>::value();
  }

  __device__ __forceinline__ bool operator()(T a, T b) const {
    if constexpr (is_floating_v<T>) {
      bool an = cuda::std::isnan(a);
      bool bn = cuda::std::isnan(b);
      if (an | bn) {
        return (!an) & bn;
      }
    }
    return a < b;
  }
};

template <
    typename ValT,
    typename IdxT,
    bool ARG_SORT,
    int N_PER_THREAD,
    typename CompareOp>
struct ThreadSort {
  __device__ __forceinline__ static void sort(
      ValT (&vals)[N_PER_THREAD],
      IdxT (&idxs)[N_PER_THREAD]) {
    CompareOp op;
#pragma unroll
    for (int i = 0; i < N_PER_THREAD; ++i) {
#pragma unroll
      for (int j = i & 1; j < N_PER_THREAD - 1; j += 2) {
        if (op(vals[j + 1], vals[j])) {
          thread_swap(vals[j + 1], vals[j]);
          if constexpr (ARG_SORT) {
            thread_swap(idxs[j + 1], idxs[j]);
          }
        }
      }
    }
  }
};

template <
    typename ValT,
    typename IdxT,
    bool ARG_SORT,
    int BLOCK_THREADS,
    int N_PER_THREAD,
    typename CompareOp>
struct BlockMergeSort {
  using thread_sort_t =
      ThreadSort<ValT, IdxT, ARG_SORT, N_PER_THREAD, CompareOp>;

  __device__ __forceinline__ static int merge_partition(
      const ValT* As,
      const ValT* Bs,
      int A_sz,
      int B_sz,
      int sort_md) {
    CompareOp op;

    int A_st = max(0, sort_md - B_sz);
    int A_ed = min(sort_md, A_sz);

    while (A_st < A_ed) {
      int md = A_st + (A_ed - A_st) / 2;
      auto a = As[md];
      auto b = Bs[sort_md - 1 - md];

      if (op(b, a)) {
        A_ed = md;
      } else {
        A_st = md + 1;
      }
    }

    return A_ed;
  }

  __device__ __forceinline__ static void merge_step(
      const ValT* As,
      const ValT* Bs,
      const IdxT* As_idx,
      const IdxT* Bs_idx,
      int A_sz,
      int B_sz,
      ValT (&vals)[N_PER_THREAD],
      IdxT (&idxs)[N_PER_THREAD]) {
    CompareOp op;
    int a_idx = 0;
    int b_idx = 0;

#pragma unroll
    for (int i = 0; i < N_PER_THREAD; ++i) {
      auto a = (a_idx < A_sz) ? As[a_idx] : ValT(CompareOp::init());
      auto b = (b_idx < B_sz) ? Bs[b_idx] : ValT(CompareOp::init());
      bool pred = (b_idx < B_sz) && (a_idx >= A_sz || op(b, a));

      vals[i] = pred ? b : a;
      if constexpr (ARG_SORT) {
        if (pred) {
          idxs[i] = Bs_idx[b_idx];
        } else {
          idxs[i] = (a_idx < A_sz) ? As_idx[a_idx] : IdxT(0);
        }
      }

      b_idx += int(pred);
      a_idx += int(!pred);
    }
  }

  __device__ __forceinline__ static void
  sort(ValT* tgp_vals, IdxT* tgp_idxs, int size_sorted_axis) {
    int idx = threadIdx.x * N_PER_THREAD;

    ValT thread_vals[N_PER_THREAD];
    IdxT thread_idxs[N_PER_THREAD];
#pragma unroll
    for (int i = 0; i < N_PER_THREAD; ++i) {
      thread_vals[i] = tgp_vals[idx + i];
      if constexpr (ARG_SORT) {
        thread_idxs[i] = tgp_idxs[idx + i];
      }
    }

    if (idx < size_sorted_axis) {
      thread_sort_t::sort(thread_vals, thread_idxs);
    }

    for (int merge_threads = 2; merge_threads <= BLOCK_THREADS;
         merge_threads *= 2) {
      __syncthreads();
#pragma unroll
      for (int i = 0; i < N_PER_THREAD; ++i) {
        tgp_vals[idx + i] = thread_vals[i];
        if constexpr (ARG_SORT) {
          tgp_idxs[idx + i] = thread_idxs[i];
        }
      }
      __syncthreads();

      int merge_group = threadIdx.x / merge_threads;
      int merge_lane = threadIdx.x % merge_threads;

      int sort_sz = N_PER_THREAD * merge_threads;
      int sort_st = N_PER_THREAD * merge_threads * merge_group;

      int A_st = sort_st;
      int A_ed = sort_st + sort_sz / 2;
      int B_st = sort_st + sort_sz / 2;
      int B_ed = sort_st + sort_sz;

      const ValT* As = tgp_vals + A_st;
      const ValT* Bs = tgp_vals + B_st;
      int A_sz = A_ed - A_st;
      int B_sz = B_ed - B_st;

      int sort_md = N_PER_THREAD * merge_lane;
      int partition = merge_partition(As, Bs, A_sz, B_sz, sort_md);

      As += partition;
      Bs += sort_md - partition;

      A_sz -= partition;
      B_sz -= sort_md - partition;

      const IdxT* As_idx = ARG_SORT ? tgp_idxs + A_st + partition : nullptr;
      const IdxT* Bs_idx =
          ARG_SORT ? tgp_idxs + B_st + sort_md - partition : nullptr;

      merge_step(As, Bs, As_idx, Bs_idx, A_sz, B_sz, thread_vals, thread_idxs);
    }

    __syncthreads();
#pragma unroll
    for (int i = 0; i < N_PER_THREAD; ++i) {
      tgp_vals[idx + i] = thread_vals[i];
      if constexpr (ARG_SORT) {
        tgp_idxs[idx + i] = thread_idxs[i];
      }
    }
  }
};

template <
    typename T,
    typename U,
    bool ARG_SORT,
    int BLOCK_THREADS,
    int N_PER_THREAD,
    typename CompareOp = LessThan<T>>
struct KernelMergeSort {
  using ValT = T;
  using IdxT = uint32_t;
  using block_merge_sort_t = BlockMergeSort<
      ValT,
      IdxT,
      ARG_SORT,
      BLOCK_THREADS,
      N_PER_THREAD,
      CompareOp>;

  static constexpr int N_PER_BLOCK = BLOCK_THREADS * N_PER_THREAD;

  __device__ __forceinline__ static void block_sort(
      const T* inp,
      U* out,
      int size_sorted_axis,
      int64_t in_stride_sorted_axis,
      int64_t out_stride_sorted_axis,
      int64_t in_stride_segment_axis,
      int64_t out_stride_segment_axis,
      ValT* tgp_vals,
      IdxT* tgp_idxs) {
    inp += blockIdx.y * in_stride_segment_axis;
    out += blockIdx.y * out_stride_segment_axis;

    for (int i = threadIdx.x; i < N_PER_BLOCK; i += BLOCK_THREADS) {
      tgp_vals[i] = i < size_sorted_axis ? inp[i * in_stride_sorted_axis]
                                         : ValT(CompareOp::init());
      if constexpr (ARG_SORT) {
        tgp_idxs[i] = i;
      }
    }

    __syncthreads();
    block_merge_sort_t::sort(tgp_vals, tgp_idxs, size_sorted_axis);
    __syncthreads();

    for (int i = threadIdx.x; i < size_sorted_axis; i += BLOCK_THREADS) {
      if constexpr (ARG_SORT) {
        out[i * out_stride_sorted_axis] = tgp_idxs[i];
      } else {
        out[i * out_stride_sorted_axis] = tgp_vals[i];
      }
    }
  }
};

template <
    typename T,
    typename U,
    bool ARG_SORT,
    int BLOCK_THREADS,
    int N_PER_THREAD>
__global__ void block_sort_kernel(
    const T* inp,
    U* out,
    int size_sorted_axis,
    int64_t in_stride_sorted_axis,
    int64_t out_stride_sorted_axis,
    int64_t in_stride_segment_axis,
    int64_t out_stride_segment_axis) {
  using sort_kernel =
      KernelMergeSort<T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD>;
  using ValT = typename sort_kernel::ValT;
  using IdxT = typename sort_kernel::IdxT;

  if constexpr (ARG_SORT) {
    __shared__ ValT tgp_vals[sort_kernel::N_PER_BLOCK];
    __shared__ IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];
    sort_kernel::block_sort(
        inp,
        out,
        size_sorted_axis,
        in_stride_sorted_axis,
        out_stride_sorted_axis,
        in_stride_segment_axis,
        out_stride_segment_axis,
        tgp_vals,
        tgp_idxs);
  } else {
    __shared__ ValT tgp_vals[sort_kernel::N_PER_BLOCK];
    sort_kernel::block_sort(
        inp,
        out,
        size_sorted_axis,
        in_stride_sorted_axis,
        out_stride_sorted_axis,
        in_stride_segment_axis,
        out_stride_segment_axis,
        tgp_vals,
        nullptr);
  }
}

template <
    typename T,
    typename U,
    bool ARG_SORT,
    int BLOCK_THREADS,
    int N_PER_THREAD>
__global__ void block_sort_nc_kernel(
    const T* inp,
    U* out,
    int size_sorted_axis,
    int64_t in_stride_sorted_axis,
    int64_t out_stride_sorted_axis,
    const __grid_constant__ Shape nc_shape,
    const __grid_constant__ Strides in_nc_strides,
    const __grid_constant__ Strides out_nc_strides,
    int nc_dim) {
  using sort_kernel =
      KernelMergeSort<T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD>;
  using ValT = typename sort_kernel::ValT;
  using IdxT = typename sort_kernel::IdxT;

  int64_t in_block_idx = elem_to_loc(
      int64_t(blockIdx.y), nc_shape.data(), in_nc_strides.data(), nc_dim);
  int64_t out_block_idx = elem_to_loc(
      int64_t(blockIdx.y), nc_shape.data(), out_nc_strides.data(), nc_dim);

  inp += in_block_idx;
  out += out_block_idx;

  if constexpr (ARG_SORT) {
    __shared__ ValT tgp_vals[sort_kernel::N_PER_BLOCK];
    __shared__ IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];
    sort_kernel::block_sort(
        inp,
        out,
        size_sorted_axis,
        in_stride_sorted_axis,
        out_stride_sorted_axis,
        0,
        0,
        tgp_vals,
        tgp_idxs);
  } else {
    __shared__ ValT tgp_vals[sort_kernel::N_PER_BLOCK];
    sort_kernel::block_sort(
        inp,
        out,
        size_sorted_axis,
        in_stride_sorted_axis,
        out_stride_sorted_axis,
        0,
        0,
        tgp_vals,
        nullptr);
  }
}

template <
    typename ValT,
    typename IdxT,
    bool ARG_SORT,
    int BLOCK_THREADS,
    int N_PER_THREAD,
    typename CompareOp = LessThan<ValT>>
struct KernelMultiBlockMergeSort {
  using block_merge_sort_t = BlockMergeSort<
      ValT,
      IdxT,
      ARG_SORT,
      BLOCK_THREADS,
      N_PER_THREAD,
      CompareOp>;

  static constexpr int N_PER_BLOCK = BLOCK_THREADS * N_PER_THREAD;

  __device__ __forceinline__ static void block_sort(
      const ValT* inp,
      ValT* out_vals,
      IdxT* out_idxs,
      int size_sorted_axis,
      int64_t stride_sorted_axis,
      ValT* tgp_vals,
      IdxT* tgp_idxs) {
    int base_idx = blockIdx.x * N_PER_BLOCK;

    for (int i = threadIdx.x; i < N_PER_BLOCK; i += BLOCK_THREADS) {
      int idx = base_idx + i;
      tgp_vals[i] = idx < size_sorted_axis ? inp[idx * stride_sorted_axis]
                                           : ValT(CompareOp::init());
      tgp_idxs[i] = idx;
    }

    __syncthreads();
    block_merge_sort_t::sort(tgp_vals, tgp_idxs, size_sorted_axis);
    __syncthreads();

    for (int i = threadIdx.x; i < N_PER_BLOCK; i += BLOCK_THREADS) {
      int idx = base_idx + i;
      if (idx < size_sorted_axis) {
        out_vals[idx] = tgp_vals[i];
        out_idxs[idx] = tgp_idxs[i];
      }
    }
  }

  __device__ __forceinline__ static int merge_partition(
      const ValT* As,
      const ValT* Bs,
      int A_sz,
      int B_sz,
      int sort_md) {
    CompareOp op;

    int A_st = max(0, sort_md - B_sz);
    int A_ed = min(sort_md, A_sz);

    while (A_st < A_ed) {
      int md = A_st + (A_ed - A_st) / 2;
      auto a = As[md];
      auto b = Bs[sort_md - 1 - md];

      if (op(b, a)) {
        A_ed = md;
      } else {
        A_st = md + 1;
      }
    }

    return A_ed;
  }
};

template <
    typename ValT,
    typename IdxT,
    bool ARG_SORT,
    int BLOCK_THREADS,
    int N_PER_THREAD>
__global__ void mb_block_sort_kernel(
    const ValT* inp,
    ValT* out_vals,
    IdxT* out_idxs,
    int size_sorted_axis,
    int64_t stride_sorted_axis,
    const __grid_constant__ Shape nc_shape,
    const __grid_constant__ Strides nc_strides,
    int nc_dim) {
  using sort_kernel = KernelMultiBlockMergeSort<
      ValT,
      IdxT,
      ARG_SORT,
      BLOCK_THREADS,
      N_PER_THREAD>;

  int64_t block_idx = elem_to_loc(
      int64_t(blockIdx.y), nc_shape.data(), nc_strides.data(), nc_dim);

  inp += block_idx;
  out_vals += blockIdx.y * size_sorted_axis;
  out_idxs += blockIdx.y * size_sorted_axis;

  __shared__ ValT tgp_vals[sort_kernel::N_PER_BLOCK];
  __shared__ IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];

  sort_kernel::block_sort(
      inp,
      out_vals,
      out_idxs,
      size_sorted_axis,
      stride_sorted_axis,
      tgp_vals,
      tgp_idxs);
}

template <
    typename ValT,
    typename IdxT,
    bool ARG_SORT,
    int BLOCK_THREADS,
    int N_PER_THREAD>
__global__ void mb_block_partition_kernel(
    IdxT* block_partitions,
    const ValT* dev_vals,
    const IdxT* dev_idxs,
    int size_sorted_axis,
    int merge_tiles,
    int n_blocks) {
  using sort_kernel = KernelMultiBlockMergeSort<
      ValT,
      IdxT,
      ARG_SORT,
      BLOCK_THREADS,
      N_PER_THREAD>;

  (void)dev_idxs;

  block_partitions += blockIdx.y * blockDim.x;
  dev_vals += blockIdx.y * size_sorted_axis;
  dev_idxs += blockIdx.y * size_sorted_axis;

  for (int i = threadIdx.x; i <= n_blocks; i += blockDim.x) {
    int merge_group = i / merge_tiles;
    int merge_lane = i % merge_tiles;

    int sort_sz = sort_kernel::N_PER_BLOCK * merge_tiles;
    int sort_st = sort_kernel::N_PER_BLOCK * merge_tiles * merge_group;

    int A_st = min(size_sorted_axis, sort_st);
    int A_ed = min(size_sorted_axis, sort_st + sort_sz / 2);
    int B_st = A_ed;
    int B_ed = min(size_sorted_axis, B_st + sort_sz / 2);

    int partition_at = min(B_ed - A_st, sort_kernel::N_PER_BLOCK * merge_lane);
    int partition = sort_kernel::merge_partition(
        dev_vals + A_st,
        dev_vals + B_st,
        A_ed - A_st,
        B_ed - B_st,
        partition_at);

    block_partitions[i] = A_st + partition;
  }
}

template <
    typename ValT,
    typename IdxT,
    bool ARG_SORT,
    int BLOCK_THREADS,
    int N_PER_THREAD,
    typename CompareOp = LessThan<ValT>>
__global__ void mb_block_merge_kernel(
    const IdxT* block_partitions,
    const ValT* dev_vals_in,
    const IdxT* dev_idxs_in,
    ValT* dev_vals_out,
    IdxT* dev_idxs_out,
    int size_sorted_axis,
    int merge_tiles,
    int num_tiles) {
  using sort_kernel = KernelMultiBlockMergeSort<
      ValT,
      IdxT,
      ARG_SORT,
      BLOCK_THREADS,
      N_PER_THREAD,
      CompareOp>;

  using block_sort_t = typename sort_kernel::block_merge_sort_t;

  block_partitions += blockIdx.y * (num_tiles + 1);
  dev_vals_in += blockIdx.y * size_sorted_axis;
  dev_idxs_in += blockIdx.y * size_sorted_axis;
  dev_vals_out += blockIdx.y * size_sorted_axis;
  dev_idxs_out += blockIdx.y * size_sorted_axis;

  int block_idx = blockIdx.x;
  int merge_group = block_idx / merge_tiles;
  int sort_st = sort_kernel::N_PER_BLOCK * merge_tiles * merge_group;
  int sort_sz = sort_kernel::N_PER_BLOCK * merge_tiles;
  int sort_md = sort_kernel::N_PER_BLOCK * block_idx - sort_st;

  int A_st = block_partitions[block_idx + 0];
  int A_ed = block_partitions[block_idx + 1];
  int B_st = min(size_sorted_axis, 2 * sort_st + sort_sz / 2 + sort_md - A_st);
  int B_ed = min(
      size_sorted_axis,
      2 * sort_st + sort_sz / 2 + sort_md + sort_kernel::N_PER_BLOCK - A_ed);

  if ((block_idx % merge_tiles) == merge_tiles - 1) {
    A_ed = min(size_sorted_axis, sort_st + sort_sz / 2);
    B_ed = min(size_sorted_axis, sort_st + sort_sz);
  }

  int A_sz = A_ed - A_st;
  int B_sz = B_ed - B_st;

  ValT thread_vals[N_PER_THREAD];
  IdxT thread_idxs[N_PER_THREAD];
#pragma unroll
  for (int i = 0; i < N_PER_THREAD; i++) {
    int idx = BLOCK_THREADS * i + threadIdx.x;
    if (idx < (A_sz + B_sz)) {
      thread_vals[i] = (idx < A_sz) ? dev_vals_in[A_st + idx]
                                    : dev_vals_in[B_st + idx - A_sz];
      thread_idxs[i] = (idx < A_sz) ? dev_idxs_in[A_st + idx]
                                    : dev_idxs_in[B_st + idx - A_sz];
    } else {
      thread_vals[i] = CompareOp::init();
      thread_idxs[i] = 0;
    }
  }

  __shared__ ValT tgp_vals[sort_kernel::N_PER_BLOCK];
  __shared__ IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];
  __syncthreads();
#pragma unroll
  for (int i = 0; i < N_PER_THREAD; i++) {
    int idx = BLOCK_THREADS * i + threadIdx.x;
    tgp_vals[idx] = thread_vals[i];
    tgp_idxs[idx] = thread_idxs[i];
  }
  __syncthreads();

  int sort_md_local = min(A_sz + B_sz, N_PER_THREAD * int(threadIdx.x));

  int A_st_local = block_sort_t::merge_partition(
      tgp_vals, tgp_vals + A_sz, A_sz, B_sz, sort_md_local);
  int A_ed_local = A_sz;

  int B_st_local = sort_md_local - A_st_local;
  int B_ed_local = B_sz;

  int A_sz_local = A_ed_local - A_st_local;
  int B_sz_local = B_ed_local - B_st_local;

  block_sort_t::merge_step(
      tgp_vals + A_st_local,
      tgp_vals + A_ed_local + B_st_local,
      tgp_idxs + A_st_local,
      tgp_idxs + A_ed_local + B_st_local,
      A_sz_local,
      B_sz_local,
      thread_vals,
      thread_idxs);

  __syncthreads();
#pragma unroll
  for (int i = 0; i < N_PER_THREAD; ++i) {
    int idx = threadIdx.x * N_PER_THREAD;
    tgp_vals[idx + i] = thread_vals[i];
    tgp_idxs[idx + i] = thread_idxs[i];
  }

  __syncthreads();
  int base_idx = blockIdx.x * sort_kernel::N_PER_BLOCK;
  for (int i = threadIdx.x; i < sort_kernel::N_PER_BLOCK; i += BLOCK_THREADS) {
    int idx = base_idx + i;
    if (idx < size_sorted_axis) {
      dev_vals_out[idx] = tgp_vals[i];
      dev_idxs_out[idx] = tgp_idxs[i];
    }
  }
}

} // namespace cu

namespace {

void single_block_sort(
    const Stream& s,
    const array& in,
    array& out,
    int axis,
    int bn,
    bool argsort) {
  int n_rows = in.size() / in.shape(axis);

  auto in_nc_str = in.strides();
  in_nc_str.erase(in_nc_str.begin() + axis);

  auto out_nc_str = out.strides();
  out_nc_str.erase(out_nc_str.begin() + axis);

  auto nc_shape = in.shape();
  nc_shape.erase(nc_shape.begin() + axis);

  int nc_dim = nc_shape.size();

  int size_sorted_axis = in.shape(axis);
  int64_t in_stride_sorted_axis = in.strides()[axis];
  int64_t out_stride_sorted_axis = out.strides()[axis];

  bool contiguous = in.flags().contiguous;
  auto check_strides = [](const array& x, int64_t sort_stride) {
    int64_t min_stride =
        *std::min_element(x.strides().begin(), x.strides().end());
    int64_t max_stride =
        *std::max_element(x.strides().begin(), x.strides().end());
    return sort_stride == min_stride || sort_stride == max_stride;
  };
  contiguous &= check_strides(in, in_stride_sorted_axis);
  contiguous &= check_strides(out, out_stride_sorted_axis);

  auto& encoder = cu::get_command_encoder(s);
  out.set_data(cu::malloc_async(out.nbytes(), encoder));
  encoder.set_input_array(in);
  encoder.set_output_array(out);

  dispatch_all_types(in.dtype(), [&](auto type_tag) {
    using CTYPE = MLX_GET_TYPE(type_tag);
    if constexpr (!std::is_same_v<CTYPE, complex64_t>) {
      using ValT = cuda_type_t<CTYPE>;
      dispatch_block_dim(bn, [&](auto block_dim) {
        constexpr int BLOCK_THREADS = block_dim();
        if constexpr (BLOCK_THREADS < 1024) {
          dim3 grid(1, n_rows, 1);
          dim3 block(BLOCK_THREADS, 1, 1);

          dispatch_bool(argsort, [&](auto arg_tag) {
            constexpr bool ARG_SORT = decltype(arg_tag)::value;
            using OutT = std::conditional_t<ARG_SORT, uint32_t, ValT>;

            if (contiguous) {
              auto kernel = cu::block_sort_kernel<
                  ValT,
                  OutT,
                  ARG_SORT,
                  BLOCK_THREADS,
                  N_PER_THREAD>;
              int64_t in_stride_segment_axis = INT64_MAX;
              int64_t out_stride_segment_axis = INT64_MAX;
              for (int i = 0; i < nc_shape.size(); i++) {
                if (nc_shape[i] == 1) {
                  continue;
                }
                if (in_nc_str[i] > INT32_MAX || out_nc_str[i] > INT32_MAX) {
                  throw std::runtime_error(
                      "[Sort::eval_gpu] Stride too large.");
                }
                in_stride_segment_axis =
                    std::min(in_stride_segment_axis, in_nc_str[i]);
                out_stride_segment_axis =
                    std::min(out_stride_segment_axis, out_nc_str[i]);
              }
              encoder.add_kernel_node(
                  kernel,
                  grid,
                  block,
                  gpu_ptr<ValT>(in),
                  gpu_ptr<OutT>(out),
                  size_sorted_axis,
                  in_stride_sorted_axis,
                  out_stride_sorted_axis,
                  in_stride_segment_axis,
                  out_stride_segment_axis);
            } else {
              auto kernel = cu::block_sort_nc_kernel<
                  ValT,
                  OutT,
                  ARG_SORT,
                  BLOCK_THREADS,
                  N_PER_THREAD>;
              auto nc_shape_param = const_param(nc_shape);
              auto in_nc_strides_param = const_param(in_nc_str);
              auto out_nc_strides_param = const_param(out_nc_str);
              encoder.add_kernel_node(
                  kernel,
                  grid,
                  block,
                  gpu_ptr<ValT>(in),
                  gpu_ptr<OutT>(out),
                  size_sorted_axis,
                  in_stride_sorted_axis,
                  out_stride_sorted_axis,
                  nc_shape_param,
                  in_nc_strides_param,
                  out_nc_strides_param,
                  nc_dim);
            }
          });
        }
      });
    } else {
      throw std::runtime_error(
          "CUDA backend does not support sorting complex numbers");
    }
  });
}

void multi_block_sort(
    const Stream& s,
    const array& in,
    array& out,
    int axis,
    int n_blocks,
    bool argsort) {
  int n_rows = in.size() / in.shape(axis);

  auto nc_str = in.strides();
  nc_str.erase(nc_str.begin() + axis);

  auto nc_shape = in.shape();
  nc_shape.erase(nc_shape.begin() + axis);

  int nc_dim = nc_shape.size();

  if (nc_dim == 0) {
    nc_shape = {0};
    nc_str = {1};
  }

  int size_sorted_axis = in.shape(axis);
  int64_t stride_sorted_axis = in.strides()[axis];

  array dev_vals_in({n_rows, size_sorted_axis}, in.dtype(), nullptr, {});
  array dev_vals_out({n_rows, size_sorted_axis}, in.dtype(), nullptr, {});

  array dev_idxs_in({n_rows, size_sorted_axis}, uint32, nullptr, {});
  array dev_idxs_out({n_rows, size_sorted_axis}, uint32, nullptr, {});

  array block_partitions({n_rows, n_blocks + 1}, uint32, nullptr, {});

  auto& encoder = cu::get_command_encoder(s);

  dev_vals_in.set_data(cu::malloc_async(dev_vals_in.nbytes(), encoder));
  dev_vals_out.set_data(cu::malloc_async(dev_vals_out.nbytes(), encoder));
  dev_idxs_in.set_data(cu::malloc_async(dev_idxs_in.nbytes(), encoder));
  dev_idxs_out.set_data(cu::malloc_async(dev_idxs_out.nbytes(), encoder));
  block_partitions.set_data(
      cu::malloc_async(block_partitions.nbytes(), encoder));

  encoder.add_temporary(block_partitions);

  dispatch_all_types(in.dtype(), [&](auto type_tag) {
    using CTYPE = MLX_GET_TYPE(type_tag);
    if constexpr (!std::is_same_v<CTYPE, complex64_t>) {
      using ValT = cuda_type_t<CTYPE>;
      using IdxT = uint32_t;
      constexpr int BLOCK_THREADS = sizeof(ValT) == 8 ? 256 : 512;
      dim3 grid(n_blocks, n_rows, 1);
      dim3 block(BLOCK_THREADS, 1, 1);

      dispatch_bool(argsort, [&](auto arg_tag) {
        constexpr bool ARG_SORT = decltype(arg_tag)::value;
        auto nc_shape_param = const_param(nc_shape);
        auto nc_strides_param = const_param(nc_str);

        auto block_sort_kernel = cu::mb_block_sort_kernel<
            ValT,
            IdxT,
            ARG_SORT,
            BLOCK_THREADS,
            N_PER_THREAD>;
        encoder.set_input_array(in);
        encoder.set_output_array(dev_vals_in);
        encoder.set_output_array(dev_idxs_in);
        encoder.add_kernel_node(
            block_sort_kernel,
            grid,
            block,
            gpu_ptr<ValT>(in),
            gpu_ptr<ValT>(dev_vals_in),
            gpu_ptr<IdxT>(dev_idxs_in),
            size_sorted_axis,
            stride_sorted_axis,
            nc_shape_param,
            nc_strides_param,
            nc_dim);

        int n_thr_per_group = (n_blocks + 1) < 1024 ? (n_blocks + 1) : 1024;

        for (int merge_tiles = 2; (merge_tiles / 2) < n_blocks;
             merge_tiles *= 2) {
          auto partition_kernel = cu::mb_block_partition_kernel<
              ValT,
              IdxT,
              ARG_SORT,
              BLOCK_THREADS,
              N_PER_THREAD>;

          encoder.set_input_array(dev_vals_in);
          encoder.set_input_array(dev_idxs_in);
          encoder.set_output_array(block_partitions);

          encoder.add_kernel_node(
              partition_kernel,
              dim3(1, n_rows, 1),
              dim3(n_thr_per_group, 1, 1),
              gpu_ptr<IdxT>(block_partitions),
              gpu_ptr<ValT>(dev_vals_in),
              gpu_ptr<IdxT>(dev_idxs_in),
              size_sorted_axis,
              merge_tiles,
              n_blocks);

          auto merge_kernel = cu::mb_block_merge_kernel<
              ValT,
              IdxT,
              ARG_SORT,
              BLOCK_THREADS,
              N_PER_THREAD>;

          encoder.set_input_array(dev_vals_in);
          encoder.set_input_array(dev_idxs_in);
          encoder.set_input_array(block_partitions);
          encoder.set_output_array(dev_vals_out);
          encoder.set_output_array(dev_idxs_out);

          encoder.add_kernel_node(
              merge_kernel,
              dim3(n_blocks, n_rows, 1),
              dim3(BLOCK_THREADS, 1, 1),
              gpu_ptr<IdxT>(block_partitions),
              gpu_ptr<ValT>(dev_vals_in),
              gpu_ptr<IdxT>(dev_idxs_in),
              gpu_ptr<ValT>(dev_vals_out),
              gpu_ptr<IdxT>(dev_idxs_out),
              size_sorted_axis,
              merge_tiles,
              n_blocks);
          std::swap(dev_vals_in, dev_vals_out);
          std::swap(dev_idxs_in, dev_idxs_out);
        }
      });
    } else {
      throw std::runtime_error(
          "CUDA backend does not support sorting complex numbers");
    }
  });

  encoder.add_temporary(dev_vals_out);
  encoder.add_temporary(dev_idxs_out);
  encoder.add_temporary(argsort ? dev_vals_in : dev_idxs_in);
  if (axis == in.ndim() - 1) {
    // Copy buffer to out, no need for temporary
    out.copy_shared_buffer(
        argsort ? dev_idxs_in : dev_vals_in,
        out.strides(),
        out.flags(),
        out.size());
  } else {
    encoder.add_temporary(argsort ? dev_idxs_in : dev_vals_in);
    out.set_data(cu::malloc_async(out.nbytes(), encoder));
    auto strides = out.strides();
    for (int ax = axis + 1; ax < strides.size(); ax++) {
      strides[ax] *= out.shape(axis);
    }
    strides[axis] = 1;
    copy_gpu_inplace(
        (argsort) ? dev_idxs_in : dev_vals_in,
        out,
        out.shape(),
        strides,
        out.strides(),
        0,
        0,
        CopyType::General,
        s);
  }
}

void gpu_merge_sort(
    const Stream& s,
    const array& in,
    array& out,
    int axis_,
    bool argsort) {
  int axis = axis_ < 0 ? axis_ + in.ndim() : axis_;
  int size_sorted_axis = in.shape(axis);

  constexpr int tn = N_PER_THREAD;
  int potential_bn = (size_sorted_axis + tn - 1) / tn;

  int bn;
  if (potential_bn > 256) {
    bn = 512;
  } else if (potential_bn > 128) {
    bn = 256;
  } else if (potential_bn > 64) {
    bn = 128;
  } else if (potential_bn > 32) {
    bn = 64;
  } else {
    bn = 32;
  }

  if (bn == 512 && size_of(in.dtype()) > 4) {
    bn = 256;
  }

  int n_per_block = bn * tn;
  int n_blocks = (size_sorted_axis + n_per_block - 1) / n_per_block;

  if (n_blocks > 1) {
    return multi_block_sort(s, in, out, axis, n_blocks, argsort);
  }
  return single_block_sort(s, in, out, axis, bn, argsort);
}

void gpu_sort(
    const Stream& s,
    const array& in,
    array& out,
    int axis,
    bool argsort) {
  auto& encoder = cu::get_command_encoder(s);
  gpu_merge_sort(s, in, out, axis, argsort);
}

} // namespace

void ArgSort::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("ArgSort::eval_gpu");
  assert(inputs.size() == 1);
  gpu_sort(stream(), inputs[0], out, axis_, true);
}

void Sort::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Sort::eval_gpu");
  assert(inputs.size() == 1);
  gpu_sort(stream(), inputs[0], out, axis_, false);
}

void ArgPartition::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("ArgPartition::eval_gpu");
  gpu_sort(stream(), inputs[0], out, axis_, true);
}

void Partition::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Partition::eval_gpu");
  gpu_sort(stream(), inputs[0], out, axis_, false);
}

} // namespace mlx::core

================================================
FILE: mlx/backend/cuda/steel/defines.cuh
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#define MLX_UNROLL _Pragma("unroll")

#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
#define MLX_CUDA_SM_80_ENABLED
#endif


================================================
FILE: mlx/backend/cuda/steel/gemm.cuh
================================================

#include "mlx/backend/cuda/steel/mma.cuh"
#include "mlx/backend/cuda/steel/tiles.cuh"

namespace mlx::core::cu {

/**
 * An example gemm written with the utils.
 *
 * Computes A @ B.T when A and B are all aligned with the block sizes.
 */
template <typename T, int BM, int BN, int BK>
__global__ void ab_t_aligned(const T* a, const T* b, T* y, int N, int K) {
  constexpr int WARPS_M = 2;
  constexpr int WARPS_N = 2;
  constexpr int NUM_WARPS = WARPS_M * WARPS_N;
  constexpr int WARP_STEP_M = BM / WARPS_M;
  constexpr int WARP_STEP_N = BN / WARPS_N;

  // Precompute some offsets for each thread
  const int warpid = threadIdx.x / 32;
  const int laneid = threadIdx.x % 32;
  const int wm = warpid / WARPS_N;
  const int wn = warpid % WARPS_N;
  const int offset_m = wm * WARP_STEP_M;
  const int offset_n = wn * WARP_STEP_N;

  // Allocate shared memory
  extern __shared__ char shmem[];
  SharedTile<T, BM, BK>(&as)[2] = *(SharedTile<T, BM, BK>(*)[2])(&shmem[0]);
  SharedTile<T, BN, BK>(&bs)[2] =
      *(SharedTile<T, BN, BK>(*)[2])(&shmem[sizeof(T) * 2 * BM * BK]);

  // Allocate registers for the MMA
  RegisterTile<float, BM / WARPS_M, BN / WARPS_N> C;
  RegisterTile<T, BM / WARPS_M, 16> A;
  RegisterTile<T, BN / WARPS_N, 16> B;

  // Move the global pointers to the tile
  a += blockIdx.y * BM * K;
  b += blockIdx.x * BN * K;
  y += blockIdx.y * BM * N + blockIdx.x * BN;

  // Zero the accumulators
  C.fill(0);

  // Start the SM pipeline
  load_async<NUM_WARPS>(as[0], as[0].base_addr(), a, K);
  load_async<NUM_WARPS>(bs[0], bs[0].base_addr(), b, K);
  cp_async_commit();

  int tic = 0;
  for (int k_block = BK; k_block < K; k_block += BK) {
    load_async<NUM_WARPS>(as[tic ^ 1], as[tic ^ 1].base_addr(), a + k_block, K);
    load_async<NUM_WARPS>(bs[tic ^ 1], bs[tic ^ 1].base_addr(), b + k_block, K);
    cp_async_commit();
    cp_async_wait<1>();
    __syncthreads();

    MLX_UNROLL
    for (int k = 0; k < BK / 16; k++) {
      A.load(
          as[tic],
          as[tic].base_addr(),
          offset_m + laneid % 16,
          k * 16 + laneid / 16 * 8);
      B.load(
          bs[tic],
          bs[tic].base_addr(),
          offset_n + laneid % 16,
          k * 16 + laneid / 16 * 8);

      mma_t(C, A, B);
    }

    tic ^= 1;
  }

  // Empty the pipeline
  cp_async_wait_all();
  __syncthreads();
  MLX_UNROLL
  for (int k = 0; k < BK / 16; k++) {
    A.load(
        as[tic],
        as[tic].base_addr(),
        offset_m + laneid % 16,
        k * 16 + laneid / 16 * 8);
    B.load(
        bs[tic],
        bs[tic].base_addr(),
        offset_n + laneid % 16,
        k * 16 + laneid / 16 * 8);

    mma_t(C, A, B);
  }

  C.store_global(y, N, offset_m, offset_n);
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/steel/mma.cuh
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/backend/cuda/steel/defines.cuh"
#include "mlx/backend/cuda/steel/tiles.cuh"

namespace mlx::core::cu {

/**
 * Fallback mma.
 *
 * We should probably a) implement a fallback or complain about it to the
 * compiler.
 */
template <typename U, typename T>
__device__ inline void
mma_t(Tile16x16<U>& C, Tile16x16<T>& A, Tile16x16<T>& B) {}

/**
 * Multiply the 16x16 bfloat16 tiles and accumulate the result in one 16x16
 * float tile.
 *
 * We actually perform C += A @ B.T
 */
__device__ __forceinline__ void mma_t(
    Tile16x16<float>& C,
    Tile16x16<__nv_bfloat16>& A,
    Tile16x16<__nv_bfloat16>& B) {
#if defined(MLX_CUDA_SM_80_ENABLED)
  asm volatile(
      "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
      "{%0, %1, %2, %3}, "
      "{%4, %5, %6, %7}, "
      "{%8, %9}, "
      "{%10, %11, %12, %13};"

      // D matrix
      : "+f"(C.values[0].x),
        "+f"(C.values[0].y),
        "+f"(C.values[1].x),
        "+f"(C.values[1].y)

      // A matrix
      : "r"(*(uint32_t*)(&A.values[0])),
        "r"(*(uint32_t*)(&A.values[1])),
        "r"(*(uint32_t*)(&A.values[2])),
        "r"(*(uint32_t*)(&A.values[3])),

        // B matrix
        "r"(*(uint32_t*)(&B.values[0])),
        "r"(*(uint32_t*)(&B.values[2])),

        // C matrix
        "f"(C.values[0].x),
        "f"(C.values[0].y),
        "f"(C.values[1].x),
        "f"(C.values[1].y));
  asm volatile(
      "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
      "{%0, %1, %2, %3}, "
      "{%4, %5, %6, %7}, "
      "{%8, %9}, "
      "{%10, %11, %12, %13};"

      // D matrix
      : "+f"(C.values[2].x),
        "+f"(C.values[2].y),
        "+f"(C.values[3].x),
        "+f"(C.values[3].y)

      // A matrix
      : "r"(*(uint32_t*)(&A.values[0])),
        "r"(*(uint32_t*)(&A.values[1])),
        "r"(*(uint32_t*)(&A.values[2])),
        "r"(*(uint32_t*)(&A.values[3])),

        // B matrix
        "r"(*(uint32_t*)(&B.values[1])),
        "r"(*(uint32_t*)(&B.values[3])),

        // C matrix
        "f"(C.values[2].x),
        "f"(C.values[2].y),
        "f"(C.values[3].x),
        "f"(C.values[3].y));
#endif
}

/**
 * Multiply larger register tiles by delegating to mma_t.
 */
template <typename U, typename T, int M, int N, int K>
__device__ __forceinline__ void mma_t(
    RegisterTile<U, M, N>& C,
    RegisterTile<T, M, K>& A,
    RegisterTile<T, N, K>& B) {
  constexpr int TILES_M = RegisterTile<T, M, K>::TILES_Y;
  constexpr int TILES_K = RegisterTile<T, M, K>::TILES_X;
  constexpr int TILES_N = RegisterTile<T, N, K>::TILES_Y;

  MLX_UNROLL
  for (int k = 0; k < TILES_K; k++) {
    MLX_UNROLL
    for (int m = 0; m < TILES_M; m++) {
      MLX_UNROLL
      for (int n = 0; n < TILES_N; n++) {
        mma_t(
            C.data[m * TILES_N + n],
            A.data[m * TILES_K + k],
            B.data[n * TILES_K + k]);
      }
    }
  }
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/steel/tiles.cuh
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/backend/cuda/steel/utils.cuh"
#include "mlx/backend/cuda/vector_types.cuh"

namespace mlx::core::cu {

/**
 * The basic building block for Ampere mmas. A 16x16 tile distributed across
 * the warp.
 *
 * Each thread holds 8 values. They are distributed according to
 * https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-fragment-mma-16816-float
 *
 * For use instructions see the individual methods eg load().
 */
template <typename T>
struct Tile16x16 {
  using T2 = Vector2_t<T>;

  T2 values[4];

  __device__ inline void fill(T v) {
    T2 v2 = {v, v};
    for (int i = 0; i < 4; i++) {
      values[i] = v2;
    }
  }

  /**
   * Load a 16x16 tile from shared memory.
   *
   * The instruction is a bit weird in the sense that the address provided by
   * each thread and the elements loaded are not the same.
   *
   * We load 4 8x8 tiles. The tile rows are stored contiguously in memory. As a
   * result the warp provides 4*8 = 32 addresses one per row.
   *
   * Threads 0-7 provide the addresses for the first tile, 8-15 for the second
   * and so on. For instance to load a non swizzled tile we would do
   *
   *    base_addr + (laneid % 16) * BK + (laneid / 2) * 8
   *
   * See
   * https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-ldmatrix
   */
  __device__ __forceinline__ void load(uint32_t row_address) {
    if constexpr (
        std::is_same_v<T2, __nv_bfloat162> || std::is_same_v<T2, __half2>) {
      asm volatile(
          "ldmatrix.sync.aligned.m8n8.x4.shared::cta.b16 {%0, %1, %2, %3}, [%4];\n"
          : "=r"(*(uint32_t*)&(values[0])),
            "=r"(*(uint32_t*)&(values[1])),
            "=r"(*(uint32_t*)&(values[2])),
            "=r"(*(uint32_t*)&(values[3]))
          : "r"(row_address));
    }
  }

  /**
   * Store the tile to the address pointed to by `x`.
   *
   * The provided pointer is a generic pointer but this is meant to be used to
   * store to global memory. For storing to shared memory we should use
   * `stmatrix`.
   *
   * This also showcases the format of the tile quite nicely. Each register is
   * holding to adjacent values. The indices are
   *
   *    row + 0, col + 0
   *    row + 8, col + 0
   *    row + 0, col + 8
   *    row + 8, col + 8
   *
   * Given that we are dealing with Vector2_t<U> the column offsets are 4
   * instead of 8.
   */
  template <typename U>
  __device__ inline void store_global(U* x, int N) {
    using U2 = Vector2_t<U>;
    U2* x2 = reinterpret_cast<U2*>(x);
    const int laneid = threadIdx.x % 32;
    const int row = laneid / 4;
    const int col = laneid % 4;
    if constexpr (std::is_same_v<U2, T2>) {
      x2[(row + 0) * (N / 2) + col + 0] = values[0];
      x2[(row + 0) * (N / 2) + col + 4] = values[2];
      x2[(row + 8) * (N / 2) + col + 0] = values[1];
      x2[(row + 8) * (N / 2) + col + 4] = values[3];
    } else if constexpr (
        std::is_same_v<T2, float2> && std::is_same_v<U, __nv_bfloat16>) {
      x2[(row + 0) * (N / 2) + col + 0] =
          __floats2bfloat162_rn(values[0].x, values[0].y);
      x2[(row + 0) * (N / 2) + col + 4] =
          __floats2bfloat162_rn(values[2].x, values[2].y);
      x2[(row + 8) * (N / 2) + col + 0] =
          __floats2bfloat162_rn(values[1].x, values[1].y);
      x2[(row + 8) * (N / 2) + col + 4] =
          __floats2bfloat162_rn(values[3].x, values[3].y);
    }
  }

  template <typename U>
  __device__ inline void store_global_safe(U* x, int N, int max_rows) {
    const int laneid = threadIdx.x % 32;
    const int row = laneid / 4;
    const int col = laneid % 4;
    if (row < max_rows) {
      x[(row + 0) * N + 2 * col + 0] = static_cast<U>(values[0].x);
      x[(row + 0) * N + 2 * col + 1] = static_cast<U>(values[0].y);
      x[(row + 0) * N + 2 * col + 8] = static_cast<U>(values[2].x);
      x[(row + 0) * N + 2 * col + 9] = static_cast<U>(values[2].y);
    }
    if (row + 8 < max_rows) {
      x[(row + 8) * N + 2 * col + 0] = static_cast<U>(values[1].x);
      x[(row + 8) * N + 2 * col + 1] = static_cast<U>(values[1].y);
      x[(row + 8) * N + 2 * col + 8] = static_cast<U>(values[3].x);
      x[(row + 8) * N + 2 * col + 9] = static_cast<U>(values[3].y);
    }
  }
};

/**
 * A simple container of multiple Tile16x16.
 *
 * Provides utility functions for loading and manipulating collections of basic
 * tiles.
 */
template <typename T, int ROWS_, int COLS_>
struct RegisterTile {
  static constexpr int ROWS = ROWS_;
  static constexpr int COLS = COLS_;
  static constexpr int TILES_X = COLS / 16;
  static constexpr int TILES_Y = ROWS / 16;

  Tile16x16<T> data[TILES_X * TILES_Y];

  __device__ inline void fill(T v) {
    MLX_UNROLL
    for (int i = 0; i < TILES_Y; i++) {
      MLX_UNROLL
      for (int j = 0; j < TILES_X; j++) {
        data[i * TILES_X + j].fill(v);
      }
    }
  }

  template <typename Tile>
  __device__ __forceinline__ void
  load(Tile& tile, uint32_t base_address, int row, int col) {
    MLX_UNROLL
    for (int i = 0; i < TILES_Y; i++) {
      MLX_UNROLL
      for (int j = 0; j < TILES_X; j++) {
        data[i * TILES_X + j].load(
            tile.loc(base_address, row + i * 16, col + j * 16));
      }
    }
  }

  template <typename Tile, typename F>
  __device__ __forceinline__ void
  load(Tile& tile, F f, uint32_t base_address, int row, int col) {
    MLX_UNROLL
    for (int i = 0; i < TILES_Y; i++) {
      MLX_UNROLL
      for (int j = 0; j < TILES_X; j++) {
        f(data[i * TILES_X + j],
          tile,
          base_address,
          row + i * 16,
          col + j * 16);
      }
    }
  }

  template <typename U>
  __device__ inline void store_global(U* x, int N, int row, int col) {
    MLX_UNROLL
    for (int i = 0; i < TILES_Y; i++) {
      MLX_UNROLL
      for (int j = 0; j < TILES_X; j++) {
        data[i * TILES_X + j].store_global(
            x + (row + i * 16) * N + col + j * 16, N);
      }
    }
  }

  template <typename U>
  __device__ inline void
  store_global_safe(U* x, int N, int row, int col, int max_rows) {
    MLX_UNROLL
    for (int i = 0; i < TILES_Y; i++) {
      MLX_UNROLL
      for (int j = 0; j < TILES_X; j++) {
        data[i * TILES_X + j].store_global_safe(
            x + (row + i * 16) * N + col + j * 16, N, max_rows - row - i * 16);
      }
    }
  }
};

/**
 * A simple container of multiple Tile16x16.
 *
 * Provides utility functions for loading and manipulating collections of basic
 * tiles.
 */
template <typename T, int ROWS_, int COLS_>
struct RegisterTile {
  static constexpr int ROWS = ROWS_;
  static constexpr int COLS = COLS_;
  static constexpr int TILES_X = COLS / 16;
  static constexpr int TILES_Y = ROWS / 16;

  Tile16x16<T> data[TILES_X * TILES_Y];

  __device__ inline void fill(T v) {
    MLX_UNROLL
    for (int i = 0; i < TILES_Y; i++) {
      MLX_UNROLL
      for (int j = 0; j < TILES_X; j++) {
        data[i * TILES_X + j].fill(v);
      }
    }
  }

  template <typename Tile>
  __device__ inline void
  load(Tile& tile, uint32_t base_address, int row, int col) {
    MLX_UNROLL
    for (int i = 0; i < TILES_Y; i++) {
      MLX_UNROLL
      for (int j = 0; j < TILES_X; j++) {
        data[i * TILES_X + j].load(
            tile.loc(base_address, row + i * 16, col + j * 16));
      }
    }
  }

  template <typename U>
  __device__ inline void store_global(U* x, int N, int row, int col) {
    MLX_UNROLL
    for (int i = 0; i < TILES_Y; i++) {
      MLX_UNROLL
      for (int j = 0; j < TILES_X; j++) {
        data[i * TILES_X + j].store_global(
            x + (row + i * 16) * N + col + j * 16, N);
      }
    }
  }
};

template <typename T, int ROWS_, int COLS_>
struct SharedTile {
  static constexpr int ROWS = ROWS_;
  static constexpr int COLS = COLS_;
  static constexpr int TILES_X = COLS / 16;
  static constexpr int TILES_Y = ROWS / 16;
  static constexpr int NUMEL = ROWS * COLS;

  // Swizzle taken from ThunderKittens. Should be changed when we switch to
  // cute Layouts.
  //
  // See inludes/types/shared/st.cuh
  //
  // I do feel that it is too math heavy and can be improved. Also the math is
  // done every time although the addresses don't change from load to load. I
  // guess we are expecting the compiler to figure that out.
  static constexpr int swizzle_bytes =
      (sizeof(T) == 2 ? (TILES_X % 4 == 0 ? 128 : (TILES_X % 2 == 0 ? 64 : 32))
                      : (sizeof(T) == 4 ? (TILES_X % 2 == 0 ? 128 : 64) : 0));

  T data[ROWS * COLS];

  __device__ inline uint32_t base_addr() const {
    return __cvta_generic_to_shared(&data[0]);
  }

  // Return a pointer to the element at (row, col) using the swizzle.
  __device__ static inline T* ptr(T* ptr, int row, int col) {
    if constexpr (swizzle_bytes > 0) {
      static constexpr int swizzle_repeat = swizzle_bytes * 8;
      static constexpr int subtile_cols = swizzle_bytes / sizeof(T);
      const int outer_idx = col / subtile_cols;
      const uint64_t addr =
          (uint64_t)(&ptr
                         [outer_idx * ROWS * subtile_cols + row * subtile_cols +
                          col % subtile_cols]);
      const int swizzle = ((addr % swizzle_repeat) >> 7) << 4;
      return (T*)(addr ^ swizzle);
    } else {
      return ptr + row * COLS + col;
    }
  }

  // Return the location of the element at (row, col) using the swizzle.
  __device__ static inline uint32_t loc(uint32_t ptr, int row, int col) {
    if constexpr (swizzle_bytes > 0) {
      static constexpr int swizzle_repeat = swizzle_bytes * 8;
      static constexpr int subtile_cols = swizzle_bytes / sizeof(T);
      const int outer_idx = col / subtile_cols;
      const uint32_t addr = ptr +
          sizeof(T) *
              (outer_idx * ROWS * subtile_cols + row * subtile_cols +
               col % subtile_cols);
      const int swizzle = ((addr % swizzle_repeat) >> 7) << 4;
      return (addr ^ swizzle);
    } else {
      return ptr + sizeof(T) * (row * COLS + col);
    }
  }

  // Convenience functions to edit elements going through the swizzle.
  __device__ inline T& operator()(int row, int col) {
    return *ptr(data, row, col);
  }
  __device__ inline void store(float4& v, int row, int col) {
    *(reinterpret_cast<float4*>(ptr(data, row, col))) = v;
  }
  __device__ inline void store(float2& v, int row, int col) {
    *(reinterpret_cast<float2*>(ptr(data, row, col))) = v;
  }
  __device__ inline void store(float& v, int row, int col) {
    *(reinterpret_cast<float*>(ptr(data, row, col))) = v;
  }
  template <int N>
  __device__ inline void store(T (&v)[N], int row, int col) {
    if constexpr (sizeof(T) * N == 4) {
      store(*(reinterpret_cast<float*>(&v[0])), row, col);
    } else if constexpr (sizeof(T) * N == 8) {
      store(*(reinterpret_cast<float2*>(&v[0])), row, col);
    } else if constexpr (sizeof(T) * N == 16) {
      store(*(reinterpret_cast<float4*>(&v[0])), row, col);
    } else {
      MLX_UNROLL
      for (int i = 0; i < N; i++) {
        *ptr(data, row, col + i) = v[i];
      }
    }
  }
};

/**
 * Load the tile from global memory by loading 16 bytes at a time and storing
 * them immediately.
 *
 * Can also be used as a fallback for architectures before sm_80.
 */
template <int NUM_WARPS, typename T, typename Tile>
__device__ inline void load(Tile& tile, const T* x, int N) {
  constexpr int NUM_THREADS = NUM_WARPS * 32;
  constexpr int ELEMENTS_PER_LOAD = sizeof(float4) / sizeof(T);
  constexpr int NUM_LOADS = Tile::NUMEL / ELEMENTS_PER_LOAD;
  constexpr int NUM_LOADS_PER_THREAD = NUM_LOADS / NUM_THREADS;
  constexpr int NUM_LOADS_PER_ROW = Tile::COLS / ELEMENTS_PER_LOAD;
  constexpr int STEP_ROWS = NUM_THREADS / NUM_LOADS_PER_ROW;

  const int row = threadIdx.x / NUM_LOADS_PER_ROW;
  const int col = threadIdx.x % NUM_LOADS_PER_ROW;

  x += row * N + col * ELEMENTS_PER_LOAD;

  MLX_UNROLL
  for (int i = 0; i < NUM_LOADS_PER_THREAD; i++) {
    float4 tmp;
    tmp = *(reinterpret_cast<const float4*>(&x[i * STEP_ROWS * N]));
    tile.store(tmp, row + i * STEP_ROWS, col * ELEMENTS_PER_LOAD);
  }
}

/**
 * The asynchronous equivalent of load.
 *
 * Loads the tile from global memory by submitting a bunch of async copy
 * instructions. The copy won't start until commit is called and we don't have
 * a guarantee it will finish until wait is called.
 *
 * It should be used as follows
 *
 *    load(...)
 *    load(...)
 *    cp_async_commit()
 *    do_other_stuff()
 *    cp_async_wait_all()
 *    do_stuff_with_shmem()
 */
template <int NUM_WARPS, typename T, typename Tile>
__device__ inline void
load_async(Tile& tile, uint32_t base_address, const T* x, int N) {
  constexpr int NUM_THREADS = NUM_WARPS * 32;
  constexpr int ELEMENTS_PER_LOAD = sizeof(float4) / sizeof(T);
  constexpr int NUM_LOADS = Tile::NUMEL / ELEMENTS_PER_LOAD;
  constexpr int NUM_LOADS_PER_THREAD = NUM_LOADS / NUM_THREADS;
  constexpr int NUM_LOADS_PER_ROW = Tile::COLS / ELEMENTS_PER_LOAD;
  constexpr int STEP_ROWS = NUM_THREADS / NUM_LOADS_PER_ROW;

  const int row = threadIdx.x / NUM_LOADS_PER_ROW;
  const int col = threadIdx.x % NUM_LOADS_PER_ROW;

  x += row * N + col * ELEMENTS_PER_LOAD;

  MLX_UNROLL
  for (int i = 0; i < NUM_LOADS_PER_THREAD; i++) {
    cp_async<16>(
        tile.loc(base_address, row + i * STEP_ROWS, col * ELEMENTS_PER_LOAD),
        x + i * STEP_ROWS * N);
  }
}

/**
 * Same as load_async but checks if we can load the row.
 *
 * NOTE: It should be changed to use a predicated cp async instead.
 */
template <int NUM_WARPS, typename T, typename Tile>
__device__ inline void load_async_safe(
    Tile& tile,
    uint32_t base_address,
    const T* x,
    int N,
    int max_rows) {
  constexpr int NUM_THREADS = NUM_WARPS * 32;
  constexpr int ELEMENTS_PER_LOAD = sizeof(float4) / sizeof(T);
  constexpr int NUM_LOADS = Tile::NUMEL / ELEMENTS_PER_LOAD;
  constexpr int NUM_LOADS_PER_THREAD = NUM_LOADS / NUM_THREADS;
  constexpr int NUM_LOADS_PER_ROW = Tile::COLS / ELEMENTS_PER_LOAD;
  constexpr int STEP_ROWS = NUM_THREADS / NUM_LOADS_PER_ROW;

  const int row = threadIdx.x / NUM_LOADS_PER_ROW;
  const int col = threadIdx.x % NUM_LOADS_PER_ROW;

  x += row * N + col * ELEMENTS_PER_LOAD;

  MLX_UNROLL
  for (int i = 0; i < NUM_LOADS_PER_THREAD; i++) {
    if (row + i * STEP_ROWS < max_rows) {
      cp_async<16>(
          tile.loc(base_address, row + i * STEP_ROWS, col * ELEMENTS_PER_LOAD),
          x + i * STEP_ROWS * N);
    } else {
      float4 tmp = {0, 0, 0, 0};
      tile.store(tmp, row + i * STEP_ROWS, col * ELEMENTS_PER_LOAD);
    }
  }
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/steel/utils.cuh
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/backend/cuda/device/utils.cuh"
#include "mlx/backend/cuda/steel/defines.cuh"

namespace mlx::core::cu {

/**
 * Copy bytes from the global memory address pointed to by x to the smem
 * address pointed to by row_address.
 *
 * A simple wrapper over the PTX.
 */
template <int N, typename T>
__device__ inline void cp_async(uint32_t row_address, const T* x) {
  static_assert(
      N == 16 || N == 8 || N == 4,
      "cp.async is only supported for N in {4, 8, 16}.");
#if defined(MLX_CUDA_SM_80_ENABLED)
  if constexpr (N == 16) {
    asm volatile(
        "cp.async.ca.shared::cta.global [%0], [%1], 16;\n" ::"r"(row_address),
        "l"(reinterpret_cast<const int4*>(x)));
  } else if constexpr (N == 8) {
    asm volatile(
        "cp.async.ca.shared::cta.global [%0], [%1], 8;\n" ::"r"(row_address),
        "l"(reinterpret_cast<const int2*>(x)));
  } else if constexpr (N == 4) {
    asm volatile(
        "cp.async.ca.shared::cta.global [%0], [%1], 4;\n" ::"r"(row_address),
        "l"(reinterpret_cast<const int*>(x)));
  }
#endif
}

/**
 * Submit all the previous async copies to be executed.
 */
__device__ inline void cp_async_commit() {
#if defined(MLX_CUDA_SM_80_ENABLED)
  asm volatile("cp.async.commit_group;\n" ::);
#endif
}

/**
 * Wait for all but N of the async copies to finish.
 */
template <int N>
__device__ inline void cp_async_wait() {
#if defined(MLX_CUDA_SM_80_ENABLED)
  if constexpr (N == 0) {
    asm volatile("cp.async.wait_all;\n" ::);
  } else {
    asm volatile("cp.async.wait_group %0;\n" ::"n"(N));
  }
#endif
}

/**
 * Wait for all the async copies to finish.
 */
__device__ inline void cp_async_wait_all() {
  cp_async_wait<0>();
}

/**
 * Extract ``bits`` bits from the 32 bit value.
 *
 * Single instruction shift and mask.
 */
template <int bits>
__device__ inline uint32_t extract_bits(uint32_t value, int start_bit) {
  static_assert(
      bits == 2 || bits == 4 || bits == 8,
      "extract_bits only supports 2, 4, 8 for now.");
  uint32_t result;
  if constexpr (bits == 2) {
    asm("bfe.u32 %0, %1, %2, 2;" : "=r"(result) : "r"(value), "r"(start_bit));
  } else if constexpr (bits == 4) {
    asm("bfe.u32 %0, %1, %2, 4;" : "=r"(result) : "r"(value), "r"(start_bit));
  } else if constexpr (bits == 8) {
    asm("bfe.u32 %0, %1, %2, 8;" : "=r"(result) : "r"(value), "r"(start_bit));
  }
  return result;
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/ternary.cu
================================================
// Copyright © 2025 Apple Inc.
#include "mlx/backend/common/ternary.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/device/ternary_ops.cuh"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/dtype_utils.h"
#include "mlx/primitives.h"

#include <cooperative_groups.h>
#include <nvtx3/nvtx3.hpp>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <typename Op, typename T, typename IdxT, int N_READS>
__global__ void
ternary_v(const bool* a, const T* b, const T* c, T* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();

  if ((index + 1) * N_READS > size) {
    for (IdxT i = index * N_READS; i < size; ++i) {
      out[i] = Op{}(a[i], b[i], c[i]);
    }
  } else {
    auto a_vec = load_vector<N_READS>(a, index);
    auto b_vec = load_vector<N_READS>(b, index);
    auto c_vec = load_vector<N_READS>(c, index);

    AlignedVector<T, N_READS> out_vec;
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      out_vec[i] = Op{}(a_vec[i], b_vec[i], c_vec[i]);
    }

    store_vector<N_READS>(out, index, out_vec);
  }
}

template <typename Op, typename T, typename IdxT, int NDIM, int N_READS>
__global__ void ternary_g_nd(
    const bool* a,
    const T* b,
    const T* c,
    T* out,
    IdxT size_rest,
    const __grid_constant__ cuda::std::array<int32_t, NDIM> shape,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> a_strides,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> b_strides,
    const __grid_constant__ cuda::std::array<int64_t, NDIM> c_strides) {
  auto block = cg::this_thread_block();
  auto grid = cg::this_grid();
  IdxT index_rest =
      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
  if (index_rest >= size_rest) {
    return;
  }

  auto shape_x = shape[NDIM - 1];
  auto a_stride_x = a_strides[NDIM - 1];
  auto b_stride_x = b_strides[NDIM - 1];
  auto c_stride_x = c_strides[NDIM - 1];
  IdxT index_x =
      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
  auto [a_idx, b_idx, c_idx] = elem_to_loc_nd<NDIM>(
      index_rest * shape_x,
      shape.data(),
      a_strides.data(),
      b_strides.data(),
      c_strides.data());
  auto a_vec =
      load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, false);
  auto b_vec =
      load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, T(0));
  auto c_vec =
      load_vector<N_READS>(c + c_idx, index_x, shape_x, c_stride_x, T(0));

  AlignedVector<T, N_READS> out_vec;
#pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    out_vec[i] = Op{}(a_vec[i], b_vec[i], c_vec[i]);
  }
  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
}

template <typename Op, typename T, typename IdxT, int N_READS>
__global__ void ternary_g(
    const bool* a,
    const T* b,
    const T* c,
    T* out,
    IdxT size_rest,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides a_strides,
    const __grid_constant__ Strides b_strides,
    const __grid_constant__ Strides c_strides,
    int ndim) {
  auto block = cg::this_thread_block();
  auto grid = cg::this_grid();
  IdxT index_rest =
      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
  if (index_rest >= size_rest) {
    return;
  }

  auto shape_x = shape[ndim - 1];
  auto a_stride_x = a_strides[ndim - 1];
  auto b_stride_x = b_strides[ndim - 1];
  auto c_stride_x = c_strides[ndim - 1];
  IdxT index_x =
      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
  auto [a_idx, b_idx, c_idx] = elem_to_loc(
      index_rest * shape_x,
      shape.data(),
      a_strides.data(),
      b_strides.data(),
      c_strides.data(),
      ndim);
  auto a_vec =
      load_vector<N_READS>(a + a_idx, index_x, shape_x, a_stride_x, false);
  auto b_vec =
      load_vector<N_READS>(b + b_idx, index_x, shape_x, b_stride_x, T(0));
  auto c_vec =
      load_vector<N_READS>(c + c_idx, index_x, shape_x, c_stride_x, T(0));

  AlignedVector<T, N_READS> out_vec;
#pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    out_vec[i] = Op{}(a_vec[i], b_vec[i], c_vec[i]);
  }
  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
}

} // namespace cu

template <typename Op>
void ternary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
    const Stream& s) {
  const auto& a = inputs[0];
  const auto& b = inputs[1];
  const auto& c = inputs[2];
  if (out.size() == 0) {
    return;
  }

  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(a);
  encoder.set_input_array(b);
  encoder.set_input_array(c);
  encoder.set_output_array(out);
  dispatch_all_types(out.dtype(), [&](auto type_tag) {
    using DType = cuda_type_t<MLX_GET_TYPE(type_tag)>;

    auto topt = get_ternary_op_type(a, b, c);
    if (topt == TernaryOpType::VectorVectorVector ||
        topt == TernaryOpType::ScalarScalarScalar) {
      dispatch_bool(out.data_size() > UINT32_MAX, [&](auto large) {
        using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
        constexpr int N_READS = 16 / sizeof(DType);
        auto [num_blocks, block_dims] = get_launch_args(
            out.data_size(), out.shape(), out.strides(), large(), N_READS);
        encoder.add_kernel_node(
            cu::ternary_v<Op, DType, IdxT, N_READS>,
            num_blocks,
            block_dims,
            gpu_ptr<bool>(a),
            gpu_ptr<DType>(b),
            gpu_ptr<DType>(c),
            gpu_ptr<DType>(out),
            out.data_size());
      });
    } else {
      dispatch_bool(
          a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
              c.data_size() > INT32_MAX || out.data_size() > INT32_MAX,
          [&](auto large) {
            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
            Shape shape;
            std::vector<Strides> strides;
            std::tie(shape, strides) = collapse_contiguous_dims(a, b, c, out);
            auto& a_strides = strides[0];
            auto& b_strides = strides[1];
            auto& c_strides = strides[2];
            int ndim = shape.size();
            int work_per_thread = 1;
            auto dim0 = ndim > 0 ? shape.back() : 1;
            auto rest = out.size() / dim0;
            if (dim0 >= 4) {
              work_per_thread = 4;
            }
            dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
            auto block_dims = get_block_dims(dim0, rest, 1);
            uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
            uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);

            if (ndim <= 3) {
              dispatch_1_2_3(ndim, [&](auto dims_constant) {
                auto kernel =
                    cu::ternary_g_nd<Op, DType, IdxT, dims_constant(), 1>;
                if (work_per_thread == 4) {
                  kernel =
                      cu::ternary_g_nd<Op, DType, IdxT, dims_constant(), 4>;
                }
                encoder.add_kernel_node(
                    kernel,
                    {num_blocks_x, num_blocks_y},
                    block_dims,
                    gpu_ptr<bool>(a),
                    gpu_ptr<DType>(b),
                    gpu_ptr<DType>(c),
                    gpu_ptr<DType>(out),
                    rest,
                    const_param<dims_constant()>(shape),
                    const_param<dims_constant()>(a_strides),
                    const_param<dims_constant()>(b_strides),
                    const_param<dims_constant()>(c_strides));
              });
            } else {
              auto kernel = cu::ternary_g<Op, DType, IdxT, 1>;
              if (work_per_thread == 4) {
                kernel = cu::ternary_g<Op, DType, IdxT, 4>;
              }
              encoder.add_kernel_node(
                  kernel,
                  {num_blocks_x, num_blocks_y},
                  block_dims,
                  gpu_ptr<bool>(a),
                  gpu_ptr<DType>(b),
                  gpu_ptr<DType>(c),
                  gpu_ptr<DType>(out),
                  rest,
                  const_param(shape),
                  const_param(a_strides),
                  const_param(b_strides),
                  const_param(c_strides),
                  ndim);
            }
          });
    }
  });
}

template <typename Op>
void ternary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
    const Stream& s) {
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto& c = inputs[2];
  auto topt = get_ternary_op_type(a, b, c);
  auto& encoder = cu::get_command_encoder(s);
  set_ternary_op_output_data(
      a, b, c, out, topt, [&](auto n) { return cu::malloc_async(n, encoder); });
  ternary_op_gpu_inplace<Op>(inputs, out, s);
}

void Select::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Select::eval_gpu");
  auto& s = out.primitive().stream();
  ternary_op_gpu<cu::Select>(inputs, out, s);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/CMakeLists.txt
================================================
target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/abs.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arccos.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arccosh.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arcsin.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arcsinh.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arctan.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/arctanh.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/bitwise_invert.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/ceil.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/conjugate.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cos.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cosh.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/erf.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/erf_inv.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/exp.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/expm1.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/floor.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/imag.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/log.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/log1p.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/logical_not.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/negative.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/real.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/round.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sigmoid.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sign.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sin.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sinh.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sqrt.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/square.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tan.cu
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tanh.cu)


================================================
FILE: mlx/backend/cuda/unary/abs.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Abs)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/arccos.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(ArcCos)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/arccosh.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(ArcCosh)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/arcsin.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(ArcSin)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/arcsinh.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(ArcSinh)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/arctan.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(ArcTan)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/arctanh.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(ArcTanh)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/bitwise_invert.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(BitwiseInvert)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/ceil.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Ceil)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/conjugate.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Conjugate)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/cos.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Cos)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/cosh.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Cosh)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/erf.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Erf)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/erf_inv.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(ErfInv)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/exp.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Exp)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/expm1.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Expm1)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/floor.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Floor)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/imag.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Imag)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/log.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
void Log::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Log::eval_gpu");
  auto& s = out.primitive().stream();
  switch (base_) {
    case Base::e:
      unary_op_gpu<cu::Log>(inputs, out, name(), s);
      break;
    case Base::two:
      unary_op_gpu<cu::Log2>(inputs, out, name(), s);
      break;
    case Base::ten:
      unary_op_gpu<cu::Log10>(inputs, out, name(), s);
      break;
  }
}
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/log1p.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Log1p)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/logical_not.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(LogicalNot)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/negative.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Negative)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/real.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Real)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/round.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
void Round::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Round::eval_gpu");
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  auto& s = out.primitive().stream();
  if (issubdtype(in.dtype(), inexact)) {
    unary_op_gpu<cu::Round>(inputs, out, name(), s);
  } else {
    // No-op integer types
    out.copy_shared_buffer(in);
  }
}
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/sigmoid.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Sigmoid)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/sign.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Sign)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/sin.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Sin)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/sinh.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Sinh)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/sqrt.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
void Sqrt::eval_gpu(const std::vector<array>& inputs, array& out) {
  nvtx3::scoped_range r("Sqrt::eval_gpu");
  auto& s = out.primitive().stream();
  if (recip_) {
    unary_op_gpu<cu::Rsqrt>(inputs, out, "Rsqrt", s);
  } else {
    unary_op_gpu<cu::Sqrt>(inputs, out, "Sqrt", s);
  }
}
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/square.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Square)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/tan.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Tan)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/tanh.cu
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/unary/unary.cuh"

namespace mlx::core {
UNARY_GPU(Tanh)
} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/unary/unary.cuh
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/unary.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/backend/cuda/device/unary_ops.cuh"
#include "mlx/backend/cuda/kernel_utils.cuh"
#include "mlx/dtype_utils.h"
#include "mlx/primitives.h"

#include <cooperative_groups.h>
#include <nvtx3/nvtx3.hpp>

namespace mlx::core {

namespace cu {

namespace cg = cooperative_groups;

template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
__global__ void unary_v(const In* in, Out* out, IdxT size) {
  IdxT index = cg::this_grid().thread_rank();

  if ((index + 1) * N_READS > size) {
    for (IdxT i = index * N_READS; i < size; ++i) {
      out[i] = Op{}(in[i]);
    }
  } else {
    auto in_vec = load_vector<N_READS>(in, index);

    AlignedVector<Out, N_READS> out_vec;
#pragma unroll
    for (int i = 0; i < N_READS; ++i) {
      out_vec[i] = Op{}(in_vec[i]);
    }

    store_vector<N_READS>(out, index, out_vec);
  }
}

template <typename Op, typename In, typename Out, typename IdxT, int N_READS>
__global__ void unary_g(
    const In* in,
    Out* out,
    IdxT size_rest,
    const __grid_constant__ Shape shape,
    const __grid_constant__ Strides strides,
    int ndim) {
  auto block = cg::this_thread_block();
  auto grid = cg::this_grid();
  IdxT index_rest =
      grid.block_index().y * block.dim_threads().y + block.thread_index().y;
  if (index_rest >= size_rest) {
    return;
  }

  auto shape_x = shape[ndim - 1];
  auto stride_x = strides[ndim - 1];
  IdxT index_x =
      grid.block_index().x * block.dim_threads().x + block.thread_index().x;
  auto idx =
      elem_to_loc(index_rest * shape_x, shape.data(), strides.data(), ndim);
  auto in_vec =
      load_vector<N_READS>(in + idx, index_x, shape_x, stride_x, In(0));
  AlignedVector<Out, N_READS> out_vec;
#pragma unroll
  for (int i = 0; i < N_READS; ++i) {
    out_vec[i] = Op{}(in_vec[i]);
  }
  store_vector(out + shape_x * index_rest, index_x, out_vec, shape_x);
}

template <typename Op, typename In, typename Out>
constexpr bool supports_unary_op() {
  if (std::is_same_v<Op, Abs> || std::is_same_v<Op, Negative> ||
      std::is_same_v<Op, Sign> || std::is_same_v<Op, Square>) {
    return std::is_same_v<In, Out>;
  }
  if (std::is_same_v<Op, ArcCosh> || std::is_same_v<Op, ArcSinh> ||
      std::is_same_v<Op, ArcTanh> || std::is_same_v<Op, Erf> ||
      std::is_same_v<Op, ErfInv> || std::is_same_v<Op, Expm1> ||
      std::is_same_v<Op, Sigmoid>) {
    return std::is_same_v<In, Out> && is_floating_v<In>;
  }
  if (std::is_same_v<Op, BitwiseInvert>) {
    return std::is_same_v<In, Out> && std::is_integral_v<In> &&
        !std::is_same_v<In, bool>;
  }
  if (std::is_same_v<Op, Ceil> || std::is_same_v<Op, Floor>) {
    return std::is_same_v<In, Out> && !mlx::core::is_complex_v<In>;
  }
  if (std::is_same_v<Op, Conjugate>) {
    return std::is_same_v<In, Out> && mlx::core::is_complex_v<In>;
  }
  if (std::is_same_v<Op, ArcCos> || std::is_same_v<Op, ArcSin> ||
      std::is_same_v<Op, ArcTan> || std::is_same_v<Op, Cos> ||
      std::is_same_v<Op, Cosh> || std::is_same_v<Op, Exp> ||
      std::is_same_v<Op, Log> || std::is_same_v<Op, Log2> ||
      std::is_same_v<Op, Log10> || std::is_same_v<Op, Log1p> ||
      std::is_same_v<Op, Round> || std::is_same_v<Op, Rsqrt> ||
      std::is_same_v<Op, Sqrt> || std::is_same_v<Op, Sin> ||
      std::is_same_v<Op, Sinh> || std::is_same_v<Op, Tan> ||
      std::is_same_v<Op, Tanh>) {
    return std::is_same_v<In, Out> && is_inexact_v<In>;
  }
  if (std::is_same_v<Op, Imag> || std::is_same_v<Op, Real>) {
    return mlx::core::is_complex_v<In> && std::is_same_v<Out, float>;
  }
  if (std::is_same_v<Op, LogicalNot>) {
    return std::is_same_v<In, Out> && std::is_same_v<In, bool>;
  }
  if (std::is_same_v<Op, ToFP8>) {
    return std::is_same_v<Out, uint8_t> && is_floating_v<In>;
  }
  if (std::is_same_v<Op, FromFP8>) {
    return std::is_same_v<In, uint8_t> && is_floating_v<Out>;
  }
  return false;
}

} // namespace cu

template <typename Op>
void unary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s) {
  auto& in = inputs[0];
  if (in.size() == 0) {
    return;
  }
  bool contig = in.flags().contiguous;
  bool large;
  if (!contig) {
    large = in.data_size() > INT32_MAX || out.size() > INT32_MAX;
  } else {
    large = in.data_size() > UINT32_MAX;
  }

  auto& encoder = cu::get_command_encoder(s);
  encoder.set_input_array(in);
  encoder.set_output_array(out);
  dispatch_all_types(in.dtype(), [&](auto in_type_tag) {
    dispatch_all_types(out.dtype(), [&](auto out_type_tag) {
      using CTYPE_IN = MLX_GET_TYPE(in_type_tag);
      using CTYPE_OUT = MLX_GET_TYPE(out_type_tag);
      if constexpr (cu::supports_unary_op<Op, CTYPE_IN, CTYPE_OUT>()) {
        dispatch_bool(large, [&](auto large) {
          using InType = cuda_type_t<CTYPE_IN>;
          using OutType = cuda_type_t<CTYPE_OUT>;
          if (contig) {
            using IdxT = std::conditional_t<large(), int64_t, uint32_t>;
            constexpr int N_READS = 16 / sizeof(OutType);
            auto [num_blocks, block_dims] = get_launch_args(
                out.data_size(), out.shape(), out.strides(), large, N_READS);
            encoder.add_kernel_node(
                cu::unary_v<Op, InType, OutType, IdxT, N_READS>,
                num_blocks,
                block_dims,
                gpu_ptr<InType>(in),
                gpu_ptr<OutType>(out),
                out.data_size());
          } else {
            using IdxT = std::conditional_t<large(), int64_t, int32_t>;
            auto [shape, strides] = collapse_contiguous_dims(in);
            auto ndim = shape.size();
            int work_per_thread = 1;
            auto kernel = cu::unary_g<Op, InType, OutType, IdxT, 1>;
            auto dim0 = ndim > 0 ? shape.back() : 1;
            auto rest = out.size() / dim0;
            if (dim0 >= 4) {
              kernel = cu::unary_g<Op, InType, OutType, IdxT, 4>;
              work_per_thread = 4;
            }
            dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
            auto block_dims = get_block_dims(dim0, rest, 1);
            uint32_t num_blocks_x = cuda::ceil_div(dim0, block_dims.x);
            uint32_t num_blocks_y = cuda::ceil_div(rest, block_dims.y);
            encoder.add_kernel_node(
                kernel,
                {num_blocks_x, num_blocks_y},
                block_dims,
                gpu_ptr<InType>(in),
                gpu_ptr<OutType>(out),
                rest,
                const_param(shape),
                const_param(strides),
                ndim);
          }
        });
      } else {
        throw std::runtime_error(
            fmt::format(
                "Can not do unary op {} on input of {} with output of {}.",
                op,
                dtype_to_string(in.dtype()),
                dtype_to_string(out.dtype())));
      }
    });
  });
}

template <typename Op>
void unary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s) {
  auto& encoder = cu::get_command_encoder(s);
  set_unary_output_data(
      inputs[0], out, [&](auto n) { return cu::malloc_async(n, encoder); });
  unary_op_gpu_inplace<Op>(inputs, out, op, s);
}

#define UNARY_GPU(func)                                               \
  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
    nvtx3::scoped_range r(#func "::eval_gpu");                        \
    auto& s = out.primitive().stream();                               \
    unary_op_gpu<cu::func>(inputs, out, name(), s);                   \
  }

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/utils.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/utils.h"
#include "mlx/backend/cuda/device.h"
#include "mlx/dtype_utils.h"

#include <fmt/format.h>
#include <cuda/cmath>
#include <vector>

namespace mlx::core {

void check_cublas_error(const char* name, cublasStatus_t err) {
  if (err != CUBLAS_STATUS_SUCCESS) {
    // TODO: Use cublasGetStatusString when it is widely available.
    throw std::runtime_error(
        fmt::format("{} failed with code: {}.", name, static_cast<int>(err)));
  }
}

void check_cuda_error(const char* name, cudaError_t err) {
  if (err != cudaSuccess) {
    throw std::runtime_error(
        fmt::format("{} failed: {}", name, cudaGetErrorString(err)));
  }
}

void check_cuda_error(const char* name, CUresult err) {
  if (err != CUDA_SUCCESS) {
    const char* err_str = "Unknown error";
    cuGetErrorString(err, &err_str);
    throw std::runtime_error(fmt::format("{} failed: {}", name, err_str));
  }
}

void check_cudnn_error(const char* name, cudnnStatus_t err) {
  if (err != CUDNN_STATUS_SUCCESS) {
    throw std::runtime_error(
        fmt::format("{} failed: {}.", name, cudnnGetErrorString(err)));
  }
}

const char* dtype_to_cuda_type(const Dtype& dtype) {
  switch (dtype) {
    case bool_:
      return "bool";
    case int8:
      return "int8_t";
    case int16:
      return "int16_t";
    case int32:
      return "int32_t";
    case int64:
      return "int64_t";
    case uint8:
      return "uint8_t";
    case uint16:
      return "uint16_t";
    case uint32:
      return "uint32_t";
    case uint64:
      return "uint64_t";
    case float16:
      return "__half";
    case bfloat16:
      return "__nv_bfloat16";
    case float32:
      return "float";
    case float64:
      return "double";
    case complex64:
      return "mlx::core::cu::complex64_t";
    default:
      return "unknown";
  }
}

CudaGraph::CudaGraph(cu::Device& device) {
  device.make_current();
  CHECK_CUDA_ERROR(cudaGraphCreate(&handle_, 0));
}

void CudaGraph::end_capture(cudaStream_t stream) {
  CHECK_CUDA_ERROR(cudaStreamEndCapture(stream, &handle_));
}

void CudaGraphExec::instantiate(cudaGraph_t graph) {
  assert(handle_ == nullptr);
  CHECK_CUDA_ERROR(cudaGraphInstantiate(&handle_, graph, nullptr, nullptr, 0));
}

CudaStream::CudaStream(cu::Device& device) {
  device.make_current();
  CHECK_CUDA_ERROR(cudaStreamCreateWithFlags(&handle_, cudaStreamNonBlocking));
}

void* allocate_workspace(cu::CommandEncoder& encoder, size_t workspace_size) {
  if (workspace_size == 0) {
    return nullptr;
  }

  // Workspace allocation should not be captured.
#ifndef NDEBUG
  cudaStreamCaptureStatus status;
  CHECK_CUDA_ERROR(cudaStreamIsCapturing(encoder.stream(), &status));
  assert(status == cudaStreamCaptureStatusNone);
#endif

  // Ensure workspace is 256-byte aligned.
  int nbytes = cuda::ceil_div(workspace_size, 256) * 256;
  array workspace(cu::malloc_async(nbytes, encoder), {nbytes}, int8);
  encoder.add_temporary(workspace);
  return gpu_ptr<void>(workspace);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/utils.h
================================================
// Copyright © 2025 Apple Inc.

// This file include utilities that are used by C++ code (i.e. .cpp files).

#pragma once

#include "mlx/array.h"
#include "mlx/backend/cuda/allocator.h"
#include "mlx/backend/cuda/cuda_utils.h"

namespace mlx::core {

template <typename T>
inline uint32_t max_occupancy_block_dim(T kernel) {
  int _, block_dim;
  if constexpr (std::is_same_v<T, CUfunction>) {
    CHECK_CUDA_ERROR(
        cuOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel, 0, 0, 0));
  } else {
    CHECK_CUDA_ERROR(
        cudaOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel));
  }
  return block_dim;
}

template <typename T>
inline T* gpu_ptr(array& arr) {
  return reinterpret_cast<T*>(
      static_cast<char*>(
          static_cast<cu::CudaBuffer*>(arr.buffer().ptr())->data) +
      arr.offset());
}

// For const array, keep constness in pointer unless it is untyped.
template <typename T>
inline std::conditional_t<std::is_same_v<T, void>, void*, const T*> gpu_ptr(
    const array& arr) {
  return gpu_ptr<T>(const_cast<array&>(arr));
}

struct Dtype;

// Convert Dtype to CUDA C++ types.
const char* dtype_to_cuda_type(const Dtype& dtype);

// Allocate an empty array and add it as temporary.
void* allocate_workspace(cu::CommandEncoder& encoder, size_t workspace_size);

} // namespace mlx::core


================================================
FILE: mlx/backend/cuda/vector_types.cuh
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include <cuda_bf16.h>
#include <cuda_fp16.h>

namespace mlx::core::cu {

template <typename T>
struct Vector2;

template <>
struct Vector2<double> {
  using type = double2;
};

template <>
struct Vector2<float> {
  using type = float2;
};

template <>
struct Vector2<__half> {
  using type = __half2;
};

template <>
struct Vector2<__nv_bfloat16> {
  using type = __nv_bfloat162;
};

template <typename T>
using Vector2_t = typename Vector2<T>::type;

template <typename T>
struct Vector4 {
  T x, y, z, w;
};

template <typename T>
using Vector4_t = Vector4<T>;

using bf16x4 = Vector4_t<__nv_bfloat16>;
using fp16x4 = Vector4_t<__half>;
using fp32x4 = Vector4_t<float>;

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/worker.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/cuda/worker.h"
#include "mlx/backend/cuda/device.h"

namespace mlx::core::cu {

Worker::Worker(Device& d)
    : signal_stream_(d),
      signal_event_(d, cudaEventDisableTiming | cudaEventBlockingSync),
      worker_(&Worker::thread_fn, this) {}

Worker::~Worker() {
  {
    std::lock_guard lock(mtx_);
    stop_ = true;
  }
  cond_.notify_one();
  worker_.join();
}

void Worker::add_task(std::function<void()> task) {
  pending_tasks_.push_back(std::move(task));
}

void Worker::signal(void* data) {
  auto w = static_cast<Worker*>(data);
  {
    std::lock_guard lock(w->mtx_);
    w->signaled_batch_++;
  }
  w->cond_.notify_one();
}

void Worker::commit(cudaStream_t stream) {
  // Move pending tasks into tasks
  if (pending_tasks_.empty()) {
    return;
  }
  {
    std::lock_guard lock(mtx_);
    // Move pending tasks into ready tasks
    worker_tasks_[++committed_batch_] = std::move(pending_tasks_);
  }
  signal_event_.record(stream);
  signal_event_.wait(signal_stream_);
  CHECK_CUDA_ERROR(cudaLaunchHostFunc(signal_stream_, signal, this));
}

void Worker::thread_fn() {
  while (!stop_) {
    uint64_t current_batch = 0;
    Tasks tasks;
    {
      std::unique_lock<std::mutex> lk(mtx_);
      cond_.wait(lk, [this, &current_batch] {
        return this->signaled_batch_ > current_batch || this->stop_;
      });
      current_batch = signaled_batch_;
      auto end = worker_tasks_.upper_bound(current_batch);
      for (auto it = worker_tasks_.begin(); it != end; ++it) {
        if (tasks.empty()) {
          tasks = std::move(it->second);
        } else {
          std::move(
              it->second.begin(), it->second.end(), std::back_inserter(tasks));
        }
      }
      worker_tasks_.erase(worker_tasks_.begin(), end);
    }
    // Make sure tasks are cleared before the next wait
    for (int i = 0; i < tasks.size(); ++i) {
      auto task = std::move(tasks[i]);
      task();
    }
  }
}

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/cuda/worker.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/backend/cuda/event.h"

#include <condition_variable>
#include <functional>
#include <map>
#include <mutex>
#include <thread>

namespace mlx::core::cu {

// Run tasks in worker thread, synchronized with cuda stream.
class Worker {
 public:
  explicit Worker(Device& d);
  ~Worker();

  Worker(const Worker&) = delete;
  Worker& operator=(const Worker&) = delete;

  // Add a pending |task| that will run when consumed or commited.
  void add_task(std::function<void()> task);

  // Inform worker thread to run current batches after kernels in |stream|
  // finish running.
  void commit(cudaStream_t stream);

 private:
  static void signal(void*);

  void thread_fn();
  std::mutex mtx_;
  std::condition_variable cond_;

  uint64_t committed_batch_{0};
  uint64_t signaled_batch_{0};

  // Cuda stream and event for signaling kernel completion.
  CudaStream signal_stream_;
  CudaEvent signal_event_;

  bool stop_{false};

  // Tasks are put in |pending_tasks_| first, and then moved to
  // |worker_tasks_| when end_batch() is called.
  using Tasks = std::vector<std::function<void()>>;
  Tasks pending_tasks_;
  std::map<uint64_t, Tasks> worker_tasks_;
  std::thread worker_;
};

} // namespace mlx::core::cu


================================================
FILE: mlx/backend/gpu/CMakeLists.txt
================================================
target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp)


================================================
FILE: mlx/backend/gpu/copy.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include "mlx/backend/gpu/copy.h"
#include "mlx/primitives.h"

#include <cassert>
#include <numeric>

namespace mlx::core {

void copy_gpu(const array& in, array& out, CopyType ctype) {
  copy_gpu(in, out, ctype, out.primitive().stream());
}

void copy_gpu_inplace(
    const array& in,
    array& out,
    CopyType ctype,
    const Stream& s) {
  assert(in.shape() == out.shape());
  return copy_gpu_inplace(
      in, out, in.shape(), in.strides(), out.strides(), 0, 0, ctype, s);
}

void copy_gpu_inplace(
    const array& in,
    array& out,
    const Strides& i_strides,
    int64_t i_offset,
    CopyType ctype,
    const Stream& s) {
  assert(in.shape() == out.shape());
  return copy_gpu_inplace(
      in, out, in.shape(), i_strides, out.strides(), i_offset, 0, ctype, s);
}

array contiguous_copy_gpu(const array& arr, const Stream& s) {
  array arr_copy(arr.shape(), arr.dtype(), nullptr, {});
  copy_gpu(arr, arr_copy, CopyType::General, s);
  return arr_copy;
}

array flatten_in_eval(const array& x, int start_axis, int end_axis, Stream s) {
  int ndim = x.ndim();
  if (start_axis < 0) {
    start_axis += ndim;
  }
  if (end_axis < 0) {
    end_axis += ndim;
  }
  start_axis = std::max(0, start_axis);
  end_axis = std::min(ndim - 1, end_axis);

  return reshape_in_eval(x, Flatten::output_shape(x, start_axis, end_axis), s);
}

array reshape_in_eval(const array& x, Shape shape, Stream s) {
  array out(std::move(shape), x.dtype(), nullptr, {});
  reshape_gpu(x, out, s);
  return out;
}

array transpose_in_eval(const array& x, const std::vector<int>& axes) {
  Shape shape(axes.size());
  Strides strides(axes.size());
  for (int i = 0; i < axes.size(); ++i) {
    shape[i] = x.shape(axes[i]);
    strides[i] = x.strides(axes[i]);
  }

  auto [data_size, row_contiguous, col_contiguous] =
      check_contiguity(shape, strides);
  bool contiguous = data_size == x.data_size();

  array out(std::move(shape), x.dtype(), nullptr, {});
  out.copy_shared_buffer(
      x,
      std::move(strides),
      {contiguous, row_contiguous, col_contiguous},
      x.data_size());
  return out;
}

array swapaxes_in_eval(const array& x, int axis1, int axis2) {
  int ndim = x.ndim();
  if (axis1 < 0) {
    axis1 += ndim;
  }
  if (axis2 < 0) {
    axis2 += ndim;
  }

  std::vector<int> axes(ndim);
  std::iota(axes.begin(), axes.end(), 0);
  std::swap(axes[axis1], axes[axis2]);
  return transpose_in_eval(x, axes);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/gpu/copy.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include "mlx/backend/common/copy.h"
#include "mlx/stream.h"

#include <optional>
#include <vector>

namespace mlx::core {

// Generic copy inplace
void copy_gpu_inplace(
    const array& in,
    array& out,
    const Shape& data_shape,
    const Strides& i_strides,
    const Strides& o_strides,
    int64_t i_offset,
    int64_t o_offset,
    CopyType ctype,
    const Stream& s,
    std::optional<array> dynamic_i_offset = std::nullopt,
    std::optional<array> dynamic_o_offset = std::nullopt);

void copy_gpu(const array& src, array& out, CopyType ctype, const Stream& s);
void copy_gpu(const array& src, array& out, CopyType ctype);

void copy_gpu_inplace(
    const array& in,
    array& out,
    CopyType ctype,
    const Stream& s);

void copy_gpu_inplace(
    const array& in,
    array& out,
    const Strides& i_strides,
    int64_t i_offset,
    CopyType ctype,
    const Stream& s);

// Fill the output with the scalar val
void fill_gpu(const array& val, array& out, const Stream& s);

// Return a contiguous array with same shape that copies the data of |arr|.
array contiguous_copy_gpu(const array& arr, const Stream& s);

// Copy data from |in| and transpose to |out|'s shape.
void reshape_gpu(const array& in, array& out, Stream s);

// Like the normal ops but safe to call in eval_gpu.
array flatten_in_eval(const array& x, int start_axis, int end_axis, Stream s);
array reshape_in_eval(const array& x, Shape shape, Stream s);
array transpose_in_eval(const array& x, const std::vector<int>& axes);
array swapaxes_in_eval(const array& x, int axis1, int axis2);

} // namespace mlx::core


================================================
FILE: mlx/backend/gpu/device_info.h
================================================
// Copyright © 2026 Apple Inc.

#pragma once

#include <string>
#include <unordered_map>
#include <variant>

#include "mlx/api.h"

namespace mlx::core::gpu {

MLX_API bool is_available();

/**
 * Get the number of available GPU devices.
 */
MLX_API int device_count();

/**
 * Get information about a GPU device.
 *
 * Returns a map of device properties. Keys vary by backend:
 *   - device_name (string): Device name
 *   - architecture (string): Architecture identifier
 *   - total_memory/memory_size (size_t): Total device memory
 *   - free_memory (size_t): Available memory (CUDA only)
 *   - uuid (string): Device UUID (CUDA only)
 *   - pci_bus_id (string): PCI bus ID (CUDA only)
 *   - compute_capability_major/minor (size_t): Compute capability (CUDA only)
 */
MLX_API const
    std::unordered_map<std::string, std::variant<std::string, size_t>>&
    device_info(int device_index = 0);

} // namespace mlx::core::gpu


================================================
FILE: mlx/backend/gpu/eval.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <future>
#include <memory>

#include "mlx/array.h"
#include "mlx/stream.h"

namespace mlx::core::gpu {

void new_stream(Stream stream);
void eval(array& arr);
void finalize(Stream s);
void synchronize(Stream s);

} // namespace mlx::core::gpu


================================================
FILE: mlx/backend/gpu/primitives.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/primitives.h"
#include "mlx/backend/common/slicing.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/gpu/slicing.h"

#if defined(MLX_USE_CUDA)
#include <nvtx3/nvtx3.hpp>
#endif

#include <cassert>

#if defined(MLX_USE_CUDA)
#define MLX_PROFILER_RANGE(message) nvtx3::scoped_range r(message)
#else
#define MLX_PROFILER_RANGE(message)
#endif

namespace mlx::core {

void AsStrided::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("AsStrided::eval_gpu");
  eval(inputs, out);
}

void AsType::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("AsType::eval_gpu");
  CopyType ctype =
      inputs[0].flags().contiguous ? CopyType::Vector : CopyType::General;
  copy_gpu(inputs[0], out, ctype);
}

void Broadcast::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("Broadcast::eval_gpu");
  eval(inputs, out);
}

void BroadcastAxes::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("BroadcastAxes::eval_gpu");
  eval(inputs, out);
}

void Concatenate::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("Concatenate::eval_gpu");
  concatenate_gpu(inputs, out, axis_, stream());
}

void Contiguous::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("Contiguous::eval_gpu");
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  constexpr size_t extra_bytes = 16384;
  if (in.buffer_size() <= out.nbytes() + extra_bytes &&
      (in.flags().row_contiguous ||
       (allow_col_major_ && in.flags().col_contiguous))) {
    out.copy_shared_buffer(in);
  } else {
    copy_gpu(in, out, CopyType::General);
  }
}

void Copy::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("Copy::eval_gpu");
  eval(inputs, out);
}

void CustomTransforms::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  MLX_PROFILER_RANGE("CustomTransforms::eval_gpu");
  eval(inputs, outputs);
}

void Depends::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  MLX_PROFILER_RANGE("Depends::eval_gpu");
  eval(inputs, outputs);
}

void DynamicSlice::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("DynamicSlice::eval_gpu");
  if (out.size() == 0) {
    out.set_data(allocator::malloc(0));
    return;
  }

  auto& in = inputs[0];
  auto& start = inputs[1];
  out.set_data(allocator::malloc(out.nbytes()));

  auto s = stream();
  auto in_offset = compute_dynamic_offset(start, in.strides(), axes_, s);
  copy_gpu_inplace(
      /* const array& src = */ in,
      /* array& dst = */ out,
      /* const Shape& data_shape = */ out.shape(),
      /* const Strides& i_strides = */ in.strides(),
      /* const Strides& o_strides = */ out.strides(),
      /* int64_t i_offset = */ 0,
      /* int64_t o_offset = */ 0,
      /* CopyType ctype = */ CopyType::GeneralGeneral,
      /* const Stream& s = */ s,
      /* std::optional<array> dynamic_i_offset = */ std::move(in_offset),
      /* std::optional<array> dynamic_o_offset = */ std::nullopt);
}

void DynamicSliceUpdate::eval_gpu(
    const std::vector<array>& inputs,
    array& out) {
  MLX_PROFILER_RANGE("DynamicSliceUpdate::eval_gpu");
  if (out.size() == 0) {
    out.set_data(allocator::malloc(0));
    return;
  }

  auto& in = inputs[0];
  auto& upd = inputs[1];
  auto& start_indices = inputs[2];

  if (upd.size() == 0) {
    out.copy_shared_buffer(in);
    return;
  }

  // Copy or donate input to output
  auto s = stream();
  auto ctype = in.flags().contiguous && in.size() == in.data_size()
      ? CopyType::Vector
      : CopyType::General;
  copy_gpu(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, s);

  auto out_offset =
      compute_dynamic_offset(start_indices, out.strides(), axes_, s);
  copy_gpu_inplace(
      /* const array& src = */ upd,
      /* array& dst = */ out,
      /* const Shape& data_shape = */ upd.shape(),
      /* const Strides& i_strides = */ upd.strides(),
      /* const Strides& o_strides = */ out.strides(),
      /* int64_t i_offset = */ 0,
      /* int64_t o_offset = */ 0,
      /* CopyType ctype = */ CopyType::GeneralGeneral,
      /* const Stream& s = */ s,
      /* std::optional<array> dynamic_i_offset = */ std::nullopt,
      /* std::optional<array> dynamic_o_offset = */ std::move(out_offset));
}

void ExpandDims::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("ExpandDims::eval_gpu");
  eval(inputs, out);
}

void Full::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("Full::eval_gpu");
  auto in = inputs[0];
  CopyType ctype;
  if (in.data_size() == 1) {
    ctype = CopyType::Scalar;
  } else if (in.flags().contiguous) {
    ctype = CopyType::Vector;
  } else {
    ctype = CopyType::General;
  }
  copy_gpu(in, out, ctype);
}

void Flatten::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("Flatten::eval_gpu");
  reshape_gpu(inputs[0], out, stream());
}

void NumberOfElements::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("NumberOfElements::eval_gpu");
  eval(inputs, out);
}

void Pad::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("Pad::eval_gpu");
  // Inputs must be base input array and scalar val array
  assert(inputs.size() == 2);
  auto& in = inputs[0];
  auto& val = inputs[1];

  // Padding value must be a scalar
  assert(val.size() == 1);

  // Padding value, input and output must be of the same type
  assert(val.dtype() == in.dtype() && in.dtype() == out.dtype());

  pad_gpu(in, val, out, axes_, low_pad_size_, stream());
}

void Reshape::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("Reshape::eval_gpu");
  reshape_gpu(inputs[0], out, stream());
}

void Split::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  MLX_PROFILER_RANGE("Split::eval_gpu");
  eval(inputs, outputs);
}

void Slice::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("Slice::eval_gpu");
  assert(inputs.size() == 1);
  if (out.size() == 0) {
    out.set_data(allocator::malloc(0));
    return;
  }

  auto& in = inputs[0];
  slice_gpu(in, out, start_indices_, strides_, stream());
}

void Squeeze::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("Squeeze::eval_gpu");
  eval(inputs, out);
}

void StopGradient::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("StopGradient::eval_gpu");
  eval(inputs, out);
}

void Transpose::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("Transpose::eval_gpu");
  eval(inputs, out);
}

void Unflatten::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("Unflatten::eval_gpu");
  reshape_gpu(inputs[0], out, stream());
}

void View::eval_gpu(const std::vector<array>& inputs, array& out) {
  MLX_PROFILER_RANGE("View::eval_gpu");
  auto& in = inputs[0];
  auto ibytes = size_of(in.dtype());
  auto obytes = size_of(out.dtype());
  // Conditions for buffer copying (disjunction):
  // - type size is the same
  // - type size is smaller and the last axis is contiguous
  // - the entire array is row contiguous
  if (ibytes == obytes || (obytes < ibytes && in.strides().back() == 1) ||
      in.flags().row_contiguous) {
    auto strides = in.strides();
    for (int i = 0; i < static_cast<int>(strides.size()) - 1; ++i) {
      strides[i] *= ibytes;
      strides[i] /= obytes;
    }
    out.copy_shared_buffer(
        in, strides, in.flags(), in.data_size() * ibytes / obytes);
  } else {
    auto tmp = array(in.shape(), in.dtype(), nullptr, {});
    tmp.set_data(allocator::malloc(tmp.nbytes()));
    copy_gpu_inplace(in, tmp, CopyType::General, stream());

    auto flags = out.flags();
    flags.contiguous = true;
    flags.row_contiguous = true;
    auto max_dim = std::max_element(out.shape().begin(), out.shape().end());
    flags.col_contiguous = out.size() <= 1 || out.size() == *max_dim;
    out.copy_shared_buffer(tmp, out.strides(), flags, out.size());
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/gpu/scan.h
================================================
#pragma once

#include "mlx/array.h"
#include "mlx/primitives.h"

namespace mlx::core {

void scan_gpu_inplace(
    array in,
    array& out,
    Scan::ReduceType reduce_type,
    int axis,
    bool reverse,
    bool inclusive,
    const Stream& s);

} // namespace mlx::core


================================================
FILE: mlx/backend/gpu/slicing.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/backend/common/slicing.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/gpu/slicing.h"

namespace mlx::core {

void slice_gpu(
    const array& in,
    array& out,
    const Shape& start_indices,
    const Shape& strides,
    const Stream&) {
  slice(in, out, start_indices, strides);
}

void pad_gpu(
    const array& in,
    const array& val,
    array& out,
    const std::vector<int>& axes,
    const Shape& low_pad_size,
    const Stream& s) {
  // Fill output with val
  fill_gpu(val, out, s);

  // Find offset for start of input values
  size_t data_offset = 0;
  for (int i = 0; i < axes.size(); i++) {
    auto ax = axes[i] < 0 ? out.ndim() + axes[i] : axes[i];
    data_offset += out.strides()[ax] * low_pad_size[i];
  }

  // Extract slice from output where input will be pasted
  array out_slice(in.shape(), out.dtype(), nullptr, {});
  out_slice.copy_shared_buffer(
      out, out.strides(), out.flags(), out_slice.size(), data_offset);

  // Copy input values into the slice
  copy_gpu_inplace(in, out_slice, CopyType::GeneralGeneral, s);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/gpu/slicing.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/array.h"

namespace mlx::core {

void slice_gpu(
    const array& in,
    array& out,
    const Shape& start_indices,
    const Shape& strides,
    const Stream& s);

void concatenate_gpu(
    const std::vector<array>& inputs,
    array& out,
    int axis,
    const Stream& s);

void pad_gpu(
    const array& in,
    const array& val,
    array& out,
    const std::vector<int>& axes,
    const Shape& low_pad_size,
    const Stream& s);

array compute_dynamic_offset(
    const array& indices,
    const Strides& strides,
    const std::vector<int>& axes,
    const Stream& s);

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/CMakeLists.txt
================================================
function(make_jit_source SRC_FILE)
  # This function takes a metal header file, runs the C preprocessesor on it,
  # and makes the processed contents available as a string in a C++ function
  # mlx::core::metal::${SRC_NAME}()
  #
  # To use the function, declare it in jit/includes.h and include
  # jit/includes.h.
  #
  # Additional arguments to this function are treated as dependencies in the
  # Cmake build system.
  get_filename_component(SRC_NAME ${SRC_FILE} NAME)
  add_custom_command(
    OUTPUT jit/${SRC_NAME}.cpp
    COMMAND
      bash ${CMAKE_CURRENT_SOURCE_DIR}/make_compiled_preamble.sh
      ${CMAKE_CURRENT_BINARY_DIR}/jit ${CMAKE_C_COMPILER} ${PROJECT_SOURCE_DIR}
      ${SRC_FILE}
    DEPENDS make_compiled_preamble.sh kernels/${SRC_FILE}.h ${ARGN})
  add_custom_target(${SRC_NAME} DEPENDS jit/${SRC_NAME}.cpp)
  add_dependencies(mlx ${SRC_NAME})
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/jit/${SRC_NAME}.cpp)
endfunction(make_jit_source)

make_jit_source(utils kernels/bf16.h kernels/bf16_math.h kernels/complex.h
                kernels/defines.h kernels/logging.h)
make_jit_source(unary_ops kernels/erf.h kernels/expm1f.h kernels/fp8.h)
make_jit_source(binary_ops)
make_jit_source(ternary_ops)
make_jit_source(reduce_utils kernels/atomic.h kernels/reduction/ops.h)
make_jit_source(indexing/scatter kernels/indexing/indexing.h)
make_jit_source(indexing/masked_scatter)
make_jit_source(indexing/gather kernels/indexing/indexing.h)
make_jit_source(indexing/gather_front kernels/indexing/indexing.h)
make_jit_source(indexing/gather_axis)
make_jit_source(indexing/scatter_axis)
make_jit_source(hadamard)

if(MLX_METAL_JIT)
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/jit_kernels.cpp)
  make_jit_source(arange)
  make_jit_source(copy)
  make_jit_source(unary)
  make_jit_source(binary)
  make_jit_source(binary_two)
  make_jit_source(fft kernels/fft/radix.h kernels/fft/readwrite.h)
  make_jit_source(logsumexp)
  make_jit_source(ternary)
  make_jit_source(softmax)
  make_jit_source(scan)
  make_jit_source(sort)
  make_jit_source(
    reduce kernels/reduction/reduce_all.h kernels/reduction/reduce_col.h
    kernels/reduction/reduce_row.h kernels/reduction/reduce_init.h)
  make_jit_source(
    steel/gemm/gemm kernels/steel/utils.h kernels/steel/gemm/loader.h
    kernels/steel/gemm/mma.h kernels/steel/gemm/params.h
    kernels/steel/gemm/transforms.h)
  make_jit_source(steel/gemm/kernels/steel_gemm_fused)
  make_jit_source(steel/gemm/kernels/steel_gemm_masked kernels/steel/defines.h)
  make_jit_source(steel/gemm/kernels/steel_gemm_gather)
  make_jit_source(steel/gemm/kernels/steel_gemm_splitk)
  make_jit_source(steel/gemm/kernels/steel_gemm_segmented)
  make_jit_source(
    steel/conv/conv
    kernels/steel/utils.h
    kernels/steel/defines.h
    kernels/steel/gemm/mma.h
    kernels/steel/gemm/transforms.h
    kernels/steel/conv/params.h
    kernels/steel/conv/loader.h
    kernels/steel/conv/loaders/loader_channel_l.h
    kernels/steel/conv/loaders/loader_channel_n.h)
  make_jit_source(steel/conv/kernels/steel_conv)
  make_jit_source(steel/conv/kernels/steel_conv_3d)
  make_jit_source(steel/conv/kernels/steel_conv_general kernels/steel/defines.h
                  kernels/steel/conv/loaders/loader_general.h)

  make_jit_source(quantized_utils)
  make_jit_source(quantized kernels/quantized_utils.h)
  make_jit_source(fp_quantized kernels/quantized_utils.h kernels/fp8.h
                  kernels/fp4.h)
  make_jit_source(gemv_masked)

  make_jit_source(steel/attn/kernels/steel_attention)

  make_jit_source(
    steel/gemm/gemm_nax kernels/steel/utils.h kernels/steel/gemm/nax.h
    kernels/steel/gemm/params.h kernels/steel/gemm/transforms.h)
  make_jit_source(steel/gemm/kernels/steel_gemm_fused_nax)
  make_jit_source(steel/gemm/kernels/steel_gemm_gather_nax)
  make_jit_source(steel/gemm/kernels/steel_gemm_splitk_nax)

  make_jit_source(quantized_nax kernels/quantized_utils.h)
  make_jit_source(fp_quantized_nax kernels/quantized_utils.h kernels/fp8.h
                  kernels/fp4.h)

  make_jit_source(steel/attn/kernels/steel_attention_nax)

else()
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/nojit_kernels.cpp)
endif()

target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/binary.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device_info.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/conv.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/copy.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/custom_kernel.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/distributed.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/eval.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fence.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/hadamard.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/logsumexp.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/scaled_dot_product_attention.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/metal.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/quantized.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/normalization.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/rope.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/scan.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/sort.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/ternary.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/unary.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/resident.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp)

if(NOT MLX_METAL_PATH)
  set(MLX_METAL_PATH ${CMAKE_CURRENT_BINARY_DIR}/kernels/)
endif()

add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels)

target_compile_definitions(mlx
                           PRIVATE METAL_PATH="${MLX_METAL_PATH}/mlx.metallib")


================================================
FILE: mlx/backend/metal/allocator.cpp
================================================
// Copyright © 2023-2024 Apple Inc.
#include "mlx/backend/metal/allocator.h"
#include "mlx/backend/gpu/device_info.h"
#include "mlx/backend/metal/metal.h"
#include "mlx/backend/metal/resident.h"
#include "mlx/memory.h"

#include <mach/vm_page_size.h>
#include <unistd.h>
#include <cstdlib>

namespace mlx::core {

constexpr size_t resource_options =
    MTL::ResourceStorageModeShared | MTL::ResourceHazardTrackingModeUntracked;

namespace allocator {

Allocator& allocator() {
  return metal::allocator();
}

void* Buffer::raw_ptr() {
  if (!ptr_) {
    return nullptr;
  }
  return static_cast<MTL::Buffer*>(ptr_)->contents();
}

} // namespace allocator

namespace metal {

MetalAllocator::MetalAllocator()
    : device_(device(mlx::core::Device::gpu).mtl_device()),
      buffer_cache_(
          vm_page_size,
          [](MTL::Buffer* buf) { return buf->length(); },
          [this](MTL::Buffer* buf) {
            if (!buf->heap()) {
              residency_set_.erase(buf);
            }
            buf->release();
          }),
      residency_set_(device_) {
  auto pool = metal::new_scoped_memory_pool();
  const auto& info = gpu::device_info(0);
  auto memsize = std::get<size_t>(info.at("memory_size"));
  auto max_rec_size =
      std::get<size_t>(info.at("max_recommended_working_set_size"));
  resource_limit_ = std::get<size_t>(info.at("resource_limit"));
  block_limit_ = std::min(1.5 * max_rec_size, 0.95 * memsize);
  gc_limit_ = std::min(static_cast<size_t>(0.95 * max_rec_size), block_limit_);
  max_pool_size_ = block_limit_;
  device(mlx::core::Device::gpu)
      .set_residency_set(residency_set_.mtl_residency_set());
  bool is_vm = std::get<std::string>(info.at("device_name")) ==
      "Apple Paravirtual device";
  if (is_vm) {
    return;
  }
  auto heap_desc = MTL::HeapDescriptor::alloc()->init();
  heap_desc->setResourceOptions(resource_options);
  heap_desc->setSize(heap_size_);
  heap_ = device_->newHeap(heap_desc);
  heap_desc->release();
  residency_set_.insert(heap_);
}

MetalAllocator::~MetalAllocator() {
  auto pool = metal::new_scoped_memory_pool();
  if (heap_) {
    heap_->release();
  }
  buffer_cache_.clear();
}

size_t MetalAllocator::set_cache_limit(size_t limit) {
  std::unique_lock lk(mutex_);
  std::swap(limit, max_pool_size_);
  return limit;
};

size_t MetalAllocator::set_memory_limit(size_t limit) {
  std::unique_lock lk(mutex_);
  std::swap(limit, block_limit_);
  gc_limit_ = std::min(
      block_limit_,
      static_cast<size_t>(0.95 * device_->recommendedMaxWorkingSetSize()));
  return limit;
};

size_t MetalAllocator::get_memory_limit() {
  return block_limit_;
}

size_t MetalAllocator::set_wired_limit(size_t limit) {
  std::unique_lock lk(mutex_);
  std::swap(limit, wired_limit_);
  residency_set_.resize(wired_limit_);
  return limit;
};

Buffer MetalAllocator::malloc(size_t size) {
  // Metal doesn't like empty buffers
  if (size == 0) {
    return Buffer{nullptr};
  }

  // More helpful message if maximum buffer length is exceeded
  if (size > device_->maxBufferLength()) {
    std::ostringstream msg;
    msg << "[metal::malloc] Attempting to allocate " << size
        << " bytes which is greater than"
        << " the maximum allowed buffer size of " << device_->maxBufferLength()
        << " bytes.";
    throw std::runtime_error(msg.str());
  }

  // Align up memory
  if (size > vm_page_size) {
    size = vm_page_size * ((size + vm_page_size - 1) / vm_page_size);
  }

  // Try the cache
  std::unique_lock lk(mutex_);
  MTL::Buffer* buf = buffer_cache_.reuse_from_cache(size);
  if (!buf) {
    size_t mem_required = get_active_memory() + get_cache_memory() + size;

    auto pool = metal::new_scoped_memory_pool();

    // If we have a lot of memory pressure try to reclaim memory from the cache
    if (mem_required >= gc_limit_ || num_resources_ >= resource_limit_) {
      num_resources_ -=
          buffer_cache_.release_cached_buffers(mem_required - gc_limit_);
    }

    // Allocate new buffer if needed
    if (num_resources_ >= resource_limit_) {
      std::ostringstream msg;
      msg << "[metal::malloc] Resource limit (" << resource_limit_
          << ") exceeded.";
      throw std::runtime_error(msg.str());
    }
    lk.unlock();
    if (size < small_size_ && heap_) {
      buf = heap_->newBuffer(size, resource_options);
    }
    if (!buf) {
      buf = device_->newBuffer(size, resource_options);
    }
    if (!buf) {
      std::ostringstream msg;
      msg << "[malloc] Unable to allocate " << size << " bytes.";
      throw std::runtime_error(msg.str());
    }
    lk.lock();
    num_resources_++;
    if (!buf->heap()) {
      residency_set_.insert(buf);
    }
  }

  active_memory_ += buf->length();
  peak_memory_ = std::max(peak_memory_, active_memory_);

  // Maintain the cache below the requested limit
  if (get_cache_memory() > max_pool_size_) {
    auto pool = metal::new_scoped_memory_pool();
    num_resources_ -= buffer_cache_.release_cached_buffers(
        get_cache_memory() - max_pool_size_);
  }

  return Buffer{static_cast<void*>(buf)};
}

void MetalAllocator::clear_cache() {
  std::unique_lock lk(mutex_);
  auto pool = metal::new_scoped_memory_pool();
  num_resources_ -= buffer_cache_.clear();
}

void MetalAllocator::free(Buffer buffer) {
  auto buf = static_cast<MTL::Buffer*>(buffer.ptr());
  if (buf == nullptr) {
    return;
  }
  std::unique_lock lk(mutex_);
  active_memory_ -= buf->length();
  if (get_cache_memory() < max_pool_size_) {
    buffer_cache_.recycle_to_cache(buf);
  } else {
    num_resources_--;
    if (!buf->heap()) {
      residency_set_.erase(buf);
    }
    lk.unlock();
    auto pool = metal::new_scoped_memory_pool();
    buf->release();
  }
}

size_t MetalAllocator::size(Buffer buffer) const {
  return static_cast<MTL::Buffer*>(buffer.ptr())->length();
}

Buffer MetalAllocator::make_buffer(void* ptr, size_t size) {
  auto buf = device_->newBuffer(ptr, size, resource_options, nullptr);
  if (!buf) {
    return Buffer{nullptr};
  }
  std::unique_lock lk(mutex_);
  residency_set_.insert(buf);
  active_memory_ += buf->length();
  peak_memory_ = std::max(peak_memory_, active_memory_);
  num_resources_++;
  return Buffer{static_cast<void*>(buf)};
}

void MetalAllocator::release(Buffer buffer) {
  auto buf = static_cast<MTL::Buffer*>(buffer.ptr());
  if (buf == nullptr) {
    return;
  }
  std::unique_lock lk(mutex_);
  active_memory_ -= buf->length();
  num_resources_--;
  residency_set_.erase(buf);
  lk.unlock();
  auto pool = metal::new_scoped_memory_pool();
  buf->release();
}

MetalAllocator& allocator() {
  // By creating the |allocator_| on heap, the destructor of MetalAllocator
  // will not be called on exit and buffers in the cache will be leaked. This
  // can save some time at program exit.
  static MetalAllocator* allocator_ = new MetalAllocator;
  return *allocator_;
}

} // namespace metal

size_t set_cache_limit(size_t limit) {
  return metal::allocator().set_cache_limit(limit);
}
size_t set_memory_limit(size_t limit) {
  return metal::allocator().set_memory_limit(limit);
}
size_t get_memory_limit() {
  return metal::allocator().get_memory_limit();
}
size_t set_wired_limit(size_t limit) {
  if (limit > std::get<size_t>(
                  gpu::device_info(0).at("max_recommended_working_set_size"))) {
    throw std::invalid_argument(
        "[metal::set_wired_limit] Setting a wired limit larger than "
        "the maximum working set size is not allowed.");
  }
  return metal::allocator().set_wired_limit(limit);
}
size_t get_active_memory() {
  return metal::allocator().get_active_memory();
}
size_t get_peak_memory() {
  return metal::allocator().get_peak_memory();
}
void reset_peak_memory() {
  metal::allocator().reset_peak_memory();
}
size_t get_cache_memory() {
  return metal::allocator().get_cache_memory();
}
void clear_cache() {
  return metal::allocator().clear_cache();
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/allocator.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <map>
#include <mutex>
#include <vector>

#include "mlx/allocator.h"
#include "mlx/backend/common/buffer_cache.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/resident.h"

namespace mlx::core::metal {

using allocator::Buffer;

class MetalAllocator : public allocator::Allocator {
  /** Allocator for Metal GPUs. */
 public:
  virtual Buffer malloc(size_t size) override;
  virtual void free(Buffer buffer) override;
  virtual size_t size(Buffer buffer) const override;
  virtual Buffer make_buffer(void* ptr, size_t size) override;
  virtual void release(Buffer buffer) override;

  size_t get_active_memory() {
    return active_memory_;
  };
  size_t get_peak_memory() {
    return peak_memory_;
  };
  void reset_peak_memory() {
    std::unique_lock lk(mutex_);
    peak_memory_ = 0;
  };
  size_t get_cache_memory() {
    return buffer_cache_.cache_size();
  };
  size_t set_cache_limit(size_t limit);
  size_t set_memory_limit(size_t limit);
  size_t get_memory_limit();
  size_t set_wired_limit(size_t limit);
  void clear_cache();

 private:
  MTL::Device* device_;

  // The size of allocations which go on the heap until it is full. This size
  // is chosen because it is the actual minimum size of a buffer allocated from
  // the heap, a heap can have at most heap.size() / 256 buffers.
  static constexpr int small_size_ = 256;
  static constexpr int heap_size_ = 1 << 20;
  MTL::Heap* heap_;
  MetalAllocator();
  ~MetalAllocator();
  friend MetalAllocator& allocator();

  // Caching allocator
  BufferCache<MTL::Buffer> buffer_cache_;

  ResidencySet residency_set_;

  // Allocation stats
  size_t block_limit_;
  size_t gc_limit_;
  size_t active_memory_{0};
  size_t peak_memory_{0};
  size_t max_pool_size_;
  size_t wired_limit_{0};
  size_t num_resources_{0};
  size_t resource_limit_{0};

  std::mutex mutex_;
};

MetalAllocator& allocator();

} // namespace mlx::core::metal


================================================
FILE: mlx/backend/metal/binary.cpp
================================================
// Copyright © 2024 Apple Inc.
#include "mlx/backend/common/binary.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/primitives.h"

#define BINARY_GPU(func)                                              \
  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
    binary_op_gpu(inputs, out, name());                               \
  }

#define BINARY_GPU_MULTI(func)                                         \
  void func::eval_gpu(                                                 \
      const std::vector<array>& inputs, std::vector<array>& outputs) { \
    binary_op_gpu(inputs, outputs, name());                            \
  }

namespace mlx::core {

std::string get_kernel_name(
    BinaryOpType bopt,
    const char* op,
    const array& a,
    bool large,
    int ndim,
    int work_per_thread) {
  std::string kname;
  switch (bopt) {
    case BinaryOpType::ScalarScalar:
      kname = "ss";
      break;
    case BinaryOpType::ScalarVector:
      kname = "sv";
      break;
    case BinaryOpType::VectorScalar:
      kname = "vs";
      break;
    case BinaryOpType::VectorVector:
      kname = "vv";
      break;
    case BinaryOpType::General:
      kname = "g";
      if (ndim <= 3) {
        kname += std::to_string(ndim);
      } else {
        concatenate(kname, "n", std::to_string(work_per_thread));
      }
      if (large) {
        kname += "large";
      }
      break;
  }
  if (bopt != BinaryOpType::General && bopt != BinaryOpType::ScalarScalar) {
    if (large) {
      kname += "2";
    } else if (work_per_thread > 1) {
      kname += "n";
    }
  }
  concatenate(kname, "_", op, type_to_name(a));
  return kname;
}

void binary_op_gpu_inplace(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const char* op,
    const Stream& s) {
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto bopt = get_binary_op_type(a, b);

  auto& out = outputs[0];
  if (out.size() == 0) {
    return;
  }

  // Try to collapse contiguous dims
  auto maybe_collapse = [bopt, &a, &b, &out]() {
    if (bopt == BinaryOpType::General) {
      auto [shape, strides] = collapse_contiguous_dims(a, b, out);
      return std::make_tuple(shape, strides[0], strides[1], strides[2]);
    } else {
      decltype(a.strides()) e{};
      return std::make_tuple(decltype(a.shape()){}, e, e, e);
    }
  };
  auto [shape, strides_a, strides_b, strides_out] = maybe_collapse();

  bool large;
  auto ndim = shape.size();
  int work_per_thread;
  if (bopt == BinaryOpType::General) {
    large = a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
        out.size() > INT32_MAX;
    work_per_thread = large ? 4 : 2;
  } else {
    large = out.data_size() > UINT32_MAX;
    work_per_thread = get_work_per_thread(a.dtype(), out.data_size());
  }
  std::string kernel_name =
      get_kernel_name(bopt, op, a, large, shape.size(), work_per_thread);
  auto& d = metal::device(s.device);

  auto kernel = outputs.size() == 2
      ? get_binary_two_kernel(d, kernel_name, a.dtype(), out.dtype(), op)
      : get_binary_kernel(d, kernel_name, a.dtype(), out.dtype(), op);
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  int arg_idx = 0;
  compute_encoder.set_input_array(a, arg_idx++);
  compute_encoder.set_input_array(b, arg_idx++);
  compute_encoder.set_output_array(outputs[0], arg_idx++);
  if (outputs.size() == 2) {
    compute_encoder.set_output_array(outputs[1], arg_idx++);
  }

  auto thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
  if (bopt == BinaryOpType::General) {
    // Launch up to 3D grid of threads
    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
    size_t rest = out.size() / (dim0 * dim1);

    if (ndim > 3) {
      compute_encoder.set_vector_bytes(shape, arg_idx++);
      compute_encoder.set_vector_bytes(strides_a, arg_idx++);
      compute_encoder.set_vector_bytes(strides_b, arg_idx++);
      compute_encoder.set_bytes<int>(ndim, arg_idx++);
      dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
    } else {
      // The shape is implicit in the grid for <= 3D
      compute_encoder.set_vector_bytes(strides_a, arg_idx++);
      compute_encoder.set_vector_bytes(strides_b, arg_idx++);
    }

    if (thread_group_size != 1024) {
      throw std::runtime_error("[Metal::binary] Must use 1024 sized block");
    }
    auto group_dims = get_block_dims(dim0, dim1, rest);
    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  } else {
    // Launch a 1D or 2D grid of threads
    size_t nthreads = ceildiv(out.data_size(), work_per_thread);
    if (thread_group_size > nthreads) {
      thread_group_size = nthreads;
    }

    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
    MTL::Size grid_dims;
    if (large) {
      compute_encoder.set_bytes<int64_t>(out.data_size(), arg_idx++);
      grid_dims = get_2d_grid_dims(out.shape(), out.strides(), work_per_thread);
    } else {
      compute_encoder.set_bytes<int>(out.data_size(), arg_idx++);
      grid_dims = MTL::Size(nthreads, 1, 1);
    }
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }
}

void binary_op_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const char* op,
    const Stream& s) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, outputs[0], bopt);
  set_binary_op_output_data(a, b, outputs[1], bopt);
  binary_op_gpu_inplace(inputs, outputs, op, s);
}

void binary_op_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const char* op) {
  auto& s = outputs[0].primitive().stream();
  binary_op_gpu(inputs, outputs, op, s);
}

void binary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s) {
  std::vector<array> outputs = {out};
  binary_op_gpu_inplace(inputs, outputs, op, s);
}

void binary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s) {
  assert(inputs.size() == 2);
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto bopt = get_binary_op_type(a, b);
  set_binary_op_output_data(a, b, out, bopt);
  binary_op_gpu_inplace(inputs, out, op, s);
}

void binary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
    const char* op) {
  auto& s = out.primitive().stream();
  binary_op_gpu(inputs, out, op, s);
}

BINARY_GPU(Add)
BINARY_GPU(ArcTan2)
BINARY_GPU(Divide)
BINARY_GPU_MULTI(DivMod)
BINARY_GPU(Remainder)
BINARY_GPU(Equal)
BINARY_GPU(Greater)
BINARY_GPU(GreaterEqual)
BINARY_GPU(Less)
BINARY_GPU(LessEqual)
BINARY_GPU(LogicalAnd)
BINARY_GPU(LogicalOr)
BINARY_GPU(LogAddExp)
BINARY_GPU(Maximum)
BINARY_GPU(Minimum)
BINARY_GPU(Multiply)
BINARY_GPU(NotEqual)
BINARY_GPU(Power)
BINARY_GPU(Subtract)

void BitwiseBinary::eval_gpu(const std::vector<array>& inputs, array& out) {
  switch (op_) {
    case BitwiseBinary::And:
      binary_op_gpu(inputs, out, name());
      break;
    case BitwiseBinary::Or:
      binary_op_gpu(inputs, out, name());
      break;
    case BitwiseBinary::Xor:
      binary_op_gpu(inputs, out, name());
      break;
    case BitwiseBinary::LeftShift:
      binary_op_gpu(inputs, out, name());
      break;
    case BitwiseBinary::RightShift:
      binary_op_gpu(inputs, out, name());
      break;
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/binary.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/array.h"

namespace mlx::core {

void binary_op_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const char* op,
    const Stream& s);

void binary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s);

void binary_op_gpu_inplace(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const char* op,
    const Stream& s);

void binary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s);

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/compiled.cpp
================================================
// Copyright © 2023-2024 Apple Inc.
#include <fmt/format.h>
#include <sstream>

#include "mlx/backend/common/compiled.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/jit/includes.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/graph_utils.h"
#include "mlx/primitives.h"
#include "mlx/utils.h"

namespace mlx::core {

inline void build_kernel(
    std::string& os,
    const std::string& kernel_name,
    const std::vector<array>& inputs,
    const std::vector<array>& outputs,
    const std::vector<array>& tape,
    const std::function<bool(size_t)>& is_constant,
    bool contiguous,
    int ndim,
    bool dynamic_dims,
    bool use_big_index = false,
    int work_per_thread = 1) {
  NodeNamer namer;
  bool add_indices = false;
  int cnt = 0;

  // Start the kernel
  os += fmt::format(
      "[[host_name(\"{0}\")]]\n[[kernel]] void {0}(\n", kernel_name);

  // Add the input arguments
  for (size_t i = 0; i < inputs.size(); ++i) {
    // Skip constants from the input list
    if (is_constant(i)) {
      continue;
    }

    const auto& x = inputs[i];
    auto& xname = namer.get_name(x);

    // Scalars and contiguous need no strides
    if (!is_scalar(x) && !contiguous) {
      add_indices = true;
    }
    os += fmt::format(
        "    device const {0}* {1} [[buffer({2})]],\n",
        get_type_string(x.dtype()),
        xname,
        cnt++);
  }

  std::string idx_type = use_big_index ? "int64_t" : "uint";
  if (add_indices) {
    os += fmt::format(
        "    constant const int64_t* in_strides [[buffer({0})]],\n", cnt++);
  }

  // Add the output arguments
  for (auto& x : outputs) {
    os += fmt::format(
        "    device {0}* {1} [[buffer({2})]],\n",
        get_type_string(x.dtype()),
        namer.get_name(x),
        cnt++);
  }
  // Add output strides and shape to extract the indices.
  if (!contiguous) {
    os += fmt::format(
        "    constant const int* output_shape [[buffer({0})]],\n", cnt++);
  } else {
    os += fmt::format(
        "    constant const {0}& size [[buffer({1})]],\n", idx_type, cnt++);
  }
  if (dynamic_dims) {
    os += fmt::format("    constant const int& ndim [[buffer({0})]],\n", cnt++);
  }

  // The thread index in the whole grid
  os += "    uint3 pos [[thread_position_in_grid]],\n";
  os += "    uint3 grid [[threads_per_grid]]) {\n";

  os += fmt::format("  constexpr int N_ = {0};\n", work_per_thread);
  if (contiguous && use_big_index) {
    // This is only used for contiguous kernels which don't have
    // a third grid dimension
    os += "  int64_t index = N_ * (pos.x + grid.x * int64_t(pos.y));\n";
  } else if (contiguous) {
    os += "  uint index = N_ * pos.x;\n";
  } else if (work_per_thread > 1) {
    os += fmt::format(
        "  int xshape = output_shape[{0}];\n",
        dynamic_dims ? "ndim - 1" : std::to_string(ndim - 1));
    os += fmt::format(
        "  {0} index = N_ * pos.x + xshape * (pos.y + {0}(grid.y) * pos.z);\n",
        idx_type);
  } else {
    os += fmt::format(
        "  {0} index = pos.x + grid.x * (pos.y + {0}(grid.y) * pos.z);\n",
        idx_type);
  }
  if (work_per_thread > 1 && contiguous) {
    os += "  for (int i = 0; i < N_ && index < size; ++i) {\n";
  }

  // Read constant / contiguous inputs in tmps
  std::vector<array> nc_inputs;
  for (int i = 0; i < inputs.size(); ++i) {
    auto& x = inputs[i];
    auto& xname = namer.get_name(x);

    if (is_constant(i)) {
      auto type_str = get_type_string(x.dtype());
      std::ostringstream ss;
      print_constant(ss, x);
      os += fmt::format(
          "  auto tmp_{0} = static_cast<{1}>({2});\n",
          xname,
          get_type_string(x.dtype()),
          ss.str());
    } else if (is_scalar(x)) {
      os += fmt::format(
          "  {0} tmp_{1} = {1}[0];\n", get_type_string(x.dtype()), xname);
    } else if (contiguous) {
      os += fmt::format(
          "  {0} tmp_{1} = {1}[index];\n", get_type_string(x.dtype()), xname);
    } else {
      nc_inputs.push_back(x);
    }
  }

  // Initialize the indices for non-contiguous inputs
  for (int i = 0; i < nc_inputs.size(); ++i) {
    auto& xname = namer.get_name(nc_inputs[i]);
    os += fmt::format("  {0} index_{1} = ", idx_type, xname);
    if (ndim == 1) {
      int offset = i * ndim;
      os +=
          fmt::format("elem_to_loc_1<uint>(pos.x, in_strides[{0}]);\n", offset);
    } else if (ndim == 2) {
      int offset = i * ndim;
      os += fmt::format(
          "elem_to_loc_2<{0}>({{pos.x, pos.y}}, in_strides + {1});\n",
          idx_type,
          offset);
    } else if (ndim == 3) {
      int offset = i * ndim;
      os += fmt::format(
          "elem_to_loc_3<{0}>(pos, in_strides + {1});\n", idx_type, offset);
    } else if (!dynamic_dims) {
      int offset = (i + 1) * ndim;
      os += fmt::format(
          "N_ * pos.x * {0}(in_strides[{1}]) + pos.y * {0}(in_strides[{2}]);\n",
          idx_type,
          offset - 1,
          offset - 2);
    } else {
      os += fmt::format(
          "N_ * pos.x * {0}(in_strides[ndim * {1} + ndim - 1]) + pos.y * {0}(in_strides[ndim * {1} + ndim - 2]);\n",
          idx_type,
          i);
    }
  }

  if (!nc_inputs.empty() && (ndim > 3 || dynamic_dims)) {
    os += "  uint zpos = pos.z;\n";
    if (dynamic_dims) {
      os += "  for (int d = ndim - 3; d >= 0; --d) {\n";
    } else {
      os += fmt::format("  for (int d = {0}; d >= 0; --d) {{\n", ndim - 3);
    }
    os += "    uint l = zpos % output_shape[d];\n";
    for (int i = 0; i < nc_inputs.size(); ++i) {
      auto& xname = namer.get_name(nc_inputs[i]);
      os += fmt::format("    index_{0} += ", xname);
      if (dynamic_dims) {
        os +=
            fmt::format("l * {0}(in_strides[{1} * ndim + d]);\n", idx_type, i);
      } else {
        os +=
            fmt::format("l * {0}(in_strides[{1} + d]);\n", idx_type, i * ndim);
      }
    }
    os += "    zpos /= output_shape[d];\n  }\n";
  }

  // Open per-thread loop
  if (work_per_thread > 1 && !contiguous) {
    os +=
        "  for (int i = 0; i < N_ && (int(N_ * pos.x) + i) < xshape; ++i) {\n";
  }

  // Read non-contiguous inputs into tmps
  for (int i = 0; i < nc_inputs.size(); ++i) {
    auto& x = nc_inputs[i];
    auto& xname = namer.get_name(x);
    os += fmt::format(
        "  {0} tmp_{1} = {1}[index_{1}];\n", get_type_string(x.dtype()), xname);
  }

  // Actually write the computation
  for (auto& x : tape) {
    os += fmt::format(
        "  {0} tmp_{1} = ", get_type_string(x.dtype()), namer.get_name(x));
    if (is_static_cast(x.primitive())) {
      os += fmt::format(
          "static_cast<{0}>(tmp_{1});\n",
          get_type_string(x.dtype()),
          namer.get_name(x.inputs()[0]));
    } else {
      os += x.primitive().name();
      os += "()(";
      for (int i = 0; i < x.inputs().size() - 1; i++) {
        os += fmt::format("tmp_{0}, ", namer.get_name(x.inputs()[i]));
      }
      os += fmt::format("tmp_{0});\n", namer.get_name(x.inputs().back()));
    }
  }

  // Write the outputs from tmps
  for (auto& x : outputs) {
    os += fmt::format("  {0}[index] = tmp_{0};\n", namer.get_name(x));
  }
  // Increment indices and close per thread loop
  if (work_per_thread > 1) {
    for (int i = 0; i < nc_inputs.size(); ++i) {
      auto& x = nc_inputs[i];
      auto& xname = namer.get_name(x);
      if (!dynamic_dims) {
        os += fmt::format(
            "  index_{0} += in_strides[{1}];\n", xname, i * ndim + ndim - 1);
      } else {
        os += fmt::format(
            "  index_{0} += in_strides[{1} * ndim + ndim - 1];\n", xname, i);
      }
    }
    os += "  index++;\n  }\n";
  }

  // Finish the kernel
  os += "}\n";

  if (cnt > 31) {
    std::ostringstream msg;
    msg << "[compile] Too many inputs/outputs fused in the Metal Compiled "
        << "primitive which exhausted the available argument buffers for "
        << "the kernel. Please file an issue with the function that results "
        << "in this error. The name of the kernel is '" << kernel_name << "'";
    throw std::runtime_error(msg.str());
  }
}

void Compiled::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  // Get the kernel if someone else built it already
  auto& s = stream();
  auto& d = metal::device(s.device);
  auto lib = d.get_library(kernel_lib_, [&]() {
    int work_per_thread = get_work_per_thread(outputs_[0].dtype());
    std::string kernel = metal::utils();
    concatenate(
        kernel, metal::unary_ops(), metal::binary_ops(), metal::ternary_ops());
    build_kernel(
        kernel,
        kernel_lib_ + "_contiguous",
        inputs_,
        outputs_,
        tape_,
        is_constant_,
        /* contiguous = */ true,
        /* ndim = */ 0,
        /* dynamic_dims = */ false,
        /* use_big_index = */ false,
        /* work_per_thread = */ 1);
    if (work_per_thread > 1) {
      build_kernel(
          kernel,
          kernel_lib_ + "_contiguous_n",
          inputs_,
          outputs_,
          tape_,
          is_constant_,
          /* contiguous = */ true,
          /* ndim = */ 0,
          /* dynamic_dims = */ false,
          /* use_big_index = */ false,
          /* work_per_thread = */ work_per_thread);
    }
    build_kernel(
        kernel,
        kernel_lib_ + "_contiguous_large",
        inputs_,
        outputs_,
        tape_,
        is_constant_,
        /* contiguous = */ true,
        /* ndim = */ 0,
        /* dynamic_dims = */ false,
        /* use_big_index = */ true,
        /* work_per_thread = */ work_per_thread);
    for (int i = 1; i < 8; i++) {
      build_kernel(
          kernel,
          kernel_lib_ + "_strided_" + std::to_string(i),
          inputs_,
          outputs_,
          tape_,
          is_constant_,
          /* contiguous = */ false,
          /* ndim = */ i,
          /* dynamic_dims = */ false,
          /* use_big_index = */ false,
          /* work_per_thread = */ i > 3 ? 2 : 1);
      if (i > 1) {
        build_kernel(
            kernel,
            kernel_lib_ + "_strided_" + std::to_string(i) + "_large",
            inputs_,
            outputs_,
            tape_,
            is_constant_,
            /* contiguous = */ false,
            /* ndim = */ i,
            /* dynamic_dims = */ false,
            /* use_big_index = */ true,
            /* work_per_thread = */ i > 3 ? 4 : 1);
      }
    }
    build_kernel(
        kernel,
        kernel_lib_ + "_strided_dynamic",
        inputs_,
        outputs_,
        tape_,
        is_constant_,
        /* contiguous = */ false,
        /* ndim = */ 0,
        /* dynamic_dims = */ true,
        /* use_big_index = */ false,
        /* work_per_thread = */ 2);
    build_kernel(
        kernel,
        kernel_lib_ + "_strided_dynamic_large",
        inputs_,
        outputs_,
        tape_,
        is_constant_,
        /* contiguous = */ false,
        /* ndim = */ 0,
        /* dynamic_dims = */ true,
        /* use_big_index = */ true,
        /* work_per_thread = */ 4);
    return kernel;
  });

  // Collapse contiguous dims to route to a faster kernel if possible. Also
  // handle all broadcasting.
  auto [contiguous, shape, strides] =
      compiled_collapse_contiguous_dims(inputs, outputs[0], is_constant_);

  // Whether to use large index.
  bool large = compiled_use_large_index(inputs, outputs, contiguous);

  // Get the kernel from the lib
  int ndim = shape.size();
  bool dynamic = ndim >= 8;
  auto kernel_name = kernel_lib_ + (contiguous ? "_contiguous" : "_strided_");
  int work_per_thread = 1;
  if (!contiguous) {
    if (dynamic) {
      kernel_name += "dynamic";
    } else {
      kernel_name += std::to_string(shape.size());
    }
    work_per_thread = ndim > 3 ? (large ? 4 : 2) : 1;
  } else {
    work_per_thread =
        get_work_per_thread(outputs[0].dtype(), outputs[0].data_size());
    if (work_per_thread > 1 && !large) {
      kernel_name += "_n";
    }
  }
  if (large) {
    kernel_name += "_large";
  }
  auto kernel = d.get_kernel(kernel_name, lib);
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Put the inputs in
  int cnt = 0;
  int stride_idx = 1; // idx 0 is the output strides
  Strides in_strides;
  for (int i = 0; i < inputs.size(); i++) {
    if (is_constant_(i)) {
      continue;
    }
    auto& x = inputs[i];
    compute_encoder.set_input_array(x, cnt++);
    if (!contiguous && !is_scalar(x)) {
      in_strides.insert(
          in_strides.end(),
          strides[stride_idx].begin(),
          strides[stride_idx].end());
      stride_idx++;
    }
  }
  if (!in_strides.empty()) {
    compute_encoder.set_vector_bytes(in_strides, cnt++);
  }

  compiled_allocate_outputs(inputs, outputs, is_constant_, contiguous);

  // Put the outputs in
  for (auto& x : outputs) {
    compute_encoder.set_output_array(x, cnt++);
  }

  // Put the output shape and strides in
  if (!contiguous) {
    compute_encoder.set_vector_bytes(shape, cnt++);
  } else {
    auto size = outputs[0].data_size();
    if (large) {
      compute_encoder.set_bytes<int64_t>(size, cnt++);
    } else {
      compute_encoder.set_bytes<int>(size, cnt++);
    }
  }

  // Put the number of dims in if it is dynamic
  if (dynamic) {
    compute_encoder.set_bytes(ndim, cnt++);
  }

  // Launch the kernel
  if (contiguous) {
    size_t nthreads = ceildiv(outputs[0].data_size(), work_per_thread);
    MTL::Size group_dims(
        std::min(nthreads, kernel->maxTotalThreadsPerThreadgroup()), 1, 1);
    MTL::Size grid_dims = large
        ? get_2d_grid_dims(
              outputs[0].shape(), outputs[0].strides(), work_per_thread)
        : MTL::Size(nthreads, 1, 1);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  } else {
    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
    size_t rest = outputs[0].size() / (dim0 * dim1);
    dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
    NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
    int pow2;
    if (thread_group_size == 1024) {
      pow2 = 10;
    } else if (thread_group_size > 512) {
      pow2 = 9;
    } else {
      throw std::runtime_error("[Metal::compiled] Must use > 512 sized block");
    }
    auto group_dims = get_block_dims(dim0, dim1, rest, pow2);
    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/conv.cpp
================================================
// Copyright © 2023-2024 Apple Inc.
#include <algorithm>
#include <cassert>
#include <numeric>

#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/gpu/slicing.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/kernels/defines.h"
#include "mlx/backend/metal/kernels/steel/conv/params.h"
#include "mlx/backend/metal/matmul.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/primitives.h"
#include "mlx/utils.h"

using namespace mlx::steel;

namespace mlx::core {

namespace {

inline array
ensure_row_contiguous(const array& x, metal::Device& d, const Stream& s) {
  if (x.flags().row_contiguous) {
    return x;
  }
  auto result = contiguous_copy_gpu(x, s);
  d.add_temporary(result, s.index);
  return result;
}

template <int N>
void explicit_gemm_conv_ND_gpu(
    const Stream& s,
    metal::Device& d,
    const array& in,
    const array& wt,
    array& out,
    const MLXConvParams<N>& conv_params) {
  // Get gemm shapes
  int implicit_M = out.size() / conv_params.O;
  int implicit_K = wt.size() / conv_params.O;
  int implicit_N = conv_params.O;
  // Prepare unfolding array
  Shape unfolded_shape{implicit_M, implicit_K};
  array in_unfolded(unfolded_shape, in.dtype(), nullptr, {});

  in_unfolded.set_data(allocator::malloc(in_unfolded.nbytes()));

  // Prepare unfolding kernel
  std::string kname;
  kname.reserve(32);
  concatenate(kname, "naive_unfold_nd_", type_to_name(in_unfolded), "_", N);
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kname);
  compute_encoder.set_compute_pipeline_state(kernel);

  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_output_array(in_unfolded, 1);

  compute_encoder.set_bytes(conv_params, 2);

  // Launch unfolding kernel
  size_t tgp_x = std::min(conv_params.C, 64);
  tgp_x = 32 * ((tgp_x + 32 - 1) / 32);
  size_t tgp_y = 256 / tgp_x;

  MTL::Size grid_dims = MTL::Size(
      conv_params.C, unfolded_shape[1] / conv_params.C, unfolded_shape[0]);
  MTL::Size group_dims = MTL::Size(
      std::min(tgp_x, grid_dims.width), std::min(tgp_y, grid_dims.height), 1);

  compute_encoder.dispatch_threads(grid_dims, group_dims);

  // Reshape weight
  Shape wt_reshape{implicit_K, implicit_N};
  Strides wt_restride{1, implicit_K};
  array wt_reshaped(wt_reshape, wt.dtype(), nullptr, {});
  auto wt_flags = wt.flags();
  wt_flags.row_contiguous = false;
  wt_flags.col_contiguous = true;
  wt_reshaped.copy_shared_buffer(wt, wt_restride, wt_flags, wt.data_size());

  // Perform gemm
  std::vector<array> copies = {in_unfolded};
  return steel_matmul(
      s,
      d,
      /*a = */ in_unfolded,
      /*b = */ wt_reshaped,
      /*c = */ out,
      /*M = */ implicit_M,
      /*N = */ implicit_N,
      /*K = */ implicit_K,
      /*batch_size_out = */ 1,
      /*a_cols = */ implicit_K,
      /*b_cols = */ implicit_K,
      /*a_transposed = */ false,
      /*b_transposed = */ true,
      /*copies = */ copies);
}

template <int N>
void explicit_gemm_conv_group_ND_gpu(
    const Stream& s,
    metal::Device& d,
    const array& in,
    const array& wt,
    array& out,
    const MLXConvParams<N>& conv_params) {
  const int groups = conv_params.groups;
  const int C_per_group = conv_params.C / conv_params.groups;
  const int O_per_group = conv_params.O / conv_params.groups;
  // Get gemm shapes
  const int implicit_M = out.size() / conv_params.O;
  const int implicit_K = wt.size() / conv_params.O;
  const int implicit_N = O_per_group;

  int kernel_size = 1;
  for (int i = 0; i < N; ++i) {
    kernel_size *= conv_params.wS[i];
  }

  // Prepare unfolding array
  Shape unfolded_shape{implicit_M, implicit_K * groups};
  array in_unfolded(unfolded_shape, in.dtype(), nullptr, {});
  in_unfolded.set_data(allocator::malloc(in_unfolded.nbytes()));

  // Prepare unfolding kernel
  std::string kname;
  kname.reserve(32);
  concatenate(
      kname, "naive_unfold_transpose_nd_", type_to_name(in_unfolded), "_", N);
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kname);
  compute_encoder.set_compute_pipeline_state(kernel);

  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_output_array(in_unfolded, 1);

  compute_encoder.set_bytes(conv_params, 2);

  // Launch unfolding kernel
  size_t tgp_x = std::min(conv_params.C, 64);
  tgp_x = 32 * ((tgp_x + 32 - 1) / 32);
  size_t tgp_y = 256 / tgp_x;

  MTL::Size grid_dims = MTL::Size(
      conv_params.C, unfolded_shape[1] / conv_params.C, unfolded_shape[0]);
  MTL::Size group_dims = MTL::Size(
      std::min(tgp_x, grid_dims.width), std::min(tgp_y, grid_dims.height), 1);

  compute_encoder.dispatch_threads(grid_dims, group_dims);

  // Transpose kernel weights so that we can slice them by contiguous chunks
  // of channel groups.
  array wt_view(
      {wt.shape(0), C_per_group, kernel_size}, wt.dtype(), nullptr, {});
  wt_view.copy_shared_buffer(
      wt, {wt.strides(0), 1, C_per_group}, wt.flags(), wt.size());

  // Materialize
  array wt_transpose = contiguous_copy_gpu(wt_view, s);

  // Perform gemm
  std::vector<array> copies = {in_unfolded, wt_transpose};
  return steel_matmul_regular(
      /* const Stream& s = */ s,
      /* Device& d = */ d,
      /* const array& a = */ in_unfolded,
      /* const array& b = */ wt_transpose,
      /* array& c = */ out,
      /* int M = */ implicit_M,
      /* int N = */ implicit_N,
      /* int K = */ implicit_K,
      /* int batch_size_out = */ groups,
      /* int lda = */ implicit_K * groups,
      /* int ldb = */ implicit_K,
      /* int ldd = */ implicit_N * groups,
      /* bool transpose_a = */ false,
      /* bool transpose_b = */ true,
      /* std::vector<array>& copies = */ copies,
      /* Shape batch_shape = */ {1},
      /* Strides batch_strides = */ {0},
      /* int64_t A_batch_strides = */ int64_t(implicit_K),
      /* int64_t B_batch_strides = */ int64_t(implicit_N) * implicit_K,
      /* int64_t matrix_stride_out = */ int64_t(implicit_N));
}

void implicit_gemm_conv_2D_gpu(
    const Stream& s,
    metal::Device& d,
    const array& in,
    const array& wt,
    array& out,
    const MLXConvParams<2>& conv_params) {
  const int groups = conv_params.groups;
  const int C_per_group = conv_params.C / conv_params.groups;
  const int O_per_group = conv_params.O / conv_params.groups;

  // Deduce implicit gemm size
  const int implicit_M = conv_params.N * conv_params.oS[0] * conv_params.oS[1];
  const int implicit_N = O_per_group;
  const int implicit_K = conv_params.wS[0] * conv_params.wS[1] * C_per_group;

  // Determine block and warp tiles
  int wm = 2, wn = 2;

  int bm = implicit_M >= 8192 && C_per_group >= 64 ? 64 : 32;
  int bn = (bm == 64 || implicit_N >= 64) ? 64 : 32;
  int bk = 16;

  if (implicit_N <= 16) {
    bn = 8;
    wm = 4;
    wn = 1;
  }

  int tn = (implicit_N + bn - 1) / bn;
  int tm = (implicit_M + bm - 1) / bm;
  int swizzle_log = 0;

  // Fix small channel specialization
  int n_channel_specialization = 0;
  int channel_k_iters = ((C_per_group + bk - 1) / bk);
  int gemm_k_iters = conv_params.wS[0] * conv_params.wS[1] * channel_k_iters;

  if (C_per_group <= 2) {
    gemm_k_iters = (implicit_K + bk - 1) / bk;
    n_channel_specialization = C_per_group;
  } else if (C_per_group <= 4) {
    gemm_k_iters = ((conv_params.wS[0] * conv_params.wS[1] * 4) + bk - 1) / bk;
    n_channel_specialization = C_per_group;
  }

  bool small_filter = (!n_channel_specialization) &&
      (conv_params.wS[0] <= 16 && conv_params.wS[1] <= 16);

  // Fix host side helper params
  int sign = (conv_params.flip ? -1 : 1);
  int ijw = conv_params.in_strides[2] * conv_params.kdil[1];
  int ijh = conv_params.in_strides[1] * conv_params.kdil[0];

  int inp_jump_w = sign * ijw;
  int inp_jump_h = sign * (ijh - (conv_params.wS[1] - 1) * ijw);
  int inp_jump_c = bk - sign * (conv_params.wS[0] - 1) * ijh -
      sign * (conv_params.wS[1] - 1) * ijw;

  // Build implicit gemm params
  ImplicitGemmConv2DParams gemm_params{
      /* const int M = */ implicit_M,
      /* const int N = */ implicit_N,
      /* const int K = */ implicit_K,

      /* const int gemm_k_iterations = */ gemm_k_iters,

      /* const int inp_jump_w = */ inp_jump_w,
      /* const int inp_jump_h = */ inp_jump_h,
      /* const int inp_jump_c = */ inp_jump_c,

      /* const int tiles_n = */ tn,
      /* const int tiles_m = */ tm,
      /* const int swizzle_log = */ swizzle_log};

  // Determine kernel
  std::string kname;
  kname.reserve(64);
  concatenate(
      kname,
      "implicit_gemm_conv_2d_",
      type_to_name(out),
      "_bm",
      bm,
      "_bn",
      bn,
      "_bk",
      bk,
      "_wm",
      wm,
      "_wn",
      wn,
      "_channel_",
      n_channel_specialization ? std::to_string(n_channel_specialization) : "l",
      "_filter_",
      small_filter ? 's' : 'l');

  // Encode and dispatch kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = get_steel_conv_kernel(
      d,
      kname,
      out,
      bm,
      bn,
      bk,
      wm,
      wn,
      n_channel_specialization,
      small_filter);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Deduce grid launch dimensions
  int tile = 1 << swizzle_log;
  size_t grid_dim_y = (tm + tile - 1) / tile;
  size_t grid_dim_x = tn * tile;

  MTL::Size group_dims = MTL::Size(32, wn, wm);
  MTL::Size grid_dims = MTL::Size(grid_dim_x, grid_dim_y, groups);

  // Encode arrays
  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_input_array(wt, 1);
  compute_encoder.set_output_array(out, 2);

  // Encode params
  compute_encoder.set_bytes(conv_params, 3);
  compute_encoder.set_bytes(gemm_params, 4);

  // Launch kernel
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void implicit_gemm_conv_2D_general_gpu(
    const Stream& s,
    metal::Device& d,
    const array& in,
    const array& wt,
    array& out,
    const MLXConvParams<2>& conv_params) {
  // Deduce implicit gemm size
  int implicit_M = conv_params.N * conv_params.oS[0] * conv_params.oS[1];
  int implicit_N = conv_params.O;
  int implicit_K = conv_params.wS[0] * conv_params.wS[1] * conv_params.C;

  // Determine block and warp tiles
  int wm = 2, wn = 2;

  // Make jump params
  int f_wgt_jump_h =
      std::lcm(conv_params.idil[0], conv_params.kdil[0]) / conv_params.kdil[0];
  int f_wgt_jump_w =
      std::lcm(conv_params.idil[1], conv_params.kdil[1]) / conv_params.kdil[1];

  int f_out_jump_h =
      std::lcm(conv_params.idil[0], conv_params.str[0]) / conv_params.str[0];
  int f_out_jump_w =
      std::lcm(conv_params.idil[1], conv_params.str[1]) / conv_params.str[1];

  int adj_out_h = (conv_params.oS[0] + f_out_jump_h - 1) / f_out_jump_h;
  int adj_out_w = (conv_params.oS[1] + f_out_jump_w - 1) / f_out_jump_w;
  int adj_out_hw = adj_out_h * adj_out_w;
  int adj_implicit_m = conv_params.N * adj_out_hw;

  Conv2DGeneralJumpParams jump_params{
      /* const int f_wgt_jump_h = */ f_wgt_jump_h,
      /* const int f_wgt_jump_w = */ f_wgt_jump_w,

      /* const int f_out_jump_h = */ f_out_jump_h,
      /* const int f_out_jump_w = */ f_out_jump_w,

      /* const int adj_out_h = */ adj_out_h,
      /* const int adj_out_w = */ adj_out_w,
      /* const int adj_out_hw = */ adj_out_hw,
      /* const int adj_implicit_m = */ adj_implicit_m};

  // Make base info
  std::vector<Conv2DGeneralBaseInfo> base_h(f_out_jump_h);
  std::vector<Conv2DGeneralBaseInfo> base_w(f_out_jump_w);

  int jump_h = conv_params.flip ? -conv_params.kdil[0] : conv_params.kdil[0];
  int jump_w = conv_params.flip ? -conv_params.kdil[1] : conv_params.kdil[1];

  int init_h =
      (conv_params.flip ? (conv_params.wS[0] - 1) * conv_params.kdil[0] : 0);
  int init_w =
      (conv_params.flip ? (conv_params.wS[1] - 1) * conv_params.kdil[1] : 0);

  for (int i = 0; i < f_out_jump_h; ++i) {
    int ih_loop = i * conv_params.str[0] - conv_params.pad[0] + init_h;

    int wh_base = 0;
    while (wh_base < conv_params.wS[0] && ih_loop % conv_params.idil[0] != 0) {
      wh_base++;
      ih_loop += jump_h;
    }

    int wh_size =
        ((conv_params.wS[0] - wh_base) + f_wgt_jump_h - 1) / f_wgt_jump_h;
    base_h[i] = {wh_base, wh_size};
  }

  for (int j = 0; j < f_out_jump_w; ++j) {
    int iw_loop = j * conv_params.str[1] - conv_params.pad[1] + init_w;

    int ww_base = 0;
    while (ww_base < conv_params.wS[1] && iw_loop % conv_params.idil[1] != 0) {
      ww_base++;
      iw_loop += jump_w;
    }

    int ww_size =
        ((conv_params.wS[1] - ww_base) + f_wgt_jump_w - 1) / f_wgt_jump_w;
    base_w[j] = {ww_base, ww_size};
  }

  // Collect block sizes
  int bm = adj_implicit_m >= 8192 && conv_params.C >= 64 ? 64 : 32;
  int bn = (bm == 64 && implicit_N >= 64) ? 64 : 32;
  int bk = 16;

  int tn = (implicit_N + bn - 1) / bn;
  int tm = (adj_implicit_m + bm - 1) / bm;
  int swizzle_log = 0;

  // Get channel iteration info
  int channel_k_iters = ((conv_params.C + bk - 1) / bk);
  int gemm_k_iters = channel_k_iters;
  bool align_C = conv_params.C % bk == 0;

  // Fix host side helper params
  int sign = (conv_params.flip ? -1 : 1);
  int ijw = conv_params.in_strides[2] * conv_params.kdil[1];
  int ijh = conv_params.in_strides[1] * conv_params.kdil[0];

  int inp_jump_w = sign * ijw;
  int inp_jump_h = sign * (ijh - (conv_params.wS[1] - 1) * ijw);
  int inp_jump_c = bk - sign * (conv_params.wS[0] - 1) * ijh -
      sign * (conv_params.wS[1] - 1) * ijw;

  // Build implicit gemm params
  ImplicitGemmConv2DParams gemm_params{
      /* const int M = */ implicit_M,
      /* const int N = */ implicit_N,
      /* const int K = */ implicit_K,

      /* const int gemm_k_iterations = */ gemm_k_iters,

      /* const int inp_jump_w = */ inp_jump_w,
      /* const int inp_jump_h = */ inp_jump_h,
      /* const int inp_jump_c = */ inp_jump_c,

      /* const int tiles_n = */ tn,
      /* const int tiles_m = */ tm,
      /* const int swizzle_log = */ swizzle_log};

  // Determine kernel
  std::string kname;
  kname.reserve(64);
  concatenate(
      kname,
      "implicit_gemm_conv_2d_general_",
      type_to_name(out),
      "_bm",
      bm,
      "_bn",
      bn,
      "_bk",
      bk,
      "_wm",
      wm,
      "_wn",
      wn);
  std::string hash_name;
  hash_name.reserve(64);
  concatenate(hash_name, kname, "_alC_", align_C);
  metal::MTLFCList func_consts = {
      {&align_C, MTL::DataType::DataTypeBool, 200},
  };

  // Encode and dispatch kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = get_steel_conv_general_kernel(
      d, kname, hash_name, func_consts, out, bm, bn, bk, wm, wn);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Deduce grid launch dimensions
  int tile = 1 << swizzle_log;
  size_t grid_dim_y = (tm + tile - 1) / tile;
  size_t grid_dim_x = tn * tile;
  size_t grid_dim_z = f_out_jump_h * f_out_jump_w;

  MTL::Size group_dims = MTL::Size(32, wn, wm);
  MTL::Size grid_dims = MTL::Size(grid_dim_x, grid_dim_y, grid_dim_z);

  // Encode arrays
  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_input_array(wt, 1);
  compute_encoder.set_output_array(out, 2);

  // Encode params
  compute_encoder.set_bytes(conv_params, 3);
  compute_encoder.set_bytes(gemm_params, 4);
  compute_encoder.set_bytes(jump_params, 5);

  compute_encoder.set_vector_bytes(base_h, 6);
  compute_encoder.set_vector_bytes(base_w, 7);

  // Launch kernel
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void implicit_gemm_conv_3D_gpu(
    const Stream& s,
    metal::Device& d,
    const array& in,
    const array& wt,
    array& out,
    const MLXConvParams<3>& conv_params) {
  const int groups = conv_params.groups;
  const int C_per_group = conv_params.C / conv_params.groups;
  const int O_per_group = conv_params.O / conv_params.groups;

  // Deduce implicit gemm size
  const int implicit_M =
      conv_params.N * conv_params.oS[0] * conv_params.oS[1] * conv_params.oS[2];
  const int implicit_N = O_per_group;
  const int implicit_K =
      conv_params.wS[0] * conv_params.wS[1] * conv_params.wS[2] * C_per_group;

  // Determine block and warp tiles
  int wm = 2, wn = 2;

  int bm = implicit_M >= 8192 && C_per_group >= 64 ? 64 : 32;
  int bn = (bm == 64 || implicit_N >= 64) ? 64 : 32;
  int bk = 16;

  if (implicit_N <= 16) {
    bn = 8;
    wm = 4;
    wn = 1;
  }

  int tn = (implicit_N + bn - 1) / bn;
  int tm = (implicit_M + bm - 1) / bm;
  int swizzle_log = 0;

  bool small_filter =
      (conv_params.wS[0] <= 16 && conv_params.wS[1] <= 16 &&
       conv_params.wS[2] <= 16);

  int channel_k_iters = ((C_per_group + bk - 1) / bk);
  int gemm_k_iters = conv_params.wS[0] * conv_params.wS[1] * conv_params.wS[2] *
      channel_k_iters;

  // Fix host side helper params
  int sign = (conv_params.flip ? -1 : 1);
  int ijw = conv_params.in_strides[3] * conv_params.kdil[2];
  int ijh = conv_params.in_strides[2] * conv_params.kdil[1];
  int ijd = conv_params.in_strides[1] * conv_params.kdil[0];

  int inp_jump_w = sign * ijw;
  int inp_jump_h = sign * (ijh - (conv_params.wS[2] - 1) * ijw);
  int inp_jump_d = sign *
      (ijd - (conv_params.wS[1] - 1) * ijh - (conv_params.wS[2] - 1) * ijw);
  int inp_jump_c = bk - sign * (conv_params.wS[0] - 1) * ijd -
      sign * (conv_params.wS[1] - 1) * ijh -
      sign * (conv_params.wS[2] - 1) * ijw;

  // Build implicit gemm params
  ImplicitGemmConv3DParams gemm_params{
      /* const int M = */ implicit_M,
      /* const int N = */ implicit_N,
      /* const int K = */ implicit_K,

      /* const int gemm_k_iterations = */ gemm_k_iters,

      /* const int inp_jump_w = */ inp_jump_w,
      /* const int inp_jump_h = */ inp_jump_h,
      /* const int inp_jump_d = */ inp_jump_d,
      /* const int inp_jump_c = */ inp_jump_c,

      /* const int tiles_n = */ tn,
      /* const int tiles_m = */ tm,
      /* const int swizzle_log = */ swizzle_log};

  // Determine kernel
  std::string kname;
  kname.reserve(64);
  concatenate(
      kname,
      "implicit_gemm_conv_3d_",
      type_to_name(out),
      "_bm",
      bm,
      "_bn",
      bn,
      "_bk",
      bk,
      "_wm",
      wm,
      "_wn",
      wn,
      "_filter_",
      small_filter ? 's' : 'l');

  // Encode and dispatch kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel =
      get_steel_conv_3d_kernel(d, kname, out, bm, bn, bk, wm, wn, small_filter);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Deduce grid launch dimensions
  int tile = 1 << swizzle_log;
  size_t grid_dim_y = (tm + tile - 1) / tile;
  size_t grid_dim_x = tn * tile;

  MTL::Size group_dims = MTL::Size(32, wn, wm);
  MTL::Size grid_dims = MTL::Size(grid_dim_x, grid_dim_y, groups);

  // Encode arrays
  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_input_array(wt, 1);
  compute_encoder.set_output_array(out, 2);

  // Encode params
  compute_encoder.set_bytes(conv_params, 3);
  compute_encoder.set_bytes(gemm_params, 4);

  // Launch kernel
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void pad_and_slice_conv_3D_gpu(
    const Stream& s,
    metal::Device& d,
    const array& in_pre,
    const array& wt_pre,
    array& out,
    const MLXConvParams<3>& conv_params) {
  // For now assume conv_params.groups == 1
  int extra_c = ((conv_params.C + 15) / 16) * 16 - conv_params.C;
  int extra_o = ((conv_params.O + 15) / 16) * 16 - conv_params.O;

  // Pad function
  auto pad_array = [&](const array& x, int pad_ax_first, int pad_ax_last) {
    if (pad_ax_first == 0 && pad_ax_last == 0) {
      return ensure_row_contiguous(x, d, s);
    }

    auto xshape = x.shape();
    xshape.front() += pad_ax_first;
    xshape.back() += pad_ax_last;
    array x_copy(xshape, x.dtype(), nullptr, {});
    array zero(0, x.dtype());
    pad_gpu(x, zero, x_copy, {0, -1}, {0, 0}, s);
    d.add_temporary(x_copy, s.index);

    return x_copy;
  };

  // Allocate space for the intermediate output. Don't save it as a temporary
  // since it will be sliced to the output so they share the buffer.
  auto oshape = out.shape();
  oshape.back() += extra_o;
  array intermediate(oshape, out.dtype(), nullptr, {});
  intermediate.set_data(allocator::malloc(intermediate.nbytes()));

  // Actually pad and conv
  array in = pad_array(in_pre, 0, extra_c);
  array wt = pad_array(wt_pre, extra_o, extra_c);
  auto new_params =
      MLXConvParams<3>::with_padded_channels(conv_params, extra_o, extra_c);
  implicit_gemm_conv_3D_gpu(s, d, in, wt, intermediate, new_params);

  // Slice out
  out.copy_shared_buffer(
      intermediate, intermediate.strides(), {0}, intermediate.data_size());
}

void dispatch_conv_3D_gpu(
    const Stream& s,
    metal::Device& d,
    const array& in_pre,
    const array& wt_pre,
    array& out,
    const MLXConvParams<3>& conv_params,
    std::vector<array>& copies) {
  bool is_idil_one = conv_params.idil[0] == 1 && conv_params.idil[1] == 1 &&
      conv_params.idil[2] == 1;
  const int C_per_group = conv_params.C / conv_params.groups;
  const int O_per_group = conv_params.O / conv_params.groups;

  bool mod16_channels =
      C_per_group % 16 == 0 && (O_per_group <= 16 || O_per_group % 16 == 0);

  // Check if we can do implicit gemm but the channels are not divisible by 16
  // so we can pad and slice.
  //
  // We check it first because it doesn't need contiguous inputs and it needs
  // different output allocation.
  if (is_idil_one && !mod16_channels && conv_params.groups == 1) {
    return pad_and_slice_conv_3D_gpu(s, d, in_pre, wt_pre, out, conv_params);
  }

  // Allocate the output and ensure contiguous inputs
  out.set_data(allocator::malloc(out.nbytes()));
  auto in = ensure_row_contiguous(in_pre, d, s);
  auto wt = ensure_row_contiguous(wt_pre, d, s);

  // Perform the implicit gemm
  if (is_idil_one && mod16_channels) {
    return implicit_gemm_conv_3D_gpu(s, d, in, wt, out, conv_params);
  }

  // Explicit gemms where we unfold and do a matmul
  // (separate one for groups > 1)
  if (conv_params.groups > 1) {
    return explicit_gemm_conv_group_ND_gpu(s, d, in, wt, out, conv_params);
  }
  return explicit_gemm_conv_ND_gpu(s, d, in, wt, out, conv_params);
}

void winograd_conv_2D_gpu(
    const Stream& s,
    metal::Device& d,
    const array& in,
    const array& wt,
    array& out,
    const MLXConvParams<2>& conv_params,
    std::vector<array>& copies_w) {
  Shape padded_shape = {
      conv_params.N,
      conv_params.iS[0] + 2 * conv_params.pad[0],
      conv_params.iS[1] + 2 * conv_params.pad[1],
      conv_params.C};

  padded_shape[1] = 6 * ((padded_shape[1] - 2 + 5) / 6) + 2;
  padded_shape[2] = 6 * ((padded_shape[2] - 2 + 5) / 6) + 2;

  array in_padded(std::move(padded_shape), in.dtype(), nullptr, {});

  // Fill with zeros
  array zero_arr = array(0, in.dtype());
  fill_gpu(zero_arr, in_padded, s);
  copies_w.push_back(zero_arr);

  // Pick input slice from padded
  size_t data_offset = conv_params.pad[0] * in_padded.strides()[1] +
      conv_params.pad[1] * in_padded.strides()[2];
  array in_padded_slice(in.shape(), in_padded.dtype(), nullptr, {});
  in_padded_slice.copy_shared_buffer(
      in_padded,
      in_padded.strides(),
      in_padded.flags(),
      in_padded_slice.size(),
      data_offset);

  // Copy input values into the slice
  copy_gpu_inplace(in, in_padded_slice, CopyType::GeneralGeneral, s);

  copies_w.push_back(in_padded_slice);
  copies_w.push_back(in_padded);

  MLXConvParams<2> conv_params_updated{
      /* const int  N = */ static_cast<int>(in_padded.shape(0)),
      /* const int  C = */ static_cast<int>(in_padded.shape(3)),
      /* const int  O = */ static_cast<int>(wt.shape(0)),
      /* const int iS[NDIM] = */
      {static_cast<int>(in_padded.shape(1)),
       static_cast<int>(in_padded.shape(2))},
      /* const int wS[NDIM] = */
      {static_cast<int>(wt.shape(1)), static_cast<int>(wt.shape(2))},
      /* const int oS[NDIM] = */
      {static_cast<int>(out.shape(1)), static_cast<int>(out.shape(2))},
      /* const int str[NDIM] = */ {1, 1},
      /* const int pad[NDIM] = */ {0, 0},
      /* const int kdil[NDIM] = */ {1, 1},
      /* const int idil[NDIM] = */ {1, 1},
      /* const size_t in_strides[NDIM + 2] = */
      {in_padded.strides()[0],
       in_padded.strides()[1],
       in_padded.strides()[2],
       in_padded.strides()[3]},
      /* const size_t wt_strides[NDIM + 2] = */
      {wt.strides()[0], wt.strides()[1], wt.strides()[2], wt.strides()[3]},
      /* const size_t out_strides[NDIM + 2] = */
      {out.strides()[0], out.strides()[1], out.strides()[2], out.strides()[3]},
      /* const int groups = */ 1,
      /* const bool flip = */ false,
  };

  int O_c = conv_params.O;
  int C_c = conv_params.C;

  int N_tiles_n = conv_params.N;
  int N_tiles_h = (conv_params.oS[0] + 5) / 6;
  int N_tiles_w = (conv_params.oS[1] + 5) / 6;
  int N_tiles = N_tiles_n * N_tiles_h * N_tiles_w;

  // Do filter transform
  Shape filt_wg_shape = {8 * 8, conv_params.C, conv_params.O};
  array filt_wg(std::move(filt_wg_shape), wt.dtype(), nullptr, {});
  filt_wg.set_data(allocator::malloc(filt_wg.nbytes()));
  copies_w.push_back(filt_wg);
  {
    int bc = 32;
    int bo = 4;
    std::string kname;
    kname.reserve(32);
    concatenate(
        kname,
        "winograd_conv_2d_weight_transform_",
        type_to_name(out),
        "_bc",
        bc);
    auto& compute_encoder = d.get_command_encoder(s.index);
    auto kernel = d.get_kernel(kname);
    compute_encoder.set_compute_pipeline_state(kernel);

    compute_encoder.set_input_array(wt, 0);
    compute_encoder.set_output_array(filt_wg, 1);

    compute_encoder.set_bytes(C_c, 2);
    compute_encoder.set_bytes(O_c, 3);

    MTL::Size group_dims = MTL::Size(32, bo, 1);
    MTL::Size grid_dims = MTL::Size(O_c / bo, 1, 1);

    compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
  }

  // Do input transform
  Shape inp_wg_shape = {8 * 8, N_tiles, conv_params.C};
  array inp_wg(std::move(inp_wg_shape), in.dtype(), nullptr, {});
  inp_wg.set_data(allocator::malloc(inp_wg.nbytes()));
  copies_w.push_back(inp_wg);
  {
    int bc = 32;
    int wm = 2;
    int wn = 2;
    std::string kname;
    kname.reserve(32);
    concatenate(
        kname,
        "winograd_conv_2d_input_transform_",
        type_to_name(out),
        "_bc",
        bc);
    auto& compute_encoder = d.get_command_encoder(s.index);
    auto kernel = d.get_kernel(kname);
    compute_encoder.set_compute_pipeline_state(kernel);

    compute_encoder.set_input_array(in_padded, 0);
    compute_encoder.set_output_array(inp_wg, 1);

    compute_encoder.set_bytes(conv_params_updated, 2);

    MTL::Size group_dims = MTL::Size(32, wn, wm);
    MTL::Size grid_dims = MTL::Size(N_tiles_w, N_tiles_h, N_tiles_n);

    compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
  }

  // Do batched gemm
  Shape out_wg_shape = {8 * 8, N_tiles, conv_params.O};
  array out_wg(std::move(out_wg_shape), in.dtype(), nullptr, {});
  out_wg.set_data(allocator::malloc(out_wg.nbytes()));
  copies_w.push_back(out_wg);
  {
    std::vector<array> empty_copies;
    steel_matmul(
        s,
        d,
        /*a = */ inp_wg,
        /*b = */ filt_wg,
        /*c = */ out_wg,
        /*M = */ N_tiles,
        /*N = */ conv_params.O,
        /*K = */ conv_params.C,
        /*batch_size_out = */ 8 * 8,
        /*a_cols = */ conv_params.C,
        /*b_cols = */ conv_params.O,
        /*a_transposed = */ false,
        /*b_transposed = */ false,
        /*copies = */ empty_copies);
  }

  // Do output transform
  {
    int bc = 32;
    int wm = 2;
    int wn = 2;
    std::string kname;
    kname.reserve(32);
    concatenate(
        kname,
        "winograd_conv_2d_output_transform_",
        type_to_name(out),
        "_bo",
        bc);
    auto& compute_encoder = d.get_command_encoder(s.index);
    auto kernel = d.get_kernel(kname);
    compute_encoder.set_compute_pipeline_state(kernel);

    compute_encoder.set_input_array(out_wg, 0);
    compute_encoder.set_output_array(out, 1);

    compute_encoder.set_bytes(conv_params_updated, 2);

    MTL::Size group_dims = MTL::Size(32, wn, wm);
    MTL::Size grid_dims = MTL::Size(N_tiles_w, N_tiles_h, N_tiles_n);

    compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
  }
}

void depthwise_conv_2D_gpu(
    const Stream& s,
    metal::Device& d,
    const array& in,
    const array& wt,
    array& out,
    const MLXConvParams<2>& conv_params) {
  std::string base_name;
  base_name.reserve(32);
  concatenate(base_name, "depthwise_conv_2d_", type_to_name(out));

  const int N = conv_params.N;
  const int ker_h = conv_params.wS[0];
  const int ker_w = conv_params.wS[1];
  const int str_h = conv_params.str[0];
  const int str_w = conv_params.str[1];
  const int tc = 8;
  const int tw = 8;
  const int th = 4;
  const bool do_flip = conv_params.flip;

  metal::MTLFCList func_consts = {
      {&ker_h, MTL::DataType::DataTypeInt, 00},
      {&ker_w, MTL::DataType::DataTypeInt, 01},
      {&str_h, MTL::DataType::DataTypeInt, 10},
      {&str_w, MTL::DataType::DataTypeInt, 11},
      {&th, MTL::DataType::DataTypeInt, 100},
      {&tw, MTL::DataType::DataTypeInt, 101},
      {&do_flip, MTL::DataType::DataTypeBool, 200},
  };

  // clang-format off
  std::string hash_name;
  hash_name.reserve(64);
  concatenate(
      hash_name,
      base_name,
  "_ker_h_", ker_h,
  "_ker_w_", ker_w,
  "_str_h_", str_h,
  "_str_w_", str_w,
  "_tgp_h_", th,
  "_tgp_w_", tw,
  "_do_flip_", do_flip ? 't' : 'n'); // clang-format on

  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(base_name, hash_name, func_consts);
  compute_encoder.set_compute_pipeline_state(kernel);

  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_input_array(wt, 1);
  compute_encoder.set_output_array(out, 2);

  compute_encoder.set_bytes(conv_params, 3);

  MTL::Size group_dims = MTL::Size(tc, tw, th);
  MTL::Size grid_dims = MTL::Size(
      conv_params.C / tc, conv_params.oS[1] / tw, (conv_params.oS[0] / th) * N);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void dispatch_conv_2D_gpu(
    const Stream& s,
    metal::Device& d,
    const array& in,
    const array& wt,
    array& out,
    const MLXConvParams<2>& conv_params,
    std::vector<array>& copies) {
  bool is_stride_one = conv_params.str[0] == 1 && conv_params.str[1] == 1;
  bool is_kdil_one = conv_params.kdil[0] == 1 && conv_params.kdil[1] == 1;
  bool is_idil_one = conv_params.idil[0] == 1 && conv_params.idil[1] == 1;

  if (is_idil_one && conv_params.groups > 1) {
    const int C_per_group = conv_params.C / conv_params.groups;
    const int O_per_group = conv_params.O / conv_params.groups;

    if (C_per_group == 1 && O_per_group == 1 && is_kdil_one &&
        conv_params.wS[0] <= 7 && conv_params.wS[1] <= 7 &&
        conv_params.str[0] <= 2 && conv_params.str[1] <= 2 &&
        conv_params.oS[0] % 8 == 0 && conv_params.oS[1] % 8 == 0 &&
        conv_params.wt_strides[1] == conv_params.wS[1] &&
        conv_params.C % 16 == 0 && conv_params.C == conv_params.O) {
      return depthwise_conv_2D_gpu(s, d, in, wt, out, conv_params);
    }

    if ((C_per_group <= 4 || C_per_group % 16 == 0) &&
        (O_per_group <= 16 || O_per_group % 16 == 0)) {
      return implicit_gemm_conv_2D_gpu(s, d, in, wt, out, conv_params);
    } else {
      return explicit_gemm_conv_group_ND_gpu(s, d, in, wt, out, conv_params);
    }
  }

  // Direct to winograd conv
  bool inp_large =
      (conv_params.N * conv_params.iS[0] * conv_params.iS[1]) >= 4096;
  bool channels_large = (conv_params.C + conv_params.O) >= 256;
  bool out_large =
      (conv_params.N * conv_params.oS[0] * conv_params.oS[1]) >= 256;
  if (!conv_params.flip && is_stride_one && is_kdil_one && is_idil_one &&
      conv_params.wS[0] == 3 && conv_params.wS[1] == 3 &&
      conv_params.C % 32 == 0 && conv_params.O % 32 == 0 && inp_large &&
      channels_large) {
    return winograd_conv_2D_gpu(s, d, in, wt, out, conv_params, copies);
  }

  // Direct to implicit gemm conv
  if (is_idil_one && (conv_params.C <= 4 || conv_params.C % 16 == 0) &&
      (conv_params.O <= 16 || conv_params.O % 16 == 0)) {
    return implicit_gemm_conv_2D_gpu(s, d, in, wt, out, conv_params);
  }

  else if ((conv_params.C % 16 == 0 && conv_params.O % 16 == 0) || out_large) {
    return implicit_gemm_conv_2D_general_gpu(s, d, in, wt, out, conv_params);
  }

  // Direct to explicit gemm conv
  else {
    return explicit_gemm_conv_ND_gpu(s, d, in, wt, out, conv_params);
  }
}

void depthwise_conv_1D_gpu(
    const Stream& s,
    metal::Device& d,
    const array& in,
    const array& wt,
    array& out) {
  bool large = in.size() > INT32_MAX || in.data_size() > INT32_MAX;
  std::string base_name;
  base_name.reserve(32);
  concatenate(
      base_name,
      "depthwise_conv_1d_",
      large ? "_large" : "",
      type_to_name(out));

  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(base_name);
  compute_encoder.set_compute_pipeline_state(kernel);

  auto B = in.shape(0);
  auto Tout = out.shape(1);
  auto D = in.shape(2);
  auto K = wt.shape(1);

  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_input_array(wt, 1);
  compute_encoder.set_output_array(out, 2);
  if (large) {
    int64_t strides[3] = {in.strides(0), in.strides(1), in.strides(2)};
    compute_encoder.set_bytes(strides, 3, 3);

  } else {
    int strides[3] = {
        static_cast<int>(in.strides(0)),
        static_cast<int>(in.strides(1)),
        static_cast<int>(in.strides(2))};
    compute_encoder.set_bytes(strides, 3, 3);
  }

  compute_encoder.set_bytes(K, 4);
  auto group_dims = get_block_dims(D, Tout, B);
  MTL::Size grid_dims = MTL::Size(D, Tout, B);

  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void conv_1D_gpu(
    const Stream& s,
    metal::Device& d,
    const array& in_pre,
    const array& wt_pre,
    array& out,
    const std::vector<int>& padding,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    int groups,
    bool flip,
    std::vector<array>& copies) {
  // Allocate space and ensure weights are contiguous
  out.set_data(allocator::malloc(out.nbytes()));
  auto in = ensure_row_contiguous(in_pre, d, s);
  auto wt = ensure_row_contiguous(wt_pre, d, s);

  bool is_idil_one = in_dilation[0] == 1;
  int C = in.shape(2);
  int O = wt.shape(0);
  // Fast path for fully separable 1D convolution
  if (is_idil_one && (groups == C) && groups == O && wt_strides[0] == 1 &&
      wt_dilation[0] == 1 && padding[0] == 0 && !flip) {
    depthwise_conv_1D_gpu(s, d, in, wt, out);
    return;
  }

  const int C_per_group = C / groups;
  const int O_per_group = O / groups;

  // Direct to implicit gemm conv
  if (is_idil_one && (C_per_group <= 4 || C_per_group % 16 == 0) &&
      (O_per_group <= 16 || O_per_group % 16 == 0)) {
    MLXConvParams<2> conv_params{
        /* const int  N = */ static_cast<int>(in.shape(0)),
        /* const int  C = */ C,
        /* const int  O = */ O,
        /* const int iS[NDIM] = */ {static_cast<int>(in.shape(1)), 1},
        /* const int wS[NDIM] = */ {static_cast<int>(wt.shape(1)), 1},
        /* const int oS[NDIM] = */ {static_cast<int>(out.shape(1)), 1},
        /* const int str[NDIM] = */ {wt_strides[0], 1},
        /* const int pad[NDIM] = */ {padding[0], 0},
        /* const int kdil[NDIM] = */ {wt_dilation[0], 1},
        /* const int idil[NDIM] = */ {in_dilation[0], 1},
        /* const size_t in_strides[NDIM + 2] = */
        {in.strides()[0], in.strides()[1], 0, in.strides()[2]},
        /* const size_t wt_strides[NDIM + 2] = */
        {wt.strides()[0], wt.strides()[1], 0, wt.strides()[2]},
        /* const size_t out_strides[NDIM + 2] = */
        {out.strides()[0], out.strides()[1], 0, out.strides()[2]},
        /* const int groups = */ groups,
        /* const bool flip = */ flip};

    dispatch_conv_2D_gpu(s, d, in, wt, out, conv_params, copies);
    return;
  }

  // Make conv params
  MLXConvParams<1> conv_params{
      /* const int  N = */ static_cast<int>(in.shape(0)),
      /* const int  C = */ static_cast<int>(in.shape(2)),
      /* const int  O = */ static_cast<int>(wt.shape(0)),
      /* const int iS[NDIM] = */ {static_cast<int>(in.shape(1))},
      /* const int wS[NDIM] = */ {static_cast<int>(wt.shape(1))},
      /* const int oS[NDIM] = */ {static_cast<int>(out.shape(1))},
      /* const int str[NDIM] = */ {wt_strides[0]},
      /* const int pad[NDIM] = */ {padding[0]},
      /* const int kdil[NDIM] = */ {wt_dilation[0]},
      /* const int idil[NDIM] = */ {in_dilation[0]},
      /* const size_t in_strides[NDIM + 2] = */
      {in.strides()[0], in.strides()[1], in.strides()[2]},
      /* const size_t wt_strides[NDIM + 2] = */
      {wt.strides()[0], wt.strides()[1], wt.strides()[2]},
      /* const size_t out_strides[NDIM + 2] = */
      {out.strides()[0], out.strides()[1], out.strides()[2]},
      /* const int groups = */ groups,
      /* const bool flip = */ flip};

  // Direct to explicit gemm conv
  if (groups > 1) {
    return explicit_gemm_conv_group_ND_gpu(s, d, in, wt, out, conv_params);
  } else {
    return explicit_gemm_conv_ND_gpu(s, d, in, wt, out, conv_params);
  }
}

void conv_2D_gpu(
    const Stream& s,
    metal::Device& d,
    const array& in_pre,
    const array& wt_pre,
    array& out,
    const std::vector<int>& padding,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    const int groups,
    bool flip,
    std::vector<array>& copies) {
  // Allocate space and ensure weights are contiguous
  out.set_data(allocator::malloc(out.nbytes()));
  auto in = ensure_row_contiguous(in_pre, d, s);
  auto wt = ensure_row_contiguous(wt_pre, d, s);

  // Make conv params
  MLXConvParams<2> conv_params{
      /* const int  N = */ static_cast<int>(in.shape(0)),
      /* const int  C = */ static_cast<int>(in.shape(3)),
      /* const int  O = */ static_cast<int>(wt.shape(0)),
      /* const int iS[NDIM] = */
      {static_cast<int>(in.shape(1)), static_cast<int>(in.shape(2))},
      /* const int wS[NDIM] = */
      {static_cast<int>(wt.shape(1)), static_cast<int>(wt.shape(2))},
      /* const int oS[NDIM] = */
      {static_cast<int>(out.shape(1)), static_cast<int>(out.shape(2))},
      /* const int str[NDIM] = */ {wt_strides[0], wt_strides[1]},
      /* const int pad[NDIM] = */ {padding[0], padding[1]},
      /* const int kdil[NDIM] = */ {wt_dilation[0], wt_dilation[1]},
      /* const int idil[NDIM] = */ {in_dilation[0], in_dilation[1]},
      /* const size_t in_strides[NDIM + 2] = */
      {in.strides(0), in.strides(1), in.strides(2), in.strides(3)},
      /* const size_t wt_strides[NDIM + 2] = */
      {wt.strides(0), wt.strides(1), wt.strides(2), wt.strides(3)},
      /* const size_t out_strides[NDIM + 2] = */
      {out.strides(0), out.strides(1), out.strides(2), out.strides(3)},
      /* const int groups = */ groups,
      /* const bool flip = */ flip,
  };
  dispatch_conv_2D_gpu(s, d, in, wt, out, conv_params, copies);
}

void conv_3D_gpu(
    const Stream& s,
    metal::Device& d,
    const array& in,
    const array& wt,
    array out,
    const std::vector<int>& padding,
    const std::vector<int>& wt_strides,
    const std::vector<int>& wt_dilation,
    const std::vector<int>& in_dilation,
    int groups,
    bool flip,
    std::vector<array>& copies) {
  // We will use the contiguous strides for the conv params because that is
  // what the rest of the code expects.
  constexpr int NDIM = 3;
  int64_t in_arr_strides[NDIM + 2];
  int64_t wt_arr_strides[NDIM + 2];
  in_arr_strides[NDIM + 1] = wt_arr_strides[NDIM + 1] = 1;
  for (int i = NDIM; i >= 0; i--) {
    in_arr_strides[i] = in_arr_strides[i + 1] * in.shape(i + 1);
    wt_arr_strides[i] = wt_arr_strides[i + 1] * wt.shape(i + 1);
  }

  // Make conv params
  MLXConvParams<3> conv_params{
      /* const int  N = */ static_cast<int>(in.shape(0)),
      /* const int  C = */ static_cast<int>(in.shape(4)),
      /* const int  O = */ static_cast<int>(wt.shape(0)),
      /* const int iS[NDIM] = */
      {static_cast<int>(in.shape(1)),
       static_cast<int>(in.shape(2)),
       static_cast<int>(in.shape(3))},
      /* const int wS[NDIM] = */
      {static_cast<int>(wt.shape(1)),
       static_cast<int>(wt.shape(2)),
       static_cast<int>(wt.shape(3))},
      /* const int oS[NDIM] = */
      {static_cast<int>(out.shape(1)),
       static_cast<int>(out.shape(2)),
       static_cast<int>(out.shape(3))},
      /* const int str[NDIM] = */ {wt_strides[0], wt_strides[1], wt_strides[2]},
      /* const int pad[NDIM] = */ {padding[0], padding[1], padding[2]},
      /* const int kdil[NDIM] = */
      {wt_dilation[0], wt_dilation[1], wt_dilation[2]},
      /* const int idil[NDIM] = */
      {in_dilation[0], in_dilation[1], in_dilation[2]},
      /* const size_t in_strides[NDIM + 2] = */
      {in_arr_strides[0],
       in_arr_strides[1],
       in_arr_strides[2],
       in_arr_strides[3],
       in_arr_strides[4]},
      /* const size_t wt_strides[NDIM + 2] = */
      {wt_arr_strides[0],
       wt_arr_strides[1],
       wt_arr_strides[2],
       wt_arr_strides[3],
       wt_arr_strides[4]},
      /* const size_t out_strides[NDIM + 2] = */
      {out.strides(0),
       out.strides(1),
       out.strides(2),
       out.strides(3),
       out.strides(4)},
      /* const int groups = */ groups,
      /* const bool flip = */ flip,
  };
  return dispatch_conv_3D_gpu(s, d, in, wt, out, conv_params, copies);
}

} // namespace

void Convolution::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& s = stream();
  auto& d = metal::device(s.device);

  // Intermediates that are put here will be added to the command encoder as
  // temporaries.
  std::vector<array> copies;

  // Some shortcuts for brevity
  const array& in = inputs[0];
  const array& wt = inputs[1];

  // 3D conv
  if (out.ndim() == 5) {
    conv_3D_gpu(
        s,
        d,
        in,
        wt,
        out,
        padding_lo_,
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
        groups_,
        flip_,
        copies);
  }
  // 2D conv
  else if (out.ndim() == 4) {
    conv_2D_gpu(
        s,
        d,
        in,
        wt,
        out,
        padding_lo_,
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
        groups_,
        flip_,
        copies);
  }
  // 1D conv
  else if (out.ndim() == 3) {
    conv_1D_gpu(
        s,
        d,
        in,
        wt,
        out,
        padding_lo_,
        kernel_strides_,
        kernel_dilation_,
        input_dilation_,
        groups_,
        flip_,
        copies);
  }
  // Throw error
  else {
    throw std::invalid_argument(
        "[Convolution::eval_gpu] Only supports 1D, 2D or 3D convolutions.");
  }

  // Record copies
  if (!copies.empty()) {
    d.add_temporaries(std::move(copies), s.index);
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/copy.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/utils.h"

namespace mlx::core {

constexpr int MAX_COPY_SPECIALIZED_DIMS = 3;

void copy_gpu(const array& in, array& out, CopyType ctype, const Stream& s) {
  bool donated = set_copy_output_data(in, out, ctype);
  if (donated && in.dtype() == out.dtype()) {
    // If the output has the same type as the input then there is nothing to
    // copy, just use the buffer.
    return;
  }
  if (ctype == CopyType::GeneralGeneral) {
    ctype = CopyType::General;
  }
  copy_gpu_inplace(in, out, ctype, s);
}

void copy_gpu_inplace(
    const array& in,
    array& out,
    const Shape& data_shape,
    const Strides& strides_in_pre,
    const Strides& strides_out_pre,
    int64_t inp_offset,
    int64_t out_offset,
    CopyType ctype,
    const Stream& s,
    std::optional<array> dynamic_i_offset /* = std::nullopt */,
    std::optional<array> dynamic_o_offset /* = std::nullopt */) {
  if (out.size() == 0) {
    return;
  }

  // Try to collapse contiguous dims
  auto maybe_collapse =
      [ctype, &data_shape, &strides_in_pre, &strides_out_pre]() {
        if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
          auto [shape, strides] = collapse_contiguous_dims(
              data_shape,
              std::vector{strides_in_pre, strides_out_pre},
              /* size_cap = */ INT32_MAX);
          return std::make_tuple(shape, strides[0], strides[1]);
        } else {
          Strides e{};
          return std::make_tuple(Shape{}, e, e);
        }
      };
  auto [shape, strides_in_, strides_out_] = maybe_collapse();
  int ndim = shape.size();
  bool large;
  if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
    // Allow for negative strides
    large = in.data_size() > INT32_MAX || out.data_size() > INT32_MAX;
  } else {
    large = out.data_size() > UINT32_MAX;
  }
  bool dynamic = dynamic_i_offset || dynamic_o_offset;
  auto& d = metal::device(s.device);
  int work_per_thread = 1;
  std::string kernel_name;
  switch (ctype) {
    case CopyType::Scalar:
      kernel_name = large ? "s2" : "s";
      break;
    case CopyType::Vector:
      kernel_name = large ? "v2" : "v";
      break;
    case CopyType::General:
      kernel_name = "g";
      break;
    case CopyType::GeneralGeneral:
      kernel_name = "gg";
      break;
  }
  if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
    if (shape.size() <= MAX_COPY_SPECIALIZED_DIMS) {
      kernel_name += std::to_string(shape.size());
    } else {
      work_per_thread = large ? 4 : 2;
      concatenate(kernel_name, "n", std::to_string(work_per_thread));
    }
    if (large) {
      kernel_name += "large";
    }
    if (dynamic) {
      kernel_name += "_dynamic";
      if (ctype != CopyType::GeneralGeneral) {
        throw std::runtime_error(
            "[Copy::eval_gpu] Dynamic output offset requires GeneralGeneral copy");
      }
    }
  } else {
    work_per_thread = get_work_per_thread(out.dtype(), out.data_size());
    if (!large && work_per_thread > 1) {
      kernel_name += "n";
    }
  }
  concatenate(kernel_name, "_copy", type_to_name(in), type_to_name(out));
  auto kernel = dynamic ? get_dynamic_copy_kernel(d, kernel_name, in, out)
                        : get_copy_kernel(d, kernel_name, in, out);

  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  inp_offset *= size_of(in.dtype());
  out_offset *= size_of(out.dtype());

  compute_encoder.set_input_array(in, 0, inp_offset);
  compute_encoder.set_output_array(out, 1, out_offset);

  auto thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
  if (ctype == CopyType::General || ctype == CopyType::GeneralGeneral) {
    Strides strides_in{strides_in_.begin(), strides_in_.end()};
    Strides strides_out{strides_out_.begin(), strides_out_.end()};
    if (ndim > 3) {
      compute_encoder.set_vector_bytes(shape, ndim, 2);
    }
    compute_encoder.set_vector_bytes(strides_in, ndim, 3);
    if (ctype == CopyType::GeneralGeneral) {
      compute_encoder.set_vector_bytes(strides_out, ndim, 4);
    }

    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;

    size_t data_size = 1;
    for (auto& s : shape)
      data_size *= s;
    size_t rest = data_size / (dim0 * dim1);

    if (ndim > MAX_COPY_SPECIALIZED_DIMS) {
      compute_encoder.set_bytes(ndim, 5);
      dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
    }
    if (dynamic) {
      if (dynamic_i_offset) {
        compute_encoder.set_input_array(*dynamic_i_offset, 6);
      } else {
        compute_encoder.set_bytes(0ll, 6);
      }
      if (dynamic_o_offset) {
        compute_encoder.set_input_array(*dynamic_o_offset, 7);
      } else {
        compute_encoder.set_bytes(0ll, 7);
      }
    }

    // NB assuming thread_group_size is a power of 2 larger than 32 x 32
    if (thread_group_size != 1024) {
      throw std::runtime_error("[Metal::copy] Must use 1024 sized block");
    }

    auto group_dims = get_block_dims(dim0, dim1, rest);
    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  } else {
    size_t nthreads = ceildiv(out.data_size(), work_per_thread);
    if (thread_group_size > nthreads) {
      thread_group_size = nthreads;
    }
    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
    MTL::Size grid_dims;
    if (large) {
      compute_encoder.set_bytes<int64_t>(out.data_size(), 2);
      grid_dims = get_2d_grid_dims(out.shape(), out.strides(), work_per_thread);
    } else {
      compute_encoder.set_bytes<int>(out.data_size(), 2);
      grid_dims = MTL::Size(nthreads, 1, 1);
    }
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }
}

void fill_gpu(const array& val, array& out, const Stream& s) {
  if (out.size() == 0) {
    return;
  }
  out.set_data(allocator::malloc(out.nbytes()));
  bool large = out.data_size() > UINT32_MAX;
  int work_per_thread = get_work_per_thread(out.dtype(), out.data_size());
  auto& d = metal::device(s.device);
  std::string kernel_name = large ? "s2" : (work_per_thread > 1 ? "sn" : "s");
  concatenate(kernel_name, "_copy", type_to_name(val), type_to_name(out));
  auto kernel = get_copy_kernel(d, kernel_name, val, out);
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  compute_encoder.set_input_array(val, 0);
  compute_encoder.set_output_array(out, 1);

  auto thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
  size_t nthreads = ceildiv(out.data_size(), work_per_thread);
  if (thread_group_size > nthreads) {
    thread_group_size = nthreads;
  }
  MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
  MTL::Size grid_dims;
  if (large) {
    compute_encoder.set_bytes<int64_t>(out.data_size(), 2);
    grid_dims = get_2d_grid_dims(out.shape(), out.strides(), work_per_thread);
  } else {
    compute_encoder.set_bytes<int>(out.data_size(), 2);
    grid_dims = MTL::Size(nthreads, 1, 1);
  }
  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void reshape_gpu(const array& in, array& out, Stream s) {
  auto [copy_necessary, out_strides] = prepare_reshape(in, out);
  if (copy_necessary) {
    out.set_data(allocator::malloc(out.nbytes()));
    copy_gpu_inplace(
        in,
        out,
        in.shape(),
        in.strides(),
        make_contiguous_strides(in.shape()),
        0,
        0,
        CopyType::General,
        s);
  } else {
    shared_buffer_reshape(in, out_strides, out);
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/custom_kernel.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <iostream>
#include <regex>

#include "mlx/backend/common/compiled.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/metal/jit/includes.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/fast.h"
#include "mlx/fast_primitives.h"
#include "mlx/utils.h"

namespace mlx::core::fast {

struct CustomKernelCache {
  std::unordered_map<std::string, std::string> libraries;
};

static CustomKernelCache& cache() {
  static CustomKernelCache cache_;
  return cache_;
};

std::string write_signature(
    std::string func_name,
    const std::string& header,
    const std::string& source,
    const std::vector<std::string>& input_names,
    const std::vector<array>& inputs,
    const std::vector<std::string>& output_names,
    const std::vector<Dtype>& output_dtypes,
    const std::vector<std::pair<std::string, TemplateArg>>& template_args,
    const std::vector<std::string>& attributes,
    const std::vector<std::tuple<bool, bool, bool>>& shape_infos,
    bool atomic_outputs) {
  std::string kernel_source;
  kernel_source.reserve(header.size() + source.size() + 16384);
  kernel_source += header;
  // Auto-generate a function signature based on `template_args`
  // and the dtype/shape of the arrays passed as `inputs`.
  if (!template_args.empty()) {
    kernel_source += "template <";
    int i = 0;
    for (const auto& [name, arg] : template_args) {
      std::string param_type;
      if (std::holds_alternative<int>(arg)) {
        param_type = "int";
      } else if (std::holds_alternative<bool>(arg)) {
        param_type = "bool";
      } else if (std::holds_alternative<Dtype>(arg)) {
        param_type = "typename";
      }
      if (i > 0) {
        kernel_source += ", ";
      }
      kernel_source += param_type;
      kernel_source += " ";
      kernel_source += name;
      i++;
    }
    kernel_source += ">\n";
  }
  kernel_source += "[[kernel]] void ";
  kernel_source += func_name;
  kernel_source += "(\n";

  int index = 0;
  constexpr int max_constant_array_size = 8;
  // Add inputs
  for (int i = 0; i < inputs.size(); ++i) {
    const auto& name = input_names[i];
    const auto& arr = inputs[i];
    auto dtype = get_type_string(arr.dtype());
    std::string location =
        arr.size() < max_constant_array_size ? "constant" : "device";
    std::string ref = arr.ndim() == 0 ? "&" : "*";
    kernel_source += "  const ";
    kernel_source += location;
    kernel_source += " ";
    kernel_source += dtype;
    kernel_source += ref;
    kernel_source += " ";
    kernel_source += name;
    kernel_source += " [[buffer(";
    kernel_source += std::to_string(index);
    kernel_source += ")]],\n";
    index++;
    // Add input shape, strides and ndim if present in the source
    if (arr.ndim() > 0) {
      if (std::get<0>(shape_infos[i])) {
        kernel_source +=
            ("  const constant int* " + name + "_shape [[buffer(" +
             std::to_string(index) + ")]],\n");
        index++;
      }
      if (std::get<1>(shape_infos[i])) {
        kernel_source +=
            ("  const constant int64_t* " + name + "_strides [[buffer(" +
             std::to_string(index) + ")]],\n");
        index++;
      }
      if (std::get<2>(shape_infos[i])) {
        kernel_source +=
            ("  const constant int& " + name + "_ndim [[buffer(" +
             std::to_string(index) + ")]],\n");
        index++;
      }
    }
  }
  // Add outputs
  for (int i = 0; i < output_names.size(); ++i) {
    const auto& name = output_names[i];
    const auto& dtype = output_dtypes[i];
    kernel_source += "  device ";
    auto type_string = get_type_string(dtype);
    if (atomic_outputs) {
      kernel_source += "atomic<";
    }
    kernel_source += type_string;
    if (atomic_outputs) {
      kernel_source += ">";
    }
    kernel_source += "* ";
    kernel_source += name;
    kernel_source += " [[buffer(";
    kernel_source += std::to_string(index);
    kernel_source += ")]]";
    if (index < inputs.size() + output_names.size() - 1 ||
        attributes.size() > 0) {
      kernel_source += ",\n";
    } else {
      kernel_source += ") {\n";
    }
    index++;
  }

  index = 0;
  for (const auto& attr : attributes) {
    kernel_source += attr;
    if (index < attributes.size() - 1) {
      kernel_source += ",\n";
    } else {
      kernel_source += ") {\n";
    }
    index++;
  }
  kernel_source += source;
  kernel_source += "\n}\n";
  return kernel_source;
}

std::string write_template(
    const std::vector<std::pair<std::string, TemplateArg>>& template_args) {
  std::ostringstream template_def;
  template_def << "<";
  int i = 0;
  for (const auto& [name, arg] : template_args) {
    if (i > 0) {
      template_def << ", ";
    }
    if (std::holds_alternative<int>(arg)) {
      template_def << std::get<int>(arg);
    } else if (std::holds_alternative<bool>(arg)) {
      template_def << std::get<bool>(arg);
    } else if (std::holds_alternative<Dtype>(arg)) {
      template_def << get_type_string(std::get<Dtype>(arg));
    }
    i++;
  }
  template_def << ">";
  return template_def.str();
}

CustomKernelFunction metal_kernel(
    const std::string& name,
    const std::vector<std::string>& input_names,
    const std::vector<std::string>& output_names,
    const std::string& source,
    const std::string& header /* = "" */,
    bool ensure_row_contiguous /* = true */,
    bool atomic_outputs /* = false */) {
  if (output_names.empty()) {
    throw std::invalid_argument(
        "[metal_kernel] Must specify at least one output.");
  }
  std::vector<std::tuple<bool, bool, bool>> shape_infos;
  for (auto& n : input_names) {
    std::tuple<bool, bool, bool> shape_info;
    std::get<0>(shape_info) = source.find(n + "_shape") != std::string::npos;
    std::get<1>(shape_info) = source.find(n + "_strides") != std::string::npos;
    std::get<2>(shape_info) = source.find(n + "_ndim") != std::string::npos;
    shape_infos.push_back(shape_info);
  }
  const std::vector<std::pair<std::string, std::string>> metal_attributes = {
      {"dispatch_quadgroups_per_threadgroup", "uint"},
      {"dispatch_simdgroups_per_threadgroup", "uint"},
      {"dispatch_threads_per_threadgroup", "uint3"},
      {"grid_origin", "uint3"},
      {"grid_size", "uint3"},
      {"quadgroup_index_in_threadgroup", "uint"},
      {"quadgroups_per_threadgroup", "uint"},
      {"simdgroup_index_in_threadgroup", "uint"},
      {"simdgroups_per_threadgroup", "uint"},
      {"thread_execution_width", "uint"},
      {"thread_index_in_quadgroup", "uint"},
      {"thread_index_in_simdgroup", "uint"},
      {"thread_index_in_threadgroup", "uint"},
      {"thread_position_in_grid", "uint3"},
      {"thread_position_in_threadgroup", "uint3"},
      {"threadgroup_position_in_grid", "uint3"},
      {"threadgroups_per_grid", "uint3"},
      {"threads_per_grid", "uint3"},
      {"threads_per_simdgroup", "uint"},
      {"threads_per_threadgroup", "uint3"},
  };

  std::vector<std::string> attributes;
  for (const auto& [attr, dtype] : metal_attributes) {
    if (source.find(attr) != std::string::npos) {
      attributes.push_back("  " + dtype + " " + attr + " [[" + attr + "]]");
    }
  }

  return [=,
          shape_infos = std::move(shape_infos),
          attributes = std::move(attributes)](
             const std::vector<array>& inputs,
             const std::vector<Shape>& output_shapes,
             const std::vector<Dtype>& output_dtypes,
             std::tuple<int, int, int> grid,
             std::tuple<int, int, int> threadgroup,
             const std::vector<std::pair<std::string, TemplateArg>>&
                 template_args = {},
             std::optional<float> init_value = std::nullopt,
             bool verbose = false,
             StreamOrDevice s_ = {}) {
    if (inputs.size() != input_names.size()) {
      std::ostringstream msg;
      msg << "[metal_kernel] Expected `inputs` to have size "
          << input_names.size() << " but got size " << inputs.size() << "."
          << std::endl;
      throw std::invalid_argument(msg.str());
    }
    if (output_shapes.size() != output_names.size()) {
      std::ostringstream msg;
      msg << "[metal_kernel] Expected `output_shapes` to have size "
          << output_names.size() << " but got size " << output_shapes.size()
          << "." << std::endl;
      throw std::invalid_argument(msg.str());
    }
    if (output_dtypes.size() != output_names.size()) {
      std::ostringstream msg;
      msg << "[metal_kernel] Expected `output_dtypes` to have size "
          << output_names.size() << " but got size " << output_dtypes.size()
          << "." << std::endl;
      throw std::invalid_argument(msg.str());
    }

    auto s = to_stream(s_);
    if (s.device != Device::gpu) {
      throw std::invalid_argument("[metal_kernel] Only supports the GPU.");
    }

    std::string kernel_name = "custom_kernel_" + name;
    std::string template_def = "";
    if (!template_args.empty()) {
      std::regex disallowed_chars("\\<|\\>|(, )");
      template_def = write_template(template_args);
      auto template_hash =
          std::regex_replace(template_def, disallowed_chars, "_");
      template_hash.pop_back();
      kernel_name += "_";
      kernel_name += template_hash;
    }

    std::string kernel_source = write_signature(
        kernel_name,
        header,
        source,
        input_names,
        inputs,
        output_names,
        output_dtypes,
        template_args,
        attributes,
        shape_infos,
        atomic_outputs);

    if (!template_args.empty()) {
      template_def = kernel_name + template_def;
      kernel_source += "\ntemplate [[host_name(\"";
      kernel_source += kernel_name;
      kernel_source += "\")]] [[kernel]] decltype(";
      kernel_source += template_def;
      kernel_source += ") ";
      kernel_source += template_def;
      kernel_source += ";\n";
    }

    if (verbose) {
      std::cout << "Generated source code for `" << name << "`:" << std::endl
                << "```" << std::endl
                << kernel_source << std::endl
                << "```" << std::endl;
    }

    return array::make_arrays(
        std::move(output_shapes),
        std::move(output_dtypes),
        std::make_shared<CustomKernel>(
            s,
            std::move(kernel_name),
            std::move(kernel_source),
            grid,
            threadgroup,
            shape_infos,
            ensure_row_contiguous,
            init_value,
            std::vector<ScalarArg>{},
            false,
            0),
        std::move(inputs));
  };
}

void CustomKernel::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  // silence some warnings
  (void)is_precompiled_;
  (void)shared_memory_;

  auto& s = stream();

  std::vector<array> copies;

  for (auto& out : outputs) {
    if (init_value_) {
      copies.emplace_back(init_value_.value(), out.dtype());
      fill_gpu(copies.back(), out, s);
    } else {
      out.set_data(allocator::malloc(out.nbytes()));
    }
  }

  auto check_input = [&copies, &s, this](const array& x) -> const array {
    bool no_copy = x.flags().row_contiguous;
    if (!ensure_row_contiguous_ || no_copy) {
      return x;
    } else {
      copies.push_back(array(x.shape(), x.dtype(), nullptr, {}));
      copy_gpu(x, copies.back(), CopyType::General, s);
      return copies.back();
    }
  };
  std::vector<array> checked_inputs;
  for (const array& in : inputs) {
    checked_inputs.push_back(check_input(in));
  }

  auto& d = metal::device(s.device);

  {
    // Clear kernels from the device library cache if needed
    auto& kernel_cache = cache();
    if (auto it = kernel_cache.libraries.find(name_);
        it != kernel_cache.libraries.end()) {
      if (it->second != source_) {
        auto& d = metal::device(s.device);
        d.clear_library(name_);
        it->second = source_;
      }
    } else {
      kernel_cache.libraries.emplace(name_, source_);
    }
  }

  auto lib = d.get_library(name_, [this] { return metal::utils() + source_; });
  auto kernel = d.get_kernel(name_, lib);
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);
  int index = 0;
  for (int i = 0; i < checked_inputs.size(); i++) {
    const array& in = checked_inputs[i];
    auto& shape_info = shape_infos_[i];
    compute_encoder.set_input_array(in, index);
    index++;
    if (in.ndim() > 0) {
      int ndim = in.ndim();
      if (std::get<0>(shape_info)) {
        compute_encoder.set_vector_bytes(in.shape(), ndim, index);
        index++;
      }
      if (std::get<1>(shape_info)) {
        compute_encoder.set_vector_bytes(in.strides(), ndim, index);
        index++;
      }
      if (std::get<2>(shape_info)) {
        compute_encoder.set_bytes(ndim, index);
        index++;
      }
    }
  }
  for (auto& out : outputs) {
    compute_encoder.set_output_array(out, index);
    index++;
  }

  const auto [tx, ty, tz] = threadgroup_;
  auto tg_size = tx * ty * tz;
  auto max_tg_size = kernel->maxTotalThreadsPerThreadgroup();
  if (tg_size > max_tg_size) {
    std::ostringstream msg;
    msg << "Thread group size (" << tg_size << ") is greater than "
        << " the maximum allowed threads per threadgroup (" << max_tg_size
        << ").";
    throw std::invalid_argument(msg.str());
  }

  const auto [gx, gy, gz] = grid_;
  MTL::Size group_dims =
      MTL::Size(std::min(tx, gx), std::min(ty, gy), std::min(tz, gz));
  MTL::Size grid_dims = MTL::Size(gx, gy, gz);
  compute_encoder.dispatch_threads(grid_dims, group_dims);

  d.add_temporaries(std::move(copies), s.index);
}

} // namespace mlx::core::fast


================================================
FILE: mlx/backend/metal/device.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <cstdlib>
#include <sstream>

#define NS_PRIVATE_IMPLEMENTATION
#define CA_PRIVATE_IMPLEMENTATION
#define MTL_PRIVATE_IMPLEMENTATION

#include "mlx/backend/common/utils.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/metal.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/utils.h"

namespace std {

// Required for putting the pointer in unordered_set.
template <class T>
struct hash<NS::SharedPtr<T>> {
  size_t operator()(const NS::SharedPtr<T>& p) const {
    return std::hash<T*>{}(p.get());
  }
};

} // namespace std

namespace mlx::core::metal {

namespace {

constexpr const char* default_mtllib_path = METAL_PATH;

auto get_metal_version() {
  auto get_metal_version_ = []() {
    if (__builtin_available(macOS 26, iOS 26, tvOS 26, visionOS 26, *)) {
      return MTL::LanguageVersion4_0;
    } else if (__builtin_available(macOS 15, iOS 18, tvOS 18, visionOS 2, *)) {
      return MTL::LanguageVersion3_2;
    } else {
      return MTL::LanguageVersion3_1;
    }
  };
  static auto metal_version_ = get_metal_version_();
  return metal_version_;
}

auto load_device() {
  auto devices = MTL::CopyAllDevices();
  auto device = static_cast<MTL::Device*>(devices->object(0))
      ?: MTL::CreateSystemDefaultDevice();
  if (!device) {
    throw std::runtime_error("Failed to load device");
  }
  return device;
}
std::pair<MTL::Library*, NS::Error*> load_library_from_path(
    MTL::Device* device,
    const char* path) {
  auto library = NS::String::string(path, NS::UTF8StringEncoding);
  NS::Error* error;
  auto lib = device->newLibrary(library, &error);

  return std::make_pair(lib, error);
}

#ifdef SWIFTPM_BUNDLE
MTL::Library* try_load_bundle(
    MTL::Device* device,
    NS::URL* url,
    const std::string& lib_name) {
  std::string bundle_path = std::string(url->fileSystemRepresentation()) + "/" +
      SWIFTPM_BUNDLE + ".bundle";
  auto bundle = NS::Bundle::alloc()->init(
      NS::String::string(bundle_path.c_str(), NS::UTF8StringEncoding));
  if (bundle != nullptr) {
    std::string resource_path =
        std::string(bundle->resourceURL()->fileSystemRepresentation()) + "/" +
        lib_name + ".metallib";
    auto [lib, error] = load_library_from_path(device, resource_path.c_str());
    if (lib) {
      return lib;
    }
  }
  return nullptr;
}

MTL::Library* try_load_framework(
    MTL::Device* device,
    NS::URL* url,
    const std::string& lib_name) {
  std::string resource_path = std::string(url->fileSystemRepresentation()) +
      "/" + lib_name + ".metallib";
  auto [lib, error] = load_library_from_path(device, resource_path.c_str());
  if (lib) {
    return lib;
  }
  return nullptr;
}
#endif

// Firstly, search for the metallib in the same path as this binary
std::pair<MTL::Library*, NS::Error*> load_colocated_library(
    MTL::Device* device,
    const std::string& relative_path) {
  auto path = current_binary_dir() / relative_path;
  if (!path.has_extension()) {
    path.replace_extension(".metallib");
  }

  return load_library_from_path(device, path.c_str());
}

std::pair<MTL::Library*, NS::Error*> load_swiftpm_library(
    MTL::Device* device,
    const std::string& lib_name) {
#ifdef SWIFTPM_BUNDLE
  MTL::Library* library =
      try_load_bundle(device, NS::Bundle::mainBundle()->bundleURL(), lib_name);
  if (library != nullptr) {
    return {library, nullptr};
  }
  auto bundles = NS::Bundle::allBundles();
  for (int i = 0, c = (int)bundles->count(); i < c; i++) {
    auto bundle = reinterpret_cast<NS::Bundle*>(bundles->object(i));
    library = try_load_bundle(device, bundle->resourceURL(), lib_name);
    if (library != nullptr) {
      return {library, nullptr};
    }
  }
  // if SWIFTPM_BUNDLE is a framework identifier, try loading from that
  auto frameworks = NS::Bundle::allFrameworks();
  for (int i = 0, c = (int)frameworks->count(); i < c; i++) {
    const auto bundle = reinterpret_cast<NS::Bundle*>(frameworks->object(i));
    const auto identifier = bundle->bundleIdentifier();
    if (identifier != nullptr &&
        !strcmp(identifier->utf8String(), SWIFTPM_BUNDLE)) {
      library = try_load_framework(device, bundle->resourceURL(), lib_name);
      if (library != nullptr) {
        return {library, nullptr};
      }
    }
  }
#endif
  return {nullptr, nullptr};
}

MTL::Library* load_default_library(MTL::Device* device) {
  NS::Error* error[5];
  MTL::Library* lib;
  // First try the colocated mlx.metallib
  std::tie(lib, error[0]) = load_colocated_library(device, "mlx");
  if (lib) {
    return lib;
  }

  std::tie(lib, error[1]) = load_colocated_library(device, "Resources/mlx");
  if (lib) {
    return lib;
  }

  // Then try default.metallib in a SwiftPM bundle if we have one
  std::tie(lib, error[2]) = load_swiftpm_library(device, "default");
  if (lib) {
    return lib;
  }

  // Try lo load resources from Framework resources if SwiftPM wrapped as a
  // dynamic framework.
  std::tie(lib, error[3]) = load_colocated_library(device, "Resources/default");
  if (lib) {
    return lib;
  }

  // Finally try default_mtllib_path
  std::tie(lib, error[4]) = load_library_from_path(device, default_mtllib_path);
  if (!lib) {
    std::ostringstream msg;
    msg << "Failed to load the default metallib. ";
    for (int i = 0; i < 5; i++) {
      if (error[i] != nullptr) {
        msg << error[i]->localizedDescription()->utf8String() << " ";
      }
    }
    throw std::runtime_error(msg.str());
  }
  return lib;
}

MTL::Library* load_library(
    MTL::Device* device,
    const std::string& lib_name,
    const std::string& lib_path) {
  // We have been given a path that ends in metallib so try to load it
  if (lib_path.size() > 9 &&
      std::equal(lib_path.end() - 9, lib_path.end(), ".metallib")) {
    auto [lib, error] = load_library_from_path(device, lib_path.c_str());
    if (!lib) {
      std::ostringstream msg;
      msg << "Failed to load the metallib from <" << lib_path << "> with error "
          << error->localizedDescription()->utf8String();
      throw std::runtime_error(msg.str());
    }
    return lib;
  }

  // We have been given a path so try to load from lib_path / lib_name.metallib
  if (lib_path.size() > 0) {
    std::string full_path = lib_path + "/" + lib_name + ".metallib";
    auto [lib, error] = load_library_from_path(device, full_path.c_str());
    if (!lib) {
      std::ostringstream msg;
      msg << "Failed to load the metallib from <" << full_path
          << "> with error " << error->localizedDescription()->utf8String();
      throw std::runtime_error(msg.str());
    }
    return lib;
  }

  // Try to load the colocated library
  {
    auto [lib, error] = load_colocated_library(device, lib_name);
    if (lib) {
      return lib;
    }
  }

  // Try to load the library from swiftpm
  {
    auto [lib, error] = load_swiftpm_library(device, lib_name);
    if (lib) {
      return lib;
    }
  }

  std::ostringstream msg;
  msg << "Failed to load the metallib " << lib_name << ".metallib. "
      << "We attempted to load it from <" << current_binary_dir() << "/"
      << lib_name << ".metallib>";
#ifdef SWIFTPM_BUNDLE
  msg << " and from the Swift PM bundle.";
#endif
  throw std::runtime_error(msg.str());
}

} // namespace

CommandEncoder::CommandEncoder(
    Device& d,
    int index,
    const MTL::ResidencySet* residency_set)
    : device_(d) {
  auto pool = new_scoped_memory_pool();
  queue_ = NS::TransferPtr(device_.mtl_device()->newCommandQueue());
  if (!queue_) {
    throw std::runtime_error(
        "[metal::CommandEncoder] Failed to make new command queue.");
  }
  if (residency_set) {
    queue_->addResidencySet(residency_set);
  }
  debug_set_stream_queue_label(queue_.get(), index);
  buffer_ = NS::RetainPtr(queue_->commandBufferWithUnretainedReferences());
}

void CommandEncoder::set_buffer(
    const MTL::Buffer* buf,
    int idx,
    int64_t offset /* = 0 */) {
  // Record as both input and output to ensure synchronization between command
  // buffers
  all_inputs_.insert((void*)buf);
  all_outputs_.insert((void*)buf);
  get_command_encoder()->setBuffer(buf, offset, idx);
}

void CommandEncoder::set_input_array(
    const array& a,
    int idx,
    int64_t offset /* = 0 */) {
  if (all_inputs_.insert(a.buffer().ptr()).second) {
    buffer_sizes_ += a.data_size();
  }
  auto r_buf = static_cast<MTL::Resource*>(const_cast<void*>(a.buffer().ptr()));
  needs_barrier_ =
      needs_barrier_ | (prev_outputs_.find(r_buf) != prev_outputs_.end());
  auto a_buf = static_cast<const MTL::Buffer*>(a.buffer().ptr());
  get_command_encoder()->setBuffer(a_buf, a.offset() + offset, idx);
}

void CommandEncoder::set_output_array(
    array& a,
    int idx,
    int64_t offset /* = 0 */) {
  // Add barriers before adding the output to the output set
  set_input_array(a, idx, offset);
  register_output_array(a);
}

void CommandEncoder::register_output_array(const array& a) {
  all_outputs_.insert(a.buffer().ptr());

  auto buf = static_cast<MTL::Resource*>(const_cast<void*>(a.buffer().ptr()));
  if (concurrent_) {
    concurrent_outputs_.insert(buf);
  } else {
    next_outputs_.insert(buf);
  }
}

void CommandEncoder::add_temporary(array arr) {
  temporaries_.push_back(std::move(arr));
}

void CommandEncoder::add_temporaries(std::vector<array> arrays) {
  temporaries_.insert(
      temporaries_.end(),
      std::make_move_iterator(arrays.begin()),
      std::make_move_iterator(arrays.end()));
}

void CommandEncoder::maybeInsertBarrier() {
  if (needs_barrier_) {
    get_command_encoder()->memoryBarrier(MTL::BarrierScopeBuffers);
    needs_barrier_ = false;
    prev_outputs_ = std::move(next_outputs_);
  } else {
    prev_outputs_.insert(next_outputs_.begin(), next_outputs_.end());
  }
  next_outputs_.clear();
}

void CommandEncoder::dispatch_threadgroups(
    MTL::Size grid_dims,
    MTL::Size group_dims) {
  maybeInsertBarrier();
  buffer_ops_++;
  get_command_encoder()->dispatchThreadgroups(grid_dims, group_dims);
}

void CommandEncoder::dispatch_threads(
    MTL::Size grid_dims,
    MTL::Size group_dims) {
  maybeInsertBarrier();
  buffer_ops_++;
  get_command_encoder()->dispatchThreads(grid_dims, group_dims);
}

void CommandEncoder::barrier() {
  get_command_encoder()->memoryBarrier(MTL::BarrierScopeBuffers);
}

void CommandEncoder::end_encoding() {
  // Each command encoder has a unique fence. We also store a map of
  // all previous outputs of command encoders to their corresponding fence.
  // - The command encoder records its inputs and outputs.
  // - Wait on a fence if any inputs in the encoder are outputs of a previous
  //   encoder.
  // - Update the map of outputs to include this command encoder's outputs.
  // - Always signal this command encoders fence.
  // - Add a completion handler for this command encoder that removes outputs
  //   from the map to limit the growth of the map and avoid unnecessary waits
  // - Temporaries are a special case as they do not cross command encoder
  //   boundaries. These can be removed early from the encoders inputs and
  //   outputs since they don't need synchronization.
  if (!encoder_) {
    return;
  }

  // Remove temporaries from inputs and outputs.
  for (auto& t : temporaries_) {
    all_outputs_.erase(t.buffer().ptr());
    all_inputs_.erase(t.buffer().ptr());
  }

  // Keep references to the fences we waited on and put them in the completion
  // handler so they are not prematurely released.
  std::unordered_set<NS::SharedPtr<MTL::Fence>> waiting_on;
  {
    std::lock_guard lk(outputs_mtx_);
    for (auto& in : all_inputs_) {
      if (auto it = prev_ce_outputs_.find(in); it != prev_ce_outputs_.end()) {
        // If we've already waited on a fence, don't wait on it again.
        if (waiting_on.find(it->second) == waiting_on.end()) {
          encoder_->waitForFence(it->second.get());
          waiting_on.insert(it->second);
        }
      }
    }
    for (auto& out : all_outputs_) {
      prev_ce_outputs_[out] = fence_;
    }
  }

  encoder_->updateFence(fence_.get());
  buffer_->addCompletedHandler([this,
                                fence = std::move(fence_),
                                temporaries = std::move(temporaries_),
                                all_outputs = std::move(all_outputs_),
                                waiting_on = std::move(waiting_on)](
                                   MTL::CommandBuffer*) mutable {
    std::lock_guard lk(outputs_mtx_);
    for (auto& o : all_outputs) {
      if (auto it = prev_ce_outputs_.find(o); it != prev_ce_outputs_.end()) {
        if (it->second == fence) {
          prev_ce_outputs_.erase(it);
        }
      }
    }
  });

  encoder_->endEncoding();
  encoder_.reset();
  needs_barrier_ = false;
  concurrent_ = false;
  prev_outputs_.clear();
  next_outputs_.clear();
  concurrent_outputs_.clear();
  all_inputs_.clear();
}

bool CommandEncoder::needs_commit() const {
  auto [max_ops, max_mb] = device_.get_max_ops_mb_per_buffer();
  return (buffer_ops_ > max_ops) || ((buffer_sizes_ >> 20) > max_mb);
}

void CommandEncoder::commit() {
  buffer_->commit();
  buffer_ = NS::RetainPtr(queue_->commandBufferWithUnretainedReferences());
  buffer_ops_ = 0;
  buffer_sizes_ = 0;
}

MTL::ComputeCommandEncoder* CommandEncoder::get_command_encoder() {
  if (!encoder_) {
    encoder_ = NS::RetainPtr(
        buffer_->computeCommandEncoder(MTL::DispatchTypeConcurrent));
    fence_ = NS::TransferPtr(device_.mtl_device()->newFence());
  }
  return encoder_.get();
}

Device::Device() {
  auto pool = new_scoped_memory_pool();
  device_ = load_device();
  default_library_ = load_default_library(device_);
  arch_ = env::metal_gpu_arch();
  if (arch_.empty()) {
    arch_ = std::string(device_->architecture()->name()->utf8String());
  }
  int ag_tens = 0;
  int ag_ones = 0;
  if (arch_.size() >= 3) {
    ag_tens = arch_[arch_.size() - 3] - '0';
    ag_ones = arch_[arch_.size() - 2] - '0';
    ag_tens = (ag_tens < 10 && ag_tens >= 0) ? ag_tens : 0;
    ag_ones = (ag_ones < 10 && ag_ones >= 0) ? ag_ones : 0;
  }
  arch_gen_ = ag_tens * 10 + ag_ones;
  auto arch = arch_.back();
  switch (arch) {
    case 'p': // phone
      max_ops_per_buffer_ = 20;
      max_mb_per_buffer_ = 40;
      break;
    case 'g': // base, pro
      max_ops_per_buffer_ = 40;
      max_mb_per_buffer_ = 40;
      break;
    case 's': // max
      max_ops_per_buffer_ = 50;
      max_mb_per_buffer_ = 50;
      break;
    case 'd': // ultra
      max_ops_per_buffer_ = 50;
      max_mb_per_buffer_ = 50;
      break;
    default: // default to medium
      max_ops_per_buffer_ = 40;
      max_mb_per_buffer_ = 40;
      break;
  }
  max_ops_per_buffer_ = env::max_ops_per_buffer(max_ops_per_buffer_);
  max_mb_per_buffer_ = env::max_mb_per_buffer(max_mb_per_buffer_);
}

Device::~Device() {
  auto pool = new_scoped_memory_pool();
  for (auto& [l, kernel_map] : library_kernels_) {
    l->release();
    for (auto& [_, k] : kernel_map) {
      k->release();
    }
  }
  encoders_.clear();
  device_->release();
}

bool Device::command_buffer_needs_commit(int index) {
  return get_command_encoder(index).needs_commit();
}

MTL::CommandBuffer* Device::get_command_buffer(int index) {
  return get_command_encoder(index).get_command_buffer();
}

void Device::commit_command_buffer(int index) {
  get_command_encoder(index).commit();
}

void Device::add_temporary(array arr, int index) {
  get_command_encoder(index).add_temporary(std::move(arr));
}

void Device::add_temporaries(std::vector<array> arrays, int index) {
  get_command_encoder(index).add_temporaries(std::move(arrays));
}

void Device::end_encoding(int index) {
  get_command_encoder(index).end_encoding();
}

CommandEncoder& Device::get_command_encoder(int index) {
  auto it = encoders_.find(index);
  if (it == encoders_.end()) {
    it = encoders_.try_emplace(index, *this, index, residency_set_).first;
  }
  return it->second;
}

MTL::Library* Device::get_library(
    const std::string& name,
    const std::string& path /* = "" */) {
  {
    std::shared_lock rlock(library_mtx_);
    if (auto it = library_map_.find(name); it != library_map_.end()) {
      return it->second;
    }
  }

  std::unique_lock wlock(library_mtx_);
  if (auto it = library_map_.find(name); it != library_map_.end()) {
    return it->second;
  }

  auto new_lib = load_library(device_, name, path.c_str());
  library_map_.insert({name, new_lib});
  return new_lib;
}

MTL::Library* Device::build_library_(const std::string& source_string) {
  auto pool = new_scoped_memory_pool();

  auto ns_code =
      NS::String::string(source_string.c_str(), NS::ASCIIStringEncoding);

  NS::Error* error = nullptr;
  auto options = MTL::CompileOptions::alloc()->init();
  options->setFastMathEnabled(false);
  options->setLanguageVersion(get_metal_version());
#ifndef NDEBUG
  if (options->languageVersion() >= MTL::LanguageVersion3_2) {
    options->setEnableLogging(true);
  }
#endif
  auto mtl_lib = device_->newLibrary(ns_code, options, &error);
  options->release();

  // Throw error if unable to compile library
  if (!mtl_lib) {
    std::ostringstream msg;
    msg << "[metal::Device] Unable to build metal library from source\n";
    if (error) {
      msg << error->localizedDescription()->utf8String() << "\n";
    }
    throw std::runtime_error(msg.str());
  }

  return mtl_lib;
}

MTL::Function* Device::get_function_(
    const std::string& name,
    MTL::Library* mtl_lib) {
  // Pull kernel from library
  auto ns_name = NS::String::string(name.c_str(), NS::ASCIIStringEncoding);
  auto mtl_function = mtl_lib->newFunction(ns_name);

  return mtl_function;
}

MTL::Function* Device::get_function_(
    const std::string& name,
    const std::string& specialized_name,
    const MTLFCList& func_consts,
    MTL::Library* mtl_lib) {
  if (func_consts.empty() && (specialized_name == name)) {
    return get_function_(name, mtl_lib);
  }

  // Prepare function constants
  auto mtl_func_consts = MTL::FunctionConstantValues::alloc()->init();

  for (auto [value, type, index] : func_consts) {
    mtl_func_consts->setConstantValue(value, type, index);
  }

  // Prepare function desc
  auto desc = MTL::FunctionDescriptor::functionDescriptor();
  desc->setName(NS::String::string(name.c_str(), NS::ASCIIStringEncoding));
  desc->setSpecializedName(
      NS::String::string(specialized_name.c_str(), NS::ASCIIStringEncoding));
  desc->setConstantValues(mtl_func_consts);

  // Pull kernel from library
  NS::Error* error = nullptr;
  auto mtl_function = mtl_lib->newFunction(desc, &error);

  // Throw error if unable to build metal function
  if (!mtl_function) {
    std::ostringstream msg;
    msg << "[metal::Device] Unable to load function " << name << "\n";
    if (error) {
      msg << error->localizedDescription()->utf8String() << "\n";
    }
    throw std::runtime_error(msg.str());
  }

  mtl_func_consts->release();

  return mtl_function;
}

MTL::ComputePipelineState* Device::get_kernel_(
    const std::string& name,
    const MTL::Function* mtl_function) {
  // Compile kernel to compute pipeline
  NS::Error* error = nullptr;
  MTL::ComputePipelineState* kernel;

  if (mtl_function) {
    kernel = device_->newComputePipelineState(mtl_function, &error);
  }

  // Throw error if unable to compile metal function
  if (!mtl_function || !kernel) {
    std::ostringstream msg;
    msg << "[metal::Device] Unable to load kernel " << name << "\n";
    if (error) {
      msg << error->localizedDescription()->utf8String() << "\n";
    }
    throw std::runtime_error(msg.str());
  }

  return kernel;
}

MTL::ComputePipelineState* Device::get_kernel_(
    const std::string& name,
    const MTL::Function* mtl_function,
    const MTL::LinkedFunctions* linked_functions) {
  // Check inputs
  if (!linked_functions) {
    return get_kernel_(name, mtl_function);
  }

  if (!mtl_function) {
    std::ostringstream msg;
    msg << "[metal::Device] Unable to load kernel " << name << "\n";
    throw std::runtime_error(msg.str());
  }

  // Prepare compute pipeline state descriptor
  auto desc = MTL::ComputePipelineDescriptor::alloc()->init();
  desc->setComputeFunction(mtl_function);
  desc->setLinkedFunctions(linked_functions);

  // Compile kernel to compute pipeline
  NS::Error* error = nullptr;
  auto kernel = device_->newComputePipelineState(
      desc, MTL::PipelineOptionNone, nullptr, &error);

  // Throw error if unable to compile metal function
  if (!kernel) {
    std::ostringstream msg;
    msg << "[metal::Device] Unable to load kernel " << name << "\n";
    if (error) {
      msg << error->localizedDescription()->utf8String() << "\n";
    }
    throw std::runtime_error(msg.str());
  }

  return kernel;
}

MTL::Library* Device::get_library_(const std::string& name) {
  std::shared_lock lock(library_mtx_);
  auto it = library_map_.find(name);
  return (it != library_map_.end()) ? it->second : nullptr;
}

MTL::Library* Device::get_library(
    const std::string& name,
    const std::function<std::string(void)>& builder) {
  {
    std::shared_lock rlock(library_mtx_);
    if (auto it = library_map_.find(name); it != library_map_.end()) {
      return it->second;
    }
  }

  std::unique_lock wlock(library_mtx_);
  if (auto it = library_map_.find(name); it != library_map_.end()) {
    return it->second;
  }

  auto mtl_lib = build_library_(builder());
  library_map_.insert({name, mtl_lib});
  return mtl_lib;
}

void Device::clear_library(const std::string& name) {
  std::unique_lock wlock(library_mtx_);
  if (auto it = library_map_.find(name); it != library_map_.end()) {
    auto kernel_map_it = library_kernels_.find(it->second);
    for (auto& [_, kernel] : kernel_map_it->second) {
      kernel->release();
    }
    library_kernels_.erase(kernel_map_it);
    it->second->release();
    library_map_.erase(it);
  }
}

MTL::LinkedFunctions* Device::get_linked_functions_(
    const std::vector<MTL::Function*>& funcs) {
  if (funcs.empty()) {
    return nullptr;
  }

  auto lfuncs = MTL::LinkedFunctions::linkedFunctions();

  std::vector<NS::Object*> objs(funcs.size());
  for (int i = 0; i < funcs.size(); i++) {
    objs[i] = funcs[i];
  }

  NS::Array* funcs_arr = NS::Array::array(objs.data(), funcs.size());

  lfuncs->setPrivateFunctions(funcs_arr);

  return lfuncs;
}

MTL::ComputePipelineState* Device::get_kernel_(
    const std::string& base_name,
    MTL::Library* mtl_lib,
    const std::string& hash_name,
    const MTLFCList& func_consts /* = {} */,
    const std::vector<MTL::Function*>& linked_functions /* = {} */) {
  // Single writer allowed
  std::unique_lock wlock(kernel_mtx_);

  // Try loading again to avoid loading twice
  auto& kernel_map_ = library_kernels_[mtl_lib];
  if (auto it = kernel_map_.find(hash_name); it != kernel_map_.end()) {
    return it->second;
  }

  auto pool = new_scoped_memory_pool();

  // Pull kernel from library
  auto mtl_function = get_function_(base_name, hash_name, func_consts, mtl_lib);

  // Compile kernel to compute pipeline
  auto mtl_linked_funcs = get_linked_functions_(linked_functions);
  auto kernel = get_kernel_(hash_name, mtl_function, mtl_linked_funcs);

  mtl_function->release();
  mtl_linked_funcs->release();

  // Add kernel to cache
  kernel_map_.insert({hash_name, kernel});

  return kernel;
}

MTL::ComputePipelineState* Device::get_kernel(
    const std::string& base_name,
    MTL::Library* mtl_lib,
    const std::string& hash_name /* = "" */,
    const MTLFCList& func_consts /* = {} */,
    const std::vector<MTL::Function*>& linked_functions /* = {} */) {
  const auto& kname = hash_name.empty() ? base_name : hash_name;
  {
    // Multiple readers allowed
    std::shared_lock lock(kernel_mtx_);

    // Look for cached kernel
    auto& kernel_map_ = library_kernels_[mtl_lib];
    if (auto it = kernel_map_.find(kname); it != kernel_map_.end()) {
      return it->second;
    }
  }
  return get_kernel_(base_name, mtl_lib, kname, func_consts, linked_functions);
}

MTL::ComputePipelineState* Device::get_kernel(
    const std::string& base_name,
    const std::string& hash_name /*  = "" */,
    const MTLFCList& func_consts /*  = {} */,
    const std::vector<MTL::Function*>& linked_functions /*  = {} */) {
  return get_kernel(
      base_name, default_library_, hash_name, func_consts, linked_functions);
}

void Device::set_residency_set(const MTL::ResidencySet* residency_set) {
  if (residency_set_ != nullptr) {
    throw std::runtime_error(
        "[Device::set_residency_set] Can only be set once.");
  }
  if (residency_set == nullptr) {
    return;
  }
  residency_set_ = residency_set;
  // Attach residency set to existing command queues
  for (auto& [_, encoder] : encoders_) {
    encoder.get_command_queue()->addResidencySet(residency_set_);
  }
}

Device& device(mlx::core::Device) {
  // Leak singleton device intentionally, to avoid cases where a compute kernel
  // returns and tries to access the object after it has been freed by the main
  // thread teardown.
  static Device* metal_device = new Device;
  return *metal_device;
}

std::unique_ptr<void, std::function<void(void*)>> new_scoped_memory_pool() {
  auto dtor = [](void* ptr) {
    static_cast<NS::AutoreleasePool*>(ptr)->release();
  };
  return std::unique_ptr<void, std::function<void(void*)>>(
      NS::AutoreleasePool::alloc()->init(), dtor);
}

} // namespace mlx::core::metal


================================================
FILE: mlx/backend/metal/device.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <Metal/Metal.hpp>
#include <functional>
#include <mutex>
#include <shared_mutex>
#include <string>
#include <unordered_map>
#include <unordered_set>

#include "mlx/array.h"
#include "mlx/device.h"

namespace mlx::core::metal {

using MTLFCList =
    std::vector<std::tuple<const void*, MTL::DataType, NS::UInteger>>;

class Device;

class MLX_API CommandEncoder {
 public:
  CommandEncoder(Device& d, int index, const MTL::ResidencySet* residency_set);
  CommandEncoder(const CommandEncoder&) = delete;
  CommandEncoder& operator=(const CommandEncoder&) = delete;

  struct ConcurrentContext {
    ConcurrentContext(CommandEncoder& enc) : enc(enc) {
      enc.concurrent_ = true;
    }
    ~ConcurrentContext() {
      enc.concurrent_ = false;
      enc.prev_outputs_.insert(
          enc.concurrent_outputs_.begin(), enc.concurrent_outputs_.end());
      enc.concurrent_outputs_.clear();
    }

   private:
    CommandEncoder& enc;
  };

  void set_buffer(const MTL::Buffer* buf, int idx, int64_t offset = 0);
  void set_input_array(const array& a, int idx, int64_t offset = 0);
  void set_output_array(array& a, int idx, int64_t offset = 0);
  void register_output_array(const array& a);

  void add_temporary(array arr);
  void add_temporaries(std::vector<array> arrays);

  void dispatch_threadgroups(MTL::Size grid_dims, MTL::Size group_dims);
  void dispatch_threads(MTL::Size grid_dims, MTL::Size group_dims);
  void maybeInsertBarrier();

  void set_compute_pipeline_state(MTL::ComputePipelineState* kernel) {
    get_command_encoder()->setComputePipelineState(kernel);
  }

  template <typename Vec, typename = std::enable_if_t<is_vector_v<Vec>>>
  void set_vector_bytes(const Vec& vec, size_t nelems, int idx) {
    get_command_encoder()->setBytes(
        vec.data(), nelems * sizeof(typename Vec::value_type), idx);
  }
  template <typename Vec, typename = std::enable_if_t<is_vector_v<Vec>>>
  void set_vector_bytes(const Vec& vec, int idx) {
    return set_vector_bytes(vec, vec.size(), idx);
  }

  template <typename T>
  void set_bytes(const T* v, int n, int idx) {
    return get_command_encoder()->setBytes(v, n * sizeof(T), idx);
  }

  template <typename T>
  void set_bytes(const T& v, int idx) {
    return get_command_encoder()->setBytes(&v, sizeof(T), idx);
  }

  void set_threadgroup_memory_length(size_t length, int idx) {
    get_command_encoder()->setThreadgroupMemoryLength(length, idx);
  }

  ConcurrentContext start_concurrent() {
    return ConcurrentContext(*this);
  }

  void barrier();
  void end_encoding();
  bool needs_commit() const;
  void commit();

  MTL::CommandQueue* get_command_queue() const {
    return queue_.get();
  }
  MTL::CommandBuffer* get_command_buffer() const {
    return buffer_.get();
  }

 private:
  MTL::ComputeCommandEncoder* get_command_encoder();

  Device& device_;

  // Buffer that stores encoded commands.
  NS::SharedPtr<MTL::CommandQueue> queue_;
  NS::SharedPtr<MTL::CommandBuffer> buffer_;
  int buffer_ops_{0};
  size_t buffer_sizes_{0};

  // Encoder for issuing GPU commands.
  // The members are used within a single ComputeCommandEncoder and will be
  // reset after calling end_encoding().
  NS::SharedPtr<MTL::ComputeCommandEncoder> encoder_;
  NS::SharedPtr<MTL::Fence> fence_;
  bool needs_barrier_{false};
  bool concurrent_{false};
  std::vector<array> temporaries_;
  std::unordered_set<MTL::Resource*> prev_outputs_;
  std::unordered_set<MTL::Resource*> next_outputs_;
  std::unordered_set<MTL::Resource*> concurrent_outputs_;
  std::unordered_set<const void*> all_inputs_;
  std::unordered_set<const void*> all_outputs_;

  // A map of prior command encoder outputs to their corresponding fence.
  std::unordered_map<const void*, NS::SharedPtr<MTL::Fence>> prev_ce_outputs_;
  std::mutex outputs_mtx_;
};

class MLX_API Device {
 public:
  Device();
  Device(const Device&) = delete;
  Device& operator=(const Device&) = delete;
  ~Device();

  MTL::Device* mtl_device() {
    return device_;
  };

  const std::string& get_architecture() const {
    return arch_;
  }
  int get_architecture_gen() const {
    return arch_gen_;
  }
  std::tuple<int, int> get_max_ops_mb_per_buffer() const {
    return std::make_tuple(max_ops_per_buffer_, max_mb_per_buffer_);
  }

  MTL::CommandBuffer* get_command_buffer(int index);
  bool command_buffer_needs_commit(int index);
  void commit_command_buffer(int index);
  CommandEncoder& get_command_encoder(int index);
  void end_encoding(int index);

  MTL::Library* get_library(
      const std::string& name,
      const std::string& path = "");

  MTL::Library* get_library(
      const std::string& name,
      const std::function<std::string(void)>& builder);

  void clear_library(const std::string& name);

  MTL::ComputePipelineState* get_kernel(
      const std::string& base_name,
      MTL::Library* mtl_lib,
      const std::string& hash_name = "",
      const MTLFCList& func_consts = {},
      const std::vector<MTL::Function*>& linked_functions = {});

  MTL::ComputePipelineState* get_kernel(
      const std::string& base_name,
      const std::string& hash_name = "",
      const MTLFCList& func_consts = {},
      const std::vector<MTL::Function*>& linked_functions = {});

  // Record temporary arrays for the given stream index
  void add_temporary(array arr, int index);
  void add_temporaries(std::vector<array> arrays, int index);

  void set_residency_set(const MTL::ResidencySet* residency_set);

 private:
  MTL::Library* get_library_cache_(const std::string& name);

  MTL::Library* get_library_(const std::string& name);
  MTL::Library* build_library_(const std::string& source_string);

  MTL::Function* get_function_(const std::string& name, MTL::Library* mtl_lib);

  MTL::Function* get_function_(
      const std::string& name,
      const std::string& specialized_name,
      const MTLFCList& func_consts,
      MTL::Library* mtl_lib);

  MTL::LinkedFunctions* get_linked_functions_(
      const std::vector<MTL::Function*>& funcs);

  MTL::ComputePipelineState* get_kernel_(
      const std::string& name,
      const MTL::Function* mtl_function);

  MTL::ComputePipelineState* get_kernel_(
      const std::string& name,
      const MTL::Function* mtl_function,
      const MTL::LinkedFunctions* linked_functions);

  MTL::ComputePipelineState* get_kernel_(
      const std::string& base_name,
      MTL::Library* mtl_lib,
      const std::string& hash_name,
      const MTLFCList& func_consts = {},
      const std::vector<MTL::Function*>& linked_functions = {});

  MTL::Device* device_;
  std::unordered_map<int32_t, CommandEncoder> encoders_;

  std::shared_mutex kernel_mtx_;
  std::shared_mutex library_mtx_;
  std::unordered_map<std::string, MTL::Library*> library_map_;
  MTL::Library* default_library_;
  std::unordered_map<
      MTL::Library*,
      std::unordered_map<std::string, MTL::ComputePipelineState*>>
      library_kernels_;
  const MTL::ResidencySet* residency_set_{nullptr};
  std::string arch_;
  int arch_gen_;
  int max_ops_per_buffer_;
  int max_mb_per_buffer_;
};

MLX_API Device& device(mlx::core::Device);

std::unique_ptr<void, std::function<void(void*)>> new_scoped_memory_pool();

inline bool is_nax_available() {
#ifdef MLX_METAL_NO_NAX
  return false;
#else
  auto _check_nax = []() {
    bool can_use_nax = false;
    if (__builtin_available(
            macOS 26.2, iOS 26.2, tvOS 26.2, visionOS 26.2, *)) {
      can_use_nax = true;
    }
    auto& d = metal::device(mlx::core::Device::gpu);
    auto arch = d.get_architecture().back();
    auto gen = d.get_architecture_gen();
    can_use_nax &= gen >= (arch == 'p' ? 18 : 17);
    return can_use_nax;
  };
  static bool is_nax_available_ = _check_nax();
  return is_nax_available_;
#endif
}

} // namespace mlx::core::metal


================================================
FILE: mlx/backend/metal/device_info.cpp
================================================
// Copyright © 2026 Apple Inc.

#include <sys/sysctl.h>

#include "mlx/backend/gpu/device_info.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/metal.h"

namespace mlx::core::gpu {

bool is_available() {
  return metal::is_available();
}

int device_count() {
  return 1;
}

const std::unordered_map<std::string, std::variant<std::string, size_t>>&
device_info(int device_index) {
  auto init_device_info = []()
      -> std::unordered_map<std::string, std::variant<std::string, size_t>> {
    auto pool = metal::new_scoped_memory_pool();
    auto& device = metal::device(mlx::core::Device::gpu);
    auto raw_device = device.mtl_device();
    auto name = std::string(raw_device->name()->utf8String());
    auto arch = device.get_architecture();

    size_t memsize = 0;
    size_t length = sizeof(memsize);
    sysctlbyname("hw.memsize", &memsize, &length, NULL, 0);

    size_t rsrc_limit = 0;
    sysctlbyname("iogpu.rsrc_limit", &rsrc_limit, &length, NULL, 0);
    if (rsrc_limit == 0) {
      rsrc_limit = 499000;
    }

    return {
        {"device_name", name},
        {"architecture", arch},
        {"max_buffer_length", raw_device->maxBufferLength()},
        {"max_recommended_working_set_size",
         raw_device->recommendedMaxWorkingSetSize()},
        {"memory_size", memsize},
        {"resource_limit", rsrc_limit}};
  };
  static auto device_info_ = init_device_info();
  static std::unordered_map<std::string, std::variant<std::string, size_t>>
      empty;

  if (device_index == 0) {
    return device_info_;
  } else {
    return empty;
  }
}

} // namespace mlx::core::gpu


================================================
FILE: mlx/backend/metal/distributed.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <cassert>

#include "mlx/allocator.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/distributed/ops.h"
#include "mlx/distributed/primitives.h"
#include "mlx/fence.h"
#include "mlx/scheduler.h"

namespace mlx::core::distributed {

void AllReduce::eval_gpu(const std::vector<array>&, std::vector<array>&) {
  throw std::runtime_error("[AllReduce::eval_gpu] has no GPU implementation.");
}

void AllGather::eval_gpu(const std::vector<array>&, std::vector<array>&) {
  throw std::runtime_error("[AllGather::eval_gpu] has no GPU implementation.");
}

void Send::eval_gpu(const std::vector<array>&, std::vector<array>&) {
  throw std::runtime_error("[Send::eval_gpu] has no GPU implementation.");
}

void Recv::eval_gpu(const std::vector<array>&, std::vector<array>&) {
  throw std::runtime_error("[Recv::eval_gpu] has no GPU implementation.");
}

void ReduceScatter::eval_gpu(const std::vector<array>&, std::vector<array>&) {
  throw std::runtime_error(
      "[ReduceScatter::eval_gpu] has no GPU implementation.");
}

} // namespace mlx::core::distributed


================================================
FILE: mlx/backend/metal/eval.cpp
================================================
// Copyright © 2023-2024 Apple Inc.
#include <memory>

#include "mlx/backend/gpu/eval.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/primitives.h"
#include "mlx/scheduler.h"

namespace mlx::core::gpu {

void new_stream(Stream stream) {
  if (stream.device == mlx::core::Device::gpu) {
    metal::device(stream.device).get_command_encoder(stream.index);
  }
}

inline void check_error(MTL::CommandBuffer* cbuf) {
  if (cbuf->status() == MTL::CommandBufferStatusError) {
    std::ostringstream msg;
    msg << "[METAL] Command buffer execution failed: "
        << cbuf->error()->localizedDescription()->utf8String();
    throw std::runtime_error(msg.str());
  }
}

void eval(array& arr) {
  auto pool = metal::new_scoped_memory_pool();
  auto s = arr.primitive().stream();
  auto& d = metal::device(s.device);
  auto command_buffer = d.get_command_buffer(s.index);

  auto outputs = arr.outputs();
  {
    // If the array is a tracer hold a reference
    // to its inputs so they don't get donated
    std::vector<array> inputs;
    if (arr.is_tracer()) {
      inputs = arr.inputs();
    }

    debug_set_primitive_buffer_label(command_buffer, arr.primitive());
    arr.primitive().eval_gpu(arr.inputs(), outputs);
  }
  std::unordered_set<std::shared_ptr<array::Data>> buffers;
  for (auto& in : arr.inputs()) {
    buffers.insert(in.data_shared_ptr());
  }
  for (auto& s : arr.siblings()) {
    buffers.insert(s.data_shared_ptr());
  }
  // Remove the output if it was donated to by an input
  if (auto it = buffers.find(arr.data_shared_ptr()); it != buffers.end()) {
    buffers.erase(it);
  }

  if (d.command_buffer_needs_commit(s.index)) {
    d.end_encoding(s.index);
    scheduler::notify_new_task(s);
    command_buffer->addCompletedHandler(
        [s, buffers = std::move(buffers)](MTL::CommandBuffer* cbuf) {
          scheduler::notify_task_completion(s);
          check_error(cbuf);
        });
    d.commit_command_buffer(s.index);
  } else {
    command_buffer->addCompletedHandler(
        [buffers = std::move(buffers)](MTL::CommandBuffer* cbuf) {
          check_error(cbuf);
        });
  }
}

void finalize(Stream s) {
  auto pool = metal::new_scoped_memory_pool();
  auto& d = metal::device(s.device);
  auto cb = d.get_command_buffer(s.index);
  d.end_encoding(s.index);
  cb->addCompletedHandler([](MTL::CommandBuffer* cbuf) { check_error(cbuf); });
  d.commit_command_buffer(s.index);
}

void synchronize(Stream s) {
  auto pool = metal::new_scoped_memory_pool();
  auto& d = metal::device(s.device);
  auto cb = d.get_command_buffer(s.index);
  cb->retain();
  d.end_encoding(s.index);
  d.commit_command_buffer(s.index);
  cb->waitUntilCompleted();
  check_error(cb);
  cb->release();
}

} // namespace mlx::core::gpu


================================================
FILE: mlx/backend/metal/event.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/event.h"
#include "mlx/backend/metal/device.h"
#include "mlx/scheduler.h"

namespace mlx::core {

Event::Event(Stream stream) : stream_(stream) {
  auto dtor = [](void* ptr) {
    auto p = metal::new_scoped_memory_pool();
    static_cast<MTL::SharedEvent*>(ptr)->release();
  };
  auto p = metal::new_scoped_memory_pool();
  event_ = std::shared_ptr<void>(
      metal::device(Device::gpu).mtl_device()->newSharedEvent(), dtor);
  if (event_ == nullptr) {
    throw std::runtime_error(
        "[Event::Event] Failed to create Metal shared event.");
  }
}

void Event::wait() {
  if (!static_cast<MTL::SharedEvent*>(event_.get())
           ->waitUntilSignaledValue(value(), -1)) {
    throw std::runtime_error("[Event::wait] Timed out");
  }
}

void Event::wait(Stream stream) {
  if (stream.device == Device::cpu) {
    scheduler::enqueue(stream, [*this]() mutable { wait(); });
  } else {
    auto& d = metal::device(stream.device);
    d.end_encoding(stream.index);
    auto command_buffer = d.get_command_buffer(stream.index);
    command_buffer->encodeWait(static_cast<MTL::Event*>(event_.get()), value());
    command_buffer->addCompletedHandler([*this](MTL::CommandBuffer*) {});
  }
}

void Event::signal(Stream stream) {
  if (stream.device == Device::cpu) {
    scheduler::enqueue(stream, [*this]() mutable {
      static_cast<MTL::SharedEvent*>(event_.get())->setSignaledValue(value());
    });
  } else {
    auto& d = metal::device(stream.device);
    d.end_encoding(stream.index);
    auto command_buffer = d.get_command_buffer(stream.index);
    command_buffer->encodeSignalEvent(
        static_cast<MTL::Event*>(event_.get()), value());
    command_buffer->addCompletedHandler([*this](MTL::CommandBuffer*) {});
  }
}

bool Event::is_signaled() const {
  return static_cast<MTL::SharedEvent*>(event_.get())->signaledValue() >=
      value();
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/fence.cpp
================================================
// Copyright © 2024 Apple Inc.
#include "mlx/fence.h"
#include "mlx/backend/metal/device.h"
#include "mlx/scheduler.h"
#include "mlx/utils.h"

namespace mlx::core {

struct FenceImpl {
  FenceImpl() {
    auto d = metal::device(Device::gpu).mtl_device();
    if (!d->supportsFamily(MTL::GPUFamilyMetal3)) {
      use_fast = false;
    } else if (__builtin_available(macOS 15, iOS 18, *)) {
      use_fast = env::metal_fast_synch();
    }

    if (!use_fast) {
      auto p = metal::new_scoped_memory_pool();
      fence = static_cast<void*>(d->newSharedEvent());
    } else {
      auto buf = allocator::malloc(sizeof(uint32_t)).ptr();
      fence = static_cast<void*>(buf);
      cpu_value()[0] = 0;
    }
  }

  ~FenceImpl() {
    if (!use_fast) {
      // Wraps Metal SharedEvent
      auto p = metal::new_scoped_memory_pool();
      static_cast<MTL::SharedEvent*>(fence)->release();
    } else {
      allocator::free(allocator::Buffer{static_cast<MTL::Buffer*>(fence)});
    }
  }
  bool use_fast{false};
  uint32_t count{0};
  void* fence;

  std::atomic_uint* cpu_value() {
    return static_cast<std::atomic_uint*>(
        static_cast<MTL::Buffer*>(fence)->contents());
  }
};

Fence::Fence(Stream) {
  auto dtor = [](void* ptr) { delete static_cast<FenceImpl*>(ptr); };
  fence_ = std::shared_ptr<void>(new FenceImpl{}, dtor);
}

void Fence::wait(Stream stream, const array& x) {
  auto& f = *static_cast<FenceImpl*>(fence_.get());

  if (stream.device == Device::cpu) {
    scheduler::enqueue(stream, [fence_ = fence_, count = f.count]() mutable {
      auto& f = *static_cast<FenceImpl*>(fence_.get());
      if (!f.use_fast) {
        if (!static_cast<MTL::SharedEvent*>(f.fence)->waitUntilSignaledValue(
                count, -1)) {
          throw std::runtime_error("[Fence::wait] Timed out");
        }
        return;
      }
      while (f.cpu_value()[0] < count) {
      }
    });
    return;
  }

  auto& d = metal::device(stream.device);
  auto idx = stream.index;

  if (!f.use_fast) {
    d.end_encoding(idx);
    auto command_buffer = d.get_command_buffer(idx);
    command_buffer->encodeWait(static_cast<MTL::Event*>(f.fence), f.count);
    command_buffer->addCompletedHandler(
        [fence_ = fence_](MTL::CommandBuffer* cbuf) {});
    return;
  }

  auto& compute_encoder = d.get_command_encoder(idx);

  // Register outputs to ensure that no kernels which depends on the
  // output starts before this one is done
  compute_encoder.register_output_array(x);

  auto kernel = d.get_kernel("fence_wait");
  MTL::Size kernel_dims = MTL::Size(1, 1, 1);
  compute_encoder.set_compute_pipeline_state(kernel);

  auto buf = static_cast<MTL::Buffer*>(f.fence);
  compute_encoder.set_buffer(buf, 0);
  compute_encoder.set_bytes(f.count, 1);
  compute_encoder.dispatch_threads(kernel_dims, kernel_dims);

  d.get_command_buffer(idx)->addCompletedHandler(
      [fence_ = fence_](MTL::CommandBuffer* cbuf) {});
}

void Fence::update(Stream stream, const array& x, bool cross_device) {
  auto& f = *static_cast<FenceImpl*>(fence_.get());
  f.count++;

  if (stream.device == Device::cpu) {
    scheduler::enqueue(stream, [fence_ = fence_, count = f.count]() mutable {
      auto& f = *static_cast<FenceImpl*>(fence_.get());
      if (!f.use_fast) {
        static_cast<MTL::SharedEvent*>(f.fence)->setSignaledValue(count);
        return;
      }

      f.cpu_value()[0] = count;
    });
    return;
  }

  auto& d = metal::device(stream.device);
  auto idx = stream.index;
  if (!f.use_fast) {
    d.end_encoding(idx);
    auto command_buffer = d.get_command_buffer(idx);
    command_buffer->encodeSignalEvent(
        static_cast<MTL::Event*>(f.fence), f.count);
    command_buffer->addCompletedHandler(
        [fence_ = fence_](MTL::CommandBuffer* cbuf) {});
    return;
  }

  // Launch input visibility kernels
  auto& compute_encoder = d.get_command_encoder(idx);
  if (cross_device) {
    auto kernel = d.get_kernel("input_coherent");
    uint32_t nthreads = (x.data_size() * x.itemsize() + sizeof(uint32_t) - 1) /
        sizeof(uint32_t);
    MTL::Size group_dims = MTL::Size(1024, 1, 1);
    MTL::Size grid_dims = MTL::Size((nthreads + 1024 - 1) / 1024, 1, 1);
    compute_encoder.set_compute_pipeline_state(kernel);
    compute_encoder.set_input_array(x, 0);
    compute_encoder.set_bytes(nthreads, 1);
    compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
  }

  // Barrier on previous kernels
  compute_encoder.barrier();

  // Launch value update kernel
  auto kernel = d.get_kernel("fence_update");
  MTL::Size kernel_dims = MTL::Size(1, 1, 1);
  compute_encoder.set_compute_pipeline_state(kernel);

  auto buf = static_cast<MTL::Buffer*>(f.fence);
  compute_encoder.set_buffer(buf, 0);
  compute_encoder.set_bytes(f.count, 1);
  compute_encoder.dispatch_threads(kernel_dims, kernel_dims);

  d.get_command_buffer(idx)->addCompletedHandler(
      [fence_ = fence_](MTL::CommandBuffer* cbuf) {});
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/fft.cpp
================================================
// Copyright © 2024 Apple Inc.
#include <cassert>
#include <complex>
#include <map>
#include <numeric>
#include <set>

#include "mlx/3rdparty/pocketfft.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/gpu/slicing.h"
#include "mlx/backend/metal/binary.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/unary.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/utils.h"

namespace mlx::core {

using MTLFC = std::tuple<const void*, MTL::DataType, NS::UInteger>;

#define MAX_STOCKHAM_FFT_SIZE 4096
#define MAX_RADER_FFT_SIZE 2048
#define MAX_BLUESTEIN_FFT_SIZE 2048
// Threadgroup memory batching improves throughput for small n
#define MIN_THREADGROUP_MEM_SIZE 256
// For strided reads/writes, coalesce at least this many complex64s
#define MIN_COALESCE_WIDTH 4

inline const std::vector<int> supported_radices() {
  // Ordered by preference in decomposition.
  return {13, 11, 8, 7, 6, 5, 4, 3, 2};
}

std::vector<int> prime_factors(int n) {
  int z = 2;
  std::vector<int> factors;
  while (z * z <= n) {
    if (n % z == 0) {
      factors.push_back(z);
      n /= z;
    } else {
      z++;
    }
  }
  if (n > 1) {
    factors.push_back(n);
  }
  return factors;
}

struct FourStepParams {
  bool required = false;
  bool first_step = true;
  int n1 = 0;
  int n2 = 0;
};

// Forward Declaration
void fft_op(
    const array& in,
    array& out,
    size_t axis,
    bool inverse,
    bool real,
    const FourStepParams four_step_params,
    bool inplace,
    const Stream& s);

struct FFTPlan {
  int n = 0;
  // Number of steps for each radix in the Stockham decomposition
  std::vector<int> stockham;
  // Number of steps for each radix in the Rader decomposition
  std::vector<int> rader;
  // Rader factor, 1 if no rader factors
  int rader_n = 1;
  int bluestein_n = -1;
  // Four step FFT
  bool four_step = false;
  int n1 = 0;
  int n2 = 0;
};

int next_fast_n(int n) {
  return next_power_of_2(n);
}

std::vector<int> plan_stockham_fft(int n) {
  auto radices = supported_radices();
  std::vector<int> plan(radices.size(), 0);
  int orig_n = n;
  if (n == 1) {
    return plan;
  }
  for (int i = 0; i < radices.size(); i++) {
    int radix = radices[i];
    // Manually tuned radices for powers of 2
    if (is_power_of_2(orig_n) && orig_n < 512 && radix > 4) {
      continue;
    }
    while (n % radix == 0) {
      plan[i] += 1;
      n /= radix;
      if (n == 1) {
        return plan;
      }
    }
  }
  throw std::runtime_error("Unplannable");
}

FFTPlan plan_fft(int n) {
  auto radices = supported_radices();
  std::set<int> radices_set(radices.begin(), radices.end());

  FFTPlan plan;
  plan.n = n;
  plan.rader = std::vector<int>(radices.size(), 0);
  auto factors = prime_factors(n);
  int remaining_n = n;

  // Four Step FFT when N is too large for shared mem.
  if (n > MAX_STOCKHAM_FFT_SIZE && is_power_of_2(n)) {
    // For power's of two we have a fast, no transpose four step implementation.
    plan.four_step = true;
    // Rough heuristic for choosing faster powers of two when we can
    plan.n2 = n > 65536 ? 1024 : 64;
    plan.n1 = n / plan.n2;
    return plan;
  } else if (n > MAX_STOCKHAM_FFT_SIZE) {
    // Otherwise we use a multi-upload Bluestein's
    plan.four_step = true;
    plan.bluestein_n = next_fast_n(2 * n - 1);
    return plan;
  }

  for (int factor : factors) {
    // Make sure the factor is a supported radix
    if (radices_set.find(factor) == radices_set.end()) {
      // We only support a single Rader factor currently
      // TODO(alexbarron) investigate weirdness with large
      // Rader sizes -- possibly a compiler issue?
      if (plan.rader_n > 1 || n > MAX_RADER_FFT_SIZE) {
        plan.four_step = n > MAX_BLUESTEIN_FFT_SIZE;
        plan.bluestein_n = next_fast_n(2 * n - 1);
        plan.stockham = plan_stockham_fft(plan.bluestein_n);
        plan.rader = std::vector<int>(radices.size(), 0);
        return plan;
      }
      // See if we can use Rader's algorithm to Stockham decompose n - 1
      auto rader_factors = prime_factors(factor - 1);
      for (int rf : rader_factors) {
        // We don't nest Rader's algorithm so if `factor - 1`
        // isn't Stockham decomposable we give up and do Bluestein's.
        if (radices_set.find(rf) == radices_set.end()) {
          plan.four_step = n > MAX_BLUESTEIN_FFT_SIZE;
          plan.bluestein_n = next_fast_n(2 * n - 1);
          plan.stockham = plan_stockham_fft(plan.bluestein_n);
          plan.rader = std::vector<int>(radices.size(), 0);
          return plan;
        }
      }
      plan.rader = plan_stockham_fft(factor - 1);
      plan.rader_n = factor;
      remaining_n /= factor;
    }
  }

  plan.stockham = plan_stockham_fft(remaining_n);
  return plan;
}

int compute_elems_per_thread(FFTPlan plan) {
  // Heuristics for selecting an efficient number
  // of threads to use for a particular mixed-radix FFT.
  auto n = plan.n;

  std::vector<int> steps;
  auto radices = supported_radices();
  steps.insert(steps.end(), plan.stockham.begin(), plan.stockham.end());
  steps.insert(steps.end(), plan.rader.begin(), plan.rader.end());
  std::set<int> used_radices;
  for (int i = 0; i < steps.size(); i++) {
    int radix = radices[i % radices.size()];
    if (steps[i] > 0) {
      used_radices.insert(radix);
    }
  }

  // Manual tuning for 7/11/13
  if (used_radices.find(7) != used_radices.end() &&
      (used_radices.find(11) != used_radices.end() ||
       used_radices.find(13) != used_radices.end())) {
    return 7;
  } else if (
      used_radices.find(11) != used_radices.end() &&
      used_radices.find(13) != used_radices.end()) {
    return 11;
  }

  // TODO(alexbarron) Some really weird stuff is going on
  // for certain `elems_per_thread` on large composite n.
  // Possibly a compiler issue?
  if (n == 3159)
    return 13;
  if (n == 3645)
    return 5;
  if (n == 3969)
    return 7;
  if (n == 1982)
    return 5;

  if (used_radices.size() == 1) {
    return *(used_radices.begin());
  }
  if (used_radices.size() == 2) {
    if (used_radices.find(11) != used_radices.end() ||
        used_radices.find(13) != used_radices.end()) {
      return std::accumulate(used_radices.begin(), used_radices.end(), 0) / 2;
    }
    std::vector<int> radix_vec(used_radices.begin(), used_radices.end());
    return radix_vec[1];
  }
  // In all other cases use the second smallest radix.
  std::vector<int> radix_vec(used_radices.begin(), used_radices.end());
  return radix_vec[1];
}

// Rader
int mod_exp(int x, int y, int n) {
  int out = 1;
  while (y) {
    if (y & 1) {
      out = out * x % n;
    }
    y >>= 1;
    x = x * x % n;
  }
  return out;
}

int primitive_root(int n) {
  auto factors = prime_factors(n - 1);

  for (int r = 2; r < n - 1; r++) {
    bool found = true;
    for (int factor : factors) {
      if (mod_exp(r, (n - 1) / factor, n) == 1) {
        found = false;
        break;
      }
    }
    if (found) {
      return r;
    }
  }
  return -1;
}

std::tuple<array, array, array> compute_raders_constants(
    int rader_n,
    const Stream& s) {
  int proot = primitive_root(rader_n);
  // Fermat's little theorem
  int inv = mod_exp(proot, rader_n - 2, rader_n);
  std::vector<short> g_q(rader_n - 1);
  std::vector<short> g_minus_q(rader_n - 1);
  for (int i = 0; i < rader_n - 1; i++) {
    g_q[i] = mod_exp(proot, i, rader_n);
    g_minus_q[i] = mod_exp(inv, i, rader_n);
  }
  array g_q_arr(g_q.begin(), {rader_n - 1});
  array g_minus_q_arr(g_minus_q.begin(), {rader_n - 1});

  std::vector<std::complex<float>> b_q(rader_n - 1);
  for (int i = 0; i < rader_n - 1; i++) {
    float pi_i = (float)g_minus_q[i] * -2.0 * M_PI / rader_n;
    b_q[i] = std::exp(std::complex<float>(0, pi_i));
  }

  array b_q_fft({rader_n - 1}, complex64, nullptr, {});
  b_q_fft.set_data(allocator::malloc(b_q_fft.nbytes()));
  auto b_q_fft_ptr =
      reinterpret_cast<std::complex<float>*>(b_q_fft.data<complex64_t>());
  std::ptrdiff_t item_size = b_q_fft.itemsize();
  size_t fft_size = rader_n - 1;
  // This FFT is always small (<4096, batch 1) so save some overhead
  // and do it on the CPU
  pocketfft::c2c(
      /* shape= */ {fft_size},
      /* stride_in= */ {item_size},
      /* stride_out= */ {item_size},
      /* axes= */ {0},
      /* forward= */ true,
      /* data_in= */ b_q.data(),
      /* data_out= */ b_q_fft_ptr,
      /* scale= */ 1.0f);
  return std::make_tuple(b_q_fft, g_q_arr, g_minus_q_arr);
}

// Bluestein
std::pair<array, array> compute_bluestein_constants(int n, int bluestein_n) {
  // We need to calculate the Bluestein twiddle factors
  // in double precision for the overall numerical stability
  // of Bluestein's FFT algorithm to be acceptable.
  //
  // Metal doesn't support float64, so instead we
  // manually implement the required operations on cpu.
  //
  // In numpy:
  // w_k = np.exp(-1j * np.pi / N * (np.arange(-N + 1, N) ** 2))
  // w_q = np.fft.fft(1/w_k)
  // return w_k, w_q
  std::vector<std::complex<float>> w_k_vec(n);
  std::vector<std::complex<float>> w_q_vec(bluestein_n, 0);

  for (int i = -n + 1; i < n; i++) {
    double theta = pow(i, 2) * M_PI / (double)n;
    w_q_vec[i + n - 1] = std::exp(std::complex<double>(0, theta));
    if (i >= 0) {
      w_k_vec[i] = std::exp(std::complex<double>(0, -theta));
    }
  }

  array w_k({n}, complex64, nullptr, {});
  w_k.set_data(allocator::malloc(w_k.nbytes()));
  std::copy(w_k_vec.begin(), w_k_vec.end(), w_k.data<complex64_t>());

  array w_q({bluestein_n}, complex64, nullptr, {});
  w_q.set_data(allocator::malloc(w_q.nbytes()));
  auto w_q_ptr =
      reinterpret_cast<std::complex<float>*>(w_q.data<complex64_t>());

  std::ptrdiff_t item_size = w_q.itemsize();
  size_t fft_size = bluestein_n;
  pocketfft::c2c(
      /* shape= */ {fft_size},
      /* stride_in= */ {item_size},
      /* stride_out= */ {item_size},
      /* axes= */ {0},
      /* forward= */ true,
      /* data_in= */ w_q_vec.data(),
      /* data_out= */ w_q_ptr,
      /* scale= */ 1.0f);
  return std::make_tuple(w_k, w_q);
}

void multi_upload_bluestein_fft(
    const array& in,
    array& out,
    size_t axis,
    bool inverse,
    bool real,
    FFTPlan& plan,
    std::vector<array>& copies,
    const Stream& s) {
  // TODO(alexbarron) Implement fused kernels for mutli upload bluestein's
  // algorithm
  int n = inverse ? out.shape(axis) : in.shape(axis);
  auto [w_k, w_q] = compute_bluestein_constants(n, plan.bluestein_n);
  copies.push_back(w_k);
  copies.push_back(w_q);

  auto temp_shape = inverse ? out.shape() : in.shape();
  array temp(temp_shape, complex64, nullptr, {});
  array temp1(temp_shape, complex64, nullptr, {});

  if (real && !inverse) {
    // Convert float32->complex64
    copy_gpu(in, temp, CopyType::General, s);
    copies.push_back(temp);
  } else if (real && inverse) {
    int back_offset = n % 2 == 0 ? 2 : 1;
    auto slice_shape = in.shape();
    slice_shape[axis] -= back_offset;
    array slice_temp(slice_shape, complex64, nullptr, {});
    array conj_temp(in.shape(), complex64, nullptr, {});
    copies.push_back(conj_temp);

    Shape rstarts(in.ndim(), 0);
    Shape rstrides(in.ndim(), 1);
    rstarts[axis] = in.shape(axis) - back_offset;
    rstrides[axis] = -1;
    unary_op_gpu({in}, conj_temp, "Conjugate", s);
    slice_gpu(in, slice_temp, rstarts, rstrides, s);
    concatenate_gpu({conj_temp, slice_temp}, temp, (int)axis, s);
    copies.push_back(temp);
  } else if (inverse) {
    unary_op_gpu({in}, temp, "Conjugate", s);
    copies.push_back(temp);
  } else {
    temp.copy_shared_buffer(in);
  }

  Strides b_strides(in.ndim(), 0);
  b_strides[axis] = 1;
  array w_k_broadcast(temp.shape(), complex64, nullptr, {});
  w_k_broadcast.copy_shared_buffer(w_k, b_strides, {}, w_k.data_size());
  binary_op_gpu({temp, w_k_broadcast}, temp1, "Multiply", s);

  std::vector<std::pair<int, int>> pads;
  auto padded_shape = out.shape();
  padded_shape[axis] = plan.bluestein_n;
  array pad_temp(padded_shape, complex64, nullptr, {});
  auto zero = array(complex64_t{0.0f, 0.0f});
  copies.push_back(zero);
  pad_gpu(temp1, zero, pad_temp, {(int)axis}, {0}, s);
  copies.push_back(pad_temp);

  array pad_temp1(padded_shape, complex64, nullptr, {});
  fft_op(
      pad_temp,
      pad_temp1,
      axis,
      /*inverse=*/false,
      /*real=*/false,
      FourStepParams(),
      /*inplace=*/false,
      s);
  copies.push_back(pad_temp1);

  array w_q_broadcast(pad_temp1.shape(), complex64, nullptr, {});
  w_q_broadcast.copy_shared_buffer(w_q, b_strides, {}, w_q.data_size());
  binary_op_gpu_inplace({pad_temp1, w_q_broadcast}, pad_temp, "Multiply", s);

  fft_op(
      pad_temp,
      pad_temp1,
      axis,
      /* inverse= */ true,
      /* real= */ false,
      FourStepParams(),
      /*inplace=*/true,
      s);

  int offset = plan.bluestein_n - (2 * n - 1);
  Shape starts(in.ndim(), 0);
  Shape strides(in.ndim(), 1);
  starts[axis] = plan.bluestein_n - offset - n;

  array temp2(temp_shape, complex64, nullptr, {});
  slice_gpu(pad_temp1, temp2, starts, strides, s);

  binary_op_gpu_inplace({temp2, w_k_broadcast}, temp1, "Multiply", s);

  if (real && !inverse) {
    Shape rstarts(in.ndim(), 0);
    Shape rstrides(in.ndim(), 1);
    slice_gpu(temp1, out, rstarts, strides, s);
  } else if (real && inverse) {
    Strides b_strides(in.ndim(), 0);
    auto inv_n = array({1.0f / n}, {1}, float32);
    array temp_float(out.shape(), out.dtype(), nullptr, {});
    copies.push_back(temp_float);
    copies.push_back(inv_n);
    copies.push_back(temp1);

    copy_gpu(temp1, temp_float, CopyType::General, s);
    binary_op_gpu({temp_float, inv_n}, out, "Multiply", s);
  } else if (inverse) {
    auto inv_n = array({1.0f / n}, {1}, complex64);
    array temp3(temp_shape, complex64, nullptr, {});
    unary_op_gpu({temp1}, temp3, "Conjugate", s);
    binary_op_gpu({temp3, inv_n}, out, "Multiply", s);
    copies.push_back(inv_n);
    copies.push_back(temp1);
    copies.push_back(temp3);
  } else {
    out.copy_shared_buffer(temp1);
  }
}

void four_step_fft(
    const array& in,
    array& out,
    size_t axis,
    bool inverse,
    bool real,
    FFTPlan& plan,
    std::vector<array>& copies,
    const Stream& s,
    bool in_place) {
  if (plan.bluestein_n == -1) {
    // Fast no transpose implementation for powers of 2.
    FourStepParams four_step_params = {
        /* required= */ true, /* first_step= */ true, plan.n1, plan.n2};
    auto temp_shape = (real && inverse) ? out.shape() : in.shape();
    array temp(temp_shape, complex64, nullptr, {});
    fft_op(
        in, temp, axis, inverse, real, four_step_params, /*inplace=*/false, s);
    four_step_params.first_step = false;
    fft_op(
        temp,
        out,
        axis,
        inverse,
        real,
        four_step_params,
        /*inplace=*/in_place,
        s);
    copies.push_back(temp);
  } else {
    multi_upload_bluestein_fft(in, out, axis, inverse, real, plan, copies, s);
  }
}

void fft_op(
    const array& in,
    array& out,
    size_t axis,
    bool inverse,
    bool real,
    const FourStepParams four_step_params,
    bool inplace,
    const Stream& s) {
  auto& d = metal::device(s.device);

  size_t n = out.dtype() == float32 ? out.shape(axis) : in.shape(axis);
  if (n == 1) {
    out.copy_shared_buffer(in);
    return;
  }

  if (four_step_params.required) {
    // Four Step FFT decomposes into two FFTs: n1 on columns, n2 on rows
    n = four_step_params.first_step ? four_step_params.n1 : four_step_params.n2;
  }

  // Make sure that the array is contiguous and has stride 1 in the FFT dim
  std::vector<array> copies;
  auto check_input = [&axis, &copies, &s](const array& x) {
    // TODO: Pass the strides to the kernel so
    // we can avoid the copy when x is not contiguous.
    bool no_copy = x.strides()[axis] == 1 &&
        (x.flags().row_contiguous || x.flags().col_contiguous);
    if (no_copy) {
      return x;
    } else {
      array x_copy(x.shape(), x.dtype(), nullptr, {});
      Strides strides;
      int64_t cur_stride = x.shape(axis);
      for (int a = 0; a < x.ndim(); a++) {
        if (a == axis) {
          strides.push_back(1);
        } else {
          strides.push_back(cur_stride);
          cur_stride *= x.shape(a);
        }
      }

      auto flags = x.flags();
      auto [data_size, is_row_contiguous, is_col_contiguous] =
          check_contiguity(x.shape(), strides);

      flags.col_contiguous = is_col_contiguous;
      flags.row_contiguous = is_row_contiguous;
      flags.contiguous = data_size == x_copy.size();

      x_copy.set_data(allocator::malloc(x.nbytes()), data_size, strides, flags);
      copy_gpu_inplace(x, x_copy, CopyType::GeneralGeneral, s);
      copies.push_back(x_copy);
      return x_copy;
    }
  };
  const array& in_contiguous = check_input(in);

  // real to complex: n -> (n/2)+1
  // complex to real: (n/2)+1 -> n
  auto out_strides = in_contiguous.strides();
  size_t out_data_size = in_contiguous.data_size();
  if (in.shape(axis) != out.shape(axis)) {
    for (int i = 0; i < out_strides.size(); i++) {
      if (out_strides[i] != 1) {
        out_strides[i] = out_strides[i] / in.shape(axis) * out.shape(axis);
      }
    }
    out_data_size = out_data_size / in.shape(axis) * out.shape(axis);
  }

  auto plan = plan_fft(n);
  if (plan.four_step) {
    four_step_fft(in, out, axis, inverse, real, plan, copies, s, inplace);
    d.add_temporaries(std::move(copies), s.index);
    return;
  }

  // TODO: allow donation here
  if (!inplace) {
    out.set_data(
        allocator::malloc(out.nbytes()),
        out_data_size,
        out_strides,
        in_contiguous.flags());
  }

  auto radices = supported_radices();
  int fft_size = plan.bluestein_n > 0 ? plan.bluestein_n : n;

  // Setup function constants
  bool power_of_2 = is_power_of_2(fft_size);

  auto make_int = [](int* a, int i) {
    return std::make_tuple(a, MTL::DataType::DataTypeInt, i);
  };
  auto make_bool = [](bool* a, int i) {
    return std::make_tuple(a, MTL::DataType::DataTypeBool, i);
  };

  std::vector<MTLFC> func_consts = {
      make_bool(&inverse, 0), make_bool(&power_of_2, 1)};

  // Start of radix/rader step constants
  int index = 4;
  for (int i = 0; i < plan.stockham.size(); i++) {
    func_consts.push_back(make_int(&plan.stockham[i], index));
    index += 1;
  }
  for (int i = 0; i < plan.rader.size(); i++) {
    func_consts.push_back(make_int(&plan.rader[i], index));
    index += 1;
  }
  int elems_per_thread = compute_elems_per_thread(plan);
  func_consts.push_back(make_int(&elems_per_thread, 2));

  int rader_m = n / plan.rader_n;
  func_consts.push_back(make_int(&rader_m, 3));

  // The overall number of FFTs we're going to compute for this input
  size_t size = out.dtype() == float32 ? out.size() : in.size();
  if (real && inverse && four_step_params.required) {
    size = out.size();
  }
  int total_batch_size = size / n;
  int threads_per_fft = (fft_size + elems_per_thread - 1) / elems_per_thread;

  // We batch among threadgroups for improved efficiency when n is small
  int threadgroup_batch_size = std::max(MIN_THREADGROUP_MEM_SIZE / fft_size, 1);
  if (four_step_params.required) {
    // Require a threadgroup batch size of at least 4 for four step FFT
    // so we can coalesce the memory accesses.
    threadgroup_batch_size =
        std::max(threadgroup_batch_size, MIN_COALESCE_WIDTH);
  }
  int threadgroup_mem_size = next_power_of_2(threadgroup_batch_size * fft_size);
  // FFTs up to 2^20 are currently supported
  assert(threadgroup_mem_size <= MAX_STOCKHAM_FFT_SIZE);

  // ceil divide
  int batch_size =
      (total_batch_size + threadgroup_batch_size - 1) / threadgroup_batch_size;

  if (real && !four_step_params.required) {
    // We can perform 2 RFFTs at once so the batch size is halved.
    batch_size = (batch_size + 2 - 1) / 2;
  }
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto in_type_str = in.dtype() == float32 ? "float" : "float2";
  auto out_type_str = out.dtype() == float32 ? "float" : "float2";
  // Only required by four step
  int step = -1;
  {
    std::ostringstream kname;
    std::string inv_string = inverse ? "true" : "false";
    std::string real_string = real ? "true" : "false";
    std::string func_name;
    if (plan.bluestein_n > 0) {
      kname << "bluestein_fft_mem_" << threadgroup_mem_size << "_"
            << in_type_str << "_" << out_type_str;
      func_name = "bluestein_fft";
    } else if (plan.rader_n > 1) {
      kname << "rader_fft_mem_" << threadgroup_mem_size << "_" << in_type_str
            << "_" << out_type_str;
      func_name = "rader_fft";
    } else if (four_step_params.required) {
      step = four_step_params.first_step ? 0 : 1;
      kname << "four_step_mem_" << threadgroup_mem_size << "_" << in_type_str
            << "_" << out_type_str << "_" << step << "_" << real_string;
      func_name = "four_step_fft";
    } else {
      kname << "fft_mem_" << threadgroup_mem_size << "_" << in_type_str << "_"
            << out_type_str;
      func_name = "fft";
    }
    std::string base_name = kname.str();
    // We use a specialized kernel for each FFT size
    kname << "_n" << fft_size << "_inv_" << inverse;
    std::string hash_name = kname.str();
    auto template_def = func_name == "four_step_fft" ? get_template_definition(
                                                           base_name,
                                                           func_name,
                                                           threadgroup_mem_size,
                                                           in_type_str,
                                                           out_type_str,
                                                           step,
                                                           real)
                                                     : get_template_definition(
                                                           base_name,
                                                           func_name,
                                                           threadgroup_mem_size,
                                                           in_type_str,
                                                           out_type_str);
    auto kernel =
        get_fft_kernel(d, base_name, hash_name, func_consts, template_def);

    compute_encoder.set_compute_pipeline_state(kernel);
    compute_encoder.set_input_array(in_contiguous, 0);
    compute_encoder.set_output_array(out, 1);

    if (plan.bluestein_n > 0) {
      // Precomputed twiddle factors for Bluestein's
      auto [w_k, w_q] = compute_bluestein_constants(n, plan.bluestein_n);
      copies.push_back(w_q);
      copies.push_back(w_k);

      compute_encoder.set_input_array(w_q, 2); // w_q
      compute_encoder.set_input_array(w_k, 3); // w_k
      compute_encoder.set_bytes(n, 4);
      compute_encoder.set_bytes(plan.bluestein_n, 5);
      compute_encoder.set_bytes(total_batch_size, 6);
    } else if (plan.rader_n > 1) {
      auto [b_q, g_q, g_minus_q] = compute_raders_constants(plan.rader_n, s);
      copies.push_back(b_q);
      copies.push_back(g_q);
      copies.push_back(g_minus_q);

      compute_encoder.set_input_array(b_q, 2);
      compute_encoder.set_input_array(g_q, 3);
      compute_encoder.set_input_array(g_minus_q, 4);
      compute_encoder.set_bytes(n, 5);
      compute_encoder.set_bytes(total_batch_size, 6);
      compute_encoder.set_bytes(plan.rader_n, 7);
    } else if (four_step_params.required) {
      compute_encoder.set_bytes(four_step_params.n1, 2);
      compute_encoder.set_bytes(four_step_params.n2, 3);
      compute_encoder.set_bytes(total_batch_size, 4);
    } else {
      compute_encoder.set_bytes(n, 2);
      compute_encoder.set_bytes(total_batch_size, 3);
    }

    auto group_dims = MTL::Size(1, threadgroup_batch_size, threads_per_fft);
    auto grid_dims =
        MTL::Size(batch_size, threadgroup_batch_size, threads_per_fft);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }

  d.add_temporaries(std::move(copies), s.index);
}

void fft_op(
    const array& in,
    array& out,
    size_t axis,
    bool inverse,
    bool real,
    bool inplace,
    const Stream& s) {
  fft_op(in, out, axis, inverse, real, FourStepParams(), inplace, s);
}

void nd_fft_op(
    const array& in,
    array& out,
    const std::vector<size_t>& axes,
    bool inverse,
    bool real,
    const Stream& s) {
  // Perform ND FFT on GPU as a series of 1D FFTs
  auto temp_shape = inverse ? in.shape() : out.shape();
  std::vector<array> temp_arrs;
  temp_arrs.emplace_back(temp_shape, complex64, nullptr, std::vector<array>{});
  if (axes.size() > 2) {
    temp_arrs.emplace_back(
        temp_shape, complex64, nullptr, std::vector<array>{});
  }
  for (int i = axes.size() - 1; i >= 0; i--) {
    int reverse_index = axes.size() - i - 1;
    // For 5D and above, we don't want to reallocate our two temporary arrays
    bool inplace = reverse_index >= 3 && i != 0;
    // Opposite order for fft vs ifft
    int index = inverse ? reverse_index : i;
    size_t axis = axes[index];
    // Mirror np.fft.(i)rfftn and perform a real transform
    // only on the final axis.
    bool step_real = (real && index == axes.size() - 1);
    const array& in_arr = i == axes.size() - 1 ? in : temp_arrs[i % 2];
    array& out_arr = i == 0 ? out : temp_arrs[1 - i % 2];
    fft_op(in_arr, out_arr, axis, inverse, step_real, inplace, s);
  }

  auto& d = metal::device(s.device);
  d.add_temporaries(std::move(temp_arrs), s.index);
}

void FFT::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& s = stream();
  auto& in = inputs[0];

  if (axes_.size() > 1) {
    nd_fft_op(in, out, axes_, inverse_, real_, s);
  } else {
    fft_op(in, out, axes_[0], inverse_, real_, /*inplace=*/false, s);
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/hadamard.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/backend/common/hadamard.h"
#include "mlx/backend/common/compiled.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/jit/includes.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/primitives.h"

namespace mlx::core {

constexpr int MAX_HADAMARD_THREADS_PER_GROUP = 256;

std::string gen_hadamard_codelet(int m) {
  // Generate a O(m^2) hadamard codelet for a given M
  // using the hadamard matrices above
  //
  // e.g. m = 2
  // METAL_FUNC void hadamard_m(thread float *x) {
  //   float tmp[2];
  //   tmp[0] = + x[0] + x[1];
  //   tmp[1] = + x[0] - x[1];
  //   for (int i = 0; i < 2; i++) { x[i] = tmp[i]; }
  // }
  //
  auto h_matrices = hadamard_matrices();
  auto& matrix = h_matrices[m];

  std::ostringstream source;
  source << "METAL_FUNC void hadamard_radix_m(thread float *x) {" << std::endl;
  if (m == 1) {
    source << "}" << std::endl;
    return source.str();
  }
  source << "  float tmp[" << m << "];" << std::endl;
  auto start = 1;
  auto end = matrix.find('\n', start);

  int index = 0;
  while (end != std::string_view::npos) {
    source << "  tmp[" << index << "] = ";
    auto row = matrix.substr(start, end - start);
    for (int i = 0; i < row.length(); i++) {
      source << " " << row[i] << " x[" << i << "]";
    }
    source << ";" << std::endl;
    start = end + 1;
    end = matrix.find('\n', start);
    index++;
  }
  source << "  for (int i = 0; i < " << m << "; i++) { x[i] = tmp[i]; }"
         << std::endl;
  source << "}" << std::endl;
  return source.str();
}

void hadamard_mn_contiguous(
    const array& x,
    array& y,
    int m,
    int n1,
    int n2,
    float scale,
    metal::Device& d,
    const Stream& s) {
  int n = n1 * n2;
  int read_width_n1 = n1 == 2 ? 2 : 4;
  int read_width_n2 = n2 == 2 ? 2 : 4;
  int read_width_m = (n == 2 || m == 28) ? 2 : 4;
  int max_radix_1 = std::min(n1, 16);
  int max_radix_2 = std::min(n2, 16);
  float scale_n1 = 1.0;
  float scale_n2 = (m == 1) ? scale : 1.0;
  float scale_m = scale;

  // n2 is a row contiguous power of 2 hadamard transform
  MTL::Size group_dims_n2(n2 / max_radix_2, 1, 1);
  MTL::Size grid_dims_n2(n2 / max_radix_2, x.size() / n2, 1);

  // n1 is a strided power of 2 hadamard transform with stride n2
  MTL::Size group_dims_n1(n1 / max_radix_1, 1, 1);
  MTL::Size grid_dims_n1(n1 / max_radix_1, x.size() / n, n2);

  // m is a strided hadamard transform with stride n = n1 * n2
  MTL::Size group_dims_m(
      std::min(n / read_width_m, MAX_HADAMARD_THREADS_PER_GROUP), 1, 1);
  MTL::Size grid_dims_m(
      group_dims_m.width, x.size() / m / read_width_m / group_dims_m.width, 1);

  // Make the kernel
  std::string kname;
  kname.reserve(32);
  concatenate(kname, "hadamard_", n * m, "_", type_to_name(x));
  auto lib = d.get_library(kname, [&]() {
    std::string kernel;
    concatenate(
        kernel,
        metal::utils(),
        gen_hadamard_codelet(m),
        metal::hadamard(),
        get_template_definition(
            "n2" + kname,
            "hadamard_n",
            get_type_string(x.dtype()),
            n2,
            max_radix_2,
            read_width_n2));
    if (n1 > 1) {
      kernel += get_template_definition(
          "n1" + kname,
          "hadamard_n",
          get_type_string(x.dtype()),
          n1,
          max_radix_1,
          read_width_n1,
          n2);
    }
    if (m > 1) {
      kernel += get_template_definition(
          "m" + kname,
          "hadamard_m",
          get_type_string(x.dtype()),
          n,
          m,
          read_width_m);
    }
    return kernel;
  });

  // Launch the strided transform for n1
  if (n1 > 1) {
    auto& compute_encoder = d.get_command_encoder(s.index);
    auto kernel = d.get_kernel("n1" + kname, lib);
    compute_encoder.set_compute_pipeline_state(kernel);
    compute_encoder.set_input_array(x, 0);
    compute_encoder.set_output_array(y, 1);
    compute_encoder.set_bytes(scale_n1, 2);
    compute_encoder.dispatch_threads(grid_dims_n1, group_dims_n1);
  }

  // Launch the transform for n2
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel("n2" + kname, lib);
  compute_encoder.set_compute_pipeline_state(kernel);
  compute_encoder.set_input_array(n1 > 1 ? y : x, 0);
  compute_encoder.set_output_array(y, 1);
  compute_encoder.set_bytes(scale_n2, 2);
  compute_encoder.dispatch_threads(grid_dims_n2, group_dims_n2);

  // Launch the strided transform for m
  if (m > 1) {
    auto kernel = d.get_kernel("m" + kname, lib);
    compute_encoder.set_compute_pipeline_state(kernel);
    compute_encoder.set_input_array(y, 0);
    compute_encoder.set_output_array(y, 1);
    compute_encoder.set_bytes(scale_m, 2);
    compute_encoder.dispatch_threads(grid_dims_m, group_dims_m);
  }
}

void Hadamard::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& s = stream();
  auto& d = metal::device(s.device);
  auto& in = inputs[0];

  // Split the hadamard transform so that all of them work on vectors smaller
  // than 8192 elements.
  //
  // We decompose it in the following way:
  //
  // n = m * n1 * n2 = m * 2^k1 * 2^k2
  //
  // where m is in (1, 12, 20, 28) and n1 and n2 <= 8192
  auto [n, m] = decompose_hadamard(in.shape().back());
  int n1 = 1, n2 = n;
  if (n > 8192) {
    for (n2 = 2; n2 * n2 < n; n2 *= 2) {
    }
    n1 = n / n2;
  }

  if (in.flags().row_contiguous) {
    if (in.is_donatable()) {
      out.copy_shared_buffer(in);
    } else {
      out.set_data(allocator::malloc(out.nbytes()));
    }
    hadamard_mn_contiguous(in, out, m, n1, n2, scale_, d, s);
  } else {
    copy_gpu(in, out, CopyType::General, s);
    hadamard_mn_contiguous(out, out, m, n1, n2, scale_, d, s);
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/indexing.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <fmt/format.h>

#include "mlx/backend/common/compiled.h"
#include "mlx/backend/common/slicing.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/gpu/scan.h"
#include "mlx/backend/gpu/slicing.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/jit/includes.h"
#include "mlx/backend/metal/jit/indexing.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/dtype.h"
#include "mlx/primitives.h"
#include "mlx/utils.h"

namespace mlx::core {

constexpr int METAL_MAX_INDEX_ARRAYS = 20;

std::pair<std::string, std::string> make_index_args(
    const std::string& idx_type,
    int nidx) {
  std::ostringstream idx_args;
  std::ostringstream idx_arr;
  for (int i = 0; i < nidx; ++i) {
    idx_args << fmt::format(
        "const device {0} *idx{1} [[buffer({2})]],", idx_type, i, 20 + i);
    idx_arr << fmt::format("idx{0}", i);
    if (i < nidx - 1) {
      idx_args << "\n";
      idx_arr << ",";
    }
  }
  return {idx_args.str(), idx_arr.str()};
}

template <typename T>
inline std::string make_op(typename T::ReduceType r, const std::string& dt) {
  switch (r) {
    case T::None:
      return "None";
    case T::Sum:
      return fmt::format("Sum<{0}>", dt);
    case T::Prod:
      return fmt::format("Prod<{0}>", dt);
    case T::Max:
      return fmt::format("Max<{0}>", dt);
    case T::Min:
      return fmt::format("Min<{0}>", dt);
  }
}

void Gather::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& src = inputs[0];
  int nidx = inputs.size() - 1;

  if (nidx > METAL_MAX_INDEX_ARRAYS) {
    std::ostringstream msg;
    msg << "[Gather::eval_gpu] Gathering with more than "
        << METAL_MAX_INDEX_ARRAYS << " index arrays not yet supported.";
    throw std::runtime_error(msg.str());
  }

  out.set_data(allocator::malloc(out.nbytes()));
  if (out.size() == 0) {
    return;
  }

  auto& s = stream();
  auto& d = metal::device(s.device);

  size_t slice_size = 1;
  for (auto s : slice_sizes_) {
    slice_size *= s;
  }

  bool large_index = nidx && inputs[1].size() > INT32_MAX;
  bool large_src = src.size() > INT32_MAX;
  bool large_out = out.size() > INT32_MAX;
  bool large = large_index || large_src || large_out;

  std::string idx_type_name = nidx ? type_to_name(inputs[1]) : "";

  if (src.flags().row_contiguous && nidx == 1 && axes_[0] == 0 &&
      inputs[1].flags().row_contiguous && slice_size == src.strides()[0]) {
    int work_per_thread = (slice_size > 8 && src.dtype().size() < 4) ? 2 : 1;
    auto& indices = inputs[1];
    std::string kernel_name = fmt::format(
        "gather_front{0}_{1}_{2}_{3}",
        type_to_name(out),
        idx_type_name,
        large ? "int64_t" : "int",
        work_per_thread);
    std::string lib_name = kernel_name;

    auto lib = d.get_library(lib_name, [&]() {
      std::string kernel_source = metal::utils();
      kernel_source += metal::gather_front();
      kernel_source += get_template_definition(
          kernel_name,
          "gather_front",
          get_type_string(out.dtype()),
          get_type_string(indices.dtype()),
          large ? "int64_t" : "int",
          work_per_thread);

      return kernel_source;
    });

    auto& compute_encoder = d.get_command_encoder(s.index);
    auto kernel = d.get_kernel(kernel_name, lib);
    compute_encoder.set_compute_pipeline_state(kernel);

    size_t dim_x = (slice_size + work_per_thread - 1) / work_per_thread;
    size_t dim_y = indices.size();
    auto group_dims = get_block_dims(dim_x, dim_y, 1);
    MTL::Size grid_dims = MTL::Size(dim_x, dim_y, 1);

    compute_encoder.set_input_array(src, 0);
    compute_encoder.set_input_array(indices, 1);
    compute_encoder.set_output_array(out, 2);
    compute_encoder.set_bytes(slice_size, 3);
    compute_encoder.set_bytes(src.shape(0), 4);
    compute_encoder.dispatch_threads(grid_dims, group_dims);

    return;
  }

  int idx_ndim = nidx ? inputs[1].ndim() : 0;
  size_t ndim = src.ndim();

  std::string kernel_name = fmt::format(
      "gather{0}{1}_{2}_{3}_{4}",
      type_to_name(out),
      idx_type_name,
      nidx,
      idx_ndim,
      large ? "int64_t" : "int");
  std::string lib_name = kernel_name;

  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source = metal::utils();
    kernel_source += metal::gather();
    std::string out_type_str = get_type_string(out.dtype());
    std::string idx_type_str =
        nidx ? get_type_string(inputs[1].dtype()) : "bool";
    auto [idx_args, idx_arr] = make_index_args(idx_type_str, nidx);

    // Index dimension specializations
    kernel_source += fmt::format(
        gather_kernels,
        type_to_name(out) + idx_type_name,
        out_type_str,
        idx_type_str,
        nidx,
        idx_args,
        idx_arr,
        idx_ndim,
        large ? "int64_t" : "int");
    return kernel_source;
  });

  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kernel_name, lib);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Launch 3D grid of threads
  // First two dimensions for the indices, the last one for the slice
  size_t dim0 = 1;
  size_t dim1 = 1;
  if (nidx) {
    if (inputs[1].ndim() >= 1) {
      dim0 = inputs[1].shape(0);
    }
    if (inputs[1].ndim() >= 2) {
      dim1 = inputs[1].size() / dim0;
    }
  }
  size_t dim2 = slice_size;
  auto group_dims = get_block_dims(dim0, dim1, dim2);
  MTL::Size grid_dims = MTL::Size(dim0, dim1, dim2);

  // Collect all idx shapes and strides into one place
  std::vector<int> idx_shapes;
  std::vector<size_t> idx_strides;
  std::vector<char> idx_contigs;
  for (int i = 0; i < nidx; ++i) {
    idx_shapes.insert(
        idx_shapes.end(),
        inputs[i + 1].shape().begin(),
        inputs[i + 1].shape().end());
    idx_strides.insert(
        idx_strides.end(),
        inputs[i + 1].strides().begin(),
        inputs[i + 1].strides().end());
    idx_contigs.push_back(inputs[i + 1].flags().row_contiguous);
  }

  // Set all the buffers
  compute_encoder.set_input_array(src, 0);
  compute_encoder.set_output_array(out, 1);

  // Set source info
  compute_encoder.set_vector_bytes(src.shape(), 2);
  compute_encoder.set_vector_bytes(src.strides(), 3);
  compute_encoder.set_bytes(ndim, 4);
  compute_encoder.set_vector_bytes(slice_sizes_, 5);
  compute_encoder.set_vector_bytes(axes_, 6);

  // Set index info
  //
  // We don't need to check for empty idx_shapes because gather has a
  // idx_ndim == 0 specialization
  compute_encoder.set_vector_bytes(idx_shapes, 7);
  compute_encoder.set_vector_bytes(idx_strides, 8);
  compute_encoder.set_vector_bytes(idx_contigs, 9);
  compute_encoder.set_bytes(idx_ndim, 10);

  // Set index buffers
  for (int i = 0; i < nidx; ++i) {
    compute_encoder.set_input_array(inputs[i + 1], 20 + i);
  }

  // Launch grid
  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void Scatter::eval_gpu(const std::vector<array>& inputs, array& out) {
  if (size_of(out.dtype()) == 8) {
    std::ostringstream msg;
    msg << "[Scatter::eval_gpu] Does not support " << out.dtype();
    throw std::invalid_argument(msg.str());
  }

  int nidx = axes_.size();
  if (nidx > METAL_MAX_INDEX_ARRAYS) {
    std::ostringstream msg;
    msg << "[Scatter::eval_gpu] Gathering with more than "
        << METAL_MAX_INDEX_ARRAYS << " index arrays not yet supported.";
    throw std::runtime_error(msg.str());
  }

  // Copy src into out
  CopyType copy_type;
  if (inputs[0].data_size() == 1) {
    copy_type = CopyType::Scalar;
  } else if (inputs[0].flags().row_contiguous) {
    copy_type = CopyType::Vector;
  } else {
    copy_type = CopyType::General;
  }
  copy_gpu(inputs[0], out, copy_type);

  auto& upd = inputs.back();

  // Empty update
  if (upd.size() == 0) {
    return;
  }

  // Get stream
  auto& s = stream();
  auto& d = metal::device(s.device);

  int idx_ndim = nidx ? inputs[1].ndim() : 0;
  size_t idx_size = nidx ? inputs[1].size() : 1;

  auto idx_to_out = idx_size / out.size();
  int nwork;
  if (idx_ndim <= 1 || idx_to_out < 1) {
    nwork = 1;
  } else if (idx_to_out <= 4) {
    nwork = 4;
  } else if (idx_to_out < 16) {
    nwork = 8;
  } else if (idx_to_out < 32) {
    nwork = 16;
  } else {
    nwork = 32;
  }

  std::string idx_type_name = nidx ? type_to_name(inputs[1]) : "";
  std::string op_name;
  switch (reduce_type_) {
    case Scatter::None:
      op_name = "none";
      break;
    case Scatter::Sum:
      op_name = "sum";
      break;
    case Scatter::Prod:
      op_name = "prod";
      break;
    case Scatter::Max:
      op_name = "max";
      break;
    case Scatter::Min:
      op_name = "min";
      break;
  }
  auto upd_contig = upd.flags().row_contiguous;
  bool large_out = out.size() > INT32_MAX;
  bool large_idx = nidx && (inputs[1].size() > INT32_MAX);
  bool large_upd = upd.size() > INT32_MAX;
  bool large = large_out || large_idx || large_upd;
  std::string kernel_name = fmt::format(
      "scatter{0}{1}_{2}_{3}_{4}_nwork{5}_{6}",
      type_to_name(out),
      idx_type_name,
      op_name,
      nidx,
      upd_contig ? "updc_true" : "updc_false",
      nwork,
      large ? "int64_t" : "int");
  std::string lib_name = kernel_name;

  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source = metal::utils();
    concatenate(kernel_source, metal::reduce_utils(), metal::scatter());

    std::string out_type_str = get_type_string(out.dtype());
    std::string idx_type_str =
        nidx ? get_type_string(inputs[1].dtype()) : "bool";
    std::string op_type = make_op<Scatter>(reduce_type_, out_type_str);
    auto [idx_args, idx_arr] = make_index_args(idx_type_str, nidx);

    kernel_source += fmt::format(
        scatter_kernels,
        type_to_name(out) + idx_type_name + "_" + op_name,
        out_type_str,
        idx_type_str,
        op_type,
        nidx,
        idx_args,
        idx_arr,
        upd_contig,
        nwork,
        large ? "int64_t" : "int");
    return kernel_source;
  });

  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kernel_name, lib);

  size_t nthreads = upd.size();

  compute_encoder.set_compute_pipeline_state(kernel);

  // Set all the buffers
  compute_encoder.set_input_array(upd, 1);
  compute_encoder.set_output_array(out, 2);

  // Set update info
  size_t upd_ndim = upd.ndim();
  size_t upd_size = 1;
  for (int i = idx_ndim; i < upd.ndim(); ++i) {
    upd_size *= upd.shape(i);
  }
  // Collect all idx shapes and strides into one place
  Shape idx_shapes;
  Strides idx_strides;
  // To access .data() use char instead of bool
  // bool is 1 byte in Metal so this is safe
  std::vector<char> idx_contigs;
  for (int i = 0; i < nidx; ++i) {
    idx_shapes.insert(
        idx_shapes.end(),
        inputs[i + 1].shape().begin(),
        inputs[i + 1].shape().end());
    idx_strides.insert(
        idx_strides.end(),
        inputs[i + 1].strides().begin(),
        inputs[i + 1].strides().end());
    idx_contigs.push_back(inputs[i + 1].flags().row_contiguous);
  }

  if (upd_ndim == 0) {
    // Need placeholders so Metal doesn't complain
    int shape_ = 0;
    int64_t stride_ = 0;
    compute_encoder.set_bytes(shape_, 3);
    compute_encoder.set_bytes(stride_, 4);
  } else {
    compute_encoder.set_vector_bytes(upd.shape(), 3);
    compute_encoder.set_vector_bytes(upd.strides(), 4);
  }
  compute_encoder.set_bytes(upd_ndim, 5);
  compute_encoder.set_bytes(upd_size, 6);

  // Set output info
  size_t out_ndim = out.ndim();
  if (out_ndim == 0) {
    // Need placeholders so Metal doesn't complain
    int shape_ = 0;
    int64_t stride_ = 0;
    compute_encoder.set_bytes(shape_, 7);
    compute_encoder.set_bytes(stride_, 8);
  } else {
    compute_encoder.set_vector_bytes(out.shape(), 7);
    compute_encoder.set_vector_bytes(out.strides(), 8);
  }
  compute_encoder.set_bytes(out_ndim, 9);
  compute_encoder.set_vector_bytes(axes_, 10);

  // Set index info
  if (idx_ndim == 0) {
    // Add a 0 in idx_shapes and strides to avoid the missing buffer binding
    // error in the metal API.
    idx_shapes.push_back(0);
    idx_strides.push_back(0);
    idx_contigs.push_back(false);
  }
  compute_encoder.set_vector_bytes(idx_shapes, 11);
  compute_encoder.set_vector_bytes(idx_strides, 12);
  compute_encoder.set_vector_bytes(idx_contigs, 13);
  compute_encoder.set_bytes(idx_ndim, 14);
  compute_encoder.set_bytes(idx_size, 15);

  // Set index buffers
  for (int i = 0; i < nidx; ++i) {
    compute_encoder.set_input_array(inputs[i + 1], 20 + i);
  }

  // Launch grid
  auto grid_y = (nthreads / upd_size);
  grid_y = (grid_y + nwork - 1) / nwork;
  MTL::Size grid_dims = MTL::Size(upd_size, grid_y, 1);
  auto thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
  if (thread_group_size != 1024) {
    throw std::runtime_error("[Scatter::eval_gpu] Invalid number of threads");
  }
  MTL::Size group_dims = get_block_dims(upd_size, grid_y, 1);
  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void GatherAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& src = inputs[0];
  auto& idx = inputs[1];

  out.set_data(allocator::malloc(out.nbytes()));
  if (out.size() == 0) {
    return;
  }

  auto& s = stream();
  auto& d = metal::device(s.device);

  size_t ndim = src.ndim();

  bool large = idx.size() > INT32_MAX || src.size() > INT32_MAX;

  std::string kernel_name = fmt::format(
      "gather_axis{0}{1}_{2}",
      type_to_name(out),
      type_to_name(idx),
      large ? "int64_t" : "int");
  std::string lib_name = kernel_name;
  kernel_name += src.flags().row_contiguous ? "c" : "nc";
  kernel_name += idx.flags().row_contiguous ? "c" : "nc";

  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source = metal::utils();
    kernel_source += metal::gather_axis();
    std::string out_type_str = get_type_string(out.dtype());
    std::string idx_type_str = get_type_string(idx.dtype());
    for (int i = 0; i < 4; ++i) {
      bool sc = i & 1;
      bool ic = i & 2;
      kernel_source += get_template_definition(
          lib_name + (sc ? "c" : "nc") + (ic ? "c" : "nc"),
          "gather_axis",
          out_type_str,
          idx_type_str,
          large ? "int64_t" : "int",
          sc ? "true" : "false",
          ic ? "true" : "false");
    }
    return kernel_source;
  });

  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kernel_name, lib);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Grid [size post, index size, size pre]
  size_t size_pre = 1;
  size_t size_post = 1;
  for (int i = 0; i < axis_; ++i) {
    size_pre *= idx.shape(i);
  }
  for (int i = axis_ + 1; i < idx.ndim(); ++i) {
    size_post *= idx.shape(i);
  }

  int idx_ax_size = idx.shape(axis_);
  auto group_dims = get_block_dims(size_post, idx_ax_size, size_pre);
  MTL::Size grid_dims = MTL::Size(size_post, idx_ax_size, size_pre);

  // Set all the buffers
  compute_encoder.set_input_array(src, 0);
  compute_encoder.set_input_array(idx, 1);
  compute_encoder.set_output_array(out, 2);

  // Set source info
  compute_encoder.set_vector_bytes(remove_index(idx.shape(), axis_), 3);
  compute_encoder.set_vector_bytes(remove_index(src.strides(), axis_), 4);
  compute_encoder.set_vector_bytes(remove_index(idx.strides(), axis_), 5);
  compute_encoder.set_bytes(ndim - 1, 6);
  compute_encoder.set_bytes(axis_, 7);
  compute_encoder.set_bytes(src.shape(axis_), 8);
  compute_encoder.set_bytes(src.strides(axis_), 9);
  compute_encoder.set_bytes(idx.strides(axis_), 10);

  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void ScatterAxis::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& src = inputs[0];
  auto& idx = inputs[1];
  auto& upd = inputs[2];

  // Copy src into out
  CopyType copy_type;
  if (src.data_size() == 1) {
    copy_type = CopyType::Scalar;
  } else if (src.flags().row_contiguous) {
    copy_type = CopyType::Vector;
  } else {
    copy_type = CopyType::General;
  }
  copy_gpu(src, out, copy_type);

  // Empty update
  if (upd.size() == 0) {
    return;
  }

  auto& s = stream();
  auto& d = metal::device(s.device);

  size_t ndim = src.ndim();

  bool large = idx.size() > INT32_MAX || src.size() > INT32_MAX;

  std::string op_name;
  switch (reduce_type_) {
    case ScatterAxis::None:
      op_name = "none";
      break;
    case ScatterAxis::Sum:
      op_name = "sum";
      break;
  }

  std::string kernel_name = fmt::format(
      "scatter_axis{0}{1}_{2}_{3}",
      type_to_name(out),
      type_to_name(idx),
      op_name,
      large ? "int64_t" : "int");
  std::string lib_name = kernel_name;
  kernel_name += upd.flags().row_contiguous ? "c" : "nc";
  kernel_name += idx.flags().row_contiguous ? "c" : "nc";

  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source = metal::utils();
    kernel_source += metal::reduce_utils();
    kernel_source += metal::scatter_axis();
    std::string out_type_str = get_type_string(out.dtype());
    std::string idx_type_str = get_type_string(idx.dtype());
    std::string op_type;
    switch (reduce_type_) {
      case ScatterAxis::None:
        op_type = "None";
        break;
      case ScatterAxis::Sum:
        op_type = "Sum<" + out_type_str + ">";
        break;
    }

    for (int i = 0; i < 4; ++i) {
      bool uc = i & 1;
      bool ic = i & 2;
      kernel_source += get_template_definition(
          lib_name + (uc ? "c" : "nc") + (ic ? "c" : "nc"),
          "scatter_axis",
          out_type_str,
          idx_type_str,
          large ? "int64_t" : "int",
          op_type,
          uc ? "true" : "false",
          ic ? "true" : "false");
    }
    return kernel_source;
  });

  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kernel_name, lib);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Grid [size post, index size, size pre]
  size_t size_pre = 1;
  size_t size_post = 1;
  for (int i = 0; i < axis_; ++i) {
    size_pre *= idx.shape(i);
  }
  for (int i = axis_ + 1; i < idx.ndim(); ++i) {
    size_post *= idx.shape(i);
  }

  int idx_ax_size = idx.shape(axis_);
  auto group_dims = get_block_dims(size_post, idx_ax_size, size_pre);
  MTL::Size grid_dims = MTL::Size(size_post, idx_ax_size, size_pre);

  // Set all the buffers
  compute_encoder.set_input_array(upd, 0);
  compute_encoder.set_input_array(idx, 1);
  compute_encoder.set_output_array(out, 2);

  // Set source info
  if (ndim > 1) {
    compute_encoder.set_vector_bytes(remove_index(idx.shape(), axis_), 3);
    compute_encoder.set_vector_bytes(remove_index(upd.strides(), axis_), 4);
    compute_encoder.set_vector_bytes(remove_index(idx.strides(), axis_), 5);
  } else {
    // The following will be ignored in the kernel but we still have to set
    // some value so that metal validation passes.
    compute_encoder.set_vector_bytes(idx.shape(), 3);
    compute_encoder.set_vector_bytes(upd.strides(), 4);
    compute_encoder.set_vector_bytes(idx.strides(), 5);
  }
  compute_encoder.set_bytes(ndim - 1, 6);
  compute_encoder.set_bytes(axis_, 7);
  compute_encoder.set_bytes(out.shape(axis_), 8);
  compute_encoder.set_bytes(upd.strides(axis_), 9);
  compute_encoder.set_bytes(idx.strides(axis_), 10);

  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void MaskedScatter::eval_gpu(const std::vector<array>& inputs, array& out) {
  const array& dst = inputs[0];
  const array& mask = inputs[1];
  const array& src = inputs[2];

  auto& s = stream();
  auto& d = metal::device(s.device);

  const size_t total = mask.size();
  const CopyType ct = (total == 1)
      ? CopyType::Scalar
      : (dst.flags().row_contiguous ? CopyType::Vector : CopyType::General);
  copy_gpu(dst, out, ct, s);
  if (total == 0) {
    return;
  }

  array mask_flat = flatten_in_eval(mask, 1, -1, s);
  if (mask_flat.data<void>() != mask.data<void>()) {
    d.add_temporary(mask_flat, s.index);
  }

  if (!mask_flat.flags().row_contiguous) {
    mask_flat = contiguous_copy_gpu(mask_flat, s);
    d.add_temporary(mask_flat, s.index);
  }

  // Prefix (exclusive) of mask → scatter_offsets
  array scatter_offsets(mask_flat.shape(), uint32, nullptr, {});
  scatter_offsets.set_data(allocator::malloc(scatter_offsets.nbytes()));
  d.add_temporary(scatter_offsets, s.index);

  scan_gpu_inplace(
      mask_flat,
      scatter_offsets,
      Scan::Sum,
      /*axis=*/1,
      /*reverse=*/false,
      /*inclusive=*/false,
      s);

  // Kernel selection/build
  static constexpr std::string_view kBaseName = "masked_assign";
  const std::string dtype_tag = type_to_name(out.dtype());
  const std::string value_type = get_type_string(out.dtype());
  const std::string contiguous =
      (src.flags().row_contiguous) ? "true" : "false";
  const std::string kernel_name =
      fmt::format("{}_{}_{}", kBaseName, dtype_tag, contiguous);

  auto lib = d.get_library(kernel_name, [&]() {
    std::string source = metal::utils();
    source += metal::masked_scatter();
    source +=
        fmt::format(masked_assign_kernel, kernel_name, value_type, contiguous);
    return source;
  });
  auto kernel = d.get_kernel(kernel_name, lib);

  // Binding
  int bind_idx = 0;
  const int ndim = static_cast<int>(src.ndim());
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);
  compute_encoder.set_input_array(mask_flat, bind_idx++);
  compute_encoder.set_input_array(scatter_offsets, bind_idx++);
  compute_encoder.set_input_array(src, bind_idx++);
  compute_encoder.set_output_array(out, bind_idx++);
  compute_encoder.set_vector_bytes(src.shape(), bind_idx++);
  compute_encoder.set_vector_bytes(src.strides(), bind_idx++);
  compute_encoder.set_bytes(ndim, bind_idx++);
  compute_encoder.set_bytes(src.size() / src.shape(0), bind_idx++);
  compute_encoder.set_bytes(mask_flat.size() / mask.shape(0), bind_idx++);

  // Dispatch
  auto group_dims = get_block_dims(total, 1, 1);
  MTL::Size grid_dims(total, 1, 1);
  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void SliceUpdate::eval_gpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  if (out.size() == 0) {
    out.set_data(allocator::malloc(0));
    return;
  }

  auto& in = inputs[0];
  auto& upd = inputs[1];

  if (upd.size() == 0) {
    out.copy_shared_buffer(in);
    return;
  }

  auto ctype = in.flags().contiguous && in.size() == in.data_size()
      ? CopyType::Vector
      : CopyType::General;
  copy_gpu(in, out, in.data_size() == 1 ? CopyType::Scalar : ctype, stream());
  auto [data_offset, out_strides] =
      prepare_slice(out, start_indices_, strides_);

  // Do copy
  if (reduce_type_ == SliceUpdate::None) {
    copy_gpu_inplace(
        /* const array& src = */ upd,
        /* array& dst = */ out,
        /* const Shape& data_shape = */ upd.shape(),
        /* const Strides& i_strides = */ upd.strides(),
        /* const Strides& o_strides = */ out_strides,
        /* int64_t i_offset = */ 0,
        /* int64_t o_offset = */ data_offset,
        /* CopyType ctype = */ CopyType::GeneralGeneral,
        /* const Stream& s = */ stream());
    return;
  }

  std::string op_name;
  switch (reduce_type_) {
    case SliceUpdate::None:
      op_name = "none";
      break;
    case SliceUpdate::Sum:
      op_name = "sum";
      break;
    case SliceUpdate::Prod:
      op_name = "prod";
      break;
    case SliceUpdate::Max:
      op_name = "max";
      break;
    case SliceUpdate::Min:
      op_name = "min";
      break;
  }

  bool upd_contiguous = upd.flags().row_contiguous;
  bool upd_scalar = upd.data_size() == 1;

  Shape shape;
  std::vector<Strides> strides;
  if (upd_scalar) {
    std::tie(shape, strides) =
        collapse_contiguous_dims(upd.shape(), {out_strides, out_strides});
  } else {
    std::tie(shape, strides) =
        collapse_contiguous_dims(upd.shape(), {upd.strides(), out_strides});
  }

  int ndim_constant = shape.size();
  if (ndim_constant > 3) {
    ndim_constant = 0;
  }

  int nwork = 1;
  if (shape.back() % 4 == 0) {
    nwork = 4;
  } else if (shape.back() % 2 == 0) {
    nwork = 2;
  }

  auto [ds, rc, cc] = check_contiguity(shape, strides[1]);
  bool out_contiguous = rc;
  bool large = upd.size() > INT32_MAX;
  std::string kernel_name = fmt::format(
      "slice_update_{0}_{1}{2}_{3}_{4}_{5}_nw{6}_nd{7}",
      op_name,
      type_to_name(out),
      large ? "int64_t" : "int",
      out_contiguous ? "oc_true" : "oc_false",
      upd_contiguous ? "updc_true" : "updc_false",
      upd_scalar ? "upds_true" : "upds_false",
      nwork,
      ndim_constant);

  auto& s = stream();
  auto& d = metal::device(s.device);

  auto lib = d.get_library(kernel_name, [&]() {
    std::string kernel_source = metal::utils();
    concatenate(kernel_source, metal::reduce_utils(), metal::scatter());

    std::string out_type = get_type_string(out.dtype());
    std::string op_type = make_op<SliceUpdate>(reduce_type_, out_type);

    kernel_source += fmt::format(
        slice_update_op_kernel,
        kernel_name,
        out_type,
        large ? "int64_t" : "int",
        op_type,
        out_contiguous,
        upd_contiguous,
        upd_scalar,
        nwork,
        ndim_constant);

    return kernel_source;
  });

  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kernel_name, lib);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Set all the buffers
  int ndim = shape.size();
  int64_t size = upd.size();
  compute_encoder.set_input_array(upd, 0);
  compute_encoder.set_output_array(out, 1);
  compute_encoder.set_vector_bytes(shape, 2);
  compute_encoder.set_vector_bytes(strides[0], 3);
  compute_encoder.set_bytes(ndim, 4);
  compute_encoder.set_bytes(size, 5);
  compute_encoder.set_vector_bytes(strides[1], 6);
  compute_encoder.set_bytes(data_offset, 7);

  // Launch grid
  int64_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
  int64_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
  int64_t rest = size / (dim0 * dim1);
  dim0 /= nwork;

  auto group_dims = get_block_dims(dim0, dim1, rest);
  MTL::Size grid_dims(dim0, dim1, rest);
  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/jit/includes.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

namespace mlx::core::metal {

const char* utils();
const char* binary_ops();
const char* unary_ops();
const char* ternary_ops();
const char* reduce_utils();
const char* gather();
const char* scatter();
const char* masked_scatter();

const char* arange();
const char* unary();
const char* binary();
const char* binary_two();
const char* copy();
const char* fft();
const char* gather_axis();
const char* gather_front();
const char* hadamard();
const char* logsumexp();
const char* quantized_utils();
const char* quantized();
const char* fp_quantized();
const char* ternary();
const char* scan();
const char* scatter_axis();
const char* softmax();
const char* sort();
const char* reduce();

const char* gemm();
const char* steel_gemm_fused();
const char* steel_gemm_masked();
const char* steel_gemm_splitk();
const char* steel_gemm_gather();
const char* steel_gemm_segmented();
const char* conv();
const char* steel_conv();
const char* steel_conv_3d();
const char* steel_conv_general();
const char* gemv_masked();
const char* steel_attention();

const char* gemm_nax();
const char* steel_gemm_fused_nax();
const char* steel_gemm_gather_nax();
const char* steel_gemm_splitk_nax();

const char* quantized_nax();
const char* fp_quantized_nax();

const char* steel_attention_nax();

} // namespace mlx::core::metal


================================================
FILE: mlx/backend/metal/jit/indexing.h
================================================
// Copyright © 2023-2024 Apple Inc.

constexpr std::string_view gather_kernels = R"(
[[kernel]] void gather{0}_{3}_{6}_{7}(
    const device {1}* src [[buffer(0)]],
    device {1}* out [[buffer(1)]],
    const constant int* src_shape [[buffer(2)]],
    const constant int64_t* src_strides [[buffer(3)]],
    const constant size_t& src_ndim [[buffer(4)]],
    const constant int* slice_sizes [[buffer(5)]],
    const constant int* axes [[buffer(6)]],
    const constant int* idx_shapes [[buffer(7)]],
    const constant int64_t* idx_strides [[buffer(8)]],
    const constant bool* idx_contigs [[buffer(9)]],
    const constant int& idx_ndim [[buffer(10)]],
    {4}
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {{
  Indices<{2}, {3}> idxs{{
    {{ {5} }}, idx_shapes, idx_strides, idx_contigs, idx_ndim}};

  return gather_impl<{1}, {2}, {3}, {6}, {7}>(
      src,
      out,
      src_shape,
      src_strides,
      src_ndim,
      slice_sizes,
      axes,
      idxs,
      index,
      grid_dim);
}}
)";

constexpr std::string_view scatter_kernels = R"(
[[kernel]] void scatter{0}_{4}_updc_{7}_nwork{8}_{9}(
    const device {1}* updates [[buffer(1)]],
    device mlx_atomic<{1}>* out [[buffer(2)]],
    const constant int* upd_shape [[buffer(3)]],
    const constant int64_t* upd_strides [[buffer(4)]],
    const constant size_t& upd_ndim [[buffer(5)]],
    const constant size_t& upd_size [[buffer(6)]],
    const constant int* out_shape [[buffer(7)]],
    const constant int64_t* out_strides [[buffer(8)]],
    const constant size_t& out_ndim [[buffer(9)]],
    const constant int* axes [[buffer(10)]],
    const constant int* idx_shapes [[buffer(11)]],
    const constant int64_t* idx_strides [[buffer(12)]],
    const constant bool* idx_contigs [[buffer(13)]],
    const constant int& idx_ndim [[buffer(14)]],
    const constant size_t& idx_size [[buffer(15)]],
    {5}
    uint2 gid [[thread_position_in_grid]]) {{
  Indices<{2}, {4}> idxs{{ {{ {6} }}, idx_shapes, idx_strides, idx_contigs, idx_ndim}};

  return scatter_impl<{1}, {2}, {3}, {4}, {7}, {8}, {9}>(
      updates,
      out,
      upd_shape,
      upd_strides,
      upd_ndim,
      upd_size,
      out_shape,
      out_strides,
      out_ndim,
      axes,
      idx_size,
      idxs,
      gid);
}}
)";

constexpr std::string_view masked_assign_kernel = R"(
template [[host_name("{0}")]] [[kernel]] decltype(masked_assign_impl<{1}, {2}>) masked_assign_impl<{1}, {2}>;
)";

constexpr std::string_view slice_update_op_kernel = R"(
template [[host_name("{0}")]]
[[kernel]] decltype(slice_update_op_impl<{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}>)
slice_update_op_impl<{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}>;
)";


================================================
FILE: mlx/backend/metal/jit_kernels.cpp
================================================
// Copyright © 2024 Apple Inc.
#include "mlx/backend/common/compiled.h"
#include "mlx/backend/metal/jit/includes.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/utils.h"

using namespace fmt::literals;

namespace mlx::core {

MTL::ComputePipelineState* get_arange_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& out) {
  auto lib = d.get_library(kernel_name, [&]() {
    std::string kernel_source = metal::utils();
    kernel_source += metal::arange();
    kernel_source += get_template_definition(
        kernel_name, "arange", get_type_string(out.dtype()));
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_unary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    Dtype in_type,
    Dtype out_type,
    const char* op) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name, [&]() {
    auto in_t = get_type_string(in_type);
    auto out_t = get_type_string(out_type);
    std::string kernel_source = metal::utils();
    concatenate(kernel_source, metal::unary_ops(), metal::unary());
    kernel_source +=
        get_template_definition("v_" + lib_name, "unary_v", in_t, out_t, op, 1);
    if (get_work_per_thread(in_type) > 1) {
      kernel_source +=
          get_template_definition("vn_" + lib_name, "unary_v", in_t, out_t, op);
    }
    kernel_source +=
        get_template_definition("v2_" + lib_name, "unary_v2", in_t, out_t, op);
    kernel_source += get_template_definition(
        "gn1_" + lib_name, "unary_g", in_t, out_t, op, 1, "int");
    kernel_source += get_template_definition(
        "gn4large_" + lib_name, "unary_g", in_t, out_t, op, 4);
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib);
}

void append_binary_kernels(
    const std::string& lib_name,
    Dtype in_type,
    Dtype out_type,
    const char* op,
    std::string& kernel_source) {
  const std::array<std::pair<std::string, std::string>, 7> kernel_types = {{
      {"ss", "binary_ss"},
      {"vs2", "binary_vs2"},
      {"sv2", "binary_sv2"},
      {"vv2", "binary_vv2"},
      {"g1large", "binary_g_nd1"},
      {"g2large", "binary_g_nd2"},
      {"g3large", "binary_g_nd3"},
  }};
  auto in_t = get_type_string(in_type);
  auto out_t = get_type_string(out_type);

  for (auto& [name, func] : kernel_types) {
    kernel_source +=
        get_template_definition(name + "_" + lib_name, func, in_t, out_t, op);
  }
  kernel_source += get_template_definition(
      "vs_" + lib_name, "binary_vs", in_t, out_t, op, 1);
  kernel_source += get_template_definition(
      "sv_" + lib_name, "binary_sv", in_t, out_t, op, 1);
  kernel_source += get_template_definition(
      "vv_" + lib_name, "binary_vv", in_t, out_t, op, 1);

  if (get_work_per_thread(in_type) > 1) {
    kernel_source += get_template_definition(
        "vsn_" + lib_name, "binary_vs", in_t, out_t, op);
    kernel_source += get_template_definition(
        "svn_" + lib_name, "binary_sv", in_t, out_t, op);
    kernel_source += get_template_definition(
        "vvn_" + lib_name, "binary_vv", in_t, out_t, op);
  }

  kernel_source += get_template_definition(
      "g1_" + lib_name, "binary_g_nd1", in_t, out_t, op, "int");
  kernel_source += get_template_definition(
      "g2_" + lib_name, "binary_g_nd2", in_t, out_t, op, "int");
  kernel_source += get_template_definition(
      "g3_" + lib_name, "binary_g_nd3", in_t, out_t, op, "int");
  kernel_source += get_template_definition(
      "gn2_" + lib_name, "binary_g", in_t, out_t, op, 2, "int");
  kernel_source += get_template_definition(
      "gn4large_" + lib_name, "binary_g", in_t, out_t, op, 4);
}

MTL::ComputePipelineState* get_binary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    Dtype in_type,
    Dtype out_type,
    const char* op) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source;
    kernel_source = metal::utils();
    concatenate(kernel_source, metal::binary_ops(), metal::binary());
    append_binary_kernels(lib_name, in_type, out_type, op, kernel_source);
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_binary_two_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    Dtype in_type,
    Dtype out_type,
    const char* op) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source = metal::utils();
    concatenate(kernel_source, metal::binary_ops(), metal::binary_two());
    append_binary_kernels(lib_name, in_type, out_type, op, kernel_source);
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_ternary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    Dtype type,
    const char* op) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name, [&]() {
    auto t_str = get_type_string(type);
    std::string kernel_source = metal::utils();
    concatenate(kernel_source, metal::ternary_ops(), metal::ternary());
    const std::array<std::pair<std::string, std::string>, 3> kernel_types = {{
        {"g1large", "ternary_g_nd1"},
        {"g2large", "ternary_g_nd2"},
        {"g3large", "ternary_g_nd3"},
    }};
    for (auto& [name, func] : kernel_types) {
      kernel_source +=
          get_template_definition(name + "_" + lib_name, func, t_str, op);
    }

    kernel_source += get_template_definition(
        "v2_" + lib_name, "ternary_v2", t_str, op, false, false);
    kernel_source += get_template_definition(
        "sv2_" + lib_name, "ternary_v2", t_str, op, true, false);
    kernel_source += get_template_definition(
        "vs2_" + lib_name, "ternary_v2", t_str, op, false, true);

    if (get_work_per_thread(type) > 1) {
      kernel_source += get_template_definition(
          "vn_" + lib_name, "ternary_v", t_str, op, false, false);
      kernel_source += get_template_definition(
          "svn_" + lib_name, "ternary_v", t_str, op, true, false);
      kernel_source += get_template_definition(
          "vsn_" + lib_name, "ternary_v", t_str, op, false, true);
    }

    kernel_source += get_template_definition(
        "v_" + lib_name, "ternary_v", t_str, op, false, false, 1);
    kernel_source += get_template_definition(
        "sv_" + lib_name, "ternary_v", t_str, op, true, false, 1);
    kernel_source += get_template_definition(
        "vs_" + lib_name, "ternary_v", t_str, op, false, true, 1);
    kernel_source += get_template_definition(
        "g1_" + lib_name, "ternary_g_nd1", t_str, op, "int");
    kernel_source += get_template_definition(
        "g2_" + lib_name, "ternary_g_nd2", t_str, op, "int");
    kernel_source += get_template_definition(
        "g3_" + lib_name, "ternary_g_nd3", t_str, op, "int");
    kernel_source += get_template_definition(
        "gn2_" + lib_name, "ternary_g", t_str, op, 2, "int");
    kernel_source += get_template_definition(
        "gn4large_" + lib_name, "ternary_g", t_str, op, 4);
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_copy_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& in,
    const array& out) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source = metal::utils();
    kernel_source += metal::copy();
    auto in_type = get_type_string(in.dtype());
    auto out_type = get_type_string(out.dtype());
    kernel_source += get_template_definition(
        "s_" + lib_name, "copy_s", in_type, out_type, 1);
    kernel_source +=
        get_template_definition("s2_" + lib_name, "copy_s2", in_type, out_type);
    kernel_source += get_template_definition(
        "v_" + lib_name, "copy_v", in_type, out_type, 1);
    kernel_source +=
        get_template_definition("v2_" + lib_name, "copy_v2", in_type, out_type);

    if (get_work_per_thread(out.dtype()) > 1) {
      kernel_source += get_template_definition(
          "sn_" + lib_name, "copy_s", in_type, out_type);
      kernel_source += get_template_definition(
          "vn_" + lib_name, "copy_v", in_type, out_type);
    }

    kernel_source += get_template_definition(
        "g1_" + lib_name, "copy_g_nd1", in_type, out_type, "int");
    kernel_source += get_template_definition(
        "g2_" + lib_name, "copy_g_nd2", in_type, out_type, "int");
    kernel_source += get_template_definition(
        "g3_" + lib_name, "copy_g_nd3", in_type, out_type, "int");
    kernel_source += get_template_definition(
        "gn2_" + lib_name, "copy_g", in_type, out_type, 2, "int");
    kernel_source += get_template_definition(
        "gg1_" + lib_name, "copy_gg_nd1", in_type, out_type, "int");
    kernel_source += get_template_definition(
        "gg2_" + lib_name, "copy_gg_nd2", in_type, out_type, "int");
    kernel_source += get_template_definition(
        "gg3_" + lib_name, "copy_gg_nd3", in_type, out_type, "int");
    kernel_source += get_template_definition(
        "ggn2_" + lib_name, "copy_gg", in_type, out_type, 2, "int");
    kernel_source += get_template_definition(
        "g1large_" + lib_name, "copy_g_nd1", in_type, out_type);
    kernel_source += get_template_definition(
        "g2large_" + lib_name, "copy_g_nd2", in_type, out_type);
    kernel_source += get_template_definition(
        "g3large_" + lib_name, "copy_g_nd3", in_type, out_type);
    kernel_source += get_template_definition(
        "gn4large_" + lib_name, "copy_g", in_type, out_type, 4);
    kernel_source += get_template_definition(
        "gg1large_" + lib_name, "copy_gg_nd1", in_type, out_type);
    kernel_source += get_template_definition(
        "gg2large_" + lib_name, "copy_gg_nd2", in_type, out_type);
    kernel_source += get_template_definition(
        "gg3large_" + lib_name, "copy_gg_nd3", in_type, out_type);
    kernel_source += get_template_definition(
        "ggn4large_" + lib_name, "copy_gg", in_type, out_type, 4);
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_dynamic_copy_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& in,
    const array& out) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source = metal::utils();
    kernel_source += metal::copy();
    auto in_type = get_type_string(in.dtype());
    auto out_type = get_type_string(out.dtype());
    kernel_source += get_template_definition(
        "gg1_" + lib_name, "copy_gg_dynamic_nd1", in_type, out_type, "int");
    kernel_source += get_template_definition(
        "gg2_" + lib_name, "copy_gg_dynamic_nd2", in_type, out_type, "int");
    kernel_source += get_template_definition(
        "gg3_" + lib_name, "copy_gg_dynamic_nd3", in_type, out_type, "int");
    kernel_source += get_template_definition(
        "ggn2_" + lib_name, "copy_gg_dynamic", in_type, out_type, 2, "int");
    kernel_source += get_template_definition(
        "gg1large_" + lib_name, "copy_gg_dynamic_nd1", in_type, out_type);
    kernel_source += get_template_definition(
        "gg2large_" + lib_name, "copy_gg_dynamic_nd2", in_type, out_type);
    kernel_source += get_template_definition(
        "gg3large_" + lib_name, "copy_gg_dynamic_nd3", in_type, out_type);
    kernel_source += get_template_definition(
        "ggn4large_" + lib_name, "copy_gg_dynamic", in_type, out_type, 4);
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_softmax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    bool precise,
    const array& out) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name, [&] {
    std::string kernel_source = metal::utils();
    auto in_type = get_type_string(out.dtype());
    auto acc_type = get_type_string(precise ? float32 : out.dtype());
    kernel_source += metal::softmax();
    kernel_source += get_template_definition(
        "block_" + lib_name, "softmax_single_row", in_type, acc_type);
    kernel_source += get_template_definition(
        "looped_" + lib_name, "softmax_looped", in_type, acc_type);
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_logsumexp_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& out) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name, [&] {
    auto t_str = get_type_string(out.dtype());
    std::string kernel_source;
    kernel_source = metal::utils();
    kernel_source += metal::logsumexp();
    kernel_source +=
        get_template_definition("block_" + lib_name, "logsumexp", t_str);
    kernel_source += get_template_definition(
        "looped_" + lib_name, "logsumexp_looped", t_str);
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_scan_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    bool reverse,
    bool inclusive,
    const std::string& reduce_type,
    const array& in,
    const array& out) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name, [&]() {
    auto out_type = get_type_string(out.dtype());
    std::string op = "Cum" + reduce_type + "<" + out_type + ">";
    op[3] = toupper(op[3]);
    std::ostringstream kernel_source;
    kernel_source << metal::utils() << metal::scan();
    const std::array<std::pair<std::string, std::string>, 2> scan_kernels = {{
        {"contig_", "contiguous_scan"},
        {"strided_", "strided_scan"},
    }};
    for (auto& [prefix, kernel] : scan_kernels) {
      kernel_source << get_template_definition(
          prefix + lib_name,
          kernel,
          get_type_string(in.dtype()),
          get_type_string(out.dtype()),
          op,
          in.itemsize() <= 4 ? 4 : 2,
          inclusive,
          reverse);
    }
    return kernel_source.str();
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_sort_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& in,
    const array& out,
    int bn,
    int tn) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name, [&]() {
    std::ostringstream kernel_source;
    auto in_type = get_type_string(in.dtype());
    auto out_type = get_type_string(out.dtype());
    kernel_source << metal::utils() << metal::sort();
    for (bool is_argsort : {true, false}) {
      std::string bool_string = is_argsort ? "true" : "false";
      std::string func_string = is_argsort ? "carg_" : "c_";
      kernel_source << get_template_definition(
          func_string + lib_name,
          "block_sort",
          in_type,
          out_type,
          bool_string,
          bn,
          tn);
      kernel_source << get_template_definition(
          "n" + func_string + lib_name,
          "block_sort_nc",
          in_type,
          out_type,
          bool_string,
          bn,
          tn);
    }
    return kernel_source.str();
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_mb_sort_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& in,
    const array& idx,
    int bn,
    int tn) {
  std::string lib_name = kernel_name.substr(kernel_name.find("_") + 1);
  auto lib = d.get_library(lib_name, [&]() {
    std::ostringstream kernel_source;
    kernel_source << metal::utils() << metal::sort();
    std::array<std::pair<std::string, std::string>, 3> kernel_types = {
        {{"sort_", "mb_block_sort"},
         {"partition_", "mb_block_partition"},
         {"merge_", "mb_block_merge"}}};
    for (auto& [name, func] : kernel_types) {
      kernel_source << get_template_definition(
          name + lib_name,
          func,
          get_type_string(in.dtype()),
          get_type_string(idx.dtype()),
          "true",
          bn,
          tn);
    }
    return kernel_source.str();
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_reduce_init_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& func_name,
    const std::string& op_name,
    const Dtype& out_type) {
  auto lib = d.get_library(kernel_name, [&]() {
    std::string op_type = op_name;
    op_type[0] = std::toupper(op_name[0]);
    auto out_t = get_type_string(out_type);
    std::string op = op_type + "<" + out_t + ">";
    std::string kernel_source = metal::utils();
    kernel_source += metal::reduce_utils();
    kernel_source += metal::reduce();
    kernel_source += get_template_definition(kernel_name, func_name, out_t, op);
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_reduce_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& func_name,
    const std::string& op_name,
    const Dtype& in_type,
    const Dtype& out_type,
    const std::string& idx_t,
    int ndim /* = -1 */,
    int bm /* = -1 */,
    int bn /* = -1 */) {
  auto lib = d.get_library(kernel_name, [&]() {
    std::string op_type = op_name;
    op_type[0] = std::toupper(op_name[0]);
    auto in_t = get_type_string(in_type);
    auto out_t = get_type_string(out_type);
    std::string op = op_type + "<" + out_t + ">";
    std::string kernel_source = metal::utils();
    concatenate(kernel_source, metal::reduce_utils(), metal::reduce());
    if (bm >= 0) {
      kernel_source += get_template_definition(
          kernel_name, func_name, in_t, out_t, op, idx_t, ndim, bm, bn);
    } else if (ndim >= 0) {
      kernel_source += get_template_definition(
          kernel_name, func_name, in_t, out_t, op, idx_t, ndim);
    } else {
      kernel_source += get_template_definition(
          kernel_name, func_name, in_t, out_t, op, idx_t);
    }
    return kernel_source;
  });
  auto st = d.get_kernel(kernel_name, lib);
  return st;
}

MTL::ComputePipelineState* get_steel_gemm_fused_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& out,
    bool transpose_a,
    bool transpose_b,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::ostringstream kernel_source;
    kernel_source << metal::utils() << metal::gemm()
                  << metal::steel_gemm_fused()
                  << get_template_definition(
                         lib_name,
                         "gemm",
                         get_type_string(out.dtype()),
                         bm,
                         bn,
                         bk,
                         wm,
                         wn,
                         transpose_a,
                         transpose_b);
    return kernel_source.str();
  });
  return d.get_kernel(kernel_name, lib, hash_name, func_consts);
}

MTL::ComputePipelineState* get_steel_gemm_splitk_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& in,
    const array& out,
    bool transpose_a,
    bool transpose_b,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn,
    bool mn_aligned,
    bool k_aligned) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::ostringstream kernel_source;
    kernel_source << metal::utils() << metal::gemm()
                  << metal::steel_gemm_splitk()
                  << get_template_definition(
                         lib_name,
                         "gemm_splitk",
                         get_type_string(in.dtype()),
                         get_type_string(out.dtype()),
                         bm,
                         bn,
                         bk,
                         wm,
                         wn,
                         transpose_a,
                         transpose_b,
                         mn_aligned,
                         k_aligned);
    return kernel_source.str();
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_steel_gemm_splitk_accum_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& in,
    const array& out,
    bool axbpy) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::ostringstream kernel_source;
    kernel_source << metal::utils() << metal::gemm()
                  << metal::steel_gemm_splitk()
                  << get_template_definition(
                         lib_name,
                         axbpy ? "gemm_splitk_accum_axpby"
                               : "gemm_splitk_accum",
                         get_type_string(in.dtype()),
                         get_type_string(out.dtype()));
    return kernel_source.str();
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_steel_gemm_masked_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& out,
    const std::optional<array>& mask_out,
    const std::optional<array>& mask_op,
    bool transpose_a,
    bool transpose_b,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn,
    bool mn_aligned,
    bool k_aligned) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::ostringstream kernel_source;
    auto out_mask_type = mask_out.has_value()
        ? get_type_string((*mask_out).dtype())
        : "nomask_t";
    auto op_mask_type =
        mask_op.has_value() ? get_type_string((*mask_op).dtype()) : "nomask_t";
    kernel_source << metal::utils() << metal::gemm()
                  << metal::steel_gemm_masked()
                  << get_template_definition(
                         lib_name,
                         "block_masked_gemm",
                         get_type_string(out.dtype()),
                         out_mask_type,
                         op_mask_type,
                         bm,
                         bn,
                         bk,
                         wm,
                         wn,
                         transpose_a,
                         transpose_b,
                         mn_aligned,
                         k_aligned);
    return kernel_source.str();
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_steel_gemm_gather_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& out,
    bool transpose_a,
    bool transpose_b,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn,
    bool rhs) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source;
    concatenate(
        kernel_source,
        metal::utils(),
        metal::gemm(),
        metal::steel_gemm_gather(),
        get_template_definition(
            lib_name,
            rhs ? "gather_mm_rhs" : "gather_mm",
            get_type_string(out.dtype()),
            bm,
            bn,
            bk,
            wm,
            wn,
            transpose_a,
            transpose_b));
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib, hash_name, func_consts);
}

MTL::ComputePipelineState* get_steel_gemm_segmented_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& out,
    bool transpose_a,
    bool transpose_b,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source;
    concatenate(
        kernel_source,
        metal::utils(),
        metal::gemm(),
        metal::steel_gemm_segmented(),
        get_template_definition(
            lib_name,
            "segmented_mm",
            get_type_string(out.dtype()),
            bm,
            bn,
            bk,
            wm,
            wn,
            transpose_a,
            transpose_b));
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib, hash_name, func_consts);
}

MTL::ComputePipelineState* get_gemv_masked_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& out,
    const std::optional<array>& mask_out,
    const std::optional<array>& mask_op,
    bool transpose_mat,
    int bm,
    int bn,
    int sm,
    int sn,
    int tm,
    int tn,
    bool contiguous) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::ostringstream kernel_source;
    auto out_mask_type = mask_out.has_value()
        ? get_type_string((*mask_out).dtype())
        : "nomask_t";
    auto op_mask_type =
        mask_op.has_value() ? get_type_string((*mask_op).dtype()) : "nomask_t";
    kernel_source << metal::utils() << metal::gemv_masked()
                  << get_template_definition(
                         lib_name,
                         (transpose_mat) ? "gemv_t_masked" : "gemv_masked",
                         get_type_string(out.dtype()),
                         out_mask_type,
                         op_mask_type,
                         bm,
                         bn,
                         sm,
                         sn,
                         tm,
                         tn,
                         contiguous ? 0 : 1);
    return kernel_source.str();
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_steel_conv_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& out,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn,
    int n_channel_specialization,
    bool small_filter) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::ostringstream kernel_source;
    kernel_source << metal::utils() << metal::conv() << metal::steel_conv()
                  << get_template_definition(
                         lib_name,
                         "implicit_gemm_conv_2d",
                         get_type_string(out.dtype()),
                         bm,
                         bn,
                         bk,
                         wm,
                         wn,
                         n_channel_specialization,
                         small_filter);
    return kernel_source.str();
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_steel_conv_3d_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& out,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn,
    bool small_filter) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::ostringstream kernel_source;
    kernel_source << metal::utils() << metal::conv() << metal::steel_conv_3d()
                  << get_template_definition(
                         lib_name,
                         "implicit_gemm_conv_3d",
                         get_type_string(out.dtype()),
                         bm,
                         bn,
                         bk,
                         wm,
                         wn,
                         small_filter);
    return kernel_source.str();
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_steel_conv_general_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& out,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::ostringstream kernel_source;
    kernel_source << metal::utils() << metal::conv()
                  << metal::steel_conv_general()
                  << get_template_definition(
                         lib_name,
                         "implicit_gemm_conv_2d_general",
                         get_type_string(out.dtype()),
                         bm,
                         bn,
                         bk,
                         wm,
                         wn);
    return kernel_source.str();
  });
  return d.get_kernel(kernel_name, lib, hash_name, func_consts);
}

MTL::ComputePipelineState* get_fft_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const std::string& template_def) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::ostringstream kernel_source;
    std::string kernel_string;
    kernel_source << metal::fft() << template_def;
    return kernel_source.str();
  });
  return d.get_kernel(kernel_name, lib, hash_name, func_consts);
}

MTL::ComputePipelineState* get_quantized_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& template_def,
    const std::string& mode) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source;
    concatenate(
        kernel_source,
        metal::utils(),
        metal::gemm(),
        metal::quantized_utils(),
        (mode == "affine") ? metal::quantized() : metal::fp_quantized(),
        template_def);
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_gather_qmm_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& x,
    int group_size,
    int bits,
    const std::string& mode,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn,
    bool transpose) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source;
    concatenate(
        kernel_source, metal::utils(), metal::quantized_utils(), metal::gemm());
    bool is_affine = mode == "affine";
    concatenate(
        kernel_source,
        is_affine ? metal::quantized() : metal::fp_quantized(),
        get_template_definition(
            lib_name,
            (is_affine ? "affine" : "fp") + std::string("_gather_qmm_rhs"),
            get_type_string(x.dtype()),
            group_size,
            bits,
            bm,
            bn,
            bk,
            wm,
            wn,
            transpose));
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib, hash_name, func_consts);
}

MTL::ComputePipelineState* get_steel_gemm_fused_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& out,
    bool transpose_a,
    bool transpose_b,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::ostringstream kernel_source;
    kernel_source << metal::utils() << metal::gemm_nax()
                  << metal::steel_gemm_fused_nax()
                  << get_template_definition(
                         lib_name,
                         "gemm",
                         get_type_string(out.dtype()),
                         bm,
                         bn,
                         bk,
                         wm,
                         wn,
                         transpose_a,
                         transpose_b);
    return kernel_source.str();
  });
  return d.get_kernel(kernel_name, lib, hash_name, func_consts);
}

MTL::ComputePipelineState* get_steel_gemm_gather_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& out,
    bool transpose_a,
    bool transpose_b,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn,
    bool rhs) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source;
    concatenate(
        kernel_source,
        metal::utils(),
        metal::gemm_nax(),
        metal::steel_gemm_gather_nax(),
        get_template_definition(
            lib_name,
            rhs ? "gather_mm_rhs_nax" : "gather_mm_nax",
            get_type_string(out.dtype()),
            bm,
            bn,
            bk,
            wm,
            wn,
            transpose_a,
            transpose_b));
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib, hash_name, func_consts);
}

MTL::ComputePipelineState* get_steel_gemm_splitk_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& out,
    bool transpose_a,
    bool transpose_b,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::ostringstream kernel_source;
    kernel_source << metal::utils() << metal::gemm_nax()
                  << metal::steel_gemm_splitk_nax()
                  << get_template_definition(
                         lib_name,
                         "gemm_splitk_nax",
                         get_type_string(out.dtype()),
                         bm,
                         bn,
                         bk,
                         wm,
                         wn,
                         transpose_a,
                         transpose_b);
    return kernel_source.str();
  });
  return d.get_kernel(kernel_name, lib, hash_name, func_consts);
}

MTL::ComputePipelineState* get_qmm_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& template_def,
    const std::string& mode) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source;
    concatenate(
        kernel_source,
        metal::utils(),
        metal::gemm_nax(),
        metal::quantized_utils(),
        (mode == "affine") ? metal::quantized_nax() : metal::fp_quantized_nax(),
        template_def);
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib);
}

MTL::ComputePipelineState* get_gather_qmm_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& x,
    int group_size,
    int bits,
    const std::string& mode,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn,
    bool transpose) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source;
    concatenate(
        kernel_source,
        metal::utils(),
        metal::gemm_nax(),
        metal::quantized_utils());
    bool is_affine = mode == "affine";
    concatenate(
        kernel_source,
        is_affine ? metal::quantized_nax() : metal::fp_quantized_nax(),
        get_template_definition(
            lib_name,
            (is_affine ? "affine" : "fp") + std::string("_gather_qmm_rhs_nax"),
            get_type_string(x.dtype()),
            group_size,
            bits,
            bm,
            bn,
            bk,
            wm,
            wn,
            transpose));
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib, hash_name, func_consts);
}

MTL::ComputePipelineState* get_steel_attention_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& q,
    int bq,
    int bk,
    int bd,
    int wm,
    int wn,
    const array& m) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source;
    concatenate(
        kernel_source,
        metal::utils(),
        metal::steel_attention(),
        get_template_definition(
            lib_name,
            "attention",
            get_type_string(q.dtype()),
            bq,
            bk,
            bd,
            wm,
            wn,
            get_type_string(m.dtype())));
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib, hash_name, func_consts);
}

MTL::ComputePipelineState* get_steel_attention_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& q,
    int bq,
    int bk,
    int bd,
    int wm,
    int wn,
    const array& m) {
  const auto& lib_name = kernel_name;
  auto lib = d.get_library(lib_name, [&]() {
    std::string kernel_source;
    concatenate(
        kernel_source,
        metal::utils(),
        metal::steel_attention_nax(),
        get_template_definition(
            lib_name,
            "attention_nax",
            get_type_string(q.dtype()),
            bq,
            bk,
            bd,
            wm,
            wn,
            get_type_string(m.dtype())));
    return kernel_source;
  });
  return d.get_kernel(kernel_name, lib, hash_name, func_consts);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/kernels/CMakeLists.txt
================================================
set(BASE_HEADERS
    bf16.h
    bf16_math.h
    complex.h
    defines.h
    erf.h
    expm1f.h
    fp8.h
    logging.h
    utils.h)

function(build_kernel_base TARGET SRCFILE DEPS)
  set(METAL_FLAGS
      -x
      metal
      -Wall
      -Wextra
      -fno-fast-math
      -Wno-c++17-extensions
      -Wno-c++20-extensions)
  if(MLX_METAL_DEBUG)
    set(METAL_FLAGS ${METAL_FLAGS} -gline-tables-only -frecord-sources)
  endif()
  if(CMAKE_BUILD_TYPE STREQUAL "Debug" AND MLX_METAL_VERSION GREATER_EQUAL 320)
    set(METAL_FLAGS ${METAL_FLAGS} -fmetal-enable-logging)
  endif()
  if(NOT CMAKE_OSX_DEPLOYMENT_TARGET STREQUAL "")
    set(METAL_FLAGS ${METAL_FLAGS}
                    "-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
  endif()
  add_custom_command(
    COMMAND xcrun -sdk macosx metal ${METAL_FLAGS} -c ${SRCFILE}
            -I${PROJECT_SOURCE_DIR} -o ${TARGET}.air
    DEPENDS ${SRCFILE} ${DEPS} ${BASE_HEADERS}
    OUTPUT ${TARGET}.air
    COMMENT "Building ${TARGET}.air"
    VERBATIM)
endfunction(build_kernel_base)

function(build_kernel KERNEL)
  set(SRCFILE ${CMAKE_CURRENT_SOURCE_DIR}/${KERNEL}.metal)
  cmake_path(GET KERNEL STEM TARGET)
  build_kernel_base(${TARGET} ${SRCFILE} "${ARGN}")
  set(KERNEL_AIR
      ${TARGET}.air ${KERNEL_AIR}
      PARENT_SCOPE)
endfunction(build_kernel)

build_kernel(arg_reduce)
build_kernel(conv steel/conv/params.h)
build_kernel(gemv steel/utils.h)
build_kernel(layer_norm)
build_kernel(random)
build_kernel(rms_norm)
build_kernel(rope)
build_kernel(scaled_dot_product_attention sdpa_vector.h)
if(MLX_METAL_VERSION GREATER_EQUAL 320)
  build_kernel(fence)
endif()

set(STEEL_HEADERS
    steel/defines.h
    steel/utils.h
    steel/conv/conv.h
    steel/conv/loader.h
    steel/conv/loaders/loader_channel_l.h
    steel/conv/loaders/loader_channel_n.h
    steel/conv/loaders/loader_general.h
    steel/conv/kernels/steel_conv.h
    steel/conv/kernels/steel_conv_3d.h
    steel/conv/kernels/steel_conv_general.h
    steel/gemm/gemm.h
    steel/gemm/mma.h
    steel/gemm/loader.h
    steel/gemm/params.h
    steel/gemm/transforms.h
    steel/gemm/kernels/steel_gemm_fused.h
    steel/gemm/kernels/steel_gemm_gather.h
    steel/gemm/kernels/steel_gemm_masked.h
    steel/gemm/kernels/steel_gemm_segmented.h
    steel/gemm/kernels/steel_gemm_splitk.h
    steel/utils/type_traits.h
    steel/utils/integral_constant.h)

set(STEEL_ATTN_HEADERS
    steel/defines.h
    steel/utils.h
    steel/gemm/gemm.h
    steel/gemm/mma.h
    steel/gemm/loader.h
    steel/gemm/transforms.h
    steel/utils/type_traits.h
    steel/utils/integral_constant.h
    steel/attn/attn.h
    steel/attn/loader.h
    steel/attn/mma.h
    steel/attn/params.h
    steel/attn/transforms.h
    steel/attn/kernels/steel_attention.h)

set(STEEL_NAX_HEADERS
    steel/defines.h
    steel/utils.h
    steel/gemm/params.h
    steel/gemm/transforms.h
    steel/gemm/nax.h
    steel/gemm/gemm_nax.h
    steel/utils/type_traits.h
    steel/utils/integral_constant.h
    steel/gemm/kernels/steel_gemm_fused_nax.h
    steel/gemm/kernels/steel_gemm_gather_nax.h
    steel/gemm/kernels/steel_gemm_splitk_nax.h)

set(STEEL_NAX_ATTN_HEADERS
    steel/defines.h
    steel/utils.h
    steel/attn/nax.h
    steel/utils/type_traits.h
    steel/utils/integral_constant.h
    steel/attn/params.h
    steel/attn/kernels/steel_attention_nax.h)

if(NOT MLX_METAL_JIT)
  build_kernel(arange arange.h)
  build_kernel(binary binary.h binary_ops.h)
  build_kernel(binary_two binary_two.h)
  build_kernel(copy copy.h)
  build_kernel(fft fft.h fft/radix.h fft/readwrite.h)
  build_kernel(
    reduce
    atomic.h
    reduction/ops.h
    reduction/reduce_init.h
    reduction/reduce_all.h
    reduction/reduce_col.h
    reduction/reduce_row.h)
  build_kernel(quantized quantized.h quantized_utils.h ${STEEL_HEADERS})
  build_kernel(fp_quantized fp4.h fp8.h fp_quantized.h quantized_utils.h
               ${STEEL_HEADERS})
  build_kernel(scan scan.h)
  build_kernel(softmax softmax.h)
  build_kernel(logsumexp logsumexp.h)
  build_kernel(sort sort.h)
  build_kernel(ternary ternary.h ternary_ops.h)
  build_kernel(unary unary.h unary_ops.h)
  build_kernel(steel/conv/kernels/steel_conv ${STEEL_HEADERS})
  build_kernel(steel/conv/kernels/steel_conv_3d ${STEEL_HEADERS})
  build_kernel(steel/conv/kernels/steel_conv_general ${STEEL_HEADERS})
  build_kernel(steel/gemm/kernels/steel_gemm_fused ${STEEL_HEADERS})
  build_kernel(steel/gemm/kernels/steel_gemm_gather ${STEEL_HEADERS})
  build_kernel(steel/gemm/kernels/steel_gemm_masked ${STEEL_HEADERS})
  build_kernel(steel/gemm/kernels/steel_gemm_splitk ${STEEL_HEADERS})
  build_kernel(steel/gemm/kernels/steel_gemm_segmented ${STEEL_HEADERS})
  build_kernel(gemv_masked steel/utils.h)
  build_kernel(steel/attn/kernels/steel_attention ${STEEL_ATTN_HEADERS})

  if((MLX_METAL_VERSION GREATER_EQUAL 400) AND (MACOS_SDK_VERSION GREATER_EQUAL
                                                26.2))

    build_kernel(steel/gemm/kernels/steel_gemm_fused_nax ${STEEL_NAX_HEADERS})
    build_kernel(steel/gemm/kernels/steel_gemm_gather_nax ${STEEL_NAX_HEADERS})
    build_kernel(steel/gemm/kernels/steel_gemm_splitk_nax ${STEEL_NAX_HEADERS})

    build_kernel(quantized_nax quantized_nax.h ${STEEL_NAX_HEADERS})
    build_kernel(fp_quantized_nax fp4.h fp8.h fp_quantized_nax.h
                 ${STEEL_NAX_HEADERS})

    build_kernel(steel/attn/kernels/steel_attention_nax
                 ${STEEL_NAX_ATTN_HEADERS})

  else()
    target_compile_definitions(mlx PRIVATE MLX_METAL_NO_NAX)
  endif()

endif()

add_custom_command(
  OUTPUT ${MLX_METAL_PATH}/mlx.metallib
  COMMAND xcrun -sdk macosx metallib ${KERNEL_AIR} -o
          ${MLX_METAL_PATH}/mlx.metallib
  DEPENDS ${KERNEL_AIR}
  COMMENT "Building mlx.metallib"
  VERBATIM)

add_custom_target(mlx-metallib DEPENDS ${MLX_METAL_PATH}/mlx.metallib)

add_dependencies(mlx mlx-metallib)

# Install metallib
include(GNUInstallDirs)

install(
  FILES ${MLX_METAL_PATH}/mlx.metallib
  DESTINATION ${CMAKE_INSTALL_LIBDIR}
  COMPONENT metallib)


================================================
FILE: mlx/backend/metal/kernels/arange.h
================================================
// Copyright © 2023-2024 Apple Inc.
template <typename T>
[[kernel]] void arange(
    constant const T& start,
    constant const T& step,
    device T* out,
    uint index [[thread_position_in_grid]]) {
  out[index] = start + index * step;
}


================================================
FILE: mlx/backend/metal/kernels/arange.metal
================================================
// Copyright © 2023-2024 Apple Inc.

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/arange.h"

#define instantiate_arange(tname, type)                                 \
  instantiate_kernel("arange" #tname, arange, type)

instantiate_arange(uint8, uint8_t)
instantiate_arange(uint16, uint16_t)
instantiate_arange(uint32, uint32_t)
instantiate_arange(uint64, uint64_t)
instantiate_arange(int8, int8_t)
instantiate_arange(int16, int16_t)
instantiate_arange(int32, int32_t)
instantiate_arange(int64, int64_t)
instantiate_arange(float16, half)
instantiate_arange(float32, float)
instantiate_arange(bfloat16, bfloat16_t) // clang-format on


================================================
FILE: mlx/backend/metal/kernels/arg_reduce.metal
================================================
// Copyright © 2023 Apple Inc.

#include <metal_simdgroup>

#include "mlx/backend/metal/kernels/utils.h"

using namespace metal;

template <typename U>
struct IndexValPair {
  uint32_t index;
  U val;
};

template <typename U>
struct ArgMin {
  static constexpr constant U init = Limits<U>::max;

  IndexValPair<U> reduce(IndexValPair<U> best, IndexValPair<U> current) {
    if (best.val > current.val ||
        (best.val == current.val && best.index > current.index)) {
      return current;
    } else {
      return best;
    }
  }

  template <int N>
  IndexValPair<U>
  reduce_many(IndexValPair<U> best, thread U* vals, uint32_t offset) {
    for (int i = 0; i < N; i++) {
      if (vals[i] < best.val) {
        best.val = vals[i];
        best.index = offset + i;
      }
    }
    return best;
  }
};

template <typename U>
struct ArgMax {
  static constexpr constant U init = Limits<U>::min;

  IndexValPair<U> reduce(IndexValPair<U> best, IndexValPair<U> current) {
    if (best.val < current.val ||
        (best.val == current.val && best.index > current.index)) {
      return current;
    } else {
      return best;
    }
  }

  template <int N>
  IndexValPair<U>
  reduce_many(IndexValPair<U> best, thread U* vals, uint32_t offset) {
    for (int i = 0; i < N; i++) {
      if (vals[i] > best.val) {
        best.val = vals[i];
        best.index = offset + i;
      }
    }
    return best;
  }
};

template <typename U>
IndexValPair<U> simd_shuffle_down(IndexValPair<U> data, uint16_t delta) {
  return IndexValPair<U>{
      simd_shuffle_down(data.index, delta), simd_shuffle_down(data.val, delta)};
}

template <typename T, typename Op, int N_READS = 4>
[[kernel]] void arg_reduce_general(
    const device T* in [[buffer(0)]],
    device uint32_t* out [[buffer(1)]],
    const constant int* shape [[buffer(2)]],
    const constant int64_t* in_strides [[buffer(3)]],
    const constant int64_t* out_strides [[buffer(4)]],
    const constant size_t& ndim [[buffer(5)]],
    const constant int64_t& axis_stride [[buffer(6)]],
    const constant size_t& axis_size [[buffer(7)]],
    uint3 gid [[thread_position_in_grid]],
    uint3 gsize [[threads_per_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint3 lsize [[threads_per_threadgroup]],
    uint simd_size [[threads_per_simdgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  // Shapes and strides *do not* contain the reduction axis. The reduction size
  // and stride are provided in axis_stride and axis_size.
  //
  // Note: in shape == out shape with this convention.
  //
  // The sketch of the kernel is as follows.
  //    1. Launch prod(shape) * thread_group_size threads.
  //    2. Loop ceildiv(axis_size / lsize) times
  //    3. Read input values
  //    4. Reduce among them and go to 3
  //    4. Reduce in each simd_group
  //    6. Write in the thread local memory
  //    6. Reduce them across thread group
  //    7. Write the output without need for atomic
  Op op;

  // Compute the input/output index. There is one beginning and one output for
  // the whole threadgroup.
  int64_t row_idx = gid.y + static_cast<int64_t>(gsize.y) * gid.z;
  auto in_idx = elem_to_loc(row_idx, shape, in_strides, ndim);
  auto out_idx = elem_to_loc(row_idx, shape, out_strides, ndim);

  IndexValPair<T> best{0, Op::init};

  threadgroup IndexValPair<T> local_data[32];

  // Loop over the reduction axis in lsize*N_READS buckets
  for (uint r = 0; r < ceildiv(axis_size, N_READS * lsize.x); r++) {
    // Read the current value
    uint32_t current_index = r * lsize.x * N_READS + lid.x * N_READS;
    uint32_t offset = current_index;
    const device T* current_in = in + in_idx + current_index * axis_stride;
    T vals[N_READS];
    for (int i = 0; i < N_READS; i++) {
      vals[i] = (current_index < axis_size) ? *current_in : T(Op::init);
      current_index++;
      current_in += axis_stride;
    }
    best = op.template reduce_many<N_READS>(best, vals, offset);
  }
  // At this point we have reduced the axis into thread group best values so we
  // need to reduce across the thread group.

  // First per simd reduction.
  for (uint offset = simd_size / 2; offset > 0; offset /= 2) {
    IndexValPair<T> neighbor = simd_shuffle_down(best, offset);
    best = op.reduce(best, neighbor);
  }

  // Write to the threadgroup memory
  if (simd_lane_id == 0) {
    local_data[simd_group_id] = best;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (simd_group_id != 0) {
    return;
  }

  // Read the appropriate value from local data and perform one simd reduction
  uint simd_groups = ceildiv(lsize.x, simd_size);
  if (simd_lane_id < simd_groups) {
    best = local_data[simd_lane_id];
  }
  for (uint offset = simd_size / 2; offset > 0; offset /= 2) {
    IndexValPair<T> neighbor = simd_shuffle_down(best, offset);
    best = op.reduce(best, neighbor);
  }

  // Finally write the output
  if (lid.x == 0) {
    out[out_idx] = best.index;
  }
}

// clang-format off
#define instantiate_arg_reduce(name, itype)                      \
  instantiate_kernel(                                            \
      "argmin_" #name, arg_reduce_general, itype, ArgMin<itype>) \
  instantiate_kernel(                                            \
      "argmax_" #name, arg_reduce_general, itype, ArgMax<itype>)

instantiate_arg_reduce(bool_, bool)
instantiate_arg_reduce(uint8, uint8_t)
instantiate_arg_reduce(uint16, uint16_t)
instantiate_arg_reduce(uint32, uint32_t)
instantiate_arg_reduce(uint64, uint64_t)
instantiate_arg_reduce(int8, int8_t)
instantiate_arg_reduce(int16, int16_t)
instantiate_arg_reduce(int32, int32_t)
instantiate_arg_reduce(int64, int64_t)
instantiate_arg_reduce(float16, half)
instantiate_arg_reduce(float32, float)
instantiate_arg_reduce(bfloat16, bfloat16_t) // clang-format on


================================================
FILE: mlx/backend/metal/kernels/atomic.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include <metal_atomic>
#include <metal_stdlib>

using namespace metal;

///////////////////////////////////////////////////////////////////////////////
// Atomic utils
///////////////////////////////////////////////////////////////////////////////

#pragma METAL internals : enable
template <typename T>
constexpr constant bool is_metal_atomic = _disjunction<
    is_same<T, int>,
    is_same<T, uint>,
    is_same<T, ulong>,
    is_same<T, float>>::value;

#pragma METAL internals : disable

template <typename T, typename = void>
struct mlx_atomic {
  atomic<uint> val;
};

template <typename T>
struct mlx_atomic<T, enable_if_t<is_metal_atomic<T>>> {
  atomic<T> val;
};

///////////////////////////////////////////////////////////////////////////////
// Native metal atomics
///////////////////////////////////////////////////////////////////////////////

template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
METAL_FUNC T
mlx_atomic_load_explicit(device mlx_atomic<T>* object, size_t offset) {
  return atomic_load_explicit(&(object[offset].val), memory_order_relaxed);
}

template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
METAL_FUNC void
mlx_atomic_store_explicit(device mlx_atomic<T>* object, T val, size_t offset) {
  atomic_store_explicit(&(object[offset].val), val, memory_order_relaxed);
}

template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
METAL_FUNC void mlx_atomic_fetch_and_explicit(
    device mlx_atomic<T>* object,
    T val,
    size_t offset) {
  atomic_fetch_and_explicit(&(object[offset].val), val, memory_order_relaxed);
}

template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
METAL_FUNC void mlx_atomic_fetch_or_explicit(
    device mlx_atomic<T>* object,
    T val,
    size_t offset) {
  atomic_fetch_or_explicit(&(object[offset].val), val, memory_order_relaxed);
}

template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
METAL_FUNC void mlx_atomic_fetch_min_explicit(
    device mlx_atomic<T>* object,
    T val,
    size_t offset) {
  atomic_fetch_min_explicit(&(object[offset].val), val, memory_order_relaxed);
}

template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
METAL_FUNC void mlx_atomic_fetch_max_explicit(
    device mlx_atomic<T>* object,
    T val,
    size_t offset) {
  atomic_fetch_max_explicit(&(object[offset].val), val, memory_order_relaxed);
}

template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
METAL_FUNC void mlx_atomic_fetch_add_explicit(
    device mlx_atomic<T>* object,
    T val,
    size_t offset) {
  atomic_fetch_add_explicit(&(object[offset].val), val, memory_order_relaxed);
}

template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
METAL_FUNC void mlx_atomic_fetch_mul_explicit(
    device mlx_atomic<T>* object,
    T val,
    size_t offset) {
  T expected = mlx_atomic_load_explicit(object, offset);
  while (!mlx_atomic_compare_exchange_weak_explicit(
      object, &expected, val * expected, offset)) {
  }
}

template <typename T, enable_if_t<is_metal_atomic<T>, bool> = true>
METAL_FUNC bool mlx_atomic_compare_exchange_weak_explicit(
    device mlx_atomic<T>* object,
    thread T* expected,
    T val,
    size_t offset) {
  return atomic_compare_exchange_weak_explicit(
      &(object[offset].val),
      expected,
      val,
      memory_order_relaxed,
      memory_order_relaxed);
}

// Specialization for float since it does not atomic_fetch_min_explicit
template <>
METAL_FUNC void mlx_atomic_fetch_min_explicit<float>(
    device mlx_atomic<float>* object,
    float val,
    size_t offset) {
  float expected = mlx_atomic_load_explicit(object, offset);
  while (val < expected) {
    if (mlx_atomic_compare_exchange_weak_explicit(
            object, &expected, val, offset)) {
      return;
    }
  }
}

// Specialization for float since it does not atomic_fetch_max_explicit
template <>
METAL_FUNC void mlx_atomic_fetch_max_explicit<float>(
    device mlx_atomic<float>* object,
    float val,
    size_t offset) {
  float expected = mlx_atomic_load_explicit(object, offset);
  while (val > expected) {
    if (mlx_atomic_compare_exchange_weak_explicit(
            object, &expected, val, offset)) {
      return;
    }
  }
}

///////////////////////////////////////////////////////////////////////////////
// Custom atomics
///////////////////////////////////////////////////////////////////////////////

namespace {

template <typename T>
constexpr constant uint packing_size = sizeof(uint) / sizeof(T);

template <typename T>
union uint_or_packed {
  T val[packing_size<T>];
  uint bits;
};

template <typename T, typename Op>
struct mlx_atomic_update_helper {
  uint operator()(uint_or_packed<T> init, T update, size_t elem_offset) {
    Op op;
    init.val[elem_offset] = op(update, init.val[elem_offset]);
    return init.bits;
  }
};

template <typename T, typename Op>
METAL_FUNC void mlx_atomic_update_and_store(
    device mlx_atomic<T>* object,
    T update,
    size_t offset) {
  size_t pack_offset = offset / packing_size<T>;
  size_t elem_offset = offset % packing_size<T>;

  mlx_atomic_update_helper<T, Op> helper;
  uint_or_packed<T> expected;
  expected.bits =
      atomic_load_explicit(&(object[pack_offset].val), memory_order_relaxed);

  while (Op::condition(update, expected.val[elem_offset]) &&
         !mlx_atomic_compare_exchange_weak_explicit(
             object,
             &(expected.bits),
             helper(expected, update, elem_offset),
             pack_offset)) {
  }
}

template <typename T>
struct __None {
  static bool condition(T a, T b) {
#pragma unused(a)
#pragma unused(b)
    return true;
  }

  T operator()(T a, T b) {
#pragma unused(b)
    return a;
  }
};

template <typename T>
struct __Add {
  static bool condition(T a, T b) {
#pragma unused(a)
#pragma unused(b)
    return true;
  }

  T operator()(T a, T b) {
    return a + b;
  }
};

template <typename T>
struct __Mul {
  static bool condition(T a, T b) {
#pragma unused(a)
    return b != 0;
  }

  T operator()(T a, T b) {
    return a * b;
  }
};

template <typename T>
struct __Max {
  static bool condition(T a, T b) {
    return a > b;
  }

  T operator()(T a, T b) {
    return max(a, b);
  }
};

template <typename T>
struct __Min {
  static bool condition(T a, T b) {
    return a < b;
  }

  T operator()(T a, T b) {
    return min(a, b);
  }
};

} // namespace

template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
METAL_FUNC T
mlx_atomic_load_explicit(device mlx_atomic<T>* object, size_t offset) {
  size_t pack_offset = offset / sizeof(T);
  size_t elem_offset = offset % sizeof(T);
  uint_or_packed<T> packed_val;
  packed_val.bits =
      atomic_load_explicit(&(object[pack_offset].val), memory_order_relaxed);
  return packed_val.val[elem_offset];
}

template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
METAL_FUNC void
mlx_atomic_store_explicit(device mlx_atomic<T>* object, T val, size_t offset) {
  mlx_atomic_update_and_store<T, __None<T>>(object, val, offset);
}

template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
METAL_FUNC void mlx_atomic_fetch_and_explicit(
    device mlx_atomic<T>* object,
    T val,
    size_t offset) {
  size_t pack_offset = offset / packing_size<T>;
  size_t elem_offset = offset % packing_size<T>;
  uint_or_packed<T> identity;
  identity.bits = __UINT32_MAX__;
  identity.val[elem_offset] = val;

  atomic_fetch_and_explicit(
      &(object[pack_offset].val), identity.bits, memory_order_relaxed);
}

template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
METAL_FUNC void mlx_atomic_fetch_or_explicit(
    device mlx_atomic<T>* object,
    T val,
    size_t offset) {
  size_t pack_offset = offset / packing_size<T>;
  size_t elem_offset = offset % packing_size<T>;
  uint_or_packed<T> identity;
  identity.bits = 0;
  identity.val[elem_offset] = val;

  atomic_fetch_or_explicit(
      &(object[pack_offset].val), identity.bits, memory_order_relaxed);
}

template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
METAL_FUNC void mlx_atomic_fetch_min_explicit(
    device mlx_atomic<T>* object,
    T val,
    size_t offset) {
  mlx_atomic_update_and_store<T, __Min<T>>(object, val, offset);
}

template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
METAL_FUNC void mlx_atomic_fetch_max_explicit(
    device mlx_atomic<T>* object,
    T val,
    size_t offset) {
  mlx_atomic_update_and_store<T, __Max<T>>(object, val, offset);
}

template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
METAL_FUNC void mlx_atomic_fetch_add_explicit(
    device mlx_atomic<T>* object,
    T val,
    size_t offset) {
  mlx_atomic_update_and_store<T, __Add<T>>(object, val, offset);
}

template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
METAL_FUNC void mlx_atomic_fetch_mul_explicit(
    device mlx_atomic<T>* object,
    T val,
    size_t offset) {
  mlx_atomic_update_and_store<T, __Mul<T>>(object, val, offset);
}

template <typename T, enable_if_t<!is_metal_atomic<T>, bool> = true>
METAL_FUNC bool mlx_atomic_compare_exchange_weak_explicit(
    device mlx_atomic<T>* object,
    thread uint* expected,
    uint val,
    size_t offset) {
  return atomic_compare_exchange_weak_explicit(
      &(object[offset].val),
      expected,
      val,
      memory_order_relaxed,
      memory_order_relaxed);
}


================================================
FILE: mlx/backend/metal/kernels/bf16.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include <metal_stdlib>

using namespace metal;

typedef bfloat bfloat16_t;
inline uint16_t bfloat16_to_uint16(const bfloat16_t x) {
  return as_type<uint16_t>(x);
}

inline bfloat16_t uint16_to_bfloat16(const uint16_t x) {
  return as_type<bfloat16_t>(x);
}


================================================
FILE: mlx/backend/metal/kernels/bf16_math.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

///////////////////////////////////////////////////////////////////////////////
// Metal math for bfloat16
///////////////////////////////////////////////////////////////////////////////

/*

Following the Metal Shading Language Specification (Metal 3.1)

"bfloat is an extended itypeing point type that only allows implicit conversion
 to a type of greater itypeing point rank. While bfloat can be implicitly
 converted to itype, it cannot be implicitly converted to half, and neither
 itype nor half can be implicitly converted to bfloat."

Further, as far as I can tell, the stdlib math/simd functions are not defined
for bfloat and calling with an argument of type bfloat will result in that
argument getting implicitly converted to itype which then returns an output
that is (likely) a itype which cannot be implicitly converted into a bfloat

This leads to situations where
bfloat a = 5.0bf;
bfloat b = metal::abs(a); // this will throw an error since abs return itype
bfloat c = static_cast<bfloat>(metal::abs(a)); // this is fine

For the moment, I will be adding overloaded instantiations of the math
functions to accordingly automatically handle the casting

*/

#define instantiate_metal_math_funcs(itype, otype, ctype, mfast)               \
                                                                               \
  METAL_FUNC otype abs(itype x) {                                              \
    return static_cast<otype>(__metal_fabs(static_cast<ctype>(x), mfast));     \
  }                                                                            \
  METAL_FUNC otype acos(itype x) {                                             \
    return static_cast<otype>(__metal_acos(static_cast<ctype>(x), mfast));     \
  }                                                                            \
  METAL_FUNC otype acosh(itype x) {                                            \
    return static_cast<otype>(__metal_acosh(static_cast<ctype>(x), mfast));    \
  }                                                                            \
  METAL_FUNC otype asin(itype x) {                                             \
    return static_cast<otype>(__metal_asin(static_cast<ctype>(x), mfast));     \
  }                                                                            \
  METAL_FUNC otype asinh(itype x) {                                            \
    return static_cast<otype>(__metal_asinh(static_cast<ctype>(x), mfast));    \
  }                                                                            \
  METAL_FUNC otype atan(itype y_over_x) {                                      \
    return static_cast<otype>(                                                 \
        __metal_atan(static_cast<ctype>(y_over_x), mfast));                    \
  }                                                                            \
  METAL_FUNC otype atan2(itype y, itype x) {                                   \
    return static_cast<otype>(                                                 \
        __metal_atan2(static_cast<ctype>(y), static_cast<ctype>(x), mfast));   \
  }                                                                            \
  METAL_FUNC otype atanh(itype x) {                                            \
    return static_cast<otype>(__metal_atanh(static_cast<ctype>(x), mfast));    \
  }                                                                            \
  METAL_FUNC otype ceil(itype x) {                                             \
    return static_cast<otype>(__metal_ceil(static_cast<ctype>(x), mfast));     \
  }                                                                            \
  METAL_FUNC otype cos(itype x) {                                              \
    return static_cast<otype>(__metal_cos(static_cast<ctype>(x), mfast));      \
  }                                                                            \
  METAL_FUNC otype cosh(itype x) {                                             \
    return static_cast<otype>(__metal_cosh(static_cast<ctype>(x), mfast));     \
  }                                                                            \
  METAL_FUNC otype cospi(itype x) {                                            \
    return static_cast<otype>(__metal_cospi(static_cast<ctype>(x), mfast));    \
  }                                                                            \
  METAL_FUNC otype divide(itype x, itype y) {                                  \
    return static_cast<otype>(                                                 \
        __metal_divide(static_cast<ctype>(x), static_cast<ctype>(y), mfast));  \
  }                                                                            \
  METAL_FUNC otype exp(itype x) {                                              \
    return static_cast<otype>(__metal_exp(static_cast<ctype>(x), mfast));      \
  }                                                                            \
  METAL_FUNC otype exp10(itype x) {                                            \
    return static_cast<otype>(__metal_exp10(static_cast<ctype>(x), mfast));    \
  }                                                                            \
  METAL_FUNC otype exp2(itype x) {                                             \
    return static_cast<otype>(__metal_exp2(static_cast<ctype>(x), mfast));     \
  }                                                                            \
  METAL_FUNC otype fabs(itype x) {                                             \
    return static_cast<otype>(__metal_fabs(static_cast<ctype>(x), mfast));     \
  }                                                                            \
  METAL_FUNC otype fdim(itype x, itype y) {                                    \
    ctype t = static_cast<ctype>(x - y);                                       \
    return static_cast<otype>(select(t, ctype(0), t < ctype(0) || x == y));    \
  }                                                                            \
  METAL_FUNC otype floor(itype x) {                                            \
    return static_cast<otype>(__metal_floor(static_cast<ctype>(x), mfast));    \
  }                                                                            \
  METAL_FUNC otype fma(itype x, itype y, itype z) {                            \
    return static_cast<otype>(__metal_fma(                                     \
        static_cast<ctype>(x), static_cast<ctype>(y), static_cast<ctype>(z))); \
  }                                                                            \
  METAL_FUNC otype fmax(itype x, itype y) {                                    \
    return static_cast<otype>(                                                 \
        __metal_fmax(static_cast<ctype>(x), static_cast<ctype>(y), mfast));    \
  }                                                                            \
  METAL_FUNC otype fmax3(itype x, itype y, itype z) {                          \
    return static_cast<otype>(__metal_fmax3(                                   \
        static_cast<ctype>(x),                                                 \
        static_cast<ctype>(y),                                                 \
        static_cast<ctype>(z),                                                 \
        mfast));                                                               \
  }                                                                            \
  METAL_FUNC otype fmedian3(itype x, itype y, itype z) {                       \
    return static_cast<otype>(__metal_fmedian3(                                \
        static_cast<ctype>(x),                                                 \
        static_cast<ctype>(y),                                                 \
        static_cast<ctype>(z),                                                 \
        mfast));                                                               \
  }                                                                            \
  METAL_FUNC otype fmin(itype x, itype y) {                                    \
    return static_cast<otype>(                                                 \
        __metal_fmin(static_cast<ctype>(x), static_cast<ctype>(y), mfast));    \
  }                                                                            \
  METAL_FUNC otype fmin3(itype x, itype y, itype z) {                          \
    return static_cast<otype>(__metal_fmin3(                                   \
        static_cast<ctype>(x),                                                 \
        static_cast<ctype>(y),                                                 \
        static_cast<ctype>(z),                                                 \
        mfast));                                                               \
  }                                                                            \
  METAL_FUNC otype fmod(itype x, itype y) {                                    \
    return static_cast<otype>(                                                 \
        __metal_fmod(static_cast<ctype>(x), static_cast<ctype>(y), mfast));    \
  }                                                                            \
  METAL_FUNC otype fract(itype x) {                                            \
    return static_cast<otype>(__metal_fract(static_cast<ctype>(x), mfast));    \
  }                                                                            \
  METAL_FUNC otype frexp(itype x, thread int& exp) {                           \
    return static_cast<otype>(__metal_frexp(static_cast<ctype>(x), &exp));     \
  }                                                                            \
  METAL_FUNC otype ldexp(itype x, int k) {                                     \
    return static_cast<otype>(__metal_ldexp(static_cast<ctype>(x), k, mfast)); \
  }                                                                            \
  METAL_FUNC otype log(itype x) {                                              \
    return static_cast<otype>(__metal_log(static_cast<ctype>(x), mfast));      \
  }                                                                            \
  METAL_FUNC otype log10(itype x) {                                            \
    return static_cast<otype>(__metal_log10(static_cast<ctype>(x), mfast));    \
  }                                                                            \
  METAL_FUNC otype log2(itype x) {                                             \
    return static_cast<otype>(__metal_log2(static_cast<ctype>(x), mfast));     \
  }                                                                            \
  METAL_FUNC otype max(itype x, itype y) {                                     \
    return static_cast<otype>(                                                 \
        __metal_fmax(static_cast<ctype>(x), static_cast<ctype>(y), mfast));    \
  }                                                                            \
  METAL_FUNC otype max3(itype x, itype y, itype z) {                           \
    return static_cast<otype>(__metal_fmax3(                                   \
        static_cast<ctype>(x),                                                 \
        static_cast<ctype>(y),                                                 \
        static_cast<ctype>(z),                                                 \
        mfast));                                                               \
  }                                                                            \
  METAL_FUNC otype median3(itype x, itype y, itype z) {                        \
    return static_cast<otype>(__metal_fmedian3(                                \
        static_cast<ctype>(x),                                                 \
        static_cast<ctype>(y),                                                 \
        static_cast<ctype>(z),                                                 \
        mfast));                                                               \
  }                                                                            \
  METAL_FUNC otype min(itype x, itype y) {                                     \
    return static_cast<otype>(                                                 \
        __metal_fmin(static_cast<ctype>(x), static_cast<ctype>(y), mfast));    \
  }                                                                            \
  METAL_FUNC otype min3(itype x, itype y, itype z) {                           \
    return static_cast<otype>(__metal_fmin3(                                   \
        static_cast<ctype>(x),                                                 \
        static_cast<ctype>(y),                                                 \
        static_cast<ctype>(z),                                                 \
        mfast));                                                               \
  }                                                                            \
  METAL_FUNC otype nextafter(itype x, itype y) {                               \
    return static_cast<otype>(                                                 \
        __metal_nextafter(static_cast<ctype>(x), static_cast<ctype>(y)));      \
  }                                                                            \
  METAL_FUNC otype pow(itype x, itype y) {                                     \
    return static_cast<otype>(                                                 \
        __metal_pow(static_cast<ctype>(x), static_cast<ctype>(y), mfast));     \
  }                                                                            \
  METAL_FUNC otype powr(itype x, itype y) {                                    \
    return static_cast<otype>(                                                 \
        __metal_powr(static_cast<ctype>(x), static_cast<ctype>(y), mfast));    \
  }                                                                            \
  METAL_FUNC otype rint(itype x) {                                             \
    return static_cast<otype>(__metal_rint(static_cast<ctype>(x), mfast));     \
  }                                                                            \
  METAL_FUNC otype round(itype x) {                                            \
    return static_cast<otype>(__metal_round(static_cast<ctype>(x), mfast));    \
  }                                                                            \
  METAL_FUNC otype rsqrt(itype x) {                                            \
    return static_cast<otype>(__metal_rsqrt(static_cast<ctype>(x), mfast));    \
  }                                                                            \
  METAL_FUNC otype sin(itype x) {                                              \
    return static_cast<otype>(__metal_sin(static_cast<ctype>(x), mfast));      \
  }                                                                            \
  METAL_FUNC otype sinh(itype x) {                                             \
    return static_cast<otype>(__metal_sinh(static_cast<ctype>(x), mfast));     \
  }                                                                            \
  METAL_FUNC otype sinpi(itype x) {                                            \
    return static_cast<otype>(__metal_sinpi(static_cast<ctype>(x), mfast));    \
  }                                                                            \
  METAL_FUNC otype sqrt(itype x) {                                             \
    return static_cast<otype>(__metal_sqrt(static_cast<ctype>(x), mfast));     \
  }                                                                            \
  METAL_FUNC otype tan(itype x) {                                              \
    return static_cast<otype>(__metal_tan(static_cast<ctype>(x), mfast));      \
  }                                                                            \
  METAL_FUNC otype tanh(itype x) {                                             \
    return static_cast<otype>(__metal_tanh(static_cast<ctype>(x), mfast));     \
  }                                                                            \
  METAL_FUNC otype tanpi(itype x) {                                            \
    return static_cast<otype>(__metal_tanpi(static_cast<ctype>(x), mfast));    \
  }                                                                            \
  METAL_FUNC otype trunc(itype x) {                                            \
    return static_cast<otype>(__metal_trunc(static_cast<ctype>(x), mfast));    \
  }

namespace metal {

instantiate_metal_math_funcs(
    bfloat16_t,
    bfloat16_t,
    float,
    __METAL_MAYBE_FAST_MATH__);

namespace fast {

instantiate_metal_math_funcs(
    bfloat16_t,
    bfloat16_t,
    float,
    __METAL_FAST_MATH__);

} // namespace fast

namespace precise {

instantiate_metal_math_funcs(
    bfloat16_t,
    bfloat16_t,
    float,
    __METAL_PRECISE_MATH__);

} // namespace precise

} // namespace metal

///////////////////////////////////////////////////////////////////////////////
// Metal simd for bfloat16
///////////////////////////////////////////////////////////////////////////////

#define instantiate_metal_simd_comm_funcs(                                   \
    itype, otype, ctype, itype_to_ctype, ctype_to_otype)                     \
                                                                             \
  METAL_FUNC otype simd_broadcast(itype data, ushort broadcast_lane_id) {    \
    return ctype_to_otype(                                                   \
        __metal_simd_broadcast(itype_to_ctype(data), broadcast_lane_id));    \
  }                                                                          \
                                                                             \
  METAL_FUNC otype simd_shuffle(itype data, ushort simd_lane_id) {           \
    return ctype_to_otype(                                                   \
        __metal_simd_shuffle(itype_to_ctype(data), simd_lane_id));           \
  }                                                                          \
                                                                             \
  METAL_FUNC otype simd_shuffle_and_fill_down(                               \
      itype data, itype filling_data, ushort delta, ushort modulo) {         \
    return ctype_to_otype(__metal_simd_shuffle_and_fill_down(                \
        itype_to_ctype(data), itype_to_ctype(filling_data), delta, modulo)); \
  }                                                                          \
                                                                             \
  METAL_FUNC otype simd_shuffle_and_fill_down(                               \
      itype data, itype filling_data, ushort delta) {                        \
    return ctype_to_otype(__metal_simd_shuffle_and_fill_down(                \
        itype_to_ctype(data),                                                \
        itype_to_ctype(filling_data),                                        \
        delta,                                                               \
        __metal_get_simdgroup_size(ushort())));                              \
  }                                                                          \
                                                                             \
  METAL_FUNC otype simd_shuffle_and_fill_up(                                 \
      itype data, itype filling_data, ushort delta, ushort modulo) {         \
    return ctype_to_otype(__metal_simd_shuffle_and_fill_up(                  \
        itype_to_ctype(data), itype_to_ctype(filling_data), delta, modulo)); \
  }                                                                          \
                                                                             \
  METAL_FUNC otype simd_shuffle_and_fill_up(                                 \
      itype data, itype filling_data, ushort delta) {                        \
    return ctype_to_otype(__metal_simd_shuffle_and_fill_up(                  \
        itype_to_ctype(data),                                                \
        itype_to_ctype(filling_data),                                        \
        delta,                                                               \
        __metal_get_simdgroup_size(ushort())));                              \
  }                                                                          \
                                                                             \
  METAL_FUNC otype simd_shuffle_down(itype data, ushort delta) {             \
    return ctype_to_otype(                                                   \
        __metal_simd_shuffle_down(itype_to_ctype(data), delta));             \
  }                                                                          \
                                                                             \
  METAL_FUNC otype simd_shuffle_rotate_down(itype data, ushort delta) {      \
    return ctype_to_otype(                                                   \
        __metal_simd_shuffle_rotate_down(itype_to_ctype(data), delta));      \
  }                                                                          \
                                                                             \
  METAL_FUNC otype simd_shuffle_rotate_up(itype data, ushort delta) {        \
    return ctype_to_otype(                                                   \
        __metal_simd_shuffle_rotate_up(itype_to_ctype(data), delta));        \
  }                                                                          \
                                                                             \
  METAL_FUNC otype simd_shuffle_up(itype data, ushort delta) {               \
    return ctype_to_otype(                                                   \
        __metal_simd_shuffle_up(itype_to_ctype(data), delta));               \
  }                                                                          \
                                                                             \
  METAL_FUNC otype simd_shuffle_xor(itype data, ushort mask) {               \
    return ctype_to_otype(                                                   \
        __metal_simd_shuffle_xor(itype_to_ctype(data), mask));               \
  }

#define instantiate_metal_simd_reduction_funcs(itype, otype, ctype)            \
                                                                               \
  METAL_FUNC otype simd_max(itype data) {                                      \
    return static_cast<otype>(__metal_simd_max(static_cast<ctype>(data)));     \
  }                                                                            \
                                                                               \
  METAL_FUNC otype simd_min(itype data) {                                      \
    return static_cast<otype>(__metal_simd_min(static_cast<ctype>(data)));     \
  }                                                                            \
                                                                               \
  METAL_FUNC otype simd_prefix_exclusive_product(itype data) {                 \
    return static_cast<otype>(                                                 \
        __metal_simd_prefix_exclusive_product(static_cast<ctype>(data)));      \
  }                                                                            \
                                                                               \
  METAL_FUNC otype simd_prefix_exclusive_sum(itype data) {                     \
    return static_cast<otype>(                                                 \
        __metal_simd_prefix_exclusive_sum(static_cast<ctype>(data)));          \
  }                                                                            \
                                                                               \
  METAL_FUNC otype simd_prefix_inclusive_product(itype data) {                 \
    return static_cast<otype>(                                                 \
        __metal_simd_prefix_inclusive_product(static_cast<ctype>(data)));      \
  }                                                                            \
                                                                               \
  METAL_FUNC otype simd_prefix_inclusive_sum(itype data) {                     \
    return static_cast<otype>(                                                 \
        __metal_simd_prefix_inclusive_sum(static_cast<ctype>(data)));          \
  }                                                                            \
                                                                               \
  METAL_FUNC otype simd_product(itype data) {                                  \
    return static_cast<otype>(__metal_simd_product(static_cast<ctype>(data))); \
  }                                                                            \
                                                                               \
  METAL_FUNC otype simd_sum(itype data) {                                      \
    return static_cast<otype>(__metal_simd_sum(static_cast<ctype>(data)));     \
  }                                                                            \
                                                                               \
  METAL_FUNC otype simd_xor(itype data) {                                      \
    return static_cast<otype>(__metal_simd_xor(static_cast<ctype>(data)));     \
  }

namespace metal {

instantiate_metal_simd_comm_funcs(
    bfloat16_t,
    bfloat16_t,
    uint16_t,
    bfloat16_to_uint16,
    uint16_to_bfloat16);
instantiate_metal_simd_reduction_funcs(bfloat16_t, bfloat16_t, float);

} // namespace metal


================================================
FILE: mlx/backend/metal/kernels/binary.h
================================================
// Copyright © 2024 Apple Inc.

template <typename T, typename U, typename Op>
[[kernel]] void binary_ss(
    device const T* a,
    device const T* b,
    device U* c,
    uint index [[thread_position_in_grid]]) {
  c[index] = Op()(a[0], b[0]);
}

template <typename T, typename U, typename Op, int N = WorkPerThread<T>::n>
[[kernel]] void binary_sv(
    device const T* a,
    device const T* b,
    device U* c,
    constant uint& size,
    uint index [[thread_position_in_grid]]) {
  index *= N;
  if (N > 1 && index + N > size) {
    for (int i = 0; index + i < size; ++i) {
      c[index + i] = Op()(a[0], b[index + i]);
    }
  } else {
    for (int i = 0; i < N; ++i) {
      c[index + i] = Op()(a[0], b[index + i]);
    }
  }
}

template <typename T, typename U, typename Op, int N = WorkPerThread<T>::n>
[[kernel]] void binary_vs(
    device const T* a,
    device const T* b,
    device U* c,
    constant uint& size,
    uint index [[thread_position_in_grid]]) {
  index *= N;
  if (N > 1 && index + N > size) {
    for (int i = 0; index + i < size; ++i) {
      c[index + i] = Op()(a[index + i], b[0]);
    }
  } else {
    for (int i = 0; i < N; ++i) {
      c[index + i] = Op()(a[index + i], b[0]);
    }
  }
}

template <typename T, typename U, typename Op, int N = WorkPerThread<T>::n>
[[kernel]] void binary_vv(
    device const T* a,
    device const T* b,
    device U* c,
    constant uint& size,
    uint index [[thread_position_in_grid]]) {
  index *= N;
  if (N > 1 && index + N > size) {
    for (int i = 0; index + i < size; ++i) {
      c[index + i] = Op()(a[index + i], b[index + i]);
    }
  } else {
    for (int i = 0; i < N; ++i) {
      c[index + i] = Op()(a[index + i], b[index + i]);
    }
  }
}

template <typename T, typename U, typename Op, int N = WorkPerThread<T>::n>
[[kernel]] void binary_sv2(
    device const T* a,
    device const T* b,
    device U* c,
    constant int64_t& size,
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  int64_t offset = N * (index.x + grid_dim.x * int64_t(index.y));
  if (N > 1 && offset + N > size) {
    for (int i = 0; offset + i < size; ++i) {
      c[offset + i] = Op()(a[0], b[offset + i]);
    }
  } else {
    for (int i = 0; i < N; ++i) {
      c[offset + i] = Op()(a[0], b[offset + i]);
    }
  }
}

template <typename T, typename U, typename Op, int N = WorkPerThread<T>::n>
[[kernel]] void binary_vs2(
    device const T* a,
    device const T* b,
    device U* c,
    constant int64_t& size,
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  int64_t offset = N * (index.x + grid_dim.x * int64_t(index.y));
  if (N > 1 && offset + N > size) {
    for (int i = 0; offset + i < size; ++i) {
      c[offset + i] = Op()(a[offset + i], b[0]);
    }
  } else {
    for (int i = 0; i < N; ++i) {
      c[offset + i] = Op()(a[offset + i], b[0]);
    }
  }
}

template <typename T, typename U, typename Op, int N = WorkPerThread<T>::n>
[[kernel]] void binary_vv2(
    device const T* a,
    device const T* b,
    device U* c,
    constant int64_t& size,
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  int64_t offset = N * (index.x + grid_dim.x * int64_t(index.y));
  if (N > 1 && offset + N > size) {
    for (int i = 0; offset + i < size; ++i) {
      c[offset + i] = Op()(a[offset + i], b[offset + i]);
    }
  } else {
    for (int i = 0; i < N; ++i) {
      c[offset + i] = Op()(a[offset + i], b[offset + i]);
    }
  }
}

template <typename T, typename U, typename Op, typename IdxT = int64_t>
[[kernel]] void binary_g_nd1(
    device const T* a,
    device const T* b,
    device U* c,
    constant const int64_t& a_stride,
    constant const int64_t& b_stride,
    uint index [[thread_position_in_grid]]) {
  auto a_idx = elem_to_loc_1<IdxT>(index, a_stride);
  auto b_idx = elem_to_loc_1<IdxT>(index, b_stride);
  c[index] = Op()(a[a_idx], b[b_idx]);
}

template <typename T, typename U, typename Op, typename IdxT = int64_t>
[[kernel]] void binary_g_nd2(
    device const T* a,
    device const T* b,
    device U* c,
    constant const int64_t a_strides[2],
    constant const int64_t b_strides[2],
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  auto a_idx = elem_to_loc_2<IdxT>(index, a_strides);
  auto b_idx = elem_to_loc_2<IdxT>(index, b_strides);
  IdxT out_idx = index.x + IdxT(grid_dim.x) * index.y;
  c[out_idx] = Op()(a[a_idx], b[b_idx]);
}

template <typename T, typename U, typename Op, typename IdxT = int64_t>
[[kernel]] void binary_g_nd3(
    device const T* a,
    device const T* b,
    device U* c,
    constant const int64_t a_strides[3],
    constant const int64_t b_strides[3],
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
  auto a_idx = elem_to_loc_3<IdxT>(index, a_strides);
  auto b_idx = elem_to_loc_3<IdxT>(index, b_strides);
  IdxT out_idx = index.x + grid_dim.x * (index.y + IdxT(grid_dim.y) * index.z);
  c[out_idx] = Op()(a[a_idx], b[b_idx]);
}

template <
    typename T,
    typename U,
    typename Op,
    int N = 1,
    typename IdxT = int64_t>
[[kernel]] void binary_g(
    device const T* a,
    device const T* b,
    device U* c,
    constant const int* shape,
    constant const int64_t* a_strides,
    constant const int64_t* b_strides,
    constant const int& ndim,
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
  auto idx = elem_to_loc_2_nd<IdxT>(
      {N * index.x, index.y, index.z}, shape, a_strides, b_strides, ndim);
  auto xshape = shape[ndim - 1];
  IdxT out_idx = N * index.x + xshape * (index.y + IdxT(grid_dim.y) * index.z);
  IdxT a_xstride = a_strides[ndim - 1];
  IdxT b_xstride = b_strides[ndim - 1];
  for (int i = 0; i < N && (int(N * index.x) + i) < xshape; ++i) {
    c[out_idx++] = Op()(a[idx.x], b[idx.y]);
    idx.x += a_xstride;
    idx.y += b_xstride;
  }
}


================================================
FILE: mlx/backend/metal/kernels/binary.metal
================================================
// Copyright © 2024 Apple Inc.

#include <metal_integer>
#include <metal_math>

// clang-format off
#include "mlx/backend/metal/kernels/defines.h"
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/binary_ops.h"
#include "mlx/backend/metal/kernels/binary.h"

#define instantiate_binary_work_per_thread(op, tname, itype, otype)     \
  instantiate_kernel("svn_" #op #tname, binary_sv, itype, otype, op)    \
  instantiate_kernel("vsn_" #op #tname, binary_vs, itype, otype, op)    \
  instantiate_kernel("vvn_" #op #tname, binary_vv, itype, otype, op)    \

#define instantiate_binary_base(op, tname, itype, otype)                    \
  instantiate_kernel("ss_" #op #tname, binary_ss, itype, otype, op)         \
  instantiate_kernel("sv_" #op #tname, binary_sv, itype, otype, op, 1)      \
  instantiate_kernel("vs_" #op #tname, binary_vs, itype, otype, op, 1)      \
  instantiate_kernel("vv_" #op #tname, binary_vv, itype, otype, op, 1)      \
  instantiate_kernel("sv2_" #op #tname, binary_sv2, itype, otype, op)       \
  instantiate_kernel("vs2_" #op #tname, binary_vs2, itype, otype, op)       \
  instantiate_kernel("vv2_" #op #tname, binary_vv2, itype, otype, op)       \
  instantiate_kernel("gn2_" #op #tname, binary_g, itype, otype, op, 2, int) \
  instantiate_kernel("gn4large_" #op #tname, binary_g, itype, otype, op, 4) \
  instantiate_kernel("g1_" #op #tname, binary_g_nd1, itype, otype, op, int) \
  instantiate_kernel("g1large_" #op #tname, binary_g_nd1, itype, otype, op) \
  instantiate_kernel("g2_" #op #tname, binary_g_nd2, itype, otype, op, int) \
  instantiate_kernel("g2large_" #op #tname, binary_g_nd2, itype, otype, op) \
  instantiate_kernel("g3_" #op #tname, binary_g_nd3, itype, otype, op, int) \
  instantiate_kernel("g3large_" #op #tname, binary_g_nd3, itype, otype, op)

#define instantiate_binary_all(op, tname, itype, otype)       \
  instantiate_binary_base(op, tname, itype, otype)            \
  instantiate_binary_work_per_thread(op, tname, itype, otype)

#define instantiate_binary_integer(op)                    \
  instantiate_binary_all(op, uint8, uint8_t, uint8_t)     \
  instantiate_binary_all(op, uint16, uint16_t, uint16_t)  \
  instantiate_binary_all(op, uint32, uint32_t, uint32_t)  \
  instantiate_binary_base(op, uint64, uint64_t, uint64_t) \
  instantiate_binary_all(op, int8, int8_t, int8_t)        \
  instantiate_binary_all(op, int16, int16_t, int16_t)     \
  instantiate_binary_all(op, int32, int32_t, int32_t)     \
  instantiate_binary_base(op, int64, int64_t, int64_t)

#define instantiate_binary_float(op)                \
  instantiate_binary_all(op, float16, half, half)   \
  instantiate_binary_all(op, float32, float, float) \
  instantiate_binary_all(op, bfloat16, bfloat16_t, bfloat16_t)

#define instantiate_binary_types(op)                              \
  instantiate_binary_all(op, bool_, bool, bool)                   \
  instantiate_binary_integer(op)                                  \
  instantiate_binary_base(op, complex64, complex64_t, complex64_t)\
  instantiate_binary_float(op)

#define instantiate_binary_types_bool(op)                \
  instantiate_binary_all(op, bool_, bool, bool)          \
  instantiate_binary_all(op, uint8, uint8_t, bool)       \
  instantiate_binary_all(op, uint16, uint16_t, bool)     \
  instantiate_binary_all(op, uint32, uint32_t, bool)     \
  instantiate_binary_base(op, uint64, uint64_t, bool)    \
  instantiate_binary_all(op, int8, int8_t, bool)         \
  instantiate_binary_all(op, int16, int16_t, bool)       \
  instantiate_binary_all(op, int32, int32_t, bool)       \
  instantiate_binary_base(op, int64, int64_t, bool)      \
  instantiate_binary_all(op, float16, half, bool)        \
  instantiate_binary_all(op, float32, float, bool)       \
  instantiate_binary_all(op, bfloat16, bfloat16_t, bool) \
  instantiate_binary_base(op, complex64, complex64_t, bool)

instantiate_binary_types(Add)
instantiate_binary_types(Divide)
instantiate_binary_types_bool(Equal)
instantiate_binary_types_bool(Greater)
instantiate_binary_types_bool(GreaterEqual)
instantiate_binary_types_bool(Less)
instantiate_binary_types_bool(LessEqual)
instantiate_binary_types_bool(NotEqual)
instantiate_binary_float(LogAddExp)
instantiate_binary_base(LogAddExp, complex64, complex64_t, complex64_t)
instantiate_binary_types(Maximum)
instantiate_binary_types(Minimum)
instantiate_binary_types(Multiply)
instantiate_binary_types(Subtract)
instantiate_binary_types(Power)
instantiate_binary_types(Remainder)
instantiate_binary_float(ArcTan2)

// NaNEqual only needed for floating point types with boolean output
instantiate_binary_all(NaNEqual, float16, half, bool)
instantiate_binary_all(NaNEqual, float32, float, bool)
instantiate_binary_all(NaNEqual, bfloat16, bfloat16_t, bool)
instantiate_binary_base(NaNEqual, complex64, complex64_t, bool)

instantiate_binary_all(LogicalOr, bool_, bool, bool)
instantiate_binary_all(LogicalAnd, bool_, bool, bool)

// Bitwise ops only need integer types and bool (except for l/r shift)
instantiate_binary_integer(BitwiseAnd)
instantiate_binary_all(BitwiseAnd, bool_, bool, bool)
instantiate_binary_integer(BitwiseOr)
instantiate_binary_all(BitwiseOr, bool_, bool, bool)
instantiate_binary_integer(BitwiseXor)
instantiate_binary_all(BitwiseXor, bool_, bool, bool)
instantiate_binary_integer(LeftShift)
instantiate_binary_integer(RightShift) // clang-format on


================================================
FILE: mlx/backend/metal/kernels/binary_ops.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <metal_integer>
#include <metal_math>

constant mlx::os_log logger("mlx", "binary_ops");

struct Add {
  template <typename T>
  T operator()(T x, T y) {
    return x + y;
  }
};

struct FloorDivide {
  template <typename T>
  T operator()(T x, T y) {
    return x / y;
  }
  template <>
  float operator()(float x, float y) {
    return trunc(x / y);
  }
  template <>
  half operator()(half x, half y) {
    return trunc(x / y);
  }
  template <>
  bfloat16_t operator()(bfloat16_t x, bfloat16_t y) {
    return trunc(x / y);
  }
};

struct Divide {
  template <typename T>
  T operator()(T x, T y) {
    return x / y;
  }
};

struct Remainder {
  template <typename T>
  metal::enable_if_t<metal::is_integral_v<T> & !metal::is_signed_v<T>, T>
  operator()(T x, T y) {
    return x % y;
  }
  template <typename T>
  metal::enable_if_t<metal::is_integral_v<T> & metal::is_signed_v<T>, T>
  operator()(T x, T y) {
    auto r = x % y;
    if (r != 0 && (r < 0 != y < 0)) {
      r += y;
    }
    return r;
  }
  template <typename T>
  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T x, T y) {
    T r = fmod(x, y);
    if (r != 0 && (r < 0 != y < 0)) {
      r += y;
    }
    return r;
  }
  template <>
  complex64_t operator()(complex64_t x, complex64_t y) {
    return x % y;
  }
};

struct Equal {
  template <typename T>
  bool operator()(T x, T y) {
    return x == y;
  }
};

struct NaNEqual {
  template <typename T>
  bool operator()(T x, T y) {
    return x == y || (metal::isnan(x) && metal::isnan(y));
  }
  template <>
  bool operator()(complex64_t x, complex64_t y) {
    return x == y ||
        (metal::isnan(x.real) && metal::isnan(y.real) && metal::isnan(x.imag) &&
         metal::isnan(y.imag)) ||
        (x.real == y.real && metal::isnan(x.imag) && metal::isnan(y.imag)) ||
        (metal::isnan(x.real) && metal::isnan(y.real) && x.imag == y.imag);
  }
};

struct Greater {
  template <typename T>
  bool operator()(T x, T y) {
    return x > y;
  }
};

struct GreaterEqual {
  template <typename T>
  bool operator()(T x, T y) {
    return x >= y;
  }
};

struct Less {
  template <typename T>
  bool operator()(T x, T y) {
    return x < y;
  }
};

struct LessEqual {
  template <typename T>
  bool operator()(T x, T y) {
    return x <= y;
  }
};

struct LogAddExp {
  template <typename T>
  T operator()(T x, T y) {
    if (metal::isnan(x) || metal::isnan(y)) {
      return metal::numeric_limits<T>::quiet_NaN();
    }
    constexpr T inf = metal::numeric_limits<T>::infinity();
    T maxval = metal::max(x, y);
    T minval = metal::min(x, y);
    return (minval == -inf || maxval == inf)
        ? maxval
        : (maxval + log1p(metal::exp(minval - maxval)));
  };

  complex64_t operator()(complex64_t x, complex64_t y) {
    if (metal::isnan(x.real) || metal::isnan(x.imag) || metal::isnan(y.real) ||
        metal::isnan(y.imag)) {
      return metal::numeric_limits<float>::quiet_NaN();
    }
    constexpr float inf = metal::numeric_limits<float>::infinity();
    complex64_t maxval = x > y ? x : y;
    complex64_t minval = x < y ? x : y;
    if (minval.real == -inf || maxval.real == inf)
      return maxval;
    float m = metal::exp(minval.real - maxval.real);
    complex64_t dexp{
        m * metal::cos(minval.imag - maxval.imag),
        m * metal::sin(minval.imag - maxval.imag),
    };
    return maxval + log1p(dexp);
  }
};

struct Maximum {
  template <typename T>
  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T x, T y) {
    return metal::max(x, y);
  }

  template <typename T>
  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T x, T y) {
    if (metal::isnan(x)) {
      return x;
    }
    return x > y ? x : y;
  }

  template <>
  complex64_t operator()(complex64_t x, complex64_t y) {
    if (metal::isnan(x.real) || metal::isnan(x.imag)) {
      return x;
    }
    return x > y ? x : y;
  }
};

struct Minimum {
  template <typename T>
  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T x, T y) {
    return metal::min(x, y);
  }

  template <typename T>
  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T x, T y) {
    if (metal::isnan(x)) {
      return x;
    }
    return x < y ? x : y;
  }

  template <>
  complex64_t operator()(complex64_t x, complex64_t y) {
    if (metal::isnan(x.real) || metal::isnan(x.imag)) {
      return x;
    }
    return x < y ? x : y;
  }
};

struct Multiply {
  template <typename T>
  T operator()(T x, T y) {
    return x * y;
  }
};

struct NotEqual {
  template <typename T>
  bool operator()(T x, T y) {
    return x != y;
  }
  template <>
  bool operator()(complex64_t x, complex64_t y) {
    return x.real != y.real || x.imag != y.imag;
  }
};

struct Power {
  template <typename T>
  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T base, T exp) {
    return metal::pow(base, exp);
  }

  template <typename T>
  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T base, T exp) {
    T res = 1;
    // Undefined to raise integer to negative power
    if (exp < 0) {
      logger.log_debug(
          "int pow exp<0 (base=%ld exp=%ld)", (long)base, (long)exp);
      return 0;
    }

    while (exp) {
      if (exp & 1) {
        res *= base;
      }
      exp >>= 1;
      base *= base;
    }
    return res;
  }

  template <>
  complex64_t operator()(complex64_t x, complex64_t y) {
    if (x.real == 0 && x.imag == 0) {
      if (metal::isnan(y.real) || metal::isnan(y.imag)) {
        auto nan = metal::numeric_limits<float>::quiet_NaN();
        return {nan, nan};
      }
      return {0.0, 0.0};
    }
    auto x_theta = metal::atan2(x.imag, x.real);
    auto x_ln_r = 0.5 * metal::log(x.real * x.real + x.imag * x.imag);
    auto mag = metal::exp(y.real * x_ln_r - y.imag * x_theta);
    auto phase = y.imag * x_ln_r + y.real * x_theta;
    return {mag * metal::cos(phase), mag * metal::sin(phase)};
  }
};

struct Subtract {
  template <typename T>
  T operator()(T x, T y) {
    return x - y;
  }
};

struct LogicalAnd {
  template <typename T>
  T operator()(T x, T y) {
    return x && y;
  };
};

struct LogicalOr {
  template <typename T>
  T operator()(T x, T y) {
    return x || y;
  };
};

struct BitwiseAnd {
  template <typename T>
  T operator()(T x, T y) {
    return x & y;
  };
};

struct BitwiseOr {
  template <typename T>
  T operator()(T x, T y) {
    return x | y;
  };
};

struct BitwiseXor {
  template <typename T>
  T operator()(T x, T y) {
    return x ^ y;
  };
};

struct LeftShift {
  template <typename T>
  T operator()(T x, T y) {
    return x << y;
  };
};

struct RightShift {
  template <typename T>
  T operator()(T x, T y) {
    return x >> y;
  };
};

struct ArcTan2 {
  template <typename T>
  T operator()(T y, T x) {
    return metal::precise::atan2(y, x);
  }
};

struct DivMod {
  template <typename T>
  metal::array<T, 2> operator()(T x, T y) {
    return {FloorDivide{}(x, y), Remainder{}(x, y)};
  };
};


================================================
FILE: mlx/backend/metal/kernels/binary_two.h
================================================
// Copyright © 2024 Apple Inc.

template <typename T, typename U, typename Op>
[[kernel]] void binary_ss(
    device const T* a,
    device const T* b,
    device U* c,
    device U* d,
    uint index [[thread_position_in_grid]]) {
  auto out = Op()(a[0], b[0]);
  c[index] = out[0];
  d[index] = out[1];
}

template <typename T, typename U, typename Op, int N = WorkPerThread<T>::n>
[[kernel]] void binary_sv(
    device const T* a,
    device const T* b,
    device U* c,
    device U* d,
    constant uint& size,
    uint index [[thread_position_in_grid]]) {
  index *= N;
  if (N > 1 && index + N > size) {
    for (int i = 0; index + i < size; ++i) {
      auto out = Op()(a[0], b[index + i]);
      c[index + i] = out[0];
      d[index + i] = out[1];
    }
  } else {
    for (int i = 0; i < N; ++i) {
      auto out = Op()(a[0], b[index + i]);
      c[index + i] = out[0];
      d[index + i] = out[1];
    }
  }
}

template <typename T, typename U, typename Op, int N = WorkPerThread<T>::n>
[[kernel]] void binary_vs(
    device const T* a,
    device const T* b,
    device U* c,
    device U* d,
    constant uint& size,
    uint index [[thread_position_in_grid]]) {
  index *= N;
  if (N > 1 && index + N > size) {
    for (int i = 0; index + i < size; ++i) {
      auto out = Op()(a[index + i], b[0]);
      c[index + i] = out[0];
      d[index + i] = out[1];
    }
  } else {
    for (int i = 0; i < N; ++i) {
      auto out = Op()(a[index + i], b[0]);
      c[index + i] = out[0];
      d[index + i] = out[1];
    }
  }
}

template <typename T, typename U, typename Op, int N = WorkPerThread<T>::n>
[[kernel]] void binary_vv(
    device const T* a,
    device const T* b,
    device U* c,
    device U* d,
    constant uint& size,
    uint index [[thread_position_in_grid]]) {
  index *= N;
  if (N > 1 && index + N > size) {
    for (int i = 0; index + i < size; ++i) {
      auto out = Op()(a[index + i], b[index + i]);
      c[index + i] = out[0];
      d[index + i] = out[1];
    }
  } else {
    for (int i = 0; i < N; ++i) {
      auto out = Op()(a[index + i], b[index + i]);
      c[index + i] = out[0];
      d[index + i] = out[1];
    }
  }
}

template <typename T, typename U, typename Op, int N = WorkPerThread<T>::n>
[[kernel]] void binary_sv2(
    device const T* a,
    device const T* b,
    device U* c,
    device U* d,
    constant int64_t& size,
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  int64_t offset = N * (index.x + grid_dim.x * int64_t(index.y));
  if (N > 1 && offset + N > size) {
    for (int i = 0; offset + i < size; ++i) {
      auto out = Op()(a[0], b[offset + i]);
      c[offset + i] = out[0];
      d[offset + i] = out[1];
    }
  } else {
    for (int i = 0; i < N; ++i) {
      auto out = Op()(a[0], b[offset + i]);
      c[offset + i] = out[0];
      d[offset + i] = out[1];
    }
  }
}

template <typename T, typename U, typename Op, int N = WorkPerThread<T>::n>
[[kernel]] void binary_vs2(
    device const T* a,
    device const T* b,
    device U* c,
    device U* d,
    constant int64_t& size,
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  int64_t offset = N * (index.x + grid_dim.x * int64_t(index.y));
  if (N > 1 && offset + N > size) {
    for (int i = 0; offset + i < size; ++i) {
      auto out = Op()(a[offset + i], b[0]);
      c[offset + i] = out[0];
      d[offset + i] = out[1];
    }
  } else {
    for (int i = 0; i < N; ++i) {
      auto out = Op()(a[offset + i], b[0]);
      c[offset + i] = out[0];
      d[offset + i] = out[1];
    }
  }
}

template <typename T, typename U, typename Op, int N = WorkPerThread<T>::n>
[[kernel]] void binary_vv2(
    device const T* a,
    device const T* b,
    device U* c,
    device U* d,
    constant int64_t& size,
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  int64_t offset = N * (index.x + grid_dim.x * int64_t(index.y));
  if (N > 1 && offset + N > size) {
    for (int i = 0; offset + i < size; ++i) {
      auto out = Op()(a[offset + i], b[offset + i]);
      c[offset + i] = out[0];
      d[offset + i] = out[1];
    }
  } else {
    for (int i = 0; i < N; ++i) {
      auto out = Op()(a[offset + i], b[offset + i]);
      c[offset + i] = out[0];
      d[offset + i] = out[1];
    }
  }
}

template <typename T, typename U, typename Op, typename IdxT = int64_t>
[[kernel]] void binary_g_nd1(
    device const T* a,
    device const T* b,
    device U* c,
    device U* d,
    constant const int64_t& a_stride,
    constant const int64_t& b_stride,
    uint index [[thread_position_in_grid]]) {
  auto a_idx = elem_to_loc_1<IdxT>(index, a_stride);
  auto b_idx = elem_to_loc_1<IdxT>(index, b_stride);
  auto out = Op()(a[a_idx], b[b_idx]);
  c[index] = out[0];
  d[index] = out[1];
}

template <typename T, typename U, typename Op, typename IdxT = int64_t>
[[kernel]] void binary_g_nd2(
    device const T* a,
    device const T* b,
    device U* c,
    device U* d,
    constant const int64_t a_strides[2],
    constant const int64_t b_strides[2],
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  auto a_idx = elem_to_loc_2<IdxT>(index, a_strides);
  auto b_idx = elem_to_loc_2<IdxT>(index, b_strides);
  IdxT out_idx = index.x + IdxT(grid_dim.x) * index.y;
  auto out = Op()(a[a_idx], b[b_idx]);
  c[out_idx] = out[0];
  d[out_idx] = out[1];
}

template <typename T, typename U, typename Op, typename IdxT = int64_t>
[[kernel]] void binary_g_nd3(
    device const T* a,
    device const T* b,
    device U* c,
    device U* d,
    constant const int64_t a_strides[3],
    constant const int64_t b_strides[3],
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
  auto a_idx = elem_to_loc_3<IdxT>(index, a_strides);
  auto b_idx = elem_to_loc_3<IdxT>(index, b_strides);
  IdxT out_idx = index.x + grid_dim.x * (index.y + IdxT(grid_dim.y) * index.z);
  auto out = Op()(a[a_idx], b[b_idx]);
  c[out_idx] = out[0];
  d[out_idx] = out[1];
}

template <
    typename T,
    typename U,
    typename Op,
    int N = 1,
    typename IdxT = int64_t>
[[kernel]] void binary_g(
    device const T* a,
    device const T* b,
    device U* c,
    device U* d,
    constant const int* shape,
    constant const int64_t* a_strides,
    constant const int64_t* b_strides,
    constant const int& ndim,
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
  auto idx = elem_to_loc_2_nd<IdxT>(
      {N * index.x, index.y, index.z}, shape, a_strides, b_strides, ndim);
  auto xshape = shape[ndim - 1];
  IdxT out_idx = N * index.x + xshape * (index.y + IdxT(grid_dim.y) * index.z);
  IdxT a_xstride = a_strides[ndim - 1];
  IdxT b_xstride = b_strides[ndim - 1];
  for (int i = 0; i < N && (int(N * index.x) + i) < xshape; ++i) {
    auto out = Op()(a[idx.x], b[idx.y]);
    c[out_idx] = out[0];
    d[out_idx++] = out[1];
    idx.x += a_xstride;
    idx.y += b_xstride;
  }
}


================================================
FILE: mlx/backend/metal/kernels/binary_two.metal
================================================
// Copyright © 2024 Apple Inc.
#include <metal_integer>
#include <metal_math>

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/binary_ops.h"
#include "mlx/backend/metal/kernels/binary_two.h"

#define instantiate_binary_work_per_thread(op, tname, itype, otype)     \
  instantiate_kernel("svn_" #op #tname, binary_sv, itype, otype, op)    \
  instantiate_kernel("vsn_" #op #tname, binary_vs, itype, otype, op)    \
  instantiate_kernel("vvn_" #op #tname, binary_vv, itype, otype, op)

#define instantiate_binary_base(op, tname, itype, otype)                    \
  instantiate_kernel("ss_" #op #tname, binary_ss, itype, otype, op)         \
  instantiate_kernel("sv_" #op #tname, binary_sv, itype, otype, op, 1)      \
  instantiate_kernel("vs_" #op #tname, binary_vs, itype, otype, op, 1)      \
  instantiate_kernel("vv_" #op #tname, binary_vv, itype, otype, op, 1)      \
  instantiate_kernel("sv2_" #op #tname, binary_sv2, itype, otype, op)       \
  instantiate_kernel("vs2_" #op #tname, binary_vs2, itype, otype, op)       \
  instantiate_kernel("vv2_" #op #tname, binary_vv2, itype, otype, op)       \
  instantiate_kernel("gn2_" #op #tname, binary_g, itype, otype, op, 2, int) \
  instantiate_kernel("gn4large_" #op #tname, binary_g, itype, otype, op, 4) \
  instantiate_kernel("g1_" #op #tname, binary_g_nd1, itype, otype, op, int) \
  instantiate_kernel("g2_" #op #tname, binary_g_nd2, itype, otype, op, int) \
  instantiate_kernel("g3_" #op #tname, binary_g_nd3, itype, otype, op, int) \
  instantiate_kernel("g1large_" #op #tname, binary_g_nd1, itype, otype, op) \
  instantiate_kernel("g2large_" #op #tname, binary_g_nd2, itype, otype, op) \
  instantiate_kernel("g3large_" #op #tname, binary_g_nd3, itype, otype, op)

#define instantiate_binary_all(op, tname, itype, otype)       \
  instantiate_binary_base(op, tname, itype, otype)            \
  instantiate_binary_work_per_thread(op, tname, itype, otype)

#define instantiate_binary_float(op)                \
  instantiate_binary_all(op, float16, half, half)   \
  instantiate_binary_all(op, float32, float, float) \
  instantiate_binary_all(op, bfloat16, bfloat16_t, bfloat16_t)

#define instantiate_binary_types(op)                               \
  instantiate_binary_all(op, bool_, bool, bool)                    \
  instantiate_binary_all(op, uint8, uint8_t, uint8_t)              \
  instantiate_binary_all(op, uint16, uint16_t, uint16_t)           \
  instantiate_binary_all(op, uint32, uint32_t, uint32_t)           \
  instantiate_binary_base(op, uint64, uint64_t, uint64_t)          \
  instantiate_binary_all(op, int8, int8_t, int8_t)                 \
  instantiate_binary_all(op, int16, int16_t, int16_t)              \
  instantiate_binary_all(op, int32, int32_t, int32_t)              \
  instantiate_binary_base(op, int64, int64_t, int64_t)             \
  instantiate_binary_base(op, complex64, complex64_t, complex64_t) \
  instantiate_binary_float(op)

instantiate_binary_types(DivMod) // clang-format on


================================================
FILE: mlx/backend/metal/kernels/cexpf.h
================================================
// Copyright © 2025 Apple Inc.
// Copyright © 2008-2013 NVIDIA Corporation
// Copyright © 2013 Filipe RNC Maia
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Forked from
// https://github.com/NVIDIA/cccl/blob/main/thrust/thrust/detail/complex/cexpf.h

// TODO: We should use thrust::exp but the thrust header in old CUDA versions
// can not be used in JIT.

#pragma once

#include <metal_math>

using ieee_float_shape_type = union {
  float value;
  uint32_t word;
};

inline void get_float_word(thread uint32_t& i, float d) {
  ieee_float_shape_type gf_u;
  gf_u.value = (d);
  (i) = gf_u.word;
}

inline void get_float_word(thread int32_t& i, float d) {
  ieee_float_shape_type gf_u;
  gf_u.value = (d);
  (i) = gf_u.word;
}

inline void set_float_word(thread float& d, uint32_t i) {
  ieee_float_shape_type sf_u;
  sf_u.word = (i);
  (d) = sf_u.value;
}

inline float frexp_expf(float x, thread int* expt) {
  const uint32_t k = 235;
  const float kln2 = 162.88958740F;

  float exp_x;
  uint32_t hx;

  exp_x = metal::exp(x - kln2);
  get_float_word(hx, exp_x);
  *expt = (hx >> 23) - (0x7f + 127) + k;
  set_float_word(exp_x, (hx & 0x7fffff) | ((0x7f + 127) << 23));
  return exp_x;
}

inline complex64_t ldexp_cexpf(complex64_t z, int expt) {
  float x, y, exp_x, scale1, scale2;
  int ex_expt, half_expt;

  x = z.real;
  y = z.imag;
  exp_x = frexp_expf(x, &ex_expt);
  expt += ex_expt;

  half_expt = expt / 2;
  set_float_word(scale1, (0x7f + half_expt) << 23);
  half_expt = expt - half_expt;
  set_float_word(scale2, (0x7f + half_expt) << 23);

  return complex64_t{
      metal::cos(y) * exp_x * scale1 * scale2,
      metal::sin(y) * exp_x * scale1 * scale2};
}

inline complex64_t cexpf(const thread complex64_t& z) {
  float x, y, exp_x;
  uint32_t hx, hy;

  const uint32_t exp_ovfl = 0x42b17218, cexp_ovfl = 0x43400074;

  x = z.real;
  y = z.imag;

  get_float_word(hy, y);
  hy &= 0x7fffffff;

  /* cexp(x + I 0) = exp(x) + I 0 */
  if (hy == 0) {
    return complex64_t{metal::exp(x), y};
  }
  get_float_word(hx, x);
  /* cexp(0 + I y) = cos(y) + I sin(y) */
  if ((hx & 0x7fffffff) == 0) {
    return complex64_t{metal::cos(y), metal::sin(y)};
  }
  if (hy >= 0x7f800000) {
    if ((hx & 0x7fffffff) != 0x7f800000) {
      /* cexp(finite|NaN +- I Inf|NaN) = NaN + I NaN */
      return complex64_t{y - y, y - y};
    } else if (hx & 0x80000000) {
      /* cexp(-Inf +- I Inf|NaN) = 0 + I 0 */
      return complex64_t{0.0, 0.0};
    } else {
      /* cexp(+Inf +- I Inf|NaN) = Inf + I NaN */
      return complex64_t{x, y - y};
    }
  }

  if (hx >= exp_ovfl && hx <= cexp_ovfl) {
    /*
     * x is between 88.7 and 192, so we must scale to avoid
     * overflow in expf(x).
     */
    return ldexp_cexpf(z, 0);
  } else {
    /*
     * Cases covered here:
     *  -  x < exp_ovfl and exp(x) won't overflow (common case)
     *  -  x > cexp_ovfl, so exp(x) * s overflows for all s > 0
     *  -  x = +-Inf (generated by exp())
     *  -  x = NaN (spurious inexact exception from y)
     */
    exp_x = metal::exp(x);
    return complex64_t{exp_x * metal::cos(y), exp_x * metal::sin(y)};
  }
}


================================================
FILE: mlx/backend/metal/kernels/complex.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include <metal_stdlib>

using namespace metal;

struct complex64_t;

template <typename T>
static constexpr constant bool can_convert_to_complex64 =
    !is_same_v<T, complex64_t> && is_convertible_v<T, float>;

template <typename T>
static constexpr constant bool can_convert_from_complex64 =
    !is_same_v<T, complex64_t> &&
    (is_convertible_v<float, T> || is_convertible_v<bfloat16_t, T>);

struct complex64_t {
  float real;
  float imag;

  // Constructors
  constexpr complex64_t(float real, float imag) : real(real), imag(imag) {};
  constexpr complex64_t() : real(0), imag(0) {};
  constexpr complex64_t() threadgroup : real(0), imag(0) {};

  // Conversions to complex64_t
  template <
      typename T,
      typename = typename enable_if<can_convert_to_complex64<T>>::type>
  constexpr complex64_t(T x) thread : real(x), imag(0) {}

  template <
      typename T,
      typename = typename enable_if<can_convert_to_complex64<T>>::type>
  constexpr complex64_t(T x) threadgroup : real(x), imag(0) {}

  template <
      typename T,
      typename = typename enable_if<can_convert_to_complex64<T>>::type>
  constexpr complex64_t(T x) device : real(x), imag(0) {}

  template <
      typename T,
      typename = typename enable_if<can_convert_to_complex64<T>>::type>
  constexpr complex64_t(T x) constant : real(x), imag(0) {}

  // Conversions from complex64_t
  template <
      typename T,
      typename = typename enable_if<can_convert_from_complex64<T>>::type>
  constexpr operator T() const thread {
    return static_cast<T>(real);
  }

  template <
      typename T,
      typename = typename enable_if<can_convert_from_complex64<T>>::type>
  constexpr operator T() const threadgroup {
    return static_cast<T>(real);
  }

  template <
      typename T,
      typename = typename enable_if<can_convert_from_complex64<T>>::type>
  constexpr operator T() const device {
    return static_cast<T>(real);
  }

  template <
      typename T,
      typename = typename enable_if<can_convert_from_complex64<T>>::type>
  constexpr operator T() const constant {
    return static_cast<T>(real);
  }
};

constexpr complex64_t operator-(complex64_t x) {
  return {-x.real, -x.imag};
}

constexpr bool operator>=(complex64_t a, complex64_t b) {
  return (a.real > b.real) || (a.real == b.real && a.imag >= b.imag);
}

constexpr bool operator>(complex64_t a, complex64_t b) {
  return (a.real > b.real) || (a.real == b.real && a.imag > b.imag);
}

constexpr bool operator<=(complex64_t a, complex64_t b) {
  return operator>=(b, a);
}

constexpr bool operator<(complex64_t a, complex64_t b) {
  return operator>(b, a);
}

constexpr bool operator==(complex64_t a, complex64_t b) {
  return a.real == b.real && a.imag == b.imag;
}

constexpr complex64_t operator+(complex64_t a, complex64_t b) {
  return {a.real + b.real, a.imag + b.imag};
}

constexpr thread complex64_t& operator+=(thread complex64_t& a, complex64_t b) {
  a.real += b.real;
  a.imag += b.imag;
  return a;
}

constexpr threadgroup complex64_t& operator+=(
    threadgroup complex64_t& a,
    complex64_t b) {
  a.real += b.real;
  a.imag += b.imag;
  return a;
}

constexpr device complex64_t& operator+=(device complex64_t& a, complex64_t b) {
  a.real += b.real;
  a.imag += b.imag;
  return a;
}

constexpr complex64_t operator+(float a, complex64_t b) {
  return {a + b.real, b.imag};
}
constexpr complex64_t operator+(complex64_t a, float b) {
  return {a.real + b, a.imag};
}

constexpr complex64_t operator-(complex64_t a, complex64_t b) {
  return {a.real - b.real, a.imag - b.imag};
}
constexpr complex64_t operator-(float a, complex64_t b) {
  return {a - b.real, -b.imag};
}
constexpr complex64_t operator-(complex64_t a, float b) {
  return {a.real - b, a.imag};
}

constexpr complex64_t operator*(complex64_t a, complex64_t b) {
  return {a.real * b.real - a.imag * b.imag, a.real * b.imag + a.imag * b.real};
}

constexpr complex64_t operator/(complex64_t a, complex64_t b) {
  auto denom = b.real * b.real + b.imag * b.imag;
  auto x = a.real * b.real + a.imag * b.imag;
  auto y = a.imag * b.real - a.real * b.imag;
  return {x / denom, y / denom};
}

constexpr complex64_t operator/(float a, complex64_t b) {
  auto denom = b.real * b.real + b.imag * b.imag;
  auto x = a * b.real;
  auto y = -a * b.imag;
  return {x / denom, y / denom};
}

constexpr complex64_t operator%(complex64_t a, complex64_t b) {
  auto real = a.real - (b.real * static_cast<int64_t>(a.real / b.real));
  auto imag = a.imag - (b.imag * static_cast<int64_t>(a.imag / b.imag));
  if (real != 0 && (real < 0 != b.real < 0)) {
    real += b.real;
  }
  if (imag != 0 && (imag < 0 != b.imag < 0)) {
    imag += b.imag;
  }
  return {real, imag};
}


================================================
FILE: mlx/backend/metal/kernels/conv.metal
================================================
// Copyright © 2023-2024 Apple Inc.

#include <metal_simdgroup>
#include <metal_simdgroup_matrix>
#include <metal_stdlib>

#include "mlx/backend/metal/kernels/steel/conv/params.h"
#include "mlx/backend/metal/kernels/utils.h"

#define MLX_MTL_CONST static constant constexpr const

using namespace metal;

///////////////////////////////////////////////////////////////////////////////
/// Naive unfold with dilation
///////////////////////////////////////////////////////////////////////////////

template <typename T, int N>
[[kernel]] void naive_unfold_Nd(
    const device T* in [[buffer(0)]],
    device T* out [[buffer(1)]],
    const constant MLXConvParams<N>* params [[buffer(2)]],
    uint3 gid [[thread_position_in_grid]]) {
  int filter_size = params->C;
  for (short i = 0; i < N; i++)
    filter_size *= params->wS[i];

  int out_pixels = 1;
  for (short i = 0; i < N; i++)
    out_pixels *= params->oS[i];

  // Set out
  out += (size_t)gid.z * filter_size + (size_t)gid.y * (params->C);

  // Coordinates in input
  int is[N] = {0};

  // gid.z: N oS (Batch and row in unfolded output)
  // gid.y: wS (Filter location to unfold input)
  // gid.x: C (channel)

  int n = (gid.z) / out_pixels;
  int oS = (gid.z) % out_pixels;
  int wS = gid.y;

  bool valid = n < params->N;

  // Unroll dimensions
  for (int i = N - 1; i >= 0; --i) {
    int os_ = (oS % params->oS[i]);
    int ws_ = (wS % params->wS[i]);

    ws_ = params->flip ? params->wS[i] - ws_ - 1 : ws_;

    int is_ = os_ * params->str[i] - params->pad[i] + ws_ * params->kdil[i];
    int is_max = 1 + params->idil[i] * (params->iS[i] - 1);

    valid &= is_ >= 0 && is_ < is_max && (is_ % params->idil[i] == 0);

    is[i] = is_ / params->idil[i];

    oS /= params->oS[i];
    wS /= params->wS[i];
  }

  if (valid) {
    size_t in_offset = n * params->in_strides[0];

    for (int i = 0; i < N; ++i) {
      in_offset += is[i] * params->in_strides[i + 1];
    }

    out[gid.x] = in[in_offset + gid.x];
  } else {
    out[gid.x] = T(0);
  }
}

// This kernel unfolds the input array of size (N, *spatial_dims, C)
// into an array of size (N x *spatial_dims, C x *kernel_dims).
template <typename T, int N>
[[kernel]] void naive_unfold_transpose_Nd(
    const device T* in [[buffer(0)]],
    device T* out [[buffer(1)]],
    const constant MLXConvParams<N>* params [[buffer(2)]],
    uint3 gid [[thread_position_in_grid]]) {
  int filter_size = params->C;
  for (short i = 0; i < N; i++)
    filter_size *= params->wS[i];

  int out_pixels = 1;
  for (short i = 0; i < N; i++)
    out_pixels *= params->oS[i];

  // Set out
  out +=
      (size_t)gid.z * filter_size + (size_t)gid.x * (filter_size / params->C);

  // Coordinates in input
  int is[N] = {0};

  // gid.z: N oS (Batch and row in unfolded output)
  // gid.y: wS (Filter location to unfold input)
  // gid.x: C (channel)

  int n = (gid.z) / out_pixels;
  int oS = (gid.z) % out_pixels;
  int wS = gid.y;

  bool valid = n < params->N;

  // Unroll dimensions
  int kernel_stride = 1;
  for (int i = N - 1; i >= 0; --i) {
    int os_ = (oS % params->oS[i]);
    int ws_ = (wS % params->wS[i]);
    out += ws_ * kernel_stride;

    ws_ = params->flip ? params->wS[i] - ws_ - 1 : ws_;

    int is_ = os_ * params->str[i] - params->pad[i] + ws_ * params->kdil[i];
    int is_max = 1 + params->idil[i] * (params->iS[i] - 1);

    valid &= is_ >= 0 && is_ < is_max && (is_ % params->idil[i] == 0);

    is[i] = is_ / params->idil[i];

    oS /= params->oS[i];
    wS /= params->wS[i];

    kernel_stride *= params->wS[i];
  }

  if (valid) {
    size_t in_offset = n * params->in_strides[0];

    for (int i = 0; i < N; ++i) {
      in_offset += is[i] * params->in_strides[i + 1];
    }

    out[0] = in[in_offset + gid.x];
  } else {
    out[0] = T(0);
  }
}

#define instantiate_naive_unfold_nd(name, itype, n)                            \
  template [[host_name("naive_unfold_nd_" #name "_" #n)]] [[kernel]] void      \
  naive_unfold_Nd(                                                             \
      const device itype* in [[buffer(0)]],                                    \
      device itype* out [[buffer(1)]],                                         \
      const constant MLXConvParams<n>* params [[buffer(2)]],                   \
      uint3 gid [[thread_position_in_grid]]);                                  \
  template                                                                     \
      [[host_name("naive_unfold_transpose_nd_" #name "_" #n)]] [[kernel]] void \
      naive_unfold_transpose_Nd(                                               \
          const device itype* in [[buffer(0)]],                                \
          device itype* out [[buffer(1)]],                                     \
          const constant MLXConvParams<n>* params [[buffer(2)]],               \
          uint3 gid [[thread_position_in_grid]]);

#define instantiate_naive_unfold_nd_dims(name, itype)                      \
  instantiate_naive_unfold_nd(name, itype, 1) instantiate_naive_unfold_nd( \
      name, itype, 2) instantiate_naive_unfold_nd(name, itype, 3)

instantiate_naive_unfold_nd_dims(float32, float);
instantiate_naive_unfold_nd_dims(float16, half);
instantiate_naive_unfold_nd_dims(bfloat16, bfloat16_t);

///////////////////////////////////////////////////////////////////////////////
/// Depthwise convolution kernels
///////////////////////////////////////////////////////////////////////////////

constant int ker_h [[function_constant(00)]];
constant int ker_w [[function_constant(01)]];
constant int str_h [[function_constant(10)]];
constant int str_w [[function_constant(11)]];
constant int tgp_h [[function_constant(100)]];
constant int tgp_w [[function_constant(101)]];
constant bool do_flip [[function_constant(200)]];

constant int span_h = tgp_h * str_h + ker_h - 1;
constant int span_w = tgp_w * str_w + ker_w - 1;
constant int span_hw = span_h * span_w;

template <typename T>
[[kernel]] void depthwise_conv_2d(
    const device T* in [[buffer(0)]],
    const device T* wt [[buffer(1)]],
    device T* out [[buffer(2)]],
    const constant MLXConvParams<2>& params [[buffer(3)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint3 gid [[thread_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  constexpr int tc = 8;
  constexpr int tw = 8;
  constexpr int th = 4;

  constexpr int c_per_thr = 8;

  constexpr int TGH = th * 2 + 6;
  constexpr int TGW = tw * 2 + 6;
  constexpr int TGC = tc;

  threadgroup T ins[TGH * TGW * TGC];

  const int n_tgblocks_h = params.oS[0] / th;
  const int n = tid.z / n_tgblocks_h;
  const int tghid = tid.z % n_tgblocks_h;
  const int oh = tghid * th + lid.z;
  const int ow = gid.y;
  const int c = gid.x;

  in += n * params.in_strides[0];

  // Load in
  {
    constexpr int n_threads = th * tw * tc;
    const int tg_oh = (tghid * th) * str_h - params.pad[0];
    const int tg_ow = (tid.y * tw) * str_w - params.pad[1];
    const int tg_c = tid.x * tc;

    const int thread_idx = simd_gid * 32 + simd_lid;
    constexpr int thr_per_hw = tc / c_per_thr;
    constexpr int hw_per_group = n_threads / thr_per_hw;

    const int thr_c = thread_idx % thr_per_hw;
    const int thr_hw = thread_idx / thr_per_hw;

    for (int hw = thr_hw; hw < span_hw; hw += hw_per_group) {
      const int h = hw / span_w;
      const int w = hw % span_w;

      const int ih = tg_oh + h;
      const int iw = tg_ow + w;

      const int in_s_offset = h * span_w * TGC + w * TGC;

      if (ih >= 0 && ih < params.iS[0] && iw >= 0 && iw < params.iS[1]) {
        const auto in_load =
            in + ih * params.in_strides[1] + iw * params.in_strides[2] + tg_c;

        MLX_MTL_PRAGMA_UNROLL
        for (int cc = 0; cc < c_per_thr; ++cc) {
          ins[in_s_offset + c_per_thr * thr_c + cc] =
              in_load[c_per_thr * thr_c + cc];
        }
      } else {
        MLX_MTL_PRAGMA_UNROLL
        for (int cc = 0; cc < c_per_thr; ++cc) {
          ins[in_s_offset + c_per_thr * thr_c + cc] = T(0);
        }
      }
    }
  }

  threadgroup_barrier(mem_flags::mem_threadgroup);
  wt += c * params.wt_strides[0];

  const auto ins_ptr =
      &ins[lid.z * str_h * span_w * TGC + lid.y * str_w * TGC + lid.x];
  float o = 0.;
  for (int h = 0; h < ker_h; ++h) {
    for (int w = 0; w < ker_w; ++w) {
      int wt_h = h;
      int wt_w = w;
      if (do_flip) {
        wt_h = ker_h - h - 1;
        wt_w = ker_w - w - 1;
      }
      auto inv = ins_ptr[h * span_w * TGC + w * TGC];
      auto wtv = wt[wt_h * ker_w + wt_w];
      o += inv * wtv;
    }
  }
  threadgroup_barrier(mem_flags::mem_none);

  out += n * params.out_strides[0] + oh * params.out_strides[1] +
      ow * params.out_strides[2];
  out[c] = static_cast<T>(o);
}

#define instantiate_depthconv2d(iname, itype) \
  instantiate_kernel("depthwise_conv_2d_" #iname, depthwise_conv_2d, itype)

instantiate_depthconv2d(float32, float);
instantiate_depthconv2d(float16, half);
instantiate_depthconv2d(bfloat16, bfloat16_t);

template <typename T, typename IdxT>
[[kernel]] void depthwise_conv_1d(
    const device T* in [[buffer(0)]],
    const device T* w [[buffer(1)]],
    device T* out [[buffer(2)]],
    constant const IdxT strides[3],
    constant const int& kernel_size,
    uint3 tid [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
  out += (tid.z * static_cast<IdxT>(grid_dim.y) + tid.y) * grid_dim.x + tid.x;
  in += tid.z * strides[0] + tid.y * strides[1] + tid.x * strides[2];
  w += tid.x * kernel_size;

  float acc = 0.0;
  for (int i = 0; i < kernel_size; ++i) {
    acc += static_cast<float>(in[0]) * w[i];
    in += strides[1];
  }
  *out = static_cast<T>(acc);
}

#define instantiate_depthconv1d(iname, itype)                         \
  instantiate_kernel(                                                 \
      "depthwise_conv_1d_" #iname, depthwise_conv_1d, itype, int32_t) \
      instantiate_kernel(                                             \
          "depthwise_conv_1d_" #iname "_large",                       \
          depthwise_conv_1d,                                          \
          itype,                                                      \
          int64_t)

instantiate_depthconv1d(float32, float);
instantiate_depthconv1d(float16, half);
instantiate_depthconv1d(bfloat16, bfloat16_t);

///////////////////////////////////////////////////////////////////////////////
/// Winograd kernels
///////////////////////////////////////////////////////////////////////////////

template <int M, int R, int S>
struct WinogradTransforms {};

template <>
struct WinogradTransforms<6, 3, 8> {
  MLX_MTL_CONST int OUT_TILE_SIZE = 6;
  MLX_MTL_CONST int FILTER_SIZE = 3;
  MLX_MTL_CONST int IN_TILE_SIZE = OUT_TILE_SIZE + FILTER_SIZE - 1;
  MLX_MTL_CONST int SIMD_MATRIX_SIZE = 8;
  MLX_MTL_CONST float in_transform[SIMD_MATRIX_SIZE][SIMD_MATRIX_SIZE] = {
      {1.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.00f},
      {0.00f, 1.00f, -1.00f, 0.50f, -0.50f, 2.00f, -2.00f, -1.00f},
      {-5.25f, 1.00f, 1.00f, 0.25f, 0.25f, 4.00f, 4.00f, 0.00f},
      {0.00f, -4.25f, 4.25f, -2.50f, 2.50f, -2.50f, 2.50f, 5.25f},
      {5.25f, -4.25f, -4.25f, -1.25f, -1.25f, -5.00f, -5.00f, 0.00f},
      {0.00f, 1.00f, -1.00f, 2.00f, -2.00f, 0.50f, -0.50f, -5.25f},
      {-1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 0.00f},
      {0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 1.00f},
  };

  MLX_MTL_CONST float out_transform[SIMD_MATRIX_SIZE][SIMD_MATRIX_SIZE] = {
      {1.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.00f},
      {1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f},
      {1.00f, -1.00f, 1.00f, -1.00f, 1.00f, -1.00f},
      {1.00f, 2.00f, 4.00f, 8.00f, 16.00f, 32.00f},
      {1.00f, -2.00f, 4.00f, -8.00f, 16.00f, -32.00f},
      {1.00f, 0.50f, 0.25f, 0.125f, 0.0625f, 0.03125f},
      {1.00f, -0.50f, 0.25f, -0.125f, 0.0625f, -0.03125f},
      {0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 1.00f},
  };

  MLX_MTL_CONST float wt_transform[SIMD_MATRIX_SIZE][SIMD_MATRIX_SIZE] = {
      {1.00, 0.00, 0.00},
      {-2.0 / 9.00, -2.0 / 9.00, -2.0 / 9.00},
      {-2.0 / 9.00, 2.0 / 9.00, -2.0 / 9.00},
      {1.0 / 90.0, 1.0 / 45.0, 2.0 / 45.0},
      {1.0 / 90.0, -1.0 / 45.0, 2.0 / 45.0},
      {32.0 / 45.0, 16.0 / 45.0, 8.0 / 45.0},
      {32.0 / 45.0, -16.0 / 45.0, 8.0 / 45.0},
      {0.00, 0.00, 1.00},
  };
};

constant constexpr const float WinogradTransforms<6, 3, 8>::wt_transform[8][8];
constant constexpr const float WinogradTransforms<6, 3, 8>::in_transform[8][8];
constant constexpr const float WinogradTransforms<6, 3, 8>::out_transform[8][8];

template <typename T, int BC = 32, int BO = 4, int M = 6, int R = 3>
[[kernel, max_total_threads_per_threadgroup(BO * 32)]] void
winograd_conv_2d_weight_transform(
    const device T* wt_in [[buffer(0)]],
    device T* wt_out [[buffer(1)]],
    const constant int& C [[buffer(2)]],
    const constant int& O [[buffer(3)]],
    uint tid [[threadgroup_position_in_grid]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]]) {
  using WGT = WinogradTransforms<M, R, 8>;

  // Get lane position in simdgroup
  const short qid = simd_lane_id / 4;
  const short sm = (qid & 4) + (simd_lane_id / 2) % 4;
  const short sn = (qid & 2) * 2 + (simd_lane_id % 2) * 2;

  // Initialize G matrix
  simdgroup_matrix<float, 8, 8> G;
  G.thread_elements()[0] = WGT::wt_transform[sm][sn];
  G.thread_elements()[1] = WGT::wt_transform[sm][sn + 1];

  // Initialize Gt matrix
  simdgroup_matrix<float, 8, 8> Gt;
  Gt.thread_elements()[0] = WGT::wt_transform[sn][sm];
  Gt.thread_elements()[1] = WGT::wt_transform[sn + 1][sm];

  // Move to the correct output filter
  size_t ko = BO * tid + simd_group_id;
  wt_in += ko * R * R * C;

  // wt_out is stored transposed (A x A x C x O)
  short ohw_0 = sm * 8 + sn;
  short ohw_1 = sm * 8 + sn + 1;
  device T* wt_out_0 = wt_out + ohw_0 * C * O + ko;
  device T* wt_out_1 = wt_out + ohw_1 * C * O + ko;

  // Prepare shared memory
  threadgroup T Ws[BO][R][R][BC];

  // Loop over C
  for (int bc = 0; bc < C; bc += BC) {
    threadgroup_barrier(mem_flags::mem_threadgroup);
    // Read into shared memory
    for (int kh = 0; kh < R; ++kh) {
      for (int kw = 0; kw < R; ++kw) {
        for (int kc = simd_lane_id; kc < BC; kc += 32) {
          Ws[simd_group_id][kh][kw][kc] = wt_in[kh * R * C + kw * C + kc];
        }
      }
    }

    threadgroup_barrier(mem_flags::mem_threadgroup);
    // Do transform and store the result
    for (int c = 0; c < BC; ++c) {
      simdgroup_matrix<float, 8, 8> g;
      g.thread_elements()[0] =
          sm < R && sn < R ? Ws[simd_group_id][sm][sn][c] : T(0);
      g.thread_elements()[1] =
          sm < R && sn + 1 < R ? Ws[simd_group_id][sm][sn + 1][c] : T(0);

      simdgroup_matrix<float, 8, 8> g_out = (G * g) * Gt;
      wt_out_0[c * O] = static_cast<T>(g_out.thread_elements()[0]);
      wt_out_1[c * O] = static_cast<T>(g_out.thread_elements()[1]);
    }

    wt_in += BC;
    wt_out_0 += BC * O;
    wt_out_1 += BC * O;
  }
}

#define instantiate_winograd_conv_2d_weight_transform_base(name, itype, bc)   \
  template [[host_name(                                                       \
      "winograd_conv_2d_weight_transform_" #name "_bc" #bc)]] [[kernel]] void \
  winograd_conv_2d_weight_transform<itype, bc>(                               \
      const device itype* wt_in [[buffer(0)]],                                \
      device itype* wt_out [[buffer(1)]],                                     \
      const constant int& C [[buffer(2)]],                                    \
      const constant int& O [[buffer(3)]],                                    \
      uint tid [[threadgroup_position_in_grid]],                              \
      uint simd_group_id [[simdgroup_index_in_threadgroup]],                  \
      uint simd_lane_id [[thread_index_in_simdgroup]]);

template <typename T, int BC, int WM, int WN, int M = 6, int R = 3>
[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] void
winograd_conv_2d_input_transform(
    const device T* inp_in [[buffer(0)]],
    device T* inp_out [[buffer(1)]],
    const constant MLXConvParams<2>& params [[buffer(2)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint3 tgp_per_grid [[threadgroups_per_grid]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]]) {
  (void)lid;

  using WGT = WinogradTransforms<M, R, 8>;
  constexpr int A = WGT::IN_TILE_SIZE;
  constexpr int N_SIMD_GROUPS = WM * WN;

  // Get lane position in simdgroup
  const short qid = simd_lane_id / 4;
  const short sm = (qid & 4) + (simd_lane_id / 2) % 4;
  const short sn = (qid & 2) * 2 + (simd_lane_id % 2) * 2;

  // Initialize B matrix
  simdgroup_matrix<float, 8, 8> B;
  B.thread_elements()[0] = WGT::in_transform[sm][sn];
  B.thread_elements()[1] = WGT::in_transform[sm][sn + 1];

  // Initialize Bt matrix
  simdgroup_matrix<float, 8, 8> Bt;
  Bt.thread_elements()[0] = WGT::in_transform[sn][sm];
  Bt.thread_elements()[1] = WGT::in_transform[sn + 1][sm];

  // Resolve input tile
  constexpr int TH = (A / WM);
  constexpr int TW = (A / WN);
  int kh = TH * (simd_group_id / WN);
  int kw = TW * (simd_group_id % WN);
  int bh = M * tid.y + kh;
  int bw = M * tid.x + kw;

  // Move to the correct input tile
  inp_in += tid.z * params.in_strides[0] + bh * params.in_strides[1] +
      bw * params.in_strides[2];

  // Pre compute strides
  int jump_in[TH][TW];

  for (int h = 0; h < TH; h++) {
    for (int w = 0; w < TW; w++) {
      jump_in[h][w] = h * params.in_strides[1] + w * params.in_strides[2];
    }
  }

  // inp_out is stored interleaved (A x A x tiles x C)
  size_t N_TILES = tgp_per_grid.x * tgp_per_grid.y * tgp_per_grid.z;
  size_t tile_id =
      tid.z * tgp_per_grid.x * tgp_per_grid.y + tid.y * tgp_per_grid.x + tid.x;
  size_t ohw_0 = sm * 8 + sn;
  size_t ohw_1 = sm * 8 + sn + 1;
  device T* inp_out_0 =
      inp_out + ohw_0 * N_TILES * params.C + tile_id * params.C;
  device T* inp_out_1 =
      inp_out + ohw_1 * N_TILES * params.C + tile_id * params.C;

  // Prepare shared memory
  threadgroup T Is[A][A][BC];

  // Loop over C
  for (int bc = 0; bc < params.C; bc += BC) {
    threadgroup_barrier(mem_flags::mem_threadgroup);
    // Read into shared memory
    for (int h = 0; h < TH; h++) {
      for (int w = 0; w < TW; w++) {
        const device T* in_ptr = inp_in + jump_in[h][w];
        for (int c = simd_lane_id; c < BC; c += 32) {
          Is[kh + h][kw + w][c] = in_ptr[c];
        }
      }
    }

    threadgroup_barrier(mem_flags::mem_threadgroup);
    // Do transform and store the result
    for (int c = simd_group_id; c < BC; c += N_SIMD_GROUPS) {
      simdgroup_matrix<float, 8, 8> I;
      I.thread_elements()[0] = Is[sm][sn][c];
      I.thread_elements()[1] = Is[sm][sn + 1][c];

      simdgroup_matrix<float, 8, 8> I_out = (Bt * I) * B;
      inp_out_0[c] = static_cast<T>(I_out.thread_elements()[0]);
      inp_out_1[c] = static_cast<T>(I_out.thread_elements()[1]);
    }

    inp_in += BC;
    inp_out_0 += BC;
    inp_out_1 += BC;
  }
}

#define instantiate_winograd_conv_2d_input_transform(name, itype, bc)        \
  template [[host_name(                                                      \
      "winograd_conv_2d_input_transform_" #name "_bc" #bc)]] [[kernel]] void \
  winograd_conv_2d_input_transform<itype, bc, 2, 2>(                         \
      const device itype* inp_in [[buffer(0)]],                              \
      device itype* inp_out [[buffer(1)]],                                   \
      const constant MLXConvParams<2>& params [[buffer(2)]],                 \
      uint3 tid [[threadgroup_position_in_grid]],                            \
      uint3 lid [[thread_position_in_threadgroup]],                          \
      uint3 tgp_per_grid [[threadgroups_per_grid]],                          \
      uint simd_group_id [[simdgroup_index_in_threadgroup]],                 \
      uint simd_lane_id [[thread_index_in_simdgroup]]);

template <typename T, int BO, int WM, int WN, int M = 6, int R = 3>
[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] void
winograd_conv_2d_output_transform(
    const device T* out_in [[buffer(0)]],
    device T* out_out [[buffer(1)]],
    const constant MLXConvParams<2>& params [[buffer(2)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint3 tgp_per_grid [[threadgroups_per_grid]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]]) {
  (void)lid;

  using WGT = WinogradTransforms<M, R, 8>;
  constexpr int N_SIMD_GROUPS = WM * WN;

  // Get lane position in simdgroup
  const short qid = simd_lane_id / 4;
  const short sm = (qid & 4) + (simd_lane_id / 2) % 4;
  const short sn = (qid & 2) * 2 + (simd_lane_id % 2) * 2;

  // Initialize A matrix
  simdgroup_matrix<float, 8, 8> B;
  B.thread_elements()[0] = WGT::out_transform[sm][sn];
  B.thread_elements()[1] = WGT::out_transform[sm][sn + 1];

  // Initialize At matrix
  simdgroup_matrix<float, 8, 8> Bt;
  Bt.thread_elements()[0] = WGT::out_transform[sn][sm];
  Bt.thread_elements()[1] = WGT::out_transform[sn + 1][sm];

  // Out_in comes in shape (A x A x tiles x O)
  // We do transform and then write out to out_out in shape (N, H, W, O)

  // Resolve output tile
  constexpr int TH = (M / WM);
  constexpr int TW = (M / WN);
  int kh = TH * (simd_group_id / WN);
  int kw = TW * (simd_group_id % WN);
  int bh = M * tid.y + kh;
  int bw = M * tid.x + kw;

  // Move to the correct input tile
  out_out += tid.z * params.out_strides[0] + bh * params.out_strides[1] +
      bw * params.out_strides[2];

  // Pre compute strides
  int jump_in[TH][TW];

  for (int h = 0; h < TH; h++) {
    for (int w = 0; w < TW; w++) {
      bool valid = ((bh + h) < params.oS[0]) && ((bw + w) < params.oS[1]);
      jump_in[h][w] =
          valid ? h * params.out_strides[1] + w * params.out_strides[2] : -1;
    }
  }

  // out_in is stored interleaved (A x A x tiles x O)
  size_t N_TILES = tgp_per_grid.x * tgp_per_grid.y * tgp_per_grid.z;
  size_t tile_id =
      tid.z * tgp_per_grid.x * tgp_per_grid.y + tid.y * tgp_per_grid.x + tid.x;
  size_t ohw_0 = sm * 8 + sn;
  size_t ohw_1 = sm * 8 + sn + 1;
  const device T* out_in_0 =
      out_in + ohw_0 * N_TILES * params.O + tile_id * params.O;
  const device T* out_in_1 =
      out_in + ohw_1 * N_TILES * params.O + tile_id * params.O;

  // Prepare shared memory
  threadgroup T Os[M][M][BO];

  // Loop over O
  for (int bo = 0; bo < params.O; bo += BO) {
    threadgroup_barrier(mem_flags::mem_threadgroup);
    // Do transform and store the result
    for (int c = simd_group_id; c < BO; c += N_SIMD_GROUPS) {
      simdgroup_matrix<float, 8, 8> O_mat;
      O_mat.thread_elements()[0] = out_in_0[c];
      O_mat.thread_elements()[1] = out_in_1[c];

      simdgroup_matrix<float, 8, 8> O_out = (Bt * (O_mat * B));
      if ((sm < M) && (sn < M)) {
        Os[sm][sn][c] = static_cast<T>(O_out.thread_elements()[0]);
      }
      if ((sm < M) && ((sn + 1) < M)) {
        Os[sm][sn + 1][c] = static_cast<T>(O_out.thread_elements()[1]);
      }
    }

    threadgroup_barrier(mem_flags::mem_threadgroup);
    // Read out from shared memory
    for (int h = 0; h < TH; h++) {
      for (int w = 0; w < TW; w++) {
        if (jump_in[h][w] >= 0) {
          device T* out_ptr = out_out + jump_in[h][w];
          for (int c = simd_lane_id; c < BO; c += 32) {
            out_ptr[c] = Os[kh + h][kw + w][c];
          }
        }
      }
    }

    out_out += BO;
    out_in_0 += BO;
    out_in_1 += BO;
  }
}

#define instantiate_winograd_conv_2d_output_transform(name, itype, bo)        \
  template [[host_name(                                                       \
      "winograd_conv_2d_output_transform_" #name "_bo" #bo)]] [[kernel]] void \
  winograd_conv_2d_output_transform<itype, bo, 2, 2>(                         \
      const device itype* out_in [[buffer(0)]],                               \
      device itype* out_out [[buffer(1)]],                                    \
      const constant MLXConvParams<2>& params [[buffer(2)]],                  \
      uint3 tid [[threadgroup_position_in_grid]],                             \
      uint3 lid [[thread_position_in_threadgroup]],                           \
      uint3 tgp_per_grid [[threadgroups_per_grid]],                           \
      uint simd_group_id [[simdgroup_index_in_threadgroup]],                  \
      uint simd_lane_id [[thread_index_in_simdgroup]]);

// clang-format off
#define instantiate_winograd_conv_2d(name, itype)                     \
  instantiate_winograd_conv_2d_weight_transform_base(name, itype, 32) \
  instantiate_winograd_conv_2d_input_transform(name, itype, 32)       \
  instantiate_winograd_conv_2d_output_transform(name, itype, 32) // clang-format on

// clang-format off
instantiate_winograd_conv_2d(float32, float);
instantiate_winograd_conv_2d(bfloat16, bfloat16_t);
instantiate_winograd_conv_2d(float16, half); // clang-format on


================================================
FILE: mlx/backend/metal/kernels/copy.h
================================================
// Copyright © 2024 Apple Inc.

template <typename T, typename U, int N = WorkPerThread<U>::n>
[[kernel]] void copy_s(
    device const T* src [[buffer(0)]],
    device U* dst [[buffer(1)]],
    constant uint& size,
    uint index [[thread_position_in_grid]]) {
  index *= N;
  if (N > 1 && index + N > size) {
    for (int i = 0; index + i < size; ++i) {
      dst[index + i] = static_cast<U>(src[0]);
    }
  } else {
    for (int i = 0; i < N; ++i) {
      dst[index + i] = static_cast<U>(src[0]);
    }
  }
}

template <typename T, typename U, int N = WorkPerThread<U>::n>
[[kernel]] void copy_v(
    device const T* src [[buffer(0)]],
    device U* dst [[buffer(1)]],
    constant uint& size,
    uint index [[thread_position_in_grid]]) {
  index *= N;
  if (N > 1 && index + N > size) {
    for (int i = 0; index + i < size; ++i) {
      dst[index + i] = static_cast<U>(src[index + i]);
    }
  } else {
    for (int i = 0; i < N; ++i) {
      dst[index + i] = static_cast<U>(src[index + i]);
    }
  }
}

template <typename T, typename U, int N = WorkPerThread<U>::n>
[[kernel]] void copy_s2(
    device const T* src [[buffer(0)]],
    device U* dst [[buffer(1)]],
    constant int64_t& size,
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  int64_t offset = N * (index.x + grid_dim.x * int64_t(index.y));
  if (N > 1 && offset + N > size) {
    for (int i = 0; offset + i < size; ++i) {
      dst[offset + i] = static_cast<U>(src[0]);
    }
  } else {
    for (int i = 0; i < N; ++i) {
      dst[offset + i] = static_cast<U>(src[0]);
    }
  }
}

template <typename T, typename U, int N = WorkPerThread<U>::n>
[[kernel]] void copy_v2(
    device const T* src [[buffer(0)]],
    device U* dst [[buffer(1)]],
    constant int64_t& size,
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  int64_t offset = N * (index.x + grid_dim.x * int64_t(index.y));
  if (N > 1 && offset + N > size) {
    for (int i = 0; offset + i < size; ++i) {
      dst[offset + i] = static_cast<U>(src[offset + i]);
    }
  } else {
    for (int i = 0; i < N; ++i) {
      dst[offset + i] = static_cast<U>(src[offset + i]);
    }
  }
}

template <typename T, typename U, typename IdxT = int64_t>
[[kernel]] void copy_g_nd1(
    device const T* src [[buffer(0)]],
    device U* dst [[buffer(1)]],
    constant const int64_t& src_stride [[buffer(3)]],
    uint index [[thread_position_in_grid]]) {
  auto src_idx = elem_to_loc_1<IdxT>(index, src_stride);
  dst[index] = static_cast<U>(src[src_idx]);
}

template <typename T, typename U, typename IdxT = int64_t>
[[kernel]] void copy_g_nd2(
    device const T* src [[buffer(0)]],
    device U* dst [[buffer(1)]],
    constant const int64_t* src_strides [[buffer(3)]],
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  auto src_idx = elem_to_loc_2<IdxT>(index, src_strides);
  IdxT dst_idx = index.x + IdxT(grid_dim.x) * index.y;
  dst[dst_idx] = static_cast<U>(src[src_idx]);
}

template <typename T, typename U, typename IdxT = int64_t>
[[kernel]] void copy_g_nd3(
    device const T* src [[buffer(0)]],
    device U* dst [[buffer(1)]],
    constant const int64_t* src_strides [[buffer(3)]],
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
  auto src_idx = elem_to_loc_3<IdxT>(index, src_strides);
  IdxT dst_idx =
      index.x + IdxT(grid_dim.x) * (index.y + IdxT(grid_dim.y) * index.z);
  dst[dst_idx] = static_cast<U>(src[src_idx]);
}

template <typename T, typename U, int N = 1, typename IdxT = int64_t>
[[kernel]] void copy_g(
    device const T* src [[buffer(0)]],
    device U* dst [[buffer(1)]],
    constant const int* src_shape [[buffer(2)]],
    constant const int64_t* src_strides [[buffer(3)]],
    constant const int& ndim [[buffer(5)]],
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
  auto src_idx = elem_to_loc<IdxT>(
      {N * index.x, index.y, index.z}, src_shape, src_strides, ndim);
  if (N == 1) {
    IdxT dst_idx =
        index.x + grid_dim.x * (index.y + IdxT(grid_dim.y) * index.z);
    dst[dst_idx] = static_cast<U>(src[src_idx]);
    return;
  }
  auto xshape = src_shape[ndim - 1];
  IdxT dst_idx = N * index.x + xshape * (index.y + IdxT(grid_dim.y) * index.z);
  auto src_xstride = src_strides[ndim - 1];
  for (int i = 0; i < N && (int(N * index.x) + i) < xshape; ++i) {
    dst[dst_idx + i] = static_cast<U>(src[src_idx]);
    src_idx += src_xstride;
  }
}

template <typename T, typename U, typename IdxT = int64_t>
[[kernel]] void copy_gg_nd1(
    device const T* src [[buffer(0)]],
    device U* dst [[buffer(1)]],
    constant const int64_t& src_stride [[buffer(3)]],
    constant const int64_t& dst_stride [[buffer(4)]],
    uint index [[thread_position_in_grid]]) {
  auto src_idx = elem_to_loc_1<IdxT>(index, src_stride);
  auto dst_idx = elem_to_loc_1<IdxT>(index, dst_stride);
  dst[dst_idx] = static_cast<U>(src[src_idx]);
}

template <typename T, typename U, typename IdxT = int64_t>
[[kernel]] void copy_gg_nd2(
    device const T* src [[buffer(0)]],
    device U* dst [[buffer(1)]],
    constant const int64_t* src_strides [[buffer(3)]],
    constant const int64_t* dst_strides [[buffer(4)]],
    uint2 index [[thread_position_in_grid]]) {
  auto src_idx = elem_to_loc_2<IdxT>(index, src_strides);
  auto dst_idx = elem_to_loc_2<IdxT>(index, dst_strides);
  dst[dst_idx] = static_cast<U>(src[src_idx]);
}

template <typename T, typename U, typename IdxT = int64_t>
[[kernel]] void copy_gg_nd3(
    device const T* src [[buffer(0)]],
    device U* dst [[buffer(1)]],
    constant const int64_t* src_strides [[buffer(3)]],
    constant const int64_t* dst_strides [[buffer(4)]],
    uint3 index [[thread_position_in_grid]]) {
  auto src_idx = elem_to_loc_3<IdxT>(index, src_strides);
  auto dst_idx = elem_to_loc_3<IdxT>(index, dst_strides);
  dst[dst_idx] = static_cast<U>(src[src_idx]);
}

template <typename T, typename U, int N = 1, typename IdxT = int64_t>
[[kernel]] void copy_gg(
    device const T* src [[buffer(0)]],
    device U* dst [[buffer(1)]],
    constant const int* src_shape [[buffer(2)]],
    constant const int64_t* src_strides [[buffer(3)]],
    constant const int64_t* dst_strides [[buffer(4)]],
    constant const int& ndim [[buffer(5)]],
    uint3 index [[thread_position_in_grid]]) {
  auto idx = elem_to_loc_2_nd<IdxT>(
      {N * index.x, index.y, index.z},
      src_shape,
      src_strides,
      dst_strides,
      ndim);
  if (N == 1) {
    dst[idx.y] = static_cast<U>(src[idx.x]);
    return;
  }
  IdxT src_xstride = src_strides[ndim - 1];
  IdxT dst_xstride = dst_strides[ndim - 1];
  auto xshape = src_shape[ndim - 1];
  for (int i = 0; i < N && (int(N * index.x) + i) < xshape; ++i) {
    dst[idx.y] = static_cast<U>(src[idx.x]);
    idx.x += src_xstride;
    idx.y += dst_xstride;
  }
}

template <typename T, typename U, typename IdxT = int64_t>
[[kernel]] void copy_gg_dynamic_nd1(
    device const T* src [[buffer(0)]],
    device U* dst [[buffer(1)]],
    constant const int64_t& src_stride [[buffer(3)]],
    constant const int64_t& dst_stride [[buffer(4)]],
    constant const int64_t& src_offset [[buffer(6)]],
    constant const int64_t& dst_offset [[buffer(7)]],
    uint index [[thread_position_in_grid]]) {
  auto src_idx = elem_to_loc_1<IdxT>(index, src_stride);
  auto dst_idx = elem_to_loc_1<IdxT>(index, dst_stride);
  dst[dst_idx + dst_offset] = src[src_idx + src_offset];
}

template <typename T, typename U, typename IdxT = int64_t>
[[kernel]] void copy_gg_dynamic_nd2(
    device const T* src [[buffer(0)]],
    device U* dst [[buffer(1)]],
    constant const int64_t* src_strides [[buffer(3)]],
    constant const int64_t* dst_strides [[buffer(4)]],
    constant const int64_t& src_offset [[buffer(6)]],
    constant const int64_t& dst_offset [[buffer(7)]],
    uint2 index [[thread_position_in_grid]]) {
  auto src_idx = elem_to_loc_2<IdxT>(index, src_strides);
  auto dst_idx = elem_to_loc_2<IdxT>(index, dst_strides);
  dst[dst_idx + dst_offset] = src[src_idx + src_offset];
}

template <typename T, typename U, typename IdxT = int64_t>
[[kernel]] void copy_gg_dynamic_nd3(
    device const T* src [[buffer(0)]],
    device U* dst [[buffer(1)]],
    constant const int64_t* src_strides [[buffer(3)]],
    constant const int64_t* dst_strides [[buffer(4)]],
    constant const int64_t& src_offset [[buffer(6)]],
    constant const int64_t& dst_offset [[buffer(7)]],
    uint3 index [[thread_position_in_grid]]) {
  auto src_idx = elem_to_loc_3<IdxT>(index, src_strides);
  auto dst_idx = elem_to_loc_3<IdxT>(index, dst_strides);
  dst[dst_idx + dst_offset] = src[src_idx + src_offset];
}

template <typename T, typename U, int N = 1, typename IdxT = int64_t>
[[kernel]] void copy_gg_dynamic(
    device const T* src [[buffer(0)]],
    device U* dst [[buffer(1)]],
    constant const int* src_shape [[buffer(2)]],
    constant const int64_t* src_strides [[buffer(3)]],
    constant const int64_t* dst_strides [[buffer(4)]],
    constant const int& ndim [[buffer(5)]],
    constant const int64_t& src_offset [[buffer(6)]],
    constant const int64_t& dst_offset [[buffer(7)]],
    uint3 index [[thread_position_in_grid]]) {
  src += src_offset;
  dst += dst_offset;
  auto idx = elem_to_loc_2_nd<IdxT>(
      {N * index.x, index.y, index.z},
      src_shape,
      src_strides,
      dst_strides,
      ndim);
  if (N == 1) {
    dst[idx.y] = src[idx.x];
    return;
  }
  IdxT src_xstride = src_strides[ndim - 1];
  IdxT dst_xstride = dst_strides[ndim - 1];
  auto xshape = src_shape[ndim - 1];
  for (int i = 0; i < N && (int(N * index.x) + i) < xshape; ++i) {
    dst[idx.y] = src[idx.x];
    idx.x += src_xstride;
    idx.y += dst_xstride;
  }
}


================================================
FILE: mlx/backend/metal/kernels/copy.metal
================================================
// Copyright © 2024 Apple Inc.

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/copy.h"

#define instantiate_copy_work_per_thread(tname, itype, otype)         \
  instantiate_kernel("sn_copy" #tname, copy_s, itype, otype)          \
  instantiate_kernel("vn_copy" #tname, copy_v, itype, otype)

#define instantiate_copy_base(tname, itype, otype)                    \
  instantiate_kernel("s_copy" #tname, copy_s, itype, otype, 1)        \
  instantiate_kernel("v_copy" #tname, copy_v, itype, otype, 1)        \
  instantiate_kernel("s2_copy" #tname, copy_s2, itype, otype)         \
  instantiate_kernel("v2_copy" #tname, copy_v2, itype, otype)         \
  instantiate_kernel("g1_copy" #tname, copy_g_nd1, itype, otype, int) \
  instantiate_kernel("g2_copy" #tname, copy_g_nd2, itype, otype, int) \
  instantiate_kernel("g3_copy" #tname, copy_g_nd3, itype, otype, int) \
  instantiate_kernel("gn2_copy" #tname, copy_g, itype, otype, 2, int) \
  instantiate_kernel("g1large_copy" #tname, copy_g_nd1, itype, otype) \
  instantiate_kernel("g2large_copy" #tname, copy_g_nd2, itype, otype) \
  instantiate_kernel("g3large_copy" #tname, copy_g_nd3, itype, otype) \
  instantiate_kernel("gn4large_copy" #tname, copy_g, itype, otype, 4)

#define instantiate_copy_all(tname, itype, otype) \
  instantiate_copy_base(tname, itype, otype)      \
  instantiate_copy_work_per_thread(tname, itype, otype)

#define instantiate_copy_same(tname, type)                                            \
  instantiate_kernel("gg1_copy" #tname, copy_gg_nd1, type, type, int)                 \
  instantiate_kernel("gg2_copy" #tname, copy_gg_nd2, type, type, int)                 \
  instantiate_kernel("gg3_copy" #tname, copy_gg_nd3, type, type, int)                 \
  instantiate_kernel("ggn2_copy" #tname, copy_gg, type, type, 2, int)                 \
  instantiate_kernel("gg1large_copy" #tname, copy_gg_nd1, type, type)                 \
  instantiate_kernel("gg2large_copy" #tname, copy_gg_nd2, type, type)                 \
  instantiate_kernel("gg3large_copy" #tname, copy_gg_nd3, type, type)                 \
  instantiate_kernel("ggn4large_copy" #tname, copy_gg, type, type, 4)                 \
  instantiate_kernel("gg1_dynamic_copy" #tname, copy_gg_dynamic_nd1, type, type, int) \
  instantiate_kernel("gg2_dynamic_copy" #tname, copy_gg_dynamic_nd2, type, type, int) \
  instantiate_kernel("gg3_dynamic_copy" #tname, copy_gg_dynamic_nd3, type, type, int) \
  instantiate_kernel("ggn2_dynamic_copy" #tname, copy_gg_dynamic, type, type, 2, int) \
  instantiate_kernel("gg1large_dynamic_copy" #tname, copy_gg_dynamic_nd1, type, type) \
  instantiate_kernel("gg2large_dynamic_copy" #tname, copy_gg_dynamic_nd2, type, type) \
  instantiate_kernel("gg3large_dynamic_copy" #tname, copy_gg_dynamic_nd3, type, type) \
  instantiate_kernel("ggn4large_dynamic_copy" #tname, copy_gg_dynamic, type, type, 4)

#define instantiate_copy_itype(itname, itype)                \
  instantiate_copy_same(itname ##itname, itype)              \
  instantiate_copy_all(itname ##bool_, itype, bool)          \
  instantiate_copy_all(itname ##uint8, itype, uint8_t)       \
  instantiate_copy_all(itname ##uint16, itype, uint16_t)     \
  instantiate_copy_all(itname ##uint32, itype, uint32_t)     \
  instantiate_copy_base(itname ##uint64, itype, uint64_t)    \
  instantiate_copy_all(itname ##int8, itype, int8_t)         \
  instantiate_copy_all(itname ##int16, itype, int16_t)       \
  instantiate_copy_all(itname ##int32, itype, int32_t)       \
  instantiate_copy_base(itname ##int64, itype, int64_t)      \
  instantiate_copy_all(itname ##float16, itype, half)        \
  instantiate_copy_all(itname ##float32, itype, float)       \
  instantiate_copy_all(itname ##bfloat16, itype, bfloat16_t) \
  instantiate_copy_base(itname ##complex64, itype, complex64_t)

instantiate_copy_itype(bool_, bool)
instantiate_copy_itype(uint8, uint8_t)
instantiate_copy_itype(uint16, uint16_t)
instantiate_copy_itype(uint32, uint32_t)
instantiate_copy_itype(uint64, uint64_t)
instantiate_copy_itype(int8, int8_t)
instantiate_copy_itype(int16, int16_t)
instantiate_copy_itype(int32, int32_t)
instantiate_copy_itype(int64, int64_t)
instantiate_copy_itype(float16, half)
instantiate_copy_itype(float32, float)
instantiate_copy_itype(bfloat16, bfloat16_t)
instantiate_copy_itype(complex64, complex64_t) // clang-format on


================================================
FILE: mlx/backend/metal/kernels/defines.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#if defined __METAL__ || defined MLX_METAL_JIT
#define MTL_CONST constant
#else
#define MTL_CONST
#endif

static MTL_CONST constexpr int MAX_REDUCE_SPECIALIZED_DIMS = 4;
static MTL_CONST constexpr int REDUCE_N_READS = 4;
static MTL_CONST constexpr int REDUCE_N_WRITES = 4;
static MTL_CONST constexpr int SOFTMAX_N_READS = 4;
static MTL_CONST constexpr int RMS_N_READS = 4;
static MTL_CONST constexpr int RMS_LOOPED_LIMIT = 4096;

// Instantiate a templated kernel.
// Extra args are used as template parameters:
// e.g. instantiate_kernel(binary_int, binary, a, b) ->
// [[host_name(binary_int)]] [kernel] binary<a, b>
#define instantiate_kernel(name, func, ...) \
  template [[host_name(                     \
      name)]] [[kernel]] decltype(func<__VA_ARGS__>) func<__VA_ARGS__>;


================================================
FILE: mlx/backend/metal/kernels/erf.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once
#include <metal_math>
#include "mlx/backend/metal/kernels/expm1f.h"

/*
 * Approximation to the error function.
 * Based on code from:
 * https://stackoverflow.com/questions/35148198/efficient-faithfully-rounded-implementation-of-error-function-erff#answer-35148199
 */
float erf(float a) {
  float r, s, t, u;
  t = metal::abs(a);
  s = a * a;
  if (t > 0.927734375f) {
    // maximum error 0.99527 ulp
    r = metal::fma(
        -1.72853470e-5f, t, 3.83197126e-4f); // -0x1.220000p-16,0x1.91cfb2p-12
    u = metal::fma(
        -3.88396438e-3f, t, 2.42546219e-2f); // -0x1.fd1438p-9, 0x1.8d6342p-6
    r = metal::fma(r, s, u);
    r = metal::fma(r, t, -1.06777877e-1f); // -0x1.b55cb8p-4
    r = metal::fma(r, t, -6.34846687e-1f); // -0x1.450aa0p-1
    r = metal::fma(r, t, -1.28717512e-1f); // -0x1.079d0cp-3
    r = metal::fma(r, t, -t);
    r = -expm1f(r);
    r = metal::copysign(r, a);
  } else {
    // maximum error 0.98929 ulp
    r = -5.96761703e-4f; // -0x1.38e000p-11
    r = metal::fma(r, s, 4.99119423e-3f); //  0x1.471a58p-8
    r = metal::fma(r, s, -2.67681349e-2f); // -0x1.b691b2p-6
    r = metal::fma(r, s, 1.12819925e-1f); //  0x1.ce1c44p-4
    r = metal::fma(r, s, -3.76125336e-1f); // -0x1.812700p-2
    r = metal::fma(r, s, 1.28379166e-1f); //  0x1.06eba8p-3
    r = metal::fma(r, a, a);
  }
  return r;
}

float erfinv(float a) {
  auto t = metal::fma(a, 0.0f - a, 1.0f);
  t = metal::log(t);
  float p;
  if (metal::abs(t) > 6.125f) { // maximum ulp error = 2.35793
    p = 3.03697567e-10f; //  0x1.4deb44p-32
    p = metal::fma(p, t, 2.93243101e-8f); //  0x1.f7c9aep-26
    p = metal::fma(p, t, 1.22150334e-6f); //  0x1.47e512p-20
    p = metal::fma(p, t, 2.84108955e-5f); //  0x1.dca7dep-16
    p = metal::fma(p, t, 3.93552968e-4f); //  0x1.9cab92p-12
    p = metal::fma(p, t, 3.02698812e-3f); //  0x1.8cc0dep-9
    p = metal::fma(p, t, 4.83185798e-3f); //  0x1.3ca920p-8
    p = metal::fma(p, t, -2.64646143e-1f); // -0x1.0eff66p-2
    p = metal::fma(p, t, 8.40016484e-1f); //  0x1.ae16a4p-1
  } else { // maximum ulp error = 2.35002
    p = 5.43877832e-9f; //  0x1.75c000p-28
    p = metal::fma(p, t, 1.43285448e-7f); //  0x1.33b402p-23
    p = metal::fma(p, t, 1.22774793e-6f); //  0x1.499232p-20
    p = metal::fma(p, t, 1.12963626e-7f); //  0x1.e52cd2p-24
    p = metal::fma(p, t, -5.61530760e-5f); // -0x1.d70bd0p-15
    p = metal::fma(p, t, -1.47697632e-4f); // -0x1.35be90p-13
    p = metal::fma(p, t, 2.31468678e-3f); //  0x1.2f6400p-9
    p = metal::fma(p, t, 1.15392581e-2f); //  0x1.7a1e50p-7
    p = metal::fma(p, t, -2.32015476e-1f); // -0x1.db2aeep-3
    p = metal::fma(p, t, 8.86226892e-1f); //  0x1.c5bf88p-1
  }
  return a * p;
}


================================================
FILE: mlx/backend/metal/kernels/expm1f.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include <metal_math>

// Original license copied below:
//  Copyright (c) 2015-2023 Norbert Juffa
//  All rights reserved.
//
//  Redistribution and use in source and binary forms, with or without
//  modification, are permitted provided that the following conditions
//  are met:
//
//  1. Redistributions of source code must retain the above copyright
//     notice, this list of conditions and the following disclaimer.
//
//  2. Redistributions in binary form must reproduce the above copyright
//     notice, this list of conditions and the following disclaimer in the
//     documentation and/or other materials provided with the distribution.
//
//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
//  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
//  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
//  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
//  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
//  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
//  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
//  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
//  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

/* Compute exponential base e minus 1. Maximum ulp error = 0.997458

   i = rint(a/log(2)), f = a-i*log(2). Then expm1(a) = 2**i * (expm1(f)+1) - 1.
   Compute r = expm1(f). Then expm1(a)= 2 * (0.5 * 2**i * r + 0.5 * 2**i - 0.5).
   With t = 0.5*2**i, expm1(a) = 2*(r * t + t-0.5). However, for best accuracy,
   when i == 1, expm1(a)= 2*(r + 0.5), and when i == 0, expm1(a) = r.

   NOTE: Scale factor b is only applied if i < 0 or i > 1 (should be power of 2)
*/
float expm1f_scaled_unchecked(float a, float b) {
  float f, j, r, s, t, u, v, x, y;
  int i;

  // exp(a) = 2**i * exp(f); i = rintf (a / log(2))
  j = fma(1.442695f, a, 12582912.f); // 0x1.715476p0, 0x1.8p23
  j = j - 12582912.0f; // 0x1.8p23
  i = (int)j;
  f = fma(j, -6.93145752e-1f, a);

  // approximate r = exp(f)-1 on interval [-log(2)/2, +log(2)/2]
  s = f * f;
  if (a == 0.0f)
    s = a; // ensure -0 is passed through
  // err = 0.997458  ulp1 = 11081805
  r = 1.97350979e-4f; // 0x1.9de000p-13
  r = fma(r, f, 1.39309070e-3f); // 0x1.6d30bcp-10
  r = fma(r, f, 8.33343994e-3f); // 0x1.1111f6p-7
  r = fma(r, f, 4.16668020e-2f); // 0x1.55559ep-5
  r = fma(r, f, 1.66666716e-1f); // 0x1.55555cp-3
  r = fma(r, f, 4.99999970e-1f); // 0x1.fffffep-2
  u = (j == 1) ? (f + 0.5f) : f;
  v = fma(r, s, u);
  s = 0.5f * b;
  t = ldexp(s, i);
  y = t - s;
  x = (t - y) - s; // double-float canonicalization of difference
  r = fma(v, t, x) + y;
  r = r + r;
  if (j == 0)
    r = v;
  if (j == 1)
    r = v + v;
  return r;
}

/* Compute exponential base e minus 1. max ulp err = 0.99746 */
float expm1f(float a) {
  float r;

  r = expm1f_scaled_unchecked(a, 1.0f);
  /* handle severe overflow and underflow */
  if (abs(a - 1.0f) > 88.0f) {
    r = pow(2, a);
    r = fma(r, r, -1.0f);
  }
  return r;
}


================================================
FILE: mlx/backend/metal/kernels/fence.metal
================================================
// Copyright © 2024 Apple Inc.

#pragma METAL internals : enable

#ifndef __METAL_MEMORY_SCOPE_SYSTEM__
#define __METAL_MEMORY_SCOPE_SYSTEM__ 3
#endif
namespace metal {
constexpr constant metal::thread_scope thread_scope_system =
    static_cast<thread_scope>(__METAL_MEMORY_SCOPE_SYSTEM__);
}

#include <metal_atomic>

[[kernel]] void input_coherent(
    volatile coherent(system) device uint* input [[buffer(0)]],
    const constant uint& size [[buffer(1)]],
    uint index [[thread_position_in_grid]]) {
  if (index < size) {
    input[index] = input[index];
  }
  metal::atomic_thread_fence(
      metal::mem_flags::mem_device,
      metal::memory_order_seq_cst,
      metal::thread_scope_system);
}

// single thread kernel to update timestamp
[[kernel]] void fence_update(
    volatile coherent(system) device uint* timestamp [[buffer(0)]],
    constant uint& value [[buffer(1)]]) {
  timestamp[0] = value;
  metal::atomic_thread_fence(
      metal::mem_flags::mem_device,
      metal::memory_order_seq_cst,
      metal::thread_scope_system);
}

// single thread kernel to spin wait for timestamp value
[[kernel]] void fence_wait(
    volatile coherent(system) device uint* timestamp [[buffer(0)]],
    constant uint& value [[buffer(1)]]) {
  while (1) {
    metal::atomic_thread_fence(
        metal::mem_flags::mem_device,
        metal::memory_order_seq_cst,
        metal::thread_scope_system);
    if (timestamp[0] >= value) {
      break;
    }
  }
}


================================================
FILE: mlx/backend/metal/kernels/fft/radix.h
================================================
// Copyright © 2024 Apple Inc.

/* Radix kernels

We provide optimized, single threaded Radix codelets
for n=2,3,4,5,6,7,8,10,11,12,13.

For n=2,3,4,5,6 we hand write the codelets.
For n=8,10,12 we combine smaller codelets.
For n=7,11,13 we use Rader's algorithm which decomposes
them into (n-1)=6,10,12 codelets. */

#pragma once

#include <metal_common>
#include <metal_math>
#include <metal_stdlib>

METAL_FUNC float2 complex_mul(float2 a, float2 b) {
  return float2(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x);
}

// Complex mul followed by conjugate
METAL_FUNC float2 complex_mul_conj(float2 a, float2 b) {
  return float2(a.x * b.x - a.y * b.y, -a.x * b.y - a.y * b.x);
}

// Compute an FFT twiddle factor
METAL_FUNC float2 get_twiddle(int k, int p) {
  float theta = -2.0f * k * M_PI_F / p;

  float2 twiddle = {metal::fast::cos(theta), metal::fast::sin(theta)};
  return twiddle;
}

METAL_FUNC void radix2(thread float2* x, thread float2* y) {
  y[0] = x[0] + x[1];
  y[1] = x[0] - x[1];
}

METAL_FUNC void radix3(thread float2* x, thread float2* y) {
  float pi_2_3 = -0.8660254037844387;

  float2 a_1 = x[1] + x[2];
  float2 a_2 = x[1] - x[2];

  y[0] = x[0] + a_1;
  float2 b_1 = x[0] - 0.5 * a_1;
  float2 b_2 = pi_2_3 * a_2;

  float2 b_2_j = {-b_2.y, b_2.x};
  y[1] = b_1 + b_2_j;
  y[2] = b_1 - b_2_j;
}

METAL_FUNC void radix4(thread float2* x, thread float2* y) {
  float2 z_0 = x[0] + x[2];
  float2 z_1 = x[0] - x[2];
  float2 z_2 = x[1] + x[3];
  float2 z_3 = x[1] - x[3];
  float2 z_3_i = {z_3.y, -z_3.x};

  y[0] = z_0 + z_2;
  y[1] = z_1 + z_3_i;
  y[2] = z_0 - z_2;
  y[3] = z_1 - z_3_i;
}

METAL_FUNC void radix5(thread float2* x, thread float2* y) {
  float2 root_5_4 = 0.5590169943749475;
  float2 sin_2pi_5 = 0.9510565162951535;
  float2 sin_1pi_5 = 0.5877852522924731;

  float2 a_1 = x[1] + x[4];
  float2 a_2 = x[2] + x[3];
  float2 a_3 = x[1] - x[4];
  float2 a_4 = x[2] - x[3];

  float2 a_5 = a_1 + a_2;
  float2 a_6 = root_5_4 * (a_1 - a_2);
  float2 a_7 = x[0] - a_5 / 4;
  float2 a_8 = a_7 + a_6;
  float2 a_9 = a_7 - a_6;
  float2 a_10 = sin_2pi_5 * a_3 + sin_1pi_5 * a_4;
  float2 a_11 = sin_1pi_5 * a_3 - sin_2pi_5 * a_4;
  float2 a_10_j = {a_10.y, -a_10.x};
  float2 a_11_j = {a_11.y, -a_11.x};

  y[0] = x[0] + a_5;
  y[1] = a_8 + a_10_j;
  y[2] = a_9 + a_11_j;
  y[3] = a_9 - a_11_j;
  y[4] = a_8 - a_10_j;
}

METAL_FUNC void radix6(thread float2* x, thread float2* y) {
  float sin_pi_3 = 0.8660254037844387;
  float2 a_1 = x[2] + x[4];
  float2 a_2 = x[0] - a_1 / 2;
  float2 a_3 = sin_pi_3 * (x[2] - x[4]);
  float2 a_4 = x[5] + x[1];
  float2 a_5 = x[3] - a_4 / 2;
  float2 a_6 = sin_pi_3 * (x[5] - x[1]);
  float2 a_7 = x[0] + a_1;

  float2 a_3_i = {a_3.y, -a_3.x};
  float2 a_6_i = {a_6.y, -a_6.x};
  float2 a_8 = a_2 + a_3_i;
  float2 a_9 = a_2 - a_3_i;
  float2 a_10 = x[3] + a_4;
  float2 a_11 = a_5 + a_6_i;
  float2 a_12 = a_5 - a_6_i;

  y[0] = a_7 + a_10;
  y[1] = a_8 - a_11;
  y[2] = a_9 + a_12;
  y[3] = a_7 - a_10;
  y[4] = a_8 + a_11;
  y[5] = a_9 - a_12;
}

METAL_FUNC void radix7(thread float2* x, thread float2* y) {
  // Rader's algorithm
  float2 inv = {1 / 6.0, -1 / 6.0};

  // fft
  float2 in1[6] = {x[1], x[3], x[2], x[6], x[4], x[5]};
  radix6(in1, y + 1);

  y[0] = y[1] + x[0];

  // b_q
  y[1] = complex_mul_conj(y[1], float2(-1, 0));
  y[2] = complex_mul_conj(y[2], float2(2.44013336, -1.02261879));
  y[3] = complex_mul_conj(y[3], float2(2.37046941, -1.17510629));
  y[4] = complex_mul_conj(y[4], float2(0, -2.64575131));
  y[5] = complex_mul_conj(y[5], float2(2.37046941, 1.17510629));
  y[6] = complex_mul_conj(y[6], float2(-2.44013336, -1.02261879));

  // ifft
  radix6(y + 1, x + 1);

  y[1] = x[1] * inv + x[0];
  y[5] = x[2] * inv + x[0];
  y[4] = x[3] * inv + x[0];
  y[6] = x[4] * inv + x[0];
  y[2] = x[5] * inv + x[0];
  y[3] = x[6] * inv + x[0];
}

METAL_FUNC void radix8(thread float2* x, thread float2* y) {
  float cos_pi_4 = 0.7071067811865476;
  float2 w_0 = {cos_pi_4, -cos_pi_4};
  float2 w_1 = {-cos_pi_4, -cos_pi_4};
  float2 temp[8] = {x[0], x[2], x[4], x[6], x[1], x[3], x[5], x[7]};
  radix4(temp, x);
  radix4(temp + 4, x + 4);

  y[0] = x[0] + x[4];
  y[4] = x[0] - x[4];
  float2 x_5 = complex_mul(x[5], w_0);
  y[1] = x[1] + x_5;
  y[5] = x[1] - x_5;
  float2 x_6 = {x[6].y, -x[6].x};
  y[2] = x[2] + x_6;
  y[6] = x[2] - x_6;
  float2 x_7 = complex_mul(x[7], w_1);
  y[3] = x[3] + x_7;
  y[7] = x[3] - x_7;
}

template <bool raders_perm>
METAL_FUNC void radix10(thread float2* x, thread float2* y) {
  float2 w[4];
  w[0] = {0.8090169943749475, -0.5877852522924731};
  w[1] = {0.30901699437494745, -0.9510565162951535};
  w[2] = {-w[1].x, w[1].y};
  w[3] = {-w[0].x, w[0].y};

  if (raders_perm) {
    float2 temp[10] = {
        x[0], x[3], x[4], x[8], x[2], x[1], x[7], x[9], x[6], x[5]};
    radix5(temp, x);
    radix5(temp + 5, x + 5);
  } else {
    float2 temp[10] = {
        x[0], x[2], x[4], x[6], x[8], x[1], x[3], x[5], x[7], x[9]};
    radix5(temp, x);
    radix5(temp + 5, x + 5);
  }

  y[0] = x[0] + x[5];
  y[5] = x[0] - x[5];
  for (int t = 1; t < 5; t++) {
    float2 a = complex_mul(x[t + 5], w[t - 1]);
    y[t] = x[t] + a;
    y[t + 5] = x[t] - a;
  }
}

METAL_FUNC void radix11(thread float2* x, thread float2* y) {
  // Raders Algorithm
  float2 inv = {1 / 10.0, -1 / 10.0};

  // fft
  radix10<true>(x + 1, y + 1);

  y[0] = y[1] + x[0];

  // b_q
  y[1] = complex_mul_conj(y[1], float2(-1, 0));
  y[2] = complex_mul_conj(y[2], float2(0.955301878, -3.17606649));
  y[3] = complex_mul_conj(y[3], float2(2.63610556, 2.01269656));
  y[4] = complex_mul_conj(y[4], float2(2.54127802, 2.13117479));
  y[5] = complex_mul_conj(y[5], float2(2.07016210, 2.59122150));
  y[6] = complex_mul_conj(y[6], float2(0, -3.31662479));
  y[7] = complex_mul_conj(y[7], float2(2.07016210, -2.59122150));
  y[8] = complex_mul_conj(y[8], float2(-2.54127802, 2.13117479));
  y[9] = complex_mul_conj(y[9], float2(2.63610556, -2.01269656));
  y[10] = complex_mul_conj(y[10], float2(-0.955301878, -3.17606649));

  // ifft
  radix10<false>(y + 1, x + 1);

  y[1] = x[1] * inv + x[0];
  y[6] = x[2] * inv + x[0];
  y[3] = x[3] * inv + x[0];
  y[7] = x[4] * inv + x[0];
  y[9] = x[5] * inv + x[0];
  y[10] = x[6] * inv + x[0];
  y[5] = x[7] * inv + x[0];
  y[8] = x[8] * inv + x[0];
  y[4] = x[9] * inv + x[0];
  y[2] = x[10] * inv + x[0];
}

template <bool raders_perm>
METAL_FUNC void radix12(thread float2* x, thread float2* y) {
  float2 w[6];
  float sin_pi_3 = 0.8660254037844387;
  w[0] = {sin_pi_3, -0.5};
  w[1] = {0.5, -sin_pi_3};
  w[2] = {0, -1};
  w[3] = {-0.5, -sin_pi_3};
  w[4] = {-sin_pi_3, -0.5};

  if (raders_perm) {
    float2 temp[12] = {
        x[0],
        x[3],
        x[2],
        x[11],
        x[8],
        x[9],
        x[1],
        x[7],
        x[5],
        x[10],
        x[4],
        x[6]};
    radix6(temp, x);
    radix6(temp + 6, x + 6);
  } else {
    float2 temp[12] = {
        x[0],
        x[2],
        x[4],
        x[6],
        x[8],
        x[10],
        x[1],
        x[3],
        x[5],
        x[7],
        x[9],
        x[11]};
    radix6(temp, x);
    radix6(temp + 6, x + 6);
  }

  y[0] = x[0] + x[6];
  y[6] = x[0] - x[6];
  for (int t = 1; t < 6; t++) {
    float2 a = complex_mul(x[t + 6], w[t - 1]);
    y[t] = x[t] + a;
    y[t + 6] = x[t] - a;
  }
}

METAL_FUNC void radix13(thread float2* x, thread float2* y) {
  // Raders Algorithm
  float2 inv = {1 / 12.0, -1 / 12.0};

  // fft
  radix12<true>(x + 1, y + 1);

  y[0] = y[1] + x[0];

  // b_q
  y[1] = complex_mul_conj(y[1], float2(-1, 0));
  y[2] = complex_mul_conj(y[2], float2(3.07497206, -1.88269669));
  y[3] = complex_mul_conj(y[3], float2(3.09912468, 1.84266823));
  y[4] = complex_mul_conj(y[4], float2(3.45084438, -1.04483161));
  y[5] = complex_mul_conj(y[5], float2(0.91083583, 3.48860690));
  y[6] = complex_mul_conj(y[6], float2(-3.60286363, 0.139189267));
  y[7] = complex_mul_conj(y[7], float2(3.60555128, 0));
  y[8] = complex_mul_conj(y[8], float2(3.60286363, 0.139189267));
  y[9] = complex_mul_conj(y[9], float2(0.91083583, -3.48860690));
  y[10] = complex_mul_conj(y[10], float2(-3.45084438, -1.04483161));
  y[11] = complex_mul_conj(y[11], float2(3.09912468, -1.84266823));
  y[12] = complex_mul_conj(y[12], float2(-3.07497206, -1.88269669));

  // ifft
  radix12<false>(y + 1, x + 1);

  y[1] = x[1] * inv + x[0];
  y[7] = x[2] * inv + x[0];
  y[10] = x[3] * inv + x[0];
  y[5] = x[4] * inv + x[0];
  y[9] = x[5] * inv + x[0];
  y[11] = x[6] * inv + x[0];
  y[12] = x[7] * inv + x[0];
  y[6] = x[8] * inv + x[0];
  y[3] = x[9] * inv + x[0];
  y[8] = x[10] * inv + x[0];
  y[4] = x[11] * inv + x[0];
  y[2] = x[12] * inv + x[0];
}

================================================
FILE: mlx/backend/metal/kernels/fft/readwrite.h
================================================
// Copyright © 2024 Apple Inc.

#include <metal_common>

#include "mlx/backend/metal/kernels/fft/radix.h"

/* FFT helpers for reading and writing from/to device memory.

For many sizes, GPU FFTs are memory bandwidth bound so
read/write performance is important.

Where possible, we read 128 bits sequentially in each thread,
coalesced with accesses from adjacent threads for optimal performance.

We implement specialized reading/writing for:
  - FFT
  - RFFT
  - IRFFT

Each with support for:
  - Contiguous reads
  - Padded reads
  - Strided reads
*/

#define MAX_RADIX 13

using namespace metal;

template <
    typename in_T,
    typename out_T,
    int step = 0,
    bool four_step_real = false>
struct ReadWriter {
  const device in_T* in;
  threadgroup float2* buf;
  device out_T* out;
  int n;
  int batch_size;
  int elems_per_thread;
  uint3 elem;
  uint3 grid;
  int threads_per_tg;
  bool inv;

  // Used for strided access
  int strided_device_idx = 0;
  int strided_shared_idx = 0;

  METAL_FUNC ReadWriter(
      const device in_T* in_,
      threadgroup float2* buf_,
      device out_T* out_,
      const short n_,
      const int batch_size_,
      const short elems_per_thread_,
      const uint3 elem_,
      const uint3 grid_,
      const bool inv_)
      : in(in_),
        buf(buf_),
        out(out_),
        n(n_),
        batch_size(batch_size_),
        elems_per_thread(elems_per_thread_),
        elem(elem_),
        grid(grid_),
        inv(inv_) {
    // Account for padding on last threadgroup
    threads_per_tg = elem.x == grid.x - 1
        ? (batch_size - (grid.x - 1) * grid.y) * grid.z
        : grid.y * grid.z;
  }

  // ifft(x) = 1/n * conj(fft(conj(x)))
  METAL_FUNC float2 post_in(float2 elem) const {
    return inv ? float2(elem.x, -elem.y) : elem;
  }

  // Handle float case for generic RFFT alg
  METAL_FUNC float2 post_in(float elem) const {
    return float2(elem, 0);
  }

  METAL_FUNC float2 pre_out(float2 elem) const {
    return inv ? float2(elem.x / n, -elem.y / n) : elem;
  }

  METAL_FUNC float2 pre_out(float2 elem, int length) const {
    return inv ? float2(elem.x / length, -elem.y / length) : elem;
  }

  METAL_FUNC bool out_of_bounds() const {
    // Account for possible extra threadgroups
    int grid_index = elem.x * grid.y + elem.y;
    return grid_index >= batch_size;
  }

  METAL_FUNC void load() const {
    size_t batch_idx = size_t(elem.x * grid.y) * n;
    short tg_idx = elem.y * grid.z + elem.z;
    short max_index = grid.y * n - 2;

    // 2 complex64s = 128 bits
    constexpr int read_width = 2;
    for (short e = 0; e < (elems_per_thread / read_width); e++) {
      short index = read_width * tg_idx + read_width * threads_per_tg * e;
      index = metal::min(index, max_index);
      // vectorized reads
      buf[index] = post_in(in[batch_idx + index]);
      buf[index + 1] = post_in(in[batch_idx + index + 1]);
    }
    max_index += 1;
    if (elems_per_thread % 2 != 0) {
      short index = tg_idx +
          read_width * threads_per_tg * (elems_per_thread / read_width);
      index = metal::min(index, max_index);
      buf[index] = post_in(in[batch_idx + index]);
    }
  }

  METAL_FUNC void write() const {
    size_t batch_idx = size_t(elem.x * grid.y) * n;
    short tg_idx = elem.y * grid.z + elem.z;
    short max_index = grid.y * n - 2;

    constexpr int read_width = 2;
    for (short e = 0; e < (elems_per_thread / read_width); e++) {
      short index = read_width * tg_idx + read_width * threads_per_tg * e;
      index = metal::min(index, max_index);
      // vectorized reads
      out[batch_idx + index] = pre_out(buf[index]);
      out[batch_idx + index + 1] = pre_out(buf[index + 1]);
    }
    max_index += 1;
    if (elems_per_thread % 2 != 0) {
      short index = tg_idx +
          read_width * threads_per_tg * (elems_per_thread / read_width);
      index = metal::min(index, max_index);
      out[batch_idx + index] = pre_out(buf[index]);
    }
  }

  // Padded IO for Bluestein's algorithm
  METAL_FUNC void load_padded(int length, const device float2* w_k) const {
    size_t batch_idx = size_t(elem.x * grid.y) * length + elem.y * length;
    int fft_idx = elem.z;
    int m = grid.z;

    threadgroup float2* seq_buf = buf + elem.y * n;
    for (int e = 0; e < elems_per_thread; e++) {
      int index = metal::min(fft_idx + e * m, n - 1);
      if (index < length) {
        float2 elem = post_in(in[batch_idx + index]);
        seq_buf[index] = complex_mul(elem, w_k[index]);
      } else {
        seq_buf[index] = 0.0;
      }
    }
  }

  METAL_FUNC void write_padded(int length, const device float2* w_k) const {
    size_t batch_idx = size_t(elem.x * grid.y) * length + elem.y * length;
    int fft_idx = elem.z;
    int m = grid.z;
    float2 inv_factor = {1.0f / n, -1.0f / n};

    threadgroup float2* seq_buf = buf + elem.y * n;
    for (int e = 0; e < elems_per_thread; e++) {
      int index = metal::min(fft_idx + e * m, n - 1);
      if (index < length) {
        float2 elem = seq_buf[index + length - 1] * inv_factor;
        out[batch_idx + index] = pre_out(complex_mul(elem, w_k[index]), length);
      }
    }
  }

  // Strided IO for four step FFT
  METAL_FUNC void compute_strided_indices(int stride, int overall_n) {
    // Use the batch threadgroup dimension to coalesce memory accesses:
    // e.g. stride = 12
    // device      | shared mem
    // 0  1  2  3  |  0 12 - -
    // -  -  -  -  |  1 13 - -
    // -  -  -  -  |  2 14 - -
    // 12 13 14 15 |  3 15 - -
    int coalesce_width = grid.y;
    int tg_idx = elem.y * grid.z + elem.z;
    int outer_batch_size = stride / coalesce_width;

    int strided_batch_idx = (elem.x % outer_batch_size) * coalesce_width +
        overall_n * (elem.x / outer_batch_size);
    strided_device_idx = strided_batch_idx +
        tg_idx / coalesce_width * elems_per_thread * stride +
        tg_idx % coalesce_width;
    strided_shared_idx = (tg_idx % coalesce_width) * n +
        tg_idx / coalesce_width * elems_per_thread;
  }

  // Four Step FFT First Step
  METAL_FUNC void load_strided(int stride, int overall_n) {
    compute_strided_indices(stride, overall_n);
    for (int e = 0; e < elems_per_thread; e++) {
      buf[strided_shared_idx + e] =
          post_in(in[strided_device_idx + e * stride]);
    }
  }

  METAL_FUNC void write_strided(int stride, int overall_n) {
    for (int e = 0; e < elems_per_thread; e++) {
      float2 output = buf[strided_shared_idx + e];
      int combined_idx = (strided_device_idx + e * stride) % overall_n;
      int ij = (combined_idx / stride) * (combined_idx % stride);
      // Apply four step twiddles at end of first step
      float2 twiddle = get_twiddle(ij, overall_n);
      out[strided_device_idx + e * stride] = complex_mul(output, twiddle);
    }
  }
};

// Four Step FFT Second Step
template <>
METAL_FUNC void ReadWriter<float2, float2, /*step=*/1>::load_strided(
    int stride,
    int overall_n) {
  // Silence compiler warnings
  (void)stride;
  (void)overall_n;
  // Don't invert between steps
  bool default_inv = inv;
  inv = false;
  load();
  inv = default_inv;
}

template <>
METAL_FUNC void ReadWriter<float2, float2, /*step=*/1>::write_strided(
    int stride,
    int overall_n) {
  compute_strided_indices(stride, overall_n);
  for (int e = 0; e < elems_per_thread; e++) {
    float2 output = buf[strided_shared_idx + e];
    out[strided_device_idx + e * stride] = pre_out(output, overall_n);
  }
}

// For RFFT, we interleave batches of two real sequences into one complex one:
//
// z_k = x_k + j.y_k
// X_k = (Z_k + Z_(N-k)*) / 2
// Y_k = -j * ((Z_k - Z_(N-k)*) / 2)
//
// This roughly doubles the throughput over the regular FFT.
template <>
METAL_FUNC bool ReadWriter<float, float2>::out_of_bounds() const {
  int grid_index = elem.x * grid.y + elem.y;
  // We pack two sequences into one for RFFTs
  return grid_index * 2 >= batch_size;
}

template <>
METAL_FUNC void ReadWriter<float, float2>::load() const {
  size_t batch_idx = size_t(elem.x * grid.y) * n * 2 + elem.y * n * 2;
  threadgroup float2* seq_buf = buf + elem.y * n;

  // No out of bounds accesses on odd batch sizes
  int grid_index = elem.x * grid.y + elem.y;
  short next_in =
      batch_size % 2 == 1 && grid_index * 2 == batch_size - 1 ? 0 : n;

  short m = grid.z;
  short fft_idx = elem.z;

  for (int e = 0; e < elems_per_thread; e++) {
    int index = metal::min(fft_idx + e * m, n - 1);
    seq_buf[index].x = in[batch_idx + index];
    seq_buf[index].y = in[batch_idx + index + next_in];
  }
}

template <>
METAL_FUNC void ReadWriter<float, float2>::write() const {
  short n_over_2 = (n / 2) + 1;

  size_t batch_idx =
      size_t(elem.x * grid.y) * n_over_2 * 2 + elem.y * n_over_2 * 2;
  threadgroup float2* seq_buf = buf + elem.y * n;

  int grid_index = elem.x * grid.y + elem.y;
  short next_out =
      batch_size % 2 == 1 && grid_index * 2 == batch_size - 1 ? 0 : n_over_2;

  float2 conj = {1, -1};
  float2 minus_j = {0, -1};

  short m = grid.z;
  short fft_idx = elem.z;

  for (int e = 0; e < elems_per_thread / 2 + 1; e++) {
    int index = metal::min(fft_idx + e * m, n_over_2 - 1);
    // x_0 = z_0.real
    // y_0 = z_0.imag
    if (index == 0) {
      out[batch_idx + index] = {seq_buf[index].x, 0};
      out[batch_idx + index + next_out] = {seq_buf[index].y, 0};
    } else {
      float2 x_k = seq_buf[index];
      float2 x_n_minus_k = seq_buf[n - index] * conj;
      out[batch_idx + index] = (x_k + x_n_minus_k) / 2;
      out[batch_idx + index + next_out] =
          complex_mul(((x_k - x_n_minus_k) / 2), minus_j);
    }
  }
}

template <>
METAL_FUNC void ReadWriter<float, float2>::load_padded(
    int length,
    const device float2* w_k) const {
  size_t batch_idx = size_t(elem.x * grid.y) * length * 2 + elem.y * length * 2;
  threadgroup float2* seq_buf = buf + elem.y * n;

  // No out of bounds accesses on odd batch sizes
  int grid_index = elem.x * grid.y + elem.y;
  short next_in =
      batch_size % 2 == 1 && grid_index * 2 == batch_size - 1 ? 0 : length;

  short m = grid.z;
  short fft_idx = elem.z;

  for (int e = 0; e < elems_per_thread; e++) {
    int index = metal::min(fft_idx + e * m, n - 1);
    if (index < length) {
      float2 elem =
          float2(in[batch_idx + index], in[batch_idx + index + next_in]);
      seq_buf[index] = complex_mul(elem, w_k[index]);
    } else {
      seq_buf[index] = 0;
    }
  }
}

template <>
METAL_FUNC void ReadWriter<float, float2>::write_padded(
    int length,
    const device float2* w_k) const {
  int length_over_2 = (length / 2) + 1;
  size_t batch_idx =
      size_t(elem.x * grid.y) * length_over_2 * 2 + elem.y * length_over_2 * 2;
  threadgroup float2* seq_buf = buf + elem.y * n + length - 1;

  int grid_index = elem.x * grid.y + elem.y;
  short next_out = batch_size % 2 == 1 && grid_index * 2 == batch_size - 1
      ? 0
      : length_over_2;

  float2 conj = {1, -1};
  float2 inv_factor = {1.0f / n, -1.0f / n};
  float2 minus_j = {0, -1};

  short m = grid.z;
  short fft_idx = elem.z;

  for (int e = 0; e < elems_per_thread / 2 + 1; e++) {
    int index = metal::min(fft_idx + e * m, length_over_2 - 1);
    // x_0 = z_0.real
    // y_0 = z_0.imag
    if (index == 0) {
      float2 elem = complex_mul(w_k[index], seq_buf[index] * inv_factor);
      out[batch_idx + index] = float2(elem.x, 0);
      out[batch_idx + index + next_out] = float2(elem.y, 0);
    } else {
      float2 x_k = complex_mul(w_k[index], seq_buf[index] * inv_factor);
      float2 x_n_minus_k = complex_mul(
          w_k[length - index], seq_buf[length - index] * inv_factor);
      x_n_minus_k *= conj;
      // w_k should happen before this extraction
      out[batch_idx + index] = (x_k + x_n_minus_k) / 2;
      out[batch_idx + index + next_out] =
          complex_mul(((x_k - x_n_minus_k) / 2), minus_j);
    }
  }
}

// For IRFFT, we do the opposite
//
// Z_k = X_k + j.Y_k
// x_k = Re(Z_k)
// Y_k = Imag(Z_k)
template <>
METAL_FUNC bool ReadWriter<float2, float>::out_of_bounds() const {
  int grid_index = elem.x * grid.y + elem.y;
  // We pack two sequences into one for IRFFTs
  return grid_index * 2 >= batch_size;
}

template <>
METAL_FUNC void ReadWriter<float2, float>::load() const {
  short n_over_2 = (n / 2) + 1;
  size_t batch_idx =
      size_t(elem.x * grid.y) * n_over_2 * 2 + elem.y * n_over_2 * 2;
  threadgroup float2* seq_buf = buf + elem.y * n;

  // No out of bounds accesses on odd batch sizes
  int grid_index = elem.x * grid.y + elem.y;
  short next_in =
      batch_size % 2 == 1 && grid_index * 2 == batch_size - 1 ? 0 : n_over_2;

  short m = grid.z;
  short fft_idx = elem.z;

  float2 conj = {1, -1};
  float2 plus_j = {0, 1};

  for (int t = 0; t < elems_per_thread / 2 + 1; t++) {
    int index = metal::min(fft_idx + t * m, n_over_2 - 1);
    float2 x = in[batch_idx + index];
    float2 y = in[batch_idx + index + next_in];
    // NumPy forces first input to be real
    bool first_val = index == 0;
    // NumPy forces last input on even irffts to be real
    bool last_val = n % 2 == 0 && index == n_over_2 - 1;
    if (first_val || last_val) {
      x = float2(x.x, 0);
      y = float2(y.x, 0);
    }
    seq_buf[index] = x + complex_mul(y, plus_j);
    seq_buf[index].y = -seq_buf[index].y;
    if (index > 0 && !last_val) {
      seq_buf[n - index] = (x * conj) + complex_mul(y * conj, plus_j);
      seq_buf[n - index].y = -seq_buf[n - index].y;
    }
  }
}

template <>
METAL_FUNC void ReadWriter<float2, float>::write() const {
  int batch_idx = elem.x * grid.y * n * 2 + elem.y * n * 2;
  threadgroup float2* seq_buf = buf + elem.y * n;

  int grid_index = elem.x * grid.y + elem.y;
  short next_out =
      batch_size % 2 == 1 && grid_index * 2 == batch_size - 1 ? 0 : n;

  short m = grid.z;
  short fft_idx = elem.z;

  for (int e = 0; e < elems_per_thread; e++) {
    int index = metal::min(fft_idx + e * m, n - 1);
    out[batch_idx + index] = seq_buf[index].x / n;
    out[batch_idx + index + next_out] = seq_buf[index].y / -n;
  }
}

template <>
METAL_FUNC void ReadWriter<float2, float>::load_padded(
    int length,
    const device float2* w_k) const {
  int n_over_2 = (n / 2) + 1;
  int length_over_2 = (length / 2) + 1;

  size_t batch_idx =
      size_t(elem.x * grid.y) * length_over_2 * 2 + elem.y * length_over_2 * 2;
  threadgroup float2* seq_buf = buf + elem.y * n;

  // No out of bounds accesses on odd batch sizes
  int grid_index = elem.x * grid.y + elem.y;
  short next_in = batch_size % 2 == 1 && grid_index * 2 == batch_size - 1
      ? 0
      : length_over_2;

  short m = grid.z;
  short fft_idx = elem.z;

  float2 conj = {1, -1};
  float2 plus_j = {0, 1};

  for (int t = 0; t < elems_per_thread / 2 + 1; t++) {
    int index = metal::min(fft_idx + t * m, n_over_2 - 1);
    float2 x = in[batch_idx + index];
    float2 y = in[batch_idx + index + next_in];
    if (index < length_over_2) {
      bool last_val = length % 2 == 0 && index == length_over_2 - 1;
      if (last_val) {
        x = float2(x.x, 0);
        y = float2(y.x, 0);
      }
      float2 elem1 = x + complex_mul(y, plus_j);
      seq_buf[index] = complex_mul(elem1 * conj, w_k[index]);
      if (index > 0 && !last_val) {
        float2 elem2 = (x * conj) + complex_mul(y * conj, plus_j);
        seq_buf[length - index] =
            complex_mul(elem2 * conj, w_k[length - index]);
      }
    } else {
      short pad_index = metal::min(length + (index - length_over_2) * 2, n - 2);
      seq_buf[pad_index] = 0;
      seq_buf[pad_index + 1] = 0;
    }
  }
}

template <>
METAL_FUNC void ReadWriter<float2, float>::write_padded(
    int length,
    const device float2* w_k) const {
  size_t batch_idx = size_t(elem.x * grid.y) * length * 2 + elem.y * length * 2;
  threadgroup float2* seq_buf = buf + elem.y * n + length - 1;

  int grid_index = elem.x * grid.y + elem.y;
  short next_out =
      batch_size % 2 == 1 && grid_index * 2 == batch_size - 1 ? 0 : length;

  short m = grid.z;
  short fft_idx = elem.z;

  float2 inv_factor = {1.0f / n, -1.0f / n};
  for (int e = 0; e < elems_per_thread; e++) {
    int index = fft_idx + e * m;
    if (index < length) {
      float2 output = complex_mul(seq_buf[index] * inv_factor, w_k[index]);
      out[batch_idx + index] = output.x / length;
      out[batch_idx + index + next_out] = output.y / -length;
    }
  }
}

// Four Step RFFT
template <>
METAL_FUNC void
ReadWriter<float2, float2, /*step=*/1, /*real=*/true>::load_strided(
    int stride,
    int overall_n) {
  // Silence compiler warnings
  (void)stride;
  (void)overall_n;
  // Don't invert between steps
  bool default_inv = inv;
  inv = false;
  load();
  inv = default_inv;
}

template <>
METAL_FUNC void
ReadWriter<float2, float2, /*step=*/1, /*real=*/true>::write_strided(
    int stride,
    int overall_n) {
  int overall_n_over_2 = overall_n / 2 + 1;
  int coalesce_width = grid.y;
  int tg_idx = elem.y * grid.z + elem.z;
  int outer_batch_size = stride / coalesce_width;

  int strided_batch_idx = (elem.x % outer_batch_size) * coalesce_width +
      overall_n_over_2 * (elem.x / outer_batch_size);
  strided_device_idx = strided_batch_idx +
      tg_idx / coalesce_width * elems_per_thread / 2 * stride +
      tg_idx % coalesce_width;
  strided_shared_idx = (tg_idx % coalesce_width) * n +
      tg_idx / coalesce_width * elems_per_thread / 2;
  for (int e = 0; e < elems_per_thread / 2; e++) {
    float2 output = buf[strided_shared_idx + e];
    out[strided_device_idx + e * stride] = output;
  }

  // Add on n/2 + 1 element
  if (tg_idx == 0 && elem.x % outer_batch_size == 0) {
    out[strided_batch_idx + overall_n / 2] = buf[n / 2];
  }
}

// Four Step IRFFT
template <>
METAL_FUNC void
ReadWriter<float2, float2, /*step=*/0, /*real=*/true>::load_strided(
    int stride,
    int overall_n) {
  int overall_n_over_2 = overall_n / 2 + 1;
  auto conj = float2(1, -1);

  compute_strided_indices(stride, overall_n);
  // Translate indices in terms of N - k
  for (int e = 0; e < elems_per_thread; e++) {
    int device_idx = strided_device_idx + e * stride;
    int overall_batch = device_idx / overall_n;
    int overall_index = device_idx % overall_n;
    if (overall_index < overall_n_over_2) {
      device_idx -= overall_batch * (overall_n - overall_n_over_2);
      buf[strided_shared_idx + e] = in[device_idx] * conj;
    } else {
      int conj_idx = overall_n - overall_index;
      device_idx = overall_batch * overall_n_over_2 + conj_idx;
      buf[strided_shared_idx + e] = in[device_idx];
    }
  }
}

template <>
METAL_FUNC void
ReadWriter<float2, float, /*step=*/1, /*real=*/true>::load_strided(
    int stride,
    int overall_n) {
  // Silence compiler warnings
  (void)stride;
  (void)overall_n;
  bool default_inv = inv;
  inv = false;
  load();
  inv = default_inv;
}

template <>
METAL_FUNC void
ReadWriter<float2, float, /*step=*/1, /*real=*/true>::write_strided(
    int stride,
    int overall_n) {
  compute_strided_indices(stride, overall_n);

  for (int e = 0; e < elems_per_thread; e++) {
    out[strided_device_idx + e * stride] =
        pre_out(buf[strided_shared_idx + e], overall_n).x;
  }
}


================================================
FILE: mlx/backend/metal/kernels/fft.h
================================================
// Copyright © 2024 Apple Inc.

// Metal FFT using Stockham's algorithm
//
// References:
// - VkFFT (https://github.com/DTolm/VkFFT)
// - Eric Bainville's excellent page (http://www.bealto.com/gpu-fft.html)

#include <metal_common>

#include "mlx/backend/metal/kernels/fft/radix.h"
#include "mlx/backend/metal/kernels/fft/readwrite.h"
#include "mlx/backend/metal/kernels/steel/defines.h"

using namespace metal;

#define MAX_RADIX 13
// Reached when elems_per_thread_ = 6, max_radix = 13
// and some threads have to do 3 radix 6s requiring 18 float2s.
#define MAX_OUTPUT_SIZE 18

// Specialize for a particular value of N at runtime
STEEL_CONST bool inv_ [[function_constant(0)]];
STEEL_CONST bool is_power_of_2_ [[function_constant(1)]];
STEEL_CONST int elems_per_thread_ [[function_constant(2)]];
// rader_m = n / rader_n
STEEL_CONST int rader_m_ [[function_constant(3)]];
// Stockham steps
STEEL_CONST int radix_13_steps_ [[function_constant(4)]];
STEEL_CONST int radix_11_steps_ [[function_constant(5)]];
STEEL_CONST int radix_8_steps_ [[function_constant(6)]];
STEEL_CONST int radix_7_steps_ [[function_constant(7)]];
STEEL_CONST int radix_6_steps_ [[function_constant(8)]];
STEEL_CONST int radix_5_steps_ [[function_constant(9)]];
STEEL_CONST int radix_4_steps_ [[function_constant(10)]];
STEEL_CONST int radix_3_steps_ [[function_constant(11)]];
STEEL_CONST int radix_2_steps_ [[function_constant(12)]];
// Rader steps
STEEL_CONST int rader_13_steps_ [[function_constant(13)]];
STEEL_CONST int rader_11_steps_ [[function_constant(14)]];
STEEL_CONST int rader_8_steps_ [[function_constant(15)]];
STEEL_CONST int rader_7_steps_ [[function_constant(16)]];
STEEL_CONST int rader_6_steps_ [[function_constant(17)]];
STEEL_CONST int rader_5_steps_ [[function_constant(18)]];
STEEL_CONST int rader_4_steps_ [[function_constant(19)]];
STEEL_CONST int rader_3_steps_ [[function_constant(20)]];
STEEL_CONST int rader_2_steps_ [[function_constant(21)]];

// See "radix.h" for radix codelets
typedef void (*RadixFunc)(thread float2*, thread float2*);

// Perform a single radix n butterfly with appropriate twiddles
template <int radix, RadixFunc radix_func>
METAL_FUNC void radix_butterfly(
    int i,
    int p,
    thread float2* x,
    thread short* indices,
    thread float2* y) {
  // i: the index in the overall DFT that we're processing.
  // p: the size of the DFTs we're merging at this step.
  // m: how many threads are working on this DFT.
  int k, j;

  // Use faster bitwise operations when working with powers of two
  constexpr bool radix_p_2 = (radix & (radix - 1)) == 0;
  if (radix_p_2 && is_power_of_2_) {
    constexpr short power = __builtin_ctz(radix);
    k = i & (p - 1);
    j = ((i - k) << power) + k;
  } else {
    k = i % p;
    j = (i / p) * radix * p + k;
  }

  // Apply twiddles
  if (p > 1) {
    float2 twiddle_1 = get_twiddle(k, radix * p);
    float2 twiddle = twiddle_1;
    x[1] = complex_mul(x[1], twiddle);

    STEEL_PRAGMA_UNROLL
    for (int t = 2; t < radix; t++) {
      twiddle = complex_mul(twiddle, twiddle_1);
      x[t] = complex_mul(x[t], twiddle);
    }
  }

  radix_func(x, y);

  STEEL_PRAGMA_UNROLL
  for (int t = 0; t < radix; t++) {
    indices[t] = j + t * p;
  }
}

// Perform all the radix steps required for a
// particular radix size n.
template <int radix, RadixFunc radix_func>
METAL_FUNC void radix_n_steps(
    int i,
    thread int* p,
    int m,
    int n,
    int num_steps,
    thread float2* inputs,
    thread short* indices,
    thread float2* values,
    threadgroup float2* buf) {
  int m_r = n / radix;
  // When combining different sized radices, we have to do
  // multiple butterflies in a single thread.
  // E.g. n = 28 = 4 * 7
  // 4 threads, 7 elems_per_thread
  // All threads do 1 radix7 butterfly.
  // 3 threads do 2 radix4 butterflies.
  // 1 thread does 1 radix4 butterfly.
  int max_radices_per_thread = (elems_per_thread_ + radix - 1) / radix;

  int index = 0;
  int r_index = 0;
  for (int s = 0; s < num_steps; s++) {
    for (int t = 0; t < max_radices_per_thread; t++) {
      index = i + t * m;
      if (index < m_r) {
        for (int r = 0; r < radix; r++) {
          inputs[r] = buf[index + r * m_r];
        }
        radix_butterfly<radix, radix_func>(
            index, *p, inputs, indices + t * radix, values + t * radix);
      }
    }

    // Wait until all threads have read their inputs into thread local mem
    threadgroup_barrier(mem_flags::mem_threadgroup);

    for (int t = 0; t < max_radices_per_thread; t++) {
      index = i + t * m;
      if (index < m_r) {
        for (int r = 0; r < radix; r++) {
          r_index = t * radix + r;
          buf[indices[r_index]] = values[r_index];
        }
      }
    }

    // Wait until all threads have written back to threadgroup mem
    threadgroup_barrier(mem_flags::mem_threadgroup);
    *p *= radix;
  }
}

#define RADIX_STEP(radix, radix_func, num_steps) \
  radix_n_steps<radix, radix_func>(              \
      fft_idx, p, m, n, num_steps, inputs, indices, values, buf);

template <bool rader = false>
METAL_FUNC void
perform_fft(int fft_idx, thread int* p, int m, int n, threadgroup float2* buf) {
  float2 inputs[MAX_RADIX];
  short indices[MAX_OUTPUT_SIZE];
  float2 values[MAX_OUTPUT_SIZE];

  RADIX_STEP(2, radix2, rader ? rader_2_steps_ : radix_2_steps_);
  RADIX_STEP(3, radix3, rader ? rader_3_steps_ : radix_3_steps_);
  RADIX_STEP(4, radix4, rader ? rader_4_steps_ : radix_4_steps_);
  RADIX_STEP(5, radix5, rader ? rader_5_steps_ : radix_5_steps_);
  RADIX_STEP(6, radix6, rader ? rader_6_steps_ : radix_6_steps_);
  RADIX_STEP(7, radix7, rader ? rader_7_steps_ : radix_7_steps_);
  RADIX_STEP(8, radix8, rader ? rader_8_steps_ : radix_8_steps_);
  RADIX_STEP(11, radix11, rader ? rader_11_steps_ : radix_11_steps_);
  RADIX_STEP(13, radix13, rader ? rader_13_steps_ : radix_13_steps_);
}

// Each FFT is computed entirely in shared GPU memory.
//
// N is decomposed into radix-n DFTs:
// e.g. 128 = 2 * 4 * 4 * 4
template <int tg_mem_size, typename in_T, typename out_T>
[[kernel]] void fft(
    const device in_T* in [[buffer(0)]],
    device out_T* out [[buffer(1)]],
    constant const int& n,
    constant const int& batch_size,
    uint3 elem [[thread_position_in_grid]],
    uint3 grid [[threads_per_grid]]) {
  threadgroup float2 shared_in[tg_mem_size];

  thread ReadWriter<in_T, out_T> read_writer = ReadWriter<in_T, out_T>(
      in,
      &shared_in[0],
      out,
      n,
      batch_size,
      elems_per_thread_,
      elem,
      grid,
      inv_);

  if (read_writer.out_of_bounds()) {
    return;
  };
  read_writer.load();

  threadgroup_barrier(mem_flags::mem_threadgroup);

  int p = 1;
  int fft_idx = elem.z; // Thread index in DFT
  int m = grid.z; // Threads per DFT
  int tg_idx = elem.y * n; // Index of this DFT in threadgroup
  threadgroup float2* buf = &shared_in[tg_idx];

  perform_fft(fft_idx, &p, m, n, buf);

  read_writer.write();
}

template <int tg_mem_size, typename in_T, typename out_T>
[[kernel]] void rader_fft(
    const device in_T* in [[buffer(0)]],
    device out_T* out [[buffer(1)]],
    const device float2* raders_b_q [[buffer(2)]],
    const device short* raders_g_q [[buffer(3)]],
    const device short* raders_g_minus_q [[buffer(4)]],
    constant const int& n,
    constant const int& batch_size,
    constant const int& rader_n,
    uint3 elem [[thread_position_in_grid]],
    uint3 grid [[threads_per_grid]]) {
  // Use Rader's algorithm to compute fast FFTs
  // when a prime factor `p` of `n` is greater than 13 but
  // has `p - 1` Stockham decomposable into to prime factors <= 13.
  //
  // E.g. n = 102
  //        = 2 * 3 * 17
  // .      = 2 * 3 * RADER(16)
  // .      = 2 * 3 * RADER(4 * 4)
  //
  // In numpy:
  //   x_perm = x[g_q]
  //   y = np.fft.fft(x_perm) * b_q
  //   z = np.fft.ifft(y) + x[0]
  //   out = z[g_minus_q]
  //   out[0]  = x[1:].sum()
  //
  // Where the g_q and g_minus_q are permutations formed
  // by the group under multiplicative modulo N using the
  // primitive root of N and b_q is a constant.
  // See https://en.wikipedia.org/wiki/Rader%27s_FFT_algorithm
  //
  // Rader's uses fewer operations than Bluestein's and so
  // is more accurate. It's also faster in most cases.
  threadgroup float2 shared_in[tg_mem_size];

  thread ReadWriter<in_T, out_T> read_writer = ReadWriter<in_T, out_T>(
      in,
      &shared_in[0],
      out,
      n,
      batch_size,
      elems_per_thread_,
      elem,
      grid,
      inv_);

  if (read_writer.out_of_bounds()) {
    return;
  };
  read_writer.load();

  threadgroup_barrier(mem_flags::mem_threadgroup);

  // The number of the threads we're using for each DFT
  int m = grid.z;

  int fft_idx = elem.z;
  int tg_idx = elem.y * n;
  threadgroup float2* buf = &shared_in[tg_idx];

  // rader_m = n / rader_n;
  int rader_m = rader_m_;

  // We have to load two x_0s for each thread since sometimes
  // elems_per_thread_ crosses a boundary.
  // E.g. with n = 34, rader_n = 17, elems_per_thread_ = 4
  // 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 6 6 6 6 7 7 7 7 8 8
  // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  short x_0_index =
      metal::min(fft_idx * elems_per_thread_ / (rader_n - 1), rader_m - 1);
  float2 x_0[2] = {buf[x_0_index], buf[x_0_index + 1]};

  // Do the Rader permutation in shared memory
  float2 temp[MAX_RADIX];
  int max_index = n - rader_m - 1;
  for (int e = 0; e < elems_per_thread_; e++) {
    short index = metal::min(fft_idx * elems_per_thread_ + e, max_index);
    short g_q = raders_g_q[index / rader_m];
    temp[e] = buf[rader_m + (g_q - 1) * rader_m + index % rader_m];
  }

  threadgroup_barrier(mem_flags::mem_threadgroup);

  for (int e = 0; e < elems_per_thread_; e++) {
    short index = metal::min(fft_idx * elems_per_thread_ + e, max_index);
    buf[index + rader_m] = temp[e];
  }

  threadgroup_barrier(mem_flags::mem_threadgroup);

  // Rader FFT on x[rader_m:]
  int p = 1;
  perform_fft</*rader=*/true>(fft_idx, &p, m, n - rader_m, buf + rader_m);

  // x_1 + ... + x_n is computed for us in the first FFT step so
  // we save it in the first rader_m indices of the array for later.
  int x_sum_index = metal::min(fft_idx, rader_m - 1);
  buf[x_sum_index] = buf[rader_m + x_sum_index * (rader_n - 1)];

  float2 inv = {1.0f, -1.0f};
  for (int e = 0; e < elems_per_thread_; e++) {
    short index = metal::min(fft_idx * elems_per_thread_ + e, max_index);
    short interleaved_index =
        index / rader_m + (index % rader_m) * (rader_n - 1);
    temp[e] = complex_mul(
        buf[rader_m + interleaved_index],
        raders_b_q[interleaved_index % (rader_n - 1)]);
  }

  threadgroup_barrier(mem_flags::mem_threadgroup);

  for (int e = 0; e < elems_per_thread_; e++) {
    short index = metal::min(fft_idx * elems_per_thread_ + e, max_index);
    buf[rader_m + index] = temp[e] * inv;
  }

  threadgroup_barrier(mem_flags::mem_threadgroup);

  // Rader IFFT on x[rader_m:]
  p = 1;
  perform_fft</*rader=*/true>(fft_idx, &p, m, n - rader_m, buf + rader_m);

  float2 rader_inv_factor = {1.0f / (rader_n - 1), -1.0f / (rader_n - 1)};

  for (int e = 0; e < elems_per_thread_; e++) {
    short index = metal::min(fft_idx * elems_per_thread_ + e, n - rader_m - 1);
    short diff_index = index / (rader_n - 1) - x_0_index;
    temp[e] = buf[rader_m + index] * rader_inv_factor + x_0[diff_index];
  }

  // Use the sum of elements that was computed in the first FFT
  float2 x_sum = buf[x_0_index] + x_0[0];

  threadgroup_barrier(mem_flags::mem_threadgroup);

  for (int e = 0; e < elems_per_thread_; e++) {
    short index = metal::min(fft_idx * elems_per_thread_ + e, max_index);
    short g_q_index = index % (rader_n - 1);
    short g_q = raders_g_minus_q[g_q_index];
    short out_index = index - g_q_index + g_q + (index / (rader_n - 1));
    buf[out_index] = temp[e];
  }

  buf[x_0_index * rader_n] = x_sum;

  threadgroup_barrier(mem_flags::mem_threadgroup);

  p = rader_n;
  perform_fft(fft_idx, &p, m, n, buf);

  read_writer.write();
}

template <int tg_mem_size, typename in_T, typename out_T>
[[kernel]] void bluestein_fft(
    const device in_T* in [[buffer(0)]],
    device out_T* out [[buffer(1)]],
    const device float2* w_q [[buffer(2)]],
    const device float2* w_k [[buffer(3)]],
    constant const int& length,
    constant const int& n,
    constant const int& batch_size,
    uint3 elem [[thread_position_in_grid]],
    uint3 grid [[threads_per_grid]]) {
  // Computes arbitrary length FFTs with Bluestein's algorithm
  //
  // In numpy:
  //   bluestein_n = next_power_of_2(2*n - 1)
  //   out = w_k * np.fft.ifft(np.fft.fft(w_k * in, bluestein_n) * w_q)
  //
  // Where w_k and w_q are precomputed on CPU in high precision as:
  //   w_k = np.exp(-1j * np.pi / n * (np.arange(-n + 1, n) ** 2))
  //   w_q = np.fft.fft(1/w_k[-n:])
  threadgroup float2 shared_in[tg_mem_size];

  thread ReadWriter<in_T, out_T> read_writer = ReadWriter<in_T, out_T>(
      in,
      &shared_in[0],
      out,
      n,
      batch_size,
      elems_per_thread_,
      elem,
      grid,
      inv_);

  if (read_writer.out_of_bounds()) {
    return;
  };
  read_writer.load_padded(length, w_k);

  threadgroup_barrier(mem_flags::mem_threadgroup);

  int p = 1;
  int fft_idx = elem.z; // Thread index in DFT
  int m = grid.z; // Threads per DFT
  int tg_idx = elem.y * n; // Index of this DFT in threadgroup
  threadgroup float2* buf = &shared_in[tg_idx];

  // fft
  perform_fft(fft_idx, &p, m, n, buf);

  float2 inv = float2(1.0f, -1.0f);
  for (int t = 0; t < elems_per_thread_; t++) {
    int index = fft_idx + t * m;
    buf[index] = complex_mul(buf[index], w_q[index]) * inv;
  }

  threadgroup_barrier(mem_flags::mem_threadgroup);

  // ifft
  p = 1;
  perform_fft(fft_idx, &p, m, n, buf);

  read_writer.write_padded(length, w_k);
}

template <
    int tg_mem_size,
    typename in_T,
    typename out_T,
    int step,
    bool real = false>
[[kernel]] void four_step_fft(
    const device in_T* in [[buffer(0)]],
    device out_T* out [[buffer(1)]],
    constant const int& n1,
    constant const int& n2,
    constant const int& batch_size,
    uint3 elem [[thread_position_in_grid]],
    uint3 grid [[threads_per_grid]]) {
  // Fast four step FFT implementation for powers of 2.
  int overall_n = n1 * n2;
  int n = step == 0 ? n1 : n2;
  int stride = step == 0 ? n2 : n1;

  // The number of the threads we're using for each DFT
  int m = grid.z;
  int fft_idx = elem.z;

  threadgroup float2 shared_in[tg_mem_size];
  threadgroup float2* buf = &shared_in[elem.y * n];

  using read_writer_t = ReadWriter<in_T, out_T, step, real>;
  read_writer_t read_writer = read_writer_t(
      in,
      &shared_in[0],
      out,
      n,
      batch_size,
      elems_per_thread_,
      elem,
      grid,
      inv_);

  if (read_writer.out_of_bounds()) {
    return;
  };
  read_writer.load_strided(stride, overall_n);

  threadgroup_barrier(mem_flags::mem_threadgroup);

  int p = 1;
  perform_fft(fft_idx, &p, m, n, buf);

  read_writer.write_strided(stride, overall_n);
}


================================================
FILE: mlx/backend/metal/kernels/fft.metal
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/backend/metal/kernels/defines.h"
#include "mlx/backend/metal/kernels/fft.h"

#define instantiate_fft(tg_mem_size, in_T, out_T)   \
  instantiate_kernel(                               \
      "fft_mem_" #tg_mem_size "_" #in_T "_" #out_T, \
      fft,                                          \
      tg_mem_size,                                  \
      in_T,                                         \
      out_T)

#define instantiate_rader(tg_mem_size, in_T, out_T)       \
  instantiate_kernel(                                     \
      "rader_fft_mem_" #tg_mem_size "_" #in_T "_" #out_T, \
      rader_fft,                                          \
      tg_mem_size,                                        \
      in_T,                                               \
      out_T)

#define instantiate_bluestein(tg_mem_size, in_T, out_T)       \
  instantiate_kernel(                                         \
      "bluestein_fft_mem_" #tg_mem_size "_" #in_T "_" #out_T, \
      bluestein_fft,                                          \
      tg_mem_size,                                            \
      in_T,                                                   \
      out_T)

#define instantiate_four_step(tg_mem_size, in_T, out_T, step, real)           \
  instantiate_kernel(                                                         \
      "four_step_mem_" #tg_mem_size "_" #in_T "_" #out_T "_" #step "_" #real, \
      four_step_fft,                                                          \
      tg_mem_size,                                                            \
      in_T,                                                                   \
      out_T,                                                                  \
      step,                                                                   \
      real)

// clang-format off
#define instantiate_ffts(tg_mem_size)                        \
  instantiate_fft(tg_mem_size, float2, float2) \
  instantiate_fft(tg_mem_size, float, float2) \
  instantiate_fft(tg_mem_size, float2, float) \
  instantiate_rader(tg_mem_size, float2, float2) \
  instantiate_rader(tg_mem_size, float, float2) \
  instantiate_rader(tg_mem_size, float2, float) \
  instantiate_bluestein(tg_mem_size, float2, float2) \
  instantiate_bluestein(tg_mem_size, float, float2) \
  instantiate_bluestein(tg_mem_size, float2, float) \
  instantiate_four_step(tg_mem_size, float2, float2, 0, /*real=*/false) \
  instantiate_four_step(tg_mem_size, float2, float2, 1, /*real=*/false) \
  instantiate_four_step(tg_mem_size, float, float2, 0, /*real=*/true) \
  instantiate_four_step(tg_mem_size, float2, float2, 1, /*real=*/true) \
  instantiate_four_step(tg_mem_size, float2, float2, 0, /*real=*/true) \
  instantiate_four_step(tg_mem_size, float2, float, 1, /*real=*/true)

// It's substantially faster to statically define the
// threadgroup memory size rather than using
// `setThreadgroupMemoryLength` on the compute encoder.
// For non-power of 2 sizes we round up the shared memory.
instantiate_ffts(256)
instantiate_ffts(512)
instantiate_ffts(1024)
instantiate_ffts(2048)
// 4096 is the max that will fit into 32KB of threadgroup memory.
instantiate_ffts(4096) // clang-format on


================================================
FILE: mlx/backend/metal/kernels/fp4.h
================================================
#pragma once

struct fp4_e2m1 {
  fp4_e2m1(float x) {
    if (metal::isnan(x)) {
      bits = 0x7;
      return;
    }

    const uint8_t sign_bit = (metal::signbit(x)) ? 0x8 : 0x0;
    x = metal::abs(x);

    if (x > 5.0f) {
      bits = 0x7;
    } else if (x >= 3.5f) {
      bits = 0x6;
    } else if (x > 2.5f) {
      bits = 0x5;
    } else if (x >= 1.75f) {
      bits = 0x4;
    } else if (x > 1.25f) {
      bits = 0x3;
    } else if (x >= 0.75f) {
      bits = 0x2;
    } else if (x > 0.25f) {
      bits = 0x1;
    } else {
      bits = 0x0;
    }
    bits |= sign_bit;
  }

  operator float16_t() {
    half converted = as_type<half>(ushort((bits & 7) << 9));
    converted *= 16384.0;
    return bits & 8 ? -converted : converted;
  }

  operator float() {
    return static_cast<float>(this->operator float16_t());
  }

  operator bfloat16_t() {
    return static_cast<bfloat16_t>(this->operator float16_t());
  }

  uint8_t bits;
};


================================================
FILE: mlx/backend/metal/kernels/fp8.h
================================================
#pragma once

struct fp8_e4m3 {
  template <typename T>
  fp8_e4m3(T f) {
    // From PyTorch
    // https://github.com/pytorch/pytorch/blob/e3643e1e0e923f0fc063dfab6f45c956d568919d/c10/util/Float8_e4m3fn.h#L148
    uint32_t fp8_max = 543 << 21;
    uint32_t denorm_mask = 141 << 23;
    uint32_t f_bits = as_type<uint32_t>(static_cast<float>(f));
    uint32_t sign = f_bits & 0x80000000;
    f_bits ^= sign;
    if (f_bits >= fp8_max) {
      // Default behavior saturates to min/max
      bits = 0x7E;
    } else {
      if (f_bits < (121 << 23)) {
        f_bits = as_type<uint32_t>(
            as_type<float>(f_bits) + as_type<float>(denorm_mask));
        bits = static_cast<uint8_t>(f_bits - denorm_mask);
      } else {
        // resulting mantissa is odd
        uint8_t mant_odd = (f_bits >> 20) & 1;
        f_bits += ((uint32_t)(7 - 127) << 23) + 0x7FFFF;
        f_bits += mant_odd;
        bits = static_cast<uint8_t>(f_bits >> 20);
      }
    }
    bits |= static_cast<uint8_t>(sign >> 24);
  }

  operator float16_t() {
    uint16_t v = (bits & 127) << 7;
    half converted = as_type<half>(v);
    converted *= 256.0;
    auto sign = bits & 128;
    return (sign ? -converted : converted);
  }

  operator bfloat16_t() {
    return static_cast<bfloat16_t>(this->operator float16_t());
  }

  operator float() {
    return static_cast<float>(this->operator float16_t());
  }

  uint8_t bits;
};

struct fp8_e8m0 {
  fp8_e8m0(float x) {
    if (!metal::isfinite(x)) {
      bits = 0xFF;
      return;
    }
    if (x < 0.0f) {
      bits = 0x00;
      return;
    }
    float le = metal::log2(x);
    int n = int(metal::round(le));

    n = n < -127 ? -127 : n;
    n = n > 127 ? 127 : n;
    bits = static_cast<uint8_t>(n + 127);
  }

  operator bfloat16_t() {
    uint16_t out = (bits == 0 ? 0x40 : (static_cast<uint16_t>(bits) << 7));
    return as_type<bfloat16_t>(out);
  }

  operator float() {
    uint32_t out = (bits == 0 ? 0x400000 : (static_cast<uint16_t>(bits) << 23));
    return as_type<float>(out);
  }

  uint8_t bits;
};


================================================
FILE: mlx/backend/metal/kernels/fp_quantized.h
================================================
// Copyright © 2025 Apple Inc.

#include <metal_simdgroup>
#include <metal_stdlib>

#include "mlx/backend/metal/kernels/fp4.h"
#include "mlx/backend/metal/kernels/fp8.h"

constant bool align_M [[function_constant(200)]];
constant bool align_N [[function_constant(201)]];
constant bool align_K [[function_constant(202)]];

using namespace metal;

#define MLX_MTL_CONST static constant constexpr const

MLX_MTL_CONST int SIMD_SIZE = 32;
MLX_MTL_CONST int QUAD_SIZE = 4;

template <int wsize = 8, int bits = 4>
inline constexpr short get_pack_factor() {
  return wsize / bits;
}

template <int wsize = 8>
inline constexpr short get_bytes_per_pack() {
  return wsize / 8;
}

template <typename T, int group_size>
static inline T dequantize_scale(uint8_t s) {
  if constexpr (group_size == 16) {
    // Use nv scale
    return T(*(thread fp8_e4m3*)(&s));
  } else {
    return T(*(thread fp8_e8m0*)(&s));
  }
}

template <int bits>
struct Quantize {
  uint8_t operator()(float x) {
    if (bits == 8) {
      return fp8_e4m3(x).bits;
    } else {
      return fp4_e2m1(x).bits;
    }
  }
};

template <int bits, typename U = float>
struct Dequantize {
  U operator()(uint8_t x) {
    if constexpr (bits == 8) {
      return U(*(thread fp8_e4m3*)(&x));
    } else {
      return U(*(thread fp4_e2m1*)(&x));
    }
  }
};

template <typename T, typename U, int values_per_thread>
inline void load_vector(const device T* x, thread U* x_thread) {
#pragma unroll
  for (int i = 0; i < values_per_thread; i++) {
    x_thread[i] = x[i];
  }
}

template <typename T, typename U, int values_per_thread>
inline void load_vector_safe(const device T* x, thread U* x_thread, int N) {
  for (int i = 0; i < N; i++) {
    x_thread[i] = x[i];
  }

  for (int i = N; i < values_per_thread; i++) {
    x_thread[i] = 0;
  }
}

template <typename U, int values_per_thread, int bits>
inline U qdot(const device uint8_t* w, const thread U* x_thread, U scale) {
  U accum = 0;
  if constexpr (bits == 4) {
    const device uint16_t* ws = (const device uint16_t*)w;
    for (int i = 0; i < (values_per_thread / 4); i++) {
      accum +=
          (x_thread[4 * i] * Dequantize<4>{}(ws[i]) +
           x_thread[4 * i + 1] * Dequantize<4>{}(ws[i] >> 4) +
           x_thread[4 * i + 2] * Dequantize<4>{}(ws[i] >> 8) +
           x_thread[4 * i + 3] * Dequantize<4>{}(ws[i] >> 12));
    }
  } else {
    for (int i = 0; i < values_per_thread; i++) {
      accum += x_thread[i] * Dequantize<8>{}(w[i]);
    }
  }

  return scale * accum;
}

template <typename U, int values_per_thread, int bits>
inline U
qdot_safe(const device uint8_t* w, const thread U* x_thread, U scale, int N) {
  U accum = 0;

  if constexpr (bits == 4) {
    const device uint16_t* ws = (const device uint16_t*)w;
    for (int i = 0; i < (N / 4); i++) {
      accum +=
          (x_thread[4 * i] * Dequantize<4>{}(ws[i]) +
           x_thread[4 * i + 1] * Dequantize<4>{}(ws[i] >> 4) +
           x_thread[4 * i + 2] * Dequantize<4>{}(ws[i] >> 8) +
           x_thread[4 * i + 3] * Dequantize<4>{}(ws[i] >> 12));
    }
  } else {
    for (int i = 0; i < N; i++) {
      accum += x_thread[i] * Dequantize<8>{}(w[i]);
    }
  }
  return scale * accum;
}

template <typename U, int values_per_thread, int bits>
inline void qouter(const thread uint8_t* w, U x, U scale, thread U* result) {
  if constexpr (bits == 4) {
    for (int i = 0; i < (values_per_thread / 2); i++) {
      result[2 * i] += x * scale * Dequantize<4>{}(w[i]);
      result[2 * i + 1] += x * scale * Dequantize<4>{}(w[i] >> 4);
    }
  } else {
    for (int i = 0; i < values_per_thread; i++) {
      result[i] += x * scale * Dequantize<8>{}(w[i]);
    }
  }
}

template <typename U, int bits>
inline void dequantize(uint8_t w, U scale, threadgroup U* w_local) {
  if constexpr (bits == 4) {
    w_local[0] = scale * Dequantize<4, U>{}(w);
    w_local[1] = scale * Dequantize<4, U>{}(w >> 4);
  } else {
    w_local[0] = scale * Dequantize<8, U>{}(w);
  }
}

template <
    typename T,
    short BROWS,
    short BCOLS,
    short dst_ld,
    short reduction_dim,
    short tgp_size,
    short group_size,
    short bits>
struct QuantizedBlockLoader {
  MLX_MTL_CONST short pack_factor = get_pack_factor<8, bits>();
  MLX_MTL_CONST short bytes_per_pack = get_bytes_per_pack();
  MLX_MTL_CONST short BCOLS_PACKED = BCOLS / pack_factor;
  MLX_MTL_CONST short n_reads =
      (BCOLS_PACKED * BROWS < tgp_size) ? 1 : (BCOLS_PACKED * BROWS) / tgp_size;
  MLX_MTL_CONST short group_steps = group_size < BCOLS ? 1 : group_size / BCOLS;
  MLX_MTL_CONST short scale_step = group_size < BCOLS ? BCOLS / group_size : 1;

  static_assert(
      (n_reads * pack_factor) <= group_size,
      "The number of reads per thread must be less than the group size.");

  const int src_ld;
  const int tile_stride;
  short group_step_cnt;
  const int group_stride;

  const short thread_idx;
  const short bi;
  const short bj;

  threadgroup T* dst;
  const device uint8_t* src;
  const device uint8_t* scales;

  QuantizedBlockLoader(
      const device uint8_t* src_,
      const device uint8_t* scales_,
      const int src_ld_,
      threadgroup T* dst_,
      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
      ushort simd_lane_id [[thread_index_in_simdgroup]])
      : src_ld(src_ld_),
        tile_stride(
            reduction_dim ? BCOLS_PACKED * bytes_per_pack
                          : BROWS * src_ld * bytes_per_pack / pack_factor),
        group_step_cnt(0),
        group_stride(BROWS * src_ld / group_size),
        thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(n_reads * thread_idx / BCOLS_PACKED),
        bj((n_reads * thread_idx) % BCOLS_PACKED),
        dst(dst_ + bi * dst_ld + bj * pack_factor),
        src(src_ + bi * src_ld * bytes_per_pack / pack_factor +
            bj * bytes_per_pack),
        scales(
            scales_ + bi * src_ld / group_size +
            (bj * pack_factor) / group_size) {}

  void load_unsafe() const {
    if (BCOLS_PACKED * BROWS < tgp_size && bi >= BROWS) {
      return;
    }

    T scale = dequantize_scale<T, group_size>(*scales);
    for (int i = 0; i < n_reads; i++) {
      dequantize<T, bits>(
          src[i * bytes_per_pack], scale, dst + i * pack_factor);
    }
  }

  void load_safe(short2 src_tile_dim) const {
    if (BCOLS_PACKED * BROWS < tgp_size && bi >= BROWS) {
      return;
    }

    if (reduction_dim == 1 && bi >= src_tile_dim.x) {
      for (int i = 0; i < n_reads * pack_factor; i++) {
        dst[i] = T(0);
      }
      return;
    }

    if (reduction_dim == 0 && bi >= src_tile_dim.y) {
      for (int i = 0; i < n_reads * pack_factor; i++) {
        dst[i] = T(0);
      }
      return;
    }

    T scale = dequantize_scale<T, group_size>(*scales);
    for (int i = 0; i < n_reads; i++) {
      dequantize<T, bits>(
          src[i * bytes_per_pack], scale, dst + i * pack_factor);
    }
  }

  void next() {
    src += tile_stride;
    if (reduction_dim == 1) {
      if (group_steps > 1) {
        group_step_cnt++;
        if (group_step_cnt == group_steps) {
          group_step_cnt = 0;
          scales++;
        }
      } else {
        scales += scale_step;
      }
    } else {
      scales += group_stride;
    }
  }
};

template <typename T, int group_size, int bits, int D>
METAL_FUNC void fp_qmv_quad_impl(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    device T* y,
    constant int& in_vec_size,
    const constant int& out_vec_size,
    uint3 tid [[threadgroup_position_in_grid]],
    uint quad_gid [[quadgroup_index_in_threadgroup]],
    uint quad_lid [[thread_index_in_quadgroup]]) {
  constexpr int quads_per_simd = SIMD_SIZE / QUAD_SIZE;
  constexpr int pack_factor = get_pack_factor<32, bits>();
  constexpr int values_per_thread = D / QUAD_SIZE;
  constexpr int steps_per_thread =
      values_per_thread < group_size ? 1 : values_per_thread / group_size;
  constexpr int values_per_step = values_per_thread / steps_per_thread;
  constexpr int packs_per_thread = values_per_thread / pack_factor;
  constexpr int packs_per_step = values_per_step / pack_factor;
  constexpr int results_per_quadgroup = 8;

  typedef float U;

  thread U x_thread[values_per_thread];
  thread U result[results_per_quadgroup] = {0};

  // Adjust positions
  const int in_vec_size_w = in_vec_size / pack_factor;
  const int in_vec_size_g = in_vec_size / group_size;
  const int out_row = tid.y * quads_per_simd * results_per_quadgroup + quad_gid;

  w += out_row * in_vec_size_w + quad_lid * packs_per_thread;
  scales +=
      out_row * in_vec_size_g + (quad_lid * values_per_thread) / group_size;
  x += tid.x * in_vec_size + quad_lid * values_per_thread;
  y += tid.x * out_vec_size + out_row;

  load_vector<T, U, values_per_thread>(x, x_thread);

  for (int row = 0; row < results_per_quadgroup; row++) {
    auto wl = (const device uint8_t*)(w + row * in_vec_size_w * quads_per_simd);
    const device uint8_t* sl = scales + row * in_vec_size_g * quads_per_simd;
#pragma unroll
    for (int k = 0; k < steps_per_thread; ++k) {
      U s = dequantize_scale<U, group_size>(sl[0]);
      if (row * quads_per_simd + out_row < out_vec_size) {
        result[row] += qdot<U, values_per_step, bits>(
            wl, x_thread + k * values_per_step, s);
      }
      sl++;
      wl += (sizeof(uint32_t) / sizeof(uint8_t)) * packs_per_step;
    }
  }

  for (int row = 0; row < results_per_quadgroup; row++) {
    result[row] = quad_sum(result[row]);
    if (quad_lid == 0 && row * quads_per_simd + out_row < out_vec_size) {
      y[row * quads_per_simd] = static_cast<T>(result[row]);
    }
  }
}

template <typename T, int group_size, int bits>
METAL_FUNC void fp_qmv_fast_impl(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    device T* y,
    const constant int& in_vec_size,
    const constant int& out_vec_size,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  constexpr int packs_per_thread = 2;
  constexpr int num_simdgroups = 2;
  constexpr int results_per_simdgroup = 4;
  constexpr int pack_factor = get_pack_factor<32, bits>();
  constexpr int bytes_per_pack = get_bytes_per_pack<32>();
  constexpr int values_per_thread = pack_factor * packs_per_thread;
  constexpr int block_size = values_per_thread * SIMD_SIZE;
  constexpr int scale_step_per_thread = group_size / values_per_thread;

  const device uint8_t* ws = (const device uint8_t*)w;

  typedef float U;
  thread U x_thread[values_per_thread];
  thread U result[results_per_simdgroup] = {0};

  // Adjust positions
  const int in_vec_size_w = in_vec_size * bytes_per_pack / pack_factor;
  const int in_vec_size_g = in_vec_size / group_size;
  const int out_row = tid.y * (num_simdgroups * results_per_simdgroup) +
      simd_gid * results_per_simdgroup;

  ws += out_row * in_vec_size_w + simd_lid * packs_per_thread * bytes_per_pack;
  scales += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;
  x += tid.x * in_vec_size + simd_lid * values_per_thread;
  y += tid.x * out_vec_size + out_row;

  for (int k = 0; k < in_vec_size; k += block_size) {
    load_vector<T, U, values_per_thread>(x, x_thread);

    for (int row = 0; row < results_per_simdgroup; row++) {
      auto wl = (const device uint8_t*)(ws + row * in_vec_size_w);
      const device auto* sl = scales + row * in_vec_size_g;

      U s = dequantize_scale<U, group_size>(sl[0]);
      result[row] += qdot<U, values_per_thread, bits>(wl, x_thread, s);
    }

    ws += block_size * bytes_per_pack / pack_factor;
    scales += block_size / group_size;
    x += block_size;
  }

  for (int row = 0; row < results_per_simdgroup; row++) {
    result[row] = simd_sum(result[row]);
    if (simd_lid == 0) {
      y[row] = static_cast<T>(result[row]);
    }
  }
}

template <typename T, int group_size, int bits>
METAL_FUNC void fp_qmv_impl(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    device T* y,
    const constant int& in_vec_size,
    const constant int& out_vec_size,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  constexpr int num_simdgroups = 2;
  constexpr int results_per_simdgroup = 4;
  constexpr int packs_per_thread = 1;
  constexpr int pack_factor = get_pack_factor<32, bits>();
  constexpr int bytes_per_pack = get_bytes_per_pack<32>();

  constexpr int values_per_thread = pack_factor * packs_per_thread;
  constexpr int block_size = values_per_thread * SIMD_SIZE;
  constexpr int scale_step_per_thread = group_size / values_per_thread;

  const device uint8_t* ws = (const device uint8_t*)w;

  typedef float U;

  thread U x_thread[values_per_thread];
  thread U result[results_per_simdgroup] = {0};

  // Adjust positions
  const int in_vec_size_w = in_vec_size * bytes_per_pack / pack_factor;
  const int in_vec_size_g = in_vec_size / group_size;
  const int out_row = tid.y * (num_simdgroups * results_per_simdgroup) +
      simd_gid * results_per_simdgroup;
  const int used_out_row = min(out_vec_size - results_per_simdgroup, out_row);

  if (out_row >= out_vec_size) {
    return;
  }

  // In this case we need to properly guard all our reads because there isn't
  // even 1 tile in the matrix
  if (out_vec_size < (num_simdgroups * results_per_simdgroup)) {
    ws +=
        out_row * in_vec_size_w + simd_lid * packs_per_thread * bytes_per_pack;
    scales += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;
    x += tid.x * in_vec_size + simd_lid * values_per_thread;
    y += tid.x * out_vec_size + out_row;

    int k = 0;
    for (; k < in_vec_size - block_size; k += block_size) {
      load_vector<T, U, values_per_thread>(x, x_thread);

      for (int row = 0;
           row < results_per_simdgroup && out_row + row < out_vec_size;
           row++) {
        auto wl = (const device uint8_t*)(ws + row * in_vec_size_w);
        const device auto* sl = scales + row * in_vec_size_g;

        uint8_t s = sl[0];
        result[row] += qdot<U, values_per_thread, bits>(wl, x_thread, s);
      }

      ws += block_size * bytes_per_pack / pack_factor;
      scales += block_size / group_size;
      x += block_size;
    }
    const int remaining = clamp(
        static_cast<int>(in_vec_size - k - simd_lid * values_per_thread),
        0,
        values_per_thread);
    if (remaining > 0) {
      load_vector_safe<T, U, values_per_thread>(x, x_thread, remaining);

      for (int row = 0;
           row < results_per_simdgroup && out_row + row < out_vec_size;
           row++) {
        auto wl = (const device uint8_t*)(ws + row * in_vec_size_w);
        const device auto* sl = scales + row * in_vec_size_g;

        U s = dequantize_scale<U, group_size>(sl[0]);
        result[row] += qdot<U, values_per_thread, bits>(wl, x_thread, s);
      }
    }

    for (int row = 0;
         row < results_per_simdgroup && out_row + row < out_vec_size;
         row++) {
      result[row] = simd_sum(result[row]);
      if (simd_lid == 0) {
        y[row] = static_cast<T>(result[row]);
      }
    }
  }

  // In this case the last tile is moved back to redo some output values
  else {
    ws += used_out_row * in_vec_size_w +
        simd_lid * packs_per_thread * bytes_per_pack;
    scales += used_out_row * in_vec_size_g + simd_lid / scale_step_per_thread;
    x += tid.x * in_vec_size + simd_lid * values_per_thread;
    y += tid.x * out_vec_size + used_out_row;

    int k = 0;
    for (; k < in_vec_size - block_size; k += block_size) {
      load_vector<T, U, values_per_thread>(x, x_thread);

      for (int row = 0; row < results_per_simdgroup; row++) {
        auto wl = (const device uint8_t*)(ws + row * in_vec_size_w);
        const device auto* sl = scales + row * in_vec_size_g;

        U s = dequantize_scale<U, group_size>(sl[0]);
        result[row] += qdot<U, values_per_thread, bits>(wl, x_thread, s);
      }

      ws += block_size * bytes_per_pack / pack_factor;
      scales += block_size / group_size;
      x += block_size;
    }
    const int remaining = clamp(
        static_cast<int>(in_vec_size - k - simd_lid * values_per_thread),
        0,
        values_per_thread);
    if (remaining > 0) {
      load_vector_safe<T, U, values_per_thread>(x, x_thread, remaining);

      for (int row = 0; row < results_per_simdgroup; row++) {
        auto wl = (const device uint8_t*)(ws + row * in_vec_size_w);
        const device auto* sl = scales + row * in_vec_size_g;

        U s = dequantize_scale<U, group_size>(sl[0]);
        result[row] +=
            qdot_safe<U, values_per_thread, bits>(wl, x_thread, s, remaining);
      }
    }
    for (int row = 0; row < results_per_simdgroup; row++) {
      result[row] = simd_sum(result[row]);
      if (simd_lid == 0) {
        y[row] = static_cast<T>(result[row]);
      }
    }
  }
}

template <typename T, const int group_size, int bits>
METAL_FUNC void fp_qvm_impl(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    device T* y,
    const int in_vec_size,
    const int out_vec_size,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  constexpr int num_simdgroups = 2;
  constexpr int pack_factor = get_pack_factor<32, bits>();
  constexpr int bytes_per_pack = get_bytes_per_pack();

  constexpr int tn = group_size / pack_factor;
  constexpr int block_size = SIMD_SIZE;

  using W_T = uint32_t;
  const device W_T* ws = (const device W_T*)w;

  typedef float U;
  typedef struct {
    W_T wi[tn * bytes_per_pack];
  } vec_w;

  thread vec_w w_local;
  thread U result[tn * pack_factor] = {0};
  thread U scale = 0;
  thread U x_local = 0;

  // Adjust positions
  const int out_vec_size_w = out_vec_size * bytes_per_pack / pack_factor;
  const int out_vec_size_g = out_vec_size / group_size;
  // 32 * (tid.y * 2 + simd_gid)
  int out_col = pack_factor * tn * (tid.y * num_simdgroups + simd_gid);
  ws += out_col * bytes_per_pack / pack_factor + simd_lid * out_vec_size_w;
  scales += out_col / group_size + simd_lid * out_vec_size_g;
  x += tid.x * in_vec_size + simd_lid;
  y += tid.x * out_vec_size + out_col;

  if (out_col >= out_vec_size) {
    return;
  }

  // Loop over in_vec in blocks of block_size
  int remaining = in_vec_size % block_size;
  if (remaining == 0) {
    for (int i = 0; i < in_vec_size; i += block_size) {
      x_local = *x;
      scale = dequantize_scale<U, group_size>(*scales);
      w_local = *((device vec_w*)ws);
      qouter<U, tn * pack_factor, bits>(
          (thread uint8_t*)&w_local, x_local, scale, result);

      x += block_size;
      scales += block_size * out_vec_size_g;
      ws += block_size * out_vec_size_w;
    }
  } else {
    for (int i = block_size; i < in_vec_size; i += block_size) {
      x_local = *x;
      scale = dequantize_scale<U, group_size>(*scales);
      w_local = *((device vec_w*)ws);

      qouter<U, tn * pack_factor, bits>(
          (thread uint8_t*)&w_local, x_local, scale, result);

      x += block_size;
      scales += block_size * out_vec_size_g;
      ws += block_size * out_vec_size_w;
    }
    if (static_cast<int>(simd_lid) < remaining) {
      x_local = *x;
      scale = dequantize_scale<U, group_size>(*scales);
      w_local = *((device vec_w*)ws);
    } else {
      x_local = 0;
      scale = 0;
    }
    qouter<U, tn * pack_factor, bits>(
        (thread uint8_t*)&w_local, x_local, scale, result);
  }

// Accumulate in the simdgroup
#pragma clang loop unroll(full)
  for (int k = 0; k < tn * pack_factor; k++) {
    result[k] = simd_sum(result[k]);
  }

  // Store the result
  if (simd_lid == 0) {
#pragma clang loop unroll(full)
    for (int k = 0; k < tn * pack_factor; k++) {
      y[k] = static_cast<T>(result[k]);
    }
  }
}

template <
    typename T,
    const int group_size,
    const int bits,
    const bool aligned_N,
    const int BM = 32,
    const int BK = 32,
    const int BN = 32>
METAL_FUNC void fp_qmm_t_impl(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    device T* y,
    threadgroup T* Xs,
    threadgroup T* Ws,
    const constant int& K,
    const constant int& N,
    const constant int& M,
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  static_assert(BK >= SIMD_SIZE, "BK should be larger than SIMD_SIZE");
  static_assert(BK % SIMD_SIZE == 0, "BK should be divisible by SIMD_SIZE");

  (void)lid;

  constexpr int WM = 2;
  constexpr int WN = 2;
  constexpr int pack_factor = get_pack_factor<8, bits>();
  constexpr int bytes_per_pack = get_bytes_per_pack();

  constexpr int BK_padded = (BK + 16 / sizeof(T));

  // Instantiate the appropriate BlockMMA and Loader
  using mma_t = mlx::steel::
      BlockMMA<T, T, BM, BN, BK, WM, WN, false, true, BK_padded, BK_padded>;
  using loader_x_t =
      mlx::steel::BlockLoader<T, BM, BK, BK_padded, 1, WM * WN * SIMD_SIZE>;
  using loader_w_t = QuantizedBlockLoader<
      T,
      BN,
      BK,
      BK_padded,
      1,
      WM * WN * SIMD_SIZE,
      group_size,
      bits>;

  // Set the block
  const int K_w = K * bytes_per_pack / pack_factor;
  const int K_g = K / group_size;
  const int y_row = tid.y * BM;
  const int y_col = tid.x * BN;

  auto wl = (const device uint8_t*)w;

  x += y_row * static_cast<int64_t>(K);
  wl += y_col * K_w;
  scales += y_col * K_g;
  y += y_row * static_cast<int64_t>(N) + y_col;

  // Make the x loader and mma operation
  const short num_els = min(BM, M - y_row);
  const short num_outs = min(BN, N - y_col);
  loader_x_t loader_x(x, K, Xs, simd_gid, simd_lid);
  loader_w_t loader_w(wl, scales, K, Ws, simd_gid, simd_lid);
  mma_t mma_op(simd_gid, simd_lid);

  if (num_els < BM) {
    if (!aligned_N && num_outs < BN) {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_safe(short2(BK, num_els));
        loader_w.load_safe(short2(BK, num_outs));
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
    } else {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_safe(short2(BK, num_els));
        loader_w.load_unsafe();
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
    }
  } else {
    if (!aligned_N && num_outs < BN) {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_unsafe();
        loader_w.load_safe(short2(BK, num_outs));
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
    } else {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_unsafe();
        loader_w.load_unsafe();
        threadgroup_barrier(mem_flags::mem_threadgroup);

        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
    }
  }

  // Store results to device memory
  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (num_els < BM || num_outs < BN) {
    mma_op.store_result_safe(y, N, short2(num_outs, num_els));
  } else {
    mma_op.store_result(y, N);
  }
}

template <
    typename T,
    const int group_size,
    const int bits,
    const int BM = 32,
    const int BK = 32,
    const int BN = 32>
METAL_FUNC void fp_qmm_n_impl(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    device T* y,
    threadgroup T* Xs,
    threadgroup T* Ws,
    const constant int& K,
    const constant int& N,
    const constant int& M,
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  static_assert(BK >= SIMD_SIZE, "BK should be larger than SIMD_SIZE");
  static_assert(BK % SIMD_SIZE == 0, "BK should be divisible by SIMD_SIZE");

  (void)lid;

  constexpr int WM = 2;
  constexpr int WN = 2;
  constexpr int pack_factor = get_pack_factor<8, bits>();
  constexpr int bytes_per_pack = get_bytes_per_pack();

  constexpr int BK_padded = (BK + 16 / sizeof(T));
  constexpr int BN_padded = (BN + 16 / sizeof(T));

  // Instantiate the appropriate BlockMMA and Loader
  using mma_t = mlx::steel::
      BlockMMA<T, T, BM, BN, BK, WM, WN, false, false, BK_padded, BN_padded>;
  using loader_x_t = mlx::steel::
      BlockLoader<T, BM, BK, BK_padded, 1, WM * WN * SIMD_SIZE, 1, 4>;
  using loader_w_t = QuantizedBlockLoader<
      T,
      BK,
      BN,
      BN_padded,
      0,
      WM * WN * SIMD_SIZE,
      group_size,
      bits>;

  auto wl = (const device uint8_t*)w;

  // Set the block
  const int y_row = tid.y * BM;
  const int y_col = tid.x * BN;
  x += y_row * static_cast<int64_t>(K);
  wl += y_col * bytes_per_pack / pack_factor;
  scales += y_col / group_size;
  y += y_row * static_cast<int64_t>(N) + y_col;

  // Make the x loader and mma operation
  const short num_els = min(BM, M - y_row);
  loader_x_t loader_x(x, K, Xs, simd_gid, simd_lid);
  loader_w_t loader_w(wl, scales, N, Ws, simd_gid, simd_lid);
  mma_t mma_op(simd_gid, simd_lid);

  if (num_els < BM) {
    if ((K % BK) != 0) {
      const int k_blocks = K / BK;
      for (int k = 0; k < k_blocks; k++) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_safe(short2(BK, num_els));
        loader_w.load_unsafe();
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
      const short num_k = K - k_blocks * BK;
      threadgroup_barrier(mem_flags::mem_threadgroup);
      loader_x.load_safe(short2(num_k, num_els));
      loader_w.load_safe(short2(BN, num_k));
      threadgroup_barrier(mem_flags::mem_threadgroup);
      mma_op.mma(Xs, Ws);
    } else {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_safe(short2(BK, num_els));
        loader_w.load_unsafe();
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
    }
  } else {
    if ((K % BK) != 0) {
      const int k_blocks = K / BK;
      for (int k = 0; k < k_blocks; k++) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_unsafe();
        loader_w.load_unsafe();
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
      const short num_k = K - k_blocks * BK;
      threadgroup_barrier(mem_flags::mem_threadgroup);
      loader_x.load_safe(short2(num_k, BM));
      loader_w.load_safe(short2(BN, num_k));
      threadgroup_barrier(mem_flags::mem_threadgroup);
      mma_op.mma(Xs, Ws);
    } else {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_unsafe();
        loader_w.load_unsafe();
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
    }
  }

  // Store results to device memory
  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (num_els < BM) {
    mma_op.store_result_safe(y, N, short2(BN, num_els));
  } else {
    mma_op.store_result(y, N);
  }
}

template <typename T>
METAL_FUNC void adjust_matrix_offsets(
    const device T*& x,
    const device uint32_t*& w,
    const device uint8_t*& scales,
    device T*& y,
    int output_stride,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    uint3 tid [[threadgroup_position_in_grid]]) {
  // Set the input/output matrices
  uint32_t x_idx = tid.z;
  uint32_t w_idx = tid.z;
  if (x_batch_ndims == 1) {
    x += x_idx * x_strides[0];
  } else {
    x += elem_to_loc(x_idx, x_shape, x_strides, x_batch_ndims);
  }
  if (w_batch_ndims == 1) {
    w += w_idx * w_strides[0];
    scales += w_idx * s_strides[0];
  } else {
    ulong2 idx = elem_to_loc_broadcast(
        w_idx, w_shape, w_strides, s_strides, w_batch_ndims);
    w += idx.x;
    scales += idx.y;
  }
  y += tid.z * output_stride;
}

template <typename T>
METAL_FUNC void adjust_matrix_offsets(
    const device T*& x,
    const device uint32_t*& w,
    const device uint8_t*& scales,
    const device uint32_t* lhs_indices,
    const device uint32_t* rhs_indices,
    device T*& y,
    int output_stride,
    const constant int& batch_ndims,
    const constant int* batch_shape,
    const constant int64_t* lhs_strides,
    const constant int64_t* rhs_strides,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    uint3 tid [[threadgroup_position_in_grid]]) {
  // Set the input/output matrices
  uint32_t x_idx;
  uint32_t w_idx;
  if (batch_ndims == 1) {
    x_idx = lhs_indices[tid.z * lhs_strides[0]];
    w_idx = rhs_indices[tid.z * rhs_strides[0]];
  } else {
    ulong2 idx = elem_to_loc_broadcast(
        tid.z, batch_shape, lhs_strides, rhs_strides, batch_ndims);
    x_idx = lhs_indices[idx.x];
    w_idx = rhs_indices[idx.y];
  }
  if (x_batch_ndims == 1) {
    x += x_idx * x_strides[0];
  } else {
    x += elem_to_loc(x_idx, x_shape, x_strides, x_batch_ndims);
  }
  if (w_batch_ndims == 1) {
    w += w_idx * w_strides[0];
    scales += w_idx * s_strides[0];
  } else {
    ulong2 idx = elem_to_loc_broadcast(
        w_idx, w_shape, w_strides, s_strides, w_batch_ndims);
    w += idx.x;
    scales += idx.y;
  }
  y += tid.z * output_stride;
}

template <typename T, int group_size, int bits, int D, bool batched>
[[kernel]] void fp_qmv_quad(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    device T* y,
    const constant int& in_vec_size,
    const constant int& out_vec_size,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    uint3 tid [[threadgroup_position_in_grid]],
    uint quad_gid [[quadgroup_index_in_threadgroup]],
    uint quad_lid [[thread_index_in_quadgroup]]) {
  if (batched) {
    int M = x_shape[x_batch_ndims];
    adjust_matrix_offsets(
        x,
        w,
        scales,
        y,
        out_vec_size * M,
        x_batch_ndims,
        x_shape,
        x_strides,
        w_batch_ndims,
        w_shape,
        w_strides,
        s_strides,
        tid);
  }
  fp_qmv_quad_impl<T, group_size, bits, D>(
      w, scales, x, y, in_vec_size, out_vec_size, tid, quad_gid, quad_lid);
}

template <typename T, int group_size, int bits, bool batched>
[[kernel]] void fp_qmv_fast(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    device T* y,
    const constant int& in_vec_size,
    const constant int& out_vec_size,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  if (batched) {
    int M = x_shape[x_batch_ndims];
    adjust_matrix_offsets(
        x,
        w,
        scales,
        y,
        out_vec_size * M,
        x_batch_ndims,
        x_shape,
        x_strides,
        w_batch_ndims,
        w_shape,
        w_strides,
        s_strides,
        tid);
  }
  fp_qmv_fast_impl<T, group_size, bits>(
      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid);
}

template <typename T, const int group_size, int bits, bool batched>
[[kernel]] void fp_qmv(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    device T* y,
    const constant int& in_vec_size,
    const constant int& out_vec_size,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  if (batched) {
    int M = x_shape[x_batch_ndims];
    adjust_matrix_offsets(
        x,
        w,
        scales,
        y,
        out_vec_size * M,
        x_batch_ndims,
        x_shape,
        x_strides,
        w_batch_ndims,
        w_shape,
        w_strides,
        s_strides,
        tid);
  }
  fp_qmv_impl<T, group_size, bits>(
      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid);
}

template <typename T, const int group_size, int bits, bool batched>
[[kernel]] void fp_qvm(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    device T* y,
    const constant int& in_vec_size,
    const constant int& out_vec_size,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  if (batched) {
    int M = x_shape[x_batch_ndims];
    adjust_matrix_offsets(
        x,
        w,
        scales,
        y,
        out_vec_size * M,
        x_batch_ndims,
        x_shape,
        x_strides,
        w_batch_ndims,
        w_shape,
        w_strides,
        s_strides,
        tid);
  }
  fp_qvm_impl<T, group_size, bits>(
      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid);
}

template <typename T, const int group_size, int bits, int split_k = 32>
[[kernel]] void fp_qvm_split_k(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    device T* y,
    const constant int& in_vec_size,
    const constant int& out_vec_size,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    const constant int& final_block_size,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  int M = x_shape[x_batch_ndims];
  adjust_matrix_offsets(
      x,
      w,
      scales,
      y,
      out_vec_size * M,
      x_batch_ndims,
      x_shape,
      x_strides,
      w_batch_ndims,
      w_shape,
      w_strides,
      s_strides,
      tid);

  // When (in_vec_size % split_k != 0) the final block needs to be smaller
  int in_vec_size_adj =
      tid.z % split_k == split_k - 1 ? final_block_size : in_vec_size;

  fp_qvm_impl<T, group_size, bits>(
      w, scales, x, y, in_vec_size_adj, out_vec_size, tid, simd_gid, simd_lid);
}

template <
    typename T,
    const int group_size,
    const int bits,
    const bool aligned_N,
    const bool batched,
    const int BM = 32,
    const int BK = 32,
    const int BN = 32>
[[kernel]] void fp_qmm_t(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    device T* y,
    const constant int& K,
    const constant int& N,
    const constant int& M,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr int BK_padded = (BK + 16 / sizeof(T));

  threadgroup T Xs[BM * BK_padded];
  threadgroup T Ws[BN * BK_padded];

  if (batched) {
    adjust_matrix_offsets(
        x,
        w,
        scales,
        y,
        M * N,
        x_batch_ndims,
        x_shape,
        x_strides,
        w_batch_ndims,
        w_shape,
        w_strides,
        s_strides,
        tid);
  }
  fp_qmm_t_impl<T, group_size, bits, aligned_N, BM, BK, BN>(
      w, scales, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
}

template <
    typename T,
    const int group_size,
    const int bits,
    const bool batched,
    const int BM = 32,
    const int BK = 32,
    const int BN = 32>
[[kernel]] void fp_qmm_n(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    device T* y,
    const constant int& K,
    const constant int& N,
    const constant int& M,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr int BK_padded = (BK + 16 / sizeof(T));
  constexpr int BN_padded = (BN + 16 / sizeof(T));

  threadgroup T Xs[BM * BK_padded];
  threadgroup T Ws[BK * BN_padded];

  if (batched) {
    adjust_matrix_offsets(
        x,
        w,
        scales,
        y,
        M * N,
        x_batch_ndims,
        x_shape,
        x_strides,
        w_batch_ndims,
        w_shape,
        w_strides,
        s_strides,
        tid);
  }

  fp_qmm_n_impl<T, group_size, bits, BM, BK, BN>(
      w, scales, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
}

template <typename T, int group_size, int bits>
[[kernel]] void fp_gather_qmv_fast(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    const device uint32_t* lhs_indices,
    const device uint32_t* rhs_indices,
    device T* y,
    const constant int& in_vec_size,
    const constant int& out_vec_size,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    const constant int& batch_ndims,
    const constant int* batch_shape,
    const constant int64_t* lhs_strides,
    const constant int64_t* rhs_strides,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  int M = x_shape[x_batch_ndims];
  adjust_matrix_offsets(
      x,
      w,
      scales,
      lhs_indices,
      rhs_indices,
      y,
      out_vec_size * M,
      batch_ndims,
      batch_shape,
      lhs_strides,
      rhs_strides,
      x_batch_ndims,
      x_shape,
      x_strides,
      w_batch_ndims,
      w_shape,
      w_strides,
      s_strides,
      tid);
  fp_qmv_fast_impl<T, group_size, bits>(
      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid);
}

template <typename T, int group_size, int bits>
[[kernel]] void fp_gather_qmv(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    const device uint32_t* lhs_indices,
    const device uint32_t* rhs_indices,
    device T* y,
    const constant int& in_vec_size,
    const constant int& out_vec_size,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    const constant int& batch_ndims,
    const constant int* batch_shape,
    const constant int64_t* lhs_strides,
    const constant int64_t* rhs_strides,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  int M = x_shape[x_batch_ndims];
  adjust_matrix_offsets(
      x,
      w,
      scales,
      lhs_indices,
      rhs_indices,
      y,
      out_vec_size * M,
      batch_ndims,
      batch_shape,
      lhs_strides,
      rhs_strides,
      x_batch_ndims,
      x_shape,
      x_strides,
      w_batch_ndims,
      w_shape,
      w_strides,
      s_strides,
      tid);
  fp_qmv_impl<T, group_size, bits>(
      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid);
}

template <typename T, int group_size, int bits>
[[kernel]] void fp_gather_qvm(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    const device uint32_t* lhs_indices,
    const device uint32_t* rhs_indices,
    device T* y,
    const constant int& in_vec_size,
    const constant int& out_vec_size,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    const constant int& batch_ndims,
    const constant int* batch_shape,
    const constant int64_t* lhs_strides,
    const constant int64_t* rhs_strides,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  int M = x_shape[x_batch_ndims];
  adjust_matrix_offsets(
      x,
      w,
      scales,
      lhs_indices,
      rhs_indices,
      y,
      out_vec_size * M,
      batch_ndims,
      batch_shape,
      lhs_strides,
      rhs_strides,
      x_batch_ndims,
      x_shape,
      x_strides,
      w_batch_ndims,
      w_shape,
      w_strides,
      s_strides,
      tid);
  fp_qvm_impl<T, group_size, bits>(
      w, scales, x, y, in_vec_size, out_vec_size, tid, simd_gid, simd_lid);
}

template <
    typename T,
    const int group_size,
    const int bits,
    const bool aligned_N,
    const int BM = 32,
    const int BK = 32,
    const int BN = 32>
[[kernel]] void fp_gather_qmm_t(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    const device uint32_t* lhs_indices,
    const device uint32_t* rhs_indices,
    device T* y,
    const constant int& K,
    const constant int& N,
    const constant int& M,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    const constant int& batch_ndims,
    const constant int* batch_shape,
    const constant int64_t* lhs_strides,
    const constant int64_t* rhs_strides,
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr int BK_padded = (BK + 16 / sizeof(T));

  threadgroup T Xs[BM * BK_padded];
  threadgroup T Ws[BN * BK_padded];

  adjust_matrix_offsets(
      x,
      w,
      scales,
      lhs_indices,
      rhs_indices,
      y,
      M * N,
      batch_ndims,
      batch_shape,
      lhs_strides,
      rhs_strides,
      x_batch_ndims,
      x_shape,
      x_strides,
      w_batch_ndims,
      w_shape,
      w_strides,
      s_strides,
      tid);
  fp_qmm_t_impl<T, group_size, bits, aligned_N, BM, BK, BN>(
      w, scales, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
}

template <
    typename T,
    const int group_size,
    const int bits,
    const int BM = 32,
    const int BK = 32,
    const int BN = 32>
[[kernel]] void fp_gather_qmm_n(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    const device uint32_t* lhs_indices,
    const device uint32_t* rhs_indices,
    device T* y,
    const constant int& K,
    const constant int& N,
    const constant int& M,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    const constant int& batch_ndims,
    const constant int* batch_shape,
    const constant int64_t* lhs_strides,
    const constant int64_t* rhs_strides,
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr int BK_padded = (BK + 16 / sizeof(T));
  constexpr int BN_padded = (BN + 16 / sizeof(T));

  threadgroup T Xs[BM * BK_padded];
  threadgroup T Ws[BK * BN_padded];

  adjust_matrix_offsets(
      x,
      w,
      scales,
      lhs_indices,
      rhs_indices,
      y,
      M * N,
      batch_ndims,
      batch_shape,
      lhs_strides,
      rhs_strides,
      x_batch_ndims,
      x_shape,
      x_strides,
      w_batch_ndims,
      w_shape,
      w_strides,
      s_strides,
      tid);
  fp_qmm_n_impl<T, group_size, bits, BM, BK, BN>(
      w, scales, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
}

template <
    typename T,
    int group_size,
    int bits,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose>
[[kernel]] void fp_gather_qmm_rhs(
    const device T* x,
    const device uint32_t* w,
    const device uint8_t* scales,
    const device uint32_t* indices,
    device T* y,
    const constant int& M,
    const constant int& N,
    const constant int& K,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]]) {
  constexpr int pack_factor = get_pack_factor<8, bits>();
  constexpr int bytes_per_pack = get_bytes_per_pack();
  constexpr int BK_padded = (BK + 16 / sizeof(T));
  constexpr int BN_padded = (BN + 16 / sizeof(T));

  using mma_t = mlx::steel::BlockMMA<
      T,
      T,
      BM,
      BN,
      BK,
      WM,
      WN,
      false,
      transpose,
      BK_padded,
      transpose ? BK_padded : BN_padded>;
  using loader_x_t =
      mlx::steel::BlockLoader<T, BM, BK, BK_padded, 1, WM * WN * SIMD_SIZE>;
  using loader_w_t = QuantizedBlockLoader<
      T,
      transpose ? BN : BK,
      transpose ? BK : BN,
      transpose ? BK_padded : BN_padded,
      transpose,
      WM * WN * SIMD_SIZE,
      group_size,
      bits>;

  threadgroup T Xs[BM * BK_padded];
  threadgroup T Ws[transpose ? BN * BK_padded : BK * BN_padded];

  // Compute the block
  const int K_w = K * bytes_per_pack / pack_factor;
  const int K_g = K / group_size;
  const int N_w = N * bytes_per_pack / pack_factor;
  const int N_g = N / group_size;
  const int K_it = K / BK;
  const size_t stride_w = transpose ? N * K_w : K * N_w;
  const size_t stride_s = transpose ? N * K_g : K * N_g;
  const int y_row = tid.y * BM;
  const int y_col = tid.x * BN;
  const size_t y_row_long = size_t(y_row);
  const size_t y_col_long = size_t(y_col);

  // Prepare threadgroup bounds
  const short tgp_bm = align_M ? BM : short(min(BM, M - y_row));
  const short tgp_bn = align_N ? BN : short(min(BN, N - y_col));

  // Calculate the final tiles in the case that K is not aligned
  const int k_remain = K - K_it * BK;
  const short2 tile_x = short2(k_remain, tgp_bm);
  const short2 tile_w =
      transpose ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);

  // Move x and output to the correct block
  auto wl = (const device uint8_t*)w;
  x += y_row_long * K;
  y += y_row_long * N + y_col_long;
  wl += transpose ? y_col_long * K_w : y_col * bytes_per_pack / pack_factor;
  scales += transpose ? y_col_long * K_g : y_col / group_size;

  // Do as many matmuls as necessary
  uint32_t index;
  short offset;
  uint32_t index_next = indices[y_row];
  short offset_next = 0;
  int n = 0;
  while (n < tgp_bm) {
    n++;
    offset = offset_next;
    index = index_next;
    offset_next = tgp_bm;
    for (; n < tgp_bm; n++) {
      if (indices[y_row + n] != index) {
        offset_next = n;
        index_next = indices[y_row + n];
        break;
      }
    }
    threadgroup_barrier(mem_flags::mem_none);

    // Prepare threadgroup mma operation
    thread mma_t mma_op(simd_group_id, simd_lane_id);

    // Prepare threadgroup loading operations
    thread loader_x_t loader_x(x, K, Xs, simd_group_id, simd_lane_id);
    thread loader_w_t loader_w(
        wl + index * stride_w,
        scales + index * stride_s,
        transpose ? K : N,
        Ws,
        simd_group_id,
        simd_lane_id);

    // Matrices are all aligned check nothing
    if (align_M && align_N) {
      gemm_loop_aligned(Xs, Ws, mma_op, loader_x, loader_w, K_it);
      if (!align_K) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        gemm_loop_finalize(Xs, Ws, mma_op, loader_x, loader_w, tile_x, tile_w);
      }

      // Store results to device memory
      if (offset_next - offset == BM) {
        mma_op.store_result(y, N);
      } else {
        mma_op.store_result_slice(
            y, N, short2(0, offset), short2(BN, offset_next));
      }
    } else {
      // Tile aligned so check outside of the hot loop
      if ((align_M || tgp_bm == BM) && (align_N || tgp_bn == BN)) {
        gemm_loop_aligned(Xs, Ws, mma_op, loader_x, loader_w, K_it);
        if (!align_K) {
          threadgroup_barrier(mem_flags::mem_threadgroup);
          gemm_loop_finalize(
              Xs, Ws, mma_op, loader_x, loader_w, tile_x, tile_w);
        }

        // Store results to device memory
        if (offset_next - offset == BM) {
          mma_op.store_result(y, N);
        } else {
          mma_op.store_result_slice(
              y, N, short2(0, offset), short2(BN, offset_next));
        }
      }

      // Tile partially aligned check rows
      else if (align_N || tgp_bn == BN) {
        gemm_loop_unaligned<false, true, transpose>(
            Xs, Ws, mma_op, loader_x, loader_w, K_it, tgp_bm, tgp_bn, BK);
        if (!align_K) {
          threadgroup_barrier(mem_flags::mem_threadgroup);
          gemm_loop_finalize(
              Xs, Ws, mma_op, loader_x, loader_w, tile_x, tile_w);
        }
        mma_op.store_result_slice(
            y, N, short2(0, offset), short2(BN, offset_next));
      }

      // Tile partially aligned check cols
      else if (align_M || tgp_bm == BM) {
        gemm_loop_unaligned<true, false, transpose>(
            Xs, Ws, mma_op, loader_x, loader_w, K_it, tgp_bm, tgp_bn, BK);
        if (!align_K) {
          threadgroup_barrier(mem_flags::mem_threadgroup);
          gemm_loop_finalize(
              Xs, Ws, mma_op, loader_x, loader_w, tile_x, tile_w);
        }
        mma_op.store_result_slice(
            y, N, short2(0, offset), short2(tgp_bn, offset_next));
      }

      // Nothing aligned so check both rows and cols
      else {
        gemm_loop_unaligned<false, false, transpose>(
            Xs, Ws, mma_op, loader_x, loader_w, K_it, tgp_bm, tgp_bn, BK);
        if (!align_K) {
          threadgroup_barrier(mem_flags::mem_threadgroup);
          gemm_loop_finalize(
              Xs, Ws, mma_op, loader_x, loader_w, tile_x, tile_w);
        }
        mma_op.store_result_slice(
            y, N, short2(0, offset), short2(tgp_bn, offset_next));
      }
    }
  }
}

template <typename T, const int group_size, const int bits>
[[kernel]] void fp_quantize(
    const device T* w [[buffer(0)]],
    device uint8_t* out [[buffer(1)]],
    device uint8_t* scales [[buffer(2)]],
    uint2 tidx [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  constexpr bool use_mx_scale = group_size == 32;
  size_t index = tidx.x + grid_dim.x * size_t(tidx.y);

  float scale;
  float w_thread = w[index];
  if (use_mx_scale) {
    scale = simd_max(abs(w_thread));
  } else {
    float w_max_l = simd_max(tidx.x < 16 ? abs(w_thread) : 0.0);
    float w_max_r = simd_max(tidx.x >= 16 ? abs(w_thread) : 0.0);
    scale = tidx.x < 16 ? w_max_l : w_max_r;
  }
  scale /= bits == 4 ? 6.0f : 448.0f;

  using ScaleType = metal::conditional_t<use_mx_scale, fp8_e8m0, fp8_e4m3>;
  auto s = ScaleType(scale);
  uint8_t q_scale = s.bits;
  scale = float(s);

  size_t gindex = index / group_size;
  if (index % group_size == 0) {
    scales[gindex] = q_scale;
  }

  uint8_t output = Quantize<bits>{}(scale == 0 ? 0.0f : w_thread / scale);
  if (bits == 4) {
    uint8_t sval = simd_shuffle_down(output, 1);
    output |= sval << bits;
  }
  constexpr int pack_factor = bits == 8 ? 1 : 2;
  if (index % pack_factor == 0) {
    out[index / pack_factor] = output;
  }
}

template <typename T, const int group_size, const int bits>
[[kernel]] void fp_dequantize(
    const device uint8_t* w [[buffer(0)]],
    const device uint8_t* scales [[buffer(1)]],
    device T* out [[buffer(3)]],
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  constexpr bool use_mx_scale = group_size == 32;
  constexpr int pack_factor = bits == 8 ? 1 : 2;
  size_t offset = index.x + grid_dim.x * size_t(index.y);
  size_t oindex = offset * pack_factor;
  size_t gindex = oindex / group_size;

  out += oindex;

  using ScaleType = metal::conditional_t<use_mx_scale, fp8_e8m0, fp8_e4m3>;
  auto q_scale = ((device ScaleType*)(scales))[gindex];
  auto scale = float(q_scale);

  uint val = w[offset];
#pragma clang loop unroll(full)
  for (int i = 0; i < pack_factor; i++) {
    uint8_t d;
    if (bits == 4) {
      d = (val >> (bits * i)) & 0x0f;
    } else if (bits == 8) {
      d = val;
    }
    out[i] = static_cast<T>(scale * Dequantize<bits>{}(d));
  }
}

template <typename T, const int group_size, const int bits>
[[kernel]] void fp_quantize_dequantize(
    const device T* w [[buffer(0)]],
    device T* out [[buffer(1)]],
    uint2 tidx [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  constexpr bool use_mx_scale = group_size == 32;
  size_t index = tidx.x + grid_dim.x * size_t(tidx.y);

  float scale;
  float w_thread = w[index];
  if (use_mx_scale) {
    scale = simd_max(abs(w_thread));
  } else {
    float w_max_l = simd_max(tidx.x < 16 ? abs(w_thread) : 0.0);
    float w_max_r = simd_max(tidx.x >= 16 ? abs(w_thread) : 0.0);
    scale = tidx.x < 16 ? w_max_l : w_max_r;
  }
  scale /= bits == 4 ? 6.0f : 448.0f;

  using ScaleType = metal::conditional_t<use_mx_scale, fp8_e8m0, fp8_e4m3>;
  auto s = ScaleType(scale);
  scale = float(s);

  uint8_t output = Quantize<bits>{}(scale == 0 ? 0.0f : w_thread / scale);

  out[index] = static_cast<T>(scale * Dequantize<bits>{}(output));
}


================================================
FILE: mlx/backend/metal/kernels/fp_quantized.metal
================================================
// Copyright © 2025 Apple Inc.

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/steel/gemm/gemm.h"
#include "mlx/backend/metal/kernels/quantized_utils.h"
#include "mlx/backend/metal/kernels/fp_quantized.h"

#define instantiate_quantized(mode, name, type, group_size, bits) \
  instantiate_kernel( \
      #mode "_" #name "_" #type "_gs_" #group_size "_b_" #bits, \
      fp_ ## name, \
      type, \
      group_size,   \
      bits)

#define instantiate_quantized_batched(mode, name, type, batched, group_size, bits) \
  instantiate_kernel( \
      #mode "_" #name "_" #type "_gs_" #group_size "_b_" #bits "_batch_" #batched, \
      fp_ ## name,    \
      type,    \
      group_size,      \
      bits,       \
      batched)

#define instantiate_quantized_aligned(mode, name, type, aligned, group_size, bits) \
  instantiate_kernel( \
      #mode "_" #name "_" #type "_gs_" #group_size "_b_" #bits "_alN_" #aligned, \
      fp_ ## name,    \
      type,    \
      group_size,      \
      bits,       \
      aligned)

#define instantiate_quantized_aligned_batched(mode, name, type, aligned, batched, group_size, bits) \
  instantiate_kernel( \
      #mode "_" #name "_" #type "_gs_" #group_size "_b_" #bits "_alN_" #aligned "_batch_" #batched, \
      fp_ ## name,    \
      type,    \
      group_size,      \
      bits,       \
      aligned, \
      batched)

#define instantiate_quantized_quad(mode, name, type, D, batched, group_size, bits) \
  instantiate_kernel( \
      #mode "_" #name "_" #type "_gs_" #group_size "_b_" #bits "_d_" #D "_batch_" #batched, \
      fp_ ## name,    \
      type,    \
      group_size,      \
      bits,       \
      D,       \
      batched)

#define instantiate_quantized_split_k(mode, name, type, split_k, group_size, bits) \
  instantiate_kernel( \
      #mode "_" #name "_" #type "_gs_" #group_size "_b_" #bits "_spk_" #split_k, \
      fp_ ## name,    \
      type,    \
      group_size,      \
      bits,       \
      split_k)

#define instantiate_gather_qmm_rhs(func, name, type, bm, bn, bk, wm, wn, transpose, mode, group_size, bits) \
  instantiate_kernel( \
      #mode "_" #name "_" #type "_gs_" #group_size "_b_" #bits "_bm_" #bm "_bn_" #bn "_bk_" #bk "_wm_" #wm "_wn_" #wn, \
      func,    \
      type,    \
      group_size,      \
      bits,       \
      bm,      \
      bn,      \
      bk,      \
      wm,      \
      wn,      \
      transpose)

#define instantiate_quantized_batched_wrap(name, type, mode, group_size, bits) \
  instantiate_quantized_batched(mode, name, type, 1, group_size, bits)         \
  instantiate_quantized_batched(mode, name, type, 0, group_size, bits)

#define instantiate_quantized_all_batched(type, mode, group_size, bits) \
  instantiate_quantized_batched_wrap(qmv_fast, type, mode, group_size, bits) \
  instantiate_quantized_batched_wrap(qmv, type, mode, group_size, bits)      \
  instantiate_quantized_batched_wrap(qvm, type, mode, group_size, bits) \
  instantiate_quantized_batched_wrap(qmm_n, type, mode, group_size, bits)

#define instantiate_quantized_all_single(type, mode, group_size, bits) \
  instantiate_quantized(mode, gather_qmv_fast, type, group_size, bits) \
  instantiate_quantized(mode, gather_qmv, type, group_size, bits)      \
  instantiate_quantized(mode, gather_qvm, type, group_size, bits) \
  instantiate_quantized(mode, gather_qmm_n, type, group_size, bits)

#define instantiate_quantized_all_aligned(type, mode, group_size, bits) \
  instantiate_quantized_aligned(mode, gather_qmm_t, type, true, group_size, bits)      \
  instantiate_quantized_aligned(mode, gather_qmm_t, type, false, group_size, bits)     \
  instantiate_quantized_aligned_batched(mode, qmm_t, type, true, 1, group_size, bits)  \
  instantiate_quantized_aligned_batched(mode, qmm_t, type, true, 0, group_size, bits)  \
  instantiate_quantized_aligned_batched(mode, qmm_t, type, false, 1, group_size, bits) \
  instantiate_quantized_aligned_batched(mode, qmm_t, type, false, 0, group_size, bits)

#define instantiate_quantized_all_quad(type, mode, group_size, bits) \
  instantiate_quantized_quad(mode, qmv_quad, type, 64, 1, group_size, bits)  \
  instantiate_quantized_quad(mode, qmv_quad, type, 64, 0, group_size, bits)  \
  instantiate_quantized_quad(mode, qmv_quad, type, 128, 1, group_size, bits) \
  instantiate_quantized_quad(mode, qmv_quad, type, 128, 0, group_size, bits)

#define instantiate_quantized_all_splitk(type, mode, group_size, bits) \
  instantiate_quantized_split_k(mode, qvm_split_k, type, 8, group_size, bits) \
  instantiate_quantized_split_k(mode, qvm_split_k, type, 32, group_size, bits)

#define instantiate_quantized_all_rhs(type, mode, group_size, bits) \
  instantiate_gather_qmm_rhs(fp_gather_qmm_rhs, gather_qmm_rhs_nt, type, 16, 32, 32, 1, 2, true, mode, group_size, bits) \
  instantiate_gather_qmm_rhs(fp_gather_qmm_rhs, gather_qmm_rhs_nn, type, 16, 32, 32, 1, 2, false, mode, group_size, bits)

#define instantiate_quantize_dequantize(type, mode, group_size, bits) \
  instantiate_kernel( \
    #mode "_quantize_dequantize_" #type "_gs_" #group_size "_b_" #bits, \
    fp_quantize_dequantize, \
    type, \
    group_size,  \
    bits) \
  instantiate_kernel( \
    #mode "_quantize_" #type "_gs_" #group_size "_b_" #bits, \
    fp_quantize, \
    type, \
    group_size,  \
    bits) \
  instantiate_kernel( \
    #mode "_dequantize_" #type "_gs_" #group_size "_b_" #bits, \
    fp_dequantize, \
    type, \
    group_size,  \
    bits)

#define instantiate_quantized_modes(type, mode, group_size, bits) \
  instantiate_quantized_all_batched(type, mode, group_size, bits) \
  instantiate_quantized_all_single(type, mode, group_size, bits)  \
  instantiate_quantized_all_quad(type, mode, group_size, bits)    \
  instantiate_quantized_all_splitk(type, mode, group_size, bits)  \
  instantiate_quantized_all_aligned(type, mode, group_size, bits) \
  instantiate_quantized_all_rhs(type, mode, group_size, bits)     \
  instantiate_quantize_dequantize(type, mode, group_size, bits)

#define instantiate_quantized_types(type) \
  instantiate_quantized_modes(type, nvfp4, 16, 4) \
  instantiate_quantized_modes(type, mxfp8, 32, 8) \
  instantiate_quantized_modes(type, mxfp4, 32, 4)

instantiate_quantized_types(float)
instantiate_quantized_types(bfloat16_t)
instantiate_quantized_types(float16_t)
    // clang-format on


================================================
FILE: mlx/backend/metal/kernels/fp_quantized_nax.h
================================================
// Copyright © 2025 Apple Inc.

#include <metal_simdgroup>
#include <metal_stdlib>

#include "mlx/backend/metal/kernels/fp4.h"
#include "mlx/backend/metal/kernels/fp8.h"

constant bool align_M [[function_constant(200)]];
constant bool align_N [[function_constant(201)]];
constant bool align_K [[function_constant(202)]];

using namespace metal;

#define MLX_MTL_CONST static constant constexpr const

MLX_MTL_CONST int SIMD_SIZE = 32;
MLX_MTL_CONST int QUAD_SIZE = 4;

template <int wsize = 8, int bits>
inline constexpr short get_pack_factor() {
  return wsize / bits;
}

template <int wsize = 8>
inline constexpr short get_bytes_per_pack() {
  return wsize / 8;
}

template <typename T, int group_size>
static inline T dequantize_scale(uint8_t s) {
  if constexpr (group_size == 16) {
    // Use nv scale
    return T(*(thread fp8_e4m3*)(&s));
  } else {
    return T(*(thread fp8_e8m0*)(&s));
  }
}

template <int bits>
struct Quantize {
  uint8_t operator()(float x) {
    if (bits == 8) {
      return fp8_e4m3(x).bits;
    } else {
      return fp4_e2m1(x).bits;
    }
  }
};

template <int bits, typename U = float>
struct Dequantize {
  U operator()(uint8_t x) {
    if constexpr (bits == 8) {
      return U(*(thread fp8_e4m3*)(&x));
    } else {
      return U(*(thread fp4_e2m1*)(&x));
    }
  }
};

template <typename U, int bits>
inline void dequantize(uint8_t w, U scale, threadgroup U* w_local) {
  if constexpr (bits == 4) {
    w_local[0] = scale * Dequantize<4, U>{}(w);
    w_local[1] = scale * Dequantize<4, U>{}(w >> 4);
  } else {
    w_local[0] = scale * Dequantize<8, U>{}(w);
  }
}

template <
    typename T,
    short BROWS,
    short BCOLS,
    short dst_ld,
    short reduction_dim,
    short tgp_size,
    short group_size,
    short bits>
struct QuantizedBlockLoader {
  MLX_MTL_CONST short pack_factor = get_pack_factor<8, bits>();
  MLX_MTL_CONST short bytes_per_pack = get_bytes_per_pack();
  MLX_MTL_CONST short BCOLS_PACKED = BCOLS / pack_factor;
  MLX_MTL_CONST short n_reads =
      (BCOLS_PACKED * BROWS < tgp_size) ? 1 : (BCOLS_PACKED * BROWS) / tgp_size;

  MLX_MTL_CONST short n_reads_per_scale = (n_reads * pack_factor) <= group_size
      ? n_reads
      : (group_size / pack_factor);
  MLX_MTL_CONST short n_steps_per_read = n_reads / n_reads_per_scale;

  MLX_MTL_CONST short n_groups = BCOLS / group_size;

  const int src_ld;
  const int tile_stride;
  const int group_stride;

  const short thread_idx;
  const short bi;
  const short bj;

  const short group_id;

  threadgroup T* dst;
  const device uint8_t* src;
  const device uint8_t* scales;

  QuantizedBlockLoader(
      const device uint8_t* src_,
      const device uint8_t* scales_,
      const int src_ld_,
      threadgroup T* dst_,
      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
      ushort simd_lane_id [[thread_index_in_simdgroup]])
      : src_ld(src_ld_),
        tile_stride(
            reduction_dim ? BCOLS_PACKED * bytes_per_pack
                          : BROWS * src_ld * bytes_per_pack / pack_factor),
        group_stride(BROWS * src_ld / group_size),
        thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(n_reads * thread_idx / BCOLS_PACKED),
        bj((n_reads * thread_idx) % BCOLS_PACKED),
        group_id((bj * pack_factor) / group_size),
        dst(dst_ + bi * dst_ld + bj * pack_factor),
        src(src_ + bi * src_ld * bytes_per_pack / pack_factor +
            bj * bytes_per_pack),
        scales(scales_ + bi * src_ld / group_size + group_id) {}

  void load_unsafe() const {
    if (BCOLS_PACKED * BROWS < tgp_size && bi >= BROWS) {
      return;
    }

    int k = 0;
    for (int i = 0; i < n_steps_per_read; i++) {
      T scale = dequantize_scale<T, group_size>(scales[i]);
      for (int j = 0; j < n_reads_per_scale; j++) {
        dequantize<T, bits>(
            src[k * bytes_per_pack], scale, dst + k * pack_factor);
        k++;
      }
    }
  }

  void load_safe(short2 src_tile_dim) const {
    if (BCOLS_PACKED * BROWS < tgp_size && bi >= BROWS) {
      return;
    }

    if (reduction_dim == 1 && bi >= src_tile_dim.x) {
      for (int i = 0; i < n_reads * pack_factor; i++) {
        dst[i] = T(0);
      }
      return;
    }

    if (reduction_dim == 0 && bi >= src_tile_dim.y) {
      for (int i = 0; i < n_reads * pack_factor; i++) {
        dst[i] = T(0);
      }
      return;
    }

    int k = 0;
    for (int i = 0; i < n_steps_per_read; i++) {
      T scale = dequantize_scale<T, group_size>(scales[i]);
      for (int j = 0; j < n_reads_per_scale; j++) {
        dequantize<T, bits>(
            src[k * bytes_per_pack], scale, dst + k * pack_factor);
        k++;
      }
    }
  }

  void next() {
    src += tile_stride;
    if (reduction_dim == 1) {
      scales += n_groups;
    } else {
      scales += n_groups * group_stride;
    }
  }
};

using namespace mlx::steel;

template <
    typename T,
    const int group_size,
    const int bits,
    const bool aligned_N,
    const int BM = 64,
    const int BK = 64,
    const int BN = 64,
    const int WM = 2,
    const int WN = 2,
    typename Wtype = bfloat>
METAL_FUNC void fp_qmm_t_impl(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    device T* y,
    threadgroup Wtype* Ws,
    const constant int& K,
    const constant int& N,
    const constant int& M,
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  static_assert(BK >= SIMD_SIZE, "BK should be larger than SIMD_SIZE");
  static_assert(BK % SIMD_SIZE == 0, "BK should be divisible by SIMD_SIZE");

  (void)lid;

  constexpr int pack_factor = get_pack_factor<8, bits>();
  constexpr int bytes_per_pack = get_bytes_per_pack();

  constexpr int BK_padded = (BK + 16 / sizeof(Wtype));

  // Instantiate Loader
  using loader_w_t = QuantizedBlockLoader<
      Wtype,
      BN,
      BK,
      BK_padded,
      1,
      WM * WN * SIMD_SIZE,
      group_size,
      bits>;

  // Set the block
  const int K_w = K * bytes_per_pack / pack_factor;
  const int K_g = K / group_size;
  const int y_row = tid.y * BM;
  const int y_col = tid.x * BN;

  auto wl = (const device uint8_t*)w;

  x += y_row * static_cast<int64_t>(K);
  wl += y_col * K_w;
  scales += y_col * K_g;
  y += y_row * static_cast<int64_t>(N) + y_col;

  // Make the weight loader
  loader_w_t loader_w(wl, scales, K, Ws, simd_gid, simd_lid);

  constexpr short SM = BM / WM;
  constexpr short SN = BN / WN;
  constexpr short SK = 32;

  constexpr short TM = SM / 16;
  constexpr short TN = SN / 16;
  constexpr short TK = SK / 16;

  const short tm = SM * (simd_gid / WN);
  const short tn = SN * (simd_gid % WN);

  constexpr bool transpose_a = false;
  constexpr bool transpose_b = true;

  const short sgp_sm = min(SM, short(M - (y_row + tm)));
  const bool is_unaligned_sm = (sgp_sm != SM);

  const short sgp_sn = aligned_N ? SN : min(SN, short(N - (y_col + tn)));

  const short tgp_bn = aligned_N ? BN : min(BN, int(N - (y_col)));
  const bool is_unaligned_bn = aligned_N ? false : (tgp_bn != BN);

  using AccumType = float;

  NAXTile<AccumType, TM, TN> Dtile;
  Dtile.clear();

  x += tm * K;

  dispatch_bool(!is_unaligned_sm, [&](auto kAlignedM) {
    dispatch_bool(aligned_N || !is_unaligned_bn, [&](auto kAlignedN) {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        if constexpr (kAlignedN.value) {
          loader_w.load_unsafe();
        } else {
          loader_w.load_safe(short2(BK, tgp_bn));
        }

        threadgroup_barrier(mem_flags::mem_threadgroup);

        STEEL_PRAGMA_NO_UNROLL
        for (int kk1 = 0; kk1 < BK; kk1 += SK) {
          NAXTile<T, TM, TK> Atile;
          NAXTile<Wtype, TN, TK> Btile;

          volatile int compiler_barrier;

          if constexpr (kAlignedM.value) {
            Atile.load(x + kk1, K);
          } else {
            Atile.load_safe(x + kk1, K, short2(SK, sgp_sm));
          }

          Btile.template load<Wtype, BK_padded, 1>(Ws + tn * BK_padded + kk1);

          tile_matmad_nax(
              Dtile,
              Atile,
              metal::bool_constant<transpose_a>{},
              Btile,
              metal::bool_constant<transpose_b>{});

          (void)compiler_barrier;
        }

        x += BK;
        loader_w.next();
      }

      // Store results to device memory
      threadgroup_barrier(mem_flags::mem_threadgroup);

      if constexpr (kAlignedM.value && kAlignedN.value) {
        Dtile.store(y + tm * N + tn, N);
      } else if (kAlignedM.value && sgp_sn == SN) {
        Dtile.store(y + tm * N + tn, N);
      } else {
        Dtile.store_safe(y + tm * N + tn, N, short2(sgp_sn, sgp_sm));
      }
    });
  });
}

template <
    typename T,
    const int group_size,
    const int bits,
    const int BM = 64,
    const int BK = 64,
    const int BN = 64,
    const int WM = 2,
    const int WN = 2,
    typename Wtype = bfloat>
METAL_FUNC void fp_qmm_n_impl(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    device T* y,
    threadgroup T* Ws,
    const constant int& K,
    const constant int& N,
    const constant int& M,
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  static_assert(BK >= SIMD_SIZE, "BK should be larger than SIMD_SIZE");
  static_assert(BK % SIMD_SIZE == 0, "BK should be divisible by SIMD_SIZE");

  (void)lid;
  (void)M;

  constexpr int pack_factor = get_pack_factor<8, bits>();
  constexpr int bytes_per_pack = get_bytes_per_pack();

  constexpr int BN_padded = (BN + 16 / sizeof(T));

  using loader_w_t = QuantizedBlockLoader<
      T,
      BK,
      BN,
      BN_padded,
      0,
      WM * WN * SIMD_SIZE,
      group_size,
      bits>;

  // Set the block
  const int K_w = K * bytes_per_pack / pack_factor;
  const int K_g = K / group_size;
  const int y_row = tid.y * BM;
  const int y_col = tid.x * BN;

  auto wl = (const device uint8_t*)w;

  x += y_row * static_cast<int64_t>(K);
  wl += y_col * K_w;
  scales += y_col * K_g;
  y += y_row * static_cast<int64_t>(N) + y_col;

  // Make the x loader and mma operation
  // const short num_els = min(BM, M - y_row);
  // const short num_outs = min(BN, N - y_col);
  loader_w_t loader_w(wl, scales, K, Ws, simd_gid, simd_lid);

  constexpr short SM = BM / WM;
  constexpr short SN = BN / WN;
  constexpr short SK = 32;

  constexpr short TM = SM / 16;
  constexpr short TN = SN / 16;
  constexpr short TK = SK / 16;

  const short tm = SM * (simd_gid / WN);
  const short tn = SN * (simd_gid % WN);

  const short ldb_tgp = BN_padded;

  constexpr bool transpose_a = false;
  constexpr bool transpose_b = false;

  using AccumType = float;

  NAXTile<AccumType, TM, TN> Dtile;
  Dtile.clear();

  x += tm * K;

  for (int k = 0; k < K; k += BK) {
    threadgroup_barrier(mem_flags::mem_threadgroup);
    loader_w.load_unsafe();
    threadgroup_barrier(mem_flags::mem_threadgroup);

    STEEL_PRAGMA_NO_UNROLL
    for (int kk1 = 0; kk1 < BK; kk1 += SK) {
      NAXTile<T, TM, TK> Atile;
      NAXTile<Wtype, TK, TN> Btile;

      volatile int compiler_barrier;

      Atile.load(x + kk1, K);
      Btile.template load<T, BN_padded, 1>(Ws + tn + kk1 * ldb_tgp);

      tile_matmad_nax(
          Dtile,
          Atile,
          metal::bool_constant<transpose_a>{},
          Btile,
          metal::bool_constant<transpose_b>{});

      (void)compiler_barrier;
    }

    x += BK;
    loader_w.next();
  }

  // Store results to device memory
  threadgroup_barrier(mem_flags::mem_threadgroup);

  Dtile.store(y + tm * N + tn, N);
}

template <typename T, typename S>
METAL_FUNC void adjust_matrix_offsets(
    const device T*& x,
    const device uint32_t*& w,
    const device S*& scales,
    device T*& y,
    int output_stride,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    uint3 tid [[threadgroup_position_in_grid]]) {
  // Set the input/output matrices
  uint32_t x_idx = tid.z;
  uint32_t w_idx = tid.z;
  if (x_batch_ndims == 1) {
    x += x_idx * x_strides[0];
  } else {
    x += elem_to_loc(x_idx, x_shape, x_strides, x_batch_ndims);
  }
  if (w_batch_ndims == 1) {
    w += w_idx * w_strides[0];
    scales += w_idx * s_strides[0];
  } else {
    ulong2 idx = elem_to_loc_broadcast(
        w_idx, w_shape, w_strides, s_strides, w_batch_ndims);
    w += idx.x;
    scales += idx.y;
  }
  y += tid.z * output_stride;
}

template <typename T, typename S>
METAL_FUNC void adjust_matrix_offsets(
    const device T*& x,
    const device uint32_t*& w,
    const device S*& scales,
    const device uint32_t* lhs_indices,
    const device uint32_t* rhs_indices,
    device T*& y,
    int output_stride,
    const constant int& batch_ndims,
    const constant int* batch_shape,
    const constant int64_t* lhs_strides,
    const constant int64_t* rhs_strides,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    uint3 tid [[threadgroup_position_in_grid]]) {
  // Set the input/output matrices
  uint32_t x_idx;
  uint32_t w_idx;
  if (batch_ndims == 1) {
    x_idx = lhs_indices[tid.z * lhs_strides[0]];
    w_idx = rhs_indices[tid.z * rhs_strides[0]];
  } else {
    ulong2 idx = elem_to_loc_broadcast(
        tid.z, batch_shape, lhs_strides, rhs_strides, batch_ndims);
    x_idx = lhs_indices[idx.x];
    w_idx = rhs_indices[idx.y];
  }
  if (x_batch_ndims == 1) {
    x += x_idx * x_strides[0];
  } else {
    x += elem_to_loc(x_idx, x_shape, x_strides, x_batch_ndims);
  }
  if (w_batch_ndims == 1) {
    w += w_idx * w_strides[0];
    scales += w_idx * s_strides[0];
  } else {
    ulong2 idx = elem_to_loc_broadcast(
        w_idx, w_shape, w_strides, s_strides, w_batch_ndims);
    w += idx.x;
    scales += idx.y;
  }
  y += tid.z * output_stride;
}

template <
    typename T,
    const int group_size,
    const int bits,
    const bool aligned_N,
    const bool batched,
    const int BM = 64,
    const int BK = 64,
    const int BN = 64,
    const int WM = 2,
    const int WN = 2,
    typename Wtype = bfloat>
[[kernel]] void fp_qmm_t_nax(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    device T* y,
    const constant int& K,
    const constant int& N,
    const constant int& M,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr int BK_padded = (BK + 16 / sizeof(Wtype));

  threadgroup Wtype Ws[BN * BK_padded];

  if (batched) {
    adjust_matrix_offsets(
        x,
        w,
        scales,
        y,
        M * N,
        x_batch_ndims,
        x_shape,
        x_strides,
        w_batch_ndims,
        w_shape,
        w_strides,
        s_strides,
        tid);
  }
  fp_qmm_t_impl<T, group_size, bits, aligned_N, BM, BK, BN, WM, WN, Wtype>(
      w, scales, x, y, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
}

template <
    typename T,
    const int group_size,
    const int bits,
    const bool batched,
    const int BM = 64,
    const int BK = 64,
    const int BN = 64,
    const int WM = 2,
    const int WN = 2,
    typename Wtype = bfloat>
[[kernel]] void fp_qmm_n_nax(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    device T* y,
    const constant int& K,
    const constant int& N,
    const constant int& M,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr int BK_padded = (BK + 16 / sizeof(T));
  constexpr int BN_padded = (BN + 16 / sizeof(T));

  threadgroup T Xs[BM * BK_padded];
  threadgroup T Ws[BK * BN_padded];

  if (batched) {
    adjust_matrix_offsets(
        x,
        w,
        scales,
        y,
        M * N,
        x_batch_ndims,
        x_shape,
        x_strides,
        w_batch_ndims,
        w_shape,
        w_strides,
        s_strides,
        tid);
  }

  fp_qmm_n_impl<T, group_size, bits, BM, BK, BN, WM, WN, Wtype>(
      w, scales, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
}

template <
    typename T,
    const int group_size,
    const int bits,
    const bool aligned_N,
    const int BM = 64,
    const int BK = 64,
    const int BN = 64,
    const int WM = 2,
    const int WN = 2,
    typename Wtype = bfloat>
[[kernel]] void fp_gather_qmm_t_nax(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    const device uint32_t* lhs_indices,
    const device uint32_t* rhs_indices,
    device T* y,
    const constant int& K,
    const constant int& N,
    const constant int& M,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    const constant int& batch_ndims,
    const constant int* batch_shape,
    const constant int64_t* lhs_strides,
    const constant int64_t* rhs_strides,
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr int BK_padded = (BK + 16 / sizeof(Wtype));

  threadgroup Wtype Ws[BN * BK_padded];

  adjust_matrix_offsets(
      x,
      w,
      scales,
      lhs_indices,
      rhs_indices,
      y,
      M * N,
      batch_ndims,
      batch_shape,
      lhs_strides,
      rhs_strides,
      x_batch_ndims,
      x_shape,
      x_strides,
      w_batch_ndims,
      w_shape,
      w_strides,
      s_strides,
      tid);
  fp_qmm_t_impl<T, group_size, bits, aligned_N, BM, BK, BN, WM, WN, Wtype>(
      w, scales, x, y, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
}

template <
    typename T,
    const int group_size,
    const int bits,
    const int BM = 64,
    const int BK = 64,
    const int BN = 64,
    const int WM = 2,
    const int WN = 2,
    typename Wtype = bfloat>
[[kernel]] void fp_gather_qmm_n_nax(
    const device uint32_t* w,
    const device uint8_t* scales,
    const device T* x,
    const device uint32_t* lhs_indices,
    const device uint32_t* rhs_indices,
    device T* y,
    const constant int& K,
    const constant int& N,
    const constant int& M,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    const constant int& batch_ndims,
    const constant int* batch_shape,
    const constant int64_t* lhs_strides,
    const constant int64_t* rhs_strides,
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr int BK_padded = (BK + 16 / sizeof(T));
  constexpr int BN_padded = (BN + 16 / sizeof(T));

  threadgroup T Xs[BM * BK_padded];
  threadgroup T Ws[BK * BN_padded];

  adjust_matrix_offsets(
      x,
      w,
      scales,
      lhs_indices,
      rhs_indices,
      y,
      M * N,
      batch_ndims,
      batch_shape,
      lhs_strides,
      rhs_strides,
      x_batch_ndims,
      x_shape,
      x_strides,
      w_batch_ndims,
      w_shape,
      w_strides,
      s_strides,
      tid);
  fp_qmm_n_impl<T, group_size, bits, BM, BK, BN, WM, WN, Wtype>(
      w, scales, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
}

template <
    typename T,
    int group_size,
    const int bits,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose,
    typename Wtype = bfloat>
[[kernel]] void fp_gather_qmm_rhs_nax(
    const device T* x,
    const device uint32_t* w,
    const device uint8_t* scales,
    const device uint32_t* indices,
    device T* y,
    const constant int& M,
    const constant int& N,
    const constant int& K,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]]) {
  constexpr int pack_factor = get_pack_factor<8, bits>();
  constexpr int bytes_per_pack = get_bytes_per_pack();
  constexpr int BK_padded = (BK + 16 / sizeof(Wtype));
  constexpr int BN_padded = (BN + 16 / sizeof(Wtype));

  using loader_w_t = QuantizedBlockLoader<
      Wtype,
      transpose ? BN : BK,
      transpose ? BK : BN,
      transpose ? BK_padded : BN_padded,
      transpose,
      WM * WN * SIMD_SIZE,
      group_size,
      bits>;

  threadgroup Wtype Ws[transpose ? BN * BK_padded : BK * BN_padded];

  // Compute the block
  const int K_w = K * bytes_per_pack / pack_factor;
  const int K_g = K / group_size;
  const int N_w = N * bytes_per_pack / pack_factor;
  const int N_g = N / group_size;
  const int K_it = K / BK;
  const size_t stride_w = transpose ? N * K_w : K * N_w;
  const size_t stride_s = transpose ? N * K_g : K * N_g;
  const int y_row = tid.y * BM;
  const int y_col = tid.x * BN;
  const size_t y_row_long = size_t(y_row);
  const size_t y_col_long = size_t(y_col);

  // Prepare threadgroup bounds
  const short tgp_bm = align_M ? BM : short(min(BM, M - y_row));
  const short tgp_bn = align_N ? BN : short(min(BN, N - y_col));

  // Calculate the final tiles in the case that K is not aligned
  const int k_remain = K - K_it * BK;
  const short2 tile_w =
      transpose ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);

  // Move x and output to the correct block
  auto wl = (const device uint8_t*)w;
  x += y_row_long * K;
  y += y_row_long * N + y_col_long;
  wl += transpose ? y_col_long * K_w : y_col * bytes_per_pack / pack_factor;
  scales += transpose ? y_col_long * K_g : y_col / group_size;

  constexpr short SM = BM / WM;
  constexpr short SN = BN / WN;
  constexpr short SK = 32;

  constexpr short TM = SM / 16;
  constexpr short TN = SN / 16;
  constexpr short TK = SK / 16;

  const short tm = SM * (simd_group_id / WN);
  const short tn = SN * (simd_group_id % WN);

  const short sgp_sm =
      align_M ? SM : min(SM, short(max(0, (M - (y_row + tm)))));
  const short sgp_sn =
      align_N ? SN : min(SN, short(max(0, (N - (y_col + tn)))));

  const bool is_unaligned_sm = align_M ? false : (sgp_sm != SM);
  const bool is_unaligned_bn = align_N ? false : (tgp_bn != BN);

  constexpr short BR = transpose ? TN : TK;
  constexpr short BC = transpose ? TK : TN;

  using AccumType = float;

  // Do as many matmuls as necessary
  uint32_t index;
  short offset;
  uint32_t index_next = indices[y_row];
  short offset_next = 0;
  int n = 0;
  while (n < tgp_bm) {
    n++;
    offset = offset_next;
    index = index_next;
    offset_next = tgp_bm;
    for (; n < tgp_bm; n++) {
      if (indices[y_row + n] != index) {
        offset_next = n;
        index_next = indices[y_row + n];
        break;
      }
    }
    threadgroup_barrier(mem_flags::mem_none);

    // Prepare threadgroup mma operation
    NAXTile<AccumType, TM, TN> Dtile;
    Dtile.clear();

    const device T* xn = x + tm * K;

    // Prepare threadgroup loading operations
    thread loader_w_t loader_w(
        wl + index * stride_w,
        scales + index * stride_s,
        transpose ? K : N,
        Ws,
        simd_group_id,
        simd_lane_id);

    dispatch_bool(align_M || !is_unaligned_sm, [&](auto kAlignedM) {
      dispatch_bool(align_N || !is_unaligned_bn, [&](auto kAlignedN) {
        for (int k = 0; k < K_it; k++) {
          threadgroup_barrier(mem_flags::mem_threadgroup);
          if constexpr (kAlignedN.value) {
            loader_w.load_unsafe();
          } else {
            loader_w.load_safe(
                transpose ? short2(BK, tgp_bn) : short2(tgp_bn, BK));
          }

          threadgroup_barrier(mem_flags::mem_threadgroup);

          STEEL_PRAGMA_NO_UNROLL
          for (int kk1 = 0; kk1 < BK; kk1 += SK) {
            NAXTile<T, TM, TK> Atile;
            NAXTile<Wtype, BR, BC> Btile;

            volatile int compiler_barrier;

            if constexpr (kAlignedM.value) {
              Atile.load(xn + kk1, K);
            } else {
              Atile.load_safe(xn + kk1, K, short2(SK, sgp_sm));
            }

            if constexpr (transpose) {
              Btile.template load<Wtype, BK_padded, 1>(
                  Ws + tn * BK_padded + kk1);
            } else {
              Btile.template load<Wtype, BN_padded, 1>(
                  Ws + tn + kk1 * BN_padded);
            }

            tile_matmad_nax(
                Dtile,
                Atile,
                metal::bool_constant<false>{},
                Btile,
                metal::bool_constant<transpose>{});

            (void)compiler_barrier;
          }

          xn += BK;
          loader_w.next();
        }

        if (!align_K) {
          threadgroup_barrier(mem_flags::mem_threadgroup);
          loader_w.load_safe(tile_w);
          threadgroup_barrier(mem_flags::mem_threadgroup);

          STEEL_PRAGMA_NO_UNROLL
          for (int kk1 = 0; kk1 < BK; kk1 += SK) {
            NAXTile<T, TM, TK> Atile;
            NAXTile<Wtype, BR, BC> Btile;

            volatile int compiler_barrier;

            const short psk = min(int(SK), max(0, (BK - kk1)));
            Atile.load_safe(xn + kk1, K, short2(psk, sgp_sm));

            if constexpr (transpose) {
              Btile.template load<Wtype, BK_padded, 1>(
                  Ws + tn * BK_padded + kk1);
            } else {
              Btile.template load<Wtype, BN_padded, 1>(
                  Ws + tn + kk1 * BN_padded);
            }

            tile_matmad_nax(
                Dtile,
                Atile,
                metal::bool_constant<false>{},
                Btile,
                metal::bool_constant<transpose>{});

            (void)compiler_barrier;
          }
        }

        threadgroup_barrier(mem_flags::mem_threadgroup);

        const short m_lo_lim = min(int(sgp_sm), max(0, offset - tm));
        const short m_hi_lim = min(int(sgp_sm), max(0, offset_next - tm));

        // Store results to device memory
        if constexpr (kAlignedN.value) {
          if (m_lo_lim == 0 && m_hi_lim == SM) {
            Dtile.store(y + tm * N + tn, N);
          } else {
            Dtile.store_slice(
                y + tm * N + tn, N, short2(0, m_lo_lim), short2(SN, m_hi_lim));
          }
        } else {
          Dtile.store_slice(
              y + tm * N + tn,
              N,
              short2(0, m_lo_lim),
              short2(sgp_sn, m_hi_lim));
        }
      });
    });
  }
}


================================================
FILE: mlx/backend/metal/kernels/fp_quantized_nax.metal
================================================
// Copyright © 2025 Apple Inc.

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/steel/gemm/gemm.h"
#include "mlx/backend/metal/kernels/quantized_utils.h"
#include "mlx/backend/metal/kernels/steel/gemm/nax.h"
#include "mlx/backend/metal/kernels/fp_quantized_nax.h"


#define instantiate_quantized_batched(mode, name, type, bm, bn, bk, wm, wn, batched, group_size, bits) \
  instantiate_kernel( \
      #mode "_" #name "_" #type "_gs_" #group_size "_b_" #bits "_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn "_batch_" #batched, \
      fp_ ## name,  \
      type,         \
      group_size,           \
      bits,            \
      batched)

#define instantiate_quantized_aligned(mode, name, type, bm, bn, bk, wm, wn, aligned, group_size, bits) \
  instantiate_kernel( \
      #mode "_" #name "_" #type "_gs_" #group_size "_b_" #bits "_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn "_alN_" #aligned, \
      fp_ ## name, \
      type,        \
      group_size,          \
      bits,           \
      aligned)

#define instantiate_quantized_aligned_batched(mode, name, type, bm, bn, bk, wm, wn, aligned, batched, group_size, bits) \
  instantiate_kernel( \
      #mode "_" #name "_" #type "_gs_" #group_size "_b_" #bits "_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn "_alN_" #aligned "_batch_" #batched, \
      fp_ ## name,    \
      type,    \
      group_size,      \
      bits,       \
      aligned, \
      batched)

#define instantiate_gather_qmm_rhs(func, name, type, bm, bn, bk, wm, wn, transpose, mode, group_size, bits) \
  instantiate_kernel( \
      #mode "_" #name "_" #type "_gs_" #group_size "_b_" #bits "_bm_" #bm "_bn_" #bn "_bk_" #bk "_wm_" #wm "_wn_" #wn, \
      func,    \
      type,    \
      group_size,      \
      bits,       \
      bm,      \
      bn,      \
      bk,      \
      wm,      \
      wn,      \
      transpose)


#define instantiate_quantized_all_aligned(type, mode, group_size, bits) \
  instantiate_quantized_aligned(mode, gather_qmm_t_nax, type, 64, 64, 64, 2, 2, true, group_size, bits)      \
  instantiate_quantized_aligned(mode, gather_qmm_t_nax, type, 64, 64, 64, 2, 2, false, group_size, bits)     \
  instantiate_quantized_aligned_batched(mode, qmm_t_nax, type, 64, 64, 64, 2, 2, true, 1, group_size, bits)  \
  instantiate_quantized_aligned_batched(mode, qmm_t_nax, type, 64, 64, 64, 2, 2, true, 0, group_size, bits)  \
  instantiate_quantized_aligned_batched(mode, qmm_t_nax, type, 64, 64, 64, 2, 2, false, 1, group_size, bits) \
  instantiate_quantized_aligned_batched(mode, qmm_t_nax, type, 64, 64, 64, 2, 2, false, 0, group_size, bits)


#define instantiate_quantized_all_rhs(type, mode, group_size, bits) \
  instantiate_gather_qmm_rhs(fp_gather_qmm_rhs_nax, gather_qmm_rhs_nax_nt, type, 64, 64, 64, 2, 2, true, mode, group_size, bits) \
  instantiate_gather_qmm_rhs(fp_gather_qmm_rhs_nax, gather_qmm_rhs_nax_nn, type, 64, 64, 64, 2, 2, false, mode, group_size, bits)

#define instantiate_quantized_modes(type, mode, group_size, bits) \
  instantiate_quantized_all_aligned(type, mode, group_size, bits) \
  instantiate_quantized_all_rhs(type, mode, group_size, bits)

#define instantiate_quantized_types(type) \
  instantiate_quantized_modes(type, nvfp4, 16, 4) \
  instantiate_quantized_modes(type, mxfp8, 32, 8) \
  instantiate_quantized_modes(type, mxfp4, 32, 4)

instantiate_quantized_types(float)
instantiate_quantized_types(bfloat16_t)
instantiate_quantized_types(float16_t)
    // clang-format on


================================================
FILE: mlx/backend/metal/kernels/gemv.metal
================================================
// Copyright © 2023-2024 Apple Inc.

#include <metal_simdgroup>
#include <metal_stdlib>

#include "mlx/backend/metal/kernels/utils.h"

#include "mlx/backend/metal/kernels/steel/utils.h"

using namespace metal;

///////////////////////////////////////////////////////////////////////////////
/// Matrix vector multiplication
///////////////////////////////////////////////////////////////////////////////

#define MLX_MTL_CONST static constant constexpr const

template <typename U>
struct DefaultAccT {
  using type = float;
};
template <>
struct DefaultAccT<complex64_t> {
  using type = complex64_t;
};

template <
    typename T,
    const int BM, /* Threadgroup rows (in simdgroups) */
    const int BN, /* Threadgroup cols (in simdgroups) */
    const int SM, /* Simdgroup rows (in threads) */
    const int SN, /* Simdgroup cols (in threads) */
    const int TM, /* Thread rows (in elements) */
    const int TN, /* Thread cols (in elements) */
    const bool kDoAxpby, /* Do out = alpha * out + beta * bias */
    typename AccT = typename DefaultAccT<T>::type>
struct GEMVKernel {
  using acc_type = AccT;

  MLX_MTL_CONST int threadsM = BM * SM;
  MLX_MTL_CONST int threadsN = BN * SN;

  MLX_MTL_CONST int blockM = threadsM * TM;
  MLX_MTL_CONST int blockN = threadsN * TN;

  static_assert(SM * SN == 32, "simdgroup can only have 32 threads");

  static_assert(
      SN == 4 || SN == 8 || SN == 16 || SN == 32,
      "gemv block must have a width of 4, 8, 16, or 32");

  // - The matrix of size (M = out_vec_size, K = in_vec_size) is divided up
  //   into blocks of (blockM, blockN) divided among threadgroups
  // - Every thread works on a block of (TM, TN)
  // - We assume each threadgroup has (threadsN, threadsM, 1) threads
  //
  // 1. A thread loads TN elements each from mat along TM rows
  //    and the corresponding scalar from the vector
  // 2. The thread then multiplies and adds to accumulate its local result for
  //    the block
  // 3. At the end, each thread has accumulated results over all blocks across
  //    the rows. These are then summed up across the threadgroup
  // 4. Each threadgroup writes its accumulated blockM outputs
  //
  // Edge case handling:
  // - The threadgroup with the largest tid has blocks that exceed the matrix
  //   * The blocks that start outside the matrix are never read (thread results
  //     remain zero)
  //   * The last thread that partially overlaps with the matrix is shifted
  //     inwards such that the thread block fits exactly in the matrix

  MLX_MTL_CONST short tgp_mem_size = BN > 1 ? BN*(blockM + TM) : 0;
  MLX_MTL_CONST bool needs_tgp_reduction = BN > 1;

  template <typename U = T>
  static METAL_FUNC void
  load_unsafe(const device T* src, thread U dst[TN], const int src_offset = 0) {
    MLX_MTL_PRAGMA_UNROLL
    for (int tn = 0; tn < TN; tn++) {
      dst[tn] = static_cast<U>(src[src_offset + tn]);
    }
  }

  template <typename U = T>
  static METAL_FUNC void load_safe(
      const device T* src,
      thread U dst[TN],
      const int src_offset = 0,
      const int src_size = TN) {
    if (src_offset + TN <= src_size) {
      MLX_MTL_PRAGMA_UNROLL
      for (int tn = 0; tn < TN; tn++) {
        dst[tn] = static_cast<U>(src[src_offset + tn]);
      }
    } else { // Edgecase
      MLX_MTL_PRAGMA_UNROLL
      for (int tn = 0; tn < TN; tn++) {
        dst[tn] = src_offset + tn < src_size
            ? static_cast<U>(src[src_offset + tn])
            : U(0);
      }
    }
  }

  static METAL_FUNC void run(
      const device T* mat [[buffer(0)]],
      const device T* in_vec [[buffer(1)]],
      const device T* bias [[buffer(2)]],
      device T* out_vec [[buffer(3)]],
      const constant int& in_vec_size [[buffer(4)]],
      const constant int& out_vec_size [[buffer(5)]],
      const constant int& matrix_ld [[buffer(6)]],
      const constant float& alpha [[buffer(7)]],
      const constant float& beta [[buffer(8)]],
      const constant int& bias_stride [[buffer(14)]],
      threadgroup AccT* tgp_memory [[threadgroup(0)]],
      uint3 tid [[threadgroup_position_in_grid]],
      uint3 lid [[thread_position_in_threadgroup]],
      uint simd_gid [[simdgroup_index_in_threadgroup]],
      uint simd_lid [[thread_index_in_simdgroup]]) {
    // Appease compiler
    (void)lid;

    // Thread local accumulation results
    thread AccT result[TM] = {0};
    thread T inter[TN];
    thread AccT v_coeff[TN];

    const int thrM = SN != 32 ? simd_lid / SN : 0;
    const int thrN = SN != 32 ? simd_lid % SN : int(simd_lid);

    const int sgN = BN != 1 ? (simd_gid % BN) : 0;

    const int simdM = BN != 1 ? SM * (simd_gid / BN) : int(SM * simd_gid);
    const int simdN = BN != 1 ? SN * (simd_gid % BN) : 0;

    int bm = (simdM + thrM) * TM;
    int bn = (simdN + thrN) * TN;

    // Block position
    int out_row = tid.x * blockM + bm;

    // Exit simdgroup if rows out of bound
    if (out_row >= out_vec_size)
      return;

    // Adjust tail simdgroup to ensure in bound reads
    out_row = out_row + TM <= out_vec_size ? out_row : out_vec_size - TM;

    // Advance matrix
    mat += out_row * matrix_ld;

    constexpr const uniform<int> loop_stride = make_uniform(blockN);
    const uniform<int> in_size = make_uniform(in_vec_size);
    const uniform<int> n_iter = in_size / loop_stride;
    const uniform<int> last_iter = loop_stride * n_iter;
    const uniform<int> leftover = in_size - last_iter;

    // Loop over in_vec in blocks of blockN
    for (int i = 0; i < n_iter; ++i) {
      load_unsafe<AccT>(in_vec, v_coeff, bn);

      // Per thread work loop
      int mat_offset = 0;
      MLX_MTL_PRAGMA_UNROLL
      for (int tm = 0; tm < TM; tm++) {
        // Load for the row
        load_unsafe(mat, inter, mat_offset + bn);

        // Accumulate results
        MLX_MTL_PRAGMA_UNROLL
        for (int tn = 0; tn < TN; tn++) {
          result[tm] += inter[tn] * v_coeff[tn];
        }

        mat_offset += matrix_ld;
      }

      bn += blockN;
    }

    if (leftover > 0) {
      load_safe<AccT>(in_vec, v_coeff, bn, in_size);

      // Per thread work loop
      MLX_MTL_PRAGMA_UNROLL
      for (int tm = 0; tm < TM; tm++) {
        // Load for the row
        load_safe(&mat[tm * matrix_ld], inter, bn, in_size);

        // Accumulate results
        MLX_MTL_PRAGMA_UNROLL
        for (int tn = 0; tn < TN; tn++) {
          result[tm] += inter[tn] * v_coeff[tn];
        }
      }
    }

    // Simdgroup accumulations
    MLX_MTL_PRAGMA_UNROLL
    for (int tm = 0; tm < TM; tm++) {
      MLX_MTL_PRAGMA_UNROLL
      for (ushort sn = (SN / 2); sn >= 1; sn >>= 1) {
        result[tm] += simd_shuffle_down(result[tm], sn);
      }
    }

    // Threadgroup accumulation results
    if (needs_tgp_reduction) {
      threadgroup AccT* tgp_results = tgp_memory + sgN * (blockM + TM) + bm;
      if (thrN == 0) {
        MLX_MTL_PRAGMA_UNROLL
        for (int tm = 0; tm < TM; tm++) {
          tgp_results[tm] = result[tm];
        }

        threadgroup_barrier(mem_flags::mem_none);

        if (sgN == 0) {
          MLX_MTL_PRAGMA_UNROLL
          for (int sgn = 1; sgn < BN; sgn++) {
            MLX_MTL_PRAGMA_UNROLL
            for (int tm = 0; tm < TM; tm++) {
              result[tm] += tgp_results[sgn * (blockM + TM) + tm];
            }
          }
        }
      }
    }

    // Write outputs
    if (simdN == 0 && thrN == 0) {
      MLX_MTL_PRAGMA_UNROLL
      for (int tm = 0; tm < TM; tm++) {
        if (kDoAxpby) {
          out_vec[out_row + tm] =
              static_cast<T>(alpha) * static_cast<T>(result[tm]) +
              static_cast<T>(beta) * bias[(out_row + tm) * bias_stride];
        } else {
          out_vec[out_row + tm] = static_cast<T>(result[tm]);
        }
      }
    }
  }
};

///////////////////////////////////////////////////////////////////////////////
/// Vector matrix multiplication
///////////////////////////////////////////////////////////////////////////////

template <
    typename T,
    const int BM, /* Threadgroup rows (in simdgroups) */
    const int BN, /* Threadgroup cols (in simdgroups) */
    const int SM, /* Simdgroup rows (in threads) */
    const int SN, /* Simdgroup cols (in threads) */
    const int TM, /* Thread rows (in elements) */
    const int TN, /* Thread cols (in elements) */
    const bool kDoAxpby, /* Do out = alpha * out + beta * bias */
    typename AccT = typename DefaultAccT<T>::type>
struct GEMVTKernel {
  using acc_type = AccT;

  MLX_MTL_CONST int threadsM = BM * SM;
  MLX_MTL_CONST int threadsN = BN * SN;

  MLX_MTL_CONST int blockM = threadsM * TM;
  MLX_MTL_CONST int blockN = threadsN * TN;

  static_assert(SM * SN == 32, "simdgroup can only have 32 threads");

  // - The matrix of size (M = in_vec_size, N = out_vec_size) is divided up
  //   into blocks of (blockM, blockN) divided among threadgroups
  // - Every thread works on a block of (TM, TN)
  // - We assume each threadgroup has (threadsN, threadsM, 1) threads
  //
  // 1. A thread loads TN elements each from mat along TM contiguous rows
  //    and the corresponding scalar from the vector
  // 2. The thread then accumulates its local result for the block
  // 3. At the end, each thread has accumulated results over all blocks across
  //    the rows. These are then summed up across the threadgroup
  // 4. Each threadgroup writes its accumulated BN * TN outputs
  //
  // Edge case handling:
  // - The threadgroup with the largest tid has blocks that exceed the matrix
  //   * The blocks that start outside the matrix are never read (thread results
  //     remain zero)
  //   * The last thread that partially overlaps with the matrix is shifted
  //     inwards such that the thread block fits exactly in the matrix

  MLX_MTL_CONST short tgp_mem_size = BM > 1 ? BM*(blockN + TN) : 0;
  MLX_MTL_CONST bool needs_tgp_reduction = BM > 1;

  static METAL_FUNC void run(
      const device T* mat [[buffer(0)]],
      const device T* in_vec [[buffer(1)]],
      const device T* bias [[buffer(2)]],
      device T* out_vec [[buffer(3)]],
      const constant int& in_vec_size [[buffer(4)]],
      const constant int& out_vec_size [[buffer(5)]],
      const constant int& marix_ld [[buffer(6)]],
      const constant float& alpha [[buffer(7)]],
      const constant float& beta [[buffer(8)]],
      const constant int& bias_stride [[buffer(14)]],
      threadgroup AccT* tgp_memory [[threadgroup(0)]],
      uint3 tid [[threadgroup_position_in_grid]],
      uint3 lid [[thread_position_in_threadgroup]],
      uint simd_gid [[simdgroup_index_in_threadgroup]],
      uint simd_lid [[thread_index_in_simdgroup]]) {
    // Appease compiler
    (void)lid;

    // Thread local accumulation results
    AccT result[TN] = {0};
    T inter[TN];
    AccT v_coeff[TM];
    const int thrM = SN != 32 ? simd_lid / SN : 0;
    const int thrN = SN != 32 ? simd_lid % SN : int(simd_lid);

    const int sgM = BN != 1 ? (simd_gid / BN) : int(simd_gid);
    const int sgN = BN != 1 ? (simd_gid % BN) : 0;

    const int simdM = SM * sgM;
    const int simdN = SN * sgN;

    int cm = (simdM + thrM);
    int cn = (simdN + thrN);

    int bm = cm * TM;
    int bn = cn * TN;

    int out_col = tid.x * blockN + bn;

    constexpr const uniform<int> loop_stride = make_uniform(blockM);
    const uniform<int> in_size = make_uniform(in_vec_size);
    const uniform<int> n_iter = in_size / loop_stride;
    const uniform<int> last_iter = loop_stride * n_iter;
    const uniform<int> leftover = in_size - last_iter;

    // Edgecase handling
    if (out_col < out_vec_size) {
      out_col = out_col + TN < out_vec_size ? out_col : out_vec_size - TN;

      // Per thread accumulation main loop
      for (int i = 0; i < n_iter; ++i) {
        // Adding a threadgroup_barrier improves performance slightly
        // This is possibly it may help exploit cache better
        threadgroup_barrier(mem_flags::mem_none);

        MLX_MTL_PRAGMA_UNROLL
        for (int tm = 0; tm < TM; tm++) {
          v_coeff[tm] = static_cast<AccT>(in_vec[bm + tm]);
        }

        MLX_MTL_PRAGMA_UNROLL
        for (int tm = 0; tm < TM; tm++) {
          auto vc = static_cast<AccT>(v_coeff[tm]);
          for (int tn = 0; tn < TN; tn++) {
            inter[tn] = mat[(bm + tm) * marix_ld + out_col + tn];
          }
          for (int tn = 0; tn < TN; tn++) {
            result[tn] += vc * inter[tn];
          }
        }

        bm += blockM;
      }

      if (leftover > 0) {
        for (int tm = 0; tm < TM && bm + tm < in_vec_size; tm++) {
          v_coeff[tm] = static_cast<AccT>(in_vec[bm + tm]);

          MLX_MTL_PRAGMA_UNROLL
          for (int tn = 0; tn < TN; tn++) {
            inter[tn] = mat[(bm + tm) * marix_ld + out_col + tn];
          }

          MLX_MTL_PRAGMA_UNROLL
          for (int tn = 0; tn < TN; tn++) {
            result[tn] += v_coeff[tm] * inter[tn];
          }
        }
      }
    }

    // Simdgroup accumulations
    MLX_MTL_PRAGMA_UNROLL
    for (int tn = 0; tn < TN; tn++) {
      MLX_MTL_PRAGMA_UNROLL
      for (ushort sm = (SM / 2); sm >= 1; sm >>= 1) {
        result[tn] += simd_shuffle_down(result[tn], SN * sm);
      }
    }

    // Threadgroup accumulation results
    if (needs_tgp_reduction) {
      threadgroup AccT* tgp_results = tgp_memory + sgM * (blockN + TN) + bn;
      if (thrM == 0) {
        MLX_MTL_PRAGMA_UNROLL
        for (int tn = 0; tn < TN; tn++) {
          tgp_results[tn] = result[tn];
        }

        threadgroup_barrier(mem_flags::mem_none);

        if (sgM == 0) {
          MLX_MTL_PRAGMA_UNROLL
          for (int sgm = 1; sgm < BM; sgm++) {
            MLX_MTL_PRAGMA_UNROLL
            for (int tn = 0; tn < TN; tn++) {
              result[tn] += tgp_results[sgm * (blockN + TN) + tn];
            }
          }
        }
      }
    }

    // Threadgroup accumulation and writing out results
    if (cm == 0 && out_col < out_vec_size) {
      MLX_MTL_PRAGMA_UNROLL
      for (int j = 0; j < TN; j++) {
        if (kDoAxpby) {
          out_vec[out_col + j] =
              static_cast<T>(alpha) * static_cast<T>(result[j]) +
              static_cast<T>(beta) * bias[(out_col + j) * bias_stride];
        } else {
          out_vec[out_col + j] = static_cast<T>(result[j]);
        }
      }
    }
  }
};

///////////////////////////////////////////////////////////////////////////////
/// Matrix vector multiplication
///////////////////////////////////////////////////////////////////////////////

template <
    typename T,
    const int BM, /* Threadgroup rows (in simdgroups) */
    const int BN, /* Threadgroup cols (in simdgroups) */
    const int SM, /* Simdgroup rows (in threads) */
    const int SN, /* Simdgroup cols (in threads) */
    const int TM, /* Thread rows (in elements) */
    const int TN, /* Thread cols (in elements) */
    const bool kDoNCBatch, /* Batch ndim > 1 */
    const bool kDoAxpby> /* Do out = alpha * out + beta * bias */
[[kernel, max_total_threads_per_threadgroup(BM * BN * 32)]] void gemv(
    const device T* mat [[buffer(0)]],
    const device T* in_vec [[buffer(1)]],
    const device T* bias [[buffer(2)]],
    device T* out_vec [[buffer(3)]],
    const constant int& in_vec_size [[buffer(4)]],
    const constant int& out_vec_size [[buffer(5)]],
    const constant int& marix_ld [[buffer(6)]],
    const constant float& alpha [[buffer(7)]],
    const constant float& beta [[buffer(8)]],
    const constant int& batch_ndim [[buffer(9)]],
    const constant int* batch_shape [[buffer(10)]],
    const constant int64_t* vector_batch_stride [[buffer(11)]],
    const constant int64_t* matrix_batch_stride [[buffer(12)]],
    const constant int64_t* bias_batch_stride [[buffer(13)]],
    const constant int& bias_stride [[buffer(14)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  using gemv_kernel = GEMVKernel<T, BM, BN, SM, SN, TM, TN, kDoAxpby>;
  threadgroup typename gemv_kernel::acc_type tgp_memory
      [gemv_kernel::tgp_mem_size == 0 ? 1 : gemv_kernel::tgp_mem_size];

  // Update batch offsets
  if (kDoNCBatch) {
    in_vec += elem_to_loc(tid.z, batch_shape, vector_batch_stride, batch_ndim);
    mat += elem_to_loc(tid.z, batch_shape, matrix_batch_stride, batch_ndim);

    if (kDoAxpby) {
      bias += elem_to_loc(tid.z, batch_shape, bias_batch_stride, batch_ndim);
    }

  } else {
    in_vec += tid.z * vector_batch_stride[0];
    mat += tid.z * matrix_batch_stride[0];

    if (kDoAxpby) {
      bias += tid.z * bias_batch_stride[0];
    }
  }

  out_vec += tid.z * out_vec_size;

  gemv_kernel::run(
      mat,
      in_vec,
      bias,
      out_vec,
      in_vec_size,
      out_vec_size,
      marix_ld,
      alpha,
      beta,
      bias_stride,
      gemv_kernel::tgp_mem_size == 0 ? nullptr : tgp_memory,
      tid,
      lid,
      simd_gid,
      simd_lid);
}

#define instantiate_gemv_helper(                                      \
    name, itype, bm, bn, sm, sn, tm, tn, nc, axpby)                   \
  instantiate_kernel(                                                 \
      "gemv_" #name "_bm" #bm "_bn" #bn "_sm" #sm "_sn" #sn "_tm" #tm \
      "_tn" #tn "_nc" #nc "_axpby" #axpby,                            \
      gemv,                                                           \
      itype,                                                          \
      bm,                                                             \
      bn,                                                             \
      sm,                                                             \
      sn,                                                             \
      tm,                                                             \
      tn,                                                             \
      nc,                                                             \
      axpby)

// clang-format off
#define instantiate_gemv(name, itype, bm, bn, sm, sn, tm, tn)        \
  instantiate_gemv_helper(name, itype, bm, bn, sm, sn, tm, tn, 0, 0) \
  instantiate_gemv_helper(name, itype, bm, bn, sm, sn, tm, tn, 0, 1) \
  instantiate_gemv_helper(name, itype, bm, bn, sm, sn, tm, tn, 1, 0) \
  instantiate_gemv_helper(name, itype, bm, bn, sm, sn, tm, tn, 1, 1) // clang-format on

// clang-format off
#define instantiate_gemv_blocks(name, itype) \
  instantiate_gemv(name, itype, 1,  8, 1, 32, 4, 4) \
  instantiate_gemv(name, itype, 1,  8, 1, 32, 1, 4) \
  instantiate_gemv(name, itype, 1,  1, 8,  4, 4, 4) \
  instantiate_gemv(name, itype, 1,  1, 8,  4, 1, 4) \
  instantiate_gemv(name, itype, 4,  1, 1, 32, 1, 4) \
  instantiate_gemv(name, itype, 4,  1, 1, 32, 4, 4) \
  instantiate_gemv(name, itype, 8,  1, 1, 32, 4, 4) // clang-format on

instantiate_gemv_blocks(float32, float);
instantiate_gemv_blocks(float16, half);
instantiate_gemv_blocks(bfloat16, bfloat16_t);
instantiate_gemv_blocks(complex64, complex64_t);

template <
    typename T,
    const int BM, /* Threadgroup rows (in simdgroups) */
    const int BN, /* Threadgroup cols (in simdgroups) */
    const int SM, /* Simdgroup rows (in threads) */
    const int SN, /* Simdgroup cols (in threads) */
    const int TM, /* Thread rows (in elements) */
    const int TN> /* Thread cols (in elements) */
[[kernel, max_total_threads_per_threadgroup(BM * BN * 32)]] void gemv_gather(
    const device T* mat [[buffer(0)]],
    const device T* in_vec [[buffer(1)]],
    const device T* bias [[buffer(2)]],
    device T* out_vec [[buffer(3)]],
    const constant int& in_vec_size [[buffer(4)]],
    const constant int& out_vec_size [[buffer(5)]],
    const constant int& marix_ld [[buffer(6)]],
    const constant float& alpha [[buffer(7)]],
    const constant float& beta [[buffer(8)]],
    const constant int& batch_ndim [[buffer(9)]],
    const constant int* batch_shape [[buffer(10)]],
    const constant int64_t* index_batch_strides [[buffer(11)]],
    const constant int& vector_batch_ndim [[buffer(12)]],
    const constant int* vector_batch_shape [[buffer(13)]],
    const constant int64_t* vector_batch_stride [[buffer(14)]],
    const constant int& matrix_batch_ndim [[buffer(15)]],
    const constant int* matrix_batch_shape [[buffer(16)]],
    const constant int64_t* matrix_batch_stride [[buffer(17)]],
    const constant uint32_t* vec_indices [[buffer(18)]],
    const constant uint32_t* mat_indices [[buffer(19)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  using gemv_kernel = GEMVKernel<T, BM, BN, SM, SN, TM, TN, false>;
  threadgroup typename gemv_kernel::acc_type tgp_memory
      [gemv_kernel::tgp_mem_size == 0 ? 1 : gemv_kernel::tgp_mem_size];

  uint32_t indx_vec;
  uint32_t indx_mat;

  // Update batch offsets
  if (batch_ndim > 1) {
    const constant auto* veci_bstrides = index_batch_strides;
    const constant auto* mati_bstrides = index_batch_strides + batch_ndim;

    ulong2 batch_offsets = elem_to_loc_broadcast(
        tid.z, batch_shape, veci_bstrides, mati_bstrides, batch_ndim);

    indx_vec = vec_indices[batch_offsets.x];
    indx_mat = mat_indices[batch_offsets.y];

  } else {
    indx_vec = vec_indices[index_batch_strides[0] * tid.z];
    indx_mat = mat_indices[index_batch_strides[batch_ndim] * tid.z];
  }

  if (vector_batch_ndim > 1) {
    in_vec += elem_to_loc(
        indx_vec, vector_batch_shape, vector_batch_stride, vector_batch_ndim);
  } else {
    in_vec += indx_vec * vector_batch_stride[0];
  }

  if (matrix_batch_ndim > 1) {
    mat += elem_to_loc(
        indx_mat, matrix_batch_shape, matrix_batch_stride, matrix_batch_ndim);
  } else {
    mat += indx_mat * matrix_batch_stride[0];
  }

  out_vec += tid.z * out_vec_size;

  gemv_kernel::run(
      mat,
      in_vec,
      bias,
      out_vec,
      in_vec_size,
      out_vec_size,
      marix_ld,
      alpha,
      beta,
      batch_ndim, // Not used
      gemv_kernel::tgp_mem_size == 0 ? nullptr : tgp_memory,
      tid,
      lid,
      simd_gid,
      simd_lid);
}

// clang-format off
#define instantiate_gemv_bs_helper(nm, itype, bm, bn, sm, sn, tm, tn) \
  instantiate_kernel(                                                 \
    "gemv_gather_" #nm "_bm" #bm "_bn" #bn "_sm" #sm                  \
                       "_sn" #sn "_tm" #tm "_tn" #tn,                 \
    gemv_gather, itype, bm, bn, sm, sn, tm, tn)

#define instantiate_gemv_bs_blocks(name, itype)              \
  instantiate_gemv_bs_helper(name, itype, 4, 1, 1, 32, 1, 4) \
  instantiate_gemv_bs_helper(name, itype, 4, 1, 1, 32, 4, 4) \
  instantiate_gemv_bs_helper(name, itype, 8, 1, 1, 32, 4, 4) // clang-format on

instantiate_gemv_bs_blocks(float32, float);
instantiate_gemv_bs_blocks(float16, half);
instantiate_gemv_bs_blocks(bfloat16, bfloat16_t);
instantiate_gemv_bs_blocks(complex64, complex64_t);

///////////////////////////////////////////////////////////////////////////////
/// Vector matrix multiplication
///////////////////////////////////////////////////////////////////////////////

template <
    typename T,
    const int BM, /* Threadgroup rows (in simdgroups) */
    const int BN, /* Threadgroup cols (in simdgroups) */
    const int SM, /* Simdgroup rows (in threads) */
    const int SN, /* Simdgroup cols (in threads) */
    const int TM, /* Thread rows (in elements) */
    const int TN, /* Thread cols (in elements) */
    const bool kDoNCBatch, /* Batch ndim > 1 */
    const bool kDoAxpby> /* Do out = alpha * out + beta * bias */
[[kernel, max_total_threads_per_threadgroup(BM * BN * 32)]] void gemv_t(
    const device T* mat [[buffer(0)]],
    const device T* in_vec [[buffer(1)]],
    const device T* bias [[buffer(2)]],
    device T* out_vec [[buffer(3)]],
    const constant int& in_vec_size [[buffer(4)]],
    const constant int& out_vec_size [[buffer(5)]],
    const constant int& marix_ld [[buffer(6)]],
    const constant float& alpha [[buffer(7)]],
    const constant float& beta [[buffer(8)]],
    const constant int& batch_ndim [[buffer(9)]],
    const constant int* batch_shape [[buffer(10)]],
    const constant int64_t* vector_batch_stride [[buffer(11)]],
    const constant int64_t* matrix_batch_stride [[buffer(12)]],
    const constant int64_t* bias_batch_stride [[buffer(13)]],
    const constant int& bias_stride [[buffer(14)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  using gemv_kernel = GEMVTKernel<T, BM, BN, SM, SN, TM, TN, kDoAxpby>;
  threadgroup typename gemv_kernel::acc_type tgp_memory
      [gemv_kernel::tgp_mem_size == 0 ? 1 : gemv_kernel::tgp_mem_size];

  // Update batch offsets
  if (kDoNCBatch) {
    in_vec += elem_to_loc(tid.z, batch_shape, vector_batch_stride, batch_ndim);
    mat += elem_to_loc(tid.z, batch_shape, matrix_batch_stride, batch_ndim);

    if (kDoAxpby) {
      bias += elem_to_loc(tid.z, batch_shape, bias_batch_stride, batch_ndim);
    }

  } else {
    in_vec += tid.z * vector_batch_stride[0];
    mat += tid.z * matrix_batch_stride[0];

    if (kDoAxpby) {
      bias += tid.z * bias_batch_stride[0];
    }
  }

  out_vec += tid.z * out_vec_size;

  gemv_kernel::run(
      mat,
      in_vec,
      bias,
      out_vec,
      in_vec_size,
      out_vec_size,
      marix_ld,
      alpha,
      beta,
      bias_stride,
      gemv_kernel::tgp_mem_size == 0 ? nullptr : tgp_memory,
      tid,
      lid,
      simd_gid,
      simd_lid);
}

// clang-format off
#define instantiate_gemv_t_helper(                          \
    name, itype, bm, bn, sm, sn, tm, tn, nc, axpby)         \
  instantiate_kernel(                                       \
    "gemv_t_" #name "_bm" #bm "_bn" #bn "_sm" #sm "_sn" #sn \
       "_tm" #tm "_tn" #tn "_nc" #nc "_axpby" #axpby,       \
  gemv_t, itype, bm, bn, sm, sn, tm, tn, nc, axpby)

#define instantiate_gemv_t(name, itype, bm, bn, sm, sn, tm, tn)        \
  instantiate_gemv_t_helper(name, itype, bm, bn, sm, sn, tm, tn, 0, 0) \
  instantiate_gemv_t_helper(name, itype, bm, bn, sm, sn, tm, tn, 0, 1) \
  instantiate_gemv_t_helper(name, itype, bm, bn, sm, sn, tm, tn, 1, 0) \
  instantiate_gemv_t_helper(name, itype, bm, bn, sm, sn, tm, tn, 1, 1) // clang-format on

// clang-format off
#define instantiate_gemv_t_blocks(name, itype) \
  instantiate_gemv_t(name, itype, 1, 2,  8, 4, 4, 1) \
  instantiate_gemv_t(name, itype, 1, 2,  8, 4, 4, 4) \
  instantiate_gemv_t(name, itype, 1, 4,  8, 4, 4, 4) \
  instantiate_gemv_t(name, itype, 1, 16, 8, 4, 4, 4) \
  instantiate_gemv_t(name, itype, 1, 16, 4, 8, 4, 4) // clang-format on

// clang-format off
instantiate_gemv_t_blocks(float32, float);
instantiate_gemv_t_blocks(float16, half);
instantiate_gemv_t_blocks(bfloat16, bfloat16_t);
instantiate_gemv_t_blocks(complex64, complex64_t); // clang-format on

template <
    typename T,
    const int BM, /* Threadgroup rows (in simdgroups) */
    const int BN, /* Threadgroup cols (in simdgroups) */
    const int SM, /* Simdgroup rows (in threads) */
    const int SN, /* Simdgroup cols (in threads) */
    const int TM, /* Thread rows (in elements) */
    const int TN> /* Thread cols (in elements) */
[[kernel, max_total_threads_per_threadgroup(BM * BN * 32)]] void gemv_t_gather(
    const device T* mat [[buffer(0)]],
    const device T* in_vec [[buffer(1)]],
    const device T* bias [[buffer(2)]],
    device T* out_vec [[buffer(3)]],
    const constant int& in_vec_size [[buffer(4)]],
    const constant int& out_vec_size [[buffer(5)]],
    const constant int& marix_ld [[buffer(6)]],
    const constant float& alpha [[buffer(7)]],
    const constant float& beta [[buffer(8)]],
    const constant int& batch_ndim [[buffer(9)]],
    const constant int* batch_shape [[buffer(10)]],
    const constant int64_t* index_batch_strides [[buffer(11)]],
    const constant int& vector_batch_ndim [[buffer(12)]],
    const constant int* vector_batch_shape [[buffer(13)]],
    const constant int64_t* vector_batch_stride [[buffer(14)]],
    const constant int& matrix_batch_ndim [[buffer(15)]],
    const constant int* matrix_batch_shape [[buffer(16)]],
    const constant int64_t* matrix_batch_stride [[buffer(17)]],
    const constant uint32_t* vec_indices [[buffer(18)]],
    const constant uint32_t* mat_indices [[buffer(19)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  using gemv_kernel = GEMVTKernel<T, BM, BN, SM, SN, TM, TN, false>;
  threadgroup typename gemv_kernel::acc_type tgp_memory
      [gemv_kernel::tgp_mem_size == 0 ? 1 : gemv_kernel::tgp_mem_size];

  uint32_t indx_vec;
  uint32_t indx_mat;

  // Update batch offsets
  if (batch_ndim > 1) {
    const constant auto* veci_bstrides = index_batch_strides;
    const constant auto* mati_bstrides = index_batch_strides + batch_ndim;

    ulong2 batch_offsets = elem_to_loc_broadcast(
        tid.z, batch_shape, veci_bstrides, mati_bstrides, batch_ndim);

    indx_vec = vec_indices[batch_offsets.x];
    indx_mat = mat_indices[batch_offsets.y];

  } else {
    indx_vec = vec_indices[index_batch_strides[0] * tid.z];
    indx_mat = mat_indices[index_batch_strides[batch_ndim] * tid.z];
  }

  if (vector_batch_ndim > 1) {
    in_vec += elem_to_loc(
        indx_vec, vector_batch_shape, vector_batch_stride, vector_batch_ndim);
  } else {
    in_vec += indx_vec * vector_batch_stride[0];
  }

  if (matrix_batch_ndim > 1) {
    mat += elem_to_loc(
        indx_mat, matrix_batch_shape, matrix_batch_stride, matrix_batch_ndim);
  } else {
    mat += indx_mat * matrix_batch_stride[0];
  }

  out_vec += tid.z * out_vec_size;

  gemv_kernel::run(
      mat,
      in_vec,
      bias,
      out_vec,
      in_vec_size,
      out_vec_size,
      marix_ld,
      alpha,
      beta,
      batch_ndim, // Not used,
      gemv_kernel::tgp_mem_size == 0 ? nullptr : tgp_memory,
      tid,
      lid,
      simd_gid,
      simd_lid);
}

// clang-format off
#define instantiate_gemv_t_bs_helper(                  \
    nm, itype, bm, bn, sm, sn, tm, tn)                 \
  instantiate_kernel(                                  \
    "gemv_t_gather_" #nm "_bm" #bm "_bn" #bn "_sm" #sm \
       "_sn" #sn "_tm" #tm "_tn" #tn,                  \
  gemv_t_gather, itype, bm, bn, sm, sn, tm, tn)

#define instantiate_gemv_t_bs_blocks(name, itype)              \
  instantiate_gemv_t_bs_helper(name, itype, 1,  2, 8, 4, 4, 1) \
  instantiate_gemv_t_bs_helper(name, itype, 1,  2, 8, 4, 4, 4) \
  instantiate_gemv_t_bs_helper(name, itype, 1,  4, 8, 4, 4, 4) \
  instantiate_gemv_t_bs_helper(name, itype, 1, 16, 8, 4, 4, 4) \
  instantiate_gemv_t_bs_helper(name, itype, 1, 16, 4, 8, 4, 4) // clang-format on

// clang-format off
instantiate_gemv_t_bs_blocks(float32, float);
instantiate_gemv_t_bs_blocks(float16, half);
instantiate_gemv_t_bs_blocks(bfloat16, bfloat16_t);
instantiate_gemv_t_bs_blocks(complex64, complex64_t); // clang-format on


================================================
FILE: mlx/backend/metal/kernels/gemv_masked.h
================================================
// Copyright © 2023-2024 Apple Inc.

#include "mlx/backend/metal/kernels/steel/utils.h"

using namespace metal;

#define MLX_MTL_CONST static constant constexpr const
#define MLX_MTL_PRAGMA_UNROLL _Pragma("clang loop unroll(full)")

struct _NoMask {
  char x;

  constexpr METAL_FUNC operator bool() {
    return true;
  }
  constexpr METAL_FUNC operator bool() const threadgroup {
    return true;
  }
  constexpr METAL_FUNC operator bool() const device {
    return true;
  }
  constexpr METAL_FUNC operator bool() const constant {
    return true;
  }
};

typedef struct _NoMask nomask_t;

template <typename OutT, typename InT = OutT>
struct ScaleOp {
  OutT scale;

  METAL_FUNC OutT apply(InT x) const {
    return static_cast<OutT>(x) * scale;
  }
};

template <
    typename T,
    typename out_mask_t,
    typename op_mask_t,
    const int BM, /* Threadgroup rows (in simdgroups) */
    const int BN, /* Threadgroup cols (in simdgroups) */
    const int SM, /* Simdgroup rows (in threads) */
    const int SN, /* Simdgroup cols (in threads) */
    const int TM, /* Thread rows (in elements) */
    const int TN, /* Thread cols (in elements) */
    typename AccT = float>
struct GEMVKernel {
  MLX_MTL_CONST int threadsM = BM * SM;
  MLX_MTL_CONST int threadsN = BN * SN;

  MLX_MTL_CONST int blockM = threadsM * TM;
  MLX_MTL_CONST int blockN = threadsN * TN;

  static_assert(SM * SN == 32, "simdgroup can only have 32 threads");

  static_assert(
      SN == 8 || SN == 16 || SN == 32,
      "gemv block must have a width of 8, 16, or 32");

  static_assert(blockN >= blockM, "Masked gemv must have blockN >= blockM");

  MLX_MTL_CONST bool has_operand_mask = !metal::is_same_v<op_mask_t, nomask_t>;
  MLX_MTL_CONST bool has_output_mask = !metal::is_same_v<out_mask_t, nomask_t>;

  MLX_MTL_CONST bool has_mul_operand_mask =
      has_operand_mask && !metal::is_same_v<op_mask_t, bool>;
  MLX_MTL_CONST bool has_mul_output_mask =
      has_output_mask && !metal::is_same_v<out_mask_t, bool>;

  // - The matrix of size (M = out_vec_size, K = in_vec_size) is divided up
  //   into blocks of (blockM, blockN) divided among threadgroups
  // - Every thread works on a block of (TM, TN)
  // - We assume each threadgroup has (threadsN, threadsM, 1) threads
  //
  // 1. A thread loads TN elements each from mat along TM rows
  //    and the corresponding scalar from the vector
  // 2. The thread then multiplies and adds to accumulate its local result for
  //    the block
  // 3. At the end, each thread has accumulated results over all blocks across
  //    the rows. These are then summed up across the threadgroup
  // 4. Each threadgroup writes its accumulated blockM outputs
  //
  // Edge case handling:
  // - The threadgroup with the largest tid has blocks that exceed the matrix
  //   * The blocks that start outside the matrix are never read (thread results
  //     remain zero)
  //   * The last thread that partially overlaps with the matrix is shifted
  //     inwards such that the thread block fits exactly in the matrix

  MLX_MTL_CONST short tgp_mem_size = BN > 1 ? BN*(blockM + TM) : 0;
  MLX_MTL_CONST bool needs_tgp_reduction = BN > 1;

  template <typename U = T>
  static METAL_FUNC void
  load_unsafe(const device T* src, thread U dst[TN], const int src_offset = 0) {
    MLX_MTL_PRAGMA_UNROLL
    for (int tn = 0; tn < TN; tn++) {
      dst[tn] = static_cast<U>(src[src_offset + tn]);
    }
  }

  template <typename U = T>
  static METAL_FUNC void load_safe(
      const device T* src,
      thread U dst[TN],
      const int src_offset = 0,
      const int src_size = TN) {
    if (src_offset + TN <= src_size) {
      MLX_MTL_PRAGMA_UNROLL
      for (int tn = 0; tn < TN; tn++) {
        dst[tn] = static_cast<U>(src[src_offset + tn]);
      }
    } else { // Edgecase
      MLX_MTL_PRAGMA_UNROLL
      for (int tn = 0; tn < TN; tn++) {
        dst[tn] = src_offset + tn < src_size
            ? static_cast<U>(src[src_offset + tn])
            : U(0);
      }
    }
  }

  static METAL_FUNC void run(
      const device T* mat [[buffer(0)]],
      const device T* in_vec [[buffer(1)]],
      device T* out_vec [[buffer(3)]],
      const constant int& in_vec_size [[buffer(4)]],
      const constant int& out_vec_size [[buffer(5)]],
      const constant int& matrix_ld [[buffer(6)]],
      const device out_mask_t* out_mask [[buffer(20)]],
      const device op_mask_t* mat_mask [[buffer(21)]],
      const device op_mask_t* vec_mask [[buffer(22)]],
      const constant int* mask_strides [[buffer(23)]],
      threadgroup AccT* tgp_memory [[threadgroup(0)]],
      uint3 tid [[threadgroup_position_in_grid]],
      uint3 lid [[thread_position_in_threadgroup]],
      uint simd_gid [[simdgroup_index_in_threadgroup]],
      uint simd_lid [[thread_index_in_simdgroup]]) {
    // Appease compiler
    (void)lid;

    // Thread local accumulation results
    thread AccT result[TM] = {0};
    thread T inter[TN];
    thread AccT v_coeff[TN];

    const int thrM = SN != 32 ? simd_lid / SN : 0;
    const int thrN = SN != 32 ? simd_lid % SN : int(simd_lid);

    const int sgN = BN != 1 ? (simd_gid % BN) : 0;

    const int simdM = BN != 1 ? SM * (simd_gid / BN) : int(SM * simd_gid);
    const int simdN = BN != 1 ? SN * (simd_gid % BN) : 0;

    int bm = (simdM + thrM) * TM;
    int bn = (simdN + thrN) * TN;

    // Block position
    int out_row = tid.x * blockM + bm;

    // Exit simdgroup if rows out of bound
    if (out_row >= out_vec_size)
      return;

    // Adjust tail simdgroup to ensure in bound reads
    out_row = out_row + TM <= out_vec_size ? out_row : out_vec_size - TM;

    // Prepare mask offsets
    const constant int* out_mask_strides = mask_strides;
    const constant int* mat_mask_strides =
        mask_strides + (has_output_mask ? 2 : 0);
    const constant int* vec_mask_strides =
        mat_mask_strides + (has_operand_mask ? 2 : 0);

    const int m_block_idx = blockN > blockM ? out_row / blockN : int(tid.x);

    const int out_mask_offset =
        !has_output_mask ? 0 : m_block_idx * out_mask_strides[1];

    int mat_mask_offset =
        !has_operand_mask ? 0 : m_block_idx * mat_mask_strides[1];
    int vec_mask_offset = 0;
    const int mat_mask_step = !has_operand_mask ? 0 : mat_mask_strides[0];
    const int vec_mask_step = !has_operand_mask ? 0 : vec_mask_strides[1];

    T out_scale{1};

    // Check output mask
    if (has_output_mask) {
      auto mask_out = out_mask[out_mask_offset];

      // Write zeros and return if mask is 0
      if (!mask_out) {
        if (simdN == 0 && thrN == 0) {
          MLX_MTL_PRAGMA_UNROLL
          for (int tm = 0; tm < TM; tm++) {
            out_vec[out_row + tm] = T(0.);
          }
        }

        return;
      }

      // Store scalar if multiplicative mask
      if (has_mul_output_mask) {
        out_scale = T(mask_out);
      }
    }

    // Advance matrix
    mat += out_row * matrix_ld;

    // Prepare for loop
    constexpr const uniform<int> loop_stride = make_uniform(blockN);
    const uniform<int> in_size = make_uniform(in_vec_size);
    const uniform<int> n_iter = in_size / loop_stride;
    const uniform<int> last_iter = loop_stride * n_iter;
    const uniform<int> leftover = in_size - last_iter;

    // Loop over in_vec in blocks of blockN
    for (int i = 0; i < n_iter; ++i) {
      if (!has_operand_mask ||
          (bool(mat_mask[mat_mask_offset]) &&
           bool(vec_mask[vec_mask_offset]))) {
        T block_scale{1};
        if (has_mul_operand_mask) {
          block_scale =
              T(mat_mask[mat_mask_offset]) * T(vec_mask[vec_mask_offset]);
        }

        load_unsafe<AccT>(in_vec, v_coeff, bn);

        // Apply scale
        if (has_mul_operand_mask) {
          MLX_MTL_PRAGMA_UNROLL
          for (int tn = 0; tn < TN; tn++) {
            v_coeff[tn] *= block_scale;
          }
        }

        // Per thread work loop
        int mat_offset = 0;
        MLX_MTL_PRAGMA_UNROLL
        for (int tm = 0; tm < TM; tm++) {
          // Load for the row
          load_unsafe(mat, inter, mat_offset + bn);

          // Accumulate results
          MLX_MTL_PRAGMA_UNROLL
          for (int tn = 0; tn < TN; tn++) {
            result[tm] += inter[tn] * v_coeff[tn];
          }

          mat_offset += matrix_ld;
        }
      }

      bn += blockN;
      mat_mask_offset += mat_mask_step;
      vec_mask_offset += vec_mask_step;
    }

    if (leftover > 0) {
      if (!has_operand_mask ||
          (bool(mat_mask[mat_mask_offset]) &&
           bool(vec_mask[vec_mask_offset]))) {
        T block_scale{1};
        if (has_mul_operand_mask) {
          block_scale =
              T(mat_mask[mat_mask_offset]) * T(vec_mask[vec_mask_offset]);
        }

        load_safe<AccT>(in_vec, v_coeff, bn, in_size);

        // Apply scale
        if (has_mul_operand_mask) {
          MLX_MTL_PRAGMA_UNROLL
          for (int tn = 0; tn < TN; tn++) {
            v_coeff[tn] *= block_scale;
          }
        }

        // Per thread work loop
        MLX_MTL_PRAGMA_UNROLL
        for (int tm = 0; tm < TM; tm++) {
          // Load for the row
          load_safe(&mat[tm * matrix_ld], inter, bn, in_size);

          // Accumulate results
          MLX_MTL_PRAGMA_UNROLL
          for (int tn = 0; tn < TN; tn++) {
            result[tm] += inter[tn] * v_coeff[tn];
          }
        }
      }
    }

    // Apply out scale
    if (has_mul_output_mask) {
      MLX_MTL_PRAGMA_UNROLL
      for (int tm = 0; tm < TM; tm++) {
        result[tm] *= out_scale;
      }
    }

    // Simdgroup accumulations
    MLX_MTL_PRAGMA_UNROLL
    for (int tm = 0; tm < TM; tm++) {
      MLX_MTL_PRAGMA_UNROLL
      for (ushort sn = (SN / 2); sn >= 1; sn >>= 1) {
        result[tm] += simd_shuffle_down(result[tm], sn);
      }
    }

    // Threadgroup accumulation results
    if (needs_tgp_reduction) {
      threadgroup AccT* tgp_results = tgp_memory + sgN * (blockM + TM) + bm;
      if (thrN == 0) {
        MLX_MTL_PRAGMA_UNROLL
        for (int tm = 0; tm < TM; tm++) {
          tgp_results[tm] = result[tm];
        }

        threadgroup_barrier(mem_flags::mem_none);

        if (sgN == 0) {
          MLX_MTL_PRAGMA_UNROLL
          for (int sgn = 1; sgn < BN; sgn++) {
            MLX_MTL_PRAGMA_UNROLL
            for (int tm = 0; tm < TM; tm++) {
              result[tm] += tgp_results[sgn * (blockM + TM) + tm];
            }
          }
        }
      }
    }

    // Write outputs
    if (simdN == 0 && thrN == 0) {
      MLX_MTL_PRAGMA_UNROLL
      for (int tm = 0; tm < TM; tm++) {
        out_vec[out_row + tm] = static_cast<T>(result[tm]);
      }
    }
  }
};

///////////////////////////////////////////////////////////////////////////////
/// Vector matrix multiplication
///////////////////////////////////////////////////////////////////////////////

template <
    typename T,
    typename out_mask_t,
    typename op_mask_t,
    const int BM, /* Threadgroup rows (in simdgroups) */
    const int BN, /* Threadgroup cols (in simdgroups) */
    const int SM, /* Simdgroup rows (in threads) */
    const int SN, /* Simdgroup cols (in threads) */
    const int TM, /* Thread rows (in elements) */
    const int TN, /* Thread cols (in elements) */
    typename AccT = float>
struct GEMVTKernel {
  MLX_MTL_CONST int threadsM = BM * SM;
  MLX_MTL_CONST int threadsN = BN * SN;

  MLX_MTL_CONST int blockM = threadsM * TM;
  MLX_MTL_CONST int blockN = threadsN * TN;

  static_assert(SM * SN == 32, "simdgroup can only have 32 threads");

  MLX_MTL_CONST bool has_operand_mask = !metal::is_same_v<op_mask_t, nomask_t>;
  MLX_MTL_CONST bool has_output_mask = !metal::is_same_v<out_mask_t, nomask_t>;

  MLX_MTL_CONST bool has_mul_operand_mask =
      has_operand_mask && !metal::is_same_v<op_mask_t, bool>;
  MLX_MTL_CONST bool has_mul_output_mask =
      has_output_mask && !metal::is_same_v<out_mask_t, bool>;

  // - The matrix of size (M = in_vec_size, N = out_vec_size) is divided up
  //   into blocks of (blockM, blockN) divided among threadgroups
  // - Every thread works on a block of (TM, TN)
  // - We assume each threadgroup has (threadsN, threadsM, 1) threads
  //
  // 1. A thread loads TN elements each from mat along TM contiguous rows
  //    and the corresponding scalar from the vector
  // 2. The thread then accumulates its local result for the block
  // 3. At the end, each thread has accumulated results over all blocks across
  //    the rows. These are then summed up across the threadgroup
  // 4. Each threadgroup writes its accumulated BN * TN outputs
  //
  // Edge case handling:
  // - The threadgroup with the largest tid has blocks that exceed the matrix
  //   * The blocks that start outside the matrix are never read (thread results
  //     remain zero)
  //   * The last thread that partially overlaps with the matrix is shifted
  //     inwards such that the thread block fits exactly in the matrix

  MLX_MTL_CONST short tgp_mem_size = BM > 1 ? BM*(blockN + TN) : 0;
  MLX_MTL_CONST bool needs_tgp_reduction = BM > 1;

  static METAL_FUNC void run(
      const device T* mat [[buffer(0)]],
      const device T* in_vec [[buffer(1)]],
      device T* out_vec [[buffer(3)]],
      const constant int& in_vec_size [[buffer(4)]],
      const constant int& out_vec_size [[buffer(5)]],
      const constant int& marix_ld [[buffer(6)]],
      const device out_mask_t* out_mask [[buffer(20)]],
      const device op_mask_t* mat_mask [[buffer(21)]],
      const device op_mask_t* vec_mask [[buffer(22)]],
      const constant int* mask_strides [[buffer(23)]],
      threadgroup AccT* tgp_memory [[threadgroup(0)]],
      uint3 tid [[threadgroup_position_in_grid]],
      uint3 lid [[thread_position_in_threadgroup]],
      uint simd_gid [[simdgroup_index_in_threadgroup]],
      uint simd_lid [[thread_index_in_simdgroup]]) {
    // Appease compiler
    (void)lid;

    // Thread local accumulation results
    AccT result[TN] = {0};
    T inter[TN];
    AccT v_coeff[TM];

    const int thrM = SN != 32 ? simd_lid / SN : 0;
    const int thrN = SN != 32 ? simd_lid % SN : int(simd_lid);

    const int sgM = BN != 1 ? (simd_gid / BN) : int(simd_gid);
    const int sgN = BN != 1 ? (simd_gid % BN) : 0;

    const int simdM = SM * sgM;
    const int simdN = SN * sgN;

    int cm = (simdM + thrM);
    int cn = (simdN + thrN);

    int bm = cm * TM;
    int bn = cn * TN;

    int out_col = tid.x * blockN + bn;

    // Prepare mask offsets
    const constant int* out_mask_strides = mask_strides;
    const constant int* mat_mask_strides =
        out_mask_strides + (has_output_mask ? 2 : 0);
    const constant int* vec_mask_strides =
        mat_mask_strides + (has_operand_mask ? 2 : 0);

    const int n_block_idx = blockM > blockN ? out_col / blockM : int(tid.x);

    const int out_mask_offset =
        !has_output_mask ? 0 : n_block_idx; // * out_mask_strides[0];

    int mat_mask_offset =
        !has_operand_mask ? 0 : n_block_idx * mat_mask_strides[0];
    int vec_mask_offset = 0;
    const int mat_mask_step = !has_operand_mask ? 0 : mat_mask_strides[1];
    const int vec_mask_step = !has_operand_mask ? 0 : vec_mask_strides[0];

    T out_scale{1};

    // Check output mask
    if (has_output_mask) {
      auto mask_out = out_mask[out_mask_offset];

      // Write zeros and return if mask is 0
      if (!mask_out) {
        if (cm == 0 && out_col < out_vec_size) {
          if (out_col + TN <= out_vec_size) {
            MLX_MTL_PRAGMA_UNROLL
            for (int tn = 0; tn < TN; tn++) {
              out_vec[out_col + tn] = T(0.);
            }
          } else {
            for (int tn = 0; tn < TN && (out_col + tn) < out_vec_size; tn++) {
              out_vec[out_col + tn] = T(0.);
            }
          }
        }

        return;
      }

      // Store scalar if multiplicative mask
      if (has_mul_output_mask) {
        out_scale = T(mask_out);
      }
    }

    // Prepare for loop
    constexpr const uniform<int> loop_stride = make_uniform(blockM);
    const uniform<int> in_size = make_uniform(in_vec_size);
    const uniform<int> n_iter = in_size / loop_stride;
    const uniform<int> last_iter = loop_stride * n_iter;
    const uniform<int> leftover = in_size - last_iter;

    // Edgecase handling
    if (out_col < out_vec_size) {
      out_col = (out_col + TN) <= out_vec_size ? out_col : out_vec_size - TN;

      // Per thread accumulation main loop
      for (int i = 0; i < n_iter; ++i) {
        // Adding a threadgroup_barrier improves performance slightly
        // This is possibly it may help exploit cache better
        threadgroup_barrier(mem_flags::mem_none);

        if (!has_operand_mask ||
            (bool(mat_mask[mat_mask_offset]) &&
             bool(vec_mask[vec_mask_offset]))) {
          T block_scale{1};
          if (has_mul_operand_mask) {
            block_scale =
                T(mat_mask[mat_mask_offset]) * T(vec_mask[vec_mask_offset]);
          }

          MLX_MTL_PRAGMA_UNROLL
          for (int tm = 0; tm < TM; tm++) {
            v_coeff[tm] = static_cast<AccT>(in_vec[bm + tm]);
          }

          // Apply scale
          if (has_mul_operand_mask) {
            MLX_MTL_PRAGMA_UNROLL
            for (int tm = 0; tm < TM; tm++) {
              v_coeff[tm] *= block_scale;
            }
          }

          MLX_MTL_PRAGMA_UNROLL
          for (int tm = 0; tm < TM; tm++) {
            for (int tn = 0; tn < TN; tn++) {
              inter[tn] = mat[(bm + tm) * marix_ld + out_col + tn];
            }
            for (int tn = 0; tn < TN; tn++) {
              result[tn] += v_coeff[tm] * inter[tn];
            }
          }
        }

        bm += blockM;
        mat_mask_offset += mat_mask_step;
        vec_mask_offset += vec_mask_step;
      }

      if (leftover > 0) {
        if (!has_operand_mask ||
            (bool(mat_mask[mat_mask_offset]) &&
             bool(vec_mask[vec_mask_offset]))) {
          T block_scale{1};
          if (has_mul_operand_mask) {
            block_scale =
                T(mat_mask[mat_mask_offset]) * T(vec_mask[vec_mask_offset]);
          }

          for (int tm = 0; tm < TM && bm + tm < in_vec_size; tm++) {
            v_coeff[tm] = static_cast<AccT>(in_vec[bm + tm]);

            if (has_mul_operand_mask) {
              v_coeff[tm] *= block_scale;
            }

            MLX_MTL_PRAGMA_UNROLL
            for (int tn = 0; tn < TN; tn++) {
              inter[tn] = mat[(bm + tm) * marix_ld + out_col + tn];
            }

            MLX_MTL_PRAGMA_UNROLL
            for (int tn = 0; tn < TN; tn++) {
              result[tn] += v_coeff[tm] * inter[tn];
            }
          }
        }
      }
    }

    // Apply out scale
    if (has_mul_output_mask) {
      MLX_MTL_PRAGMA_UNROLL
      for (int tn = 0; tn < TN; tn++) {
        result[tn] *= out_scale;
      }
    }

    // Simdgroup accumulations
    MLX_MTL_PRAGMA_UNROLL
    for (int tn = 0; tn < TN; tn++) {
      MLX_MTL_PRAGMA_UNROLL
      for (ushort sm = (SM / 2); sm >= 1; sm >>= 1) {
        result[tn] += simd_shuffle_down(result[tn], SN * sm);
      }
    }

    // Threadgroup accumulation results
    if (needs_tgp_reduction) {
      threadgroup AccT* tgp_results = tgp_memory + sgM * (blockN + TN) + bn;
      if (thrM == 0) {
        MLX_MTL_PRAGMA_UNROLL
        for (int tn = 0; tn < TN; tn++) {
          tgp_results[tn] = result[tn];
        }

        threadgroup_barrier(mem_flags::mem_none);

        if (sgM == 0) {
          MLX_MTL_PRAGMA_UNROLL
          for (int sgm = 1; sgm < BM; sgm++) {
            MLX_MTL_PRAGMA_UNROLL
            for (int tn = 0; tn < TN; tn++) {
              result[tn] += tgp_results[sgm * (blockN + TN) + tn];
            }
          }
        }
      }
    }

    // Threadgroup accumulation and writing out results
    if (cm == 0 && out_col < out_vec_size) {
      MLX_MTL_PRAGMA_UNROLL
      for (int j = 0; j < TN; j++) {
        out_vec[out_col + j] = static_cast<T>(result[j]);
      }
    }
  }
};

///////////////////////////////////////////////////////////////////////////////
/// Matrix vector multiplication
///////////////////////////////////////////////////////////////////////////////

template <
    typename T,
    typename out_mask_t,
    typename op_mask_t,
    const int BM, /* Threadgroup rows (in simdgroups) */
    const int BN, /* Threadgroup cols (in simdgroups) */
    const int SM, /* Simdgroup rows (in threads) */
    const int SN, /* Simdgroup cols (in threads) */
    const int TM, /* Thread rows (in elements) */
    const int TN, /* Thread cols (in elements) */
    const bool kDoNCBatch> /* Batch ndim > 1 */
[[kernel, max_total_threads_per_threadgroup(BM * BN * 32)]] void gemv_masked(
    const device T* mat [[buffer(0)]],
    const device T* in_vec [[buffer(1)]],
    device T* out_vec [[buffer(3)]],
    const constant int& in_vec_size [[buffer(4)]],
    const constant int& out_vec_size [[buffer(5)]],
    const constant int& marix_ld [[buffer(6)]],
    const constant int& batch_ndim [[buffer(9)]],
    const constant int* batch_shape [[buffer(10)]],
    const constant int64_t* vector_batch_stride [[buffer(11)]],
    const constant int64_t* matrix_batch_stride [[buffer(12)]],
    const device out_mask_t* out_mask [[buffer(20)]],
    const device op_mask_t* mat_mask [[buffer(21)]],
    const device op_mask_t* vec_mask [[buffer(22)]],
    const constant int* mask_strides [[buffer(23)]],
    const constant int64_t* mask_batch_strides [[buffer(24)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  using gemv_kernel =
      GEMVKernel<T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN>;
  threadgroup float tgp_memory
      [gemv_kernel::tgp_mem_size == 0 ? 1 : gemv_kernel::tgp_mem_size];

  constexpr bool has_operand_mask = !metal::is_same_v<op_mask_t, nomask_t>;
  constexpr bool has_output_mask = !metal::is_same_v<out_mask_t, nomask_t>;

  // Update batch offsets
  if (kDoNCBatch) {
    in_vec += elem_to_loc(tid.z, batch_shape, vector_batch_stride, batch_ndim);
    mat += elem_to_loc(tid.z, batch_shape, matrix_batch_stride, batch_ndim);

    if (has_output_mask) {
      out_mask +=
          elem_to_loc(tid.z, batch_shape, mask_batch_strides, batch_ndim);
      mask_batch_strides += batch_ndim;
    }

    if (has_operand_mask) {
      const constant auto* mask_strides_mat = mask_batch_strides;
      const constant auto* mask_strides_vec = mask_strides_mat + batch_ndim;

      ulong2 batch_offsets = elem_to_loc_broadcast(
          tid.z, batch_shape, mask_strides_mat, mask_strides_vec, batch_ndim);

      mat_mask += batch_offsets.x;
      vec_mask += batch_offsets.y;
    }

  } else {
    in_vec += tid.z * vector_batch_stride[0];
    mat += tid.z * matrix_batch_stride[0];

    if (has_output_mask) {
      out_mask += tid.z * mask_batch_strides[0];
      mask_batch_strides += batch_ndim;
    }

    if (has_operand_mask) {
      mat_mask += tid.z * mask_batch_strides[0];
      vec_mask += tid.z * mask_batch_strides[batch_ndim];
    }
  }

  out_vec += tid.z * out_vec_size;

  gemv_kernel::run(
      mat,
      in_vec,
      out_vec,
      in_vec_size,
      out_vec_size,
      marix_ld,
      out_mask,
      mat_mask,
      vec_mask,
      mask_strides,
      gemv_kernel::tgp_mem_size == 0 ? nullptr : tgp_memory,
      tid,
      lid,
      simd_gid,
      simd_lid);
}

///////////////////////////////////////////////////////////////////////////////
/// Vector matrix multiplication
///////////////////////////////////////////////////////////////////////////////

template <
    typename T,
    typename out_mask_t,
    typename op_mask_t,
    const int BM, /* Threadgroup rows (in simdgroups) */
    const int BN, /* Threadgroup cols (in simdgroups) */
    const int SM, /* Simdgroup rows (in threads) */
    const int SN, /* Simdgroup cols (in threads) */
    const int TM, /* Thread rows (in elements) */
    const int TN, /* Thread cols (in elements) */
    const bool kDoNCBatch> /* Batch ndim > 1 */
[[kernel, max_total_threads_per_threadgroup(BM * BN * 32)]] void gemv_t_masked(
    const device T* mat [[buffer(0)]],
    const device T* in_vec [[buffer(1)]],
    device T* out_vec [[buffer(3)]],
    const constant int& in_vec_size [[buffer(4)]],
    const constant int& out_vec_size [[buffer(5)]],
    const constant int& marix_ld [[buffer(6)]],
    const constant int& batch_ndim [[buffer(9)]],
    const constant int* batch_shape [[buffer(10)]],
    const constant int64_t* vector_batch_stride [[buffer(11)]],
    const constant int64_t* matrix_batch_stride [[buffer(12)]],
    const device out_mask_t* out_mask [[buffer(20)]],
    const device op_mask_t* mat_mask [[buffer(21)]],
    const device op_mask_t* vec_mask [[buffer(22)]],
    const constant int* mask_strides [[buffer(23)]],
    const constant int64_t* mask_batch_strides [[buffer(24)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  using gemv_kernel =
      GEMVTKernel<T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN>;
  threadgroup float tgp_memory
      [gemv_kernel::tgp_mem_size == 0 ? 1 : gemv_kernel::tgp_mem_size];

  constexpr bool has_operand_mask = !metal::is_same_v<op_mask_t, nomask_t>;
  constexpr bool has_output_mask = !metal::is_same_v<out_mask_t, nomask_t>;

  // Update batch offsets
  if (kDoNCBatch) {
    in_vec += elem_to_loc(tid.z, batch_shape, vector_batch_stride, batch_ndim);
    mat += elem_to_loc(tid.z, batch_shape, matrix_batch_stride, batch_ndim);

    if (has_output_mask) {
      out_mask +=
          elem_to_loc(tid.z, batch_shape, mask_batch_strides, batch_ndim);
      mask_batch_strides += batch_ndim;
    }

    if (has_operand_mask) {
      const constant auto* mask_strides_mat = mask_batch_strides;
      const constant auto* mask_strides_vec = mask_strides_mat + batch_ndim;

      ulong2 batch_offsets = elem_to_loc_broadcast(
          tid.z, batch_shape, mask_strides_mat, mask_strides_vec, batch_ndim);

      mat_mask += batch_offsets.x;
      vec_mask += batch_offsets.y;
    }

  } else {
    in_vec += tid.z * vector_batch_stride[0];
    mat += tid.z * matrix_batch_stride[0];

    if (has_output_mask) {
      out_mask += tid.z * mask_batch_strides[0];
      mask_batch_strides += batch_ndim;
    }

    if (has_operand_mask) {
      mat_mask += tid.z * mask_batch_strides[0];
      vec_mask += tid.z * mask_batch_strides[batch_ndim];
    }
  }

  out_vec += tid.z * out_vec_size;

  gemv_kernel::run(
      mat,
      in_vec,
      out_vec,
      in_vec_size,
      out_vec_size,
      marix_ld,
      out_mask,
      mat_mask,
      vec_mask,
      mask_strides,
      gemv_kernel::tgp_mem_size == 0 ? nullptr : tgp_memory,
      tid,
      lid,
      simd_gid,
      simd_lid);
}


================================================
FILE: mlx/backend/metal/kernels/gemv_masked.metal
================================================
// Copyright © 2023-2024 Apple Inc.

// clang-format off
#include <metal_simdgroup>
#include <metal_stdlib>

#include "mlx/backend/metal/kernels/utils.h"

#include "mlx/backend/metal/kernels/gemv_masked.h"

#define instantiate_gemv_helper(                                           \
    outm_n, outm_t, opm_n, opm_t, name, itype, bm, bn, sm, sn, tm, tn, nc) \
  instantiate_kernel(                                                      \
    "gemv_outmask_" #outm_n "_opmask_" #opm_n "_" #name                    \
      "_bm" #bm "_bn" #bn "_sm" #sm "_sn" #sn "_tm" #tm                    \
      "_tn" #tn "_nc" #nc,                                                 \
  gemv_masked, itype, outm_t, opm_t, bm, bn, sm, sn, tm, tn, nc)

#define instantiate_gemv_base(name, itype, bm, bn, sm, sn, tm, tn, nc) \
  instantiate_gemv_helper(bool_, bool, bool_, bool, name, itype, bm, bn, sm, sn, tm, tn, nc)      \
  instantiate_gemv_helper(name, itype, name, itype, name, itype, bm, bn, sm, sn, tm, tn, nc)      \
  instantiate_gemv_helper(bool_, bool, name, itype, name, itype, bm, bn, sm, sn, tm, tn, nc)      \
  instantiate_gemv_helper(name, itype, bool_, bool, name, itype, bm, bn, sm, sn, tm, tn, nc)      \
  instantiate_gemv_helper(nomask, nomask_t, name, itype, name, itype, bm, bn, sm, sn, tm, tn, nc) \
  instantiate_gemv_helper(nomask, nomask_t, bool_, bool, name, itype, bm, bn, sm, sn, tm, tn, nc) \
  instantiate_gemv_helper(bool_, bool, nomask, nomask_t, name, itype, bm, bn, sm, sn, tm, tn, nc) \
  instantiate_gemv_helper(name, itype, nomask, nomask_t, name, itype, bm, bn, sm, sn, tm, tn, nc)

#define instantiate_gemv(name, itype, bm, bn, sm, sn, tm, tn)   \
  instantiate_gemv_base(name, itype, bm, bn, sm, sn, tm, tn, 0) \
  instantiate_gemv_base(name, itype, bm, bn, sm, sn, tm, tn, 1)

#define instantiate_gemv_blocks(name, itype) \
  instantiate_gemv(name, itype, 2, 1, 4,  8, 1, 4) \
  instantiate_gemv(name, itype, 2, 1, 4,  8, 4, 4) \
  instantiate_gemv(name, itype, 2, 1, 2, 16, 1, 4) \
  instantiate_gemv(name, itype, 2, 1, 2, 16, 4, 4) \
  instantiate_gemv(name, itype, 4, 1, 2, 16, 4, 4)

instantiate_gemv_blocks(float32, float);
instantiate_gemv_blocks(float16, half);
instantiate_gemv_blocks(bfloat16, bfloat16_t);

#define instantiate_gemv_t_helper(                                           \
    outm_n, outm_t, opm_n, opm_t, name, itype, bm, bn, sm, sn, tm, tn, nc)   \
  instantiate_kernel(                                                        \
    "gemv_t_outmask_" #outm_n "_opmask_" #opm_n "_" #name                    \
      "_bm" #bm "_bn" #bn "_sm" #sm "_sn" #sn "_tm" #tm                      \
      "_tn" #tn "_nc" #nc,                                                   \
  gemv_t_masked, itype, outm_t, opm_t, bm, bn, sm, sn, tm, tn, nc)

#define instantiate_gemv_t_base(name, itype, bm, bn, sm, sn, tm, tn, nc) \
  instantiate_gemv_t_helper(bool_, bool, bool_, bool, name, itype, bm, bn, sm, sn, tm, tn, nc)      \
  instantiate_gemv_t_helper(name, itype, name, itype, name, itype, bm, bn, sm, sn, tm, tn, nc)      \
  instantiate_gemv_t_helper(bool_, bool, name, itype, name, itype, bm, bn, sm, sn, tm, tn, nc)      \
  instantiate_gemv_t_helper(name, itype, bool_, bool, name, itype, bm, bn, sm, sn, tm, tn, nc)      \
  instantiate_gemv_t_helper(nomask, nomask_t, name, itype, name, itype, bm, bn, sm, sn, tm, tn, nc) \
  instantiate_gemv_t_helper(nomask, nomask_t, bool_, bool, name, itype, bm, bn, sm, sn, tm, tn, nc) \
  instantiate_gemv_t_helper(bool_, bool, nomask, nomask_t, name, itype, bm, bn, sm, sn, tm, tn, nc) \
  instantiate_gemv_t_helper(name, itype, nomask, nomask_t, name, itype, bm, bn, sm, sn, tm, tn, nc)

#define instantiate_gemv_t(name, itype, bm, bn, sm, sn, tm, tn)   \
  instantiate_gemv_t_base(name, itype, bm, bn, sm, sn, tm, tn, 0) \
  instantiate_gemv_t_base(name, itype, bm, bn, sm, sn, tm, tn, 1)

#define instantiate_gemv_t_blocks(name, itype) \
  instantiate_gemv_t(name, itype, 1, 1,  8, 4, 4, 1) \
  instantiate_gemv_t(name, itype, 1, 2,  8, 4, 4, 4) \
  instantiate_gemv_t(name, itype, 1, 1,  8, 4, 8, 1) \
  instantiate_gemv_t(name, itype, 1, 1,  8, 4, 8, 4) \
  instantiate_gemv_t(name, itype, 1, 2,  8, 4, 8, 4) \
  instantiate_gemv_t(name, itype, 1, 4,  8, 4, 8, 4)

instantiate_gemv_t_blocks(float32, float);
instantiate_gemv_t_blocks(float16, half);
instantiate_gemv_t_blocks(bfloat16, bfloat16_t); // clang-format on


================================================
FILE: mlx/backend/metal/kernels/hadamard.h
================================================
// Copyright © 2024 Apple Inc.
#include <metal_common>
#include <metal_compute>

#include "mlx/backend/metal/kernels/steel/defines.h"

using namespace metal;

// Thread local Hadamard transform for 2^R
template <short R>
METAL_FUNC void radix_func(thread float* x) {
  constexpr short logR = __builtin_ctz(R);
  short h = 1;
  STEEL_PRAGMA_UNROLL
  for (short s = 0; s < logR; s++) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < R / 2; i++) {
      short k = i & (h - 1);
      short j = ((i - k) << 1) + k;
      float a = x[j];
      float b = x[j + h];
      x[j] = a + b;
      x[j + h] = a - b;
    }
    h <<= 1;
  }
}

template <typename T, int N, int max_radix, int read_width, int stride = 1>
[[kernel]] void hadamard_n(
    const device T* in [[buffer(0)]],
    device T* out [[buffer(1)]],
    constant const float& scale,
    uint3 elem [[thread_position_in_grid]],
    uint3 grid [[threads_per_grid]]) {
  // Compute a Hadamard transform of size N = 2^k
  //
  // Equivalent to:
  //    from scipy.linalg import hadamard
  //    y = hadamard(len(x)) @ x

  constexpr short num_threads = N / max_radix;
  constexpr short logN = __builtin_ctz(N);
  constexpr short logR = __builtin_ctz(max_radix);
  constexpr short num_steps = logN / logR;
  constexpr short logFinal = logN % logR;
  constexpr short final_radix = 1 << (logFinal);

  int batch_idx = elem.y * N * stride + elem.z;
  short i = elem.x;

  threadgroup T buf[N];

  // Read values from device
  if (stride == 1) {
    STEEL_PRAGMA_UNROLL
    for (short j = 0; j < max_radix / read_width; j++) {
      short index = j * read_width * num_threads + i * read_width;
      STEEL_PRAGMA_UNROLL
      for (short r = 0; r < read_width; r++) {
        buf[index + r] = in[batch_idx + index + r];
      }
    }
  } else {
    STEEL_PRAGMA_UNROLL
    for (short j = 0; j < max_radix; j++) {
      buf[j * num_threads + i] = in[batch_idx + (j * num_threads + i) * stride];
    }
  }

  threadgroup_barrier(mem_flags::mem_threadgroup);

  float x[max_radix];
  short h = 1;

  STEEL_PRAGMA_UNROLL
  for (short s = 0; s < num_steps; s++) {
    short k = i & (h - 1);
    short j = ((i - k) << logR) + k;

    STEEL_PRAGMA_UNROLL
    for (short r = 0; r < max_radix; r++) {
      x[r] = buf[j + h * r];
    }

    radix_func<max_radix>(x);

    STEEL_PRAGMA_UNROLL
    for (short r = 0; r < max_radix; r++) {
      buf[j + h * r] = T(x[r]);
    }

    h <<= logR;
    threadgroup_barrier(mem_flags::mem_threadgroup);
  }

  // Do the final radix
  // e.g. max_radix = 16
  //      N = 1024 = 16 * 16 * 4
  if (final_radix > 1) {
    // Each thread does multiple butterflies
    STEEL_PRAGMA_UNROLL
    for (int t = 0; t < max_radix / final_radix; t++) {
      short index = i + t * num_threads;
      short k = index & (h - 1);
      short j = ((index - k) << logFinal) + k;
      STEEL_PRAGMA_UNROLL
      for (short r = 0; r < final_radix; r++) {
        x[r] = buf[j + h * r];
      }

      radix_func<final_radix>(x);

      STEEL_PRAGMA_UNROLL
      for (short r = 0; r < final_radix; r++) {
        buf[j + h * r] = T(x[r]);
      }
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
  }

  // Write values to device
  if (stride == 1) {
    STEEL_PRAGMA_UNROLL
    for (short j = 0; j < max_radix / read_width; j++) {
      short index = j * read_width * num_threads + i * read_width;
      STEEL_PRAGMA_UNROLL
      for (short r = 0; r < read_width; r++) {
        out[batch_idx + index + r] = T(buf[index + r] * scale);
      }
    }
  } else {
    STEEL_PRAGMA_UNROLL
    for (short j = 0; j < max_radix; j++) {
      out[batch_idx + (j * num_threads + i) * stride] =
          buf[j * num_threads + i];
    }
  }
}

template <typename T, int N, int M, int read_width>
[[kernel]] void hadamard_m(
    const device T* in [[buffer(0)]],
    device T* out [[buffer(1)]],
    constant const float& scale,
    uint3 elem [[thread_position_in_grid]],
    uint3 grid [[threads_per_grid]]) {
  // Compute a Hadamard transform of size M
  // using a naive O(M^2) codelet.
  //
  // This kernel is the second stage in the computation
  // of a Hadamard transform of size M*N where N = 2^k.

  int index = elem.x * grid.y + elem.y;
  short i = index % (N / read_width);
  int batch_idx = index / (N / read_width) * M * N;

  float x[read_width][M];
  STEEL_PRAGMA_UNROLL
  for (short c = 0; c < M; c++) {
    STEEL_PRAGMA_UNROLL
    for (short r = 0; r < read_width; r++) {
      x[r][c] = in[batch_idx + c * N + i * read_width + r];
    }
  }

  STEEL_PRAGMA_UNROLL
  for (short r = 0; r < read_width; r++) {
    // This function is JIT compiled for M
    // using the Hadamard matrix strings in `metal/hadamard.cpp`
    hadamard_radix_m(x[r]);
  }

  // Write back to device
  STEEL_PRAGMA_UNROLL
  for (short c = 0; c < M; c++) {
    STEEL_PRAGMA_UNROLL
    for (short r = 0; r < read_width; r++) {
      out[batch_idx + c * N + i * read_width + r] = T(x[r][c] * scale);
    }
  }
}


================================================
FILE: mlx/backend/metal/kernels/indexing/gather.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/indexing/indexing.h"

template <typename T, typename IdxT, int NIDX, int IDX_NDIM, typename LocT>
METAL_FUNC void gather_impl(
    const device T* src [[buffer(0)]],
    device T* out [[buffer(1)]],
    const constant int* src_shape [[buffer(2)]],
    const constant int64_t* src_strides [[buffer(3)]],
    const constant size_t& src_ndim [[buffer(4)]],
    const constant int* slice_sizes [[buffer(5)]],
    const constant int* axes [[buffer(6)]],
    const thread Indices<IdxT, NIDX>& indices,
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
  LocT src_idx = 0;
  for (int i = 0; i < NIDX; ++i) {
    LocT idx_loc;
    if (IDX_NDIM == 0) {
      idx_loc = 0;
    } else if (IDX_NDIM == 1) {
      idx_loc = index.x * static_cast<LocT>(indices.strides[indices.ndim * i]);
    } else {
      idx_loc = index.x * static_cast<LocT>(indices.strides[indices.ndim * i]);
      idx_loc += indices.row_contiguous[i]
          ? index.y
          : elem_to_loc<LocT>(
                index.y,
                &indices.shapes[indices.ndim * i + 1],
                &indices.strides[indices.ndim * i + 1],
                indices.ndim - 1);
    }
    auto ax = axes[i];
    auto idx_val = offset_neg_idx(indices.buffers[i][idx_loc], src_shape[ax]);
    src_idx += static_cast<LocT>(idx_val) * static_cast<LocT>(src_strides[ax]);
  }

  auto src_offset =
      elem_to_loc<LocT>(index.z, slice_sizes, src_strides, src_ndim);

  LocT out_idx = index.z;
  if (IDX_NDIM == 1) {
    out_idx += static_cast<LocT>(grid_dim.z) * index.x;
  } else if (IDX_NDIM >= 2) {
    out_idx += grid_dim.z * (index.x * static_cast<LocT>(grid_dim.y) + index.y);
  }
  out[out_idx] = src[src_offset + src_idx];
}


================================================
FILE: mlx/backend/metal/kernels/indexing/gather_axis.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

template <typename T, typename IdxT, typename LocT, bool SrcC, bool IdxC>
[[kernel]] void gather_axis(
    const device T* src [[buffer(0)]],
    const device IdxT* indices [[buffer(1)]],
    device T* out [[buffer(2)]],
    const constant int* shape [[buffer(3)]],
    const constant int64_t* src_strides [[buffer(4)]],
    const constant int64_t* idx_strides [[buffer(5)]],
    const constant size_t& ndim [[buffer(6)]],
    const constant int& axis [[buffer(7)]],
    const constant int& axis_size [[buffer(8)]],
    const constant size_t& src_ax_stride [[buffer(9)]],
    const constant size_t& idx_ax_stride [[buffer(10)]],
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
  LocT elem_idx = index.z * static_cast<LocT>(grid_dim.x);
  LocT out_idx = elem_idx * grid_dim.y + index.x;

  LocT idx_loc = index.y * static_cast<LocT>(idx_ax_stride);
  if (IdxC) {
    idx_loc += out_idx;
  } else {
    idx_loc += elem_to_loc<LocT>(elem_idx + index.x, shape, idx_strides, ndim);
  }

  auto idx_val = indices[idx_loc];
  if (is_signed_v<IdxT>) {
    idx_val = (idx_val < 0) ? idx_val + axis_size : idx_val;
  }

  LocT src_idx = idx_val * static_cast<LocT>(src_ax_stride);
  if (SrcC) {
    src_idx += elem_idx * axis_size + index.x;
  } else {
    src_idx += elem_to_loc<LocT>(elem_idx + index.x, shape, src_strides, ndim);
  }

  out_idx += index.y * static_cast<LocT>(grid_dim.x);
  out[out_idx] = src[src_idx];
}


================================================
FILE: mlx/backend/metal/kernels/indexing/gather_front.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/indexing/indexing.h"

template <typename T, typename IdxT, typename LocT, int N>
[[kernel]] void gather_front(
    const device T* src,
    const device IdxT* indices,
    device T* out,
    const constant int64_t& stride,
    const constant int& size,
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  auto idx = offset_neg_idx(indices[index.y], size);
  LocT src_idx = static_cast<LocT>(stride) * idx;
  LocT out_idx = static_cast<LocT>(stride) * index.y;

  int s_idx = N * index.x;
  for (int i = 0; i < N && s_idx < stride; ++i, ++s_idx) {
    out[out_idx + s_idx] = src[src_idx + s_idx];
  }
}


================================================
FILE: mlx/backend/metal/kernels/indexing/indexing.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <metal_stdlib>

template <typename IdxT, int NIDX>
struct Indices {
  const array<const device IdxT*, NIDX> buffers;
  const constant int* shapes;
  const constant int64_t* strides;
  const constant bool* row_contiguous;
  const int ndim;
};

template <typename IdxT>
METAL_FUNC size_t offset_neg_idx(IdxT idx, int size) {
  if (is_unsigned_v<IdxT>) {
    return idx;
  } else {
    return (idx < 0) ? idx + size : idx;
  }
}


================================================
FILE: mlx/backend/metal/kernels/indexing/masked_scatter.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

constant mlx::os_log logger("mlx", "masked_assign");

template <typename T, bool src_contiguous>
[[kernel]] void masked_assign_impl(
    const device bool* mask [[buffer(0)]],
    const device uint* scatter_offsets [[buffer(1)]],
    const device T* src [[buffer(2)]],
    device T* out [[buffer(3)]],
    const constant int* src_shapes [[buffer(4)]],
    const constant int64_t* src_strides [[buffer(5)]],
    const constant int& src_ndim [[buffer(6)]],
    const constant int64_t& src_batch_size [[buffer(7)]],
    const constant int64_t& mask_batch_size [[buffer(8)]],
    uint idx [[thread_position_in_grid]]) {
  const bool mask_value = mask[idx];
  if (!mask_value) {
    return;
  }

  const uint src_index = scatter_offsets[idx];
  if (src_index >= src_batch_size) {
    logger.log_debug("Out of bound read from src");
    return;
  }

  const uint batch_idx = idx / mask_batch_size;

  if (src_contiguous) {
    out[idx] = src[batch_idx * src_batch_size + src_index];
  } else {
    out[idx] = src[elem_to_loc<uint>(
        batch_idx * src_batch_size + src_index,
        src_shapes,
        src_strides,
        src_ndim)];
  }
}


================================================
FILE: mlx/backend/metal/kernels/indexing/scatter.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/indexing/indexing.h"

template <
    typename T,
    typename IdxT,
    typename Op,
    int NIDX,
    bool UPD_ROW_CONTIG,
    int NWORK,
    typename LocT>
METAL_FUNC void scatter_impl(
    const device T* updates,
    device mlx_atomic<T>* out,
    const constant int* upd_shape,
    const constant int64_t* upd_strides,
    const constant size_t& upd_ndim,
    const constant size_t& upd_size,
    const constant int* out_shape,
    const constant int64_t* out_strides,
    const constant size_t& out_ndim,
    const constant int* axes,
    const constant size_t& idx_size,
    const thread Indices<IdxT, NIDX>& indices,
    uint2 gid [[thread_position_in_grid]]) {
  Op op;

  auto ind_idx = gid.y * NWORK;
  LocT out_offset = 0;
  if (upd_size > 1) {
    out_offset = elem_to_loc<LocT>(
        gid.x, upd_shape + indices.ndim, out_strides, out_ndim);
  }

  for (int j = 0; j < NWORK && ind_idx < idx_size; ++j, ind_idx++) {
    LocT out_idx = out_offset;
    for (int i = 0; i < NIDX; ++i) {
      auto idx_loc = indices.row_contiguous[i]
          ? ind_idx
          : elem_to_loc<LocT>(
                ind_idx,
                &indices.shapes[indices.ndim * i],
                &indices.strides[indices.ndim * i],
                indices.ndim);
      auto ax = axes[i];
      auto idx_val = offset_neg_idx(indices.buffers[i][idx_loc], out_shape[ax]);
      out_idx +=
          static_cast<LocT>(idx_val) * static_cast<LocT>(out_strides[ax]);
    }
    auto upd_idx = ind_idx * static_cast<LocT>(upd_size) + gid.x;
    if constexpr (!UPD_ROW_CONTIG) {
      upd_idx = elem_to_loc<LocT>(upd_idx, upd_shape, upd_strides, upd_ndim);
    }
    op.atomic_update(out, updates[upd_idx], out_idx);
  }
}

template <
    typename T,
    typename IdxT,
    typename Op,
    bool OUT_ROW_CONTIG,
    bool UPD_ROW_CONTIG,
    bool UPD_SCALAR,
    int NWORK,
    int NDIM>
[[kernel]] void slice_update_op_impl(
    const device T* updates [[buffer(0)]],
    device T* out [[buffer(1)]],
    const constant int* update_shape [[buffer(2)]],
    const constant int64_t* update_strides [[buffer(3)]],
    const constant int& update_ndim [[buffer(4)]],
    const constant int64_t& update_size [[buffer(5)]],
    const constant int64_t* output_strides [[buffer(6)]],
    const constant int64_t& output_offset [[buffer(7)]],
    uint3 gid [[thread_position_in_grid]],
    uint3 gsize [[threads_per_grid]]) {
  Op op;

  IdxT idx = IdxT(gid.z) * gsize.y + gid.y * gsize.x + gid.x * NWORK;
  IdxT out_idx;
  IdxT update_idx;

  if constexpr (OUT_ROW_CONTIG) {
    out_idx = idx;
  } else if constexpr (NDIM == 1) {
    out_idx = NWORK * gid.x * output_strides[0];
  } else if constexpr (NDIM == 2) {
    out_idx = gid.y * output_strides[0] + NWORK * gid.x * output_strides[1];
  } else if constexpr (NDIM == 3) {
    out_idx = gid.z * output_strides[0] + gid.y * output_strides[1] +
        NWORK * gid.x * output_strides[2];
  } else {
    out_idx = elem_to_loc<IdxT>(idx, update_shape, output_strides, update_ndim);
  }

  if constexpr (UPD_SCALAR) {
    update_idx = 0;
  } else if constexpr (UPD_ROW_CONTIG) {
    update_idx = idx;
  } else if constexpr (NDIM == 1) {
    update_idx = NWORK * gid.x * update_strides[0];
  } else if constexpr (NDIM == 2) {
    update_idx = gid.y * update_strides[0] + NWORK * gid.x * update_strides[1];
  } else if constexpr (NDIM == 3) {
    update_idx = gid.z * update_strides[0] + gid.y * update_strides[1] +
        NWORK * gid.x * update_strides[2];
  } else {
    update_idx =
        elem_to_loc<IdxT>(idx, update_shape, update_strides, update_ndim);
  }

  out += output_offset;

  if constexpr (OUT_ROW_CONTIG && (UPD_ROW_CONTIG || UPD_SCALAR)) {
    for (int j = 0; j < NWORK; j++) {
      out[out_idx] = op(out[out_idx], updates[update_idx]);
      out_idx++;
      if constexpr (!UPD_SCALAR) {
        update_idx++;
      }
    }
  } else {
    auto out_stride = output_strides[update_ndim - 1];
    auto update_stride = update_strides[update_ndim - 1];
    for (int j = 0; j < NWORK; j++) {
      out[out_idx] = op(out[out_idx], updates[update_idx]);
      out_idx += out_stride;
      if constexpr (!UPD_SCALAR) {
        update_idx += update_stride;
      }
    }
  }
}


================================================
FILE: mlx/backend/metal/kernels/indexing/scatter_axis.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

template <
    typename T,
    typename IdxT,
    typename LocT,
    typename Op,
    bool UpdC,
    bool IdxC>
[[kernel]] void scatter_axis(
    const device T* upd [[buffer(0)]],
    const device IdxT* indices [[buffer(1)]],
    device mlx_atomic<T>* out [[buffer(2)]],
    const constant int* shape [[buffer(3)]],
    const constant int64_t* upd_strides [[buffer(4)]],
    const constant int64_t* idx_strides [[buffer(5)]],
    const constant size_t& ndim [[buffer(6)]],
    const constant int& axis [[buffer(7)]],
    const constant int& out_axis_size [[buffer(8)]],
    const constant size_t& upd_ax_stride [[buffer(9)]],
    const constant size_t& idx_ax_stride [[buffer(10)]],
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
  Op op;

  LocT elem_idx = index.z * static_cast<LocT>(grid_dim.x);

  LocT idx_loc = index.y * static_cast<LocT>(idx_ax_stride);
  if (IdxC) {
    idx_loc += elem_idx * grid_dim.y + index.x;
  } else {
    idx_loc += elem_to_loc<LocT>(elem_idx + index.x, shape, idx_strides, ndim);
  }

  auto idx_val = indices[idx_loc];
  if (is_signed_v<IdxT>) {
    idx_val = (idx_val < 0) ? idx_val + out_axis_size : idx_val;
  }

  LocT upd_idx = index.y * static_cast<LocT>(upd_ax_stride);
  if (UpdC) {
    upd_idx += elem_idx * grid_dim.y + index.x;
  } else {
    upd_idx += elem_to_loc<LocT>(elem_idx + index.x, shape, upd_strides, ndim);
  }

  LocT out_idx = elem_idx * static_cast<LocT>(out_axis_size) +
      idx_val * grid_dim.x + index.x;
  op.atomic_update(out, upd[upd_idx], out_idx);
}


================================================
FILE: mlx/backend/metal/kernels/layer_norm.metal
================================================
// Copyright © 2024 Apple Inc.

#include <metal_common>
#include <metal_simdgroup>

#include "mlx/backend/metal/kernels/utils.h"

using namespace metal;

constant bool has_w [[function_constant(20)]];

template <int N = 1>
inline void initialize_buffer(
    threadgroup float* xs,
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  if (simd_group_id == 0) {
    for (int i = 0; i < N; i++) {
      xs[N * simd_lane_id + i] = 0;
    }
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
}

template <int N = 1>
inline void threadgroup_sum(
    thread float* x,
    threadgroup float* xs,
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  for (int i = 0; i < N; i++) {
    x[i] = simd_sum(x[i]);
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (simd_lane_id == 0) {
    for (int i = 0; i < N; i++) {
      xs[N * simd_group_id + i] = x[i];
    }
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  for (int i = 0; i < N; i++) {
    x[i] = xs[N * simd_lane_id + i];
    x[i] = simd_sum(x[i]);
  }
}

template <typename T, int N_READS = 8>
[[kernel]] void layer_norm_single_row(
    const device T* x,
    const device T* w,
    const device T* b,
    device T* out,
    constant float& eps,
    constant uint& axis_size,
    constant uint& w_stride,
    constant uint& b_stride,
    uint gid [[threadgroup_position_in_grid]],
    uint lid [[thread_position_in_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  constexpr int SIMD_SIZE = 32;

  // Initialize the registers and threadgroup memory
  float thread_x[N_READS] = {0};
  threadgroup float local_buffer[SIMD_SIZE] = {0};
  initialize_buffer(local_buffer, simd_lane_id, simd_group_id);

  // Advance the pointers
  x += gid * size_t(axis_size) + lid * N_READS;
  w += w_stride * lid * N_READS;
  b += b_stride * lid * N_READS;
  out += gid * size_t(axis_size) + lid * N_READS;

  // Compute some variables for reading writing etc
  const bool safe = lid * N_READS + N_READS <= axis_size;
  const int n = axis_size - lid * N_READS;

  // Read the inputs
  if (safe) {
    for (int i = 0; i < N_READS; i++) {
      thread_x[i] = x[i];
    }
  } else {
    for (int i = 0; i < n; i++) {
      thread_x[i] = x[i];
    }
  }

  // Compute the mean
  float mean = 0;
  for (int i = 0; i < N_READS; i++) {
    mean += thread_x[i];
  }
  threadgroup_sum(&mean, local_buffer, simd_lane_id, simd_group_id);
  mean /= axis_size;

  // Compute the normalizer
  float normalizer = 0;
  if (!safe) {
    for (int i = n; i < N_READS; i++) {
      thread_x[i] = mean;
    }
  }
  for (int i = 0; i < N_READS; i++) {
    thread_x[i] -= mean;
    normalizer += thread_x[i] * thread_x[i];
  }
  threadgroup_sum(&normalizer, local_buffer, simd_lane_id, simd_group_id);
  normalizer = metal::precise::rsqrt(normalizer / axis_size + eps);

  // Write the outputs
  if (safe) {
    for (int i = 0; i < N_READS; i++) {
      thread_x[i] *= normalizer;
      out[i] = w[w_stride * i] * static_cast<T>(thread_x[i]) + b[b_stride * i];
    }
  } else {
    for (int i = 0; i < n; i++) {
      thread_x[i] *= normalizer;
      out[i] = w[w_stride * i] * static_cast<T>(thread_x[i]) + b[b_stride * i];
    }
  }
}

template <typename T, int N_READS = 4>
[[kernel]] void layer_norm_looped(
    const device T* x,
    const device T* w,
    const device T* b,
    device T* out,
    constant float& eps,
    constant uint& axis_size,
    constant uint& w_stride,
    constant uint& b_stride,
    uint gid [[threadgroup_position_in_grid]],
    uint lid [[thread_position_in_threadgroup]],
    uint lsize [[threads_per_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  constexpr int SIMD_SIZE = 32;

  threadgroup float local_buffer[SIMD_SIZE];
  initialize_buffer(local_buffer, simd_lane_id, simd_group_id);

  x += gid * size_t(axis_size) + lid * N_READS;
  w += w_stride * lid * N_READS;
  b += b_stride * lid * N_READS;

  // Compute the mean
  float mean = 0;
  for (uint r = 0; r < axis_size; r += lsize * N_READS) {
    if (r + lid * N_READS + N_READS <= axis_size) {
      for (int i = 0; i < N_READS; i++) {
        mean += x[i + r];
      }
    } else {
      for (int i = 0; i < N_READS; i++) {
        if ((r + lid * N_READS + i) < axis_size) {
          mean += x[i + r];
        }
      }
    }
  }
  threadgroup_sum(&mean, local_buffer, simd_lane_id, simd_group_id);
  mean /= axis_size;

  // Compute the normalizer
  float normalizer = 0;
  for (uint r = 0; r < axis_size; r += lsize * N_READS) {
    if (r + lid * N_READS + N_READS <= axis_size) {
      for (int i = 0; i < N_READS; i++) {
        float t = x[i + r] - mean;
        normalizer += t * t;
      }
    } else {
      for (int i = 0; i < N_READS; i++) {
        if ((r + lid * N_READS + i) < axis_size) {
          float t = x[i + r] - mean;
          normalizer += t * t;
        }
      }
    }
  }
  threadgroup_sum(&normalizer, local_buffer, simd_lane_id, simd_group_id);
  normalizer = metal::precise::rsqrt(normalizer / axis_size + eps);

  // Write the outputs
  out += gid * size_t(axis_size) + lid * N_READS;
  for (uint r = 0; r < axis_size; r += lsize * N_READS) {
    if (r + lid * N_READS + N_READS <= axis_size) {
      for (int i = 0; i < N_READS; i++) {
        float xi = (x[r + i] - mean) * normalizer;
        out[r + i] =
            w[w_stride * (i + r)] * static_cast<T>(xi) + b[b_stride * (i + r)];
      }
    } else {
      for (int i = 0; i < N_READS; i++) {
        if ((r + lid * N_READS + i) < axis_size) {
          float xi = (x[r + i] - mean) * normalizer;
          out[r + i] = w[w_stride * (i + r)] * static_cast<T>(xi) +
              b[b_stride * (i + r)];
        }
      }
    }
  }
}

template <typename T, int N_READS = 8>
[[kernel]] void vjp_layer_norm_single_row(
    const device T* x,
    const device T* w,
    const device T* g,
    device T* gx,
    device T* gw,
    constant float& eps,
    constant uint& axis_size,
    constant uint& w_stride,
    uint gid [[threadgroup_position_in_grid]],
    uint lid [[thread_position_in_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  constexpr int SIMD_SIZE = 32;

  // Advance the input pointers
  x += gid * size_t(axis_size) + lid * N_READS;
  g += gid * size_t(axis_size) + lid * N_READS;
  w += w_stride * lid * N_READS;

  // Initialize the registers and threadgroup memory
  float thread_x[N_READS] = {0};
  float thread_w[N_READS] = {0};
  float thread_g[N_READS] = {0};
  threadgroup float local_buffer[3 * SIMD_SIZE];
  initialize_buffer<3>(local_buffer, simd_lane_id, simd_group_id);

  // Compute some variables for reading writing etc
  const bool safe = lid * N_READS + N_READS <= axis_size;
  const int n = axis_size - lid * N_READS;

  // Read the inputs
  if (safe) {
    for (int i = 0; i < N_READS; i++) {
      thread_x[i] = x[i];
      thread_g[i] = g[i];
      thread_w[i] = w[i * w_stride];
    }
  } else {
    for (int i = 0; i < n; i++) {
      thread_x[i] = x[i];
      thread_g[i] = g[i];
      thread_w[i] = w[i * w_stride];
    }
  }

  // Compute the mean
  float mean = 0;
  for (int i = 0; i < N_READS; i++) {
    mean += thread_x[i];
  }
  threadgroup_sum(&mean, local_buffer, simd_lane_id, simd_group_id);
  mean /= axis_size;

  // Compute the neccesary scaling factors using the mean
  if (!safe) {
    for (int i = n; i < N_READS; i++) {
      thread_x[i] = mean;
    }
  }
  float factors[3] = {0};
  constexpr int meanwg = 0;
  constexpr int meanwgxc = 1;
  constexpr int normalizer2 = 2;
  for (int i = 0; i < N_READS; i++) {
    thread_x[i] -= mean;
    factors[meanwg] += thread_w[i] * thread_g[i];
    factors[meanwgxc] += thread_w[i] * thread_g[i] * thread_x[i];
    factors[normalizer2] += thread_x[i] * thread_x[i];
  }
  threadgroup_sum<3>(factors, local_buffer, simd_lane_id, simd_group_id);
  factors[meanwg] /= axis_size;
  factors[meanwgxc] /= axis_size;
  factors[normalizer2] = 1 / (factors[normalizer2] / axis_size + eps);
  float normalizer = metal::precise::sqrt(factors[normalizer2]);

  // Write the outputs
  gx += gid * size_t(axis_size) + lid * N_READS;
  gw += gid * size_t(axis_size) + lid * N_READS;
  if (safe) {
    for (int i = 0; i < N_READS; i++) {
      thread_x[i] *= normalizer;
      gx[i] = static_cast<T>(
          normalizer * (thread_w[i] * thread_g[i] - factors[meanwg]) -
          thread_x[i] * factors[meanwgxc] * factors[normalizer2]);
      if (has_w) {
        gw[i] = static_cast<T>(thread_g[i] * thread_x[i]);
      }
    }
  } else {
    for (int i = 0; i < n; i++) {
      thread_x[i] *= normalizer;
      gx[i] = static_cast<T>(
          normalizer * (thread_w[i] * thread_g[i] - factors[meanwg]) -
          thread_x[i] * factors[meanwgxc] * factors[normalizer2]);
      if (has_w) {
        gw[i] = static_cast<T>(thread_g[i] * thread_x[i]);
      }
    }
  }
}

template <typename T, int N_READS = 4>
[[kernel]] void vjp_layer_norm_looped(
    const device T* x,
    const device T* w,
    const device T* g,
    device T* gx,
    device T* gw,
    constant float& eps,
    constant uint& axis_size,
    constant uint& w_stride,
    uint gid [[threadgroup_position_in_grid]],
    uint lid [[thread_position_in_threadgroup]],
    uint lsize [[threads_per_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  constexpr int SIMD_SIZE = 32;

  // Advance the input pointers
  x += gid * size_t(axis_size) + lid * N_READS;
  g += gid * size_t(axis_size) + lid * N_READS;
  w += w_stride * lid * N_READS;

  threadgroup float local_buffer[3 * SIMD_SIZE];
  initialize_buffer<3>(local_buffer, simd_lane_id, simd_group_id);

  // Compute the mean
  float mean = 0;
  for (uint r = 0; r < axis_size; r += lsize * N_READS) {
    if (r + lid * N_READS + N_READS <= axis_size) {
      for (int i = 0; i < N_READS; i++) {
        mean += x[i + r];
      }
    } else {
      for (int i = 0; i < N_READS; i++) {
        if ((r + lid * N_READS + i) < axis_size) {
          mean += x[i + r];
        }
      }
    }
  }
  threadgroup_sum(&mean, local_buffer, simd_lane_id, simd_group_id);
  mean /= axis_size;

  // Compute the neccesary scaling factors using the mean
  float factors[3] = {0};
  constexpr int meanwg = 0;
  constexpr int meanwgxc = 1;
  constexpr int normalizer2 = 2;
  for (uint r = 0; r < axis_size; r += lsize * N_READS) {
    if (r + lid * N_READS + N_READS <= axis_size) {
      for (int i = 0; i < N_READS; i++) {
        float t = x[i + r] - mean;
        float wi = w[(i + r) * w_stride];
        float gi = g[i + r];
        float wg = wi * gi;
        factors[meanwg] += wg;
        factors[meanwgxc] += wg * t;
        factors[normalizer2] += t * t;
      }
    } else {
      for (int i = 0; i < N_READS; i++) {
        if ((r + lid * N_READS + i) < axis_size) {
          float t = x[i + r] - mean;
          float wi = w[(i + r) * w_stride];
          float gi = g[i + r];
          float wg = wi * gi;
          factors[meanwg] += wg;
          factors[meanwgxc] += wg * t;
          factors[normalizer2] += t * t;
        }
      }
    }
  }
  threadgroup_sum<3>(factors, local_buffer, simd_lane_id, simd_group_id);
  factors[meanwg] /= axis_size;
  factors[meanwgxc] /= axis_size;
  factors[normalizer2] = 1 / (factors[normalizer2] / axis_size + eps);
  float normalizer = metal::precise::sqrt(factors[normalizer2]);

  // Write the outputs
  gx += gid * size_t(axis_size) + lid * N_READS;
  gw += gid * size_t(axis_size) + lid * N_READS;
  for (uint r = 0; r < axis_size; r += lsize * N_READS) {
    if (r + lid * N_READS + N_READS <= axis_size) {
      for (int i = 0; i < N_READS; i++) {
        float xi = (x[i + r] - mean) * normalizer;
        float wi = w[(i + r) * w_stride];
        float gi = g[i + r];
        gx[i + r] = static_cast<T>(
            normalizer * (wi * gi - factors[meanwg]) -
            xi * factors[meanwgxc] * factors[normalizer2]);
        if (has_w) {
          gw[i + r] = static_cast<T>(gi * xi);
        }
      }
    } else {
      for (int i = 0; i < N_READS; i++) {
        if ((r + lid * N_READS + i) < axis_size) {
          float xi = (x[i + r] - mean) * normalizer;
          float wi = w[(i + r) * w_stride];
          float gi = g[i + r];
          gx[i + r] = static_cast<T>(
              normalizer * (wi * gi - factors[meanwg]) -
              xi * factors[meanwgxc] * factors[normalizer2]);
          if (has_w) {
            gw[i + r] = static_cast<T>(gi * xi);
          }
        }
      }
    }
  }
}

// clang-format off
#define instantiate_layer_norm(name, itype)                                       \
  instantiate_kernel("layer_norm" #name, layer_norm_single_row, itype)            \
  instantiate_kernel("vjp_layer_norm" #name, vjp_layer_norm_single_row, itype)    \
  instantiate_kernel("layer_norm_looped" #name, layer_norm_looped, itype)         \
  instantiate_kernel("vjp_layer_norm_looped" #name, vjp_layer_norm_looped, itype)

instantiate_layer_norm(float32, float)
instantiate_layer_norm(float16, half)
instantiate_layer_norm(bfloat16, bfloat16_t) // clang-format on


================================================
FILE: mlx/backend/metal/kernels/logging.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#if defined(__METAL_VERSION__) && (__METAL_VERSION__ >= 320)
#include <metal_logging>

namespace mlx {
using os_log = metal::os_log;
} // namespace mlx

#else

namespace mlx {
struct os_log {
  constexpr os_log(constant char*, constant char*) constant {}

  template <typename... Args>
  void log_debug(constant char*, Args...) const {}

  template <typename... Args>
  void log_debug(constant char*, Args...) const constant {}
};
} // namespace mlx

#endif

================================================
FILE: mlx/backend/metal/kernels/logsumexp.h
================================================
// Copyright © 2025 Apple Inc.

template <typename T, typename AccT = float, int N_READS = 4>
[[kernel]] void logsumexp(
    const device T* in,
    device T* out,
    constant int& axis_size,
    uint gid [[threadgroup_position_in_grid]],
    uint _lid [[thread_position_in_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  int lid = _lid;

  constexpr int SIMD_SIZE = 32;

  threadgroup AccT local_max[SIMD_SIZE];
  threadgroup AccT local_normalizer[SIMD_SIZE];

  AccT ld[N_READS];

  in += gid * size_t(axis_size) + lid * N_READS;
  if (lid * N_READS + N_READS <= axis_size) {
    for (int i = 0; i < N_READS; i++) {
      ld[i] = AccT(in[i]);
    }
  } else {
    for (int i = 0; i < N_READS; i++) {
      ld[i] =
          ((lid * N_READS + i) < axis_size) ? AccT(in[i]) : Limits<AccT>::min;
    }
  }
  if (simd_group_id == 0) {
    local_max[simd_lane_id] = Limits<AccT>::min;
    local_normalizer[simd_lane_id] = 0;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);

  // Get the max
  AccT maxval = Limits<AccT>::finite_min;
  for (int i = 0; i < N_READS; i++) {
    maxval = (maxval < ld[i]) ? ld[i] : maxval;
  }
  maxval = simd_max(maxval);
  if (simd_lane_id == 0) {
    local_max[simd_group_id] = maxval;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (simd_group_id == 0) {
    maxval = simd_max(local_max[simd_lane_id]);
    if (simd_lane_id == 0) {
      local_max[0] = maxval;
    }
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  maxval = local_max[0];

  // Compute exp(x_i - maxval) and store the partial sums in local_normalizer
  AccT normalizer = 0;
  for (int i = 0; i < N_READS; i++) {
    normalizer += fast::exp(ld[i] - maxval);
  }
  normalizer = simd_sum(normalizer);
  if (simd_lane_id == 0) {
    local_normalizer[simd_group_id] = normalizer;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (simd_group_id == 0) {
    normalizer = simd_sum(local_normalizer[simd_lane_id]);
    if (simd_lane_id == 0) {
      out[gid] = isinf(maxval) ? T(maxval) : T(log(normalizer) + maxval);
    }
  }
}

template <typename T, typename AccT = float, int N_READS = 4>
[[kernel]] void logsumexp_looped(
    const device T* in,
    device T* out,
    constant int& axis_size,
    uint gid [[threadgroup_position_in_grid]],
    uint lid [[thread_position_in_threadgroup]],
    uint lsize [[threads_per_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  in += gid * size_t(axis_size);

  constexpr int SIMD_SIZE = 32;

  threadgroup AccT local_max[SIMD_SIZE];
  threadgroup AccT local_normalizer[SIMD_SIZE];

  // Get the max and the normalizer in one go
  AccT prevmax;
  AccT maxval = Limits<AccT>::finite_min;
  AccT normalizer = 0;
  for (int r = 0; r < static_cast<int>(ceildiv(axis_size, N_READS * lsize));
       r++) {
    int offset = r * lsize * N_READS + lid * N_READS;
    AccT vals[N_READS];
    if (offset + N_READS <= axis_size) {
      for (int i = 0; i < N_READS; i++) {
        vals[i] = AccT(in[offset + i]);
      }
    } else {
      for (int i = 0; i < N_READS; i++) {
        vals[i] =
            (offset + i < axis_size) ? AccT(in[offset + i]) : Limits<AccT>::min;
      }
    }
    prevmax = maxval;
    for (int i = 0; i < N_READS; i++) {
      maxval = (maxval < vals[i]) ? vals[i] : maxval;
    }
    normalizer *= fast::exp(prevmax - maxval);
    for (int i = 0; i < N_READS; i++) {
      normalizer += fast::exp(vals[i] - maxval);
    }
  }
  prevmax = maxval;
  maxval = simd_max(maxval);
  normalizer *= fast::exp(prevmax - maxval);
  normalizer = simd_sum(normalizer);

  prevmax = maxval;
  if (simd_lane_id == 0) {
    local_max[simd_group_id] = maxval;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  maxval = simd_max(local_max[simd_lane_id]);
  normalizer *= fast::exp(prevmax - maxval);
  if (simd_lane_id == 0) {
    local_normalizer[simd_group_id] = normalizer;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  normalizer = simd_sum(local_normalizer[simd_lane_id]);

  if (lid == 0) {
    out[gid] = isinf(maxval) ? T(maxval) : T(log(normalizer) + maxval);
  }
}


================================================
FILE: mlx/backend/metal/kernels/logsumexp.metal
================================================
// Copyright © 2023-2024 Apple Inc.

#include <metal_common>
#include <metal_simdgroup>

using namespace metal;

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/logsumexp.h"

#define instantiate_logsumexp(name, itype)                               \
  instantiate_kernel("block_logsumexp_" #name, logsumexp, itype)         \
  instantiate_kernel("looped_logsumexp_" #name, logsumexp_looped, itype) \

instantiate_logsumexp(float32, float)
instantiate_logsumexp(float16, half)
instantiate_logsumexp(bfloat16, bfloat16_t) // clang-format on


================================================
FILE: mlx/backend/metal/kernels/quantized.h
================================================
// Copyright © 2023-2024 Apple Inc.

#include <metal_simdgroup>
#include <metal_stdlib>

constant bool align_M [[function_constant(200)]];
constant bool align_N [[function_constant(201)]];
constant bool align_K [[function_constant(202)]];

using namespace metal;

#define MLX_MTL_CONST static constant constexpr const

MLX_MTL_CONST int SIMD_SIZE = 32;
MLX_MTL_CONST int QUAD_SIZE = 4;

template <int bits, int wsize = 8>
inline constexpr short get_pack_factor() {
  return (bits == 3 || bits == 5) ? 8 : (bits == 6 ? 4 : wsize / bits);
}

template <int bits, int wsize = 8>
inline constexpr short get_bytes_per_pack() {
  constexpr int power_of_2_bits = (bits & (bits - 1)) == 0;
  return power_of_2_bits ? (wsize / 8) : (bits == 5 ? 5 : 3);
}

template <typename T, typename U, int values_per_thread, int bits>
inline U load_vector(const device T* x, thread U* x_thread) {
  static_assert(
      bits == 2 || bits == 3 || bits == 4 || bits == 5 || bits == 6 ||
          bits == 8,
      "Template undefined for bits not in {2, 3, 4, 5, 6, 8}");

  U sum = 0;

  if (bits == 2) {
    for (int i = 0; i < values_per_thread; i += 4) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 4.0f;
      x_thread[i + 2] = x[i + 2] / 16.0f;
      x_thread[i + 3] = x[i + 3] / 64.0f;
    }
  }

  else if (bits == 3) {
    for (int i = 0; i < values_per_thread; i += 8) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3] + x[i + 4] + x[i + 5] +
          x[i + 6] + x[i + 7];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 8.0f;
      x_thread[i + 2] = x[i + 2] / 64.0f;
      x_thread[i + 3] = x[i + 3] / 2.0f;
      x_thread[i + 4] = x[i + 4] / 16.0f;
      x_thread[i + 5] = x[i + 5] / 128.0f;
      x_thread[i + 6] = x[i + 6] / 4.0f;
      x_thread[i + 7] = x[i + 7] / 32.0f;
    }
  }

  else if (bits == 4) {
    for (int i = 0; i < values_per_thread; i += 4) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 16.0f;
      x_thread[i + 2] = x[i + 2] / 256.0f;
      x_thread[i + 3] = x[i + 3] / 4096.0f;
    }
  }

  else if (bits == 5) {
    for (int i = 0; i < values_per_thread; i += 8) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3] + x[i + 4] + x[i + 5] +
          x[i + 6] + x[i + 7];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 32.0f;
      x_thread[i + 2] = x[i + 2] / 4.0f;
      x_thread[i + 3] = x[i + 3] / 128.0f;
      x_thread[i + 4] = x[i + 4] / 16.0f;
      x_thread[i + 5] = x[i + 5] / 2.0f;
      x_thread[i + 6] = x[i + 6] / 64.0f;
      x_thread[i + 7] = x[i + 7] / 8.0f;
    }
  }

  else if (bits == 6) {
    for (int i = 0; i < values_per_thread; i += 4) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 64.0f;
      x_thread[i + 2] = x[i + 2] / 16.0f;
      x_thread[i + 3] = x[i + 3] / 4.0f;
    }
  }

  else if (bits == 8) {
    for (int i = 0; i < values_per_thread; i++) {
      sum += x[i];
      x_thread[i] = x[i];
    }
  }

  return sum;
}

template <typename T, typename U, int values_per_thread, int bits>
inline U load_vector_safe(const device T* x, thread U* x_thread, int N) {
  static_assert(
      bits == 2 || bits == 3 || bits == 4 || bits == 5 || bits == 6 ||
          bits == 8,
      "Template undefined for bits not in {2, 3, 4, 5, 6, 8}");

  U sum = 0;

  if (bits == 2) {
    for (int i = 0; i < N; i += 4) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 4.0f;
      x_thread[i + 2] = x[i + 2] / 16.0f;
      x_thread[i + 3] = x[i + 3] / 64.0f;
    }
  }

  else if (bits == 3) {
    for (int i = 0; i < N; i += 8) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3] + x[i + 4] + x[i + 5] +
          x[i + 6] + x[i + 7];

      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 8.0f;
      x_thread[i + 2] = x[i + 2] / 64.0f;
      x_thread[i + 3] = x[i + 3] / 2.0f;
      x_thread[i + 4] = x[i + 4] / 16.0f;
      x_thread[i + 5] = x[i + 5] / 128.0f;
      x_thread[i + 6] = x[i + 6] / 4.0f;
      x_thread[i + 7] = x[i + 7] / 32.0f;
    }
  }

  else if (bits == 4) {
    for (int i = 0; i < N; i += 4) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 16.0f;
      x_thread[i + 2] = x[i + 2] / 256.0f;
      x_thread[i + 3] = x[i + 3] / 4096.0f;
    }
  }

  else if (bits == 5) {
    for (int i = 0; i < N; i += 8) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3] + x[i + 4] + x[i + 5] +
          x[i + 6] + x[i + 7];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 32.0f;
      x_thread[i + 2] = x[i + 2] / 4.0f;
      x_thread[i + 3] = x[i + 3] / 128.0f;
      x_thread[i + 4] = x[i + 4] / 16.0f;
      x_thread[i + 5] = x[i + 5] / 2.0f;
      x_thread[i + 6] = x[i + 6] / 64.0f;
      x_thread[i + 7] = x[i + 7] / 8.0f;
    }
  }

  else if (bits == 6) {
    for (int i = 0; i < N; i += 4) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 64.0f;
      x_thread[i + 2] = x[i + 2] / 16.0f;
      x_thread[i + 3] = x[i + 3] / 4.0f;
    }
  }

  else if (bits == 8) {
    for (int i = 0; i < N; i++) {
      sum += x[i];
      x_thread[i] = x[i];
    }
  }

  for (int i = N; i < values_per_thread; i++) {
    x_thread[i] = 0;
  }

  return sum;
}

template <typename U, int values_per_thread, int bits>
inline U qdot(
    const device uint8_t* w,
    const thread U* x_thread,
    U scale,
    U bias,
    U sum) {
  static_assert(
      bits == 2 || bits == 3 || bits == 4 || bits == 5 || bits == 6 ||
          bits == 8,
      "Template undefined for bits not in {2, 3, 4, 5, 6, 8}");

  U accum = 0;

  if (bits == 2) {
    for (int i = 0; i < (values_per_thread / 4); i++) {
      accum +=
          (x_thread[4 * i] * (w[i] & 0x03) +
           x_thread[4 * i + 1] * (w[i] & 0x0c) +
           x_thread[4 * i + 2] * (w[i] & 0x30) +
           x_thread[4 * i + 3] * (w[i] & 0xc0));
    }
  }

  else if (bits == 3) {
    for (int i = 0; i < (values_per_thread / 8); i++) {
      x_thread += 8 * i;
      w += 3 * i;

      accum += (w[0] & 0x07) * x_thread[0];
      accum += (w[0] & 0x38) * x_thread[1];
      accum += (w[0] & 0xc0) * x_thread[2];
      accum += (w[1] & 0x01) * (x_thread[2] * 256.0f);

      accum += (w[1] & 0x0e) * x_thread[3];
      accum += (w[1] & 0x70) * x_thread[4];
      accum += (w[1] & 0x80) * x_thread[5];
      accum += (w[2] & 0x03) * (x_thread[5] * 256.0f);

      accum += (w[2] & 0x1c) * x_thread[6];
      accum += (w[2] & 0xe0) * x_thread[7];
    }
  }

  else if (bits == 4) {
    const device uint16_t* ws = (const device uint16_t*)w;
    for (int i = 0; i < (values_per_thread / 4); i++) {
      accum +=
          (x_thread[4 * i] * (ws[i] & 0x000f) +
           x_thread[4 * i + 1] * (ws[i] & 0x00f0) +
           x_thread[4 * i + 2] * (ws[i] & 0x0f00) +
           x_thread[4 * i + 3] * (ws[i] & 0xf000));
    }
  }

  else if (bits == 5) {
    for (int i = 0; i < (values_per_thread / 8); i++) {
      x_thread += 8 * i;
      w += 5 * i;

      accum += (w[0] & 0x1f) * x_thread[0];
      accum += (w[0] & 0xe0) * x_thread[1];
      accum += (w[1] & 0x3) * (x_thread[1] * 256.0f);
      accum += (w[1] & 0x7c) * x_thread[2];
      accum += (w[1] & 0x80) * x_thread[3];
      accum += (w[2] & 0xf) * (x_thread[3] * 256.0f);
      accum += (w[2] & 0xf0) * x_thread[4];
      accum += (w[3] & 0x1) * (x_thread[4] * 256.0f);
      accum += (w[3] & 0x3e) * x_thread[5];
      accum += (w[3] & 0xc0) * x_thread[6];
      accum += (w[4] & 0x7) * (x_thread[6] * 256.0f);
      accum += (w[4] & 0xf8) * x_thread[7];
    }
  }

  else if (bits == 6) {
    for (int i = 0; i < (values_per_thread / 4); i++) {
      x_thread += 4 * i;
      w += 3 * i;

      accum += (w[0] & 0x3f) * x_thread[0];

      accum += (w[0] & 0xc0) * x_thread[1];
      accum += (w[1] & 0x0f) * (x_thread[1] * 256.0f);

      accum += (w[1] & 0xf0) * x_thread[2];
      accum += (w[2] & 0x03) * (x_thread[2] * 256.0f);

      accum += (w[2] & 0xfc) * x_thread[3];
    }
  }

  else if (bits == 8) {
    for (int i = 0; i < values_per_thread; i++) {
      accum += x_thread[i] * w[i];
    }
  }

  return scale * accum + sum * bias;
}

template <typename U, int values_per_thread, int bits>
inline U qdot_safe(
    const device uint8_t* w,
    const thread U* x_thread,
    U scale,
    U bias,
    U sum,
    int N) {
  static_assert(
      bits == 2 || bits == 3 || bits == 4 || bits == 5 || bits == 6 ||
          bits == 8,
      "Template undefined for bits not in {2, 3, 4, 5, 6, 8}");

  U accum = 0;

  if (bits == 2) {
    for (int i = 0; i < (N / 4); i++) {
      accum +=
          (x_thread[4 * i] * (w[i] & 0x03) +
           x_thread[4 * i + 1] * (w[i] & 0x0c) +
           x_thread[4 * i + 2] * (w[i] & 0x30) +
           x_thread[4 * i + 3] * (w[i] & 0xc0));
    }
  }

  else if (bits == 3) {
    for (int i = 0; i < (N / 8); i++) {
      x_thread += 8 * i;
      w += 3 * i;

      accum += (w[0] & 0x07) * x_thread[0];
      accum += (w[0] & 0x38) * x_thread[1];
      accum += (w[0] & 0xc0) * x_thread[2];
      accum += (w[1] & 0x01) * (x_thread[2] * 256.0f);

      accum += (w[1] & 0x0e) * x_thread[3];
      accum += (w[1] & 0x70) * x_thread[4];
      accum += (w[1] & 0x80) * x_thread[5];
      accum += (w[2] & 0x03) * (x_thread[5] * 256.0f);

      accum += (w[2] & 0x1c) * x_thread[6];
      accum += (w[2] & 0xe0) * x_thread[7];
    }
  }

  else if (bits == 4) {
    const device uint16_t* ws = (const device uint16_t*)w;
    for (int i = 0; i < (N / 4); i++) {
      accum +=
          (x_thread[4 * i] * (ws[i] & 0x000f) +
           x_thread[4 * i + 1] * (ws[i] & 0x00f0) +
           x_thread[4 * i + 2] * (ws[i] & 0x0f00) +
           x_thread[4 * i + 3] * (ws[i] & 0xf000));
    }
  }

  else if (bits == 5) {
    for (int i = 0; i < (N / 8); i++) {
      x_thread += 8 * i;
      w += 5 * i;

      accum += (w[0] & 0x1f) * x_thread[0];
      accum += (w[0] & 0xe0) * x_thread[1];
      accum += (w[1] & 0x3) * (x_thread[1] * 256.0f);
      accum += (w[1] & 0x7c) * x_thread[2];
      accum += (w[1] & 0x80) * x_thread[3];
      accum += (w[2] & 0xf) * (x_thread[3] * 256.0f);
      accum += (w[2] & 0xf0) * x_thread[4];
      accum += (w[3] & 0x1) * (x_thread[4] * 256.0f);
      accum += (w[3] & 0x3e) * x_thread[5];
      accum += (w[3] & 0xc0) * x_thread[6];
      accum += (w[4] & 0x7) * (x_thread[6] * 256.0f);
      accum += (w[4] & 0xf8) * x_thread[7];
    }
  }

  else if (bits == 6) {
    for (int i = 0; i < (N / 4); i++) {
      x_thread += 4 * i;
      w += 3 * i;

      accum += (w[0] & 0x3f) * x_thread[0];

      accum += (w[0] & 0xc0) * x_thread[1];
      accum += (w[1] & 0x0f) * (x_thread[1] * 256.0f);

      accum += (w[1] & 0xf0) * x_thread[2];
      accum += (w[2] & 0x03) * (x_thread[2] * 256.0f);

      accum += (w[2] & 0xfc) * x_thread[3];
    }
  }

  else if (bits == 8) {
    for (int i = 0; i < N; i++) {
      accum += x_thread[i] * w[i];
    }
  }

  return scale * accum + sum * bias;
}

template <typename U, int values_per_thread, int bits>
inline void
qouter(const thread uint8_t* w, U x, U scale, U bias, thread U* result) {
  static_assert(
      bits == 2 || bits == 3 || bits == 4 || bits == 5 || bits == 6 ||
          bits == 8,
      "Template undefined for bits not in {2, 3, 4, 5, 6, 8}");

  if (bits == 2) {
    U s[4] = {scale, scale / 4.0f, scale / 16.0f, scale / 64.0f};
    for (int i = 0; i < (values_per_thread / 4); i++) {
      result[4 * i] += x * (s[0] * (w[i] & 0x03) + bias);
      result[4 * i + 1] += x * (s[1] * (w[i] & 0x0c) + bias);
      result[4 * i + 2] += x * (s[2] * (w[i] & 0x30) + bias);
      result[4 * i + 3] += x * (s[3] * (w[i] & 0xc0) + bias);
    }
  }

  else if (bits == 3) {
    for (int i = 0; i < (values_per_thread / 8); i++) {
      uint8_t w0 = w[3 * i];
      uint8_t w1 = w[3 * i + 1];
      uint8_t w2 = w[3 * i + 2];

      result[8 * i] += x * ((w0 & 0x7) * scale + bias);
      result[8 * i + 1] += x * (((w0 & 0x38) >> 3) * scale + bias);
      result[8 * i + 2] +=
          x * ((((w0 & 0xc0) >> 6) + ((w1 & 0x1) << 2)) * scale + bias);
      result[8 * i + 3] += x * (((w1 & 0xe) >> 1) * scale + bias);
      result[8 * i + 4] += x * (((w1 & 0x70) >> 4) * scale + bias);
      result[8 * i + 5] +=
          x * ((((w1 & 0x80) >> 7) + ((w2 & 0x3) << 1)) * scale + bias);
      result[8 * i + 6] += x * (((w2 & 0x1c) >> 2) * scale + bias);
      result[8 * i + 7] += x * (((w2 & 0xe0) >> 5) * scale + bias);
    }
  }

  else if (bits == 4) {
    U s[2] = {scale, scale / 16.0f};
    for (int i = 0; i < (values_per_thread / 2); i++) {
      result[2 * i] += x * (s[0] * (w[i] & 0x0f) + bias);
      result[2 * i + 1] += x * (s[1] * (w[i] & 0xf0) + bias);
    }
  }

  else if (bits == 5) {
    for (int i = 0; i < (values_per_thread / 8); i++) {
      uint8_t w0 = w[5 * i];
      uint8_t w1 = w[5 * i + 1];
      uint8_t w2 = w[5 * i + 2];
      uint8_t w3 = w[5 * i + 3];
      uint8_t w4 = w[5 * i + 4];
      result[8 * i] += x * ((w0 & 0x1f) * scale + bias);
      result[8 * i + 1] +=
          x * ((((w0 & 0xe0) >> 5) + ((w1 & 0x3) << 3)) * scale + bias);
      result[8 * i + 2] += x * (((w1 & 0x7c) >> 2) * scale + bias);
      result[8 * i + 3] +=
          x * ((((w1 & 0x80) >> 7) + ((w2 & 0xf) << 1)) * scale + bias);
      result[8 * i + 4] +=
          x * ((((w2 & 0xf0) >> 4) + ((w3 & 0x1) << 4)) * scale + bias);
      result[8 * i + 5] += x * (((w3 & 0x3e) >> 1) * scale + bias);
      result[8 * i + 6] +=
          x * ((((w3 & 0xc0) >> 6) + ((w4 & 0x7) << 2)) * scale + bias);
      result[8 * i + 7] += x * (((w4 & 0xf8) >> 3) * scale + bias);
    }
  }

  else if (bits == 6) {
    for (int i = 0; i < (values_per_thread / 4); i++) {
      uint8_t w0 = w[3 * i];
      uint8_t w1 = w[3 * i + 1];
      uint8_t w2 = w[3 * i + 2];

      result[4 * i] += x * ((w0 & 0x3f) * scale + bias);
      result[4 * i + 1] +=
          x * ((((w0 >> 6) & 0x03) + ((w1 & 0x0f) << 2)) * scale + bias);
      result[4 * i + 2] +=
          x * ((((w1 >> 4) & 0x0f) + ((w2 & 0x03) << 4)) * scale + bias);
      result[4 * i + 3] += x * (((w2 >> 2) & 0x3f) * scale + bias);
    }
  }

  else if (bits == 8) {
    for (int i = 0; i < values_per_thread; i++) {
      result[i] += x * (scale * w[i] + bias);
    }
  }
}

template <typename U, int N, int bits>
inline void
dequantize(const device uint8_t* w, U scale, U bias, threadgroup U* w_local) {
  static_assert(
      bits == 2 || bits == 3 || bits == 4 || bits == 5 || bits == 6 ||
          bits == 8,
      "Template undefined for bits not in {2, 3, 4, 5, 6, 8}");

  if (bits == 2) {
    U s[4] = {
        scale,
        scale / static_cast<U>(4.0f),
        scale / static_cast<U>(16.0f),
        scale / static_cast<U>(64.0f)};
    for (int i = 0; i < (N / 4); i++) {
      w_local[4 * i] = s[0] * (w[i] & 0x03) + bias;
      w_local[4 * i + 1] = s[1] * (w[i] & 0x0c) + bias;
      w_local[4 * i + 2] = s[2] * (w[i] & 0x30) + bias;
      w_local[4 * i + 3] = s[3] * (w[i] & 0xc0) + bias;
    }
  }

  else if (bits == 3) {
    for (int i = 0; i < (N / 8); i++) {
      w_local += 8 * i;
      w += 3 * i;

      w_local[0] = (w[0] & 0x7) * scale + bias;
      w_local[1] = ((w[0] & 0x38) >> 3) * scale + bias;
      w_local[2] = (((w[0] & 0xc0) >> 6) + ((w[1] & 0x1) << 2)) * scale + bias;
      w_local[3] = ((w[1] & 0xe) >> 1) * scale + bias;
      w_local[4] = ((w[1] & 0x70) >> 4) * scale + bias;
      w_local[5] = (((w[1] & 0x80) >> 7) + ((w[2] & 0x3) << 1)) * scale + bias;
      w_local[6] = ((w[2] & 0x1c) >> 2) * scale + bias;
      w_local[7] = ((w[2] & 0xe0) >> 5) * scale + bias;
    }
  }

  else if (bits == 4) {
    U s[2] = {scale, scale / static_cast<U>(16.0f)};
    for (int i = 0; i < (N / 2); i++) {
      w_local[2 * i] = s[0] * (w[i] & 0x0f) + bias;
      w_local[2 * i + 1] = s[1] * (w[i] & 0xf0) + bias;
    }
  }

  else if (bits == 5) {
    for (int i = 0; i < (N / 8); i++) {
      w_local += 8 * i;
      w += 5 * i;

      w_local[0] = (w[0] & 0x1f) * scale + bias;
      w_local[1] = (((w[0] & 0xe0) >> 5) + ((w[1] & 0x3) << 3)) * scale + bias;
      w_local[2] = ((w[1] & 0x7c) >> 2) * scale + bias;
      w_local[3] = (((w[1] & 0x80) >> 7) + ((w[2] & 0xf) << 1)) * scale + bias;
      w_local[4] = (((w[2] & 0xf0) >> 4) + ((w[3] & 0x1) << 4)) * scale + bias;
      w_local[5] = ((w[3] & 0x3e) >> 1) * scale + bias;
      w_local[6] = (((w[3] & 0xc0) >> 6) + ((w[4] & 0x7) << 2)) * scale + bias;
      w_local[7] = ((w[4] & 0xf8) >> 3) * scale + bias;
    }
  }

  else if (bits == 6) {
    for (int i = 0; i < (N / 4); i++) {
      w_local += 4 * i;
      w += 3 * i;
      w_local[0] = (w[0] & 0x3f) * scale + bias;
      w_local[1] = (((w[0] >> 6) & 0x03) + ((w[1] & 0x0f) << 2)) * scale + bias;
      w_local[2] = (((w[1] >> 4) & 0x0f) + ((w[2] & 0x03) << 4)) * scale + bias;
      w_local[3] = ((w[2] >> 2) & 0x3f) * scale + bias;
    }
  }

  else if (bits == 8) {
    for (int i = 0; i < N; i++) {
      w_local[i] = scale * w[i] + bias;
    }
  }
}

template <
    typename T,
    short BROWS,
    short BCOLS,
    short dst_ld,
    short reduction_dim,
    short tgp_size,
    short group_size,
    short bits>
struct QuantizedBlockLoader {
  static_assert(
      BCOLS <= group_size,
      "The group size should be larger than the columns");
  static_assert(
      group_size % BCOLS == 0,
      "The group size should be divisible by the columns");
  static_assert(
      bits == 2 || bits == 3 || bits == 4 || bits == 5 || bits == 6 ||
          bits == 8,
      "Template undefined for bits not in {2, 3, 4, 5, 6, 8}");

  MLX_MTL_CONST short pack_factor = get_pack_factor<bits, 8>();
  MLX_MTL_CONST short bytes_per_pack = get_bytes_per_pack<bits>();
  MLX_MTL_CONST short BCOLS_PACKED = BCOLS / pack_factor;
  MLX_MTL_CONST short n_reads =
      (BCOLS_PACKED * BROWS < tgp_size) ? 1 : (BCOLS_PACKED * BROWS) / tgp_size;
  MLX_MTL_CONST short group_steps = group_size / BCOLS;

  const int src_ld;
  const int tile_stride;
  short group_step_cnt;
  const int group_stride;

  const short thread_idx;
  const short bi;
  const short bj;

  threadgroup T* dst;
  const device uint8_t* src;
  const device T* scales;
  const device T* biases;

  QuantizedBlockLoader(
      const device uint8_t* src_,
      const device T* scales_,
      const device T* biases_,
      const int src_ld_,
      threadgroup T* dst_,
      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
      ushort simd_lane_id [[thread_index_in_simdgroup]])
      : src_ld(src_ld_),
        tile_stride(
            reduction_dim ? BCOLS_PACKED * bytes_per_pack
                          : BROWS * src_ld * bytes_per_pack / pack_factor),
        group_step_cnt(0),
        group_stride(BROWS * src_ld / group_size),
        thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(n_reads * thread_idx / BCOLS_PACKED),
        bj((n_reads * thread_idx) % BCOLS_PACKED),
        dst(dst_ + bi * dst_ld + bj * pack_factor),
        src(src_ + bi * src_ld * bytes_per_pack / pack_factor +
            bj * bytes_per_pack),
        scales(scales_ + bi * src_ld / group_size),
        biases(biases_ + bi * src_ld / group_size) {}

  void load_unsafe() const {
    if (BCOLS_PACKED * BROWS < tgp_size && bi >= BROWS) {
      return;
    }

    T scale = *scales;
    T bias = *biases;
    for (int i = 0; i < n_reads; i++) {
      dequantize<T, pack_factor, bits>(
          src + i * bytes_per_pack, scale, bias, dst + i * pack_factor);
    }
  }

  void load_safe(short2 src_tile_dim) const {
    if (BCOLS_PACKED * BROWS < tgp_size && bi >= BROWS) {
      return;
    }

    if (reduction_dim == 1 && bi >= src_tile_dim.x) {
      for (int i = 0; i < n_reads * pack_factor; i++) {
        dst[i] = T(0);
      }
      return;
    }

    if (reduction_dim == 0 && bi >= src_tile_dim.y) {
      for (int i = 0; i < n_reads * pack_factor; i++) {
        dst[i] = T(0);
      }
      return;
    }

    T scale = *scales;
    T bias = *biases;
    for (int i = 0; i < n_reads; i++) {
      dequantize<T, pack_factor, bits>(
          (device uint8_t*)(src + i * bytes_per_pack),
          scale,
          bias,
          dst + i * pack_factor);
    }
  }

  void next() {
    src += tile_stride;
    if (reduction_dim == 1) {
      if (group_steps > 1) {
        group_step_cnt++;
        if (group_step_cnt == group_steps) {
          group_step_cnt = 0;
          scales++;
          biases++;
        }
      } else {
        scales++;
        biases++;
      }
    } else {
      scales += group_stride;
      biases += group_stride;
    }
  }
};

template <typename T, int group_size, int bits, int D>
METAL_FUNC void qmv_quad_impl(
    const device uint32_t* w,
    const device T* scales,
    const device T* biases,
    const device T* x,
    device T* y,
    constant int& in_vec_size,
    const constant int& out_vec_size,
    uint3 tid [[threadgroup_position_in_grid]],
    uint quad_gid [[quadgroup_index_in_threadgroup]],
    uint quad_lid [[thread_index_in_quadgroup]]) {
  constexpr int quads_per_simd = SIMD_SIZE / QUAD_SIZE;
  constexpr int pack_factor = 32 / bits;
  constexpr int values_per_thread = D / QUAD_SIZE;
  constexpr int packs_per_thread = values_per_thread / pack_factor;
  constexpr int scale_step_per_thread = group_size / values_per_thread;
  constexpr int results_per_quadgroup = 8;

  typedef float U;

  thread U x_thread[values_per_thread];
  thread U result[results_per_quadgroup] = {0};

  // Adjust positions
  const int in_vec_size_w = in_vec_size / pack_factor;
  const int in_vec_size_g = in_vec_size / group_size;
  const int out_row = tid.y * quads_per_simd * results_per_quadgroup + quad_gid;

  w += out_row * in_vec_size_w + quad_lid * packs_per_thread;
  scales += out_row * in_vec_size_g + quad_lid / scale_step_per_thread;
  biases += out_row * in_vec_size_g + quad_lid / scale_step_per_thread;
  x += tid.x * in_vec_size + quad_lid * values_per_thread;
  y += tid.x * out_vec_size + out_row;

  U sum = load_vector<T, U, values_per_thread, bits>(x, x_thread);

  for (int row = 0; row < results_per_quadgroup; row++) {
    auto wl = (const device uint8_t*)(w + row * in_vec_size_w * quads_per_simd);
    const device T* sl = scales + row * in_vec_size_g * quads_per_simd;
    const device T* bl = biases + row * in_vec_size_g * quads_per_simd;

    U s = sl[0];
    U b = bl[0];
    if (row * quads_per_simd + out_row < out_vec_size) {
      result[row] += qdot<U, values_per_thread, bits>(wl, x_thread, s, b, sum);
    }
  }

  for (int row = 0; row < results_per_quadgroup; row++) {
    result[row] = quad_sum(result[row]);
    if (quad_lid == 0 && row * quads_per_simd + out_row < out_vec_size) {
      y[row * quads_per_simd] = static_cast<T>(result[row]);
    }
  }
}

template <typename T, int group_size, int bits>
METAL_FUNC void qmv_fast_impl(
    const device uint32_t* w,
    const device T* scales,
    const device T* biases,
    const device T* x,
    device T* y,
    const constant int& in_vec_size,
    const constant int& out_vec_size,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  constexpr int packs_per_thread = bits == 2 ? 1 : 2;
  constexpr int num_simdgroups = 2;
  constexpr int results_per_simdgroup = 4;
  constexpr int pack_factor = get_pack_factor<bits, 32>();
  constexpr int bytes_per_pack = get_bytes_per_pack<bits, 32>();
  constexpr int values_per_thread = pack_factor * packs_per_thread;
  constexpr int block_size = values_per_thread * SIMD_SIZE;
  constexpr int scale_step_per_thread = group_size / values_per_thread;

  const device uint8_t* ws = (const device uint8_t*)w;

  typedef float U;

  thread U x_thread[values_per_thread];
  thread U result[results_per_simdgroup] = {0};

  // Adjust positions
  const int in_vec_size_w = in_vec_size * bytes_per_pack / pack_factor;
  const int in_vec_size_g = in_vec_size / group_size;
  const int out_row = tid.y * (num_simdgroups * results_per_simdgroup) +
      simd_gid * results_per_simdgroup;

  ws += out_row * in_vec_size_w + simd_lid * packs_per_thread * bytes_per_pack;
  scales += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;
  biases += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;
  x += tid.x * in_vec_size + simd_lid * values_per_thread;
  y += tid.x * out_vec_size + out_row;

  for (int k = 0; k < in_vec_size; k += block_size) {
    U sum = load_vector<T, U, values_per_thread, bits>(x, x_thread);

    for (int row = 0; row < results_per_simdgroup; row++) {
      auto wl = (const device uint8_t*)(ws + row * in_vec_size_w);
      const device T* sl = scales + row * in_vec_size_g;
      const device T* bl = biases + row * in_vec_size_g;

      U s = sl[0];
      U b = bl[0];
      result[row] += qdot<U, values_per_thread, bits>(wl, x_thread, s, b, sum);
    }

    ws += block_size * bytes_per_pack / pack_factor;
    scales += block_size / group_size;
    biases += block_size / group_size;
    x += block_size;
  }

  for (int row = 0; row < results_per_simdgroup; row++) {
    result[row] = simd_sum(result[row]);
    if (simd_lid == 0) {
      y[row] = static_cast<T>(result[row]);
    }
  }
}

template <typename T, int group_size, int bits>
METAL_FUNC void qmv_impl(
    const device uint32_t* w,
    const device T* scales,
    const device T* biases,
    const device T* x,
    device T* y,
    const constant int& in_vec_size,
    const constant int& out_vec_size,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  constexpr int num_simdgroups = 2;
  constexpr int results_per_simdgroup = 4;
  constexpr int packs_per_thread = 1;
  constexpr int pack_factor = get_pack_factor<bits, 32>();
  constexpr int bytes_per_pack = get_bytes_per_pack<bits, 32>();

  constexpr int values_per_thread = pack_factor * packs_per_thread;
  constexpr int block_size = values_per_thread * SIMD_SIZE;
  constexpr int scale_step_per_thread = group_size / values_per_thread;

  const device uint8_t* ws = (const device uint8_t*)w;

  typedef float U;

  thread U x_thread[values_per_thread];
  thread U result[results_per_simdgroup] = {0};

  // Adjust positions
  const int in_vec_size_w = in_vec_size * bytes_per_pack / pack_factor;
  const int in_vec_size_g = in_vec_size / group_size;
  const int out_row = tid.y * (num_simdgroups * results_per_simdgroup) +
      simd_gid * results_per_simdgroup;
  const int used_out_row = min(out_vec_size - results_per_simdgroup, out_row);

  if (out_row >= out_vec_size) {
    return;
  }

  // In this case we need to properly guard all our reads because there isn't
  // even 1 tile in the matrix
  if (out_vec_size < (num_simdgroups * results_per_simdgroup)) {
    ws +=
        out_row * in_vec_size_w + simd_lid * packs_per_thread * bytes_per_pack;
    scales += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;
    biases += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;
    x += tid.x * in_vec_size + simd_lid * values_per_thread;
    y += tid.x * out_vec_size + out_row;

    int k = 0;
    for (; k < in_vec_size - block_size; k += block_size) {
      U sum = load_vector<T, U, values_per_thread, bits>(x, x_thread);

      for (int row = 0;
           row < results_per_simdgroup && out_row + row < out_vec_size;
           row++) {
        auto wl = (const device uint8_t*)(ws + row * in_vec_size_w);
        const device T* sl = scales + row * in_vec_size_g;
        const device T* bl = biases + row * in_vec_size_g;

        U s = sl[0];
        U b = bl[0];
        result[row] +=
            qdot<U, values_per_thread, bits>(wl, x_thread, s, b, sum);
      }

      ws += block_size * bytes_per_pack / pack_factor;
      scales += block_size / group_size;
      biases += block_size / group_size;
      x += block_size;
    }
    const int remaining = clamp(
        static_cast<int>(in_vec_size - k - simd_lid * values_per_thread),
        0,
        values_per_thread);
    if (remaining > 0) {
      U sum = load_vector_safe<T, U, values_per_thread, bits>(
          x, x_thread, remaining);

      for (int row = 0;
           row < results_per_simdgroup && out_row + row < out_vec_size;
           row++) {
        auto wl = (const device uint8_t*)(ws + row * in_vec_size_w);
        const device T* sl = scales + row * in_vec_size_g;
        const device T* bl = biases + row * in_vec_size_g;

        U s = sl[0];
        U b = bl[0];
        result[row] += qdot_safe<U, values_per_thread, bits>(
            wl, x_thread, s, b, sum, remaining);
      }
    }

    for (int row = 0;
         row < results_per_simdgroup && out_row + row < out_vec_size;
         row++) {
      result[row] = simd_sum(result[row]);
      if (simd_lid == 0) {
        y[row] = static_cast<T>(result[row]);
      }
    }
  }

  // In this case the last tile is moved back to redo some output values
  else {
    ws += used_out_row * in_vec_size_w +
        simd_lid * packs_per_thread * bytes_per_pack;
    scales += used_out_row * in_vec_size_g + simd_lid / scale_step_per_thread;
    biases += used_out_row * in_vec_size_g + simd_lid / scale_step_per_thread;
    x += tid.x * in_vec_size + simd_lid * values_per_thread;
    y += tid.x * out_vec_size + used_out_row;

    int k = 0;
    for (; k < in_vec_size - block_size; k += block_size) {
      U sum = load_vector<T, U, values_per_thread, bits>(x, x_thread);

      for (int row = 0; row < results_per_simdgroup; row++) {
        auto wl = (const device uint8_t*)(ws + row * in_vec_size_w);
        const device T* sl = scales + row * in_vec_size_g;
        const device T* bl = biases + row * in_vec_size_g;

        U s = sl[0];
        U b = bl[0];
        result[row] +=
            qdot<U, values_per_thread, bits>(wl, x_thread, s, b, sum);
      }

      ws += block_size * bytes_per_pack / pack_factor;
      scales += block_size / group_size;
      biases += block_size / group_size;
      x += block_size;
    }
    const int remaining = clamp(
        static_cast<int>(in_vec_size - k - simd_lid * values_per_thread),
        0,
        values_per_thread);
    if (remaining > 0) {
      U sum = load_vector_safe<T, U, values_per_thread, bits>(
          x, x_thread, remaining);

      for (int row = 0; row < results_per_simdgroup; row++) {
        auto wl = (const device uint8_t*)(ws + row * in_vec_size_w);
        const device T* sl = scales + row * in_vec_size_g;
        const device T* bl = biases + row * in_vec_size_g;

        U s = sl[0];
        U b = bl[0];
        result[row] += qdot_safe<U, values_per_thread, bits>(
            wl, x_thread, s, b, sum, remaining);
      }
    }
    for (int row = 0; row < results_per_simdgroup; row++) {
      result[row] = simd_sum(result[row]);
      if (simd_lid == 0) {
        y[row] = static_cast<T>(result[row]);
      }
    }
  }
}

template <typename T, const int group_size, const int bits>
METAL_FUNC void qvm_impl(
    const device uint32_t* w,
    const device T* scales,
    const device T* biases,
    const device T* x,
    device T* y,
    const int in_vec_size,
    const int out_vec_size,
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  constexpr int power_of_2_bits = (bits & (bits - 1)) == 0;
  constexpr int num_simdgroups = 2;
  constexpr int pack_factor = get_pack_factor<bits, 32>();
  constexpr int bytes_per_pack = get_bytes_per_pack<bits>();

  constexpr int tn = 32 / pack_factor;
  constexpr int block_size = SIMD_SIZE;

  using W_T =
      typename ConditionalType<power_of_2_bits, uint32_t, uint8_t>::type;
  const device W_T* ws = (const device W_T*)w;

  typedef float U;
  typedef struct {
    W_T wi[tn * bytes_per_pack];
  } vec_w;

  thread vec_w w_local;
  thread U result[tn * pack_factor] = {0};
  thread U scale = 1;
  thread U bias = 0;
  thread U x_local = 0;

  // Adjust positions
  const int out_vec_size_w = out_vec_size * bytes_per_pack / pack_factor;
  const int out_vec_size_g = out_vec_size / group_size;
  int out_col = pack_factor * tn * (tid.y * num_simdgroups + simd_gid);
  ws += out_col * bytes_per_pack / pack_factor + simd_lid * out_vec_size_w;
  scales += out_col / group_size + simd_lid * out_vec_size_g;
  biases += out_col / group_size + simd_lid * out_vec_size_g;
  x += tid.x * in_vec_size + simd_lid;
  y += tid.x * out_vec_size + out_col;

  if (out_col >= out_vec_size) {
    return;
  }

  // Loop over in_vec in blocks of block_size
  int remaining = in_vec_size % block_size;
  if (remaining == 0) {
    for (int i = 0; i < in_vec_size; i += block_size) {
      x_local = *x;
      scale = *scales;
      bias = *biases;
      w_local = *((device vec_w*)ws);
      qouter<U, tn * pack_factor, bits>(
          (thread uint8_t*)&w_local, x_local, scale, bias, result);

      x += block_size;
      scales += block_size * out_vec_size_g;
      biases += block_size * out_vec_size_g;
      ws += block_size * out_vec_size_w;
    }
  } else {
    for (int i = block_size; i < in_vec_size; i += block_size) {
      x_local = *x;
      scale = *scales;
      bias = *biases;
      w_local = *((device vec_w*)ws);

      qouter<U, tn * pack_factor, bits>(
          (thread uint8_t*)&w_local, x_local, scale, bias, result);

      x += block_size;
      scales += block_size * out_vec_size_g;
      biases += block_size * out_vec_size_g;
      ws += block_size * out_vec_size_w;
    }
    if (static_cast<int>(simd_lid) < remaining) {
      x_local = *x;
      scale = *scales;
      bias = *biases;
      w_local = *((device vec_w*)ws);
    } else {
      x_local = 0;
      scale = 0;
      bias = 0;
    }
    qouter<U, tn * pack_factor, bits>(
        (thread uint8_t*)&w_local, x_local, scale, bias, result);
  }

// Accumulate in the simdgroup
#pragma clang loop unroll(full)
  for (int k = 0; k < tn * pack_factor; k++) {
    result[k] = simd_sum(result[k]);
  }

  // Store the result
  if (simd_lid == 0) {
#pragma clang loop unroll(full)
    for (int k = 0; k < tn * pack_factor; k++) {
      y[k] = static_cast<T>(result[k]);
    }
  }
}

template <
    typename T,
    const int group_size,
    const int bits,
    const bool aligned_N,
    const int BM = 32,
    const int BK = 32,
    const int BN = 32>
METAL_FUNC void qmm_t_impl(
    const device uint32_t* w,
    const device T* scales,
    const device T* biases,
    const device T* x,
    device T* y,
    threadgroup T* Xs,
    threadgroup T* Ws,
    const constant int& K,
    const constant int& N,
    const constant int& M,
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  static_assert(BK >= SIMD_SIZE, "BK should be larger than SIMD_SIZE");
  static_assert(BK % SIMD_SIZE == 0, "BK should be divisible by SIMD_SIZE");

  (void)lid;

  constexpr int WM = 2;
  constexpr int WN = 2;
  constexpr int pack_factor = get_pack_factor<bits, 8>();
  constexpr int bytes_per_pack = get_bytes_per_pack<bits>();

  constexpr int BK_padded = (BK + 16 / sizeof(T));

  // Instantiate the appropriate BlockMMA and Loader
  using mma_t = mlx::steel::
      BlockMMA<T, T, BM, BN, BK, WM, WN, false, true, BK_padded, BK_padded>;
  using loader_x_t =
      mlx::steel::BlockLoader<T, BM, BK, BK_padded, 1, WM * WN * SIMD_SIZE>;
  using loader_w_t = QuantizedBlockLoader<
      T,
      BN,
      BK,
      BK_padded,
      1,
      WM * WN * SIMD_SIZE,
      group_size,
      bits>;

  // Set the block
  const int K_w = K * bytes_per_pack / pack_factor;
  const int K_g = K / group_size;
  const int y_row = tid.y * BM;
  const int y_col = tid.x * BN;

  auto wl = (const device uint8_t*)w;

  x += y_row * static_cast<int64_t>(K);
  wl += y_col * K_w;
  scales += y_col * K_g;
  biases += y_col * K_g;
  y += y_row * static_cast<int64_t>(N) + y_col;

  // Make the x loader and mma operation
  const short num_els = min(BM, M - y_row);
  const short num_outs = min(BN, N - y_col);
  loader_x_t loader_x(x, K, Xs, simd_gid, simd_lid);
  loader_w_t loader_w(wl, scales, biases, K, Ws, simd_gid, simd_lid);
  mma_t mma_op(simd_gid, simd_lid);

  if (num_els < BM) {
    if (!aligned_N && num_outs < BN) {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_safe(short2(BK, num_els));
        loader_w.load_safe(short2(BK, num_outs));
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
    } else {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_safe(short2(BK, num_els));
        loader_w.load_unsafe();
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
    }
  } else {
    if (!aligned_N && num_outs < BN) {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_unsafe();
        loader_w.load_safe(short2(BK, num_outs));
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
    } else {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_unsafe();
        loader_w.load_unsafe();
        threadgroup_barrier(mem_flags::mem_threadgroup);

        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
    }
  }

  // Store results to device memory
  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (num_els < BM || num_outs < BN) {
    mma_op.store_result_safe(y, N, short2(num_outs, num_els));
  } else {
    mma_op.store_result(y, N);
  }
}

template <
    typename T,
    const int group_size,
    const int bits,
    const int BM = 32,
    const int BK = 32,
    const int BN = 32>
METAL_FUNC void qmm_n_impl(
    const device uint32_t* w,
    const device T* scales,
    const device T* biases,
    const device T* x,
    device T* y,
    threadgroup T* Xs,
    threadgroup T* Ws,
    const constant int& K,
    const constant int& N,
    const constant int& M,
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  static_assert(BK >= SIMD_SIZE, "BK should be larger than SIMD_SIZE");
  static_assert(BK % SIMD_SIZE == 0, "BK should be divisible by SIMD_SIZE");

  (void)lid;

  constexpr int WM = 2;
  constexpr int WN = 2;
  constexpr int pack_factor = get_pack_factor<bits, 8>();
  constexpr int bytes_per_pack = get_bytes_per_pack<bits>();

  constexpr int BK_padded = (BK + 16 / sizeof(T));
  constexpr int BN_padded = (BN + 16 / sizeof(T));

  // Instantiate the appropriate BlockMMA and Loader
  using mma_t = mlx::steel::
      BlockMMA<T, T, BM, BN, BK, WM, WN, false, false, BK_padded, BN_padded>;
  using loader_x_t = mlx::steel::
      BlockLoader<T, BM, BK, BK_padded, 1, WM * WN * SIMD_SIZE, 1, 4>;
  using loader_w_t = QuantizedBlockLoader<
      T,
      BK,
      BN,
      BN_padded,
      0,
      WM * WN * SIMD_SIZE,
      group_size,
      bits>;

  auto wl = (const device uint8_t*)w;

  // Set the block
  const int y_row = tid.y * BM;
  const int y_col = tid.x * BN;
  x += y_row * static_cast<int64_t>(K);
  wl += y_col * bytes_per_pack / pack_factor;
  scales += y_col / group_size;
  biases += y_col / group_size;
  y += y_row * static_cast<int64_t>(N) + y_col;

  // Make the x loader and mma operation
  const short num_els = min(BM, M - y_row);
  loader_x_t loader_x(x, K, Xs, simd_gid, simd_lid);
  loader_w_t loader_w(wl, scales, biases, N, Ws, simd_gid, simd_lid);
  mma_t mma_op(simd_gid, simd_lid);

  if (num_els < BM) {
    if ((K % BK) != 0) {
      const int k_blocks = K / BK;
      for (int k = 0; k < k_blocks; k++) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_safe(short2(BK, num_els));
        loader_w.load_unsafe();
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
      const short num_k = K - k_blocks * BK;
      threadgroup_barrier(mem_flags::mem_threadgroup);
      loader_x.load_safe(short2(num_k, num_els));
      loader_w.load_safe(short2(BN, num_k));
      threadgroup_barrier(mem_flags::mem_threadgroup);
      mma_op.mma(Xs, Ws);
    } else {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_safe(short2(BK, num_els));
        loader_w.load_unsafe();
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
    }
  } else {
    if ((K % BK) != 0) {
      const int k_blocks = K / BK;
      for (int k = 0; k < k_blocks; k++) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_unsafe();
        loader_w.load_unsafe();
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
      const short num_k = K - k_blocks * BK;
      threadgroup_barrier(mem_flags::mem_threadgroup);
      loader_x.load_safe(short2(num_k, BM));
      loader_w.load_safe(short2(BN, num_k));
      threadgroup_barrier(mem_flags::mem_threadgroup);
      mma_op.mma(Xs, Ws);
    } else {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_x.load_unsafe();
        loader_w.load_unsafe();
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(Xs, Ws);
        loader_x.next();
        loader_w.next();
      }
    }
  }

  // Store results to device memory
  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (num_els < BM) {
    mma_op.store_result_safe(y, N, short2(BN, num_els));
  } else {
    mma_op.store_result(y, N);
  }
}

template <typename T>
METAL_FUNC void adjust_matrix_offsets(
    const device T*& x,
    const device uint32_t*& w,
    const device T*& scales,
    const device T*& biases,
    device T*& y,
    int output_stride,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    const constant int64_t* b_strides,
    uint3 tid [[threadgroup_position_in_grid]]) {
  // Set the input/output matrices
  uint32_t x_idx = tid.z;
  uint32_t w_idx = tid.z;
  if (x_batch_ndims == 1) {
    x += x_idx * x_strides[0];
  } else {
    x += elem_to_loc(x_idx, x_shape, x_strides, x_batch_ndims);
  }
  if (w_batch_ndims == 1) {
    w += w_idx * w_strides[0];
    scales += w_idx * s_strides[0];
    biases += w_idx * b_strides[0];
  } else {
    ulong3 idx = elem_to_loc_broadcast(
        w_idx, w_shape, w_strides, s_strides, b_strides, w_batch_ndims);
    w += idx.x;
    scales += idx.y;
    biases += idx.z;
  }
  y += tid.z * output_stride;
}

template <typename T>
METAL_FUNC void adjust_matrix_offsets(
    const device T*& x,
    const device uint32_t*& w,
    const device T*& scales,
    const device T*& biases,
    const device uint32_t* lhs_indices,
    const device uint32_t* rhs_indices,
    device T*& y,
    int output_stride,
    const constant int& batch_ndims,
    const constant int* batch_shape,
    const constant int64_t* lhs_strides,
    const constant int64_t* rhs_strides,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    const constant int64_t* b_strides,
    uint3 tid [[threadgroup_position_in_grid]]) {
  // Set the input/output matrices
  uint32_t x_idx;
  uint32_t w_idx;
  if (batch_ndims == 1) {
    x_idx = lhs_indices[tid.z * lhs_strides[0]];
    w_idx = rhs_indices[tid.z * rhs_strides[0]];
  } else {
    ulong2 idx = elem_to_loc_broadcast(
        tid.z, batch_shape, lhs_strides, rhs_strides, batch_ndims);
    x_idx = lhs_indices[idx.x];
    w_idx = rhs_indices[idx.y];
  }
  if (x_batch_ndims == 1) {
    x += x_idx * x_strides[0];
  } else {
    x += elem_to_loc(x_idx, x_shape, x_strides, x_batch_ndims);
  }
  if (w_batch_ndims == 1) {
    w += w_idx * w_strides[0];
    scales += w_idx * s_strides[0];
    biases += w_idx * b_strides[0];
  } else {
    ulong3 idx = elem_to_loc_broadcast(
        w_idx, w_shape, w_strides, s_strides, b_strides, w_batch_ndims);
    w += idx.x;
    scales += idx.y;
    biases += idx.z;
  }
  y += tid.z * output_stride;
}

template <typename T, int group_size, int bits, int D, bool batched>
[[kernel]] void affine_qmv_quad(
    const device uint32_t* w [[buffer(0)]],
    const device T* scales [[buffer(1)]],
    const device T* biases [[buffer(2)]],
    const device T* x [[buffer(3)]],
    device T* y [[buffer(4)]],
    const constant int& in_vec_size [[buffer(5)]],
    const constant int& out_vec_size [[buffer(6)]],
    const constant int& x_batch_ndims [[buffer(7)]],
    const constant int* x_shape [[buffer(8)]],
    const constant int64_t* x_strides [[buffer(9)]],
    const constant int& w_batch_ndims [[buffer(10)]],
    const constant int* w_shape [[buffer(11)]],
    const constant int64_t* w_strides [[buffer(12)]],
    const constant int64_t* s_strides [[buffer(13)]],
    const constant int64_t* b_strides [[buffer(14)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint quad_gid [[quadgroup_index_in_threadgroup]],
    uint quad_lid [[thread_index_in_quadgroup]]) {
  if (batched) {
    int M = x_shape[x_batch_ndims];
    adjust_matrix_offsets<T>(
        x,
        w,
        scales,
        biases,
        y,
        out_vec_size * M,
        x_batch_ndims,
        x_shape,
        x_strides,
        w_batch_ndims,
        w_shape,
        w_strides,
        s_strides,
        b_strides,
        tid);
  }
  qmv_quad_impl<T, group_size, bits, D>(
      w,
      scales,
      biases,
      x,
      y,
      in_vec_size,
      out_vec_size,
      tid,
      quad_gid,
      quad_lid);
}

template <typename T, int group_size, int bits, bool batched>
[[kernel]] void affine_qmv_fast(
    const device uint32_t* w [[buffer(0)]],
    const device T* scales [[buffer(1)]],
    const device T* biases [[buffer(2)]],
    const device T* x [[buffer(3)]],
    device T* y [[buffer(4)]],
    const constant int& in_vec_size [[buffer(5)]],
    const constant int& out_vec_size [[buffer(6)]],
    const constant int& x_batch_ndims [[buffer(7)]],
    const constant int* x_shape [[buffer(8)]],
    const constant int64_t* x_strides [[buffer(9)]],
    const constant int& w_batch_ndims [[buffer(10)]],
    const constant int* w_shape [[buffer(11)]],
    const constant int64_t* w_strides [[buffer(12)]],
    const constant int64_t* s_strides [[buffer(13)]],
    const constant int64_t* b_strides [[buffer(14)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  if (batched) {
    int M = x_shape[x_batch_ndims];
    adjust_matrix_offsets<T>(
        x,
        w,
        scales,
        biases,
        y,
        out_vec_size * M,
        x_batch_ndims,
        x_shape,
        x_strides,
        w_batch_ndims,
        w_shape,
        w_strides,
        s_strides,
        b_strides,
        tid);
  }
  qmv_fast_impl<T, group_size, bits>(
      w,
      scales,
      biases,
      x,
      y,
      in_vec_size,
      out_vec_size,
      tid,
      simd_gid,
      simd_lid);
}

template <typename T, const int group_size, const int bits, bool batched>
[[kernel]] void affine_qmv(
    const device uint32_t* w [[buffer(0)]],
    const device T* scales [[buffer(1)]],
    const device T* biases [[buffer(2)]],
    const device T* x [[buffer(3)]],
    device T* y [[buffer(4)]],
    const constant int& in_vec_size [[buffer(5)]],
    const constant int& out_vec_size [[buffer(6)]],
    const constant int& x_batch_ndims [[buffer(7)]],
    const constant int* x_shape [[buffer(8)]],
    const constant int64_t* x_strides [[buffer(9)]],
    const constant int& w_batch_ndims [[buffer(10)]],
    const constant int* w_shape [[buffer(11)]],
    const constant int64_t* w_strides [[buffer(12)]],
    const constant int64_t* s_strides [[buffer(13)]],
    const constant int64_t* b_strides [[buffer(14)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  if (batched) {
    int M = x_shape[x_batch_ndims];
    adjust_matrix_offsets<T>(
        x,
        w,
        scales,
        biases,
        y,
        out_vec_size * M,
        x_batch_ndims,
        x_shape,
        x_strides,
        w_batch_ndims,
        w_shape,
        w_strides,
        s_strides,
        b_strides,
        tid);
  }
  qmv_impl<T, group_size, bits>(
      w,
      scales,
      biases,
      x,
      y,
      in_vec_size,
      out_vec_size,
      tid,
      simd_gid,
      simd_lid);
}

template <typename T, const int group_size, const int bits, bool batched>
[[kernel]] void affine_qvm(
    const device uint32_t* w [[buffer(0)]],
    const device T* scales [[buffer(1)]],
    const device T* biases [[buffer(2)]],
    const device T* x [[buffer(3)]],
    device T* y [[buffer(4)]],
    const constant int& in_vec_size [[buffer(5)]],
    const constant int& out_vec_size [[buffer(6)]],
    const constant int& x_batch_ndims [[buffer(7)]],
    const constant int* x_shape [[buffer(8)]],
    const constant int64_t* x_strides [[buffer(9)]],
    const constant int& w_batch_ndims [[buffer(10)]],
    const constant int* w_shape [[buffer(11)]],
    const constant int64_t* w_strides [[buffer(12)]],
    const constant int64_t* s_strides [[buffer(13)]],
    const constant int64_t* b_strides [[buffer(14)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  if (batched) {
    int M = x_shape[x_batch_ndims];
    adjust_matrix_offsets<T>(
        x,
        w,
        scales,
        biases,
        y,
        out_vec_size * M,
        x_batch_ndims,
        x_shape,
        x_strides,
        w_batch_ndims,
        w_shape,
        w_strides,
        s_strides,
        b_strides,
        tid);
  }
  qvm_impl<T, group_size, bits>(
      w,
      scales,
      biases,
      x,
      y,
      in_vec_size,
      out_vec_size,
      tid,
      simd_gid,
      simd_lid);
}

template <typename T, const int group_size, const int bits, int split_k = 32>
[[kernel]] void affine_qvm_split_k(
    const device uint32_t* w [[buffer(0)]],
    const device T* scales [[buffer(1)]],
    const device T* biases [[buffer(2)]],
    const device T* x [[buffer(3)]],
    device T* y [[buffer(4)]],
    const constant int& in_vec_size [[buffer(5)]],
    const constant int& out_vec_size [[buffer(6)]],
    const constant int& x_batch_ndims [[buffer(7)]],
    const constant int* x_shape [[buffer(8)]],
    const constant int64_t* x_strides [[buffer(9)]],
    const constant int& w_batch_ndims [[buffer(10)]],
    const constant int* w_shape [[buffer(11)]],
    const constant int64_t* w_strides [[buffer(12)]],
    const constant int64_t* s_strides [[buffer(13)]],
    const constant int64_t* b_strides [[buffer(14)]],
    const constant int& final_block_size [[buffer(15)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  int M = x_shape[x_batch_ndims];
  adjust_matrix_offsets<T>(
      x,
      w,
      scales,
      biases,
      y,
      out_vec_size * M,
      x_batch_ndims,
      x_shape,
      x_strides,
      w_batch_ndims,
      w_shape,
      w_strides,
      s_strides,
      b_strides,
      tid);

  // When (in_vec_size % split_k != 0) the final block needs to be smaller
  int in_vec_size_adj =
      tid.z % split_k == split_k - 1 ? final_block_size : in_vec_size;

  qvm_impl<T, group_size, bits>(
      w,
      scales,
      biases,
      x,
      y,
      in_vec_size_adj,
      out_vec_size,
      tid,
      simd_gid,
      simd_lid);
}

template <
    typename T,
    const int group_size,
    const int bits,
    const bool aligned_N,
    const bool batched,
    const int BM = 32,
    const int BK = 32,
    const int BN = 32>
[[kernel]] void affine_qmm_t(
    const device uint32_t* w [[buffer(0)]],
    const device T* scales [[buffer(1)]],
    const device T* biases [[buffer(2)]],
    const device T* x [[buffer(3)]],
    device T* y [[buffer(4)]],
    const constant int& K [[buffer(5)]],
    const constant int& N [[buffer(6)]],
    const constant int& M [[buffer(7)]],
    const constant int& x_batch_ndims [[buffer(8)]],
    const constant int* x_shape [[buffer(9)]],
    const constant int64_t* x_strides [[buffer(10)]],
    const constant int& w_batch_ndims [[buffer(11)]],
    const constant int* w_shape [[buffer(12)]],
    const constant int64_t* w_strides [[buffer(13)]],
    const constant int64_t* s_strides [[buffer(14)]],
    const constant int64_t* b_strides [[buffer(15)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr int BK_padded = (BK + 16 / sizeof(T));

  threadgroup T Xs[BM * BK_padded];
  threadgroup T Ws[BN * BK_padded];

  if (batched) {
    adjust_matrix_offsets<T>(
        x,
        w,
        scales,
        biases,
        y,
        M * N,
        x_batch_ndims,
        x_shape,
        x_strides,
        w_batch_ndims,
        w_shape,
        w_strides,
        s_strides,
        b_strides,
        tid);
  }
  qmm_t_impl<T, group_size, bits, aligned_N, BM, BK, BN>(
      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
}

template <
    typename T,
    const int group_size,
    const int bits,
    const bool batched,
    const int BM = 32,
    const int BK = 32,
    const int BN = 32>
[[kernel]] void affine_qmm_n(
    const device uint32_t* w [[buffer(0)]],
    const device T* scales [[buffer(1)]],
    const device T* biases [[buffer(2)]],
    const device T* x [[buffer(3)]],
    device T* y [[buffer(4)]],
    const constant int& K [[buffer(5)]],
    const constant int& N [[buffer(6)]],
    const constant int& M [[buffer(7)]],
    const constant int& x_batch_ndims [[buffer(8)]],
    const constant int* x_shape [[buffer(9)]],
    const constant int64_t* x_strides [[buffer(10)]],
    const constant int& w_batch_ndims [[buffer(11)]],
    const constant int* w_shape [[buffer(12)]],
    const constant int64_t* w_strides [[buffer(13)]],
    const constant int64_t* s_strides [[buffer(14)]],
    const constant int64_t* b_strides [[buffer(15)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr int BK_padded = (BK + 16 / sizeof(T));
  constexpr int BN_padded = (BN + 16 / sizeof(T));

  threadgroup T Xs[BM * BK_padded];
  threadgroup T Ws[BK * BN_padded];

  if (batched) {
    adjust_matrix_offsets<T>(
        x,
        w,
        scales,
        biases,
        y,
        M * N,
        x_batch_ndims,
        x_shape,
        x_strides,
        w_batch_ndims,
        w_shape,
        w_strides,
        s_strides,
        b_strides,
        tid);
  }

  qmm_n_impl<T, group_size, bits, BM, BK, BN>(
      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
}

template <typename T, int group_size, int bits>
[[kernel]] void affine_gather_qmv_fast(
    const device uint32_t* w [[buffer(0)]],
    const device T* scales [[buffer(1)]],
    const device T* biases [[buffer(2)]],
    const device T* x [[buffer(3)]],
    const device uint32_t* lhs_indices [[buffer(4)]],
    const device uint32_t* rhs_indices [[buffer(5)]],
    device T* y [[buffer(6)]],
    const constant int& in_vec_size [[buffer(7)]],
    const constant int& out_vec_size [[buffer(8)]],
    const constant int& x_batch_ndims [[buffer(9)]],
    const constant int* x_shape [[buffer(10)]],
    const constant int64_t* x_strides [[buffer(11)]],
    const constant int& w_batch_ndims [[buffer(12)]],
    const constant int* w_shape [[buffer(13)]],
    const constant int64_t* w_strides [[buffer(14)]],
    const constant int64_t* s_strides [[buffer(15)]],
    const constant int64_t* b_strides [[buffer(16)]],
    const constant int& batch_ndims [[buffer(17)]],
    const constant int* batch_shape [[buffer(18)]],
    const constant int64_t* lhs_strides [[buffer(19)]],
    const constant int64_t* rhs_strides [[buffer(20)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  int M = x_shape[x_batch_ndims];
  adjust_matrix_offsets<T>(
      x,
      w,
      scales,
      biases,
      lhs_indices,
      rhs_indices,
      y,
      out_vec_size * M,
      batch_ndims,
      batch_shape,
      lhs_strides,
      rhs_strides,
      x_batch_ndims,
      x_shape,
      x_strides,
      w_batch_ndims,
      w_shape,
      w_strides,
      s_strides,
      b_strides,
      tid);
  qmv_fast_impl<T, group_size, bits>(
      w,
      scales,
      biases,
      x,
      y,
      in_vec_size,
      out_vec_size,
      tid,
      simd_gid,
      simd_lid);
}

template <typename T, int group_size, int bits>
[[kernel]] void affine_gather_qmv(
    const device uint32_t* w [[buffer(0)]],
    const device T* scales [[buffer(1)]],
    const device T* biases [[buffer(2)]],
    const device T* x [[buffer(3)]],
    const device uint32_t* lhs_indices [[buffer(4)]],
    const device uint32_t* rhs_indices [[buffer(5)]],
    device T* y [[buffer(6)]],
    const constant int& in_vec_size [[buffer(7)]],
    const constant int& out_vec_size [[buffer(8)]],
    const constant int& x_batch_ndims [[buffer(9)]],
    const constant int* x_shape [[buffer(10)]],
    const constant int64_t* x_strides [[buffer(11)]],
    const constant int& w_batch_ndims [[buffer(12)]],
    const constant int* w_shape [[buffer(13)]],
    const constant int64_t* w_strides [[buffer(14)]],
    const constant int64_t* s_strides [[buffer(15)]],
    const constant int64_t* b_strides [[buffer(16)]],
    const constant int& batch_ndims [[buffer(17)]],
    const constant int* batch_shape [[buffer(18)]],
    const constant int64_t* lhs_strides [[buffer(19)]],
    const constant int64_t* rhs_strides [[buffer(20)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  int M = x_shape[x_batch_ndims];
  adjust_matrix_offsets<T>(
      x,
      w,
      scales,
      biases,
      lhs_indices,
      rhs_indices,
      y,
      out_vec_size * M,
      batch_ndims,
      batch_shape,
      lhs_strides,
      rhs_strides,
      x_batch_ndims,
      x_shape,
      x_strides,
      w_batch_ndims,
      w_shape,
      w_strides,
      s_strides,
      b_strides,
      tid);
  qmv_impl<T, group_size, bits>(
      w,
      scales,
      biases,
      x,
      y,
      in_vec_size,
      out_vec_size,
      tid,
      simd_gid,
      simd_lid);
}

template <typename T, int group_size, int bits>
[[kernel]] void affine_gather_qvm(
    const device uint32_t* w [[buffer(0)]],
    const device T* scales [[buffer(1)]],
    const device T* biases [[buffer(2)]],
    const device T* x [[buffer(3)]],
    const device uint32_t* lhs_indices [[buffer(4)]],
    const device uint32_t* rhs_indices [[buffer(5)]],
    device T* y [[buffer(6)]],
    const constant int& in_vec_size [[buffer(7)]],
    const constant int& out_vec_size [[buffer(8)]],
    const constant int& x_batch_ndims [[buffer(9)]],
    const constant int* x_shape [[buffer(10)]],
    const constant int64_t* x_strides [[buffer(11)]],
    const constant int& w_batch_ndims [[buffer(12)]],
    const constant int* w_shape [[buffer(13)]],
    const constant int64_t* w_strides [[buffer(14)]],
    const constant int64_t* s_strides [[buffer(15)]],
    const constant int64_t* b_strides [[buffer(16)]],
    const constant int& batch_ndims [[buffer(17)]],
    const constant int* batch_shape [[buffer(18)]],
    const constant int64_t* lhs_strides [[buffer(19)]],
    const constant int64_t* rhs_strides [[buffer(20)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  int M = x_shape[x_batch_ndims];
  adjust_matrix_offsets<T>(
      x,
      w,
      scales,
      biases,
      lhs_indices,
      rhs_indices,
      y,
      out_vec_size * M,
      batch_ndims,
      batch_shape,
      lhs_strides,
      rhs_strides,
      x_batch_ndims,
      x_shape,
      x_strides,
      w_batch_ndims,
      w_shape,
      w_strides,
      s_strides,
      b_strides,
      tid);
  qvm_impl<T, group_size, bits>(
      w,
      scales,
      biases,
      x,
      y,
      in_vec_size,
      out_vec_size,
      tid,
      simd_gid,
      simd_lid);
}

template <
    typename T,
    const int group_size,
    const int bits,
    const bool aligned_N,
    const int BM = 32,
    const int BK = 32,
    const int BN = 32>
[[kernel]] void affine_gather_qmm_t(
    const device uint32_t* w [[buffer(0)]],
    const device T* scales [[buffer(1)]],
    const device T* biases [[buffer(2)]],
    const device T* x [[buffer(3)]],
    const device uint32_t* lhs_indices [[buffer(4)]],
    const device uint32_t* rhs_indices [[buffer(5)]],
    device T* y [[buffer(6)]],
    const constant int& K [[buffer(7)]],
    const constant int& N [[buffer(8)]],
    const constant int& M [[buffer(9)]],
    const constant int& x_batch_ndims [[buffer(10)]],
    const constant int* x_shape [[buffer(11)]],
    const constant int64_t* x_strides [[buffer(12)]],
    const constant int& w_batch_ndims [[buffer(13)]],
    const constant int* w_shape [[buffer(14)]],
    const constant int64_t* w_strides [[buffer(15)]],
    const constant int64_t* s_strides [[buffer(16)]],
    const constant int64_t* b_strides [[buffer(17)]],
    const constant int& batch_ndims [[buffer(18)]],
    const constant int* batch_shape [[buffer(19)]],
    const constant int64_t* lhs_strides [[buffer(20)]],
    const constant int64_t* rhs_strides [[buffer(21)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr int BK_padded = (BK + 16 / sizeof(T));

  threadgroup T Xs[BM * BK_padded];
  threadgroup T Ws[BN * BK_padded];

  adjust_matrix_offsets<T>(
      x,
      w,
      scales,
      biases,
      lhs_indices,
      rhs_indices,
      y,
      M * N,
      batch_ndims,
      batch_shape,
      lhs_strides,
      rhs_strides,
      x_batch_ndims,
      x_shape,
      x_strides,
      w_batch_ndims,
      w_shape,
      w_strides,
      s_strides,
      b_strides,
      tid);
  qmm_t_impl<T, group_size, bits, aligned_N, BM, BK, BN>(
      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
}

template <
    typename T,
    const int group_size,
    const int bits,
    const int BM = 32,
    const int BK = 32,
    const int BN = 32>
[[kernel]] void affine_gather_qmm_n(
    const device uint32_t* w [[buffer(0)]],
    const device T* scales [[buffer(1)]],
    const device T* biases [[buffer(2)]],
    const device T* x [[buffer(3)]],
    const device uint32_t* lhs_indices [[buffer(4)]],
    const device uint32_t* rhs_indices [[buffer(5)]],
    device T* y [[buffer(6)]],
    const constant int& K [[buffer(7)]],
    const constant int& N [[buffer(8)]],
    const constant int& M [[buffer(9)]],
    const constant int& x_batch_ndims [[buffer(10)]],
    const constant int* x_shape [[buffer(11)]],
    const constant int64_t* x_strides [[buffer(12)]],
    const constant int& w_batch_ndims [[buffer(13)]],
    const constant int* w_shape [[buffer(14)]],
    const constant int64_t* w_strides [[buffer(15)]],
    const constant int64_t* s_strides [[buffer(16)]],
    const constant int64_t* b_strides [[buffer(17)]],
    const constant int& batch_ndims [[buffer(18)]],
    const constant int* batch_shape [[buffer(19)]],
    const constant int64_t* lhs_strides [[buffer(20)]],
    const constant int64_t* rhs_strides [[buffer(21)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr int BK_padded = (BK + 16 / sizeof(T));
  constexpr int BN_padded = (BN + 16 / sizeof(T));

  threadgroup T Xs[BM * BK_padded];
  threadgroup T Ws[BK * BN_padded];

  adjust_matrix_offsets<T>(
      x,
      w,
      scales,
      biases,
      lhs_indices,
      rhs_indices,
      y,
      M * N,
      batch_ndims,
      batch_shape,
      lhs_strides,
      rhs_strides,
      x_batch_ndims,
      x_shape,
      x_strides,
      w_batch_ndims,
      w_shape,
      w_strides,
      s_strides,
      b_strides,
      tid);
  qmm_n_impl<T, group_size, bits, BM, BK, BN>(
      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
}

template <
    typename T,
    int group_size,
    int bits,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose>
[[kernel]] void affine_gather_qmm_rhs(
    const device T* x [[buffer(0)]],
    const device uint32_t* w [[buffer(1)]],
    const device T* scales [[buffer(2)]],
    const device T* biases [[buffer(3)]],
    const device uint32_t* indices [[buffer(4)]],
    device T* y [[buffer(5)]],
    const constant int& M [[buffer(6)]],
    const constant int& N [[buffer(7)]],
    const constant int& K [[buffer(8)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]]) {
  constexpr int pack_factor = get_pack_factor<bits, 8>();
  constexpr int bytes_per_pack = get_bytes_per_pack<bits>();
  constexpr int BK_padded = (BK + 16 / sizeof(T));
  constexpr int BN_padded = (BN + 16 / sizeof(T));

  using mma_t = mlx::steel::BlockMMA<
      T,
      T,
      BM,
      BN,
      BK,
      WM,
      WN,
      false,
      transpose,
      BK_padded,
      transpose ? BK_padded : BN_padded>;
  using loader_x_t =
      mlx::steel::BlockLoader<T, BM, BK, BK_padded, 1, WM * WN * SIMD_SIZE>;
  using loader_w_t = QuantizedBlockLoader<
      T,
      transpose ? BN : BK,
      transpose ? BK : BN,
      transpose ? BK_padded : BN_padded,
      transpose,
      WM * WN * SIMD_SIZE,
      group_size,
      bits>;

  threadgroup T Xs[BM * BK_padded];
  threadgroup T Ws[transpose ? BN * BK_padded : BK * BN_padded];

  // Compute the block
  const int K_w = K * bytes_per_pack / pack_factor;
  const int K_g = K / group_size;
  const int N_w = N * bytes_per_pack / pack_factor;
  const int N_g = N / group_size;
  const int K_it = K / BK;
  const size_t stride_w = transpose ? N * K_w : K * N_w;
  const size_t stride_s = transpose ? N * K_g : K * N_g;
  const int y_row = tid.y * BM;
  const int y_col = tid.x * BN;
  const size_t y_row_long = size_t(y_row);
  const size_t y_col_long = size_t(y_col);

  // Prepare threadgroup bounds
  const short tgp_bm = align_M ? BM : short(min(BM, M - y_row));
  const short tgp_bn = align_N ? BN : short(min(BN, N - y_col));

  // Calculate the final tiles in the case that K is not aligned
  const int k_remain = K - K_it * BK;
  const short2 tile_x = short2(k_remain, tgp_bm);
  const short2 tile_w =
      transpose ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);

  // Move x and output to the correct block
  auto wl = (const device uint8_t*)w;
  x += y_row_long * K;
  y += y_row_long * N + y_col_long;
  wl += transpose ? y_col_long * K_w : y_col * bytes_per_pack / pack_factor;
  scales += transpose ? y_col_long * K_g : y_col / group_size;
  biases += transpose ? y_col_long * K_g : y_col / group_size;

  // Do as many matmuls as necessary
  uint32_t index;
  short offset;
  uint32_t index_next = indices[y_row];
  short offset_next = 0;
  int n = 0;
  while (n < tgp_bm) {
    n++;
    offset = offset_next;
    index = index_next;
    offset_next = tgp_bm;
    for (; n < tgp_bm; n++) {
      if (indices[y_row + n] != index) {
        offset_next = n;
        index_next = indices[y_row + n];
        break;
      }
    }
    threadgroup_barrier(mem_flags::mem_none);

    // Prepare threadgroup mma operation
    thread mma_t mma_op(simd_group_id, simd_lane_id);

    // Prepare threadgroup loading operations
    thread loader_x_t loader_x(x, K, Xs, simd_group_id, simd_lane_id);
    thread loader_w_t loader_w(
        wl + index * stride_w,
        scales + index * stride_s,
        biases + index * stride_s,
        transpose ? K : N,
        Ws,
        simd_group_id,
        simd_lane_id);

    // Matrices are all aligned check nothing
    if (align_M && align_N) {
      gemm_loop_aligned(Xs, Ws, mma_op, loader_x, loader_w, K_it);
      if (!align_K) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        gemm_loop_finalize(Xs, Ws, mma_op, loader_x, loader_w, tile_x, tile_w);
      }

      // Store results to device memory
      if (offset_next - offset == BM) {
        mma_op.store_result(y, N);
      } else {
        mma_op.store_result_slice(
            y, N, short2(0, offset), short2(BN, offset_next));
      }
    } else {
      // Tile aligned so check outside of the hot loop
      if ((align_M || tgp_bm == BM) && (align_N || tgp_bn == BN)) {
        gemm_loop_aligned(Xs, Ws, mma_op, loader_x, loader_w, K_it);
        if (!align_K) {
          threadgroup_barrier(mem_flags::mem_threadgroup);
          gemm_loop_finalize(
              Xs, Ws, mma_op, loader_x, loader_w, tile_x, tile_w);
        }

        // Store results to device memory
        if (offset_next - offset == BM) {
          mma_op.store_result(y, N);
        } else {
          mma_op.store_result_slice(
              y, N, short2(0, offset), short2(BN, offset_next));
        }
      }

      // Tile partially aligned check rows
      else if (align_N || tgp_bn == BN) {
        gemm_loop_unaligned<false, true, transpose>(
            Xs, Ws, mma_op, loader_x, loader_w, K_it, tgp_bm, tgp_bn, BK);
        if (!align_K) {
          threadgroup_barrier(mem_flags::mem_threadgroup);
          gemm_loop_finalize(
              Xs, Ws, mma_op, loader_x, loader_w, tile_x, tile_w);
        }
        mma_op.store_result_slice(
            y, N, short2(0, offset), short2(BN, offset_next));
      }

      // Tile partially aligned check cols
      else if (align_M || tgp_bm == BM) {
        gemm_loop_unaligned<true, false, transpose>(
            Xs, Ws, mma_op, loader_x, loader_w, K_it, tgp_bm, tgp_bn, BK);
        if (!align_K) {
          threadgroup_barrier(mem_flags::mem_threadgroup);
          gemm_loop_finalize(
              Xs, Ws, mma_op, loader_x, loader_w, tile_x, tile_w);
        }
        mma_op.store_result_slice(
            y, N, short2(0, offset), short2(tgp_bn, offset_next));
      }

      // Nothing aligned so check both rows and cols
      else {
        gemm_loop_unaligned<false, false, transpose>(
            Xs, Ws, mma_op, loader_x, loader_w, K_it, tgp_bm, tgp_bn, BK);
        if (!align_K) {
          threadgroup_barrier(mem_flags::mem_threadgroup);
          gemm_loop_finalize(
              Xs, Ws, mma_op, loader_x, loader_w, tile_x, tile_w);
        }
        mma_op.store_result_slice(
            y, N, short2(0, offset), short2(tgp_bn, offset_next));
      }
    }
  }
}

template <typename T, const int group_size, const int bits>
[[kernel]] void affine_quantize(
    const device T* w [[buffer(0)]],
    device uint8_t* out [[buffer(1)]],
    device T* scales [[buffer(2)]],
    device T* biases [[buffer(3)]],
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  constexpr float eps = 1e-7;
  constexpr int simd_size = 32;
  constexpr float n_bins = (1 << bits) - 1;
  constexpr int pack_factor = get_pack_factor<bits, 8>();
  constexpr int bytes_per_pack = get_bytes_per_pack<bits>();
  constexpr int values_per_reduce = group_size / simd_size;
  constexpr int writes_per_reduce = pack_factor / values_per_reduce;
  constexpr int writes_per_pack =
      writes_per_reduce > 1 ? 1 : values_per_reduce / pack_factor;
  constexpr int power_of_2_bits = (bits & (bits - 1)) == 0;

  static_assert(
      group_size % simd_size == 0,
      "Group size must be divisible by simd size.");

  size_t offset = index.x + grid_dim.x * size_t(index.y);
  size_t in_index = offset * values_per_reduce;
  size_t out_index = power_of_2_bits
      ? offset * writes_per_pack
      : offset * bytes_per_pack / writes_per_reduce;

  float w_thread[values_per_reduce];
  float w_min = Limits<T>::max;
  float w_max = 0;

#pragma clang loop unroll(full)
  for (int i = 0; i < values_per_reduce; i++) {
    float val = w[in_index + i];
    w_thread[i] = val;
    w_min = min(w_min, val);
    w_max = max(w_max, val);
  }

  w_min = simd_min(w_min);
  w_max = simd_max(w_max);

  float scale = max((w_max - w_min) / n_bins, eps);
  bool side = abs(w_min) > abs(w_max);
  scale = side ? scale : -scale;
  float edge = side ? w_min : w_max;
  float q0 = round(edge / scale);
  bool at_zero = q0 == 0.0f;
  scale = at_zero ? scale : edge / q0;
  float bias = at_zero ? 0 : edge;

  // Write out the scales and biases
  size_t gindex = in_index / group_size;
  if (in_index % group_size == 0) {
    scales[gindex] = static_cast<T>(scale);
    biases[gindex] = static_cast<T>(bias);
  }

  using OutType = metal::conditional_t<bits == 5, uint64_t, uint32_t>;
  OutType output = 0;

#pragma clang loop unroll(full)
  for (int i = 0; i < values_per_reduce; i++) {
    uint8_t val = min(round((w_thread[i] - bias) / scale), n_bins);
    if (bits == 8) {
      output = val;
    } else {
      output |= val << (bits * (i % pack_factor));
    }

    if (pack_factor < values_per_reduce && i % pack_factor == pack_factor - 1) {
      out[out_index + i / pack_factor] = output;
      output = 0;
    } else {
#pragma clang loop unroll(full)
      for (int j = 1; j < writes_per_reduce; j++) {
        uint8_t sval = simd_shuffle_down(val, j);
        output |= static_cast<OutType>(sval)
            << (bits * (j * values_per_reduce + i));
      }
    }
  }
  if (bits == 3 || bits == 6) {
    if (in_index % pack_factor == 0 && out_index % bytes_per_pack == 0) {
      out[out_index] = output & 0xff;
      out[out_index + 1] = (output & 0xff00) >> 8;
      out[out_index + 2] = (output & 0xff0000) >> 16;
    }
  } else if (bits == 5) {
    if (in_index % pack_factor == 0 && out_index % bytes_per_pack == 0) {
      out[out_index] = output & 0xff;
      out[out_index + 1] = (output & 0xff00) >> 8;
      out[out_index + 2] = (output & 0xff0000) >> 16;
      out[out_index + 3] = (output & 0xff000000) >> 24;
      out[out_index + 4] = (output & 0xff00000000) >> 32;
    }
  } else {
    if (writes_per_reduce > 0 && out_index % writes_per_reduce == 0) {
      out[out_index / writes_per_reduce] = output;
    }
  }
}

template <typename T, const int group_size, const int bits>
[[kernel]] void affine_dequantize(
    const device uint8_t* w [[buffer(0)]],
    const device T* scales [[buffer(1)]],
    const device T* biases [[buffer(2)]],
    device T* out [[buffer(3)]],
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  constexpr int pack_factor = get_pack_factor<bits, 8>();
  constexpr int bytes_per_pack = get_bytes_per_pack<bits>();

  size_t offset = index.x + grid_dim.x * size_t(index.y);
  size_t oindex = offset * pack_factor;
  size_t gindex = oindex / group_size;
  T scale = scales[gindex];
  T bias = biases[gindex];

  out += oindex;

  if (bits == 3) {
    w += offset * bytes_per_pack;
    out[0] = (w[0] & 0x7) * scale + bias;
    out[1] = ((w[0] & 0x38) >> 3) * scale + bias;
    out[2] = (((w[0] & 0xc0) >> 6) + ((w[1] & 0x1) << 2)) * scale + bias;
    out[3] = ((w[1] & 0xe) >> 1) * scale + bias;
    out[4] = ((w[1] & 0x70) >> 4) * scale + bias;
    out[5] = (((w[1] & 0x80) >> 7) + ((w[2] & 0x3) << 1)) * scale + bias;
    out[6] = ((w[2] & 0x1c) >> 2) * scale + bias;
    out[7] = ((w[2] & 0xe0) >> 5) * scale + bias;
  } else if (bits == 5) {
    w += offset * bytes_per_pack;
    out[0] = (w[0] & 0x1f) * scale + bias;
    out[1] = (((w[0] & 0xe0) >> 5) + ((w[1] & 0x3) << 3)) * scale + bias;
    out[2] = ((w[1] & 0x7c) >> 2) * scale + bias;
    out[3] = (((w[1] & 0x80) >> 7) + ((w[2] & 0xf) << 1)) * scale + bias;
    out[4] = (((w[2] & 0xf0) >> 4) + ((w[3] & 0x1) << 4)) * scale + bias;
    out[5] = ((w[3] & 0x3e) >> 1) * scale + bias;
    out[6] = (((w[3] & 0xc0) >> 6) + ((w[4] & 0x7) << 2)) * scale + bias;
    out[7] = ((w[4] & 0xf8) >> 3) * scale + bias;
  } else if (bits == 6) {
    w += offset * bytes_per_pack;
    out[0] = (w[0] & 0x3f) * scale + bias;
    out[1] = (((w[0] >> 6) & 0x03) + ((w[1] & 0x0f) << 2)) * scale + bias;
    out[2] = (((w[1] >> 4) & 0x0f) + ((w[2] & 0x03) << 4)) * scale + bias;
    out[3] = ((w[2] >> 2) & 0x3f) * scale + bias;
  } else {
    uint val = w[offset];
#pragma clang loop unroll(full)
    for (int i = 0; i < pack_factor; i++) {
      uint8_t d;
      if (bits == 2) {
        d = (val >> (bits * i)) & 0x03;
      } else if (bits == 4) {
        d = (val >> (bits * i)) & 0x0f;
      } else if (bits == 8) {
        d = val;
      }
      out[i] = scale * d + bias;
    }
  }
}


================================================
FILE: mlx/backend/metal/kernels/quantized.metal
================================================
// Copyright © 2023-2024 Apple Inc.

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/steel/gemm/gemm.h"
#include "mlx/backend/metal/kernels/quantized_utils.h"
#include "mlx/backend/metal/kernels/quantized.h"

#define instantiate_quantized(name, type, group_size, bits)     \
  instantiate_kernel(                                                    \
      #name "_" #type "_gs_" #group_size "_b_" #bits,                    \
      name,                                                              \
      type,                                                              \
      group_size,                                                        \
      bits)

#define instantiate_quantized_batched(name, type, group_size, bits, batched)     \
  instantiate_kernel(                                                    \
      #name "_" #type "_gs_" #group_size "_b_" #bits "_batch_" #batched, \
      name,                                                              \
      type,                                                              \
      group_size,                                                        \
      bits,                                                              \
      batched)

#define instantiate_quantized_aligned(name, type, group_size, bits, aligned)     \
  instantiate_kernel(                                                                     \
      #name "_" #type "_gs_" #group_size "_b_" #bits "_alN_" #aligned, \
      name,                                                                  \
      type,                                                                  \
      group_size,                                                            \
      bits,                                                                  \
      aligned)

#define instantiate_quantized_aligned_batched(name, type, group_size, bits, aligned, batched)     \
  instantiate_kernel(                                                                     \
      #name "_" #type "_gs_" #group_size "_b_" #bits "_alN_" #aligned "_batch_" #batched, \
      name,                                                                  \
      type,                                                                  \
      group_size,                                                            \
      bits,                                                                  \
      aligned,                                                               \
      batched)

#define instantiate_quantized_quad(name, type, group_size, bits, D, batched)     \
  instantiate_kernel(                                                            \
      #name "_" #type "_gs_" #group_size "_b_" #bits "_d_" #D "_batch_" #batched, \
      name,                                                         \
      type,                                                         \
      group_size,                                                   \
      bits,                                                         \
      D,                                                            \
      batched)

#define instantiate_quantized_split_k(name, type, group_size, bits, split_k)     \
  instantiate_kernel(                                                            \
      #name "_" #type "_gs_" #group_size "_b_" #bits "_spk_" #split_k, \
      name,                                                         \
      type,                                                         \
      group_size,                                                   \
      bits,                                                         \
      split_k)

#define instantiate_gather_qmm_rhs(func, name, type, group_size, bits, bm, bn, bk, wm, wn, transpose)        \
  instantiate_kernel(                                                                                        \
      #name "_" #type "_gs_" #group_size "_b_" #bits "_bm_" #bm "_bn_" #bn "_bk_" #bk "_wm_" #wm "_wn_" #wn, \
      func,                                                         \
      type,                                                         \
      group_size,                                                   \
      bits,                                                         \
      bm,                                                           \
      bn,                                                           \
      bk,                                                           \
      wm,                                                           \
      wn,                                                           \
      transpose)

#define instantiate_quantized_batched_wrap(name, type, group_size, bits) \
  instantiate_quantized_batched(name, type, group_size, bits, 1)      \
  instantiate_quantized_batched(name, type, group_size, bits, 0)

#define instantiate_quantized_all_batched(type, group_size, bits) \
  instantiate_quantized_batched_wrap(affine_qmv_fast, type, group_size, bits)     \
  instantiate_quantized_batched_wrap(affine_qmv, type, group_size, bits)     \
  instantiate_quantized_batched_wrap(affine_qvm, type, group_size, bits)     \
  instantiate_quantized_batched_wrap(affine_qmm_n, type, group_size, bits)

#define instantiate_quantized_all_single(type, group_size, bits) \
  instantiate_quantized(affine_quantize, type, group_size, bits) \
  instantiate_quantized(affine_dequantize, type, group_size, bits)     \
  instantiate_quantized(affine_gather_qmv_fast, type, group_size, bits)     \
  instantiate_quantized(affine_gather_qmv, type, group_size, bits)     \
  instantiate_quantized(affine_gather_qvm, type, group_size, bits)     \
  instantiate_quantized(affine_gather_qmm_n, type, group_size, bits)

#define instantiate_quantized_all_aligned(type, group_size, bits)   \
  instantiate_quantized_aligned(affine_gather_qmm_t, type, group_size, bits, true) \
  instantiate_quantized_aligned(affine_gather_qmm_t, type, group_size, bits, false) \
  instantiate_quantized_aligned_batched(affine_qmm_t, type, group_size, bits, true, 1) \
  instantiate_quantized_aligned_batched(affine_qmm_t, type, group_size, bits, true, 0) \
  instantiate_quantized_aligned_batched(affine_qmm_t, type, group_size, bits, false, 1) \
  instantiate_quantized_aligned_batched(affine_qmm_t, type, group_size, bits, false, 0)

#define instantiate_quantized_all_quad(type, group_size, bits)   \
  instantiate_quantized_quad(affine_qmv_quad, type, group_size, bits, 64, 1)   \
  instantiate_quantized_quad(affine_qmv_quad, type, group_size, bits, 64, 0)   \
  instantiate_quantized_quad(affine_qmv_quad, type, group_size, bits, 128, 1)  \
  instantiate_quantized_quad(affine_qmv_quad, type, group_size, bits, 128, 0)

#define instantiate_quantized_all_splitk(type, group_size, bits)   \
  instantiate_quantized_split_k(affine_qvm_split_k, type, group_size, bits, 8)   \
  instantiate_quantized_split_k(affine_qvm_split_k, type, group_size, bits, 32)

#define instantiate_quantized_all_rhs(type, group_size, bits) \
  instantiate_gather_qmm_rhs(affine_gather_qmm_rhs, affine_gather_qmm_rhs_nt, type, group_size, bits, 16, 32, 32, 1, 2, true) \
  instantiate_gather_qmm_rhs(affine_gather_qmm_rhs, affine_gather_qmm_rhs_nn, type, group_size, bits, 16, 32, 32, 1, 2, false)

#define instantiate_quantized_funcs(type, group_size, bits) \
  instantiate_quantized_all_single(type, group_size, bits)  \
  instantiate_quantized_all_batched(type, group_size, bits) \
  instantiate_quantized_all_aligned(type, group_size, bits) \
  instantiate_quantized_all_quad(type, group_size, bits)    \
  instantiate_quantized_all_splitk(type, group_size, bits)  \
  instantiate_quantized_all_rhs(type, group_size, bits)

#define instantiate_quantized_types(group_size, bits)       \
  instantiate_quantized_funcs(float, group_size, bits)      \
  instantiate_quantized_funcs(float16_t, group_size, bits)  \
  instantiate_quantized_funcs(bfloat16_t, group_size, bits)

#define instantiate_quantized_groups(bits) \
  instantiate_quantized_types(128, bits)   \
  instantiate_quantized_types(64, bits)    \
  instantiate_quantized_types(32, bits)

#define instantiate_quantized_all() \
  instantiate_quantized_groups(2) \
  instantiate_quantized_groups(3) \
  instantiate_quantized_groups(4) \
  instantiate_quantized_groups(5) \
  instantiate_quantized_groups(6) \
  instantiate_quantized_groups(8)

instantiate_quantized_all() // clang-format on


================================================
FILE: mlx/backend/metal/kernels/quantized_nax.h
================================================
// Copyright © 2023-2024 Apple Inc.

#include <metal_simdgroup>
#include <metal_stdlib>

using namespace metal;
using namespace mlx::steel;

constant bool align_M [[function_constant(200)]];
constant bool align_N [[function_constant(201)]];
constant bool align_K [[function_constant(202)]];

using namespace metal;

#define MLX_MTL_CONST static constant constexpr const

MLX_MTL_CONST int SIMD_SIZE = 32;
MLX_MTL_CONST int QUAD_SIZE = 4;

template <int bits, int wsize = 8>
inline constexpr short get_pack_factor() {
  return (bits == 3 || bits == 5) ? 8 : (bits == 6 ? 4 : wsize / bits);
}

template <int bits, int wsize = 8>
inline constexpr short get_bytes_per_pack() {
  constexpr int power_of_2_bits = (bits & (bits - 1)) == 0;
  return power_of_2_bits ? (wsize / 8) : (bits == 5 ? 5 : 3);
}

template <typename T, typename U, int values_per_thread, int bits>
inline U load_vector(const device T* x, thread U* x_thread) {
  static_assert(
      bits == 2 || bits == 3 || bits == 4 || bits == 5 || bits == 6 ||
          bits == 8,
      "Template undefined for bits not in {2, 3, 4, 5, 6, 8}");

  U sum = 0;

  if (bits == 2) {
    for (int i = 0; i < values_per_thread; i += 4) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 4.0f;
      x_thread[i + 2] = x[i + 2] / 16.0f;
      x_thread[i + 3] = x[i + 3] / 64.0f;
    }
  }

  else if (bits == 3) {
    for (int i = 0; i < values_per_thread; i += 8) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3] + x[i + 4] + x[i + 5] +
          x[i + 6] + x[i + 7];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 8.0f;
      x_thread[i + 2] = x[i + 2] / 64.0f;
      x_thread[i + 3] = x[i + 3] / 2.0f;
      x_thread[i + 4] = x[i + 4] / 16.0f;
      x_thread[i + 5] = x[i + 5] / 128.0f;
      x_thread[i + 6] = x[i + 6] / 4.0f;
      x_thread[i + 7] = x[i + 7] / 32.0f;
    }
  }

  else if (bits == 4) {
    for (int i = 0; i < values_per_thread; i += 4) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 16.0f;
      x_thread[i + 2] = x[i + 2] / 256.0f;
      x_thread[i + 3] = x[i + 3] / 4096.0f;
    }
  }

  else if (bits == 5) {
    for (int i = 0; i < values_per_thread; i += 8) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3] + x[i + 4] + x[i + 5] +
          x[i + 6] + x[i + 7];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 32.0f;
      x_thread[i + 2] = x[i + 2] / 4.0f;
      x_thread[i + 3] = x[i + 3] / 128.0f;
      x_thread[i + 4] = x[i + 4] / 16.0f;
      x_thread[i + 5] = x[i + 5] / 2.0f;
      x_thread[i + 6] = x[i + 6] / 64.0f;
      x_thread[i + 7] = x[i + 7] / 8.0f;
    }
  }

  else if (bits == 6) {
    for (int i = 0; i < values_per_thread; i += 4) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 64.0f;
      x_thread[i + 2] = x[i + 2] / 16.0f;
      x_thread[i + 3] = x[i + 3] / 4.0f;
    }
  }

  else if (bits == 8) {
    for (int i = 0; i < values_per_thread; i++) {
      sum += x[i];
      x_thread[i] = x[i];
    }
  }

  return sum;
}

template <typename T, typename U, int values_per_thread, int bits>
inline U load_vector_safe(const device T* x, thread U* x_thread, int N) {
  static_assert(
      bits == 2 || bits == 3 || bits == 4 || bits == 5 || bits == 6 ||
          bits == 8,
      "Template undefined for bits not in {2, 3, 4, 5, 6, 8}");

  U sum = 0;

  if (bits == 2) {
    for (int i = 0; i < N; i += 4) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 4.0f;
      x_thread[i + 2] = x[i + 2] / 16.0f;
      x_thread[i + 3] = x[i + 3] / 64.0f;
    }
  }

  else if (bits == 3) {
    for (int i = 0; i < N; i += 8) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3] + x[i + 4] + x[i + 5] +
          x[i + 6] + x[i + 7];

      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 8.0f;
      x_thread[i + 2] = x[i + 2] / 64.0f;
      x_thread[i + 3] = x[i + 3] / 2.0f;
      x_thread[i + 4] = x[i + 4] / 16.0f;
      x_thread[i + 5] = x[i + 5] / 128.0f;
      x_thread[i + 6] = x[i + 6] / 4.0f;
      x_thread[i + 7] = x[i + 7] / 32.0f;
    }
  }

  else if (bits == 4) {
    for (int i = 0; i < N; i += 4) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 16.0f;
      x_thread[i + 2] = x[i + 2] / 256.0f;
      x_thread[i + 3] = x[i + 3] / 4096.0f;
    }
  }

  else if (bits == 5) {
    for (int i = 0; i < N; i += 8) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3] + x[i + 4] + x[i + 5] +
          x[i + 6] + x[i + 7];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 32.0f;
      x_thread[i + 2] = x[i + 2] / 4.0f;
      x_thread[i + 3] = x[i + 3] / 128.0f;
      x_thread[i + 4] = x[i + 4] / 16.0f;
      x_thread[i + 5] = x[i + 5] / 2.0f;
      x_thread[i + 6] = x[i + 6] / 64.0f;
      x_thread[i + 7] = x[i + 7] / 8.0f;
    }
  }

  else if (bits == 6) {
    for (int i = 0; i < N; i += 4) {
      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];
      x_thread[i] = x[i];
      x_thread[i + 1] = x[i + 1] / 64.0f;
      x_thread[i + 2] = x[i + 2] / 16.0f;
      x_thread[i + 3] = x[i + 3] / 4.0f;
    }
  }

  else if (bits == 8) {
    for (int i = 0; i < N; i++) {
      sum += x[i];
      x_thread[i] = x[i];
    }
  }

  for (int i = N; i < values_per_thread; i++) {
    x_thread[i] = 0;
  }

  return sum;
}

template <typename U, int values_per_thread, int bits>
inline U qdot(
    const device uint8_t* w,
    const thread U* x_thread,
    U scale,
    U bias,
    U sum) {
  static_assert(
      bits == 2 || bits == 3 || bits == 4 || bits == 5 || bits == 6 ||
          bits == 8,
      "Template undefined for bits not in {2, 3, 4, 5, 6, 8}");

  U accum = 0;

  if (bits == 2) {
    for (int i = 0; i < (values_per_thread / 4); i++) {
      accum +=
          (x_thread[4 * i] * (w[i] & 0x03) +
           x_thread[4 * i + 1] * (w[i] & 0x0c) +
           x_thread[4 * i + 2] * (w[i] & 0x30) +
           x_thread[4 * i + 3] * (w[i] & 0xc0));
    }
  }

  else if (bits == 3) {
    for (int i = 0; i < (values_per_thread / 8); i++) {
      x_thread += 8 * i;
      w += 3 * i;

      accum += (w[0] & 0x07) * x_thread[0];
      accum += (w[0] & 0x38) * x_thread[1];
      accum += (w[0] & 0xc0) * x_thread[2];
      accum += (w[1] & 0x01) * (x_thread[2] * 256.0f);

      accum += (w[1] & 0x0e) * x_thread[3];
      accum += (w[1] & 0x70) * x_thread[4];
      accum += (w[1] & 0x80) * x_thread[5];
      accum += (w[2] & 0x03) * (x_thread[5] * 256.0f);

      accum += (w[2] & 0x1c) * x_thread[6];
      accum += (w[2] & 0xe0) * x_thread[7];
    }
  }

  else if (bits == 4) {
    const device uint16_t* ws = (const device uint16_t*)w;
    for (int i = 0; i < (values_per_thread / 4); i++) {
      accum +=
          (x_thread[4 * i] * (ws[i] & 0x000f) +
           x_thread[4 * i + 1] * (ws[i] & 0x00f0) +
           x_thread[4 * i + 2] * (ws[i] & 0x0f00) +
           x_thread[4 * i + 3] * (ws[i] & 0xf000));
    }
  }

  else if (bits == 5) {
    for (int i = 0; i < (values_per_thread / 8); i++) {
      x_thread += 8 * i;
      w += 5 * i;

      accum += (w[0] & 0x1f) * x_thread[0];
      accum += (w[0] & 0xe0) * x_thread[1];
      accum += (w[1] & 0x3) * (x_thread[1] * 256.0f);
      accum += (w[1] & 0x7c) * x_thread[2];
      accum += (w[1] & 0x80) * x_thread[3];
      accum += (w[2] & 0xf) * (x_thread[3] * 256.0f);
      accum += (w[2] & 0xf0) * x_thread[4];
      accum += (w[3] & 0x1) * (x_thread[4] * 256.0f);
      accum += (w[3] & 0x3e) * x_thread[5];
      accum += (w[3] & 0xc0) * x_thread[6];
      accum += (w[4] & 0x7) * (x_thread[6] * 256.0f);
      accum += (w[4] & 0xf8) * x_thread[7];
    }
  }

  else if (bits == 6) {
    for (int i = 0; i < (values_per_thread / 4); i++) {
      x_thread += 4 * i;
      w += 3 * i;

      accum += (w[0] & 0x3f) * x_thread[0];

      accum += (w[0] & 0xc0) * x_thread[1];
      accum += (w[1] & 0x0f) * (x_thread[1] * 256.0f);

      accum += (w[1] & 0xf0) * x_thread[2];
      accum += (w[2] & 0x03) * (x_thread[2] * 256.0f);

      accum += (w[2] & 0xfc) * x_thread[3];
    }
  }

  else if (bits == 8) {
    for (int i = 0; i < values_per_thread; i++) {
      accum += x_thread[i] * w[i];
    }
  }

  return scale * accum + sum * bias;
}

template <typename U, int values_per_thread, int bits>
inline U qdot_safe(
    const device uint8_t* w,
    const thread U* x_thread,
    U scale,
    U bias,
    U sum,
    int N) {
  static_assert(
      bits == 2 || bits == 3 || bits == 4 || bits == 5 || bits == 6 ||
          bits == 8,
      "Template undefined for bits not in {2, 3, 4, 5, 6, 8}");

  U accum = 0;

  if (bits == 2) {
    for (int i = 0; i < (N / 4); i++) {
      accum +=
          (x_thread[4 * i] * (w[i] & 0x03) +
           x_thread[4 * i + 1] * (w[i] & 0x0c) +
           x_thread[4 * i + 2] * (w[i] & 0x30) +
           x_thread[4 * i + 3] * (w[i] & 0xc0));
    }
  }

  else if (bits == 3) {
    for (int i = 0; i < (N / 8); i++) {
      x_thread += 8 * i;
      w += 3 * i;

      accum += (w[0] & 0x07) * x_thread[0];
      accum += (w[0] & 0x38) * x_thread[1];
      accum += (w[0] & 0xc0) * x_thread[2];
      accum += (w[1] & 0x01) * (x_thread[2] * 256.0f);

      accum += (w[1] & 0x0e) * x_thread[3];
      accum += (w[1] & 0x70) * x_thread[4];
      accum += (w[1] & 0x80) * x_thread[5];
      accum += (w[2] & 0x03) * (x_thread[5] * 256.0f);

      accum += (w[2] & 0x1c) * x_thread[6];
      accum += (w[2] & 0xe0) * x_thread[7];
    }
  }

  else if (bits == 4) {
    const device uint16_t* ws = (const device uint16_t*)w;
    for (int i = 0; i < (N / 4); i++) {
      accum +=
          (x_thread[4 * i] * (ws[i] & 0x000f) +
           x_thread[4 * i + 1] * (ws[i] & 0x00f0) +
           x_thread[4 * i + 2] * (ws[i] & 0x0f00) +
           x_thread[4 * i + 3] * (ws[i] & 0xf000));
    }
  }

  else if (bits == 5) {
    for (int i = 0; i < (N / 8); i++) {
      x_thread += 8 * i;
      w += 5 * i;

      accum += (w[0] & 0x1f) * x_thread[0];
      accum += (w[0] & 0xe0) * x_thread[1];
      accum += (w[1] & 0x3) * (x_thread[1] * 256.0f);
      accum += (w[1] & 0x7c) * x_thread[2];
      accum += (w[1] & 0x80) * x_thread[3];
      accum += (w[2] & 0xf) * (x_thread[3] * 256.0f);
      accum += (w[2] & 0xf0) * x_thread[4];
      accum += (w[3] & 0x1) * (x_thread[4] * 256.0f);
      accum += (w[3] & 0x3e) * x_thread[5];
      accum += (w[3] & 0xc0) * x_thread[6];
      accum += (w[4] & 0x7) * (x_thread[6] * 256.0f);
      accum += (w[4] & 0xf8) * x_thread[7];
    }
  }

  else if (bits == 6) {
    for (int i = 0; i < (N / 4); i++) {
      x_thread += 4 * i;
      w += 3 * i;

      accum += (w[0] & 0x3f) * x_thread[0];

      accum += (w[0] & 0xc0) * x_thread[1];
      accum += (w[1] & 0x0f) * (x_thread[1] * 256.0f);

      accum += (w[1] & 0xf0) * x_thread[2];
      accum += (w[2] & 0x03) * (x_thread[2] * 256.0f);

      accum += (w[2] & 0xfc) * x_thread[3];
    }
  }

  else if (bits == 8) {
    for (int i = 0; i < N; i++) {
      accum += x_thread[i] * w[i];
    }
  }

  return scale * accum + sum * bias;
}

template <typename U, int values_per_thread, int bits>
inline void
qouter(const thread uint8_t* w, U x, U scale, U bias, thread U* result) {
  static_assert(
      bits == 2 || bits == 3 || bits == 4 || bits == 5 || bits == 6 ||
          bits == 8,
      "Template undefined for bits not in {2, 3, 4, 5, 6, 8}");

  if (bits == 2) {
    U s[4] = {scale, scale / 4.0f, scale / 16.0f, scale / 64.0f};
    for (int i = 0; i < (values_per_thread / 4); i++) {
      result[4 * i] += x * (s[0] * (w[i] & 0x03) + bias);
      result[4 * i + 1] += x * (s[1] * (w[i] & 0x0c) + bias);
      result[4 * i + 2] += x * (s[2] * (w[i] & 0x30) + bias);
      result[4 * i + 3] += x * (s[3] * (w[i] & 0xc0) + bias);
    }
  }

  else if (bits == 3) {
    for (int i = 0; i < (values_per_thread / 8); i++) {
      uint8_t w0 = w[3 * i];
      uint8_t w1 = w[3 * i + 1];
      uint8_t w2 = w[3 * i + 2];

      result[8 * i] += x * ((w0 & 0x7) * scale + bias);
      result[8 * i + 1] += x * (((w0 & 0x38) >> 3) * scale + bias);
      result[8 * i + 2] +=
          x * ((((w0 & 0xc0) >> 6) + ((w1 & 0x1) << 2)) * scale + bias);
      result[8 * i + 3] += x * (((w1 & 0xe) >> 1) * scale + bias);
      result[8 * i + 4] += x * (((w1 & 0x70) >> 4) * scale + bias);
      result[8 * i + 5] +=
          x * ((((w1 & 0x80) >> 7) + ((w2 & 0x3) << 1)) * scale + bias);
      result[8 * i + 6] += x * (((w2 & 0x1c) >> 2) * scale + bias);
      result[8 * i + 7] += x * (((w2 & 0xe0) >> 5) * scale + bias);
    }
  }

  else if (bits == 4) {
    U s[2] = {scale, scale / 16.0f};
    for (int i = 0; i < (values_per_thread / 2); i++) {
      result[2 * i] += x * (s[0] * (w[i] & 0x0f) + bias);
      result[2 * i + 1] += x * (s[1] * (w[i] & 0xf0) + bias);
    }
  }

  else if (bits == 5) {
    for (int i = 0; i < (values_per_thread / 8); i++) {
      uint8_t w0 = w[5 * i];
      uint8_t w1 = w[5 * i + 1];
      uint8_t w2 = w[5 * i + 2];
      uint8_t w3 = w[5 * i + 3];
      uint8_t w4 = w[5 * i + 4];
      result[8 * i] += x * ((w0 & 0x1f) * scale + bias);
      result[8 * i + 1] +=
          x * ((((w0 & 0xe0) >> 5) + ((w1 & 0x3) << 3)) * scale + bias);
      result[8 * i + 2] += x * (((w1 & 0x7c) >> 2) * scale + bias);
      result[8 * i + 3] +=
          x * ((((w1 & 0x80) >> 7) + ((w2 & 0xf) << 1)) * scale + bias);
      result[8 * i + 4] +=
          x * ((((w2 & 0xf0) >> 4) + ((w3 & 0x1) << 4)) * scale + bias);
      result[8 * i + 5] += x * (((w3 & 0x3e) >> 1) * scale + bias);
      result[8 * i + 6] +=
          x * ((((w3 & 0xc0) >> 6) + ((w4 & 0x7) << 2)) * scale + bias);
      result[8 * i + 7] += x * (((w4 & 0xf8) >> 3) * scale + bias);
    }
  }

  else if (bits == 6) {
    for (int i = 0; i < (values_per_thread / 4); i++) {
      uint8_t w0 = w[3 * i];
      uint8_t w1 = w[3 * i + 1];
      uint8_t w2 = w[3 * i + 2];

      result[4 * i] += x * ((w0 & 0x3f) * scale + bias);
      result[4 * i + 1] +=
          x * ((((w0 >> 6) & 0x03) + ((w1 & 0x0f) << 2)) * scale + bias);
      result[4 * i + 2] +=
          x * ((((w1 >> 4) & 0x0f) + ((w2 & 0x03) << 4)) * scale + bias);
      result[4 * i + 3] += x * (((w2 >> 2) & 0x3f) * scale + bias);
    }
  }

  else if (bits == 8) {
    for (int i = 0; i < values_per_thread; i++) {
      result[i] += x * (scale * w[i] + bias);
    }
  }
}

template <typename U, int N, int bits>
inline void
dequantize(const device uint8_t* w, U scale, U bias, threadgroup U* w_local) {
  static_assert(
      bits == 2 || bits == 3 || bits == 4 || bits == 5 || bits == 6 ||
          bits == 8,
      "Template undefined for bits not in {2, 3, 4, 5, 6, 8}");

  if (bits == 2) {
    U s[4] = {
        scale,
        scale / static_cast<U>(4.0f),
        scale / static_cast<U>(16.0f),
        scale / static_cast<U>(64.0f)};
    for (int i = 0; i < (N / 4); i++) {
      w_local[4 * i] = s[0] * (w[i] & 0x03) + bias;
      w_local[4 * i + 1] = s[1] * (w[i] & 0x0c) + bias;
      w_local[4 * i + 2] = s[2] * (w[i] & 0x30) + bias;
      w_local[4 * i + 3] = s[3] * (w[i] & 0xc0) + bias;
    }
  }

  else if (bits == 3) {
    for (int i = 0; i < (N / 8); i++) {
      w_local += 8 * i;
      w += 3 * i;

      w_local[0] = (w[0] & 0x7) * scale + bias;
      w_local[1] = ((w[0] & 0x38) >> 3) * scale + bias;
      w_local[2] = (((w[0] & 0xc0) >> 6) + ((w[1] & 0x1) << 2)) * scale + bias;
      w_local[3] = ((w[1] & 0xe) >> 1) * scale + bias;
      w_local[4] = ((w[1] & 0x70) >> 4) * scale + bias;
      w_local[5] = (((w[1] & 0x80) >> 7) + ((w[2] & 0x3) << 1)) * scale + bias;
      w_local[6] = ((w[2] & 0x1c) >> 2) * scale + bias;
      w_local[7] = ((w[2] & 0xe0) >> 5) * scale + bias;
    }
  }

  else if (bits == 4) {
    U s[2] = {scale, scale / static_cast<U>(16.0f)};
    for (int i = 0; i < (N / 2); i++) {
      w_local[2 * i] = s[0] * (w[i] & 0x0f) + bias;
      w_local[2 * i + 1] = s[1] * (w[i] & 0xf0) + bias;
    }
  }

  else if (bits == 5) {
    for (int i = 0; i < (N / 8); i++) {
      w_local += 8 * i;
      w += 5 * i;

      w_local[0] = (w[0] & 0x1f) * scale + bias;
      w_local[1] = (((w[0] & 0xe0) >> 5) + ((w[1] & 0x3) << 3)) * scale + bias;
      w_local[2] = ((w[1] & 0x7c) >> 2) * scale + bias;
      w_local[3] = (((w[1] & 0x80) >> 7) + ((w[2] & 0xf) << 1)) * scale + bias;
      w_local[4] = (((w[2] & 0xf0) >> 4) + ((w[3] & 0x1) << 4)) * scale + bias;
      w_local[5] = ((w[3] & 0x3e) >> 1) * scale + bias;
      w_local[6] = (((w[3] & 0xc0) >> 6) + ((w[4] & 0x7) << 2)) * scale + bias;
      w_local[7] = ((w[4] & 0xf8) >> 3) * scale + bias;
    }
  }

  else if (bits == 6) {
    for (int i = 0; i < (N / 4); i++) {
      w_local += 4 * i;
      w += 3 * i;
      w_local[0] = (w[0] & 0x3f) * scale + bias;
      w_local[1] = (((w[0] >> 6) & 0x03) + ((w[1] & 0x0f) << 2)) * scale + bias;
      w_local[2] = (((w[1] >> 4) & 0x0f) + ((w[2] & 0x03) << 4)) * scale + bias;
      w_local[3] = ((w[2] >> 2) & 0x3f) * scale + bias;
    }
  }

  else if (bits == 8) {
    for (int i = 0; i < N; i++) {
      w_local[i] = scale * w[i] + bias;
    }
  }
}

template <
    typename T,
    short BROWS,
    short BCOLS,
    short dst_ld,
    short reduction_dim,
    short tgp_size,
    short group_size,
    short bits>
struct QuantizedBlockLoader {
  static_assert(
      BCOLS <= group_size,
      "The group size should be larger than the columns");
  static_assert(
      group_size % BCOLS == 0,
      "The group size should be divisible by the columns");
  static_assert(
      bits == 2 || bits == 3 || bits == 4 || bits == 5 || bits == 6 ||
          bits == 8,
      "Template undefined for bits not in {2, 3, 4, 5, 6, 8}");

  MLX_MTL_CONST short pack_factor = get_pack_factor<bits, 8>();
  MLX_MTL_CONST short bytes_per_pack = get_bytes_per_pack<bits>();
  MLX_MTL_CONST short BCOLS_PACKED = BCOLS / pack_factor;
  MLX_MTL_CONST short n_reads =
      (BCOLS_PACKED * BROWS < tgp_size) ? 1 : (BCOLS_PACKED * BROWS) / tgp_size;
  MLX_MTL_CONST short group_steps = group_size / BCOLS;

  const int src_ld;
  const int tile_stride;
  short group_step_cnt;
  const int group_stride;

  const short thread_idx;
  const short bi;
  const short bj;

  threadgroup T* dst;
  const device uint8_t* src;
  const device T* scales;
  const device T* biases;

  QuantizedBlockLoader(
      const device uint8_t* src_,
      const device T* scales_,
      const device T* biases_,
      const int src_ld_,
      threadgroup T* dst_,
      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
      ushort simd_lane_id [[thread_index_in_simdgroup]])
      : src_ld(src_ld_),
        tile_stride(
            reduction_dim ? BCOLS_PACKED * bytes_per_pack
                          : BROWS * src_ld * bytes_per_pack / pack_factor),
        group_step_cnt(0),
        group_stride(BROWS * src_ld / group_size),
        thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(n_reads * thread_idx / BCOLS_PACKED),
        bj((n_reads * thread_idx) % BCOLS_PACKED),
        dst(dst_ + bi * dst_ld + bj * pack_factor),
        src(src_ + bi * src_ld * bytes_per_pack / pack_factor +
            bj * bytes_per_pack),
        scales(scales_ + bi * src_ld / group_size),
        biases(biases_ + bi * src_ld / group_size) {}

  void load_unsafe() const {
    if (BCOLS_PACKED * BROWS < tgp_size && bi >= BROWS) {
      return;
    }

    T scale = *scales;
    T bias = *biases;
    for (int i = 0; i < n_reads; i++) {
      dequantize<T, pack_factor, bits>(
          src + i * bytes_per_pack, scale, bias, dst + i * pack_factor);
    }
  }

  void load_safe(short2 src_tile_dim) const {
    if (BCOLS_PACKED * BROWS < tgp_size && bi >= BROWS) {
      return;
    }

    if (reduction_dim == 1 && bi >= src_tile_dim.x) {
      for (int i = 0; i < n_reads * pack_factor; i++) {
        dst[i] = T(0);
      }
      return;
    }

    if (reduction_dim == 0 && bi >= src_tile_dim.y) {
      for (int i = 0; i < n_reads * pack_factor; i++) {
        dst[i] = T(0);
      }
      return;
    }

    T scale = *scales;
    T bias = *biases;
    for (int i = 0; i < n_reads; i++) {
      dequantize<T, pack_factor, bits>(
          (device uint8_t*)(src + i * bytes_per_pack),
          scale,
          bias,
          dst + i * pack_factor);
    }
  }

  void next() {
    src += tile_stride;
    if (reduction_dim == 1) {
      if (group_steps > 1) {
        group_step_cnt++;
        if (group_step_cnt == group_steps) {
          group_step_cnt = 0;
          scales++;
          biases++;
        }
      } else {
        scales++;
        biases++;
      }
    } else {
      scales += group_stride;
      biases += group_stride;
    }
  }
};

template <
    typename T,
    short BROWS,
    short BCOLS,
    short dst_ld,
    short reduction_dim,
    short tgp_size,
    short bits>
struct QuantizedBlockLoader<
    T,
    BROWS,
    BCOLS,
    dst_ld,
    reduction_dim,
    tgp_size,
    32,
    bits> {
  MLX_MTL_CONST short group_size = 32;

  static_assert(
      BCOLS % group_size == 0,
      "The group size should be divisible by the columns");
  static_assert(
      bits == 2 || bits == 3 || bits == 4 || bits == 5 || bits == 6 ||
          bits == 8,
      "Template undefined for bits not in {2, 3, 4, 5, 6, 8}");

  MLX_MTL_CONST short pack_factor = get_pack_factor<bits, 8>();
  MLX_MTL_CONST short bytes_per_pack = get_bytes_per_pack<bits>();
  MLX_MTL_CONST short BCOLS_PACKED = BCOLS / pack_factor;
  MLX_MTL_CONST short n_reads =
      (BCOLS_PACKED * BROWS < tgp_size) ? 1 : (BCOLS_PACKED * BROWS) / tgp_size;
  MLX_MTL_CONST short n_groups = BCOLS / group_size;

  static_assert(
      (BCOLS_PACKED / n_reads) == n_groups,
      "Other configurations are not yet supported");

  const int src_ld;
  const int tile_stride;
  const int group_stride;

  const short thread_idx;
  const short bi;
  const short bj;

  const short group_id;

  threadgroup T* dst;
  const device uint8_t* src;
  const device T* scales;
  const device T* biases;

  QuantizedBlockLoader(
      const device uint8_t* src_,
      const device T* scales_,
      const device T* biases_,
      const int src_ld_,
      threadgroup T* dst_,
      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
      ushort simd_lane_id [[thread_index_in_simdgroup]])
      : src_ld(src_ld_),
        tile_stride(
            reduction_dim ? BCOLS_PACKED * bytes_per_pack
                          : BROWS * src_ld * bytes_per_pack / pack_factor),
        group_stride(BROWS * src_ld / group_size),
        thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(n_reads * thread_idx / BCOLS_PACKED),
        bj((n_reads * thread_idx) % BCOLS_PACKED),
        group_id((bj * pack_factor) / group_size),
        dst(dst_ + bi * dst_ld + bj * pack_factor),
        src(src_ + bi * src_ld * bytes_per_pack / pack_factor +
            bj * bytes_per_pack),
        scales(scales_ + bi * src_ld / group_size + group_id),
        biases(biases_ + bi * src_ld / group_size + group_id) {}

  void load_unsafe() const {
    if (BCOLS_PACKED * BROWS < tgp_size && bi >= BROWS) {
      return;
    }

    T scale = *scales;
    T bias = *biases;
    for (int i = 0; i < n_reads; i++) {
      dequantize<T, pack_factor, bits>(
          src + i * bytes_per_pack, scale, bias, dst + i * pack_factor);
    }
  }

  void load_safe(short2 src_tile_dim) const {
    if (BCOLS_PACKED * BROWS < tgp_size && bi >= BROWS) {
      return;
    }

    if (reduction_dim == 1 && bi >= src_tile_dim.x) {
      for (int i = 0; i < n_reads * pack_factor; i++) {
        dst[i] = T(0);
      }
      return;
    }

    if (reduction_dim == 0 && bi >= src_tile_dim.y) {
      for (int i = 0; i < n_reads * pack_factor; i++) {
        dst[i] = T(0);
      }
      return;
    }

    T scale = *scales;
    T bias = *biases;
    for (int i = 0; i < n_reads; i++) {
      dequantize<T, pack_factor, bits>(
          (device uint8_t*)(src + i * bytes_per_pack),
          scale,
          bias,
          dst + i * pack_factor);
    }
  }

  void next() {
    src += tile_stride;
    if (reduction_dim == 1) {
      // if (group_steps > 1) {
      //   group_step_cnt++;
      //   if (group_step_cnt == group_steps) {
      //     group_step_cnt = 0;
      //     scales++;
      //     biases++;
      //   }
      // } else {
      scales += n_groups;
      biases += n_groups;
      // }
    } else {
      scales += n_groups * group_stride;
      biases += n_groups * group_stride;
    }
  }
};

template <typename T>
METAL_FUNC void adjust_matrix_offsets(
    const device T*& x,
    const device uint32_t*& w,
    const device T*& scales,
    const device T*& biases,
    device T*& y,
    int output_stride,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    const constant int64_t* b_strides,
    uint3 tid [[threadgroup_position_in_grid]]) {
  // Set the input/output matrices
  uint32_t x_idx = tid.z;
  uint32_t w_idx = tid.z;
  if (x_batch_ndims == 1) {
    x += x_idx * x_strides[0];
  } else {
    x += elem_to_loc(x_idx, x_shape, x_strides, x_batch_ndims);
  }
  if (w_batch_ndims == 1) {
    w += w_idx * w_strides[0];
    scales += w_idx * s_strides[0];
    biases += w_idx * b_strides[0];
  } else {
    ulong3 idx = elem_to_loc_broadcast(
        w_idx, w_shape, w_strides, s_strides, b_strides, w_batch_ndims);
    w += idx.x;
    scales += idx.y;
    biases += idx.z;
  }
  y += tid.z * output_stride;
}

template <typename T>
METAL_FUNC void adjust_matrix_offsets(
    const device T*& x,
    const device uint32_t*& w,
    const device T*& scales,
    const device T*& biases,
    const device uint32_t* lhs_indices,
    const device uint32_t* rhs_indices,
    device T*& y,
    int output_stride,
    const constant int& batch_ndims,
    const constant int* batch_shape,
    const constant int64_t* lhs_strides,
    const constant int64_t* rhs_strides,
    const constant int& x_batch_ndims,
    const constant int* x_shape,
    const constant int64_t* x_strides,
    const constant int& w_batch_ndims,
    const constant int* w_shape,
    const constant int64_t* w_strides,
    const constant int64_t* s_strides,
    const constant int64_t* b_strides,
    uint3 tid [[threadgroup_position_in_grid]]) {
  // Set the input/output matrices
  uint32_t x_idx;
  uint32_t w_idx;
  if (batch_ndims == 1) {
    x_idx = lhs_indices[tid.z * lhs_strides[0]];
    w_idx = rhs_indices[tid.z * rhs_strides[0]];
  } else {
    ulong2 idx = elem_to_loc_broadcast(
        tid.z, batch_shape, lhs_strides, rhs_strides, batch_ndims);
    x_idx = lhs_indices[idx.x];
    w_idx = rhs_indices[idx.y];
  }
  if (x_batch_ndims == 1) {
    x += x_idx * x_strides[0];
  } else {
    x += elem_to_loc(x_idx, x_shape, x_strides, x_batch_ndims);
  }
  if (w_batch_ndims == 1) {
    w += w_idx * w_strides[0];
    scales += w_idx * s_strides[0];
    biases += w_idx * b_strides[0];
  } else {
    ulong3 idx = elem_to_loc_broadcast(
        w_idx, w_shape, w_strides, s_strides, b_strides, w_batch_ndims);
    w += idx.x;
    scales += idx.y;
    biases += idx.z;
  }
  y += tid.z * output_stride;
}

template <
    typename T,
    const int group_size,
    const int bits,
    const bool aligned_N,
    const int BM = 64,
    const int BK = 64,
    const int BN = 64,
    const int WM = 2,
    const int WN = 2>
METAL_FUNC void qmm_t_nax_tgp_impl(
    const device uint32_t* w,
    const device T* scales,
    const device T* biases,
    const device T* x,
    device T* y,
    threadgroup T* Ws,
    const constant int& K,
    const constant int& N,
    const constant int& M,
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  static_assert(BK >= SIMD_SIZE, "BK should be larger than SIMD_SIZE");
  static_assert(BK % SIMD_SIZE == 0, "BK should be divisible by SIMD_SIZE");

  (void)lid;

  constexpr int pack_factor = get_pack_factor<bits, 8>();
  constexpr int bytes_per_pack = get_bytes_per_pack<bits>();

  constexpr int BK_padded = (BK + 16 / sizeof(T));

  using loader_w_t = QuantizedBlockLoader<
      T,
      BN,
      BK,
      BK_padded,
      1,
      WM * WN * SIMD_SIZE,
      group_size,
      bits>;

  // Set the block
  const int K_w = K * bytes_per_pack / pack_factor;
  const int K_g = K / group_size;
  const int y_row = tid.y * BM;
  const int y_col = tid.x * BN;

  auto wl = (const device uint8_t*)w;

  x += y_row * static_cast<int64_t>(K);
  wl += y_col * K_w;
  scales += y_col * K_g;
  biases += y_col * K_g;
  y += y_row * static_cast<int64_t>(N) + y_col;

  // Make the weight loader
  loader_w_t loader_w(wl, scales, biases, K, Ws, simd_gid, simd_lid);

  constexpr short SM = BM / WM;
  constexpr short SN = BN / WN;
  constexpr short SK = 32;

  constexpr short TM = SM / 16;
  constexpr short TN = SN / 16;
  constexpr short TK = SK / 16;

  const short tm = SM * (simd_gid / WN);
  const short tn = SN * (simd_gid % WN);

  constexpr bool transpose_a = false;
  constexpr bool transpose_b = true;

  const short sgp_sm = min(SM, short(M - (y_row + tm)));
  const bool is_unaligned_sm = (sgp_sm != SM);

  const short sgp_sn = aligned_N ? SN : min(SN, short(N - (y_col + tn)));

  const short tgp_bn = aligned_N ? BN : min(BN, int(N - (y_col)));
  const bool is_unaligned_bn = aligned_N ? false : (tgp_bn != BN);

  using AccumType = float;

  NAXTile<AccumType, TM, TN> Dtile;
  Dtile.clear();

  x += tm * K;

  dispatch_bool(!is_unaligned_sm, [&](auto kAlignedM) {
    dispatch_bool(aligned_N || !is_unaligned_bn, [&](auto kAlignedN) {
      for (int k = 0; k < K; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        if constexpr (kAlignedN.value) {
          loader_w.load_unsafe();
        } else {
          loader_w.load_safe(short2(BK, tgp_bn));
        }

        threadgroup_barrier(mem_flags::mem_threadgroup);

        STEEL_PRAGMA_NO_UNROLL
        for (int kk1 = 0; kk1 < BK; kk1 += SK) {
          NAXTile<T, TM, TK> Atile;
          NAXTile<T, TN, TK> Btile;

          volatile int compiler_barrier;

          if constexpr (kAlignedM.value) {
            Atile.load(x + kk1, K);
          } else {
            Atile.load_safe(x + kk1, K, short2(SK, sgp_sm));
          }

          Btile.template load<T, BK_padded, 1>(Ws + tn * BK_padded + kk1);

          tile_matmad_nax(
              Dtile,
              Atile,
              metal::bool_constant<transpose_a>{},
              Btile,
              metal::bool_constant<transpose_b>{});

          (void)compiler_barrier;
        }

        x += BK;
        loader_w.next();
      }

      // Store results to device memory
      threadgroup_barrier(mem_flags::mem_threadgroup);

      if constexpr (kAlignedM.value && kAlignedN.value) {
        Dtile.store(y + tm * N + tn, N);
      } else if (kAlignedM.value && sgp_sn == SN) {
        Dtile.store(y + tm * N + tn, N);
      } else {
        Dtile.store_safe(y + tm * N + tn, N, short2(sgp_sn, sgp_sm));
      }
    });
  });
}

template <
    typename T,
    const int group_size,
    const int bits,
    const int BM = 64,
    const int BK = 64,
    const int BN = 64,
    const int WM = 2,
    const int WN = 2>
METAL_FUNC void qmm_n_nax_tgp_impl(
    const device uint32_t* w,
    const device T* scales,
    const device T* biases,
    const device T* x,
    device T* y,
    threadgroup T* Ws,
    const constant int& K,
    const constant int& N,
    const constant int& M,
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;
  (void)M;

  static_assert(BK >= SIMD_SIZE, "BK should be larger than SIMD_SIZE");
  static_assert(BK % SIMD_SIZE == 0, "BK should be divisible by SIMD_SIZE");

  constexpr int pack_factor = get_pack_factor<bits, 8>();
  constexpr int bytes_per_pack = get_bytes_per_pack<bits>();

  constexpr int BN_padded = (BN + 16 / sizeof(T));

  using loader_w_t = QuantizedBlockLoader<
      T,
      BK,
      BN,
      BN_padded,
      0,
      WM * WN * SIMD_SIZE,
      group_size,
      bits>;

  // Set the block
  const int K_w = K * bytes_per_pack / pack_factor;
  const int K_g = K / group_size;
  const int y_row = tid.y * BM;
  const int y_col = tid.x * BN;

  auto wl = (const device uint8_t*)w;

  x += y_row * static_cast<int64_t>(K);
  wl += y_col * K_w;
  scales += y_col * K_g;
  biases += y_col * K_g;
  y += y_row * static_cast<int64_t>(N) + y_col;

  // Make the x loader and mma operation
  // const short num_els = min(BM, M - y_row);
  // const short num_outs = min(BN, N - y_col);
  loader_w_t loader_w(wl, scales, biases, K, Ws, simd_gid, simd_lid);

  constexpr short SM = BM / WM;
  constexpr short SN = BN / WN;
  constexpr short SK = 32;

  constexpr short TM = SM / 16;
  constexpr short TN = SN / 16;
  constexpr short TK = SK / 16;

  const short tm = SM * (simd_gid / WN);
  const short tn = SN * (simd_gid % WN);

  const short ldb_tgp = BN_padded;

  constexpr bool transpose_a = false;
  constexpr bool transpose_b = false;

  using AccumType = float;

  NAXTile<AccumType, TM, TN> Dtile;
  Dtile.clear();

  x += tm * K;

  for (int k = 0; k < K; k += BK) {
    threadgroup_barrier(mem_flags::mem_threadgroup);
    loader_w.load_unsafe();
    threadgroup_barrier(mem_flags::mem_threadgroup);

    STEEL_PRAGMA_NO_UNROLL
    for (int kk1 = 0; kk1 < BK; kk1 += SK) {
      NAXTile<T, TM, TK> Atile;
      NAXTile<T, TK, TN> Btile;

      volatile int compiler_barrier;

      Atile.load(x + kk1, K);
      Btile.template load<T, BN_padded, 1>(Ws + tn + kk1 * ldb_tgp);

      tile_matmad_nax(
          Dtile,
          Atile,
          metal::bool_constant<transpose_a>{},
          Btile,
          metal::bool_constant<transpose_b>{});

      (void)compiler_barrier;
    }

    x += BK;
    loader_w.next();
  }

  // Store results to device memory
  threadgroup_barrier(mem_flags::mem_threadgroup);

  Dtile.store(y + tm * N + tn, N);
}

template <
    typename T,
    const int group_size,
    const int bits,
    const bool aligned_N,
    const bool batched,
    const int BM = 64,
    const int BK = 32,
    const int BN = 64,
    const int WM = 2,
    const int WN = 2>
[[kernel]] void affine_qmm_t_nax(
    const device uint32_t* w [[buffer(0)]],
    const device T* scales [[buffer(1)]],
    const device T* biases [[buffer(2)]],
    const device T* x [[buffer(3)]],
    device T* y [[buffer(4)]],
    const constant int& K [[buffer(5)]],
    const constant int& N [[buffer(6)]],
    const constant int& M [[buffer(7)]],
    const constant int& x_batch_ndims [[buffer(8)]],
    const constant int* x_shape [[buffer(9)]],
    const constant int64_t* x_strides [[buffer(10)]],
    const constant int& w_batch_ndims [[buffer(11)]],
    const constant int* w_shape [[buffer(12)]],
    const constant int64_t* w_strides [[buffer(13)]],
    const constant int64_t* s_strides [[buffer(14)]],
    const constant int64_t* b_strides [[buffer(15)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr int BK_padded = (BK + 16 / sizeof(T));

  threadgroup T Ws[BN * BK_padded];

  if (batched) {
    adjust_matrix_offsets<T>(
        x,
        w,
        scales,
        biases,
        y,
        M * N,
        x_batch_ndims,
        x_shape,
        x_strides,
        w_batch_ndims,
        w_shape,
        w_strides,
        s_strides,
        b_strides,
        tid);
  }
  qmm_t_nax_tgp_impl<T, group_size, bits, aligned_N, BM, BK, BN, WM, WN>(
      w, scales, biases, x, y, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
}

template <
    typename T,
    const int group_size,
    const int bits,
    const bool batched,
    const int BM = 64,
    const int BK = 64,
    const int BN = 64,
    const int WM = 2,
    const int WN = 2>
[[kernel]] void affine_qmm_n_nax(
    const device uint32_t* w [[buffer(0)]],
    const device T* scales [[buffer(1)]],
    const device T* biases [[buffer(2)]],
    const device T* x [[buffer(3)]],
    device T* y [[buffer(4)]],
    const constant int& K [[buffer(5)]],
    const constant int& N [[buffer(6)]],
    const constant int& M [[buffer(7)]],
    const constant int& x_batch_ndims [[buffer(8)]],
    const constant int* x_shape [[buffer(9)]],
    const constant int64_t* x_strides [[buffer(10)]],
    const constant int& w_batch_ndims [[buffer(11)]],
    const constant int* w_shape [[buffer(12)]],
    const constant int64_t* w_strides [[buffer(13)]],
    const constant int64_t* s_strides [[buffer(14)]],
    const constant int64_t* b_strides [[buffer(15)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr int BN_padded = (BN + 16 / sizeof(T));

  threadgroup T Ws[BK * BN_padded];

  if (batched) {
    adjust_matrix_offsets<T>(
        x,
        w,
        scales,
        biases,
        y,
        M * N,
        x_batch_ndims,
        x_shape,
        x_strides,
        w_batch_ndims,
        w_shape,
        w_strides,
        s_strides,
        b_strides,
        tid);
  }

  qmm_n_nax_tgp_impl<T, group_size, bits, BM, BK, BN, WM, WN>(
      w, scales, biases, x, y, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
}

template <
    typename T,
    const int group_size,
    const int bits,
    const bool aligned_N,
    const int BM = 64,
    const int BK = 64,
    const int BN = 64,
    const int WM = 2,
    const int WN = 2>
[[kernel]] void affine_gather_qmm_t_nax(
    const device uint32_t* w [[buffer(0)]],
    const device T* scales [[buffer(1)]],
    const device T* biases [[buffer(2)]],
    const device T* x [[buffer(3)]],
    const device uint32_t* lhs_indices [[buffer(4)]],
    const device uint32_t* rhs_indices [[buffer(5)]],
    device T* y [[buffer(6)]],
    const constant int& K [[buffer(7)]],
    const constant int& N [[buffer(8)]],
    const constant int& M [[buffer(9)]],
    const constant int& x_batch_ndims [[buffer(10)]],
    const constant int* x_shape [[buffer(11)]],
    const constant int64_t* x_strides [[buffer(12)]],
    const constant int& w_batch_ndims [[buffer(13)]],
    const constant int* w_shape [[buffer(14)]],
    const constant int64_t* w_strides [[buffer(15)]],
    const constant int64_t* s_strides [[buffer(16)]],
    const constant int64_t* b_strides [[buffer(17)]],
    const constant int& batch_ndims [[buffer(18)]],
    const constant int* batch_shape [[buffer(19)]],
    const constant int64_t* lhs_strides [[buffer(20)]],
    const constant int64_t* rhs_strides [[buffer(21)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr int BK_padded = (BK + 16 / sizeof(T));

  threadgroup T Ws[BN * BK_padded];

  adjust_matrix_offsets<T>(
      x,
      w,
      scales,
      biases,
      lhs_indices,
      rhs_indices,
      y,
      M * N,
      batch_ndims,
      batch_shape,
      lhs_strides,
      rhs_strides,
      x_batch_ndims,
      x_shape,
      x_strides,
      w_batch_ndims,
      w_shape,
      w_strides,
      s_strides,
      b_strides,
      tid);
  qmm_t_nax_tgp_impl<T, group_size, bits, aligned_N, BM, BK, BN, WM, WN>(
      w, scales, biases, x, y, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
}

template <
    typename T,
    const int group_size,
    const int bits,
    const int BM = 64,
    const int BK = 64,
    const int BN = 64,
    const int WM = 2,
    const int WN = 2>
[[kernel]] void affine_gather_qmm_n_nax(
    const device uint32_t* w [[buffer(0)]],
    const device T* scales [[buffer(1)]],
    const device T* biases [[buffer(2)]],
    const device T* x [[buffer(3)]],
    const device uint32_t* lhs_indices [[buffer(4)]],
    const device uint32_t* rhs_indices [[buffer(5)]],
    device T* y [[buffer(6)]],
    const constant int& K [[buffer(7)]],
    const constant int& N [[buffer(8)]],
    const constant int& M [[buffer(9)]],
    const constant int& x_batch_ndims [[buffer(10)]],
    const constant int* x_shape [[buffer(11)]],
    const constant int64_t* x_strides [[buffer(12)]],
    const constant int& w_batch_ndims [[buffer(13)]],
    const constant int* w_shape [[buffer(14)]],
    const constant int64_t* w_strides [[buffer(15)]],
    const constant int64_t* s_strides [[buffer(16)]],
    const constant int64_t* b_strides [[buffer(17)]],
    const constant int& batch_ndims [[buffer(18)]],
    const constant int* batch_shape [[buffer(19)]],
    const constant int64_t* lhs_strides [[buffer(20)]],
    const constant int64_t* rhs_strides [[buffer(21)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint lid [[thread_index_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr int BN_padded = (BN + 16 / sizeof(T));

  threadgroup T Ws[BK * BN_padded];

  adjust_matrix_offsets<T>(
      x,
      w,
      scales,
      biases,
      lhs_indices,
      rhs_indices,
      y,
      M * N,
      batch_ndims,
      batch_shape,
      lhs_strides,
      rhs_strides,
      x_batch_ndims,
      x_shape,
      x_strides,
      w_batch_ndims,
      w_shape,
      w_strides,
      s_strides,
      b_strides,
      tid);
  qmm_n_nax_tgp_impl<T, group_size, bits, BM, BK, BN, WM, WN>(
      w, scales, biases, x, y, Ws, K, N, M, tid, lid, simd_gid, simd_lid);
}

template <
    typename T,
    int group_size,
    int bits,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose>
[[kernel]] void affine_gather_qmm_rhs_nax(
    const device T* x [[buffer(0)]],
    const device uint32_t* w [[buffer(1)]],
    const device T* scales [[buffer(2)]],
    const device T* biases [[buffer(3)]],
    const device uint32_t* indices [[buffer(4)]],
    device T* y [[buffer(5)]],
    const constant int& M [[buffer(6)]],
    const constant int& N [[buffer(7)]],
    const constant int& K [[buffer(8)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]]) {
  constexpr int pack_factor = get_pack_factor<bits, 8>();
  constexpr int bytes_per_pack = get_bytes_per_pack<bits>();
  constexpr int BK_padded = (BK + 16 / sizeof(T));
  constexpr int BN_padded = (BN + 16 / sizeof(T));

  using loader_w_t = QuantizedBlockLoader<
      T,
      transpose ? BN : BK,
      transpose ? BK : BN,
      transpose ? BK_padded : BN_padded,
      transpose,
      WM * WN * SIMD_SIZE,
      group_size,
      bits>;

  threadgroup T Ws[transpose ? BN * BK_padded : BK * BN_padded];

  // Compute the block
  const int K_w = K * bytes_per_pack / pack_factor;
  const int K_g = K / group_size;
  const int N_w = N * bytes_per_pack / pack_factor;
  const int N_g = N / group_size;
  const int K_it = K / BK;
  const size_t stride_w = transpose ? N * K_w : K * N_w;
  const size_t stride_s = transpose ? N * K_g : K * N_g;
  const int y_row = tid.y * BM;
  const int y_col = tid.x * BN;
  const size_t y_row_long = size_t(y_row);
  const size_t y_col_long = size_t(y_col);

  // Prepare threadgroup bounds
  const short tgp_bm = align_M ? BM : short(min(BM, M - y_row));
  const short tgp_bn = align_N ? BN : short(min(BN, N - y_col));

  // Calculate the final tiles in the case that K is not aligned
  const int k_remain = K - K_it * BK;
  const short2 tile_w =
      transpose ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);

  // Move x and output to the correct block
  auto wl = (const device uint8_t*)w;
  x += y_row_long * K;
  y += y_row_long * N + y_col_long;
  wl += transpose ? y_col_long * K_w : y_col * bytes_per_pack / pack_factor;
  scales += transpose ? y_col_long * K_g : y_col / group_size;
  biases += transpose ? y_col_long * K_g : y_col / group_size;

  constexpr short SM = BM / WM;
  constexpr short SN = BN / WN;
  constexpr short SK = 32;

  constexpr short TM = SM / 16;
  constexpr short TN = SN / 16;
  constexpr short TK = SK / 16;

  const short tm = SM * (simd_group_id / WN);
  const short tn = SN * (simd_group_id % WN);

  const short sgp_sm =
      align_M ? SM : min(SM, short(max(0, (M - (y_row + tm)))));
  const short sgp_sn =
      align_N ? SN : min(SN, short(max(0, (N - (y_col + tn)))));

  const bool is_unaligned_sm = align_M ? false : (sgp_sm != SM);
  const bool is_unaligned_bn = align_N ? false : (tgp_bn != BN);

  constexpr short BR = transpose ? TN : TK;
  constexpr short BC = transpose ? TK : TN;

  using AccumType = float;

  // Do as many matmuls as necessary
  uint32_t index;
  short offset;
  uint32_t index_next = indices[y_row];
  short offset_next = 0;
  int n = 0;
  while (n < tgp_bm) {
    n++;
    offset = offset_next;
    index = index_next;
    offset_next = tgp_bm;
    for (; n < tgp_bm; n++) {
      if (indices[y_row + n] != index) {
        offset_next = n;
        index_next = indices[y_row + n];
        break;
      }
    }
    threadgroup_barrier(mem_flags::mem_none);

    NAXTile<AccumType, TM, TN> Dtile;
    Dtile.clear();

    const device T* xn = x + tm * K;

    // Prepare threadgroup loading operations
    thread loader_w_t loader_w(
        wl + index * stride_w,
        scales + index * stride_s,
        biases + index * stride_s,
        transpose ? K : N,
        Ws,
        simd_group_id,
        simd_lane_id);

    dispatch_bool(align_M || !is_unaligned_sm, [&](auto kAlignedM) {
      dispatch_bool(align_N || !is_unaligned_bn, [&](auto kAlignedN) {
        for (int k = 0; k < K_it; k++) {
          threadgroup_barrier(mem_flags::mem_threadgroup);
          if constexpr (kAlignedN.value) {
            loader_w.load_unsafe();
          } else {
            loader_w.load_safe(
                transpose ? short2(BK, tgp_bn) : short2(tgp_bn, BK));
          }

          threadgroup_barrier(mem_flags::mem_threadgroup);

          STEEL_PRAGMA_NO_UNROLL
          for (int kk1 = 0; kk1 < BK; kk1 += SK) {
            NAXTile<T, TM, TK> Atile;
            NAXTile<T, BR, BC> Btile;

            volatile int compiler_barrier;

            if constexpr (kAlignedM.value) {
              Atile.load(xn + kk1, K);
            } else {
              Atile.load_safe(xn + kk1, K, short2(SK, sgp_sm));
            }

            if constexpr (transpose) {
              Btile.template load<T, BK_padded, 1>(Ws + tn * BK_padded + kk1);
            } else {
              Btile.template load<T, BN_padded, 1>(Ws + tn + kk1 * BN_padded);
            }

            tile_matmad_nax(
                Dtile,
                Atile,
                metal::bool_constant<false>{},
                Btile,
                metal::bool_constant<transpose>{});

            (void)compiler_barrier;
          }

          xn += BK;
          loader_w.next();
        }

        if (!align_K) {
          threadgroup_barrier(mem_flags::mem_threadgroup);
          loader_w.load_safe(tile_w);
          threadgroup_barrier(mem_flags::mem_threadgroup);

          STEEL_PRAGMA_NO_UNROLL
          for (int kk1 = 0; kk1 < BK; kk1 += SK) {
            NAXTile<T, TM, TK> Atile;
            NAXTile<T, BR, BC> Btile;

            volatile int compiler_barrier;

            const short psk = min(int(SK), max(0, (BK - kk1)));
            Atile.load_safe(xn + kk1, K, short2(psk, sgp_sm));

            if constexpr (transpose) {
              Btile.template load<T, BK_padded, 1>(Ws + tn * BK_padded + kk1);
            } else {
              Btile.template load<T, BN_padded, 1>(Ws + tn + kk1 * BN_padded);
            }

            tile_matmad_nax(
                Dtile,
                Atile,
                metal::bool_constant<false>{},
                Btile,
                metal::bool_constant<transpose>{});

            (void)compiler_barrier;
          }
        }

        threadgroup_barrier(mem_flags::mem_threadgroup);

        const short m_lo_lim = min(int(sgp_sm), max(0, offset - tm));
        const short m_hi_lim = min(int(sgp_sm), max(0, offset_next - tm));

        // Store results to device memory
        if constexpr (kAlignedN.value) {
          if (m_lo_lim == 0 && m_hi_lim == SM) {
            Dtile.store(y + tm * N + tn, N);
          } else {
            Dtile.store_slice(
                y + tm * N + tn, N, short2(0, m_lo_lim), short2(SN, m_hi_lim));
          }
        } else {
          Dtile.store_slice(
              y + tm * N + tn,
              N,
              short2(0, m_lo_lim),
              short2(sgp_sn, m_hi_lim));
        }
      });
    });
  }
}

================================================
FILE: mlx/backend/metal/kernels/quantized_nax.metal
================================================
// Copyright © 2023-2024 Apple Inc.

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/steel/gemm/gemm.h"
#include "mlx/backend/metal/kernels/steel/gemm/nax.h"
#include "mlx/backend/metal/kernels/steel/gemm/loader.h"
#include "mlx/backend/metal/kernels/quantized_nax.h"

#define instantiate_quantized(name, type, group_size, bits, bm, bn, bk, wm, wn)  \
  instantiate_kernel(                                                    \
      #name "_" #type "_gs_" #group_size "_b_" #bits,                    \
      name,                                                              \
      type,                                                              \
      group_size,                                                        \
      bits, bm, bk, bn, wm, wn)

#define instantiate_quantized_batched(name, type, group_size, bits, bm, bn, bk, wm, wn, batched)     \
  instantiate_kernel(                                                    \
      #name "_" #type "_gs_" #group_size "_b_" #bits "_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn "_batch_" #batched, \
      name,                                                              \
      type,                                                              \
      group_size,                                                        \
      bits,                                                              \
      batched, bm, bk, bn, wm, wn)

#define instantiate_quantized_aligned(name, type, group_size, bits, bm, bn, bk, wm, wn, aligned)     \
  instantiate_kernel(                                                                     \
      #name "_" #type "_gs_" #group_size "_b_" #bits "_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn "_alN_" #aligned, \
      name,                                                                  \
      type,                                                                  \
      group_size,                                                            \
      bits,                                                                  \
      aligned, bm, bk, bn, wm, wn)

#define instantiate_quantized_aligned_batched(name, type, group_size, bits, bm, bn, bk, wm, wn, aligned, batched)     \
  instantiate_kernel(                                                                     \
      #name "_" #type "_gs_" #group_size "_b_" #bits "_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn "_alN_" #aligned "_batch_" #batched, \
      name,                                                                  \
      type,                                                                  \
      group_size,                                                            \
      bits,                                                                  \
      aligned,                                                               \
      batched, bm, bk, bn, wm, wn)

#define instantiate_gather_qmm_rhs(func, name, type, group_size, bits, bm, bn, bk, wm, wn, transpose)        \
  instantiate_kernel(                                                                                        \
      #name "_" #type "_gs_" #group_size "_b_" #bits "_bm_" #bm "_bn_" #bn "_bk_" #bk "_wm_" #wm "_wn_" #wn, \
      func,                                                         \
      type,                                                         \
      group_size,                                                   \
      bits,                                                         \
      bm,                                                           \
      bn,                                                           \
      bk,                                                           \
      wm,                                                           \
      wn,                                                           \
      transpose)

#define instantiate_quantized_batched_wrap(name, type, group_size, bits) \
  instantiate_quantized_batched(name, type, group_size, bits, 64, 64, 64, 2, 2, 1)      \
  instantiate_quantized_batched(name, type, group_size, bits, 64, 64, 64, 2, 2, 0)

#define instantiate_quantized_all_batched(type, group_size, bits) \
  instantiate_quantized_batched_wrap(affine_qmm_n_nax, type, group_size, bits)


#define instantiate_quantized_all_single(type, group_size, bits) \
  instantiate_quantized(affine_gather_qmm_n_nax, type, group_size, bits, 64, 64, 64, 2, 2)

#define instantiate_quantized_all_aligned(type, group_size, bits)   \
  instantiate_quantized_aligned(affine_gather_qmm_t_nax, type, group_size, bits, 64, 64, 64, 2, 2, true) \
  instantiate_quantized_aligned(affine_gather_qmm_t_nax, type, group_size, bits, 64, 64, 64, 2, 2, false) \
  instantiate_quantized_aligned_batched(affine_qmm_t_nax, type, group_size, bits, 64, 64, 64, 2, 2, true, 1) \
  instantiate_quantized_aligned_batched(affine_qmm_t_nax, type, group_size, bits, 64, 64, 64, 2, 2, true, 0) \
  instantiate_quantized_aligned_batched(affine_qmm_t_nax, type, group_size, bits, 64, 64, 64, 2, 2, false, 1) \
  instantiate_quantized_aligned_batched(affine_qmm_t_nax, type, group_size, bits, 64, 64, 64, 2, 2, false, 0)

#define instantiate_quantized_all_rhs(type, group_size, bits) \
  instantiate_gather_qmm_rhs(affine_gather_qmm_rhs_nax, affine_gather_qmm_rhs_nax_nt, type, group_size, bits, 64, 64, 64, 2, 2, true) \
  instantiate_gather_qmm_rhs(affine_gather_qmm_rhs_nax, affine_gather_qmm_rhs_nax_nn, type, group_size, bits, 64, 64, 64, 2, 2, false)

#define instantiate_quantized_funcs(type, group_size, bits) \
  instantiate_quantized_all_batched(type, group_size, bits) \
  instantiate_quantized_all_aligned(type, group_size, bits) \
  instantiate_quantized_all_rhs(type, group_size, bits)

#define instantiate_quantized_types(group_size, bits)       \
  instantiate_quantized_funcs(float, group_size, bits)      \
  instantiate_quantized_funcs(float16_t, group_size, bits)  \
  instantiate_quantized_funcs(bfloat16_t, group_size, bits)  

#define instantiate_quantized_groups(bits) \
  instantiate_quantized_types(128, bits)   \
  instantiate_quantized_types(64, bits)    \
  instantiate_quantized_types(32, bits)

#define instantiate_quantized_all() \
  instantiate_quantized_groups(2) \
  instantiate_quantized_groups(3) \
  instantiate_quantized_groups(4) \
  instantiate_quantized_groups(5) \
  instantiate_quantized_groups(6) \
  instantiate_quantized_groups(8)

instantiate_quantized_all() // clang-format on


================================================
FILE: mlx/backend/metal/kernels/quantized_utils.h
================================================
// Copyright © 2023-2024 Apple Inc.

#include <metal_simdgroup>
#include <metal_stdlib>

template <typename T, typename mma_t, typename loader_a_t, typename loader_b_t>
METAL_FUNC void gemm_loop_aligned(
    threadgroup T* As,
    threadgroup T* Bs,
    thread mma_t& mma_op,
    thread loader_a_t& loader_a,
    thread loader_b_t& loader_b,
    const int k_iterations) {
  for (int k = 0; k < k_iterations; k++) {
    threadgroup_barrier(mem_flags::mem_threadgroup);

    // Load elements into threadgroup memory
    loader_a.load_unsafe();
    loader_b.load_unsafe();

    threadgroup_barrier(mem_flags::mem_threadgroup);

    // Multiply and accumulate threadgroup elements
    mma_op.mma(As, Bs);

    // Prepare for next iteration
    loader_a.next();
    loader_b.next();
  }
}

template <
    bool rows_aligned,
    bool cols_aligned,
    bool transpose,
    typename T,
    typename mma_t,
    typename loader_a_t,
    typename loader_b_t>
METAL_FUNC void gemm_loop_unaligned(
    threadgroup T* As,
    threadgroup T* Bs,
    thread mma_t& mma_op,
    thread loader_a_t& loader_a,
    thread loader_b_t& loader_b,
    const int k_iterations,
    const short tgp_bm,
    const short tgp_bn,
    const short tgp_bk) {
  for (int k = 0; k < k_iterations; k++) {
    threadgroup_barrier(mem_flags::mem_threadgroup);

    // Load elements into threadgroup memory
    if (rows_aligned) {
      loader_a.load_unsafe();
    } else {
      loader_a.load_safe(short2(tgp_bk, tgp_bm));
    }
    if (cols_aligned) {
      loader_b.load_unsafe();
    } else {
      loader_b.load_safe(
          transpose ? short2(tgp_bk, tgp_bn) : short2(tgp_bn, tgp_bk));
    }

    threadgroup_barrier(mem_flags::mem_threadgroup);

    // Multiply and accumulate threadgroup elements
    mma_op.mma(As, Bs);

    // Prepare for next iteration
    loader_a.next();
    loader_b.next();
  }
}

template <typename T, typename mma_t, typename loader_a_t, typename loader_b_t>
METAL_FUNC void gemm_loop_finalize(
    threadgroup T* As,
    threadgroup T* Bs,
    thread mma_t& mma_op,
    thread loader_a_t& loader_a,
    thread loader_b_t& loader_b,
    const short2 tile_a,
    const short2 tile_b) {
  loader_a.load_safe(tile_a);
  loader_b.load_safe(tile_b);
  threadgroup_barrier(mem_flags::mem_threadgroup);
  mma_op.mma(As, Bs);
}


================================================
FILE: mlx/backend/metal/kernels/random.metal
================================================
// Copyright © 2023 Apple Inc.

#include "mlx/backend/metal/kernels/utils.h"

static constexpr constant uint32_t rotations[2][4] = {
    {13, 15, 26, 6},
    {17, 29, 16, 24}};

union rbits {
  uint2 val;
  uchar4 bytes[2];
};

rbits threefry2x32_hash(const thread uint2& key, uint2 count) {
  uint4 ks = {key.x, key.y, key.x ^ key.y ^ 0x1BD11BDA};

  rbits v;
  v.val.x = count.x + ks[0];
  v.val.y = count.y + ks[1];

  for (int i = 0; i < 5; ++i) {
    for (auto r : rotations[i % 2]) {
      v.val.x += v.val.y;
      v.val.y = (v.val.y << r) | (v.val.y >> (32 - r));
      v.val.y ^= v.val.x;
    }
    v.val.x += ks[(i + 1) % 3];
    v.val.y += ks[(i + 2) % 3] + i + 1;
  }

  return v;
}

[[kernel]] void rbitsc(
    device const uint32_t* keys,
    device char* out,
    constant const bool& odd,
    constant const uint& bytes_per_key,
    uint2 grid_dim [[threads_per_grid]],
    uint2 index [[thread_position_in_grid]]) {
  auto kidx = 2 * index.x;
  auto key = uint2(keys[kidx], keys[kidx + 1]);
  auto half_size = grid_dim.y - odd;
  out += index.x * bytes_per_key;
  bool drop_last = odd && (index.y == half_size);
  auto bits = threefry2x32_hash(
      key, uint2(index.y, drop_last ? 0 : index.y + grid_dim.y));
  size_t idx = size_t(index.y) << 2;
  for (int i = 0; i < 4; ++i) {
    out[idx + i] = bits.bytes[0][i];
  }
  if (!drop_last) {
    idx = (drop_last ? 0 : size_t(index.y) + grid_dim.y) << 2;
    if ((index.y + 1) == half_size && (bytes_per_key % 4) > 0) {
      int edge_bytes = (bytes_per_key % 4);
      for (int i = 0; i < edge_bytes; ++i) {
        out[idx + i] = bits.bytes[1][i];
      }
    } else {
      for (int i = 0; i < 4; ++i) {
        out[idx + i] = bits.bytes[1][i];
      }
    }
  }
}

[[kernel]] void rbits(
    device const uint32_t* keys,
    device char* out,
    constant const bool& odd,
    constant const uint& bytes_per_key,
    constant const int& ndim,
    constant const int* key_shape,
    constant const int64_t* key_strides,
    uint2 grid_dim [[threads_per_grid]],
    uint2 index [[thread_position_in_grid]]) {
  auto kidx = 2 * index.x;
  auto k1_elem = elem_to_loc(kidx, key_shape, key_strides, ndim);
  auto k2_elem = elem_to_loc(kidx + 1, key_shape, key_strides, ndim);
  auto key = uint2(keys[k1_elem], keys[k2_elem]);
  auto half_size = grid_dim.y - odd;
  out += size_t(index.x) * bytes_per_key;
  bool drop_last = odd && (index.y == half_size);
  auto bits = threefry2x32_hash(
      key, uint2(index.y, drop_last ? 0 : index.y + grid_dim.y));
  size_t idx = size_t(index.y) << 2;
  for (int i = 0; i < 4; ++i) {
    out[idx + i] = bits.bytes[0][i];
  }
  if (!drop_last) {
    idx = (drop_last ? 0 : size_t(index.y) + grid_dim.y) << 2;
    if ((index.y + 1) == half_size && (bytes_per_key % 4) > 0) {
      int edge_bytes = (bytes_per_key % 4);
      for (int i = 0; i < edge_bytes; ++i) {
        out[idx + i] = bits.bytes[1][i];
      }
    } else {
      for (int i = 0; i < 4; ++i) {
        out[idx + i] = bits.bytes[1][i];
      }
    }
  }
}


================================================
FILE: mlx/backend/metal/kernels/reduce.h
================================================
#pragma once
#include "mlx/backend/metal/kernels/reduction/reduce_all.h"
#include "mlx/backend/metal/kernels/reduction/reduce_col.h"
#include "mlx/backend/metal/kernels/reduction/reduce_init.h"
#include "mlx/backend/metal/kernels/reduction/reduce_row.h"


================================================
FILE: mlx/backend/metal/kernels/reduce.metal
================================================
// Copyright © 2024 Apple Inc.

#include <metal_atomic>
#include <metal_simdgroup>

// clang-format off
#include "mlx/backend/metal/kernels/defines.h"
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/atomic.h"
#include "mlx/backend/metal/kernels/reduction/ops.h"
#include "mlx/backend/metal/kernels/reduce.h"

#define instantiate_init_reduce(name, tname, type, op) \
  instantiate_kernel("init_reduce_" #name #tname, init_reduce, type, op<type>)

instantiate_init_reduce(and, bool_, bool, And)
instantiate_init_reduce(or, bool_, bool, Or)

#define instantiate_init_sum_prod(name, op)                 \
  instantiate_init_reduce(name, int32, int32_t, op)         \
  instantiate_init_reduce(name, int64, int64_t, op)         \
  instantiate_init_reduce(name, float16, float16_t, op)     \
  instantiate_init_reduce(name, bfloat16, bfloat16_t, op)   \
  instantiate_init_reduce(name, float32, float, op)         \
  instantiate_init_reduce(name, complex64, complex64_t, op)

instantiate_init_sum_prod(sum, Sum)
instantiate_init_sum_prod(prod, Prod)

#define instantiate_init_min_max(name, op)                   \
  instantiate_init_reduce(name, bool_, bool, op)             \
  instantiate_init_reduce(name, int8, int8_t, op)            \
  instantiate_init_reduce(name, int16, int16_t, op)          \
  instantiate_init_reduce(name, int32, int32_t, op)          \
  instantiate_init_reduce(name, int64, int64_t, op)          \
  instantiate_init_reduce(name, uint8, uint8_t, op)          \
  instantiate_init_reduce(name, uint16, uint16_t, op)        \
  instantiate_init_reduce(name, uint32, uint32_t, op)        \
  instantiate_init_reduce(name, uint64, uint64_t, op)        \
  instantiate_init_reduce(name, float16, float16_t, op)      \
  instantiate_init_reduce(name, bfloat16, bfloat16_t, op)    \
  instantiate_init_reduce(name, float32, float, op)          \
  instantiate_init_reduce(name, complex64, complex64_t, op)

instantiate_init_min_max(min, Min)
instantiate_init_min_max(max, Max)

#define instantiate_all_reduce(name, itype, otype, op) \
  instantiate_kernel("all_reduce_" #name,              \
                     all_reduce,                       \
                     itype, otype, op)

#define instantiate_col_reduce_small(name, itype, otype, op, dim)          \
  instantiate_kernel("col_reduce_small_" #dim "_reduce_" #name,            \
                     col_reduce_small,                                     \
                     itype, otype, op, int, dim)                           \
  instantiate_kernel("col_reduce_longcolumn_" #dim "_reduce_" #name,       \
                     col_reduce_longcolumn,                                \
                     itype, otype, op, int, dim)                           \
  instantiate_kernel("col_reduce_small_large_" #dim "_reduce_" #name,      \
                     col_reduce_small,                                     \
                     itype, otype, op, int64_t, dim)                       \
  instantiate_kernel("col_reduce_longcolumn_large_" #dim "_reduce_" #name, \
                     col_reduce_longcolumn,                                \
                     itype, otype, op, int64_t, dim)

#define instantiate_col_reduce_looped_tile(name, itype, otype, op, dim, bm, bn)        \
  instantiate_kernel("col_reduce_looped_" #dim "_" #bm "_" #bn "_reduce_" #name,       \
                     col_reduce_looped,                                                \
                     itype, otype, op, int, dim, bm, bn)                               \
  instantiate_kernel("col_reduce_looped_large_" #dim "_" #bm "_" #bn "_reduce_" #name, \
                     col_reduce_looped,                                                \
                     itype, otype, op, int64_t, dim, bm, bn)

#define instantiate_col_reduce_2pass_tile(name, itype, otype, op, dim, bm, bn)        \
  instantiate_kernel("col_reduce_2pass_" #dim "_" #bm "_" #bn "_reduce_" #name,       \
                     col_reduce_2pass,                                                \
                     itype, otype, op, int, dim, bm, bn)                              \
  instantiate_kernel("col_reduce_2pass_large_" #dim "_" #bm "_" #bn "_reduce_" #name, \
                     col_reduce_2pass,                                                \
                     itype, otype, op, int64_t, dim, bm, bn)

#define instantiate_col_reduce_looped(name, itype, otype, op, dim)        \
  instantiate_col_reduce_looped_tile(name, itype, otype, op, dim, 32, 32) \
  instantiate_col_reduce_2pass_tile(name, itype, otype, op, dim, 32, 32)

#define instantiate_col_reduce_general(name, itype, otype, op) \
  instantiate_col_reduce_small(name, itype, otype, op, 1)      \
  instantiate_col_reduce_small(name, itype, otype, op, 2)      \
  instantiate_col_reduce_small(name, itype, otype, op, 5)      \
  instantiate_col_reduce_looped(name, itype, otype, op, 1)     \
  instantiate_col_reduce_looped(name, itype, otype, op, 2)     \
  instantiate_col_reduce_looped(name, itype, otype, op, 5)

#define instantiate_row_reduce_small(name, itype, otype, op, dim)     \
  instantiate_kernel("row_reduce_small_" #dim "_reduce_" #name,       \
                     row_reduce_small,                                \
                     itype, otype, op, int, dim)                      \
  instantiate_kernel("row_reduce_small_large_" #dim "_reduce_" #name, \
                     row_reduce_small,                                \
                     itype, otype, op, int64_t, dim)

#define instantiate_row_reduce_looped(name, itype, otype, op, dim)       \
  instantiate_kernel("row_reduce_looped_" #dim "_reduce_" #name,         \
                     row_reduce_looped,                                  \
                     itype, otype, op, int, dim)                         \
  instantiate_kernel("row_reduce_looped_large_" #dim "_reduce_" #name,   \
                     row_reduce_looped,                                  \
                     itype, otype, op, int64_t, dim)

#define instantiate_row_reduce_general(name, itype, otype, op) \
  instantiate_row_reduce_small(name, itype, otype, op, 1)      \
  instantiate_row_reduce_small(name, itype, otype, op, 2)      \
  instantiate_row_reduce_small(name, itype, otype, op, 5)      \
  instantiate_row_reduce_looped(name, itype, otype, op, 1)     \
  instantiate_row_reduce_looped(name, itype, otype, op, 2)     \
  instantiate_row_reduce_looped(name, itype, otype, op, 5)     \
  instantiate_kernel("row_reduce_simple_" #name,               \
                     row_reduce_simple,                        \
                     itype, otype, op)

#define instantiate_reduce_functions(name, tname, itype, otype, op)    \
  instantiate_all_reduce(name##tname, itype, otype, op<otype>)         \
  instantiate_row_reduce_general(name##tname, itype, otype, op<otype>) \
  instantiate_col_reduce_general(name##tname, itype, otype, op<otype>)

#define instantiate_and_or(name, op)                           \
  instantiate_reduce_functions(name, bool_, bool, bool, op)    \
  instantiate_reduce_functions(name, int16, int16_t, bool, op) \
  instantiate_reduce_functions(name, int32, int32_t, bool, op) \
  instantiate_reduce_functions(name, int64, int64_t, bool, op)

instantiate_and_or(and, And)
instantiate_and_or(or, Or)

#define instantiate_sum_prod(name, op)                                       \
  instantiate_reduce_functions(name, uint8, uint8_t, int32_t, op)            \
  instantiate_reduce_functions(name, uint16, uint16_t, uint32_t, op)         \
  instantiate_reduce_functions(name, uint32, uint32_t, uint32_t, op)         \
  instantiate_reduce_functions(name, uint64, uint64_t, uint64_t, op)         \
  instantiate_reduce_functions(name, int8, int8_t, int32_t, op)              \
  instantiate_reduce_functions(name, int16, int16_t, int32_t, op)            \
  instantiate_reduce_functions(name, int32, int32_t, int32_t, op)            \
  instantiate_reduce_functions(name, int64, int64_t, int64_t, op)            \
  instantiate_reduce_functions(name, float16, float16_t, float16_t, op)      \
  instantiate_reduce_functions(name, bfloat16, bfloat16_t, bfloat16_t, op)   \
  instantiate_reduce_functions(name, float32, float, float, op)              \
  instantiate_reduce_functions(name, complex64, complex64_t, complex64_t, op)

instantiate_sum_prod(sum, Sum)
instantiate_sum_prod(prod, Prod)

#define instantiate_min_max(name, op)                                        \
  instantiate_reduce_functions(name, int8, int8_t, int8_t, op)               \
  instantiate_reduce_functions(name, int16, int16_t, int16_t, op)            \
  instantiate_reduce_functions(name, int32, int32_t, int32_t, op)            \
  instantiate_reduce_functions(name, int64, int64_t, int64_t, op)            \
  instantiate_reduce_functions(name, uint8, uint8_t, uint8_t, op)            \
  instantiate_reduce_functions(name, uint16, uint16_t, uint16_t, op)         \
  instantiate_reduce_functions(name, uint32, uint32_t, uint32_t, op)         \
  instantiate_reduce_functions(name, uint64, uint64_t, uint64_t, op)         \
  instantiate_reduce_functions(name, float16, float16_t, float16_t, op)      \
  instantiate_reduce_functions(name, bfloat16, bfloat16_t, bfloat16_t, op)   \
  instantiate_reduce_functions(name, float32, float, float, op)              \
  instantiate_reduce_functions(name, complex64, complex64_t, complex64_t, op)

instantiate_min_max(min, Min)
instantiate_min_max(max, Max)
    // clang-format on


================================================
FILE: mlx/backend/metal/kernels/reduce_utils.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/atomic.h"
#include "mlx/backend/metal/kernels/reduction/ops.h"


================================================
FILE: mlx/backend/metal/kernels/reduction/ops.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <metal_atomic>
#include <metal_simdgroup>

#define DEFINE_SIMD_REDUCE()                                             \
  template <typename T, metal::enable_if_t<sizeof(T) < 8, bool> = true>  \
  T simd_reduce(T val) {                                                 \
    return simd_reduce_impl(val);                                        \
  }                                                                      \
                                                                         \
  template <typename T, metal::enable_if_t<sizeof(T) == 8, bool> = true> \
  T simd_reduce(T val) {                                                 \
    for (short i = simd_size / 2; i > 0; i /= 2) {                       \
      val = operator()(val, simd_shuffle_down(val, i));                  \
    }                                                                    \
    return val;                                                          \
  }

static constant constexpr const uint8_t simd_size = 32;

union bool4_or_uint {
  bool4 b;
  unsigned int i;
};

struct None {
  template <typename T>
  void atomic_update(device mlx_atomic<T>* out, T val, size_t offset = 0) {
    mlx_atomic_store_explicit(out, val, offset);
  }
};

template <typename U = bool>
struct And {
  DEFINE_SIMD_REDUCE()

  bool simd_reduce_impl(bool val) {
    return simd_all(val);
  }

  static constexpr constant bool init = true;

  void atomic_update(
      device mlx_atomic<unsigned int>* out,
      bool val,
      int elem_idx,
      size_t offset = 0) {
    if (!val) {
      bool4_or_uint update;
      update.b = {true, true, true, true};
      update.b[elem_idx] = false;
      mlx_atomic_fetch_and_explicit(out, update.i, offset);
    }
  }

  void
  atomic_update(device mlx_atomic<bool>* out, bool val, size_t offset = 0) {
    if (!val) {
      mlx_atomic_store_explicit(out, val, offset);
    }
  }

  // Non atomic update
  void update(device bool* out, bool val) {
    *out &= val;
  }

  // Operator
  bool operator()(bool a, bool b) {
    return a && b;
  }
};

template <typename U = bool>
struct Or {
  DEFINE_SIMD_REDUCE()

  bool simd_reduce_impl(bool val) {
    return simd_any(val);
  }

  static constexpr constant bool init = false;

  void atomic_update(
      device mlx_atomic<unsigned int>* out,
      bool val,
      int elem_idx,
      size_t offset = 0) {
    if (val) {
      bool4_or_uint update;
      update.b = {false, false, false, false};
      update.b[elem_idx] = true;
      mlx_atomic_fetch_or_explicit(out, update.i, offset);
    }
  }

  void
  atomic_update(device mlx_atomic<bool>* out, bool val, size_t offset = 0) {
    if (val) {
      mlx_atomic_store_explicit(out, val, offset);
    }
  }

  // Non atomic update
  void update(device bool* out, bool val) {
    *out |= val;
  }

  // Operator
  bool operator()(bool a, bool b) {
    return a || b;
  }
};

template <typename U>
struct Sum {
  DEFINE_SIMD_REDUCE()

  template <typename T>
  T simd_reduce_impl(T val) {
    return simd_sum(val);
  }

  static constexpr constant U init = U(0);

  template <typename T>
  void atomic_update(device mlx_atomic<T>* out, T val, size_t offset = 0) {
    mlx_atomic_fetch_add_explicit(out, val, offset);
  }

  // Operator
  U operator()(U a, U b) {
    return a + b;
  }
};

template <typename U>
struct Prod {
  DEFINE_SIMD_REDUCE()

  template <typename T>
  T simd_reduce_impl(T val) {
    return simd_product(val);
  }

  static constexpr constant U init = U(1);

  template <typename T>
  void atomic_update(device mlx_atomic<T>* out, T val, size_t offset = 0) {
    mlx_atomic_fetch_mul_explicit(out, val, offset);
  }

  // Operator
  U operator()(U a, U b) {
    return a * b;
  }
};

template <typename U>
struct Min {
  DEFINE_SIMD_REDUCE()

  template <typename T>
  metal::enable_if_t<metal::is_integral_v<T>, T> simd_reduce_impl(T val) {
    return simd_min(val);
  }

  template <typename T>
  metal::enable_if_t<!metal::is_integral_v<T>, T> simd_reduce_impl(T val) {
    if (simd_any(val != val)) {
      return static_cast<T>(NAN);
    }
    return simd_min(val);
  }

  static constexpr constant U init = Limits<U>::max;

  template <typename T>
  void atomic_update(device mlx_atomic<T>* out, T val, size_t offset = 0) {
    mlx_atomic_fetch_min_explicit(out, val, offset);
  }

  // Operator
  template <typename T>
  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T a, T b) {
    return a < b ? a : b;
  }

  template <typename T>
  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T a, T b) {
    if (metal::isnan(a) || metal::isnan(b)) {
      return static_cast<T>(NAN);
    } else {
      return a < b ? a : b;
    }
  }

  template <>
  complex64_t operator()(complex64_t a, complex64_t b) {
    bool real_is_nan = metal::isnan(a.real) || metal::isnan(b.real);
    bool imag_is_nan = metal::isnan(a.imag) || metal::isnan(b.imag);

    if (!real_is_nan && !imag_is_nan) {
      return a < b ? a : b;
    } else if (real_is_nan && !imag_is_nan) {
      return complex64_t(
          static_cast<float>(NAN), a.imag < b.imag ? a.imag : b.imag);
    } else if (!real_is_nan && imag_is_nan) {
      return complex64_t(
          a.real < b.real ? a.real : b.real, static_cast<float>(NAN));
    } else {
      return complex64_t(static_cast<float>(NAN), static_cast<float>(NAN));
    }
  };
};
template <typename U>
struct Max {
  DEFINE_SIMD_REDUCE()

  template <typename T>
  metal::enable_if_t<metal::is_integral_v<T>, T> simd_reduce_impl(T val) {
    return simd_max(val);
  }

  template <typename T>
  metal::enable_if_t<!metal::is_integral_v<T>, T> simd_reduce_impl(T val) {
    if (simd_any(val != val)) {
      return static_cast<T>(NAN);
    }
    return simd_max(val);
  }

  static constexpr constant U init = Limits<U>::min;

  template <typename T>
  void atomic_update(device mlx_atomic<T>* out, T val, size_t offset = 0) {
    mlx_atomic_fetch_max_explicit(out, val, offset);
  }

  // Operator
  template <typename T>
  metal::enable_if_t<metal::is_integral_v<T>, T> operator()(T a, T b) {
    return a > b ? a : b;
  }

  template <typename T>
  metal::enable_if_t<!metal::is_integral_v<T>, T> operator()(T a, T b) {
    if (metal::isnan(a) || metal::isnan(b)) {
      return static_cast<T>(NAN);
    } else {
      return a > b ? a : b;
    }
  }

  template <>
  complex64_t operator()(complex64_t a, complex64_t b) {
    bool real_is_nan = metal::isnan(a.real) || metal::isnan(b.real);
    bool imag_is_nan = metal::isnan(a.imag) || metal::isnan(b.imag);

    if (!real_is_nan && !imag_is_nan) {
      return a > b ? a : b;
    } else if (real_is_nan && !imag_is_nan) {
      return complex64_t(
          static_cast<float>(NAN), a.imag > b.imag ? a.imag : b.imag);
    } else if (!real_is_nan && imag_is_nan) {
      return complex64_t(
          a.real > b.real ? a.real : b.real, static_cast<float>(NAN));
    } else {
      return complex64_t(static_cast<float>(NAN), static_cast<float>(NAN));
    }
  }
};


================================================
FILE: mlx/backend/metal/kernels/reduction/reduce_all.h
================================================
// Copyright © 2023-2024 Apple Inc.

template <
    typename T,
    typename U,
    typename Op,
    typename IdxT = int64_t,
    int N_READS = REDUCE_N_READS>
[[kernel]] void all_reduce(
    const device T* in [[buffer(0)]],
    device U* out [[buffer(1)]],
    const constant size_t& in_size [[buffer(2)]],
    const constant size_t& row_size [[buffer(3)]],
    uint3 gid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint3 lsize [[threads_per_threadgroup]],
    uint simd_per_group [[simdgroups_per_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  Op op;
  threadgroup U shared_vals[simd_size];

  U total = Op::init;
  IdxT start_idx = gid.y * IdxT(row_size);
  IdxT actual_row =
      (start_idx + row_size <= in_size) ? row_size : in_size - start_idx;
  IdxT blocks = actual_row / (lsize.x * N_READS);
  int extra = actual_row - blocks * (lsize.x * N_READS);
  extra -= lid.x * N_READS;
  start_idx += lid.x * N_READS;
  in += start_idx;

  if (extra >= N_READS) {
    blocks++;
    extra = 0;
  }

  for (IdxT b = 0; b < blocks; b++) {
    for (int i = 0; i < N_READS; i++) {
      total = op(static_cast<U>(in[i]), total);
    }
    in += lsize.x * N_READS;
  }
  if (extra > 0) {
    for (int i = 0; i < extra; i++) {
      total = op(static_cast<U>(in[i]), total);
    }
  }

  // Reduction within simd group
  total = op.simd_reduce(total);
  if (simd_per_group > 1) {
    if (simd_lane_id == 0) {
      shared_vals[simd_group_id] = total;
    }

    // Reduction within thread group
    threadgroup_barrier(mem_flags::mem_threadgroup);
    total = lid.x < simd_per_group ? shared_vals[lid.x] : op.init;
    total = op.simd_reduce(total);
  }

  if (lid.x == 0) {
    out[gid.y] = total;
  }
}


================================================
FILE: mlx/backend/metal/kernels/reduction/reduce_col.h
================================================
// Copyright © 2023-2024 Apple Inc.

template <typename T, typename U, typename Op, typename IdxT, int NDIMS>
[[kernel]] void col_reduce_small(
    const device T* in [[buffer(0)]],
    device U* out [[buffer(1)]],
    const constant size_t& reduction_size [[buffer(2)]],
    const constant int64_t& reduction_stride [[buffer(3)]],
    const constant int* shape [[buffer(4)]],
    const constant int64_t* strides [[buffer(5)]],
    const constant int& ndim [[buffer(6)]],
    const constant int* reduce_shape [[buffer(7)]],
    const constant int64_t* reduce_strides [[buffer(8)]],
    const constant int& reduce_ndim [[buffer(9)]],
    const constant size_t& non_col_reductions [[buffer(10)]],
    uint3 gid [[threadgroup_position_in_grid]],
    uint3 gsize [[threadgroups_per_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint3 lsize [[threads_per_threadgroup]]) {
  constexpr int n_reads = 4;
  Op op;
  LoopedElemToLoc<NDIMS, IdxT, (NDIMS > 2)> loop(reduce_ndim);
  const device T* row;

  U totals[n_reads];
  for (int i = 0; i < n_reads; i++) {
    totals[i] = Op::init;
  }

  IdxT column = IdxT(gid.x) * lsize.x * n_reads + lid.x * n_reads;
  if (column >= reduction_stride) {
    return;
  }
  bool safe = column + n_reads <= reduction_stride;

  IdxT out_idx = gid.y + gsize.y * IdxT(gid.z);
  IdxT in_idx = elem_to_loc<IdxT>(out_idx, shape, strides, ndim);
  in += in_idx + column;

  IdxT total_rows = IdxT(non_col_reductions) * IdxT(reduction_size);
  loop.next(lid.y, reduce_shape, reduce_strides);
  for (IdxT r = lid.y; r < total_rows; r += lsize.y) {
    row = in + loop.location();
    if (safe) {
      for (int i = 0; i < n_reads; i++) {
        totals[i] = op(static_cast<U>(row[i]), totals[i]);
      }
    } else {
      U vals[n_reads];
      for (int i = 0; i < n_reads; i++) {
        vals[i] =
            (column + i < reduction_stride) ? static_cast<U>(row[i]) : op.init;
      }
      for (int i = 0; i < n_reads; i++) {
        totals[i] = op(vals[i], totals[i]);
      }
    }
    loop.next(lsize.y, reduce_shape, reduce_strides);
  }

  if (lsize.y > 1) {
    // lsize.y should be <= 8
    threadgroup U shared_vals[32 * 8 * n_reads];
    for (int i = 0; i < n_reads; i++) {
      shared_vals[lid.y * lsize.x * n_reads + lid.x * n_reads + i] = totals[i];
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    if (lid.y == 0) {
      for (int i = 0; i < n_reads; i++) {
        totals[i] = shared_vals[lid.x * n_reads + i];
      }
      for (uint j = 1; j < lsize.y; j++) {
        for (int i = 0; i < n_reads; i++) {
          totals[i] =
              op(shared_vals[j * lsize.x * n_reads + lid.x * n_reads + i],
                 totals[i]);
        }
      }
    }
  }

  if (lid.y == 0) {
    out += out_idx * IdxT(reduction_stride) + column;
    if (safe) {
      for (int i = 0; i < n_reads; i++) {
        out[i] = totals[i];
      }
    } else {
      for (int i = 0; column + i < reduction_stride; i++) {
        out[i] = totals[i];
      }
    }
  }
}

template <typename T, typename U, typename Op, typename IdxT, int NDIMS>
[[kernel]] void col_reduce_longcolumn(
    const device T* in [[buffer(0)]],
    device U* out [[buffer(1)]],
    const constant size_t& reduction_size [[buffer(2)]],
    const constant size_t& reduction_stride [[buffer(3)]],
    const constant int* shape [[buffer(4)]],
    const constant int64_t* strides [[buffer(5)]],
    const constant int& ndim [[buffer(6)]],
    const constant int* reduce_shape [[buffer(7)]],
    const constant int64_t* reduce_strides [[buffer(8)]],
    const constant int& reduce_ndim [[buffer(9)]],
    const constant size_t& non_col_reductions [[buffer(10)]],
    const constant size_t& out_size [[buffer(11)]],
    uint3 gid [[threadgroup_position_in_grid]],
    uint3 gsize [[threadgroups_per_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint3 lsize [[threads_per_threadgroup]]) {
  Op op;
  LoopedElemToLoc<NDIMS, IdxT, (NDIMS > 2)> loop(reduce_ndim);
  const device T* row;

  IdxT out_idx = gid.x + gsize.x * IdxT(gid.y);
  IdxT in_idx = elem_to_loc<IdxT>(out_idx, shape, strides, ndim);
  in += in_idx + lid.x;

  U total = Op::init;
  IdxT total_rows = IdxT(non_col_reductions) * IdxT(reduction_size);
  loop.next(gid.z * lsize.y + lid.y, reduce_shape, reduce_strides);
  for (IdxT r = gid.z * lsize.y + lid.y; r < total_rows;
       r += lsize.y * gsize.z) {
    row = in + loop.location();
    total = op(static_cast<U>(*row), total);
    loop.next(lsize.y * gsize.z, reduce_shape, reduce_strides);
  }

  threadgroup U shared_vals[32 * 32];
  shared_vals[lid.y * lsize.x + lid.x] = total;
  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (lid.y == 0) {
    for (uint i = 1; i < lsize.y; i++) {
      total = op(total, shared_vals[i * lsize.x + lid.x]);
    }
    out[gid.z * IdxT(out_size) + out_idx * IdxT(reduction_stride) + lid.x] =
        total;
  }
}

/**
 * Our approach is the following simple looped approach:
 *  1. Each thread keeps running totals for BN / n_simdgroups outputs.
 *  2. Load a tile BM, BN in registers and accumulate in the running totals
 *  3. Move ahead by BM steps until the column axis and the non column
 *     reductions are exhausted.
 *  6. If BM == 32 then transpose in SM and simd reduce the running totals.
 *     Otherwise write in shared memory and BN threads accumulate the running
 *     totals with a loop.
 *  7. Write them to the output
 */
template <
    typename T,
    typename U,
    typename Op,
    typename IdxT,
    int NDIMS,
    int BM,
    int BN>
[[kernel]] void col_reduce_looped(
    const device T* in [[buffer(0)]],
    device U* out [[buffer(1)]],
    const constant size_t& reduction_size [[buffer(2)]],
    const constant int64_t& reduction_stride [[buffer(3)]],
    const constant int* shape [[buffer(4)]],
    const constant int64_t* strides [[buffer(5)]],
    const constant int& ndim [[buffer(6)]],
    const constant int* reduce_shape [[buffer(7)]],
    const constant int64_t* reduce_strides [[buffer(8)]],
    const constant int& reduce_ndim [[buffer(9)]],
    const constant size_t& non_col_reductions [[buffer(10)]],
    uint3 gid [[threadgroup_position_in_grid]],
    uint3 gsize [[threadgroups_per_grid]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  Op op;
  constexpr int n_simdgroups = 8;
  constexpr short tgp_size = n_simdgroups * simd_size;
  constexpr short n_reads = (BM * BN) / tgp_size;
  constexpr short n_read_blocks = BN / n_reads;

  threadgroup U shared_vals[BN * BM];
  U totals[n_reads];
  LoopedElemToLoc<NDIMS, IdxT, (NDIMS > 2)> loop(reduce_ndim);
  const device T* row;

  for (int i = 0; i < n_reads; i++) {
    totals[i] = Op::init;
  }

  short lid = simd_group_id * simd_size + simd_lane_id;
  short2 offset((lid % n_read_blocks) * n_reads, lid / n_read_blocks);
  IdxT column = BN * gid.x + offset.x;
  bool safe = column + n_reads <= reduction_stride;

  IdxT out_idx = gid.y + gsize.y * IdxT(gid.z);
  IdxT in_idx = elem_to_loc<IdxT>(out_idx, shape, strides, ndim);
  in += in_idx + column;

  IdxT total = IdxT(non_col_reductions) * IdxT(reduction_size);
  loop.next(offset.y, reduce_shape, reduce_strides);
  for (IdxT r = offset.y; r < total; r += BM) {
    row = in + loop.location();

    if (safe) {
      for (int i = 0; i < n_reads; i++) {
        totals[i] = op(static_cast<U>(row[i]), totals[i]);
      }
    } else {
      U vals[n_reads];
      for (int i = 0; i < n_reads; i++) {
        vals[i] =
            (column + i < reduction_stride) ? static_cast<U>(row[i]) : op.init;
      }
      for (int i = 0; i < n_reads; i++) {
        totals[i] = op(vals[i], totals[i]);
      }
    }

    loop.next(BM, reduce_shape, reduce_strides);
  }

  // We can use a simd reduction to accumulate across BM so each thread writes
  // the partial output to SM and then each simdgroup does BN / n_simdgroups
  // accumulations.
  if (BM == 32) {
    constexpr int n_outputs = BN / n_simdgroups;
    static_assert(
        BM != 32 || n_outputs == n_reads,
        "The tile should be selected such that n_outputs == n_reads");
    for (int i = 0; i < n_reads; i++) {
      shared_vals[offset.y * BN + offset.x + i] = totals[i];
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    short2 out_offset(simd_group_id * n_outputs, simd_lane_id);
    for (int i = 0; i < n_outputs; i++) {
      totals[i] =
          op.simd_reduce(shared_vals[out_offset.y * BN + out_offset.x + i]);
    }

    // Write the output.
    if (simd_lane_id == 0) {
      IdxT out_column = BN * gid.x + out_offset.x;
      out += out_idx * IdxT(reduction_stride) + out_column;
      if (out_column + n_outputs <= reduction_stride) {
        for (int i = 0; i < n_outputs; i++) {
          out[i] = totals[i];
        }
      } else {
        for (int i = 0; out_column + i < reduction_stride; i++) {
          out[i] = totals[i];
        }
      }
    }
  }

  // Each thread holds n_reads partial results. We write them all out to shared
  // memory and threads with offset.y == 0 aggregate the columns and write the
  // outputs.
  else {
    short x_block = offset.x / n_reads;
    for (int i = 0; i < n_reads; i++) {
      shared_vals[x_block * BM * n_reads + i * BM + offset.y] = totals[i];
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    if (offset.y == 0) {
      for (int i = 0; i < n_reads; i++) {
        for (int j = 1; j < BM; j++) {
          totals[i] =
              op(shared_vals[x_block * BM * n_reads + i * BM + j], totals[i]);
        }
      }
    }

    // Write the output.
    if (offset.y == 0) {
      out += out_idx * IdxT(reduction_stride) + column;
      if (safe) {
        for (int i = 0; i < n_reads; i++) {
          out[i] = totals[i];
        }
      } else {
        for (int i = 0; column + i < reduction_stride; i++) {
          out[i] = totals[i];
        }
      }
    }
  }
}

template <
    typename T,
    typename U,
    typename Op,
    typename IdxT,
    int NDIMS,
    int BM,
    int BN>
[[kernel]] void col_reduce_2pass(
    const device T* in [[buffer(0)]],
    device U* out [[buffer(1)]],
    const constant size_t& reduction_size [[buffer(2)]],
    const constant int64_t& reduction_stride [[buffer(3)]],
    const constant int* shape [[buffer(4)]],
    const constant int64_t* strides [[buffer(5)]],
    const constant int& ndim [[buffer(6)]],
    const constant int* reduce_shape [[buffer(7)]],
    const constant int64_t* reduce_strides [[buffer(8)]],
    const constant int& reduce_ndim [[buffer(9)]],
    const constant size_t& non_col_reductions [[buffer(10)]],
    const constant size_t& out_size [[buffer(11)]],
    uint3 gid [[threadgroup_position_in_grid]],
    uint3 gsize [[threadgroups_per_grid]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  Op op;
  constexpr int n_simdgroups = 8;
  constexpr short tgp_size = n_simdgroups * simd_size;
  constexpr short n_reads = (BM * BN) / tgp_size;
  constexpr short n_read_blocks = BN / n_reads;
  constexpr int n_outputs = BN / n_simdgroups;
  constexpr short outer_blocks = 32;
  static_assert(BM == 32, "BM should be equal to 32");

  threadgroup U shared_vals[BN * BM];
  U totals[n_reads];
  LoopedElemToLoc<NDIMS, IdxT, (NDIMS > 2)> loop(reduce_ndim);
  const device T* row;

  for (int i = 0; i < n_reads; i++) {
    totals[i] = Op::init;
  }

  short lid = simd_group_id * simd_size + simd_lane_id;
  short2 offset((lid % n_read_blocks) * n_reads, lid / n_read_blocks);
  IdxT column = BN * gid.x + offset.x;
  bool safe = column + n_reads <= reduction_stride;

  IdxT full_idx = gid.y + gsize.y * IdxT(gid.z);
  IdxT block_idx = full_idx / IdxT(out_size);
  IdxT out_idx = full_idx % IdxT(out_size);
  IdxT in_idx = elem_to_loc<IdxT>(out_idx, shape, strides, ndim);
  in += in_idx + column;

  IdxT total = IdxT(non_col_reductions) * IdxT(reduction_size);
  loop.next(offset.y + block_idx * BM, reduce_shape, reduce_strides);
  for (IdxT r = offset.y + block_idx * BM; r < total; r += outer_blocks * BM) {
    row = in + loop.location();

    if (safe) {
      for (int i = 0; i < n_reads; i++) {
        totals[i] = op(static_cast<U>(row[i]), totals[i]);
      }
    } else {
      U vals[n_reads];
      for (int i = 0; i < n_reads; i++) {
        vals[i] =
            (column + i < reduction_stride) ? static_cast<U>(row[i]) : op.init;
      }
      for (int i = 0; i < n_reads; i++) {
        totals[i] = op(vals[i], totals[i]);
      }
    }

    loop.next(outer_blocks * BM, reduce_shape, reduce_strides);
  }

  // We can use a simd reduction to accumulate across BM so each thread writes
  // the partial output to SM and then each simdgroup does BN / n_simdgroups
  // accumulations.
  for (int i = 0; i < n_reads; i++) {
    shared_vals[offset.y * BN + offset.x + i] = totals[i];
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  short2 out_offset(simd_group_id * n_outputs, simd_lane_id);
  for (int i = 0; i < n_outputs; i++) {
    totals[i] =
        op.simd_reduce(shared_vals[out_offset.y * BN + out_offset.x + i]);
  }

  // Write the output.
  if (simd_lane_id == 0) {
    IdxT out_column = BN * gid.x + out_offset.x;
    out += full_idx * IdxT(reduction_stride) + out_column;
    if (out_column + n_outputs <= reduction_stride) {
      for (int i = 0; i < n_outputs; i++) {
        out[i] = totals[i];
      }
    } else {
      for (int i = 0; out_column + i < reduction_stride; i++) {
        out[i] = totals[i];
      }
    }
  }
}


================================================
FILE: mlx/backend/metal/kernels/reduction/reduce_init.h
================================================
// Copyright © 2023-2024 Apple Inc.

template <typename T, typename Op>
[[kernel]] void init_reduce(
    device T* out [[buffer(0)]],
    uint tid [[thread_position_in_grid]]) {
  out[tid] = Op::init;
}


================================================
FILE: mlx/backend/metal/kernels/reduction/reduce_row.h
================================================
// Copyright © 2023-2024 Apple Inc.

// Row reduction utilities
// - `per_thread_row_reduce` collaborative partial reduction in the threadgroup
// - `threadgroup_reduce` collaborative reduction in the threadgroup such that
//   lid.x == 0 holds the reduced value
// - `thread_reduce` simple loop and reduce the row

/**
 * The thread group collaboratively reduces across the rows with bounds
 * checking. In the end each thread holds a part of the reduction.
 */
template <
    typename T,
    typename U,
    typename Op,
    int N_READS = REDUCE_N_READS,
    int N_WRITES = REDUCE_N_WRITES>
METAL_FUNC void per_thread_row_reduce(
    thread U totals[N_WRITES],
    const device T* inputs[N_WRITES],
    int blocks,
    int extra,
    uint lsize_x,
    uint lid_x) {
  Op op;

  // Set up the accumulator registers
  for (int i = 0; i < N_WRITES; i++) {
    totals[i] = Op::init;
  }

  // Loop over the reduction size within thread group
  for (int i = 0; i < blocks; i++) {
    for (int j = 0; j < N_WRITES; j++) {
      for (int i = 0; i < N_READS; i++) {
        totals[j] = op(static_cast<U>(inputs[j][i]), totals[j]);
      }

      inputs[j] += lsize_x * N_READS;
    }
  }

  // Separate case for the last set as we close the reduction size
  int index = lid_x * N_READS;
  if (index + N_READS <= extra) {
    for (int j = 0; j < N_WRITES; j++) {
      for (int i = 0; i < N_READS; i++) {
        totals[j] = op(static_cast<U>(inputs[j][i]), totals[j]);
      }
    }
  } else {
    for (int j = 0; j < N_WRITES; j++) {
      for (int i = 0; index + i < extra; i++) {
        totals[j] = op(static_cast<U>(inputs[j][i]), totals[j]);
      }
    }
  }
}

/**
 * Consecutive rows in a contiguous array.
 */
template <
    typename T,
    typename U,
    typename Op,
    int N_READS = REDUCE_N_READS,
    int N_WRITES = REDUCE_N_WRITES>
METAL_FUNC void per_thread_row_reduce(
    thread U totals[N_WRITES],
    const device T* in,
    const constant size_t& reduction_size,
    int blocks,
    int extra,
    uint lsize_x,
    uint lid_x) {
  // Set up the input pointers
  const device T* inputs[N_WRITES];
  inputs[0] = in + lid_x * N_READS;
  for (int i = 1; i < N_READS; i++) {
    inputs[i] = inputs[i - 1] + reduction_size;
  }

  per_thread_row_reduce<T, U, Op, N_READS, N_WRITES>(
      totals, inputs, blocks, extra, lsize_x, lid_x);
}

/**
 * Consecutive rows in an arbitrarily ordered array.
 */
template <
    typename T,
    typename U,
    typename Op,
    int N_READS = REDUCE_N_READS,
    int N_WRITES = REDUCE_N_WRITES>
METAL_FUNC void per_thread_row_reduce(
    thread U totals[N_WRITES],
    const device T* in,
    const int64_t row_idx,
    int blocks,
    int extra,
    const constant int* shape,
    const constant int64_t* strides,
    const constant int& ndim,
    uint lsize_x,
    uint lid_x) {
  // Set up the input pointers
  const device T* inputs[N_WRITES];
  in += lid_x * N_READS;
  for (int i = 0; i < N_READS; i++) {
    inputs[i] = in + elem_to_loc(row_idx + i, shape, strides, ndim);
  }

  per_thread_row_reduce<T, U, Op, N_READS, N_WRITES>(
      totals, inputs, blocks, extra, lsize_x, lid_x);
}

/**
 * Reduce within the threadgroup.
 */
template <
    typename T,
    typename U,
    typename Op,
    int N_READS = REDUCE_N_READS,
    int N_WRITES = REDUCE_N_WRITES>
METAL_FUNC void threadgroup_reduce(
    thread U totals[N_WRITES],
    threadgroup U* shared_vals,
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_per_group [[simdgroups_per_threadgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  Op op;

  // Simdgroup first
  for (int i = 0; i < N_WRITES; i++) {
    totals[i] = op.simd_reduce(totals[i]);
  }

  // Across simdgroups
  if (simd_per_group > 1) {
    if (simd_lane_id == 0) {
      for (int i = 0; i < N_WRITES; i++) {
        shared_vals[simd_group_id * N_WRITES + i] = totals[i];
      }
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);

    U values[N_WRITES];
    for (int i = 0; i < N_WRITES; i++) {
      values[i] = (lid.x < simd_per_group) ? shared_vals[lid.x * N_WRITES + i]
                                           : op.init;
    }

    for (int i = 0; i < N_WRITES; i++) {
      totals[i] = op.simd_reduce(values[i]);
    }
  }
}

template <typename T, typename U, typename Op, int N_READS = REDUCE_N_READS>
METAL_FUNC void
thread_reduce(thread U& total, const device T* row, int blocks, int extra) {
  Op op;
  for (int i = 0; i < blocks; i++) {
    U vals[N_READS];
    for (int j = 0; j < N_READS; j++) {
      vals[j] = row[j];
    }
    for (int j = 0; j < N_READS; j++) {
      total = op(vals[j], total);
    }
    row += N_READS;
  }
  for (int i = 0; i < extra; i++) {
    total = op(*row++, total);
  }
}

// Reduction kernels
// - `row_reduce_small` depending on the non-row reductions and row size it
//   either just loops over everything or a simd collaboratively reduces the
//   non_row reductions. In the first case one thread is responsible for one
//   output on the 2nd one simd is responsible for one output.
// - `row_reduce_simple` simple contiguous row reduction
// - `row_reduce_looped` simply loop and reduce each row for each non-row
//   reduction. One threadgroup is responsible for one output.

template <
    typename T,
    typename U,
    typename Op,
    typename IdxT,
    int NDIMS,
    int N_READS = REDUCE_N_READS>
[[kernel]] void row_reduce_small(
    const device T* in [[buffer(0)]],
    device U* out [[buffer(1)]],
    const constant int64_t& row_size [[buffer(2)]],
    const constant int64_t& non_row_reductions [[buffer(3)]],
    const constant int* shape [[buffer(4)]],
    const constant int64_t* strides [[buffer(5)]],
    const constant int& ndim [[buffer(6)]],
    const constant int* reduce_shape [[buffer(7)]],
    const constant int64_t* reduce_strides [[buffer(8)]],
    const constant int& reduce_ndim [[buffer(9)]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint3 gid [[threadgroup_position_in_grid]],
    uint3 gsize [[threadgroups_per_grid]],
    uint3 tid [[thread_position_in_grid]],
    uint3 tsize [[threads_per_grid]]) {
  Op op;

  U total_val = Op::init;
  LoopedElemToLoc<NDIMS, IdxT, (NDIMS > 2)> loop(reduce_ndim);

  // Precompute some row reduction numbers
  const device T* row;
  int blocks = IdxT(row_size) / N_READS;
  int extra = IdxT(row_size) % N_READS;

  if ((non_row_reductions < 32 && row_size <= 8) || non_row_reductions <= 8) {
    // Simple loop over non_row_reductions and reduce the row in the thread.
    IdxT out_idx = tid.x + tsize.x * IdxT(tid.y);
    in += elem_to_loc<IdxT>(out_idx, shape, strides, ndim);

    for (uint r = 0; r < non_row_reductions; r++) {
      row = in + loop.location();
      thread_reduce<T, U, Op, N_READS>(total_val, row, blocks, extra);
      loop.next(reduce_shape, reduce_strides);
    }

    out[out_idx] = total_val;
  } else {
    // Collaboratively reduce over non_row_reductions in the simdgroup. Each
    // thread reduces every 32nd row and then a simple simd reduce.
    IdxT out_idx = gid.y + gsize.y * IdxT(gid.z);
    in += elem_to_loc<IdxT>(out_idx, shape, strides, ndim);

    loop.next(simd_lane_id, reduce_shape, reduce_strides);

    for (uint r = simd_lane_id; r < non_row_reductions; r += simd_size) {
      row = in + loop.location();
      thread_reduce<T, U, Op, N_READS>(total_val, row, blocks, extra);
      loop.next(simd_size, reduce_shape, reduce_strides);
    }

    total_val = op.simd_reduce(total_val);

    if (simd_lane_id == 0) {
      out[out_idx] = total_val;
    }
  }
}

template <
    typename T,
    typename U,
    typename Op,
    typename IdxT = int64_t,
    int N_READS = REDUCE_N_READS,
    int N_WRITES = REDUCE_N_WRITES>
[[kernel]] void row_reduce_simple(
    const device T* in [[buffer(0)]],
    device U* out [[buffer(1)]],
    const constant size_t& reduction_size [[buffer(2)]],
    const constant int64_t& out_size [[buffer(3)]],
    uint3 gid [[threadgroup_position_in_grid]],
    uint3 gsize [[threadgroups_per_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint3 lsize [[threads_per_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_per_group [[simdgroups_per_threadgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  threadgroup U shared_vals[simd_size * N_WRITES];
  U totals[N_WRITES];

  // Move to the row
  IdxT out_idx = N_WRITES * (gid.y + gsize.y * IdxT(gid.z));
  if (out_idx + N_WRITES > out_size) {
    out_idx = out_size - N_WRITES;
  }
  in += out_idx * IdxT(reduction_size);
  out += out_idx;

  // Each thread reduces across the row
  int blocks = IdxT(reduction_size) / (lsize.x * N_READS);
  int extra = reduction_size - blocks * (lsize.x * N_READS);
  per_thread_row_reduce<T, U, Op, N_READS, N_WRITES>(
      totals, in, reduction_size, blocks, extra, lsize.x, lid.x);

  // Reduce across the threadgroup
  threadgroup_reduce<T, U, Op, N_READS, N_WRITES>(
      totals, shared_vals, lid, simd_lane_id, simd_per_group, simd_group_id);

  // Write the output
  if (lid.x == 0) {
    for (int i = 0; i < N_WRITES; i++) {
      out[i] = totals[i];
    }
  }
}

template <
    typename T,
    typename U,
    typename Op,
    typename IdxT,
    int NDIMS,
    int N_READS = REDUCE_N_READS>
[[kernel]] void row_reduce_looped(
    const device T* in [[buffer(0)]],
    device U* out [[buffer(1)]],
    const constant int64_t& row_size [[buffer(2)]],
    const constant int64_t& non_row_reductions [[buffer(3)]],
    const constant int* shape [[buffer(4)]],
    const constant int64_t* strides [[buffer(5)]],
    const constant int& ndim [[buffer(6)]],
    const constant int* reduce_shape [[buffer(7)]],
    const constant int64_t* reduce_strides [[buffer(8)]],
    const constant int& reduce_ndim [[buffer(9)]],
    uint3 gid [[threadgroup_position_in_grid]],
    uint3 gsize [[threadgroups_per_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint3 lsize [[threads_per_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_per_group [[simdgroups_per_threadgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  Op op;
  threadgroup U shared_vals[simd_size];
  U total = Op::init;

  IdxT out_idx = gid.y + gsize.y * IdxT(gid.z);

  // lid.x * N_READS breaks the per_thread_row_reduce interface a bit. Maybe it
  // needs a small refactor.
  in += elem_to_loc<IdxT>(out_idx, shape, strides, ndim) + lid.x * N_READS;

  LoopedElemToLoc<NDIMS, IdxT, (NDIMS > 2)> loop(reduce_ndim);
  const device T* row;
  int blocks = IdxT(row_size) / (lsize.x * N_READS);
  int extra = row_size - blocks * (lsize.x * N_READS);

  for (IdxT i = 0; i < non_row_reductions; i++) {
    row = in + loop.location();

    // Each thread reduces across the row
    U row_total;
    per_thread_row_reduce<T, U, Op, N_READS, 1>(
        &row_total, &row, blocks, extra, lsize.x, lid.x);

    // Aggregate across rows
    total = op(total, row_total);

    loop.next(reduce_shape, reduce_strides);
  }

  // Reduce across the threadgroup
  threadgroup_reduce<T, U, Op, N_READS, 1>(
      &total, shared_vals, lid, simd_lane_id, simd_per_group, simd_group_id);

  // Write the output
  if (lid.x == 0) {
    out[out_idx] = total;
  }
}


================================================
FILE: mlx/backend/metal/kernels/rms_norm.metal
================================================
// Copyright © 2024 Apple Inc.

#include <metal_common>
#include <metal_simdgroup>

#include "mlx/backend/metal/kernels/utils.h"

using namespace metal;

constant bool has_w [[function_constant(20)]];

template <typename T, int N_READS = RMS_N_READS>
[[kernel]] void rms_single_row(
    const device T* x,
    const device T* w,
    device T* out,
    constant float& eps,
    constant uint& axis_size,
    constant uint& w_stride,
    uint gid [[threadgroup_position_in_grid]],
    uint lid [[thread_position_in_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  constexpr int SIMD_SIZE = 32;

  threadgroup float local_inv_mean[1];
  threadgroup float local_sums[SIMD_SIZE];

  float acc = 0;
  x += gid * size_t(axis_size) + lid * N_READS;
  w += w_stride * lid * N_READS;
  if (lid * N_READS + N_READS <= axis_size) {
    for (int i = 0; i < N_READS; i++) {
      float xi = x[i];
      acc += xi * xi;
    }
  } else {
    for (int i = 0; i < N_READS; i++) {
      if ((lid * N_READS + i) < axis_size) {
        float xi = x[i];
        acc += xi * xi;
      }
    }
  }
  acc = simd_sum(acc);
  //  Initialize shared memory
  if (simd_group_id == 0) {
    local_sums[simd_lane_id] = 0;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);

  // Write simd accumulations into shared memory
  if (simd_lane_id == 0) {
    local_sums[simd_group_id] = acc;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);

  // Accumulate over simd groups
  if (simd_group_id == 0) {
    acc = simd_sum(local_sums[simd_lane_id]);
    if (simd_lane_id == 0) {
      local_inv_mean[0] = metal::precise::rsqrt(acc / axis_size + eps);
    }
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);

  // Write the outputs
  out += gid * size_t(axis_size) + lid * N_READS;
  if (lid * N_READS + N_READS <= axis_size) {
    for (int i = 0; i < N_READS; i++) {
      out[i] = w[w_stride * i] * static_cast<T>(x[i] * local_inv_mean[0]);
    }
  } else {
    for (int i = 0; i < N_READS; i++) {
      if ((lid * N_READS + i) < axis_size) {
        out[i] = w[w_stride * i] * static_cast<T>(x[i] * local_inv_mean[0]);
      }
    }
  }
}

template <typename T, int N_READS = RMS_N_READS>
[[kernel]] void rms_looped(
    const device T* x,
    const device T* w,
    device T* out,
    constant float& eps,
    constant uint& axis_size,
    constant uint& w_stride,
    uint gid [[threadgroup_position_in_grid]],
    uint lid [[thread_position_in_threadgroup]],
    uint lsize [[threads_per_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  constexpr int SIMD_SIZE = 32;
  threadgroup float local_inv_mean[1];
  threadgroup float local_sums[SIMD_SIZE];

  float acc = 0;
  x += gid * size_t(axis_size) + lid * N_READS;
  w += w_stride * lid * N_READS;
  for (uint r = 0; r < axis_size; r += lsize * N_READS) {
    if (r + lid * N_READS + N_READS <= axis_size) {
      for (int i = 0; i < N_READS; i++) {
        float xi = x[i + r];
        acc += xi * xi;
      }
    } else {
      for (int i = 0; i < N_READS; i++) {
        if ((r + lid * N_READS + i) < axis_size) {
          float xi = x[i + r];
          acc += xi * xi;
        }
      }
    }
  }
  acc = simd_sum(acc);
  //  Initialize shared memory
  if (simd_group_id == 0) {
    local_sums[simd_lane_id] = 0;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);

  // Write simd accumulations into shared memory
  if (simd_lane_id == 0) {
    local_sums[simd_group_id] = acc;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);

  // Accumulate over simd groups
  if (simd_group_id == 0) {
    acc = simd_sum(local_sums[simd_lane_id]);
    if (simd_lane_id == 0) {
      local_inv_mean[0] = metal::precise::rsqrt(acc / axis_size + eps);
    }
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);

  // Write the outputs
  out += gid * size_t(axis_size) + lid * N_READS;
  for (uint r = 0; r < axis_size; r += lsize * N_READS) {
    if (r + lid * N_READS + N_READS <= axis_size) {
      for (int i = 0; i < N_READS; i++) {
        out[r + i] = w[w_stride * (i + r)] *
            static_cast<T>(x[r + i] * local_inv_mean[0]);
      }
    } else {
      for (int i = 0; i < N_READS; i++) {
        if ((r + lid * N_READS + i) < axis_size) {
          out[r + i] = w[w_stride * (i + r)] *
              static_cast<T>(x[r + i] * local_inv_mean[0]);
        }
      }
    }
  }
}

template <typename T, int N_READS = RMS_N_READS>
[[kernel]] void vjp_rms_single_row(
    const device T* x,
    const device T* w,
    const device T* g,
    device T* gx,
    device T* gw,
    constant float& eps,
    constant uint& axis_size,
    constant uint& w_stride,
    uint gid [[threadgroup_position_in_grid]],
    uint lid [[thread_position_in_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  // Advance the input pointers
  x += gid * size_t(axis_size) + lid * N_READS;
  g += gid * size_t(axis_size) + lid * N_READS;
  w += w_stride * lid * N_READS;

  // Allocate registers for the computation and accumulators
  float thread_x[N_READS];
  float thread_w[N_READS];
  float thread_g[N_READS];
  float sumx2 = 0;
  float sumgwx = 0;

  // Allocate shared memory to implement the reduction
  constexpr int SIMD_SIZE = 32;
  threadgroup float local_sumx2[SIMD_SIZE];
  threadgroup float local_sumgwx[SIMD_SIZE];
  threadgroup float local_normalizer[1];
  threadgroup float local_meangwx[1];

  // Read and accumulate locally
  if (lid * N_READS + N_READS <= axis_size) {
    for (int i = 0; i < N_READS; i++) {
      thread_x[i] = x[i];
      thread_w[i] = w[w_stride * i];
      thread_g[i] = g[i];

      sumx2 += thread_x[i] * thread_x[i];
      sumgwx += thread_x[i] * thread_w[i] * thread_g[i];
    }
  } else {
    for (int i = 0; i < N_READS; i++) {
      if ((lid * N_READS + i) < axis_size) {
        thread_x[i] = x[i];
        thread_w[i] = w[w_stride * i];
        thread_g[i] = g[i];

        sumx2 += thread_x[i] * thread_x[i];
        sumgwx += thread_x[i] * thread_w[i] * thread_g[i];
      }
    }
  }

  // Accumulate across threads
  sumx2 = simd_sum(sumx2);
  sumgwx = simd_sum(sumgwx);
  if (simd_group_id == 0) {
    local_sumx2[simd_lane_id] = 0;
    local_sumgwx[simd_lane_id] = 0;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (simd_lane_id == 0) {
    local_sumx2[simd_group_id] = sumx2;
    local_sumgwx[simd_group_id] = sumgwx;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (simd_group_id == 0) {
    sumx2 = simd_sum(local_sumx2[simd_lane_id]);
    sumgwx = simd_sum(local_sumgwx[simd_lane_id]);
    if (simd_lane_id == 0) {
      local_meangwx[0] = sumgwx / axis_size;
      local_normalizer[0] = metal::precise::rsqrt(sumx2 / axis_size + eps);
    }
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  float meangwx = local_meangwx[0];
  float normalizer = local_normalizer[0];
  float normalizer3 = normalizer * normalizer * normalizer;

  // Write the outputs
  gx += gid * size_t(axis_size) + lid * N_READS;
  gw += gid * size_t(axis_size) + lid * N_READS;
  if (lid * N_READS + N_READS <= axis_size) {
    for (int i = 0; i < N_READS; i++) {
      gx[i] = static_cast<T>(
          thread_g[i] * thread_w[i] * normalizer -
          thread_x[i] * meangwx * normalizer3);
      if (has_w) {
        gw[i] = static_cast<T>(thread_g[i] * thread_x[i] * normalizer);
      }
    }
  } else {
    for (int i = 0; i < N_READS; i++) {
      if ((lid * N_READS + i) < axis_size) {
        gx[i] = static_cast<T>(
            thread_g[i] * thread_w[i] * normalizer -
            thread_x[i] * meangwx * normalizer3);
        if (has_w) {
          gw[i] = static_cast<T>(thread_g[i] * thread_x[i] * normalizer);
        }
      }
    }
  }
}

template <typename T, int N_READS = RMS_N_READS>
[[kernel]] void vjp_rms_looped(
    const device T* x,
    const device T* w,
    const device T* g,
    device T* gx,
    device T* gw,
    constant float& eps,
    constant uint& axis_size,
    constant uint& w_stride,
    uint gid [[threadgroup_position_in_grid]],
    uint lid [[thread_position_in_threadgroup]],
    uint lsize [[threads_per_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  // Advance the input pointers
  x += gid * size_t(axis_size) + lid * N_READS;
  g += gid * size_t(axis_size) + lid * N_READS;
  w += w_stride * lid * N_READS;

  // Allocate registers for the accumulators
  float sumx2 = 0;
  float sumgwx = 0;

  // Allocate shared memory to implement the reduction
  constexpr int SIMD_SIZE = 32;
  threadgroup float local_sumx2[SIMD_SIZE];
  threadgroup float local_sumgwx[SIMD_SIZE];
  threadgroup float local_normalizer[1];
  threadgroup float local_meangwx[1];

  // Read and accumulate locally
  for (uint r = 0; r < axis_size; r += lsize * N_READS) {
    if (r + lid * N_READS + N_READS <= axis_size) {
      for (int i = 0; i < N_READS; i++) {
        float xi = x[i + r];
        float wi = w[w_stride * (i + r)];
        float gi = g[i + r];

        sumx2 += xi * xi;
        sumgwx += xi * wi * gi;
      }
    } else {
      for (int i = 0; i < N_READS; i++) {
        if ((r + lid * N_READS + i) < axis_size) {
          float xi = x[i + r];
          float wi = w[w_stride * (i + r)];
          float gi = g[i + r];

          sumx2 += xi * xi;
          sumgwx += xi * wi * gi;
        }
      }
    }
  }

  // Accumulate across threads
  sumx2 = simd_sum(sumx2);
  sumgwx = simd_sum(sumgwx);
  if (simd_group_id == 0) {
    local_sumx2[simd_lane_id] = 0;
    local_sumgwx[simd_lane_id] = 0;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (simd_lane_id == 0) {
    local_sumx2[simd_group_id] = sumx2;
    local_sumgwx[simd_group_id] = sumgwx;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (simd_group_id == 0) {
    sumx2 = simd_sum(local_sumx2[simd_lane_id]);
    sumgwx = simd_sum(local_sumgwx[simd_lane_id]);
    if (simd_lane_id == 0) {
      local_meangwx[0] = sumgwx / axis_size;
      local_normalizer[0] = metal::precise::rsqrt(sumx2 / axis_size + eps);
    }
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  float meangwx = local_meangwx[0];
  float normalizer = local_normalizer[0];
  float normalizer3 = normalizer * normalizer * normalizer;

  // Write the outputs
  gx += gid * size_t(axis_size) + lid * N_READS;
  gw += gid * size_t(axis_size) + lid * N_READS;
  for (uint r = 0; r < axis_size; r += lsize * N_READS) {
    if (r + lid * N_READS + N_READS <= axis_size) {
      for (int i = 0; i < N_READS; i++) {
        float xi = x[i + r];
        float wi = w[w_stride * (i + r)];
        float gi = g[i + r];

        gx[i + r] =
            static_cast<T>(gi * wi * normalizer - xi * meangwx * normalizer3);
        if (has_w) {
          gw[i + r] = static_cast<T>(gi * xi * normalizer);
        }
      }
    } else {
      for (int i = 0; i < N_READS; i++) {
        if ((r + lid * N_READS + i) < axis_size) {
          float xi = x[i + r];
          float wi = w[w_stride * (i + r)];
          float gi = g[i + r];

          gx[i + r] =
              static_cast<T>(gi * wi * normalizer - xi * meangwx * normalizer3);
          if (has_w) {
            gw[i + r] = static_cast<T>(gi * xi * normalizer);
          }
        }
      }
    }
  }
}

// clang-format off
#define instantiate_rms(name, itype)                                \
  instantiate_kernel("rms" #name, rms_single_row, itype)            \
  instantiate_kernel("vjp_rms" #name, vjp_rms_single_row, itype)    \
  instantiate_kernel("rms_looped" #name, rms_looped, itype)         \
  instantiate_kernel("vjp_rms_looped" #name, vjp_rms_looped, itype)

instantiate_rms(float32, float)
instantiate_rms(float16, half)
instantiate_rms(bfloat16, bfloat16_t) // clang-format on


================================================
FILE: mlx/backend/metal/kernels/rope.metal
================================================
// Copyright © 2023-2024 Apple Inc.

#include <metal_math>

#include "mlx/backend/metal/kernels/utils.h"

constant bool forward [[function_constant(1)]];
constant bool traditional [[function_constant(2)]];
constant bool hs_transpose [[function_constant(3)]];

template <typename T>
void rope_single_impl(
    const device T* in,
    device T* out,
    constant const int& offset,
    const float inv_freq,
    constant const float& scale,
    constant const int64_t& stride,
    uint2 pos,
    uint2 grid) {
  float L = scale * static_cast<float>(offset);

  // Compute costheta, sintheta
  float theta = L * inv_freq;
  float costheta = metal::fast::cos(theta);
  float sintheta = metal::fast::sin(theta);

  // Compute the input and output indices
  uint index_1, index_2;
  if (traditional) {
    index_1 = 2 * pos.x + pos.y * stride;
    index_2 = index_1 + 1;
  } else {
    index_1 = pos.x + pos.y * stride;
    index_2 = index_1 + grid.x;
  }

  // Read and write the output
  float x1 = static_cast<float>(in[index_1]);
  float x2 = static_cast<float>(in[index_2]);
  float rx1;
  float rx2;
  if (forward) {
    rx1 = x1 * costheta - x2 * sintheta;
    rx2 = x1 * sintheta + x2 * costheta;
  } else {
    rx1 = x2 * sintheta + x1 * costheta;
    rx2 = x2 * costheta - x1 * sintheta;
  }
  out[index_1] = static_cast<T>(rx1);
  out[index_2] = static_cast<T>(rx2);
}

template <typename T>
[[kernel]] void rope_single(
    const device T* in [[buffer(0)]],
    device T* out [[buffer(1)]],
    constant const int& offset,
    constant const float& scale,
    constant const int64_t& stride,
    constant const float& base [[buffer(10)]],
    uint2 pos [[thread_position_in_grid]],
    uint2 grid [[threads_per_grid]]) {
  float d = static_cast<float>(pos.x) / static_cast<float>(grid.x);
  float inv_freq = metal::exp2(-d * base);
  rope_single_impl<T>(in, out, offset, inv_freq, scale, stride, pos, grid);
}

template <typename T>
[[kernel]] void rope_single_freqs(
    const device T* in [[buffer(0)]],
    device T* out [[buffer(1)]],
    constant const int& offset,
    constant const float& scale,
    constant const int64_t& stride,
    const device float* freqs [[buffer(10)]],
    constant const int64_t& freq_stride [[buffer(11)]],
    uint2 pos [[thread_position_in_grid]],
    uint2 grid [[threads_per_grid]]) {
  float inv_freq = 1.0 / (freqs[freq_stride * pos.x]);
  rope_single_impl<T>(in, out, offset, inv_freq, scale, stride, pos, grid);
}

template <typename T, typename IdxT, int N = 4>
void rope_impl(
    const device T* in,
    device T* out,
    const device int* offset,
    const float inv_freq,
    constant const float& scale,
    constant const int64_t strides[3],
    constant const int64_t out_strides[3],
    constant const int64_t& offset_stride,
    constant const int& n_head,
    uint3 pos,
    uint3 grid) {
  auto n_head_up = N * ((n_head + N - 1) / N);
  auto head_idx = static_cast<int>((pos.z * N) % n_head_up);
  auto batch_idx = (pos.z * N) / n_head_up;
  auto batch_offset = offset[batch_idx * offset_stride];
  float L = scale * static_cast<float>(pos.y + batch_offset);
  auto mat_idx = batch_idx * n_head + head_idx;

  // Compute costheta, sintheta
  float theta = L * inv_freq;
  float costheta = metal::fast::cos(theta);
  float sintheta = metal::fast::sin(theta);
  // Compute the input and output indices
  IdxT in_index_1;
  if (hs_transpose) {
    IdxT batch_stride = grid.y * IdxT(strides[1]);
    in_index_1 =
        batch_idx * batch_stride + pos.y * strides[1] + head_idx * strides[0];
  } else {
    in_index_1 = pos.y * IdxT(strides[1]) + mat_idx * IdxT(strides[0]);
  }
  IdxT in_index_2;
  IdxT out_index_1 =
      pos.y * IdxT(out_strides[1]) + mat_idx * IdxT(out_strides[0]);
  IdxT out_index_2;
  if (traditional) {
    out_index_1 += 2 * pos.x * IdxT(out_strides[2]);
    out_index_2 = out_index_1 + 1;
    in_index_1 += 2 * pos.x * IdxT(strides[2]);
    in_index_2 = in_index_1 + IdxT(strides[2]);
  } else {
    out_index_1 += pos.x * IdxT(out_strides[2]);
    out_index_2 = out_index_1 + grid.x * IdxT(out_strides[2]);
    in_index_1 += pos.x * IdxT(strides[2]);
    in_index_2 = in_index_1 + grid.x * IdxT(strides[2]);
  }
  for (int i = 0; i < N && head_idx + i < n_head; ++i) {
    // Read and write the output
    float x1 = static_cast<float>(in[in_index_1]);
    float x2 = static_cast<float>(in[in_index_2]);
    float rx1;
    float rx2;
    if (forward) {
      rx1 = x1 * costheta - x2 * sintheta;
      rx2 = x1 * sintheta + x2 * costheta;
    } else {
      rx1 = x2 * sintheta + x1 * costheta;
      rx2 = x2 * costheta - x1 * sintheta;
    }
    out[out_index_1] = static_cast<T>(rx1);
    out[out_index_2] = static_cast<T>(rx2);
    in_index_1 += IdxT(strides[0]);
    in_index_2 += IdxT(strides[0]);
    out_index_1 += IdxT(out_strides[0]);
    out_index_2 += IdxT(out_strides[0]);
  }
}

template <typename T, typename IdxT, int N = 4>
[[kernel]] void rope(
    const device T* in [[buffer(0)]],
    device T* out [[buffer(1)]],
    const device int* offset,
    constant const float& scale,
    constant const int64_t strides[3],
    constant const int64_t out_strides[3],
    constant const int64_t& offset_stride,
    constant const int& n_head,
    constant const float& base [[buffer(10)]],
    uint3 pos [[thread_position_in_grid]],
    uint3 grid [[threads_per_grid]]) {
  float d = static_cast<float>(pos.x) / static_cast<float>(grid.x);
  float inv_freq = metal::exp2(-d * base);
  rope_impl<T, IdxT, N>(
      in,
      out,
      offset,
      inv_freq,
      scale,
      strides,
      out_strides,
      offset_stride,
      n_head,
      pos,
      grid);
}

template <typename T, typename IdxT, int N = 4>
[[kernel]] void rope_freqs(
    const device T* in [[buffer(0)]],
    device T* out [[buffer(1)]],
    const device int* offset,
    constant const float& scale,
    constant const int64_t strides[3],
    constant const int64_t out_strides[3],
    constant const int64_t& offset_stride,
    constant const int& n_head,
    const device float* freqs [[buffer(10)]],
    constant const int64_t& freq_stride [[buffer(11)]],
    uint3 pos [[thread_position_in_grid]],
    uint3 grid [[threads_per_grid]]) {
  float inv_freq = 1.0 / (freqs[freq_stride * pos.x]);
  rope_impl<T, IdxT, N>(
      in,
      out,
      offset,
      inv_freq,
      scale,
      strides,
      out_strides,
      offset_stride,
      n_head,
      pos,
      grid);
}

// clang-format off
#define instantiate_rope_g(name, type) \
  instantiate_kernel("rope_" #name, rope, type, int32_t) \
  instantiate_kernel("rope_freqs_" #name, rope_freqs, type, int32_t) \
  instantiate_kernel("rope_large_" #name, rope, type, int64_t) \
  instantiate_kernel("rope_freqs_large_" #name, rope_freqs, type, int64_t)

#define instantiate_rope_s(name, type) \
  instantiate_kernel("rope_single_" #name, rope_single, type) \
  instantiate_kernel("rope_single_freqs_" #name, rope_single_freqs, type)

#define instantiate_rope(name, type) \
  instantiate_rope_s(name, type)     \
  instantiate_rope_g(name, type)

instantiate_rope(float16, half)
instantiate_rope(bfloat16, bfloat16_t)
instantiate_rope(float32, float) // clang-format on


================================================
FILE: mlx/backend/metal/kernels/scaled_dot_product_attention.metal
================================================
#include <metal_stdlib>

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/sdpa_vector.h"

using namespace metal;

// SDPA vector instantiations
#define instantiate_sdpa_vector_aggregation(type, value_dim) \
  instantiate_kernel(                                        \
      "sdpa_vector_2pass_2_" #type "_" #value_dim,           \
      sdpa_vector_2pass_2,                                   \
      type,                                                  \
      value_dim)

#define instantiate_sdpa_vector(type, qk_dim, value_dim)       \
  instantiate_kernel(                                          \
      "sdpa_vector_" #type "_" #qk_dim "_" #value_dim,         \
      sdpa_vector,                                             \
      type,                                                    \
      qk_dim,                                                  \
      value_dim)                                               \
  instantiate_kernel(                                          \
      "sdpa_vector_2pass_1_" #type "_" #qk_dim "_" #value_dim, \
      sdpa_vector_2pass_1,                                     \
      type,                                                    \
      qk_dim,                                                  \
      value_dim)

#define instantiate_sdpa_vector_heads(type)      \
  instantiate_sdpa_vector(type, 64, 64)          \
  instantiate_sdpa_vector(type, 96, 96)          \
  instantiate_sdpa_vector(type, 128, 128)        \
  instantiate_sdpa_vector(type, 256, 256)        \
  instantiate_sdpa_vector_aggregation(type, 64)  \
  instantiate_sdpa_vector_aggregation(type, 96)  \
  instantiate_sdpa_vector_aggregation(type, 128) \
  instantiate_sdpa_vector_aggregation(type, 256)

instantiate_sdpa_vector_heads(float)
instantiate_sdpa_vector_heads(bfloat16_t)
instantiate_sdpa_vector_heads(float16_t)
    // clang-format on


================================================
FILE: mlx/backend/metal/kernels/scan.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/binary_ops.h"

#define DEFINE_SIMD_SCAN()                                               \
  template <typename T, metal::enable_if_t<sizeof(T) < 8, bool> = true>  \
  T simd_scan(T val) {                                                   \
    return simd_scan_impl(val);                                          \
  }                                                                      \
                                                                         \
  template <typename T, metal::enable_if_t<sizeof(T) == 8, bool> = true> \
  T simd_scan(T val) {                                                   \
    for (int i = 1; i <= 16; i *= 2) {                                   \
      val = operator()(val, simd_shuffle_and_fill_up(val, init, i));     \
    }                                                                    \
    return val;                                                          \
  }

#define DEFINE_SIMD_EXCLUSIVE_SCAN()                                     \
  template <typename T, metal::enable_if_t<sizeof(T) < 8, bool> = true>  \
  T simd_exclusive_scan(T val) {                                         \
    return simd_exclusive_scan_impl(val);                                \
  }                                                                      \
                                                                         \
  template <typename T, metal::enable_if_t<sizeof(T) == 8, bool> = true> \
  T simd_exclusive_scan(T val) {                                         \
    val = simd_scan(val);                                                \
    return simd_shuffle_and_fill_up(val, init, 1);                       \
  }

template <typename U>
struct CumSum {
  DEFINE_SIMD_SCAN()
  DEFINE_SIMD_EXCLUSIVE_SCAN()

  static constexpr constant U init = static_cast<U>(0);

  template <typename T>
  U operator()(U a, T b) {
    return a + b;
  }

  U simd_scan_impl(U x) {
    return simd_prefix_inclusive_sum(x);
  }

  U simd_exclusive_scan_impl(U x) {
    return simd_prefix_exclusive_sum(x);
  }
};

template <typename U>
struct CumProd {
  DEFINE_SIMD_SCAN()
  DEFINE_SIMD_EXCLUSIVE_SCAN()

  static constexpr constant U init = static_cast<U>(1.0f);

  template <typename T>
  U operator()(U a, T b) {
    return a * b;
  }

  U simd_scan_impl(U x) {
    return simd_prefix_inclusive_product(x);
  }

  U simd_exclusive_scan_impl(U x) {
    return simd_prefix_exclusive_product(x);
  }
};

template <>
struct CumProd<bool> {
  static constexpr constant bool init = true;

  template <typename T>
  bool operator()(bool a, T b) {
    return a & static_cast<bool>(b);
  }

  bool simd_scan(bool x) {
    for (int i = 1; i <= 16; i *= 2) {
      bool other = simd_shuffle_and_fill_up(x, init, i);
      x &= other;
    }
    return x;
  }

  bool simd_exclusive_scan(bool x) {
    x = simd_scan(x);
    return simd_shuffle_and_fill_up(x, init, 1);
  }
};

template <typename U>
struct CumMax {
  static constexpr constant U init = Limits<U>::min;

  template <typename T>
  U operator()(U a, T b) {
    return (a >= b) ? a : b;
  }

  U simd_scan(U x) {
    for (int i = 1; i <= 16; i *= 2) {
      U other = simd_shuffle_and_fill_up(x, init, i);
      x = (x >= other) ? x : other;
    }
    return x;
  }

  U simd_exclusive_scan(U x) {
    x = simd_scan(x);
    return simd_shuffle_and_fill_up(x, init, 1);
  }
};

template <typename U>
struct CumMin {
  static constexpr constant U init = Limits<U>::max;

  template <typename T>
  U operator()(U a, T b) {
    return (a <= b) ? a : b;
  }

  U simd_scan(U x) {
    for (int i = 1; i <= 16; i *= 2) {
      U other = simd_shuffle_and_fill_up(x, init, i);
      x = (x <= other) ? x : other;
    }
    return x;
  }

  U simd_exclusive_scan(U x) {
    x = simd_scan(x);
    return simd_shuffle_and_fill_up(x, init, 1);
  }
};

template <typename U>
struct CumLogaddexp {
  static constexpr constant U init = Limits<U>::min;

  template <typename T>
  U operator()(U a, T b) {
    return LogAddExp{}(a, static_cast<U>(b));
  }

  U simd_scan(U x) {
    for (int i = 1; i <= 16; i *= 2) {
      U other = simd_shuffle_and_fill_up(x, init, i);
      x = LogAddExp{}(x, other);
    }
    return x;
  }

  U simd_exclusive_scan(U x) {
    x = simd_scan(x);
    return simd_shuffle_and_fill_up(x, init, 1);
  }
};

template <typename T, typename U, int N_READS, bool reverse>
inline void load_unsafe(U values[N_READS], const device T* input) {
  if (reverse) {
    for (int i = 0; i < N_READS; i++) {
      values[N_READS - i - 1] = input[i];
    }
  } else {
    for (int i = 0; i < N_READS; i++) {
      values[i] = input[i];
    }
  }
}

template <typename T, typename U, int N_READS, bool reverse>
inline void load_safe(
    U values[N_READS],
    const device T* input,
    int start,
    int total,
    U init) {
  if (reverse) {
    for (int i = 0; i < N_READS; i++) {
      values[N_READS - i - 1] =
          (start + N_READS - i - 1 < total) ? input[i] : init;
    }
  } else {
    for (int i = 0; i < N_READS; i++) {
      values[i] = (start + i < total) ? input[i] : init;
    }
  }
}

template <typename U, int N_READS, bool reverse>
inline void write_unsafe(U values[N_READS], device U* out) {
  if (reverse) {
    for (int i = 0; i < N_READS; i++) {
      out[i] = values[N_READS - i - 1];
    }
  } else {
    for (int i = 0; i < N_READS; i++) {
      out[i] = values[i];
    }
  }
}

template <typename U, int N_READS, bool reverse>
inline void write_safe(U values[N_READS], device U* out, int start, int total) {
  if (reverse) {
    for (int i = 0; i < N_READS; i++) {
      if (start + N_READS - i - 1 < total) {
        out[i] = values[N_READS - i - 1];
      }
    }
  } else {
    for (int i = 0; i < N_READS; i++) {
      if (start + i < total) {
        out[i] = values[i];
      }
    }
  }
}

template <
    typename T,
    typename U,
    typename Op,
    int N_READS,
    bool inclusive,
    bool reverse>
[[kernel]] void contiguous_scan(
    const device T* in [[buffer(0)]],
    device U* out [[buffer(1)]],
    const constant size_t& axis_size [[buffer(2)]],
    uint3 gid [[threadgroup_position_in_grid]],
    uint3 gsize [[threadgroups_per_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint3 lsize [[threads_per_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  constexpr int simd_size = 32;
  Op op;

  // Position the pointers
  size_t offset = (gid.y + gsize.y * size_t(gid.z)) * axis_size;
  in += offset;
  out += offset;

  // Compute the number of simd_groups
  uint simd_groups = lsize.x / simd_size;

  // Allocate memory
  U prefix = Op::init;
  U values[N_READS];
  threadgroup U simdgroup_sums[32];

  // Loop over the reduced axis in blocks of size ceildiv(axis_size,
  // N_READS*lsize)
  //    Read block
  //    Compute inclusive scan of the block
  //      Compute inclusive scan per thread
  //      Compute exclusive scan of thread sums in simdgroup
  //      Write simdgroup sums in SM
  //      Compute exclusive scan of simdgroup sums
  //      Compute the output by scanning prefix, prev_simdgroup, prev_thread,
  //      value
  //    Write block

  for (uint r = 0; r < ceildiv(axis_size, N_READS * lsize.x); r++) {
    // Compute the block offset
    uint offset = r * lsize.x * N_READS + lid.x * N_READS;

    // Read the values
    if (reverse) {
      if ((offset + N_READS) < axis_size) {
        load_unsafe<T, U, N_READS, reverse>(
            values, in + axis_size - offset - N_READS);
      } else {
        load_safe<T, U, N_READS, reverse>(
            values,
            in + axis_size - offset - N_READS,
            offset,
            axis_size,
            Op::init);
      }
    } else {
      if ((offset + N_READS) < axis_size) {
        load_unsafe<T, U, N_READS, reverse>(values, in + offset);
      } else {
        load_safe<T, U, N_READS, reverse>(
            values, in + offset, offset, axis_size, Op::init);
      }
    }

    // Compute an inclusive scan per thread
    for (int i = 1; i < N_READS; i++) {
      values[i] = op(values[i], values[i - 1]);
    }

    // Compute exclusive scan of thread sums
    U prev_thread = op.simd_exclusive_scan(values[N_READS - 1]);

    // Write simdgroup_sums to SM
    threadgroup_barrier(mem_flags::mem_threadgroup);
    if (simd_lane_id == simd_size - 1) {
      simdgroup_sums[simd_group_id] = op(prev_thread, values[N_READS - 1]);
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);

    // Compute exclusive scan of simdgroup_sums
    if (simd_group_id == 0) {
      U prev_simdgroup = op.simd_exclusive_scan(simdgroup_sums[simd_lane_id]);
      simdgroup_sums[simd_lane_id] = prev_simdgroup;
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);

    // Compute the output
    for (int i = 0; i < N_READS; i++) {
      values[i] = op(values[i], prefix);
      values[i] = op(values[i], simdgroup_sums[simd_group_id]);
      values[i] = op(values[i], prev_thread);
    }

    // Write the values
    if (reverse) {
      if (inclusive) {
        if ((offset + N_READS) < axis_size) {
          write_unsafe<U, N_READS, reverse>(
              values, out + axis_size - offset - N_READS);
        } else {
          write_safe<U, N_READS, reverse>(
              values, out + axis_size - offset - N_READS, offset, axis_size);
        }
      } else {
        if (lid.x == 0 && offset == 0) {
          out[axis_size - 1] = Op::init;
        }
        if ((offset + N_READS + 1) < axis_size) {
          write_unsafe<U, N_READS, reverse>(
              values, out + axis_size - offset - 1 - N_READS);
        } else {
          write_safe<U, N_READS, reverse>(
              values,
              out + axis_size - offset - 1 - N_READS,
              offset + 1,
              axis_size);
        }
      }
    } else {
      if (inclusive) {
        if ((offset + N_READS) < axis_size) {
          write_unsafe<U, N_READS, reverse>(values, out + offset);
        } else {
          write_safe<U, N_READS, reverse>(
              values, out + offset, offset, axis_size);
        }
      } else {
        if (lid.x == 0 && offset == 0) {
          out[0] = Op::init;
        }
        if ((offset + N_READS + 1) < axis_size) {
          write_unsafe<U, N_READS, reverse>(values, out + offset + 1);
        } else {
          write_safe<U, N_READS, reverse>(
              values, out + offset + 1, offset + 1, axis_size);
        }
      }
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);

    // Share the prefix
    if (simd_group_id == simd_groups - 1 && simd_lane_id == simd_size - 1) {
      simdgroup_sums[0] = values[N_READS - 1];
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    prefix = simdgroup_sums[0];
  }
}

template <
    typename T,
    typename U,
    typename Op,
    int N_READS,
    bool inclusive,
    bool reverse>
[[kernel]] void strided_scan(
    const device T* in [[buffer(0)]],
    device U* out [[buffer(1)]],
    const constant size_t& axis_size [[buffer(2)]],
    const constant size_t& stride [[buffer(3)]],
    const constant size_t& stride_blocks [[buffer(4)]],
    uint3 gid [[threadgroup_position_in_grid]],
    uint3 gsize [[threadgroups_per_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  constexpr int simd_size = 32;
  constexpr int BM = 32;
  constexpr int BN = 32;
  constexpr int BN_pad = 32 + 16 / sizeof(U);
  constexpr int n_simds = BN / N_READS;
  constexpr int n_scans = BN / n_simds;
  Op op;

  threadgroup U read_buffer[BM * BN_pad];
  U values[n_scans];
  U prefix[n_scans];
  for (int i = 0; i < n_scans; i++) {
    prefix[i] = Op::init;
  }

  // Compute offsets
  size_t full_gid = gid.y + gsize.y * size_t(gid.z);
  size_t offset = full_gid / stride_blocks * axis_size * stride;
  size_t global_index_x = full_gid % stride_blocks * BN;
  uint read_offset_y = (lid.x * N_READS) / BN;
  uint read_offset_x = (lid.x * N_READS) % BN;
  uint scan_offset_y = simd_lane_id;
  uint scan_offset_x = simd_group_id * n_scans;

  uint stride_limit = stride - global_index_x;
  in += offset + global_index_x + read_offset_x;
  out += offset + global_index_x + read_offset_x;
  threadgroup U* read_into =
      read_buffer + read_offset_y * BN_pad + read_offset_x;
  threadgroup U* read_from =
      read_buffer + scan_offset_y * BN_pad + scan_offset_x;

  for (uint j = 0; j < axis_size; j += BM) {
    // Calculate the indices for the current thread
    uint index_y = j + read_offset_y;
    uint check_index_y = index_y;
    if (reverse) {
      index_y = axis_size - 1 - index_y;
    }

    // Read in SM
    threadgroup_barrier(mem_flags::mem_threadgroup);
    if (check_index_y < axis_size && (read_offset_x + N_READS) < stride_limit) {
      for (int i = 0; i < N_READS; i++) {
        read_into[i] = in[index_y * stride + i];
      }
    } else {
      for (int i = 0; i < N_READS; i++) {
        if (check_index_y < axis_size && (read_offset_x + i) < stride_limit) {
          read_into[i] = in[index_y * stride + i];
        } else {
          read_into[i] = Op::init;
        }
      }
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);

    // Read strided into registers
    for (int i = 0; i < n_scans; i++) {
      values[i] = read_from[i];
    }
    simdgroup_barrier(mem_flags::mem_threadgroup);

    // Perform the scan
    for (int i = 0; i < n_scans; i++) {
      values[i] = op.simd_scan(values[i]);
      values[i] = op(values[i], prefix[i]);
      prefix[i] = simd_shuffle(values[i], simd_size - 1);
    }

    // Write to SM
    for (int i = 0; i < n_scans; i++) {
      read_from[i] = values[i];
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);

    // Write to device memory
    if (!inclusive) {
      if (check_index_y == 0) {
        if ((read_offset_x + N_READS) < stride_limit) {
          for (int i = 0; i < N_READS; i++) {
            out[index_y * stride + i] = Op::init;
          }
        } else {
          for (int i = 0; i < N_READS; i++) {
            if ((read_offset_x + i) < stride_limit) {
              out[index_y * stride + i] = Op::init;
            }
          }
        }
      }
      if (reverse) {
        index_y -= 1;
        check_index_y += 1;
      } else {
        index_y += 1;
        check_index_y += 1;
      }
    }
    if (check_index_y < axis_size && (read_offset_x + N_READS) < stride_limit) {
      for (int i = 0; i < N_READS; i++) {
        out[index_y * stride + i] = read_into[i];
      }
    } else {
      for (int i = 0; i < N_READS; i++) {
        if (check_index_y < axis_size && (read_offset_x + i) < stride_limit) {
          out[index_y * stride + i] = read_into[i];
        }
      }
    }
  }
}


================================================
FILE: mlx/backend/metal/kernels/scan.metal
================================================
// Copyright © 2023-2024 Apple Inc.

#include <metal_math>
#include <metal_simdgroup>

// clang-format off

using namespace metal;

#include "mlx/backend/metal/kernels/defines.h"
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/scan.h"

#define instantiate_contiguous_scan(                                    \
    name, itype, otype, op, inclusive, reverse, nreads)                 \
  template [[host_name("contig_scan_" #name)]] [[kernel]] void          \
  contiguous_scan<itype, otype, op<otype>, nreads, inclusive, reverse>( \
      const device itype* in [[buffer(0)]],                             \
      device otype* out [[buffer(1)]],                                  \
      const constant size_t& axis_size [[buffer(2)]],                   \
      uint3 gid [[threadgroup_position_in_grid]],                       \
      uint3 gsize [[threadgroups_per_grid]],                            \
      uint3 lid [[thread_position_in_threadgroup]],                     \
      uint3 lsize [[threads_per_threadgroup]],                          \
      uint simd_lane_id [[thread_index_in_simdgroup]],                  \
      uint simd_group_id [[simdgroup_index_in_threadgroup]]);

#define instantiate_strided_scan(                                    \
    name, itype, otype, op, inclusive, reverse, nreads)              \
  template [[host_name("strided_scan_" #name)]] [[kernel]] void      \
  strided_scan<itype, otype, op<otype>, nreads, inclusive, reverse>( \
      const device itype* in [[buffer(0)]],                          \
      device otype* out [[buffer(1)]],                               \
      const constant size_t& axis_size [[buffer(2)]],                \
      const constant size_t& stride [[buffer(3)]],                   \
      const constant size_t& stride_blocks [[buffer(4)]],            \
      uint3 gid [[threadgroup_position_in_grid]],                    \
      uint3 gsize [[threadgroups_per_grid]],                         \
      uint3 lid [[thread_position_in_threadgroup]],                  \
      uint simd_lane_id [[thread_index_in_simdgroup]],               \
      uint simd_group_id [[simdgroup_index_in_threadgroup]]);

#define instantiate_scan_helper(name, itype, otype, op, nreads)                                \
  instantiate_contiguous_scan(inclusive_##name, itype, otype, op, true, false, nreads)         \
  instantiate_contiguous_scan(exclusive_##name, itype, otype, op, false, false, nreads)        \
  instantiate_contiguous_scan(reverse_inclusive_##name, itype, otype, op, true, true, nreads)  \
  instantiate_contiguous_scan(reverse_exclusive_##name, itype, otype, op, false, true, nreads) \
  instantiate_strided_scan(inclusive_##name, itype, otype, op, true, false, nreads)            \
  instantiate_strided_scan(exclusive_##name, itype, otype, op, false, false, nreads)           \
  instantiate_strided_scan(reverse_inclusive_##name, itype, otype, op, true, true, nreads)     \
  instantiate_strided_scan(reverse_exclusive_##name, itype, otype, op, false, true, nreads)

instantiate_scan_helper(sum_bool__int32,         bool,        int32_t,     CumSum, 4)
instantiate_scan_helper(sum_bool__uint32,        bool,        uint32_t,    CumSum, 4)
instantiate_scan_helper(sum_uint8_uint8,         uint8_t,     uint8_t,     CumSum, 4)
instantiate_scan_helper(sum_uint16_uint16,       uint16_t,    uint16_t,    CumSum, 4)
instantiate_scan_helper(sum_uint32_uint32,       uint32_t,    uint32_t,    CumSum, 4)
instantiate_scan_helper(sum_uint64_uint64,       uint64_t,    uint64_t,    CumSum, 2)
instantiate_scan_helper(sum_int8_int8,           int8_t,      int8_t,      CumSum, 4)
instantiate_scan_helper(sum_int16_int16,         int16_t,     int16_t,     CumSum, 4)
instantiate_scan_helper(sum_int32_int32,         int32_t,     int32_t,     CumSum, 4)
instantiate_scan_helper(sum_int64_int64,         int64_t,     int64_t,     CumSum, 2)
instantiate_scan_helper(sum_float16_float16,     half,        half,        CumSum, 4)
instantiate_scan_helper(sum_float32_float32,     float,       float,       CumSum, 4)
instantiate_scan_helper(sum_bfloat16_bfloat16,   bfloat16_t,  bfloat16_t,  CumSum, 4)
instantiate_scan_helper(sum_complex64_complex64, complex64_t, complex64_t, CumSum, 2)
instantiate_scan_helper(prod_bool__bool_,         bool,        bool,        CumProd, 4)
instantiate_scan_helper(prod_uint8_uint8,         uint8_t,     uint8_t,     CumProd, 4)
instantiate_scan_helper(prod_uint16_uint16,       uint16_t,    uint16_t,    CumProd, 4)
instantiate_scan_helper(prod_uint32_uint32,       uint32_t,    uint32_t,    CumProd, 4)
instantiate_scan_helper(prod_uint64_uint64,       uint64_t,    uint64_t,    CumProd, 2)
instantiate_scan_helper(prod_int8_int8,           int8_t,      int8_t,      CumProd, 4)
instantiate_scan_helper(prod_int16_int16,         int16_t,     int16_t,     CumProd, 4)
instantiate_scan_helper(prod_int32_int32,         int32_t,     int32_t,     CumProd, 4)
instantiate_scan_helper(prod_int64_int64,         int64_t,     int64_t,     CumProd, 2)
instantiate_scan_helper(prod_float16_float16,     half,        half,        CumProd, 4)
instantiate_scan_helper(prod_float32_float32,     float,       float,       CumProd, 4)
instantiate_scan_helper(prod_bfloat16_bfloat16,   bfloat16_t,  bfloat16_t,  CumProd, 4)
instantiate_scan_helper(prod_complex64_complex64, complex64_t, complex64_t, CumProd, 2)
instantiate_scan_helper(max_bool__bool_,         bool,        bool,        CumMax, 4)
instantiate_scan_helper(max_uint8_uint8,         uint8_t,     uint8_t,     CumMax, 4)
instantiate_scan_helper(max_uint16_uint16,       uint16_t,    uint16_t,    CumMax, 4)
instantiate_scan_helper(max_uint32_uint32,       uint32_t,    uint32_t,    CumMax, 4)
instantiate_scan_helper(max_uint64_uint64,       uint64_t,    uint64_t,    CumMax, 2)
instantiate_scan_helper(max_int8_int8,           int8_t,      int8_t,      CumMax, 4)
instantiate_scan_helper(max_int16_int16,         int16_t,     int16_t,     CumMax, 4)
instantiate_scan_helper(max_int32_int32,         int32_t,     int32_t,     CumMax, 4)
instantiate_scan_helper(max_int64_int64,         int64_t,     int64_t,     CumMax, 2)
instantiate_scan_helper(max_float16_float16,     half,        half,        CumMax, 4)
instantiate_scan_helper(max_float32_float32,     float,       float,       CumMax, 4)
instantiate_scan_helper(max_bfloat16_bfloat16,   bfloat16_t,  bfloat16_t,  CumMax, 4)
instantiate_scan_helper(max_complex64_complex64, complex64_t, complex64_t, CumMax, 2)
instantiate_scan_helper(min_bool__bool_,         bool,        bool,        CumMin, 4)
instantiate_scan_helper(min_uint8_uint8,         uint8_t,     uint8_t,     CumMin, 4)
instantiate_scan_helper(min_uint16_uint16,       uint16_t,    uint16_t,    CumMin, 4)
instantiate_scan_helper(min_uint32_uint32,       uint32_t,    uint32_t,    CumMin, 4)
instantiate_scan_helper(min_uint64_uint64,       uint64_t,    uint64_t,    CumMin, 2)
instantiate_scan_helper(min_int8_int8,           int8_t,      int8_t,      CumMin, 4)
instantiate_scan_helper(min_int16_int16,         int16_t,     int16_t,     CumMin, 4)
instantiate_scan_helper(min_int32_int32,         int32_t,     int32_t,     CumMin, 4)
instantiate_scan_helper(min_int64_int64,         int64_t,     int64_t,     CumMin, 2)
instantiate_scan_helper(min_float16_float16,     half,        half,        CumMin, 4)
instantiate_scan_helper(min_float32_float32,     float,       float,       CumMin, 4)
instantiate_scan_helper(min_bfloat16_bfloat16,   bfloat16_t,  bfloat16_t,  CumMin, 4)
instantiate_scan_helper(min_complex64_complex64, complex64_t, complex64_t, CumMin, 2)
instantiate_scan_helper(logaddexp_float16_float16,     half,        half,        CumLogaddexp, 4)
instantiate_scan_helper(logaddexp_float32_float32,     float,       float,       CumLogaddexp, 4)
instantiate_scan_helper(logaddexp_bfloat16_bfloat16,   bfloat16_t,  bfloat16_t,  CumLogaddexp, 4)
instantiate_scan_helper(logaddexp_complex64_complex64, complex64_t, complex64_t, CumLogaddexp, 2) // clang-format on


================================================
FILE: mlx/backend/metal/kernels/sdpa_vector.h
================================================
// Copyright © 2024 Apple Inc.

#include <metal_simdgroup>

using namespace metal;

constant bool has_mask [[function_constant(20)]];
constant bool query_transposed [[function_constant(21)]];
constant bool do_causal [[function_constant(22)]];
constant bool bool_mask [[function_constant(23)]];
constant bool float_mask [[function_constant(24)]];
constant bool has_sinks [[function_constant(25)]];
constant int blocks [[function_constant(26)]];

template <typename T, int D, int V = D>
[[kernel]] void sdpa_vector(
    const device T* queries [[buffer(0)]],
    const device T* keys [[buffer(1)]],
    const device T* values [[buffer(2)]],
    device T* out [[buffer(3)]],
    const constant int& gqa_factor [[buffer(4)]],
    const constant int& N [[buffer(5)]],
    const constant size_t& k_head_stride [[buffer(6)]],
    const constant size_t& k_seq_stride [[buffer(7)]],
    const constant size_t& v_head_stride [[buffer(8)]],
    const constant size_t& v_seq_stride [[buffer(9)]],
    const constant float& scale [[buffer(10)]],
    const device bool* bmask [[buffer(11), function_constant(bool_mask)]],
    const device T* fmask [[buffer(12), function_constant(float_mask)]],
    const constant int& mask_kv_seq_stride
    [[buffer(13), function_constant(has_mask)]],
    const constant int& mask_q_seq_stride
    [[buffer(14), function_constant(has_mask)]],
    const constant int& mask_head_stride
    [[buffer(15), function_constant(has_mask)]],
    const device T* sinks [[buffer(16), function_constant(has_sinks)]],
    const constant int& num_q_heads
    [[buffer(17), function_constant(has_sinks)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 tpg [[threadgroups_per_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  constexpr int BN = 32;
  constexpr int BD = 32;
  constexpr int qk_per_thread = D / BD;
  constexpr int v_per_thread = V / BD;
  int inner_k_stride = BN * int(k_seq_stride);
  int inner_v_stride = BN * int(v_seq_stride);

  typedef float U;

  thread U q[qk_per_thread];
  thread U k[qk_per_thread];
  thread U o[v_per_thread];

  threadgroup U outputs[BN * BD];
  threadgroup U max_scores[BN];
  threadgroup U sum_exp_scores[BN];

  // Adjust positions
  const int q_batch_head_idx = tid.x;
  const int q_seq_idx = tid.y;
  const int kv_head_idx = q_batch_head_idx / gqa_factor;
  const int o_offset = q_batch_head_idx * tpg.y + q_seq_idx;
  const int q_offset =
      query_transposed ? tpg.x * q_seq_idx + q_batch_head_idx : o_offset;
  queries += q_offset * D + simd_lid * qk_per_thread;
  keys += kv_head_idx * k_head_stride + simd_gid * k_seq_stride +
      simd_lid * qk_per_thread;
  values += kv_head_idx * v_head_stride + simd_gid * v_seq_stride +
      simd_lid * v_per_thread;
  if (bool_mask) {
    bmask += q_batch_head_idx * mask_head_stride +
        simd_gid * mask_kv_seq_stride + q_seq_idx * mask_q_seq_stride;
  }
  if (float_mask) {
    fmask += q_batch_head_idx * mask_head_stride +
        simd_gid * mask_kv_seq_stride + q_seq_idx * mask_q_seq_stride;
  }

  out += o_offset * V + simd_gid * v_per_thread;

  // Read the query and 0 the output accumulator
  for (int i = 0; i < qk_per_thread; i++) {
    q[i] = static_cast<U>(scale) * queries[i];
  }
  for (int i = 0; i < v_per_thread; i++) {
    o[i] = 0;
  }

  U max_score = Limits<U>::finite_min;
  U sum_exp_score = 0;
  if (has_sinks && simd_gid == 0) {
    max_score = static_cast<U>(sinks[q_batch_head_idx % num_q_heads]);
    sum_exp_score = 1;
  }

  // For each key
  for (int i = simd_gid; i < N; i += BN) {
    bool use_key = true;
    if (do_causal) {
      use_key = i <= (N - int(tpg.y) + int(q_seq_idx));
    } else if (bool_mask) {
      use_key = bmask[0];
    } else if (float_mask) {
      use_key = (fmask[0] >= Limits<T>::finite_min);
    }
    if (use_key) {
      // Read the key
      for (int j = 0; j < qk_per_thread; j++) {
        k[j] = keys[j];
      }

      // Compute the i-th score
      U score = 0;
      for (int j = 0; j < qk_per_thread; j++) {
        score += q[j] * k[j];
      }
      score = simd_sum(score);
      if (float_mask) {
        score += static_cast<U>(fmask[0]);
      }

      // Update the accumulators
      U new_max = max(max_score, score);
      U factor = fast::exp(max_score - new_max);
      U exp_score = fast::exp(score - new_max);

      max_score = new_max;
      sum_exp_score = sum_exp_score * factor + exp_score;

      // Update the output accumulator
      for (int j = 0; j < v_per_thread; j++) {
        o[j] = o[j] * factor + exp_score * values[j];
      }
    }

    // Move the pointers to the next kv
    keys += inner_k_stride;
    values += inner_v_stride;
    if (bool_mask) {
      bmask += BN * mask_kv_seq_stride;
    }
    if (float_mask) {
      fmask += BN * mask_kv_seq_stride;
    }
  }

  // Each thread has a partial part of the output so we need to combine them.

  // First let's communicate the max and sum_exp
  if (simd_lid == 0) {
    max_scores[simd_gid] = max_score;
    sum_exp_scores[simd_gid] = sum_exp_score;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  max_score = max_scores[simd_lid];
  U new_max = simd_max(max_score);
  U factor = fast::exp(max_score - new_max);
  sum_exp_score = simd_sum(sum_exp_scores[simd_lid] * factor);

  // Now we need to aggregate all the outputs
  for (int i = 0; i < v_per_thread; i++) {
    outputs[simd_lid * BD + simd_gid] = o[i];
    threadgroup_barrier(mem_flags::mem_threadgroup);
    o[i] = simd_sum(outputs[simd_gid * BD + simd_lid] * factor);
    o[i] = sum_exp_score == 0 ? o[i] : (o[i] / sum_exp_score);
    threadgroup_barrier(mem_flags::mem_threadgroup);
  }

  // And write the output
  if (simd_lid == 0) {
    for (int i = 0; i < v_per_thread; i++) {
      out[i] = static_cast<T>(o[i]);
    }
  }
}

template <typename T, int D, int V = D>
[[kernel]] void sdpa_vector_2pass_1(
    const device T* queries [[buffer(0)]],
    const device T* keys [[buffer(1)]],
    const device T* values [[buffer(2)]],
    device T* out [[buffer(3)]],
    device float* sums [[buffer(4)]],
    device float* maxs [[buffer(5)]],
    const constant int& N [[buffer(7)]],
    const constant size_t& k_head_stride [[buffer(8)]],
    const constant size_t& k_seq_stride [[buffer(9)]],
    const constant size_t& v_head_stride [[buffer(10)]],
    const constant size_t& v_seq_stride [[buffer(11)]],
    const constant float& scale [[buffer(12)]],
    const device bool* bmask [[buffer(13), function_constant(bool_mask)]],
    const device T* fmask [[buffer(14), function_constant(float_mask)]],
    const constant int& mask_kv_seq_stride
    [[buffer(15), function_constant(has_mask)]],
    const constant int& mask_q_seq_stride
    [[buffer(16), function_constant(has_mask)]],
    const constant int& mask_head_stride
    [[buffer(17), function_constant(has_mask)]],
    const device T* sinks [[buffer(18), function_constant(has_sinks)]],
    uint3 tptg [[threads_per_threadgroup]],
    uint3 tidtg [[thread_position_in_threadgroup]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 tpg [[threadgroups_per_grid]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  constexpr int BD = 32;
  constexpr int qk_per_thread = D / BD;
  constexpr int v_per_thread = V / BD;

  typedef float U;

  thread U q[qk_per_thread];
  thread U o[v_per_thread] = {0};

  // Adjust positions
  const int kv_head_idx = tid.x;
  const int batch_idx = tid.y;
  const int block_idx = tid.z;
  const int gqa_factor = tptg.y;
  const int q_seq_len = tptg.z;
  const int q_seq_idx = tidtg.z;
  const int q_head_idx = gqa_factor * kv_head_idx + tidtg.y;
  const int num_kv_heads = tpg.x;
  const int num_q_heads = num_kv_heads * gqa_factor;
  const int q_batch_head_idx = (batch_idx * num_q_heads + q_head_idx);
  const int o_offset = q_batch_head_idx * q_seq_len + q_seq_idx;
  const int q_offset =
      query_transposed ? num_q_heads * q_seq_idx + q_batch_head_idx : o_offset;

  queries += q_offset * D + simd_lid * qk_per_thread;

  const int kv_batch_head_idx = batch_idx * num_kv_heads + kv_head_idx;
  keys += kv_batch_head_idx * k_head_stride + block_idx * k_seq_stride +
      simd_lid * qk_per_thread;
  values += kv_batch_head_idx * v_head_stride + block_idx * v_seq_stride +
      simd_lid * v_per_thread;
  out += o_offset * blocks * V + block_idx * V + simd_lid * v_per_thread;
  if (bool_mask) {
    bmask += q_batch_head_idx * mask_head_stride +
        block_idx * mask_kv_seq_stride + q_seq_idx * mask_q_seq_stride;
  }
  if (float_mask) {
    fmask += q_batch_head_idx * mask_head_stride +
        block_idx * mask_kv_seq_stride + q_seq_idx * mask_q_seq_stride;
  }
  sums += o_offset * blocks + block_idx;
  maxs += o_offset * blocks + block_idx;

  // Read the query
  for (int i = 0; i < qk_per_thread; i++) {
    q[i] = static_cast<U>(scale) * queries[i];
  }

  U max_score = Limits<U>::finite_min;
  U sum_exp_score = 0;
  if (has_sinks && block_idx == 0) {
    max_score = static_cast<U>(sinks[q_head_idx]);
    sum_exp_score = 1;
  }

  // For each key
  for (int i = block_idx; i < N; i += blocks) {
    bool use_key = true;
    if (do_causal) {
      use_key = i <= (N - q_seq_len + int(q_seq_idx));
    } else if (bool_mask) {
      use_key = bmask[0];
    } else if (float_mask) {
      use_key = (fmask[0] >= Limits<T>::finite_min);
    }
    if (use_key) {
      // Compute the i-th score
      U score = 0;
      for (int i = 0; i < qk_per_thread; i++) {
        score += q[i] * keys[i];
      }
      score = simd_sum(score);

      if (float_mask) {
        score += fmask[0];
      }

      // Update the accumulators
      U new_max = max(max_score, score);
      U factor = fast::exp(max_score - new_max);
      U exp_score = fast::exp(score - new_max);

      max_score = new_max;
      sum_exp_score = sum_exp_score * factor + exp_score;

      // Update the output accumulator
      for (int i = 0; i < v_per_thread; i++) {
        o[i] = o[i] * factor + exp_score * values[i];
      }
    }

    // Move the pointers to the next kv
    keys += blocks * int(k_seq_stride);
    values += blocks * int(v_seq_stride);
    if (bool_mask) {
      bmask += blocks * mask_kv_seq_stride;
    }
    if (float_mask) {
      fmask += blocks * mask_kv_seq_stride;
    }
  }

  // Write the sum and max and outputs
  if (simd_lid == 0) {
    sums[0] = sum_exp_score;
    maxs[0] = max_score;
  }

  for (int i = 0; i < v_per_thread; i++) {
    out[i] = static_cast<T>(o[i]);
  }
}

template <typename T, int D>
[[kernel]] void sdpa_vector_2pass_2(
    const device T* partials [[buffer(0)]],
    const device float* sums [[buffer(1)]],
    const device float* maxs [[buffer(2)]],
    device T* out [[buffer(3)]],
    const constant int& blocks [[buffer(4)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 tpg [[threadgroups_per_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  constexpr int BN = 32;
  constexpr int BD = 32;
  constexpr int elem_per_thread = D / BD;

  typedef float U;

  thread U o[elem_per_thread] = {0};
  threadgroup U outputs[BN * BD];

  // Adjust positions
  const int head_idx = tid.x;
  const int q_seq_idx = tid.y;
  const int q_offset = head_idx * tpg.y + q_seq_idx;
  partials += q_offset * blocks * D + simd_gid * D + simd_lid * elem_per_thread;
  sums += q_offset * blocks;
  maxs += q_offset * blocks;
  out += q_offset * D + simd_gid * elem_per_thread;

  // Set defaults
  U sum_exp_score = 0.0;
  U max_score = Limits<U>::finite_min;

  // Reduce the max
  for (int b = 0; b < blocks / BN; ++b) {
    max_score = max(max_score, maxs[simd_lid + BN * b]);
  }
  max_score = simd_max(max_score);

  // Reduce the d
  for (int b = 0; b < blocks / BN; ++b) {
    U factor = fast::exp(maxs[simd_lid + BN * b] - max_score);
    sum_exp_score += factor * sums[simd_lid + BN * b];
  }
  sum_exp_score = simd_sum(sum_exp_score);

  // Reduce the sum exp and partials
  for (int b = 0; b < blocks / BN; ++b) {
    U factor = fast::exp(maxs[simd_gid] - max_score);

    // Update the output accumulator
    for (int i = 0; i < elem_per_thread; i++) {
      o[i] += factor * static_cast<U>(partials[i]);
    }
    maxs += BN;
    sums += BN;
    partials += BN * D;
  }

  // Use shared memory to transpose and reduce the final block
  for (int i = 0; i < elem_per_thread; i++) {
    outputs[simd_lid * BD + simd_gid] = o[i];
    threadgroup_barrier(mem_flags::mem_threadgroup);
    o[i] = simd_sum(outputs[simd_gid * BD + simd_lid]);
    o[i] = sum_exp_score == 0 ? o[i] : (o[i] / sum_exp_score);
    threadgroup_barrier(mem_flags::mem_threadgroup);
  }

  // And write the output
  if (simd_lid == 0) {
    for (int i = 0; i < elem_per_thread; i++) {
      out[i] = static_cast<T>(o[i]);
    }
  }
}


================================================
FILE: mlx/backend/metal/kernels/softmax.h
================================================
// Copyright © 2023-2024 Apple Inc.

template <typename T>
inline T softmax_exp(T x) {
  // Softmax doesn't need high precision exponential cause x is gonna be in
  // (-oo, 0] anyway and subsequently it will be divided by sum(exp(x_i)).
  return fast::exp(x);
}

template <typename T, typename AccT = T, int N_READS = SOFTMAX_N_READS>
[[kernel]] void softmax_single_row(
    const device T* in,
    device T* out,
    constant int& axis_size,
    uint gid [[threadgroup_position_in_grid]],
    uint _lid [[thread_position_in_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  int lid = _lid;

  constexpr int SIMD_SIZE = 32;

  threadgroup AccT local_max[SIMD_SIZE];
  threadgroup AccT local_normalizer[SIMD_SIZE];

  AccT ld[N_READS];

  in += gid * size_t(axis_size) + lid * N_READS;
  if (lid * N_READS + N_READS <= axis_size) {
    for (int i = 0; i < N_READS; i++) {
      ld[i] = AccT(in[i]);
    }
  } else {
    for (int i = 0; i < N_READS; i++) {
      ld[i] =
          ((lid * N_READS + i) < axis_size) ? AccT(in[i]) : Limits<AccT>::min;
    }
  }
  if (simd_group_id == 0) {
    local_max[simd_lane_id] = Limits<AccT>::min;
    local_normalizer[simd_lane_id] = 0;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);

  // Get the max
  AccT maxval = Limits<AccT>::finite_min;
  for (int i = 0; i < N_READS; i++) {
    maxval = (maxval < ld[i]) ? ld[i] : maxval;
  }
  maxval = simd_max(maxval);
  if (simd_lane_id == 0) {
    local_max[simd_group_id] = maxval;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (simd_group_id == 0) {
    maxval = simd_max(local_max[simd_lane_id]);
    if (simd_lane_id == 0) {
      local_max[0] = maxval;
    }
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  maxval = local_max[0];

  // Compute exp(x_i - maxval) and store the partial sums in local_normalizer
  AccT normalizer = 0;
  for (int i = 0; i < N_READS; i++) {
    AccT exp_x = softmax_exp(ld[i] - maxval);
    ld[i] = exp_x;
    normalizer += exp_x;
  }
  normalizer = simd_sum(normalizer);
  if (simd_lane_id == 0) {
    local_normalizer[simd_group_id] = normalizer;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  if (simd_group_id == 0) {
    normalizer = simd_sum(local_normalizer[simd_lane_id]);
    if (simd_lane_id == 0) {
      local_normalizer[0] = normalizer;
    }
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  normalizer = 1 / local_normalizer[0];

  // Normalize and write to the output
  out += gid * size_t(axis_size) + lid * N_READS;
  if (lid * N_READS + N_READS <= axis_size) {
    for (int i = 0; i < N_READS; i++) {
      out[i] = T(ld[i] * normalizer);
    }
  } else {
    for (int i = 0; i < N_READS; i++) {
      if ((lid * N_READS + i) < axis_size) {
        out[i] = T(ld[i] * normalizer);
      }
    }
  }
}

template <typename T, typename AccT = T, int N_READS = SOFTMAX_N_READS>
[[kernel]] void softmax_looped(
    const device T* in,
    device T* out,
    constant int& axis_size,
    uint gid [[threadgroup_position_in_grid]],
    uint lid [[thread_position_in_threadgroup]],
    uint lsize [[threads_per_threadgroup]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
  in += gid * size_t(axis_size);

  constexpr int SIMD_SIZE = 32;

  threadgroup AccT local_max[SIMD_SIZE];
  threadgroup AccT local_normalizer[SIMD_SIZE];

  // Get the max and the normalizer in one go
  AccT prevmax;
  AccT maxval = Limits<AccT>::finite_min;
  AccT normalizer = 0;
  for (int r = 0; r < static_cast<int>(ceildiv(axis_size, N_READS * lsize));
       r++) {
    int offset = r * lsize * N_READS + lid * N_READS;
    AccT vals[N_READS];
    if (offset + N_READS <= axis_size) {
      for (int i = 0; i < N_READS; i++) {
        vals[i] = AccT(in[offset + i]);
      }
    } else {
      for (int i = 0; i < N_READS; i++) {
        vals[i] =
            (offset + i < axis_size) ? AccT(in[offset + i]) : Limits<AccT>::min;
      }
    }
    prevmax = maxval;
    for (int i = 0; i < N_READS; i++) {
      maxval = (maxval < vals[i]) ? vals[i] : maxval;
    }
    normalizer *= softmax_exp(prevmax - maxval);
    for (int i = 0; i < N_READS; i++) {
      normalizer += softmax_exp(vals[i] - maxval);
    }
  }
  // Now we got partial normalizer of N_READS * ceildiv(axis_size, N_READS *
  // lsize) parts. We need to combine them.
  //    1. We start by finding the max across simd groups
  //    2. We then change the partial normalizers to account for a possible
  //       change in max
  //    3. We sum all normalizers
  prevmax = maxval;
  maxval = simd_max(maxval);
  normalizer *= softmax_exp(prevmax - maxval);
  normalizer = simd_sum(normalizer);

  // Now the normalizer and max value is correct for each simdgroup. We write
  // them shared memory and combine them.
  prevmax = maxval;
  if (simd_lane_id == 0) {
    local_max[simd_group_id] = maxval;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  maxval = simd_max(local_max[simd_lane_id]);
  normalizer *= softmax_exp(prevmax - maxval);
  if (simd_lane_id == 0) {
    local_normalizer[simd_group_id] = normalizer;
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);
  normalizer = simd_sum(local_normalizer[simd_lane_id]);
  normalizer = 1 / normalizer;

  // Finally given the normalizer and max value we can directly write the
  // softmax output
  out += gid * size_t(axis_size);
  for (int r = 0; r < static_cast<int>(ceildiv(axis_size, N_READS * lsize));
       r++) {
    int offset = r * lsize * N_READS + lid * N_READS;
    if (offset + N_READS <= axis_size) {
      for (int i = 0; i < N_READS; i++) {
        out[offset + i] = T(softmax_exp(in[offset + i] - maxval) * normalizer);
      }
    } else {
      for (int i = 0; i < N_READS; i++) {
        if (offset + i < axis_size) {
          out[offset + i] =
              T(softmax_exp(in[offset + i] - maxval) * normalizer);
        }
      }
    }
  }
}


================================================
FILE: mlx/backend/metal/kernels/softmax.metal
================================================
// Copyright © 2023-2024 Apple Inc.

#include <metal_common>
#include <metal_simdgroup>

using namespace metal;

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/softmax.h"

#define instantiate_softmax(name, itype)                                \
  instantiate_kernel("block_softmax_" #name, softmax_single_row, itype) \
  instantiate_kernel("looped_softmax_" #name, softmax_looped, itype)

#define instantiate_softmax_precise(name, itype)                                       \
  instantiate_kernel("block_softmax_precise_" #name, softmax_single_row, itype, float) \
  instantiate_kernel("looped_softmax_precise_" #name, softmax_looped, itype, float)

instantiate_softmax(float32, float)
instantiate_softmax(float16, half)
instantiate_softmax(bfloat16, bfloat16_t)
instantiate_softmax_precise(float16, half)
instantiate_softmax_precise(bfloat16, bfloat16_t) // clang-format on


================================================
FILE: mlx/backend/metal/kernels/sort.h
================================================
// Copyright © 2023-2024 Apple Inc.

#define MLX_MTL_CONST static constant constexpr const
#define MLX_MTL_LOOP_UNROLL _Pragma("clang loop unroll(full)")

using namespace metal;

// Based on GPU merge sort algorithm at
// https://github.com/NVIDIA/cccl/tree/main/cub/cub

///////////////////////////////////////////////////////////////////////////////
// Thread-level sort
///////////////////////////////////////////////////////////////////////////////

template <typename T>
METAL_FUNC void thread_swap(thread T& a, thread T& b) {
  T w = a;
  a = b;
  b = w;
}

template <typename T, typename = void>
struct Init {
  static constexpr constant T v = Limits<T>::max;
};

template <typename T>
struct Init<T, metal::enable_if_t<metal::is_floating_point_v<T>>> {
  static constexpr constant T v = metal::numeric_limits<T>::quiet_NaN();
};

template <typename T>
struct LessThan {
  static constexpr constant T init = Init<T>::v;
  METAL_FUNC bool operator()(T a, T b) const {
    if constexpr (
        metal::is_floating_point_v<T> || metal::is_same_v<T, complex64_t>) {
      bool an = isnan(a);
      bool bn = isnan(b);
      if (an | bn) {
        return (!an) & bn;
      }
    }
    return a < b;
  }
};

template <
    typename ValT,
    typename IdxT,
    bool ARG_SORT,
    short N_PER_THREAD,
    typename CompareOp>
struct ThreadSort {
  static METAL_FUNC void sort(
      thread ValT (&vals)[N_PER_THREAD],
      thread IdxT (&idxs)[N_PER_THREAD]) {
    CompareOp op;
    MLX_MTL_LOOP_UNROLL
    for (short i = 0; i < N_PER_THREAD; ++i) {
      MLX_MTL_LOOP_UNROLL
      for (short j = i & 1; j < N_PER_THREAD - 1; j += 2) {
        if (op(vals[j + 1], vals[j])) {
          thread_swap(vals[j + 1], vals[j]);
          if (ARG_SORT) {
            thread_swap(idxs[j + 1], idxs[j]);
          }
        }
      }
    }
  }
};

///////////////////////////////////////////////////////////////////////////////
// Threadgroup-level sort
///////////////////////////////////////////////////////////////////////////////

template <
    typename ValT,
    typename IdxT,
    bool ARG_SORT,
    short BLOCK_THREADS,
    short N_PER_THREAD,
    typename CompareOp>
struct BlockMergeSort {
  using thread_sort_t =
      ThreadSort<ValT, IdxT, ARG_SORT, N_PER_THREAD, CompareOp>;
  static METAL_FUNC int merge_partition(
      const threadgroup ValT* As,
      const threadgroup ValT* Bs,
      short A_sz,
      short B_sz,
      short sort_md) {
    CompareOp op;

    short A_st = max(0, sort_md - B_sz);
    short A_ed = min(sort_md, A_sz);

    while (A_st < A_ed) {
      short md = A_st + (A_ed - A_st) / 2;
      auto a = As[md];
      auto b = Bs[sort_md - 1 - md];

      if (op(b, a)) {
        A_ed = md;
      } else {
        A_st = md + 1;
      }
    }

    return A_ed;
  }

  static METAL_FUNC void merge_step(
      const threadgroup ValT* As,
      const threadgroup ValT* Bs,
      const threadgroup IdxT* As_idx,
      const threadgroup IdxT* Bs_idx,
      short A_sz,
      short B_sz,
      thread ValT (&vals)[N_PER_THREAD],
      thread IdxT (&idxs)[N_PER_THREAD]) {
    CompareOp op;
    short a_idx = 0;
    short b_idx = 0;

    for (int i = 0; i < N_PER_THREAD; ++i) {
      auto a = (a_idx < A_sz) ? As[a_idx] : ValT(CompareOp::init);
      auto b = (b_idx < B_sz) ? Bs[b_idx] : ValT(CompareOp::init);
      bool pred = (b_idx < B_sz) && (a_idx >= A_sz || op(b, a));

      vals[i] = pred ? b : a;
      if (ARG_SORT) {
        if (pred) {
          idxs[i] = Bs_idx[b_idx];
        } else {
          idxs[i] = (a_idx < A_sz) ? As_idx[a_idx] : IdxT(0);
        }
      }

      b_idx += short(pred);
      a_idx += short(!pred);
    }
  }

  static METAL_FUNC void sort(
      threadgroup ValT* tgp_vals [[threadgroup(0)]],
      threadgroup IdxT* tgp_idxs [[threadgroup(1)]],
      int size_sorted_axis,
      uint3 lid [[thread_position_in_threadgroup]]) {
    // Get thread location
    int idx = lid.x * N_PER_THREAD;

    // Load from shared memory
    thread ValT thread_vals[N_PER_THREAD];
    thread IdxT thread_idxs[N_PER_THREAD];
    for (int i = 0; i < N_PER_THREAD; ++i) {
      thread_vals[i] = tgp_vals[idx + i];
      if (ARG_SORT) {
        thread_idxs[i] = tgp_idxs[idx + i];
      }
    }

    // Per thread sort
    if (idx < size_sorted_axis) {
      thread_sort_t::sort(thread_vals, thread_idxs);
    }

    // Do merges using threadgroup memory
    for (int merge_threads = 2; merge_threads <= BLOCK_THREADS;
         merge_threads *= 2) {
      // Update threadgroup memory
      threadgroup_barrier(mem_flags::mem_threadgroup);
      for (int i = 0; i < N_PER_THREAD; ++i) {
        tgp_vals[idx + i] = thread_vals[i];
        if (ARG_SORT) {
          tgp_idxs[idx + i] = thread_idxs[i];
        }
      }
      threadgroup_barrier(mem_flags::mem_threadgroup);

      // Find location in merge step
      int merge_group = lid.x / merge_threads;
      int merge_lane = lid.x % merge_threads;

      int sort_sz = N_PER_THREAD * merge_threads;
      int sort_st = N_PER_THREAD * merge_threads * merge_group;

      // As = tgp_vals[A_st:A_ed] is sorted
      // Bs = tgp_vals[B_st:B_ed] is sorted
      int A_st = sort_st;
      int A_ed = sort_st + sort_sz / 2;
      int B_st = sort_st + sort_sz / 2;
      int B_ed = sort_st + sort_sz;

      const threadgroup ValT* As = tgp_vals + A_st;
      const threadgroup ValT* Bs = tgp_vals + B_st;
      int A_sz = A_ed - A_st;
      int B_sz = B_ed - B_st;

      // Find a partition of merge elements
      //  Ci = merge(As[partition:], Bs[sort_md - partition:])
      //       of size N_PER_THREAD for each merge lane i
      //  C = [Ci] is sorted
      int sort_md = N_PER_THREAD * merge_lane;
      int partition = merge_partition(As, Bs, A_sz, B_sz, sort_md);

      As += partition;
      Bs += sort_md - partition;

      A_sz -= partition;
      B_sz -= sort_md - partition;

      const threadgroup IdxT* As_idx =
          ARG_SORT ? tgp_idxs + A_st + partition : nullptr;
      const threadgroup IdxT* Bs_idx =
          ARG_SORT ? tgp_idxs + B_st + sort_md - partition : nullptr;

      // Merge starting at the partition and store results in thread registers
      merge_step(As, Bs, As_idx, Bs_idx, A_sz, B_sz, thread_vals, thread_idxs);
    }

    // Write out to shared memory
    threadgroup_barrier(mem_flags::mem_threadgroup);
    for (int i = 0; i < N_PER_THREAD; ++i) {
      tgp_vals[idx + i] = thread_vals[i];
      if (ARG_SORT) {
        tgp_idxs[idx + i] = thread_idxs[i];
      }
    }
  }
};

///////////////////////////////////////////////////////////////////////////////
// Kernel sort
///////////////////////////////////////////////////////////////////////////////

template <
    typename T,
    typename U,
    bool ARG_SORT,
    short BLOCK_THREADS,
    short N_PER_THREAD,
    typename CompareOp = LessThan<T>>
struct KernelMergeSort {
  using ValT = T;
  using IdxT = uint;
  using block_merge_sort_t = BlockMergeSort<
      ValT,
      IdxT,
      ARG_SORT,
      BLOCK_THREADS,
      N_PER_THREAD,
      CompareOp>;

  MLX_MTL_CONST short N_PER_BLOCK = BLOCK_THREADS * N_PER_THREAD;

  static METAL_FUNC void block_sort(
      const device T* inp,
      device U* out,
      const constant int& size_sorted_axis,
      const constant int& in_stride_sorted_axis,
      const constant int& out_stride_sorted_axis,
      const constant int& in_stride_segment_axis,
      const constant int& out_stride_segment_axis,
      threadgroup ValT* tgp_vals,
      threadgroup IdxT* tgp_idxs,
      uint3 tid [[threadgroup_position_in_grid]],
      uint3 lid [[thread_position_in_threadgroup]]) {
    // tid.y tells us the segment index
    inp += tid.y * in_stride_segment_axis;
    out += tid.y * out_stride_segment_axis;

    // Copy into threadgroup memory
    for (short i = lid.x; i < N_PER_BLOCK; i += BLOCK_THREADS) {
      tgp_vals[i] = i < size_sorted_axis ? inp[i * in_stride_sorted_axis]
                                         : ValT(CompareOp::init);
      if (ARG_SORT) {
        tgp_idxs[i] = i;
      }
    }

    // Sort elements within the block
    threadgroup_barrier(mem_flags::mem_threadgroup);

    block_merge_sort_t::sort(tgp_vals, tgp_idxs, size_sorted_axis, lid);

    threadgroup_barrier(mem_flags::mem_threadgroup);

    // Write output
    for (int i = lid.x; i < size_sorted_axis; i += BLOCK_THREADS) {
      if (ARG_SORT) {
        out[i * out_stride_sorted_axis] = tgp_idxs[i];
      } else {
        out[i * out_stride_sorted_axis] = tgp_vals[i];
      }
    }
  }
};

template <
    typename T,
    typename U,
    bool ARG_SORT,
    short BLOCK_THREADS,
    short N_PER_THREAD>
[[kernel, max_total_threads_per_threadgroup(BLOCK_THREADS)]] void block_sort(
    const device T* inp [[buffer(0)]],
    device U* out [[buffer(1)]],
    const constant int& size_sorted_axis [[buffer(2)]],
    const constant int& in_stride_sorted_axis [[buffer(3)]],
    const constant int& out_stride_sorted_axis [[buffer(4)]],
    const constant int& in_stride_segment_axis [[buffer(5)]],
    const constant int& out_stride_segment_axis [[buffer(6)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]]) {
  using sort_kernel =
      KernelMergeSort<T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD>;
  using ValT = typename sort_kernel::ValT;
  using IdxT = typename sort_kernel::IdxT;

  if (ARG_SORT) {
    threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
    threadgroup IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];
    sort_kernel::block_sort(
        inp,
        out,
        size_sorted_axis,
        in_stride_sorted_axis,
        out_stride_sorted_axis,
        in_stride_segment_axis,
        out_stride_segment_axis,
        tgp_vals,
        tgp_idxs,
        tid,
        lid);
  } else {
    threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
    sort_kernel::block_sort(
        inp,
        out,
        size_sorted_axis,
        in_stride_sorted_axis,
        out_stride_sorted_axis,
        in_stride_segment_axis,
        out_stride_segment_axis,
        tgp_vals,
        nullptr,
        tid,
        lid);
  }
}

constant constexpr const int zero_helper = 0;

template <
    typename T,
    typename U,
    bool ARG_SORT,
    short BLOCK_THREADS,
    short N_PER_THREAD>
[[kernel, max_total_threads_per_threadgroup(BLOCK_THREADS)]] void block_sort_nc(
    const device T* inp [[buffer(0)]],
    device U* out [[buffer(1)]],
    const constant int& size_sorted_axis [[buffer(2)]],
    const constant int& in_stride_sorted_axis [[buffer(3)]],
    const constant int& out_stride_sorted_axis [[buffer(4)]],
    const constant int& nc_dim [[buffer(5)]],
    const constant int* nc_shape [[buffer(6)]],
    const constant int64_t* in_nc_strides [[buffer(7)]],
    const constant int64_t* out_nc_strides [[buffer(8)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]]) {
  using sort_kernel =
      KernelMergeSort<T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD>;
  using ValT = typename sort_kernel::ValT;
  using IdxT = typename sort_kernel::IdxT;

  auto in_block_idx = elem_to_loc(tid.y, nc_shape, in_nc_strides, nc_dim);
  auto out_block_idx = elem_to_loc(tid.y, nc_shape, out_nc_strides, nc_dim);
  inp += in_block_idx;
  out += out_block_idx;

  if (ARG_SORT) {
    threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
    threadgroup IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];
    sort_kernel::block_sort(
        inp,
        out,
        size_sorted_axis,
        in_stride_sorted_axis,
        out_stride_sorted_axis,
        zero_helper,
        zero_helper,
        tgp_vals,
        tgp_idxs,
        tid,
        lid);
  } else {
    threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
    sort_kernel::block_sort(
        inp,
        out,
        size_sorted_axis,
        in_stride_sorted_axis,
        out_stride_sorted_axis,
        zero_helper,
        zero_helper,
        tgp_vals,
        nullptr,
        tid,
        lid);
  }
}

template <
    typename ValT,
    typename IdxT,
    bool ARG_SORT,
    short BLOCK_THREADS,
    short N_PER_THREAD,
    typename CompareOp = LessThan<ValT>>
struct KernelMultiBlockMergeSort {
  using block_merge_sort_t = BlockMergeSort<
      ValT,
      IdxT,
      ARG_SORT,
      BLOCK_THREADS,
      N_PER_THREAD,
      CompareOp>;

  MLX_MTL_CONST short N_PER_BLOCK = BLOCK_THREADS * N_PER_THREAD;

  static METAL_FUNC void block_sort(
      const device ValT* inp,
      device ValT* out_vals,
      device IdxT* out_idxs,
      const constant int& size_sorted_axis,
      const constant int& stride_sorted_axis,
      threadgroup ValT* tgp_vals,
      threadgroup IdxT* tgp_idxs,
      uint3 tid [[threadgroup_position_in_grid]],
      uint3 lid [[thread_position_in_threadgroup]]) {
    // tid.y tells us the segment index
    int base_idx = tid.x * N_PER_BLOCK;

    // Copy into threadgroup memory
    for (short i = lid.x; i < N_PER_BLOCK; i += BLOCK_THREADS) {
      int idx = base_idx + i;
      tgp_vals[i] = idx < size_sorted_axis ? inp[idx * stride_sorted_axis]
                                           : ValT(CompareOp::init);
      tgp_idxs[i] = idx;
    }

    // Sort elements within the block
    threadgroup_barrier(mem_flags::mem_threadgroup);

    block_merge_sort_t::sort(tgp_vals, tgp_idxs, size_sorted_axis, lid);

    threadgroup_barrier(mem_flags::mem_threadgroup);

    // Write output
    for (int i = lid.x; i < N_PER_BLOCK; i += BLOCK_THREADS) {
      int idx = base_idx + i;
      if (idx < size_sorted_axis) {
        out_vals[idx] = tgp_vals[i];
        out_idxs[idx] = tgp_idxs[i];
      }
    }
  }

  static METAL_FUNC int merge_partition(
      const device ValT* As,
      const device ValT* Bs,
      int A_sz,
      int B_sz,
      int sort_md) {
    CompareOp op;

    int A_st = max(0, sort_md - B_sz);
    int A_ed = min(sort_md, A_sz);

    while (A_st < A_ed) {
      int md = A_st + (A_ed - A_st) / 2;
      auto a = As[md];
      auto b = Bs[sort_md - 1 - md];

      if (op(b, a)) {
        A_ed = md;
      } else {
        A_st = md + 1;
      }
    }

    return A_ed;
  }
};

template <
    typename ValT,
    typename IdxT,
    bool ARG_SORT,
    short BLOCK_THREADS,
    short N_PER_THREAD>
[[kernel, max_total_threads_per_threadgroup(BLOCK_THREADS)]] void mb_block_sort(
    const device ValT* inp [[buffer(0)]],
    device ValT* out_vals [[buffer(1)]],
    device IdxT* out_idxs [[buffer(2)]],
    const constant int& size_sorted_axis [[buffer(3)]],
    const constant int& stride_sorted_axis [[buffer(4)]],
    const constant int& nc_dim [[buffer(5)]],
    const constant int* nc_shape [[buffer(6)]],
    const constant int64_t* nc_strides [[buffer(7)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]]) {
  using sort_kernel = KernelMultiBlockMergeSort<
      ValT,
      IdxT,
      ARG_SORT,
      BLOCK_THREADS,
      N_PER_THREAD>;

  auto block_idx = elem_to_loc(tid.y, nc_shape, nc_strides, nc_dim);
  inp += block_idx;
  out_vals += tid.y * size_sorted_axis;
  out_idxs += tid.y * size_sorted_axis;

  threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
  threadgroup IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];

  sort_kernel::block_sort(
      inp,
      out_vals,
      out_idxs,
      size_sorted_axis,
      stride_sorted_axis,
      tgp_vals,
      tgp_idxs,
      tid,
      lid);
}

template <
    typename ValT,
    typename IdxT,
    bool ARG_SORT,
    short BLOCK_THREADS,
    short N_PER_THREAD>
[[kernel]] void mb_block_partition(
    device IdxT* block_partitions [[buffer(0)]],
    const device ValT* dev_vals [[buffer(1)]],
    const device IdxT* dev_idxs [[buffer(2)]],
    const constant int& size_sorted_axis [[buffer(3)]],
    const constant int& merge_tiles [[buffer(4)]],
    const constant int& n_blocks [[buffer(5)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint3 tgp_dims [[threads_per_threadgroup]]) {
  using sort_kernel = KernelMultiBlockMergeSort<
      ValT,
      IdxT,
      ARG_SORT,
      BLOCK_THREADS,
      N_PER_THREAD>;

  block_partitions += tid.y * tgp_dims.x;
  dev_vals += tid.y * size_sorted_axis;
  dev_idxs += tid.y * size_sorted_axis;

  for (int i = lid.x; i <= n_blocks; i += tgp_dims.x) {
    // Find location in merge step
    int merge_group = i / merge_tiles;
    int merge_lane = i % merge_tiles;

    int sort_sz = sort_kernel::N_PER_BLOCK * merge_tiles;
    int sort_st = sort_kernel::N_PER_BLOCK * merge_tiles * merge_group;

    int A_st = min(size_sorted_axis, sort_st);
    int A_ed = min(size_sorted_axis, sort_st + sort_sz / 2);
    int B_st = A_ed;
    int B_ed = min(size_sorted_axis, B_st + sort_sz / 2);

    int partition_at = min(B_ed - A_st, sort_kernel::N_PER_BLOCK * merge_lane);
    int partition = sort_kernel::merge_partition(
        dev_vals + A_st,
        dev_vals + B_st,
        A_ed - A_st,
        B_ed - B_st,
        partition_at);

    block_partitions[i] = A_st + partition;
  }
}

template <
    typename ValT,
    typename IdxT,
    bool ARG_SORT,
    short BLOCK_THREADS,
    short N_PER_THREAD,
    typename CompareOp = LessThan<ValT>>
[[kernel, max_total_threads_per_threadgroup(BLOCK_THREADS)]] void
mb_block_merge(
    const device IdxT* block_partitions [[buffer(0)]],
    const device ValT* dev_vals_in [[buffer(1)]],
    const device IdxT* dev_idxs_in [[buffer(2)]],
    device ValT* dev_vals_out [[buffer(3)]],
    device IdxT* dev_idxs_out [[buffer(4)]],
    const constant int& size_sorted_axis [[buffer(5)]],
    const constant int& merge_tiles [[buffer(6)]],
    const constant int& num_tiles [[buffer(7)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]]) {
  using sort_kernel = KernelMultiBlockMergeSort<
      ValT,
      IdxT,
      ARG_SORT,
      BLOCK_THREADS,
      N_PER_THREAD,
      CompareOp>;

  using block_sort_t = typename sort_kernel::block_merge_sort_t;

  block_partitions += tid.y * (num_tiles + 1);
  dev_vals_in += tid.y * size_sorted_axis;
  dev_idxs_in += tid.y * size_sorted_axis;
  dev_vals_out += tid.y * size_sorted_axis;
  dev_idxs_out += tid.y * size_sorted_axis;

  int block_idx = tid.x;
  int merge_group = block_idx / merge_tiles;
  int sort_st = sort_kernel::N_PER_BLOCK * merge_tiles * merge_group;
  int sort_sz = sort_kernel::N_PER_BLOCK * merge_tiles;
  int sort_md = sort_kernel::N_PER_BLOCK * block_idx - sort_st;

  int A_st = block_partitions[block_idx + 0];
  int A_ed = block_partitions[block_idx + 1];
  int B_st = min(size_sorted_axis, 2 * sort_st + sort_sz / 2 + sort_md - A_st);
  int B_ed = min(
      size_sorted_axis,
      2 * sort_st + sort_sz / 2 + sort_md + sort_kernel::N_PER_BLOCK - A_ed);

  if ((block_idx % merge_tiles) == merge_tiles - 1) {
    A_ed = min(size_sorted_axis, sort_st + sort_sz / 2);
    B_ed = min(size_sorted_axis, sort_st + sort_sz);
  }

  int A_sz = A_ed - A_st;
  int B_sz = B_ed - B_st;

  // Load from global memory
  thread ValT thread_vals[N_PER_THREAD];
  thread IdxT thread_idxs[N_PER_THREAD];
  for (int i = 0; i < N_PER_THREAD; i++) {
    int idx = BLOCK_THREADS * i + lid.x;
    if (idx < (A_sz + B_sz)) {
      thread_vals[i] = (idx < A_sz) ? dev_vals_in[A_st + idx]
                                    : dev_vals_in[B_st + idx - A_sz];
      thread_idxs[i] = (idx < A_sz) ? dev_idxs_in[A_st + idx]
                                    : dev_idxs_in[B_st + idx - A_sz];
    } else {
      thread_vals[i] = CompareOp::init;
      thread_idxs[i] = 0;
    }
  }

  // Write to shared memory
  threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
  threadgroup IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];
  threadgroup_barrier(mem_flags::mem_threadgroup);
  for (int i = 0; i < N_PER_THREAD; i++) {
    int idx = BLOCK_THREADS * i + lid.x;
    tgp_vals[idx] = thread_vals[i];
    tgp_idxs[idx] = thread_idxs[i];
  }
  threadgroup_barrier(mem_flags::mem_threadgroup);

  // Merge
  int sort_md_local = min(A_sz + B_sz, N_PER_THREAD * int(lid.x));

  int A_st_local = block_sort_t::merge_partition(
      tgp_vals, tgp_vals + A_sz, A_sz, B_sz, sort_md_local);
  int A_ed_local = A_sz;

  int B_st_local = sort_md_local - A_st_local;
  int B_ed_local = B_sz;

  int A_sz_local = A_ed_local - A_st_local;
  int B_sz_local = B_ed_local - B_st_local;

  // Do merge
  block_sort_t::merge_step(
      tgp_vals + A_st_local,
      tgp_vals + A_ed_local + B_st_local,
      tgp_idxs + A_st_local,
      tgp_idxs + A_ed_local + B_st_local,
      A_sz_local,
      B_sz_local,
      thread_vals,
      thread_idxs);

  threadgroup_barrier(mem_flags::mem_threadgroup);
  for (int i = 0; i < N_PER_THREAD; ++i) {
    int idx = lid.x * N_PER_THREAD;
    tgp_vals[idx + i] = thread_vals[i];
    tgp_idxs[idx + i] = thread_idxs[i];
  }

  threadgroup_barrier(mem_flags::mem_threadgroup);
  // Write output
  int base_idx = tid.x * sort_kernel::N_PER_BLOCK;
  for (int i = lid.x; i < sort_kernel::N_PER_BLOCK; i += BLOCK_THREADS) {
    int idx = base_idx + i;
    if (idx < size_sorted_axis) {
      dev_vals_out[idx] = tgp_vals[i];
      dev_idxs_out[idx] = tgp_idxs[i];
    }
  }
}


================================================
FILE: mlx/backend/metal/kernels/sort.metal
================================================
// Copyright © 2023-2024 Apple Inc.

#include <metal_stdlib>

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/sort.h"

#define instantiate_block_sort(                                          \
    name, itname, itype, otname, otype, arg_sort, bn, tn)                \
  instantiate_kernel("c" #name "_" #itname "_" #otname "_bn" #bn "_tn" #tn, \
                     block_sort, itype, otype, arg_sort, bn, tn) \
  instantiate_kernel("nc" #name "_" #itname "_" #otname "_bn" #bn "_tn" #tn, \
                     block_sort_nc, itype, otype, arg_sort, bn, tn)

#define instantiate_arg_block_sort_base(itname, itype, bn, tn) \
  instantiate_block_sort(                                      \
      arg_block_sort, itname, itype, uint32, uint32_t, true, bn, tn)

#define instantiate_block_sort_base(itname, itype, bn, tn) \
  instantiate_block_sort(                                  \
      _block_sort, itname, itype, itname, itype, false, bn, tn)

#define instantiate_block_sort_tn(itname, itype, bn) \
  instantiate_block_sort_base(itname, itype, bn, 4)  \
  instantiate_arg_block_sort_base(itname, itype, bn, 4)

#define instantiate_block_sort_bn(itname, itype) \
  instantiate_block_sort_tn(itname, itype, 32)  \
  instantiate_block_sort_tn(itname, itype, 64)  \
  instantiate_block_sort_tn(itname, itype, 128)  \
  instantiate_block_sort_tn(itname, itype, 256)  \
  instantiate_block_sort_tn(itname, itype, 512)

instantiate_block_sort_bn(uint8, uint8_t)
instantiate_block_sort_bn(uint16, uint16_t)
instantiate_block_sort_bn(uint32, uint32_t)
instantiate_block_sort_bn(int8, int8_t)
instantiate_block_sort_bn(int16, int16_t)
instantiate_block_sort_bn(int32, int32_t)
instantiate_block_sort_bn(float16, half)
instantiate_block_sort_bn(float32, float)
instantiate_block_sort_bn(bfloat16, bfloat16_t)

#define instantiate_block_sort_long(itname, itype) \
  instantiate_block_sort_tn(itname, itype, 32)     \
  instantiate_block_sort_tn(itname, itype, 64)     \
  instantiate_block_sort_tn(itname, itype, 128)    \
  instantiate_block_sort_tn(itname, itype, 256)

instantiate_block_sort_long(uint64, uint64_t)
instantiate_block_sort_long(int64, int64_t)

#define instantiate_multi_block_sort(                                      \
    vtname, vtype, itname, itype, arg_sort, bn, tn)                        \
  instantiate_kernel("sort_mbsort_" #vtname "_" #itname "_bn" #bn "_tn" #tn, \
                     mb_block_sort, vtype, itype, arg_sort, bn, tn) \
  instantiate_kernel("partition_mbsort_" #vtname "_" #itname "_bn" #bn "_tn" #tn, \
                     mb_block_partition, vtype, itype, arg_sort, bn, tn) \
  instantiate_kernel("merge_mbsort_" #vtname "_" #itname "_bn" #bn "_tn" #tn, \
                     mb_block_merge, vtype, itype, arg_sort, bn, tn)

#define instantiate_multi_block_sort_base(vtname, vtype) \
  instantiate_multi_block_sort(vtname, vtype, uint32, uint32_t, true, 512, 4)

instantiate_multi_block_sort_base(uint8, uint8_t)
instantiate_multi_block_sort_base(uint16, uint16_t)
instantiate_multi_block_sort_base(uint32, uint32_t)
instantiate_multi_block_sort_base(int8, int8_t)
instantiate_multi_block_sort_base(int16, int16_t)
instantiate_multi_block_sort_base(int32, int32_t)
instantiate_multi_block_sort_base(float16, half)
instantiate_multi_block_sort_base(float32, float)
instantiate_multi_block_sort_base(bfloat16, bfloat16_t)

#define instantiate_multi_block_sort_long(vtname, vtype) \
  instantiate_multi_block_sort(vtname, vtype, uint32, uint32_t, true, 256, 4)

instantiate_multi_block_sort_long(uint64, uint64_t)
instantiate_multi_block_sort_long(int64, int64_t) // clang-format on


================================================
FILE: mlx/backend/metal/kernels/steel/attn/attn.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/steel/attn/loader.h"
#include "mlx/backend/metal/kernels/steel/attn/mma.h"
#include "mlx/backend/metal/kernels/steel/attn/params.h"
#include "mlx/backend/metal/kernels/steel/attn/transforms.h"
#include "mlx/backend/metal/kernels/steel/gemm/params.h"
#include "mlx/backend/metal/kernels/steel/utils.h"

using namespace metal;

///////////////////////////////////////////////////////////////////////////////
// GEMM kernel class
///////////////////////////////////////////////////////////////////////////////

namespace mlx {
namespace steel {

template <bool M_aligned, bool N_aligned, bool K_aligned>
struct LoopAlignment {};

template <
    typename T,
    typename U,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose_a,
    bool transpose_b,
    bool MN_aligned,
    bool K_aligned,
    typename AccumType = typename AccumHelper<T>::accum_type,
    typename Epilogue = TransformNone<U, AccumType>>
struct GEMMKernel {
  STEEL_CONST short tgp_padding_a = 16 / sizeof(T);
  STEEL_CONST short tgp_padding_b = 16 / sizeof(T);
  STEEL_CONST short tgp_mem_size_a =
      transpose_a ? BK * (BM + tgp_padding_a) : BM * (BK + tgp_padding_a);
  STEEL_CONST short tgp_mem_size_b =
      transpose_b ? BN * (BK + tgp_padding_b) : BK * (BN + tgp_padding_b);
  STEEL_CONST short tgp_mem_size = tgp_mem_size_a + tgp_mem_size_b;

  STEEL_CONST short tgp_size = WM * WN * 32;

  using loader_a_t = BlockLoader<
      T,
      transpose_a ? BK : BM,
      transpose_a ? BM : BK,
      transpose_a ? BM + tgp_padding_a : BK + tgp_padding_a,
      !transpose_a,
      tgp_size>;
  using loader_b_t = BlockLoader<
      T,
      transpose_b ? BN : BK,
      transpose_b ? BK : BN,
      transpose_b ? BK + tgp_padding_b : BN + tgp_padding_b,
      transpose_b,
      tgp_size>;
  using mma_t = BlockMMA<
      T,
      U,
      BM,
      BN,
      BK,
      WM,
      WN,
      transpose_a,
      transpose_b,
      transpose_a ? BM + tgp_padding_a : BK + tgp_padding_a,
      transpose_b ? BK + tgp_padding_b : BN + tgp_padding_b,
      AccumType,
      Epilogue>;

  /* Main kernel function */
  template <bool M_aligned, bool N_aligned, bool K_aligned_>
  static METAL_FUNC void gemm_loop(
      threadgroup T* As [[threadgroup(0)]],
      threadgroup T* Bs [[threadgroup(1)]],
      const int gemm_k_iterations,
      thread loader_a_t& loader_a,
      thread loader_b_t& loader_b,
      thread mma_t& mma_op,
      thread const short& tgp_bm,
      thread const short& tgp_bn,
      thread const short& lbk,
      LoopAlignment<M_aligned, N_aligned, K_aligned_> l = {}) {
    // Appease the compiler
    (void)l;

    short2 tile_dims_A = transpose_a ? short2(tgp_bm, BK) : short2(BK, tgp_bm);

    short2 tile_dims_B = transpose_b ? short2(BK, tgp_bn) : short2(tgp_bn, BK);

    for (int k = 0; k < gemm_k_iterations; k++) {
      threadgroup_barrier(mem_flags::mem_threadgroup);
      // Load elements into threadgroup
      if (M_aligned) {
        loader_a.load_unsafe();
      } else {
        loader_a.load_safe(tile_dims_A);
      }

      if (N_aligned) {
        loader_b.load_unsafe();
      } else {
        loader_b.load_safe(tile_dims_B);
      }

      threadgroup_barrier(mem_flags::mem_threadgroup);

      // Multiply and accumulate threadgroup elements
      mma_op.mma(As, Bs);

      // Prepare for next iteration
      loader_a.next();
      loader_b.next();
    }

    if (!K_aligned_) {
      threadgroup_barrier(mem_flags::mem_threadgroup);

      short2 tile_dims_A_last =
          transpose_a ? short2(tgp_bm, lbk) : short2(lbk, tgp_bm);
      short2 tile_dims_B_last =
          transpose_b ? short2(lbk, tgp_bn) : short2(tgp_bn, lbk);

      loader_a.load_safe(tile_dims_A_last);
      loader_b.load_safe(tile_dims_B_last);

      threadgroup_barrier(mem_flags::mem_threadgroup);

      mma_op.mma(As, Bs);
    }
  }

  /* Main kernel function */
  static METAL_FUNC void run(
      const device T* A [[buffer(0)]],
      const device T* B [[buffer(1)]],
      device U* D [[buffer(2)]],
      const constant GEMMParams* params [[buffer(3)]],
      threadgroup T* As [[threadgroup(0)]],
      threadgroup T* Bs [[threadgroup(1)]],
      uint simd_lane_id [[thread_index_in_simdgroup]],
      uint simd_group_id [[simdgroup_index_in_threadgroup]],
      uint3 tid [[threadgroup_position_in_grid]],
      uint3 lid [[thread_position_in_threadgroup]]) {
    // Pacifying compiler
    (void)lid;

    const int tid_y = ((tid.y) << params->swizzle_log) +
        ((tid.x) & ((1 << params->swizzle_log) - 1));
    const int tid_x = (tid.x) >> params->swizzle_log;

    if (params->tiles_n <= tid_x || params->tiles_m <= tid_y) {
      return;
    }

    threadgroup_barrier(mem_flags::mem_none);

    // Find block in A, B, C
    const int c_row = tid_y * BM;
    const int c_col = tid_x * BN;
    const size_t c_row_long = size_t(c_row);
    const size_t c_col_long = size_t(c_col);

    A += transpose_a ? c_row_long : c_row_long * params->lda;
    B += transpose_b ? c_col_long * params->ldb : c_col_long;
    D += c_row_long * params->ldd + c_col_long;

    // Prepare threadgroup loading operations
    thread loader_a_t loader_a(A, params->lda, As, simd_group_id, simd_lane_id);
    thread loader_b_t loader_b(B, params->ldb, Bs, simd_group_id, simd_lane_id);

    // Prepare threadgroup mma operation
    thread mma_t mma_op(simd_group_id, simd_lane_id);

    int gemm_k_iterations = params->gemm_k_iterations_aligned;

    ///////////////////////////////////////////////////////////////////////////////
    // MNK aligned loop
    if (MN_aligned) {
      for (int k = 0; k < gemm_k_iterations; k++) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        // Load elements into threadgroup
        loader_a.load_unsafe();
        loader_b.load_unsafe();

        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Multiply and accumulate threadgroup elements
        mma_op.mma(As, Bs);

        // Prepare for next iteration
        loader_a.next();
        loader_b.next();
      }

      threadgroup_barrier(mem_flags::mem_none);

      // Loop tail
      if (!K_aligned) {
        int lbk = params->K - params->gemm_k_iterations_aligned * BK;
        short2 tile_dims_A = transpose_a ? short2(BM, lbk) : short2(lbk, BM);
        short2 tile_dims_B = transpose_b ? short2(lbk, BN) : short2(BN, lbk);

        loader_a.load_safe(tile_dims_A);
        loader_b.load_safe(tile_dims_B);

        threadgroup_barrier(mem_flags::mem_threadgroup);

        mma_op.mma(As, Bs);
      }

      // Store results to device memory
      mma_op.store_result(D, params->ldd);
      return;

    }
    ///////////////////////////////////////////////////////////////////////////////
    // MN unaligned loop
    else { // Loop over K - unaligned case
      short tgp_bm = min(BM, params->M - c_row);
      short tgp_bn = min(BN, params->N - c_col);
      short leftover_bk = params->K - params->gemm_k_iterations_aligned * BK;

      if (tgp_bm == BM && tgp_bn == BN) {
        gemm_loop<true, true, K_aligned>(
            As,
            Bs,
            gemm_k_iterations,
            loader_a,
            loader_b,
            mma_op,
            tgp_bm,
            tgp_bn,
            leftover_bk);

        mma_op.store_result(D, params->ldd);
        return;

      } else if (tgp_bn == BN) {
        gemm_loop<false, true, K_aligned>(
            As,
            Bs,
            gemm_k_iterations,
            loader_a,
            loader_b,
            mma_op,
            tgp_bm,
            tgp_bn,
            leftover_bk);

        mma_op.store_result_safe(D, params->ldd, short2(tgp_bn, tgp_bm));
        return;

      } else if (tgp_bm == BM) {
        gemm_loop<true, false, K_aligned>(
            As,
            Bs,
            gemm_k_iterations,
            loader_a,
            loader_b,
            mma_op,
            tgp_bm,
            tgp_bn,
            leftover_bk);

        mma_op.store_result_safe(D, params->ldd, short2(tgp_bn, tgp_bm));
        return;

      } else {
        gemm_loop<false, false, K_aligned>(
            As,
            Bs,
            gemm_k_iterations,
            loader_a,
            loader_b,
            mma_op,
            tgp_bm,
            tgp_bn,
            leftover_bk);

        mma_op.store_result_safe(D, params->ldd, short2(tgp_bn, tgp_bm));
        return;
      }
    }
  }
};

} // namespace steel
} // namespace mlx

================================================
FILE: mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h
================================================
// Copyright © 2024-25 Apple Inc.

#include "mlx/backend/metal/kernels/steel/attn/attn.h"

using namespace mlx::steel;

///////////////////////////////////////////////////////////////////////////////
// GEMM kernels
///////////////////////////////////////////////////////////////////////////////

constant bool align_Q [[function_constant(200)]];
constant bool align_K [[function_constant(201)]];

constant bool has_mask [[function_constant(300)]];
constant bool do_causal [[function_constant(301)]];
constant bool has_sinks [[function_constant(302)]];

struct MaxOp {
  template <typename T>
  METAL_FUNC static constexpr T apply(T x, T y) {
    return metal::max(x, y);
  }
};

struct SumOp {
  template <typename T>
  METAL_FUNC static constexpr T apply(T x, T y) {
    return x + y;
  }
};

struct MulOp {
  template <typename T>
  METAL_FUNC static constexpr T apply(T x, T y) {
    return x * y;
  }
};

struct SubOp {
  template <typename T>
  METAL_FUNC static constexpr T apply(T x, T y) {
    return x - y;
  }
};

struct ExpSubOp {
  template <typename T>
  METAL_FUNC static constexpr T apply(T x, T y) {
    return fast::exp2(x - y);
  }
};

struct DivOp {
  template <typename T>
  METAL_FUNC static constexpr T apply(T x, T y) {
    return x / y;
  }
};

// clang-format off
template <
    typename T,
    int BQ,
    int BK,
    int BD,
    int WM,
    int WN,
    typename MaskType = float,
    typename AccumType = float>
[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] void attention(
    const device T* Q [[buffer(0)]],
    const device T* K [[buffer(1)]],
    const device T* V [[buffer(2)]],
    device T* O [[buffer(3)]],
    const constant AttnParams* params [[buffer(4)]],
    const constant AttnMaskParams* mask_params [[buffer(5), function_constant(has_mask)]],
    const device MaskType* mask [[buffer(6), function_constant(has_mask)]],
    const device T* sinks [[buffer(7), function_constant(has_sinks)]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]]) { // clang-format on

  // Pacifying compiler
  (void)lid;

  // Move to correct block
  ulong3 tidl{tid.x, tid.y, tid.z};

  Q += tidl.z * params->Q_strides[0] + // Batch
      tidl.y * params->Q_strides[1] + // Head
      tidl.x * BQ * params->Q_strides[2]; // Sequence

  ulong kv_head_idx = int(tid.y) / params->gqa_factor;
  K += tidl.z * params->K_strides[0] + // Batch
      kv_head_idx * params->K_strides[1]; // Head

  V += tidl.z * params->V_strides[0] + // Batch
      kv_head_idx * params->V_strides[1]; // Head

  O += tidl.z * params->O_strides[0] + // Batch
      tidl.y * params->O_strides[1] + // Head
      tidl.x * BQ * params->O_strides[2]; // Sequence

  if (has_mask) {
    mask += tidl.z * mask_params->M_strides[0] + // Batch
        tidl.y * mask_params->M_strides[1]; // Head
  }

  // Prepare threadgroup memory
  constexpr short padQ = 16 / sizeof(T);
  constexpr short padK = 16 / sizeof(T);
  constexpr short padV = 16 / sizeof(T);

  constexpr short LDQ_tgp = BD + padQ;
  constexpr short LDK_tgp = BK + padK;
  constexpr short LDV_tgp = BD + padV;

  constexpr short tgp_mem_0 = (BK + padK) * (BD);
  constexpr short tgp_mem_1 = BK * (BD + padV);
  constexpr short tgp_mem_s = tgp_mem_0 > tgp_mem_1 ? tgp_mem_0 : tgp_mem_1;

  threadgroup T Q_smem[BQ * (BD + padQ)];
  threadgroup T KV_smem[tgp_mem_s];

  threadgroup T* Qs = Q_smem;
  threadgroup T* Ks = KV_smem;
  threadgroup T* Vs = KV_smem;

  // Prepare block loaders
  using QBlockLoader = BlockLoaderT<
      /* typename T = */ T,
      /* short BROWS = */ BQ,
      /* short BCOLS = */ BD,
      /* short kDstStrRow = */ LDQ_tgp,
      /* short kDstStrCol = */ 1,
      /* short reduction_dim = */ 1,
      /* short tgp_size = */ WM * WN * 32>;

  // K is loaded in transposed
  using KBlockLoader = BlockLoaderT<
      /* typename T = */ T,
      /* short BROWS = */ BK,
      /* short BCOLS = */ BD,
      /* short kDstStrRow = */ 1,
      /* short kDstStrCol = */ LDK_tgp,
      /* short reduction_dim = */ 0,
      /* short tgp_size = */ WM * WN * 32>;

  using VBlockLoader = BlockLoaderT<
      /* typename T = */ T,
      /* short BROWS = */ BK,
      /* short BCOLS = */ BD,
      /* short kDstStrRow = */ LDV_tgp,
      /* short kDstStrCol = */ 1,
      /* short reduction_dim = */ 0,
      /* short tgp_size = */ WM * WN * 32>;

  QBlockLoader loader_q(
      Q, params->Q_strides[2], Qs, simd_group_id, simd_lane_id);
  KBlockLoader loader_k(
      K, params->K_strides[2], Ks, simd_group_id, simd_lane_id);
  VBlockLoader loader_v(
      V, params->V_strides[2], Vs, simd_group_id, simd_lane_id);

  const AccumType scale = params->scale * M_LOG2E_F;

  // Prepare MMA tiles
  constexpr short kFragSize = 8; // MMAFrag size
  using MMAFrag_acc_t = BaseMMAFrag<AccumType, kFragSize, kFragSize>;

  constexpr int kNWarps = WM * WN;
  static_assert(
      BQ >= (kNWarps * kFragSize) && BQ % (kNWarps * kFragSize) == 0,
      "Each simdgroup must host atleast 1 simdgroup matrix along Q sequence.");

  // Q seq frags per warp
  constexpr int TQ = BQ / (kNWarps * kFragSize);
  // KV sequence frags (all warps load the same frags)
  constexpr int TK = BK / kFragSize;
  // HeadDim frags (all warps load the same frags)
  constexpr int TD = BD / kFragSize;

  static_assert(TQ == 1, "Check TQ");

  MMATile<AccumType, TQ, 1, MMAFrag_acc_t> Qtile;
  MMATile<AccumType, 1, TK, MMAFrag_acc_t> Ktile;
  MMATile<AccumType, TQ, TK, MMAFrag_acc_t> Stile;
  MMATile<AccumType, 1, 1, MMAFrag_acc_t> Vtile;
  MMATile<AccumType, TQ, TD, MMAFrag_acc_t> Otile;

  Otile.clear();

  // Prepare mma tile offsets
  const short2 simd_coord = MMAFrag_acc_t::get_coord(simd_lane_id);
  const short sm = simd_coord.y;
  const short sn = simd_coord.x;
  const short tm = kFragSize * TQ * simd_group_id;

  const short Qs_offset = (tm + sm) * LDQ_tgp + sn;
  const short Ks_offset = sm * LDK_tgp + sn;
  const short Vs_offset = sm * LDV_tgp + sn;

  constexpr short Qs_tile_stride = kFragSize;
  constexpr short Ks_tile_stride = kFragSize * LDK_tgp;

  threadgroup_barrier(mem_flags::mem_threadgroup);

  // Load Q blocks
  if (!align_Q && int(tid.x) == (params->NQ_aligned)) {
    loader_q.load_safe(short2(BD, params->qL_rem));
  } else {
    loader_q.load_unsafe();
  }

  // Init row reduction variables
  constexpr short kRowsPT = decltype(Stile)::kRowsPerThread;

  AccumType max_score[kRowsPT];
  AccumType sum_score[kRowsPT] = {0};

  // Init to -Inf
  STEEL_PRAGMA_UNROLL
  for (short i = 0; i < kRowsPT; ++i) {
    max_score[i] = Limits<AccumType>::finite_min;
  }

  if (has_sinks) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kRowsPT; ++i) {
      max_score[i] = M_LOG2E_F * static_cast<AccumType>(sinks[tidl.y]);
      sum_score[i] = 1;
    }
  }

  int kb_lim = params->NK;
  int kb_min_causal = params->NK;

  if (do_causal) {
    int q_max = (tid.x + 1) * BQ + params->qL_off;
    kb_lim = (q_max + BK - 1) / BK;
    kb_lim = min(params->NK, kb_lim);

    int q_min = tid.x * BQ + params->qL_off;
    q_min = max(0, q_min);
    kb_min_causal = (q_min / BK);
  }

  // Loop over KV seq length
  for (int kb = 0; kb < kb_lim; kb++) {
    // Load K block and apply scale
    threadgroup_barrier(mem_flags::mem_threadgroup);
    if (!align_K && kb == (params->NK_aligned)) {
      loader_k.load_safe(short2(BD, params->kL_rem));
    } else {
      loader_k.load_unsafe();
    }

    // Do S = Q @ K.T
    Stile.clear();

    threadgroup_barrier(mem_flags::mem_threadgroup);

    STEEL_PRAGMA_UNROLL
    for (short dd = 0; dd < TD; dd++) {
      simdgroup_barrier(mem_flags::mem_none);

      Qtile.template load<T, 1, 1, LDQ_tgp, 1>(
          &Qs[Qs_offset + dd * Qs_tile_stride]);
      Ktile.template load<T, 1, 1, LDK_tgp, 1>(
          &Ks[Ks_offset + dd * Ks_tile_stride]);

      simdgroup_barrier(mem_flags::mem_none);

      tile_matmad(Stile, Qtile, Ktile, Stile);
    }

    // Apply scale in float32
    STEEL_PRAGMA_UNROLL
    for (short ii = 0; ii < decltype(Stile)::kElemsPerTile; ii++) {
      Stile.elems()[ii] *= scale;
    }

    // Mask out length sequence
    if (!align_K && kb == (params->NK_aligned)) {
      using stile_t = decltype(Stile);
      using selem_t = typename stile_t::elem_type;
      constexpr auto neg_inf = Limits<selem_t>::finite_min;

      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < stile_t::kTileRows; i++) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < stile_t::kTileCols; j++) {
          short col_pos = sn + (j * stile_t::kFragCols);
          STEEL_PRAGMA_UNROLL
          for (short jj = 0; jj < stile_t::MMAFrag_t::kElemCols; jj++) {
            if ((col_pos + jj) >= params->kL_rem) {
              Stile.frag_at(i, j)[jj] = neg_inf;
            }
          }
        }
      }
    }

    // Mask out if causal
    if (do_causal && kb >= kb_min_causal) {
      using stile_t = decltype(Stile);
      using selem_t = typename stile_t::elem_type;
      constexpr auto neg_inf = Limits<selem_t>::finite_min;

      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < stile_t::kTileRows; i++) {
        const int row_pos =
            tid.x * BQ + params->qL_off + tm + sm + (i * stile_t::kFragRows);
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < stile_t::kTileCols; j++) {
          const int col_pos = kb * BK + sn + (j * stile_t::kFragCols);
          STEEL_PRAGMA_UNROLL
          for (short jj = 0; jj < stile_t::MMAFrag_t::kElemCols; jj++) {
            if (row_pos < (col_pos + jj)) {
              Stile.frag_at(i, j)[jj] = neg_inf;
            }
          }
        }
      }
    }

    // Other masking as needed
    if (has_mask) {
      using stile_t = decltype(Stile);
      using selem_t = typename stile_t::elem_type;
      constexpr auto neg_inf = Limits<selem_t>::finite_min;

      constexpr bool is_bool = is_same_v<MaskType, bool>;
      using melem_t = typename metal::conditional_t<is_bool, bool, selem_t>;

      using MMAFrag_mask_t = BaseMMAFrag<melem_t, kFragSize, kFragSize>;
      using frag_t = typename MMAFrag_mask_t::frag_type;

      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < stile_t::kTileRows; i++) {
        const int row_pos = tid.x * BQ + tm + sm + (i * stile_t::kFragRows);
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < stile_t::kTileCols; j++) {
          const int col_pos = kb * BK + sn + (j * stile_t::kFragCols);

          frag_t mfrag;

          MMAFrag_mask_t::load_safe(
              mfrag,
              mask,
              int64_t(mask_params->M_strides[2]),
              Int<1>{},
              params->qL,
              params->kL,
              row_pos,
              col_pos);

          STEEL_PRAGMA_UNROLL
          for (short jj = 0; jj < stile_t::MMAFrag_t::kElemsPerFrag; jj++) {
            if constexpr (is_bool) {
              Stile.frag_at(i, j)[jj] =
                  mfrag[jj] ? Stile.frag_at(i, j)[jj] : neg_inf;
            } else {
              Stile.frag_at(i, j)[jj] += M_LOG2E_F * selem_t(mfrag[jj]);
            }
          }
        }
      }
    }

    threadgroup_barrier(mem_flags::mem_threadgroup);

    // Load V blocks
    if (!align_K && kb == (params->NK_aligned)) {
      loader_v.load_safe(short2(BD, params->kL_rem));
    } else {
      loader_v.load_unsafe();
    }

    // Do softmax

    // Temp variables
    AccumType new_max[kRowsPT];
    AccumType factor[kRowsPT];
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kRowsPT; ++i) {
      new_max[i] = max_score[i];
    }

    // Row max
    Stile.template row_reduce<MaxOp>(new_max);

    // exp(Si - rowmax(Si))
    Stile.template row_bin_op<ExpSubOp>(new_max);

    // Factor exp(rowmax(Si) - rowmax(Si-1))
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kRowsPT; ++i) {
      factor[i] = fast::exp2(max_score[i] - new_max[i]);
    }

    // Save max for next iteration
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kRowsPT; ++i) {
      max_score[i] = new_max[i];
    }

    // Row Sum
    AccumType sum_score_tmp[kRowsPT] = {0};
    Stile.template row_reduce<SumOp>(sum_score_tmp);

    // Update norm
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kRowsPT; ++i) {
      sum_score[i] = sum_score[i] * factor[i] + sum_score_tmp[i];
    }

    // Update O
    Otile.template row_bin_op<MulOp>(factor);

    // Load V into registers
    threadgroup_barrier(mem_flags::mem_threadgroup);

    STEEL_PRAGMA_UNROLL
    for (short iq = 0; iq < TQ; iq++) {
      STEEL_PRAGMA_UNROLL
      for (short id = 0; id < TD; id++) {
        STEEL_PRAGMA_UNROLL
        for (short ik = 0; ik < TK; ik++) {
          if constexpr (BD == 128) {
            simdgroup_barrier(mem_flags::mem_none);
          }

          const short kk = ik * kFragSize;
          const short dd = id * kFragSize;

          Vtile.template load<T, 1, 1, LDV_tgp, 1>(
              &Vs[Vs_offset + kk * LDV_tgp + dd]);

          if constexpr (BD == 128) {
            simdgroup_barrier(mem_flags::mem_none);
          }

          MMAFrag_acc_t::mma(
              Otile.frag_at(iq, id),
              Stile.frag_at(iq, ik),
              Vtile.frag_at(0, 0),
              Otile.frag_at(iq, id));
        }
      }
    }

    // Prepare for next iteration
    loader_k.next();
    loader_v.next();
  }

  // Normalize output
  Otile.template row_bin_op<DivOp>(sum_score);
  threadgroup_barrier(mem_flags::mem_none);

  // Store results
  O += (tm + sm) * params->O_strides[2] + sn;

  if (!align_Q && int(tid.x) == (params->NQ_aligned)) {
    auto dst_tile_dims = short2(BD - sn, params->qL_rem - (tm + sm));

    if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
      return;

    Otile.template store_safe<T, 1, 1>(O, params->O_strides[2], dst_tile_dims);
  } else {
    Otile.template store<T, 1, 1>(O, params->O_strides[2]);
  }
}


================================================
FILE: mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.metal
================================================
// Copyright © 2024-25 Apple Inc.

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"

#include "mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h"

#define instantiate_attn(tname, dtype, bq, bk, bd, wm, wn, mname, mtype) \
  instantiate_kernel(                                                    \
      "steel_attention_" #tname "_bq" #bq "_bk" #bk "_bd" #bd            \
      "_wm" #wm "_wn" #wn "_mask" #mname,                                \
  attention, dtype, bq, bk, bd, wm, wn, mtype, float)

#define instantiate_attn_shapes_helper(iname, itype, mname, mtype)  \
    instantiate_attn(iname, itype, 32, 16, 128, 4, 1, mname, mtype) \
    instantiate_attn(iname, itype, 32, 32,  80, 4, 1, mname, mtype) \
    instantiate_attn(iname, itype, 32, 32,  64, 4, 1, mname, mtype)

#define instantiate_attn_mask_helper(iname, itype) \
    instantiate_attn_shapes_helper(iname, itype, iname, itype) \
    instantiate_attn_shapes_helper(iname, itype, bool_, bool)

instantiate_attn_mask_helper(float16, half);
instantiate_attn_mask_helper(bfloat16, bfloat16_t);

instantiate_attn_mask_helper(float32, float);
// clang-format on


================================================
FILE: mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.h
================================================
// Copyright © 2024-25 Apple Inc.

#include "mlx/backend/metal/kernels/steel/attn/nax.h"
#include "mlx/backend/metal/kernels/steel/attn/params.h"
#include "mlx/backend/metal/kernels/steel/attn/transforms.h"
#include "mlx/backend/metal/kernels/steel/utils.h"

using namespace mlx::steel;

///////////////////////////////////////////////////////////////////////////////
// GEMM kernels
///////////////////////////////////////////////////////////////////////////////

constant bool align_Q [[function_constant(200)]];
constant bool align_K [[function_constant(201)]];

constant bool has_mask [[function_constant(300)]];
constant bool do_causal [[function_constant(301)]];
constant bool has_sinks [[function_constant(302)]];

template <typename T>
struct TransformScale {
  T scale;
  METAL_FUNC TransformScale(T scale_) : scale(scale_) {}

  METAL_FUNC T apply(T x) const {
    return scale * x;
  }
};

struct MaxOp {
  template <typename T>
  METAL_FUNC static constexpr T apply(T x, T y) {
    return metal::max(x, y);
  }
};

struct SumOp {
  template <typename T>
  METAL_FUNC static constexpr T apply(T x, T y) {
    return x + y;
  }
};

struct MulOp {
  template <typename T>
  METAL_FUNC static constexpr T apply(T x, T y) {
    return x * y;
  }
};

struct SubOp {
  template <typename T>
  METAL_FUNC static constexpr T apply(T x, T y) {
    return x - y;
  }
};

struct ExpSubOp {
  template <typename T>
  METAL_FUNC static constexpr T apply(T x, T y) {
    return fast::exp2(x - y);
  }
};

struct DivOp {
  template <typename T>
  METAL_FUNC static constexpr T apply(T x, T y) {
    return x / y;
  }
};

// clang-format off
template <
    typename T,
    int BQ,
    int BK,
    int BD,
    int WM,
    int WN,
    typename MaskType = float,
    typename AccumType = float>
[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] void attention_nax(
    const device T* Q [[buffer(0)]],
    const device T* K [[buffer(1)]],
    const device T* V [[buffer(2)]],
    device T* O [[buffer(3)]],
    const constant AttnParams* params [[buffer(4)]],
    const constant AttnMaskParams* mask_params [[buffer(5), function_constant(has_mask)]],
    const device MaskType* mask [[buffer(6), function_constant(has_mask)]],
    const device T* sinks [[buffer(7), function_constant(has_sinks)]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]]) { // clang-format on

  // Pacifying compiler
  (void)lid;
  (void)simd_lane_id;

  // Move to correct block
  ulong3 tidl{tid.x, tid.y, tid.z};

  Q += tidl.z * params->Q_strides[0] + // Batch
      tidl.y * params->Q_strides[1] + // Head
      tidl.x * BQ * params->Q_strides[2]; // Sequence

  ulong kv_head_idx = int(tid.y) / params->gqa_factor;
  K += tidl.z * params->K_strides[0] + // Batch
      kv_head_idx * params->K_strides[1]; // Head

  V += tidl.z * params->V_strides[0] + // Batch
      kv_head_idx * params->V_strides[1]; // Head

  O += tidl.z * params->O_strides[0] + // Batch
      tidl.y * params->O_strides[1] + // Head
      tidl.x * BQ * params->O_strides[2]; // Sequence

  if (has_mask) {
    mask += tidl.z * mask_params->M_strides[0] + // Batch
        tidl.y * mask_params->M_strides[1]; // Head
  }

  const metal::uniform<float> scale2 =
      make_uniform(params->scale) * make_uniform(1.44269504089f);

  // Prepare MMA tiles
  constexpr short kU = 16;

  constexpr int kNWarps = WM * WN;
  static_assert(
      BQ >= (kNWarps * kU) && BQ % (kNWarps * kU) == 0,
      "Each simdgroup must host atleast 1 simdgroup matrix along Q sequence.");

  // Q seq frags per warp
  constexpr int TQ = BQ / (kNWarps * kU);
  // HeadDim frags (all warps load the same frags)
  constexpr int TD = BD / kU;
  // KV seq frags per warp
  constexpr short TK = BK / kU;

  static_assert(TQ == 1, "Check TQ");
  using otile_t = NAXTile<AccumType, TQ, TD>;
  otile_t Otile;

  Otile.clear();

  // Prepare mma tile offsets
  const short tm = kU * TQ * simd_group_id;
  Q += tm * int(params->Q_strides[2]);

  const short2 simd_coord = otile_t::NAXFrag_t::get_coord();
  const short sm = simd_coord.y;
  const short sn = simd_coord.x;

  // Init row reduction variables
  constexpr short kRowsPT = otile_t::kRowsPerThread;

  metal::vec<AccumType, kRowsPT> max_score;
  metal::vec<AccumType, kRowsPT> sum_score{0};

  // Init to -Inf
  STEEL_PRAGMA_UNROLL
  for (short i = 0; i < kRowsPT; ++i) {
    max_score[i] = Limits<AccumType>::finite_min;
  }

  if (has_sinks) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kRowsPT; ++i) {
      max_score[i] = M_LOG2E_F * static_cast<AccumType>(sinks[tidl.y]);
      sum_score[i] = 1;
    }
  }

  int kb_lim = params->NK;
  int kb_min_causal = params->NK;

  if (do_causal) {
    int q_max = (tid.x + 1) * BQ + params->qL_off;
    kb_lim = (q_max + BK - 1) / BK;
    kb_lim = min(params->NK, kb_lim);

    int q_min = tid.x * BQ + params->qL_off;
    q_min = max(0, q_min);
    kb_min_causal = (q_min / BK);
  }

  const bool is_last_bq = int(tid.x) == (params->NQ_aligned);
  // const bool is_last_tq = int(simd_group_id) >= (params->qL_rem / UQ);
  const bool is_last_q = is_last_bq;

  const short lim_rows_q = params->qL_rem - tm;
  const short lim_rows_k = params->kL_rem;

  // Loop over KV seq length
  for (int kb = 0; kb < kb_lim; kb++) {
    const int is_last_k = (kb == (params->NK_aligned));

    // Do S = Q @ K.T
    using stile_t = NAXTile<AccumType, TQ, TK>;
    stile_t Stile;

    Stile.clear();

    STEEL_PRAGMA_UNROLL
    for (short iq = 0; iq < TQ; iq++) {
      STEEL_PRAGMA_UNROLL
      for (short ik = 0; ik < TK; ik += 2) {
        STEEL_PRAGMA_UNROLL
        for (short id = 0; id < TD; id++) {
          NAXTile<T, 1, 1> Qtile;
          NAXTile<T, 2, 1> Ktile;

          const int Q_load_off = iq * kU * int(params->Q_strides[2]) + id * kU;
          const int K_load_off = ik * kU * int(params->K_strides[2]) + id * kU;

          if (!align_Q && is_last_q) {
            Qtile.load_rows(
                Q + Q_load_off,
                int(params->Q_strides[2]),
                lim_rows_q - iq * kU);
          } else {
            Qtile.load(Q + Q_load_off, int(params->Q_strides[2]));
          }

          if (!align_K && is_last_k) {
            Ktile.load_rows(
                K + K_load_off,
                int(params->K_strides[2]),
                lim_rows_k - ik * kU);
          } else {
            Ktile.load(K + K_load_off, int(params->K_strides[2]));
          }

          stile_t::NAXFrag_t::mma(
              Stile.frag_at(iq, ik),
              Stile.frag_at(iq, ik + 1),
              Qtile.frag_at(0, 0),
              metal::false_type{},
              Ktile.frag_at(0, 0),
              Ktile.frag_at(1, 0),
              metal::true_type{});
        }
      }
    }

    // Scale S
    STEEL_PRAGMA_UNROLL
    for (short ii = 0; ii < stile_t::kElemsPerTile; ii++) {
      Stile.elems()[ii] *= float(scale2);
    }

    // Mask out length sequence
    if (!align_K && is_last_k) {
      constexpr auto neg_inf = Limits<AccumType>::finite_min;

      STEEL_PRAGMA_UNROLL
      for (short iq = 0; iq < TQ; iq++) {
        STEEL_PRAGMA_UNROLL
        for (short ik = 0; ik < TK; ik++) {
          const short col_pos = ik * kU + sn;

          thread auto& fg = Stile.frag_at(iq, ik);

          STEEL_PRAGMA_UNROLL
          for (short ii = 0; ii < stile_t::kFragThrRows; ii++) {
            STEEL_PRAGMA_UNROLL
            for (short jj = 0; jj < stile_t::kFragThrCols; jj++) {
              const auto loc = ii * stile_t::kFragThrCols + jj;
              fg[loc] = ((col_pos + jj) < params->kL_rem) ? fg[loc] : neg_inf;
            }
          }
        }
      }
    }

    // Mask out if causal
    if (do_causal && kb >= kb_min_causal) {
      constexpr auto neg_inf = Limits<AccumType>::finite_min;

      const int base_row = tid.x * BQ + params->qL_off + tm;
      const int base_col = kb * BK;

      STEEL_PRAGMA_UNROLL
      for (short iq = 0; iq < TQ; iq++) {
        STEEL_PRAGMA_UNROLL
        for (short ik = 0; ik < TK; ik++) {
          const short row_pos = base_row + iq * kU;
          const short col_pos = base_col + ik * kU;

          thread auto& fg = Stile.frag_at(iq, ik);

          STEEL_PRAGMA_UNROLL
          for (short ii = 0; ii < stile_t::kFragThrRows; ii++) {
            STEEL_PRAGMA_UNROLL
            for (short jj = 0; jj < stile_t::kFragThrCols; jj++) {
              const auto r = row_pos + ii * stile_t::kFragRowsJump + sm;
              const auto c = col_pos + jj + sn;
              const auto loc = ii * stile_t::kFragThrCols + jj;
              fg[loc] = (r < c) ? neg_inf : fg[loc];
            }
          }
        }
      }
    }

    // Other masking as needed
    if (has_mask) {
      constexpr auto neg_inf = Limits<AccumType>::finite_min;

      const int base_row = tid.x * BQ + tm;
      const int base_col = kb * BK;

      constexpr bool is_bool = is_same_v<MaskType, bool>;
      using melem_t = typename metal::conditional_t<is_bool, bool, AccumType>;
      using mtile_t = NAXTile<melem_t, TQ, TK>;
      using mfrag_t = typename mtile_t::frag_type;

      STEEL_PRAGMA_UNROLL
      for (short iq = 0; iq < TQ; iq++) {
        STEEL_PRAGMA_UNROLL
        for (short ik = 0; ik < TK; ik++) {
          const short row_pos = base_row + iq * kU;
          const short col_pos = base_col + ik * kU;

          mfrag_t mfrag;
          mtile_t::NAXFrag_t::load_safe(
              mfrag,
              mask,
              int64_t(mask_params->M_strides[2]),
              Int<1>{},
              params->qL,
              params->kL,
              row_pos,
              col_pos);

          thread auto& fg = Stile.frag_at(iq, ik);

          STEEL_PRAGMA_UNROLL
          for (short jj = 0; jj < mtile_t::kElemsPerFrag; jj++) {
            if constexpr (is_bool) {
              fg[jj] = mfrag[jj] ? fg[jj] : neg_inf;
            } else {
              fg[jj] += M_LOG2E_F * AccumType(mfrag[jj]);
            }
          }
        }
      }
    }

    // Do softmax

    // Temp variables
    metal::vec<AccumType, kRowsPT> new_max;
    metal::vec<AccumType, kRowsPT> factor;
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kRowsPT; ++i) {
      new_max[i] = max_score[i];
    }

    // Row max
    Stile.template row_reduce<MaxOp>(new_max);

    // exp(Si - rowmax(Si))
    Stile.template row_bin_op<ExpSubOp>(new_max);

    // Factor exp(rowmax(Si) - rowmax(Si-1))
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kRowsPT; ++i) {
      factor[i] = fast::exp2(max_score[i] - new_max[i]);
      max_score[i] = new_max[i];
    }

    // Row Sum
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kRowsPT; ++i) {
      sum_score[i] = sum_score[i] * factor[i];
    }

    Stile.template row_reduce<SumOp>(sum_score);

    // Update O
    Otile.template row_bin_op<MulOp>(factor);

    simdgroup_barrier(mem_flags::mem_none);

    // Do O = P @ V
    STEEL_PRAGMA_UNROLL
    for (short iq = 0; iq < TQ; iq++) {
      STEEL_PRAGMA_UNROLL
      for (short id = 0; id < TD; id += 2) {
        if constexpr (BD == 128) {
          if (id == 4) {
            threadgroup_barrier(mem_flags::mem_none);
          }
        }

        STEEL_PRAGMA_UNROLL
        for (short ik = 0; ik < TK; ik++) {
          NAXTile<T, 1, 2> Vtile;

          const int V_load_off = ik * kU * int(params->V_strides[2]) + id * kU;

          if (!align_K && is_last_k) {
            Vtile.load_rows(
                V + V_load_off,
                int(params->V_strides[2]),
                lim_rows_k - ik * kU);
          } else {
            Vtile.load(V + V_load_off, int(params->V_strides[2]));
          }

          otile_t::NAXFrag_t::mma(
              Otile.frag_at(iq, id),
              Otile.frag_at(iq, id + 1),
              Stile.frag_at(iq, ik),
              metal::false_type{},
              Vtile.frag_at(0, 0),
              Vtile.frag_at(0, 1),
              metal::false_type{});
        }
      }
    }

    // Prepare for next iteration
    K += BK * int(params->K_strides[2]);
    V += BK * int(params->V_strides[2]);
  }

  // Normalize output

  threadgroup_barrier(mem_flags::mem_none);

  metal::vec<AccumType, kRowsPT> rcp;
  STEEL_PRAGMA_UNROLL
  for (short i = 0; i < kRowsPT; ++i) {
    rcp[i] = 1.f / sum_score[i];
  }

  Otile.template row_bin_op<MulOp>(rcp);

  // Store results
  O += tm * int(params->O_strides[2]);

  if (!align_Q && is_last_q) {
    if (lim_rows_q <= 0)
      return;

    Otile.store_rows(O, int(params->O_strides[2]), lim_rows_q);
  } else {
    Otile.store(O, int(params->O_strides[2]));
  }
}


================================================
FILE: mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.metal
================================================
// Copyright © 2024-25 Apple Inc.

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"

#include "mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.h"

#define instantiate_attn(tname, dtype, bq, bk, bd, wm, wn, mname, mtype) \
  instantiate_kernel(                                                    \
      "steel_attention_" #tname "_bq" #bq "_bk" #bk "_bd" #bd            \
      "_wm" #wm "_wn" #wn "_mask" #mname,                                \
  attention_nax, dtype, bq, bk, bd, wm, wn, mtype, float)

#define instantiate_attn_shapes_helper(iname, itype, mname, mtype)  \
    instantiate_attn(iname, itype, 64, 32, 128, 4, 1, mname, mtype) \
    instantiate_attn(iname, itype, 64, 32,  64, 4, 1, mname, mtype) \
    instantiate_attn(iname, itype, 64, 64, 128, 4, 1, mname, mtype) \
    instantiate_attn(iname, itype, 64, 64,  64, 4, 1, mname, mtype)

#define instantiate_attn_mask_helper(iname, itype) \
    instantiate_attn_shapes_helper(iname, itype, iname, itype) \
    instantiate_attn_shapes_helper(iname, itype, bool_, bool)

instantiate_attn_mask_helper(float16, half);
instantiate_attn_mask_helper(bfloat16, bfloat);

instantiate_attn_mask_helper(float32, float);
// clang-format on


================================================
FILE: mlx/backend/metal/kernels/steel/attn/loader.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/steel/defines.h"

///////////////////////////////////////////////////////////////////////////////
// Loading helper
///////////////////////////////////////////////////////////////////////////////

namespace mlx {
namespace steel {

template <
    typename T,
    short BROWS,
    short BCOLS,
    short dst_ld,
    short reduction_dim,
    short tgp_size,
    short alignment = 1,
    short n_reads = (BCOLS * BROWS) / (tgp_size),
    short TCOLS = BCOLS / n_reads,
    short TROWS = tgp_size / TCOLS>
struct BlockLoader {
  STEEL_CONST short n_rows = (BROWS + TROWS - 1) / TROWS;
  STEEL_CONST short vec_size = n_reads;

  // Leading dimension for src
  const int src_ld;
  const int tile_stride;

  // Thread location indices
  const short thread_idx;
  const short bi;
  const short bj;

  // threadgroup and device memory
  threadgroup T* dst;
  const device T* src;

  struct alignas(alignment * sizeof(T)) ReadVector {
    uint8_t v[sizeof(T) * vec_size];
  };

  /* Constructor */
  METAL_FUNC BlockLoader(
      const device T* src_,
      const int src_ld_,
      threadgroup T* dst_,
      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
      ushort simd_lane_id [[thread_index_in_simdgroup]])
      : src_ld(src_ld_),
        tile_stride(reduction_dim ? BCOLS : BROWS * src_ld),
        thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(thread_idx / TCOLS),
        bj(vec_size * (thread_idx % TCOLS)),
        dst(dst_ + bi * dst_ld + bj),
        src(src_ + bi * src_ld + bj) {}

  /* Apply operation to threadgroup without bound checking */
  template <typename UnaryOp>
  METAL_FUNC void apply_inplace_op(thread const UnaryOp& op) const {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < BROWS; i += TROWS) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < vec_size; j++) {
        dst[i * dst_ld + j] = op.apply(dst[i * dst_ld + j]);
      }
    }
  }

  /* Load from device memory into threadgroup memory - without bound checking */
  METAL_FUNC void load_unsafe() const {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < BROWS; i += TROWS) {
      *((threadgroup ReadVector*)(&dst[i * dst_ld])) =
          *((const device ReadVector*)(&src[i * src_ld]));
    }
  }

  /* Load from device memory into threadgroup memory - with bound checking */
  METAL_FUNC void load_safe(short2 src_tile_dim) const {
    src_tile_dim = src_tile_dim - short2(bj, bi);

    // Skip loading if thread has no valid reads
    if (src_tile_dim.x <= 0 || src_tile_dim.y <= 0) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < BROWS; i += TROWS) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; j++) {
          dst[i * dst_ld + j] = T(0);
        }
      }
      return;
    }

    // Use fast thread memory for bound checks
    bool tmp_idx[vec_size];
    T tmp_val[vec_size];

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < BROWS; i += TROWS) {
      // Make sure tmp_idx only contains valid indices
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < vec_size; j++) {
        tmp_idx[j] = (i < src_tile_dim.y) && (j < src_tile_dim.x);
      }

      // Read valid indices into tmp_val
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < vec_size; j++) {
        tmp_val[j] = src[(tmp_idx[j] ? i * src_ld + j : 0)];
      }

      // Zero out unneeded values
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < vec_size; j++) {
        tmp_val[j] = tmp_idx[j] ? tmp_val[j] : T(0);
      }

      // Copy values to threadgroup memory
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < vec_size; j++) {
        dst[i * dst_ld + j] = tmp_val[j];
      }
    }
  }

  /* Iteration helper */
  METAL_FUNC void next() {
    src += tile_stride;
  }
};

template <int R, int C>
struct CShape {
  STEEL_CONST int kRows = R;
  STEEL_CONST int kCols = C;
};

template <
    typename T,
    short BROWS,
    short BCOLS,
    short kDstStrRow,
    short kDstStrCol,
    short reduction_dim,
    short tgp_size,
    short n_reads = (BCOLS * BROWS) / (tgp_size),
    short TCOLS = BCOLS / n_reads,
    short TROWS = tgp_size / TCOLS>
struct BlockLoaderT {
  STEEL_CONST short n_rows = (BROWS + TROWS - 1) / TROWS;
  STEEL_CONST short vec_size = n_reads;

  // Leading dimension for src
  const int src_ld;
  const int tile_stride;

  // Thread location indices
  const short thread_idx;
  const short bi;
  const short bj;

  // threadgroup and device memory
  threadgroup T* dst;
  const device T* src;

  /* Constructor */
  METAL_FUNC BlockLoaderT(
      const device T* src_,
      const int src_ld_,
      threadgroup T* dst_,
      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
      ushort simd_lane_id [[thread_index_in_simdgroup]])
      : src_ld(src_ld_),
        tile_stride(reduction_dim ? BCOLS : BROWS * src_ld),
        thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(thread_idx / TCOLS),
        bj(vec_size * (thread_idx % TCOLS)),
        dst(dst_ + bi * kDstStrRow + bj * kDstStrCol),
        src(src_ + bi * src_ld + bj) {}

  /* Apply operation to threadgroup without bound checking */
  template <typename UnaryOp>
  METAL_FUNC void apply_inplace_op(thread const UnaryOp& op) const {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < BROWS; i += TROWS) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < vec_size; j++) {
        dst[i * kDstStrRow + j * kDstStrCol] =
            op.apply(dst[i * kDstStrRow + j * kDstStrCol]);
      }
    }
  }

  /* Load from device memory into threadgroup memory - without bound checking */
  METAL_FUNC void load_unsafe() const {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < BROWS; i += TROWS) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < vec_size; j++) {
        dst[i * kDstStrRow + j * kDstStrCol] = src[i * src_ld + j];
      }
    }
  }

  /* Load from device memory into threadgroup memory - with bound checking */
  METAL_FUNC void load_safe(short2 src_tile_dim) const {
    src_tile_dim = src_tile_dim - short2(bj, bi);

    // Skip loading if thread has no valid reads
    if (src_tile_dim.x <= 0 || src_tile_dim.y <= 0) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < BROWS; i += TROWS) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; j++) {
          dst[i * kDstStrRow + j * kDstStrCol] = T(0);
        }
      }
      return;
    }

    // Use fast thread memory for bound checks
    bool tmp_idx[vec_size];
    T tmp_val[vec_size];

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < BROWS; i += TROWS) {
      // Make sure tmp_idx only contains valid indices
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < vec_size; j++) {
        tmp_idx[j] = (i < src_tile_dim.y) && (j < src_tile_dim.x);
      }

      // Read valid indices into tmp_val
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < vec_size; j++) {
        tmp_val[j] = src[(tmp_idx[j] ? i * src_ld + j : 0)];
      }

      // Zero out unneeded values
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < vec_size; j++) {
        tmp_val[j] = tmp_idx[j] ? tmp_val[j] : T(0);
      }

      // Copy values to threadgroup memory
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < vec_size; j++) {
        dst[i * kDstStrRow + j * kDstStrCol] = tmp_val[j];
      }
    }
  }

  /* Iteration helper */
  METAL_FUNC void next() {
    src += tile_stride;
  }
};

} // namespace steel
} // namespace mlx


================================================
FILE: mlx/backend/metal/kernels/steel/attn/mma.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include <metal_simdgroup>
#include <metal_simdgroup_matrix>
#include <metal_stdlib>

#include "mlx/backend/metal/kernels/steel/attn/transforms.h"
#include "mlx/backend/metal/kernels/steel/defines.h"
#include "mlx/backend/metal/kernels/steel/utils/integral_constant.h"

using namespace metal;

///////////////////////////////////////////////////////////////////////////////
// MMA helper
///////////////////////////////////////////////////////////////////////////////

namespace mlx {
namespace steel {

template <typename RInt, typename CInt>
struct Shape2D {
  RInt r;
  CInt c;

  Shape2D(RInt r_, CInt c_) : r(r_), c(c_) {}
};

template <typename Shape, typename Layout>
struct Layout2D {
  Shape shape;
  Layout layout;
};

template <typename T, int kFragRows_, int kFragCols_>
struct BaseMMAFrag {
  static_assert(
      kFragRows_ == 8,
      "Only 8 x 8 fragment matrices are currently supported");
  static_assert(
      kFragCols_ == 8,
      "Only 8 x 8 fragment matrices are currently supported");
};

template <typename T>
struct BaseMMAFrag<T, 8, 8> {
  STEEL_CONST int kFragRows = 8;
  STEEL_CONST int kFragCols = 8;

  STEEL_CONST int kElemsPerFrag = (kFragRows * kFragCols) / 32;

  STEEL_CONST int kElemRows = 1;
  STEEL_CONST int kElemCols = 2;

  static_assert(
      kElemRows * kElemCols == kElemsPerFrag,
      "MMAFrag shape is not consistent with MMAFrag size");

  typedef metal::simdgroup_matrix<T, kFragRows, kFragCols> mat_type;
  typedef metal::vec<T, kElemsPerFrag> frag_type;
  typedef metal::vec<T, kElemRows> row_frag_type;
  typedef metal::vec<T, kElemCols> col_frag_type;

  template <typename U>
  using dtype_mat_t = typename metal::simdgroup_matrix<U, kFragRows, kFragCols>;

  template <typename U>
  using dtype_frag_t = typename metal::vec<U, kElemsPerFrag>;

  METAL_FUNC static constexpr short2 get_coord(
      ushort simd_lane_id [[thread_index_in_simdgroup]]) {
    const short qid = simd_lane_id / 4;
    const short fm = (qid & 4) + ((simd_lane_id / 2) % 4);
    const short fn = (qid & 2) * 2 + (simd_lane_id % 2) * 2;
    return short2{fn, fm};
  }

  template <typename SrcPtrType, typename StrX, typename StrY>
  METAL_FUNC static constexpr void
  load(thread frag_type& dst, SrcPtrType src, StrX str_x, StrY str_y) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kElemCols; j++) {
        dst[i * kElemCols + j] = static_cast<T>(src[i * str_x + j * str_y]);
      }
    }
  }

  template <
      typename SrcPtrType,
      typename StrX,
      typename StrY,
      typename LimX,
      typename LimY,
      typename OffX,
      typename OffY>
  METAL_FUNC static constexpr void load_safe(
      thread frag_type& dst,
      SrcPtrType src,
      StrX str_x,
      StrY str_y,
      LimX lim_x,
      LimY lim_y,
      OffX off_x = Int<0>{},
      OffY off_y = Int<0>{}) {
    src += off_x * str_x + off_y * str_y;
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kElemCols; j++) {
        if ((off_x + i) < lim_x && (off_y + j) < lim_y) {
          dst[i * kElemCols + j] = static_cast<T>(src[0]);
        } else {
          dst[i * kElemCols + j] = T(0);
        }
        src += str_y;
      }
      src -= kElemCols * str_y;
      src += str_x;
    }
  }

  template <typename DstPtrType, typename StrX, typename StrY>
  METAL_FUNC static constexpr void
  store(const thread frag_type& src, DstPtrType dst, StrX str_x, StrY str_y) {
    using U = pointer_element_t<DstPtrType>;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kElemCols; j++) {
        dst[i * str_x + j * str_y] = static_cast<U>(src[i * kElemCols + j]);
      }
    }
  }

  template <
      typename DstPtrType,
      typename StrX,
      typename StrY,
      typename LimX,
      typename LimY,
      typename OffX,
      typename OffY>
  METAL_FUNC static constexpr void store_safe(
      const thread frag_type& src,
      DstPtrType dst,
      StrX str_x,
      StrY str_y,
      LimX lim_x,
      LimY lim_y,
      OffX off_x = Int<0>{},
      OffY off_y = Int<0>{}) {
    using U = pointer_element_t<DstPtrType>;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kElemCols; j++) {
        if ((off_x + i) < lim_x && (off_y + j) < lim_y) {
          dst[(off_x + i) * str_x + (off_y + j) * str_y] =
              static_cast<U>(src[i * kElemCols + j]);
        }
      }
    }
  }

  template <typename Atype, typename Btype, typename Ctype>
  METAL_FUNC static constexpr void mma(
      thread frag_type& D,
      thread dtype_frag_t<Atype>& A,
      thread dtype_frag_t<Btype>& B,
      thread dtype_frag_t<Ctype>& C) {
    mat_type D_mat;
    dtype_mat_t<Atype> A_mat;
    dtype_mat_t<Btype> B_mat;
    dtype_mat_t<Ctype> C_mat;

    reinterpret_cast<thread dtype_frag_t<Atype>&>(A_mat.thread_elements()) = A;
    reinterpret_cast<thread dtype_frag_t<Btype>&>(B_mat.thread_elements()) = B;
    reinterpret_cast<thread dtype_frag_t<Ctype>&>(C_mat.thread_elements()) = C;

    mma(D_mat, A_mat, B_mat, C_mat);

    D = reinterpret_cast<thread frag_type&>(D_mat.thread_elements());
  }

  template <typename Atype, typename Btype, typename Ctype>
  METAL_FUNC static constexpr void mma(
      thread mat_type& D,
      thread dtype_mat_t<Atype>& A,
      thread dtype_mat_t<Btype>& B,
      thread dtype_mat_t<Ctype>& C) {
    simdgroup_multiply_accumulate(D, A, B, C);
  }

  template <typename Op>
  METAL_FUNC static constexpr void row_reduce(
      thread const frag_type& inp_vals,
      thread T* reduced_vals) {
    T thr_reduce = Op::apply(inp_vals.x, inp_vals.y);

    T qgr_reduce = simd_shuffle_xor(thr_reduce, ushort(1));
    qgr_reduce = Op::apply(thr_reduce, qgr_reduce);

    T sgr_reduce = simd_shuffle_xor(qgr_reduce, ushort(8));
    sgr_reduce = Op::apply(qgr_reduce, sgr_reduce);

    reduced_vals[0] = Op::apply(reduced_vals[0], sgr_reduce);
  }

  template <typename Op>
  METAL_FUNC static constexpr void row_bin_op(
      thread frag_type& inp_vals,
      thread T* row_vals) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kElemCols; j++) {
        inp_vals[i * kElemCols + j] =
            Op::apply(inp_vals[i * kElemCols + j], row_vals[i]);
      }
    }
  }
};

template <
    typename T,
    int kTileRows_,
    int kTileCols_,
    class MMAFrag_ = BaseMMAFrag<T, 8, 8>>
struct MMATile {
  using MMAFrag_t = MMAFrag_;
  using elem_type = T;
  STEEL_CONST int kFragRows = MMAFrag_t::kFragRows;
  STEEL_CONST int kFragCols = MMAFrag_t::kFragCols;
  STEEL_CONST int kElemsPerFrag = MMAFrag_t::kElemsPerFrag;

  STEEL_CONST int kTileRows = kTileRows_;
  STEEL_CONST int kTileCols = kTileCols_;

  STEEL_CONST int kRows = kTileRows * kFragRows;
  STEEL_CONST int kCols = kTileCols * kFragCols;

  STEEL_CONST int kNumFrags = kTileRows * kTileCols;
  STEEL_CONST int kElemsPerTile = kNumFrags * kElemsPerFrag;

  STEEL_CONST int kRowsPerThread = kTileRows * MMAFrag_t::kElemRows;
  STEEL_CONST int kColsPerThread = kTileCols * MMAFrag_t::kElemCols;

  typedef typename MMAFrag_t::mat_type mat_type;
  typedef typename MMAFrag_t::frag_type frag_type;

  frag_type val_frags[kNumFrags]; // = {frag_type(0)};

  METAL_FUNC MMATile() thread {}

  METAL_FUNC constexpr void clear() {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kNumFrags; ++i) {
      val_frags[i] = frag_type(0);
    }
  }

  METAL_FUNC constexpr thread frag_type& frag_at(const short i, const short j) {
    return val_frags[i * kTileCols + j];
  }

  METAL_FUNC constexpr const thread frag_type& frag_at(
      const short i,
      const short j) const {
    return val_frags[i * kTileCols + j];
  }

  METAL_FUNC mat_type mat_at(const short i, const short j) {
    mat_type val_mat;
    STEEL_PRAGMA_UNROLL
    for (short ii = 0; ii < kElemsPerFrag; ++ii) {
      val_mat.thread_elements()[ii] = frag_at(i, j)[ii];
    }
    return val_mat;
  }

  METAL_FUNC thread elem_type* elems() {
    return reinterpret_cast<thread elem_type*>(val_frags);
  }

  METAL_FUNC const thread elem_type* elems() const {
    return reinterpret_cast<const thread elem_type*>(val_frags);
  }

  template <typename Op>
  METAL_FUNC void row_reduce(thread T vals[kRowsPerThread]) const {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kTileCols; ++j) {
        MMAFrag_t::template row_reduce<Op>(
            frag_at(i, j), &vals[i * MMAFrag_t::kElemRows]);
      }
    }
  }

  template <typename Op>
  METAL_FUNC void row_bin_op(thread T vals[kRowsPerThread]) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kTileCols; ++j) {
        MMAFrag_t::template row_bin_op<Op>(
            frag_at(i, j), &vals[i * MMAFrag_t::kElemRows]);
      }
    }
  }

  template <typename U, int w_x, int w_y, int str_x, int str_y>
  METAL_FUNC void load(const threadgroup U* src) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kTileCols; ++j) {
        MMAFrag_t::load(
            frag_at(i, j),
            &(
                src[(i * kFragRows) * w_x * str_x +
                    (j * kFragCols) * w_y * str_y]),
            Int<str_x>{},
            Int<str_y>{});
      }
    }
  }

  template <typename U, int w_x, int w_y, int str_x, int str_y>
  METAL_FUNC void store(threadgroup U* dst) const {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kTileCols; ++j) {
        MMAFrag_t::store(
            frag_at(i, j),
            &(
                dst[(i * kFragRows) * w_x * str_x +
                    (j * kFragCols) * w_y * str_y]),
            Int<str_x>{},
            Int<str_y>{});
      }
    }
  }

  template <typename U, int w_x, int w_y>
  METAL_FUNC void load(const device U* src, const int ld) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kTileCols; ++j) {
        MMAFrag_t::load(
            frag_at(i, j),
            &(src[(i * kFragRows) * w_x * ld + (j * kFragCols) * w_y]),
            ld,
            Int<1>{});
      }
    }
  }

  template <typename U, int w_x, int w_y>
  METAL_FUNC void store(device U* dst, const int ld) const {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kTileCols; ++j) {
        MMAFrag_t::store(
            frag_at(i, j),
            &(dst[(i * kFragRows) * w_x * ld + (j * kFragCols) * w_y]),
            ld,
            Int<1>{});
      }
    }
  }

  template <typename U, int w_x, int w_y>
  METAL_FUNC void
  load_safe(const device U* src, const int ld, const short2 src_tile_dims) {
    STEEL_PRAGMA_UNROLL
    for (int i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (int j = 0; j < kTileCols; ++j) {
        MMAFrag_t::load_safe(
            frag_at(i, j),
            src,
            ld,
            Int<1>{},
            src_tile_dims.y,
            src_tile_dims.x,
            (i * kFragRows) * w_x,
            (j * kFragCols) * w_y);
      }
    }
  }

  template <typename U, int w_x, int w_y>
  METAL_FUNC void
  store_safe(device U* dst, const int ld, const short2 dst_tile_dims) const {
    STEEL_PRAGMA_UNROLL
    for (int i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (int j = 0; j < kTileCols; ++j) {
        MMAFrag_t::store_safe(
            frag_at(i, j),
            dst,
            ld,
            Int<1>{},
            dst_tile_dims.y,
            dst_tile_dims.x,
            (i * kFragRows) * w_x,
            (j * kFragCols) * w_y);
      }
    }
  }
};

template <
    typename Dtype,
    typename Atype,
    typename Btype,
    typename Ctype,
    int M,
    int N,
    int K,
    class MMAFragD,
    class MMAFragA,
    class MMAFragB,
    class MMAFragC>
METAL_FUNC void tile_matmad(
    thread MMATile<Dtype, M, N, MMAFragD>& D,
    thread MMATile<Atype, M, K, MMAFragA>& A,
    thread MMATile<Btype, K, N, MMAFragB>& B,
    thread MMATile<Ctype, M, N, MMAFragC>& C) {
  STEEL_PRAGMA_UNROLL
  for (short m = 0; m < M; ++m) {
    STEEL_PRAGMA_UNROLL
    for (short n = 0; n < N; ++n) {
      short m_serp = m; //(n % 2) ? (M - 1 - m) : m;
      short n_serp = (m % 2) ? (N - 1 - n) : n;

      STEEL_PRAGMA_UNROLL
      for (short k = 0; k < K; ++k) {
        MMAFragD::mma(
            D.frag_at(m_serp, n_serp),
            A.frag_at(m_serp, k),
            B.frag_at(k, n_serp),
            C.frag_at(m_serp, n_serp));
      }
    }
  }
}

template <
    typename T,
    typename U,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose_a,
    bool transpose_b,
    short lda_tgp,
    short ldb_tgp,
    typename AccumType = float,
    typename Epilogue = TransformNone<U, AccumType>>
struct BlockMMA {
  // MMAFrag size
  STEEL_CONST short kFragSize = 8;
  using MMAFrag_acc_t = BaseMMAFrag<AccumType, kFragSize, kFragSize>;

  // Warp tile simdgroup matrix strides along M
  STEEL_CONST short TM_stride = kFragSize * WM;
  // Warp tile simdgroup matrix strides along M
  STEEL_CONST short TN_stride = kFragSize * WN;

  // Warp tile size along M
  STEEL_CONST short TM = BM / TM_stride;
  // Warp tile size along N
  STEEL_CONST short TN = BN / TN_stride;

  // Threadgroup A strides
  STEEL_CONST short A_str_m = transpose_a ? 1 : lda_tgp; // M
  STEEL_CONST short A_str_k = transpose_a ? lda_tgp : 1; // K

  // Threadgroup B strides
  STEEL_CONST short B_str_k = transpose_b ? 1 : ldb_tgp; // K
  STEEL_CONST short B_str_n = transpose_b ? ldb_tgp : 1; // N

  // Threadgroup strides along K
  STEEL_CONST short tile_stride_a = kFragSize * A_str_k;
  STEEL_CONST short tile_stride_b = kFragSize * B_str_k;

  // Simdgroup matrices
  MMATile<AccumType, TM, 1, MMAFrag_acc_t> Atile;
  MMATile<AccumType, 1, TN, MMAFrag_acc_t> Btile;
  MMATile<AccumType, TM, TN, MMAFrag_acc_t> Ctile;

  // Offsets within threadgroup
  short sm;
  short sn;

  short As_offset;
  short Bs_offset;

  /* Constructor */
  METAL_FUNC BlockMMA(
      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
      ushort simd_lane_id [[thread_index_in_simdgroup]]) {
    // Determine thread position in simdgroup matrix
    short tm = kFragSize * (simd_group_id / WN);
    short tn = kFragSize * (simd_group_id % WN);

    short2 simd_coord = MMAFrag_acc_t::get_coord(simd_lane_id);
    sm = simd_coord.y;
    sn = simd_coord.x;

    // Determine thread and simdgroup offset
    As_offset = (tm + sm) * A_str_m + (sn)*A_str_k; // M, K
    Bs_offset = (sm)*B_str_k + (tn + sn) * B_str_n; // K, N

    sm += tm;
    sn += tn;
  }

  /* (BM, BK) X (BK, BN) multiply accumulate function */
  METAL_FUNC void mma(const threadgroup T* As, const threadgroup T* Bs) {
    // Adjust for simdgroup and thread location
    As += As_offset;
    Bs += Bs_offset;

    // Iterate over BK in blocks of kFragSize
    STEEL_PRAGMA_UNROLL
    for (short kk = 0; kk < BK; kk += kFragSize) {
      simdgroup_barrier(mem_flags::mem_none);

      Atile.template load<T, WM, 1, A_str_m, A_str_k>(As);

      simdgroup_barrier(mem_flags::mem_none);

      Btile.template load<T, 1, WN, B_str_k, B_str_n>(Bs);

      simdgroup_barrier(mem_flags::mem_none);

      tile_matmad(Ctile, Atile, Btile, Ctile);

      // Progress to next simdgroup tile
      As += tile_stride_a;
      Bs += tile_stride_b;
    }
  }

  /* Store results from simdgroup_matrix results into device memory */
  METAL_FUNC void store_result(device U* D, const int ldd) {
    // Apply epilogue
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < decltype(Ctile)::kElemsPerTile; i++) {
      Ctile.elems()[i] = Epilogue::apply(Ctile.elems()[i]);
    }

    // Adjust for simdgroup and thread location
    D += sm * ldd + sn;

    Ctile.template store<U, WM, WN>(D, ldd);
  }

  METAL_FUNC void
  store_result_safe(device U* D, const int ldd, short2 dst_tile_dims) {
    // Apply epilogue
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < decltype(Ctile)::kElemsPerTile; i++) {
      Ctile.elems()[i] = Epilogue::apply(Ctile.elems()[i]);
    }

    // Adjust for simdgroup and thread location
    D += sm * ldd + sn;
    dst_tile_dims -= short2(sn, sm);

    if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
      return;

    Ctile.template store_safe<U, WM, WN>(D, ldd, dst_tile_dims);
  }

  /* Apply epilogue */
  template <typename UnaryEpilogue>
  METAL_FUNC void apply_epilogue(thread const UnaryEpilogue& epilogue_op) {
    // Loop over all simdgroup tiles
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < decltype(Ctile)::kElemsPerTile; i++) {
      Ctile.elems()[i] = epilogue_op.apply(Ctile.elems()[i]);
    }
  }

  /* Apply epilogue */
  template <typename BinaryEpilogue>
  METAL_FUNC void apply_epilogue(
      const device U* C,
      const int ldc,
      const int fdc,
      thread const BinaryEpilogue& epilogue_op) {
    // Adjust for simdgroup and thread location
    C += (sm)*ldc + (sn)*fdc;

    // Loop over all simdgroup tiles
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < TM; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < TN; j++) {
        // Get accumulated result and associated offset in C
        thread auto& accum = Ctile.frag_at(i, j);
        int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;

        // Apply epilogue
        STEEL_PRAGMA_UNROLL
        for (short k = 0; k < decltype(Ctile)::kElemsPerFrag; k++) {
          accum[k] = epilogue_op.apply(accum[k], C[offset_c + k * fdc]);
        }
      }
    }
  }

  /* Apply epilogue */
  template <typename BinaryEpilogue>
  METAL_FUNC void apply_epilogue_safe(
      const device U* C,
      const int ldc,
      const int fdc,
      short2 dst_tile_dims,
      thread const BinaryEpilogue& epilogue_op) {
    // Adjust for simdgroup and thread location
    C += (sm)*ldc + (sn)*fdc;
    dst_tile_dims -= short2(sn, sm);

    if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
      return;

    // Loop over all simdgroup tiles
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < TM; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < TN; j++) {
        // Get accumulated result and associated offset in C
        thread auto& accum = Ctile.frag_at(i, j);
        int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;

        constexpr short kelems = decltype(Ctile)::kElemsPerFrag;

        // Read C
        U c_elems[kelems] = {0};

        STEEL_PRAGMA_UNROLL
        for (short k = 0; k < kelems; k++) {
          if ((j * TN_stride + k) < dst_tile_dims.x) {
            c_elems[k] = C[offset_c + k * fdc];
          }
        }

        // Apply epilogue
        STEEL_PRAGMA_UNROLL
        for (short k = 0; k < kelems; k++) {
          accum[k] = epilogue_op.apply(accum[k], c_elems[k]);
        }
      }
    }
  }

  /* Store results from simdgroup_matrix results into device memory */
  METAL_FUNC void store_result(
      device U* D,
      const int ldd,
      const device U* C,
      const int ldc,
      const int fdc,
      thread const Epilogue& epilogue_op) const {
    // Adjust for simdgroup and thread location
    C += (sm)*ldc + (sn)*fdc;
    D += (sm)*ldd + sn;

    constexpr short kelems = decltype(Ctile)::kElemsPerFrag;

    // Loop over all simdgroup tiles
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < TM; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < TN; j++) {
        // Get accumulated result and associated offset in C
        thread const auto& accum = Ctile.frag_at(i, j);
        int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
        int offset_d = (i * TM_stride) * ldd + (j * TN_stride);

        // Apply epilogue
        STEEL_PRAGMA_UNROLL
        for (short k = 0; k < kelems; k++) {
          D[offset_d + k] = epilogue_op.apply(accum[k], C[offset_c + k * fdc]);
        }
      }
    }
  }

  METAL_FUNC void store_result_safe(
      device U* D,
      const int ldd,
      const device U* C,
      const int ldc,
      const int fdc,
      short2 dst_tile_dims,
      thread const Epilogue& epilogue_op) const {
    // Adjust for simdgroup and thread location
    C += (sm)*ldc + (sn)*fdc;
    D += (sm)*ldd + sn;
    dst_tile_dims -= short2(sn, sm);

    if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
      return;

    constexpr short kelems = decltype(Ctile)::kElemsPerFrag;

    STEEL_PRAGMA_UNROLL
    for (int i = 0; i < TM; i++) {
      if (i * TM_stride < dst_tile_dims.y) {
        STEEL_PRAGMA_UNROLL
        for (int j = 0; j < TN; j++) {
          // Get accumulated result and associated offset in C
          thread const auto& accum = Ctile.frag_at(i, j);
          int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
          int offset_d = (i * TM_stride) * ldd + (j * TN_stride);

          // Apply epilogue
          STEEL_PRAGMA_UNROLL
          for (short k = 0; k < kelems; k++) {
            if ((j * TN_stride + k) < dst_tile_dims.x) {
              D[offset_d + k] =
                  epilogue_op.apply(accum[k], C[offset_c + k * fdc]);
            }
          }
        }
      }
    }
  }
};

} // namespace steel
} // namespace mlx


================================================
FILE: mlx/backend/metal/kernels/steel/attn/nax.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include <metal_simdgroup>
#include <metal_simdgroup_matrix>
#include <metal_stdlib>

#include "mlx/backend/metal/kernels/steel/defines.h"
#include "mlx/backend/metal/kernels/steel/utils/integral_constant.h"

#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h>

using namespace metal;

///////////////////////////////////////////////////////////////////////////////
// MMA helper
///////////////////////////////////////////////////////////////////////////////

namespace mlx {
namespace steel {

///////////////////////////////////////////////////////////////////////////////
// NAX Steel with new tiles
///////////////////////////////////////////////////////////////////////////////

struct BaseNAXFrag {
  STEEL_CONST short kFragRows = 16;
  STEEL_CONST short kFragCols = 16;

  STEEL_CONST short kElemsPerFrag = (kFragRows * kFragCols) / 32;

  STEEL_CONST short kElemRows = 2;
  STEEL_CONST short kElemCols = 4;

  STEEL_CONST short kElemRowsJump = 8;

  static_assert(
      kElemRows * kElemCols == kElemsPerFrag,
      "MMAFrag shape is not consistent with MMAFrag size");

  template <typename U>
  using dtype_frag_t = typename metal::vec<U, kElemsPerFrag>;

  METAL_FUNC static short2 get_coord() {
    const ushort simd_lane_id = __metal_get_thread_index_in_simdgroup(ushort());
    const short qid = simd_lane_id >> 2;
    const short fm = ((qid & 4) | ((simd_lane_id >> 1) & 3));
    const short fn = ((qid & 2) | (simd_lane_id & 1)) * 4;
    return short2{fn, fm};
  }

  METAL_FUNC static short2 get_coord(short idx) {
    const ushort simd_lane_id = __metal_get_thread_index_in_simdgroup(ushort());
    const short qid = simd_lane_id >> 2;
    const short fm = ((qid & 4) | ((simd_lane_id >> 1) & 3)) + (idx >> 2) * 8;
    const short fn = ((qid & 2) | (simd_lane_id & 1)) * 4 + idx % 4;
    return short2{fn, fm};
  }

  template <
      typename T,
      typename SrcPtrType,
      typename StrX,
      typename StrY,
      typename OffX = Int<0>,
      typename OffY = Int<0>>
  METAL_FUNC static constexpr void load(
      thread dtype_frag_t<T>& dst,
      SrcPtrType src,
      StrX str_x,
      StrY str_y,
      OffX off_x = {},
      OffY off_y = {}) {
    const short2 sc = get_coord();
    src += sc.y * str_x + sc.x * str_y;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      const auto r = off_x + i * kElemRowsJump;
      const auto c = off_y;

      if constexpr (metal::is_same_v<StrY, Int<1>>) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < kElemCols; j++) {
          dst[i * kElemCols + j] = static_cast<T>(src[r * str_x + c + j]);
        }
      } else {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < kElemCols; j++) {
          dst[i * kElemCols + j] =
              static_cast<T>(src[r * str_x + (c + j) * str_y]);
        }
      }
    }
  }

  template <
      typename T,
      typename SrcPtrType,
      typename StrX,
      typename StrY,
      typename LimX,
      typename OffX = Int<0>,
      typename OffY = Int<0>>
  METAL_FUNC static constexpr void load_rows(
      thread dtype_frag_t<T>& dst,
      SrcPtrType src,
      StrX str_x,
      StrY str_y,
      LimX lim_x,
      OffX off_x = {},
      OffY off_y = {}) {
    const short2 sc = get_coord();
    src += sc.y * str_x + sc.x * str_y;
    auto lx = lim_x - sc.y;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      const auto r = off_x + i * kElemRowsJump;
      const auto c = off_y;

      if (r < lx) {
        if constexpr (metal::is_same_v<StrY, Int<1>>) {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < kElemCols; j++) {
            dst[i * kElemCols + j] = static_cast<T>(src[r * str_x + (c + j)]);
          }
        } else {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < kElemCols; j++) {
            dst[i * kElemCols + j] =
                static_cast<T>(src[r * str_x + (c + j) * str_y]);
          }
        }

      } else {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < kElemCols; j++) {
          dst[i * kElemCols + j] = T(0);
        }
      }
    }
  }

  template <
      typename T,
      typename SrcPtrType,
      typename StrX,
      typename StrY,
      typename LimX,
      typename LimY,
      typename OffX = Int<0>,
      typename OffY = Int<0>>
  METAL_FUNC static constexpr void load_safe(
      thread dtype_frag_t<T>& dst,
      SrcPtrType src,
      StrX str_x,
      StrY str_y,
      LimX lim_x,
      LimY lim_y,
      OffX off_x = {},
      OffY off_y = {}) {
    const short2 sc = get_coord();
    src += sc.y * str_x + sc.x * str_y;
    auto lx = lim_x - sc.y;
    auto ly = lim_y - sc.x;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      const auto r = off_x + i * kElemRowsJump;
      const auto c = off_y;
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kElemCols; j++) {
        if ((r < lx) && ((c + j) < ly)) {
          dst[i * kElemCols + j] =
              static_cast<T>(src[r * str_x + (c + j) * str_y]);
        } else {
          dst[i * kElemCols + j] = T(0);
        }
      }
    }
  }

  template <
      typename T,
      typename DstPtrType,
      typename StrX,
      typename StrY,
      typename OffX = Int<0>,
      typename OffY = Int<0>>
  METAL_FUNC static constexpr void store(
      const thread dtype_frag_t<T>& src,
      DstPtrType dst,
      StrX str_x,
      StrY str_y,
      OffX off_x = {},
      OffY off_y = {}) {
    using U = pointer_element_t<DstPtrType>;

    const short2 sc = get_coord();
    dst += sc.y * str_x + sc.x * str_y;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      const auto r = off_x + i * kElemRowsJump;
      const auto c = off_y;

      if constexpr (metal::is_same_v<StrY, Int<1>>) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < kElemCols; j++) {
          dst[r * str_x + c + j] = static_cast<U>(src[i * kElemCols + j]);
        }
      } else {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < kElemCols; j++) {
          dst[r * str_x + (c + j) * str_y] =
              static_cast<U>(src[i * kElemCols + j]);
        }
      }
    }
  }

  template <
      typename T,
      typename DstPtrType,
      typename StrX,
      typename StrY,
      typename LimX,
      typename OffX = Int<0>,
      typename OffY = Int<0>>
  METAL_FUNC static constexpr void store_rows(
      const thread dtype_frag_t<T>& src,
      DstPtrType dst,
      StrX str_x,
      StrY str_y,
      LimX lim_x,
      OffX off_x = {},
      OffY off_y = {}) {
    using U = pointer_element_t<DstPtrType>;

    const short2 sc = get_coord();
    dst += sc.y * str_x + sc.x * str_y;
    auto lx = lim_x - sc.y;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      const auto r = off_x + i * kElemRowsJump;
      const auto c = off_y;

      if (r < lx) {
        if constexpr (metal::is_same_v<StrY, Int<1>>) {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < kElemCols; j++) {
            dst[r * str_x + c + j] = static_cast<U>(src[i * kElemCols + j]);
          }
        } else {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < kElemCols; j++) {
            dst[r * str_x + (c + j) * str_y] =
                static_cast<U>(src[i * kElemCols + j]);
          }
        }
      }
    }
  }

  template <
      typename T,
      typename DstPtrType,
      typename StrX,
      typename StrY,
      typename LimX,
      typename LimY,
      typename OffX = Int<0>,
      typename OffY = Int<0>>
  METAL_FUNC static constexpr void store_safe(
      const thread dtype_frag_t<T>& src,
      DstPtrType dst,
      StrX str_x,
      StrY str_y,
      LimX lim_x,
      LimY lim_y,
      OffX off_x = {},
      OffY off_y = {}) {
    using U = pointer_element_t<DstPtrType>;

    const short2 sc = get_coord();
    dst += sc.y * str_x + sc.x * str_y;
    auto lx = lim_x - sc.y;
    auto ly = lim_y - sc.x;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      const auto r = off_x + i * kElemRowsJump;
      const auto c = off_y;

      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kElemCols; j++) {
        if (r < lx && (c + j) < ly) {
          dst[r * str_x + (c + j) * str_y] =
              static_cast<U>(src[i * kElemCols + j]);
        }
      }
    }
  }

  template <
      typename T,
      typename DstPtrType,
      typename StrX,
      typename StrY,
      typename StartX,
      typename StopX,
      typename StartY,
      typename StopY,
      typename OffX = Int<0>,
      typename OffY = Int<0>>
  METAL_FUNC static constexpr void store_slice(
      const thread dtype_frag_t<T>& src,
      DstPtrType dst,
      StrX str_x,
      StrY str_y,
      StartX start_x,
      StopX stop_x,
      StartY start_y,
      StopY stop_y,
      OffX off_x = Int<0>{},
      OffY off_y = Int<0>{}) {
    using U = pointer_element_t<DstPtrType>;

    const short2 sc = get_coord();

    const_for_loop<0, kElemRows, 1>([&](auto idx_row) {
      const auto r = off_x + idx_row * Int<kElemRowsJump>{};
      if (r >= stop_x - sc.y || r < start_x - sc.y) {
        return;
      }

      const_for_loop<0, kElemCols, 1>([&](auto idx_col) {
        const auto c = off_y + idx_col;
        if (c >= stop_y - sc.x || c < start_y - sc.x) {
          return;
        }

        const auto src_idx = idx_row * Int<kElemCols>{} + idx_col;
        dst[(r + sc.y) * str_x + (c + sc.x) * str_y] =
            static_cast<U>(src[src_idx]);
      });
    });
  }

  template <typename Op, typename T>
  METAL_FUNC static constexpr void row_reduce(
      thread const dtype_frag_t<T>& inp_vals,
      thread T* reduced_vals) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      T thr_reduce = Op::apply(
          Op::apply(inp_vals[i * kElemCols + 0], inp_vals[i * kElemCols + 1]),
          Op::apply(inp_vals[i * kElemCols + 2], inp_vals[i * kElemCols + 3]));

      T qgr_reduce = simd_shuffle_xor(thr_reduce, ushort(1));
      qgr_reduce = Op::apply(thr_reduce, qgr_reduce);

      T sgr_reduce = simd_shuffle_xor(qgr_reduce, ushort(8));
      sgr_reduce = Op::apply(qgr_reduce, sgr_reduce);

      reduced_vals[i] = Op::apply(reduced_vals[i], sgr_reduce);
    }
  }

  template <typename Op, typename T>
  METAL_FUNC static constexpr void row_bin_op(
      thread dtype_frag_t<T>& inp_vals,
      thread T* row_vals) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kElemCols; j++) {
        inp_vals[i * kElemCols + j] =
            Op::apply(inp_vals[i * kElemCols + j], row_vals[i]);
      }
    }
  }

  template <
      typename CType,
      typename AType,
      typename BType,
      bool transpose_a = false,
      bool transpose_b = false>
  METAL_FUNC static constexpr void mma(
      thread dtype_frag_t<CType>& Cn0,
      thread dtype_frag_t<CType>& Cn1,
      const thread dtype_frag_t<AType>& A,
      metal::bool_constant<transpose_a>,
      const thread dtype_frag_t<BType>& Bn0,
      const thread dtype_frag_t<BType>& Bn1,
      metal::bool_constant<transpose_b>) {
    constexpr auto desc = mpp::tensor_ops::matmul2d_descriptor(
        16,
        32,
        16,
        transpose_a,
        transpose_b,
        true,
        mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate);

    // Create matmul op
    mpp::tensor_ops::matmul2d<desc, metal::execution_simdgroup> gemm_op;

    // Create matmul operands in registers
    auto ct_a =
        gemm_op
            .template get_left_input_cooperative_tensor<AType, BType, CType>();
    auto ct_b =
        gemm_op
            .template get_right_input_cooperative_tensor<AType, BType, CType>();

    // Create matmul output in register
    auto ct_c = gemm_op.template get_destination_cooperative_tensor<
        decltype(ct_a),
        decltype(ct_b),
        CType>();

    // Load A in to left operand registers
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemsPerFrag; i++) {
      ct_a[i] = A[i];
    }

    // Load B into right operand registers
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemsPerFrag; i++) {
      ct_b[i] = Bn0[i];
      ct_b[kElemsPerFrag + i] = Bn1[i];
    }

    // Load C into output registers (op handles accumulation)
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemsPerFrag; i++) {
      ct_c[i] = Cn0[i];
      ct_c[kElemsPerFrag + i] = Cn1[i];
    }

    // Do matmul
    gemm_op.run(ct_a, ct_b, ct_c);

    // Copy out results
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemsPerFrag; i++) {
      Cn0[i] = ct_c[i];
      Cn1[i] = ct_c[kElemsPerFrag + i];
    }
  }

  template <
      typename CType,
      typename AType,
      typename BType,
      bool transpose_a = false,
      bool transpose_b = false>
  METAL_FUNC static constexpr void mma(
      thread dtype_frag_t<CType>& Cm0,
      thread dtype_frag_t<CType>& Cm1,
      const thread dtype_frag_t<AType>& Am0,
      const thread dtype_frag_t<AType>& Am1,
      metal::bool_constant<transpose_a>,
      const thread dtype_frag_t<BType>& B,
      metal::bool_constant<transpose_b>) {
    // Create Matmul descriptor
    constexpr auto desc = mpp::tensor_ops::matmul2d_descriptor(
        16,
        32,
        16,
        transpose_a,
        transpose_b,
        true,
        mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate);

    // Create matmul op
    mpp::tensor_ops::matmul2d<desc, metal::execution_simdgroup> gemm_op;

    // Create matmul operands in registers
    auto ct_a =
        gemm_op
            .template get_left_input_cooperative_tensor<AType, BType, CType>();
    auto ct_b =
        gemm_op
            .template get_right_input_cooperative_tensor<AType, BType, CType>();

    // Create matmul output in register
    auto ct_c = gemm_op.template get_destination_cooperative_tensor<
        decltype(ct_a),
        decltype(ct_b),
        CType>();

    // Load A in to left operand registers
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemsPerFrag; i++) {
      ct_a[i] = Am0[i];
      ct_a[kElemsPerFrag + i] = Am1[i];
    }

    // Load B into right operand registers
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemsPerFrag; i++) {
      ct_b[i] = B[i];
    }

    // Load C into output registers (op handles accumulation)
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemsPerFrag; i++) {
      ct_c[i] = Cm0[i];
      ct_c[kElemsPerFrag + i] = Cm1[i];
    }

    // Do matmul
    gemm_op.run(ct_a, ct_b, ct_c);

    // Copy out results
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemsPerFrag; i++) {
      Cm0[i] = ct_c[i];
      Cm1[i] = ct_c[kElemsPerFrag + i];
    }
  }
};

template <
    typename T,
    short kTileRows_,
    short kTileCols_,
    class NAXFrag_ = BaseNAXFrag>
struct NAXTile {
  using NAXFrag_t = NAXFrag_;
  using elem_type = T;

  STEEL_CONST short kFragRows = NAXFrag_t::kFragRows;
  STEEL_CONST short kFragCols = NAXFrag_t::kFragCols;
  STEEL_CONST short kElemsPerFrag = NAXFrag_t::kElemsPerFrag;

  STEEL_CONST short kTileRows = kTileRows_;
  STEEL_CONST short kTileCols = kTileCols_;

  STEEL_CONST short kRows = kTileRows * kFragRows;
  STEEL_CONST short kCols = kTileCols * kFragCols;

  STEEL_CONST short kNumFrags = kTileRows * kTileCols;
  STEEL_CONST short kElemsPerTile = kNumFrags * kElemsPerFrag;

  STEEL_CONST short kFragThrRows = NAXFrag_t::kElemRows;
  STEEL_CONST short kFragThrCols = NAXFrag_t::kElemCols;
  STEEL_CONST short kFragRowsJump = NAXFrag_t::kElemRowsJump;

  STEEL_CONST short kRowsPerThread = kTileRows * NAXFrag_t::kElemRows;
  STEEL_CONST short kColsPerThread = kTileCols * NAXFrag_t::kElemCols;

  typedef typename NAXFrag_t::template dtype_frag_t<T> frag_type;

  frag_type val_frags[kNumFrags]; // = {frag_type(0)};

  METAL_FUNC NAXTile() thread {}

  METAL_FUNC constexpr void clear() {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kNumFrags; ++i) {
      val_frags[i] = frag_type(0);
    }
  }

  METAL_FUNC constexpr thread frag_type& frag_at(const short i, const short j) {
    return val_frags[i * kTileCols + j];
  }

  METAL_FUNC constexpr const thread frag_type& frag_at(
      const short i,
      const short j) const {
    return val_frags[i * kTileCols + j];
  }

  template <int i, int j>
  METAL_FUNC constexpr thread frag_type& frag_at() {
    return val_frags[i * kTileCols + j];
  }

  template <int i, int j>
  METAL_FUNC constexpr const thread frag_type& frag_at() const {
    return val_frags[i * kTileCols + j];
  }

  template <bool transpose>
  METAL_FUNC constexpr thread frag_type&
  frag_at(const short i, const short j, metal::bool_constant<transpose>) {
    if constexpr (transpose) {
      return frag_at(j, i);
    } else {
      return frag_at(i, j);
    }
  }

  template <bool transpose>
  METAL_FUNC constexpr const thread frag_type&
  frag_at(const short i, const short j, metal::bool_constant<transpose>) const {
    if constexpr (transpose) {
      return frag_at(j, i);
    } else {
      return frag_at(i, j);
    }
  }

  template <int i, int j, bool transpose>
  METAL_FUNC constexpr thread frag_type& frag_at() {
    if constexpr (transpose) {
      return frag_at<j, i>();
    } else {
      return frag_at<i, j>();
    }
  }

  template <int i, int j, bool transpose>
  METAL_FUNC constexpr const thread frag_type& frag_at() const {
    if constexpr (transpose) {
      return frag_at<j, i>();
    } else {
      return frag_at<i, j>();
    }
  }

  METAL_FUNC thread elem_type* elems() {
    return reinterpret_cast<thread elem_type*>(val_frags);
  }

  METAL_FUNC const thread elem_type* elems() const {
    return reinterpret_cast<const thread elem_type*>(val_frags);
  }

  template <typename Op>
  METAL_FUNC void row_reduce(thread metal::vec<T, kRowsPerThread>& vals) const {
    auto vptr = (thread T*)(&vals);
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kTileCols; ++j) {
        NAXFrag_t::template row_reduce<Op>(
            frag_at(i, j), &vptr[i * kFragThrRows]);
      }
    }
  }

  template <typename Op>
  METAL_FUNC void row_bin_op(thread metal::vec<T, kRowsPerThread>& vals) {
    auto vptr = (thread T*)(&vals);
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kTileCols; ++j) {
        NAXFrag_t::template row_bin_op<Op>(
            frag_at(i, j), &vptr[i * kFragThrRows]);
      }
    }
  }

  template <typename U, int str_x, int str_y>
  METAL_FUNC void load(const threadgroup U* src) {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::load(
            frag_at<idx_row.value, idx_col.value>(),
            src,
            Int<str_x>{},
            Int<str_y>{},
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }

  template <typename U, int str_x, int str_y>
  METAL_FUNC void store(threadgroup U* dst) const {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::store(
            frag_at<idx_row.value, idx_col.value>(),
            dst,
            Int<str_x>{},
            Int<str_y>{},
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }

  template <typename U>
  METAL_FUNC void load(const device U* src, const int ld) {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::load(
            frag_at<idx_row.value, idx_col.value>(),
            src,
            ld,
            Int<1>{},
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }

  template <typename U>
  METAL_FUNC void store(device U* dst, const int ld) const {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::store(
            frag_at<idx_row.value, idx_col.value>(),
            dst,
            ld,
            Int<1>{},
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }

  template <typename U>
  METAL_FUNC void
  load_rows(const device U* src, const int ld, const short n_rows) {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::load_rows(
            frag_at<idx_row.value, idx_col.value>(),
            src,
            ld,
            Int<1>{},
            n_rows,
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }

  template <typename U>
  METAL_FUNC void
  load_safe(const device U* src, const int ld, const short2 src_tile_dims) {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::load_safe(
            frag_at<idx_row.value, idx_col.value>(),
            src,
            ld,
            Int<1>{},
            src_tile_dims.y,
            src_tile_dims.x,
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }

  template <typename U>
  METAL_FUNC void store_rows(device U* dst, const int ld, const short n_rows)
      const {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::store_rows(
            frag_at<idx_row.value, idx_col.value>(),
            dst,
            ld,
            Int<1>{},
            n_rows,
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }

  template <typename U>
  METAL_FUNC void
  store_safe(device U* dst, const int ld, const short2 dst_tile_dims) const {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::store_safe(
            frag_at<idx_row.value, idx_col.value>(),
            dst,
            ld,
            Int<1>{},
            dst_tile_dims.y,
            dst_tile_dims.x,
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }

  template <typename U>
  METAL_FUNC void store_slice(
      device U* dst,
      const int ld,
      const short2 start,
      const short2 stop) const {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::store_slice(
            frag_at<idx_row.value, idx_col.value>(),
            dst,
            ld,
            Int<1>{},
            start.y,
            stop.y,
            start.x,
            stop.x,
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }
};

template <
    class CTile,
    class ATile,
    class BTile,
    bool transpose_a,
    bool transpose_b>
METAL_FUNC void tile_matmad_nax(
    thread CTile& C,
    thread ATile& A,
    metal::bool_constant<transpose_a>,
    thread BTile& B,
    metal::bool_constant<transpose_b>) {
  // Static checks
  constexpr short TMa = transpose_a ? ATile::kTileCols : ATile::kTileRows;
  constexpr short TM = CTile::kTileRows;
  static_assert(TMa == TM, "MXU tile matmul: M dimensions do not match");

  constexpr short TNb = transpose_b ? BTile::kTileRows : BTile::kTileCols;
  constexpr short TN = CTile::kTileCols;
  static_assert(TNb == TN, "MXU tile matmul: N dimensions do not match");

  constexpr short TKa = transpose_a ? ATile::kTileRows : ATile::kTileCols;
  constexpr short TK = transpose_b ? BTile::kTileCols : BTile::kTileRows;
  static_assert(TKa == TK, "MXU tile matmul: K dimensions do not match");

  constexpr auto ta = metal::bool_constant<transpose_a>{};
  constexpr auto tb = metal::bool_constant<transpose_b>{};

  if constexpr (TN == 1 && TM % 2 == 0) {
    STEEL_PRAGMA_UNROLL
    for (short mm = 0; mm < TM; mm += 2) {
      STEEL_PRAGMA_UNROLL
      for (short nn = 0; nn < TN; ++nn) {
        STEEL_PRAGMA_UNROLL
        for (short kk = 0; kk < TK; ++kk) {
          CTile::NAXFrag_t::mma(
              C.frag_at(mm, nn),
              C.frag_at(mm + 1, nn),
              A.frag_at(mm, kk, ta),
              A.frag_at(mm + 1, kk, ta),
              metal::bool_constant<transpose_a>{},
              B.frag_at(kk, nn, tb),
              metal::bool_constant<transpose_b>{});
        }
      }
    }
  } else if constexpr (TN % 2 == 0) {
    STEEL_PRAGMA_UNROLL
    for (short mm = 0; mm < TM; ++mm) {
      STEEL_PRAGMA_UNROLL
      for (short nn = 0; nn < TN; nn += 2) {
        STEEL_PRAGMA_UNROLL
        for (short kk = 0; kk < TK; ++kk) {
          CTile::NAXFrag_t::mma(
              C.frag_at(mm, nn),
              C.frag_at(mm, nn + 1),
              A.frag_at(mm, kk, ta),
              metal::bool_constant<transpose_a>{},
              B.frag_at(kk, nn, tb),
              B.frag_at(kk, nn + 1, tb),
              metal::bool_constant<transpose_b>{});
        }
      }
    }
  }
}

} // namespace steel
} // namespace mlx


================================================
FILE: mlx/backend/metal/kernels/steel/attn/params.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

///////////////////////////////////////////////////////////////////////////////
// Attn param classes
///////////////////////////////////////////////////////////////////////////////

namespace mlx {
namespace steel {

struct AttnParams {
  int B; ///< Batch Size
  int H; ///< Heads
  int D; ///< Head Dim

  int qL; ///< Query Sequence Length
  int kL; ///< Key Sequence Length

  int gqa_factor; ///< Group Query factor
  float scale; ///< Attention scale

  int NQ; ///< Number of query blocks
  int NK; ///< Number of key/value blocks

  int NQ_aligned; ///< Number of full query blocks
  int NK_aligned; ///< Number of full key/value blocks

  int qL_rem; ///< Remainder in last query block
  int kL_rem; ///< Remainder in last key/value block
  int qL_off; ///< Offset in query sequence start

  int64_t Q_strides[3]; ///< Query  strides (B, H, L, D = 1)
  int64_t K_strides[3]; ///< Key    strides (B, H, L, D = 1)
  int64_t V_strides[3]; ///< Value  strides (B, H, L, D = 1)
  int64_t O_strides[3]; ///< Output strides (B, H, L, D = 1)
};

struct AttnMaskParams {
  int64_t M_strides[3]; ///< Mask  strides (B, H, qL, kL = 1)
};

} // namespace steel
} // namespace mlx


================================================
FILE: mlx/backend/metal/kernels/steel/attn/transforms.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/steel/utils.h"

///////////////////////////////////////////////////////////////////////////////
// Transforms and Epilogues
///////////////////////////////////////////////////////////////////////////////

namespace mlx {
namespace steel {

template <typename OutT, typename InT>
struct TransformNone {
  static METAL_FUNC OutT apply(InT x) {
    return static_cast<OutT>(x);
  }

  static METAL_FUNC OutT apply(InT x, OutT) {
    return static_cast<OutT>(x);
  }
};

template <typename OutT, typename InT>
struct TransformAdd {
  TransformAdd(const float, const float) {}

  static METAL_FUNC OutT apply(InT x) {
    return static_cast<OutT>(x);
  }

  static METAL_FUNC OutT apply(InT x, OutT c) {
    return static_cast<OutT>(x) + c;
  }
};

template <typename OutT, typename InT>
struct TransformAxpby {
  const float alpha;
  const float beta;

  TransformAxpby(const float alpha_, const float beta_)
      : alpha(alpha_), beta(beta_) {}

  static METAL_FUNC OutT apply(InT x) {
    return static_cast<OutT>(x);
  }

  METAL_FUNC OutT apply(InT x, OutT c) const {
    return static_cast<OutT>(x * alpha + (beta * c));
  }
};

template <typename T>
struct AccumHelper {
  typedef float accum_type;
};

struct BlockSwizzle {
  static METAL_FUNC int2
  swizzle(uint3 tid [[threadgroup_position_in_grid]], const int swizzle_log) {
    const int tid_x = (tid.x) >> swizzle_log;
    const int tid_y =
        ((tid.y) << swizzle_log) + ((tid.x) & ((1 << swizzle_log) - 1));
    return int2(tid_x, tid_y);
  }
};

} // namespace steel
} // namespace mlx

================================================
FILE: mlx/backend/metal/kernels/steel/conv/conv.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/steel/defines.h"
#include "mlx/backend/metal/kernels/steel/utils.h"

#include "mlx/backend/metal/kernels/steel/conv/loader.h"
#include "mlx/backend/metal/kernels/steel/conv/params.h"
#include "mlx/backend/metal/kernels/steel/gemm/mma.h"

using namespace metal;
using namespace mlx::steel;


================================================
FILE: mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h
================================================
// Copyright © 2024 Apple Inc.

#include <metal_stdlib>

using namespace metal;

template <
    typename T,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    int N_CHANNELS = 0,
    bool SMALL_FILTER = false>
[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] void
implicit_gemm_conv_2d(
    const device T* A [[buffer(0)]],
    const device T* B [[buffer(1)]],
    device T* C [[buffer(2)]],
    const constant MLXConvParams<2>* params [[buffer(3)]],
    const constant ImplicitGemmConv2DParams* gemm_params [[buffer(4)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  using namespace mlx::steel;

  (void)lid;

  constexpr bool transpose_a = false;
  constexpr bool transpose_b = true;
  constexpr short tgp_padding_a = 16 / sizeof(T);
  constexpr short tgp_padding_b = 16 / sizeof(T);

  constexpr short shape_a_cols = (transpose_a ? BM : BK) + tgp_padding_a;
  constexpr short shape_b_cols = (transpose_b ? BK : BN) + tgp_padding_b;
  constexpr short shape_a_rows = (transpose_a ? BK : BM);
  constexpr short shape_b_rows = (transpose_b ? BN : BK);
  constexpr short tgp_mem_size_a = shape_a_cols * shape_a_rows;
  constexpr short tgp_mem_size_b = shape_b_cols * shape_b_rows;

  constexpr short tgp_size = WM * WN * 32;

  // Input loader

  using loader_a_t = typename metal::conditional_t<
      // Check for small channel specialization
      N_CHANNELS != 0 && N_CHANNELS <= 4,

      // Go to small channel specialization
      Conv2DInputBlockLoaderSmallChannels<
          T,
          BM,
          BN,
          BK,
          tgp_size,
          N_CHANNELS,
          tgp_padding_a>,

      // Else go to general loader
      typename metal::conditional_t<
          // Check if filter size is small enough
          SMALL_FILTER,

          // Go to small filter specialization
          Conv2DInputBlockLoaderSmallFilter<
              T,
              BM,
              BN,
              BK,
              tgp_size,
              tgp_padding_a>,

          // Else go to large filter generalization
          Conv2DInputBlockLoaderLargeFilter<
              T,
              BM,
              BN,
              BK,
              tgp_size,
              tgp_padding_a>>>;

  // Weight loader
  using loader_b_t = typename metal::conditional_t<
      // Check for small channel specialization
      N_CHANNELS != 0 && N_CHANNELS <= 4,

      // Go to small channel specialization
      Conv2DWeightBlockLoaderSmallChannels<
          T,
          BM,
          BN,
          BK,
          tgp_size,
          N_CHANNELS,
          tgp_padding_b>,

      // Else go to general loader
      Conv2DWeightBlockLoader<T, BM, BN, BK, tgp_size, tgp_padding_b>>;

  using mma_t = BlockMMA<
      T,
      T,
      BM,
      BN,
      BK,
      WM,
      WN,
      transpose_a,
      transpose_b,
      shape_a_cols,
      shape_b_cols>;

  threadgroup T As[tgp_mem_size_a];
  threadgroup T Bs[tgp_mem_size_b];

  const int tid_y = ((tid.y) << gemm_params->swizzle_log) +
      ((tid.x) & ((1 << gemm_params->swizzle_log) - 1));
  const int tid_x = (tid.x) >> gemm_params->swizzle_log;

  if (gemm_params->tiles_n <= tid_x || gemm_params->tiles_m <= tid_y) {
    return;
  }

  const int c_row = tid_y * BM;
  const int c_col = tid_x * BN;
  const int K = gemm_params->K;
  const int N = gemm_params->N;
  const int C_per_group = params->C / params->groups;

  // Groups
  A += tid.z * C_per_group;
  B += tid.z * N * K;
  C += tid.z * N;

  B += c_col * K;
  C += c_row * (N * params->groups) + c_col;

  const int2 offsets_a(0, c_row);
  const int2 offsets_b(0, c_col);

  // Prepare threadgroup loading operations
  loader_a_t loader_a(
      A, As, offsets_a, params, gemm_params, simd_gid, simd_lid);
  loader_b_t loader_b(
      B, Bs, offsets_b, params, gemm_params, simd_gid, simd_lid);

  // Prepare threadgroup mma operation
  mma_t mma_op(simd_gid, simd_lid);

  int gemm_k_iterations = gemm_params->gemm_k_iterations;
  for (int k = 0; k < gemm_k_iterations; k++) {
    threadgroup_barrier(mem_flags::mem_threadgroup);
    // Load elements into threadgroup
    loader_a.load_unsafe();
    loader_b.load_unsafe();

    threadgroup_barrier(mem_flags::mem_threadgroup);

    // Multiply and accumulate threadgroup elements
    mma_op.mma(As, Bs);

    // Prepare for next iteration
    loader_a.next();
    loader_b.next();
  }

  threadgroup_barrier(mem_flags::mem_none);

  // Store results to device memory
  short tgp_bm = min(BM, gemm_params->M - c_row);
  short tgp_bn = min(BN, gemm_params->N - c_col);
  const int ldc = N * params->groups;
  mma_op.store_result_safe(C, ldc, short2(tgp_bn, tgp_bm));
}


================================================
FILE: mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.metal
================================================
// Copyright © 2024 Apple Inc.

#include <metal_stdlib>

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/steel/gemm/mma.h"
#include "mlx/backend/metal/kernels/steel/conv/conv.h"
#include "mlx/backend/metal/kernels/steel/conv/params.h"
#include "mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h"

#define instantiate_implicit_conv_2d(                                          \
    name,                                                                      \
    itype,                                                                     \
    bm,                                                                        \
    bn,                                                                        \
    bk,                                                                        \
    wm,                                                                        \
    wn,                                                                        \
    channel_name,                                                              \
    n_channels,                                                                \
    filter_name,                                                               \
    small_filter)                                                              \
  template [[host_name("implicit_gemm_conv_2d_" #name "_bm" #bm "_bn" #bn      \
                       "_bk" #bk "_wm" #wm "_wn" #wn "_channel_" #channel_name \
                       "_filter_" #filter_name)]] [[kernel]] void              \
  implicit_gemm_conv_2d<itype, bm, bn, bk, wm, wn, n_channels, small_filter>(  \
      const device itype* A [[buffer(0)]],                                     \
      const device itype* B [[buffer(1)]],                                     \
      device itype* C [[buffer(2)]],                                           \
      const constant MLXConvParams<2>* params [[buffer(3)]],                   \
      const constant ImplicitGemmConv2DParams* gemm_params [[buffer(4)]],      \
      uint3 tid [[threadgroup_position_in_grid]],                              \
      uint3 lid [[thread_position_in_threadgroup]],                            \
      uint simd_gid [[simdgroup_index_in_threadgroup]],                        \
      uint simd_lid [[thread_index_in_simdgroup]]);

#define instantiate_implicit_2d_filter(name, itype, bm, bn, bk, wm, wn)           \
    instantiate_implicit_conv_2d(name, itype, bm, bn, bk, wm, wn, l, 0, s, true)  \
    instantiate_implicit_conv_2d(name, itype, bm, bn, bk, wm, wn, l, 0, l, false) \
    instantiate_implicit_conv_2d(name, itype, bm, bn, bk, wm, wn, 1, 1, l, false) \
    instantiate_implicit_conv_2d(name, itype, bm, bn, bk, wm, wn, 2, 2, l, false) \
    instantiate_implicit_conv_2d(name, itype, bm, bn, bk, wm, wn, 3, 3, l, false) \
    instantiate_implicit_conv_2d(name, itype, bm, bn, bk, wm, wn, 4, 4, l, false)

#define instantiate_implicit_2d_blocks(name, itype)               \
    instantiate_implicit_2d_filter(name, itype, 32,  8, 16, 4, 1) \
    instantiate_implicit_2d_filter(name, itype, 64,  8, 16, 4, 1) \
    instantiate_implicit_2d_filter(name, itype, 32, 32, 16, 2, 2) \
    instantiate_implicit_2d_filter(name, itype, 32, 64, 16, 2, 2) \
    instantiate_implicit_2d_filter(name, itype, 64, 32, 16, 2, 2) \
    instantiate_implicit_2d_filter(name, itype, 64, 64, 16, 2, 2)

instantiate_implicit_2d_blocks(float32, float);
instantiate_implicit_2d_blocks(float16, half);
instantiate_implicit_2d_blocks(bfloat16, bfloat16_t); // clang-format on


================================================
FILE: mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_3d.h
================================================
// Copyright © 2024 Apple Inc.

#include <metal_stdlib>

using namespace metal;

template <
    typename T,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool SMALL_FILTER = false>
[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] void
implicit_gemm_conv_3d(
    const device T* A [[buffer(0)]],
    const device T* B [[buffer(1)]],
    device T* C [[buffer(2)]],
    const constant MLXConvParams<3>* params [[buffer(3)]],
    const constant ImplicitGemmConv3DParams* gemm_params [[buffer(4)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  using namespace mlx::steel;

  (void)lid;

  constexpr bool transpose_a = false;
  constexpr bool transpose_b = true;
  constexpr short tgp_padding_a = 16 / sizeof(T);
  constexpr short tgp_padding_b = 16 / sizeof(T);

  constexpr short shape_a_cols = (transpose_a ? BM : BK) + tgp_padding_a;
  constexpr short shape_b_cols = (transpose_b ? BK : BN) + tgp_padding_b;
  constexpr short shape_a_rows = (transpose_a ? BK : BM);
  constexpr short shape_b_rows = (transpose_b ? BN : BK);
  constexpr short tgp_mem_size_a = shape_a_cols * shape_a_rows;
  constexpr short tgp_mem_size_b = shape_b_cols * shape_b_rows;

  constexpr short tgp_size = WM * WN * 32;

  // Input loader
  using loader_a_t = typename metal::conditional_t<
      // If the filter is small we can precompute masks for bounds checking
      SMALL_FILTER,
      Conv3DInputBlockLoaderSmallFilter<T, BM, BN, BK, tgp_size, tgp_padding_a>,
      Conv3DInputBlockLoaderLargeFilter<
          T,
          BM,
          BN,
          BK,
          tgp_size,
          tgp_padding_a>>;

  // Weight loader
  using loader_b_t =
      Conv3DWeightBlockLoader<T, BM, BN, BK, tgp_size, tgp_padding_b>;

  using mma_t = BlockMMA<
      T,
      T,
      BM,
      BN,
      BK,
      WM,
      WN,
      transpose_a,
      transpose_b,
      shape_a_cols,
      shape_b_cols>;

  threadgroup T As[tgp_mem_size_a];
  threadgroup T Bs[tgp_mem_size_b];

  const int tid_y = ((tid.y) << gemm_params->swizzle_log) +
      ((tid.x) & ((1 << gemm_params->swizzle_log) - 1));
  const int tid_x = (tid.x) >> gemm_params->swizzle_log;

  if (gemm_params->tiles_n <= tid_x || gemm_params->tiles_m <= tid_y) {
    return;
  }

  const int c_row = tid_y * BM;
  const int c_col = tid_x * BN;
  const int K = gemm_params->K;
  const int N = gemm_params->N;
  const int C_per_group = params->C / params->groups;

  // Groups
  A += tid.z * C_per_group;
  B += tid.z * N * K;
  C += tid.z * N;

  B += c_col * K;
  C += c_row * (N * params->groups) + c_col;

  const int2 offsets_a(0, c_row);
  const int2 offsets_b(0, c_col);

  // Prepare threadgroup loading operations
  loader_a_t loader_a(
      A, As, offsets_a, params, gemm_params, simd_gid, simd_lid);
  loader_b_t loader_b(
      B, Bs, offsets_b, params, gemm_params, simd_gid, simd_lid);

  // Prepare threadgroup mma operation
  mma_t mma_op(simd_gid, simd_lid);

  int gemm_k_iterations = gemm_params->gemm_k_iterations;
  for (int k = 0; k < gemm_k_iterations; k++) {
    threadgroup_barrier(mem_flags::mem_threadgroup);
    // Load elements into threadgroup
    loader_a.load_unsafe();
    loader_b.load_unsafe();

    threadgroup_barrier(mem_flags::mem_threadgroup);

    // Multiply and accumulate threadgroup elements
    mma_op.mma(As, Bs);

    // Prepare for next iteration
    loader_a.next();
    loader_b.next();
  }

  threadgroup_barrier(mem_flags::mem_none);

  // Store results to device memory
  short tgp_bm = min(BM, gemm_params->M - c_row);
  short tgp_bn = min(BN, gemm_params->N - c_col);
  const int ldc = N * params->groups;
  mma_op.store_result_safe(C, ldc, short2(tgp_bn, tgp_bm));
}


================================================
FILE: mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_3d.metal
================================================
// Copyright © 2024 Apple Inc.

#include <metal_stdlib>

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/steel/gemm/mma.h"
#include "mlx/backend/metal/kernels/steel/conv/conv.h"
#include "mlx/backend/metal/kernels/steel/conv/params.h"
#include "mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_3d.h"

#define instantiate_implicit_conv_3d(                     \
    name,                                                 \
    itype,                                                \
    bm,                                                   \
    bn,                                                   \
    bk,                                                   \
    wm,                                                   \
    wn,                                                   \
    fn,                                                   \
    f)                                                    \
  instantiate_kernel(                                     \
      "implicit_gemm_conv_3d_" #name "_bm" #bm "_bn" #bn  \
          "_bk" #bk "_wm" #wm "_wn" #wn "_filter_" #fn,   \
      implicit_gemm_conv_3d,                              \
      itype,                                              \
      bm,                                                 \
      bn,                                                 \
      bk,                                                 \
      wm,                                                 \
      wn,                                                 \
      f)

#define instantiate_implicit_conv_3d_filter(name, itype, bm, bn, bk, wm, wn)  \
    instantiate_implicit_conv_3d(name, itype, bm, bn, bk, wm, wn, s, true)    \
    instantiate_implicit_conv_3d(name, itype, bm, bn, bk, wm, wn, l, false)

#define instantiate_implicit_3d_blocks(name, itype)                       \
    instantiate_implicit_conv_3d_filter(name, itype, 32,  8, 16, 4, 1)    \
    instantiate_implicit_conv_3d_filter(name, itype, 64,  8, 16, 4, 1)    \
    instantiate_implicit_conv_3d_filter(name, itype, 32, 32, 16, 2, 2)    \
    instantiate_implicit_conv_3d_filter(name, itype, 32, 64, 16, 2, 2)    \
    instantiate_implicit_conv_3d_filter(name, itype, 64, 32, 16, 2, 2)    \
    instantiate_implicit_conv_3d_filter(name, itype, 64, 64, 16, 2, 2)

instantiate_implicit_3d_blocks(float32, float);
instantiate_implicit_3d_blocks(float16, half);
instantiate_implicit_3d_blocks(bfloat16, bfloat16_t); // clang-format on


================================================
FILE: mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h"

constant bool align_C [[function_constant(200)]];

template <
    typename T,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    typename AccumType = float,
    typename Epilogue = TransformNone<T, AccumType>>
[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] void
implicit_gemm_conv_2d_general(
    const device T* A [[buffer(0)]],
    const device T* B [[buffer(1)]],
    device T* C [[buffer(2)]],
    const constant MLXConvParams<2>* params [[buffer(3)]],
    const constant ImplicitGemmConv2DParams* gemm_params [[buffer(4)]],
    const constant Conv2DGeneralJumpParams* jump_params [[buffer(5)]],
    const constant Conv2DGeneralBaseInfo* base_h [[buffer(6)]],
    const constant Conv2DGeneralBaseInfo* base_w [[buffer(7)]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {
  (void)lid;

  constexpr bool transpose_a = false;
  constexpr bool transpose_b = true;
  constexpr short tgp_padding_a = 16 / sizeof(T);
  constexpr short tgp_padding_b = 16 / sizeof(T);

  constexpr short shape_a_cols = (transpose_a ? BM : BK) + tgp_padding_a;
  constexpr short shape_b_cols = (transpose_b ? BK : BN) + tgp_padding_b;
  constexpr short shape_a_rows = (transpose_a ? BK : BM);
  constexpr short shape_b_rows = (transpose_b ? BN : BK);
  constexpr short tgp_mem_size_a = shape_a_cols * shape_a_rows;
  constexpr short tgp_mem_size_b = shape_b_cols * shape_b_rows;

  constexpr short tgp_size = WM * WN * 32;

  // Input loader
  using loader_a_t =
      Conv2DInputBlockLoaderGeneral<T, BM, BN, BK, tgp_size, tgp_padding_a>;

  // Weight loader
  using loader_b_t =
      Conv2DWeightBlockLoaderGeneral<T, BM, BN, BK, tgp_size, tgp_padding_b>;

  using mma_t = BlockMMA<
      T,
      T,
      BM,
      BN,
      BK,
      WM,
      WN,
      transpose_a,
      transpose_b,
      shape_a_cols,
      shape_b_cols>;

  threadgroup T As[tgp_mem_size_a];
  threadgroup T Bs[tgp_mem_size_b];

  const int tid_y = ((tid.y) << gemm_params->swizzle_log) +
      ((tid.x) & ((1 << gemm_params->swizzle_log) - 1));
  const int tid_x = (tid.x) >> gemm_params->swizzle_log;

  if (gemm_params->tiles_n <= tid_x || gemm_params->tiles_m <= tid_y) {
    return;
  }

  const int tid_z = tid.z;

  const int base_oh = tid_z / jump_params->f_out_jump_w;
  const int base_ow = tid_z % jump_params->f_out_jump_w;

  const int base_wh = base_h[base_oh].weight_base;
  const int base_ww = base_w[base_ow].weight_base;

  const int base_wh_size = base_h[base_oh].weight_size;
  const int base_ww_size = base_w[base_ow].weight_size;

  const int c_row = tid_y * BM;
  const int c_col = tid_x * BN;
  const int K = gemm_params->K;

  B += c_col * K;

  const int4 offsets_a(0, c_row, base_oh, base_ow);
  const int2 offsets_b(0, c_col);

  // Prepare threadgroup loading operations
  loader_a_t loader_a(
      A,
      As,
      offsets_a,
      params,
      jump_params,
      base_wh,
      base_ww,
      simd_gid,
      simd_lid);
  loader_b_t loader_b(
      B,
      Bs,
      offsets_b,
      params,
      jump_params,
      base_wh,
      base_ww,
      simd_gid,
      simd_lid);

  // Prepare threadgroup mma operation
  mma_t mma_op(simd_gid, simd_lid);

  if (align_C) {
    int gemm_k_iterations =
        base_wh_size * base_ww_size * gemm_params->gemm_k_iterations;

    for (int k = 0; k < gemm_k_iterations; k++) {
      threadgroup_barrier(mem_flags::mem_threadgroup);
      // Load elements into threadgroup
      loader_a.load_unsafe();
      loader_b.load_unsafe();

      threadgroup_barrier(mem_flags::mem_threadgroup);

      // Multiply and accumulate threadgroup elements
      mma_op.mma(As, Bs);

      // Prepare for next iteration
      loader_a.next();
      loader_b.next();
    }
  }

  else {
    for (int k = 1; k < gemm_params->gemm_k_iterations; k++) {
      for (int j = 0; j < base_wh_size * base_ww_size; j++) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        // Load elements into threadgroup
        loader_a.load_unsafe();
        loader_b.load_unsafe();

        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Multiply and accumulate threadgroup elements
        mma_op.mma(As, Bs);

        // Prepare for next iteration
        loader_a.next();
        loader_b.next();
      }
    }
    const short remaining_k = params->C % BK;
    for (int j = 0; j < base_wh_size * base_ww_size; j++) {
      // Load elements into threadgroup
      threadgroup_barrier(mem_flags::mem_threadgroup);
      loader_a.load_safe(remaining_k);
      loader_b.load_safe(remaining_k);
      threadgroup_barrier(mem_flags::mem_threadgroup);
      // Multiply and accumulate threadgroup elements
      mma_op.mma(As, Bs);
      // Prepare for next iteration
      loader_a.next();
      loader_b.next();
    }
  }

  threadgroup_barrier(mem_flags::mem_none);

  // Store results to device memory
  {
    // Adjust for simdgroup and thread location
    int offset_m = c_row + mma_op.sm;
    int offset_n = c_col + mma_op.sn;
    C += offset_n;

    if (offset_n >= gemm_params->N)
      return;

    short diff = gemm_params->N - offset_n;

    STEEL_PRAGMA_UNROLL
    for (int i = 0; i < mma_t::TM; i++) {
      int cm = offset_m + i * mma_t::TM_stride;

      int n = cm / jump_params->adj_out_hw;
      int hw = cm % jump_params->adj_out_hw;
      int oh =
          (hw / jump_params->adj_out_w) * jump_params->f_out_jump_h + base_oh;
      int ow =
          (hw % jump_params->adj_out_w) * jump_params->f_out_jump_w + base_ow;

      if (n < params->N && oh < params->oS[0] && ow < params->oS[1]) {
        int offset_cm = n * params->out_strides[0] +
            oh * params->out_strides[1] + ow * params->out_strides[2];

        STEEL_PRAGMA_UNROLL
        for (int j = 0; j < mma_t::TN; j++) {
          // Get accumulated result and associated offset in C
          thread const auto& accum = mma_op.Ctile.frag_at(i, j);
          int offset = offset_cm + (j * mma_t::TN_stride);

          constexpr short kelems = decltype(mma_op.Ctile)::kElemsPerFrag;

          // Apply epilogue and output C
          STEEL_PRAGMA_UNROLL
          for (short k = 0; k < kelems; k++) {
            if ((j * mma_t::TN_stride + k) < diff) {
              C[offset + k] = Epilogue::apply(accum[k]);
            }
          }
        }
      }
    }
  }
}


================================================
FILE: mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.metal
================================================
// Copyright © 2024 Apple Inc.

#include <metal_stdlib>

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/steel/gemm/mma.h"
#include "mlx/backend/metal/kernels/steel/conv/conv.h"
#include "mlx/backend/metal/kernels/steel/conv/params.h"
#include "mlx/backend/metal/kernels/steel/utils.h"
#include "mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h"

using namespace metal;
using namespace mlx::steel;

#define instantiate_implicit_conv_2d(name, itype, bm, bn, bk, wm, wn)         \
  template                                                                    \
      [[host_name("implicit_gemm_conv_2d_general_" #name "_bm" #bm "_bn" #bn  \
                  "_bk" #bk "_wm" #wm "_wn" #wn)]] [[kernel]] void            \
      implicit_gemm_conv_2d_general<itype, bm, bn, bk, wm, wn>(               \
          const device itype* A [[buffer(0)]],                                \
          const device itype* B [[buffer(1)]],                                \
          device itype* C [[buffer(2)]],                                      \
          const constant MLXConvParams<2>* params [[buffer(3)]],              \
          const constant ImplicitGemmConv2DParams* gemm_params [[buffer(4)]], \
          const constant Conv2DGeneralJumpParams* jump_params [[buffer(5)]],  \
          const constant Conv2DGeneralBaseInfo* base_h [[buffer(6)]],         \
          const constant Conv2DGeneralBaseInfo* base_w [[buffer(7)]],         \
          uint3 tid [[threadgroup_position_in_grid]],                         \
          uint3 lid [[thread_position_in_threadgroup]],                       \
          uint simd_gid [[simdgroup_index_in_threadgroup]],                   \
          uint simd_lid [[thread_index_in_simdgroup]]);

#define instantiate_implicit_2d_filter(name, itype, bm, bn, bk, wm, wn) \
  instantiate_implicit_conv_2d(name, itype, bm, bn, bk, wm, wn)

#define instantiate_implicit_2d_blocks(name, itype)               \
    instantiate_implicit_2d_filter(name, itype, 32,  8, 16, 4, 1) \
    instantiate_implicit_2d_filter(name, itype, 64,  8, 16, 4, 1) \
    instantiate_implicit_2d_filter(name, itype, 32, 32, 16, 2, 2) \
    instantiate_implicit_2d_filter(name, itype, 32, 64, 16, 2, 2) \
    instantiate_implicit_2d_filter(name, itype, 64, 32, 16, 2, 2) \
    instantiate_implicit_2d_filter(name, itype, 64, 64, 16, 2, 2)

instantiate_implicit_2d_blocks(float32, float);
instantiate_implicit_2d_blocks(float16, half);
instantiate_implicit_2d_blocks(bfloat16, bfloat16_t); // clang-format on


================================================
FILE: mlx/backend/metal/kernels/steel/conv/loader.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h"
#include "mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h"

================================================
FILE: mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/steel/utils.h"

#include "mlx/backend/metal/kernels/steel/conv/params.h"

///////////////////////////////////////////////////////////////////////////////
// Loading helper
///////////////////////////////////////////////////////////////////////////////

namespace mlx {
namespace steel {

template <
    typename T,
    short BM,
    short BN,
    short BK,
    short tgp_size,
    short tgp_padding = 0>
struct Conv2DInputBlockLoaderLargeFilter {
  // Destination dimensions
  STEEL_CONST short BROWS = BM;
  STEEL_CONST short BCOLS = BK;

  // Read dimensions
  STEEL_CONST short dst_ld = BCOLS + tgp_padding;
  STEEL_CONST short vec_size = tgp_size / (BROWS * BCOLS) >= 8 ? 8 : 4;

  // Thread read shape
  STEEL_CONST short TCOLS = BCOLS / vec_size;
  STEEL_CONST short TROWS = tgp_size / TCOLS;

  // Rows / strided reads within the block
  STEEL_CONST short n_rows = BROWS / TROWS;

  // Thread location indices
  const short thread_idx;
  const short bi;
  const short bj;

  // threadgroup and device memory
  threadgroup T* dst;

  const constant MLXConvParams<2>* params;
  const constant ImplicitGemmConv2DParams* gemm_params;

  short weight_h;
  short weight_w;

  const device T* src[n_rows];

  int read_n[n_rows];
  int read_ih[n_rows];
  int read_iw[n_rows];

  /* Constructor */
  METAL_FUNC Conv2DInputBlockLoaderLargeFilter(
      const device T* src_,
      threadgroup T* dst_,
      const int2 offsets,
      const constant MLXConvParams<2>* params_,
      const constant ImplicitGemmConv2DParams* gemm_params_,
      uint simd_group_id [[simdgroup_index_in_threadgroup]],
      uint simd_lane_id [[thread_index_in_simdgroup]])
      : thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(thread_idx / TCOLS),
        bj(vec_size * (thread_idx % TCOLS)),
        dst(dst_ + bi * dst_ld + bj),
        params(params_),
        gemm_params(gemm_params_),
        weight_h(0),
        weight_w(0) {
    int out_n_pixels = params->oS[0] * params->oS[1];

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < n_rows; ++i) {
      int offset_nhw = offsets.y + bi + i * TROWS;
      int n = offset_nhw / out_n_pixels;
      int hw = offset_nhw % out_n_pixels;
      int oh = hw / params->oS[1];
      int ow = hw % params->oS[1];

      int ih = oh * params->str[0] - params->pad[0];
      int iw = ow * params->str[1] - params->pad[1];

      read_n[i] = n;
      read_ih[i] = ih;
      read_iw[i] = iw;

      // Adjust for flip
      if (params->flip) {
        ih += (params->wS[0] - 1) * params->kdil[0];
        iw += (params->wS[1] - 1) * params->kdil[1];
      }

      // Read from input if in bounds
      src[i] = src_ + n * params->in_strides[0] + ih * params->in_strides[1] +
          iw * params->in_strides[2] + bj;
    }
  }

  /* Load from device memory into threadgroup memory - without bound checking */
  METAL_FUNC void load_unsafe() const {
    STEEL_PRAGMA_UNROLL
    for (short i = 0, is = 0; i < n_rows; ++i, is += TROWS) {
      // Find bounds
      int n = read_n[i];
      int ih = read_ih[i] + weight_h * params->kdil[0];
      int iw = read_iw[i] + weight_w * params->kdil[1];

      // Read from input if in bounds
      if ((n < params->N) && (ih >= 0 && ih < params->iS[0]) &&
          (iw >= 0 && iw < params->iS[1])) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; ++j) {
          dst[is * dst_ld + j] = src[i][j];
        }
      }

      // Zero pad otherwise
      else {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; ++j) {
          dst[is * dst_ld + j] = T(0);
        }
      }
    }
  }

  /* Iteration helper */
  METAL_FUNC void next() {
    if (++weight_w < params->wS[1]) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < n_rows; i++) {
        src[i] += gemm_params->inp_jump_w;
      }

      return;
    }

    weight_w = 0;

    if (++weight_h < params->wS[0]) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < n_rows; i++) {
        src[i] += gemm_params->inp_jump_h;
      }

      return;
    }

    weight_h = 0;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < n_rows; i++) {
      src[i] += gemm_params->inp_jump_c;
    }
  }
};

template <
    typename T,
    short BM,
    short BN,
    short BK,
    short tgp_size,
    short tgp_padding = 0>
struct Conv2DInputBlockLoaderSmallFilter {
  // Destination dimensions
  STEEL_CONST short BROWS = BM;
  STEEL_CONST short BCOLS = BK;

  // Read dimensions
  STEEL_CONST short dst_ld = BCOLS + tgp_padding;
  STEEL_CONST short vec_size = tgp_size / (BROWS * BCOLS) >= 8 ? 8 : 4;

  // Thread read shape
  STEEL_CONST short TCOLS = BCOLS / vec_size;
  STEEL_CONST short TROWS = tgp_size / TCOLS;

  // Rows / strided reads within the block
  STEEL_CONST short n_rows = BROWS / TROWS;

  using mask_t = short;

  // Thread location indices
  const short thread_idx;
  const short bi;
  const short bj;

  // threadgroup and device memory
  threadgroup T* dst;

  const constant MLXConvParams<2>* params;
  const constant ImplicitGemmConv2DParams* gemm_params;

  short weight_h;
  short weight_w;

  const device T* src[n_rows];

  mask_t mask_h[n_rows];
  mask_t mask_w[n_rows];

  /* Constructor */
  METAL_FUNC Conv2DInputBlockLoaderSmallFilter(
      const device T* src_,
      threadgroup T* dst_,
      const int2 offsets,
      const constant MLXConvParams<2>* params_,
      const constant ImplicitGemmConv2DParams* gemm_params_,
      uint simd_group_id [[simdgroup_index_in_threadgroup]],
      uint simd_lane_id [[thread_index_in_simdgroup]])
      : thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(thread_idx / TCOLS),
        bj(vec_size * (thread_idx % TCOLS)),
        dst(dst_ + bi * dst_ld + bj),
        params(params_),
        gemm_params(gemm_params_),
        weight_h(0),
        weight_w(0) {
    int out_n_pixels = params->oS[0] * params->oS[1];

    int read_n[n_rows];
    int read_ih[n_rows];
    int read_iw[n_rows];

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < n_rows; ++i) {
      int offset_nhw = offsets.y + bi + i * TROWS;
      int n = offset_nhw / out_n_pixels;
      int hw = offset_nhw % out_n_pixels;
      int oh = hw / params->oS[1];
      int ow = hw % params->oS[1];

      int ih = oh * params->str[0] - params->pad[0];
      int iw = ow * params->str[1] - params->pad[1];

      read_n[i] = n;
      read_ih[i] = ih;
      read_iw[i] = iw;

      // Adjust for flip
      if (params->flip) {
        ih += (params->wS[0] - 1) * params->kdil[0];
        iw += (params->wS[1] - 1) * params->kdil[1];
      }

      // Read from input if in bounds
      src[i] = src_ + n * params->in_strides[0] + ih * params->in_strides[1] +
          iw * params->in_strides[2] + bj;
    }

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < n_rows; ++i) {
      mask_h[i] = 0;
      mask_w[i] = 0;
    }

    for (short kh = 0; kh < params->wS[0]; kh++) {
      short flip_h = params->flip ? params->wS[0] - kh - 1 : kh;
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < n_rows; ++i) {
        int n = read_n[i];
        int ih = read_ih[i] + flip_h * params->kdil[0];

        bool in_bounds = n < params->N && ih >= 0 && ih < params->iS[0];

        mask_h[i] |= (in_bounds << kh);
      }
    }

    for (short kw = 0; kw < params->wS[1]; kw++) {
      short flip_w = params->flip ? params->wS[1] - kw - 1 : kw;
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < n_rows; ++i) {
        int iw = read_iw[i] + flip_w * params->kdil[1];

        bool in_bounds = iw >= 0 && iw < params->iS[1];

        mask_w[i] |= (in_bounds << kw);
      }
    }
  }

  /* Load from device memory into threadgroup memory - without bound checking */
  METAL_FUNC void load_unsafe() const {
    mask_t h_mask = mask_t(1) << weight_h;
    mask_t w_mask = mask_t(1) << weight_w;

    STEEL_PRAGMA_UNROLL
    for (short i = 0, is = 0; i < n_rows; ++i, is += TROWS) {
      // Read from input if in bounds
      if ((mask_h[i] & h_mask) && (mask_w[i] & w_mask)) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; ++j) {
          dst[is * dst_ld + j] = src[i][j];
        }
      }

      // Zero pad otherwise
      else {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; ++j) {
          dst[is * dst_ld + j] = T(0);
        }
      }
    }
  }

  /* Iteration helper */
  METAL_FUNC void next() {
    if (++weight_w < params->wS[1]) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < n_rows; i++) {
        src[i] += gemm_params->inp_jump_w;
      }

      return;
    }

    weight_w = 0;

    if (++weight_h < params->wS[0]) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < n_rows; i++) {
        src[i] += gemm_params->inp_jump_h;
      }

      return;
    }

    weight_h = 0;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < n_rows; i++) {
      src[i] += gemm_params->inp_jump_c;
    }
  }
};

template <
    typename T,
    short BM,
    short BN,
    short BK,
    short tgp_size,
    short tgp_padding = 0>
struct Conv2DWeightBlockLoader {
  // Destination dimensions
  STEEL_CONST short BROWS = BN;
  STEEL_CONST short BCOLS = BK;

  // Read dimensions
  STEEL_CONST short dst_ld = BCOLS + tgp_padding;
  STEEL_CONST short vec_size =
      (BN == 8) ? 1 : (tgp_size / (BROWS * BCOLS) >= 8 ? 8 : 4);

  // Thread read shape
  STEEL_CONST short TCOLS = BCOLS / vec_size;
  STEEL_CONST short TROWS = tgp_size / TCOLS;

  // Rows / strided reads within the block
  STEEL_CONST short n_rows = BROWS / TROWS;

  // Leading dimension for src
  const int src_ld;

  // Thread location indices
  const short thread_idx;
  const short bi;
  const short bj;

  // threadgroup and device memory
  threadgroup T* dst;
  const device T* src;

  const constant MLXConvParams<2>* params;

  int weight_hw;
  int weight_step;

  const int read_n;
  const bool do_read;

  /* Constructor */
  METAL_FUNC Conv2DWeightBlockLoader(
      const device T* src_,
      threadgroup T* dst_,
      const int2 offsets,
      const constant MLXConvParams<2>* params_,
      const constant ImplicitGemmConv2DParams* gemm_params_,
      uint simd_group_id [[simdgroup_index_in_threadgroup]],
      uint simd_lane_id [[thread_index_in_simdgroup]])
      : src_ld(params_->wt_strides[0]),
        thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(thread_idx / TCOLS),
        bj(vec_size * (thread_idx % TCOLS)),
        dst(dst_ + bi * dst_ld + bj),
        src(src_ + bi * src_ld + bj),
        params(params_),
        weight_hw(0),
        weight_step(params->C / params->groups),
        read_n(offsets.y + bi),
        do_read(read_n + n_rows * TROWS <= gemm_params_->N) {}

  /* Load from device memory into threadgroup memory - without bound checking */
  METAL_FUNC void load_unsafe() const {
    if (BN != 8 || do_read) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < BN; i += TROWS) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; j++) {
          dst[i * dst_ld + j] = src[i * src_ld + j];
        }
      }
    } else {
      for (short i = 0; i < BN; i += TROWS) {
        if ((read_n + i) < params->O) {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < vec_size; j++) {
            dst[i * dst_ld + j] = src[i * src_ld + j];
          }
        } else {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < vec_size; j++) {
            dst[i * dst_ld + j] = T(0);
          }
        }
      }
    }
  }

  /* Iteration helper */
  METAL_FUNC void next() {
    if (++weight_hw < (params->wS[1] * params->wS[0])) {
      src += weight_step;
      return;
    }

    weight_hw = 0;

    src += BK - (params->wS[1] * params->wS[0] - 1) * weight_step;
  }
};

template <
    typename T,
    short BM,
    short BN,
    short BK,
    short tgp_size,
    short tgp_padding = 0>
struct Conv3DInputBlockLoaderLargeFilter {
  // Destination dimensions
  STEEL_CONST short BROWS = BM;
  STEEL_CONST short BCOLS = BK;

  // Read dimensions
  STEEL_CONST short dst_ld = BCOLS + tgp_padding;
  STEEL_CONST short vec_size = tgp_size / (BROWS * BCOLS) >= 8 ? 8 : 4;

  // Thread read shape
  STEEL_CONST short TCOLS = BCOLS / vec_size;
  STEEL_CONST short TROWS = tgp_size / TCOLS;

  // Rows / strided reads within the block
  STEEL_CONST short n_rows = BROWS / TROWS;

  // Thread location indices
  const short thread_idx;
  const short bi;
  const short bj;

  // threadgroup and device memory
  threadgroup T* dst;

  const constant MLXConvParams<3>* params;
  const constant ImplicitGemmConv3DParams* gemm_params;

  short weight_d;
  short weight_h;
  short weight_w;

  short kdil_d;
  short kdil_h;
  short kdil_w;

  const device T* src[n_rows];

  int read_n[n_rows];
  int read_id[n_rows];
  int read_ih[n_rows];
  int read_iw[n_rows];

  /* Constructor */
  METAL_FUNC Conv3DInputBlockLoaderLargeFilter(
      const device T* src_,
      threadgroup T* dst_,
      const int2 offsets,
      const constant MLXConvParams<3>* params_,
      const constant ImplicitGemmConv3DParams* gemm_params_,
      uint simd_group_id [[simdgroup_index_in_threadgroup]],
      uint simd_lane_id [[thread_index_in_simdgroup]])
      : thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(thread_idx / TCOLS),
        bj(vec_size * (thread_idx % TCOLS)),
        dst(dst_ + bi * dst_ld + bj),
        params(params_),
        gemm_params(gemm_params_),
        weight_d(0),
        weight_h(0),
        weight_w(0),
        kdil_d(params_->flip ? -params_->kdil[0] : params_->kdil[0]),
        kdil_h(params_->flip ? -params_->kdil[1] : params_->kdil[1]),
        kdil_w(params_->flip ? -params_->kdil[2] : params_->kdil[2]) {
    int out_n_pixels = params->oS[0] * params->oS[1] * params->oS[2];

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < n_rows; ++i) {
      int offset_ndhw = offsets.y + bi + i * TROWS;
      int n = offset_ndhw / out_n_pixels;
      int dhw = offset_ndhw % out_n_pixels;
      int od = dhw / (params->oS[1] * params->oS[2]);
      int hw = dhw % (params->oS[1] * params->oS[2]);
      int oh = hw / params->oS[2];
      int ow = hw % params->oS[2];

      int id = od * params->str[0] - params->pad[0];
      int ih = oh * params->str[1] - params->pad[1];
      int iw = ow * params->str[2] - params->pad[2];

      read_n[i] = n;

      if (params->flip) {
        read_id[i] = id + (params->wS[0] - 1) * params->kdil[0];
        read_ih[i] = ih + (params->wS[1] - 1) * params->kdil[1];
        read_iw[i] = iw + (params->wS[2] - 1) * params->kdil[2];
      } else {
        read_id[i] = id;
        read_ih[i] = ih;
        read_iw[i] = iw;
      }

      // Adjust for flip
      if (params->flip) {
        id += (params->wS[0] - 1) * params->kdil[0];
        ih += (params->wS[1] - 1) * params->kdil[1];
        iw += (params->wS[2] - 1) * params->kdil[2];
      }

      // Read from input if in bounds
      src[i] = src_ + n * params->in_strides[0] + id * params->in_strides[1] +
          ih * params->in_strides[2] + iw * params->in_strides[3] + bj;
    }
  }

  /* Load from device memory into threadgroup memory - without bound checking */
  METAL_FUNC void load_unsafe() const {
    STEEL_PRAGMA_UNROLL
    for (short i = 0, is = 0; i < n_rows; ++i, is += TROWS) {
      // Find bounds
      int n = read_n[i];
      int id = read_id[i] + weight_d * kdil_d;
      int ih = read_ih[i] + weight_h * kdil_h;
      int iw = read_iw[i] + weight_w * kdil_w;

      // Read from input if in bounds
      if ((n < params->N) && (id >= 0 && id < params->iS[0]) &&
          (ih >= 0 && ih < params->iS[1]) && (iw >= 0 && iw < params->iS[2])) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; ++j) {
          dst[is * dst_ld + j] = src[i][j];
        }
      }

      // Zero pad otherwise
      else {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; ++j) {
          dst[is * dst_ld + j] = T(0);
        }
      }
    }
  }

  /* Iteration helper */
  METAL_FUNC void next() {
    if (++weight_w < params->wS[2]) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < n_rows; i++) {
        src[i] += gemm_params->inp_jump_w;
      }

      return;
    }

    weight_w = 0;

    if (++weight_h < params->wS[1]) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < n_rows; i++) {
        src[i] += gemm_params->inp_jump_h;
      }

      return;
    }

    weight_h = 0;

    if (++weight_d < params->wS[0]) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < n_rows; i++) {
        src[i] += gemm_params->inp_jump_d;
      }

      return;
    }

    weight_d = 0;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < n_rows; i++) {
      src[i] += gemm_params->inp_jump_c;
    }
  }
};

template <
    typename T,
    short BM,
    short BN,
    short BK,
    short tgp_size,
    short tgp_padding = 0>
struct Conv3DInputBlockLoaderSmallFilter {
  // Destination dimensions
  STEEL_CONST short BROWS = BM;
  STEEL_CONST short BCOLS = BK;

  // Read dimensions
  STEEL_CONST short dst_ld = BCOLS + tgp_padding;
  STEEL_CONST short vec_size = tgp_size / (BROWS * BCOLS) >= 8 ? 8 : 4;

  // Thread read shape
  STEEL_CONST short TCOLS = BCOLS / vec_size;
  STEEL_CONST short TROWS = tgp_size / TCOLS;

  // Rows / strided reads within the block
  STEEL_CONST short n_rows = BROWS / TROWS;

  using mask_t = short;

  // Thread location indices
  const short thread_idx;
  const short bi;
  const short bj;

  // threadgroup and device memory
  threadgroup T* dst;

  const constant MLXConvParams<3>* params;
  const constant ImplicitGemmConv3DParams* gemm_params;

  short weight_d;
  short weight_h;
  short weight_w;

  const device T* src[n_rows];

  mask_t mask_d[n_rows];
  mask_t mask_h[n_rows];
  mask_t mask_w[n_rows];

  /* Constructor */
  METAL_FUNC Conv3DInputBlockLoaderSmallFilter(
      const device T* src_,
      threadgroup T* dst_,
      const int2 offsets,
      const constant MLXConvParams<3>* params_,
      const constant ImplicitGemmConv3DParams* gemm_params_,
      uint simd_group_id [[simdgroup_index_in_threadgroup]],
      uint simd_lane_id [[thread_index_in_simdgroup]])
      : thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(thread_idx / TCOLS),
        bj(vec_size * (thread_idx % TCOLS)),
        dst(dst_ + bi * dst_ld + bj),
        params(params_),
        gemm_params(gemm_params_),
        weight_d(0),
        weight_h(0),
        weight_w(0) {
    int out_n_pixels = params->oS[0] * params->oS[1] * params->oS[2];

    int read_n[n_rows];
    int read_id[n_rows];
    int read_ih[n_rows];
    int read_iw[n_rows];

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < n_rows; ++i) {
      int offset_ndhw = offsets.y + bi + i * TROWS;
      int n = offset_ndhw / out_n_pixels;
      int dhw = offset_ndhw % out_n_pixels;
      int od = dhw / (params->oS[1] * params->oS[2]);
      int hw = dhw % (params->oS[1] * params->oS[2]);
      int oh = hw / params->oS[2];
      int ow = hw % params->oS[2];

      int id = od * params->str[0] - params->pad[0];
      int ih = oh * params->str[1] - params->pad[1];
      int iw = ow * params->str[2] - params->pad[2];

      read_n[i] = n;
      read_id[i] = id;
      read_ih[i] = ih;
      read_iw[i] = iw;

      // Adjust for flip
      if (params->flip) {
        id += (params->wS[0] - 1) * params->kdil[0];
        ih += (params->wS[1] - 1) * params->kdil[1];
        iw += (params->wS[2] - 1) * params->kdil[2];
      }

      // Read from input if in bounds
      src[i] = src_ + n * params->in_strides[0] + id * params->in_strides[1] +
          ih * params->in_strides[2] + iw * params->in_strides[3] + bj;
    }

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < n_rows; ++i) {
      mask_d[i] = 0;
      mask_h[i] = 0;
      mask_w[i] = 0;
    }

    for (short kd = 0; kd < params->wS[0]; kd++) {
      short flip_d = params->flip ? params->wS[0] - kd - 1 : kd;
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < n_rows; ++i) {
        int n = read_n[i];
        int id = read_id[i] + flip_d * params->kdil[0];

        bool in_bounds = n < params->N && id >= 0 && id < params->iS[0];

        mask_d[i] |= (in_bounds << kd);
      }
    }

    for (short kh = 0; kh < params->wS[1]; kh++) {
      short flip_h = params->flip ? params->wS[1] - kh - 1 : kh;
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < n_rows; ++i) {
        int ih = read_ih[i] + flip_h * params->kdil[1];

        bool in_bounds = ih >= 0 && ih < params->iS[1];

        mask_h[i] |= (in_bounds << kh);
      }
    }

    for (short kw = 0; kw < params->wS[2]; kw++) {
      short flip_w = params->flip ? params->wS[2] - kw - 1 : kw;
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < n_rows; ++i) {
        int iw = read_iw[i] + flip_w * params->kdil[2];

        bool in_bounds = iw >= 0 && iw < params->iS[2];

        mask_w[i] |= (in_bounds << kw);
      }
    }
  }

  /* Load from device memory into threadgroup memory - without bound checking */
  METAL_FUNC void load_unsafe() const {
    mask_t d_mask = mask_t(1) << weight_d;
    mask_t h_mask = mask_t(1) << weight_h;
    mask_t w_mask = mask_t(1) << weight_w;

    STEEL_PRAGMA_UNROLL
    for (short i = 0, is = 0; i < n_rows; ++i, is += TROWS) {
      // Read from input if in bounds
      if ((mask_d[i] & d_mask) && (mask_h[i] & h_mask) &&
          (mask_w[i] & w_mask)) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; ++j) {
          dst[is * dst_ld + j] = src[i][j];
        }
      }

      // Zero pad otherwise
      else {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; ++j) {
          dst[is * dst_ld + j] = T(0);
        }
      }
    }
  }

  /* Iteration helper */
  METAL_FUNC void next() {
    if (++weight_w < params->wS[2]) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < n_rows; i++) {
        src[i] += gemm_params->inp_jump_w;
      }

      return;
    }

    weight_w = 0;

    if (++weight_h < params->wS[1]) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < n_rows; i++) {
        src[i] += gemm_params->inp_jump_h;
      }

      return;
    }

    weight_h = 0;

    if (++weight_d < params->wS[0]) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < n_rows; i++) {
        src[i] += gemm_params->inp_jump_d;
      }

      return;
    }

    weight_d = 0;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < n_rows; i++) {
      src[i] += gemm_params->inp_jump_c;
    }
  }
};

template <
    typename T,
    short BM,
    short BN,
    short BK,
    short tgp_size,
    short tgp_padding = 0>
struct Conv3DWeightBlockLoader {
  // Destination dimensions
  STEEL_CONST short BROWS = BN;
  STEEL_CONST short BCOLS = BK;

  // Read dimensions
  STEEL_CONST short dst_ld = BCOLS + tgp_padding;
  STEEL_CONST short vec_size =
      (BN == 8) ? 1 : (tgp_size / (BROWS * BCOLS) >= 8 ? 8 : 4);

  // Thread read shape
  STEEL_CONST short TCOLS = BCOLS / vec_size;
  STEEL_CONST short TROWS = tgp_size / TCOLS;

  // Rows / strided reads within the block
  STEEL_CONST short n_rows = BROWS / TROWS;

  // Leading dimension for src
  const int src_ld;

  // Thread location indices
  const short thread_idx;
  const short bi;
  const short bj;

  // threadgroup and device memory
  threadgroup T* dst;
  const device T* src;

  const constant MLXConvParams<3>* params;

  int weight_dhw;
  int weight_step;

  const int read_n;
  const bool do_read;

  /* Constructor */
  METAL_FUNC Conv3DWeightBlockLoader(
      const device T* src_,
      threadgroup T* dst_,
      const int2 offsets,
      const constant MLXConvParams<3>* params_,
      const constant ImplicitGemmConv3DParams* gemm_params_,
      uint simd_group_id [[simdgroup_index_in_threadgroup]],
      uint simd_lane_id [[thread_index_in_simdgroup]])
      : src_ld(params_->wt_strides[0]),
        thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(thread_idx / TCOLS),
        bj(vec_size * (thread_idx % TCOLS)),
        dst(dst_ + bi * dst_ld + bj),
        src(src_ + bi * src_ld + bj),
        params(params_),
        weight_dhw(0),
        weight_step(params->C / params->groups),
        read_n(offsets.y + bi),
        do_read(read_n + n_rows * TROWS <= gemm_params_->N) {}

  /* Load from device memory into threadgroup memory - without bound checking */
  METAL_FUNC void load_unsafe() const {
    if (BN != 8 || do_read) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < BN; i += TROWS) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; j++) {
          dst[i * dst_ld + j] = src[i * src_ld + j];
        }
      }
    } else {
      for (short i = 0; i < BN; i += TROWS) {
        if ((read_n + i) < params->O) {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < vec_size; j++) {
            dst[i * dst_ld + j] = src[i * src_ld + j];
          }
        } else {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < vec_size; j++) {
            dst[i * dst_ld + j] = T(0);
          }
        }
      }
    }
  }

  /* Iteration helper */
  METAL_FUNC void next() {
    if (++weight_dhw < (params->wS[0] * params->wS[1] * params->wS[2])) {
      src += weight_step;
      return;
    }

    weight_dhw = 0;

    src +=
        BK - (params->wS[0] * params->wS[1] * params->wS[2] - 1) * weight_step;
  }
};

} // namespace steel
} // namespace mlx


================================================
FILE: mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/steel/utils.h"

#include "mlx/backend/metal/kernels/steel/conv/params.h"

///////////////////////////////////////////////////////////////////////////////
// Loading helper
///////////////////////////////////////////////////////////////////////////////

namespace mlx {
namespace steel {

template <short n_channels_>
struct ChannelHelper {
  STEEL_CONST short n_channels = n_channels_;
  STEEL_CONST short vec_size = n_channels_ <= 4 ? 4 : 8;
  STEEL_CONST short excess = vec_size - n_channels_;
};

template <>
struct ChannelHelper<1> {
  STEEL_CONST short n_channels = 1;
  STEEL_CONST short vec_size = 1;
  STEEL_CONST short excess = 0;
};

template <>
struct ChannelHelper<2> {
  STEEL_CONST short n_channels = 2;
  STEEL_CONST short vec_size = 2;
  STEEL_CONST short excess = 0;
};

template <>
struct ChannelHelper<3> {
  STEEL_CONST short n_channels = 3;
  STEEL_CONST short vec_size = 4;
  STEEL_CONST short excess = 1;
};

template <>
struct ChannelHelper<4> {
  STEEL_CONST short n_channels = 4;
  STEEL_CONST short vec_size = 4;
  STEEL_CONST short excess = 0;
};

template <
    typename T,
    short BM,
    short BN,
    short BK,
    short tgp_size,
    short n_channels,
    short tgp_padding = 0>
struct Conv2DInputBlockLoaderSmallChannels {
  // Destination dimensions
  STEEL_CONST short BROWS = BM;
  STEEL_CONST short BCOLS = BK;

  // Read dimensions
  STEEL_CONST short dst_ld = BCOLS + tgp_padding;
  STEEL_CONST short vec_size = ChannelHelper<n_channels>::vec_size;

  // Thread read shape
  STEEL_CONST short TCOLS = BCOLS / vec_size;
  STEEL_CONST short TROWS = tgp_size / TCOLS;

  // Rows / strided reads within the block
  STEEL_CONST short n_rows = BROWS / TROWS;

  // Thread location indices
  const short thread_idx;
  const short bi;
  const short bj;

  // threadgroup and device memory
  threadgroup T* dst;

  const constant MLXConvParams<2>* params;
  const constant ImplicitGemmConv2DParams* gemm_params;

  int weight_hw;

  const device T* src[n_rows];

  int read_n[n_rows];
  int read_ih[n_rows];
  int read_iw[n_rows];

  /* Constructor */
  METAL_FUNC Conv2DInputBlockLoaderSmallChannels(
      const device T* src_,
      threadgroup T* dst_,
      const int2 offsets,
      const constant MLXConvParams<2>* params_,
      const constant ImplicitGemmConv2DParams* gemm_params_,
      uint simd_group_id [[simdgroup_index_in_threadgroup]],
      uint simd_lane_id [[thread_index_in_simdgroup]])
      : thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(thread_idx / TCOLS),
        bj(vec_size * (thread_idx % TCOLS)),
        dst(dst_ + bi * dst_ld + bj),
        params(params_),
        gemm_params(gemm_params_),
        weight_hw(thread_idx % TCOLS) {
    int out_n_pixels = params->oS[0] * params->oS[1];

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < n_rows; ++i) {
      int offset_nhw = offsets.y + bi + i * TROWS;
      int n = offset_nhw / out_n_pixels;
      int hw = offset_nhw % out_n_pixels;
      int oh = hw / params->oS[1];
      int ow = hw % params->oS[1];

      int ih = oh * params->str[0] - params->pad[0];
      int iw = ow * params->str[1] - params->pad[1];

      // Read from input if in bounds
      src[i] = src_ + n * params->in_strides[0] + ih * params->in_strides[1] +
          iw * params->in_strides[2];

      read_n[i] = n;
      read_ih[i] = ih;
      read_iw[i] = iw;
    }
  }

  /* Load from device memory into threadgroup memory - without bound checking */
  METAL_FUNC void load_unsafe() const {
    if (weight_hw >= params->wS[1] * params->wS[0]) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < BROWS; i += TROWS) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; j++) {
          dst[i * dst_ld + j] = T(0);
        }
      }
      return;
    }

    int wh = (weight_hw / params->wS[1]);
    int ww = (weight_hw % params->wS[1]);

    int flip_h = params->flip ? params->wS[0] - wh - 1 : wh;
    int flip_w = params->flip ? params->wS[1] - ww - 1 : ww;

    int weight_h = flip_h * params->kdil[0];
    int weight_w = flip_w * params->kdil[1];

    STEEL_PRAGMA_UNROLL
    for (short i = 0, is = 0; i < n_rows; ++i, is += TROWS) {
      // Find bounds
      int n = read_n[i];
      int ih = read_ih[i] + weight_h;
      int iw = read_iw[i] + weight_w;

      // Read from input if in bounds
      if ((n < params->N) && (ih >= 0 && ih < params->iS[0]) &&
          (iw >= 0 && iw < params->iS[1])) {
        const device T* curr_src = src[i] + weight_h * params->in_strides[1] +
            weight_w * params->in_strides[2];

        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < n_channels; ++j) {
          dst[is * dst_ld + j] = curr_src[j];
        }

        STEEL_PRAGMA_UNROLL
        for (short j = n_channels; j < vec_size; ++j) {
          dst[is * dst_ld + j] = T(0);
        }
      }

      // Zero pad otherwise
      else {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; ++j) {
          dst[is * dst_ld + j] = T(0);
        }
      }
    }
  }

  /* Iteration helper */
  METAL_FUNC void next() {
    weight_hw += TCOLS;
  }
};

template <
    typename T,
    short BM,
    short BN,
    short BK,
    short tgp_size,
    short n_channels,
    short tgp_padding = 0>
struct Conv2DWeightBlockLoaderSmallChannels {
  // Destination dimensions
  STEEL_CONST short BROWS = BN;
  STEEL_CONST short BCOLS = BK;

  // Read dimensions
  STEEL_CONST short dst_ld = BCOLS + tgp_padding;
  STEEL_CONST short vec_size = ChannelHelper<n_channels>::vec_size;

  // Thread read shape
  STEEL_CONST short TCOLS = BCOLS / vec_size;
  STEEL_CONST short TROWS = tgp_size / TCOLS;

  // Rows / strided reads within the block
  STEEL_CONST short n_rows = BROWS / TROWS;

  // Leading dimension for src
  const int src_ld;

  // Thread location indices
  const short thread_idx;
  const short bi;
  const short bj;

  // threadgroup and device memory
  threadgroup T* dst;
  const device T* src;

  const constant MLXConvParams<2>* params;

  int weight_hw;

  const int read_n;
  const bool do_read;

  /* Constructor */
  METAL_FUNC Conv2DWeightBlockLoaderSmallChannels(
      const device T* src_,
      threadgroup T* dst_,
      const int2 offsets,
      const constant MLXConvParams<2>* params_,
      const constant ImplicitGemmConv2DParams* gemm_params_,
      uint simd_group_id [[simdgroup_index_in_threadgroup]],
      uint simd_lane_id [[thread_index_in_simdgroup]])
      : src_ld(params_->wt_strides[0]),
        thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(thread_idx / TCOLS),
        bj(vec_size * (thread_idx % TCOLS)),
        dst(dst_ + bi * dst_ld + bj),
        src(src_ + bi * src_ld),
        params(params_),
        weight_hw(thread_idx % TCOLS),
        read_n(offsets.y + bi),
        do_read(read_n + BN <= gemm_params_->N) {}

  /* Load from device memory into threadgroup memory - without bound checking */
  METAL_FUNC void load_unsafe() const {
    if (bi >= BROWS || bj >= BCOLS)
      return;

    if (read_n >= params->O || weight_hw >= params->wS[1] * params->wS[0]) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < BROWS; i += TROWS) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; j++) {
          dst[i * dst_ld + j] = T(0);
        }
      }

      return;
    }

    const device T* curr_src = src + weight_hw * (params->C / params->groups);

    if (BN != 8 || do_read) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < BROWS; i += TROWS) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < n_channels; j++) {
          dst[i * dst_ld + j] = curr_src[i * src_ld + j];
        }

        STEEL_PRAGMA_UNROLL
        for (short j = n_channels; j < vec_size; j++) {
          dst[i * dst_ld + j] = T(0);
        }
      }
    } else {
      for (short i = 0; i < BROWS; i += TROWS) {
        if (((read_n + i) < params->O)) {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < n_channels; j++) {
            dst[i * dst_ld + j] = curr_src[i * src_ld + j];
          }

          STEEL_PRAGMA_UNROLL
          for (short j = n_channels; j < vec_size; j++) {
            dst[i * dst_ld + j] = T(0);
          }
        } else {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < vec_size; j++) {
            dst[i * dst_ld + j] = T(0);
          }
        }
      }
    }
  }

  /* Iteration helper */
  METAL_FUNC void next() {
    weight_hw += TCOLS;
  }
};

} // namespace steel
} // namespace mlx


================================================
FILE: mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/steel/defines.h"

///////////////////////////////////////////////////////////////////////////////
// Loading helper
///////////////////////////////////////////////////////////////////////////////

namespace mlx {
namespace steel {

template <
    typename T,
    short BM,
    short BN,
    short BK,
    short tgp_size,
    short tgp_padding = 0>
struct Conv2DInputBlockLoaderGeneral {
  // Destination dimensions
  STEEL_CONST short BROWS = BM;
  STEEL_CONST short BCOLS = BK;

  // Read dimensions
  STEEL_CONST short dst_ld = BCOLS + tgp_padding;
  STEEL_CONST short vec_size = tgp_size / (BROWS * BCOLS) >= 8 ? 8 : 4;

  // Thread read shape
  STEEL_CONST short TCOLS = BCOLS / vec_size;
  STEEL_CONST short TROWS = tgp_size / TCOLS;

  // Rows / strided reads within the block
  STEEL_CONST short n_rows = BROWS / TROWS;

  // Thread location indices
  const short thread_idx;
  const short bi;
  const short bj;

  // threadgroup and device memory
  threadgroup T* dst;

  const constant MLXConvParams<2>* params;
  const constant Conv2DGeneralJumpParams* jump_params;

  const short base_wh;
  const short base_ww;

  short weight_h;
  short weight_w;

  const device T* src[n_rows];

  int read_n[n_rows];
  int read_ih[n_rows];
  int read_iw[n_rows];

  /* Constructor */
  METAL_FUNC Conv2DInputBlockLoaderGeneral(
      const device T* src_,
      threadgroup T* dst_,
      const int4 offsets,
      const constant MLXConvParams<2>* params_,
      const constant Conv2DGeneralJumpParams* jump_params_,
      const short base_wh_,
      const short base_ww_,
      uint simd_group_id [[simdgroup_index_in_threadgroup]],
      uint simd_lane_id [[thread_index_in_simdgroup]])
      : thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(thread_idx / TCOLS),
        bj(vec_size * (thread_idx % TCOLS)),
        dst(dst_ + bi * dst_ld + bj),
        params(params_),
        jump_params(jump_params_),
        base_wh(base_wh_),
        base_ww(base_ww_),
        weight_h(base_wh_),
        weight_w(base_ww_) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < n_rows; ++i) {
      int offset_nhw = offsets.y + bi + i * TROWS;
      int n = offset_nhw / jump_params->adj_out_hw;
      int hw = offset_nhw % jump_params->adj_out_hw;
      int oh =
          (hw / jump_params->adj_out_w) * jump_params->f_out_jump_h + offsets.z;
      int ow =
          (hw % jump_params->adj_out_w) * jump_params->f_out_jump_w + offsets.w;

      int ih = oh * params->str[0] - params->pad[0];
      int iw = ow * params->str[1] - params->pad[1];

      read_n[i] = n;
      read_ih[i] = ih;
      read_iw[i] = iw;

      // Read from input if in bounds
      src[i] = src_ + n * params->in_strides[0] + bj;
    }
  }

  /* Load from device memory into threadgroup memory - without bound checking */
  METAL_FUNC void load_unsafe() const {
    STEEL_PRAGMA_UNROLL
    for (short i = 0, is = 0; i < n_rows; ++i, is += TROWS) {
      // Find bounds
      int n = read_n[i];

      int h_flip = params->flip ? params->wS[0] - weight_h - 1 : weight_h;
      int w_flip = params->flip ? params->wS[1] - weight_w - 1 : weight_w;

      int ih_dil = read_ih[i] + h_flip * params->kdil[0];
      int iw_dil = read_iw[i] + w_flip * params->kdil[1];

      int ih = ih_dil / params->idil[0];
      int iw = iw_dil / params->idil[1];

      size_t offset = ih * params->in_strides[1] + iw * params->in_strides[2];

      // Read from input if in bounds
      if ((n < params->N) && (ih_dil >= 0 && ih < params->iS[0]) &&
          (iw_dil >= 0 && iw < params->iS[1])) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; ++j) {
          dst[is * dst_ld + j] = (src[i])[offset + j];
        }
      }

      // Zero pad otherwise
      else {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; ++j) {
          dst[is * dst_ld + j] = T(0);
        }
      }
    }
  }

  METAL_FUNC void load_safe(const short remaining_k) const {
    STEEL_PRAGMA_UNROLL
    for (short i = 0, is = 0; i < n_rows; ++i, is += TROWS) {
      // Find bounds
      int n = read_n[i];

      int h_flip = params->flip ? params->wS[0] - weight_h - 1 : weight_h;
      int w_flip = params->flip ? params->wS[1] - weight_w - 1 : weight_w;

      int ih_dil = read_ih[i] + h_flip * params->kdil[0];
      int iw_dil = read_iw[i] + w_flip * params->kdil[1];

      int ih = ih_dil / params->idil[0];
      int iw = iw_dil / params->idil[1];

      size_t offset = ih * params->in_strides[1] + iw * params->in_strides[2];

      // Read from input if in bounds
      if ((n < params->N) && (ih_dil >= 0 && ih < params->iS[0]) &&
          (iw_dil >= 0 && iw < params->iS[1])) {
        if (bj + vec_size <= remaining_k) {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < vec_size; ++j) {
            dst[is * dst_ld + j] = (src[i])[offset + j];
          }
        } else {
          for (short j = 0; j < vec_size; ++j) {
            if (bj + j < remaining_k) {
              dst[is * dst_ld + j] = (src[i])[offset + j];
            } else {
              dst[is * dst_ld + j] = T(0);
            }
          }
        }
      }

      // Zero pad otherwise
      else {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; ++j) {
          dst[is * dst_ld + j] = T(0);
        }
      }
    }
  }

  /* Iteration helper */
  METAL_FUNC void next() {
    weight_w += jump_params->f_wgt_jump_w;
    if (weight_w < params->wS[1]) {
      return;
    }

    weight_w = base_ww;

    weight_h += jump_params->f_wgt_jump_h;
    if (weight_h < params->wS[0]) {
      return;
    }

    weight_h = base_wh;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < n_rows; i++) {
      src[i] += BK;
    }
  }
};

template <
    typename T,
    short BM,
    short BN,
    short BK,
    short tgp_size,
    short tgp_padding = 0>
struct Conv2DWeightBlockLoaderGeneral {
  // Destination dimensions
  STEEL_CONST short BROWS = BN;
  STEEL_CONST short BCOLS = BK;

  // Read dimensions
  STEEL_CONST short dst_ld = BCOLS + tgp_padding;
  STEEL_CONST short vec_size =
      (BN == 8) ? 1 : (tgp_size / (BROWS * BCOLS) >= 8 ? 8 : 4);

  // Thread read shape
  STEEL_CONST short TCOLS = BCOLS / vec_size;
  STEEL_CONST short TROWS = tgp_size / TCOLS;

  // Rows / strided reads within the block
  STEEL_CONST short n_rows = BROWS / TROWS;

  // Leading dimension for src
  const int src_ld;

  // Thread location indices
  const short thread_idx;
  const short bi;
  const short bj;

  // threadgroup and device memory
  threadgroup T* dst;
  const device T* src;

  const constant MLXConvParams<2>* params;
  const constant Conv2DGeneralJumpParams* jump_params;

  const short base_wh;
  const short base_ww;

  short weight_h;
  short weight_w;

  const int start_row;

  /* Constructor */
  METAL_FUNC Conv2DWeightBlockLoaderGeneral(
      const device T* src_,
      threadgroup T* dst_,
      const int2 offsets,
      const constant MLXConvParams<2>* params_,
      const constant Conv2DGeneralJumpParams* jump_params_,
      const short base_wh_,
      const short base_ww_,
      uint simd_group_id [[simdgroup_index_in_threadgroup]],
      uint simd_lane_id [[thread_index_in_simdgroup]])
      : src_ld(params_->wt_strides[0]),
        thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(thread_idx / TCOLS),
        bj(vec_size * (thread_idx % TCOLS)),
        dst(dst_ + bi * dst_ld + bj),
        src(src_ + bi * src_ld + bj),
        params(params_),
        jump_params(jump_params_),
        base_wh(base_wh_),
        base_ww(base_ww_),
        weight_h(base_wh_),
        weight_w(base_ww_),
        start_row(offsets.y + bi) {}

  /* Load from device memory into threadgroup memory - without bound checking */
  METAL_FUNC void load_unsafe() const {
    const device T* curr_src = src + weight_h * params->wt_strides[1] +
        weight_w * params->wt_strides[2];

    if ((start_row + BN <= params->O)) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < BN; i += TROWS) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; j++) {
          dst[i * dst_ld + j] = curr_src[i * src_ld + j];
        }
      }
    } else {
      for (short i = 0; i < BN; i += TROWS) {
        if ((start_row + i) < params->O) {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < vec_size; j++) {
            dst[i * dst_ld + j] = curr_src[i * src_ld + j];
          }
        } else {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < vec_size; j++) {
            dst[i * dst_ld + j] = T(0);
          }
        }
      }
    }
  }

  METAL_FUNC void load_safe(const short remaining_k) const {
    const device T* curr_src = src + weight_h * params->wt_strides[1] +
        weight_w * params->wt_strides[2];

    if ((start_row + BN <= params->O)) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < BN; i += TROWS) {
        if (bj + vec_size <= remaining_k) {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < vec_size; j++) {
            dst[i * dst_ld + j] = curr_src[i * src_ld + j];
          }
        } else {
          for (short j = 0; j < vec_size; j++) {
            if (bj + j < remaining_k) {
              dst[i * dst_ld + j] = curr_src[i * src_ld + j];
            } else {
              dst[i * dst_ld + j] = T(0);
            }
          }
        }
      }
    } else {
      for (short i = 0; i < BN; i += TROWS) {
        if ((start_row + i) < params->O) {
          if (bj + vec_size <= remaining_k) {
            STEEL_PRAGMA_UNROLL
            for (short j = 0; j < vec_size; j++) {
              dst[i * dst_ld + j] = curr_src[i * src_ld + j];
            }
          } else {
            for (short j = 0; j < vec_size; j++) {
              if (bj + j < remaining_k) {
                dst[i * dst_ld + j] = curr_src[i * src_ld + j];
              } else {
                dst[i * dst_ld + j] = T(0);
              }
            }
          }
        } else {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < vec_size; j++) {
            dst[i * dst_ld + j] = T(0);
          }
        }
      }
    }
  }

  /* Iteration helper */
  METAL_FUNC void next() {
    weight_w += jump_params->f_wgt_jump_w;
    if (weight_w < params->wS[1]) {
      return;
    }

    weight_w = base_ww;

    weight_h += jump_params->f_wgt_jump_h;
    if (weight_h < params->wS[0]) {
      return;
    }

    weight_h = base_wh;

    src += BK;
  }
};

} // namespace steel
} // namespace mlx


================================================
FILE: mlx/backend/metal/kernels/steel/conv/params.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

template <int NDIM>
struct MLXConvParams {
  int N; // Batch size
  int C; // In channels
  int O; // Out channels
  int iS[NDIM]; // Input spatial dim
  int wS[NDIM]; // Weight spatial dim
  int oS[NDIM]; // Output spatial dim
  int str[NDIM]; // Kernel strides
  int pad[NDIM]; // Input padding
  int kdil[NDIM]; // Kernel dilation
  int idil[NDIM]; // Input dilation
  int64_t in_strides[NDIM + 2]; // In strides
  int64_t wt_strides[NDIM + 2]; // Wt strides
  int64_t out_strides[NDIM + 2]; // Out strides
  int groups; // Input channel groups
  bool flip;

  static MLXConvParams<NDIM>
  with_padded_channels(MLXConvParams<NDIM> other, int pad_out, int pad_in) {
    MLXConvParams<NDIM> params = other;

    // Update strides
    for (int i = 0; i < NDIM + 1; i++) {
      params.in_strides[i] =
          (params.in_strides[i] / params.C) * (params.C + pad_in);
      params.wt_strides[i] =
          (params.wt_strides[i] / params.C) * (params.C + pad_in);
      params.out_strides[i] =
          (params.out_strides[i] / params.O) * (params.O + pad_out);
    }
    params.in_strides[NDIM + 1] = 1;
    params.wt_strides[NDIM + 1] = 1;
    params.out_strides[NDIM + 1] = 1;

    // Update channels
    params.C += pad_in;
    params.O += pad_out;

    return params;
  };
};

namespace mlx {
namespace steel {

struct ImplicitGemmConv2DParams {
  const int M;
  const int N;
  const int K;

  const int gemm_k_iterations;

  const int inp_jump_w;
  const int inp_jump_h;
  const int inp_jump_c;

  const int tiles_n;
  const int tiles_m;
  const int swizzle_log;
};

struct ImplicitGemmConv3DParams {
  const int M;
  const int N;
  const int K;

  const int gemm_k_iterations;

  const int inp_jump_w;
  const int inp_jump_h;
  const int inp_jump_d;
  const int inp_jump_c;

  const int tiles_n;
  const int tiles_m;
  const int swizzle_log;
};

struct Conv2DGeneralJumpParams {
  const int f_wgt_jump_h;
  const int f_wgt_jump_w;

  const int f_out_jump_h;
  const int f_out_jump_w;

  const int adj_out_h;
  const int adj_out_w;
  const int adj_out_hw;
  const int adj_implicit_m;
};

struct Conv2DGeneralBaseInfo {
  int weight_base;
  int weight_size;
};

} // namespace steel
} // namespace mlx


================================================
FILE: mlx/backend/metal/kernels/steel/defines.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#define STEEL_CONST static constant constexpr const
#define STEEL_PRAGMA_UNROLL _Pragma("clang loop unroll(full)")
#define STEEL_PRAGMA_NO_UNROLL _Pragma("clang loop unroll(disable)")


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/gemm.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/steel/gemm/loader.h"
#include "mlx/backend/metal/kernels/steel/gemm/mma.h"
#include "mlx/backend/metal/kernels/steel/gemm/params.h"
#include "mlx/backend/metal/kernels/steel/gemm/transforms.h"
#include "mlx/backend/metal/kernels/steel/utils.h"

using namespace metal;

///////////////////////////////////////////////////////////////////////////////
// GEMM kernel class
///////////////////////////////////////////////////////////////////////////////

namespace mlx {
namespace steel {

template <bool M_aligned, bool N_aligned, bool K_aligned>
struct LoopAlignment {};

template <
    typename T,
    typename U,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose_a,
    bool transpose_b,
    bool MN_aligned,
    bool K_aligned,
    typename AccumType = typename AccumHelper<T>::accum_type,
    typename Epilogue = TransformNone<U, AccumType>>
struct GEMMKernel {
  STEEL_CONST short tgp_padding_a = 16 / sizeof(T);
  STEEL_CONST short tgp_padding_b = 16 / sizeof(T);
  STEEL_CONST short tgp_mem_size_a =
      transpose_a ? BK * (BM + tgp_padding_a) : BM * (BK + tgp_padding_a);
  STEEL_CONST short tgp_mem_size_b =
      transpose_b ? BN * (BK + tgp_padding_b) : BK * (BN + tgp_padding_b);
  STEEL_CONST short tgp_mem_size = tgp_mem_size_a + tgp_mem_size_b;

  STEEL_CONST short tgp_size = WM * WN * 32;

  using loader_a_t = BlockLoader<
      T,
      transpose_a ? BK : BM,
      transpose_a ? BM : BK,
      transpose_a ? BM + tgp_padding_a : BK + tgp_padding_a,
      !transpose_a,
      tgp_size>;
  using loader_b_t = BlockLoader<
      T,
      transpose_b ? BN : BK,
      transpose_b ? BK : BN,
      transpose_b ? BK + tgp_padding_b : BN + tgp_padding_b,
      transpose_b,
      tgp_size>;
  using mma_t = BlockMMA<
      T,
      U,
      BM,
      BN,
      BK,
      WM,
      WN,
      transpose_a,
      transpose_b,
      transpose_a ? BM + tgp_padding_a : BK + tgp_padding_a,
      transpose_b ? BK + tgp_padding_b : BN + tgp_padding_b,
      AccumType,
      Epilogue>;

  /* Main kernel function */
  template <bool M_aligned, bool N_aligned, bool K_aligned_>
  static METAL_FUNC void gemm_loop(
      threadgroup T* As [[threadgroup(0)]],
      threadgroup T* Bs [[threadgroup(1)]],
      const int gemm_k_iterations,
      thread loader_a_t& loader_a,
      thread loader_b_t& loader_b,
      thread mma_t& mma_op,
      thread const short& tgp_bm,
      thread const short& tgp_bn,
      thread const short& lbk,
      LoopAlignment<M_aligned, N_aligned, K_aligned_> l = {}) {
    // Appease the compiler
    (void)l;

    short2 tile_dims_A = transpose_a ? short2(tgp_bm, BK) : short2(BK, tgp_bm);

    short2 tile_dims_B = transpose_b ? short2(BK, tgp_bn) : short2(tgp_bn, BK);

    for (int k = 0; k < gemm_k_iterations; k++) {
      threadgroup_barrier(mem_flags::mem_threadgroup);
      // Load elements into threadgroup
      if (M_aligned) {
        loader_a.load_unsafe();
      } else {
        loader_a.load_safe(tile_dims_A);
      }

      if (N_aligned) {
        loader_b.load_unsafe();
      } else {
        loader_b.load_safe(tile_dims_B);
      }

      threadgroup_barrier(mem_flags::mem_threadgroup);

      // Multiply and accumulate threadgroup elements
      mma_op.mma(As, Bs);

      // Prepare for next iteration
      loader_a.next();
      loader_b.next();
    }

    if (!K_aligned_) {
      threadgroup_barrier(mem_flags::mem_threadgroup);

      short2 tile_dims_A_last =
          transpose_a ? short2(tgp_bm, lbk) : short2(lbk, tgp_bm);
      short2 tile_dims_B_last =
          transpose_b ? short2(lbk, tgp_bn) : short2(tgp_bn, lbk);

      loader_a.load_safe(tile_dims_A_last);
      loader_b.load_safe(tile_dims_B_last);

      threadgroup_barrier(mem_flags::mem_threadgroup);

      mma_op.mma(As, Bs);
    }
  }

  /* Main kernel function */
  static METAL_FUNC void run(
      const device T* A [[buffer(0)]],
      const device T* B [[buffer(1)]],
      device U* D [[buffer(2)]],
      const constant GEMMParams* params [[buffer(3)]],
      threadgroup T* As [[threadgroup(0)]],
      threadgroup T* Bs [[threadgroup(1)]],
      uint simd_lane_id [[thread_index_in_simdgroup]],
      uint simd_group_id [[simdgroup_index_in_threadgroup]],
      uint3 tid [[threadgroup_position_in_grid]],
      uint3 lid [[thread_position_in_threadgroup]]) {
    // Pacifying compiler
    (void)lid;

    const int tid_y = ((tid.y) << params->swizzle_log) +
        ((tid.x) & ((1 << params->swizzle_log) - 1));
    const int tid_x = (tid.x) >> params->swizzle_log;

    if (params->tiles_n <= tid_x || params->tiles_m <= tid_y) {
      return;
    }

    threadgroup_barrier(mem_flags::mem_none);

    // Find block in A, B, C
    const int c_row = tid_y * BM;
    const int c_col = tid_x * BN;
    const size_t c_row_long = size_t(c_row);
    const size_t c_col_long = size_t(c_col);

    A += transpose_a ? c_row_long : c_row_long * params->lda;
    B += transpose_b ? c_col_long * params->ldb : c_col_long;
    D += c_row_long * params->ldd + c_col_long;

    // Prepare threadgroup loading operations
    thread loader_a_t loader_a(A, params->lda, As, simd_group_id, simd_lane_id);
    thread loader_b_t loader_b(B, params->ldb, Bs, simd_group_id, simd_lane_id);

    // Prepare threadgroup mma operation
    thread mma_t mma_op(simd_group_id, simd_lane_id);

    int gemm_k_iterations = params->gemm_k_iterations_aligned;

    ///////////////////////////////////////////////////////////////////////////////
    // MNK aligned loop
    if (MN_aligned) {
      for (int k = 0; k < gemm_k_iterations; k++) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        // Load elements into threadgroup
        loader_a.load_unsafe();
        loader_b.load_unsafe();

        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Multiply and accumulate threadgroup elements
        mma_op.mma(As, Bs);

        // Prepare for next iteration
        loader_a.next();
        loader_b.next();
      }

      threadgroup_barrier(mem_flags::mem_none);

      // Loop tail
      if (!K_aligned) {
        int lbk = params->K - params->gemm_k_iterations_aligned * BK;
        short2 tile_dims_A = transpose_a ? short2(BM, lbk) : short2(lbk, BM);
        short2 tile_dims_B = transpose_b ? short2(lbk, BN) : short2(BN, lbk);

        loader_a.load_safe(tile_dims_A);
        loader_b.load_safe(tile_dims_B);

        threadgroup_barrier(mem_flags::mem_threadgroup);

        mma_op.mma(As, Bs);
      }

      // Store results to device memory
      mma_op.store_result(D, params->ldd);
      return;

    }
    ///////////////////////////////////////////////////////////////////////////////
    // MN unaligned loop
    else { // Loop over K - unaligned case
      short tgp_bm = min(BM, params->M - c_row);
      short tgp_bn = min(BN, params->N - c_col);
      short leftover_bk = params->K - params->gemm_k_iterations_aligned * BK;

      if (tgp_bm == BM && tgp_bn == BN) {
        gemm_loop<true, true, K_aligned>(
            As,
            Bs,
            gemm_k_iterations,
            loader_a,
            loader_b,
            mma_op,
            tgp_bm,
            tgp_bn,
            leftover_bk);

        mma_op.store_result(D, params->ldd);
        return;

      } else if (tgp_bn == BN) {
        gemm_loop<false, true, K_aligned>(
            As,
            Bs,
            gemm_k_iterations,
            loader_a,
            loader_b,
            mma_op,
            tgp_bm,
            tgp_bn,
            leftover_bk);

        mma_op.store_result_safe(D, params->ldd, short2(tgp_bn, tgp_bm));
        return;

      } else if (tgp_bm == BM) {
        gemm_loop<true, false, K_aligned>(
            As,
            Bs,
            gemm_k_iterations,
            loader_a,
            loader_b,
            mma_op,
            tgp_bm,
            tgp_bn,
            leftover_bk);

        mma_op.store_result_safe(D, params->ldd, short2(tgp_bn, tgp_bm));
        return;

      } else {
        gemm_loop<false, false, K_aligned>(
            As,
            Bs,
            gemm_k_iterations,
            loader_a,
            loader_b,
            mma_op,
            tgp_bm,
            tgp_bn,
            leftover_bk);

        mma_op.store_result_safe(D, params->ldd, short2(tgp_bn, tgp_bm));
        return;
      }
    }
  }
};

} // namespace steel
} // namespace mlx

================================================
FILE: mlx/backend/metal/kernels/steel/gemm/gemm_nax.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/steel/gemm/nax.h"
#include "mlx/backend/metal/kernels/steel/gemm/params.h"
#include "mlx/backend/metal/kernels/steel/gemm/transforms.h"
#include "mlx/backend/metal/kernels/steel/utils.h"

using namespace metal;

namespace mlx::steel {

template <
    typename T,
    short SM,
    short SN,
    short SK,
    short BK,
    bool transpose_a,
    bool transpose_b,
    bool kAlignedM,
    bool kAlignedN,
    bool kAlignedK,
    typename AccumType = float>
auto gemm_loop(
    const device T* A,
    const device T* B,
    int lda,
    int ldb,
    int K,
    int gemm_k_iterations_aligned,
    const short sgp_sm,
    const short sgp_sn) {
  constexpr short TM = SM / 16;
  constexpr short TN = SN / 16;
  constexpr short TK = SK / 16;

  constexpr int RA = transpose_a ? TK : TM;
  constexpr int CA = transpose_a ? TM : TK;

  constexpr int RB = transpose_b ? TN : TK;
  constexpr int CB = transpose_b ? TK : TN;

  NAXTile<AccumType, TM, TN> Dtile;
  Dtile.clear();

  int gemm_k_iterations_ = gemm_k_iterations_aligned;

  STEEL_PRAGMA_NO_UNROLL
  for (int kk0 = 0; kk0 < gemm_k_iterations_; kk0++) {
    threadgroup_barrier(mem_flags::mem_none);

    STEEL_PRAGMA_NO_UNROLL
    for (int kk1 = 0; kk1 < BK; kk1 += SK) {
      NAXTile<T, RA, CA> Atile;
      NAXTile<T, RB, CB> Btile;
      const int k = kk1;

      volatile int compiler_barrier;

      const int A_offset = transpose_a ? k * lda : k;
      const int B_offset = transpose_b ? k : k * ldb;

      if constexpr (kAlignedM) {
        Atile.load(A + A_offset, lda);
      } else {
        const short rmax = transpose_a ? SK : sgp_sm;
        const short cmax = transpose_a ? sgp_sm : SK;
        Atile.load_safe(A + A_offset, lda, short2(cmax, rmax));
      }

      if constexpr (kAlignedN) {
        Btile.load(B + B_offset, ldb);
      } else {
        const short rmax = transpose_b ? sgp_sn : SK;
        const short cmax = transpose_b ? SK : sgp_sn;
        Btile.load_safe(B + B_offset, ldb, short2(cmax, rmax));
      }

      tile_matmad_nax(
          Dtile,
          Atile,
          metal::bool_constant<transpose_a>{},
          Btile,
          metal::bool_constant<transpose_b>{});

      (void)compiler_barrier;
    }

    A += transpose_a ? (BK * lda) : BK;
    B += transpose_b ? BK : (BK * ldb);
  }

  if constexpr (!kAlignedK) {
    simdgroup_barrier(mem_flags::mem_none);

    const short rem_bk = K - gemm_k_iterations_ * BK;

    STEEL_PRAGMA_NO_UNROLL
    for (int kk1 = 0; kk1 < rem_bk; kk1 += SK) {
      NAXTile<T, RA, CA> Atile;
      NAXTile<T, RB, CB> Btile;

      const int k = kk1;
      const short psk = max(0, rem_bk - k);

      const short2 Aklims =
          transpose_a ? short2(sgp_sm, psk) : short2(psk, sgp_sm);
      const short2 Bklims =
          transpose_b ? short2(psk, sgp_sn) : short2(sgp_sn, psk);

      const int A_offset = transpose_a ? k * lda : k;
      const int B_offset = transpose_b ? k : k * ldb;

      Atile.load_safe(A + A_offset, lda, Aklims);
      Btile.load_safe(B + B_offset, ldb, Bklims);

      tile_matmad_nax(
          Dtile,
          Atile,
          metal::bool_constant<transpose_a>{},
          Btile,
          metal::bool_constant<transpose_b>{});
    }
  }

  return Dtile;
}

} // namespace mlx::steel


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.h
================================================
// Copyright © 2024 Apple Inc.

using namespace mlx::steel;

///////////////////////////////////////////////////////////////////////////////
// GEMM kernels
///////////////////////////////////////////////////////////////////////////////

constant bool has_batch [[function_constant(10)]];

constant bool use_out_source [[function_constant(100)]];
constant bool do_axpby [[function_constant(110)]];

constant bool align_M [[function_constant(200)]];
constant bool align_N [[function_constant(201)]];
constant bool align_K [[function_constant(202)]];

// clang-format off
template <
    typename T,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose_a,
    bool transpose_b,
    typename AccumType = float>
[[kernel, max_total_threads_per_threadgroup(WM* WN * 32)]] void gemm(
    const device T* A [[buffer(0)]],
    const device T* B [[buffer(1)]],
    const device T* C [[buffer(2), function_constant(use_out_source)]],
    device T* D [[buffer(3)]],
    const constant GEMMParams* params [[buffer(4)]],
    const constant GEMMAddMMParams* addmm_params [[buffer(5), function_constant(use_out_source)]],
    const constant int* batch_shape [[buffer(6), function_constant(has_batch)]],
    const constant int64_t* batch_strides [[buffer(7), function_constant(has_batch)]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]]) { // clang-format on
  // Pacifying compiler
  (void)lid;

  using gemm_kernel = GEMMKernel<
      T,
      T,
      BM,
      BN,
      BK,
      WM,
      WN,
      transpose_a,
      transpose_b,
      true,
      true,
      AccumType>;

  using loader_a_t = typename gemm_kernel::loader_a_t;
  using loader_b_t = typename gemm_kernel::loader_b_t;
  using mma_t = typename gemm_kernel::mma_t;

  // Find block
  const int tid_y = ((tid.y) << params->swizzle_log) +
      ((tid.x) & ((1 << params->swizzle_log) - 1));
  const int tid_x = (tid.x) >> params->swizzle_log;

  // Exit early if out of bounds
  if (params->tiles_n <= tid_x || params->tiles_m <= tid_y) {
    return;
  }

  // Adjust for batch
  if (has_batch) {
    const constant auto* A_bstrides = batch_strides;
    const constant auto* B_bstrides = batch_strides + params->batch_ndim;

    ulong2 batch_offsets = elem_to_loc_broadcast(
        tid.z, batch_shape, A_bstrides, B_bstrides, params->batch_ndim);

    A += batch_offsets.x;
    B += batch_offsets.y;

    if (use_out_source) {
      const constant auto* C_bstrides = B_bstrides + params->batch_ndim;
      C += elem_to_loc(tid.z, batch_shape, C_bstrides, params->batch_ndim);
    }
  } else {
    A += params->batch_stride_a * tid.z;
    B += params->batch_stride_b * tid.z;

    if (use_out_source) {
      C += addmm_params->batch_stride_c * tid.z;
    }
  }

  D += params->batch_stride_d * tid.z;

  // Prepare threadgroup memory
  threadgroup T As[gemm_kernel::tgp_mem_size_a];
  threadgroup T Bs[gemm_kernel::tgp_mem_size_b];

  threadgroup_barrier(mem_flags::mem_none);

  // Find block in A, B, C
  const int c_row = tid_y * BM;
  const int c_col = tid_x * BN;
  const size_t c_row_long = size_t(c_row);
  const size_t c_col_long = size_t(c_col);

  A += transpose_a ? c_row_long : c_row_long * params->lda;
  B += transpose_b ? c_col_long * params->ldb : c_col_long;
  D += c_row_long * params->ldd + c_col_long;

  if (use_out_source) {
    C += c_row_long * addmm_params->ldc + c_col_long * addmm_params->fdc;
  }

  // Prepare threadgroup mma operation
  thread mma_t mma_op(simd_group_id, simd_lane_id);

  // Prepare threadgroup loading operations
  thread loader_a_t loader_a(A, params->lda, As, simd_group_id, simd_lane_id);
  thread loader_b_t loader_b(B, params->ldb, Bs, simd_group_id, simd_lane_id);

  // Prepare threadgroup bounds
  const short tgp_bm = align_M ? BM : short(min(BM, params->M - c_row));
  const short tgp_bn = align_N ? BN : short(min(BN, params->N - c_col));

  // Prepare iterations
  int gemm_k_iterations = params->gemm_k_iterations_aligned;

  // Do unaligned K iterations first
  if (!align_K) {
    const int k_last = params->gemm_k_iterations_aligned * BK;
    const int k_remain = params->K - k_last;
    const size_t k_jump_a =
        transpose_a ? params->lda * size_t(k_last) : size_t(k_last);
    const size_t k_jump_b =
        transpose_b ? size_t(k_last) : params->ldb * size_t(k_last);

    // Move loader source ahead to end
    loader_a.src += k_jump_a;
    loader_b.src += k_jump_b;

    // Load tile
    const short2 tile_dims_A =
        transpose_a ? short2(tgp_bm, k_remain) : short2(k_remain, tgp_bm);
    const short2 tile_dims_B =
        transpose_b ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);

    loader_a.load_safe(tile_dims_A);
    loader_b.load_safe(tile_dims_B);

    threadgroup_barrier(mem_flags::mem_threadgroup);

    // Do matmul
    mma_op.mma(As, Bs);

    // Reset source back to start
    loader_a.src -= k_jump_a;
    loader_b.src -= k_jump_b;
  }

  const TransformAdd<AccumType, AccumType> epilogue_op_add(
      addmm_params->alpha, addmm_params->beta);
  const TransformAxpby<AccumType, AccumType> epilogue_op_axpby(
      addmm_params->alpha, addmm_params->beta);

  ///////////////////////////////////////////////////////////////////////////////
  // MNK aligned loop
  if (align_M && align_N) {
    // Do gemm
    for (int k = 0; k < gemm_k_iterations; k++) {
      threadgroup_barrier(mem_flags::mem_threadgroup);
      // Load elements into threadgroup
      loader_a.load_unsafe();
      loader_b.load_unsafe();

      threadgroup_barrier(mem_flags::mem_threadgroup);

      // Multiply and accumulate threadgroup elements
      mma_op.mma(As, Bs);

      // Prepare for next iteration
      loader_a.next();
      loader_b.next();
    }

    threadgroup_barrier(mem_flags::mem_none);

    // Do epilogue
    if (use_out_source) {
      if (do_axpby) {
        mma_op.apply_epilogue(
            C, addmm_params->ldc, addmm_params->fdc, epilogue_op_axpby);
      } else {
        mma_op.apply_epilogue(
            C, addmm_params->ldc, addmm_params->fdc, epilogue_op_add);
      }
    }

    // Store results to device memory
    return mma_op.store_result(D, params->ldd);

  }
  ///////////////////////////////////////////////////////////////////////////////
  // MN unaligned loop
  else { // Loop over K - unaligned case
    const int leftover_bk = 0;

    if ((align_M || tgp_bm == BM) && (align_N || tgp_bn == BN)) {
      // Do gemm
      gemm_kernel::gemm_loop(
          As,
          Bs,
          gemm_k_iterations,
          loader_a,
          loader_b,
          mma_op,
          tgp_bm,
          tgp_bn,
          leftover_bk,
          LoopAlignment<true, true, true>{});

      // Do epilogue
      if (use_out_source) {
        if (do_axpby) {
          mma_op.apply_epilogue(
              C, addmm_params->ldc, addmm_params->fdc, epilogue_op_axpby);
        } else {
          mma_op.apply_epilogue(
              C, addmm_params->ldc, addmm_params->fdc, epilogue_op_add);
        }
      }

      // Store results to device memory
      return mma_op.store_result(D, params->ldd);

    } else if (align_N || tgp_bn == BN) {
      gemm_kernel::gemm_loop(
          As,
          Bs,
          gemm_k_iterations,
          loader_a,
          loader_b,
          mma_op,
          tgp_bm,
          tgp_bn,
          leftover_bk,
          LoopAlignment<false, true, true>{});

      // Do epilogue
      if (use_out_source) {
        if (do_axpby) {
          mma_op.apply_epilogue_safe(
              C,
              addmm_params->ldc,
              addmm_params->fdc,
              short2(tgp_bn, tgp_bm),
              epilogue_op_axpby);
        } else {
          mma_op.apply_epilogue_safe(
              C,
              addmm_params->ldc,
              addmm_params->fdc,
              short2(tgp_bn, tgp_bm),
              epilogue_op_add);
        }
      }

      // Store results to device memory
      return mma_op.store_result_safe(D, params->ldd, short2(tgp_bn, tgp_bm));

    } else if (align_M || tgp_bm == BM) {
      gemm_kernel::gemm_loop(
          As,
          Bs,
          gemm_k_iterations,
          loader_a,
          loader_b,
          mma_op,
          tgp_bm,
          tgp_bn,
          leftover_bk,
          LoopAlignment<true, false, true>{});

      // Do epilogue
      if (use_out_source) {
        if (do_axpby) {
          mma_op.apply_epilogue_safe(
              C,
              addmm_params->ldc,
              addmm_params->fdc,
              short2(tgp_bn, tgp_bm),
              epilogue_op_axpby);
        } else {
          mma_op.apply_epilogue_safe(
              C,
              addmm_params->ldc,
              addmm_params->fdc,
              short2(tgp_bn, tgp_bm),
              epilogue_op_add);
        }
      }

      // Store results to device memory
      return mma_op.store_result_safe(D, params->ldd, short2(tgp_bn, tgp_bm));

    } else {
      gemm_kernel::gemm_loop(
          As,
          Bs,
          gemm_k_iterations,
          loader_a,
          loader_b,
          mma_op,
          tgp_bm,
          tgp_bn,
          leftover_bk,
          LoopAlignment<false, false, true>{});

      // Do epilogue
      if (use_out_source) {
        if (do_axpby) {
          mma_op.apply_epilogue_safe(
              C,
              addmm_params->ldc,
              addmm_params->fdc,
              short2(tgp_bn, tgp_bm),
              epilogue_op_axpby);
        } else {
          mma_op.apply_epilogue_safe(
              C,
              addmm_params->ldc,
              addmm_params->fdc,
              short2(tgp_bn, tgp_bm),
              epilogue_op_add);
        }
      }

      // Store results to device memory
      return mma_op.store_result_safe(D, params->ldd, short2(tgp_bn, tgp_bm));
    }
  }
}


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.metal
================================================
// Copyright © 2024 Apple Inc.

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"

#include "mlx/backend/metal/kernels/steel/gemm/gemm.h"
#include "mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.h"

#define instantiate_gemm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
  instantiate_kernel(                                                                             \
      "steel_gemm_fused_" #tname "_"  #iname "_" #oname                                           \
      "_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn,                                          \
  gemm, itype, bm, bn, bk, wm, wn, trans_a, trans_b, float)

#define instantiate_gemm_transpose_helper(iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm(nn, false, false, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm(nt, false, true , iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm(tn, true , false, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm(tt, true , true , iname, itype, oname, otype, bm, bn, bk, wm, wn)

#define instantiate_gemm_shapes_helper(iname, itype, oname, otype) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 64, 64, 16, 2, 2) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 64, 64, 16, 1, 2) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 64, 32, 32, 2, 2) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 32, 64, 16, 1, 2) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 32, 32, 16, 2, 2) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 64, 32,  8, 4, 1)

instantiate_gemm_shapes_helper(float16, half, float16, half);
instantiate_gemm_shapes_helper(bfloat16, bfloat16_t, bfloat16, bfloat16_t);

instantiate_gemm_shapes_helper(float32, float, float32, float);
instantiate_gemm_shapes_helper(complex64, complex64_t, complex64, complex64_t);
// clang-format on


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.h
================================================
// Copyright © 2025 Apple Inc.

using namespace mlx::steel;

constant bool has_batch [[function_constant(10)]];

constant bool use_out_source [[function_constant(100)]];
constant bool do_axpby [[function_constant(110)]];

constant bool align_M [[function_constant(200)]];
constant bool align_N [[function_constant(201)]];
constant bool align_K [[function_constant(202)]];

// clang-format off
template <
    bool kAlignedM,
    bool kAlignedN,
    class NAXTile_t,
    typename T>
void gemm_epilogue(
    thread NAXTile_t& Dtile,
    const device T* C,
    const constant GEMMParams* params,
    const constant GEMMAddMMParams* addmm_params,
    const short sgp_sm, 
    const short sgp_sn) { // clang-format on

  (void)params;

  using V = typename NAXTile_t::elem_type;

  constexpr short TM = NAXTile_t::kTileRows;
  constexpr short TN = NAXTile_t::kTileCols;
  constexpr short kElemsPerFrag = NAXTile_t::kElemsPerFrag;

  using CFrag = typename NAXTile_t::NAXFrag_t;
  using cfrag_t = typename CFrag::template dtype_frag_t<T>;

  STEEL_PRAGMA_UNROLL
  for (short mm = 0; mm < TM; mm++) {
    STEEL_PRAGMA_UNROLL
    for (short nn = 0; nn < TN; nn++) {
      const short m = mm * CFrag::kFragRows;
      const short n = nn * CFrag::kFragCols;

      cfrag_t celems;

      if constexpr (kAlignedM && kAlignedN) {
        CFrag::load(celems, C, addmm_params->ldc, addmm_params->fdc, m, n);
      } else {
        CFrag::load_safe(
            celems,
            C,
            addmm_params->ldc,
            addmm_params->fdc,
            sgp_sm,
            sgp_sn,
            m,
            n);
      }

      auto delems = Dtile.frag_at(mm, nn);

      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < kElemsPerFrag; i++) {
        if (do_axpby) {
          delems[i] = addmm_params->alpha * delems[i] +
              addmm_params->beta * static_cast<V>(celems[i]);
        } else {
          delems[i] += static_cast<V>(celems[i]);
        }
      }
    }
  }
}

// clang-format off
template <
    typename T,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose_a,
    bool transpose_b,
    typename AccumType = float>
[[kernel, max_total_threads_per_threadgroup(WM* WN * 32)]] void gemm(
    const device T* A [[buffer(0)]],
    const device T* B [[buffer(1)]],
    const device T* C [[buffer(2), function_constant(use_out_source)]],
    device T* D [[buffer(3)]],
    const constant GEMMParams* params [[buffer(4)]],
    const constant GEMMAddMMParams* addmm_params [[buffer(5), function_constant(use_out_source)]],
    const constant int* batch_shape [[buffer(6), function_constant(has_batch)]],
    const constant int64_t* batch_strides [[buffer(7), function_constant(has_batch)]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint3 tid [[threadgroup_position_in_grid]]) { // clang-format on
  // Find block
  const int tid_y = ((tid.y) << params->swizzle_log) +
      ((tid.x) & ((1 << params->swizzle_log) - 1));
  const int tid_x = (tid.x) >> params->swizzle_log;

  // Exit early if out of bounds
  if (params->tiles_n <= tid_x || params->tiles_m <= tid_y) {
    return;
  }

  // Adjust for batch
  if (has_batch) {
    const constant auto* A_bstrides = batch_strides;
    const constant auto* B_bstrides = batch_strides + params->batch_ndim;

    ulong2 batch_offsets = elem_to_loc_broadcast(
        tid.z, batch_shape, A_bstrides, B_bstrides, params->batch_ndim);

    A += batch_offsets.x;
    B += batch_offsets.y;

    if (use_out_source) {
      const constant auto* C_bstrides = B_bstrides + params->batch_ndim;
      C += elem_to_loc(tid.z, batch_shape, C_bstrides, params->batch_ndim);
    }
  } else {
    A += params->batch_stride_a * tid.z;
    B += params->batch_stride_b * tid.z;

    if (use_out_source) {
      C += addmm_params->batch_stride_c * tid.z;
    }
  }

  D += params->batch_stride_d * tid.z;

  // Prepare threadgroup memory
  threadgroup_barrier(mem_flags::mem_none);

  // Find block in A, B, C
  const int c_row = tid_y * BM;
  const int c_col = tid_x * BN;
  const size_t c_row_long = size_t(c_row);
  const size_t c_col_long = size_t(c_col);

  A += transpose_a ? c_row_long : c_row_long * params->lda;
  B += transpose_b ? c_col_long * params->ldb : c_col_long;
  D += c_row_long * params->ldd + c_col_long;

  if (use_out_source) {
    C += c_row_long * addmm_params->ldc + c_col_long * addmm_params->fdc;
  }

  constexpr short SM = BM / WM;
  constexpr short SN = BN / WN;
  constexpr short SK = 32;

  constexpr short TM = SM / 16;
  constexpr short TN = SN / 16;

  const short tm = SM * (simd_group_id / WN);
  const short tn = SN * (simd_group_id % WN);

  const int sgp_sm_int =
      align_M ? int(SM) : min(int(SM), params->M - (c_row + tm));
  const short sgp_sm = short(sgp_sm_int);
  const bool is_unaligned_sm = align_M ? false : (sgp_sm != SM);

  const int sgp_sn_int =
      align_N ? int(SN) : min(int(SN), params->N - (c_col + tn));
  const short sgp_sn = short(sgp_sn_int);
  const bool is_unaligned_sn = align_N ? false : (sgp_sn != SN);

  A += transpose_a ? tm : (tm * params->lda);
  B += transpose_b ? (tn * params->ldb) : tn;
  D += tm * params->ldd + tn;

  if (use_out_source) {
    C += tm * addmm_params->ldc + tn * addmm_params->fdc;
  }

  NAXTile<AccumType, TM, TN> Dtile;

  dispatch_bool(align_K, [&](auto kAlignedK) {
    dispatch_bool(align_M || !is_unaligned_sm, [&](auto kAlignedM) {
      dispatch_bool(align_N || !is_unaligned_sn, [&](auto kAlignedN) {
        Dtile = gemm_loop<
            T,
            SM,
            SN,
            SK,
            BK,
            transpose_a,
            transpose_b,
            kAlignedM.value,
            kAlignedN.value,
            kAlignedK.value,
            AccumType>(
            A,
            B,
            params->lda,
            params->ldb,
            params->K,
            params->gemm_k_iterations_aligned,
            sgp_sm,
            sgp_sn);
        if (use_out_source) {
          gemm_epilogue<kAlignedM.value, kAlignedN.value>(
              Dtile, C, params, addmm_params, sgp_sm, sgp_sn);
        }
        if constexpr (kAlignedM && kAlignedN) {
          Dtile.store(D, int(params->ldd));
        } else {
          Dtile.store_safe(D, int(params->ldd), short2(sgp_sn, sgp_sm));
        }
      });
    });
  });
}


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.metal
================================================
// Copyright © 2025 Apple Inc.

#include <metal_stdlib>

#include "mlx/backend/metal/kernels/utils.h"

#include "mlx/backend/metal/kernels/steel/gemm/gemm_nax.h"
#include "mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.h"

// clang-format off
#define instantiate_gemm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
  instantiate_kernel(                                                                             \
      "steel_gemm_fused_nax_" #tname "_"  #iname "_" #oname                                       \
      "_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn,                                          \
  gemm, itype, bm, bn, bk, wm, wn, trans_a, trans_b, float)

#define instantiate_gemm_transpose_helper(iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm(nn, false, false, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm(nt, false, true , iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm(tn, true , false, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm(tt, true , true , iname, itype, oname, otype, bm, bn, bk, wm, wn)

#define instantiate_gemm_shapes_helper(iname, itype, oname, otype) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype,  64,  64, 256, 2, 2) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype,  64, 128,  64, 2, 4) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype,  64, 128, 256, 2, 4) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 128, 128,  64, 4, 4) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 128, 128, 256, 4, 4) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 128, 128, 512, 4, 4)

instantiate_gemm_shapes_helper(float16, half, float16, half);
instantiate_gemm_shapes_helper(bfloat16, bfloat, bfloat16, bfloat);
instantiate_gemm_shapes_helper(float32, float, float32, float);
// clang-format on


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.h
================================================
// Copyright © 2024 Apple Inc.

using namespace mlx::steel;

constant bool has_batch [[function_constant(10)]];
constant bool align_M [[function_constant(200)]];
constant bool align_N [[function_constant(201)]];
constant bool align_K [[function_constant(202)]];

template <
    typename T,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose_a,
    bool transpose_b,
    typename AccumType = float>
[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] void gather_mm_rhs(
    const device T* A [[buffer(0)]],
    const device T* B [[buffer(1)]],
    const device uint32_t* rhs_indices [[buffer(2)]],
    device T* C [[buffer(3)]],
    const constant GEMMParams* params [[buffer(4)]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint3 tid [[threadgroup_position_in_grid]]) {
  using gemm_kernel = GEMMKernel<
      T,
      T,
      BM,
      BN,
      BK,
      WM,
      WN,
      transpose_a,
      transpose_b,
      true,
      true,
      AccumType>;

  using loader_a_t = typename gemm_kernel::loader_a_t;
  using loader_b_t = typename gemm_kernel::loader_b_t;
  using mma_t = typename gemm_kernel::mma_t;

  if (params->tiles_n <= static_cast<int>(tid.x) ||
      params->tiles_m <= static_cast<int>(tid.y)) {
    return;
  }

  // Prepare threadgroup memory
  threadgroup T As[gemm_kernel::tgp_mem_size_a];
  threadgroup T Bs[gemm_kernel::tgp_mem_size_b];

  // Find the block in A, B, C
  const int c_row = tid.y * BM;
  const int c_col = tid.x * BN;
  const size_t c_row_long = size_t(c_row);
  const size_t c_col_long = size_t(c_col);

  // Prepare threadgroup bounds
  const short tgp_bm = align_M ? BM : short(min(BM, params->M - c_row));
  const short tgp_bn = align_N ? BN : short(min(BN, params->N - c_col));

  A += transpose_a ? c_row_long : c_row_long * params->lda;
  B += transpose_b ? c_col_long * params->ldb : c_col_long;
  C += c_row_long * params->ldd + c_col_long;

  // Do as many matmuls as necessary
  uint32_t index;
  short offset;
  uint32_t index_next = rhs_indices[c_row];
  short offset_next = 0;
  int n = 0;
  while (n < tgp_bm) {
    n++;
    offset = offset_next;
    index = index_next;
    offset_next = tgp_bm;
    for (; n < tgp_bm; n++) {
      if (rhs_indices[c_row + n] != index) {
        offset_next = n;
        index_next = rhs_indices[c_row + n];
        break;
      }
    }
    threadgroup_barrier(mem_flags::mem_none);

    // Prepare threadgroup mma operation
    thread mma_t mma_op(simd_group_id, simd_lane_id);

    // Prepare threadgroup loading operations
    thread loader_a_t loader_a(A, params->lda, As, simd_group_id, simd_lane_id);
    thread loader_b_t loader_b(
        B + index * params->batch_stride_b,
        params->ldb,
        Bs,
        simd_group_id,
        simd_lane_id);

    // Prepare iterations
    const int gemm_k_iterations = params->gemm_k_iterations_aligned;

    // Do unaligned K iterations first
    if (!align_K) {
      const int k_last = params->gemm_k_iterations_aligned * BK;
      const int k_remain = params->K - k_last;
      const size_t k_jump_a =
          transpose_a ? params->lda * size_t(k_last) : size_t(k_last);
      const size_t k_jump_b =
          transpose_b ? size_t(k_last) : params->ldb * size_t(k_last);

      // Move loader source ahead to end
      loader_a.src += k_jump_a;
      loader_b.src += k_jump_b;

      // Load tile
      const short2 tile_dims_A =
          transpose_a ? short2(tgp_bm, k_remain) : short2(k_remain, tgp_bm);
      const short2 tile_dims_B =
          transpose_b ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);

      loader_a.load_safe(tile_dims_A);
      loader_b.load_safe(tile_dims_B);

      threadgroup_barrier(mem_flags::mem_threadgroup);

      // Do matmul
      mma_op.mma(As, Bs);

      // Reset source back to start
      loader_a.src -= k_jump_a;
      loader_b.src -= k_jump_b;
    }

    // Matrix level aligned never check
    if (align_M && align_N) {
      for (int k = 0; k < gemm_k_iterations; k++) {
        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Load elements into threadgroup
        loader_a.load_unsafe();
        loader_b.load_unsafe();

        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Multiply and accumulate threadgroup elements
        mma_op.mma(As, Bs);

        // Prepare for next iteration
        loader_a.next();
        loader_b.next();
      }

      // Store results to device memory
      if (offset_next - offset == BM) {
        mma_op.store_result(C, params->ldd);
      } else {
        mma_op.store_result_slice(
            C, params->ldd, short2(0, offset), short2(BN, offset_next));
      }
    } else {
      const short lbk = 0;

      // Tile aligned don't check
      if ((align_M || tgp_bm == BM) && (align_N || tgp_bn == BN)) {
        gemm_kernel::gemm_loop(
            As,
            Bs,
            gemm_k_iterations,
            loader_a,
            loader_b,
            mma_op,
            tgp_bm,
            tgp_bn,
            lbk,
            LoopAlignment<true, true, true>{});
        if (offset_next - offset == BM) {
          mma_op.store_result(C, params->ldd);
        } else {
          mma_op.store_result_slice(
              C, params->ldd, short2(0, offset), short2(BN, offset_next));
        }
      }

      // Tile partially aligned check rows
      else if (align_N || tgp_bn == BN) {
        gemm_kernel::gemm_loop(
            As,
            Bs,
            gemm_k_iterations,
            loader_a,
            loader_b,
            mma_op,
            tgp_bm,
            tgp_bn,
            lbk,
            LoopAlignment<false, true, true>{});
        mma_op.store_result_slice(
            C, params->ldd, short2(0, offset), short2(BN, offset_next));
      }

      // Tile partially aligned check cols
      else if (align_M || tgp_bm == BM) {
        gemm_kernel::gemm_loop(
            As,
            Bs,
            gemm_k_iterations,
            loader_a,
            loader_b,
            mma_op,
            tgp_bm,
            tgp_bn,
            lbk,
            LoopAlignment<true, false, true>{});
        mma_op.store_result_slice(
            C, params->ldd, short2(0, offset), short2(tgp_bn, offset_next));
      }

      // Nothing aligned so check both rows and cols
      else {
        gemm_kernel::gemm_loop(
            As,
            Bs,
            gemm_k_iterations,
            loader_a,
            loader_b,
            mma_op,
            tgp_bm,
            tgp_bn,
            lbk,
            LoopAlignment<false, false, true>{});
        mma_op.store_result_slice(
            C, params->ldd, short2(0, offset), short2(tgp_bn, offset_next));
      }
    }
  }
}

template <
    typename T,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose_a,
    bool transpose_b,
    typename AccumType = float>
[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] void gather_mm(
    const device T* A [[buffer(0)]],
    const device T* B [[buffer(1)]],
    const device uint32_t* lhs_indices [[buffer(2)]],
    const device uint32_t* rhs_indices [[buffer(3)]],
    device T* C [[buffer(4)]],
    const constant GEMMParams* params [[buffer(5)]],
    const constant int* indices_shape [[buffer(6)]],
    const constant int64_t* lhs_strides [[buffer(7)]],
    const constant int64_t* rhs_strides [[buffer(8)]],
    const constant int& batch_ndim_a [[buffer(9)]],
    const constant int* batch_shape_a [[buffer(10)]],
    const constant int64_t* batch_strides_a [[buffer(11)]],
    const constant int& batch_ndim_b [[buffer(12)]],
    const constant int* batch_shape_b [[buffer(13)]],
    const constant int64_t* batch_strides_b [[buffer(14)]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint3 tid [[threadgroup_position_in_grid]]) {
  using gemm_kernel = GEMMKernel<
      T,
      T,
      BM,
      BN,
      BK,
      WM,
      WN,
      transpose_a,
      transpose_b,
      true,
      true,
      AccumType>;

  using loader_a_t = typename gemm_kernel::loader_a_t;
  using loader_b_t = typename gemm_kernel::loader_b_t;
  using mma_t = typename gemm_kernel::mma_t;

  if (params->tiles_n <= static_cast<int>(tid.x) ||
      params->tiles_m <= static_cast<int>(tid.y)) {
    return;
  }

  // Move A and B to the locations pointed by lhs_indices and rhs_indices.
  uint32_t indx_A, indx_B;
  if (has_batch) {
    ulong2 indices_offsets = elem_to_loc_broadcast(
        tid.z, indices_shape, lhs_strides, rhs_strides, params->batch_ndim);
    indx_A = lhs_indices[indices_offsets.x];
    indx_B = rhs_indices[indices_offsets.y];
  } else {
    indx_A = lhs_indices[params->batch_stride_a * tid.z];
    indx_B = rhs_indices[params->batch_stride_b * tid.z];
  }
  A += elem_to_loc(indx_A, batch_shape_a, batch_strides_a, batch_ndim_a);
  B += elem_to_loc(indx_B, batch_shape_b, batch_strides_b, batch_ndim_b);
  C += params->batch_stride_d * tid.z;

  // Prepare threadgroup memory
  threadgroup T As[gemm_kernel::tgp_mem_size_a];
  threadgroup T Bs[gemm_kernel::tgp_mem_size_b];

  // Just make sure everybody's finished with the indexing math above.
  threadgroup_barrier(mem_flags::mem_none);

  // Find block in A, B, C
  const int c_row = tid.y * BM;
  const int c_col = tid.x * BN;
  const size_t c_row_long = size_t(c_row);
  const size_t c_col_long = size_t(c_col);

  A += transpose_a ? c_row_long : c_row_long * params->lda;
  B += transpose_b ? c_col_long * params->ldb : c_col_long;
  C += c_row_long * params->ldd + c_col_long;

  // Prepare threadgroup mma operation
  thread mma_t mma_op(simd_group_id, simd_lane_id);

  // Prepare threadgroup loading operations
  thread loader_a_t loader_a(A, params->lda, As, simd_group_id, simd_lane_id);
  thread loader_b_t loader_b(B, params->ldb, Bs, simd_group_id, simd_lane_id);

  // Prepare threadgroup bounds
  const short tgp_bm = align_M ? BM : short(min(BM, params->M - c_row));
  const short tgp_bn = align_N ? BN : short(min(BN, params->N - c_col));

  // Prepare iterations
  int gemm_k_iterations = params->gemm_k_iterations_aligned;

  // Do unaligned K iterations first
  if (!align_K) {
    const int k_last = params->gemm_k_iterations_aligned * BK;
    const int k_remain = params->K - k_last;
    const size_t k_jump_a =
        transpose_a ? params->lda * size_t(k_last) : size_t(k_last);
    const size_t k_jump_b =
        transpose_b ? size_t(k_last) : params->ldb * size_t(k_last);

    // Move loader source ahead to end
    loader_a.src += k_jump_a;
    loader_b.src += k_jump_b;

    // Load tile
    const short2 tile_dims_A =
        transpose_a ? short2(tgp_bm, k_remain) : short2(k_remain, tgp_bm);
    const short2 tile_dims_B =
        transpose_b ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);

    loader_a.load_safe(tile_dims_A);
    loader_b.load_safe(tile_dims_B);

    threadgroup_barrier(mem_flags::mem_threadgroup);

    // Do matmul
    mma_op.mma(As, Bs);

    // Reset source back to start
    loader_a.src -= k_jump_a;
    loader_b.src -= k_jump_b;
  }

  // Matrix level aligned never check
  if (align_M && align_N) {
    for (int k = 0; k < gemm_k_iterations; k++) {
      threadgroup_barrier(mem_flags::mem_threadgroup);

      // Load elements into threadgroup
      loader_a.load_unsafe();
      loader_b.load_unsafe();

      threadgroup_barrier(mem_flags::mem_threadgroup);

      // Multiply and accumulate threadgroup elements
      mma_op.mma(As, Bs);

      // Prepare for next iteration
      loader_a.next();
      loader_b.next();
    }

    // Store results to device memory
    mma_op.store_result(C, params->ldd);
  } else {
    const short lbk = 0;

    // Tile aligned don't check
    if ((align_M || tgp_bm == BM) && (align_N || tgp_bn == BN)) {
      gemm_kernel::gemm_loop(
          As,
          Bs,
          gemm_k_iterations,
          loader_a,
          loader_b,
          mma_op,
          tgp_bm,
          tgp_bn,
          lbk,
          LoopAlignment<true, true, true>{});
      mma_op.store_result(C, params->ldd);
    }

    // Tile partially aligned check rows
    else if (align_N || tgp_bn == BN) {
      gemm_kernel::gemm_loop(
          As,
          Bs,
          gemm_k_iterations,
          loader_a,
          loader_b,
          mma_op,
          tgp_bm,
          tgp_bn,
          lbk,
          LoopAlignment<false, true, true>{});
      mma_op.store_result_safe(C, params->ldd, short2(tgp_bn, tgp_bm));
    }

    // Tile partially aligned check cols
    else if (align_M || tgp_bm == BM) {
      gemm_kernel::gemm_loop(
          As,
          Bs,
          gemm_k_iterations,
          loader_a,
          loader_b,
          mma_op,
          tgp_bm,
          tgp_bn,
          lbk,
          LoopAlignment<true, false, true>{});
      mma_op.store_result_safe(C, params->ldd, short2(tgp_bn, tgp_bm));
    }

    // Nothing aligned so check both rows and cols
    else {
      gemm_kernel::gemm_loop(
          As,
          Bs,
          gemm_k_iterations,
          loader_a,
          loader_b,
          mma_op,
          tgp_bm,
          tgp_bn,
          lbk,
          LoopAlignment<false, false, true>{});
      mma_op.store_result_safe(C, params->ldd, short2(tgp_bn, tgp_bm));
    }
  }
}


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.metal
================================================
// Copyright © 2024 Apple Inc.

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/steel/gemm/gemm.h"
#include "mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.h"

#define instantiate_gather_mm_rhs(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
  instantiate_kernel(                                                         \
      "steel_gather_mm_rhs_" #tname "_" #iname "_" #oname "_bm" #bm "_bn" #bn \
      "_bk" #bk "_wm" #wm "_wn" #wn,                                          \
      gather_mm_rhs,                                                          \
      itype,                                                                  \
      bm,                                                                     \
      bn,                                                                     \
      bk,                                                                     \
      wm,                                                                     \
      wn,                                                                     \
      trans_a,                                                                \
      trans_b,                                                                \
      float)

#define instantiate_gather_mm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
  instantiate_kernel(                                                     \
      "steel_gather_mm_" #tname "_" #iname "_" #oname "_bm" #bm "_bn" #bn \
      "_bk" #bk "_wm" #wm "_wn" #wn,                                      \
      gather_mm,                                                          \
      itype,                                                              \
      bm,                                                                 \
      bn,                                                                 \
      bk,                                                                 \
      wm,                                                                 \
      wn,                                                                 \
      trans_a,                                                            \
      trans_b,                                                            \
      float)

#define instantiate_gather_mm_rhs_transpose_helper(iname, itype, oname, otype, bm, bn, bk, wm, wn) \
  instantiate_gather_mm_rhs(nn, false, false, iname, itype, oname, otype, bm, bn, bk, wm, wn)  \
  instantiate_gather_mm_rhs(nt, false,  true, iname, itype, oname, otype, bm, bn, bk, wm, wn)

#define instantiate_gather_mm_transpose_helper(iname, itype, oname, otype, bm, bn, bk, wm, wn) \
  instantiate_gather_mm(nn, false, false, iname, itype, oname, otype, bm, bn, bk, wm, wn)      \
  instantiate_gather_mm(nt, false, true , iname, itype, oname, otype, bm, bn, bk, wm, wn)      \
  instantiate_gather_mm(tn, true , false, iname, itype, oname, otype, bm, bn, bk, wm, wn)      \
  instantiate_gather_mm(tt, true , true , iname, itype, oname, otype, bm, bn, bk, wm, wn)

#define instantiate_gather_mm_shapes_helper(iname, itype, oname, otype)                     \
  instantiate_gather_mm_rhs_transpose_helper(iname, itype, oname, otype, 16, 64, 16, 1, 2)  \
  instantiate_gather_mm_transpose_helper(iname, itype, oname, otype, 64, 64, 16, 2, 2)      \
  instantiate_gather_mm_transpose_helper(iname, itype, oname, otype, 64, 64, 16, 1, 2)      \
  instantiate_gather_mm_transpose_helper(iname, itype, oname, otype, 64, 32, 32, 2, 2)      \
  instantiate_gather_mm_transpose_helper(iname, itype, oname, otype, 32, 64, 16, 1, 2)      \
  instantiate_gather_mm_transpose_helper(iname, itype, oname, otype, 32, 32, 16, 2, 2)
// clang-format on

instantiate_gather_mm_shapes_helper(float16, half, float16, half);
instantiate_gather_mm_shapes_helper(bfloat16, bfloat16_t, bfloat16, bfloat16_t);
instantiate_gather_mm_shapes_helper(float32, float, float32, float);


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.h
================================================
// Copyright © 2024 Apple Inc.

using namespace mlx::steel;

constant bool align_M [[function_constant(200)]];
constant bool align_N [[function_constant(201)]];
constant bool align_K [[function_constant(202)]];

template <
    typename T,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose_a,
    bool transpose_b,
    typename AccumType = float>
[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] void
gather_mm_rhs_nax(
    const device T* A [[buffer(0)]],
    const device T* B [[buffer(1)]],
    const device uint32_t* rhs_indices [[buffer(2)]],
    device T* C [[buffer(3)]],
    const constant GEMMParams* params [[buffer(4)]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint3 tid [[threadgroup_position_in_grid]]) {
  constexpr short SM = BM / WM;
  constexpr short SN = BN / WN;
  constexpr short SK = 32;
  constexpr short TM = SM / 16;
  constexpr short TN = SN / 16;

  if (params->tiles_n <= static_cast<int>(tid.x) ||
      params->tiles_m <= static_cast<int>(tid.y)) {
    return;
  }

  // Find the block in A, B, C
  const int c_row = tid.y * BM;
  const int c_col = tid.x * BN;
  const size_t c_row_long = size_t(c_row);
  const size_t c_col_long = size_t(c_col);

  A += transpose_a ? c_row_long : c_row_long * params->lda;
  B += transpose_b ? c_col_long * params->ldb : c_col_long;
  C += c_row_long * params->ldd + c_col_long;
  rhs_indices += c_row;

  const short tm = SM * (simd_group_id / WN);
  const short tn = SN * (simd_group_id % WN);

  const int sgp_sm_int =
      align_M ? int(SM) : min(int(SM), params->M - (c_row + tm));
  const short sgp_sm = short(sgp_sm_int);
  const bool is_unaligned_sm = align_M ? false : (sgp_sm != SM);

  const int sgp_sn_int =
      align_N ? int(SN) : min(int(SN), params->N - (c_col + tn));
  const short sgp_sn = short(sgp_sn_int);
  const bool is_unaligned_sn = align_N ? false : (sgp_sn != SN);

  A += transpose_a ? tm : (tm * params->lda);
  B += transpose_b ? (tn * params->ldb) : tn;
  C += tm * params->ldd + tn;
  rhs_indices += tm;

  // Do as many matmuls as necessary
  uint32_t index;
  short offset;
  uint32_t index_next = rhs_indices[0];
  short offset_next = 0;
  int n = 0;
  while (n < sgp_sm) {
    n++;
    offset = offset_next;
    index = index_next;
    offset_next = sgp_sm;
    for (; n < sgp_sm; n++) {
      if (rhs_indices[n] != index) {
        offset_next = n;
        index_next = rhs_indices[n];
        break;
      }
    }
    threadgroup_barrier(mem_flags::mem_none);

    NAXTile<AccumType, TM, TN> Ctile;

    dispatch_bool(align_K, [&](auto kAlignedK) {
      dispatch_bool(align_M || !is_unaligned_sm, [&](auto kAlignedM) {
        dispatch_bool(align_N || !is_unaligned_sn, [&](auto kAlignedN) {
          auto do_gemm = gemm_loop< // Matmul for partial BM, full BN and full K
              T,
              SM,
              SN,
              SK,
              BK,
              transpose_a,
              transpose_b,
              kAlignedM.value,
              kAlignedN.value,
              kAlignedK.value,
              AccumType>;
          Ctile = do_gemm(
              A,
              B + index * params->batch_stride_b,
              params->lda,
              params->ldb,
              params->K,
              params->gemm_k_iterations_aligned,
              sgp_sm,
              sgp_sn);

          if constexpr (kAlignedN.value) {
            if (offset_next - offset == SM) {
              Ctile.store(C, int(params->ldd));
            } else {
              Ctile.store_slice(
                  C,
                  int(params->ldd),
                  short2(0, offset),
                  short2(SN, offset_next));
            }
          } else {
            Ctile.store_slice(
                C,
                int(params->ldd),
                short2(0, offset),
                short2(sgp_sn, offset_next));
          }
        });
      });
    });
  }
}


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.metal
================================================
// Copyright © 2024 Apple Inc.

#include <metal_stdlib>

#include "mlx/backend/metal/kernels/utils.h"

#include "mlx/backend/metal/kernels/steel/gemm/gemm_nax.h"
#include "mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.h"

// clang-format off
#define instantiate_gather_mm_rhs(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
  instantiate_kernel(                                                             \
      "steel_gather_mm_rhs_nax_" #tname "_" #iname "_" #oname "_bm" #bm "_bn" #bn \
      "_bk" #bk "_wm" #wm "_wn" #wn,                                              \
      gather_mm_rhs_nax,                                                          \
      itype,                                                                      \
      bm,                                                                         \
      bn,                                                                         \
      bk,                                                                         \
      wm,                                                                         \
      wn,                                                                         \
      trans_a,                                                                    \
      trans_b,                                                                    \
      float)

#define instantiate_gather_mm_rhs_transpose_helper(iname, itype, oname, otype, bm, bn, bk, wm, wn) \
  instantiate_gather_mm_rhs(nn, false, false, iname, itype, oname, otype, bm, bn, bk, wm, wn)  \
  instantiate_gather_mm_rhs(nt, false,  true, iname, itype, oname, otype, bm, bn, bk, wm, wn)

#define instantiate_gather_mm_shapes_helper(iname, itype, oname, otype)                      \
  instantiate_gather_mm_rhs_transpose_helper(iname, itype, oname, otype, 16, 128, 128, 1, 4) \
  instantiate_gather_mm_rhs_transpose_helper(iname, itype, oname, otype, 32, 128, 128, 1, 4) \
  instantiate_gather_mm_rhs_transpose_helper(iname, itype, oname, otype, 64, 128, 128, 2, 4)
// clang-format on

instantiate_gather_mm_shapes_helper(float16, half, float16, half);
instantiate_gather_mm_shapes_helper(bfloat16, bfloat, bfloat16, bfloat);


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/backend/metal/kernels/steel/defines.h"
using namespace metal;
using namespace mlx::steel;

///////////////////////////////////////////////////////////////////////////////
// GEMM kernels
///////////////////////////////////////////////////////////////////////////////

struct _NoMask {
  char x;

  constexpr METAL_FUNC operator bool() {
    return true;
  }
  constexpr METAL_FUNC operator bool() const threadgroup {
    return true;
  }
  constexpr METAL_FUNC operator bool() const device {
    return true;
  }
  constexpr METAL_FUNC operator bool() const constant {
    return true;
  }
};

template <typename OutT, typename InT = OutT>
struct ScaleOp {
  OutT scale;

  METAL_FUNC OutT apply(InT x) const {
    return static_cast<OutT>(x) * scale;
  }
};

typedef struct _NoMask nomask_t;

template <
    typename T,
    typename out_mask_t,
    typename op_mask_t,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose_a,
    bool transpose_b,
    bool MN_aligned,
    bool K_aligned>
[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] void
block_masked_gemm(
    const device T* A [[buffer(0)]],
    const device T* B [[buffer(1)]],
    device T* D [[buffer(3)]],
    const constant GEMMParams* params [[buffer(4)]],
    const constant int* batch_shape [[buffer(6)]],
    const constant int64_t* batch_strides [[buffer(7)]],
    const device out_mask_t* out_mask [[buffer(10)]],
    const device op_mask_t* lhs_mask [[buffer(11)]],
    const device op_mask_t* rhs_mask [[buffer(12)]],
    const constant int* mask_strides [[buffer(13)]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]]) {
  // Appease the compiler
  (void)lid;

  static_assert(
      BM == BN,
      "block_masked_gemm must have the same block M and block N size");
  static_assert(BM % BK == 0, "block_masked_gemm must have BM % BK == 0");

  constexpr bool has_operand_mask = !metal::is_same_v<op_mask_t, nomask_t>;
  constexpr bool has_output_mask = !metal::is_same_v<out_mask_t, nomask_t>;

  constexpr bool has_mul_operand_mask =
      has_operand_mask && !metal::is_same_v<op_mask_t, bool>;
  constexpr bool has_mul_output_mask =
      has_output_mask && !metal::is_same_v<out_mask_t, bool>;

  constexpr short k_mask_factor = short(BM / BK);

  using gemm_kernel = GEMMKernel<
      T,
      T,
      BM,
      BN,
      BK,
      WM,
      WN,
      transpose_a,
      transpose_b,
      MN_aligned,
      K_aligned>;

  const int tid_y = ((tid.y) << params->swizzle_log) +
      ((tid.x) & ((1 << params->swizzle_log) - 1));
  const int tid_x = (tid.x) >> params->swizzle_log;

  if (params->tiles_n <= tid_x || params->tiles_m <= tid_y) {
    return;
  }

  const constant auto* mask_batch_strides =
      batch_strides + 2 * params->batch_ndim;

  if (params->batch_ndim > 1) {
    if (has_output_mask) {
      out_mask += elem_to_loc(
          tid.z, batch_shape, mask_batch_strides, params->batch_ndim);

      mask_batch_strides += params->batch_ndim;
    }

    if (has_operand_mask) {
      const constant auto* mask_strides_lhs = mask_batch_strides;
      const constant auto* mask_strides_rhs =
          mask_strides_lhs + params->batch_ndim;

      ulong2 batch_offsets = elem_to_loc_broadcast(
          tid.z,
          batch_shape,
          mask_strides_lhs,
          mask_strides_rhs,
          params->batch_ndim);

      lhs_mask += batch_offsets.x;
      rhs_mask += batch_offsets.y;
    }
  } else {
    if (has_output_mask) {
      out_mask += tid.z * mask_batch_strides[0];
      mask_batch_strides += params->batch_ndim;
    }

    if (has_operand_mask) {
      lhs_mask += tid.z * mask_batch_strides[0];
      rhs_mask += tid.z * mask_batch_strides[params->batch_ndim];
    }
  }

  // Adjust for batch
  if (params->batch_ndim > 1) {
    const constant auto* A_bstrides = batch_strides;
    const constant auto* B_bstrides = batch_strides + params->batch_ndim;

    ulong2 batch_offsets = elem_to_loc_broadcast(
        tid.z, batch_shape, A_bstrides, B_bstrides, params->batch_ndim);

    A += batch_offsets.x;
    B += batch_offsets.y;

  } else {
    A += params->batch_stride_a * tid.z;
    B += params->batch_stride_b * tid.z;
  }

  D += params->batch_stride_d * tid.z;

  // Find block in A, B, C
  const int c_row = tid_y * BM;
  const int c_col = tid_x * BN;
  const size_t c_row_long = size_t(c_row);
  const size_t c_col_long = size_t(c_col);

  A += transpose_a ? c_row_long : c_row_long * params->lda;
  B += transpose_b ? c_col_long * params->ldb : c_col_long;
  D += c_row_long * params->ldd + c_col_long;

  const constant int* out_mask_strides = mask_strides;
  const constant int* lhs_mask_strides =
      mask_strides + (has_output_mask ? 2 : 0);
  const constant int* rhs_mask_strides =
      lhs_mask_strides + (has_operand_mask ? 2 : 0);

  const int out_mask_offset = !has_output_mask
      ? 0
      : tid_y * out_mask_strides[1] + tid_x * out_mask_strides[0];
  int lhs_mask_offset = !has_operand_mask ? 0 : tid_y * lhs_mask_strides[1];
  int rhs_mask_offset = !has_operand_mask ? 0 : tid_x * rhs_mask_strides[0];
  const int lhs_mask_step = !has_operand_mask ? 0 : lhs_mask_strides[0];
  const int rhs_mask_step = !has_operand_mask ? 0 : rhs_mask_strides[1];
  short k_factor_cnt = k_mask_factor;

  ScaleOp<float> out_mask_op;
  ScaleOp<T> lhs_mask_op;
  ScaleOp<T> rhs_mask_op;

  if (has_output_mask) {
    auto mask_out = out_mask[out_mask_offset];

    if (has_mul_output_mask) {
      out_mask_op.scale = float(mask_out);
    }

    // Write zeros and return
    if (!mask_out) {
      constexpr short tgp_size = WM * WN * 32;
      constexpr short vec_size = 4;

      // Tile threads in threadgroup
      constexpr short TN = BN / vec_size;
      constexpr short TM = tgp_size / TN;

      const short thread_idx = simd_group_id * 32 + simd_lane_id;
      const short bi = thread_idx / TN;
      const short bj = vec_size * (thread_idx % TN);

      D += bi * params->ldd + bj;

      short tgp_bm = min(BM, params->M - c_row);
      short tgp_bn = min(BN, params->N - c_col);

      if (MN_aligned || (tgp_bm == BM && tgp_bn == BN)) {
        for (short ti = 0; ti < BM; ti += TM) {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < vec_size; j++) {
            D[ti * params->ldd + j] = T(0.);
          }
        }
      } else {
        short jmax = tgp_bn - bj;
        jmax = jmax < vec_size ? jmax : vec_size;
        for (short ti = 0; (bi + ti) < tgp_bm; ti += TM) {
          for (short j = 0; j < jmax; j++) {
            D[ti * params->ldd + j] = T(0.);
          }
        }
      }

      return;
    }
  }

  threadgroup_barrier(mem_flags::mem_none);

  // Prepare threadgroup mma operation
  thread typename gemm_kernel::mma_t mma_op(simd_group_id, simd_lane_id);

  threadgroup T As[gemm_kernel::tgp_mem_size_a];
  threadgroup T Bs[gemm_kernel::tgp_mem_size_b];

  // Prepare threadgroup loading operations
  thread typename gemm_kernel::loader_a_t loader_a(
      A, params->lda, As, simd_group_id, simd_lane_id);
  thread typename gemm_kernel::loader_b_t loader_b(
      B, params->ldb, Bs, simd_group_id, simd_lane_id);

  // Prepare threadgroup bounds
  const short tgp_bm =
      MN_aligned ? short(BM) : short(min(BM, params->M - c_row));
  const short tgp_bn =
      MN_aligned ? short(BN) : short(min(BN, params->N - c_col));

  int gemm_k_iterations = params->gemm_k_iterations_aligned;

  ///////////////////////////////////////////////////////////////////////////////
  // Do unaligned K iterations first
  if (!K_aligned) {
    const int k_last = params->gemm_k_iterations_aligned * BK;
    const int mask_idx_last = k_last / BM;

    if (!has_operand_mask ||
        (bool(lhs_mask[lhs_mask_offset + mask_idx_last * lhs_mask_step]) &&
         bool(rhs_mask[rhs_mask_offset + mask_idx_last * rhs_mask_step]))) {
      if (has_mul_operand_mask) {
        lhs_mask_op.scale =
            lhs_mask[lhs_mask_offset + mask_idx_last * lhs_mask_step];
        rhs_mask_op.scale =
            rhs_mask[rhs_mask_offset + mask_idx_last * rhs_mask_step];
      }

      // Move loader source ahead to end
      const int k_remain = params->K - k_last;
      const size_t k_jump_a =
          transpose_a ? params->lda * size_t(k_last) : size_t(k_last);
      const size_t k_jump_b =
          transpose_b ? size_t(k_last) : params->ldb * size_t(k_last);

      loader_a.src += k_jump_a;
      loader_b.src += k_jump_b;

      // Load tile
      const short2 tile_dims_A =
          transpose_a ? short2(tgp_bm, k_remain) : short2(k_remain, tgp_bm);
      const short2 tile_dims_B =
          transpose_b ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);

      loader_a.load_safe(tile_dims_A);
      loader_b.load_safe(tile_dims_B);

      if (has_mul_operand_mask) {
        loader_a.apply_inplace_op(lhs_mask_op);
        loader_b.apply_inplace_op(rhs_mask_op);
      }

      threadgroup_barrier(mem_flags::mem_threadgroup);

      // Do matmul
      mma_op.mma(As, Bs);

      // Reset source back to start
      loader_a.src -= k_jump_a;
      loader_b.src -= k_jump_b;
    }
  }

  ///////////////////////////////////////////////////////////////////////////////
  // MNK aligned loop
  if (MN_aligned) {
    for (; gemm_k_iterations > 0; gemm_k_iterations--) {
      threadgroup_barrier(mem_flags::mem_threadgroup);

      if (!has_operand_mask ||
          (bool(lhs_mask[lhs_mask_offset]) &&
           bool(rhs_mask[rhs_mask_offset]))) {
        if (has_mul_operand_mask) {
          lhs_mask_op.scale = lhs_mask[lhs_mask_offset];
          rhs_mask_op.scale = rhs_mask[rhs_mask_offset];
        }

        // Load elements into threadgroup
        loader_a.load_unsafe();
        loader_b.load_unsafe();

        if (has_mul_operand_mask) {
          loader_a.apply_inplace_op(lhs_mask_op);
          loader_b.apply_inplace_op(rhs_mask_op);
        }

        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Multiply and accumulate threadgroup elements
        mma_op.mma(As, Bs);
      }

      // Prepare for next iteration
      loader_a.next();
      loader_b.next();

      k_factor_cnt--;
      lhs_mask_offset += k_factor_cnt == 0 ? lhs_mask_step : 0;
      rhs_mask_offset += k_factor_cnt == 0 ? rhs_mask_step : 0;
      k_factor_cnt = k_factor_cnt == 0 ? k_mask_factor : k_factor_cnt;
    }

    if (has_mul_output_mask) {
      mma_op.apply_epilogue(out_mask_op);
    }

    // Store results to device memory
    mma_op.store_result(D, params->ldd);
    return;

  }
  ///////////////////////////////////////////////////////////////////////////////
  // MN unaligned loop
  else {
    const bool M_aligned = (tgp_bm == BM);
    const bool N_aligned = (tgp_bn == BN);

    const short2 tile_dims_A =
        transpose_a ? short2(tgp_bm, BK) : short2(BK, tgp_bm);
    const short2 tile_dims_B =
        transpose_b ? short2(BK, tgp_bn) : short2(tgp_bn, BK);

    for (; gemm_k_iterations > 0; gemm_k_iterations--) {
      threadgroup_barrier(mem_flags::mem_threadgroup);
      if (!has_operand_mask ||
          (bool(lhs_mask[lhs_mask_offset]) &&
           bool(rhs_mask[rhs_mask_offset]))) {
        if (has_mul_operand_mask) {
          lhs_mask_op.scale = lhs_mask[lhs_mask_offset];
          rhs_mask_op.scale = rhs_mask[rhs_mask_offset];
        }

        // Load elements into threadgroup
        if (M_aligned) {
          loader_a.load_unsafe();
        } else {
          loader_a.load_safe(tile_dims_A);
        }

        if (N_aligned) {
          loader_b.load_unsafe();
        } else {
          loader_b.load_safe(tile_dims_B);
        }

        if (has_mul_operand_mask) {
          loader_a.apply_inplace_op(lhs_mask_op);
          loader_b.apply_inplace_op(rhs_mask_op);
        }

        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Multiply and accumulate threadgroup elements
        mma_op.mma(As, Bs);
      }

      // Prepare for next iteration
      loader_a.next();
      loader_b.next();

      k_factor_cnt--;
      lhs_mask_offset += k_factor_cnt == 0 ? lhs_mask_step : 0;
      rhs_mask_offset += k_factor_cnt == 0 ? rhs_mask_step : 0;
      k_factor_cnt = k_factor_cnt == 0 ? k_mask_factor : k_factor_cnt;
    }

    if (has_mul_output_mask) {
      mma_op.apply_epilogue(out_mask_op);
    }

    if (M_aligned && N_aligned) {
      mma_op.store_result(D, params->ldd);
    } else {
      mma_op.store_result_safe(D, params->ldd, short2(tgp_bn, tgp_bm));
    }
  }
}

template <
    typename T,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose_a,
    bool transpose_b,
    bool MN_aligned,
    bool K_aligned,
    bool has_operand_mask = false>
[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] void
block_masked_gemm(
    const device T* A [[buffer(0)]],
    const device T* B [[buffer(1)]],
    device T* D [[buffer(3)]],
    const constant GEMMParams* params [[buffer(4)]],
    const constant int* batch_shape [[buffer(6)]],
    const constant int64_t* batch_strides [[buffer(7)]],
    const device bool* out_mask [[buffer(10)]],
    const device bool* lhs_mask [[buffer(11)]],
    const device bool* rhs_mask [[buffer(12)]],
    const constant int* mask_strides [[buffer(13)]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]]) {
  // Appease the compiler
  (void)lid;

  using gemm_kernel = GEMMKernel<
      T,
      T,
      BM,
      BN,
      BK,
      WM,
      WN,
      transpose_a,
      transpose_b,
      MN_aligned,
      K_aligned>;

  const int tid_y = ((tid.y) << params->swizzle_log) +
      ((tid.x) & ((1 << params->swizzle_log) - 1));
  const int tid_x = (tid.x) >> params->swizzle_log;

  if (params->tiles_n <= tid_x || params->tiles_m <= tid_y) {
    return;
  }

  if (params->batch_ndim > 1) {
    const constant auto* mask_batch_strides =
        batch_strides + 2 * params->batch_ndim;
    out_mask +=
        elem_to_loc(tid.z, batch_shape, mask_batch_strides, params->batch_ndim);

    if (has_operand_mask) {
      const constant auto* mask_strides_lhs =
          mask_batch_strides + params->batch_ndim;
      const constant auto* mask_strides_rhs =
          mask_strides_lhs + params->batch_ndim;

      ulong2 batch_offsets = elem_to_loc_broadcast(
          tid.z,
          batch_shape,
          mask_strides_lhs,
          mask_strides_rhs,
          params->batch_ndim);

      lhs_mask += batch_offsets.x;
      rhs_mask += batch_offsets.y;
    }
  } else {
    out_mask += tid.z * batch_strides[2 * params->batch_ndim];
    if (has_operand_mask) {
      lhs_mask += tid.z * batch_strides[3 * params->batch_ndim];
      rhs_mask += tid.z * batch_strides[4 * params->batch_ndim];
    }
  }

  // Adjust for batch
  if (params->batch_ndim > 1) {
    const constant auto* A_bstrides = batch_strides;
    const constant auto* B_bstrides = batch_strides + params->batch_ndim;

    ulong2 batch_offsets = elem_to_loc_broadcast(
        tid.z, batch_shape, A_bstrides, B_bstrides, params->batch_ndim);

    A += batch_offsets.x;
    B += batch_offsets.y;

  } else {
    A += params->batch_stride_a * tid.z;
    B += params->batch_stride_b * tid.z;
  }

  D += params->batch_stride_d * tid.z;

  // Find block in A, B, C
  const int c_row = tid_y * BM;
  const int c_col = tid_x * BN;
  const size_t c_row_long = size_t(c_row);
  const size_t c_col_long = size_t(c_col);

  A += transpose_a ? c_row_long : c_row_long * params->lda;
  B += transpose_b ? c_col_long * params->ldb : c_col_long;
  D += c_row_long * params->ldd + c_col_long;

  bool mask_out = out_mask[tid_y * mask_strides[1] + tid_x * mask_strides[0]];

  // Write zeros and return
  if (!mask_out) {
    constexpr short tgp_size = WM * WN * 32;
    constexpr short vec_size = 4;

    // Tile threads in threadgroup
    constexpr short TN = BN / vec_size;
    constexpr short TM = tgp_size / TN;

    const short thread_idx = simd_group_id * 32 + simd_lane_id;
    const short bi = thread_idx / TN;
    const short bj = vec_size * (thread_idx % TN);

    D += bi * params->ldd + bj;

    short tgp_bm = min(BM, params->M - c_row);
    short tgp_bn = min(BN, params->N - c_col);

    if (MN_aligned || (tgp_bm == BM && tgp_bn == BN)) {
      for (short ti = 0; ti < BM; ti += TM) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; j++) {
          D[ti * params->ldd + j] = T(0.);
        }
      }
    } else {
      short jmax = tgp_bn - bj;
      jmax = jmax < vec_size ? jmax : vec_size;
      for (short ti = 0; (bi + ti) < tgp_bm; ti += TM) {
        for (short j = 0; j < jmax; j++) {
          D[ti * params->ldd + j] = T(0.);
        }
      }
    }

    return;
  }

  threadgroup_barrier(mem_flags::mem_none);

  // Prepare threadgroup mma operation
  thread typename gemm_kernel::mma_t mma_op(simd_group_id, simd_lane_id);

  int gemm_k_iterations = params->gemm_k_iterations_aligned;

  threadgroup T As[gemm_kernel::tgp_mem_size_a];
  threadgroup T Bs[gemm_kernel::tgp_mem_size_b];

  // Prepare threadgroup loading operations
  thread typename gemm_kernel::loader_a_t loader_a(
      A, params->lda, As, simd_group_id, simd_lane_id);
  thread typename gemm_kernel::loader_b_t loader_b(
      B, params->ldb, Bs, simd_group_id, simd_lane_id);

  ///////////////////////////////////////////////////////////////////////////////
  // MNK aligned loop
  if (MN_aligned) {
    for (int k = 0; k < gemm_k_iterations; k++) {
      threadgroup_barrier(mem_flags::mem_threadgroup);

      if (!has_operand_mask ||
          (lhs_mask
               [tid_y * mask_strides[3] + ((k * BK) / BM) * mask_strides[2]] &&
           rhs_mask
               [((k * BK) / BM) * mask_strides[5] + tid_x * mask_strides[4]])) {
        // Load elements into threadgroup
        loader_a.load_unsafe();
        loader_b.load_unsafe();

        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Multiply and accumulate threadgroup elements
        mma_op.mma(As, Bs);
      }

      // Prepare for next iteration
      loader_a.next();
      loader_b.next();
    }

    threadgroup_barrier(mem_flags::mem_none);

    // Loop tail
    if (!K_aligned) {
      if (!has_operand_mask ||
          (lhs_mask
               [tid_y * mask_strides[3] + (params->K / BM) * mask_strides[2]] &&
           rhs_mask
               [(params->K / BM) * mask_strides[5] +
                tid_x * mask_strides[4]])) {
        int lbk = params->K - params->gemm_k_iterations_aligned * BK;
        short2 tile_dims_A = transpose_a ? short2(BM, lbk) : short2(lbk, BM);
        short2 tile_dims_B = transpose_b ? short2(lbk, BN) : short2(BN, lbk);

        loader_a.load_safe(tile_dims_A);
        loader_b.load_safe(tile_dims_B);

        threadgroup_barrier(mem_flags::mem_threadgroup);

        mma_op.mma(As, Bs);
      }
    }

    // Store results to device memory
    mma_op.store_result(D, params->ldd);
    return;

  }
  ///////////////////////////////////////////////////////////////////////////////
  // MN unaligned loop
  else { // Loop over K - unaligned case
    short tgp_bm = min(BM, params->M - c_row);
    short tgp_bn = min(BN, params->N - c_col);
    short lbk = params->K - params->gemm_k_iterations_aligned * BK;

    bool M_aligned = (tgp_bm == BM);
    bool N_aligned = (tgp_bn == BN);

    short2 tile_dims_A = transpose_a ? short2(tgp_bm, BK) : short2(BK, tgp_bm);
    short2 tile_dims_B = transpose_b ? short2(BK, tgp_bn) : short2(tgp_bn, BK);

    for (int k = 0; k < gemm_k_iterations; k++) {
      threadgroup_barrier(mem_flags::mem_threadgroup);
      if (!has_operand_mask ||
          (lhs_mask
               [tid_y * mask_strides[3] + ((k * BK) / BM) * mask_strides[2]] &&
           rhs_mask
               [((k * BK) / BM) * mask_strides[5] + tid_x * mask_strides[4]])) {
        // Load elements into threadgroup
        if (M_aligned) {
          loader_a.load_unsafe();
        } else {
          loader_a.load_safe(tile_dims_A);
        }

        if (N_aligned) {
          loader_b.load_unsafe();
        } else {
          loader_b.load_safe(tile_dims_B);
        }

        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Multiply and accumulate threadgroup elements
        mma_op.mma(As, Bs);
      }

      // Prepare for next iteration
      loader_a.next();
      loader_b.next();
    }

    if (!K_aligned) {
      threadgroup_barrier(mem_flags::mem_threadgroup);

      if (!has_operand_mask ||
          (lhs_mask
               [tid_y * mask_strides[3] + (params->K / BM) * mask_strides[2]] &&
           rhs_mask
               [(params->K / BM) * mask_strides[5] +
                tid_x * mask_strides[4]])) {
        short2 tile_dims_A_last =
            transpose_a ? short2(tgp_bm, lbk) : short2(lbk, tgp_bm);
        short2 tile_dims_B_last =
            transpose_b ? short2(lbk, tgp_bn) : short2(tgp_bn, lbk);

        loader_a.load_safe(tile_dims_A_last);
        loader_b.load_safe(tile_dims_B_last);

        threadgroup_barrier(mem_flags::mem_threadgroup);

        mma_op.mma(As, Bs);
      }
    }

    if (M_aligned && N_aligned) {
      mma_op.store_result(D, params->ldd);
    } else {
      mma_op.store_result_safe(D, params->ldd, short2(tgp_bn, tgp_bm));
    }
  }
}


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.metal
================================================
// Copyright © 2024 Apple Inc.

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/steel/gemm/gemm.h"
#include "mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h"

#define instantiate_gemm(                                              \
    outmaskname,                                                       \
    outmasktype,                                                       \
    opmaskname,                                                        \
    opmasktype,                                                        \
    tname,                                                             \
    trans_a,                                                           \
    trans_b,                                                           \
    iname,                                                             \
    itype,                                                             \
    oname,                                                             \
    otype,                                                             \
    bm,                                                                \
    bn,                                                                \
    bk,                                                                \
    wm,                                                                \
    wn,                                                                \
    aname,                                                             \
    mn_aligned,                                                        \
    kname,                                                             \
    k_aligned)                                                         \
  instantiate_kernel(                                                  \
    "steel_gemm_block_outmask_" #outmaskname                           \
      "_opmask_" #opmaskname "_" #tname "_" #iname "_" #oname          \
      "_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn                \
      "_MN_" #aname "_K_" #kname,                                      \
    block_masked_gemm,                                                 \
      itype,                                                           \
      outmasktype,                                                     \
      opmasktype,                                                      \
      bm,                                                              \
      bn,                                                              \
      bk,                                                              \
      wm,                                                              \
      wn,                                                              \
      trans_a,                                                         \
      trans_b,                                                         \
      mn_aligned,                                                      \
      k_aligned)

#define instantiate_gemm_mask_helper(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, aname, mn_aligned, kname, k_aligned)                \
  instantiate_gemm(bool_, bool, bool_, bool, tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, aname, mn_aligned, kname, k_aligned)        \
  instantiate_gemm(iname, itype, iname, itype, tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, aname, mn_aligned, kname, k_aligned)      \
  instantiate_gemm(bool_, bool, iname, itype, tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, aname, mn_aligned, kname, k_aligned)       \
  instantiate_gemm(iname, itype, bool_, bool, tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, aname, mn_aligned, kname, k_aligned)       \
  instantiate_gemm(nomask, nomask_t, bool_, bool, tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, aname, mn_aligned, kname, k_aligned)   \
  instantiate_gemm(nomask, nomask_t, iname, itype, tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, aname, mn_aligned, kname, k_aligned)  \
  instantiate_gemm(bool_, bool, nomask, nomask_t, tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, aname, mn_aligned, kname, k_aligned)   \
  instantiate_gemm(iname, itype, nomask, nomask_t, tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, aname, mn_aligned, kname, k_aligned)

#define instantiate_gemm_aligned_helper(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn)                         \
  instantiate_gemm_mask_helper(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, taligned, true, taligned, true)  \
  instantiate_gemm_mask_helper(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, taligned, true, naligned, false) \
  instantiate_gemm_mask_helper(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, naligned, false, taligned, true) \
  instantiate_gemm_mask_helper(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, naligned, false, naligned, false)

#define instantiate_gemm_transpose_helper(iname, itype, oname, otype, bm, bn, bk, wm, wn)             \
    instantiate_gemm_aligned_helper(nn, false, false, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm_aligned_helper(nt, false, true , iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm_aligned_helper(tn, true , false, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm_aligned_helper(tt, true , true , iname, itype, oname, otype, bm, bn, bk, wm, wn)

#define instantiate_gemm_shapes_helper(iname, itype, oname, otype)                  \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 32, 32, 16, 2, 2) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 64, 64, 16, 2, 2)

instantiate_gemm_shapes_helper(float16, half, float16, half);
instantiate_gemm_shapes_helper(bfloat16, bfloat16_t, bfloat16, bfloat16_t);
instantiate_gemm_shapes_helper(float32, float, float32, float); // clang-format on


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h
================================================
// Copyright © 2025 Apple Inc.

using namespace mlx::steel;

constant bool segments_contiguous [[function_constant(199)]];
constant bool align_M [[function_constant(200)]];
constant bool align_N [[function_constant(201)]];

template <
    typename T,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose_a,
    bool transpose_b,
    typename AccumType = float>
[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] void segmented_mm(
    const device T* A [[buffer(0)]],
    const device T* B [[buffer(1)]],
    const device uint32_t* segments [[buffer(2)]],
    device T* C [[buffer(3)]],
    const constant GEMMParams* params [[buffer(4)]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint3 tid [[threadgroup_position_in_grid]]) {
  using gemm_kernel = GEMMKernel<
      T,
      T,
      BM,
      BN,
      BK,
      WM,
      WN,
      transpose_a,
      transpose_b,
      true,
      true,
      AccumType>;

  using loader_a_t = typename gemm_kernel::loader_a_t;
  using loader_b_t = typename gemm_kernel::loader_b_t;
  using mma_t = typename gemm_kernel::mma_t;

  if (params->tiles_n <= static_cast<int>(tid.x) ||
      params->tiles_m <= static_cast<int>(tid.y)) {
    return;
  }

  // Prepare threadgroup memory
  threadgroup T As[gemm_kernel::tgp_mem_size_a];
  threadgroup T Bs[gemm_kernel::tgp_mem_size_b];

  // Find the block in A, B, C
  const int c_row = tid.y * BM;
  const int c_col = tid.x * BN;
  const size_t c_row_long = size_t(c_row);
  const size_t c_col_long = size_t(c_col);

  // Prepare threadgroup bounds
  const short tgp_bm = align_M ? BM : short(min(BM, params->M - c_row));
  const short tgp_bn = align_N ? BN : short(min(BN, params->N - c_col));

  // Move the pointers to the output tile
  A += transpose_a ? c_row_long : c_row_long * params->lda;
  B += transpose_b ? c_col_long * params->ldb : c_col_long;
  C += c_row_long * params->ldd + c_col_long;

  // Move the pointers to the start of the segment
  uint32_t k_start, k_end;
  if (segments_contiguous) {
    k_start = segments[2 * tid.z];
    k_end = segments[2 * tid.z + 1];
  } else {
    // We accept either contiguous (above) or weird strides where the beginning
    // of the next one is the previous one. Basically the last two strides are
    // both 1!
    k_start = segments[tid.z];
    k_end = segments[tid.z + 1];
  }
  A += transpose_a ? k_start * params->lda : k_start;
  B += transpose_b ? k_start : k_start * params->ldb;
  C += tid.z * params->batch_stride_d;

  // Prepare threadgroup mma operation
  thread mma_t mma_op(simd_group_id, simd_lane_id);

  // Prepare threadgroup loading operations
  thread loader_a_t loader_a(A, params->lda, As, simd_group_id, simd_lane_id);
  thread loader_b_t loader_b(B, params->ldb, Bs, simd_group_id, simd_lane_id);

  // Matrix level alignment so only check K
  if (align_M && align_N) {
    uint32_t k = k_start + BK;
    for (; k <= k_end; k += BK) {
      threadgroup_barrier(mem_flags::mem_threadgroup);

      // Load elements into threadgroup
      loader_a.load_unsafe();
      loader_b.load_unsafe();

      threadgroup_barrier(mem_flags::mem_threadgroup);

      // Multiply and accumulate threadgroup elements
      mma_op.mma(As, Bs);

      // Prepare for next iteration
      loader_a.next();
      loader_b.next();
    }
    short k_remain = BK - short(k - k_end);
    const short2 tile_dims_A =
        transpose_a ? short2(tgp_bm, k_remain) : short2(k_remain, tgp_bm);
    const short2 tile_dims_B =
        transpose_b ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);
    if (k_remain > 0) {
      threadgroup_barrier(mem_flags::mem_threadgroup);
      loader_a.load_safe(tile_dims_A);
      loader_b.load_safe(tile_dims_B);
      threadgroup_barrier(mem_flags::mem_threadgroup);
      mma_op.mma(As, Bs);
    }
    mma_op.store_result(C, params->ldd);
  } else {
    // Tile aligned do the same as above
    if ((align_M || tgp_bm == BM) && (align_N || tgp_bn == BN)) {
      uint32_t k = k_start + BK;
      for (; k <= k_end; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Load elements into threadgroup
        loader_a.load_unsafe();
        loader_b.load_unsafe();

        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Multiply and accumulate threadgroup elements
        mma_op.mma(As, Bs);

        // Prepare for next iteration
        loader_a.next();
        loader_b.next();
      }
      short k_remain = BK - short(k - k_end);
      const short2 tile_dims_A =
          transpose_a ? short2(tgp_bm, k_remain) : short2(k_remain, tgp_bm);
      const short2 tile_dims_B =
          transpose_b ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);
      if (k_remain > 0) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_a.load_safe(tile_dims_A);
        loader_b.load_safe(tile_dims_B);
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(As, Bs);
      }
      mma_op.store_result(C, params->ldd);
    }

    // Tile partially aligned check rows
    else if (align_N || tgp_bn == BN) {
      uint32_t k = k_start + BK;
      for (; k <= k_end; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Load elements into threadgroup
        loader_a.load_safe(
            transpose_a ? short2(tgp_bm, BK) : short2(BK, tgp_bm));
        loader_b.load_unsafe();

        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Multiply and accumulate threadgroup elements
        mma_op.mma(As, Bs);

        // Prepare for next iteration
        loader_a.next();
        loader_b.next();
      }
      short k_remain = BK - short(k - k_end);
      const short2 tile_dims_A =
          transpose_a ? short2(tgp_bm, k_remain) : short2(k_remain, tgp_bm);
      const short2 tile_dims_B =
          transpose_b ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);
      if (k_remain > 0) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_a.load_safe(tile_dims_A);
        loader_b.load_safe(tile_dims_B);
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(As, Bs);
      }
      mma_op.store_result_safe(C, params->ldd, short2(tgp_bn, tgp_bm));
    }

    // Tile partially aligned check cols
    else if (align_M || tgp_bm == BM) {
      uint32_t k = k_start + BK;
      for (; k <= k_end; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Load elements into threadgroup
        loader_a.load_unsafe();
        loader_b.load_safe(
            transpose_b ? short2(BK, tgp_bn) : short2(tgp_bn, BK));

        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Multiply and accumulate threadgroup elements
        mma_op.mma(As, Bs);

        // Prepare for next iteration
        loader_a.next();
        loader_b.next();
      }
      short k_remain = BK - short(k - k_end);
      const short2 tile_dims_A =
          transpose_a ? short2(tgp_bm, k_remain) : short2(k_remain, tgp_bm);
      const short2 tile_dims_B =
          transpose_b ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);
      if (k_remain > 0) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_a.load_safe(tile_dims_A);
        loader_b.load_safe(tile_dims_B);
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(As, Bs);
      }
      mma_op.store_result_safe(C, params->ldd, short2(tgp_bn, tgp_bm));
    }

    // Nothing aligned so check both rows and cols
    else {
      uint32_t k = k_start + BK;
      for (; k <= k_end; k += BK) {
        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Load elements into threadgroup
        loader_a.load_safe(
            transpose_a ? short2(tgp_bm, BK) : short2(BK, tgp_bm));
        loader_b.load_safe(
            transpose_b ? short2(BK, tgp_bn) : short2(tgp_bn, BK));

        threadgroup_barrier(mem_flags::mem_threadgroup);

        // Multiply and accumulate threadgroup elements
        mma_op.mma(As, Bs);

        // Prepare for next iteration
        loader_a.next();
        loader_b.next();
      }
      short k_remain = BK - short(k - k_end);
      const short2 tile_dims_A =
          transpose_a ? short2(tgp_bm, k_remain) : short2(k_remain, tgp_bm);
      const short2 tile_dims_B =
          transpose_b ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);
      if (k_remain > 0) {
        threadgroup_barrier(mem_flags::mem_threadgroup);
        loader_a.load_safe(tile_dims_A);
        loader_b.load_safe(tile_dims_B);
        threadgroup_barrier(mem_flags::mem_threadgroup);
        mma_op.mma(As, Bs);
      }
      mma_op.store_result_safe(C, params->ldd, short2(tgp_bn, tgp_bm));
    }
  }
}


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.metal
================================================
// Copyright © 2024 Apple Inc.

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/steel/gemm/gemm.h"
#include "mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h"

#define instantiate_segmented_mm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
  instantiate_kernel(                                                         \
      "steel_segmented_mm_" #tname "_" #iname "_" #oname "_bm" #bm "_bn" #bn  \
      "_bk" #bk "_wm" #wm "_wn" #wn,                                          \
      segmented_mm,                                                           \
      itype,                                                                  \
      bm,                                                                     \
      bn,                                                                     \
      bk,                                                                     \
      wm,                                                                     \
      wn,                                                                     \
      trans_a,                                                                \
      trans_b,                                                                \
      float)

#define instantiate_segmented_mm_transpose_helper(iname, itype, oname, otype, bm, bn, bk, wm, wn) \
  instantiate_segmented_mm(nn, false, false, iname, itype, oname, otype, bm, bn, bk, wm, wn)      \
  instantiate_segmented_mm(nt, false, true , iname, itype, oname, otype, bm, bn, bk, wm, wn)      \
  instantiate_segmented_mm(tn, true , false, iname, itype, oname, otype, bm, bn, bk, wm, wn)      \
  instantiate_segmented_mm(tt, true , true , iname, itype, oname, otype, bm, bn, bk, wm, wn)

#define instantiate_segmented_mm_shapes_helper(iname, itype, oname, otype)                 \
  instantiate_segmented_mm_transpose_helper(iname, itype, oname, otype, 64, 64, 16, 2, 2)  \
  instantiate_segmented_mm_transpose_helper(iname, itype, oname, otype, 64, 64, 16, 1, 2)  \
  instantiate_segmented_mm_transpose_helper(iname, itype, oname, otype, 64, 32, 32, 2, 2)  \
  instantiate_segmented_mm_transpose_helper(iname, itype, oname, otype, 32, 64, 16, 1, 2)  \
  instantiate_segmented_mm_transpose_helper(iname, itype, oname, otype, 32, 32, 16, 2, 2)
// clang-format on

instantiate_segmented_mm_shapes_helper(float16, half, float16, half);
instantiate_segmented_mm_shapes_helper(
    bfloat16,
    bfloat16_t,
    bfloat16,
    bfloat16_t);
instantiate_segmented_mm_shapes_helper(float32, float, float32, float);


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.h
================================================
// Copyright © 2024 Apple Inc.

using namespace mlx::steel;

///////////////////////////////////////////////////////////////////////////////
// GEMM kernels
///////////////////////////////////////////////////////////////////////////////

template <
    typename T,
    typename U,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose_a,
    bool transpose_b,
    bool MN_aligned,
    bool K_aligned>
[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] void gemm_splitk(
    const device T* A [[buffer(0)]],
    const device T* B [[buffer(1)]],
    device U* C [[buffer(2)]],
    const constant GEMMSpiltKParams* params [[buffer(3)]],
    uint simd_lane_id [[thread_index_in_simdgroup]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint3 tid [[threadgroup_position_in_grid]],
    uint3 lid [[thread_position_in_threadgroup]]) {
  (void)lid;

  using gemm_kernel = GEMMKernel<
      T,
      U,
      BM,
      BN,
      BK,
      WM,
      WN,
      transpose_a,
      transpose_b,
      MN_aligned,
      K_aligned>;
  using loader_a_t = typename gemm_kernel::loader_a_t;
  using loader_b_t = typename gemm_kernel::loader_b_t;
  using mma_t = typename gemm_kernel::mma_t;

  threadgroup T As[gemm_kernel::tgp_mem_size_a];
  threadgroup T Bs[gemm_kernel::tgp_mem_size_b];

  const int tid_x = tid.x;
  const int tid_y = tid.y;
  const int tid_z = tid.z;

  if (params->tiles_n <= tid_x || params->tiles_m <= tid_y) {
    return;
  }

  // Find block in A, B, C
  const int c_row = tid_y * BM;
  const int c_col = tid_x * BN;
  const int k_start = params->split_k_partition_size * tid_z;

  const size_t c_row_long = size_t(c_row);
  const size_t c_col_long = size_t(c_col);
  const size_t k_start_long = size_t(k_start);

  A += transpose_a ? (c_row_long + k_start_long * params->lda)
                   : (k_start_long + c_row_long * params->lda);
  B += transpose_b ? (k_start_long + c_col_long * params->ldb)
                   : (c_col_long + k_start_long * params->ldb);
  C += (size_t(params->split_k_partition_stride) * tid_z) +
      (c_row_long * params->ldc + c_col_long);

  // Prepare threadgroup loading operations
  thread loader_a_t loader_a(A, params->lda, As, simd_group_id, simd_lane_id);
  thread loader_b_t loader_b(B, params->ldb, Bs, simd_group_id, simd_lane_id);

  // Prepare threadgroup mma operation
  thread mma_t mma_op(simd_group_id, simd_lane_id);

  int gemm_k_iterations = params->gemm_k_iterations_aligned;

  short tgp_bm = min(BM, params->M - c_row);
  short tgp_bn = min(BN, params->N - c_col);
  short leftover_bk = params->K % BK;

  if (MN_aligned || (tgp_bm == BM && tgp_bn == BN)) {
    gemm_kernel::gemm_loop(
        As,
        Bs,
        gemm_k_iterations,
        loader_a,
        loader_b,
        mma_op,
        tgp_bm,
        tgp_bn,
        leftover_bk,
        LoopAlignment<true, true, true>{});
  } else if (tgp_bn == BN) {
    gemm_kernel::gemm_loop(
        As,
        Bs,
        gemm_k_iterations,
        loader_a,
        loader_b,
        mma_op,
        tgp_bm,
        tgp_bn,
        leftover_bk,
        LoopAlignment<false, true, true>{});
  } else if (tgp_bm == BM) {
    gemm_kernel::gemm_loop(
        As,
        Bs,
        gemm_k_iterations,
        loader_a,
        loader_b,
        mma_op,
        tgp_bm,
        tgp_bn,
        leftover_bk,
        LoopAlignment<true, false, true>{});
  } else {
    gemm_kernel::gemm_loop(
        As,
        Bs,
        gemm_k_iterations,
        loader_a,
        loader_b,
        mma_op,
        tgp_bm,
        tgp_bn,
        leftover_bk,
        LoopAlignment<false, false, true>{});
  }

  threadgroup_barrier(mem_flags::mem_threadgroup);

  if ((tid_z + 1) == (params->split_k_partitions)) {
    int gemm_k_iter_remaining =
        (params->K - (k_start + params->split_k_partition_size)) / BK;
    if (!K_aligned || gemm_k_iter_remaining > 0)
      gemm_kernel::gemm_loop(
          As,
          Bs,
          gemm_k_iter_remaining,
          loader_a,
          loader_b,
          mma_op,
          tgp_bm,
          tgp_bn,
          leftover_bk,
          LoopAlignment<false, false, K_aligned>{});
  }

  if (MN_aligned || (tgp_bm == BM && tgp_bn == BN)) {
    mma_op.store_result(C, params->ldc);
  } else {
    mma_op.store_result_safe(C, params->ldc, short2(tgp_bn, tgp_bm));
  }
}

///////////////////////////////////////////////////////////////////////////////
// Split k accumulation kernel
///////////////////////////////////////////////////////////////////////////////

template <
    typename AccT,
    typename OutT,
    typename Epilogue = TransformNone<OutT, AccT>>
[[kernel]] void gemm_splitk_accum(
    const device AccT* C_split [[buffer(0)]],
    device OutT* D [[buffer(1)]],
    const constant int& k_partitions [[buffer(2)]],
    const constant int& partition_stride [[buffer(3)]],
    const constant int& ldd [[buffer(4)]],
    uint2 gid [[thread_position_in_grid]]) {
  // Ajust D and C
  D += gid.x + gid.y * size_t(ldd);
  C_split += gid.x + gid.y * size_t(ldd);

  size_t offset = 0;
  AccT out = 0;

  for (int i = 0; i < k_partitions; i++) {
    out += C_split[offset];
    offset += partition_stride;
  }

  // Write output
  D[0] = Epilogue::apply(out);
}

template <
    typename AccT,
    typename OutT,
    typename Epilogue = TransformAxpby<OutT, AccT>>
[[kernel]] void gemm_splitk_accum_axpby(
    const device AccT* C_split [[buffer(0)]],
    device OutT* D [[buffer(1)]],
    const constant int& k_partitions [[buffer(2)]],
    const constant int& partition_stride [[buffer(3)]],
    const constant int& ldd [[buffer(4)]],
    const device OutT* C [[buffer(5)]],
    const constant int& ldc [[buffer(6)]],
    const constant int& fdc [[buffer(7)]],
    const constant float& alpha [[buffer(8)]],
    const constant float& beta [[buffer(9)]],
    uint2 gid [[thread_position_in_grid]]) {
  // Ajust D and C
  C += gid.x * size_t(fdc) + gid.y * size_t(ldc);
  D += gid.x + gid.y * size_t(ldd);
  C_split += gid.x + gid.y * size_t(ldd);

  size_t offset = 0;
  AccT out = 0;

  for (int i = 0; i < k_partitions; i++) {
    out += C_split[offset];
    offset += partition_stride;
  }

  // Write output
  Epilogue op(alpha, beta);
  D[0] = op.apply(out, *C);
}


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.metal
================================================
// Copyright © 2024 Apple Inc.

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/steel/gemm/gemm.h"
#include "mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.h"

#define instantiate_gemm(                                     \
    tname,                                                    \
    trans_a,                                                  \
    trans_b,                                                  \
    iname,                                                    \
    itype,                                                    \
    oname,                                                    \
    otype,                                                    \
    bm,                                                       \
    bn,                                                       \
    bk,                                                       \
    wm,                                                       \
    wn,                                                       \
    aname,                                                    \
    mn_aligned,                                               \
    kname,                                                    \
    k_aligned)                                                \
  instantiate_kernel(                                         \
      "steel_gemm_splitk_" #tname "_" #iname "_" #oname       \
         "_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn    \
         "_MN_" #aname "_K_" #kname,                          \
  gemm_splitk,                                                \
      itype,                                                  \
      otype,                                                  \
      bm,                                                     \
      bn,                                                     \
      bk,                                                     \
      wm,                                                     \
      wn,                                                     \
      trans_a,                                                \
      trans_b,                                                \
      mn_aligned,                                             \
      k_aligned)

#define instantiate_gemm_aligned_helper(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn)             \
  instantiate_gemm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, taligned, true, taligned, true)  \
  instantiate_gemm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, taligned, true, naligned, false) \
  instantiate_gemm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, naligned, false, taligned, true) \
  instantiate_gemm(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn, naligned, false, naligned, false)

#define instantiate_gemm_transpose_helper(iname, itype, oname, otype, bm, bn, bk, wm, wn)             \
    instantiate_gemm_aligned_helper(nn, false, false, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm_aligned_helper(nt, false, true , iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm_aligned_helper(tn, true , false, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm_aligned_helper(tt, true , true , iname, itype, oname, otype, bm, bn, bk, wm, wn)

#define instantiate_gemm_shapes_helper(iname, itype, oname, otype)                  \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 16, 16, 16, 2, 2) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 16, 32, 16, 2, 2) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 32, 16, 16, 2, 2) \
    instantiate_gemm_transpose_helper(iname, itype, oname, otype, 32, 32, 16, 2, 2)

instantiate_gemm_shapes_helper(float16, half, float32, float);
instantiate_gemm_shapes_helper(bfloat16, bfloat16_t, float32, float);
instantiate_gemm_shapes_helper(float32, float, float32, float);
instantiate_gemm_shapes_helper(complex64, complex64_t, complex64, complex64_t);

#define instantiate_accum(oname, otype, aname, atype)      \
  instantiate_kernel(                                      \
    "steel_gemm_splitk_accum_" #oname "_" #aname,          \
    gemm_splitk_accum, atype, otype)                       \
  instantiate_kernel(                                      \
    "steel_gemm_splitk_accum_" #oname "_" #aname "_axbpy", \
  gemm_splitk_accum_axpby, atype, otype)                   \

instantiate_accum(bfloat16, bfloat16_t, float32, float);
instantiate_accum(float16, half, float32, float);
instantiate_accum(float32, float, float32, float);
instantiate_accum(complex64, complex64_t, complex64, complex64_t); // clang-format on


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk_nax.h
================================================
// Copyright © 2026 Apple Inc.

using namespace mlx::steel;

constant bool align_M [[function_constant(200)]];
constant bool align_N [[function_constant(201)]];

///////////////////////////////////////////////////////////////////////////////
// NAX Split-K GEMM kernel
///////////////////////////////////////////////////////////////////////////////

// clang-format off
template <
    typename T,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose_a,
    bool transpose_b,
    typename AccumType = float>
[[kernel, max_total_threads_per_threadgroup(WM* WN * 32)]] void gemm_splitk_nax(
    const device T* A [[buffer(0)]],
    const device T* B [[buffer(1)]],
    device AccumType* C [[buffer(2)]],
    const constant GEMMSpiltKParams* params [[buffer(3)]],
    uint simd_group_id [[simdgroup_index_in_threadgroup]],
    uint3 tid [[threadgroup_position_in_grid]]) { // clang-format on

  const int linear_tid = tid.x;

  // Compute swizzled tile dimensions
  const int tn_swizzled = params->tiles_n << params->swizzle_log;
  const int tm_swizzled =
      (params->tiles_m + (1 << params->swizzle_log) - 1) >> params->swizzle_log;
  const int tiles_per_partition = tn_swizzled * tm_swizzled;

  const int tid_z = linear_tid / tiles_per_partition;
  const int xy_flat = linear_tid % tiles_per_partition;

  // Decode 2D grid coordinates in swizzled space
  const int grid_x = xy_flat % tn_swizzled;
  const int grid_y = xy_flat / tn_swizzled;

  // Apply X-Y swizzle
  const int tid_y = (grid_y << params->swizzle_log) +
      (grid_x & ((1 << params->swizzle_log) - 1));
  const int tid_x = grid_x >> params->swizzle_log;

  // Exit early
  if (params->tiles_n <= tid_x || params->tiles_m <= tid_y) {
    return;
  }

  // Calculate partition bounds
  const int c_row = tid_y * BM;
  const int c_col = tid_x * BN;
  const int k_start = params->split_k_partition_size * tid_z;
  const int k_end = min(k_start + params->split_k_partition_size, params->K);

  const size_t c_row_long = size_t(c_row);
  const size_t c_col_long = size_t(c_col);
  const size_t k_start_long = size_t(k_start);

  // Adjust pointers for split-K partition
  A += transpose_a ? (c_row_long + k_start_long * params->lda)
                   : (k_start_long + c_row_long * params->lda);
  B += transpose_b ? (k_start_long + c_col_long * params->ldb)
                   : (c_col_long + k_start_long * params->ldb);
  C += (size_t(params->split_k_partition_stride) * tid_z) +
      (c_row_long * params->ldc + c_col_long);

  // NAX tile configuration
  constexpr short SM = BM / WM;
  constexpr short SN = BN / WN;
  constexpr short SK = 32;

  constexpr short TM = SM / 16;
  constexpr short TN = SN / 16;

  // Calculate simdgroup offsets and alignment
  const short tm = SM * (simd_group_id / WN);
  const short tn = SN * (simd_group_id % WN);

  const int sgp_sm_int =
      align_M ? int(SM) : min(int(SM), params->M - (c_row + tm));
  const short sgp_sm = short(sgp_sm_int);
  const bool is_unaligned_sm = align_M ? false : (sgp_sm != SM);

  const int sgp_sn_int =
      align_N ? int(SN) : min(int(SN), params->N - (c_col + tn));
  const short sgp_sn = short(sgp_sn_int);
  const bool is_unaligned_sn = align_N ? false : (sgp_sn != SN);

  A += transpose_a ? tm : (tm * params->lda);
  B += transpose_b ? (tn * params->ldb) : tn;
  C += tm * params->ldc + tn;

  NAXTile<AccumType, TM, TN> Dtile;

  // gemm_loop through the partition
  // Check K-alignment at runtime (partition-specific)
  const int partition_k_size = k_end - k_start;
  const int partition_k_iters = partition_k_size / BK;
  const bool partition_k_aligned = (partition_k_size % BK) == 0;

  dispatch_bool(partition_k_aligned, [&](auto kAlignedK) {
    dispatch_bool(align_M || !is_unaligned_sm, [&](auto kAlignedM) {
      dispatch_bool(align_N || !is_unaligned_sn, [&](auto kAlignedN) {
        Dtile = gemm_loop<
            T,
            SM,
            SN,
            SK,
            BK,
            transpose_a,
            transpose_b,
            kAlignedM.value,
            kAlignedN.value,
            kAlignedK.value,
            AccumType>(
            A,
            B,
            params->lda,
            params->ldb,
            partition_k_size,
            partition_k_iters,
            sgp_sm,
            sgp_sn);
      });
    });
  });

  // Store result
  dispatch_bool(align_M || !is_unaligned_sm, [&](auto kAlignedM) {
    dispatch_bool(align_N || !is_unaligned_sn, [&](auto kAlignedN) {
      if constexpr (kAlignedM && kAlignedN) {
        Dtile.store(C, int(params->ldc));
      } else {
        Dtile.store_safe(C, int(params->ldc), short2(sgp_sn, sgp_sm));
      }
    });
  });
}


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk_nax.metal
================================================
// Copyright © 2026 Apple Inc.

#include <metal_stdlib>

#include "mlx/backend/metal/kernels/utils.h"

#include "mlx/backend/metal/kernels/steel/gemm/gemm_nax.h"
#include "mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk_nax.h"

// clang-format off
#define instantiate_gemm_splitk(tname, trans_a, trans_b, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
  instantiate_kernel(                                                                             \
      "steel_gemm_splitk_nax_" #tname "_"  #iname "_" #oname                                      \
      "_bm" #bm "_bn" #bn "_bk" #bk "_wm" #wm "_wn" #wn,                                          \
  gemm_splitk_nax, itype, bm, bn, bk, wm, wn, trans_a, trans_b, float)

#define instantiate_gemm_splitk_transpose_helper(iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm_splitk(nn, false, false, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm_splitk(nt, false, true , iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm_splitk(tn, true , false, iname, itype, oname, otype, bm, bn, bk, wm, wn) \
    instantiate_gemm_splitk(tt, true , true , iname, itype, oname, otype, bm, bn, bk, wm, wn)

#define instantiate_gemm_splitk_shapes_helper(iname, itype, oname, otype) \
    instantiate_gemm_splitk_transpose_helper(iname, itype, oname, otype,  64,  64, 256, 2, 2) \
    instantiate_gemm_splitk_transpose_helper(iname, itype, oname, otype, 128, 128, 512, 4, 4)

instantiate_gemm_splitk_shapes_helper(float16, half, float32, float);
instantiate_gemm_splitk_shapes_helper(bfloat16, bfloat, float32, float);
instantiate_gemm_splitk_shapes_helper(float32, float, float32, float);
// clang-format on


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/loader.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/steel/defines.h"

///////////////////////////////////////////////////////////////////////////////
// Loading helper
///////////////////////////////////////////////////////////////////////////////

namespace mlx {
namespace steel {

template <
    typename T,
    short BROWS,
    short BCOLS,
    short dst_ld,
    short reduction_dim,
    short tgp_size,
    short alignment = 1,
    short n_reads = (BCOLS * BROWS) / (tgp_size),
    short TCOLS = BCOLS / n_reads,
    short TROWS = tgp_size / TCOLS>
struct BlockLoader {
  STEEL_CONST short n_rows = (BROWS + TROWS - 1) / TROWS;
  STEEL_CONST short vec_size = n_reads;

  // Leading dimension for src
  const int src_ld;
  const int tile_stride;

  // Thread location indices
  const short thread_idx;
  const short bi;
  const short bj;

  // threadgroup and device memory
  threadgroup T* dst;
  const device T* src;

  struct alignas(alignment * sizeof(T)) ReadVector {
    uint8_t v[sizeof(T) * vec_size];
  };

  /* Constructor */
  METAL_FUNC BlockLoader(
      const device T* src_,
      const int src_ld_,
      threadgroup T* dst_,
      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
      ushort simd_lane_id [[thread_index_in_simdgroup]])
      : src_ld(src_ld_),
        tile_stride(reduction_dim ? BCOLS : BROWS * src_ld),
        thread_idx(simd_group_id * 32 + simd_lane_id),
        bi(thread_idx / TCOLS),
        bj(vec_size * (thread_idx % TCOLS)),
        dst(dst_ + bi * dst_ld + bj),
        src(src_ + bi * src_ld + bj) {}

  /* Apply operation to threadgroup without bound checking */
  template <typename UnaryOp>
  METAL_FUNC void apply_inplace_op(thread const UnaryOp& op) const {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < BROWS; i += TROWS) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < vec_size; j++) {
        dst[i * dst_ld + j] = op.apply(dst[i * dst_ld + j]);
      }
    }
  }

  /* Load from device memory into threadgroup memory - without bound checking */
  METAL_FUNC void load_unsafe() const {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < BROWS; i += TROWS) {
      *((threadgroup ReadVector*)(&dst[i * dst_ld])) =
          *((const device ReadVector*)(&src[i * src_ld]));
    }
  }

  /* Load from device memory into threadgroup memory - with bound checking */
  METAL_FUNC void load_safe(short2 src_tile_dim) const {
    src_tile_dim = src_tile_dim - short2(bj, bi);

    // Skip loading if thread has no valid reads
    if (src_tile_dim.x <= 0 || src_tile_dim.y <= 0) {
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < BROWS; i += TROWS) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < vec_size; j++) {
          dst[i * dst_ld + j] = T(0);
        }
      }
      return;
    }

    // Use fast thread memory for bound checks
    bool tmp_idx[vec_size];
    T tmp_val[vec_size];

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < BROWS; i += TROWS) {
      // Make sure tmp_idx only contains valid indices
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < vec_size; j++) {
        tmp_idx[j] = (i < src_tile_dim.y) && (j < src_tile_dim.x);
      }

      // Read valid indices into tmp_val
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < vec_size; j++) {
        tmp_val[j] = src[(tmp_idx[j] ? i * src_ld + j : 0)];
      }

      // Zero out unneeded values
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < vec_size; j++) {
        tmp_val[j] = tmp_idx[j] ? tmp_val[j] : T(0);
      }

      // Copy values to threadgroup memory
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < vec_size; j++) {
        dst[i * dst_ld + j] = tmp_val[j];
      }
    }
  }

  /* Iteration helper */
  METAL_FUNC void next() {
    src += tile_stride;
  }
};

} // namespace steel
} // namespace mlx


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/mma.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include <metal_simdgroup>
#include <metal_simdgroup_matrix>
#include <metal_stdlib>

#include "mlx/backend/metal/kernels/steel/defines.h"
#include "mlx/backend/metal/kernels/steel/gemm/transforms.h"
#include "mlx/backend/metal/kernels/steel/utils/integral_constant.h"

using namespace metal;

///////////////////////////////////////////////////////////////////////////////
// MMA helper
///////////////////////////////////////////////////////////////////////////////

namespace mlx {
namespace steel {

template <typename T, int kFragRows_, int kFragCols_>
struct BaseMMAFrag {
  static_assert(
      kFragRows_ == 8,
      "Only 8 x 8 fragment matrices are currently supported");
  static_assert(
      kFragCols_ == 8,
      "Only 8 x 8 fragment matrices are currently supported");
};

template <typename T>
struct BaseMMAFrag<T, 8, 8> {
  STEEL_CONST int kFragRows = 8;
  STEEL_CONST int kFragCols = 8;

  STEEL_CONST int kElemsPerFrag = (kFragRows * kFragCols) / 32;

  STEEL_CONST int kElemRows = 1;
  STEEL_CONST int kElemCols = 2;

  static_assert(
      kElemRows * kElemCols == kElemsPerFrag,
      "MMAFrag shape is not consistent with MMAFrag size");

  typedef metal::simdgroup_matrix<T, kFragRows, kFragCols> mat_type;
  typedef metal::vec<T, kElemsPerFrag> frag_type;

  METAL_FUNC static constexpr short2 get_coord(
      ushort simd_lane_id [[thread_index_in_simdgroup]]) {
    const short qid = simd_lane_id / 4;
    const short fm = (qid & 4) + ((simd_lane_id / 2) % 4);
    const short fn = (qid & 2) * 2 + (simd_lane_id % 2) * 2;
    return short2{fn, fm};
  }

  template <typename SrcPtrType, typename StrX, typename StrY>
  METAL_FUNC static constexpr void
  load(thread frag_type& dst, SrcPtrType src, StrX str_x, StrY str_y) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kElemCols; j++) {
        dst[i * kElemCols + j] = static_cast<T>(src[i * str_x + j * str_y]);
      }
    }
  }

  template <
      typename SrcPtrType,
      typename StrX,
      typename StrY,
      typename LimX,
      typename LimY,
      typename OffX,
      typename OffY>
  METAL_FUNC static constexpr void load_safe(
      thread frag_type& dst,
      SrcPtrType src,
      StrX str_x,
      StrY str_y,
      LimX lim_x,
      LimY lim_y,
      OffX off_x = Int<0>{},
      OffY off_y = Int<0>{}) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kElemCols; j++) {
        if ((off_x + i) < lim_x && (off_y + j) < lim_y) {
          dst[i * kElemCols + j] =
              static_cast<T>(src[(off_x + i) * str_x + (off_x + j) * str_y]);
        } else {
          dst[i * kElemCols + j] = T(0);
        }
      }
    }
  }

  template <typename DstPtrType, typename StrX, typename StrY>
  METAL_FUNC static constexpr void
  store(const thread frag_type& src, DstPtrType dst, StrX str_x, StrY str_y) {
    using U = pointer_element_t<DstPtrType>;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kElemCols; j++) {
        dst[i * str_x + j * str_y] = static_cast<U>(src[i * kElemCols + j]);
      }
    }
  }

  template <
      typename DstPtrType,
      typename StrX,
      typename StrY,
      typename LimX,
      typename LimY,
      typename OffX,
      typename OffY>
  METAL_FUNC static constexpr void store_safe(
      const thread frag_type& src,
      DstPtrType dst,
      StrX str_x,
      StrY str_y,
      LimX lim_x,
      LimY lim_y,
      OffX off_x = Int<0>{},
      OffY off_y = Int<0>{}) {
    using U = pointer_element_t<DstPtrType>;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kElemCols; j++) {
        if ((off_x + i) < lim_x && (off_y + j) < lim_y) {
          dst[(off_x + i) * str_x + (off_y + j) * str_y] =
              static_cast<U>(src[i * kElemCols + j]);
        }
      }
    }
  }

  template <
      typename DstPtrType,
      typename StrX,
      typename StrY,
      typename StartX,
      typename StopX,
      typename StartY,
      typename StopY,
      typename OffX,
      typename OffY>
  METAL_FUNC static constexpr void store_slice(
      const thread frag_type& src,
      DstPtrType dst,
      StrX str_x,
      StrY str_y,
      StartX start_x,
      StopX stop_x,
      StartY start_y,
      StopY stop_y,
      OffX off_x = Int<0>{},
      OffY off_y = Int<0>{}) {
    using U = pointer_element_t<DstPtrType>;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kElemCols; j++) {
        if ((off_x + i) < stop_x && (off_x + i) >= start_x &&
            (off_y + j) < stop_y && (off_y + j) >= start_y) {
          dst[(off_x + i) * str_x + (off_y + j) * str_y] =
              static_cast<U>(src[i * kElemCols + j]);
        }
      }
    }
  }

  METAL_FUNC static constexpr void mma(
      thread frag_type& D,
      thread frag_type& A,
      thread frag_type& B,
      thread frag_type& C) {
    mat_type D_mat;
    mat_type A_mat;
    mat_type B_mat;
    mat_type C_mat;

    reinterpret_cast<thread frag_type&>(A_mat.thread_elements()) = A;
    reinterpret_cast<thread frag_type&>(B_mat.thread_elements()) = B;
    reinterpret_cast<thread frag_type&>(C_mat.thread_elements()) = C;

    mma(D_mat, A_mat, B_mat, C_mat);

    D = reinterpret_cast<thread frag_type&>(D_mat.thread_elements());
  }

  METAL_FUNC static constexpr void mma(
      thread mat_type& D,
      thread mat_type& A,
      thread mat_type& B,
      thread mat_type& C) {
    simdgroup_multiply_accumulate(D, A, B, C);
  }
};

template <
    typename T,
    int kTileRows_,
    int kTileCols_,
    class MMAFrag_ = BaseMMAFrag<T, 8, 8>>
struct MMATile {
  using MMAFrag_t = MMAFrag_;
  using elem_type = T;
  STEEL_CONST int kFragRows = MMAFrag_t::kFragRows;
  STEEL_CONST int kFragCols = MMAFrag_t::kFragCols;
  STEEL_CONST int kElemsPerFrag = MMAFrag_t::kElemsPerFrag;

  STEEL_CONST int kTileRows = kTileRows_;
  STEEL_CONST int kTileCols = kTileCols_;

  STEEL_CONST int kRows = kTileRows * kFragRows;
  STEEL_CONST int kCols = kTileCols * kFragCols;

  STEEL_CONST int kNumFrags = kTileRows * kTileCols;
  STEEL_CONST int kElemsPerTile = kNumFrags * kElemsPerFrag;

  typedef typename MMAFrag_t::mat_type mat_type;
  typedef typename MMAFrag_t::frag_type frag_type;

  frag_type val_frags[kNumFrags] = {frag_type(0)};

  METAL_FUNC MMATile() thread {}

  METAL_FUNC constexpr void clear() {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kNumFrags; ++i) {
      val_frags[i] = frag_type(0);
    }
  }

  METAL_FUNC constexpr thread frag_type& frag_at(const short i, const short j) {
    return val_frags[i * kTileCols + j];
  }

  METAL_FUNC constexpr const thread frag_type& frag_at(
      const short i,
      const short j) const {
    return val_frags[i * kTileCols + j];
  }

  METAL_FUNC mat_type mat_at(const short i, const short j) {
    mat_type val_mat;
    STEEL_PRAGMA_UNROLL
    for (short ii = 0; ii < kElemsPerFrag; ++ii) {
      val_mat.thread_elements()[ii] = frag_at(i, j)[ii];
    }
    return val_mat;
  }

  METAL_FUNC thread elem_type* elems() {
    return reinterpret_cast<thread elem_type*>(val_frags);
  }

  METAL_FUNC const thread elem_type* elems() const {
    return reinterpret_cast<const thread elem_type*>(val_frags);
  }

  template <typename U, int w_x, int w_y, int str_x, int str_y>
  METAL_FUNC void load(const threadgroup U* src) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kTileCols; ++j) {
        MMAFrag_t::load(
            frag_at(i, j),
            &(
                src[(i * kFragRows) * w_x * str_x +
                    (j * kFragCols) * w_y * str_y]),
            Int<str_x>{},
            Int<str_y>{});
      }
    }
  }

  template <typename U, int w_x, int w_y, int str_x, int str_y>
  METAL_FUNC void store(threadgroup U* dst) const {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kTileCols; ++j) {
        MMAFrag_t::store(
            frag_at(i, j),
            &(
                dst[(i * kFragRows) * w_x * str_x +
                    (j * kFragCols) * w_y * str_y]),
            Int<str_x>{},
            Int<str_y>{});
      }
    }
  }

  template <typename U, int w_x, int w_y>
  METAL_FUNC void load(const device U* src, const int ld) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kTileCols; ++j) {
        MMAFrag_t::load(
            frag_at(i, j),
            &(src[(i * kFragRows) * w_x * ld + (j * kFragCols) * w_y]),
            ld,
            Int<1>{});
      }
    }
  }

  template <typename U, int w_x, int w_y>
  METAL_FUNC void store(device U* dst, const int ld) const {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kTileCols; ++j) {
        MMAFrag_t::store(
            frag_at(i, j),
            &(dst[(i * kFragRows) * w_x * ld + (j * kFragCols) * w_y]),
            ld,
            Int<1>{});
      }
    }
  }

  template <typename U, int w_x, int w_y>
  METAL_FUNC void
  load_safe(const device U* src, const int ld, const short2 src_tile_dims) {
    STEEL_PRAGMA_UNROLL
    for (int i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (int j = 0; j < kTileCols; ++j) {
        MMAFrag_t::load_safe(
            frag_at(i, j),
            src,
            ld,
            Int<1>{},
            src_tile_dims.y,
            src_tile_dims.x,
            (i * kFragRows) * w_x,
            (j * kFragCols) * w_y);
      }
    }
  }

  template <typename U, int w_x, int w_y>
  METAL_FUNC void
  store_safe(device U* dst, const int ld, const short2 dst_tile_dims) const {
    STEEL_PRAGMA_UNROLL
    for (int i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (int j = 0; j < kTileCols; ++j) {
        MMAFrag_t::store_safe(
            frag_at(i, j),
            dst,
            ld,
            Int<1>{},
            dst_tile_dims.y,
            dst_tile_dims.x,
            (i * kFragRows) * w_x,
            (j * kFragCols) * w_y);
      }
    }
  }

  template <typename U, int w_x, int w_y>
  METAL_FUNC void store_slice(
      device U* dst,
      const int ld,
      const short2 start,
      const short2 stop) const {
    STEEL_PRAGMA_UNROLL
    for (int i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (int j = 0; j < kTileCols; ++j) {
        MMAFrag_t::store_slice(
            frag_at(i, j),
            dst,
            ld,
            Int<1>{},
            start.y,
            stop.y,
            start.x,
            stop.x,
            (i * kFragRows) * w_x,
            (j * kFragCols) * w_y);
      }
    }
  }
};

template <typename T, typename U, int M, int N, int K>
METAL_FUNC void tile_matmad(
    thread MMATile<T, M, N>& D,
    thread MMATile<U, M, K>& A,
    thread MMATile<U, K, N>& B,
    thread MMATile<T, M, N>& C) {
  STEEL_PRAGMA_UNROLL
  for (short m = 0; m < M; ++m) {
    STEEL_PRAGMA_UNROLL
    for (short n = 0; n < N; ++n) {
      short n_serp = (m % 2) ? (N - 1 - n) : n;
      STEEL_PRAGMA_UNROLL
      for (short k = 0; k < K; ++k) {
        MMATile<T, M, N>::MMAFrag_t::mma(
            D.frag_at(m, n_serp),
            A.frag_at(m, k),
            B.frag_at(k, n_serp),
            C.frag_at(m, n_serp));
      }
    }
  }
}

template <typename InT>
struct TransformNone<complex64_t, InT> {
  static METAL_FUNC complex64_t apply(complex64_t x) {
    return x;
  }
  static METAL_FUNC complex64_t apply(complex64_t x, complex64_t) {
    return x;
  }
};

template <
    typename T,
    typename U,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose_a,
    bool transpose_b,
    short lda_tgp,
    short ldb_tgp,
    typename AccumType = float,
    typename Epilogue = TransformNone<U, AccumType>>
struct BlockMMA {
  // MMAFrag size
  STEEL_CONST short kFragSize = 8;
  using MMAFrag_acc_t = BaseMMAFrag<AccumType, kFragSize, kFragSize>;

  // Warp tile simdgroup matrix strides along M
  STEEL_CONST short TM_stride = kFragSize * WM;
  // Warp tile simdgroup matrix strides along M
  STEEL_CONST short TN_stride = kFragSize * WN;

  // Warp tile size along M
  STEEL_CONST short TM = BM / (kFragSize * WM);
  // Warp tile size along N
  STEEL_CONST short TN = BN / (kFragSize * WN);

  // Threadgroup A strides
  STEEL_CONST short A_str_m = transpose_a ? 1 : lda_tgp; // M
  STEEL_CONST short A_str_k = transpose_a ? lda_tgp : 1; // K

  // Threadgroup B strides
  STEEL_CONST short B_str_k = transpose_b ? 1 : ldb_tgp; // K
  STEEL_CONST short B_str_n = transpose_b ? ldb_tgp : 1; // N

  // Threadgroup strides along K
  STEEL_CONST short tile_stride_a = kFragSize * A_str_k;
  STEEL_CONST short tile_stride_b = kFragSize * B_str_k;

  // Simdgroup matrices
  MMATile<AccumType, TM, 1, MMAFrag_acc_t> Atile;
  MMATile<AccumType, 1, TN, MMAFrag_acc_t> Btile;
  MMATile<AccumType, TM, TN, MMAFrag_acc_t> Ctile;

  // Offsets within threadgroup
  short sm;
  short sn;

  short As_offset;
  short Bs_offset;

  /* Constructor */
  METAL_FUNC BlockMMA(
      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
      ushort simd_lane_id [[thread_index_in_simdgroup]]) {
    // Determine thread position in simdgroup matrix
    short tm = kFragSize * (simd_group_id / WN);
    short tn = kFragSize * (simd_group_id % WN);

    short2 simd_coord = MMAFrag_acc_t::get_coord(simd_lane_id);
    sm = simd_coord.y;
    sn = simd_coord.x;

    // Determine thread and simdgroup offset
    As_offset = (tm + sm) * A_str_m + (sn)*A_str_k; // M, K
    Bs_offset = (sm)*B_str_k + (tn + sn) * B_str_n; // K, N

    sm += tm;
    sn += tn;
  }

  /* (BM, BK) X (BK, BN) multiply accumulate function */
  METAL_FUNC void mma(const threadgroup T* As, const threadgroup T* Bs) {
    // Adjust for simdgroup and thread location
    As += As_offset;
    Bs += Bs_offset;

    // Iterate over BK in blocks of kFragSize
    STEEL_PRAGMA_UNROLL
    for (short kk = 0; kk < BK; kk += kFragSize) {
      simdgroup_barrier(mem_flags::mem_none);

      Atile.template load<T, WM, 1, A_str_m, A_str_k>(As);

      simdgroup_barrier(mem_flags::mem_none);

      Btile.template load<T, 1, WN, B_str_k, B_str_n>(Bs);

      simdgroup_barrier(mem_flags::mem_none);

      tile_matmad(Ctile, Atile, Btile, Ctile);

      // Progress to next simdgroup tile
      As += tile_stride_a;
      Bs += tile_stride_b;
    }
  }

  /* Store results from simdgroup_matrix results into device memory */
  METAL_FUNC void store_result(device U* D, const int ldd) {
    // Apply epilogue
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < decltype(Ctile)::kElemsPerTile; i++) {
      Ctile.elems()[i] = Epilogue::apply(Ctile.elems()[i]);
    }

    // Adjust for simdgroup and thread location
    D += sm * ldd + sn;

    Ctile.template store<U, WM, WN>(D, ldd);
  }

  METAL_FUNC void
  store_result_slice(device U* D, const int ldd, short2 start, short2 stop) {
    // Apply epilogue
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < decltype(Ctile)::kElemsPerTile; i++) {
      Ctile.elems()[i] = Epilogue::apply(Ctile.elems()[i]);
    }

    D += sm * ldd + sn;
    start -= short2(sn, sm);
    stop -= short2(sn, sm);

    // TODO: Check the start as well
    if (stop.y <= 0 || stop.x <= 0) {
      return;
    }

    Ctile.template store_slice<U, WM, WN>(D, ldd, start, stop);
  }

  METAL_FUNC void
  store_result_safe(device U* D, const int ldd, short2 dst_tile_dims) {
    // Apply epilogue
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < decltype(Ctile)::kElemsPerTile; i++) {
      Ctile.elems()[i] = Epilogue::apply(Ctile.elems()[i]);
    }

    // Adjust for simdgroup and thread location
    D += sm * ldd + sn;
    dst_tile_dims -= short2(sn, sm);

    if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
      return;

    Ctile.template store_safe<U, WM, WN>(D, ldd, dst_tile_dims);
  }

  /* Apply epilogue */
  template <typename UnaryEpilogue>
  METAL_FUNC void apply_epilogue(thread const UnaryEpilogue& epilogue_op) {
    // Loop over all simdgroup tiles
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < decltype(Ctile)::kElemsPerTile; i++) {
      Ctile.elems()[i] = epilogue_op.apply(Ctile.elems()[i]);
    }
  }

  /* Apply epilogue */
  template <typename BinaryEpilogue>
  METAL_FUNC void apply_epilogue(
      const device U* C,
      const int ldc,
      const int fdc,
      thread const BinaryEpilogue& epilogue_op) {
    // Adjust for simdgroup and thread location
    C += (sm)*ldc + (sn)*fdc;

    // Loop over all simdgroup tiles
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < TM; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < TN; j++) {
        // Get accumulated result and associated offset in C
        thread auto& accum = Ctile.frag_at(i, j);
        int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;

        // Apply epilogue
        STEEL_PRAGMA_UNROLL
        for (short k = 0; k < decltype(Ctile)::kElemsPerFrag; k++) {
          accum[k] = epilogue_op.apply(accum[k], C[offset_c + k * fdc]);
        }
      }
    }
  }

  /* Apply epilogue */
  template <typename BinaryEpilogue>
  METAL_FUNC void apply_epilogue_safe(
      const device U* C,
      const int ldc,
      const int fdc,
      short2 dst_tile_dims,
      thread const BinaryEpilogue& epilogue_op) {
    // Adjust for simdgroup and thread location
    C += (sm)*ldc + (sn)*fdc;
    dst_tile_dims -= short2(sn, sm);

    if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
      return;

    // Loop over all simdgroup tiles
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < TM; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < TN; j++) {
        // Get accumulated result and associated offset in C
        thread auto& accum = Ctile.frag_at(i, j);
        int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;

        constexpr short kelems = decltype(Ctile)::kElemsPerFrag;

        // Read C
        U c_elems[kelems] = {0};

        STEEL_PRAGMA_UNROLL
        for (short k = 0; k < kelems; k++) {
          if ((j * TN_stride + k) < dst_tile_dims.x) {
            c_elems[k] = C[offset_c + k * fdc];
          }
        }

        // Apply epilogue
        STEEL_PRAGMA_UNROLL
        for (short k = 0; k < kelems; k++) {
          accum[k] = epilogue_op.apply(accum[k], c_elems[k]);
        }
      }
    }
  }

  /* Store results from simdgroup_matrix results into device memory */
  METAL_FUNC void store_result(
      device U* D,
      const int ldd,
      const device U* C,
      const int ldc,
      const int fdc,
      thread const Epilogue& epilogue_op) const {
    // Adjust for simdgroup and thread location
    C += (sm)*ldc + (sn)*fdc;
    D += (sm)*ldd + sn;

    constexpr short kelems = decltype(Ctile)::kElemsPerFrag;

    // Loop over all simdgroup tiles
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < TM; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < TN; j++) {
        // Get accumulated result and associated offset in C
        thread const auto& accum = Ctile.frag_at(i, j);
        int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
        int offset_d = (i * TM_stride) * ldd + (j * TN_stride);

        // Apply epilogue
        STEEL_PRAGMA_UNROLL
        for (short k = 0; k < kelems; k++) {
          D[offset_d + k] = epilogue_op.apply(accum[k], C[offset_c + k * fdc]);
        }
      }
    }
  }

  METAL_FUNC void store_result_safe(
      device U* D,
      const int ldd,
      const device U* C,
      const int ldc,
      const int fdc,
      short2 dst_tile_dims,
      thread const Epilogue& epilogue_op) const {
    // Adjust for simdgroup and thread location
    C += (sm)*ldc + (sn)*fdc;
    D += (sm)*ldd + sn;
    dst_tile_dims -= short2(sn, sm);

    if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
      return;

    constexpr short kelems = decltype(Ctile)::kElemsPerFrag;

    STEEL_PRAGMA_UNROLL
    for (int i = 0; i < TM; i++) {
      if (i * TM_stride < dst_tile_dims.y) {
        STEEL_PRAGMA_UNROLL
        for (int j = 0; j < TN; j++) {
          // Get accumulated result and associated offset in C
          thread const auto& accum = Ctile.frag_at(i, j);
          int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
          int offset_d = (i * TM_stride) * ldd + (j * TN_stride);

          // Apply epilogue
          STEEL_PRAGMA_UNROLL
          for (short k = 0; k < kelems; k++) {
            if ((j * TN_stride + k) < dst_tile_dims.x) {
              D[offset_d + k] =
                  epilogue_op.apply(accum[k], C[offset_c + k * fdc]);
            }
          }
        }
      }
    }
  }
};

template <
    typename U,
    int BM,
    int BN,
    int BK,
    int WM,
    int WN,
    bool transpose_a,
    bool transpose_b,
    short lda_tgp,
    short ldb_tgp,
    typename AccumType,
    typename Epilogue>
struct BlockMMA<
    complex64_t,
    U,
    BM,
    BN,
    BK,
    WM,
    WN,
    transpose_a,
    transpose_b,
    lda_tgp,
    ldb_tgp,
    AccumType,
    Epilogue> {
  static_assert(
      metal::is_same_v<AccumType, float>,
      "BlockMMA<complex64_t,...> expects float accumulators");
  static_assert(
      metal::is_same_v<U, complex64_t>,
      "For complex BlockMMA, U must be complex64_t; use a different epilogue for projections");
  // MMAFrag size
  STEEL_CONST short kFragSize = 8;
  using MMAFrag_acc_t = BaseMMAFrag<AccumType, kFragSize, kFragSize>;

  // Warp tile simdgroup matrix strides along M
  STEEL_CONST short TM_stride = kFragSize * WM;
  // Warp tile simdgroup matrix strides along M
  STEEL_CONST short TN_stride = kFragSize * WN;

  // Warp tile size along M
  STEEL_CONST short TM = BM / (kFragSize * WM);
  // Warp tile size along N
  STEEL_CONST short TN = BN / (kFragSize * WN);

  // Threadgroup A strides
  STEEL_CONST short A_str_m = transpose_a ? 1 : lda_tgp; // M
  STEEL_CONST short A_str_k = transpose_a ? lda_tgp : 1; // K

  // Threadgroup B strides
  STEEL_CONST short B_str_k = transpose_b ? 1 : ldb_tgp; // K
  STEEL_CONST short B_str_n = transpose_b ? ldb_tgp : 1; // N

  // Threadgroup strides along K
  STEEL_CONST short tile_stride_a = kFragSize * A_str_k;
  STEEL_CONST short tile_stride_b = kFragSize * B_str_k;

  // When indexing complex as float[2]
  STEEL_CONST short A_str_m_f = A_str_m * 2;
  STEEL_CONST short A_str_k_f = A_str_k * 2;
  STEEL_CONST short B_str_k_f = B_str_k * 2;
  STEEL_CONST short B_str_n_f = B_str_n * 2;
  STEEL_CONST short tile_stride_a_f = tile_stride_a * 2;
  STEEL_CONST short tile_stride_b_f = tile_stride_b * 2;

  // Accumulators (real/imag)
  MMATile<AccumType, TM, TN, MMAFrag_acc_t> Ctile_r;
  MMATile<AccumType, TM, TN, MMAFrag_acc_t> Ctile_i;

  // Offsets within threadgroup
  short sm, sn;
  short As_offset, Bs_offset;

  /* Constructor */
  METAL_FUNC BlockMMA(
      ushort simd_group_id [[simdgroup_index_in_threadgroup]],
      ushort simd_lane_id [[thread_index_in_simdgroup]]) {
    // Determine thread position in simdgroup matrix
    short tm = kFragSize * (simd_group_id / WN);
    short tn = kFragSize * (simd_group_id % WN);

    short2 simd_coord = MMAFrag_acc_t::get_coord(simd_lane_id);
    sm = simd_coord.y;
    sn = simd_coord.x;

    // Determine thread and simdgroup offset
    As_offset = (tm + sm) * A_str_m + (sn)*A_str_k; // (M,K)
    Bs_offset = (sm)*B_str_k + (tn + sn) * B_str_n; // (K,N)

    sm += tm;
    sn += tn;
  }

  /* Karatsuba MMA: 3 real MMAs per K-chunk */
  METAL_FUNC void mma(
      const threadgroup complex64_t* As,
      const threadgroup complex64_t* Bs) {
    // Adjust for simdgroup and thread location
    As += As_offset;
    Bs += Bs_offset;
    threadgroup const float* As_f =
        reinterpret_cast<threadgroup const float*>(As);
    threadgroup const float* Bs_f =
        reinterpret_cast<threadgroup const float*>(Bs);

    // Iterate over BK in blocks of kFragSize
    STEEL_PRAGMA_UNROLL
    for (short kk = 0; kk < BK; kk += kFragSize) {
      simdgroup_barrier(mem_flags::mem_none);

      MMATile<AccumType, TM, 1, MMAFrag_acc_t> Ar, Ai;
      Ar.template load<float, WM, 1, A_str_m_f, A_str_k_f>(As_f + 0);
      Ai.template load<float, WM, 1, A_str_m_f, A_str_k_f>(As_f + 1);

      simdgroup_barrier(mem_flags::mem_none);

      MMATile<AccumType, 1, TN, MMAFrag_acc_t> Br, Bi;
      Br.template load<float, 1, WN, B_str_k_f, B_str_n_f>(Bs_f + 0);
      Bi.template load<float, 1, WN, B_str_k_f, B_str_n_f>(Bs_f + 1);

      simdgroup_barrier(mem_flags::mem_none);

      // P = Ar*Br ; Q = Ai*Bi ; R = (Ar+Ai)*(Br+Bi)
      MMATile<AccumType, TM, TN, MMAFrag_acc_t> P, Q, R;

      tile_matmad(P, Ar, Br, P);
      tile_matmad(Q, Ai, Bi, Q);

      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < decltype(Ar)::kElemsPerTile; ++i)
        Ar.elems()[i] += Ai.elems()[i];
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < decltype(Br)::kElemsPerTile; ++i)
        Br.elems()[i] += Bi.elems()[i];

      tile_matmad(R, Ar, Br, R);

      // C_r += P - Q ; C_i -= Q
      STEEL_PRAGMA_UNROLL
      for (short i = 0; i < decltype(Ctile_r)::kElemsPerTile; ++i) {
        const auto p = P.elems()[i];
        const auto q = Q.elems()[i];
        const auto r = R.elems()[i];
        Ctile_r.elems()[i] += (p - q);
        Ctile_i.elems()[i] += (r - p - q);
      }

      // Progress to next simdgroup tile
      As_f += tile_stride_a_f;
      Bs_f += tile_stride_b_f;
    }
  }

  /* Store results from simdgroup_matrix results into device memory */
  METAL_FUNC void store_result(device U* D, const int ldd) {
    // Adjust for simdgroup and thread location
    D += sm * ldd + sn;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < TM; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < TN; j++) {
        thread const auto& r = Ctile_r.frag_at(i, j);
        thread const auto& im = Ctile_i.frag_at(i, j);
        int off = (i * TM_stride) * ldd + (j * TN_stride);
        STEEL_PRAGMA_UNROLL
        for (short k = 0; k < decltype(Ctile_r)::kElemsPerFrag; k++) {
          D[off + k] = Epilogue::apply(complex64_t(r[k], im[k]));
        }
      }
    }
  }

  METAL_FUNC void
  store_result_slice(device U* D, const int ldd, short2 start, short2 stop) {
    D += sm * ldd + sn;
    start -= short2(sn, sm);
    stop -= short2(sn, sm);

    if (stop.y <= 0 || stop.x <= 0)
      return;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < TM; ++i) {
      const int row = i * TM_stride;
      if (row >= start.y && row < stop.y) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < TN; ++j) {
          const int off = row * ldd + (j * TN_stride);
          thread const auto& r = Ctile_r.frag_at(i, j);
          thread const auto& im = Ctile_i.frag_at(i, j);

          STEEL_PRAGMA_UNROLL
          for (short k = 0; k < decltype(Ctile_r)::kElemsPerFrag; ++k) {
            const int col = j * TN_stride + k;
            if (col >= start.x && col < stop.x) {
              D[off + k] = Epilogue::apply(complex64_t(r[k], im[k]));
            }
          }
        }
      }
    }
  }

  METAL_FUNC void
  store_result_safe(device U* D, const int ldd, short2 dst_tile_dims) {
    D += sm * ldd + sn;
    dst_tile_dims -= short2(sn, sm);
    if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
      return;
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < TM; i++) {
      if (i * TM_stride < dst_tile_dims.y) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < TN; j++) {
          int off = (i * TM_stride) * ldd + (j * TN_stride);
          thread const auto& r = Ctile_r.frag_at(i, j);
          thread const auto& im = Ctile_i.frag_at(i, j);
          STEEL_PRAGMA_UNROLL
          for (short k = 0; k < decltype(Ctile_r)::kElemsPerFrag; k++) {
            if ((j * TN_stride + k) < dst_tile_dims.x) {
              D[off + k] = Epilogue::apply(complex64_t(r[k], im[k]));
            }
          }
        }
      }
    }
  }

  /* Apply epilogue */
  template <typename UnaryEpilogue>
  METAL_FUNC void apply_epilogue(thread const UnaryEpilogue& epilogue_op) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < decltype(Ctile_r)::kElemsPerTile; i++) {
      complex64_t out = epilogue_op.apply(
          complex64_t(Ctile_r.elems()[i], Ctile_i.elems()[i]));
      Ctile_r.elems()[i] = out.real;
      Ctile_i.elems()[i] = out.imag;
    }
  }

  /* Apply epilogue */
  template <typename BinaryEpilogue>
  METAL_FUNC void apply_epilogue(
      const device U* C,
      const int ldc,
      const int fdc,
      thread const BinaryEpilogue& epilogue_op) {
    // Adjust for simdgroup and thread location
    C += (sm)*ldc + (sn)*fdc;

    // Loop over all simdgroup tiles
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < TM; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < TN; j++) {
        // Get accumulated result and associated offset in Cr, Ci
        thread auto& r = Ctile_r.frag_at(i, j);
        thread auto& im = Ctile_i.frag_at(i, j);
        int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;

        STEEL_PRAGMA_UNROLL
        for (short k = 0; k < decltype(Ctile_r)::kElemsPerFrag; k++) {
          complex64_t out = epilogue_op.apply(
              complex64_t(r[k], im[k]), C[offset_c + k * fdc]);
          r[k] = out.real;
          im[k] = out.imag;
        }
      }
    }
  }

  /* Apply epilogue */
  template <typename BinaryEpilogue>
  METAL_FUNC void apply_epilogue_safe(
      const device U* C,
      const int ldc,
      const int fdc,
      short2 dst_tile_dims,
      thread const BinaryEpilogue& epilogue_op) {
    // Adjust for simdgroup and thread location
    C += (sm)*ldc + (sn)*fdc;
    dst_tile_dims -= short2(sn, sm);

    if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
      return;

    // Loop over all simdgroup tiles
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < TM; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < TN; j++) {
        // Get accumulated result and associated offset in Cr, Ci
        thread auto& r = Ctile_r.frag_at(i, j);
        thread auto& im = Ctile_i.frag_at(i, j);
        int offset_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;

        constexpr short kelems = decltype(Ctile_r)::kElemsPerFrag;
        complex64_t tmp[kelems];

        STEEL_PRAGMA_UNROLL
        for (short k = 0; k < kelems; k++) {
          if ((j * TN_stride + k) < dst_tile_dims.x &&
              (i * TM_stride) < dst_tile_dims.y) {
            tmp[k] = C[offset_c + k * fdc];
          } else {
            tmp[k] = complex64_t(0.0f, 0.0f);
          }
        }

        // Apply epilogue
        STEEL_PRAGMA_UNROLL
        for (short k = 0; k < kelems; k++) {
          complex64_t out = epilogue_op.apply(complex64_t(r[k], im[k]), tmp[k]);
          r[k] = out.real;
          im[k] = out.imag;
        }
      }
    }
  }

  /* Store results from simdgroup_matrix results into device memory */
  METAL_FUNC void store_result(
      device U* D,
      const int ldd,
      const device U* C,
      const int ldc,
      const int fdc,
      thread const Epilogue& epilogue_op) const {
    // Adjust for simdgroup and thread location
    C += (sm)*ldc + (sn)*fdc;
    D += (sm)*ldd + sn;

    constexpr short kelems = decltype(Ctile_r)::kElemsPerFrag;

    // Loop over all simdgroup tiles
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < TM; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < TN; j++) {
        // Get accumulated result and associated offset in Cr, Ci
        thread const auto& r = Ctile_r.frag_at(i, j);
        thread const auto& im = Ctile_i.frag_at(i, j);
        int off_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
        int off_d = (i * TM_stride) * ldd + (j * TN_stride);

        // Apply epilogue
        STEEL_PRAGMA_UNROLL
        for (short k = 0; k < kelems; k++) {
          D[off_d + k] =
              epilogue_op.apply(complex64_t(r[k], im[k]), C[off_c + k * fdc]);
        }
      }
    }
  }

  METAL_FUNC void store_result_safe(
      device U* D,
      const int ldd,
      const device U* C,
      const int ldc,
      const int fdc,
      short2 dst_tile_dims,
      thread const Epilogue& epilogue_op) const {
    // Adjust for simdgroup and thread location
    C += (sm)*ldc + (sn)*fdc;
    D += (sm)*ldd + sn;
    dst_tile_dims -= short2(sn, sm);

    if (dst_tile_dims.x <= 0 || dst_tile_dims.y <= 0)
      return;

    constexpr short kelems = decltype(Ctile_r)::kElemsPerFrag;

    STEEL_PRAGMA_UNROLL
    for (int i = 0; i < TM; i++) {
      if (i * TM_stride < dst_tile_dims.y) {
        STEEL_PRAGMA_UNROLL
        for (int j = 0; j < TN; j++) {
          // Get accumulated result and associated offset in Cr, Ci
          thread const auto& r = Ctile_r.frag_at(i, j);
          thread const auto& im = Ctile_i.frag_at(i, j);
          int off_c = (i * TM_stride) * ldc + (j * TN_stride) * fdc;
          int off_d = (i * TM_stride) * ldd + (j * TN_stride);

          // Apply epilogue
          STEEL_PRAGMA_UNROLL
          for (short k = 0; k < kelems; k++) {
            if ((j * TN_stride + k) < dst_tile_dims.x) {
              D[off_d + k] = epilogue_op.apply(
                  complex64_t(r[k], im[k]), C[off_c + k * fdc]);
            }
          }
        }
      }
    }
  }
};

} // namespace steel
} // namespace mlx


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/nax.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include <metal_simdgroup>
#include <metal_simdgroup_matrix>
#include <metal_stdlib>

#include "mlx/backend/metal/kernels/steel/defines.h"
#include "mlx/backend/metal/kernels/steel/utils/integral_constant.h"

#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h>

using namespace metal;

///////////////////////////////////////////////////////////////////////////////
// MMA helper
///////////////////////////////////////////////////////////////////////////////

namespace mlx {
namespace steel {

///////////////////////////////////////////////////////////////////////////////
// NAX Steel with new tiles
///////////////////////////////////////////////////////////////////////////////

struct BaseNAXFrag {
  STEEL_CONST short kFragRows = 16;
  STEEL_CONST short kFragCols = 16;

  STEEL_CONST short kElemsPerFrag = (kFragRows * kFragCols) / 32;

  STEEL_CONST short kElemRows = 2;
  STEEL_CONST short kElemCols = 4;

  STEEL_CONST short kElemRowsJump = 8;

  static_assert(
      kElemRows * kElemCols == kElemsPerFrag,
      "MMAFrag shape is not consistent with MMAFrag size");

  template <typename U>
  using dtype_frag_t = typename metal::vec<U, kElemsPerFrag>;

  METAL_FUNC static short2 get_coord() {
    const ushort simd_lane_id = __metal_get_thread_index_in_simdgroup(ushort());
    const short qid = simd_lane_id >> 2;
    const short fm = ((qid & 4) | ((simd_lane_id >> 1) & 3));
    const short fn = ((qid & 2) | (simd_lane_id & 1)) * 4;
    return short2{fn, fm};
  }

  METAL_FUNC static short2 get_coord(short idx) {
    const ushort simd_lane_id = __metal_get_thread_index_in_simdgroup(ushort());
    const short qid = simd_lane_id >> 2;
    const short fm = ((qid & 4) | ((simd_lane_id >> 1) & 3)) + (idx >> 2) * 8;
    const short fn = ((qid & 2) | (simd_lane_id & 1)) * 4 + idx % 4;
    return short2{fn, fm};
  }

  template <
      typename T,
      typename SrcPtrType,
      typename StrX,
      typename StrY,
      typename OffX = Int<0>,
      typename OffY = Int<0>>
  METAL_FUNC static constexpr void load(
      thread dtype_frag_t<T>& dst,
      SrcPtrType src,
      StrX str_x,
      StrY str_y,
      OffX off_x = {},
      OffY off_y = {}) {
    const short2 sc = get_coord();
    src += sc.y * str_x + sc.x * str_y;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      const auto r = off_x + i * kElemRowsJump;
      const auto c = off_y;

      if constexpr (metal::is_same_v<StrY, Int<1>>) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < kElemCols; j++) {
          dst[i * kElemCols + j] = static_cast<T>(src[r * str_x + c + j]);
        }
      } else {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < kElemCols; j++) {
          dst[i * kElemCols + j] =
              static_cast<T>(src[r * str_x + (c + j) * str_y]);
        }
      }
    }
  }

  template <
      typename T,
      typename SrcPtrType,
      typename StrX,
      typename StrY,
      typename LimX,
      typename OffX = Int<0>,
      typename OffY = Int<0>>
  METAL_FUNC static constexpr void load_rows(
      thread dtype_frag_t<T>& dst,
      SrcPtrType src,
      StrX str_x,
      StrY str_y,
      LimX lim_x,
      OffX off_x = {},
      OffY off_y = {}) {
    const short2 sc = get_coord();
    src += sc.y * str_x + sc.x * str_y;
    auto lx = lim_x - sc.y;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      const auto r = off_x + i * kElemRowsJump;
      const auto c = off_y;

      if (r < lx) {
        if constexpr (metal::is_same_v<StrY, Int<1>>) {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < kElemCols; j++) {
            dst[i * kElemCols + j] = static_cast<T>(src[r * str_x + (c + j)]);
          }
        } else {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < kElemCols; j++) {
            dst[i * kElemCols + j] =
                static_cast<T>(src[r * str_x + (c + j) * str_y]);
          }
        }

      } else {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < kElemCols; j++) {
          dst[i * kElemCols + j] = T(0);
        }
      }
    }
  }

  template <
      typename T,
      typename SrcPtrType,
      typename StrX,
      typename StrY,
      typename LimX,
      typename LimY,
      typename OffX = Int<0>,
      typename OffY = Int<0>>
  METAL_FUNC static constexpr void load_safe(
      thread dtype_frag_t<T>& dst,
      SrcPtrType src,
      StrX str_x,
      StrY str_y,
      LimX lim_x,
      LimY lim_y,
      OffX off_x = {},
      OffY off_y = {}) {
    const short2 sc = get_coord();
    src += sc.y * str_x + sc.x * str_y;
    auto lx = lim_x - sc.y;
    auto ly = lim_y - sc.x;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      const auto r = off_x + i * kElemRowsJump;
      const auto c = off_y;
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kElemCols; j++) {
        if ((r < lx) && ((c + j) < ly)) {
          dst[i * kElemCols + j] =
              static_cast<T>(src[r * str_x + (c + j) * str_y]);
        } else {
          dst[i * kElemCols + j] = T(0);
        }
      }
    }
  }

  template <
      typename T,
      typename DstPtrType,
      typename StrX,
      typename StrY,
      typename OffX = Int<0>,
      typename OffY = Int<0>>
  METAL_FUNC static constexpr void store(
      const thread dtype_frag_t<T>& src,
      DstPtrType dst,
      StrX str_x,
      StrY str_y,
      OffX off_x = {},
      OffY off_y = {}) {
    using U = pointer_element_t<DstPtrType>;

    const short2 sc = get_coord();
    dst += sc.y * str_x + sc.x * str_y;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      const auto r = off_x + i * kElemRowsJump;
      const auto c = off_y;

      if constexpr (metal::is_same_v<StrY, Int<1>>) {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < kElemCols; j++) {
          dst[r * str_x + c + j] = static_cast<U>(src[i * kElemCols + j]);
        }
      } else {
        STEEL_PRAGMA_UNROLL
        for (short j = 0; j < kElemCols; j++) {
          dst[r * str_x + (c + j) * str_y] =
              static_cast<U>(src[i * kElemCols + j]);
        }
      }
    }
  }

  template <
      typename T,
      typename DstPtrType,
      typename StrX,
      typename StrY,
      typename LimX,
      typename OffX = Int<0>,
      typename OffY = Int<0>>
  METAL_FUNC static constexpr void store_rows(
      const thread dtype_frag_t<T>& src,
      DstPtrType dst,
      StrX str_x,
      StrY str_y,
      LimX lim_x,
      OffX off_x = {},
      OffY off_y = {}) {
    using U = pointer_element_t<DstPtrType>;

    const short2 sc = get_coord();
    dst += sc.y * str_x + sc.x * str_y;
    auto lx = lim_x - sc.y;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      const auto r = off_x + i * kElemRowsJump;
      const auto c = off_y;

      if (r < lx) {
        if constexpr (metal::is_same_v<StrY, Int<1>>) {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < kElemCols; j++) {
            dst[r * str_x + c + j] = static_cast<U>(src[i * kElemCols + j]);
          }
        } else {
          STEEL_PRAGMA_UNROLL
          for (short j = 0; j < kElemCols; j++) {
            dst[r * str_x + (c + j) * str_y] =
                static_cast<U>(src[i * kElemCols + j]);
          }
        }
      }
    }
  }

  template <
      typename T,
      typename DstPtrType,
      typename StrX,
      typename StrY,
      typename LimX,
      typename LimY,
      typename OffX = Int<0>,
      typename OffY = Int<0>>
  METAL_FUNC static constexpr void store_safe(
      const thread dtype_frag_t<T>& src,
      DstPtrType dst,
      StrX str_x,
      StrY str_y,
      LimX lim_x,
      LimY lim_y,
      OffX off_x = {},
      OffY off_y = {}) {
    using U = pointer_element_t<DstPtrType>;

    const short2 sc = get_coord();
    dst += sc.y * str_x + sc.x * str_y;
    auto lx = lim_x - sc.y;
    auto ly = lim_y - sc.x;

    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      const auto r = off_x + i * kElemRowsJump;
      const auto c = off_y;

      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kElemCols; j++) {
        if (r < lx && (c + j) < ly) {
          dst[r * str_x + (c + j) * str_y] =
              static_cast<U>(src[i * kElemCols + j]);
        }
      }
    }
  }

  template <
      typename T,
      typename DstPtrType,
      typename StrX,
      typename StrY,
      typename StartX,
      typename StopX,
      typename StartY,
      typename StopY,
      typename OffX = Int<0>,
      typename OffY = Int<0>>
  METAL_FUNC static constexpr void store_slice(
      const thread dtype_frag_t<T>& src,
      DstPtrType dst,
      StrX str_x,
      StrY str_y,
      StartX start_x,
      StopX stop_x,
      StartY start_y,
      StopY stop_y,
      OffX off_x = Int<0>{},
      OffY off_y = Int<0>{}) {
    using U = pointer_element_t<DstPtrType>;

    const short2 sc = get_coord();

    const_for_loop<0, kElemRows, 1>([&](auto idx_row) {
      const auto r = off_x + idx_row * Int<kElemRowsJump>{};
      if (r >= stop_x - sc.y || r < start_x - sc.y) {
        return;
      }

      const_for_loop<0, kElemCols, 1>([&](auto idx_col) {
        const auto c = off_y + idx_col;
        if (c >= stop_y - sc.x || c < start_y - sc.x) {
          return;
        }

        const auto src_idx = idx_row * Int<kElemCols>{} + idx_col;
        dst[(r + sc.y) * str_x + (c + sc.x) * str_y] =
            static_cast<U>(src[src_idx]);
      });
    });
  }

  template <typename Op, typename T>
  METAL_FUNC static constexpr void row_reduce(
      thread const dtype_frag_t<T>& inp_vals,
      thread T* reduced_vals) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      T thr_reduce = Op::apply(
          Op::apply(inp_vals[i * kElemCols + 0], inp_vals[i * kElemCols + 1]),
          Op::apply(inp_vals[i * kElemCols + 2], inp_vals[i * kElemCols + 3]));

      T qgr_reduce = simd_shuffle_xor(thr_reduce, ushort(1));
      qgr_reduce = Op::apply(thr_reduce, qgr_reduce);

      T sgr_reduce = simd_shuffle_xor(qgr_reduce, ushort(8));
      sgr_reduce = Op::apply(qgr_reduce, sgr_reduce);

      reduced_vals[i] = Op::apply(reduced_vals[i], sgr_reduce);
    }
  }

  template <typename Op, typename T>
  METAL_FUNC static constexpr void row_bin_op(
      thread dtype_frag_t<T>& inp_vals,
      thread T* row_vals) {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemRows; i++) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kElemCols; j++) {
        inp_vals[i * kElemCols + j] =
            Op::apply(inp_vals[i * kElemCols + j], row_vals[i]);
      }
    }
  }

  template <
      typename CType,
      typename AType,
      typename BType,
      bool transpose_a = false,
      bool transpose_b = false>
  METAL_FUNC static constexpr void mma(
      thread dtype_frag_t<CType>& Cn0,
      thread dtype_frag_t<CType>& Cn1,
      const thread dtype_frag_t<AType>& A,
      metal::bool_constant<transpose_a>,
      const thread dtype_frag_t<BType>& Bn0,
      const thread dtype_frag_t<BType>& Bn1,
      metal::bool_constant<transpose_b>) {
    constexpr auto desc = mpp::tensor_ops::matmul2d_descriptor(
        16,
        32,
        16,
        transpose_a,
        transpose_b,
        true,
        mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate);

    // Create matmul op
    mpp::tensor_ops::matmul2d<desc, metal::execution_simdgroup> gemm_op;

    // Create matmul operands in registers
    auto ct_a =
        gemm_op
            .template get_left_input_cooperative_tensor<AType, BType, CType>();
    auto ct_b =
        gemm_op
            .template get_right_input_cooperative_tensor<AType, BType, CType>();

    // Create matmul output in register
    auto ct_c = gemm_op.template get_destination_cooperative_tensor<
        decltype(ct_a),
        decltype(ct_b),
        CType>();

    // Load A in to left operand registers
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemsPerFrag; i++) {
      ct_a[i] = A[i];
    }

    // Load B into right operand registers
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemsPerFrag; i++) {
      ct_b[i] = Bn0[i];
      ct_b[kElemsPerFrag + i] = Bn1[i];
    }

    // Load C into output registers (op handles accumulation)
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemsPerFrag; i++) {
      ct_c[i] = Cn0[i];
      ct_c[kElemsPerFrag + i] = Cn1[i];
    }

    // Do matmul
    gemm_op.run(ct_a, ct_b, ct_c);

    // Copy out results
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemsPerFrag; i++) {
      Cn0[i] = ct_c[i];
      Cn1[i] = ct_c[kElemsPerFrag + i];
    }
  }

  template <
      typename CType,
      typename AType,
      typename BType,
      bool transpose_a = false,
      bool transpose_b = false>
  METAL_FUNC static constexpr void mma(
      thread dtype_frag_t<CType>& Cm0,
      thread dtype_frag_t<CType>& Cm1,
      const thread dtype_frag_t<AType>& Am0,
      const thread dtype_frag_t<AType>& Am1,
      metal::bool_constant<transpose_a>,
      const thread dtype_frag_t<BType>& B,
      metal::bool_constant<transpose_b>) {
    // Create Matmul descriptor
    constexpr auto desc = mpp::tensor_ops::matmul2d_descriptor(
        16,
        32,
        16,
        transpose_a,
        transpose_b,
        true,
        mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate);

    // Create matmul op
    mpp::tensor_ops::matmul2d<desc, metal::execution_simdgroup> gemm_op;

    // Create matmul operands in registers
    auto ct_a =
        gemm_op
            .template get_left_input_cooperative_tensor<AType, BType, CType>();
    auto ct_b =
        gemm_op
            .template get_right_input_cooperative_tensor<AType, BType, CType>();

    // Create matmul output in register
    auto ct_c = gemm_op.template get_destination_cooperative_tensor<
        decltype(ct_a),
        decltype(ct_b),
        CType>();

    // Load A in to left operand registers
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemsPerFrag; i++) {
      ct_a[i] = Am0[i];
      ct_a[kElemsPerFrag + i] = Am1[i];
    }

    // Load B into right operand registers
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemsPerFrag; i++) {
      ct_b[i] = B[i];
    }

    // Load C into output registers (op handles accumulation)
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemsPerFrag; i++) {
      ct_c[i] = Cm0[i];
      ct_c[kElemsPerFrag + i] = Cm1[i];
    }

    // Do matmul
    gemm_op.run(ct_a, ct_b, ct_c);

    // Copy out results
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kElemsPerFrag; i++) {
      Cm0[i] = ct_c[i];
      Cm1[i] = ct_c[kElemsPerFrag + i];
    }
  }
};

template <
    typename T,
    short kTileRows_,
    short kTileCols_,
    class NAXFrag_ = BaseNAXFrag>
struct NAXTile {
  using NAXFrag_t = NAXFrag_;
  using elem_type = T;

  STEEL_CONST short kFragRows = NAXFrag_t::kFragRows;
  STEEL_CONST short kFragCols = NAXFrag_t::kFragCols;
  STEEL_CONST short kElemsPerFrag = NAXFrag_t::kElemsPerFrag;

  STEEL_CONST short kTileRows = kTileRows_;
  STEEL_CONST short kTileCols = kTileCols_;

  STEEL_CONST short kRows = kTileRows * kFragRows;
  STEEL_CONST short kCols = kTileCols * kFragCols;

  STEEL_CONST short kNumFrags = kTileRows * kTileCols;
  STEEL_CONST short kElemsPerTile = kNumFrags * kElemsPerFrag;

  STEEL_CONST short kFragThrRows = NAXFrag_t::kElemRows;
  STEEL_CONST short kFragThrCols = NAXFrag_t::kElemCols;
  STEEL_CONST short kFragRowsJump = NAXFrag_t::kElemRowsJump;

  STEEL_CONST short kRowsPerThread = kTileRows * NAXFrag_t::kElemRows;
  STEEL_CONST short kColsPerThread = kTileCols * NAXFrag_t::kElemCols;

  typedef typename NAXFrag_t::template dtype_frag_t<T> frag_type;

  frag_type val_frags[kNumFrags]; // = {frag_type(0)};

  METAL_FUNC NAXTile() thread {}

  METAL_FUNC constexpr void clear() {
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kNumFrags; ++i) {
      val_frags[i] = frag_type(0);
    }
  }

  METAL_FUNC constexpr thread frag_type& frag_at(const short i, const short j) {
    return val_frags[i * kTileCols + j];
  }

  METAL_FUNC constexpr const thread frag_type& frag_at(
      const short i,
      const short j) const {
    return val_frags[i * kTileCols + j];
  }

  template <int i, int j>
  METAL_FUNC constexpr thread frag_type& frag_at() {
    return val_frags[i * kTileCols + j];
  }

  template <int i, int j>
  METAL_FUNC constexpr const thread frag_type& frag_at() const {
    return val_frags[i * kTileCols + j];
  }

  template <bool transpose>
  METAL_FUNC constexpr thread frag_type&
  frag_at(const short i, const short j, metal::bool_constant<transpose>) {
    if constexpr (transpose) {
      return frag_at(j, i);
    } else {
      return frag_at(i, j);
    }
  }

  template <bool transpose>
  METAL_FUNC constexpr const thread frag_type&
  frag_at(const short i, const short j, metal::bool_constant<transpose>) const {
    if constexpr (transpose) {
      return frag_at(j, i);
    } else {
      return frag_at(i, j);
    }
  }

  template <int i, int j, bool transpose>
  METAL_FUNC constexpr thread frag_type& frag_at() {
    if constexpr (transpose) {
      return frag_at<j, i>();
    } else {
      return frag_at<i, j>();
    }
  }

  template <int i, int j, bool transpose>
  METAL_FUNC constexpr const thread frag_type& frag_at() const {
    if constexpr (transpose) {
      return frag_at<j, i>();
    } else {
      return frag_at<i, j>();
    }
  }

  METAL_FUNC thread elem_type* elems() {
    return reinterpret_cast<thread elem_type*>(val_frags);
  }

  METAL_FUNC const thread elem_type* elems() const {
    return reinterpret_cast<const thread elem_type*>(val_frags);
  }

  template <typename Op>
  METAL_FUNC void row_reduce(thread metal::vec<T, kRowsPerThread>& vals) const {
    auto vptr = (thread T*)(&vals);
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kTileCols; ++j) {
        NAXFrag_t::template row_reduce<Op>(
            frag_at(i, j), &vptr[i * kFragThrRows]);
      }
    }
  }

  template <typename Op>
  METAL_FUNC void row_bin_op(thread metal::vec<T, kRowsPerThread>& vals) {
    auto vptr = (thread T*)(&vals);
    STEEL_PRAGMA_UNROLL
    for (short i = 0; i < kTileRows; ++i) {
      STEEL_PRAGMA_UNROLL
      for (short j = 0; j < kTileCols; ++j) {
        NAXFrag_t::template row_bin_op<Op>(
            frag_at(i, j), &vptr[i * kFragThrRows]);
      }
    }
  }

  template <typename U, int str_x, int str_y>
  METAL_FUNC void load(const threadgroup U* src) {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::load(
            frag_at<idx_row.value, idx_col.value>(),
            src,
            Int<str_x>{},
            Int<str_y>{},
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }

  template <typename U, int str_x, int str_y>
  METAL_FUNC void store(threadgroup U* dst) const {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::store(
            frag_at<idx_row.value, idx_col.value>(),
            dst,
            Int<str_x>{},
            Int<str_y>{},
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }

  template <typename U>
  METAL_FUNC void load(const device U* src, const int ld) {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::load(
            frag_at<idx_row.value, idx_col.value>(),
            src,
            ld,
            Int<1>{},
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }

  template <typename U>
  METAL_FUNC void store(device U* dst, const int ld) const {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::store(
            frag_at<idx_row.value, idx_col.value>(),
            dst,
            ld,
            Int<1>{},
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }

  template <typename U>
  METAL_FUNC void
  load_rows(const device U* src, const int ld, const short n_rows) {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::load_rows(
            frag_at<idx_row.value, idx_col.value>(),
            src,
            ld,
            Int<1>{},
            n_rows,
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }

  template <typename U>
  METAL_FUNC void
  load_safe(const device U* src, const int ld, const short2 src_tile_dims) {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::load_safe(
            frag_at<idx_row.value, idx_col.value>(),
            src,
            ld,
            Int<1>{},
            src_tile_dims.y,
            src_tile_dims.x,
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }

  template <typename U>
  METAL_FUNC void store_rows(device U* dst, const int ld, const short n_rows)
      const {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::store_rows(
            frag_at<idx_row.value, idx_col.value>(),
            dst,
            ld,
            Int<1>{},
            n_rows,
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }

  template <typename U>
  METAL_FUNC void
  store_safe(device U* dst, const int ld, const short2 dst_tile_dims) const {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::store_safe(
            frag_at<idx_row.value, idx_col.value>(),
            dst,
            ld,
            Int<1>{},
            dst_tile_dims.y,
            dst_tile_dims.x,
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }

  template <typename U>
  METAL_FUNC void store_slice(
      device U* dst,
      const int ld,
      const short2 start,
      const short2 stop) const {
    const_for_loop<0, kTileRows, 1>([&](auto idx_row) {
      const_for_loop<0, kTileCols, 1>([&](auto idx_col) {
        NAXFrag_t::store_slice(
            frag_at<idx_row.value, idx_col.value>(),
            dst,
            ld,
            Int<1>{},
            start.y,
            stop.y,
            start.x,
            stop.x,
            idx_row * Int<kFragRows>{},
            idx_col * Int<kFragCols>{});
      });
    });
  }
};

template <
    class CTile,
    class ATile,
    class BTile,
    bool transpose_a,
    bool transpose_b>
METAL_FUNC void tile_matmad_nax(
    thread CTile& C,
    thread ATile& A,
    metal::bool_constant<transpose_a>,
    thread BTile& B,
    metal::bool_constant<transpose_b>) {
  // Static checks
  constexpr short TMa = transpose_a ? ATile::kTileCols : ATile::kTileRows;
  constexpr short TM = CTile::kTileRows;
  static_assert(TMa == TM, "MXU tile matmul: M dimensions do not match");

  constexpr short TNb = transpose_b ? BTile::kTileRows : BTile::kTileCols;
  constexpr short TN = CTile::kTileCols;
  static_assert(TNb == TN, "MXU tile matmul: N dimensions do not match");

  constexpr short TKa = transpose_a ? ATile::kTileRows : ATile::kTileCols;
  constexpr short TK = transpose_b ? BTile::kTileCols : BTile::kTileRows;
  static_assert(TKa == TK, "MXU tile matmul: K dimensions do not match");

  constexpr auto ta = metal::bool_constant<transpose_a>{};
  constexpr auto tb = metal::bool_constant<transpose_b>{};

  if constexpr (TN == 1 && TM % 2 == 0) {
    STEEL_PRAGMA_UNROLL
    for (short mm = 0; mm < TM; mm += 2) {
      STEEL_PRAGMA_UNROLL
      for (short nn = 0; nn < TN; ++nn) {
        STEEL_PRAGMA_UNROLL
        for (short kk = 0; kk < TK; ++kk) {
          CTile::NAXFrag_t::mma(
              C.frag_at(mm, nn),
              C.frag_at(mm + 1, nn),
              A.frag_at(mm, kk, ta),
              A.frag_at(mm + 1, kk, ta),
              metal::bool_constant<transpose_a>{},
              B.frag_at(kk, nn, tb),
              metal::bool_constant<transpose_b>{});
        }
      }
    }
  } else if constexpr (TN % 2 == 0) {
    STEEL_PRAGMA_UNROLL
    for (short mm = 0; mm < TM; ++mm) {
      STEEL_PRAGMA_UNROLL
      for (short nn = 0; nn < TN; nn += 2) {
        STEEL_PRAGMA_UNROLL
        for (short kk = 0; kk < TK; ++kk) {
          CTile::NAXFrag_t::mma(
              C.frag_at(mm, nn),
              C.frag_at(mm, nn + 1),
              A.frag_at(mm, kk, ta),
              metal::bool_constant<transpose_a>{},
              B.frag_at(kk, nn, tb),
              B.frag_at(kk, nn + 1, tb),
              metal::bool_constant<transpose_b>{});
        }
      }
    }
  }
}

} // namespace steel
} // namespace mlx


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/params.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

///////////////////////////////////////////////////////////////////////////////
// GEMM param classes
///////////////////////////////////////////////////////////////////////////////

namespace mlx {
namespace steel {

struct GEMMParams {
  const int M;
  const int N;
  const int K;

  const int lda;
  const int ldb;
  const int ldd;

  const int tiles_n;
  const int tiles_m;

  const int64_t batch_stride_a;
  const int64_t batch_stride_b;
  const int64_t batch_stride_d;

  const int swizzle_log;
  const int gemm_k_iterations_aligned;

  const int batch_ndim;
};

struct GEMMSpiltKParams {
  const int M;
  const int N;
  const int K;

  const int lda;
  const int ldb;
  const int ldc;

  const int tiles_n;
  const int tiles_m;

  const int split_k_partitions;
  const int split_k_partition_stride;
  const int split_k_partition_size;

  const int swizzle_log;
  const int gemm_k_iterations_aligned;
};

struct GEMMAddMMParams {
  const int ldc;
  const int fdc;

  const int64_t batch_stride_c;

  const float alpha;
  const float beta;
};

} // namespace steel
} // namespace mlx


================================================
FILE: mlx/backend/metal/kernels/steel/gemm/transforms.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/backend/metal/kernels/steel/utils.h"

///////////////////////////////////////////////////////////////////////////////
// Transforms and Epilogues
///////////////////////////////////////////////////////////////////////////////

namespace mlx {
namespace steel {

template <typename OutT, typename InT>
struct TransformNone {
  static METAL_FUNC OutT apply(InT x) {
    return static_cast<OutT>(x);
  }

  static METAL_FUNC OutT apply(InT x, OutT) {
    return static_cast<OutT>(x);
  }
};

template <typename OutT, typename InT>
struct TransformAdd {
  TransformAdd(const float, const float) {}

  static METAL_FUNC OutT apply(InT x) {
    return static_cast<OutT>(x);
  }

  static METAL_FUNC OutT apply(InT x, OutT c) {
    return static_cast<OutT>(x) + c;
  }
};

template <typename OutT, typename InT>
struct TransformAxpby {
  const float alpha;
  const float beta;

  TransformAxpby(const float alpha_, const float beta_)
      : alpha(alpha_), beta(beta_) {}

  static METAL_FUNC OutT apply(InT x) {
    return static_cast<OutT>(x);
  }

  METAL_FUNC OutT apply(InT x, OutT c) const {
    return static_cast<OutT>(
        x * static_cast<InT>(alpha) + (static_cast<OutT>(beta) * c));
  }
};

template <typename T>
struct AccumHelper {
  typedef float accum_type;
};

struct BlockSwizzle {
  static METAL_FUNC int2
  swizzle(uint3 tid [[threadgroup_position_in_grid]], const int swizzle_log) {
    const int tid_x = (tid.x) >> swizzle_log;
    const int tid_y =
        ((tid.y) << swizzle_log) + ((tid.x) & ((1 << swizzle_log) - 1));
    return int2(tid_x, tid_y);
  }
};

} // namespace steel
} // namespace mlx

================================================
FILE: mlx/backend/metal/kernels/steel/utils/integral_constant.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include <metal_stdlib>
#include "mlx/backend/metal/kernels/steel/utils/type_traits.h"

#pragma METAL internals : enable

namespace mlx {
namespace steel {

///////////////////////////////////////////////////////////////////////////////
// Integral constant with casting
///////////////////////////////////////////////////////////////////////////////

template <typename T, T v>
struct integral_constant {
  static constexpr constant T value = v;
  using value_type = T;
  using type = integral_constant;

  METAL_FUNC constexpr operator value_type() const noexcept {
    return value;
  }

  // METAL_FUNC constexpr value_type operator()() const noexcept {
  //   return value;
  // }
};

template <bool B>
using bool_constant = integral_constant<bool, B>;
using true_type = bool_constant<true>;
using false_type = bool_constant<false>;

template <class T>
struct is_integral : bool_constant<metal::is_integral<T>::value> {};

template <class T, T v>
struct is_integral<integral_constant<T, v>>
    : bool_constant<metal::is_integral<T>::value> {};

template <typename T>
constexpr constant bool is_integral_v = is_integral<T>::value;

template <int val>
using Int = integral_constant<int, val>;

///////////////////////////////////////////////////////////////////////////////
// Binary Operators on Integral constants
///////////////////////////////////////////////////////////////////////////////

#define integral_const_binop(__op__, __operator__)          \
  template <typename T, T tv, typename U, U uv>             \
  METAL_FUNC constexpr auto __operator__(                   \
      integral_constant<T, tv>, integral_constant<U, uv>) { \
    constexpr auto res = tv __op__ uv;                      \
    return integral_constant<decltype(res), res>{};         \
  }

integral_const_binop(+, operator+);
integral_const_binop(-, operator-);
integral_const_binop(*, operator*);
integral_const_binop(/, operator/);

integral_const_binop(==, operator==);
integral_const_binop(!=, operator!=);
integral_const_binop(<, operator<);
integral_const_binop(>, operator>);
integral_const_binop(<=, operator<=);
integral_const_binop(>=, operator>=);

integral_const_binop(&&, operator&&);
integral_const_binop(||, operator||);

template <typename T, typename = metal::enable_if_t<!is_integral_v<T>>>
METAL_FUNC constexpr auto operator||(true_type, T) {
  return true_type{};
}
template <typename T, typename = metal::enable_if_t<!is_integral_v<T>>>
METAL_FUNC constexpr auto operator||(T, true_type) {
  return true_type{};
}

template <typename T, typename = metal::enable_if_t<!is_integral_v<T>>>
METAL_FUNC constexpr auto operator&&(false_type, T) {
  return false_type{};
}

template <typename T, typename = metal::enable_if_t<!is_integral_v<T>>>
METAL_FUNC constexpr auto operator&&(T, false_type) {
  return false_type{};
}

// Dispatch utilities
template <typename F>
void dispatch_bool(bool v, F f) {
  if (v) {
    f(true_type{});
  } else {
    f(false_type{});
  }
}

template <int start, int stop, int step, typename F>
constexpr void const_for_loop(F f) {
  if constexpr (start < stop) {
    constexpr auto idx = Int<start>{};
    f(idx);
    const_for_loop<start + step, stop, step, F>(f);
  }
}

#undef integral_const_binop

///////////////////////////////////////////////////////////////////////////////
// Reduction operators
///////////////////////////////////////////////////////////////////////////////

template <typename T>
METAL_FUNC constexpr T sum(T x) {
  return x;
}

template <typename T, typename... Us>
METAL_FUNC constexpr auto sum(T x, Us... us) {
  return x + sum(us...);
}

} // namespace steel
} // namespace mlx

#pragma METAL internals : disable

================================================
FILE: mlx/backend/metal/kernels/steel/utils/type_traits.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include <metal_stdlib>

#pragma METAL internals : enable

namespace metal {

template <typename T>
struct is_empty : metal::bool_constant<__is_empty(T)> {};

#ifdef __cpp_variable_templates
template <typename T>
constexpr constant bool is_empty_v = is_empty<T>::value;
#endif

template <typename... Ts>
struct make_void {
  typedef void type;
};

template <typename... Ts>
using void_t = typename make_void<Ts...>::type;

template <class T>
struct is_static : metal::bool_constant<is_empty<remove_cv_t<T>>::value> {};

template <typename T>
struct pointer_element {};

template <typename T>
struct pointer_element<thread T*> {
  using type = remove_cv_t<T>;
};
template <typename T>
struct pointer_element<device T*> {
  using type = remove_cv_t<T>;
};
template <typename T>
struct pointer_element<constant T*> {
  using type = remove_cv_t<T>;
};
template <typename T>
struct pointer_element<threadgroup T*> {
  using type = remove_cv_t<T>;
};

template <typename T>
using pointer_element_t = typename pointer_element<remove_cv_t<T>>::type;

} // namespace metal

#pragma METAL internals : disable

================================================
FILE: mlx/backend/metal/kernels/steel/utils.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include <metal_stdlib>

METAL_FUNC ulong2 elem_to_loc_broadcast(
    uint elem,
    constant const int* shape,
    constant const int64_t* a_strides,
    constant const int64_t* b_strides,
    int ndim) {
  ulong loc_a{0};
  ulong loc_b{0};
  for (int i = ndim - 1; i >= 0 && elem > 0; --i) {
    int pos_in_dim = (elem % shape[i]);
    elem /= shape[i];
    loc_a += pos_in_dim * a_strides[i];
    loc_b += pos_in_dim * b_strides[i];
  }
  return ulong2(loc_a, loc_b);
}

METAL_FUNC ulong3 elem_to_loc_broadcast(
    uint elem,
    constant const int* shape,
    constant const int64_t* a_strides,
    constant const int64_t* b_strides,
    constant const int64_t* c_strides,
    int ndim) {
  ulong loc_a{0};
  ulong loc_b{0};
  ulong loc_c{0};
  for (int i = ndim - 1; i >= 0 && elem > 0; --i) {
    int pos_in_dim = (elem % shape[i]);
    elem /= shape[i];
    loc_a += pos_in_dim * a_strides[i];
    loc_b += pos_in_dim * b_strides[i];
    loc_c += pos_in_dim * c_strides[i];
  }
  return ulong3(loc_a, loc_b, loc_c);
}


================================================
FILE: mlx/backend/metal/kernels/ternary.h
================================================
// Copyright © 2024 Apple Inc.

template <
    typename T,
    typename Op,
    bool BSCALAR,
    bool CSCALAR,
    int N = WorkPerThread<T>::n>
[[kernel]] void ternary_v(
    device const bool* a,
    device const T* b,
    device const T* c,
    device T* d,
    constant uint& size,
    uint index [[thread_position_in_grid]]) {
  index *= N;
  if (N > 1 && index + N > size) {
    for (int i = 0; index + i < size; ++i) {
      auto bidx = BSCALAR ? 0 : index + i;
      auto cidx = CSCALAR ? 0 : index + i;
      d[index + i] = Op()(a[index + i], b[bidx], c[cidx]);
    }
  } else {
    for (int i = 0; i < N; ++i) {
      auto bidx = BSCALAR ? 0 : index + i;
      auto cidx = CSCALAR ? 0 : index + i;
      d[index + i] = Op()(a[index + i], b[bidx], c[cidx]);
    }
  }
}

template <
    typename T,
    typename Op,
    bool BSCALAR,
    bool CSCALAR,
    int N = WorkPerThread<T>::n>
[[kernel]] void ternary_v2(
    device const bool* a,
    device const T* b,
    device const T* c,
    device T* d,
    constant int64_t& size,
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  int64_t offset = N * (index.x + grid_dim.x * int64_t(index.y));
  if (N > 1 && offset + N > size) {
    for (int i = 0; offset + i < size; ++i) {
      auto bidx = BSCALAR ? 0 : offset + i;
      auto cidx = CSCALAR ? 0 : offset + i;
      d[offset + i] = Op()(a[offset + i], b[bidx], c[cidx]);
    }
  } else {
    for (int i = 0; i < N; ++i) {
      auto bidx = BSCALAR ? 0 : offset + i;
      auto cidx = CSCALAR ? 0 : offset + i;
      d[offset + i] = Op()(a[offset + i], b[bidx], c[cidx]);
    }
  }
}

template <typename T, typename Op, typename IdxT = int64_t>
[[kernel]] void ternary_g_nd1(
    device const bool* a,
    device const T* b,
    device const T* c,
    device T* d,
    constant const int64_t& a_strides,
    constant const int64_t& b_strides,
    constant const int64_t& c_strides,
    uint index [[thread_position_in_grid]]) {
  auto a_idx = elem_to_loc_1<IdxT>(index, a_strides);
  auto b_idx = elem_to_loc_1<IdxT>(index, b_strides);
  auto c_idx = elem_to_loc_1<IdxT>(index, c_strides);
  d[index] = Op()(a[a_idx], b[b_idx], c[c_idx]);
}

template <typename T, typename Op, typename IdxT = int64_t>
[[kernel]] void ternary_g_nd2(
    device const bool* a,
    device const T* b,
    device const T* c,
    device T* d,
    constant const int64_t a_strides[2],
    constant const int64_t b_strides[2],
    constant const int64_t c_strides[2],
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  auto a_idx = elem_to_loc_2<IdxT>(index, a_strides);
  auto b_idx = elem_to_loc_2<IdxT>(index, b_strides);
  auto c_idx = elem_to_loc_2<IdxT>(index, c_strides);
  IdxT out_idx = index.x + IdxT(grid_dim.x) * index.y;
  d[out_idx] = Op()(a[a_idx], b[b_idx], c[c_idx]);
}

template <typename T, typename Op, typename IdxT = int64_t>
[[kernel]] void ternary_g_nd3(
    device const bool* a,
    device const T* b,
    device const T* c,
    device T* d,
    constant const int64_t a_strides[3],
    constant const int64_t b_strides[3],
    constant const int64_t c_strides[3],
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
  auto a_idx = elem_to_loc_3<IdxT>(index, a_strides);
  auto b_idx = elem_to_loc_3<IdxT>(index, b_strides);
  auto c_idx = elem_to_loc_3<IdxT>(index, c_strides);
  IdxT out_idx = index.x + grid_dim.x * (index.y + IdxT(grid_dim.y) * index.z);
  d[out_idx] = Op()(a[a_idx], b[b_idx], c[c_idx]);
}

template <typename T, typename Op, int N = 1, typename IdxT = int64_t>
[[kernel]] void ternary_g(
    device const bool* a,
    device const T* b,
    device const T* c,
    device T* d,
    constant const int* shape,
    constant const int64_t* a_strides,
    constant const int64_t* b_strides,
    constant const int64_t* c_strides,
    constant const int& ndim,
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
  auto idx = elem_to_loc_3_nd<IdxT>(
      {N * index.x, index.y, index.z},
      shape,
      a_strides,
      b_strides,
      c_strides,
      ndim);
  auto xshape = shape[ndim - 1];
  IdxT out_idx = N * index.x + xshape * (index.y + IdxT(grid_dim.y) * index.z);
  IdxT a_xstride = a_strides[ndim - 1];
  IdxT b_xstride = b_strides[ndim - 1];
  IdxT c_xstride = c_strides[ndim - 1];
  for (int i = 0; i < N && (int(N * index.x) + i) < xshape; ++i) {
    d[out_idx++] = Op()(a[idx.x], b[idx.y], c[idx.z]);
    idx.x += a_xstride;
    idx.y += b_xstride;
    idx.z += c_xstride;
  }
}


================================================
FILE: mlx/backend/metal/kernels/ternary.metal
================================================
// Copyright © 2024 Apple Inc.

#include <metal_integer>
#include <metal_math>

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/ternary_ops.h"
#include "mlx/backend/metal/kernels/ternary.h"

#define instantiate_ternary_base(op, tname, type)                    \
  instantiate_kernel("v_" #op #tname, ternary_v, type, op, false, false, 1) \
  instantiate_kernel("v2_" #op #tname, ternary_v2, type, op, false, false)  \
  instantiate_kernel("vs_" #op #tname, ternary_v, type, op, false, true, 1) \
  instantiate_kernel("vs2_" #op #tname, ternary_v2, type, op, false, true)  \
  instantiate_kernel("sv_" #op #tname, ternary_v, type, op, true, false, 1) \
  instantiate_kernel("sv2_" #op #tname, ternary_v2, type, op, true, false)  \
  instantiate_kernel("gn2_" #op #tname, ternary_g, type, op, 2, int) \
  instantiate_kernel("g1_" #op #tname, ternary_g_nd1, type, op, int) \
  instantiate_kernel("g2_" #op #tname, ternary_g_nd2, type, op, int) \
  instantiate_kernel("g3_" #op #tname, ternary_g_nd3, type, op, int) \
  instantiate_kernel("g1large_" #op #tname, ternary_g_nd1, type, op) \
  instantiate_kernel("g2large_" #op #tname, ternary_g_nd2, type, op) \
  instantiate_kernel("g3large_" #op #tname, ternary_g_nd3, type, op) \
  instantiate_kernel("gn4large_" #op #tname, ternary_g, type, op, 4) \

#define instantiate_ternary_all(op, tname, type)            \
  instantiate_kernel("vn_" #op #tname, ternary_v, type, op, false, false) \
  instantiate_kernel("vsn_" #op #tname, ternary_v, type, op, false, true) \
  instantiate_kernel("svn_" #op #tname, ternary_v, type, op, true, false) \
  instantiate_ternary_base(op, tname, type)

#define instantiate_ternary_types(op)               \
  instantiate_ternary_all(op, bool_, bool)          \
  instantiate_ternary_all(op, uint8, uint8_t)       \
  instantiate_ternary_all(op, uint16, uint16_t)     \
  instantiate_ternary_all(op, uint32, uint32_t)     \
  instantiate_ternary_base(op, uint64, uint64_t)    \
  instantiate_ternary_all(op, int8, int8_t)         \
  instantiate_ternary_all(op, int16, int16_t)       \
  instantiate_ternary_all(op, int32, int32_t)       \
  instantiate_ternary_base(op, int64, int64_t)      \
  instantiate_ternary_all(op, float16, half)        \
  instantiate_ternary_all(op, float32, float)       \
  instantiate_ternary_all(op, bfloat16, bfloat16_t) \
  instantiate_ternary_base(op, complex64, complex64_t) // clang-format on

instantiate_ternary_types(Select)


================================================
FILE: mlx/backend/metal/kernels/ternary_ops.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

struct Select {
  template <typename T>
  T operator()(bool condition, T x, T y) {
    return condition ? x : y;
  }
};


================================================
FILE: mlx/backend/metal/kernels/unary.h
================================================
// Copyright © 2024 Apple Inc.

template <typename T, typename U, typename Op, int N = WorkPerThread<T>::n>
[[kernel]] void unary_v(
    device const T* in,
    device U* out,
    constant uint& size,
    uint index [[thread_position_in_grid]]) {
  index *= N;
  if (N > 1 && index + N > size) {
    for (int i = 0; index + i < size; ++i) {
      out[index + i] = static_cast<U>(Op()(in[index + i]));
    }
  } else {
    for (int i = 0; i < N; ++i) {
      out[index + i] = static_cast<U>(Op()(in[index + i]));
    }
  }
}

template <typename T, typename U, typename Op, int N = WorkPerThread<T>::n>
[[kernel]] void unary_v2(
    device const T* in,
    device U* out,
    constant int64_t& size,
    uint2 index [[thread_position_in_grid]],
    uint2 grid_dim [[threads_per_grid]]) {
  int64_t offset = N * (index.x + grid_dim.x * int64_t(index.y));
  if (N > 1 && offset + N > size) {
    for (int i = 0; offset + i < size; ++i) {
      out[offset + i] = static_cast<U>(Op()(in[offset + i]));
    }
  } else {
    for (int i = 0; i < N; ++i) {
      out[offset + i] = static_cast<U>(Op()(in[offset + i]));
    }
  }
}

template <
    typename T,
    typename U,
    typename Op,
    int N = 1,
    typename IdxT = int64_t>
[[kernel]] void unary_g(
    device const T* in,
    device U* out,
    constant const int* in_shape,
    constant const int64_t* in_strides,
    device const int& ndim,
    uint3 index [[thread_position_in_grid]],
    uint3 grid_dim [[threads_per_grid]]) {
  auto idx = elem_to_loc<IdxT>(
      {N * index.x, index.y, index.z}, in_shape, in_strides, ndim);
  auto xshape = in_shape[ndim - 1];
  IdxT xstride = in_strides[ndim - 1];
  IdxT out_idx = N * index.x + xshape * (index.y + IdxT(grid_dim.y) * index.z);
  for (int i = 0; i < N && (int(N * index.x) + i) < xshape; ++i) {
    out[out_idx++] = static_cast<U>(Op()(in[idx]));
    idx += xstride;
  }
}


================================================
FILE: mlx/backend/metal/kernels/unary.metal
================================================
// Copyright © 2024 Apple Inc.

// clang-format off
#include "mlx/backend/metal/kernels/utils.h"
#include "mlx/backend/metal/kernels/unary_ops.h"
#include "mlx/backend/metal/kernels/unary.h"

#define instantiate_unary_work_per_thread(op, in_tname, out_tname, in_type, out_type) \
  instantiate_kernel("vn_" #op #in_tname #out_tname, unary_v, in_type, out_type, op)

#define instantiate_unary_base(op, in_tname, out_tname, in_type, out_type)             \
  instantiate_kernel("v_" #op #in_tname #out_tname, unary_v, in_type, out_type, op, 1) \
  instantiate_kernel("v2_" #op #in_tname #out_tname, unary_v2, in_type, out_type, op)  \
  instantiate_kernel(                                                                  \
      "gn1_" #op #in_tname #out_tname, unary_g, in_type, out_type, op, 1, int)         \
  instantiate_kernel(                                                                  \
      "gn4large_" #op #in_tname #out_tname, unary_g, in_type, out_type, op, 4)

#define instantiate_unary_all(op, in_tname, out_tname, in_type, out_type)       \
  instantiate_unary_base(op, in_tname, out_tname, in_type, out_type)            \
  instantiate_unary_work_per_thread(op, in_tname, out_tname, in_type, out_type)

#define instantiate_unary_all_same(op, tname, type)   \
  instantiate_unary_all(op, tname, tname, type, type)

#define instantiate_unary_base_same(op, tname, type)   \
  instantiate_unary_base(op, tname, tname, type, type)

#define instantiate_unary_float(op)                    \
  instantiate_unary_all_same(op, float16, half)        \
  instantiate_unary_all_same(op, float32, float)       \
  instantiate_unary_all_same(op, bfloat16, bfloat16_t)

#define instantiate_unary_int(op)                   \
  instantiate_unary_all_same(op, uint8, uint8_t)    \
  instantiate_unary_all_same(op, uint16, uint16_t)  \
  instantiate_unary_all_same(op, uint32, uint32_t)  \
  instantiate_unary_base_same(op, uint64, uint64_t) \
  instantiate_unary_all_same(op, int8, int8_t)      \
  instantiate_unary_all_same(op, int16, int16_t)    \
  instantiate_unary_all_same(op, int32, int32_t)    \
  instantiate_unary_base_same(op, int64, int64_t)

#define instantiate_unary_types(op)                \
  instantiate_unary_all_same(op, bool_, bool)      \
  instantiate_unary_int(op)                        \
  instantiate_unary_float(op)

instantiate_unary_types(Abs)
instantiate_unary_float(ArcCos)
instantiate_unary_float(ArcCosh)
instantiate_unary_float(ArcSin)
instantiate_unary_float(ArcSinh)
instantiate_unary_float(ArcTan)
instantiate_unary_float(ArcTanh)
instantiate_unary_types(Ceil)
instantiate_unary_float(Cos)
instantiate_unary_float(Cosh)
instantiate_unary_float(Exp)
instantiate_unary_float(Expm1)
instantiate_unary_types(Floor)
instantiate_unary_float(Log)
instantiate_unary_float(Log2)
instantiate_unary_float(Log10)
instantiate_unary_float(Log1p)
instantiate_unary_types(Negative)
instantiate_unary_float(Sigmoid)
instantiate_unary_float(Erf)
instantiate_unary_float(ErfInv)
instantiate_unary_types(Sign)
instantiate_unary_float(Sin)
instantiate_unary_float(Sinh)
instantiate_unary_types(Square)
instantiate_unary_float(Sqrt)
instantiate_unary_float(Rsqrt)
instantiate_unary_float(Tan)
instantiate_unary_float(Tanh)
instantiate_unary_float(Round)
instantiate_unary_int(BitwiseInvert)

instantiate_unary_base_same(Abs, complex64, complex64_t)
instantiate_unary_base_same(ArcCos, complex64, complex64_t)
instantiate_unary_base_same(ArcSin, complex64, complex64_t)
instantiate_unary_base_same(ArcTan, complex64, complex64_t)
instantiate_unary_base_same(Conjugate, complex64, complex64_t)
instantiate_unary_base_same(Cos, complex64, complex64_t)
instantiate_unary_base_same(Cosh, complex64, complex64_t)
instantiate_unary_base_same(Exp, complex64, complex64_t)
instantiate_unary_base_same(Log, complex64, complex64_t)
instantiate_unary_base_same(Log1p, complex64, complex64_t)
instantiate_unary_base_same(Log2, complex64, complex64_t)
instantiate_unary_base_same(Log10, complex64, complex64_t)
instantiate_unary_base_same(Negative, complex64, complex64_t)
instantiate_unary_base_same(Sign, complex64, complex64_t)
instantiate_unary_base_same(Sin, complex64, complex64_t)
instantiate_unary_base_same(Sinh, complex64, complex64_t)
instantiate_unary_base_same(Square, complex64, complex64_t)
instantiate_unary_base_same(Sqrt, complex64, complex64_t)
instantiate_unary_base_same(Rsqrt, complex64, complex64_t)
instantiate_unary_base_same(Tan, complex64, complex64_t)
instantiate_unary_base_same(Tanh, complex64, complex64_t)
instantiate_unary_base_same(Round, complex64, complex64_t)
instantiate_unary_base(Real, complex64, float32, complex64_t, float)
instantiate_unary_base(Imag, complex64, float32, complex64_t, float)

instantiate_unary_all_same(LogicalNot, bool_, bool)

instantiate_unary_all(ToFP8, float16, uint8, float16_t, uint8_t)
instantiate_unary_all(ToFP8, bfloat16, uint8, bfloat16_t, uint8_t)
instantiate_unary_all(ToFP8, float32, uint8, float, uint8_t)
instantiate_unary_all(FromFP8, uint8, float16, uint8_t, float16_t)
instantiate_unary_all(FromFP8, uint8, bfloat16, uint8_t, bfloat16_t)
instantiate_unary_all(FromFP8, uint8, float32, uint8_t, float)

    // clang-format on


================================================
FILE: mlx/backend/metal/kernels/unary_ops.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <metal_integer>
#include <metal_math>

#include "mlx/backend/metal/kernels/cexpf.h"
#include "mlx/backend/metal/kernels/erf.h"
#include "mlx/backend/metal/kernels/expm1f.h"
#include "mlx/backend/metal/kernels/fp8.h"

namespace {
constant float inf = metal::numeric_limits<float>::infinity();
}

struct Abs {
  template <typename T>
  T operator()(T x) {
    return metal::abs(x);
  };
  uint8_t operator()(uint8_t x) {
    return x;
  };
  uint16_t operator()(uint16_t x) {
    return x;
  };
  uint32_t operator()(uint32_t x) {
    return x;
  };
  uint64_t operator()(uint64_t x) {
    return x;
  };
  bool operator()(bool x) {
    return x;
  };
  complex64_t operator()(complex64_t x) {
    return {metal::precise::sqrt(x.real * x.real + x.imag * x.imag), 0};
  };
};

struct ArcCos {
  template <typename T>
  T operator()(T x) {
    return metal::precise::acos(x);
  };

  complex64_t operator()(complex64_t x);
};

struct ArcCosh {
  template <typename T>
  T operator()(T x) {
    return metal::precise::acosh(x);
  };
};

struct ArcSin {
  template <typename T>
  T operator()(T x) {
    return metal::precise::asin(x);
  };

  complex64_t operator()(complex64_t x);
};

struct ArcSinh {
  template <typename T>
  T operator()(T x) {
    return metal::precise::asinh(x);
  };
};

struct ArcTan {
  template <typename T>
  T operator()(T x) {
    return metal::precise::atan(x);
  };

  complex64_t operator()(complex64_t x);
};

struct ArcTanh {
  template <typename T>
  T operator()(T x) {
    return metal::precise::atanh(x);
  };
};

struct BitwiseInvert {
  template <typename T>
  T operator()(T x) {
    return ~x;
  };
};

struct Ceil {
  template <typename T>
  T operator()(T x) {
    return metal::ceil(x);
  };
  int8_t operator()(int8_t x) {
    return x;
  };
  int16_t operator()(int16_t x) {
    return x;
  };
  int32_t operator()(int32_t x) {
    return x;
  };
  int64_t operator()(int64_t x) {
    return x;
  };
  uint8_t operator()(uint8_t x) {
    return x;
  };
  uint16_t operator()(uint16_t x) {
    return x;
  };
  uint32_t operator()(uint32_t x) {
    return x;
  };
  uint64_t operator()(uint64_t x) {
    return x;
  };
  bool operator()(bool x) {
    return x;
  };
};

struct Cos {
  template <typename T>
  T operator()(T x) {
    return metal::precise::cos(x);
  };

  complex64_t operator()(complex64_t x) {
    return {
        metal::precise::cos(x.real) * metal::precise::cosh(x.imag),
        -metal::precise::sin(x.real) * metal::precise::sinh(x.imag)};
  };
};

struct Cosh {
  template <typename T>
  T operator()(T x) {
    return metal::precise::cosh(x);
  };

  complex64_t operator()(complex64_t x) {
    return {
        metal::precise::cosh(x.real) * metal::precise::cos(x.imag),
        metal::precise::sinh(x.real) * metal::precise::sin(x.imag)};
  };
};

struct Conjugate {
  complex64_t operator()(complex64_t x) {
    return complex64_t{x.real, -x.imag};
  }
};

struct Erf {
  template <typename T>
  T operator()(T x) {
    return static_cast<T>(erf(static_cast<float>(x)));
  };
};

struct ErfInv {
  template <typename T>
  T operator()(T x) {
    return static_cast<T>(erfinv(static_cast<float>(x)));
  };
};

struct Exp {
  template <typename T>
  T operator()(T x) {
    return metal::precise::exp(x);
  };
  complex64_t operator()(complex64_t x) {
    return cexpf(x);
  }
};

struct Expm1 {
  template <typename T>
  T operator()(T x) {
    return static_cast<T>(expm1f(static_cast<float>(x)));
  };
};

struct Floor {
  template <typename T>
  T operator()(T x) {
    return metal::floor(x);
  };
  int8_t operator()(int8_t x) {
    return x;
  };
  int16_t operator()(int16_t x) {
    return x;
  };
  int32_t operator()(int32_t x) {
    return x;
  };
  int64_t operator()(int64_t x) {
    return x;
  };
  uint8_t operator()(uint8_t x) {
    return x;
  };
  uint16_t operator()(uint16_t x) {
    return x;
  };
  uint32_t operator()(uint32_t x) {
    return x;
  };
  uint64_t operator()(uint64_t x) {
    return x;
  };
  bool operator()(bool x) {
    return x;
  };
};

struct Imag {
  float operator()(complex64_t x) {
    return x.imag;
  };
};

struct Log {
  template <typename T>
  T operator()(T x) {
    return metal::precise::log(x);
  };

  complex64_t operator()(complex64_t x) {
    auto r = metal::precise::log(Abs{}(x).real);
    auto i = metal::precise::atan2(x.imag, x.real);
    return {r, i};
  };
};

struct Log2 {
  template <typename T>
  T operator()(T x) {
    return metal::precise::log2(x);
  };

  complex64_t operator()(complex64_t x) {
    auto y = Log{}(x);
    return {y.real / M_LN2_F, y.imag / M_LN2_F};
  };
};

struct Log10 {
  template <typename T>
  T operator()(T x) {
    return metal::precise::log10(x);
  };

  complex64_t operator()(complex64_t x) {
    auto y = Log{}(x);
    return {y.real / M_LN10_F, y.imag / M_LN10_F};
  };
};

struct Log1p {
  template <typename T>
  T operator()(T x) {
    return log1p(x);
  };
};

struct LogicalNot {
  template <typename T>
  T operator()(T x) {
    return !x;
  };
};

struct Negative {
  template <typename T>
  T operator()(T x) {
    return -x;
  };
};

struct Real {
  float operator()(complex64_t x) {
    return x.real;
  };
};

struct Round {
  template <typename T>
  T operator()(T x) {
    return metal::rint(x);
  };
  complex64_t operator()(complex64_t x) {
    return {metal::rint(x.real), metal::rint(x.imag)};
  };
};

struct Sigmoid {
  template <typename T>
  T operator()(T x) {
    auto y = 1 / (1 + metal::exp(metal::abs(x)));
    return (x < 0) ? y : 1 - y;
  }
};

struct Sign {
  template <typename T>
  T operator()(T x) {
    return (x > T(0)) - (x < T(0));
  };
  uint32_t operator()(uint32_t x) {
    return x != 0;
  };
  complex64_t operator()(complex64_t x) {
    if (x == complex64_t(0)) {
      return x;
    }
    return x /
        (complex64_t)metal::precise::sqrt(x.real * x.real + x.imag * x.imag);
  };
};

struct Sin {
  template <typename T>
  T operator()(T x) {
    return metal::precise::sin(x);
  };

  complex64_t operator()(complex64_t x) {
    return {
        metal::precise::sin(x.real) * metal::precise::cosh(x.imag),
        metal::precise::cos(x.real) * metal::precise::sinh(x.imag)};
  };
};

struct Sinh {
  template <typename T>
  T operator()(T x) {
    return metal::precise::sinh(x);
  };

  complex64_t operator()(complex64_t x) {
    return {
        metal::precise::sinh(x.real) * metal::precise::cos(x.imag),
        metal::precise::cosh(x.real) * metal::precise::sin(x.imag)};
  };
};

struct Square {
  template <typename T>
  T operator()(T x) {
    return x * x;
  };
};

struct Sqrt {
  template <typename T>
  T operator()(T x) {
    return metal::precise::sqrt(x);
  };

  complex64_t operator()(complex64_t x) {
    if (x.real == 0.0 && x.imag == 0.0) {
      return {0.0, 0.0};
    }
    auto r = Abs{}(x).real;
    auto a = metal::precise::sqrt((r + x.real) / 2.0);
    auto b_abs = metal::precise::sqrt((r - x.real) / 2.0);
    auto b = metal::copysign(b_abs, x.imag);
    return {a, b};
  }
};

struct Rsqrt {
  template <typename T>
  T operator()(T x) {
    return metal::precise::rsqrt(x);
  };

  complex64_t operator()(complex64_t x) {
    return 1.0 / Sqrt{}(x);
  }
};

struct Tan {
  template <typename T>
  T operator()(T x) {
    return metal::precise::tan(x);
  };

  complex64_t operator()(complex64_t x) {
    float tan_a = metal::precise::tan(x.real);
    float tanh_b = metal::precise::tanh(x.imag);
    float t1 = tan_a * tanh_b;
    float denom = 1. + t1 * t1;
    return {(tan_a - tanh_b * t1) / denom, (tanh_b + tan_a * t1) / denom};
  };
};

struct Tanh {
  template <typename T>
  T operator()(T x) {
    return metal::precise::tanh(x);
  };

  complex64_t operator()(complex64_t x) {
    float tanh_a = metal::precise::tanh(x.real);
    float tan_b = metal::precise::tan(x.imag);
    float t1 = tanh_a * tan_b;
    float denom = 1. + t1 * t1;
    return {(tanh_a + tan_b * t1) / denom, (tan_b - tanh_a * t1) / denom};
  };
};

complex64_t ArcCos::operator()(complex64_t x) {
  auto i = complex64_t{0.0, 1.0};
  auto y = Log{}(x + i * Sqrt{}(1.0 - x * x));
  return {y.imag, -y.real};
};

complex64_t ArcSin::operator()(complex64_t x) {
  auto i = complex64_t{0.0, 1.0};
  auto y = Log{}(i * x + Sqrt{}(1.0 - x * x));
  return {y.imag, -y.real};
};

complex64_t ArcTan::operator()(complex64_t x) {
  auto i = complex64_t{0.0, 1.0};
  auto ix = i * x;
  return (1.0 / complex64_t{0.0, 2.0}) * Log{}((1.0 + ix) / (1.0 - ix));
};

struct ToFP8 {
  template <typename T>
  uint8_t operator()(T f) {
    return fp8_e4m3(f).bits;
  }
};

struct FromFP8 {
  float operator()(uint8_t x) {
    return float(*(thread fp8_e4m3*)(&x));
  }
};


================================================
FILE: mlx/backend/metal/kernels/utils.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <metal_math>

#include "mlx/backend/metal/kernels/bf16.h"
#include "mlx/backend/metal/kernels/bf16_math.h"
#include "mlx/backend/metal/kernels/complex.h"
#include "mlx/backend/metal/kernels/defines.h"
#include "mlx/backend/metal/kernels/logging.h"

typedef half float16_t;

// Work per thread values for different types. The values here are expected to
// match get_work_per_thread in mlx/backend/metal/utils.h
template <typename U>
struct WorkPerThread {
  static_assert(sizeof(U) <= 8, "Type too large");
  static constexpr int constant n = 8 / sizeof(U);
};

///////////////////////////////////////////////////////////////////////////////
// Type limits utils
///////////////////////////////////////////////////////////////////////////////

template <typename U>
struct Limits {
  static const constant U max = metal::numeric_limits<U>::max();
  static const constant U min = metal::numeric_limits<U>::min();
  static const constant U finite_max = metal::numeric_limits<U>::max();
  static const constant U finite_min = metal::numeric_limits<U>::min();
};

#define instantiate_default_limit(type)                                      \
  template <>                                                                \
  struct Limits<type> {                                                      \
    static constexpr constant type max = metal::numeric_limits<type>::max(); \
    static constexpr constant type min = metal::numeric_limits<type>::min(); \
    static constexpr constant type finite_max =                              \
        metal::numeric_limits<type>::max();                                  \
    static constexpr constant type finite_min =                              \
        metal::numeric_limits<type>::min();                                  \
  };

instantiate_default_limit(uint8_t);
instantiate_default_limit(uint16_t);
instantiate_default_limit(uint32_t);
instantiate_default_limit(uint64_t);
instantiate_default_limit(int8_t);
instantiate_default_limit(int16_t);
instantiate_default_limit(int32_t);
instantiate_default_limit(int64_t);

#define instantiate_float_limit(type)             \
  template <>                                     \
  struct Limits<type> {                           \
    static constexpr constant type max =          \
        metal::numeric_limits<type>::infinity();  \
    static constexpr constant type min =          \
        -metal::numeric_limits<type>::infinity(); \
    static constexpr constant type finite_max =   \
        metal::numeric_limits<type>::max();       \
    static constexpr constant type finite_min =   \
        -metal::numeric_limits<type>::max();      \
  };

instantiate_float_limit(half);
instantiate_float_limit(float);
instantiate_float_limit(bfloat16_t);

template <>
struct Limits<bool> {
  static constexpr constant bool max = true;
  static constexpr constant bool min = false;
};

template <>
struct Limits<complex64_t> {
  static constexpr constant complex64_t max = complex64_t(
      metal::numeric_limits<float>::infinity(),
      metal::numeric_limits<float>::infinity());
  static constexpr constant complex64_t min = complex64_t(
      -metal::numeric_limits<float>::infinity(),
      -metal::numeric_limits<float>::infinity());
};

///////////////////////////////////////////////////////////////////////////////
// Indexing utils
///////////////////////////////////////////////////////////////////////////////

#define MLX_MTL_PRAGMA_UNROLL _Pragma("clang loop unroll(full)")

///////////////////////////////////////////////////////////////////////////////
// Single Array with generic dims

template <typename IdxT = int64_t>
METAL_FUNC IdxT elem_to_loc(
    IdxT elem,
    constant const int* shape,
    constant const int64_t* strides,
    int ndim) {
  IdxT loc = 0;
  for (int i = ndim - 1; i >= 0 && elem > 0; --i) {
    loc += (elem % shape[i]) * IdxT(strides[i]);
    elem /= shape[i];
  }
  return loc;
}

// Non templated version to handle arbitrary dims
template <typename IdxT = int64_t>
METAL_FUNC IdxT elem_to_loc(
    uint3 elem,
    constant const int* shape,
    constant const int64_t* strides,
    int ndim) {
  IdxT loc =
      elem.x * IdxT(strides[ndim - 1]) + elem.y * IdxT(strides[ndim - 2]);
  for (int d = ndim - 3; d >= 0; --d) {
    loc += (elem.z % shape[d]) * IdxT(strides[d]);
    elem.z /= shape[d];
  }
  return loc;
}

///////////////////////////////////////////////////////////////////////////////
// Single Array with fixed N dims

template <typename IdxT = int64_t>
METAL_FUNC IdxT elem_to_loc_1(uint elem, constant const int64_t& stride) {
  return elem * IdxT(stride);
}

template <typename IdxT = int64_t>
METAL_FUNC IdxT elem_to_loc_2(uint2 elem, constant const int64_t strides[2]) {
  return elem.x * IdxT(strides[1]) + elem.y * IdxT(strides[0]);
}

template <typename IdxT = int64_t>
METAL_FUNC IdxT elem_to_loc_3(uint3 elem, constant const int64_t strides[3]) {
  return elem.x * IdxT(strides[2]) + elem.y * IdxT(strides[1]) +
      elem.z * IdxT(strides[0]);
}

///////////////////////////////////////////////////////////////////////////////
// Multiple Arrays with generic dims

template <typename IdxT = int64_t>
METAL_FUNC vec<IdxT, 2> elem_to_loc_2_nd(
    uint3 elem,
    constant const int* shape,
    constant const int64_t* a_strides,
    constant const int64_t* b_strides,
    int ndim) {
  vec<IdxT, 2> loc = {
      IdxT(
          elem.x * IdxT(a_strides[ndim - 1]) +
          IdxT(elem.y) * IdxT(a_strides[ndim - 2])),
      IdxT(
          elem.x * IdxT(b_strides[ndim - 1]) +
          elem.y * IdxT(b_strides[ndim - 2]))};
  for (int d = ndim - 3; d >= 0; --d) {
    uint l = elem.z % shape[d];
    loc.x += l * IdxT(a_strides[d]);
    loc.y += l * IdxT(b_strides[d]);
    elem.z /= shape[d];
  }
  return loc;
}

template <typename IdxT = int64_t>
METAL_FUNC vec<IdxT, 3> elem_to_loc_3_nd(
    uint3 elem,
    constant const int* shape,
    constant const int64_t* a_strides,
    constant const int64_t* b_strides,
    constant const int64_t* c_strides,
    int ndim) {
  vec<IdxT, 3> loc = {
      IdxT(elem.x * IdxT(a_strides[ndim - 1])) +
          IdxT(elem.y * IdxT(a_strides[ndim - 2])),
      IdxT(elem.x * IdxT(b_strides[ndim - 1])) +
          IdxT(elem.y * IdxT(b_strides[ndim - 2])),
      IdxT(elem.x * IdxT(c_strides[ndim - 1])) +
          IdxT(elem.y * IdxT(c_strides[ndim - 2]))};
  for (int d = ndim - 3; d >= 0; --d) {
    uint l = elem.z % shape[d];
    loc.x += l * IdxT(a_strides[d]);
    loc.y += l * IdxT(b_strides[d]);
    loc.z += l * IdxT(c_strides[d]);
    elem.z /= shape[d];
  }
  return loc;
}

///////////////////////////////////////////////////////////////////////////////
// Elem to loc in a loop utils
///////////////////////////////////////////////////////////////////////////////

template <int DIM, typename OffsetT = size_t, bool General = true>
struct LoopedElemToLoc {
  int dim;
  LoopedElemToLoc<DIM - 1, OffsetT, General> inner_looper;
  OffsetT offset{0};
  int index{0};

  LoopedElemToLoc(int dim) : dim(dim), inner_looper(dim - 1) {}

  void next(const constant int* shape, const constant int64_t* strides) {
    if (dim == 0) {
      return;
    }
    index++;
    offset += OffsetT(strides[dim - 1]);
    if (index >= shape[dim - 1]) {
      index = 0;
      inner_looper.next(shape, strides);
      offset = inner_looper.offset;
    }
  }

  void next(int n, const constant int* shape, const constant int64_t* strides) {
    if (dim == 0) {
      return;
    }
    index += n;
    offset += n * OffsetT(strides[dim - 1]);

    if (index >= shape[dim - 1]) {
      int extra = index - shape[dim - 1];
      if (extra >= shape[dim - 1]) {
        inner_looper.next(1 + extra / shape[dim - 1], shape, strides);
        extra = extra % shape[dim - 1];
      } else {
        inner_looper.next(shape, strides);
      }
      index = 0;
      offset = inner_looper.offset;
      if (extra > 0) {
        next(extra, shape, strides);
      }
    }
  }

  OffsetT location() {
    return offset;
  }
};

template <typename OffsetT>
struct LoopedElemToLoc<1, OffsetT, true> {
  int dim;
  OffsetT offset{0};
  uint index{0};

  LoopedElemToLoc(int dim) : dim(dim) {}

  void next(const constant int* shape, const constant int64_t* strides) {
    index++;
    if (dim > 1) {
      offset = elem_to_loc<OffsetT>(index, shape, strides, dim);
    } else {
      offset += OffsetT(strides[0]);
    }
  }

  void next(int n, const constant int* shape, const constant int64_t* strides) {
    index += n;
    if (dim > 1) {
      offset = elem_to_loc<OffsetT>(index, shape, strides, dim);
    } else {
      offset = index * OffsetT(strides[0]);
    }
  }

  OffsetT location() {
    return offset;
  }
};

template <typename OffsetT>
struct LoopedElemToLoc<1, OffsetT, false> {
  OffsetT offset{0};

  LoopedElemToLoc(int) {}

  void next(const constant int*, const constant int64_t* strides) {
    offset += OffsetT(strides[0]);
  }

  void next(int n, const constant int*, const constant int64_t* strides) {
    offset += n * OffsetT(strides[0]);
  }

  OffsetT location() {
    return offset;
  }
};

///////////////////////////////////////////////////////////////////////////////
// Calculation utils
///////////////////////////////////////////////////////////////////////////////

/** Compute ceil((float)N/(float)M) */
template <typename T, typename U>
inline T ceildiv(T N, U M) {
  return (N + M - 1) / M;
}

// https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html#1202
inline float log1p(float x) {
  float xp1 = 1.0f + x;
  if (xp1 == Limits<float>::max) {
    return Limits<float>::max;
  }
  if (xp1 == 1.0f) {
    return x;
  }

  return x * (metal::log(xp1) / (xp1 - 1.0f));
}

inline bfloat16_t log1p(bfloat16_t x) {
  float xp1 = 1.0f + static_cast<float>(x);
  if (xp1 == Limits<float>::max) {
    return Limits<bfloat16_t>::max;
  }
  if (xp1 == 1.0f) {
    return x;
  }

  return bfloat16_t(x * (metal::log(xp1) / (xp1 - 1.0f)));
}

inline complex64_t log1p(complex64_t in) {
  float x = in.real;
  float y = in.imag;
  float zabs = metal::precise::sqrt(x * x + y * y);
  float theta = metal::atan2(y, x + 1);
  if (zabs < 0.5f) {
    float r = x * (2 + x) + y * y;
    if (r == 0) { // handle underflow
      return {x, theta};
    }
    return {0.5f * log1p(r), theta};
  } else {
    auto z0 = metal::sqrt((x + 1) * (x + 1) + y * y);
    return {metal::log(z0), theta};
  }
}

///////////////////////////////////////////////////////////////////////////////
// SIMD shuffle ops
///////////////////////////////////////////////////////////////////////////////

inline uint64_t simd_shuffle_down(uint64_t data, uint16_t delta) {
  return as_type<uint64_t>(
      metal::simd_shuffle_down(as_type<uint2>(data), delta));
}

inline int64_t simd_shuffle_down(int64_t data, uint16_t delta) {
  return as_type<int64_t>(
      metal::simd_shuffle_down(as_type<uint2>(data), delta));
}

inline bool simd_shuffle_down(bool data, uint16_t delta) {
  return simd_shuffle_down(static_cast<uint32_t>(data), delta);
}

inline complex64_t simd_shuffle_down(complex64_t data, uint16_t delta) {
  return complex64_t(
      simd_shuffle_down(data.real, delta), simd_shuffle_down(data.imag, delta));
}

inline uint64_t simd_shuffle_up(uint64_t data, uint16_t delta) {
  return as_type<uint64_t>(metal::simd_shuffle_up(as_type<uint2>(data), delta));
}

inline int64_t simd_shuffle_up(int64_t data, uint16_t delta) {
  return as_type<int64_t>(metal::simd_shuffle_up(as_type<uint2>(data), delta));
}

inline bool simd_shuffle_up(bool data, uint16_t delta) {
  return simd_shuffle_up(static_cast<uint32_t>(data), delta);
}

inline complex64_t simd_shuffle_up(complex64_t data, uint16_t delta) {
  return complex64_t(
      simd_shuffle_up(data.real, delta), simd_shuffle_up(data.imag, delta));
}

inline uint64_t
simd_shuffle_and_fill_up(uint64_t data, uint64_t filling, uint16_t delta) {
  return as_type<uint64_t>(metal::simd_shuffle_and_fill_up(
      as_type<uint2>(data), as_type<uint2>(filling), delta));
}

inline int64_t
simd_shuffle_and_fill_up(int64_t data, int64_t filling, uint16_t delta) {
  return as_type<int64_t>(metal::simd_shuffle_and_fill_up(
      as_type<uint2>(data), as_type<uint2>(filling), delta));
}

inline bool simd_shuffle_and_fill_up(bool data, bool filling, uint16_t delta) {
  return simd_shuffle_and_fill_up(
      static_cast<uint32_t>(data), static_cast<uint32_t>(filling), delta);
}

inline complex64_t simd_shuffle_and_fill_up(
    complex64_t data,
    complex64_t filling,
    uint16_t delta) {
  return complex64_t(
      simd_shuffle_and_fill_up(data.real, filling.real, delta),
      simd_shuffle_and_fill_up(data.imag, filling.imag, delta));
}

inline uint64_t simd_shuffle(uint64_t data, uint16_t lane) {
  return as_type<uint64_t>(metal::simd_shuffle(as_type<uint2>(data), lane));
}

inline int64_t simd_shuffle(int64_t data, uint16_t lane) {
  return as_type<int64_t>(metal::simd_shuffle(as_type<uint2>(data), lane));
}

inline bool simd_shuffle(bool data, uint16_t lane) {
  return simd_shuffle(static_cast<uint32_t>(data), lane);
}

inline complex64_t simd_shuffle(complex64_t data, uint16_t lane) {
  return complex64_t(
      simd_shuffle(data.real, lane), simd_shuffle(data.imag, lane));
}

// std::conditional is not included with Metal
template <bool condition, typename T, typename U>
struct ConditionalType {
  using type = U;
};

template <typename T, typename U>
struct ConditionalType<true, T, U> {
  using type = T;
};


================================================
FILE: mlx/backend/metal/kernels.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include <fmt/format.h>

#include "mlx/array.h"
#include "mlx/backend/metal/device.h"

namespace mlx::core {

MTL::ComputePipelineState* get_arange_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& out);

MTL::ComputePipelineState* get_unary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    Dtype in_type,
    Dtype out_type,
    const char* op);

MTL::ComputePipelineState* get_binary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    Dtype in_type,
    Dtype out_type,
    const char* op);

MTL::ComputePipelineState* get_binary_two_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    Dtype in_type,
    Dtype out_type,
    const char* op);

MTL::ComputePipelineState* get_ternary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    Dtype type,
    const char* op);

MTL::ComputePipelineState* get_copy_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& in,
    const array& out);

MTL::ComputePipelineState* get_dynamic_copy_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& in,
    const array& out);

MTL::ComputePipelineState* get_softmax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    bool precise,
    const array& out);

MTL::ComputePipelineState* get_logsumexp_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& out);

MTL::ComputePipelineState* get_scan_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    bool reverse,
    bool inclusive,
    const std::string& reduce_type,
    const array& in,
    const array& out);

MTL::ComputePipelineState* get_sort_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& in,
    const array& out,
    int bn,
    int tn);

MTL::ComputePipelineState* get_mb_sort_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& in,
    const array& idx,
    int bn,
    int tn);

MTL::ComputePipelineState* get_reduce_init_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& func_name,
    const std::string& op_name,
    const Dtype& out_type);

MTL::ComputePipelineState* get_reduce_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& func_name,
    const std::string& op_name,
    const Dtype& in_type,
    const Dtype& out_type,
    const std::string& idx_t,
    int ndim = -1,
    int bm = -1,
    int bn = -1);

MTL::ComputePipelineState* get_steel_gemm_fused_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& out,
    bool transpose_a,
    bool transpose_b,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn);

MTL::ComputePipelineState* get_steel_gemm_splitk_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& in,
    const array& out,
    bool transpose_a,
    bool transpose_b,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn,
    bool mn_aligned,
    bool k_aligned);

MTL::ComputePipelineState* get_steel_gemm_splitk_accum_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& in,
    const array& out,
    bool axbpy);

MTL::ComputePipelineState* get_steel_gemm_masked_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& out,
    const std::optional<array>& mask_out,
    const std::optional<array>& mask_op,
    bool transpose_a,
    bool transpose_b,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn,
    bool mn_aligned,
    bool k_aligned);

MTL::ComputePipelineState* get_steel_gemm_gather_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& out,
    bool transpose_a,
    bool transpose_b,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn,
    bool rhs);

MTL::ComputePipelineState* get_steel_gemm_segmented_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& out,
    bool transpose_a,
    bool transpose_b,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn);

MTL::ComputePipelineState* get_steel_conv_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& out,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn,
    int n_channel_specialization,
    bool small_filter);

MTL::ComputePipelineState* get_steel_conv_3d_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& out,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn,
    bool small_filter);

MTL::ComputePipelineState* get_gemv_masked_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array& out,
    const std::optional<array>& mask_out,
    const std::optional<array>& mask_op,
    bool transpose_mat,
    int bm,
    int bn,
    int sm,
    int sn,
    int tm,
    int tn,
    bool contiguous);

MTL::ComputePipelineState* get_steel_conv_general_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& out,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn);

MTL::ComputePipelineState* get_fft_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const std::string& template_def);

MTL::ComputePipelineState* get_quantized_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& template_def,
    const std::string& mode);

MTL::ComputePipelineState* get_gather_qmm_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& x,
    int group_size,
    int bits,
    const std::string& mode,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn,
    bool transpose);

MTL::ComputePipelineState* get_steel_gemm_fused_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& out,
    bool transpose_a,
    bool transpose_b,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn);

MTL::ComputePipelineState* get_steel_gemm_gather_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& out,
    bool transpose_a,
    bool transpose_b,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn,
    bool rhs);

MTL::ComputePipelineState* get_steel_gemm_splitk_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& out,
    bool transpose_a,
    bool transpose_b,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn);

MTL::ComputePipelineState* get_qmm_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& template_def,
    const std::string& mode);

MTL::ComputePipelineState* get_gather_qmm_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& x,
    int group_size,
    int bits,
    const std::string& mode,
    int bm,
    int bn,
    int bk,
    int wm,
    int wn,
    bool transpose);

MTL::ComputePipelineState* get_steel_attention_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& q,
    int bq,
    int bk,
    int bd,
    int wm,
    int wn,
    const array& m);

MTL::ComputePipelineState* get_steel_attention_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array& q,
    int bq,
    int bk,
    int bd,
    int wm,
    int wn,
    const array& m);

// Create a GPU kernel template definition for JIT compilation
template <typename... Args>
std::string get_template_definition(
    std::string_view name,
    std::string_view func,
    Args... args) {
  std::ostringstream s;
  s << func << "<";
  bool first = true;
  auto add_arg = [&s, &first](const auto& arg) {
    if (!first) {
      s << ", ";
    }
    first = false;
    s << arg;
  };
  (add_arg(args), ...);
  s << ">";
  return fmt::format(
      "\ntemplate [[host_name(\"{0}\")]] [[kernel]] decltype({1}) {1};\n",
      name,
      s.str());
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/logsumexp.cpp
================================================
// Copyright © 2023-2024 Apple Inc.
#include <algorithm>

#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/primitives.h"

namespace mlx::core {

constexpr int LOGSUMEXP_LOOPED_LIMIT = 4096;

void LogSumExp::eval_gpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  if (!issubdtype(out.dtype(), floating)) {
    throw std::runtime_error(
        "[logsumexp] Does not support non-floating point types.");
  }
  auto& s = stream();
  auto& d = metal::device(s.device);

  // Make sure that the last dimension is contiguous
  auto ensure_contiguous = [&s, &d](const array& x) {
    if (x.flags().contiguous && x.strides()[x.ndim() - 1] == 1) {
      return x;
    } else {
      array x_copy = contiguous_copy_gpu(x, s);
      d.add_temporary(x_copy, s.index);
      return x_copy;
    }
  };

  auto in = ensure_contiguous(inputs[0]);
  if (in.flags().row_contiguous) {
    out.set_data(allocator::malloc(out.nbytes()));
  } else {
    auto n = in.shape(-1);
    auto flags = in.flags();
    auto strides = in.strides();
    for (auto& s : strides) {
      s /= n;
    }
    bool col_contig = strides[0] == 1;
    for (int i = 1; col_contig && i < strides.size(); ++i) {
      col_contig &=
          (out.shape(i) == 1 || strides[i - 1] == out.shape(i) * strides[i]);
    }
    flags.col_contiguous = col_contig;
    out.set_data(
        allocator::malloc(in.nbytes() / n),
        in.data_size() / n,
        std::move(strides),
        flags);
  }

  int axis_size = in.shape().back();
  int n_rows = in.data_size() / axis_size;

  const int simd_size = 32;
  const int n_reads = 4;
  const int looped_limit = LOGSUMEXP_LOOPED_LIMIT;

  std::string kernel_name = (axis_size > looped_limit) ? "looped_" : "block_";
  kernel_name += "logsumexp_";
  kernel_name += type_to_name(out);

  auto kernel = get_logsumexp_kernel(d, kernel_name, out);
  auto& compute_encoder = d.get_command_encoder(s.index);
  {
    MTL::Size grid_dims, group_dims;
    if (axis_size <= looped_limit) {
      size_t threadgroup_needed = (axis_size + n_reads - 1) / n_reads;
      size_t simds_needed = (threadgroup_needed + simd_size - 1) / simd_size;
      size_t threadgroup_size = simd_size * simds_needed;
      assert(threadgroup_size <= kernel->maxTotalThreadsPerThreadgroup());
      size_t n_threads = n_rows * threadgroup_size;
      grid_dims = MTL::Size(n_threads, 1, 1);
      group_dims = MTL::Size(threadgroup_size, 1, 1);
    } else {
      size_t threadgroup_size = kernel->maxTotalThreadsPerThreadgroup();
      size_t n_threads = n_rows * threadgroup_size;
      grid_dims = MTL::Size(n_threads, 1, 1);
      group_dims = MTL::Size(threadgroup_size, 1, 1);
    }

    compute_encoder.set_compute_pipeline_state(kernel);
    compute_encoder.set_input_array(in, 0);
    compute_encoder.set_output_array(out, 1);
    compute_encoder.set_bytes(axis_size, 2);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/make_compiled_preamble.sh
================================================
#!/bin/bash
#
# This script generates a C++ function that provides the Metal source code
# at runtime for use with kernel generation.
#
# The steps executed are as follows 
# - Take as input a metal header file in the mlx metal backend 
# - Use the metal compiler to expand the dependency headers 
# - Sort the headers in order of inclusion 
# - Expand the headers in order of inclusion 
# - Export the generated source code content as a C++ function
#
# Doing the expansion this way allows us to retain macros, comments, and 
# formatting in the expanded source. This adds user readibility, and also 
# enables use of the metal macros in the source code which can then be 
# handled by the metal runtime compiler
#
# Copyright © 2023-25 Apple Inc.

OUTPUT_DIR=$1
CC=$2
SRC_DIR=$3
SRC_FILE=$4
CFLAGS=$5
SRC_NAME=$(basename -- "${SRC_FILE}")
JIT_INCLUDES=${SRC_DIR}/mlx/backend/metal/kernels/jit
INPUT_FILE=${SRC_DIR}/mlx/backend/metal/kernels/${SRC_FILE}.h
OUTPUT_FILE=${OUTPUT_DIR}/${SRC_NAME}.cpp

# Prepare output
mkdir -p "$OUTPUT_DIR"

# Use the metal compiler to get a list of headers (with depth)
CCC="xcrun -sdk macosx metal -x metal"
HDRS=$( $CCC -I"$SRC_DIR" -I"$JIT_INCLUDES" -DMLX_METAL_JIT -E -P -CC -C -H "$INPUT_FILE" $CFLAGS -w 2>&1 1>/dev/null )

# Remove any included system frameworks (for MetalPerformancePrimitive headers)
HDRS=$(echo "$HDRS" | grep -v "Xcode")

# Use the header depth to sort the files in order of inclusion
declare -a HDRS_LIST=($HDRS)
declare -a HDRS_STACK=()
declare -a HDRS_SORTED=()

length=${#HDRS_LIST[@]}

HDRS_LIST+=(".")

for ((i=0; i<${length}; i+=2));
do 

  header="${HDRS_LIST[$i+1]#$SRC_DIR/}"

  str_this="${HDRS_LIST[$i]}"
  str_next="${HDRS_LIST[$i + 2]}"

  depth_this=${#str_this}
  depth_next=${#str_next}

  # If we have a dependency then we stack it
  if [ $depth_next -gt $depth_this ]; then 
    HDRS_STACK=($header ${HDRS_STACK[@]})

  # If we are done with this level 
  else 
    # We add the header to out list
    HDRS_SORTED+=($header) 

    # Pop the stacked up dependencies
    pop_len=$((depth_this - depth_next))
    for popped_header in "${HDRS_STACK[@]:0:$pop_len}"
    do 
      HDRS_SORTED+=($popped_header)
    done 

    HDRS_STACK=(${HDRS_STACK[@]:$pop_len})
  fi  

done

# Make sure the given metal header is also expanded in the source content
HDRS_SORTED+=("${INPUT_FILE#$SRC_DIR/}")

# Expand the headers in order of inclusion 
CONTENT=$(
echo "// Copyright © 2025 Apple Inc."
echo "" 
echo "// Auto generated source for ${INPUT_FILE#$SRC_DIR/}"
echo ""

for header in "${HDRS_SORTED[@]}"
do 
  echo "///////////////////////////////////////////////////////////////////////////////"
  echo "// Contents from \"${header}\""
  echo "///////////////////////////////////////////////////////////////////////////////"
  echo ""

  echo "#line 1 \"${header}\""

  grep -h -v -G -e "#include \".*.h\"" -e "#pragma once" "${SRC_DIR}/${header}" 
  
  echo ""
  
done

echo "///////////////////////////////////////////////////////////////////////////////"
)

# Export the generated source code content as a C++ function
cat << EOF > "$OUTPUT_FILE"
namespace mlx::core::metal {

const char* $SRC_NAME() {
  return R"preamble(
$CONTENT
)preamble";
}

} // namespace mlx::core::metal
EOF


================================================
FILE: mlx/backend/metal/matmul.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <algorithm>
#include <cassert>
#include <numeric>
#include <sstream>

#include "mlx/backend/common/broadcasting.h"
#include "mlx/backend/common/matmul.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/metal/binary.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/kernels/defines.h"
#include "mlx/backend/metal/kernels/steel/gemm/params.h"
#include "mlx/backend/metal/matmul.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/primitives.h"
#include "mlx/utils.h"

namespace mlx::core {

namespace {

std::tuple<bool, int64_t, array> check_transpose(
    std::vector<array>& copies,
    const Stream& s,
    const array& arr,
    bool is_vector) {
  auto stx = arr.strides()[arr.ndim() - 2];
  auto sty = arr.strides()[arr.ndim() - 1];
  if (sty == 1 && (!is_vector || stx == arr.shape(-1))) {
    return std::make_tuple(false, stx, arr);
  } else if (stx == 1 && (!is_vector || sty == arr.shape(-2))) {
    return std::make_tuple(true, sty, arr);
  } else {
    array arr_copy = contiguous_copy_gpu(arr, s);
    copies.push_back(arr_copy);
    return std::make_tuple(false, arr.shape(-1), arr_copy);
  }
};

inline array
ensure_row_contiguous(const array& x, metal::Device& d, const Stream& s) {
  if (!x.flags().row_contiguous) {
    array x_copy = contiguous_copy_gpu(x, s);
    d.add_temporary(x_copy, s.index);
    return x_copy;
  } else {
    return x;
  }
}

inline std::tuple<bool, int64_t, array>
ensure_batch_contiguous(const array& x, metal::Device& d, const Stream& s) {
  if (x.flags().row_contiguous) {
    return std::make_tuple(false, x.strides()[x.ndim() - 2], x);
  }

  bool rc = true;
  for (int i = 0; i < x.ndim() - 3; i++) {
    rc &= x.strides()[i + 1] * x.shape(i) == x.strides()[i];
  }
  if (rc) {
    auto stx = x.strides()[x.ndim() - 2];
    auto sty = x.strides()[x.ndim() - 1];
    auto K = x.shape(-2);
    auto N = x.shape(-1);
    if (sty == 1 && (N != 1 || stx == N)) {
      return std::make_tuple(false, stx, x);
    }
    if (stx == 1 && (N != 1 || sty == K)) {
      return std::make_tuple(true, sty, x);
    }
  }

  array x_copy = contiguous_copy_gpu(x, s);
  d.add_temporary(x_copy, s.index);
  return std::make_tuple(false, x_copy.strides()[x_copy.ndim() - 2], x_copy);
}

} // namespace

///////////////////////////////////////////////////////////////////////////////
// Steel matmul fallback
///////////////////////////////////////////////////////////////////////////////

#define GEMM_TPARAM_MACRO(devc)                                           \
  if (devc == 'g' || devc == 'p') { /* Small device */                    \
    if (out.dtype() == complex64) {                                       \
      bm = 64;                                                            \
      bn = 32;                                                            \
      bk = 8;                                                             \
      wm = 4;                                                             \
      wn = 1;                                                             \
    } else if (!transpose_a && transpose_b) { /* nt */                    \
      bm = 64;                                                            \
      bn = 32;                                                            \
      bk = 32;                                                            \
      wm = 2;                                                             \
      wn = 2;                                                             \
    } else if (out.dtype() != float32) { /* half and bfloat */            \
      bm = 64;                                                            \
      bn = 64;                                                            \
      bk = 16;                                                            \
      wm = 1;                                                             \
      wn = 2;                                                             \
    }                                                                     \
  } else if (devc == 'd') { /* Large device */                            \
    if ((size_t)batch_size_out * M * N >= 1ul << 20) { /* large matmul */ \
      if (out.dtype() != float32) { /* half and bfloat */                 \
        if (2 * std::max(M, N) > K) { /* Reasonable K */                  \
          bm = 64;                                                        \
          bn = 64;                                                        \
          bk = 16;                                                        \
          wm = 1;                                                         \
          wn = 2;                                                         \
        } else if (!transpose_a && transpose_b) { /* nt with large k */   \
          bm = 64;                                                        \
          bn = 32;                                                        \
          bk = 32;                                                        \
          wm = 2;                                                         \
          wn = 2;                                                         \
        } else { /* nn with large K */                                    \
          bm = 32;                                                        \
          bn = 64;                                                        \
          bk = 16;                                                        \
          wm = 1;                                                         \
          wn = 2;                                                         \
        }                                                                 \
      } /* float takes default */                                         \
    } else { /* smaller matmul */                                         \
      if (out.dtype() != float32) { /* half and bfloat */                 \
        if (!transpose_a && transpose_b) { /* nt */                       \
          bm = 64;                                                        \
          bn = 32;                                                        \
          bk = 32;                                                        \
          wm = 2;                                                         \
          wn = 2;                                                         \
        } else { /* nn */                                                 \
          bm = 64;                                                        \
          bn = 64;                                                        \
          bk = 16;                                                        \
          wm = 1;                                                         \
          wn = 2;                                                         \
        }                                                                 \
      } else { /* floats */                                               \
        if (!transpose_a && transpose_b) { /* nt */                       \
          bm = 32;                                                        \
          bn = 64;                                                        \
          bk = 16;                                                        \
          wm = 1;                                                         \
          wn = 2;                                                         \
        } else { /* nn */                                                 \
          bm = 64;                                                        \
          bn = 32;                                                        \
          bk = 32;                                                        \
          wm = 2;                                                         \
          wn = 2;                                                         \
        }                                                                 \
      }                                                                   \
    }                                                                     \
  } else { /* Medium device */                                            \
    bm = 64;                                                              \
    bn = 64;                                                              \
    bk = 16;                                                              \
    wm = 2;                                                               \
    wn = 2;                                                               \
  }

///////////////////////////////////////////////////////////////////////////////
// Regular steel matmul dispatch
///////////////////////////////////////////////////////////////////////////////

template <bool CHECK_AB>
void steel_matmul_regular_axpby_nax(
    const Stream& s,
    metal::Device& d,
    const array& a,
    const array& b,
    const array& c,
    array& out,
    int M,
    int N,
    int K,
    int batch_size_out,
    int lda,
    int ldb,
    int ldd,
    bool transpose_a,
    bool transpose_b,
    std::vector<array>& copies,
    Shape batch_shape,
    Strides batch_strides,
    int64_t A_batch_stride,
    int64_t B_batch_stride,
    int64_t matrix_stride_out,
    int64_t C_batch_stride /* = 0*/,
    float alpha /* = 1.0f */,
    float beta /* = 0.0f */) {
  using namespace mlx::steel;

  // Determine dispatch kernel
  int bm = 128, bn = 128, bk = 512;
  int wm = 4, wn = 4;

  // Temp routing for larger devices
  char devc = d.get_architecture().back();
  if (devc == 's' || devc == 'c' || devc == 'd') {
    bk = (K >= 8192 && K > (M + N)) ? 64 : 256;

    bm = 64;
    wm = 2;
  }

  // Prepare kernel name
  std::ostringstream kname;

  // clang-format off
  kname << "steel_gemm_fused_nax_"
        << (transpose_a ? 't' : 'n')
        << (transpose_b ? 't' : 'n')
        << "_" << type_to_name(a)
        << "_" << type_to_name(out)
        << "_bm" << bm << "_bn" << bn << "_bk" << bk
        << "_wm" << wm << "_wn" << wn; // clang-format on

  std::string base_name = kname.str();

  const bool has_batch = (batch_shape.size() > 1);
  const bool use_out_source = CHECK_AB && (alpha != 0.0f || beta != 1.0f);
  const bool do_axpby = use_out_source && (alpha != 1.0f || beta != 1.0f);
  const bool align_M = (M % bm) == 0;
  const bool align_N = (N % bn) == 0;
  const bool align_K = (K % bk) == 0;

  metal::MTLFCList func_consts = {
      {&has_batch, MTL::DataType::DataTypeBool, 10},
      {&use_out_source, MTL::DataType::DataTypeBool, 100},
      {&do_axpby, MTL::DataType::DataTypeBool, 110},
      {&align_M, MTL::DataType::DataTypeBool, 200},
      {&align_N, MTL::DataType::DataTypeBool, 201},
      {&align_K, MTL::DataType::DataTypeBool, 202},
  };

  // clang-format off
  kname << "_has_batch_" << (has_batch ? 't' : 'n')
        << "_use_out_source_" << (use_out_source ? 't' : 'n')
        << "_do_axpby_" << (do_axpby ? 't' : 'n')
        << "_align_M_" << (align_M ? 't' : 'n')
        << "_align_N_" << (align_N ? 't' : 'n')
        << "_align_K_" << (align_K ? 't' : 'n'); // clang-format on

  std::string hash_name = kname.str();

  // Encode and dispatch kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = get_steel_gemm_fused_nax_kernel(
      /* metal::Device& d = */ d,
      /* const std::string& kernel_name = */ base_name,
      /* const std::string& hash_name = */ hash_name,
      /* const metal::MTLFCList& func_consts = */ func_consts,
      /* const array& out = */ out,
      /* bool transpose_a = */ transpose_a,
      /* bool transpose_b = */ transpose_b,
      /* int bm = */ bm,
      /* int bn = */ bn,
      /* int bk = */ bk,
      /* int wm = */ wm,
      /* int wn = */ wn);

  compute_encoder.set_compute_pipeline_state(kernel);

  // Use problem size to determine threadblock swizzle
  int tn = (N + bn - 1) / bn;
  int tm = (M + bm - 1) / bm;

  // TODO: Explore device-based tuning for swizzle
  int swizzle_log = tm <= 3 ? 0 : 1;
  if (devc == 's' || devc == 'c' || devc == 'd') {
    swizzle_log = 2;
  }

  // Prepare steel matmul params
  GEMMParams params{/* const int M = */ M,
                    /* const int N = */ N,
                    /* const int K = */ K,
                    /* const int lda = */ lda,
                    /* const int ldb = */ ldb,
                    /* const int ldd = */ ldd,
                    /* const int tiles_n = */ tn,
                    /* const int tiles_m = */ tm,
                    /* const int64_t batch_stride_a = */ A_batch_stride,
                    /* const int64_t batch_stride_b = */ B_batch_stride,
                    /* const int64_t batch_stride_d = */ matrix_stride_out,
                    /* const int swizzle_log = */ swizzle_log,
                    /* const int gemm_k_iterations_aligned = */ (K / bk),
                    /* const int batch_ndim = */ int(batch_shape.size())};

  // Prepare launch grid params
  int tile = 1 << swizzle_log;
  tm = (tm + tile - 1) / tile;
  tn = tn * tile;

  MTL::Size group_dims = MTL::Size(32, wn, wm);
  MTL::Size grid_dims = MTL::Size(tn, tm, batch_size_out);

  // Launch kernel
  compute_encoder.set_input_array(a, 0);
  compute_encoder.set_input_array(b, 1);
  compute_encoder.set_output_array(out, 3);

  compute_encoder.set_bytes(params, 4);

  if (has_batch) {
    compute_encoder.set_vector_bytes(batch_shape, 6);
    compute_encoder.set_vector_bytes(batch_strides, 7);
  }

  if (use_out_source) {
    int ldc = c.strides()[c.ndim() - 2];
    int fdc = c.strides()[c.ndim() - 1];

    GEMMAddMMParams params{/* const int ldc = */ ldc,
                           /* const int fdc = */ fdc,
                           /* const int64_t batch_stride_c = */ C_batch_stride,
                           /* const float alpha = */ alpha,
                           /* const float beta = */ beta};

    compute_encoder.set_input_array(c, 2);
    compute_encoder.set_bytes(params, 5);
  }

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);

  // Record copies
  d.add_temporaries(std::move(copies), s.index);
}

template <bool CHECK_AB>
void steel_matmul_regular_axpby(
    const Stream& s,
    metal::Device& d,
    const array& a,
    const array& b,
    const array& c,
    array& out,
    int M,
    int N,
    int K,
    int batch_size_out,
    int lda,
    int ldb,
    int ldd,
    bool transpose_a,
    bool transpose_b,
    std::vector<array>& copies,
    Shape batch_shape,
    Strides batch_strides,
    int64_t A_batch_stride,
    int64_t B_batch_stride,
    int64_t matrix_stride_out,
    int64_t C_batch_stride /* = 0*/,
    float alpha /* = 1.0f */,
    float beta /* = 0.0f */) {
  if (metal::is_nax_available() && !issubdtype(a.dtype(), complexfloating) &&
      (env::enable_tf32() || a.dtype() != float32)) {
    return steel_matmul_regular_axpby_nax<CHECK_AB>(
        /* const Stream& s = */ s,
        /* metal::Device& d = */ d,
        /* const array& a = */ a,
        /* const array& b = */ b,
        /* const array& c = */ c,
        /* array& out = */ out,
        /* int M = */ M,
        /* int N = */ N,
        /* int K = */ K,
        /* int batch_size_out = */ batch_size_out,
        /* int lda = */ lda,
        /* int ldb = */ ldb,
        /* int ldd = */ ldd,
        /* bool transpose_a = */ transpose_a,
        /* bool transpose_b = */ transpose_b,
        /* std::vector<array>& copies = */ copies,
        /* Shape batch_shape = */ batch_shape,
        /* Strides batch_strides = */ batch_strides,
        /* int64_t A_batch_stride = */ A_batch_stride,
        /* int64_t B_batch_stride = */ B_batch_stride,
        /* int64_t matrix_stride_out = */ matrix_stride_out,
        /* int64_t C_batch_stride = */ C_batch_stride,
        /* float alpha = */ alpha,
        /* float beta = */ beta);
  }

  using namespace mlx::steel;

  // Determine dispatch kernel
  int bm = 64, bn = 64, bk = 16;
  int wm = 2, wn = 2;

  char devc = d.get_architecture().back();
  GEMM_TPARAM_MACRO(devc)

  // Prepare kernel name
  std::ostringstream kname;

  // clang-format off
  kname << "steel_gemm_fused_"
        << (transpose_a ? 't' : 'n')
        << (transpose_b ? 't' : 'n')
        << "_" << type_to_name(a)
        << "_" << type_to_name(out)
        << "_bm" << bm << "_bn" << bn << "_bk" << bk
        << "_wm" << wm << "_wn" << wn; // clang-format on

  std::string base_name = kname.str();

  const bool has_batch = (batch_shape.size() > 1);
  const bool use_out_source = CHECK_AB && (alpha != 0.0f || beta != 1.0f);
  const bool do_axpby = use_out_source && (alpha != 1.0f || beta != 1.0f);
  const bool align_M = (M % bm) == 0;
  const bool align_N = (N % bn) == 0;
  const bool align_K = (K % bk) == 0;

  metal::MTLFCList func_consts = {
      {&has_batch, MTL::DataType::DataTypeBool, 10},
      {&use_out_source, MTL::DataType::DataTypeBool, 100},
      {&do_axpby, MTL::DataType::DataTypeBool, 110},
      {&align_M, MTL::DataType::DataTypeBool, 200},
      {&align_N, MTL::DataType::DataTypeBool, 201},
      {&align_K, MTL::DataType::DataTypeBool, 202},
  };

  // clang-format off
  kname << "_has_batch_" << (has_batch ? 't' : 'n')
        << "_use_out_source_" << (use_out_source ? 't' : 'n')
        << "_do_axpby_" << (do_axpby ? 't' : 'n')
        << "_align_M_" << (align_M ? 't' : 'n')
        << "_align_N_" << (align_N ? 't' : 'n')
        << "_align_K_" << (align_K ? 't' : 'n'); // clang-format on

  std::string hash_name = kname.str();

  // Encode and dispatch kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = get_steel_gemm_fused_kernel(
      /* metal::Device& d = */ d,
      /* const std::string& kernel_name = */ base_name,
      /* const std::string& hash_name = */ hash_name,
      /* const metal::MTLFCList& func_consts = */ func_consts,
      /* const array& out = */ out,
      /* bool transpose_a = */ transpose_a,
      /* bool transpose_b = */ transpose_b,
      /* int bm = */ bm,
      /* int bn = */ bn,
      /* int bk = */ bk,
      /* int wm = */ wm,
      /* int wn = */ wn);

  compute_encoder.set_compute_pipeline_state(kernel);

  // Use problem size to determine threadblock swizzle
  int tn = (N + bn - 1) / bn;
  int tm = (M + bm - 1) / bm;

  // TODO: Explore device-based tuning for swizzle
  int swizzle_log = 0; // tm >= 6 ? 3 : (tm <= 3 ? 0 : 2);

  // Prepare steel matmul params
  GEMMParams params{/* const int M = */ M,
                    /* const int N = */ N,
                    /* const int K = */ K,
                    /* const int lda = */ lda,
                    /* const int ldb = */ ldb,
                    /* const int ldd = */ ldd,
                    /* const int tiles_n = */ tn,
                    /* const int tiles_m = */ tm,
                    /* const int64_t batch_stride_a = */ A_batch_stride,
                    /* const int64_t batch_stride_b = */ B_batch_stride,
                    /* const int64_t batch_stride_d = */ matrix_stride_out,
                    /* const int swizzle_log = */ swizzle_log,
                    /* const int gemm_k_iterations_aligned = */ (K / bk),
                    /* const int batch_ndim = */ int(batch_shape.size())};

  // Prepare launch grid params
  int tile = 1 << swizzle_log;
  tm = (tm + tile - 1) / tile;
  tn = tn * tile;

  MTL::Size group_dims = MTL::Size(32, wn, wm);
  MTL::Size grid_dims = MTL::Size(tn, tm, batch_size_out);

  // Launch kernel
  compute_encoder.set_input_array(a, 0);
  compute_encoder.set_input_array(b, 1);
  compute_encoder.set_output_array(out, 3);

  compute_encoder.set_bytes(params, 4);

  if (has_batch) {
    compute_encoder.set_vector_bytes(batch_shape, 6);
    compute_encoder.set_vector_bytes(batch_strides, 7);
  }

  if (use_out_source) {
    int ldc = c.strides()[c.ndim() - 2];
    int fdc = c.strides()[c.ndim() - 1];

    GEMMAddMMParams params{/* const int ldc = */ ldc,
                           /* const int fdc = */ fdc,
                           /* const int64_t batch_stride_c = */ C_batch_stride,
                           /* const float alpha = */ alpha,
                           /* const float beta = */ beta};

    compute_encoder.set_input_array(c, 2);
    compute_encoder.set_bytes(params, 5);
  }

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);

  // Record copies
  d.add_temporaries(std::move(copies), s.index);
}

///////////////////////////////////////////////////////////////////////////////
// Split k steel matmul
///////////////////////////////////////////////////////////////////////////////

template <bool CHECK_AB = true>
void steel_gemm_splitk_axpby(
    const Stream& s,
    metal::Device& d,
    const array& a,
    const array& b,
    const array& c,
    array& out,
    int M,
    int N,
    int K,
    int batch_size_out,
    int lda,
    int ldb,
    bool transpose_a,
    bool transpose_b,
    std::vector<array>& copies,
    float alpha = 1.0f,
    float beta = 0.0f) {
  using namespace mlx::steel;

  int _tm = (M + 32 - 1) / 32;
  int _tn = (N + 32 - 1) / 32;
  int _tk = K / 16;

  int bm = M < 40 ? 16 : 32;
  int bn = N < 40 ? 16 : 32;
  int bk = 16;
  int wm = 2, wn = 2;

  // As _tk grows use more partitions, as _tm * _tn grow use fewer partitions
  int split_k_partitions =
      std::min(std::max(2, next_power_of_2(_tk / (_tm * _tn))), 32);
  int split_k_partition_stride = M * N;
  int gemm_k_iterations = (K / bk) / split_k_partitions;
  int split_k_partition_size = gemm_k_iterations * bk;

  array C_split(
      {split_k_partitions, M, N},
      issubdtype(out.dtype(), complexfloating) ? complex64 : float32,
      nullptr,
      {});
  C_split.set_data(allocator::malloc(C_split.nbytes()));
  copies.push_back(C_split);

  bool mn_aligned = M % bm == 0 && N % bn == 0;
  bool k_aligned = K % bk == 0;
  std::ostringstream kname;

  // clang-format off
  kname << "steel_gemm_splitk_"
        << (transpose_a ? 't' : 'n')
        << (transpose_b ? 't' : 'n')
        << "_" << type_to_name(a)
        << "_" << type_to_name(C_split)
        << "_bm" << bm << "_bn" << bn << "_bk" << bk
        << "_wm" << wm << "_wn" << wn
        << "_MN_" << (mn_aligned ? "t" : "n") << "aligned"
        << "_K_" << (k_aligned ? "t" : "n") << "aligned"; // clang-format on

  // Encode and dispatch gemm kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = get_steel_gemm_splitk_kernel(
      /* metal::Device& d = */ d,
      /* const std::string& kernel_name = */ kname.str(),
      /* const array& in = */ a,
      /* const array& out = */ C_split,
      /* bool transpose_a = */ transpose_a,
      /* bool transpose_b = */ transpose_b,
      /* int bm = */ bm,
      /* int bn = */ bn,
      /* int bk = */ bk,
      /* int wm = */ wm,
      /* int wn = */ wn,
      /* bool mn_aligned = */ mn_aligned,
      /* bool k_aligned = */ k_aligned);

  compute_encoder.set_compute_pipeline_state(kernel);

  int tn = (N + bn - 1) / bn;
  int tm = (M + bm - 1) / bm;

  GEMMSpiltKParams params{
      /* const int M = */ M,
      /* const int N = */ N,
      /* const int K = */ K,
      /* const int lda = */ lda,
      /* const int ldb = */ ldb,
      /* const int ldc = */ N,
      /* const int tiles_n = */ tn,
      /* const int tiles_m = */ tm,
      /* const int split_k_partitions = */ split_k_partitions,
      /* const int split_k_partition_stride = */ split_k_partition_stride,
      /* const int split_k_partition_size = */ split_k_partition_size,
      /* const int swizzle_log = */ 0, // no swizzle
      /* const int gemm_k_iterations_aligned = */ gemm_k_iterations};

  MTL::Size group_dims = MTL::Size(32, wn, wm);
  MTL::Size grid_dims = MTL::Size(tn, tm, split_k_partitions);

  compute_encoder.set_input_array(a, 0);
  compute_encoder.set_input_array(b, 1);
  compute_encoder.set_output_array(C_split, 2);

  compute_encoder.set_bytes(params, 3);
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);

  // Do accum kernel
  {
    const bool do_axpby = CHECK_AB && (alpha != 1.0f || beta != 0.0f);

    auto kernel_name = "steel_gemm_splitk_accum_" + type_to_name(out) + "_" +
        type_to_name(C_split);

    if (do_axpby) {
      kernel_name = kernel_name + "_axbpy";
    }

    auto kernel = get_steel_gemm_splitk_accum_kernel(
        /* metal::Device& d = */ d,
        /* const std::string& kernel_name = */ kernel_name,
        /* const array& in = */ C_split,
        /* const array& out = */ out,
        /* bool axbpy = */ do_axpby);
    compute_encoder.set_compute_pipeline_state(kernel);

    // Set the arguments for the kernel
    compute_encoder.set_input_array(C_split, 0);
    compute_encoder.set_output_array(out, 1);
    compute_encoder.set_bytes(split_k_partitions, 2);
    compute_encoder.set_bytes(split_k_partition_stride, 3);
    compute_encoder.set_bytes(N, 4);

    if (do_axpby) {
      int ldc = c.strides()[c.ndim() - 2];
      int fdc = c.strides()[c.ndim() - 1];

      compute_encoder.set_input_array(c, 5);
      compute_encoder.set_bytes(ldc, 6);
      compute_encoder.set_bytes(fdc, 7);
      compute_encoder.set_bytes(alpha, 8);
      compute_encoder.set_bytes(beta, 9);
    }

    // Launch enough thread groups for each output
    MTL::Size grid_dims = MTL::Size(N, M, 1);
    auto group_dims = get_block_dims(N, M, 1);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }

  d.add_temporaries(std::move(copies), s.index);
}

///////////////////////////////////////////////////////////////////////////////
// NAX Split k steel matmul
///////////////////////////////////////////////////////////////////////////////

template <bool CHECK_AB = true>
void steel_gemm_splitk_axpby_nax(
    const Stream& s,
    metal::Device& d,
    const array& a,
    const array& b,
    const array& c,
    array& out,
    int M,
    int N,
    int K,
    int batch_size_out,
    int lda,
    int ldb,
    bool transpose_a,
    bool transpose_b,
    std::vector<array>& copies,
    float alpha = 1.0f,
    float beta = 0.0f) {
  using namespace mlx::steel;

  constexpr int bm = 128, bn = 128, bk = 512;
  constexpr int wm = 4, wn = 4;

  // Determine how many partitions to split K into
  constexpr int split_k_partition_size = 3072;
  int split_k_partitions =
      (K + split_k_partition_size - 1) / split_k_partition_size;

  const int bk_iters_per_partition = split_k_partition_size / bk;
  const int split_k_partition_stride = M * N;

  array C_split({split_k_partitions, M, N}, float32, nullptr, {});
  C_split.set_data(allocator::malloc(C_split.nbytes()));
  copies.push_back(C_split);

  const bool align_M = (M % bm) == 0;
  const bool align_N = (N % bn) == 0;
  const bool align_K = (K % bk) == 0;

  // Per-tile align_K is checked at runtime; only the last tile can be unaligned
  metal::MTLFCList func_consts = {
      {&align_M, MTL::DataType::DataTypeBool, 200},
      {&align_N, MTL::DataType::DataTypeBool, 201}};

  std::ostringstream kname;

  // clang-format off
  kname << "steel_gemm_splitk_nax_"
        << (transpose_a ? 't' : 'n')
        << (transpose_b ? 't' : 'n')
        << "_" << type_to_name(a)
        << "_" << type_to_name(C_split)
        << "_bm" << bm << "_bn" << bn << "_bk" << bk
        << "_wm" << wm << "_wn" << wn; // clang-format on

  std::string base_name = kname.str();

  // clang-format off
  kname << "_align_M_" << (align_M ? 't' : 'n')
        << "_align_N_" << (align_N ? 't' : 'n')
        << "_align_K_" << (align_K ? 't' : 'n'); // clang-format on

  std::string hash_name = kname.str();

  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = get_steel_gemm_splitk_nax_kernel(
      /* metal::Device& d = */ d,
      /* const std::string& kernel_name = */ base_name,
      /* const std::string& hash_name = */ hash_name,
      /* const metal::MTLFCList& func_consts = */ func_consts,
      /* const array& out = */ C_split,
      /* bool transpose_a = */ transpose_a,
      /* bool transpose_b = */ transpose_b,
      /* int bm = */ bm,
      /* int bn = */ bn,
      /* int bk = */ bk,
      /* int wm = */ wm,
      /* int wn = */ wn);

  compute_encoder.set_compute_pipeline_state(kernel);

  int tn = (N + bn - 1) / bn;
  int tm = (M + bm - 1) / bm;

  int swizzle_log = tm <= 3 ? 0 : 1;

  // Compute swizzled tile counts
  int tile = 1 << swizzle_log;
  int tm_swizzled = (tm + tile - 1) / tile;
  int tn_swizzled = tn * tile;

  GEMMSpiltKParams params{
      /* const int M = */ M,
      /* const int N = */ N,
      /* const int K = */ K,
      /* const int lda = */ lda,
      /* const int ldb = */ ldb,
      /* const int ldc = */ N,
      /* const int tiles_n = */ tn,
      /* const int tiles_m = */ tm,
      /* const int split_k_partitions = */ split_k_partitions,
      /* const int split_k_partition_stride = */ split_k_partition_stride,
      /* const int split_k_partition_size = */ split_k_partition_size,
      /* const int swizzle_log = */ swizzle_log,
      /* const int gemm_k_iterations_aligned = */ bk_iters_per_partition};

  MTL::Size group_dims = MTL::Size(32, wn, wm);
  // Use 1D grid with K-partition-major layout: [Partition0: M×N
  // tiles][Partition1: M×N tiles]... Grid size is 1D to prevent driver/HW from
  // using its own heuristic to exploit 2D locality by launching threadgroups in
  // a non-linear order
  MTL::Size grid_dims =
      MTL::Size(tn_swizzled * tm_swizzled * split_k_partitions, 1, 1);

  compute_encoder.set_input_array(a, 0);
  compute_encoder.set_input_array(b, 1);
  compute_encoder.set_output_array(C_split, 2);

  compute_encoder.set_bytes(params, 3);
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);

  // Do accum kernel
  {
    const bool do_axpby = CHECK_AB && (alpha != 1.0f || beta != 0.0f);

    auto kernel_name = "steel_gemm_splitk_accum_" + type_to_name(out) + "_" +
        type_to_name(C_split);

    if (do_axpby) {
      kernel_name = kernel_name + "_axbpy";
    }

    auto kernel = get_steel_gemm_splitk_accum_kernel(
        /* metal::Device& d = */ d,
        /* const std::string& kernel_name = */ kernel_name,
        /* const array& in = */ C_split,
        /* const array& out = */ out,
        /* bool axbpy = */ do_axpby);
    compute_encoder.set_compute_pipeline_state(kernel);

    // Set the arguments for the kernel
    compute_encoder.set_input_array(C_split, 0);
    compute_encoder.set_output_array(out, 1);
    compute_encoder.set_bytes(split_k_partitions, 2);
    compute_encoder.set_bytes(split_k_partition_stride, 3);
    compute_encoder.set_bytes(N, 4);

    if (do_axpby) {
      int ldc = c.strides()[c.ndim() - 2];
      int fdc = c.strides()[c.ndim() - 1];

      compute_encoder.set_input_array(c, 5);
      compute_encoder.set_bytes(ldc, 6);
      compute_encoder.set_bytes(fdc, 7);
      compute_encoder.set_bytes(alpha, 8);
      compute_encoder.set_bytes(beta, 9);
    }

    // Launch enough thread groups for each output
    MTL::Size grid_dims = MTL::Size(N, M, 1);
    auto group_dims = get_block_dims(N, M, 1);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }

  d.add_temporaries(std::move(copies), s.index);
}

///////////////////////////////////////////////////////////////////////////////
// Split matmul routing
///////////////////////////////////////////////////////////////////////////////

template <bool CHECK_AB>
void steel_matmul_axpby(
    const Stream& s,
    metal::Device& d,
    const array& a,
    const array& b,
    const array& c,
    array& out,
    int M,
    int N,
    int K,
    int batch_size_out,
    int lda,
    int ldb,
    bool transpose_a,
    bool transpose_b,
    std::vector<array>& copies,
    Shape batch_shape /* = {} */,
    Strides A_batch_stride /* = {} */,
    Strides B_batch_stride /* = {} */,
    Strides C_batch_stride /* = {} */,
    float alpha /* = 1.0f */,
    float beta /* = 0.0f */) {
  if (batch_shape.empty()) {
    /////////////////////////////////////////////////////////////////////////////
    // Check and collapse batch dimensions
    if constexpr (CHECK_AB) {
      auto [batch_shape_, A_bstride_, B_bstride_, C_bstride_] =
          collapse_batches(a, b, c);

      batch_shape = batch_shape_;
      A_batch_stride = A_bstride_;
      B_batch_stride = B_bstride_;
      C_batch_stride = C_bstride_;
      // Collapse batches into M if needed
      if (batch_size_out > 1 && !transpose_a && batch_shape.size() == 1 &&
          a.strides()[a.ndim() - 2] == K && A_batch_stride.back() == M * K &&
          C_batch_stride.back() == M * c.strides()[c.ndim() - 2] &&
          B_batch_stride.back() == 0) {
        M *= batch_shape.back();
        batch_size_out = 1;

        A_batch_stride = {0};
        B_batch_stride = {0};
        C_batch_stride = {0};
        batch_shape = {1};
      }
    } else {
      auto [batch_shape_, A_bstride_, B_bstride_] = collapse_batches(a, b);

      batch_shape = batch_shape_;
      A_batch_stride = A_bstride_;
      B_batch_stride = B_bstride_;
      // Collapse batches into M if needed
      if (batch_size_out > 1 && !transpose_a && batch_shape.size() == 1 &&
          a.strides()[a.ndim() - 2] == K && A_batch_stride.back() == M * K &&
          B_batch_stride.back() == 0) {
        M *= batch_shape.back();
        batch_size_out = 1;

        A_batch_stride = {0};
        B_batch_stride = {0};
        batch_shape = {1};
      }
    }
  }

  /////////////////////////////////////////////////////////////////////////////
  // Split K specialization

  int _tm = (M + 16 - 1) / 16;
  int _tn = (N + 16 - 1) / 16;
  int _tk = K / 16;

  // Case 1: Small M×N with large K, use SIMD split-K
  char devc = d.get_architecture().back();
  // Max and Ultra dispatch larger sizes to splitk
  int min_tmn_threshold = (devc == 's' || devc == 'd') ? 2048 : 1024;
  if (batch_size_out == 1 && (_tm * _tn) <= min_tmn_threshold && _tk >= 8 &&
      K >= std::max(M, N)) {
    return steel_gemm_splitk_axpby<CHECK_AB>(
        /* const Stream& s = */ s,
        /* metal::Device& d = */ d,
        /* const array& a = */ a,
        /* const array& b = */ b,
        /* const array& c = */ c,
        /* array& out = */ out,
        /* int M = */ M,
        /* int N = */ N,
        /* int K = */ K,
        /* int batch_size_out = */ batch_size_out,
        /* int lda = */ lda,
        /* int ldb = */ ldb,
        /* bool transpose_a = */ transpose_a,
        /* bool transpose_b = */ transpose_b,
        /* std::vector<array>& copies = */ copies,
        /* float alpha = */ alpha,
        /* float beta = */ beta);
  }

  // Case 2: Large K with sufficient M, N, and NAX is available, use NAX split-K
  // TODO: Add device-specific tuning for more NAX GPUs in the future
  constexpr int min_mn_threshold = 2048 * 2048;
  constexpr int min_k_threshold = 10240;
  if (batch_size_out == 1 && metal::is_nax_available() &&
      !issubdtype(a.dtype(), complexfloating) &&
      (env::enable_tf32() || a.dtype() != float32) &&
      int64_t(M) * N >= min_mn_threshold && K >= min_k_threshold &&
      K >= (3 * std::max(M, N))) {
    return steel_gemm_splitk_axpby_nax<CHECK_AB>(
        /* const Stream& s = */ s,
        /* metal::Device& d = */ d,
        /* const array& a = */ a,
        /* const array& b = */ b,
        /* const array& c = */ c,
        /* array& out = */ out,
        /* int M = */ M,
        /* int N = */ N,
        /* int K = */ K,
        /* int batch_size_out = */ batch_size_out,
        /* int lda = */ lda,
        /* int ldb = */ ldb,
        /* bool transpose_a = */ transpose_a,
        /* bool transpose_b = */ transpose_b,
        /* std::vector<array>& copies = */ copies,
        /* float alpha = */ alpha,
        /* float beta = */ beta);
  }

  /////////////////////////////////////////////////////////////////////////////
  // Regular kernel dispatch
  auto batch_strides = A_batch_stride;
  batch_strides.insert(
      batch_strides.end(), B_batch_stride.begin(), B_batch_stride.end());
  if (CHECK_AB && !C_batch_stride.empty()) {
    batch_strides.insert(
        batch_strides.end(), C_batch_stride.begin(), C_batch_stride.end());
  }

  int64_t A_batch_stride_ = A_batch_stride.empty() ? 0 : A_batch_stride.back();
  int64_t B_batch_stride_ = B_batch_stride.empty() ? 0 : B_batch_stride.back();
  int64_t C_batch_stride_ = C_batch_stride.empty() ? 0 : C_batch_stride.back();

  return steel_matmul_regular_axpby<CHECK_AB>(
      /* const Stream& s = */ s,
      /* metal::Device& d = */ d,
      /* const array& a = */ a,
      /* const array& b = */ b,
      /* const array& c = */ c,
      /* array& out = */ out,
      /* int M = */ M,
      /* int N = */ N,
      /* int K = */ K,
      /* int batch_size_out = */ batch_size_out,
      /* int lda = */ lda,
      /* int ldb = */ ldb,
      /* int ldd = */ N,
      /* bool transpose_a = */ transpose_a,
      /* bool transpose_b = */ transpose_b,
      /* std::vector<array>& copies = */ copies,
      /* Shape batch_shape = */ std::move(batch_shape),
      /* Strides batch_strides = */ std::move(batch_strides),
      /* int64_t A_batch_stride = */ A_batch_stride_,
      /* int64_t B_batch_stride = */ B_batch_stride_,
      /* int64_t matrix_stride_out = */ int64_t(M) * N,
      /* int64_t C_batch_stride = */ C_batch_stride_,
      /* float alpha = */ alpha,
      /* float beta = */ beta);
}

///////////////////////////////////////////////////////////////////////////////
// GEMV dispatch
///////////////////////////////////////////////////////////////////////////////

template <bool CHECK_AB = true>
void gemv_axbpy(
    const Stream& s,
    metal::Device& d,
    const array& a,
    const array& b,
    const array& c,
    array& out,
    int M,
    int N,
    int K,
    int batch_size_out,
    int lda,
    int ldb,
    bool transpose_a,
    bool transpose_b,
    std::vector<array>& copies,
    Shape batch_shape = {},
    Strides A_batch_stride = {},
    Strides B_batch_stride = {},
    Strides C_batch_stride = {},
    float alpha = 1.0f,
    float beta = 0.0f) {
  // Collect problem info
  bool is_b_matrix = N != 1;

  auto& mat = is_b_matrix ? b : a;
  auto& vec = is_b_matrix ? a : b;
  bool transpose_mat = is_b_matrix ? !transpose_b : transpose_a;
  int in_vector_len = K;
  int out_vector_len = is_b_matrix ? N : M;

  int mat_ld = is_b_matrix ? ldb : lda;

  auto batch_strides_mat = is_b_matrix ? B_batch_stride : A_batch_stride;
  auto batch_strides_vec = is_b_matrix ? A_batch_stride : B_batch_stride;

  // Determine if inputs have simple batching / broadcasting
  bool contiguous_kernel = (batch_shape.size() == 1);

  int batch_ndim = batch_shape.size();

  // Determine dispatch kernel
  int tm = 4, tn = 4;
  int sm = 1, sn = 32;
  int bm = 1, bn = 1;
  int n_out_per_tgp;
  std::ostringstream kname;

  if (transpose_mat) {
    if (in_vector_len >= 8192 && out_vector_len >= 2048) {
      sm = 4;
      sn = 8;
    } else {
      sm = 8;
      sn = 4;
    }

    if (out_vector_len >= 2048) {
      bn = 16;
    } else if (out_vector_len >= 512) {
      bn = 4;
    } else {
      bn = 2;
    }

    // Specialized kernel for very small outputs
    tn = out_vector_len < tn ? 1 : tn;

    n_out_per_tgp = bn * sn * tn;
    kname << "gemv_t_" << type_to_name(out);

  } else {
    bm = out_vector_len >= 4096 ? 8 : 4;
    sn = 32;

    if (K <= 64) {
      bm = 1;
      sm = 8;
      sn = 4;
    } else if (K >= 16 * out_vector_len) {
      bm = 1;
      bn = 8;
    }

    // Specialized kernel for very small outputs
    tm = out_vector_len < tm ? 1 : tm;

    n_out_per_tgp = bm * sm * tm;
    kname << "gemv_" << type_to_name(out);
  }

  const bool do_axpby = CHECK_AB && (alpha != 1.0f || beta != 0.0f);

  // clang-format off
  kname << "_bm" << bm << "_bn" << bn
        << "_sm" << sm << "_sn" << sn
        << "_tm" << tm << "_tn" << tn
        << "_nc" << !contiguous_kernel
        << "_axpby" << do_axpby; // clang-format on

  // Encode and dispatch kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kname.str());
  compute_encoder.set_compute_pipeline_state(kernel);

  int n_tgp = (out_vector_len + n_out_per_tgp - 1) / n_out_per_tgp;
  MTL::Size group_dims = MTL::Size(32, bn, bm);
  MTL::Size grid_dims = MTL::Size(n_tgp, 1, batch_size_out);

  compute_encoder.set_input_array(mat, 0);
  compute_encoder.set_input_array(vec, 1);
  compute_encoder.set_output_array(out, 3);

  compute_encoder.set_bytes(in_vector_len, 4);
  compute_encoder.set_bytes(out_vector_len, 5);
  compute_encoder.set_bytes(mat_ld, 6);

  compute_encoder.set_bytes(batch_ndim, 9);
  compute_encoder.set_vector_bytes(batch_shape, 10);
  compute_encoder.set_vector_bytes(batch_strides_vec, 11);
  compute_encoder.set_vector_bytes(batch_strides_mat, 12);

  if (do_axpby) {
    compute_encoder.set_input_array(c, 2);

    compute_encoder.set_bytes(alpha, 7);
    compute_encoder.set_bytes(beta, 8);

    compute_encoder.set_vector_bytes(C_batch_stride, 13);

    int bias_stride = c.strides()[c.ndim() - 1];
    compute_encoder.set_bytes(bias_stride, 14);
  }

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);

  d.add_temporaries(std::move(copies), s.index);
}

inline void gemv(
    const Stream& s,
    metal::Device& d,
    const array& a,
    const array& b,
    array& out,
    int M,
    int N,
    int K,
    int batch_size_out,
    int lda,
    int ldb,
    bool transpose_a,
    bool transpose_b,
    std::vector<array>& copies,
    Shape batch_shape = {},
    Strides A_batch_stride = {},
    Strides B_batch_stride = {}) {
  return gemv_axbpy<false>(
      /* const Stream& s = */ s,
      /* metal::Device& d = */ d,
      /* const array& a = */ a,
      /* const array& b = */ b,
      /* const array& c = */ b,
      /* array& out = */ out,
      /* int M = */ M,
      /* int N = */ N,
      /* int K = */ K,
      /* int batch_size_out = */ batch_size_out,
      /* int lda = */ lda,
      /* int ldb = */ ldb,
      /* bool transpose_a = */ transpose_a,
      /* bool transpose_b = */ transpose_b,
      /* std::vector<array>& copies = */ copies,
      /* Shape batch_shape = */ batch_shape,
      /* Strides A_batch_stride = */ A_batch_stride,
      /* Strides B_batch_stride = */ B_batch_stride);
}

///////////////////////////////////////////////////////////////////////////////
// Matmul implementation
///////////////////////////////////////////////////////////////////////////////

void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 2);
  if (!issubdtype(out.dtype(), inexact)) {
    throw std::runtime_error("[matmul] dtype must be inexact.");
  }
  auto& s = stream();
  auto& d = metal::device(s.device);

  auto& a_pre = inputs[0];
  auto& b_pre = inputs[1];
  // Return 0s if either input is empty
  if (a_pre.size() == 0 || b_pre.size() == 0) {
    array zero = array(0, a_pre.dtype());
    fill_gpu(zero, out, s);
    d.add_temporary(std::move(zero), s.index);
    return;
  }

  out.set_data(allocator::malloc(out.nbytes()));

  /////////////////////////////////////////////////////////////////////////////
  // Init checks and prep

  int M = a_pre.shape(-2);
  int N = b_pre.shape(-1);
  int K = a_pre.shape(-1);

  // Keep a vector with copies to be cleared in the completed buffer to release
  // the arrays
  std::vector<array> copies;
  auto [a_transposed, a_cols, a] = check_transpose(copies, s, a_pre, M == 1);
  auto [b_transposed, b_cols, b] = check_transpose(copies, s, b_pre, N == 1);

  /////////////////////////////////////////////////////////////////////////////
  // Check and collapse batch dimensions

  auto [batch_shape, A_batch_stride, B_batch_stride] = collapse_batches(a, b);

  auto batch_size_out = out.size() / (size_t(M) * size_t(N));

  // Collapse batches into M if needed
  if (batch_size_out > 1 && !a_transposed && batch_shape.size() == 1 &&
      a.strides()[a.ndim() - 2] == K && A_batch_stride.back() == M * K &&
      B_batch_stride.back() == 0) {
    M *= batch_shape.back();
    batch_size_out = 1;

    A_batch_stride = {0};
    B_batch_stride = {0};
    batch_shape = {1};
  }

  /////////////////////////////////////////////////////////////////////////////
  // Gemv specialization

  // Route to gemv if needed
  if (std::min(M, N) == 1) {
    return gemv(
        /* const Stream& s = */ s,
        /* metal::Device& d = */ d,
        /* const array& a = */ a,
        /* const array& b = */ b,
        /* array& out = */ out,
        /* int M = */ M,
        /* int N = */ N,
        /* int K = */ K,
        /* int batch_size_out = */ batch_size_out,
        /* int lda = */ a_cols,
        /* int ldb = */ b_cols,
        /* bool transpose_a = */ a_transposed,
        /* bool transpose_b = */ b_transposed,
        /* std::vector<array>& copies = */ copies,
        /* Shape batch_shape = */ std::move(batch_shape),
        /* Strides A_batch_stride = */ std::move(A_batch_stride),
        /* Strides B_batch_stride = */ std::move(B_batch_stride));
  }

  /////////////////////////////////////////////////////////////////////////////
  // Gemm specialization

  return steel_matmul(
      /* const Stream& s = */ s,
      /* metal::Device& d = */ d,
      /* const array& a = */ a,
      /* const array& b = */ b,
      /* array& out = */ out,
      /* int M = */ M,
      /* int N = */ N,
      /* int K = */ K,
      /* int batch_size_out = */ batch_size_out,
      /* int lda = */ a_cols,
      /* int ldb = */ b_cols,
      /* bool transpose_a = */ a_transposed,
      /* bool transpose_b = */ b_transposed,
      /* std::vector<array>& copies = */ copies,
      /* Shape batch_shape = */ std::move(batch_shape),
      /* Strides A_batch_stride = */ std::move(A_batch_stride),
      /* Strides B_batch_stride = */ std::move(B_batch_stride));
}

///////////////////////////////////////////////////////////////////////////////
// AddMM implementation
///////////////////////////////////////////////////////////////////////////////

void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 3);
  if (!issubdtype(out.dtype(), floating)) {
    throw std::runtime_error(
        "[matmul] Does not yet support non-floating point types.");
  }

  // Return 0s if either input is empty
  if (out.size() == 0) {
    out.set_data(allocator::malloc(out.nbytes()));
    return;
  }

  auto& s = stream();
  auto& d = metal::device(s.device);

  // Handle empty matrix case (K=0)
  if (inputs[0].shape(-1) == 0) {
    auto& c = inputs[2];
    if (beta_ == 1.0f) {
      copy_gpu(
          c,
          out,
          c.flags().row_contiguous ? CopyType::Vector : CopyType::General,
          s);
    } else {
      array beta_scalar = array(beta_, c.dtype());
      binary_op_gpu({c, beta_scalar}, out, "Multiply", s);
      d.add_temporary(std::move(beta_scalar), s.index);
    }
    return;
  }

  out.set_data(allocator::malloc(out.nbytes()));

  auto& a_pre = inputs[0];
  auto& b_pre = inputs[1];
  auto& c_pre = inputs[2];

  /////////////////////////////////////////////////////////////////////////////
  // Init checks and prep

  int M = a_pre.shape(-2);
  int N = b_pre.shape(-1);
  int K = a_pre.shape(-1);

  // Keep a vector with copies to be cleared in the completed buffer to release
  // the arrays
  std::vector<array> copies;
  auto [transpose_a, a_cols, a] = check_transpose(copies, s, a_pre, M == 1);
  auto [transpose_b, b_cols, b] = check_transpose(copies, s, b_pre, N == 1);

  array c = c_pre;

  int lda = a_cols;
  int ldb = b_cols;

  /////////////////////////////////////////////////////////////////////////////
  // Check and collapse batch dimensions
  auto [batch_shape, A_batch_stride, B_batch_stride, C_batch_stride] =
      collapse_batches(a, b, c);

  int64_t matrix_stride_out = M * static_cast<int64_t>(N);
  auto batch_size_out = out.size() / (matrix_stride_out);

  // Collapse batches into M if needed
  if (batch_size_out > 1 && !transpose_a && batch_shape.size() == 1 &&
      a.strides()[a.ndim() - 2] == K && A_batch_stride.back() == M * K &&
      C_batch_stride.back() == M * c.strides()[c.ndim() - 2] &&
      B_batch_stride.back() == 0) {
    M *= batch_shape.back();
    batch_size_out = 1;

    A_batch_stride = {0};
    B_batch_stride = {0};
    C_batch_stride = {0};
    batch_shape = {1};
  }

  /////////////////////////////////////////////////////////////////////////////
  // Gemv specialization

  // Route to gemv if needed
  if (std::min(M, N) == 1) {
    return gemv_axbpy(
        /* const Stream& s = */ s,
        /* metal::Device& d = */ d,
        /* const array& a = */ a,
        /* const array& b = */ b,
        /* const array& c = */ c,
        /* array& out = */ out,
        /* int M = */ M,
        /* int N = */ N,
        /* int K = */ K,
        /* int batch_size_out = */ batch_size_out,
        /* int lda = */ lda,
        /* int ldb = */ ldb,
        /* bool transpose_a = */ transpose_a,
        /* bool transpose_b = */ transpose_b,
        /* std::vector<array>& copies = */ copies,
        /* Shape batch_shape = */ batch_shape,
        /* Strides A_batch_stride = */ A_batch_stride,
        /* Strides B_batch_stride = */ B_batch_stride,
        /* Strides C_batch_stride = */ C_batch_stride,
        /* float alpha = */ alpha_,
        /* float beta = */ beta_);
  }

  /////////////////////////////////////////////////////////////////////////////
  // Regular addmm dispatch

  return steel_matmul_axpby(
      /* const Stream& s = */ s,
      /* metal::Device& d = */ d,
      /* const array& a = */ a,
      /* const array& b = */ b,
      /* const array& c = */ c,
      /* array& out = */ out,
      /* int M = */ M,
      /* int N = */ N,
      /* int K = */ K,
      /* int batch_size_out = */ batch_size_out,
      /* int lda = */ lda,
      /* int ldb = */ ldb,
      /* bool transpose_a = */ transpose_a,
      /* bool transpose_b = */ transpose_b,
      /* std::vector<array>& copies = */ copies,
      /* Shape batch_shape = */ batch_shape,
      /* Strides A_batch_stride = */ A_batch_stride,
      /* Strides B_batch_stride = */ B_batch_stride,
      /* Strides B_batch_stride = */ C_batch_stride,
      /* float alpha = */ alpha_,
      /* float beta = */ beta_);
}

///////////////////////////////////////////////////////////////////////////////
// BlockMaskedMM implementation
///////////////////////////////////////////////////////////////////////////////

void BlockMaskedMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  using namespace mlx::steel;
  // assert(inputs.size() == 2);
  if (!issubdtype(out.dtype(), floating)) {
    throw std::runtime_error(
        "[matmul] Does not yet support non-floating point types.");
  }
  auto& s = stream();
  auto& d = metal::device(s.device);

  auto& a_pre = inputs[0];
  auto& b_pre = inputs[1];
  // Return 0s if either input is empty
  if (a_pre.size() == 0 || b_pre.size() == 0) {
    array zero = array(0, a_pre.dtype());
    fill_gpu(zero, out, s);
    d.add_temporary(std::move(zero), s.index);
    return;
  }

  out.set_data(allocator::malloc(out.nbytes()));

  /////////////////////////////////////////////////////////////////////////////
  // Init checks and prep

  int M = a_pre.shape(-2);
  int N = b_pre.shape(-1);
  int K = a_pre.shape(-1);

  // Keep a vector with copies to be cleared in the completed buffer to release
  // the arrays
  std::vector<array> copies;
  auto [transpose_a, a_cols, a] = check_transpose(copies, s, a_pre, M == 1);
  auto [transpose_b, b_cols, b] = check_transpose(copies, s, b_pre, N == 1);

  int lda = a_cols;
  int ldb = b_cols;

  /////////////////////////////////////////////////////////////////////////////
  // Check and collapse batch dimensions

  bool has_op_mask = inputs.size() > 3;
  bool has_out_mask = inputs.size() == 3 || inputs.size() == 5;

  // Prepare kernel name
  std::string out_mask_nm = has_out_mask ? type_to_name(inputs[2]) : "nomask";
  std::string op_mask_nm = has_op_mask ? type_to_name(inputs.back()) : "nomask";

  Shape batch_shape{1};
  Strides A_batch_stride{0};
  Strides B_batch_stride{0};
  Strides outmask_bstride{0};
  Strides Amask_bstride{0};
  Strides Bmask_bstride{0};
  int64_t A_batch_str = 0;
  int64_t B_batch_str = 0;

  Strides batch_strides;

  if (out.ndim() > 2) {
    Shape bshape{out.shape().begin(), out.shape().end() - 2};
    std::vector<Strides> bstrides;

    for (auto& arr : inputs) {
      bstrides.emplace_back(arr.strides().begin(), arr.strides().end() - 2);
    }

    // auto [bshape_c, bstrides_c] = collapse_contiguous_dims(bshape, bstrides);
    batch_shape = bshape;
    A_batch_str = bstrides[0].back();
    B_batch_str = bstrides[1].back();

    for (auto& bstr : bstrides) {
      batch_strides.insert(batch_strides.end(), bstr.begin(), bstr.end());
    }

    A_batch_stride = bstrides[0];
    B_batch_stride = bstrides[1];

    if (has_out_mask) {
      outmask_bstride = bstrides[2];
    }
    if (has_op_mask) {
      Amask_bstride = bstrides[has_out_mask + 2];
      Bmask_bstride = bstrides[has_out_mask + 3];
    }

  } else {
    batch_strides = Strides(inputs.size(), 0);
  }

  int64_t matrix_stride_out = static_cast<int64_t>(M) * N;
  size_t batch_size_out = out.size() / (matrix_stride_out);

  /////////////////////////////////////////////////////////////////////////////
  // Gemv specialization

  // Route to gemv if needed
  if (std::min(M, N) == 1) {
    // Collect problem info
    bool is_b_matrix = N != 1;

    auto& mat = is_b_matrix ? b : a;
    auto& vec = is_b_matrix ? a : b;
    bool transpose_mat = is_b_matrix ? !transpose_b : transpose_a;
    int in_vector_len = K;
    int out_vector_len = is_b_matrix ? N : M;

    int mat_ld = is_b_matrix ? b_cols : a_cols;

    auto batch_strides_mat = is_b_matrix ? B_batch_stride : A_batch_stride;
    auto batch_strides_vec = is_b_matrix ? A_batch_stride : B_batch_stride;

    auto mask_bstrides_mat = is_b_matrix ? Bmask_bstride : Amask_bstride;
    auto mask_bstrides_vec = is_b_matrix ? Amask_bstride : Bmask_bstride;

    auto mat_mask_idx = int(has_out_mask) + (is_b_matrix ? 3 : 2);
    auto vec_mask_idx = int(has_out_mask) + (is_b_matrix ? 2 : 3);

    // Determine if inputs have simple batching / broadcasting
    bool contiguous_kernel = (batch_shape.size() == 1);

    int batch_ndim = batch_shape.size();

    // Determine dispatch kernel
    int tm = 4, tn = 4;
    int sm = 1, sn = 32;
    int bm = 1, bn = 1;
    int n_out_per_tgp;
    std::ostringstream kname;

    if (transpose_mat) {
      sm = 8;
      sn = 4;
      bm = 1;
      bn = (block_size_ == 64 && out_vector_len >= 2048) ? 4 : 2;
      tm = block_size_ == 32 ? 4 : 8;
      tn = 4;

      // Specialized kernel for very small outputs
      tn = out_vector_len < tn ? 1 : tn;

      n_out_per_tgp = bn * sn * tn;
      kname << "gemv_t";

    } else {
      if (block_size_ == 32) {
        sm = 4;
        sn = 8;
        bm = 2;
      } else {
        sm = 2;
        sn = 16;
        bm = out_vector_len >= 512 ? 4 : 2;
      }

      // Specialized kernel for very small outputs
      tm = out_vector_len < tm ? 1 : tm;

      n_out_per_tgp = bm * sm * tm;
      kname << "gemv";
    }

    kname << "_outmask_" << out_mask_nm;
    kname << "_opmask_" << op_mask_nm;
    kname << "_" << type_to_name(out);
    kname << "_bm" << bm << "_bn" << bn;
    kname << "_sm" << sm << "_sn" << sn;
    kname << "_tm" << tm << "_tn" << tn;
    kname << "_nc" << !contiguous_kernel;

    // Encode and dispatch kernel
    auto kernel = get_gemv_masked_kernel(
        d,
        kname.str(),
        out,
        has_out_mask ? std::optional<array>{inputs[2]} : std::nullopt,
        has_op_mask ? std::optional<array>{inputs.back()} : std::nullopt,
        transpose_mat,
        bm,
        bn,
        sm,
        sn,
        tm,
        tn,
        contiguous_kernel);

    auto& compute_encoder = d.get_command_encoder(s.index);
    compute_encoder.set_compute_pipeline_state(kernel);

    int n_tgp = (out_vector_len + n_out_per_tgp - 1) / n_out_per_tgp;
    MTL::Size group_dims = MTL::Size(32, bn, bm);
    MTL::Size grid_dims = MTL::Size(n_tgp, 1, batch_size_out);

    // Get mask params
    std::vector<int> mask_strides;
    Strides mask_batch_strides;
    if (has_out_mask) {
      auto& out_mask = inputs[2];

      if (transpose_mat) {
        mask_strides.push_back(out_mask.strides(out.shape(-2) == 1 ? -1 : -2));
        mask_strides.push_back(out_mask.strides(out.shape(-2) == 1 ? -2 : -1));
      } else {
        mask_strides.push_back(out_mask.strides(out.shape(-1) == 1 ? -1 : -2));
        mask_strides.push_back(out_mask.strides(out.shape(-1) == 1 ? -2 : -1));
      }

      mask_batch_strides.insert(
          mask_batch_strides.end(),
          outmask_bstride.begin(),
          outmask_bstride.end());

      compute_encoder.set_input_array(out_mask, 20);
    }

    if (has_op_mask) {
      auto& mat_mask = inputs[mat_mask_idx];

      if (transpose_mat) {
        mask_strides.push_back(mat_mask.strides(!is_b_matrix ? -2 : -1));
        mask_strides.push_back(mat_mask.strides(!is_b_matrix ? -1 : -2));
      } else {
        mask_strides.push_back(mat_mask.strides(is_b_matrix ? -2 : -1));
        mask_strides.push_back(mat_mask.strides(is_b_matrix ? -1 : -2));
      }

      mask_batch_strides.insert(
          mask_batch_strides.end(),
          mask_bstrides_mat.begin(),
          mask_bstrides_mat.end());

      compute_encoder.set_input_array(mat_mask, 21);

      auto& vec_mask = inputs[vec_mask_idx];
      if (transpose_mat) {
        mask_strides.push_back(vec_mask.strides(vec.shape(-2) == 1 ? -1 : -2));
        mask_strides.push_back(vec_mask.strides(vec.shape(-2) == 1 ? -2 : -1));
      } else {
        mask_strides.push_back(vec_mask.strides(vec.shape(-1) == 1 ? -1 : -2));
        mask_strides.push_back(vec_mask.strides(vec.shape(-1) == 1 ? -2 : -1));
      }

      mask_batch_strides.insert(
          mask_batch_strides.end(),
          mask_bstrides_vec.begin(),
          mask_bstrides_vec.end());

      compute_encoder.set_input_array(vec_mask, 22);
    }

    // Get gemv params
    compute_encoder.set_input_array(mat, 0);
    compute_encoder.set_input_array(vec, 1);
    compute_encoder.set_output_array(out, 3);

    compute_encoder.set_bytes(in_vector_len, 4);
    compute_encoder.set_bytes(out_vector_len, 5);
    compute_encoder.set_bytes(mat_ld, 6);
    compute_encoder.set_bytes(batch_ndim, 9);
    compute_encoder.set_vector_bytes(batch_shape, 10);
    compute_encoder.set_vector_bytes(batch_strides_vec, 11);
    compute_encoder.set_vector_bytes(batch_strides_mat, 12);

    compute_encoder.set_vector_bytes(mask_strides, 23);
    compute_encoder.set_vector_bytes(mask_batch_strides, 24);

    compute_encoder.dispatch_threadgroups(grid_dims, group_dims);

    d.add_temporaries(std::move(copies), s.index);
    return;
  }

  /////////////////////////////////////////////////////////////////////////////
  // Regular kernel dispatch

  // Determine dispatch kernel
  int bm = block_size_, bn = block_size_, bk = 16;
  int wm = 2, wn = 2;
  bool mn_aligned = M % bm == 0 && N % bn == 0;
  bool k_aligned = K % bk == 0;

  std::ostringstream kname;
  kname << "steel_gemm_block_outmask_" << out_mask_nm << "_opmask_"
        << op_mask_nm << "_" << (transpose_a ? 't' : 'n')
        << (transpose_b ? 't' : 'n') << "_" << type_to_name(a) << "_"
        << type_to_name(out) << "_bm" << bm << "_bn" << bn << "_bk" << bk
        << "_wm" << wm << "_wn" << wn << "_MN_" << (mn_aligned ? "t" : "n")
        << "aligned"
        << "_K_" << (k_aligned ? "t" : "n") << "aligned";

  // Encode and dispatch kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = get_steel_gemm_masked_kernel(
      d,
      kname.str(),
      out,
      has_out_mask ? std::optional<array>{inputs[2]} : std::nullopt,
      has_op_mask ? std::optional<array>{inputs.back()} : std::nullopt,
      transpose_a,
      transpose_b,
      bm,
      bn,
      bk,
      wm,
      wn,
      mn_aligned,
      k_aligned);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Use problem size to determine threadblock swizzle
  int tn = (N + bn - 1) / bn;
  int tm = (M + bm - 1) / bm;

  // TODO: Explore device-based tuning for swizzle
  int swizzle_log = 0; // tm >= 6 ? 3 : (tm <= 3 ? 0 : 2);

  // Prepare steel matmul params
  GEMMParams params{/* const int M = */ M,
                    /* const int N = */ N,
                    /* const int K = */ K,
                    /* const int lda = */ lda,
                    /* const int ldb = */ ldb,
                    /* const int ldd = */ N,
                    /* const int tiles_n = */ tn,
                    /* const int tiles_m = */ tm,
                    /* const int64_t batch_stride_a = */ A_batch_str,
                    /* const int64_t batch_stride_b = */ B_batch_str,
                    /* const int64_t batch_stride_d = */ matrix_stride_out,
                    /* const int swizzle_log = */ swizzle_log,
                    /* const int gemm_k_iterations_aligned = */ (K / bk),
                    /* const int batch_ndim = */ int(batch_shape.size())};

  // Prepare launch grid params
  int tile = 1 << swizzle_log;
  tm = (tm + tile - 1) / tile;
  tn = tn * tile;

  MTL::Size group_dims = MTL::Size(32, wn, wm);
  MTL::Size grid_dims = MTL::Size(tn, tm, batch_size_out);

  std::vector<int> mask_strides;

  if (has_out_mask) {
    auto& out_mask = inputs[2];
    mask_strides.push_back(*(out_mask.strides().end() - 1));
    mask_strides.push_back(*(out_mask.strides().end() - 2));

    compute_encoder.set_input_array(out_mask, 10);
  }

  if (has_op_mask) {
    auto& lhs_mask = inputs[2 + has_out_mask];
    mask_strides.push_back(*(lhs_mask.strides().end() - 1));
    mask_strides.push_back(*(lhs_mask.strides().end() - 2));

    compute_encoder.set_input_array(lhs_mask, 11);

    auto& rhs_mask = inputs[3 + has_out_mask];
    mask_strides.push_back(*(rhs_mask.strides().end() - 1));
    mask_strides.push_back(*(rhs_mask.strides().end() - 2));

    compute_encoder.set_input_array(rhs_mask, 12);
  }

  // Launch kernel
  compute_encoder.set_input_array(a, 0);
  compute_encoder.set_input_array(b, 1);
  compute_encoder.set_output_array(out, 3);

  compute_encoder.set_bytes(params, 4);

  compute_encoder.set_vector_bytes(batch_shape, 6);
  compute_encoder.set_vector_bytes(batch_strides, 7);

  compute_encoder.set_vector_bytes(mask_strides, 13);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);

  d.add_temporaries(std::move(copies), s.index);
}

///////////////////////////////////////////////////////////////////////////////
// GatherMM implementation
///////////////////////////////////////////////////////////////////////////////

void gather_mm_rhs(
    const array& a_,
    const array& b_,
    const array& indices_,
    array& out,
    metal::Device& d,
    const Stream& s) {
  array indices = ensure_row_contiguous(indices_, d, s);
  auto [transpose_b, ldb, b] = ensure_batch_contiguous(b_, d, s);

  // Broadcast a with indices. If we are here that means lhs_indices were not
  // provided so the lhs_indices are implied to be the shape of a broadcasted
  // with rhs_indices. We need only broadcast a and copy it as if applying the
  // lhs_indices.
  auto broadcast_with_indices = [&d, &s, &indices](const array& x) {
    if (x.size() / x.shape(-2) / x.shape(-1) == indices.size()) {
      return ensure_row_contiguous(x, d, s);
    }

    auto x_shape = indices.shape();
    x_shape.push_back(x.shape(-2));
    x_shape.push_back(x.shape(-1));
    array new_x(std::move(x_shape), x.dtype(), nullptr, {});
    broadcast(x, new_x);
    return ensure_row_contiguous(new_x, d, s);
  };
  array a = broadcast_with_indices(a_);

  // Extract the matmul shapes
  int K = a.shape(-1);
  int M = a.size() / K;
  int N = b.shape(-1);
  int lda = a.strides()[a.ndim() - 2]; // should be K

  // Define the dispatch blocks
  int bm = 16, bn = 64, bk = 16;
  int wm = 1, wn = 2;

  const bool align_M = (M % bm) == 0;
  const bool align_N = (N % bn) == 0;
  const bool align_K = (K % bk) == 0;

  // Define the kernel name
  std::string base_name;
  base_name.reserve(64);
  concatenate(
      base_name,
      "steel_gather_mm_rhs_n",
      transpose_b ? 't' : 'n',
      '_',
      type_to_name(a),
      '_',
      type_to_name(out),
      "_bm",
      bm,
      "_bn",
      bn,
      "_bk",
      bk,
      "_wm",
      wm,
      "_wn",
      wn);

  metal::MTLFCList func_consts = {
      {&align_M, MTL::DataType::DataTypeBool, 200},
      {&align_N, MTL::DataType::DataTypeBool, 201},
      {&align_K, MTL::DataType::DataTypeBool, 202},
  };

  // And the kernel hash that includes the function constants
  std::string hash_name;
  hash_name.reserve(128);
  concatenate(
      hash_name,
      base_name,
      "_align_M_",
      align_M ? 't' : 'n',
      "_align_N_",
      align_N ? 't' : 'n',
      "_align_K_",
      align_K ? 't' : 'n');

  // Get and set the kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = get_steel_gemm_gather_kernel(
      d,
      base_name,
      hash_name,
      func_consts,
      out,
      false,
      transpose_b,
      bm,
      bn,
      bk,
      wm,
      wn,
      true);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Prepare the matmul params
  auto batch_stride_b = b.ndim() > 2 ? b.strides()[b.ndim() - 3] : b.size();
  steel::GEMMParams params{
      /* const int M = */ M,
      /* const int N = */ N,
      /* const int K = */ K,
      /* const int lda = */ lda,
      /* const int ldb = */ static_cast<int>(ldb),
      /* const int ldd = */ N,
      /* const int tiles_n = */ (N + bn - 1) / bn,
      /* const int tiles_m = */ (M + bm - 1) / bm,
      /* const int64_t batch_stride_a = */ 0,
      /* const int64_t batch_stride_b = */ static_cast<int64_t>(batch_stride_b),
      /* const int64_t batch_stride_d = */ 0,
      /* const int swizzle_log = */ 0,
      /* const int gemm_k_iterations_aligned = */ (K / bk),
      /* const int batch_ndim = */ 0};

  // Prepare the grid
  MTL::Size group_dims = MTL::Size(32, wn, wm);
  MTL::Size grid_dims = MTL::Size(params.tiles_n, params.tiles_m, 1);

  // Launch kernel
  compute_encoder.set_input_array(a, 0);
  compute_encoder.set_input_array(b, 1);
  compute_encoder.set_input_array(indices, 2);
  compute_encoder.set_output_array(out, 3);
  compute_encoder.set_bytes(params, 4);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void gather_mm_rhs_nax(
    const array& a_,
    const array& b_,
    const array& indices_,
    array& out,
    metal::Device& d,
    const Stream& s) {
  array indices = ensure_row_contiguous(indices_, d, s);
  auto [transpose_b, ldb, b] = ensure_batch_contiguous(b_, d, s);

  // Broadcast a with indices. If we are here that means lhs_indices were not
  // provided so the lhs_indices are implied to be the shape of a broadcasted
  // with rhs_indices. We need only broadcast a and copy it as if applying the
  // lhs_indices.
  auto broadcast_with_indices = [&d, &s, &indices](const array& x) {
    if (x.size() / x.shape(-2) / x.shape(-1) == indices.size()) {
      return ensure_row_contiguous(x, d, s);
    }

    auto x_shape = indices.shape();
    x_shape.push_back(x.shape(-2));
    x_shape.push_back(x.shape(-1));
    array new_x(std::move(x_shape), x.dtype(), nullptr, {});
    broadcast(x, new_x);
    return ensure_row_contiguous(new_x, d, s);
  };
  array a = broadcast_with_indices(a_);

  // Extract the matmul shapes
  int K = a.shape(-1);
  int M = a.size() / K;
  int N = b.shape(-1);
  int lda = a.strides()[a.ndim() - 2]; // should be K
  int E = b.shape(0);

  // Define the dispatch blocks
  int bm, bn = 128, bk = 128, wm, wn = 4;
  if (M / E > 48) {
    bm = 64;
    wm = 2;
  } else if (M / E > 24) {
    bm = 32l;
    wm = 1;
  } else {
    bm = 16;
    wm = 1;
  }

  const bool align_M = (M % bm) == 0;
  const bool align_N = (N % bn) == 0;
  const bool align_K = (K % bk) == 0;

  // Define the kernel name
  std::string base_name;
  base_name.reserve(64);
  concatenate(
      base_name,
      "steel_gather_mm_rhs_nax_n",
      transpose_b ? 't' : 'n',
      '_',
      type_to_name(a),
      '_',
      type_to_name(out),
      "_bm",
      bm,
      "_bn",
      bn,
      "_bk",
      bk,
      "_wm",
      wm,
      "_wn",
      wn);

  metal::MTLFCList func_consts = {
      {&align_M, MTL::DataType::DataTypeBool, 200},
      {&align_N, MTL::DataType::DataTypeBool, 201},
      {&align_K, MTL::DataType::DataTypeBool, 202},
  };

  // And the kernel hash that includes the function constants
  std::string hash_name;
  hash_name.reserve(128);
  concatenate(
      hash_name,
      base_name,
      "_align_M_",
      align_M ? 't' : 'n',
      "_align_N_",
      align_N ? 't' : 'n',
      "_align_K_",
      align_K ? 't' : 'n');

  // Get and set the kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = get_steel_gemm_gather_nax_kernel(
      d,
      base_name,
      hash_name,
      func_consts,
      out,
      false,
      transpose_b,
      bm,
      bn,
      bk,
      wm,
      wn,
      true);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Prepare the matmul params
  auto batch_stride_b = b.ndim() > 2 ? b.strides()[b.ndim() - 3] : b.size();
  steel::GEMMParams params{
      /* const int M = */ M,
      /* const int N = */ N,
      /* const int K = */ K,
      /* const int lda = */ lda,
      /* const int ldb = */ static_cast<int>(ldb),
      /* const int ldd = */ N,
      /* const int tiles_n = */ (N + bn - 1) / bn,
      /* const int tiles_m = */ (M + bm - 1) / bm,
      /* const int64_t batch_stride_a = */ 0,
      /* const int64_t batch_stride_b = */ static_cast<int64_t>(batch_stride_b),
      /* const int64_t batch_stride_d = */ 0,
      /* const int swizzle_log = */ 0,
      /* const int gemm_k_iterations_aligned = */ (K / bk),
      /* const int batch_ndim = */ 0};

  // Prepare the grid
  MTL::Size group_dims = MTL::Size(32, wn, wm);
  MTL::Size grid_dims = MTL::Size(params.tiles_n, params.tiles_m, 1);

  // Launch kernel
  compute_encoder.set_input_array(a, 0);
  compute_encoder.set_input_array(b, 1);
  compute_encoder.set_input_array(indices, 2);
  compute_encoder.set_output_array(out, 3);
  compute_encoder.set_bytes(params, 4);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void gather_mv(
    const array& mat_,
    const array& vec_,
    const array& mat_indices_,
    const array& vec_indices_,
    array& out,
    int N,
    int K,
    bool is_mv,
    metal::Device& d,
    const Stream& s) {
  // Copy if needed
  std::vector<array> copies;
  auto [transpose_mat, mat_cols, mat] =
      check_transpose(copies, s, mat_, N == 1);
  auto [transpose_vec, vec_cols, vec] = check_transpose(copies, s, vec_, true);
  d.add_temporaries(std::move(copies), s.index);

  // If we are doing vector matrix instead of matrix vector we need to flip the
  // matrix transposition. Basically m @ v = v @ m.T assuming that v is treated
  // as a one dimensional array.
  transpose_mat = (!is_mv) ^ transpose_mat;

  // Define some shapes
  int in_vector_len = K;
  int out_vector_len = N;
  int mat_ld = mat_cols;

  int batch_size_out = out.size() / N;
  int batch_ndim = out.ndim() - 2;
  int batch_ndim_mat = mat.ndim() - 2;
  int batch_ndim_vec = vec.ndim() - 2;
  Strides index_strides = vec_indices_.strides();
  index_strides.insert(
      index_strides.end(),
      mat_indices_.strides().begin(),
      mat_indices_.strides().end());

  // Determine dispatch kernel
  int tm = 4, tn = 4;
  int sm = 1, sn = 32;
  int bm = 1, bn = 1;
  int n_out_per_tgp;
  std::ostringstream kname;

  if (transpose_mat) {
    if (in_vector_len >= 8192 && out_vector_len >= 2048) {
      sm = 4;
      sn = 8;
    } else {
      sm = 8;
      sn = 4;
    }

    if (out_vector_len >= 2048) {
      bn = 16;
    } else if (out_vector_len >= 512) {
      bn = 4;
    } else {
      bn = 2;
    }

    // Specialized kernel for very small outputs
    tn = out_vector_len < tn ? 1 : tn;

    n_out_per_tgp = bn * sn * tn;
    kname << "gemv_t_gather_" << type_to_name(out);

  } else {
    bm = out_vector_len >= 4096 ? 8 : 4;
    sn = 32;

    // Specialized kernel for very small outputs
    tm = out_vector_len < tm ? 1 : tm;

    n_out_per_tgp = bm * sm * tm;
    kname << "gemv_gather_" << type_to_name(out);
  }

  kname << "_bm" << bm << "_bn" << bn << "_sm" << sm << "_sn" << sn << "_tm"
        << tm << "_tn" << tn;

  // Encode and dispatch kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kname.str());
  compute_encoder.set_compute_pipeline_state(kernel);

  int n_tgp = (out_vector_len + n_out_per_tgp - 1) / n_out_per_tgp;
  MTL::Size group_dims = MTL::Size(32, bn, bm);
  MTL::Size grid_dims = MTL::Size(n_tgp, 1, batch_size_out);

  compute_encoder.set_input_array(mat, 0);
  compute_encoder.set_input_array(vec, 1);
  compute_encoder.set_output_array(out, 3);

  compute_encoder.set_bytes(in_vector_len, 4);
  compute_encoder.set_bytes(out_vector_len, 5);
  compute_encoder.set_bytes(mat_ld, 6);

  compute_encoder.set_bytes(batch_ndim, 9);
  compute_encoder.set_vector_bytes(out.shape(), 10);
  compute_encoder.set_vector_bytes(index_strides, 11);

  compute_encoder.set_bytes(batch_ndim_vec, 12);
  compute_encoder.set_vector_bytes(vec.shape(), 13);
  compute_encoder.set_vector_bytes(vec.strides(), 14);

  compute_encoder.set_bytes(batch_ndim_mat, 15);
  compute_encoder.set_vector_bytes(mat.shape(), 16);
  compute_encoder.set_vector_bytes(mat.strides(), 17);

  compute_encoder.set_input_array(vec_indices_, 18);
  compute_encoder.set_input_array(mat_indices_, 19);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void gather_mm(
    const array& a_,
    const array& b_,
    const array& lhs_indices,
    const array& rhs_indices,
    array& out,
    int M,
    int N,
    int K,
    metal::Device& d,
    const Stream& s) {
  // Copy if needed
  std::vector<array> copies;
  auto [transpose_a, lda, a] = check_transpose(copies, s, a_, false);
  auto [transpose_b, ldb, b] = check_transpose(copies, s, b_, false);
  d.add_temporaries(std::move(copies), s.index);

  // Determine dispatch kernel
  int bm = 64, bn = 64, bk = 16;
  int wm = 2, wn = 2;
  size_t batch_size_out = out.size() / M / N;
  int batch_ndim = out.ndim() - 2;
  int batch_ndim_a = a.ndim() - 2;
  int batch_ndim_b = b.ndim() - 2;

  char devc = d.get_architecture().back();
  GEMM_TPARAM_MACRO(devc)

  const bool has_batch = batch_ndim > 1;
  const bool align_M = (M % bm) == 0;
  const bool align_N = (N % bn) == 0;
  const bool align_K = (K % bk) == 0;

  // Define the kernel name
  std::string base_name;
  base_name.reserve(128);
  concatenate(
      base_name,
      "steel_gather_mm_",
      transpose_a ? 't' : 'n',
      transpose_b ? 't' : 'n',
      "_",
      type_to_name(a),
      "_",
      type_to_name(out),
      "_bm",
      bm,
      "_bn",
      bn,
      "_bk",
      bk,
      "_wm",
      wm,
      "_wn",
      wn);

  metal::MTLFCList func_consts = {
      {&has_batch, MTL::DataType::DataTypeBool, 10},
      {&align_M, MTL::DataType::DataTypeBool, 200},
      {&align_N, MTL::DataType::DataTypeBool, 201},
      {&align_K, MTL::DataType::DataTypeBool, 202},
  };

  // And the kernel hash that includes the function constants
  std::string hash_name;
  hash_name.reserve(128);
  concatenate(
      hash_name,
      base_name,
      "_has_batch_",
      has_batch ? 't' : 'n',
      "_align_M_",
      align_M ? 't' : 'n',
      "_align_N_",
      align_N ? 't' : 'n',
      "_align_K_",
      align_K ? 't' : 'n');

  // Get and set the kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = get_steel_gemm_gather_kernel(
      d,
      base_name,
      hash_name,
      func_consts,
      out,
      transpose_a,
      transpose_b,
      bm,
      bn,
      bk,
      wm,
      wn,
      false);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Prepare the matmul params
  steel::GEMMParams params{/* const int M = */ M,
                           /* const int N = */ N,
                           /* const int K = */ K,
                           /* const int lda = */ static_cast<int>(lda),
                           /* const int ldb = */ static_cast<int>(ldb),
                           /* const int ldd = */ N,
                           /* const int tiles_n = */ (N + bn - 1) / bn,
                           /* const int tiles_m = */ (M + bm - 1) / bm,
                           /* const int64_t batch_stride_a = */
                           (batch_ndim > 0) ? lhs_indices.strides()[0] : 0,
                           /* const int64_t batch_stride_b = */
                           (batch_ndim > 0) ? rhs_indices.strides()[0] : 0,
                           /* const int64_t batch_stride_d = */ M * N,
                           /* const int swizzle_log = */ 0,
                           /* const int gemm_k_iterations_aligned = */ (K / bk),
                           /* const int batch_ndim = */ batch_ndim};

  // Prepare the grid
  MTL::Size group_dims = MTL::Size(32, wn, wm);
  MTL::Size grid_dims =
      MTL::Size(params.tiles_n, params.tiles_m, batch_size_out);

  // Launch kernel
  compute_encoder.set_input_array(a, 0);
  compute_encoder.set_input_array(b, 1);
  compute_encoder.set_input_array(lhs_indices, 2);
  compute_encoder.set_input_array(rhs_indices, 3);
  compute_encoder.set_output_array(out, 4);
  compute_encoder.set_bytes(params, 5);
  compute_encoder.set_vector_bytes(lhs_indices.shape(), 6);
  compute_encoder.set_vector_bytes(lhs_indices.strides(), 7);
  compute_encoder.set_vector_bytes(rhs_indices.strides(), 8);
  compute_encoder.set_bytes(batch_ndim_a, 9);
  compute_encoder.set_vector_bytes(a.shape(), 10);
  compute_encoder.set_vector_bytes(a.strides(), 11);
  compute_encoder.set_bytes(batch_ndim_b, 12);
  compute_encoder.set_vector_bytes(b.shape(), 13);
  compute_encoder.set_vector_bytes(b.strides(), 14);
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void GatherMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& s = stream();
  auto& d = metal::device(s.device);

  auto& a = inputs[0];
  auto& b = inputs[1];
  auto& lhs_indices = inputs[2];
  auto& rhs_indices = inputs[3];

  // Return 0s if either input is empty
  if (a.size() == 0 || b.size() == 0) {
    array zero = array(0, a.dtype());
    fill_gpu(zero, out, s);
    d.add_temporary(std::move(zero), s.index);
    return;
  }

  out.set_data(allocator::malloc(out.nbytes()));

  // Extract shapes from inputs.
  int M = a.shape(-2);
  int N = b.shape(-1);
  int K = a.shape(-1);

  // We are walking a in order and b is also in order so we can batch up the
  // matmuls and reuse reading a and b.
  if (M == 1 && right_sorted_ == true) {
    if (metal::is_nax_available() &&
        (env::enable_tf32() || a.dtype() != float32)) {
      return gather_mm_rhs_nax(a, b, rhs_indices, out, d, s);
    }
    gather_mm_rhs(a, b, rhs_indices, out, d, s);
    return;
  }

  // Route to gather gemv if any of a or b are vectors
  if (M == 1) {
    gather_mv(b, a, rhs_indices, lhs_indices, out, N, K, false, d, s);
    return;
  }
  if (N == 1) {
    gather_mv(a, b, lhs_indices, rhs_indices, out, M, K, true, d, s);
    return;
  }

  // Route to non specialized gather mm
  gather_mm(a, b, lhs_indices, rhs_indices, out, M, N, K, d, s);
}

void segmented_mm(
    const array& a_,
    const array& b_,
    const array& segments_,
    array& out,
    int M,
    int N,
    int K,
    metal::Device& d,
    const Stream& s) {
  auto check_segments_layout = [&d, &s](const array& x) {
    // Contiguous so return early
    if (x.flags().row_contiguous) {
      return std::make_tuple(true, x);
    }

    bool rc = true;
    for (int i = 0; i < x.ndim() - 2; i++) {
      rc &=
          (x.strides(i + 1) * x.shape(i) == x.strides(i)) || (x.shape(i) == 1);
    }
    rc &= x.strides(x.ndim() - 1) == 1;
    if (x.ndim() > 1) {
      rc &= x.strides(x.ndim() - 2) == 1;
    }

    if (rc) {
      return std::make_tuple(false, x);
    }

    array x_copy = contiguous_copy_gpu(x, s);
    d.add_temporary(x_copy, s.index);
    return std::make_tuple(true, x_copy);
  };

  // Copy if needed
  std::vector<array> copies;
  auto [transpose_a, lda, a] = check_transpose(copies, s, a_, false);
  auto [transpose_b, ldb, b] = check_transpose(copies, s, b_, false);
  auto [segments_contiguous, segments] = check_segments_layout(segments_);
  d.add_temporaries(std::move(copies), s.index);

  // Determine dispatch kernel
  int bm = 64, bn = 64, bk = 16;
  int wm = 2, wn = 2;
  size_t batch_size_out = out.size() / M / N;

  char devc = d.get_architecture().back();
  GEMM_TPARAM_MACRO(devc)

  const bool align_M = (M % bm) == 0;
  const bool align_N = (N % bn) == 0;

  // Define the kernel name
  std::string base_name;
  base_name.reserve(128);
  concatenate(
      base_name,
      "steel_segmented_mm_",
      transpose_a ? 't' : 'n',
      transpose_b ? 't' : 'n',
      "_",
      type_to_name(a),
      "_",
      type_to_name(out),
      "_bm",
      bm,
      "_bn",
      bn,
      "_bk",
      bk,
      "_wm",
      wm,
      "_wn",
      wn);

  metal::MTLFCList func_consts = {
      {&segments_contiguous, MTL::DataType::DataTypeBool, 199},
      {&align_M, MTL::DataType::DataTypeBool, 200},
      {&align_N, MTL::DataType::DataTypeBool, 201},
  };

  // And the kernel hash that includes the function constants
  std::string hash_name;
  hash_name.reserve(128);
  concatenate(
      hash_name,
      base_name,
      "_segments_contiguous_",
      segments_contiguous ? 't' : 'n',
      "_align_M_",
      align_M ? 't' : 'n',
      "_align_N_",
      align_N ? 't' : 'n');

  // Get and set the kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = get_steel_gemm_segmented_kernel(
      d,
      base_name,
      hash_name,
      func_consts,
      out,
      transpose_a,
      transpose_b,
      bm,
      bn,
      bk,
      wm,
      wn);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Prepare the matmul params
  steel::GEMMParams params{/* const int M = */ M,
                           /* const int N = */ N,
                           /* const int K = */ K,
                           /* const int lda = */ static_cast<int>(lda),
                           /* const int ldb = */ static_cast<int>(ldb),
                           /* const int ldd = */ N,
                           /* const int tiles_n = */ (N + bn - 1) / bn,
                           /* const int tiles_m = */ (M + bm - 1) / bm,
                           /* const int64_t batch_stride_a = */ 0,
                           /* const int64_t batch_stride_b = */ 0,
                           /* const int64_t batch_stride_d = */ M * N,
                           /* const int swizzle_log = */ 0,
                           /* const int gemm_k_iterations_aligned = */ 0,
                           /* const int batch_ndim = */ 0};

  // Prepare the grid
  MTL::Size group_dims = MTL::Size(32, wn, wm);
  MTL::Size grid_dims =
      MTL::Size(params.tiles_n, params.tiles_m, batch_size_out);

  // Launch kernel
  compute_encoder.set_input_array(a, 0);
  compute_encoder.set_input_array(b, 1);
  compute_encoder.set_input_array(segments, 2);
  compute_encoder.set_output_array(out, 3);
  compute_encoder.set_bytes(params, 4);
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void SegmentedMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& s = stream();
  auto& d = metal::device(s.device);

  auto& a = inputs[0];
  auto& b = inputs[1];
  auto& segments = inputs[2];

  out.set_data(allocator::malloc(out.nbytes()));

  // Extract shapes from inputs.
  int M = a.shape(-2);
  int N = b.shape(-1);
  int K = a.shape(-1);

  segmented_mm(a, b, segments, out, M, N, K, d, s);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/matmul.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include "mlx/backend/metal/device.h"

namespace mlx::core {

template <bool CHECK_AB = true>
void steel_matmul_regular_axpby(
    const Stream& s,
    metal::Device& d,
    const array& a,
    const array& b,
    const array& c,
    array& out,
    int M,
    int N,
    int K,
    int batch_size_out,
    int lda,
    int ldb,
    int ldd,
    bool transpose_a,
    bool transpose_b,
    std::vector<array>& copies,
    Shape batch_shape,
    Strides batch_strides,
    int64_t A_batch_stride,
    int64_t B_batch_stride,
    int64_t matrix_stride_out,
    int64_t C_batch_stride = 0,
    float alpha = 1.0f,
    float beta = 0.0f);

inline void steel_matmul_regular(
    const Stream& s,
    metal::Device& d,
    const array& a,
    const array& b,
    array& out,
    int M,
    int N,
    int K,
    int batch_size_out,
    int lda,
    int ldb,
    int ldd,
    bool transpose_a,
    bool transpose_b,
    std::vector<array>& copies,
    Shape batch_shape,
    Strides batch_strides,
    int64_t A_batch_stride,
    int64_t B_batch_stride,
    int64_t matrix_stride_out) {
  return steel_matmul_regular_axpby<false>(
      /* const Stream& s = */ s,
      /* metal::Device& d = */ d,
      /* const array& a = */ a,
      /* const array& b = */ b,
      /* const array& c = */ b,
      /* array& out = */ out,
      /* int M = */ M,
      /* int N = */ N,
      /* int K = */ K,
      /* int batch_size_out = */ batch_size_out,
      /* int lda = */ lda,
      /* int ldb = */ ldb,
      /* int ldd = */ ldd,
      /* bool transpose_a = */ transpose_a,
      /* bool transpose_b = */ transpose_b,
      /* std::vector<array>& copies = */ copies,
      /* Shape batch_shape = */ batch_shape,
      /* Strides batch_strides = */ batch_strides,
      /* int64_t A_batch_stride = */ A_batch_stride,
      /* int64_t B_batch_stride = */ B_batch_stride,
      /* int64_t matrix_stride_out = */ matrix_stride_out);
}

template <bool CHECK_AB = true>
void steel_matmul_axpby(
    const Stream& s,
    metal::Device& d,
    const array& a,
    const array& b,
    const array& c,
    array& out,
    int M,
    int N,
    int K,
    int batch_size_out,
    int lda,
    int ldb,
    bool transpose_a,
    bool transpose_b,
    std::vector<array>& copies,
    Shape batch_shape = {},
    Strides A_batch_stride = {},
    Strides B_batch_stride = {},
    Strides C_batch_stride = {},
    float alpha = 1.0f,
    float beta = 0.0f);

inline void steel_matmul(
    const Stream& s,
    metal::Device& d,
    const array& a,
    const array& b,
    array& out,
    int M,
    int N,
    int K,
    int batch_size_out,
    int lda,
    int ldb,
    bool transpose_a,
    bool transpose_b,
    std::vector<array>& copies,
    Shape batch_shape = {},
    Strides A_batch_stride = {},
    Strides B_batch_stride = {}) {
  return steel_matmul_axpby<false>(
      /* const Stream& s = */ s,
      /* metal::Device& d = */ d,
      /* const array& a = */ a,
      /* const array& b = */ b,
      /* const array& c = */ b,
      /* array& out = */ out,
      /* int M = */ M,
      /* int N = */ N,
      /* int K = */ K,
      /* int batch_size_out = */ batch_size_out,
      /* int lda = */ lda,
      /* int ldb = */ ldb,
      /* bool transpose_a = */ transpose_a,
      /* bool transpose_b = */ transpose_b,
      /* std::vector<array>& copies = */ copies,
      /* Shape batch_shape = */ batch_shape,
      /* Strides A_batch_stride = */ A_batch_stride,
      /* Strides B_batch_stride = */ B_batch_stride);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/metal.cpp
================================================
// Copyright © 2023-2024 Apple Inc.
#include <memory>

#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/metal.h"
#include "mlx/backend/metal/utils.h"

namespace mlx::core::metal {

bool is_available() {
  return true;
}

void start_capture(std::string path, NS::Object* object) {
  auto pool = new_scoped_memory_pool();

  auto descriptor = MTL::CaptureDescriptor::alloc()->init();
  descriptor->setCaptureObject(object);

  if (!path.empty()) {
    auto string = NS::String::string(path.c_str(), NS::UTF8StringEncoding);
    auto url = NS::URL::fileURLWithPath(string);
    descriptor->setDestination(MTL::CaptureDestinationGPUTraceDocument);
    descriptor->setOutputURL(url);
  }

  auto manager = MTL::CaptureManager::sharedCaptureManager();
  NS::Error* error;
  bool started = manager->startCapture(descriptor, &error);
  descriptor->release();
  if (!started) {
    std::ostringstream msg;
    msg << "[metal::start_capture] Failed to start: "
        << error->localizedDescription()->utf8String();
    throw std::runtime_error(msg.str());
  }
}

void start_capture(std::string path) {
  auto& device = metal::device(mlx::core::Device::gpu);
  return start_capture(path, device.mtl_device());
}

void stop_capture() {
  auto pool = new_scoped_memory_pool();
  auto manager = MTL::CaptureManager::sharedCaptureManager();
  manager->stopCapture();
}

} // namespace mlx::core::metal


================================================
FILE: mlx/backend/metal/metal.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <string>
#include <unordered_map>
#include <variant>

#include "mlx/api.h"

namespace mlx::core::metal {

/* Check if the Metal backend is available. */
MLX_API bool is_available();

/** Capture a GPU trace, saving it to an absolute file `path` */
MLX_API void start_capture(std::string path = "");
MLX_API void stop_capture();

/** Get information about the GPU and system settings. */
MLX_API const
    std::unordered_map<std::string, std::variant<std::string, size_t>>&
    device_info();

} // namespace mlx::core::metal


================================================
FILE: mlx/backend/metal/no_metal.cpp
================================================
// Copyright © 2025 Apple Inc.

#include <stdexcept>

#include "mlx/backend/metal/metal.h"
#include "mlx/fast.h"

namespace mlx::core {

namespace metal {

bool is_available() {
  return false;
}

void start_capture(std::string) {}
void stop_capture() {}

const std::unordered_map<std::string, std::variant<std::string, size_t>>&
device_info() {
  throw std::runtime_error(
      "[metal::device_info] Cannot get device info without metal backend");
};

} // namespace metal

namespace fast {

CustomKernelFunction metal_kernel(
    const std::string&,
    const std::vector<std::string>&,
    const std::vector<std::string>&,
    const std::string&,
    const std::string&,
    bool,
    bool) {
  throw std::runtime_error("[metal_kernel] No Metal back-end.");
}

} // namespace fast

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/nojit_kernels.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/primitives.h"

namespace mlx::core {

MTL::ComputePipelineState* get_arange_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array&) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_unary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    Dtype,
    Dtype,
    const char*) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_binary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    Dtype,
    Dtype,
    const char*) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_binary_two_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    Dtype,
    Dtype,
    const char*) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_ternary_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    Dtype,
    const char*) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_copy_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array&,
    const array&) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_dynamic_copy_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array&,
    const array&) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_softmax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    bool,
    const array&) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_logsumexp_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array&) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_scan_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    bool,
    bool,
    const std::string&,
    const array&,
    const array&) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_sort_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array&,
    const array&,
    int,
    int) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_mb_sort_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array&,
    const array&,
    int,
    int) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_reduce_init_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string&,
    const std::string&,
    const Dtype&) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_reduce_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string&,
    const std::string&,
    const Dtype&,
    const Dtype&,
    const std::string&,
    int,
    int,
    int) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_steel_gemm_fused_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array&,
    bool,
    bool,
    int,
    int,
    int,
    int,
    int) {
  return d.get_kernel(kernel_name, hash_name, func_consts);
}

MTL::ComputePipelineState* get_steel_gemm_splitk_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array&,
    const array&,
    bool,
    bool,
    int,
    int,
    int,
    int,
    int,
    bool,
    bool) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_steel_gemm_splitk_accum_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array&,
    const array&,
    bool) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_steel_gemm_masked_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array&,
    const std::optional<array>& mask_out,
    const std::optional<array>& mask_op,
    bool,
    bool,
    int,
    int,
    int,
    int,
    int,
    bool,
    bool) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_steel_gemm_gather_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array&,
    bool,
    bool,
    int,
    int,
    int,
    int,
    int,
    bool) {
  return d.get_kernel(kernel_name, hash_name, func_consts);
}

MTL::ComputePipelineState* get_steel_gemm_segmented_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array&,
    bool,
    bool,
    int,
    int,
    int,
    int,
    int) {
  return d.get_kernel(kernel_name, hash_name, func_consts);
}

MTL::ComputePipelineState* get_gemv_masked_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array&,
    const std::optional<array>&,
    const std::optional<array>&,
    bool,
    int,
    int,
    int,
    int,
    int,
    int,
    bool) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_steel_conv_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array&,
    int,
    int,
    int,
    int,
    int,
    int,
    bool) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_steel_conv_3d_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const array&,
    int,
    int,
    int,
    int,
    int,
    bool) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_steel_conv_general_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array&,
    int,
    int,
    int,
    int,
    int) {
  return d.get_kernel(kernel_name, hash_name, func_consts);
}

MTL::ComputePipelineState* get_fft_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const std::string&) {
  return d.get_kernel(kernel_name, hash_name, func_consts);
}

MTL::ComputePipelineState* get_quantized_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string&,
    const std::string&) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_gather_qmm_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array&,
    int,
    int,
    const std::string&,
    int,
    int,
    int,
    int,
    int,
    bool) {
  return d.get_kernel(kernel_name, hash_name, func_consts);
}

MTL::ComputePipelineState* get_steel_gemm_fused_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array&,
    bool,
    bool,
    int,
    int,
    int,
    int,
    int) {
  return d.get_kernel(kernel_name, hash_name, func_consts);
}

MTL::ComputePipelineState* get_steel_gemm_gather_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array&,
    bool,
    bool,
    int,
    int,
    int,
    int,
    int,
    bool) {
  return d.get_kernel(kernel_name, hash_name, func_consts);
}

MTL::ComputePipelineState* get_steel_gemm_splitk_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array&,
    bool,
    bool,
    int,
    int,
    int,
    int,
    int) {
  return d.get_kernel(kernel_name, hash_name, func_consts);
}

MTL::ComputePipelineState* get_qmm_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string&,
    const std::string&) {
  return d.get_kernel(kernel_name);
}

MTL::ComputePipelineState* get_gather_qmm_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array&,
    int,
    int,
    const std::string&,
    int,
    int,
    int,
    int,
    int,
    bool) {
  return d.get_kernel(kernel_name, hash_name, func_consts);
}

MTL::ComputePipelineState* get_steel_attention_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array&,
    int,
    int,
    int,
    int,
    int,
    const array&) {
  return d.get_kernel(kernel_name, hash_name, func_consts);
}

MTL::ComputePipelineState* get_steel_attention_nax_kernel(
    metal::Device& d,
    const std::string& kernel_name,
    const std::string& hash_name,
    const metal::MTLFCList& func_consts,
    const array&,
    int,
    int,
    int,
    int,
    int,
    const array&) {
  return d.get_kernel(kernel_name, hash_name, func_consts);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/normalization.cpp
================================================
// Copyright © 2024 Apple Inc.
#include <algorithm>

#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/kernels/defines.h"
#include "mlx/backend/metal/reduce.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/fast_primitives.h"

namespace mlx::core::fast {

bool RMSNorm::use_fallback(Stream s) {
  return s.device == Device::cpu;
}

void RMSNorm::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  auto& s = stream();
  auto& d = metal::device(s.device);
  auto& out = outputs[0];

  // Make sure that the last dimension is contiguous
  auto set_output = [&s, &out](const array& x) {
    bool no_copy = x.flags().contiguous && x.strides()[x.ndim() - 1] == 1;
    if (no_copy && x.ndim() > 1) {
      auto s = x.strides()[x.ndim() - 2];
      no_copy &= (s == 0 || s == x.shape().back() || x.shape(-2) == 1);
    }
    if (no_copy) {
      if (x.is_donatable()) {
        out.copy_shared_buffer(x);
      } else {
        out.set_data(
            allocator::malloc(x.data_size() * x.itemsize()),
            x.data_size(),
            x.strides(),
            x.flags());
      }
      return x;
    } else {
      array x_copy = contiguous_copy_gpu(x, s);
      out.copy_shared_buffer(x_copy);
      return x_copy;
    }
  };

  const array x = set_output(inputs[0]);
  const array& w = inputs[1];

  auto axis_size = static_cast<uint32_t>(x.shape().back());
  int n_rows = x.data_size() / axis_size;

  const int simd_size = 32;
  const int n_reads = RMS_N_READS;
  const int looped_limit = RMS_LOOPED_LIMIT;
  std::string op_name = "rms";
  if (axis_size > looped_limit) {
    op_name += "_looped";
  }
  op_name += type_to_name(out);
  auto& compute_encoder = d.get_command_encoder(s.index);
  {
    auto kernel = d.get_kernel(op_name);

    MTL::Size grid_dims, group_dims;
    if (axis_size <= looped_limit) {
      size_t threadgroup_needed = (axis_size + n_reads - 1) / n_reads;
      size_t simds_needed = (threadgroup_needed + simd_size - 1) / simd_size;
      size_t threadgroup_size = simd_size * simds_needed;
      assert(threadgroup_size <= kernel->maxTotalThreadsPerThreadgroup());
      size_t n_threads = n_rows * threadgroup_size;
      grid_dims = MTL::Size(n_threads, 1, 1);
      group_dims = MTL::Size(threadgroup_size, 1, 1);
    } else {
      size_t threadgroup_size = kernel->maxTotalThreadsPerThreadgroup();
      size_t n_threads = n_rows * threadgroup_size;
      grid_dims = MTL::Size(n_threads, 1, 1);
      group_dims = MTL::Size(threadgroup_size, 1, 1);
    }

    uint32_t w_stride = (w.ndim() == 1) ? w.strides()[0] : 0;
    compute_encoder.set_compute_pipeline_state(kernel);
    compute_encoder.set_input_array(x, 0);
    compute_encoder.set_input_array(w, 1);
    compute_encoder.set_output_array(out, 2);
    compute_encoder.set_bytes(eps_, 3);
    compute_encoder.set_bytes(axis_size, 4);
    compute_encoder.set_bytes(w_stride, 5);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }
}

void RMSNormVJP::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  auto& s = stream();
  auto& d = metal::device(s.device);

  // Ensure row contiguity. We could relax this step by checking that the array
  // is contiguous (no broadcasts or holes) and that the input strides are the
  // same as the cotangent strides but for now this is simpler.
  auto check_input = [&s](const array& x) -> std::pair<array, bool> {
    if (x.flags().row_contiguous) {
      return {x, false};
    }
    array x_copy = contiguous_copy_gpu(x, s);
    return {x_copy, true};
  };
  bool donate_g = inputs[2].is_donatable();
  auto [x, copied] = check_input(inputs[0]);
  const array& w = inputs[1];
  auto [g, g_copied] = check_input(inputs[2]);
  donate_g |= g_copied;
  array& gx = outputs[0];
  array& gw = outputs[1];

  // Check whether we had a weight
  bool has_w = w.ndim() != 0;

  // Allocate space for the outputs
  bool g_in_gx = false;
  if (x.is_donatable()) {
    gx.copy_shared_buffer(x);
  } else if (g.is_donatable()) {
    gx.copy_shared_buffer(g);
    g_in_gx = true;
  } else {
    gx.set_data(allocator::malloc(gx.nbytes()));
  }
  if (g_copied && !g_in_gx) {
    d.add_temporary(g, s.index);
  }

  auto axis_size = static_cast<uint32_t>(x.shape().back());
  int n_rows = x.data_size() / axis_size;

  // Allocate the gradient accumulator gw and a temporary to store the
  // gradients before they are accumulated.
  array gw_temp =
      (has_w) ? array({n_rows, x.shape().back()}, gw.dtype(), nullptr, {}) : w;
  if (has_w) {
    if (!g_in_gx && donate_g) {
      gw_temp.copy_shared_buffer(g);
    } else {
      gw_temp.set_data(allocator::malloc(gw_temp.nbytes()));
      d.add_temporary(gw_temp, s.index);
    }
  }
  gw.set_data(allocator::malloc(gw.nbytes()));

  const int simd_size = 32;
  const int n_reads = RMS_N_READS;
  const int looped_limit = RMS_LOOPED_LIMIT;
  std::string op_name = "vjp_rms";
  if (axis_size > looped_limit) {
    op_name += "_looped";
  }
  op_name += type_to_name(gx);

  std::string hash_name = op_name + ((has_w) ? "_w" : "_now");
  metal::MTLFCList func_consts = {
      {&has_w, MTL::DataType::DataTypeBool, 20},
  };

  auto& compute_encoder = d.get_command_encoder(s.index);
  {
    auto kernel = d.get_kernel(op_name, hash_name, func_consts);

    MTL::Size grid_dims, group_dims;
    if (axis_size <= looped_limit) {
      size_t threadgroup_needed = (axis_size + n_reads - 1) / n_reads;
      size_t simds_needed = (threadgroup_needed + simd_size - 1) / simd_size;
      size_t threadgroup_size = simd_size * simds_needed;
      assert(threadgroup_size <= kernel->maxTotalThreadsPerThreadgroup());
      size_t n_threads = n_rows * threadgroup_size;
      grid_dims = MTL::Size(n_threads, 1, 1);
      group_dims = MTL::Size(threadgroup_size, 1, 1);
    } else {
      size_t threadgroup_size = kernel->maxTotalThreadsPerThreadgroup();
      size_t n_threads = n_rows * threadgroup_size;
      grid_dims = MTL::Size(n_threads, 1, 1);
      group_dims = MTL::Size(threadgroup_size, 1, 1);
    }

    uint32_t w_stride = (w.ndim() == 1) ? w.strides()[0] : 0;
    compute_encoder.set_compute_pipeline_state(kernel);
    compute_encoder.set_input_array(x, 0);
    compute_encoder.set_input_array(w, 1);
    compute_encoder.set_input_array(g, 2);
    compute_encoder.set_output_array(gx, 3);
    compute_encoder.set_output_array(gw_temp, 4);
    compute_encoder.set_bytes(eps_, 5);
    compute_encoder.set_bytes(axis_size, 6);
    compute_encoder.set_bytes(w_stride, 7);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }

  if (has_w) {
    ReductionPlan plan(
        ReductionOpType::ContiguousStridedReduce, {n_rows}, {axis_size});
    strided_reduce_general_dispatch(
        gw_temp, gw, "sum", plan, {0}, compute_encoder, d, s);
  }
}

bool LayerNorm::use_fallback(Stream s) {
  return s.device == Device::cpu;
}

void LayerNorm::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  auto& s = stream();
  auto& d = metal::device(s.device);
  auto& out = outputs[0];

  // Make sure that the last dimension is contiguous
  auto set_output = [&s, &out](const array& x) {
    bool no_copy = x.flags().contiguous && x.strides()[x.ndim() - 1] == 1;
    if (no_copy && x.ndim() > 1) {
      auto s = x.strides()[x.ndim() - 2];
      no_copy &= (s == 0 || s == x.shape().back() || x.shape(-2) == 1);
    }
    if (no_copy) {
      if (x.is_donatable()) {
        out.copy_shared_buffer(x);
      } else {
        out.set_data(
            allocator::malloc(x.data_size() * x.itemsize()),
            x.data_size(),
            x.strides(),
            x.flags());
      }
      return x;
    } else {
      array x_copy = contiguous_copy_gpu(x, s);
      out.copy_shared_buffer(x_copy);
      return x_copy;
    }
  };

  const array x = set_output(inputs[0]);
  const array& w = inputs[1];
  const array& b = inputs[2];

  auto axis_size = static_cast<uint32_t>(x.shape().back());
  int n_rows = x.data_size() / axis_size;

  int simd_size = 32;
  int n_reads = 8;
  int looped_limit = 6656;
  std::string op_name = "layer_norm";
  if (axis_size > looped_limit) {
    op_name += "_looped";
    n_reads = 4;
  }
  op_name += type_to_name(out);
  auto& compute_encoder = d.get_command_encoder(s.index);
  {
    auto kernel = d.get_kernel(op_name);

    MTL::Size grid_dims, group_dims;
    if (axis_size <= looped_limit) {
      size_t threadgroup_needed = (axis_size + n_reads - 1) / n_reads;
      size_t simds_needed = (threadgroup_needed + simd_size - 1) / simd_size;
      size_t threadgroup_size = simd_size * simds_needed;
      if (threadgroup_size > kernel->maxTotalThreadsPerThreadgroup()) {
        std::ostringstream msg;
        msg << "[layer_norm] Threadgroup size " << threadgroup_size
            << " is larger than the maximum allowed threadgroup size "
            << kernel->maxTotalThreadsPerThreadgroup();
        throw std::runtime_error(msg.str());
      }
      size_t n_threads = n_rows * threadgroup_size;
      grid_dims = MTL::Size(n_threads, 1, 1);
      group_dims = MTL::Size(threadgroup_size, 1, 1);
    } else {
      size_t threadgroup_size = kernel->maxTotalThreadsPerThreadgroup();
      size_t n_threads = n_rows * threadgroup_size;
      grid_dims = MTL::Size(n_threads, 1, 1);
      group_dims = MTL::Size(threadgroup_size, 1, 1);
    }

    uint32_t w_stride = (w.ndim() == 1) ? w.strides()[0] : 0;
    uint32_t b_stride = (b.ndim() == 1) ? b.strides()[0] : 0;
    compute_encoder.set_compute_pipeline_state(kernel);
    compute_encoder.set_input_array(x, 0);
    compute_encoder.set_input_array(w, 1);
    compute_encoder.set_input_array(b, 2);
    compute_encoder.set_output_array(out, 3);
    compute_encoder.set_bytes(eps_, 4);
    compute_encoder.set_bytes(axis_size, 5);
    compute_encoder.set_bytes(w_stride, 6);
    compute_encoder.set_bytes(b_stride, 7);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }
}

void LayerNormVJP::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  auto& s = stream();
  auto& d = metal::device(s.device);

  // Ensure row contiguity. We could relax this step by checking that the array
  // is contiguous (no broadcasts or holes) and that the input strides are the
  // same as the cotangent strides but for now this is simpler.
  auto check_input = [&s](const array& x) -> std::pair<array, bool> {
    if (x.flags().row_contiguous) {
      return {x, false};
    }
    array x_copy = contiguous_copy_gpu(x, s);
    return {x_copy, true};
  };
  bool donate_x = inputs[0].is_donatable();
  bool donate_g = inputs[3].is_donatable();
  auto [x, copied] = check_input(inputs[0]);
  donate_x |= copied;
  const array& w = inputs[1];
  auto [g, g_copied] = check_input(inputs[3]);
  donate_g |= g_copied;
  array& gx = outputs[0];
  array& gw = outputs[1];
  array& gb = outputs[2];

  // Check whether we had a weight
  bool has_w = w.ndim() != 0;

  // Allocate space for the outputs
  bool g_in_gx = false;
  if (donate_x) {
    gx.copy_shared_buffer(x);
  } else if (donate_g) {
    gx.copy_shared_buffer(g);
    g_in_gx = true;
  } else {
    gx.set_data(allocator::malloc(gx.nbytes()));
  }
  if (g_copied && !g_in_gx) {
    d.add_temporary(g, s.index);
  }

  auto axis_size = static_cast<uint32_t>(x.shape().back());
  int n_rows = x.data_size() / axis_size;

  // Allocate a temporary to store the gradients for w and allocate the output
  // gradient accumulators.
  array gw_temp =
      (has_w) ? array({n_rows, x.shape().back()}, gw.dtype(), nullptr, {}) : w;
  if (has_w) {
    if (!g_in_gx && donate_g) {
      gw_temp.copy_shared_buffer(g);
    } else {
      gw_temp.set_data(allocator::malloc(gw_temp.nbytes()));
      d.add_temporary(gw_temp, s.index);
    }
  }
  gw.set_data(allocator::malloc(gw.nbytes()));
  gb.set_data(allocator::malloc(gb.nbytes()));

  // Finish with the gradient for b in case we had a b
  auto& compute_encoder = d.get_command_encoder(s.index);
  if (gb.ndim() == 1 && gb.size() == axis_size) {
    ReductionPlan plan(
        ReductionOpType::ContiguousStridedReduce, {n_rows}, {axis_size});
    strided_reduce_general_dispatch(
        g, gb, "sum", plan, {0}, compute_encoder, d, s);
  }

  int simd_size = 32;
  int n_reads = 8;
  int looped_limit = 8192;
  std::string op_name = "vjp_layer_norm";
  if (axis_size > looped_limit) {
    op_name += "_looped";
    n_reads = 4;
  }
  op_name += type_to_name(gx);

  std::string hash_name = op_name + ((has_w) ? "_w" : "_now");
  metal::MTLFCList func_consts = {
      {&has_w, MTL::DataType::DataTypeBool, 20},
  };

  {
    auto kernel = d.get_kernel(op_name, hash_name, func_consts);

    MTL::Size grid_dims, group_dims;
    if (axis_size <= looped_limit) {
      size_t threadgroup_needed = (axis_size + n_reads - 1) / n_reads;
      size_t simds_needed = (threadgroup_needed + simd_size - 1) / simd_size;
      size_t threadgroup_size = simd_size * simds_needed;
      if (threadgroup_size > kernel->maxTotalThreadsPerThreadgroup()) {
        std::ostringstream msg;
        msg << "[vjp_layer_norm] Threadgroup size " << threadgroup_size
            << " is larger than the maximum allowed threadgroup size "
            << kernel->maxTotalThreadsPerThreadgroup();
        throw std::runtime_error(msg.str());
      }
      size_t n_threads = n_rows * threadgroup_size;
      grid_dims = MTL::Size(n_threads, 1, 1);
      group_dims = MTL::Size(threadgroup_size, 1, 1);
    } else {
      size_t threadgroup_size = kernel->maxTotalThreadsPerThreadgroup();
      size_t n_threads = n_rows * threadgroup_size;
      grid_dims = MTL::Size(n_threads, 1, 1);
      group_dims = MTL::Size(threadgroup_size, 1, 1);
    }

    uint32_t w_stride = (w.ndim() == 1) ? w.strides()[0] : 0;
    compute_encoder.set_compute_pipeline_state(kernel);
    compute_encoder.set_input_array(x, 0);
    compute_encoder.set_input_array(w, 1);
    compute_encoder.set_input_array(g, 2);
    compute_encoder.set_output_array(gx, 3);
    compute_encoder.set_output_array(gw_temp, 4);
    compute_encoder.set_bytes(eps_, 5);
    compute_encoder.set_bytes(axis_size, 6);
    compute_encoder.set_bytes(w_stride, 7);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }

  if (has_w) {
    ReductionPlan plan(
        ReductionOpType::ContiguousStridedReduce, {n_rows}, {axis_size});
    strided_reduce_general_dispatch(
        gw_temp, gw, "sum", plan, {0}, compute_encoder, d, s);
  }
}

} // namespace mlx::core::fast


================================================
FILE: mlx/backend/metal/primitives.cpp
================================================
// Copyright © 2023-2024 Apple Inc.
#include <algorithm>
#include <cassert>
#include <numeric>
#include <sstream>

#include "mlx/backend/common/slicing.h"
#include "mlx/backend/common/utils.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/gpu/slicing.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/primitives.h"
#include "mlx/scheduler.h"
#include "mlx/utils.h"

namespace mlx::core {

template <typename T>
void arange_set_scalars(T start, T next, metal::CommandEncoder& enc) {
  enc.set_bytes(start, 0);
  T step = next - start;
  enc.set_bytes(step, 1);
}

void Arange::eval_gpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 0);
  out.set_data(allocator::malloc(out.nbytes()));
  if (out.size() == 0) {
    return;
  }
  auto& s = stream();
  auto& d = metal::device(s.device);
  auto kernel = get_arange_kernel(d, "arange" + type_to_name(out), out);
  size_t nthreads = out.size();
  MTL::Size grid_dims = MTL::Size(nthreads, 1, 1);
  MTL::Size group_dims = MTL::Size(
      std::min(nthreads, kernel->maxTotalThreadsPerThreadgroup()), 1, 1);
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  switch (out.dtype()) {
    case bool_: // unsupported
      throw std::runtime_error("[Arange::eval_gpu] Does not support bool");
    case uint8:
      arange_set_scalars<uint8_t>(start_, start_ + step_, compute_encoder);
      break;
    case uint16:
      arange_set_scalars<uint16_t>(start_, start_ + step_, compute_encoder);
      break;
    case uint32:
      arange_set_scalars<uint32_t>(start_, start_ + step_, compute_encoder);
      break;
    case uint64:
      arange_set_scalars<uint64_t>(start_, start_ + step_, compute_encoder);
      break;
    case int8:
      arange_set_scalars<int8_t>(start_, start_ + step_, compute_encoder);
      break;
    case int16:
      arange_set_scalars<int16_t>(start_, start_ + step_, compute_encoder);
      break;
    case int32:
      arange_set_scalars<int32_t>(start_, start_ + step_, compute_encoder);
      break;
    case int64:
      arange_set_scalars<int64_t>(start_, start_ + step_, compute_encoder);
      break;
    case float16:
      arange_set_scalars<float16_t>(start_, start_ + step_, compute_encoder);
      break;
    case float32:
      arange_set_scalars<float>(start_, start_ + step_, compute_encoder);
      break;
    case bfloat16:
      arange_set_scalars<bfloat16_t>(start_, start_ + step_, compute_encoder);
      break;
    default:
      throw std::runtime_error("[Arange::eval_gpu] Does not support type.");
  }

  compute_encoder.set_output_array(out, 2);
  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void ArgReduce::eval_gpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  auto& in = inputs[0];
  out.set_data(allocator::malloc(out.nbytes()));
  auto& s = stream();
  auto& d = metal::device(s.device);
  std::string op_name;
  switch (reduce_type_) {
    case ArgReduce::ArgMin:
      op_name = "argmin_";
      break;
    case ArgReduce::ArgMax:
      op_name = "argmax_";
      break;
  }

  // Prepare the shapes, strides and axis arguments.
  auto in_strides = in.strides();
  auto shape = in.shape();
  auto out_strides = out.strides();
  auto axis_stride = in_strides[axis_];
  size_t axis_size = shape[axis_];
  if (out_strides.size() == in_strides.size()) {
    out_strides.erase(out_strides.begin() + axis_);
  }
  in_strides.erase(in_strides.begin() + axis_);
  shape.erase(shape.begin() + axis_);
  size_t ndim = shape.size();

  // ArgReduce
  int simd_size = 32;
  int n_reads = 4;
  auto& compute_encoder = d.get_command_encoder(s.index);
  {
    auto kernel = d.get_kernel(op_name + type_to_name(in));
    NS::UInteger thread_group_size = std::min(
        (axis_size + n_reads - 1) / n_reads,
        kernel->maxTotalThreadsPerThreadgroup());
    // round up to the closest number divisible by simd_size
    thread_group_size =
        (thread_group_size + simd_size - 1) / simd_size * simd_size;
    assert(thread_group_size <= kernel->maxTotalThreadsPerThreadgroup());

    auto gd = get_2d_grid_dims(out.shape(), out.strides());
    MTL::Size grid_dims = MTL::Size(thread_group_size, gd.width, gd.height);
    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
    compute_encoder.set_compute_pipeline_state(kernel);
    compute_encoder.set_input_array(in, 0);
    compute_encoder.set_output_array(out, 1);
    if (ndim == 0) {
      // Pass place holders so metal doesn't complain
      int shape_ = 0;
      int64_t stride_ = 0;
      compute_encoder.set_bytes(shape_, 2);
      compute_encoder.set_bytes(stride_, 3);
      compute_encoder.set_bytes(stride_, 4);
    } else {
      compute_encoder.set_vector_bytes(shape, 2);
      compute_encoder.set_vector_bytes(in_strides, 3);
      compute_encoder.set_vector_bytes(out_strides, 4);
    }
    compute_encoder.set_bytes(ndim, 5);
    compute_encoder.set_bytes(axis_stride, 6);
    compute_encoder.set_bytes(axis_size, 7);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }
}

void Load::eval_gpu(const std::vector<array>& inputs, array& out) {
  throw std::runtime_error("[Load::eval_gpu] Not implemented.");
}

void RandomBits::eval_gpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);

  // keys has shape (N1, ..., NK, 2)
  // out has shape (N1, ..., NK, M1, M2, ...)
  auto& keys = inputs[0];
  size_t num_keys = keys.size() / 2;

  size_t elems_per_key = out.size() / num_keys;
  size_t bytes_per_key = out.itemsize() * elems_per_key;
  out.set_data(allocator::malloc(out.nbytes()));
  if (out.size() == 0) {
    return;
  }

  size_t out_per_key = (bytes_per_key + 4 - 1) / 4;
  size_t half_size = out_per_key / 2;
  bool odd = out_per_key % 2;

  auto& s = stream();
  auto& d = metal::device(s.device);
  std::string kname = keys.flags().row_contiguous ? "rbitsc" : "rbits";
  auto kernel = d.get_kernel(kname);

  // organize into grid nkeys x elem_per_key
  MTL::Size grid_dims = MTL::Size(num_keys, half_size + odd, 1);
  auto group_dims = get_block_dims(num_keys, half_size + odd, 1);
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);
  compute_encoder.set_input_array(keys, 0);
  compute_encoder.set_output_array(out, 1);
  compute_encoder.set_bytes(odd, 2);
  compute_encoder.set_bytes(bytes_per_key, 3);

  if (!keys.flags().row_contiguous) {
    int ndim = keys.ndim();
    compute_encoder.set_bytes(ndim, 4);
    compute_encoder.set_vector_bytes(keys.shape(), 5);
    compute_encoder.set_vector_bytes(keys.strides(), 6);
  }

  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void QRF::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  throw std::runtime_error("[QRF::eval_gpu] Metal QR factorization NYI.");
}

void SVD::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  throw std::runtime_error("[SVD::eval_gpu] Metal SVD NYI.");
}

void Inverse::eval_gpu(const std::vector<array>& inputs, array& output) {
  throw std::runtime_error("[Inverse::eval_gpu] Metal inversion NYI.");
}

void Cholesky::eval_gpu(const std::vector<array>& inputs, array& out) {
  throw std::runtime_error(
      "[Cholesky::eval_gpu] Metal Cholesky decomposition NYI.");
}

void Eig::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  throw std::runtime_error("[Eig::eval_gpu] Metal Eig NYI.");
}

void Eigh::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  throw std::runtime_error("[Eigh::eval_gpu] Metal Eigh NYI.");
}

void LUF::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  throw std::runtime_error("[LUF::eval_gpu] Metal LU factorization NYI.");
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/quantized.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include "mlx/backend/common/broadcasting.h"
#include "mlx/backend/common/compiled.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/reduce.h"
#include "mlx/backend/metal/unary.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/fast_primitives.h"
#include "mlx/primitives.h"
#include "mlx/utils.h"

namespace mlx::core {

namespace {

template <typename... Args>
auto get_quantized_kernel_wrapped(
    metal::Device& d,
    const std::string& name,
    const std::string& func,
    const std::string& mode,
    const std::string& type,
    int group_size,
    int bits,
    Args... args) {
  std::string template_def;
  std::string fname = ((mode == "affine") ? "affine_" : "fp_") + func;
  template_def = get_template_definition(
      name, fname, type, group_size, bits, std::forward<Args>(args)...);
  return get_quantized_kernel(d, name, template_def, mode);
}

template <typename... Args>
auto get_qmm_nax_kernel_wrapped(
    metal::Device& d,
    const std::string& name,
    const std::string& func,
    const std::string& mode,
    const std::string& type,
    int group_size,
    int bits,
    Args... args) {
  std::string template_def;
  std::string fname = ((mode == "affine") ? "affine_" : "fp_") + func;
  template_def = get_template_definition(
      name, fname, type, group_size, bits, std::forward<Args>(args)...);
  return get_qmm_nax_kernel(d, name, template_def, mode);
}

inline array
ensure_row_contiguous(const array& x, metal::Device& d, const Stream& s) {
  if (!x.flags().row_contiguous) {
    array x_copy = contiguous_copy_gpu(x, s);
    d.add_temporary(x_copy, s.index);
    return x_copy;
  } else {
    return x;
  }
}

inline array ensure_row_contiguous_matrix(
    const array& x,
    metal::Device& d,
    const Stream& s) {
  if (x.ndim() < 2) {
    if (x.strides()[0] == 1) {
      return x;
    }
  } else {
    auto stride_0 = x.strides()[x.ndim() - 2];
    auto stride_1 = x.strides()[x.ndim() - 1];
    if (stride_0 == x.shape(-1) && stride_1 == 1) {
      return x;
    }
  }
  array x_copy = contiguous_copy_gpu(x, s);
  d.add_temporary(x_copy, s.index);
  return x_copy;
}

inline int get_qmv_batch_limit(int D, int O, metal::Device& d) {
  auto arch_size = d.get_architecture().back();
  auto arch_gen = d.get_architecture_gen();
  if (arch_gen == 13 || arch_gen == 14) {
    switch (arch_size) {
      case 'd':
        if (D <= 2048 && O <= 2048) {
          return 32;
        } else if (D <= 4096 && O <= 4096) {
          return 18;
        } else {
          return 12;
        }
      default:
        if (D <= 2048 && O <= 2048) {
          return 14;
        } else if (D <= 4096 && O <= 4096) {
          return 10;
        } else {
          return 6;
        }
    }
  } else {
    switch (arch_size) {
      case 'd':
        if (D <= 2048 && O <= 2048) {
          return 32;
        } else if (D <= 4096 && O <= 4096) {
          return 18;
        } else {
          return 12;
        }
      default:
        if (D <= 2048 && O <= 2048) {
          return 18;
        } else if (D <= 4096 && O <= 4096) {
          return 12;
        } else {
          return 10;
        }
    }
  }
}

inline int add_strides_and_shapes(
    CommandEncoder& compute_encoder,
    bool skip,
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    int offset) {
  if (skip) {
    return 0;
  }

  // TODO: Collapse batch dimensions

  int x_batch_ndims = x.ndim() - 2;
  int w_batch_ndims = w.ndim() - 2;
  compute_encoder.set_bytes(x_batch_ndims, offset++);
  compute_encoder.set_vector_bytes(x.shape(), offset++);
  compute_encoder.set_vector_bytes(x.strides(), offset++);
  compute_encoder.set_bytes(w_batch_ndims, offset++);
  compute_encoder.set_vector_bytes(w.shape(), offset++);
  compute_encoder.set_vector_bytes(w.strides(), offset++);
  compute_encoder.set_vector_bytes(scales.strides(), offset++);
  if (biases) {
    compute_encoder.set_vector_bytes(biases->strides(), offset++);
  }

  return offset;
}

inline int add_gather_strides_and_shapes(
    CommandEncoder& compute_encoder,
    const array& lhs_indices,
    const array& rhs_indices,
    int offset) {
  auto [shape, strides] = collapse_contiguous_dims(
      lhs_indices.shape(), {lhs_indices.strides(), rhs_indices.strides()});
  int ndims = shape.size();

  compute_encoder.set_bytes(ndims, offset++);
  compute_encoder.set_vector_bytes(shape, offset++);
  compute_encoder.set_vector_bytes(strides[0], offset++);
  compute_encoder.set_vector_bytes(strides[1], offset++);

  return offset;
}

} // namespace

void qmv_quad(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    array& out,
    int group_size,
    int bits,
    int M,
    int N,
    int K,
    metal::Device& d,
    const Stream& s,
    const std::string& mode) {
  int B = out.size() / M / N;

  constexpr int quads_per_simd = 8;
  constexpr int results_per_quadgroup = 8;
  int bn = quads_per_simd * results_per_quadgroup;
  int simdgroup_size = 32;
  MTL::Size group_dims(simdgroup_size, 1, 1);
  MTL::Size grid_dims(M, (N + bn - 1) / bn, B);

  std::string kname;
  kname.reserve(64);
  std::string type_string = get_type_string(x.dtype());

  concatenate(
      kname,
      mode + "_qmv_quad_",
      type_string,
      "_gs_",
      group_size,
      "_b_",
      bits,
      "_d_",
      K,
      B > 1 ? "_batch_1" : "_batch_0");
  auto kernel = get_quantized_kernel_wrapped(
      d, kname, "qmv_quad", mode, type_string, group_size, bits, K, B > 1);
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  int c = 0;
  compute_encoder.set_input_array(w, c++);
  compute_encoder.set_input_array(scales, c++);
  if (biases) {
    compute_encoder.set_input_array(*biases, c++);
  }
  compute_encoder.set_input_array(x, c++);
  compute_encoder.set_output_array(out, c++);
  compute_encoder.set_bytes(K, c++);
  compute_encoder.set_bytes(N, c++);
  add_strides_and_shapes(compute_encoder, B <= 1, x, w, scales, biases, c++);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void qmv(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    array& out,
    int group_size,
    int bits,
    int M,
    int N,
    int K,
    metal::Device& d,
    const Stream& s,
    const std::string& mode) {
  int B = out.size() / M / N;

  int bn = 8;
  int bk = 32;
  MTL::Size group_dims(bk, 2, 1);
  MTL::Size grid_dims(M, (N + bn - 1) / bn, B);

  std::string kname;
  kname.reserve(64);
  std::string type_string = get_type_string(x.dtype());
  bool fast = N % bn == 0 && K % 512 == 0;

  concatenate(
      kname,
      mode + (fast ? "_qmv_fast_" : "_qmv_"),
      type_string,
      "_gs_",
      group_size,
      "_b_",
      bits,
      B > 1 ? "_batch_1" : "_batch_0");
  auto kernel = get_quantized_kernel_wrapped(
      d,
      kname,
      (fast ? "qmv_fast" : "qmv"),
      mode,
      type_string,
      group_size,
      bits,
      B > 1);

  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  int c = 0;
  compute_encoder.set_input_array(w, c++);
  compute_encoder.set_input_array(scales, c++);
  if (biases) {
    compute_encoder.set_input_array(*biases, c++);
  }
  compute_encoder.set_input_array(x, c++);
  compute_encoder.set_output_array(out, c++);
  compute_encoder.set_bytes(K, c++);
  compute_encoder.set_bytes(N, c++);
  add_strides_and_shapes(compute_encoder, B <= 1, x, w, scales, biases, c);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void qvm_split_k(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    array& out,
    int group_size,
    int bits,
    int M,
    int N,
    int K,
    metal::Device& d,
    const Stream& s,
    const std::string& mode) {
  int split_k = K > 8192 ? 32 : 8;
  int split_D = (K + split_k - 1) / split_k;
  int B = out.size() / M / N;
  B *= split_k;

  constexpr int num_simdgroups = 2;
  constexpr int bk = 32;
  int bn = std::min(group_size, 32) * num_simdgroups;
  MTL::Size group_dims = MTL::Size(bk, num_simdgroups, 1);
  MTL::Size grid_dims = MTL::Size(M, N / bn, B);

  auto x_shape = x.shape();
  auto x_strides = x.strides();
  if (x_shape.size() == 1) {
    x_shape.insert(x_shape.begin(), 1);
    x_strides.insert(x_strides.begin(), 0);
  }

  int x_ndim = x_shape.size();
  int x_batch_ndims = x_ndim - 2;
  int w_batch_ndims = w.ndim() - 2;
  auto w_shape = w.shape();
  auto w_strides = w.strides();
  auto s_strides = scales.strides();

  // Add split_k dim with reshapes
  x_shape.insert(x_shape.end() - 2, split_k);
  x_shape.back() /= split_k;
  x_strides.insert(x_strides.end() - 2, split_D);
  x_strides[x_ndim - 1] = split_D;
  x_batch_ndims += 1;

  w_shape.insert(w_shape.end() - 2, split_k);
  w_shape[w.ndim() - 1] /= split_k;
  w_strides.insert(w_strides.end() - 2, split_D * w.shape(-1));
  w_batch_ndims += 1;
  s_strides.insert(s_strides.end() - 2, split_D * scales.shape(-1));

  int final_block_size = K - (split_k - 1) * split_D;

  auto temp_shape = out.shape();
  if (temp_shape.size() == 1) {
    temp_shape.insert(temp_shape.begin(), 1);
  }
  temp_shape.insert(temp_shape.end() - 2, split_k);
  array intermediate(temp_shape, x.dtype(), nullptr, {});
  intermediate.set_data(allocator::malloc(intermediate.nbytes()));
  d.add_temporary(intermediate, s.index);

  std::string type_string = get_type_string(x.dtype());
  std::string kname;
  kname.reserve(64);
  concatenate(
      kname,
      mode + "_qvm_split_k_",
      type_string,
      "_gs_",
      group_size,
      "_b_",
      bits,
      "_spk_",
      split_k);

  // Encode and dispatch kernel
  auto kernel = get_quantized_kernel_wrapped(
      d, kname, "qvm_split_k", mode, type_string, group_size, bits, split_k);

  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  int c = 0;
  compute_encoder.set_input_array(w, c++);
  compute_encoder.set_input_array(scales, c++);
  if (biases) {
    compute_encoder.set_input_array(*biases, c++);
  }
  compute_encoder.set_input_array(x, c++);
  compute_encoder.set_output_array(intermediate, c++);
  compute_encoder.set_bytes(split_D, c++);
  compute_encoder.set_bytes(N, c++);

  compute_encoder.set_bytes(x_batch_ndims, c++);
  compute_encoder.set_vector_bytes(x_shape, c++);
  compute_encoder.set_vector_bytes(x_strides, c++);
  compute_encoder.set_bytes(w_batch_ndims, c++);
  compute_encoder.set_vector_bytes(w_shape, c++);
  compute_encoder.set_vector_bytes(w_strides, c++);
  compute_encoder.set_vector_bytes(s_strides, c++);
  if (biases) {
    auto b_strides = biases->strides();
    b_strides.insert(b_strides.end() - 2, split_D * biases->shape(-1));
    compute_encoder.set_vector_bytes(b_strides, c++);
  }
  compute_encoder.set_bytes(final_block_size, c++);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);

  int axis = intermediate.ndim() - 3;
  ReductionPlan plan(
      ReductionOpType::ContiguousStridedReduce,
      {intermediate.shape(axis)},
      {intermediate.strides(axis)});
  strided_reduce_general_dispatch(
      intermediate, out, "sum", plan, {axis}, compute_encoder, d, s);
}

void qvm(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    array& out,
    int group_size,
    int bits,
    int M,
    int N,
    int K,
    metal::Device& d,
    const Stream& s,
    const std::string& mode) {
  int B = out.size() / M / N;

  constexpr int num_simdgroups = 2;
  constexpr int bk = 32;
  int bn = std::min(group_size, 32) * num_simdgroups;
  MTL::Size group_dims(bk, num_simdgroups, 1);
  MTL::Size grid_dims(M, (N + bn - 1) / bn, B);

  std::string kname;
  kname.reserve(64);
  std::string type_string = get_type_string(x.dtype());
  concatenate(
      kname,
      mode + "_qvm_",
      type_string,
      "_gs_",
      group_size,
      "_b_",
      bits,
      B > 1 ? "_batch_1" : "_batch_0");
  auto kernel = get_quantized_kernel_wrapped(
      d, kname, "qvm", mode, type_string, group_size, bits, B > 1);
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  int c = 0;
  compute_encoder.set_input_array(w, c++);
  compute_encoder.set_input_array(scales, c++);
  if (biases) {
    compute_encoder.set_input_array(*biases, c++);
  }
  compute_encoder.set_input_array(x, c++);
  compute_encoder.set_output_array(out, c++);
  compute_encoder.set_bytes(K, c++);
  compute_encoder.set_bytes(N, c++);
  add_strides_and_shapes(compute_encoder, B <= 1, x, w, scales, biases, c++);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void qmm_nax(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    array& out,
    bool transpose,
    int group_size,
    int bits,
    int M,
    int N,
    int K,
    metal::Device& d,
    const Stream& s,
    const std::string& mode) {
  int B = out.size() / M / N;

  int wm = 2;
  int wn = 2;
  int bm = 64;
  int bn = 64;
  int bk = 64;
  MTL::Size group_dims(32, wn, wm);
  MTL::Size grid_dims((N + bn - 1) / bn, (M + bm - 1) / bm, B);

  std::string kname;
  kname.reserve(64);
  bool aligned = N % 64 == 0;
  bool batched = B > 1;
  std::string type_string = get_type_string(x.dtype());
  concatenate(
      kname,
      mode + (transpose ? "_qmm_t_nax_" : "_qmm_n_nax_"),
      type_string,
      "_gs_",
      group_size,
      "_b_",
      bits,
      "_bm",
      bm,
      "_bn",
      bn,
      "_bk",
      bk,
      "_wm",
      wm,
      "_wn",
      wn,
      transpose ? (aligned ? "_alN_true" : "_alN_false") : "",
      batched ? "_batch_1" : "_batch_0");
  std::string template_def;
  MTL::ComputePipelineState* kernel;
  if (transpose) {
    kernel = get_qmm_nax_kernel_wrapped(
        d,
        kname,
        "qmm_t_nax",
        mode,
        type_string,
        group_size,
        bits,
        aligned,
        batched,
        bm,
        bk,
        bn,
        wm,
        wn);
  } else {
    kernel = get_qmm_nax_kernel_wrapped(
        d,
        kname,
        "qmm_n_nax",
        mode,
        type_string,
        group_size,
        bits,
        batched,
        bm,
        bk,
        bn,
        wm,
        wn);
  }
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  int c = 0;
  compute_encoder.set_input_array(w, c++);
  compute_encoder.set_input_array(scales, c++);
  if (biases) {
    compute_encoder.set_input_array(*biases, c++);
  }
  compute_encoder.set_input_array(x, c++);
  compute_encoder.set_output_array(out, c++);
  compute_encoder.set_bytes(K, c++);
  compute_encoder.set_bytes(N, c++);
  compute_encoder.set_bytes(M, c++);
  add_strides_and_shapes(compute_encoder, B <= 1, x, w, scales, biases, c);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void gather_qmm_nax(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    const array& lhs_indices,
    const array& rhs_indices,
    array& out,
    bool transpose,
    int group_size,
    int bits,
    int M,
    int N,
    int K,
    metal::Device& d,
    const Stream& s,
    const std::string& mode) {
  int B = out.size() / M / N;

  int wm = 2;
  int wn = 2;
  int bm = 64;
  int bn = 64;
  int bk = 32;
  MTL::Size group_dims(32, wn, wm);
  MTL::Size grid_dims((N + bn - 1) / bn, (M + bm - 1) / bm, B);

  std::string kname;
  kname.reserve(64);
  bool aligned = N % 64 == 0;
  std::string type_string = get_type_string(x.dtype());
  concatenate(
      kname,
      mode + (transpose ? "_gather_qmm_t_nax_" : "_gather_qmm_n_nax_"),
      type_string,
      "_gs_",
      group_size,
      "_b_",
      bits,
      "_bm",
      bm,
      "_bn",
      bn,
      "_bk",
      bk,
      "_wm",
      wm,
      "_wn",
      wn,
      transpose ? (aligned ? "_alN_true" : "_alN_false") : "");
  MTL::ComputePipelineState* kernel;
  if (transpose) {
    kernel = get_qmm_nax_kernel_wrapped(
        d,
        kname,
        "gather_qmm_t_nax_",
        mode,
        type_string,
        group_size,
        bits,
        aligned,
        bm,
        bk,
        bn,
        wm,
        wn);
  } else {
    kernel = get_qmm_nax_kernel_wrapped(
        d,
        kname,
        "gather_qmm_n_nax_",
        mode,
        type_string,
        group_size,
        bits,
        bm,
        bk,
        bn,
        wm,
        wn);
  }

  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  int c = 0;
  compute_encoder.set_input_array(w, c++);
  compute_encoder.set_input_array(scales, c++);
  if (biases) {
    compute_encoder.set_input_array(*biases, c++);
  }
  compute_encoder.set_input_array(x, c++);
  compute_encoder.set_input_array(lhs_indices, c++);
  compute_encoder.set_input_array(rhs_indices, c++);
  compute_encoder.set_output_array(out, c++);
  compute_encoder.set_bytes(K, c++);
  compute_encoder.set_bytes(N, c++);
  compute_encoder.set_bytes(M, c++);
  c = add_strides_and_shapes(compute_encoder, false, x, w, scales, biases, c);
  add_gather_strides_and_shapes(compute_encoder, lhs_indices, rhs_indices, c);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void qmm(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    array& out,
    bool transpose,
    int group_size,
    int bits,
    int M,
    int N,
    int K,
    metal::Device& d,
    const Stream& s,
    const std::string& mode) {
  if (metal::is_nax_available() && transpose && (K % 64 == 0) &&
      (env::enable_tf32() || x.dtype() != float32)) {
    return qmm_nax(
        /* const array& x = */ x,
        /* const array& w = */ w,
        /* const array& scales = */ scales,
        /* const std::optional<array>& biases = */ biases,
        /* array& out = */ out,
        /* bool transpose = */ transpose,
        /* int group_size = */ group_size,
        /* int bits = */ bits,
        /* int M = */ M,
        /* int N = */ N,
        /* int K = */ K,
        /* metal::Device& d = */ d,
        /* const Stream& s = */ s,
        /* const std::string& mode = */ mode);
  }

  int B = out.size() / M / N;

  int wm = 2;
  int wn = 2;
  int bm = 32;
  int bn = 32;
  MTL::Size group_dims(32, wn, wm);
  MTL::Size grid_dims((N + bn - 1) / bn, (M + bm - 1) / bm, B);

  std::string kname;
  kname.reserve(64);
  bool aligned = N % 32 == 0;
  bool batched = B > 1;
  std::string type_string = get_type_string(x.dtype());
  concatenate(
      kname,
      mode + (transpose ? "_qmm_t_" : "_qmm_n_"),
      type_string,
      "_gs_",
      group_size,
      "_b_",
      bits,
      transpose ? (aligned ? "_alN_true" : "_alN_false") : "",
      batched ? "_batch_1" : "_batch_0");
  std::string template_def;
  MTL::ComputePipelineState* kernel;
  if (transpose) {
    kernel = get_quantized_kernel_wrapped(
        d,
        kname,
        "qmm_t",
        mode,
        type_string,
        group_size,
        bits,
        aligned,
        batched);
  } else {
    kernel = get_quantized_kernel_wrapped(
        d, kname, "qmm_n", mode, type_string, group_size, bits, batched);
  }
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  int c = 0;
  compute_encoder.set_input_array(w, c++);
  compute_encoder.set_input_array(scales, c++);
  if (biases) {
    compute_encoder.set_input_array(*biases, c++);
  }
  compute_encoder.set_input_array(x, c++);
  compute_encoder.set_output_array(out, c++);
  compute_encoder.set_bytes(K, c++);
  compute_encoder.set_bytes(N, c++);
  compute_encoder.set_bytes(M, c++);
  add_strides_and_shapes(compute_encoder, B <= 1, x, w, scales, biases, c);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void gather_qmm(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    const array& lhs_indices,
    const array& rhs_indices,
    array& out,
    bool transpose,
    int group_size,
    int bits,
    int M,
    int N,
    int K,
    metal::Device& d,
    const Stream& s,
    const std::string& mode) {
  if (metal::is_nax_available() && transpose && (K % 64 == 0) &&
      (env::enable_tf32() || x.dtype() != float32)) {
    return gather_qmm_nax(
        /* const array& x = */ x,
        /* const array& w = */ w,
        /* const array& scales = */ scales,
        /* const std::optional<array>& biases = */ biases,
        /* const array& lhs_indices = */ lhs_indices,
        /* const array& rhs_indices = */ rhs_indices,
        /* array& out = */ out,
        /* bool transpose = */ transpose,
        /* int group_size = */ group_size,
        /* int bits = */ bits,
        /* int M = */ M,
        /* int N = */ N,
        /* int K = */ K,
        /* metal::Device& d = */ d,
        /* const Stream& s = */ s,
        /* const std::string& mode = */ mode);
  }

  int B = out.size() / M / N;

  int wm = 2;
  int wn = 2;
  int bm = 32;
  int bn = 32;
  MTL::Size group_dims(32, wn, wm);
  MTL::Size grid_dims((N + bn - 1) / bn, (M + bm - 1) / bm, B);

  std::string kname;
  kname.reserve(64);
  bool aligned = N % 32 == 0;
  std::string type_string = get_type_string(x.dtype());
  concatenate(
      kname,
      mode + (transpose ? "_gather_qmm_t_" : "_gather_qmm_n_"),
      type_string,
      "_gs_",
      group_size,
      "_b_",
      bits,
      transpose ? (aligned ? "_alN_true" : "_alN_false") : "");
  MTL::ComputePipelineState* kernel;
  if (transpose) {
    kernel = get_quantized_kernel_wrapped(
        d, kname, "gather_qmm_t", mode, type_string, group_size, bits, aligned);
  } else {
    kernel = get_quantized_kernel_wrapped(
        d, kname, "gather_qmm_n", mode, type_string, group_size, bits);
  }

  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  int c = 0;
  compute_encoder.set_input_array(w, c++);
  compute_encoder.set_input_array(scales, c++);
  if (biases) {
    compute_encoder.set_input_array(*biases, c++);
  }
  compute_encoder.set_input_array(x, c++);
  compute_encoder.set_input_array(lhs_indices, c++);
  compute_encoder.set_input_array(rhs_indices, c++);
  compute_encoder.set_output_array(out, c++);
  compute_encoder.set_bytes(K, c++);
  compute_encoder.set_bytes(N, c++);
  compute_encoder.set_bytes(M, c++);
  c = add_strides_and_shapes(compute_encoder, false, x, w, scales, biases, c);
  add_gather_strides_and_shapes(compute_encoder, lhs_indices, rhs_indices, c);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void gather_qmv(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    const array& lhs_indices,
    const array& rhs_indices,
    array& out,
    int group_size,
    int bits,
    int M,
    int N,
    int K,
    metal::Device& d,
    const Stream& s,
    const std::string& mode) {
  int B = out.size() / M / N;

  int bn = 8;
  int bk = 32;
  MTL::Size group_dims(bk, 2, 1);
  MTL::Size grid_dims(M, (N + bn - 1) / bn, B);

  std::string kname;
  kname.reserve(64);
  std::string type_string = get_type_string(x.dtype());
  bool fast = N % bn == 0 && K % 512 == 0;
  concatenate(
      kname,
      mode + (fast ? "_gather_qmv_fast_" : "_gather_qmv_"),
      type_string,
      "_gs_",
      group_size,
      "_b_",
      bits);

  auto kernel = get_quantized_kernel_wrapped(
      d,
      kname,
      (fast ? "gather_qmv_fast" : "gather_qmv"),
      mode,
      type_string,
      group_size,
      bits);

  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  int c = 0;
  compute_encoder.set_input_array(w, c++);
  compute_encoder.set_input_array(scales, c++);
  if (biases) {
    compute_encoder.set_input_array(*biases, c++);
  }
  compute_encoder.set_input_array(x, c++);
  compute_encoder.set_input_array(lhs_indices, c++);
  compute_encoder.set_input_array(rhs_indices, c++);
  compute_encoder.set_output_array(out, c++);
  compute_encoder.set_bytes(K, c++);
  compute_encoder.set_bytes(N, c++);
  c = add_strides_and_shapes(compute_encoder, false, x, w, scales, biases, c);
  add_gather_strides_and_shapes(compute_encoder, lhs_indices, rhs_indices, c);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void gather_qvm(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    const array& lhs_indices,
    const array& rhs_indices,
    array& out,
    int group_size,
    int bits,
    int M,
    int N,
    int K,
    metal::Device& d,
    const Stream& s,
    const std::string& mode) {
  int B = out.size() / M / N;

  constexpr int num_simdgroups = 2;
  constexpr int bk = 32;
  int bn = std::min(group_size, 32) * num_simdgroups;
  MTL::Size group_dims(bk, num_simdgroups, 1);
  MTL::Size grid_dims(M, (N + bn - 1) / bn, B);

  std::string kname;
  kname.reserve(64);
  std::string type_string = get_type_string(x.dtype());
  concatenate(
      kname,
      mode + "_gather_qvm_",
      type_string,
      "_gs_",
      group_size,
      "_b_",
      bits);
  auto kernel = get_quantized_kernel_wrapped(
      d, kname, "gather_qvm", mode, type_string, group_size, bits);
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  int c = 0;
  compute_encoder.set_input_array(w, c++);
  compute_encoder.set_input_array(scales, c++);
  if (biases) {
    compute_encoder.set_input_array(*biases, c++);
  }
  compute_encoder.set_input_array(x, c++);
  compute_encoder.set_input_array(lhs_indices, c++);
  compute_encoder.set_input_array(rhs_indices, c++);
  compute_encoder.set_output_array(out, c++);
  compute_encoder.set_bytes(K, c++);
  compute_encoder.set_bytes(N, c++);
  c = add_strides_and_shapes(compute_encoder, false, x, w, scales, biases, c++);
  add_gather_strides_and_shapes(compute_encoder, lhs_indices, rhs_indices, c);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void gather_qmm_rhs_nax(
    const array& x_,
    const array& w_,
    const array& scales_,
    const std::optional<array>& biases_,
    const array& indices_,
    array& out,
    bool transpose,
    int group_size,
    int bits,
    int M,
    int N,
    int K,
    metal::Device& d,
    const Stream& s,
    const std::string mode) {
  // Start by normalizing the indices
  array indices = ensure_row_contiguous(indices_, d, s);

  // Broadcast x with indices. If we are here that means lhs_indices were not
  // provided so the lhs_indices are implied to be the shape of x broadcasted
  // with rhs_indices. We need only broadcast x and copy it as if applying the
  // lhs_indices.
  auto broadcast_with_indices = [&d, &s, &indices](const array& x) {
    if (x.size() / x.shape(-2) / x.shape(-1) == indices.size()) {
      return ensure_row_contiguous(x, d, s);
    }

    auto x_shape = indices.shape();
    x_shape.push_back(x.shape(-2));
    x_shape.push_back(x.shape(-1));
    array new_x(std::move(x_shape), x.dtype(), nullptr, {});
    broadcast(x, new_x);
    return ensure_row_contiguous(new_x, d, s);
  };

  // Normalize the input arrays
  array x = broadcast_with_indices(x_);
  array w = ensure_row_contiguous(w_, d, s);
  array scales = ensure_row_contiguous(scales_, d, s);

  // TODO: Tune the block sizes
  int bm = 64, bn = 64, bk = 64;
  int wm = 2, wn = 2;

  const bool align_M = (M % bm) == 0;
  const bool align_N = (N % bn) == 0;
  const bool align_K = (K % bk) == 0;

  // Make the kernel name
  std::string kname;
  kname.reserve(64);
  std::string type_string = get_type_string(x.dtype());
  concatenate(
      kname,
      mode +
          (transpose ? "_gather_qmm_rhs_nax_nt_" : "_gather_qmm_rhs_nax_nn_"),
      type_string,
      "_gs_",
      group_size,
      "_b_",
      bits,
      "_bm_",
      bm,
      "_bn_",
      bn,
      "_bk_",
      bk,
      "_wm_",
      wm,
      "_wn_",
      wn);

  metal::MTLFCList func_consts = {
      {&align_M, MTL::DataType::DataTypeBool, 200},
      {&align_N, MTL::DataType::DataTypeBool, 201},
      {&align_K, MTL::DataType::DataTypeBool, 202},
  };

  // And the kernel hash that includes the function constants
  std::string hash_name;
  hash_name.reserve(128);
  concatenate(
      hash_name,
      kname,
      "_align_M_",
      align_M ? 't' : 'n',
      "_align_N_",
      align_N ? 't' : 'n',
      "_align_K_",
      align_K ? 't' : 'n');

  // Get and set the kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = get_gather_qmm_nax_kernel(
      d,
      kname,
      hash_name,
      func_consts,
      x,
      group_size,
      bits,
      mode,
      bm,
      bn,
      bk,
      wm,
      wn,
      transpose);
  compute_encoder.set_compute_pipeline_state(kernel);

  MTL::Size group_dims(32, wn, wm);
  MTL::Size grid_dims((N + bn - 1) / bn, (M + bm - 1) / bm, 1);

  int c = 0;
  compute_encoder.set_input_array(x, c++);
  compute_encoder.set_input_array(w, c++);
  compute_encoder.set_input_array(scales, c++);
  if (biases_) {
    array biases = ensure_row_contiguous(*biases_, d, s);
    compute_encoder.set_input_array(biases, c++);
  }
  compute_encoder.set_input_array(indices, c++);
  compute_encoder.set_output_array(out, c++);
  compute_encoder.set_bytes(M, c++);
  compute_encoder.set_bytes(N, c++);
  compute_encoder.set_bytes(K, c++);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void gather_qmm_rhs(
    const array& x_,
    const array& w_,
    const array& scales_,
    const std::optional<array>& biases_,
    const array& indices_,
    array& out,
    bool transpose,
    int group_size,
    int bits,
    int M,
    int N,
    int K,
    metal::Device& d,
    const Stream& s,
    const std::string mode) {
  if (metal::is_nax_available() && transpose &&
      (env::enable_tf32() || x_.dtype() != float32)) {
    return gather_qmm_rhs_nax(
        /* const array& x_ = */ x_,
        /* const array& w_ = */ w_,
        /* const array& scales_ = */ scales_,
        /* const std::optional<array>& biases_ = */ biases_,
        /* const array& indices_ = */ indices_,
        /* array& out = */ out,
        /* bool transpose = */ transpose,
        /* int group_size = */ group_size,
        /* int bits = */ bits,
        /* int M = */ M,
        /* int N = */ N,
        /* int K = */ K,
        /* metal::Device& d = */ d,
        /* const Stream& s = */ s,
        /* const std::string mode = */ mode);
  }

  // Start by normalizing the indices
  array indices = ensure_row_contiguous(indices_, d, s);

  // Broadcast x with indices. If we are here that means lhs_indices were not
  // provided so the lhs_indices are implied to be the shape of x broadcasted
  // with rhs_indices. We need only broadcast x and copy it as if applying the
  // lhs_indices.
  auto broadcast_with_indices = [&d, &s, &indices](const array& x) {
    if (x.size() / x.shape(-2) / x.shape(-1) == indices.size()) {
      return ensure_row_contiguous(x, d, s);
    }

    auto x_shape = indices.shape();
    x_shape.push_back(x.shape(-2));
    x_shape.push_back(x.shape(-1));
    array new_x(std::move(x_shape), x.dtype(), nullptr, {});
    broadcast(x, new_x);
    return ensure_row_contiguous(new_x, d, s);
  };

  // Normalize the input arrays
  array x = broadcast_with_indices(x_);
  array w = ensure_row_contiguous(w_, d, s);
  array scales = ensure_row_contiguous(scales_, d, s);

  // TODO: Tune the block sizes
  int bm = 16, bn = 32, bk = 32;
  int wm = 1, wn = 2;

  const bool align_M = (M % bm) == 0;
  const bool align_N = (N % bn) == 0;
  const bool align_K = (K % bk) == 0;

  // Make the kernel name
  std::string kname;
  kname.reserve(64);
  std::string type_string = get_type_string(x.dtype());
  concatenate(
      kname,
      mode + (transpose ? "_gather_qmm_rhs_nt_" : "_gather_qmm_rhs_nn_"),
      type_string,
      "_gs_",
      group_size,
      "_b_",
      bits,
      "_bm_",
      bm,
      "_bn_",
      bn,
      "_bk_",
      bk,
      "_wm_",
      wm,
      "_wn_",
      wn);

  metal::MTLFCList func_consts = {
      {&align_M, MTL::DataType::DataTypeBool, 200},
      {&align_N, MTL::DataType::DataTypeBool, 201},
      {&align_K, MTL::DataType::DataTypeBool, 202},
  };

  // And the kernel hash that includes the function constants
  std::string hash_name;
  hash_name.reserve(128);
  concatenate(
      hash_name,
      kname,
      "_align_M_",
      align_M ? 't' : 'n',
      "_align_N_",
      align_N ? 't' : 'n',
      "_align_K_",
      align_K ? 't' : 'n');

  // Get and set the kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = get_gather_qmm_kernel(
      d,
      kname,
      hash_name,
      func_consts,
      x,
      group_size,
      bits,
      mode,
      bm,
      bn,
      bk,
      wm,
      wn,
      transpose);
  compute_encoder.set_compute_pipeline_state(kernel);

  MTL::Size group_dims(32, wn, wm);
  MTL::Size grid_dims((N + bn - 1) / bn, (M + bm - 1) / bm, 1);

  int c = 0;
  compute_encoder.set_input_array(x, c++);
  compute_encoder.set_input_array(w, c++);
  compute_encoder.set_input_array(scales, c++);
  if (biases_) {
    array biases = ensure_row_contiguous(*biases_, d, s);
    compute_encoder.set_input_array(biases, c++);
  }
  compute_encoder.set_input_array(indices, c++);
  compute_encoder.set_output_array(out, c++);
  compute_encoder.set_bytes(M, c++);
  compute_encoder.set_bytes(N, c++);
  compute_encoder.set_bytes(K, c++);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void dispatch_qmv(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    array& out,
    int group_size,
    int bits,
    int M,
    int N,
    int K,
    metal::Device& d,
    const Stream& s,
    const std::string& mode) {
  // It is a qmv with a small inner dimension so route to qmv_quad kernel
  if ((K == 128 || K == 64) && is_power_of_2(bits)) {
    qmv_quad(x, w, scales, biases, out, group_size, bits, M, N, K, d, s, mode);
    return;
  }

  // Run of the mill qmv
  qmv(x, w, scales, biases, out, group_size, bits, M, N, K, d, s, mode);
}

void QuantizedMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& s = stream();
  auto& d = metal::device(s.device);

  out.set_data(allocator::malloc(out.nbytes()));

  // Make sure the last two dims of x and w, s, b are contiguous. This should
  // be relaxed for x.
  array x = ensure_row_contiguous_matrix(inputs[0], d, s);
  array w = ensure_row_contiguous_matrix(inputs[1], d, s);
  array scales = ensure_row_contiguous_matrix(inputs[2], d, s);
  std::optional<array> biases = std::nullopt;
  if (inputs.size() == 4) {
    biases = ensure_row_contiguous_matrix(inputs[3], d, s);
  }

  // Extract the matmul shapes
  bool non_batched = w.ndim() == 2 && x.flags().row_contiguous;
  int K = x.shape(-1);
  int M = non_batched ? x.size() / K : x.shape(-2);
  int N = out.shape(-1);

  int vector_limit = transpose_ ? get_qmv_batch_limit(K, N, d) : 4;
  auto mode = quantization_mode_to_string(mode_);
  // It is a matrix matrix product.
  if (M >= vector_limit) {
    qmm(x,
        w,
        scales,
        biases,
        out,
        transpose_,
        group_size_,
        bits_,
        M,
        N,
        K,
        d,
        s,
        mode);
    return;
  }

  // Run of the mill qmv
  if (transpose_) {
    dispatch_qmv(
        x, w, scales, biases, out, group_size_, bits_, M, N, K, d, s, mode);
    return;
  }

  // Run of the mill qvm
  if (K < 1024) {
    qvm(x, w, scales, biases, out, group_size_, bits_, M, N, K, d, s, mode);
    return;
  }

  // Qvm with large dimension so route to a split K kernel for more parallelism
  qvm_split_k(
      x, w, scales, biases, out, group_size_, bits_, M, N, K, d, s, mode);
  return;
}

void GatherQMM::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& s = stream();
  auto& d = metal::device(s.device);

  out.set_data(allocator::malloc(out.nbytes()));

  array x = ensure_row_contiguous_matrix(inputs[0], d, s);
  array w = ensure_row_contiguous_matrix(inputs[1], d, s);
  array scales = ensure_row_contiguous_matrix(inputs[2], d, s);
  std::optional<array> biases = std::nullopt;
  if (inputs.size() == 6) {
    biases = ensure_row_contiguous_matrix(inputs[3], d, s);
  }
  const array& lhs_indices = inputs[inputs.size() - 2];
  const array& rhs_indices = inputs[inputs.size() - 1];

  int K = x.shape(-1);
  int M = x.shape(-2);
  int N = out.shape(-1);
  int B = out.size() / M / N;
  int E = w.size() / w.shape(-1) / w.shape(-2);
  int vector_limit = transpose_ ? get_qmv_batch_limit(K, N, d) : 4;
  auto mode = quantization_mode_to_string(mode_);

  // We are walking x in order and w is also in order so we can batch up the
  // matmuls and reuse reading x and w.
  //
  // TODO: Tune 16 and 4 here a bit better.
  if (M == 1 && B >= 16 && right_sorted_ == true && B / E >= 4) {
    gather_qmm_rhs(
        x,
        w,
        scales,
        biases,
        rhs_indices,
        out,
        transpose_,
        group_size_,
        bits_,
        x.size() / K,
        N,
        K,
        d,
        s,
        mode);
    return;
  }

  // It is a matrix matrix product
  if (M >= vector_limit) {
    gather_qmm(
        x,
        w,
        scales,
        biases,
        lhs_indices,
        rhs_indices,
        out,
        transpose_,
        group_size_,
        bits_,
        M,
        N,
        K,
        d,
        s,
        mode);
    return;
  }

  if (transpose_) {
    gather_qmv(
        x,
        w,
        scales,
        biases,
        lhs_indices,
        rhs_indices,
        out,
        group_size_,
        bits_,
        M,
        N,
        K,
        d,
        s,
        mode);
    return;
  }

  gather_qvm(
      x,
      w,
      scales,
      biases,
      lhs_indices,
      rhs_indices,
      out,
      group_size_,
      bits_,
      M,
      N,
      K,
      d,
      s,
      mode);
}

void quantize_dequantize(
    const array& in,
    array& out,
    std::string mode,
    int group_size,
    int bits,
    metal::Device& d,
    const Stream& s) {
  auto& compute_encoder = d.get_command_encoder(s.index);

  auto w = ensure_row_contiguous(in, d, s);
  compute_encoder.set_input_array(w, 0);
  compute_encoder.set_output_array(out, 1);
  auto type_string = get_type_string(in.dtype());
  std::string kname;
  concatenate(
      kname,
      mode + "_quantize_dequantize_",
      type_string,
      "_gs_",
      group_size,
      "_b_",
      bits);
  auto kernel = get_quantized_kernel_wrapped(
      d, kname, "quantize_dequantize", mode, type_string, group_size, bits);

  compute_encoder.set_compute_pipeline_state(kernel);

  constexpr int uint8_per_uint32 = 4;
  constexpr int simd_size = 32;
  int packs_per_int = (bits == 3 || bits == 5) ? 8 : bits == 6 ? 4 : 8 / bits;
  int per_thread = std::max(group_size / simd_size, 1);
  size_t nthreads = w.size() / per_thread;

  NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
  if (thread_group_size > nthreads) {
    thread_group_size = nthreads;
  }
  auto group_dims = MTL::Size(thread_group_size, 1, 1);
  bool use_2d = nthreads > UINT_MAX;
  auto grid_shape = w.shape();
  grid_shape.back() /= per_thread;
  MTL::Size grid_dims = use_2d ? get_2d_grid_dims(grid_shape, w.strides())
                               : MTL::Size(nthreads, 1, 1);
  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void QQMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
  auto& s = stream();
  auto& d = metal::device(s.device);

  auto mode = quantization_mode_to_string(mode_);
  bool w_quantized = (inputs[1].dtype() == uint32);
  if (w_quantized && inputs[0].shape(-2) == 1) {
    out.set_data(allocator::malloc(out.nbytes()));

    bool donate_x = inputs[0].is_donatable();
    array x = ensure_row_contiguous(inputs[0], d, s);
    // If x is a copy it should be donatable
    donate_x |= x.is_donatable();
    auto xhat = donate_x
        ? x
        : array(allocator::malloc(x.nbytes()), x.shape(), x.dtype());
    quantize_dequantize(x, xhat, mode, group_size_, bits_, d, s);

    // Make sure the last two dims of w and s are contiguous
    array w = ensure_row_contiguous_matrix(inputs[1], d, s);
    array scales = ensure_row_contiguous_matrix(inputs[2], d, s);

    bool non_batched = w.ndim() == 2;
    int K = x.shape(-1);
    int M = non_batched ? x.size() / K : x.shape(-2);
    int N = out.shape(-1);
    dispatch_qmv(
        xhat,
        w,
        scales,
        std::nullopt,
        out,
        group_size_,
        bits_,
        M,
        N,
        K,
        d,
        s,
        mode);
    return;
  } else {
    throw std::runtime_error("[QQMatmul] NYI for the general case");
  }
}

void fast::Quantize::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  auto& w_pre = inputs[0];
  auto& out = outputs[0];
  out.set_data(allocator::malloc(out.nbytes()));

  auto& s = stream();
  auto& d = metal::device(s.device);
  auto& compute_encoder = d.get_command_encoder(s.index);

  auto w = ensure_row_contiguous(w_pre, d, s);
  if (dequantize_) {
    auto scales = ensure_row_contiguous(inputs[1], d, s);
    if (mode_ == QuantizationMode::Affine) {
      auto biases = ensure_row_contiguous(inputs[2], d, s);
      compute_encoder.set_input_array(biases, 2);
    }
    compute_encoder.set_input_array(w, 0);
    compute_encoder.set_input_array(scales, 1);
    compute_encoder.set_output_array(out, 3);
  } else {
    auto& scales = outputs[1];
    scales.set_data(allocator::malloc(scales.nbytes()));
    if (mode_ == QuantizationMode::Affine) {
      auto& biases = outputs[2];
      biases.set_data(allocator::malloc(biases.nbytes()));
      compute_encoder.set_output_array(biases, 3);
    }
    compute_encoder.set_input_array(w, 0);
    compute_encoder.set_output_array(out, 1);
    compute_encoder.set_output_array(scales, 2);
  }

  auto type_string = dequantize_ ? get_type_string(out.dtype())
                                 : get_type_string(w_pre.dtype());
  auto mode = quantization_mode_to_string(mode_);
  std::string kname;
  concatenate(
      kname,
      mode + (dequantize_ ? "_dequantize" : "_quantize"),
      "_",
      type_string,
      "_gs_",
      group_size_,
      "_b_",
      bits_);
  auto kernel = get_quantized_kernel_wrapped(
      d,
      kname,
      dequantize_ ? "dequantize" : "quantize",
      mode,
      type_string,
      group_size_,
      bits_);

  compute_encoder.set_compute_pipeline_state(kernel);

  // Treat uint32 as uint8 in kernel
  constexpr int uint8_per_uint32 = 4;
  constexpr int simd_size = 32;
  int packs_per_int = (bits_ == 3 || bits_ == 5) ? 8
      : bits_ == 6                               ? 4
                                                 : 8 / bits_;
  int per_thread =
      dequantize_ ? packs_per_int : std::max(group_size_ / simd_size, 1);
  size_t nthreads =
      dequantize_ ? out.size() / packs_per_int : w.size() / per_thread;

  NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
  if (thread_group_size > nthreads) {
    thread_group_size = nthreads;
  }
  auto group_dims = MTL::Size(thread_group_size, 1, 1);
  bool use_2d = nthreads > UINT_MAX;
  auto grid_shape = w.shape();
  if (dequantize_) {
    grid_shape.back() *= uint8_per_uint32;
  } else {
    grid_shape.back() /= per_thread;
  }
  MTL::Size grid_dims = use_2d ? get_2d_grid_dims(grid_shape, w.strides())
                               : MTL::Size(nthreads, 1, 1);
  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void fast::ConvertFP8::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  auto& in = inputs[0];
  auto& out = outputs[0];
  unary_op_gpu(inputs, out, name(), stream());
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/reduce.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <algorithm>
#include <cassert>

#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/kernels/defines.h"
#include "mlx/backend/metal/reduce.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/primitives.h"
#include "mlx/utils.h"

namespace mlx::core {

namespace {

struct RowReduceArgs {
  // Input shape and strides not including the reduction axes
  Shape shape;
  Strides strides;
  int ndim;

  // Input shape and strides for the reduction axes
  Shape reduce_shape;
  Strides reduce_strides;
  int reduce_ndim;

  // The number of rows we are reducing. Namely prod(reduce_shape).
  size_t non_row_reductions;

  // The size of the row.
  size_t row_size;

  RowReduceArgs(
      const array& in,
      const ReductionPlan& plan,
      const std::vector<int>& axes) {
    row_size = plan.shape.back();

    reduce_shape = plan.shape;
    reduce_strides = plan.strides;
    reduce_shape.pop_back();
    reduce_strides.pop_back();
    reduce_ndim = reduce_shape.size();

    non_row_reductions = 1;
    for (auto s : reduce_shape) {
      non_row_reductions *= s;
    }

    std::tie(shape, strides) = shapes_without_reduction_axes(in, axes);
    std::tie(shape, strides) = collapse_contiguous_dims(shape, strides);
    ndim = shape.size();
  }

  void encode(CommandEncoder& compute_encoder) {
    // Push 0s to avoid encoding empty vectors.
    if (reduce_ndim == 0) {
      reduce_shape.push_back(0);
      reduce_strides.push_back(0);
    }
    if (ndim == 0) {
      shape.push_back(0);
      strides.push_back(0);
    }

    compute_encoder.set_bytes(row_size, 2);
    compute_encoder.set_bytes(non_row_reductions, 3);
    compute_encoder.set_vector_bytes(shape, 4);
    compute_encoder.set_vector_bytes(strides, 5);
    compute_encoder.set_bytes(ndim, 6);
    compute_encoder.set_vector_bytes(reduce_shape, 7);
    compute_encoder.set_vector_bytes(reduce_strides, 8);
    compute_encoder.set_bytes(reduce_ndim, 9);

    if (reduce_ndim == 0) {
      reduce_shape.pop_back();
      reduce_strides.pop_back();
    }
    if (ndim == 0) {
      shape.pop_back();
      strides.pop_back();
    }
  }
};

struct ColReduceArgs {
  // Input shape and strides not including the reduction axes
  Shape shape;
  Strides strides;
  int ndim;

  // Input shape and strides for the reduction axes
  Shape reduce_shape;
  Strides reduce_strides;
  int reduce_ndim;

  // The number of column reductions we are doing. Namely prod(reduce_shape).
  size_t non_col_reductions;

  // The size of the contiguous column reduction.
  size_t reduction_size;
  int64_t reduction_stride;

  ColReduceArgs(
      const array& in,
      const ReductionPlan& plan,
      const std::vector<int>& axes) {
    reduction_size = plan.shape.back();
    reduction_stride = plan.strides.back();

    reduce_shape = plan.shape;
    reduce_strides = plan.strides;
    reduce_shape.pop_back();
    reduce_strides.pop_back();
    reduce_ndim = reduce_shape.size();

    non_col_reductions = 1;
    for (auto s : reduce_shape) {
      non_col_reductions *= s;
    }

    // We 'll use a stride_back variable because strides.back() could be 0 but
    // yet we may have removed the appropriate amount of elements. It is safe
    // to compute the stride by multiplying shapes (while < reduction_stride)
    // because it is a contiguous section.
    int64_t stride_back = 1;
    std::tie(shape, strides) = shapes_without_reduction_axes(in, axes);
    while (!shape.empty() && stride_back < reduction_stride) {
      stride_back *= shape.back();
      shape.pop_back();
      strides.pop_back();
    }
    std::tie(shape, strides) = collapse_contiguous_dims(shape, strides);
    ndim = shape.size();
  }

  /**
   * Create the col reduce arguments for reducing the 1st axis of the row
   * contiguous intermediate array.
   */
  ColReduceArgs(const array& intermediate) {
    assert(intermediate.flags().row_contiguous);

    reduction_size = intermediate.shape(0);
    reduction_stride = intermediate.size() / reduction_size;
    non_col_reductions = 1;
    reduce_ndim = 0;
    ndim = 0;
  }

  void encode(CommandEncoder& compute_encoder) {
    // Push 0s to avoid encoding empty vectors.
    if (reduce_ndim == 0) {
      reduce_shape.push_back(0);
      reduce_strides.push_back(0);
    }
    if (ndim == 0) {
      shape.push_back(0);
      strides.push_back(0);
    }

    compute_encoder.set_bytes(reduction_size, 2);
    compute_encoder.set_bytes(reduction_stride, 3);
    compute_encoder.set_vector_bytes(shape, 4);
    compute_encoder.set_vector_bytes(strides, 5);
    compute_encoder.set_bytes(ndim, 6);
    compute_encoder.set_vector_bytes(reduce_shape, 7);
    compute_encoder.set_vector_bytes(reduce_strides, 8);
    compute_encoder.set_bytes(reduce_ndim, 9);
    compute_encoder.set_bytes(non_col_reductions, 10);

    if (reduce_ndim == 0) {
      reduce_shape.pop_back();
      reduce_strides.pop_back();
    }
    if (ndim == 0) {
      shape.pop_back();
      strides.pop_back();
    }
  }
};

} // namespace

inline auto safe_div(size_t n, size_t m) {
  return m == 0 ? 0 : (n + m - 1) / m;
}

inline auto safe_divup(size_t n, size_t m) {
  return safe_div(n, m) * m;
}

inline bool is_64b_int(Dtype dtype) {
  return dtype == int64 || dtype == uint64;
}

inline bool is_64b_dtype(Dtype dtype) {
  return dtype == int64 || dtype == uint64 || dtype == complex64;
}

inline int get_kernel_reduce_ndim(int reduce_ndim) {
  if (reduce_ndim <= 1) {
    return 1;
  } else if (reduce_ndim == 2) {
    return 2;
  } else {
    return 5;
  }
}

inline int threadgroup_size_from_row_size(int row_size) {
  // 1 simdgroup per row smallish rows
  if (row_size <= 512) {
    return 32;
  }

  // 2 simdgroups per row for medium rows
  if (row_size <= 1024) {
    return 128;
  }

  // up to 32 simdgroups after that
  int thread_group_size;
  thread_group_size = (row_size + REDUCE_N_READS - 1) / REDUCE_N_READS;
  thread_group_size = ((thread_group_size + 31) / 32) * 32;
  thread_group_size = std::min(1024, thread_group_size);
  return thread_group_size;
}

inline auto output_grid_for_col_reduce(
    const array& out,
    const ColReduceArgs& args) {
  auto out_shape = out.shape();
  auto out_strides = out.strides();
  while (!out_shape.empty() && out_strides.back() < args.reduction_stride) {
    out_shape.pop_back();
    out_strides.pop_back();
  }
  return get_2d_grid_dims(out_shape, out_strides);
}

std::pair<Dtype, Dtype> remap_reduce_types(
    const array& in,
    const std::string& op_name) {
  if (op_name == "sum" || op_name == "prod") {
    if (issubdtype(in.dtype(), integer)) {
      switch (in.dtype()) {
        case uint8:
          return {uint8, uint32};
        case uint16:
          return {uint16, uint32};
        case uint32:
          return {uint32, uint32};
        case uint64:
          return {uint64, uint64};
        case int8:
          return {int8, int32};
        case int16:
          return {int16, int32};
        case int32:
          return {int32, int32};
        case int64:
          return {int64, int64};
        default:
          throw std::runtime_error("Unsupported integer type");
      }
    }
    if (in.dtype() == bool_) {
      return {int8, int32};
    }
    return {in.dtype(), in.dtype()};
  } else if (op_name == "and" || op_name == "or") {
    if (in.dtype().size() == 1) {
      return {bool_, bool_};
    } else if (in.dtype().size() == 2) {
      return {int16, bool_};
    } else if (in.dtype().size() == 4) {
      return {int32, bool_};
    } else {
      return {int64, bool_};
    }
  }
  return {in.dtype(), in.dtype()};
}

void init_reduce(
    array& out,
    const std::string& op_name,
    CommandEncoder& compute_encoder,
    metal::Device& d,
    const Stream& s) {
  auto [_, out_type] = remap_reduce_types(out, op_name);
  const std::string func_name = "init_reduce";
  std::string kname = func_name;
  concatenate(kname, "_", op_name, type_to_name(out_type));
  auto kernel = get_reduce_init_kernel(d, kname, func_name, op_name, out_type);
  size_t nthreads = out.size();
  MTL::Size grid_dims = MTL::Size(nthreads, 1, 1);
  NS::UInteger thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
  if (thread_group_size > nthreads) {
    thread_group_size = nthreads;
  }
  MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
  compute_encoder.set_compute_pipeline_state(kernel);
  compute_encoder.set_output_array(out, 0);
  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void all_reduce_dispatch(
    const array& in,
    array& out,
    const std::string& op_name,
    CommandEncoder& compute_encoder,
    metal::Device& d,
    const Stream& s) {
  // Set the kernel
  auto [in_type, out_type] = remap_reduce_types(in, op_name);
  const std::string func_name = "all_reduce";
  std::string kname = func_name;
  concatenate(kname, "_", op_name, type_to_name(in_type));
  auto kernel = get_reduce_kernel(
      d, kname, func_name, op_name, in_type, out_type, "int64_t");
  compute_encoder.set_compute_pipeline_state(kernel);

  size_t in_size = in.size();

  // Small array so dispatch a single threadgroup
  if (in_size <= REDUCE_N_READS * 1024) {
    int threadgroup_size = (in_size + REDUCE_N_READS - 1) / REDUCE_N_READS;
    threadgroup_size = ((threadgroup_size + 31) / 32) * 32;
    MTL::Size grid_dims(threadgroup_size, 1, 1);

    compute_encoder.set_input_array(in, 0);
    compute_encoder.set_output_array(out, 1);
    compute_encoder.set_bytes(in_size, 2);
    compute_encoder.set_bytes(in_size, 3);
    compute_encoder.dispatch_threads(grid_dims, grid_dims);
  }

  // We need multiple threadgroups so we 'll do it in 2 passes.
  else {
    int n_rows, threadgroup_2nd_pass;
    // Less than 2**26 bytes
    if (in.nbytes() <= (1 << 26)) {
      n_rows = 32 * REDUCE_N_READS;
      threadgroup_2nd_pass = 32;
    }

    // Really large matrix so parallelize as much as possible
    else {
      n_rows = 1024 * REDUCE_N_READS;
      threadgroup_2nd_pass = 1024;
    }

    // Allocate an intermediate tensor to hold results if needed
    array intermediate({n_rows}, out_type, nullptr, {});
    intermediate.set_data(allocator::malloc(intermediate.nbytes()));
    d.add_temporary(intermediate, s.index);

    // 1st pass
    size_t row_size = (in_size + n_rows - 1) / n_rows;
    int threadgroup_size =
        std::min((row_size + REDUCE_N_READS - 1) / REDUCE_N_READS, 1024ul);
    threadgroup_size = ((threadgroup_size + 31) / 32) * 32;
    MTL::Size grid_dims(threadgroup_size, n_rows, 1);
    MTL::Size group_dims(threadgroup_size, 1, 1);
    compute_encoder.set_input_array(in, 0);
    compute_encoder.set_output_array(intermediate, 1);
    compute_encoder.set_bytes(in_size, 2);
    compute_encoder.set_bytes(row_size, 3);
    compute_encoder.dispatch_threads(grid_dims, group_dims);

    // 2nd pass
    std::string kname_2nd_pass = func_name;
    concatenate(kname_2nd_pass, "_", op_name, type_to_name(intermediate));
    auto kernel_2nd_pass = get_reduce_kernel(
        d, kname_2nd_pass, func_name, op_name, out_type, out_type, "int64_t");
    compute_encoder.set_compute_pipeline_state(kernel_2nd_pass);
    size_t intermediate_size = n_rows;
    grid_dims = MTL::Size(threadgroup_2nd_pass, 1, 1);
    group_dims = MTL::Size(threadgroup_2nd_pass, 1, 1);
    compute_encoder.set_input_array(intermediate, 0);
    compute_encoder.set_output_array(out, 1);
    compute_encoder.set_bytes(intermediate_size, 2);
    compute_encoder.set_bytes(intermediate_size, 3);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }
}

void row_reduce_small(
    const array& in,
    array& out,
    const std::string& op_name,
    RowReduceArgs& args,
    CommandEncoder& compute_encoder,
    metal::Device& d,
    const Stream& s) {
  // Set the kernel
  int n = get_kernel_reduce_ndim(args.reduce_ndim);
  auto [in_type, out_type] = remap_reduce_types(in, op_name);
  const std::string func_name = "row_reduce_small";
  std::string kname = func_name;
  bool large = in.size() > INT32_MAX;
  if (large) {
    kname += "_large";
  }
  concatenate(
      kname,
      "_",
      std::to_string(n),
      "_reduce_",
      op_name,
      type_to_name(in_type));
  auto kernel = get_reduce_kernel(
      d,
      kname,
      func_name,
      op_name,
      in_type,
      out_type,
      large ? "size_t" : "int",
      n);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Figure out the grid dims
  MTL::Size grid_dims;
  MTL::Size group_dims;
  if ((args.non_row_reductions < 32 && args.row_size <= 8) ||
      args.non_row_reductions <= 8) {
    grid_dims = get_2d_grid_dims(out.shape(), out.strides());
    group_dims =
        MTL::Size((grid_dims.width < 1024) ? grid_dims.width : 1024, 1, 1);
  } else {
    auto out_grid_size = get_2d_grid_dims(out.shape(), out.strides());
    grid_dims = MTL::Size(32, out_grid_size.width, out_grid_size.height);
    group_dims = MTL::Size(32, 1, 1);
  }

  // Launch
  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_output_array(out, 1);
  args.encode(compute_encoder);
  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void row_reduce_simple(
    const array& in,
    array& out,
    const std::string& op_name,
    RowReduceArgs& args,
    CommandEncoder& compute_encoder,
    metal::Device& d,
    const Stream& s) {
  // Set the kernel
  auto [in_type, out_type] = remap_reduce_types(in, op_name);
  const std::string func_name = "row_reduce_simple";
  std::string kname = func_name;
  concatenate(kname, "_", op_name, type_to_name(in_type));

  auto kernel = get_reduce_kernel(
      d, kname, func_name, op_name, in_type, out_type, "size_t");
  compute_encoder.set_compute_pipeline_state(kernel);

  // Figure out the grid dims
  size_t row_size = args.row_size;
  size_t out_size = out.size();
  auto out_grid_size = get_2d_grid_dims(out.shape(), out.strides());
  out_grid_size.width =
      (out_grid_size.width + REDUCE_N_WRITES - 1) / REDUCE_N_WRITES;
  int threadgroup_size = threadgroup_size_from_row_size(row_size);
  if (in.itemsize() == 8) {
    threadgroup_size = std::min(threadgroup_size, 512);
  }
  MTL::Size grid_dims(
      threadgroup_size, out_grid_size.width, out_grid_size.height);
  MTL::Size group_dims(threadgroup_size, 1, 1);

  // Launch
  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_output_array(out, 1);
  compute_encoder.set_bytes(row_size, 2);
  compute_encoder.set_bytes(out_size, 3);
  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void row_reduce_looped(
    const array& in,
    array& out,
    const std::string& op_name,
    RowReduceArgs& args,
    CommandEncoder& compute_encoder,
    metal::Device& d,
    const Stream& s) {
  auto [in_type, out_type] = remap_reduce_types(in, op_name);

  // Set the kernel
  int n = get_kernel_reduce_ndim(args.reduce_ndim);
  const std::string func_name = "row_reduce_looped";
  std::string kname = func_name;
  bool large = in.size() > INT32_MAX;
  if (large) {
    kname += "_large";
  }
  concatenate(
      kname,
      "_",
      std::to_string(n),
      "_reduce_",
      op_name,
      type_to_name(in_type));
  auto kernel = get_reduce_kernel(
      d,
      kname,
      func_name,
      op_name,
      in_type,
      out_type,
      large ? "size_t" : "int",
      n);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Figure out the grid
  auto out_grid_size = get_2d_grid_dims(out.shape(), out.strides());
  int threadgroup_size = threadgroup_size_from_row_size(args.row_size);
  MTL::Size grid_dims(
      threadgroup_size, out_grid_size.width, out_grid_size.height);
  MTL::Size group_dims(threadgroup_size, 1, 1);

  // Launch
  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_output_array(out, 1);
  args.encode(compute_encoder);
  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void row_reduce_general_dispatch(
    const array& in,
    array& out,
    const std::string& op_name,
    const ReductionPlan& plan,
    const std::vector<int>& axes,
    CommandEncoder& compute_encoder,
    metal::Device& d,
    const Stream& s) {
  // Prepare the arguments for the kernel
  RowReduceArgs args(in, plan, axes);

  // Case 1: The row is small
  if (args.row_size <= 64) {
    return row_reduce_small(in, out, op_name, args, compute_encoder, d, s);
  }

  // Case 2: Contiguous reduce without non-row reductions
  if (plan.type == ContiguousReduce && args.reduce_ndim == 0 &&
      in.size() / args.row_size >= 32) {
    return row_reduce_simple(in, out, op_name, args, compute_encoder, d, s);
  }

  // Case 3: General row reduce including non-row reductions
  return row_reduce_looped(in, out, op_name, args, compute_encoder, d, s);
}

void strided_reduce_small(
    const array& in,
    array& out,
    const std::string& op_name,
    ColReduceArgs& args,
    CommandEncoder& compute_encoder,
    metal::Device& d,
    const Stream& s) {
  auto [in_type, out_type] = remap_reduce_types(in, op_name);

  // Figure out the grid dims
  MTL::Size grid_dims, group_dims;

  // Prepare the arguments for the kernel
  args.reduce_shape.push_back(args.reduction_size);
  args.reduce_strides.push_back(args.reduction_stride);
  args.reduce_ndim++;

  int n = get_kernel_reduce_ndim(args.reduce_ndim);
  const std::string func_name = "col_reduce_small";
  std::string kname = func_name;
  bool large = in.size() > INT32_MAX;
  if (large) {
    kname += "_large";
  }
  concatenate(
      kname,
      "_",
      std::to_string(n),
      "_reduce_",
      op_name,
      type_to_name(in_type));
  auto kernel = get_reduce_kernel(
      d,
      kname,
      func_name,
      op_name,
      in_type,
      out_type,
      large ? "size_t" : "int",
      n);
  compute_encoder.set_compute_pipeline_state(kernel);

  const int n_reads = 4;
  size_t reduction_stride_blocks =
      (args.reduction_stride + n_reads - 1) / n_reads;
  size_t total = args.reduction_size * args.non_col_reductions;
  size_t threadgroup_x = std::min(reduction_stride_blocks, 32ul);
  size_t threadgroup_y = std::min(
      8ul,
      std::min(kernel->maxTotalThreadsPerThreadgroup() / threadgroup_x, total));

  group_dims = MTL::Size(threadgroup_x, threadgroup_y, 1);
  grid_dims = output_grid_for_col_reduce(out, args);
  grid_dims = MTL::Size(
      (reduction_stride_blocks + threadgroup_x - 1) / threadgroup_x,
      grid_dims.width,
      grid_dims.height);

  // Launch
  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_output_array(out, 1);
  args.encode(compute_encoder);
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void strided_reduce_longcolumn(
    const array& in,
    array& out,
    const std::string& op_name,
    ColReduceArgs& args,
    CommandEncoder& compute_encoder,
    metal::Device& d,
    const Stream& s) {
  auto [in_type, out_type] = remap_reduce_types(in, op_name);
  size_t total_reduction_size = args.reduction_size * args.non_col_reductions;
  size_t outer_blocks = 32;
  if (total_reduction_size >= 32768) {
    outer_blocks = 128;
  }

  // Prepare the temporary accumulator
  Shape intermediate_shape;
  intermediate_shape.reserve(out.ndim() + 1);
  intermediate_shape.push_back(outer_blocks);
  intermediate_shape.insert(
      intermediate_shape.end(), out.shape().begin(), out.shape().end());
  array intermediate(std::move(intermediate_shape), out_type, nullptr, {});
  intermediate.set_data(allocator::malloc(intermediate.nbytes()));
  d.add_temporary(intermediate, s.index);

  // Prepare the arguments for the kernel
  args.reduce_shape.push_back(args.reduction_size);
  args.reduce_strides.push_back(args.reduction_stride);
  args.reduce_ndim++;

  // Figure out the grid dims
  size_t out_size = out.size();
  size_t threadgroup_x = args.reduction_stride;
  size_t threadgroup_y =
      (args.non_col_reductions * args.reduction_size + outer_blocks - 1) /
      outer_blocks;
  threadgroup_y = std::min(32ul, threadgroup_y);

  auto out_grid_size = output_grid_for_col_reduce(out, args);
  MTL::Size grid_dims(out_grid_size.width, out_grid_size.height, outer_blocks);
  MTL::Size group_dims(threadgroup_x, threadgroup_y, 1);

  // Set the kernel
  int n = get_kernel_reduce_ndim(args.reduce_ndim);
  std::string func_name = "col_reduce_longcolumn";
  std::string kname = func_name;
  bool large = in.size() > INT32_MAX;
  if (large) {
    kname += "_large";
  }
  concatenate(
      kname,
      "_",
      std::to_string(n),
      "_reduce_",
      op_name,
      type_to_name(in_type));
  auto kernel = get_reduce_kernel(
      d,
      kname,
      func_name,
      op_name,
      in_type,
      out_type,
      large ? "int64_t" : "int",
      n);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Launch
  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_output_array(intermediate, 1);
  args.encode(compute_encoder);
  compute_encoder.set_bytes(out_size, 11);
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);

  // Make the 2nd pass arguments and grid_dims
  ColReduceArgs second_args(intermediate);
  second_args.reduce_shape.push_back(outer_blocks);
  second_args.reduce_strides.push_back(out.size());
  second_args.reduce_ndim++;
  int BN = 32;
  grid_dims = MTL::Size(256 * ((out.size() + BN - 1) / BN), 1, 1);
  group_dims = MTL::Size(256, 1, 1);

  // Set the 2nd kernel
  func_name = "col_reduce_looped";
  kname = func_name;
  large = intermediate.size() > INT32_MAX;
  if (large) {
    kname += "_large";
  }
  concatenate(kname, "_1_32_32_reduce_", op_name, type_to_name(intermediate));
  kernel = get_reduce_kernel(
      d,
      kname,
      func_name,
      op_name,
      intermediate.dtype(),
      out_type,
      large ? "int64_t" : "int",
      1,
      32,
      32);
  compute_encoder.set_compute_pipeline_state(kernel);

  compute_encoder.set_input_array(intermediate, 0);
  compute_encoder.set_output_array(out, 1);
  second_args.encode(compute_encoder);
  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void strided_reduce_looped(
    const array& in,
    array& out,
    const std::string& op_name,
    ColReduceArgs& args,
    CommandEncoder& compute_encoder,
    metal::Device& d,
    const Stream& s) {
  auto [in_type, out_type] = remap_reduce_types(in, op_name);

  // Prepare the arguments for the kernel
  args.reduce_shape.push_back(args.reduction_size);
  args.reduce_strides.push_back(args.reduction_stride);
  args.reduce_ndim++;

  // Figure out the grid dims
  auto out_grid_size = output_grid_for_col_reduce(out, args);
  int BN = 32;
  int BM = 1024 / BN;
  int threadgroup_size = 8 * 32;
  MTL::Size grid_dims(
      threadgroup_size * ((args.reduction_stride + BN - 1) / BN),
      out_grid_size.width,
      out_grid_size.height);
  MTL::Size group_dims(threadgroup_size, 1, 1);

  // Set the kernel
  int n = get_kernel_reduce_ndim(args.reduce_ndim);
  std::string func_name = "col_reduce_looped";
  std::string kname = func_name;
  bool large = in.size() > INT32_MAX;
  if (large) {
    kname += "_large";
  }
  concatenate(
      kname,
      "_",
      std::to_string(n),
      "_",
      std::to_string(BM),
      "_",
      std::to_string(BN),
      "_reduce_",
      op_name,
      type_to_name(in_type));
  auto kernel = get_reduce_kernel(
      d,
      kname,
      func_name,
      op_name,
      in_type,
      out_type,
      large ? "int64_t" : "int",
      n,
      BM,
      BN);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Launch
  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_output_array(out, 1);
  args.encode(compute_encoder);
  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void strided_reduce_2pass(
    const array& in,
    array& out,
    const std::string& op_name,
    ColReduceArgs& args,
    CommandEncoder& compute_encoder,
    metal::Device& d,
    const Stream& s) {
  auto [in_type, out_type] = remap_reduce_types(in, op_name);

  // Prepare the temporary accumulator
  Shape intermediate_shape;
  intermediate_shape.reserve(out.ndim() + 1);
  intermediate_shape.push_back(32);
  intermediate_shape.insert(
      intermediate_shape.end(), out.shape().begin(), out.shape().end());
  array intermediate(std::move(intermediate_shape), out_type, nullptr, {});
  intermediate.set_data(allocator::malloc(intermediate.nbytes()));
  d.add_temporary(intermediate, s.index);

  // Prepare the arguments for the kernel
  args.reduce_shape.push_back(args.reduction_size);
  args.reduce_strides.push_back(args.reduction_stride);
  args.reduce_ndim++;

  // Figure out the grid dims
  size_t out_size = out.size() / args.reduction_stride;
  auto out_grid_size = output_grid_for_col_reduce(out, args);
  int outer_blocks = 32;
  int BN = 32;
  int BM = 1024 / BN;
  int threadgroup_size = 8 * 32;
  MTL::Size grid_dims(
      threadgroup_size * ((args.reduction_stride + BN - 1) / BN),
      out_grid_size.width * outer_blocks,
      out_grid_size.height);
  MTL::Size group_dims(threadgroup_size, 1, 1);

  // Set the kernel
  int n = get_kernel_reduce_ndim(args.reduce_ndim);
  std::string func_name = "col_reduce_2pass";
  std::string kname = func_name;
  bool large = in.size() > INT32_MAX;
  if (large) {
    kname += "_large";
  }
  concatenate(
      kname,
      "_",
      std::to_string(n),
      "_",
      std::to_string(BM),
      "_",
      std::to_string(BN),
      "_reduce_",
      op_name,
      type_to_name(in_type));
  auto kernel = get_reduce_kernel(
      d,
      kname,
      func_name,
      op_name,
      in_type,
      out_type,
      large ? "int64_t" : "int",
      n,
      BM,
      BN);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Launch
  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_output_array(intermediate, 1);
  args.encode(compute_encoder);
  compute_encoder.set_bytes(out_size, 11);
  compute_encoder.dispatch_threads(grid_dims, group_dims);

  // Make the 2nd pass arguments and grid_dims
  ColReduceArgs second_args(intermediate);
  second_args.reduce_shape.push_back(outer_blocks);
  second_args.reduce_strides.push_back(out.size());
  second_args.reduce_ndim++;
  grid_dims = MTL::Size(threadgroup_size * ((out.size() + BN - 1) / BN), 1, 1);

  // Set the 2nd kernel
  func_name = "col_reduce_looped";
  kname = func_name;
  large = intermediate.size() > INT32_MAX;
  if (large) {
    kname += "_large";
  }
  concatenate(kname, "_1_32_32_reduce_", op_name, type_to_name(intermediate));
  kernel = get_reduce_kernel(
      d,
      kname,
      func_name,
      op_name,
      intermediate.dtype(),
      out_type,
      large ? "int64_t" : "int",
      1,
      32,
      32);
  compute_encoder.set_compute_pipeline_state(kernel);

  compute_encoder.set_input_array(intermediate, 0);
  compute_encoder.set_output_array(out, 1);
  second_args.encode(compute_encoder);
  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

void strided_reduce_general_dispatch(
    const array& in,
    array& out,
    const std::string& op_name,
    const ReductionPlan& plan,
    const std::vector<int>& axes,
    CommandEncoder& compute_encoder,
    metal::Device& d,
    const Stream& s) {
  // Prepare the arguments for the kernel
  ColReduceArgs args(in, plan, axes);

  // Small column
  if (args.reduction_size * args.non_col_reductions < 32) {
    return strided_reduce_small(in, out, op_name, args, compute_encoder, d, s);
  }

  // Long column but small row
  if (args.reduction_stride < 32 &&
      args.reduction_size * args.non_col_reductions >= 1024) {
    return strided_reduce_longcolumn(
        in, out, op_name, args, compute_encoder, d, s);
  }

  if (args.reduction_size * args.non_col_reductions > 256 &&
      out.size() / 32 < 1024) {
    return strided_reduce_2pass(in, out, op_name, args, compute_encoder, d, s);
  }

  return strided_reduce_looped(in, out, op_name, args, compute_encoder, d, s);
}

void Reduce::eval_gpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  array in = inputs[0];

  // Make sure no identity reductions trickle down here
  assert(!axes_.empty());
  assert(out.size() != in.size());

  // Continue with reduction operation
  // Minimum of 4 bytes since we use size 4 structs for all reduce
  // and metal will complain o/w
  size_t min_bytes = std::max(out.nbytes(), 4ul);
  out.set_data(allocator::malloc(min_bytes));
  std::string op_name;
  switch (reduce_type_) {
    case Reduce::And:
      op_name = "and";
      break;
    case Reduce::Or:
      op_name = "or";
      break;
    case Reduce::Sum:
      op_name = "sum";
      break;
    case Reduce::Prod:
      op_name = "prod";
      break;
    case Reduce::Min:
      op_name = out.dtype() == bool_ ? "and" : "min";
      break;
    case Reduce::Max:
      op_name = out.dtype() == bool_ ? "or" : "max";
      break;
  }

  // Initialize output
  auto& s = stream();
  auto& d = metal::device(s.device);
  auto& compute_encoder = d.get_command_encoder(s.index);

  // Reduce
  if (in.size() > 0) {
    ReductionPlan plan = get_reduction_plan(in, axes_);

    // If it is a general reduce then copy the input to a contiguous array and
    // recompute the plan.
    //
    // TODO: This can be avoided by making the output have the same strides as
    //       input for the axes with stride smaller than the minimum reduction
    //       stride.
    if (plan.type == GeneralReduce) {
      array in_copy = contiguous_copy_gpu(in, s);
      d.add_temporary(in_copy, s.index);
      in = in_copy;
      plan = get_reduction_plan(in, axes_);
    }

    // Reducing over everything and the data is all there no broadcasting or
    // slicing etc.
    if (plan.type == ContiguousAllReduce) {
      all_reduce_dispatch(in, out, op_name, compute_encoder, d, s);
    }

    // At least the last dimension is row contiguous and we are reducing over
    // the last dim.
    else if (
        plan.type == ContiguousReduce || plan.type == GeneralContiguousReduce) {
      row_reduce_general_dispatch(
          in, out, op_name, plan, axes_, compute_encoder, d, s);
    }

    // At least the last two dimensions are contiguous and we are doing a
    // strided reduce over these.
    else if (
        plan.type == ContiguousStridedReduce ||
        plan.type == GeneralStridedReduce) {
      strided_reduce_general_dispatch(
          in, out, op_name, plan, axes_, compute_encoder, d, s);
    }
  }

  // Nothing to reduce just initialize the output
  else {
    init_reduce(out, op_name, compute_encoder, d, s);
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/reduce.h
================================================
// Copyright @ 2023 - 2024 Apple Inc.

#pragma once

#include "mlx/backend/common/reduce.h"
#include "mlx/backend/metal/device.h"
#include "mlx/stream.h"

namespace mlx::core {

using metal::CommandEncoder;

void all_reduce_dispatch(
    const array& in,
    array& out,
    const std::string& op_name,
    CommandEncoder& compute_encoder,
    metal::Device& d,
    const Stream& s);

void row_reduce_general_dispatch(
    const array& in,
    array& out,
    const std::string& op_name,
    const ReductionPlan& plan,
    const std::vector<int>& axes,
    CommandEncoder& compute_encoder,
    metal::Device& d,
    const Stream& s);

void strided_reduce_general_dispatch(
    const array& in,
    array& out,
    const std::string& op_name,
    const ReductionPlan& plan,
    const std::vector<int>& axes,
    CommandEncoder& compute_encoder,
    metal::Device& d,
    const Stream& s);

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/resident.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/backend/metal/resident.h"

namespace mlx::core::metal {

ResidencySet::ResidencySet(MTL::Device* d) {
  if (!d->supportsFamily(MTL::GPUFamilyMetal3)) {
    return;
  } else if (__builtin_available(macOS 15, iOS 18, *)) {
    auto pool = new_scoped_memory_pool();
    auto desc = MTL::ResidencySetDescriptor::alloc()->init();
    NS::Error* error;
    wired_set_ = d->newResidencySet(desc, &error);
    desc->release();
    if (!wired_set_) {
      std::ostringstream msg;
      msg << "[metal::Device] Unable to construct residency set.\n";
      if (error) {
        msg << error->localizedDescription()->utf8String() << "\n";
      }
      throw std::runtime_error(msg.str());
    }
    wired_set_->requestResidency();
  }
}

void ResidencySet::insert(MTL::Allocation* buf) {
  if (!wired_set_) {
    return;
  }
  if (wired_set_->allocatedSize() + buf->allocatedSize() <= capacity_) {
    wired_set_->addAllocation(buf);
    wired_set_->commit();
  } else {
    unwired_set_.insert(buf);
  }
}

void ResidencySet::erase(MTL::Allocation* buf) {
  if (!wired_set_) {
    return;
  }
  if (auto it = unwired_set_.find(buf); it != unwired_set_.end()) {
    unwired_set_.erase(it);
  } else {
    wired_set_->removeAllocation(buf);
    wired_set_->commit();
  }
}

void ResidencySet::resize(size_t size) {
  if (!wired_set_) {
    return;
  }

  if (capacity_ == size) {
    return;
  }
  capacity_ = size;

  size_t current_size = wired_set_->allocatedSize();

  if (current_size < size) {
    auto pool = new_scoped_memory_pool();
    // Add unwired allocations to the set
    for (auto it = unwired_set_.begin(); it != unwired_set_.end();) {
      auto buf_size = (*it)->allocatedSize();
      if (current_size + buf_size > size) {
        it++;
      } else {
        current_size += buf_size;
        wired_set_->addAllocation(*it);
        unwired_set_.erase(it++);
      }
    }
    wired_set_->commit();
  } else if (current_size > size) {
    auto pool = new_scoped_memory_pool();
    // Remove wired allocations until under capacity
    auto allocations = wired_set_->allAllocations();
    auto num_allocations = wired_set_->allocationCount();
    for (int i = 0; i < num_allocations && current_size > size; ++i) {
      auto buf = static_cast<const MTL::Allocation*>(allocations->object(i));
      wired_set_->removeAllocation(buf);
      current_size -= buf->allocatedSize();
      unwired_set_.insert(buf);
    }
    wired_set_->commit();
  }
}

ResidencySet::~ResidencySet() {
  if (wired_set_) {
    auto pool = new_scoped_memory_pool();
    wired_set_->release();
  }
}

} // namespace mlx::core::metal


================================================
FILE: mlx/backend/metal/resident.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/backend/metal/device.h"

namespace mlx::core::metal {

class ResidencySet {
 public:
  ResidencySet(MTL::Device* d);
  ~ResidencySet();

  ResidencySet(const ResidencySet&) = delete;
  ResidencySet& operator=(const ResidencySet&) = delete;

  const MTL::ResidencySet* mtl_residency_set() {
    return wired_set_;
  }

  void insert(MTL::Allocation* buf);
  void erase(MTL::Allocation* buf);

  void resize(size_t size);

 private:
  MTL::ResidencySet* wired_set_{nullptr};
  std::unordered_set<const MTL::Allocation*> unwired_set_;
  size_t capacity_{0};
};

} // namespace mlx::core::metal


================================================
FILE: mlx/backend/metal/rope.cpp
================================================
// Copyright © 2023-2024 Apple Inc.
#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/fast_primitives.h"

namespace mlx::core::fast {

constexpr int n_per_thread = 4;

bool RoPE::use_fallback(Stream s) {
  return s.device == Device::cpu;
}

void RoPE::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  assert(outputs.size() == 1);
  auto& in = inputs[0];
  auto& out = outputs[0];

  auto& s = out.primitive().stream();
  auto& d = metal::device(s.device);

  int64_t strides[3];
  int64_t out_strides[3];
  bool donated = false;
  int ndim = in.ndim();
  int B = in.shape(0);
  int T = in.shape(-2);
  int D = in.shape(-1);
  size_t mat_size = T * D;
  bool large = in.data_size() > INT32_MAX || in.size() > INT32_MAX;

  int dispatch_ndim = ndim;
  while (in.shape(-dispatch_ndim) == 1 && dispatch_ndim > 3) {
    dispatch_ndim--;
  }

  int N = 1;
  for (int i = 1; i < (ndim - 2); ++i) {
    N *= in.shape(i);
  }

  bool head_seq_transpose = false;

  if (dims_ < D) {
    donated = true;
    auto ctype =
        (in.flags().row_contiguous) ? CopyType::Vector : CopyType::General;
    copy_gpu(in, out, ctype, s);
    strides[0] = mat_size;
    strides[1] = out.strides()[ndim - 2];
    strides[2] = out.strides()[ndim - 1];
  } else if (in.flags().row_contiguous) {
    if (in.is_donatable()) {
      donated = true;
      out.copy_shared_buffer(in);
    } else {
      out.set_data(allocator::malloc(out.nbytes()));
    }
    strides[0] = mat_size;
    strides[1] = in.strides()[ndim - 2];
    strides[2] = in.strides()[ndim - 1];
  } else if (dispatch_ndim == 3) {
    // Handle non-contiguous 3D inputs
    out.set_data(allocator::malloc(out.nbytes()));
    strides[0] = in.strides()[ndim - 3];
    strides[1] = in.strides()[ndim - 2];
    strides[2] = in.strides()[ndim - 1];
  } else if (
      ndim == 4 &&
      // batch dim is regularly strided
      in.strides()[0] == T * N * D &&
      // sequence and head dimensions are transposed
      in.strides()[1] == D && in.strides()[2] == N * D) {
    head_seq_transpose = true;
    out.set_data(allocator::malloc(out.nbytes()));
    strides[0] = in.strides()[1];
    strides[1] = in.strides()[2];
    strides[2] = in.strides()[3];
  } else {
    // Copy non-contiguous > 3D inputs into the output and treat
    // input as donated
    donated = true;
    copy_gpu(in, out, CopyType::General, s);
    strides[0] = mat_size;
    strides[1] = out.strides()[ndim - 2];
    strides[2] = out.strides()[ndim - 1];
  }
  out_strides[0] = mat_size;
  out_strides[1] = out.strides()[ndim - 2];
  out_strides[2] = out.strides()[ndim - 1];

  // Special case for inference (single time step, contiguous, one offset)
  auto& offset = inputs[1];
  bool single = in.flags().row_contiguous && T == 1 && offset.size() == 1;

  bool with_freqs = inputs.size() == 3;
  std::string kname;
  concatenate(
      kname,
      "rope_",
      single ? "single_" : "",
      (with_freqs) ? "freqs_" : "",
      large ? "large_" : "",
      type_to_name(in));
  std::string hash_name;
  concatenate(
      hash_name,
      kname,
      "_",
      forward_ ? "" : "vjp_",
      traditional_ ? "traditional_" : "",
      head_seq_transpose ? "transpose" : "");
  metal::MTLFCList func_consts = {
      {&forward_, MTL::DataType::DataTypeBool, 1},
      {&traditional_, MTL::DataType::DataTypeBool, 2},
      {&head_seq_transpose, MTL::DataType::DataTypeBool, 3}};

  auto kernel = d.get_kernel(kname, hash_name, func_consts);
  auto& compute_encoder = d.get_command_encoder(s.index);

  float base = std::log2(base_);
  compute_encoder.set_compute_pipeline_state(kernel);
  compute_encoder.set_input_array(donated ? out : in, 0);
  compute_encoder.set_output_array(out, 1);

  compute_encoder.set_input_array(offset, 2);
  compute_encoder.set_bytes(scale_, 3);

  MTL::Size group_dims;
  MTL::Size grid_dims;
  if (single) {
    compute_encoder.set_bytes(out_strides, 1, 4);
    uint32_t dim0 = dims_ / 2;
    group_dims = get_block_dims(dim0, N, 1);
    grid_dims = MTL::Size(dim0, N, 1);
  } else {
    compute_encoder.set_bytes(strides, 3, 4);
    compute_encoder.set_bytes(out_strides, 3, 5);
    int64_t offset_stride = 0;
    if (offset.ndim() > 0) {
      offset_stride = offset.strides()[0];
    }
    compute_encoder.set_bytes(offset_stride, 6);
    compute_encoder.set_bytes(N, 7);
    uint32_t dim0 = dims_ / 2;
    uint32_t dim1 = T;
    uint32_t dim2 = B * ((N + n_per_thread - 1) / n_per_thread);
    group_dims = get_block_dims(dim0, dim1, dim2);
    grid_dims = MTL::Size(dim0, dim1, dim2);
  }

  if (with_freqs) {
    auto& freqs = inputs[2];
    compute_encoder.set_input_array(freqs, 10);
    auto freq_stride = freqs.strides()[0];
    compute_encoder.set_bytes(freq_stride, 11);
  } else {
    compute_encoder.set_bytes(base, 10);
  }
  compute_encoder.dispatch_threads(grid_dims, group_dims);
}

} // namespace mlx::core::fast


================================================
FILE: mlx/backend/metal/scaled_dot_product_attention.cpp
================================================
// Copyright © 2024 Apple Inc.
#include <sstream>

#include "mlx/backend/common/compiled.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/kernels/defines.h"
#include "mlx/backend/metal/kernels/steel/attn/params.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/fast_primitives.h"
#include "mlx/utils.h"

namespace mlx::core::fast {

namespace {

void sdpa_full_self_attention_nax(
    const Stream& s,
    metal::Device& d,
    const array& q,
    const array& k,
    const array& v,
    const float scale,
    array& o,
    bool do_causal_,
    const std::optional<array>& mask,
    const std::optional<array>& sinks) {
  using namespace mlx::steel;

  int wm = 4;
  int wn = 1;

  int bd = q.shape(-1);
  int bq = 64;
  int bk = 32;

  int B = q.shape(0);
  int H = q.shape(1);
  int D = q.shape(3);
  int gqa_factor = q.shape(1) / k.shape(1);

  int qL = q.shape(2);
  int kL = k.shape(2);

  const bool align_Q = (qL % bq) == 0;
  const bool align_K = (kL % bk) == 0;
  const bool has_mask = mask.has_value();
  const bool do_causal = do_causal_;
  const bool has_sinks = sinks.has_value();

  metal::MTLFCList func_consts = {
      {&align_Q, MTL::DataType::DataTypeBool, 200},
      {&align_K, MTL::DataType::DataTypeBool, 201},
      {&has_mask, MTL::DataType::DataTypeBool, 300},
      {&do_causal, MTL::DataType::DataTypeBool, 301},
      {&has_sinks, MTL::DataType::DataTypeBool, 302}};

  std::string base_name;
  concatenate(
      base_name,
      "steel_attention_",
      type_to_name(q),
      "_bq",
      bq,
      "_bk",
      bk,
      "_bd",
      bd,
      "_wm",
      wm,
      "_wn",
      wn,
      "_mask",
      type_to_name(has_mask ? *mask : q));

  std::string hash_name;
  concatenate(
      hash_name,
      base_name,
      "_align_Q_",
      (align_Q ? 't' : 'n'),
      "_align_K_",
      (align_K ? 't' : 'n'),
      "_has_mask_",
      (has_mask ? 't' : 'n'),
      "_do_causal_",
      (do_causal ? 't' : 'n'),
      "_has_sinks_",
      (has_sinks ? 't' : 'n'));

  auto& compute_encoder = d.get_command_encoder(s.index);

  auto kernel = get_steel_attention_nax_kernel(
      d,
      base_name,
      hash_name,
      func_consts,
      q,
      bq,
      bk,
      bd,
      wm,
      wn,
      (has_mask ? *mask : q));

  compute_encoder.set_compute_pipeline_state(kernel);

  const int NQ = (qL + bq - 1) / bq;
  const int NK = (kL + bk - 1) / bk;

  const int NQ_aligned = qL / bq;
  const int NK_aligned = kL / bk;

  AttnParams params{
      /* int B = */ B,
      /* int H = */ H,
      /* int D = */ D,

      /* int qL = */ qL,
      /* int kL = */ kL,

      /* int gqa_factor = */ gqa_factor,
      /* float scale = */ scale,

      /* int NQ = */ NQ,
      /* int NK = */ NK,

      /* int NQ_aligned = */ NQ_aligned,
      /* int NK_aligned = */ NK_aligned,

      /* int qL_rem = */ (qL - NQ_aligned * bq),
      /* int kL_rem = */ (kL - NK_aligned * bk),
      /* int qL_off = */ (kL - qL),

      /* int64_t Q_strides[3] = */ {q.strides(0), q.strides(1), q.strides(2)},
      /* int64_t K_strides[3] = */ {k.strides(0), k.strides(1), k.strides(2)},
      /* int64_t V_strides[3] = */ {v.strides(0), v.strides(1), v.strides(2)},
      /* int64_t O_strides[3] = */ {o.strides(0), o.strides(1), o.strides(2)}};

  compute_encoder.set_input_array(q, 0);
  compute_encoder.set_input_array(k, 1);
  compute_encoder.set_input_array(v, 2);
  compute_encoder.set_output_array(o, 3);
  compute_encoder.set_bytes(params, 4);

  if (has_mask) {
    auto& m = *mask;

    AttnMaskParams mask_params{/* int64_t M_strides[3] = */ {
        m.strides(0), m.strides(1), m.strides(2)}};

    compute_encoder.set_bytes(mask_params, 5);
    compute_encoder.set_input_array(m, 6);
  }
  if (has_sinks) {
    compute_encoder.set_input_array(*sinks, 7);
  }

  MTL::Size grid_dims = MTL::Size(NQ, H, B);
  MTL::Size group_dims = MTL::Size(32, wm, wn);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void sdpa_full_self_attention_metal(
    const Stream& s,
    metal::Device& d,
    const array& q,
    const array& k,
    const array& v,
    const float scale,
    array& o,
    bool do_causal_,
    const std::optional<array>& mask,
    const std::optional<array>& sinks) {
  if (metal::is_nax_available() && q.shape(3) != 80 &&
      (env::enable_tf32() || q.dtype() != float32)) {
    return sdpa_full_self_attention_nax(
        /* const Stream& s = */ s,
        /* metal::Device& d = */ d,
        /* const array& q = */ q,
        /* const array& k = */ k,
        /* const array& v = */ v,
        /* const float scale = */ scale,
        /* array& o = */ o,
        /* bool do_causal_ = */ do_causal_,
        /* const std::optional<array>& mask = */ mask,
        /* const std::optional<array>& sinks = */ sinks);
  }

  using namespace mlx::steel;

  int wm = 4;
  int wn = 1;

  int bd = q.shape(-1);
  int bq = 32;
  int bk = bd < 128 ? 32 : 16;

  int B = q.shape(0);
  int H = q.shape(1);
  int D = q.shape(3);
  int gqa_factor = q.shape(1) / k.shape(1);

  int qL = q.shape(2);
  int kL = k.shape(2);

  const bool align_Q = (qL % bq) == 0;
  const bool align_K = (kL % bk) == 0;
  const bool has_mask = mask.has_value();
  const bool do_causal = do_causal_;
  const bool has_sinks = sinks.has_value();

  metal::MTLFCList func_consts = {
      {&align_Q, MTL::DataType::DataTypeBool, 200},
      {&align_K, MTL::DataType::DataTypeBool, 201},
      {&has_mask, MTL::DataType::DataTypeBool, 300},
      {&do_causal, MTL::DataType::DataTypeBool, 301},
      {&has_sinks, MTL::DataType::DataTypeBool, 302}};

  std::string base_name;
  concatenate(
      base_name,
      "steel_attention_",
      type_to_name(q),
      "_bq",
      bq,
      "_bk",
      bk,
      "_bd",
      bd,
      "_wm",
      wm,
      "_wn",
      wn,
      "_mask",
      type_to_name(has_mask ? *mask : q));

  std::string hash_name;
  concatenate(
      hash_name,
      base_name,
      "_align_Q_",
      (align_Q ? 't' : 'n'),
      "_align_K_",
      (align_K ? 't' : 'n'),
      "_has_mask_",
      (has_mask ? 't' : 'n'),
      "_do_causal_",
      (do_causal ? 't' : 'n'),
      "_has_sinks_",
      (has_sinks ? 't' : 'n'));

  auto& compute_encoder = d.get_command_encoder(s.index);

  auto kernel = get_steel_attention_kernel(
      d,
      base_name,
      hash_name,
      func_consts,
      q,
      bq,
      bk,
      bd,
      wm,
      wn,
      (has_mask ? *mask : q));

  compute_encoder.set_compute_pipeline_state(kernel);

  const int NQ = (qL + bq - 1) / bq;
  const int NK = (kL + bk - 1) / bk;

  const int NQ_aligned = qL / bq;
  const int NK_aligned = kL / bk;

  AttnParams params{
      /* int B = */ B,
      /* int H = */ H,
      /* int D = */ D,

      /* int qL = */ qL,
      /* int kL = */ kL,

      /* int gqa_factor = */ gqa_factor,
      /* float scale = */ scale,

      /* int NQ = */ NQ,
      /* int NK = */ NK,

      /* int NQ_aligned = */ NQ_aligned,
      /* int NK_aligned = */ NK_aligned,

      /* int qL_rem = */ (qL - NQ_aligned * bq),
      /* int kL_rem = */ (kL - NK_aligned * bk),
      /* int qL_off = */ (kL - qL),

      /* int64_t Q_strides[3] = */ {q.strides(0), q.strides(1), q.strides(2)},
      /* int64_t K_strides[3] = */ {k.strides(0), k.strides(1), k.strides(2)},
      /* int64_t V_strides[3] = */ {v.strides(0), v.strides(1), v.strides(2)},
      /* int64_t O_strides[3] = */ {o.strides(0), o.strides(1), o.strides(2)}};

  compute_encoder.set_input_array(q, 0);
  compute_encoder.set_input_array(k, 1);
  compute_encoder.set_input_array(v, 2);
  compute_encoder.set_output_array(o, 3);
  compute_encoder.set_bytes(params, 4);

  if (has_mask) {
    auto& m = *mask;

    AttnMaskParams mask_params{/* int64_t M_strides[3] = */ {
        m.strides(0), m.strides(1), m.strides(2)}};

    compute_encoder.set_bytes(mask_params, 5);
    compute_encoder.set_input_array(m, 6);
  }
  if (has_sinks) {
    compute_encoder.set_input_array(*sinks, 7);
  }

  MTL::Size grid_dims = MTL::Size(NQ, H, B);
  MTL::Size group_dims = MTL::Size(32, wm, wn);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void sdpa_vector(
    const Stream& s,
    metal::Device& d,
    const array& q,
    const array& k,
    const array& v,
    array& out,
    float scale,
    bool do_causal,
    const std::optional<array>& mask,
    const std::optional<array>& sinks) {
  // Set the kernel name
  std::string kname;
  kname.reserve(64);
  kname += "sdpa_vector_";
  kname += get_type_string(q.dtype());
  kname += "_";
  kname += std::to_string(q.shape(-1));
  kname += "_";
  kname += std::to_string(v.shape(-1));

  // Compute the necessary sizes
  int gqa_factor = q.shape(1) / k.shape(1);
  int N = k.shape(2);
  size_t k_head_stride = k.shape(1) == 1 ? k.strides(0) : k.strides(1);
  size_t k_seq_stride = k.strides()[2];
  size_t v_head_stride = v.shape(1) == 1 ? v.strides(0) : v.strides(1);
  size_t v_seq_stride = v.strides()[2];

  MTL::Size group_dims(1024, 1, 1);
  MTL::Size grid_dims(q.shape(0) * q.shape(1), q.shape(2), 1);

  bool has_mask = mask.has_value();
  bool bool_mask = has_mask && (*mask).dtype() == bool_;
  bool float_mask = has_mask && !bool_mask;
  bool query_transposed = !q.flags().row_contiguous;
  bool has_sinks = sinks.has_value();
  metal::MTLFCList func_consts = {
      {&has_mask, MTL::DataType::DataTypeBool, 20},
      {&query_transposed, MTL::DataType::DataTypeBool, 21},
      {&do_causal, MTL::DataType::DataTypeBool, 22},
      {&bool_mask, MTL::DataType::DataTypeBool, 23},
      {&float_mask, MTL::DataType::DataTypeBool, 24},
      {&has_sinks, MTL::DataType::DataTypeBool, 25},
  };
  std::string hash_name = kname;
  hash_name += has_mask ? (bool_mask ? "_boolmask" : "_floatmask") : "_nomask";
  hash_name += query_transposed ? "_qt" : "_qnt";
  hash_name += do_causal ? "_c" : "_nc";
  hash_name += has_sinks ? "_sinks" : "_nosinks";

  // Get the kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kname, hash_name, func_consts);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Set its arguments
  compute_encoder.set_input_array(q, 0);
  compute_encoder.set_input_array(k, 1);
  compute_encoder.set_input_array(v, 2);
  compute_encoder.set_output_array(out, 3);
  compute_encoder.set_bytes(gqa_factor, 4);
  compute_encoder.set_bytes(N, 5);
  compute_encoder.set_bytes(k_head_stride, 6);
  compute_encoder.set_bytes(k_seq_stride, 7);
  compute_encoder.set_bytes(v_head_stride, 8);
  compute_encoder.set_bytes(v_seq_stride, 9);

  compute_encoder.set_bytes(scale, 10);
  if (has_mask) {
    auto& m = *mask;
    compute_encoder.set_input_array(m, 11 + float_mask);
    int32_t kv_seq_stride = m.shape(3) > 1 ? m.strides(3) : 0;
    int32_t q_seq_stride = m.shape(2) > 1 ? m.strides(2) : 0;
    int32_t head_stride =
        m.shape(1) > 1 ? m.strides(1) : (m.shape(0) > 1 ? m.strides(0) : 0);
    compute_encoder.set_bytes(kv_seq_stride, 13);
    compute_encoder.set_bytes(q_seq_stride, 14);
    compute_encoder.set_bytes(head_stride, 15);
  }
  if (has_sinks) {
    compute_encoder.set_input_array(*sinks, 16);
    compute_encoder.set_bytes(q.shape(1), 17);
  }

  // Launch
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void sdpa_vector_2pass(
    const Stream& s,
    metal::Device& d,
    const array& q,
    const array& k,
    const array& v,
    array& out,
    float scale,
    bool do_causal,
    const std::optional<array>& mask,
    const std::optional<array>& sinks) {
  // Set the kernel name
  std::string kname;
  kname.reserve(64);
  kname += "sdpa_vector_2pass_1_";
  kname += get_type_string(q.dtype());
  kname += "_";
  kname += std::to_string(q.shape(-1));
  kname += "_";
  kname += std::to_string(v.shape(-1));

  // Compute the necessary sizes
  int gqa_factor = q.shape(1) / k.shape(1);
  int n_simds = gqa_factor * q.shape(2);

  char devc = d.get_architecture().back();
  int N = k.shape(2);
  int blocks;
  if (devc == 's') {
    blocks = 64;
    if (N > 1024 && n_simds > 4) {
      if (N <= 8192) {
        blocks = 128;
      } else if (N <= 32768) {
        blocks = 256;
      } else if (N <= 65536) {
        blocks = 512;
      } else {
        blocks = 1024;
      }
    }
  } else if (devc == 'd') {
    blocks = 128;
    if (n_simds <= 2 && N > 8192) {
      blocks = 256;
    } else if (n_simds >= 6) {
      if (N >= 16384 && N < 65536) {
        blocks = 512;
      } else if (N >= 65536) {
        blocks = 1024;
      }
    }
  } else {
    if (n_simds >= 4) {
      blocks = 64;
    } else {
      blocks = 32;
    }
  }
  size_t k_head_stride = k.shape(1) == 1 ? k.strides(0) : k.strides(1);
  size_t k_seq_stride = k.strides()[2];
  size_t v_head_stride = v.shape(1) == 1 ? v.strides(0) : v.strides(1);
  size_t v_seq_stride = v.strides()[2];
  MTL::Size group_dims(32, gqa_factor, q.shape(2));
  MTL::Size grid_dims(k.shape(1), q.shape(0), blocks);

  // Allocate the intermediates
  Shape intermediate_shape;
  intermediate_shape.reserve(out.ndim() + 1);
  intermediate_shape.insert(
      intermediate_shape.end(), out.shape().begin(), out.shape().end() - 1);
  intermediate_shape.push_back(blocks);
  intermediate_shape.push_back(out.shape().back());
  array intermediate(intermediate_shape, q.dtype(), nullptr, {});
  intermediate_shape.pop_back();
  array sums(intermediate_shape, float32, nullptr, {});
  array maxs(std::move(intermediate_shape), float32, nullptr, {});
  intermediate.set_data(allocator::malloc(intermediate.nbytes()));
  sums.set_data(allocator::malloc(sums.nbytes()));
  maxs.set_data(allocator::malloc(maxs.nbytes()));
  d.add_temporary(intermediate, s.index);
  d.add_temporary(sums, s.index);
  d.add_temporary(maxs, s.index);

  bool has_mask = mask.has_value();
  bool bool_mask = has_mask && (*mask).dtype() == bool_;
  bool float_mask = has_mask && !bool_mask;
  bool query_transposed = !q.flags().row_contiguous;
  bool has_sinks = sinks.has_value();
  metal::MTLFCList func_consts = {
      {&has_mask, MTL::DataType::DataTypeBool, 20},
      {&query_transposed, MTL::DataType::DataTypeBool, 21},
      {&do_causal, MTL::DataType::DataTypeBool, 22},
      {&bool_mask, MTL::DataType::DataTypeBool, 23},
      {&float_mask, MTL::DataType::DataTypeBool, 24},
      {&has_sinks, MTL::DataType::DataTypeBool, 25},
      {&blocks, MTL::DataType::DataTypeInt, 26},
  };
  std::string hash_name = kname;
  hash_name += has_mask ? (bool_mask ? "_boolmask" : "_floatmask") : "_nomask";
  hash_name += query_transposed ? "_qt" : "_qnt";
  hash_name += do_causal ? "_c" : "_nc";
  hash_name += has_sinks ? "_sinks_" : "_nosinks_";
  hash_name += std::to_string(blocks);

  // Get the kernel
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto kernel = d.get_kernel(kname, hash_name, func_consts);
  check_kernel_threadgroup_size(kernel, group_dims, hash_name);

  compute_encoder.set_compute_pipeline_state(kernel);

  // Set its arguments
  compute_encoder.set_input_array(q, 0);
  compute_encoder.set_input_array(k, 1);
  compute_encoder.set_input_array(v, 2);
  compute_encoder.set_output_array(intermediate, 3);
  compute_encoder.set_output_array(sums, 4);
  compute_encoder.set_output_array(maxs, 5);
  compute_encoder.set_bytes(N, 7);
  compute_encoder.set_bytes(k_head_stride, 8);
  compute_encoder.set_bytes(k_seq_stride, 9);
  compute_encoder.set_bytes(v_head_stride, 10);
  compute_encoder.set_bytes(v_seq_stride, 11);
  compute_encoder.set_bytes(scale, 12);
  if (has_mask) {
    auto& m = *mask;
    compute_encoder.set_input_array(m, 13 + float_mask);
    int32_t kv_seq_stride = m.shape(3) > 1 ? m.strides(3) : 0;
    int32_t q_seq_stride = m.shape(2) > 1 ? m.strides(2) : 0;
    int32_t head_stride =
        m.shape(1) > 1 ? m.strides(1) : (m.shape(0) > 1 ? m.strides(0) : 0);
    compute_encoder.set_bytes(kv_seq_stride, 15);
    compute_encoder.set_bytes(q_seq_stride, 16);
    compute_encoder.set_bytes(head_stride, 17);
  }
  if (has_sinks) {
    compute_encoder.set_input_array(*sinks, 18);
  }

  // Launch
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);

  // Final pass
  kname.clear();
  kname = "sdpa_vector_2pass_2_";
  kname += get_type_string(q.dtype());
  kname += "_";
  kname += std::to_string(v.shape(-1));

  // Get the kernel
  kernel = d.get_kernel(kname);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Set its arguments
  compute_encoder.set_input_array(intermediate, 0);
  compute_encoder.set_input_array(sums, 1);
  compute_encoder.set_input_array(maxs, 2);
  compute_encoder.set_output_array(out, 3);
  compute_encoder.set_bytes(blocks, 4);

  // Launch
  group_dims = MTL::Size(1024, 1, 1);
  grid_dims = MTL::Size(q.shape(0) * q.shape(1), q.shape(2), 1);
  check_kernel_threadgroup_size(kernel, group_dims, kname);
  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

} // namespace

bool ScaledDotProductAttention::use_fallback(
    const array& q,
    const array& k,
    const array& v,
    bool has_mask,
    bool has_arr_mask,
    bool do_causal,
    bool is_training,
    bool output_logsumexp,
    Stream s) {
  if (is_training) {
    // It's faster for training on Metal to use the unfused SDPA for both
    // forward and backward.
    return true;
  }
  if (output_logsumexp) {
    return true;
  }
  if (s.device == Device::cpu) {
    return true;
  }

  const int value_head_dim = v.shape(-1);
  const int query_head_dim = q.shape(-1);
  const int query_sequence_length = q.shape(2);
  const int key_sequence_length = k.shape(2);
  const int num_query_heads = q.shape(1);
  const int num_kv_heads = k.shape(1);
  const int gqa_factor = num_query_heads / num_kv_heads;

  const bool sdpa_vector_supported_head_dim =
      query_head_dim == value_head_dim &&
      (query_head_dim == 64 || query_head_dim == 96 || query_head_dim == 128 ||
       query_head_dim == 256);
  const bool sdpa_full_supported_head_dim = query_head_dim == value_head_dim &&
      (query_head_dim == 64 || query_head_dim == 80 || query_head_dim == 128);

  const bool sdpa_full_supported_mask = !has_mask || has_arr_mask ||
      (query_sequence_length <= key_sequence_length && do_causal);

  const bool supports_sdpa_full = query_sequence_length > 8 &&
      sdpa_full_supported_mask && sdpa_full_supported_head_dim;

  const bool supports_sdpa_vector = (query_sequence_length <= 8) &&
      (query_sequence_length <= key_sequence_length) &&
      sdpa_vector_supported_head_dim &&
      (query_sequence_length * gqa_factor) <= 32;

  return !(supports_sdpa_full || supports_sdpa_vector);
}

bool ScaledDotProductAttention::supports_bool_mask() {
  return true;
}

void ScaledDotProductAttention::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  auto& s = stream();
  auto& d = metal::device(s.device);

  auto& q_pre = inputs[0];
  auto& k_pre = inputs[1];
  auto& v_pre = inputs[2];
  auto& o = outputs[0];

  std::vector<array> copies;

  // Define some copy functions to ensure the layout of the inputs is as
  // expected.
  copies.reserve(inputs.size());
  auto copy_unless = [&copies, &s](
                         auto predicate, const array& arr) -> const array& {
    if (!predicate(arr)) {
      array arr_copy = contiguous_copy_gpu(arr, s);
      copies.push_back(std::move(arr_copy));
      return copies.back();
    } else {
      return arr;
    }
  };

  // Checks that the headdim dimension has stride 1.
  auto is_matrix_contiguous = [](const array& arr) {
    return arr.strides(-1) == 1;
  };

  std::optional<array> sinks = std::nullopt;
  if (has_sinks_) {
    sinks = copy_unless(is_matrix_contiguous, inputs.back());
  }
  bool has_arr_mask = inputs.size() > (3 + has_sinks_);

  // We are in vector mode ie single query
  if (q_pre.shape(2) <= 8) {
    auto q_copy_unless = [](const array& arr) {
      if (arr.flags().row_contiguous) {
        return true;
      }
      auto& strides = arr.strides();
      auto& shape = arr.shape();
      if (shape[0] == 1 || shape[1] == 1) {
        // If either the batch or head dimension is a singleton, the other can
        // be transposed with the sequence dimension
        auto bidx = shape[0] == 1 ? 1 : 0;
        return (strides[3] == 1) && (strides[2] == shape[3] * shape[bidx]) &&
            (strides[bidx] == shape[3]);
      }
      return false;
    };

    auto kv_copy_unless = [](const array& arr) {
      // keys and values should be copied if:
      // - the last dimension is not contiguous
      // - the batch and head dim are not contiguous
      auto& strides = arr.strides();
      auto& shape = arr.shape();
      if (strides.back() != 1) {
        return false;
      }
      if (shape[0] == 1 || shape[1] == 1) {
        return true;
      }
      return (strides[0] == strides[1] * shape[1]);
    };

    bool q_copied = !q_copy_unless(q_pre);
    array q = (q_copied) ? contiguous_copy_gpu(q_pre, s) : q_pre;
    const auto& k = copy_unless(kv_copy_unless, k_pre);
    const auto& v = copy_unless(kv_copy_unless, v_pre);

    // Donate the query if possible
    if (q.is_donatable() && q.flags().row_contiguous && q.size() == o.size()) {
      o.copy_shared_buffer(q);
    } else {
      if (q_copied) {
        copies.push_back(q);
      }
      o.set_data(allocator::malloc(o.nbytes()));
    }

    auto mask_copy_unless = [&q](const array& arr) {
      auto& strides = arr.strides();
      auto& shape = arr.shape();
      return arr.flags().row_contiguous || q.shape(0) == 1 || q.shape(1) == 1 ||
          (strides[0] == strides[1] * shape[1]);
    };

    auto mask = has_arr_mask
        ? std::optional<array>{copy_unless(mask_copy_unless, inputs[3])}
        : std::nullopt;

    // We route to the 2 pass fused attention if
    // - The device is large and the sequence length long
    // - The sequence length is even longer and we have gqa
    bool do_causal = do_causal_ && q.shape(2) > 1;
    char devc = d.get_architecture().back();
    if (((devc == 'd' || devc == 's') && k.shape(2) >= 1024) ||
        (k.shape(1) < q.shape(1) && k.shape(2) >= 4096)) {
      sdpa_vector_2pass(s, d, q, k, v, o, scale_, do_causal, mask, sinks);
    } else {
      sdpa_vector(s, d, q, k, v, o, scale_, do_causal, mask, sinks);
    }
  }

  // Full attention mode
  else {
    const auto& q = copy_unless(is_matrix_contiguous, q_pre);
    const auto& k = copy_unless(is_matrix_contiguous, k_pre);
    const auto& v = copy_unless(is_matrix_contiguous, v_pre);

    int64_t str_oD = 1;
    int64_t str_oH = o.shape(3);
    int64_t str_oL = o.shape(1) * str_oH;
    int64_t str_oB = o.shape(2) * str_oL;
    size_t data_size = o.shape(0) * str_oB;

    array::Flags flags{
        /* bool contiguous = */ 1,
        /* bool row_contiguous = */ 0,
        /* bool col_contiguous = */ 0,
    };

    o.set_data(
        allocator::malloc(o.nbytes()),
        data_size,
        {str_oB, str_oH, str_oL, str_oD},
        flags);

    auto mask = has_arr_mask
        ? std::optional<array>{copy_unless(is_matrix_contiguous, inputs[3])}
        : std::nullopt;

    sdpa_full_self_attention_metal(
        s, d, q, k, v, scale_, o, do_causal_, mask, sinks);
  }

  d.add_temporaries(std::move(copies), s.index);
}

bool ScaledDotProductAttentionVJP::use_fallback(const array& q, Stream s) {
  return true;
}

void ScaledDotProductAttentionVJP::eval_gpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  throw std::runtime_error("NYI");
}

} // namespace mlx::core::fast


================================================
FILE: mlx/backend/metal/scan.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <cassert>
#include <sstream>

#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/gpu/scan.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/primitives.h"

namespace mlx::core {

void scan_gpu_inplace(
    array in,
    array& out,
    Scan::ReduceType reduce_type,
    int axis,
    bool reverse,
    bool inclusive,
    const Stream& s) {
  auto& d = metal::device(s.device);

  bool contiguous = in.strides()[axis] == 1;

  std::string reduce_type_str;
  switch (reduce_type) {
    case Scan::Sum:
      reduce_type_str = "sum";
      break;
    case Scan::Prod:
      reduce_type_str = "prod";
      break;
    case Scan::Max:
      reduce_type_str = "max";
      break;
    case Scan::Min:
      reduce_type_str = "min";
      break;
    case Scan::LogAddExp:
      reduce_type_str = "logaddexp";
      break;
  }

  std::string kname;
  concatenate(
      kname,
      contiguous ? "contig_" : "strided_",
      "scan_",
      reverse ? "reverse_" : "",
      inclusive ? "inclusive_" : "exclusive_",
      reduce_type_str,
      "_",
      type_to_name(in),
      "_",
      type_to_name(out));

  auto kernel =
      get_scan_kernel(d, kname, reverse, inclusive, reduce_type_str, in, out);

  if (contiguous) {
    auto& compute_encoder = d.get_command_encoder(s.index);
    compute_encoder.set_compute_pipeline_state(kernel);
    compute_encoder.set_input_array(in, 0);
    compute_encoder.set_output_array(out, 1);
    size_t size = in.shape(axis);
    compute_encoder.set_bytes(size, 2);

    // Compute the thread grid
    int n_reads = (in.itemsize() <= 4) ? 4 : 2;
    constexpr int simd_size = 32;
    int elements_per_simd = n_reads * simd_size;
    int thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
    if (size <= n_reads * 1024) {
      thread_group_size =
          ((size + elements_per_simd - 1) / elements_per_simd) * simd_size;
    } else if (size <= n_reads * 2048) {
      thread_group_size =
          ((size / 2 + elements_per_simd - 1) / elements_per_simd) * simd_size;
    }
    thread_group_size = std::min(
        thread_group_size,
        static_cast<int>(kernel->maxTotalThreadsPerThreadgroup()));
    auto tmp_grid_dims =
        get_2d_grid_dims(in.shape(), in.strides(), /*divisor=*/size);
    MTL::Size grid_dims(
        thread_group_size, tmp_grid_dims.width, tmp_grid_dims.height);
    MTL::Size group_dims(thread_group_size, 1, 1);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  } else {
    auto& compute_encoder = d.get_command_encoder(s.index);
    compute_encoder.set_compute_pipeline_state(kernel);
    compute_encoder.set_input_array(in, 0);
    compute_encoder.set_output_array(out, 1);
    size_t size = in.shape(axis);
    size_t stride = in.strides()[axis];
    int bn = 32;
    size_t stride_blocks = (stride + bn - 1) / bn;
    compute_encoder.set_bytes(size, 2);
    compute_encoder.set_bytes(stride, 3);
    compute_encoder.set_bytes(stride_blocks, 4);

    // Compute the thread grid
    int n_reads = (in.itemsize() <= 4) ? 4 : 2;
    int n_simdgroups = bn / n_reads;
    int thread_group_size = n_simdgroups * 32;
    auto tmp_grid_dims =
        get_2d_grid_dims(in.shape(), in.strides(), /*divisor=*/size * stride);
    if (tmp_grid_dims.width * stride_blocks <= UINT_MAX) {
      tmp_grid_dims.width *= stride_blocks;
    } else {
      tmp_grid_dims.height *= stride_blocks;
    }
    MTL::Size grid_dims(
        thread_group_size, tmp_grid_dims.width, tmp_grid_dims.height);
    MTL::Size group_dims(thread_group_size, 1, 1);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }
}

void Scan::eval_gpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);

  auto in = inputs[0];
  if (in.flags().contiguous && in.strides()[axis_] != 0) {
    if (in.is_donatable() && in.itemsize() == out.itemsize()) {
      out.copy_shared_buffer(in);
    } else {
      out.set_data(
          allocator::malloc(in.data_size() * out.itemsize()),
          in.data_size(),
          in.strides(),
          in.flags());
    }
  } else {
    in = contiguous_copy_gpu(in, stream());
    out.copy_shared_buffer(in);
  }

  scan_gpu_inplace(
      in, out, reduce_type_, axis_, reverse_, inclusive_, stream());
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/slicing.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <numeric>

#include "mlx/backend/common/compiled.h"
#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/gpu/slicing.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/utils.h"

namespace mlx::core {

void concatenate_gpu(
    const std::vector<array>& inputs,
    array& out,
    int axis,
    const Stream& s) {
  std::vector<int> sizes;
  sizes.push_back(0);
  for (auto& p : inputs) {
    sizes.push_back(p.shape(axis));
  }
  std::partial_sum(sizes.cbegin(), sizes.cend(), sizes.begin());

  out.set_data(allocator::malloc(out.nbytes()));

  auto strides = out.strides();
  auto flags = out.flags();
  flags.row_contiguous = false;
  flags.col_contiguous = false;
  flags.contiguous = false;
  auto& d = metal::device(s.device);
  auto& compute_encoder = d.get_command_encoder(s.index);
  auto concurrent_ctx = compute_encoder.start_concurrent();
  for (int i = 0; i < inputs.size(); i++) {
    array out_slice(inputs[i].shape(), out.dtype(), nullptr, {});
    size_t data_offset = strides[axis] * sizes[i];
    out_slice.copy_shared_buffer(
        out, strides, flags, out_slice.size(), data_offset);
    copy_gpu_inplace(inputs[i], out_slice, CopyType::GeneralGeneral, s);
  }
}

array compute_dynamic_offset(
    const array& indices,
    const Strides& strides,
    const std::vector<int>& axes,
    const Stream& s) {
  auto& d = metal::device(s.device);

  // Kernel to compute offset here.
  array offset({1}, int64, nullptr, {});
  bool donate = indices.is_donatable() &&
      (indices.data_size() * indices.itemsize()) >= offset.itemsize();
  if (donate) {
    offset.copy_shared_buffer(indices);
  } else {
    offset.set_data(allocator::malloc(offset.itemsize()));
  }
  d.add_temporary(offset, s.index);

  auto dtype = indices.dtype();
  std::string lib_name = "compute_dynamic_offset_" + type_to_name(dtype);
  auto lib = d.get_library(lib_name, [dtype]() {
    return fmt::format(
        R"(
        [[kernel]] void compute_dynamic_offset_{0}(
            constant const {1}* indices [[buffer(0)]],
            device int64_t& offset [[buffer(1)]],
            constant const int64_t* strides [[buffer(2)]],
            constant const int* axes [[buffer(3)]],
            constant const int& n_axes [[buffer(4)]],
            uint index [[thread_position_in_grid]]) {{
          int64_t acc = 0;
          for (int i = 0; i < n_axes; ++i) {{
            acc += indices[i] * strides[axes[i]];
          }}
          offset = acc;
        }})",
        type_to_name(dtype),
        get_type_string(dtype));
  });
  auto kernel = d.get_kernel(lib_name, lib);

  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);
  compute_encoder.set_input_array(indices, 0);
  compute_encoder.set_output_array(offset, 1);
  compute_encoder.set_vector_bytes(strides, 2);
  compute_encoder.set_vector_bytes(axes, 3);
  int n_axes = axes.size();
  compute_encoder.set_bytes(n_axes, 4);
  MTL::Size dims = MTL::Size(1, 1, 1);
  compute_encoder.dispatch_threads(dims, dims);
  return offset;
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/softmax.cpp
================================================
// Copyright © 2023-2024 Apple Inc.
#include <algorithm>

#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/kernels/defines.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/primitives.h"

namespace mlx::core {

constexpr int SOFTMAX_LOOPED_LIMIT = 4096;

void Softmax::eval_gpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  if (!issubdtype(out.dtype(), floating)) {
    throw std::runtime_error(
        "[softmax] Does not support non-floating point types.");
  }
  auto& s = stream();
  auto& d = metal::device(s.device);

  // Make sure that the last dimension is contiguous
  auto set_output = [&s, &out](const array& x) {
    if (x.flags().contiguous && x.strides()[x.ndim() - 1] == 1) {
      if (x.is_donatable()) {
        out.copy_shared_buffer(x);
      } else {
        out.set_data(
            allocator::malloc(x.data_size() * x.itemsize()),
            x.data_size(),
            x.strides(),
            x.flags());
      }
      return x;
    } else {
      array x_copy = contiguous_copy_gpu(x, s);
      out.copy_shared_buffer(x_copy);
      return x_copy;
    }
  };

  const array in = set_output(inputs[0]);

  int axis_size = in.shape().back();
  int n_rows = in.data_size() / axis_size;

  const int simd_size = 32;
  const int n_reads = SOFTMAX_N_READS;
  const int looped_limit = SOFTMAX_LOOPED_LIMIT;

  std::string kernel_name = (axis_size > looped_limit) ? "looped_" : "block_";
  kernel_name += "softmax_";
  if (in.dtype() != float32 && precise_) {
    kernel_name += "precise_";
  }
  kernel_name += type_to_name(out);

  auto kernel = get_softmax_kernel(d, kernel_name, precise_, out);
  auto& compute_encoder = d.get_command_encoder(s.index);
  {
    MTL::Size grid_dims, group_dims;
    if (axis_size <= looped_limit) {
      size_t threadgroup_needed = (axis_size + n_reads - 1) / n_reads;
      size_t simds_needed = (threadgroup_needed + simd_size - 1) / simd_size;
      size_t threadgroup_size = simd_size * simds_needed;
      assert(threadgroup_size <= kernel->maxTotalThreadsPerThreadgroup());
      size_t n_threads = n_rows * threadgroup_size;
      grid_dims = MTL::Size(n_threads, 1, 1);
      group_dims = MTL::Size(threadgroup_size, 1, 1);
    } else {
      size_t threadgroup_size = kernel->maxTotalThreadsPerThreadgroup();
      size_t n_threads = n_rows * threadgroup_size;
      grid_dims = MTL::Size(n_threads, 1, 1);
      group_dims = MTL::Size(threadgroup_size, 1, 1);
    }

    compute_encoder.set_compute_pipeline_state(kernel);
    compute_encoder.set_input_array(in, 0);
    compute_encoder.set_output_array(out, 1);
    compute_encoder.set_bytes(axis_size, 2);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/sort.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <algorithm>

#include "mlx/backend/gpu/copy.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/primitives.h"

namespace mlx::core {

namespace {

void single_block_sort(
    const Stream& s,
    metal::Device& d,
    const array& in,
    array& out,
    int axis,
    int bn,
    int tn,
    bool argsort) {
  // Prepare shapes
  int n_rows = in.size() / in.shape(axis);

  auto in_nc_str = in.strides();
  in_nc_str.erase(in_nc_str.begin() + axis);

  auto out_nc_str = out.strides();
  out_nc_str.erase(out_nc_str.begin() + axis);

  auto nc_shape = in.shape();
  nc_shape.erase(nc_shape.begin() + axis);

  int nc_dim = nc_shape.size();

  int size_sorted_axis = in.shape(axis);
  int in_stride_sorted_axis = in.strides()[axis];
  int out_stride_sorted_axis = out.strides()[axis];

  // We can only use the contiguous kernel if the sorted axis
  // has the largest or smallest stride.
  // We also need the input to be contiguous
  bool contiguous = in.flags().contiguous;
  auto check_strides = [](array x, int sort_stride) {
    int min_stride = *std::min_element(x.strides().begin(), x.strides().end());
    int max_stride = *std::max_element(x.strides().begin(), x.strides().end());
    return sort_stride == min_stride || sort_stride == max_stride;
  };
  contiguous &= check_strides(in, in_stride_sorted_axis);
  contiguous &= check_strides(out, out_stride_sorted_axis);

  // Prepare kernel name
  std::ostringstream kname;
  kname << (contiguous ? "c" : "nc");
  if (argsort) {
    kname << "arg";
  }

  kname << "_block_sort_" << type_to_name(in) << "_" << type_to_name(out)
        << "_bn" << bn << "_tn" << tn;
  auto kernel = get_sort_kernel(d, kname.str(), in, out, bn, tn);

  // Prepare command encoder
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);

  // Set inputs
  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_output_array(out, 1);
  compute_encoder.set_bytes(size_sorted_axis, 2);
  compute_encoder.set_bytes(in_stride_sorted_axis, 3);
  compute_encoder.set_bytes(out_stride_sorted_axis, 4);

  if (contiguous) {
    int in_stride_segment_axis = INT32_MAX;
    int out_stride_segment_axis = INT32_MAX;
    for (int i = 0; i < in_nc_str.size(); i++) {
      if (nc_shape[i] == 1) {
        continue;
      }
      if (in_nc_str[i] > INT32_MAX || out_nc_str[i] > INT32_MAX) {
        throw std::runtime_error("[Sort::eval_gpu] Stride too large.");
      }
      in_stride_segment_axis =
          std::min(in_stride_segment_axis, static_cast<int>(in_nc_str[i]));
      out_stride_segment_axis =
          std::min(out_stride_segment_axis, static_cast<int>(out_nc_str[i]));
    }
    compute_encoder.set_bytes(in_stride_segment_axis, 5);
    compute_encoder.set_bytes(out_stride_segment_axis, 6);
  } else {
    compute_encoder.set_bytes(nc_dim, 5);
    if (nc_shape.empty()) {
      int shape = 0;
      int64_t stride = 0;
      compute_encoder.set_bytes(shape, 6);
      compute_encoder.set_bytes(stride, 7);
      compute_encoder.set_bytes(stride, 8);
    } else {
      compute_encoder.set_vector_bytes(nc_shape, 6);
      compute_encoder.set_vector_bytes(in_nc_str, 7);
      compute_encoder.set_vector_bytes(out_nc_str, 8);
    }
  }

  MTL::Size group_dims = MTL::Size(bn, 1, 1);
  MTL::Size grid_dims = MTL::Size(1, n_rows, 1);

  compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
}

void multi_block_sort(
    const Stream& s,
    metal::Device& d,
    const array& in,
    array& out,
    int axis,
    int bn,
    int tn,
    int n_blocks,
    bool argsort) {
  // Prepare shapes
  int n_rows = in.size() / in.shape(axis);

  auto nc_str = in.strides();
  nc_str.erase(nc_str.begin() + axis);

  auto nc_shape = in.shape();
  nc_shape.erase(nc_shape.begin() + axis);

  int nc_dim = nc_shape.size();

  if (nc_dim == 0) {
    nc_shape = {0};
    nc_str = {1};
  }

  int size_sorted_axis = in.shape(axis);
  int stride_sorted_axis = in.strides()[axis];

  // Make temporary copies
  array dev_vals_0({n_rows, size_sorted_axis}, in.dtype(), nullptr, {});
  array dev_vals_1({n_rows, size_sorted_axis}, in.dtype(), nullptr, {});

  array dev_idxs_0({n_rows, size_sorted_axis}, uint32, nullptr, {});
  array dev_idxs_1({n_rows, size_sorted_axis}, uint32, nullptr, {});

  array block_partitions({n_rows, n_blocks + 1}, uint32, nullptr, {});

  // Do allocations
  dev_vals_0.set_data(allocator::malloc(dev_vals_0.nbytes()));
  dev_vals_1.set_data(allocator::malloc(dev_vals_1.nbytes()));
  dev_idxs_0.set_data(allocator::malloc(dev_idxs_0.nbytes()));
  dev_idxs_1.set_data(allocator::malloc(dev_idxs_1.nbytes()));
  block_partitions.set_data(allocator::malloc(block_partitions.nbytes()));

  std::vector<array> copies = {
      dev_vals_0, dev_vals_1, dev_idxs_0, dev_idxs_1, block_partitions};

  // Prepare command encoder
  auto& compute_encoder = d.get_command_encoder(s.index);

  // Do blockwise sort
  {
    std::ostringstream kname;
    kname << "sort_mbsort_" << type_to_name(dev_vals_0) << "_"
          << type_to_name(dev_idxs_0) << "_bn" << bn << "_tn" << tn;
    auto kernel =
        get_mb_sort_kernel(d, kname.str(), dev_vals_0, dev_idxs_0, bn, tn);
    compute_encoder.set_compute_pipeline_state(kernel);

    compute_encoder.set_input_array(in, 0);
    compute_encoder.set_output_array(dev_vals_0, 1);
    compute_encoder.set_output_array(dev_idxs_0, 2);
    compute_encoder.set_bytes(size_sorted_axis, 3);
    compute_encoder.set_bytes(stride_sorted_axis, 4);
    compute_encoder.set_bytes(nc_dim, 5);
    compute_encoder.set_vector_bytes(nc_shape, 6);
    compute_encoder.set_vector_bytes(nc_str, 7);

    MTL::Size group_dims = MTL::Size(bn, 1, 1);
    MTL::Size grid_dims = MTL::Size(n_blocks, n_rows, 1);

    compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
  }

  // Do merges
  bool ping = false;
  array dev_vals_in = dev_vals_0;
  array dev_idxs_in = dev_idxs_0;
  array dev_vals_out = dev_vals_1;
  array dev_idxs_out = dev_idxs_1;

  int n_thr_per_group = (n_blocks + 1) < 1024 ? (n_blocks + 1) : 1024;

  for (int merge_tiles = 2; (merge_tiles / 2) < n_blocks; merge_tiles *= 2) {
    dev_vals_in = ping ? dev_vals_1 : dev_vals_0;
    dev_idxs_in = ping ? dev_idxs_1 : dev_idxs_0;
    dev_vals_out = ping ? dev_vals_0 : dev_vals_1;
    dev_idxs_out = ping ? dev_idxs_0 : dev_idxs_1;
    ping = !ping;

    // Do partition
    {
      std::ostringstream kname;
      kname << "partition_mbsort_" << type_to_name(dev_vals_in) << "_"
            << type_to_name(dev_idxs_in) << "_bn" << bn << "_tn" << tn;

      auto kernel =
          get_mb_sort_kernel(d, kname.str(), dev_vals_0, dev_idxs_0, bn, tn);
      compute_encoder.set_compute_pipeline_state(kernel);

      compute_encoder.set_output_array(block_partitions, 0);
      compute_encoder.set_input_array(dev_vals_in, 1);
      compute_encoder.set_input_array(dev_idxs_in, 2);
      compute_encoder.set_bytes(size_sorted_axis, 3);
      compute_encoder.set_bytes(merge_tiles, 4);
      compute_encoder.set_bytes(n_blocks, 5);

      MTL::Size group_dims = MTL::Size(n_thr_per_group, 1, 1);
      MTL::Size grid_dims = MTL::Size(1, n_rows, 1);

      compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
    }

    // Do merge
    {
      std::ostringstream kname;
      kname << "merge_mbsort_" << type_to_name(dev_vals_in) << "_"
            << type_to_name(dev_idxs_in) << "_bn" << bn << "_tn" << tn;

      auto kernel =
          get_mb_sort_kernel(d, kname.str(), dev_vals_0, dev_idxs_0, bn, tn);
      compute_encoder.set_compute_pipeline_state(kernel);

      compute_encoder.set_input_array(block_partitions, 0);
      compute_encoder.set_input_array(dev_vals_in, 1);
      compute_encoder.set_input_array(dev_idxs_in, 2);
      compute_encoder.set_output_array(dev_vals_out, 3);
      compute_encoder.set_output_array(dev_idxs_out, 4);
      compute_encoder.set_bytes(size_sorted_axis, 5);
      compute_encoder.set_bytes(merge_tiles, 6);
      compute_encoder.set_bytes(n_blocks, 7);

      MTL::Size group_dims = MTL::Size(bn, 1, 1);
      MTL::Size grid_dims = MTL::Size(n_blocks, n_rows, 1);

      compute_encoder.dispatch_threadgroups(grid_dims, group_dims);
    }
  }

  // Copy outputs with appropriate strides
  auto strides = out.strides();
  for (int ax = axis + 1; ax < strides.size(); ax++) {
    strides[ax] *= out.shape(axis);
  }
  strides[axis] = 1;
  copy_gpu_inplace(
      (argsort) ? dev_idxs_out : dev_vals_out,
      out,
      out.shape(),
      strides,
      out.strides(),
      0,
      0,
      (axis == in.ndim() - 1) ? CopyType::Vector : CopyType::General,
      s);

  d.add_temporaries(std::move(copies), s.index);
}

void gpu_merge_sort(
    const Stream& s,
    metal::Device& d,
    const array& in,
    array& out,
    int axis_,
    bool argsort) {
  // Get size info
  int axis = axis_ < 0 ? axis_ + in.ndim() : axis_;
  int size_sorted_axis = in.shape(axis);

  // Get kernel size
  int tn = 4;
  int potential_bn = (size_sorted_axis + tn - 1) / tn;

  int bn;
  if (potential_bn > 256) {
    bn = 512;
  } else if (potential_bn > 128) {
    bn = 256;
  } else if (potential_bn > 64) {
    bn = 128;
  } else if (potential_bn > 32) {
    bn = 64;
  } else {
    bn = 32;
  }

  if (bn == 512 && size_of(in.dtype()) > 4) {
    bn = 256;
  }

  int n_per_block = bn * tn;
  int n_blocks = (size_sorted_axis + n_per_block - 1) / n_per_block;

  if (n_blocks > 1) {
    return multi_block_sort(s, d, in, out, axis, bn, tn, n_blocks, argsort);
  } else {
    return single_block_sort(s, d, in, out, axis, bn, tn, argsort);
  }
}

} // namespace

void ArgSort::eval_gpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);

  out.set_data(allocator::malloc(out.nbytes()));

  auto& s = stream();
  auto& d = metal::device(s.device);
  auto& in = inputs[0];

  gpu_merge_sort(s, d, in, out, axis_, true);
}

void Sort::eval_gpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);

  out.set_data(allocator::malloc(out.nbytes()));

  auto& s = stream();
  auto& d = metal::device(s.device);
  auto& in = inputs[0];

  gpu_merge_sort(s, d, in, out, axis_, false);
}

void ArgPartition::eval_gpu(const std::vector<array>& inputs, array& out) {
  // We direct arg partition to sort for now
  assert(inputs.size() == 1);

  out.set_data(allocator::malloc(out.nbytes()));

  auto& s = stream();
  auto& d = metal::device(s.device);
  auto& in = inputs[0];

  gpu_merge_sort(s, d, in, out, axis_, true);
}

void Partition::eval_gpu(const std::vector<array>& inputs, array& out) {
  // We direct partition to sort for now
  assert(inputs.size() == 1);

  out.set_data(allocator::malloc(out.nbytes()));

  auto& s = stream();
  auto& d = metal::device(s.device);
  auto& in = inputs[0];

  gpu_merge_sort(s, d, in, out, axis_, false);
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/ternary.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/backend/common/ternary.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/primitives.h"

namespace mlx::core {

void ternary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s) {
  assert(inputs.size() == 3);
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto& c = inputs[2];
  TernaryOpType topt = get_ternary_op_type(a, b, c);

  if (out.size() == 0) {
    return;
  }

  // Try to collapse contiguous dims
  auto maybe_collapse = [topt, &a, &b, &c, &out]() {
    if (topt == TernaryOpType::General) {
      auto [shape, strides] = collapse_contiguous_dims(a, b, c, out);
      return std::make_tuple(
          shape, strides[0], strides[1], strides[2], strides[3]);
    } else {
      Strides e;
      return std::make_tuple(Shape{}, e, e, e, e);
    }
  };
  auto [shape, strides_a, strides_b, strides_c, strides_out] = maybe_collapse();

  bool large;
  auto ndim = shape.size();
  int work_per_thread;
  if (topt == TernaryOpType::General) {
    large = a.data_size() > INT32_MAX || b.data_size() > INT32_MAX ||
        c.data_size() > INT32_MAX || out.size() > INT32_MAX;
    work_per_thread = large ? 4 : 2;
  } else {
    large = out.data_size() > INT32_MAX;
    work_per_thread = get_work_per_thread(b.dtype(), out.data_size());
  }
  std::string kernel_name;
  if (topt == TernaryOpType::General) {
    kernel_name = "g";
    if (shape.size() <= 3) {
      kernel_name += std::to_string(shape.size());
    } else if (work_per_thread > 1) {
      concatenate(kernel_name, "n", std::to_string(work_per_thread));
    }
    if (large) {
      kernel_name += "large";
    }
  } else {
    if (topt == TernaryOpType::VectorScalarVector) {
      kernel_name = "sv";
    } else if (topt == TernaryOpType::VectorVectorScalar) {
      kernel_name = "vs";
    } else {
      kernel_name = "v";
    }
    if (large) {
      kernel_name += "2";
    } else if (work_per_thread > 1) {
      kernel_name += "n";
    }
  }
  concatenate(kernel_name, "_", op, type_to_name(b));

  auto& d = metal::device(s.device);

  auto kernel = get_ternary_kernel(d, kernel_name, out.dtype(), op);

  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);
  compute_encoder.set_input_array(a, 0);
  compute_encoder.set_input_array(b, 1);
  compute_encoder.set_input_array(c, 2);
  compute_encoder.set_output_array(out, 3);

  auto thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
  if (topt == TernaryOpType::General) {
    // Launch up to 3D grid of threads
    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
    size_t rest = out.size() / (dim0 * dim1);

    if (ndim > 3) {
      compute_encoder.set_vector_bytes(shape, 4);
      compute_encoder.set_vector_bytes(strides_a, 5);
      compute_encoder.set_vector_bytes(strides_b, 6);
      compute_encoder.set_vector_bytes(strides_c, 7);

      compute_encoder.set_bytes(ndim, 8);
      dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
    } else {
      // The shape is implicit in the grid for <= 3D
      compute_encoder.set_vector_bytes(strides_a, 4);
      compute_encoder.set_vector_bytes(strides_b, 5);
      compute_encoder.set_vector_bytes(strides_c, 6);
    }

    if (thread_group_size != 1024) {
      throw std::runtime_error("[Metal::ternary] Must use 1024 sized block");
    }
    MTL::Size group_dims = get_block_dims(dim0, dim1, rest);
    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  } else {
    // Launch a 1D or 2D grid of threads
    size_t nthreads = ceildiv(out.data_size(), work_per_thread);
    if (thread_group_size > nthreads) {
      thread_group_size = nthreads;
    }
    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
    MTL::Size grid_dims;
    if (large) {
      compute_encoder.set_bytes<int64_t>(out.data_size(), 4);
      grid_dims = get_2d_grid_dims(out.shape(), out.strides(), work_per_thread);
    } else {
      compute_encoder.set_bytes<int>(out.data_size(), 4);
      grid_dims = MTL::Size(nthreads, 1, 1);
    }
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }
}

void ternary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s) {
  auto& a = inputs[0];
  auto& b = inputs[1];
  auto& c = inputs[2];
  TernaryOpType topt = get_ternary_op_type(a, b, c);
  set_ternary_op_output_data(a, b, c, out, topt);
  ternary_op_gpu_inplace(inputs, out, op, s);
}

void ternary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
    const char* op) {
  auto& s = out.primitive().stream();
  ternary_op_gpu(inputs, out, op, s);
}

void Select::eval_gpu(const std::vector<array>& inputs, array& out) {
  ternary_op_gpu(inputs, out, name());
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/ternary.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/array.h"

namespace mlx::core {

void ternary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s);

void ternary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s);

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/unary.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/backend/common/unary.h"
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/kernels.h"
#include "mlx/backend/metal/utils.h"
#include "mlx/primitives.h"

#define UNARY_GPU(func)                                               \
  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
    unary_op_gpu(inputs, out, name());                                \
  }

namespace mlx::core {

void unary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s) {
  auto& in = inputs[0];
  bool contig = in.flags().contiguous;
  if (in.size() == 0) {
    return;
  }

  auto& d = metal::device(s.device);

  auto maybe_collapse = [contig, &in]() {
    if (!contig) {
      return collapse_contiguous_dims(in);
    } else {
      return std::make_pair(Shape{}, Strides{});
    }
  };
  auto [shape, strides] = maybe_collapse();
  int ndim = shape.size();
  bool large;
  if (!contig) {
    large = in.data_size() > INT32_MAX || out.size() > INT32_MAX;
  } else {
    large = in.data_size() > UINT32_MAX;
  }
  int work_per_thread;
  std::string kernel_name;
  if (contig) {
    work_per_thread = get_work_per_thread(in.dtype(), in.data_size());
    kernel_name = (large ? "v2" : (work_per_thread > 1 ? "vn" : "v"));
  } else {
    work_per_thread = large ? 4 : 1;
    kernel_name = "gn" + std::to_string(work_per_thread);
    if (large) {
      kernel_name += "large";
    }
  }
  concatenate(kernel_name, "_", op, type_to_name(in), type_to_name(out));
  auto kernel = get_unary_kernel(d, kernel_name, in.dtype(), out.dtype(), op);

  auto thread_group_size = kernel->maxTotalThreadsPerThreadgroup();
  auto& compute_encoder = d.get_command_encoder(s.index);
  compute_encoder.set_compute_pipeline_state(kernel);
  compute_encoder.set_input_array(in, 0);
  compute_encoder.set_output_array(out, 1);
  if (!contig) {
    // Launch up to 3D grid of threads
    size_t dim0 = ndim > 0 ? shape[ndim - 1] : 1;
    size_t dim1 = ndim > 1 ? shape[ndim - 2] : 1;
    size_t rest = out.size() / (dim0 * dim1);
    compute_encoder.set_vector_bytes(shape, 2);
    compute_encoder.set_vector_bytes(strides, 3);
    compute_encoder.set_bytes(ndim, 4);
    if (thread_group_size != 1024) {
      throw std::runtime_error("[Metal::unary] Must use 1024 sized block");
    }
    dim0 = (dim0 + work_per_thread - 1) / work_per_thread;
    auto group_dims = get_block_dims(dim0, dim1, rest);
    MTL::Size grid_dims = MTL::Size(dim0, dim1, rest);
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  } else {
    size_t nthreads = ceildiv(in.data_size(), work_per_thread);
    if (thread_group_size > nthreads) {
      thread_group_size = nthreads;
    }

    MTL::Size group_dims = MTL::Size(thread_group_size, 1, 1);
    MTL::Size grid_dims;
    if (large) {
      compute_encoder.set_bytes<int64_t>(in.data_size(), 2);
      grid_dims = get_2d_grid_dims(out.shape(), out.strides(), work_per_thread);
    } else {
      compute_encoder.set_bytes<int>(in.data_size(), 2);
      grid_dims = MTL::Size(nthreads, 1, 1);
    }
    compute_encoder.dispatch_threads(grid_dims, group_dims);
  }
}

void unary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s) {
  set_unary_output_data(inputs[0], out);
  unary_op_gpu_inplace(inputs, out, op, s);
}

void unary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
    const char* op) {
  auto& s = out.primitive().stream();
  unary_op_gpu(inputs, out, op, s);
}

UNARY_GPU(Abs)
UNARY_GPU(ArcCos)
UNARY_GPU(ArcCosh)
UNARY_GPU(ArcSin)
UNARY_GPU(ArcSinh)
UNARY_GPU(ArcTan)
UNARY_GPU(ArcTanh)
UNARY_GPU(BitwiseInvert)
UNARY_GPU(Conjugate)
UNARY_GPU(Cos)
UNARY_GPU(Cosh)
UNARY_GPU(Erf)
UNARY_GPU(ErfInv)
UNARY_GPU(Exp)
UNARY_GPU(Expm1)
UNARY_GPU(Imag)
UNARY_GPU(Log1p)
UNARY_GPU(LogicalNot)
UNARY_GPU(Floor)
UNARY_GPU(Ceil)
UNARY_GPU(Negative)
UNARY_GPU(Real)
UNARY_GPU(Sigmoid)
UNARY_GPU(Sign)
UNARY_GPU(Sin)
UNARY_GPU(Sinh)
UNARY_GPU(Square)
UNARY_GPU(Sqrt)
UNARY_GPU(Tan)
UNARY_GPU(Tanh)

void Log::eval_gpu(const std::vector<array>& inputs, array& out) {
  unary_op_gpu(inputs, out, name());
}

void Round::eval_gpu(const std::vector<array>& inputs, array& out) {
  assert(inputs.size() == 1);
  const auto& in = inputs[0];
  if (issubdtype(in.dtype(), inexact)) {
    unary_op_gpu(inputs, out, name());
  } else {
    // No-op integer types
    out.copy_shared_buffer(in);
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/unary.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/array.h"

namespace mlx::core {

void unary_op_gpu(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s);

void unary_op_gpu_inplace(
    const std::vector<array>& inputs,
    array& out,
    const char* op,
    const Stream& s);

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/utils.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include "mlx/backend/metal/utils.h"
#include "mlx/backend/common/utils.h"

namespace mlx::core {

std::string type_to_name(const Dtype& t) {
  std::string tname;
  switch (t) {
    case bool_:
      tname = "bool_";
      break;
    case uint8:
      tname = "uint8";
      break;
    case uint16:
      tname = "uint16";
      break;
    case uint32:
      tname = "uint32";
      break;
    case uint64:
      tname = "uint64";
      break;
    case int8:
      tname = "int8";
      break;
    case int16:
      tname = "int16";
      break;
    case int32:
      tname = "int32";
      break;
    case int64:
      tname = "int64";
      break;
    case float16:
      tname = "float16";
      break;
    case float32:
      tname = "float32";
      break;
    case float64:
      tname = "double";
      break;
    case bfloat16:
      tname = "bfloat16";
      break;
    case complex64:
      tname = "complex64";
      break;
  }
  return tname;
}

std::string type_to_name(const array& a) {
  return type_to_name(a.dtype());
}

MTL::Size get_block_dims(int dim0, int dim1, int dim2, int pow2) {
  Dims dims = get_block_dims_common(dim0, dim1, dim2, pow2);
  return MTL::Size(std::get<0>(dims), std::get<1>(dims), std::get<2>(dims));
}

MTL::Size get_2d_grid_dims(const Shape& shape, const Strides& strides) {
  Dims dims = get_2d_grid_dims_common(shape, strides);
  return MTL::Size(std::get<0>(dims), std::get<1>(dims), std::get<2>(dims));
}

MTL::Size
get_2d_grid_dims(const Shape& shape, const Strides& strides, size_t divisor) {
  Dims dims = get_2d_grid_dims_common(shape, strides, divisor);
  return MTL::Size(std::get<0>(dims), std::get<1>(dims), std::get<2>(dims));
}

} // namespace mlx::core


================================================
FILE: mlx/backend/metal/utils.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <type_traits>

#include "mlx/array.h"
#include "mlx/backend/metal/device.h"
#include "mlx/primitives.h"

namespace mlx::core {

MLX_API std::string type_to_name(const Dtype& t);
MLX_API std::string type_to_name(const array& a);

// Compute the grid and block dimensions, check backend/common/utils.h for docs.
MTL::Size get_block_dims(int dim0, int dim1, int dim2, int pow2 = 10);
MTL::Size get_2d_grid_dims(const Shape& shape, const Strides& strides);
MTL::Size
get_2d_grid_dims(const Shape& shape, const Strides& strides, size_t divisor);

inline NS::String* make_string(std::ostringstream& os) {
  std::string string = os.str();
  return NS::String::string(string.c_str(), NS::UTF8StringEncoding);
}

inline void debug_set_stream_queue_label(MTL::CommandQueue* queue, int index) {
#ifdef MLX_METAL_DEBUG
  std::ostringstream label;
  label << "Stream " << index;
  queue->setLabel(make_string(label));
#endif
}

inline void debug_set_primitive_buffer_label(
    MTL::CommandBuffer* command_buffer,
    Primitive& primitive) {
#ifdef MLX_METAL_DEBUG
  std::ostringstream label;
  if (auto cbuf_label = command_buffer->label(); cbuf_label) {
    label << cbuf_label->utf8String();
  }
  label << primitive.name();
  command_buffer->setLabel(make_string(label));
#endif
}

template <typename T>
constexpr bool is_numeric_except_char = std::is_arithmetic_v<T> &&
    !std::is_same_v<T, char> && !std::is_same_v<T, signed char> &&
    !std::is_same_v<T, unsigned char> && !std::is_same_v<T, wchar_t>;

template <typename T>
void concatenate(std::string& acc, T first) {
  if constexpr (is_numeric_except_char<T>) {
    acc += std::to_string(first);
  } else {
    acc += first;
  }
}

template <typename T, typename... Args>
void concatenate(std::string& acc, T first, Args... args) {
  if constexpr (is_numeric_except_char<T>) {
    acc += std::to_string(first);
  } else {
    acc += first;
  }
  concatenate(acc, args...);
}

inline int get_work_per_thread(Dtype dtype) {
  return std::max(1, 8 / dtype.size());
}
inline int get_work_per_thread(Dtype dtype, size_t size) {
  constexpr size_t wpt_threshold = 1 << 16;
  return size < wpt_threshold ? 1 : std::max(1, 8 / dtype.size());
}

inline size_t ceildiv(size_t n, size_t m) {
  return (n + m - 1) / m;
}

inline void check_kernel_threadgroup_size(
    const MTL::ComputePipelineState* kernel,
    MTL::Size group_dims,
    const std::string& name) {
  auto max_size = kernel->maxTotalThreadsPerThreadgroup();
  auto requested_size = group_dims.width * group_dims.height * group_dims.depth;

  if (max_size < requested_size) {
    std::ostringstream msg;
    msg << "Maximum threads per threadgroup is " << max_size
        << " but requested " << requested_size << " for kernel " << name << ".";
    throw std::runtime_error(msg.str());
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/no_cpu/CMakeLists.txt
================================================
target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/device_info.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/../cpu/eval.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/../cpu/encoder.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp)


================================================
FILE: mlx/backend/no_cpu/compiled.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include "mlx/compile_impl.h"
#include "mlx/primitives.h"

namespace mlx::core {

// GPU compile is always available if the GPU is available and since we are in
// this file CPU compile is not available so check if the device is a GPU
// device.
namespace detail {
bool compile_available_for_device(const Device& device) {
  return device == Device::gpu;
}
} // namespace detail

void Compiled::eval_cpu(
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  throw std::runtime_error(
      "[Compiled::eval_cpu] CPU compilation not supported on the platform.");
}

} // namespace mlx::core


================================================
FILE: mlx/backend/no_cpu/device_info.cpp
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/cpu/device_info.h"

namespace mlx::core::cpu {

bool is_available() {
  return false;
}

int device_count() {
  return 0;
}

const std::unordered_map<std::string, std::variant<std::string, size_t>>&
device_info(int /* device_index */) {
  static std::unordered_map<std::string, std::variant<std::string, size_t>>
      empty;
  return empty;
}

} // namespace mlx::core::cpu


================================================
FILE: mlx/backend/no_cpu/primitives.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/primitives.h"
#include "mlx/distributed/primitives.h"
#include "mlx/fast_primitives.h"

#define NO_CPU_MULTI(func)                                             \
  void func::eval_cpu(                                                 \
      const std::vector<array>& inputs, std::vector<array>& outputs) { \
    throw std::runtime_error(#func " has no CPU implementation.");     \
  }

#define NO_CPU(func)                                                  \
  void func::eval_cpu(const std::vector<array>& inputs, array& out) { \
    throw std::runtime_error(#func " has no CPU implementation.");    \
  }

namespace mlx::core {

NO_CPU(Abs)
NO_CPU(Add)
NO_CPU(AddMM)
NO_CPU(Arange)
NO_CPU(ArcCos)
NO_CPU(ArcCosh)
NO_CPU(ArcSin)
NO_CPU(ArcSinh)
NO_CPU(ArcTan)
NO_CPU(ArcTan2)
NO_CPU(ArcTanh)
NO_CPU(ArgPartition)
NO_CPU(ArgReduce)
NO_CPU(ArgSort)
NO_CPU(AsType)
NO_CPU(AsStrided)
NO_CPU(BitwiseBinary)
NO_CPU(BitwiseInvert)
NO_CPU(BlockMaskedMM)
NO_CPU(Broadcast)
NO_CPU(BroadcastAxes)
NO_CPU(Ceil)
NO_CPU(Cholesky)
NO_CPU(Concatenate)
NO_CPU(Conjugate)
NO_CPU(Contiguous)
NO_CPU(Convolution)
NO_CPU(Copy)
NO_CPU(Cos)
NO_CPU(Cosh)
NO_CPU_MULTI(CustomTransforms)
NO_CPU_MULTI(Depends)
NO_CPU(Divide)
NO_CPU_MULTI(DivMod)
NO_CPU(DynamicSlice)
NO_CPU(DynamicSliceUpdate)
NO_CPU(NumberOfElements)
NO_CPU(Remainder)
NO_CPU_MULTI(Eig)
NO_CPU_MULTI(Eigh)
NO_CPU(Equal)
NO_CPU(Erf)
NO_CPU(ErfInv)
NO_CPU(Exp)
NO_CPU(ExpandDims)
NO_CPU(Expm1)
NO_CPU(FFT)
NO_CPU(Flatten)
NO_CPU(Floor)
NO_CPU(Full)
NO_CPU(Gather)
NO_CPU(GatherAxis)
NO_CPU(GatherMM)
NO_CPU(GatherQMM)
NO_CPU(Greater)
NO_CPU(GreaterEqual)
NO_CPU(Hadamard)
NO_CPU(Imag)
NO_CPU(Less)
NO_CPU(LessEqual)
NO_CPU(Log)
NO_CPU(Log1p)
NO_CPU(LogicalNot)
NO_CPU(LogicalAnd)
NO_CPU(LogicalOr)
NO_CPU(LogAddExp)
NO_CPU(LogSumExp)
NO_CPU_MULTI(LUF)
NO_CPU(Matmul)
NO_CPU(Maximum)
NO_CPU(MaskedScatter)
NO_CPU(Minimum)
NO_CPU(Multiply)
NO_CPU(Negative)
NO_CPU(NotEqual)
NO_CPU(Pad)
NO_CPU(Partition)
NO_CPU(Power)
NO_CPU_MULTI(QRF)
NO_CPU(QuantizedMatmul)
NO_CPU(QQMatmul)
NO_CPU(RandomBits)
NO_CPU(Real)
NO_CPU(Reduce)
NO_CPU(Reshape)
NO_CPU(Round)
NO_CPU(Scan)
NO_CPU(Scatter)
NO_CPU(ScatterAxis)
NO_CPU(Select)
NO_CPU(SegmentedMM)
NO_CPU(Sigmoid)
NO_CPU(Sign)
NO_CPU(Sin)
NO_CPU(Sinh)
NO_CPU(Slice)
NO_CPU(SliceUpdate)
NO_CPU(Softmax)
NO_CPU(Sort)
NO_CPU_MULTI(Split)
NO_CPU(Square)
NO_CPU(Squeeze)
NO_CPU(Sqrt)
NO_CPU(StopGradient)
NO_CPU(Subtract)
NO_CPU_MULTI(SVD)
NO_CPU(Tan)
NO_CPU(Tanh)
NO_CPU(Transpose)
NO_CPU(Unflatten)
NO_CPU(Inverse)
NO_CPU(View)

namespace fast {
NO_CPU_MULTI(Quantize)
NO_CPU_MULTI(ConvertFP8)
} // namespace fast

namespace distributed {
NO_CPU_MULTI(AllReduce)
NO_CPU_MULTI(AllGather)
NO_CPU_MULTI(Send)
NO_CPU_MULTI(Recv)
NO_CPU_MULTI(ReduceScatter)
} // namespace distributed

} // namespace mlx::core


================================================
FILE: mlx/backend/no_gpu/CMakeLists.txt
================================================
target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/device_info.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/fence.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/eval.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp)


================================================
FILE: mlx/backend/no_gpu/allocator.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <algorithm>
#include <mutex>

#include "mlx/allocator.h"
#include "mlx/memory.h"

#ifdef __APPLE__
#include "mlx/backend/no_gpu/apple_memory.h"
#elif defined(__linux__)
#include "mlx/backend/no_gpu/linux_memory.h"
#else
size_t get_memory_size() {
  return 0;
}
#endif

namespace mlx::core {

namespace allocator {

class CommonAllocator : public Allocator {
  /** A general CPU allocator. */
 public:
  virtual Buffer malloc(size_t size) override;
  virtual void free(Buffer buffer) override;
  virtual size_t size(Buffer buffer) const override;

  size_t get_active_memory() const {
    return active_memory_;
  };
  size_t get_peak_memory() const {
    return peak_memory_;
  };
  void reset_peak_memory() {
    std::unique_lock lk(mutex_);
    peak_memory_ = 0;
  };
  size_t get_memory_limit() {
    return memory_limit_;
  }
  size_t set_memory_limit(size_t limit) {
    std::unique_lock lk(mutex_);
    std::swap(memory_limit_, limit);
    return limit;
  }

 private:
  size_t memory_limit_;
  size_t active_memory_{0};
  size_t peak_memory_{0};
  std::mutex mutex_;
  CommonAllocator() : memory_limit_(0.8 * get_memory_size()) {
    if (memory_limit_ == 0) {
      memory_limit_ = 1UL << 33;
    }
  };

  friend CommonAllocator& common_allocator();
};

CommonAllocator& common_allocator() {
  static CommonAllocator allocator_;
  return allocator_;
}

Allocator& allocator() {
  return common_allocator();
}

void* Buffer::raw_ptr() {
  if (!ptr_) {
    return nullptr;
  }
  return static_cast<size_t*>(ptr_) + 1;
}

Buffer CommonAllocator::malloc(size_t size) {
  void* ptr = std::malloc(size + sizeof(size_t));
  if (ptr != nullptr) {
    *static_cast<size_t*>(ptr) = size;
  }
  std::unique_lock lk(mutex_);
  active_memory_ += size;
  peak_memory_ = std::max(active_memory_, peak_memory_);
  return Buffer{ptr};
}

void CommonAllocator::free(Buffer buffer) {
  auto sz = size(buffer);
  std::free(buffer.ptr());
  std::unique_lock lk(mutex_);
  active_memory_ -= sz;
}

size_t CommonAllocator::size(Buffer buffer) const {
  if (buffer.ptr() == nullptr) {
    return 0;
  }
  return *static_cast<size_t*>(buffer.ptr());
}

} // namespace allocator

size_t get_active_memory() {
  return allocator::common_allocator().get_active_memory();
}
size_t get_peak_memory() {
  return allocator::common_allocator().get_peak_memory();
}
void reset_peak_memory() {
  return allocator::common_allocator().reset_peak_memory();
}
size_t set_memory_limit(size_t limit) {
  return allocator::common_allocator().set_memory_limit(limit);
}
size_t get_memory_limit() {
  return allocator::common_allocator().get_memory_limit();
}

// No-ops for common allocator
size_t get_cache_memory() {
  return 0;
}
size_t set_cache_limit(size_t) {
  return 0;
}
size_t set_wired_limit(size_t) {
  return 0;
}
void clear_cache() {}

} // namespace mlx::core


================================================
FILE: mlx/backend/no_gpu/apple_memory.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include <sys/sysctl.h>

namespace {

size_t get_memory_size() {
  size_t memsize = 0;
  size_t length = sizeof(memsize);
  sysctlbyname("hw.memsize", &memsize, &length, NULL, 0);
  return memsize;
}

} // namespace


================================================
FILE: mlx/backend/no_gpu/device_info.cpp
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/backend/gpu/device_info.h"

namespace mlx::core::gpu {

bool is_available() {
  return false;
}

int device_count() {
  return 0;
}

const std::unordered_map<std::string, std::variant<std::string, size_t>>&
device_info(int /* device_index */) {
  static std::unordered_map<std::string, std::variant<std::string, size_t>>
      empty;
  return empty;
}

} // namespace mlx::core::gpu


================================================
FILE: mlx/backend/no_gpu/eval.cpp
================================================
// Copyright © 2025 Apple Inc.

#include <stdexcept>

#include "mlx/backend/gpu/device_info.h"
#include "mlx/backend/gpu/eval.h"

namespace mlx::core::gpu {

void new_stream(Stream) {}

void eval(array&) {
  throw std::runtime_error("[gpu::eval] GPU backend is not available");
}

void finalize(Stream) {
  throw std::runtime_error("[gpu::finalize] GPU backend is not available");
}

void synchronize(Stream) {
  throw std::runtime_error("[gpu::synchronize]  GPU backend is not available");
}

} // namespace mlx::core::gpu


================================================
FILE: mlx/backend/no_gpu/event.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/event.h"
#include "mlx/scheduler.h"

#include <condition_variable>
#include <mutex>

namespace mlx::core {

struct EventCounter {
  uint64_t value{0};
  std::mutex mtx;
  std::condition_variable cv;
};

Event::Event(Stream stream) : stream_(stream) {
  auto dtor = [](void* ptr) { delete static_cast<EventCounter*>(ptr); };
  event_ = std::shared_ptr<void>(new EventCounter{}, dtor);
}

void Event::wait() {
  auto ec = static_cast<EventCounter*>(event_.get());
  std::unique_lock<std::mutex> lk(ec->mtx);
  if (ec->value >= value()) {
    return;
  }
  ec->cv.wait(lk, [value = value(), ec] { return ec->value >= value; });
}

void Event::wait(Stream stream) {
  scheduler::enqueue(stream, [*this]() mutable { wait(); });
}

void Event::signal(Stream stream) {
  scheduler::enqueue(stream, [*this]() mutable {
    auto ec = static_cast<EventCounter*>(event_.get());
    {
      std::lock_guard<std::mutex> lk(ec->mtx);
      ec->value = value();
    }
    ec->cv.notify_all();
  });
}

bool Event::is_signaled() const {
  auto ec = static_cast<EventCounter*>(event_.get());
  {
    std::lock_guard<std::mutex> lk(ec->mtx);
    return (ec->value >= value());
  }
}
} // namespace mlx::core


================================================
FILE: mlx/backend/no_gpu/fence.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <condition_variable>
#include <mutex>

#include "mlx/fence.h"
#include "mlx/scheduler.h"

namespace mlx::core {

struct FenceImpl {
  uint32_t count{0};
  uint32_t value{0};
  std::mutex mtx;
  std::condition_variable cv;
};

Fence::Fence(Stream) {
  auto dtor = [](void* ptr) { delete static_cast<FenceImpl*>(ptr); };
  fence_ = std::shared_ptr<void>(new FenceImpl{}, dtor);
}

void Fence::wait(Stream stream, const array&) {
  auto& f = *static_cast<FenceImpl*>(fence_.get());
  if (stream.device == Device::cpu) {
    scheduler::enqueue(stream, [count = f.count, fence_ = fence_]() mutable {
      auto& f = *static_cast<FenceImpl*>(fence_.get());
      std::unique_lock<std::mutex> lk(f.mtx);
      if (f.value >= count) {
        return;
      }
      f.cv.wait(lk, [&f, count] { return f.value >= count; });
    });
  } else {
    throw std::runtime_error("[Fence::wait] Invalid stream.");
  }
}

void Fence::update(Stream stream, const array&, bool) {
  auto& f = *static_cast<FenceImpl*>(fence_.get());
  f.count++;
  if (stream.device == Device::cpu) {
    scheduler::enqueue(stream, [count = f.count, fence_ = fence_]() mutable {
      auto& f = *static_cast<FenceImpl*>(fence_.get());
      std::unique_lock<std::mutex> lk(f.mtx);
      f.value = count;
      f.cv.notify_all();
    });
  } else {
    throw std::runtime_error("[Fence::update] Invalid stream.");
  }
}

} // namespace mlx::core


================================================
FILE: mlx/backend/no_gpu/linux_memory.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include <sys/sysinfo.h>

namespace {

size_t get_memory_size() {
  struct sysinfo info;

  if (sysinfo(&info) != 0) {
    return 0;
  }

  size_t total_ram = info.totalram;
  total_ram *= info.mem_unit;

  return total_ram;
}

} // namespace


================================================
FILE: mlx/backend/no_gpu/primitives.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include "mlx/primitives.h"
#include "mlx/distributed/primitives.h"
#include "mlx/fast_primitives.h"

#define NO_GPU_MULTI(func)                                             \
  void func::eval_gpu(                                                 \
      const std::vector<array>& inputs, std::vector<array>& outputs) { \
    throw std::runtime_error(#func " has no GPU implementation.");     \
  }

#define NO_GPU_USE_FALLBACK(func)     \
  bool func::use_fallback(Stream s) { \
    return true;                      \
  }                                   \
  NO_GPU_MULTI(func)

#define NO_GPU(func)                                                  \
  void func::eval_gpu(const std::vector<array>& inputs, array& out) { \
    throw std::runtime_error(#func " has no GPU implementation.");    \
  }

namespace mlx::core {

bool fast::ScaledDotProductAttention::use_fallback(
    const array& q,
    const array& k,
    const array& v,
    bool has_mask,
    bool has_arr_mask,
    bool do_causal,
    bool is_training,
    bool output_logsumexp,
    Stream s) {
  return true;
}

bool fast::ScaledDotProductAttention::supports_bool_mask() {
  return false;
}

bool fast::ScaledDotProductAttentionVJP::use_fallback(
    const array& q,
    Stream s) {
  return true;
}

NO_GPU(Abs)
NO_GPU(Add)
NO_GPU(AddMM)
NO_GPU(Arange)
NO_GPU(ArcCos)
NO_GPU(ArcCosh)
NO_GPU(ArcSin)
NO_GPU(ArcSinh)
NO_GPU(ArcTan)
NO_GPU(ArcTan2)
NO_GPU(ArcTanh)
NO_GPU(ArgPartition)
NO_GPU(ArgReduce)
NO_GPU(ArgSort)
NO_GPU(AsType)
NO_GPU(AsStrided)
NO_GPU(BitwiseBinary)
NO_GPU(BitwiseInvert)
NO_GPU(BlockMaskedMM)
NO_GPU(Broadcast)
NO_GPU(BroadcastAxes)
NO_GPU(Ceil)
NO_GPU_MULTI(Compiled)
NO_GPU(Concatenate)
NO_GPU(Conjugate)
NO_GPU(Contiguous)
NO_GPU(Convolution)
NO_GPU(Copy)
NO_GPU(Cos)
NO_GPU(Cosh)
NO_GPU_MULTI(CustomTransforms)
NO_GPU_MULTI(Depends)
NO_GPU(Divide)
NO_GPU_MULTI(DivMod)
NO_GPU(DynamicSlice)
NO_GPU(DynamicSliceUpdate)
NO_GPU(NumberOfElements)
NO_GPU(Remainder)
NO_GPU(Equal)
NO_GPU(Erf)
NO_GPU(ErfInv)
NO_GPU(Exp)
NO_GPU(ExpandDims)
NO_GPU(Expm1)
NO_GPU(FFT)
NO_GPU(Flatten)
NO_GPU(Floor)
NO_GPU(Full)
NO_GPU(Gather)
NO_GPU(GatherAxis)
NO_GPU(GatherMM)
NO_GPU(GatherQMM)
NO_GPU(Greater)
NO_GPU(GreaterEqual)
NO_GPU(Hadamard)
NO_GPU(Imag)
NO_GPU(Less)
NO_GPU(LessEqual)
NO_GPU(Load)
NO_GPU(Log)
NO_GPU(Log1p)
NO_GPU(LogicalNot)
NO_GPU(LogicalAnd)
NO_GPU(LogicalOr)
NO_GPU(LogAddExp)
NO_GPU(LogSumExp)
NO_GPU_MULTI(LUF)
NO_GPU(Matmul)
NO_GPU(Maximum)
NO_GPU(Minimum)
NO_GPU(Multiply)
NO_GPU(Negative)
NO_GPU(NotEqual)
NO_GPU(Pad)
NO_GPU(Partition)
NO_GPU(Power)
NO_GPU_MULTI(QRF)
NO_GPU(QuantizedMatmul)
NO_GPU(QQMatmul)
NO_GPU(RandomBits)
NO_GPU(Real)
NO_GPU(Reduce)
NO_GPU(Reshape)
NO_GPU(Round)
NO_GPU(Scan)
NO_GPU(Scatter)
NO_GPU(ScatterAxis)
NO_GPU(Select)
NO_GPU(SegmentedMM)
NO_GPU(Sigmoid)
NO_GPU(Sign)
NO_GPU(Sin)
NO_GPU(Sinh)
NO_GPU(Slice)
NO_GPU(SliceUpdate)
NO_GPU(Softmax)
NO_GPU(Sort)
NO_GPU_MULTI(Split)
NO_GPU(Square)
NO_GPU(Squeeze)
NO_GPU(Sqrt)
NO_GPU(StopGradient)
NO_GPU(Subtract)
NO_GPU_MULTI(SVD)
NO_GPU(Tan)
NO_GPU(Tanh)
NO_GPU(Transpose)
NO_GPU(Unflatten)
NO_GPU(Inverse)
NO_GPU(Cholesky)
NO_GPU_MULTI(Eigh)
NO_GPU_MULTI(Eig)
NO_GPU(View)
NO_GPU(MaskedScatter)

namespace fast {
NO_GPU_USE_FALLBACK(LayerNorm)
NO_GPU_MULTI(LayerNormVJP)
NO_GPU_USE_FALLBACK(RMSNorm)
NO_GPU_MULTI(RMSNormVJP)
NO_GPU_USE_FALLBACK(RoPE)
NO_GPU_MULTI(ScaledDotProductAttention)
NO_GPU_MULTI(ScaledDotProductAttentionVJP)
NO_GPU_MULTI(ConvertFP8)
NO_GPU_MULTI(Quantize)
NO_GPU_MULTI(CustomKernel)
} // namespace fast

namespace distributed {
NO_GPU_MULTI(AllReduce)
NO_GPU_MULTI(AllGather)
NO_GPU_MULTI(Send)
NO_GPU_MULTI(Recv)
NO_GPU_MULTI(ReduceScatter)
} // namespace distributed

} // namespace mlx::core


================================================
FILE: mlx/compile.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <atomic>
#include <cstdlib>
#include <map>
#include <sstream>
#include <unordered_map>
#include <unordered_set>

#include "mlx/allocator.h"
#include "mlx/backend/common/compiled.h"
#include "mlx/compile.h"
#include "mlx/compile_impl.h"
#include "mlx/fast_primitives.h"
#include "mlx/graph_utils.h"
#include "mlx/primitives.h"
#include "mlx/transforms.h"
#include "mlx/transforms_impl.h"
#include "mlx/utils.h"

namespace mlx::core {

constexpr int max_compile_depth = 11;
constexpr int max_compile_arrays = 24;

bool is_unary(const Primitive& p) {
  return (
      typeid(p) == typeid(Abs) || typeid(p) == typeid(ArcCos) ||
      typeid(p) == typeid(ArcCosh) || typeid(p) == typeid(ArcSin) ||
      typeid(p) == typeid(ArcSinh) || typeid(p) == typeid(ArcTan) ||
      typeid(p) == typeid(ArcTanh) || typeid(p) == typeid(AsType) ||
      typeid(p) == typeid(Ceil) || typeid(p) == typeid(Cos) ||
      typeid(p) == typeid(Conjugate) || typeid(p) == typeid(Cosh) ||
      typeid(p) == typeid(Remainder) || typeid(p) == typeid(Erf) ||
      typeid(p) == typeid(ErfInv) || typeid(p) == typeid(Exp) ||
      typeid(p) == typeid(Floor) || typeid(p) == typeid(Log) ||
      typeid(p) == typeid(Log1p) || typeid(p) == typeid(LogicalNot) ||
      typeid(p) == typeid(Negative) || typeid(p) == typeid(Round) ||
      typeid(p) == typeid(Sigmoid) || typeid(p) == typeid(Sign) ||
      typeid(p) == typeid(Sin) || typeid(p) == typeid(Sinh) ||
      typeid(p) == typeid(Square) || typeid(p) == typeid(Sqrt) ||
      typeid(p) == typeid(Tan) || typeid(p) == typeid(Tanh) ||
      typeid(p) == typeid(Expm1) || typeid(p) == typeid(Real) ||
      typeid(p) == typeid(Imag) || typeid(p) == typeid(BitwiseInvert));
}

bool is_binary(const Primitive& p) {
  return (
      typeid(p) == typeid(Add) || typeid(p) == typeid(Divide) ||
      typeid(p) == typeid(Equal) || typeid(p) == typeid(Greater) ||
      typeid(p) == typeid(GreaterEqual) || typeid(p) == typeid(Less) ||
      typeid(p) == typeid(LessEqual) || typeid(p) == typeid(LogicalNot) ||
      typeid(p) == typeid(LogicalAnd) || typeid(p) == typeid(LogicalOr) ||
      typeid(p) == typeid(LogAddExp) || typeid(p) == typeid(Maximum) ||
      typeid(p) == typeid(Minimum) || typeid(p) == typeid(Multiply) ||
      typeid(p) == typeid(NotEqual) || typeid(p) == typeid(Power) ||
      typeid(p) == typeid(Subtract) || typeid(p) == typeid(BitwiseBinary) ||
      typeid(p) == typeid(ArcTan2));
}

bool is_ternary(const Primitive& p) {
  return typeid(p) == typeid(Select);
}

bool is_broadcast(const Primitive& p) {
  return typeid(p) == typeid(Broadcast);
}

bool is_noop(const Primitive& p) {
  return typeid(p) == typeid(Copy) || typeid(p) == typeid(StopGradient);
}

bool is_reduction(const Primitive& p) {
  return typeid(p) == typeid(Reduce) || typeid(p) == typeid(ArgReduce);
}

bool is_fusable(const Primitive& p) {
  return is_unary(p) || is_binary(p) || is_ternary(p) || is_broadcast(p);
}

Compiled::Compiled(
    Stream stream,
    std::vector<array> inputs,
    std::vector<array> outputs,
    std::vector<array> tape,
    std::unordered_set<uintptr_t> constant_ids)
    : Primitive(stream),
      inputs_(std::move(inputs)),
      outputs_(std::move(outputs)),
      tape_(std::move(tape)),
      constant_ids_(std::move(constant_ids)),
      is_constant_([this](size_t i) {
        return constant_ids_.find(inputs_[i].id()) != constant_ids_.end();
      }) {
  // Build the kernel name.
  NodeNamer namer;
  std::ostringstream os;
  std::ostringstream constant_hasher;

  std::unordered_set<uintptr_t> output_ids;
  for (auto& o : outputs_) {
    output_ids.insert(o.id());
  }

  // Fill the input names. This is not really necessary, I just like having A,
  // B, C, ... as the inputs.
  for (const auto& x : inputs_) {
    namer.get_name(x);
  }

  // The primitives describing the tape. For unary and binary primitives this
  // must be enough to describe the full computation.
  for (const auto& a : tape_) {
    // name and type of output
    os << namer.get_name(a) << kindof(a.dtype()) << a.itemsize();
    // whether or not it's an output
    if (output_ids.find(a.id()) != output_ids.end()) {
      os << "O";
    } else {
      os << "I";
    }
    // computation performed
    os << a.primitive().name();
    // name of inputs to the function
    for (auto& inp : a.inputs()) {
      os << namer.get_name(inp);
    }
  }
  os << "_";

  for (const auto& x : inputs_) {
    if (constant_ids_.find(x.id()) != constant_ids_.end()) {
      os << "C";
      print_constant(constant_hasher, x);
    } else {
      os << (is_scalar(x) ? "S" : "V");
    }
  }
  os << "_";
  for (const auto& x : inputs) {
    if (constant_ids.find(x.id()) != constant_ids.end()) {
      continue;
    }
    os << kindof(x.dtype()) << x.itemsize();
  }
  os << "_" << std::hash<std::string>{}(constant_hasher.str());

  kernel_lib_ = os.str();
}

std::vector<array> Compiled::vjp(
    const std::vector<array>&,
    const std::vector<array>&,
    const std::vector<int>&,
    const std::vector<array>&) {
  throw std::runtime_error("[Compiled] Cannot vjp primitive.");
}

std::vector<array> Compiled::jvp(
    const std::vector<array>&,
    const std::vector<array>&,
    const std::vector<int>&) {
  throw std::runtime_error("[Compiled] Cannot jvp primitive.");
}

std::pair<std::vector<array>, std::vector<int>> Compiled::vmap(
    const std::vector<array>&,
    const std::vector<int>&) {
  throw std::runtime_error("[Compiled] Cannot vmap primitive.");
}

bool Compiled::is_equivalent(const Primitive& other) const {
  const Compiled& a_other = static_cast<const Compiled&>(other);
  return std::equal(
      tape_.begin(),
      tape_.end(),
      a_other.tape_.begin(),
      a_other.tape_.end(),
      [](const array& a1, const array& a2) {
        auto& p1 = a1.primitive();
        auto& p2 = a2.primitive();
        return typeid(p1) == typeid(p2) && p1.is_equivalent(p2);
      });
}

const char* Compiled::name() const {
  if (name_.empty()) {
    std::ostringstream os;
    os << "Compiled";
    for (auto& a : tape_) {
      os << a.primitive().name();
    }
    name_ = os.str();
  }
  return name_.c_str();
}

std::vector<Shape> Compiled::output_shapes(const std::vector<array>& inputs) {
  size_t nd = 0;
  for (auto& in : inputs) {
    nd = std::max(nd, in.ndim());
  }
  Shape out_shape(nd, 0);
  for (auto& in : inputs) {
    auto dd = nd - in.ndim();
    for (auto i = dd; i < nd; ++i) {
      out_shape[i] = std::max(out_shape[i], in.shape()[i - dd]);
    }
  }
  // All outputs have the same shape
  return std::vector<Shape>(outputs_.size(), out_shape);
}

namespace detail {

std::atomic<CompileMode>& compile_mode() {
  auto get_val = []() {
    if (std::getenv("MLX_DISABLE_COMPILE")) {
      return CompileMode::disabled;
    } else {
      return CompileMode::enabled;
    }
  };
  static std::atomic<CompileMode> compile_mode_ = get_val();
  return compile_mode_;
}

// Helper like below but only merges the two provided arrays. If the src has
// siblings then these won't be merged to the dst.
void merge_one(array& dst, array& src, ParentsMap& parents_map) {
  auto src_parents = parents_map.find(src.id());
  if (src_parents == parents_map.end()) {
    return;
  }
  auto& pairs = parents_map[dst.id()];
  for (auto& parent : src_parents->second) {
    parent.first.inputs()[parent.second] = dst;
    pairs.push_back(parent);
  }

  // If src is a parent of dst, remove it from dst's parents
  for (auto it = pairs.begin(); it != pairs.end();) {
    if (it->first.id() == src.id()) {
      it = pairs.erase(it);
    } else {
      it++;
    }
  }
  // Remove the source from the map to avoid fusing with it again
  parents_map.erase(src_parents);
}

// Helper that merges two arrays in the graph by setting the parents of the
// source to point to the destination. The arrays are assumed to be coming from
// equivalent primitives so their siblings are merged as well.
void merge(array& dst, array& src, ParentsMap& parents_map) {
  // Canonicalize the order of the primitives outputs
  auto sources = src.outputs();
  auto dests = dst.outputs();
  // For each src parent, point it to the corresponding dst
  for (int i = 0; i < sources.size(); ++i) {
    merge_one(dests[i], sources[i], parents_map);
  }
}

// Any parent in the divider will continue to refer to `x` but any parent not
// in the divider will refer to a copy of the operation.
array split_one(
    const array& x,
    ParentsMap& parents_map,
    const std::unordered_set<uintptr_t>& divider) {
  array y(x.shape(), x.dtype(), x.primitive_ptr(), x.inputs());

  auto& x_parents = parents_map[x.id()];
  auto& y_parents = parents_map[y.id()];

  for (auto it = x_parents.begin(); it != x_parents.end();) {
    if (divider.find(it->first.id()) != divider.end()) {
      it->first.inputs()[it->second] = y;
      y_parents.emplace_back(std::move(*it));
      it = x_parents.erase(it);
    } else {
      it++;
    }
  }

  return y;
}

template <typename T, typename... U>
std::uintptr_t get_function_address(const std::function<T(U...)>& fun) {
  using FunType = T (*)(U...);
  const FunType* fun_ptr = fun.template target<FunType>();
  if (fun_ptr == nullptr) {
    return 0;
  }
  return reinterpret_cast<std::uintptr_t>(*fun_ptr);
}

class CompilerCache {
 public:
  struct CacheEntry {
    CacheEntry(Stream stream, bool shapeless)
        : stream(stream), shapeless(shapeless) {};
    Stream stream;
    bool shapeless;
    std::vector<array> inputs;
    std::vector<array> outputs;
    std::vector<array> tape;
    bool empty{true};
    std::vector<uint64_t> constants;
    std::shared_ptr<void> extra;
  };

  // Returns a reference to a CacheEntry which can be updated
  // by the caller to avoid copying large tapes / inputs / outputs
  CacheEntry& find(
      std::uintptr_t fun_id,
      const std::vector<array>& inputs,
      bool shapeless,
      const std::vector<uint64_t>& constants) {
    // Find the cache entries for |fun_id|.
    std::vector<CacheEntry>& entries = cache_[fun_id];

    // Compare if 2 arrays have same shape and dtype.
    auto has_same_shape_and_dtype = [shapeless](
                                        const std::vector<array>& in1,
                                        const std::vector<array>& in2) {
      if (in1.size() != in2.size()) {
        return false;
      }
      for (size_t i = 0; i < in1.size(); ++i) {
        if (in1[i].ndim() != in2[i].ndim()) {
          return false;
        }
        if (!shapeless && in1[i].shape() != in2[i].shape()) {
          return false;
        }
        if (in1[i].dtype() != in2[i].dtype()) {
          return false;
        }
      }
      return true;
    };
    // Loop over entries and check:
    // - Default stream and device match the entry's default stream
    // - Inputs match i.e. shapes and types must be equal.
    auto stream = default_stream(default_device());
    for (CacheEntry& entry : entries) {
      // Check that the default stream and device match
      if (entry.stream != stream) {
        continue;
      }
      if (entry.shapeless != shapeless) {
        continue;
      }

      // Check the inputs match and return if so
      if (has_same_shape_and_dtype(inputs, entry.inputs) &&
          constants == entry.constants) {
        return entry;
      }
    }
    // Otherwise append a new cache entry
    entries.push_back(CacheEntry{stream, shapeless});
    return entries.back();
  }

  void erase(std::uintptr_t fun_id) {
    cache_.erase(fun_id);
  }

  void clear() {
    cache_.clear();
  }

 private:
  CompilerCache() {
    // Make sure the allocator is fully
    // initialized before the compiler cache
    allocator::allocator();
  }

  friend CompilerCache& compiler_cache();
  std::unordered_map<std::uintptr_t, std::vector<CacheEntry>> cache_;
};

CompilerCache& compiler_cache() {
  static thread_local CompilerCache compiler_cache_;
  return compiler_cache_;
}

std::tuple<std::vector<array>, std::vector<array>, std::shared_ptr<void>>
compile_trace(
    const ArrayFnWithExtra& fun,
    const std::vector<array>& inputs,
    bool shapeless) {
  // Set the global tracing flag.
  detail::InTracing in_tracing{shapeless};

  // Run the function on placeholder inputs
  // to get compute graph
  std::vector<array> tracer_inputs;
  for (int i = 0; i < inputs.size(); ++i) {
    array in(inputs[i].shape(), inputs[i].dtype(), nullptr, {});
    in.set_tracer(true);
    tracer_inputs.push_back(std::move(in));
  }

  auto output = fun(tracer_inputs);
  return {tracer_inputs, output.first, output.second};
}

// Traverses the graph to build a tape and a map of array ids to their parents
std::pair<std::vector<array>, ParentsMap> compile_dfs(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const std::vector<array>& original_inputs) {
  std::vector<array> tape;
  std::unordered_map<std::uintptr_t, std::vector<std::pair<array, int>>>
      parents_map;
  {
    std::function<void(const array&)> recurse;
    std::unordered_set<std::uintptr_t> input_set;
    std::unordered_set<std::uintptr_t> original_input_set;
    for (int i = 0; i < inputs.size(); ++i) {
      input_set.insert(inputs[i].id());
      original_input_set.insert(original_inputs[i].id());
    }

    // DFS the graph to build the tape, and log parents and scalars
    std::unordered_set<std::uintptr_t> cache;
    recurse = [&](const array& a) {
      auto id = a.id();
      if (original_input_set.find(id) != original_input_set.end()) {
        throw std::invalid_argument(
            "[compile] Attempting to compile a function with uncaptured inputs is not allowed.");
      }
      if (cache.find(id) != cache.end()) {
        return;
      }
      for (int i = 0; i < a.inputs().size(); i++) {
        auto& in = a.inputs()[i];
        parents_map[in.id()].push_back({a, i});
        for (auto& s : a.siblings()) {
          parents_map[in.id()].push_back({s, i});
        }
        // Don't recurse on inputs (but add them to the tape for the purpose
        // of future optimizations)
        if (input_set.find(a.id()) == input_set.end()) {
          recurse(in);
        }
      }
      cache.insert(id);
      for (auto& s : a.siblings()) {
        cache.insert(s.id());
      }
      tape.push_back(a);
    };
    for (auto& a : outputs) {
      recurse(a);
    }
  }

  // Deep copy the tape and parents map while preserving inputs and outputs
  std::vector<array> new_tape;
  std::unordered_set<uintptr_t> io_set;
  std::unordered_map<uintptr_t, array> old_to_new;
  for (auto& o : outputs) {
    old_to_new.insert({o.id(), o});
    io_set.insert(o.id());
    for (auto& s : o.siblings()) {
      old_to_new.insert({s.id(), s});
      io_set.insert(s.id());
    }
  }
  for (auto& i : inputs) {
    io_set.insert(i.id());
    old_to_new.insert({i.id(), i});
  }

  new_tape.reserve(tape.size());
  for (auto& arr : tape) {
    if (!arr.has_primitive() || (io_set.find(arr.id()) != io_set.end())) {
      old_to_new.insert({arr.id(), arr});
      new_tape.push_back(arr);
      continue;
    }
    std::vector<array> inputs;
    inputs.reserve(arr.inputs().size());
    for (auto& i : arr.inputs()) {
      inputs.push_back(old_to_new.find(i.id())->second);
    }
    if (arr.siblings().size() > 0) {
      std::vector<Dtype> types;
      std::vector<Shape> shapes;
      auto out = arr.outputs();
      for (auto& o : out) {
        types.push_back(o.dtype());
        shapes.push_back(o.shape());
      }
      auto as = array::make_arrays(
          std::move(shapes), types, arr.primitive_ptr(), std::move(inputs));
      for (int i = 0; i < out.size(); ++i) {
        old_to_new.insert({out[i].id(), as[i]});
      }
      new_tape.push_back(as[arr.sibling_position()]);
    } else {
      auto a = array(
          arr.shape(), arr.dtype(), arr.primitive_ptr(), std::move(inputs));
      old_to_new.insert({arr.id(), a});
      new_tape.push_back(a);
    }
  }
  io_set.clear();
  for (auto& o : outputs) {
    if (!(io_set.insert(o.id()).second)) {
      continue;
    }
    for (auto& i : o.inputs()) {
      i = old_to_new.find(i.id())->second;
    }
    for (auto& s : o.siblings()) {
      io_set.insert(s.id());
      for (auto& i : s.inputs()) {
        i = old_to_new.find(i.id())->second;
      }
    }
  }
  tape = std::move(new_tape);

  std::unordered_map<std::uintptr_t, std::vector<std::pair<array, int>>>
      new_parents_map;
  for (auto& [id, vec] : parents_map) {
    for (auto& [a, _] : vec) {
      a = old_to_new.find(a.id())->second;
    }
    new_parents_map[old_to_new.find(id)->second.id()] = std::move(vec);
  }
  parents_map = std::move(new_parents_map);
  return {tape, parents_map};
}

static inline uint64_t splitmix64(uint64_t x) noexcept {
  x += 0x9e3779b97f4a7c15ull;
  x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ull;
  x = (x ^ (x >> 27)) * 0x94d049bb133111ebull;
  return x ^ (x >> 31);
}

struct VecU64Hash {
  size_t operator()(const std::vector<uint64_t>& s) const noexcept {
    uint64_t h =
        0x243f6a8885a308d3ull ^ (uint64_t)s.size() * 0x9e3779b97f4a7c15ull;
    for (uint64_t x : s) {
      h = splitmix64(x ^ splitmix64(h + 0x9e3779b97f4a7c15ull));
    }
    return (size_t)h;
  }
};

// Simplify the tape. Note, this function modifies in-place both the tape,
// the parents map to remove orphaned arrays, and potentially the outputs
void compile_simplify(
    std::vector<array>& tape,
    ParentsMap& parents_map,
    std::vector<array>& outputs,
    int passes) {
  // Helpers to identify identical scalars
  std::map<std::pair<uint64_t, Dtype::Val>, array> scalars;
  auto is_scalar = [](const array& a) {
    // Condition for when it's safe to read an array
    return a.is_available() && a.ndim() == 0;
  };
  auto get_scalar_rep = [](const array& a) {
    uint64_t v = 0;
    switch (a.dtype().size()) {
      case 1:
        v = *a.data<uint8_t>();
        break;
      case 2:
        v = *a.data<uint16_t>();
        break;
      case 4:
        v = *a.data<uint32_t>();
        break;
      case 8:
        v = *a.data<uint64_t>();
        break;
    }
    return std::make_pair(v, a.dtype().val());
  };

  for (auto& a : tape) {
    if (is_scalar(a)) {
      scalars.insert({get_scalar_rep(a), a});
    }
  }

  // Depth-1 array equivalence check.
  auto array_equivalent = [](const array& a, const array& b) {
    if (!a.has_primitive() || !b.has_primitive()) {
      return false;
    }
    if (a.primitive_id() == b.primitive_id()) {
      return false;
    }
    const auto& pa = a.primitive();
    const auto& pb = b.primitive();
    if (typeid(pa) != typeid(pb)) {
      return false;
    }

    if (a.inputs().size() != b.inputs().size()) {
      return false;
    }

    for (int i = 0; i < a.inputs().size(); i++) {
      if (a.inputs()[i].id() != b.inputs()[i].id()) {
        return false;
      }
    }

    return pa.is_equivalent(pb);
  };

  // Merge scalars
  std::vector<array> new_tape;
  for (auto& arr : tape) {
    // Check if we can merge scalars
    if (is_scalar(arr)) {
      auto scalar = scalars.find(get_scalar_rep(arr));
      if (scalar->second.id() != arr.id()) {
        merge(scalar->second, arr, parents_map);
        // Don't keep orphaned scalars in the tape
        continue;
      }
    }
    new_tape.push_back(std::move(arr));
  }
  tape = std::move(new_tape);

  // Remove no-ops
  {
    std::unordered_map<uintptr_t, array> output_map;
    for (auto& o : outputs) {
      output_map.insert({o.id(), o});
    }
    for (auto& arr : tape) {
      if (!arr.has_primitive() || !is_noop(arr.primitive())) {
        new_tape.push_back(std::move(arr));
        continue;
      }
      merge_one(arr.inputs()[0], arr, parents_map);
      if (auto it = output_map.find(arr.id()); it != output_map.end()) {
        it->second = arr.inputs()[0];
      }
    }
    tape = std::move(new_tape);
    for (auto& o : outputs) {
      o = output_map.at(o.id());
    }
  }

  std::unordered_map<std::uintptr_t, uint32_t> tape_order;
  for (uint32_t i = 0; i < tape.size(); ++i) {
    tape_order.insert({tape[i].id(), i});
  }

  std::unordered_set<uintptr_t> output_set;
  for (auto& o : outputs) {
    output_set.insert(o.id());
  }

  // Multi-pass merge only keeping non-orphaned arrays in the tape
  for (int pass = 0; pass < passes; ++pass) {
    for (auto& arr : tape) {
      // Helper to check if we can merge the parents of the
      // given array
      auto maybe_merge_parents = [&](auto& a) {
        auto parents = parents_map.find(a.id());
        if (parents != parents_map.end()) {
          auto N = parents->second.size();
          std::vector<bool> mask(N, false);

          auto try_merge = [&](int dst_idx, int src_idx) {
            if (tape_order[parents->second[src_idx].first.id()] <
                tape_order[parents->second[dst_idx].first.id()]) {
              std::swap(src_idx, dst_idx);
            }
            auto& src = parents->second[src_idx].first;
            auto& dst = parents->second[dst_idx].first;
            if (src.id() != dst.id() && array_equivalent(src, dst) &&
                output_set.find(src.id()) == output_set.end()) {
              merge(dst, src, parents_map);
              mask[src_idx] = true;
            }
          };

          if (N > 100) {
            std::unordered_map<
                std::vector<uint64_t>,
                std::vector<int>,
                VecU64Hash>
                dst_map;
            // Find possibly mergeable groups
            for (int i = 0; i < N; i++) {
              // Make the hash key
              std::vector<uint64_t> key;
              auto& curr = parents->second[i].first;
              key.reserve(curr.inputs().size() + 2);
              for (auto& in : curr.inputs()) {
                key.push_back(in.id());
              }
              auto& p = curr.primitive();
              key.push_back(curr.inputs().size());
              key.push_back(typeid(p).hash_code());
              auto it = dst_map.find(key);
              if (it == dst_map.end()) {
                bool _;
                std::tie(it, _) = dst_map.insert({key, std::vector<int>{}});
              }
              it->second.push_back(i);
            }
            for (auto& [_, group] : dst_map) {
              for (int i = 0; i < group.size(); ++i) {
                if (mask[group[i]]) {
                  continue;
                }
                for (int j = i + 1; j < group.size(); ++j) {
                  if (mask[group[j]]) {
                    continue;
                  }
                  try_merge(group[i], group[j]);
                }
              }
            }
          } else {
            for (int i = 0; i < N; ++i) {
              if (mask[i]) {
                continue;
              }
              for (int j = i + 1; j < N; ++j) {
                if (mask[j]) {
                  continue;
                }
                try_merge(i, j);
              }
            }
          }

          // Erase orphaned parents so we don't keep fusing with them
          for (int i = N - 1; i >= 0; --i) {
            if (mask[i]) {
              parents->second.erase(parents->second.begin() + i);
            }
          }
          return false;
        } else {
          return output_set.find(a.id()) == output_set.end();
        }
      };
      bool discard = maybe_merge_parents(arr);
      for (auto& s : arr.siblings()) {
        discard &= maybe_merge_parents(s);
      }
      // If an array and its siblings have no parents, and none of them are
      // outputs, it is safe to remove it from the tape
      if (!discard) {
        new_tape.push_back(std::move(arr));
      }
    }
    tape = std::move(new_tape);
  }
}

// Extract sub-graphs of the graph that can be compiled
// and replace them with a Compiled Primitive.
void compile_fuse(
    std::vector<array>& tape,
    ParentsMap& parents_map,
    const std::vector<array>& inputs,
    std::vector<array>& outputs) {
  // Track outputs to replace with new compiled outputs
  std::unordered_map<uintptr_t, array> output_map;
  for (auto& o : outputs) {
    output_map.insert({o.id(), o});
  }

  // Set of inputs to distinguish constants
  std::unordered_set<uintptr_t> input_ids;
  for (auto& in : inputs) {
    input_ids.insert(in.id());
  }

  // Go through the tape in reverse order and check for fusable sub-graphs
  std::vector<array> new_tape;
  std::unordered_set<uintptr_t> global_cache;
  for (int i = tape.size() - 1; i >= 0; --i) {
    auto& arr = tape[i];

    // Already compiled
    if (global_cache.find(arr.id()) != global_cache.end()) {
      continue;
    }

    // Two pass recursion:
    // First pass:
    //  - Collect all the primitives which we can fuse with
    //  - Keeps a cache of fusable primitives which may be added out of
    //    DAG order. We have to determine if all of a fused primitive's
    //    outputs are also in the fused section, and this may not be the
    //    case the first time we visit it.
    // Second pass:
    //  - Collect inputs to the new compiled primitive
    //  - Add fusable primitives to a tape in the correct order

    std::function<void(const array&, int, const Stream&, const Shape&)> recurse;
    std::unordered_set<uintptr_t> cache;
    std::unordered_set<uintptr_t> input_set;
    recurse = [&](const array& a,
                  int depth,
                  const Stream& s,
                  const Shape& shape) {
      if (cache.find(a.id()) != cache.end()) {
        return;
      }

      // Stop fusing if:
      // - Depth limit exceeded
      // - Constant input
      // - Stream mismatch
      // - Non fusable primitive
      // - Is global output but has a different shape
      if (depth >= max_compile_depth || !a.has_primitive() ||
          a.primitive().stream() != s || !is_fusable(a.primitive()) ||
          (output_map.find(a.id()) != output_map.end() && a.shape() != shape)) {
        // Possible input
        input_set.insert(a.id());
        return;
      }

      bool all_parents_in = true;
      if (depth > 0) {
        // Guaranteed to have a parent since nested in the
        // recursion.
        auto& parents = parents_map.at(a.id());
        for (auto& [p, idx] : parents) {
          auto in_cache = cache.find(p.id()) != cache.end();
          if (!in_cache) {
            all_parents_in = false;
            break;
          }
        }
      }

      // Arrays with a mix of parents outside the compilable section
      // are not fusable except for broadcast which we can split to avoid
      // stopping fusion
      if (!all_parents_in) {
        if (a.has_primitive() && is_broadcast(a.primitive()) &&
            input_set.size() < max_compile_arrays) {
          array b = split_one(a, parents_map, cache);
          recurse(b, depth, s, shape);
        } else {
          // Possible input
          input_set.insert(a.id());
        }
        return;
      }

      if (output_map.find(a.id()) != output_map.end()) {
        input_set.insert(a.id());
      } else {
        // Not an input anymore since fusing it
        input_set.erase(a.id());
      }
      if (input_set.size() >= max_compile_arrays) {
        return;
      }
      cache.insert({a.id()});

      for (auto& in : a.inputs()) {
        recurse(in, depth + 1, s, shape);
      }
    };

    // This will be the result of the fused operation so it needs
    //   a) to not be already computed ie have a primitive
    //   b) that primitive to not be a broadcast since it will unnecessarily
    //      cast to a contiguous array potentially blowing up memory
    if (arr.has_primitive() && !is_broadcast(arr.primitive())) {
      Stream s = arr.primitive().stream();
      recurse(arr, 0, s, arr.shape());
    }

    // Not worth fusing a single primitive
    if (cache.size() <= 1) {
      new_tape.push_back(arr);
      continue;
    }

    // Recurse a second time to build the tape in the right
    // order and collect the inputs
    input_set.clear();
    std::vector<array> inputs;
    std::vector<array> fused_tape;
    std::unordered_set<uintptr_t> tape_set;
    std::function<void(const array&)> recurse_tape;
    recurse_tape = [&](const array& a) {
      if (cache.find(a.id()) == cache.end()) {
        if (input_set.find(a.id()) == input_set.end()) {
          input_set.insert(a.id());
          inputs.push_back(a);
        }
        return;
      }
      if (tape_set.find(a.id()) != tape_set.end()) {
        return;
      }
      tape_set.insert(a.id());
      for (auto& in : a.inputs()) {
        recurse_tape(in);
      }
      fused_tape.push_back(a);
    };
    recurse_tape(arr);

    std::vector<array> old_outputs;
    // Add to global cache and add any global outputs to outputs
    // of new primitive
    for (int j = 0; j < fused_tape.size() - 1; ++j) {
      auto& f = fused_tape[j];
      if (output_map.find(f.id()) != output_map.end()) {
        old_outputs.push_back(f);
        // Parents are now siblings, update the parent map
        auto& pairs = parents_map[f.id()];
        pairs.erase(
            std::remove_if(
                pairs.begin(),
                pairs.end(),
                [&](auto& p) {
                  return cache.find(p.first.id()) != cache.end();
                }),
            pairs.end());
      } else {
        // Remove inner fused arrays parents from the parents map
        // to keep the parents map in a valid state
        parents_map.erase(f.id());
      }
      global_cache.insert({f.id()});
    }
    old_outputs.push_back(arr);

    std::vector<Shape> shapes;
    std::vector<Dtype> types;
    for (auto& o : old_outputs) {
      if (o.shape() != old_outputs.back().shape()) {
        throw std::runtime_error(
            "[compile] Compilation failed. Tried to fuse operations with different output shapes");
      }
      shapes.push_back(o.shape());
      types.push_back(o.dtype());
    }
    std::unordered_set<uintptr_t> constant_ids;
    for (auto& in : inputs) {
      // Scalar constant
      if (in.size() == 1 && !in.has_primitive() &&
          input_ids.find(in.id()) == input_ids.end()) {
        constant_ids.insert(in.id());
      }
    }
    auto compiled_outputs = array::make_arrays(
        std::move(shapes),
        types,
        std::make_shared<Compiled>(
            old_outputs.back().primitive().stream(),
            inputs,
            old_outputs,
            std::move(fused_tape),
            std::move(constant_ids)),
        inputs);

    // One output per primitive
    new_tape.push_back(compiled_outputs.back());

    // Replace inputs old parents with compiled_outputs
    for (int i = 0; i < inputs.size(); ++i) {
      auto& pairs = parents_map[inputs[i].id()];
      pairs.erase(
          std::remove_if(
              pairs.begin(),
              pairs.end(),
              [&](auto& p) { return cache.find(p.first.id()) != cache.end(); }),
          pairs.end());
      for (auto& o : compiled_outputs) {
        pairs.push_back({o, i});
      }
    }

    // - Update outputs parents to point to compiled outputs
    // - Update any overall graph outputs to be compiled outputs
    for (int o = 0; o < old_outputs.size(); ++o) {
      merge_one(compiled_outputs[o], old_outputs[o], parents_map);
      if (auto it = output_map.find(old_outputs[o].id());
          it != output_map.end()) {
        it->second = compiled_outputs[o];
      }
    }
  }

  std::reverse(new_tape.begin(), new_tape.end());
  tape = std::move(new_tape);

  // Replace output with potentially compiled output
  for (auto& o : outputs) {
    o = output_map.at(o.id());
  }
}

std::vector<array> compile_replace(
    const std::vector<array>& tape,
    const std::vector<array>& trace_inputs,
    const std::vector<array>& trace_outputs,
    const std::vector<array>& inputs,
    bool shapeless) {
  std::unordered_map<uintptr_t, array> trace_to_real;
  for (int i = 0; i < inputs.size(); ++i) {
    trace_to_real.insert({trace_inputs[i].id(), inputs[i]});
  }

  auto is_load = [](const Primitive& p) { return typeid(p) == typeid(Load); };

  for (auto& a : tape) {
    // Arrays in the tape without primitives are either:
    // - inputs, which are already in the map
    // - constants, which can be used directly
    // - a load primitive which has no inputs and will become a constant
    //   after the first eval
    if (!a.has_primitive() || is_load(a.primitive())) {
      trace_to_real.insert({a.id(), a});
    } else {
      // Find real inputs
      std::vector<array> real_inputs;
      for (auto& in : a.inputs()) {
        real_inputs.push_back(trace_to_real.at(in.id()));
      }
      if (a.siblings().empty()) {
        auto shape =
            shapeless ? a.primitive().output_shapes(real_inputs)[0] : a.shape();
        auto real_a = array(
            std::move(shape),
            a.dtype(),
            a.primitive_ptr(),
            std::move(real_inputs));
        trace_to_real.insert({a.id(), std::move(real_a)});
      } else {
        // Ensure the order is correct for multi-output primitives
        std::vector<Dtype> types;
        auto trace_out = a.outputs();
        for (auto& o : trace_out) {
          types.push_back(o.dtype());
        }
        std::vector<Shape> shapes;
        if (shapeless) {
          shapes = a.primitive().output_shapes(real_inputs);
        } else {
          for (auto& o : trace_out) {
            shapes.push_back(o.shape());
          }
        }
        auto real_out = array::make_arrays(
            std::move(shapes), types, a.primitive_ptr(), real_inputs);
        for (int i = 0; i < trace_out.size(); ++i) {
          trace_to_real.insert({trace_out[i].id(), std::move(real_out[i])});
        }
      }
    }
  }

  std::vector<array> outputs;
  for (auto& o : trace_outputs) {
    outputs.push_back(trace_to_real.at(o.id()));
  }
  return outputs;
}

bool skip_compile() {
  return compile_mode() == CompileMode::disabled ||
      !(compile_available_for_device(default_device()));
}

ArrayFnWithExtra compile(
    ArrayFnWithExtra fun,
    std::uintptr_t fun_id,
    bool shapeless /* = false */,
    std::vector<uint64_t> constants /* = {} */) {
  if (skip_compile()) {
    return fun;
  }
  if (!fun) {
    throw std::invalid_argument(
        "[compile] Cannot compile a function without a target.");
  }

  return [fun = std::move(fun),
          fun_id,
          shapeless,
          constants = std::move(constants)](const std::vector<array>& inputs) {
    // If the inputs are tracers, trace the original graph
    if (std::any_of(inputs.begin(), inputs.end(), [](auto& in) {
          return in.is_tracer();
        })) {
      return fun(inputs);
    }

    // Find a cache entry with the correct inputs
    auto& entry = compiler_cache().find(fun_id, inputs, shapeless, constants);

    // No matching cache entry existed, so compile
    if (entry.empty) {
      // Mark the entry as not empty since we are about to fill it
      entry.empty = false;
      // Set the constants
      entry.constants = std::move(constants);
      // Trace to build the graph
      std::tie(entry.inputs, entry.outputs, entry.extra) =
          compile_trace(fun, inputs, shapeless);

      // DFS the graph and get a tape, and a map of array id to (parent,
      // position in parent inputs)
      std::unordered_map<uintptr_t, std::vector<std::pair<array, int>>>
          parents_map;
      std::tie(entry.tape, parents_map) =
          compile_dfs(entry.inputs, entry.outputs, inputs);

      // Simplify the tape
      auto mode = compile_mode().load();
      if (mode != CompileMode::no_simplify) {
        compile_simplify(
            entry.tape, parents_map, entry.outputs, /* passes */ 3);
      }

      // Kernel fusion to generate Compiled primitives. The tape and
      // new outputs must be updated accordingly
      if (mode != CompileMode::no_fuse) {
        compile_fuse(entry.tape, parents_map, entry.inputs, entry.outputs);
      }
    }

    // At this point we must have a tape, now replace the placeholders
    // with real arrays that can be evaluated
    return ArraysAndExtra{
        compile_replace(
            entry.tape, entry.inputs, entry.outputs, inputs, shapeless),
        entry.extra};
  };
}

std::function<std::vector<array>(const std::vector<array>&)> compile(
    std::function<std::vector<array>(const std::vector<array>&)> fun,
    std::uintptr_t fun_id,
    bool shapeless /* = false */,
    std::vector<uint64_t> constants /* = {} */) {
  if (skip_compile()) {
    return fun;
  }
  if (!fun) {
    throw std::invalid_argument(
        "[compile] Cannot compile a function without a target.");
  }

  ArrayFnWithExtra fun_with_extra =
      [fun = std::move(fun)](const std::vector<array>& inputs) {
        return ArraysAndExtra{fun(inputs), nullptr};
      };

  auto compiled_fun = compile(
      std::move(fun_with_extra), fun_id, shapeless, std::move(constants));

  return [compiled_fun =
              std::move(compiled_fun)](const std::vector<array>& inputs) {
    return compiled_fun(inputs).first;
  };
}

void compile_erase(std::uintptr_t fun_id) {
  detail::compiler_cache().erase(fun_id);
}

void compile_clear_cache() {
  detail::compiler_cache().clear();
}

} // namespace detail

std::function<std::vector<array>(const std::vector<array>&)> compile(
    std::function<std::vector<array>(const std::vector<array>&)> fun,
    bool shapeless /* false */) {
  if (detail::skip_compile()) {
    return fun;
  }
  auto fun_id = detail::get_function_address(fun);
  if (fun_id) {
    // If the function has an addressable target then no need to manage it's
    // lifetime
    return detail::compile(std::move(fun), fun_id, shapeless);
  } else {
    auto pfun = std::shared_ptr<
        std::function<std::vector<array>(const std::vector<array>&)>>(
        new std::function<std::vector<array>(const std::vector<array>&)>{fun},
        [](auto* p) {
          detail::compile_erase(reinterpret_cast<std::uintptr_t>(p));
          delete p;
        });
    fun_id = reinterpret_cast<std::uintptr_t>(pfun.get());
    return detail::compile(
        [pfun = std::move(pfun)](const auto& inputs) {
          return (*pfun)(inputs);
        },
        fun_id,
        shapeless);
  }
}

std::function<std::vector<array>(const std::vector<array>&)> compile(
    std::vector<array> (*fun)(const std::vector<array>&),
    bool shapeless /* = false */) {
  if (detail::skip_compile()) {
    return fun;
  }
  return detail::compile(fun, reinterpret_cast<std::uintptr_t>(fun), shapeless);
}

void disable_compile() {
  detail::compile_mode() = CompileMode::disabled;
}

void enable_compile() {
  detail::compile_mode() = CompileMode::enabled;
}

void set_compile_mode(CompileMode mode) {
  detail::compile_mode() = mode;
}

} // namespace mlx::core


================================================
FILE: mlx/compile.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include "mlx/api.h"
#include "mlx/array.h"

namespace mlx::core {

enum class CompileMode { disabled, no_simplify, no_fuse, enabled };

/** Compile takes a function and returns a compiled function. */
MLX_API std::function<std::vector<array>(const std::vector<array>&)> compile(
    std::function<std::vector<array>(const std::vector<array>&)> fun,
    bool shapeless = false);

MLX_API std::function<std::vector<array>(const std::vector<array>&)> compile(
    std::vector<array> (*fun)(const std::vector<array>&),
    bool shapeless = false);

// Convert capture-less lambdas to function pointers.
template <
    typename F,
    typename = std::enable_if_t<
        std::is_convertible_v<F, decltype(+std::declval<F>())>>>
std::function<std::vector<array>(const std::vector<array>&)> compile(
    F&& f,
    bool shapeless = false) {
  return compile(+f, shapeless);
}

/** Globally disable compilation.
 * Setting the environment variable ``MLX_DISABLE_COMPILE`` can also
 * be used to disable compilation.
 */
MLX_API void disable_compile();

/** Globally enable compilation.
 * This will override the environment variable ``MLX_DISABLE_COMPILE``.
 */
MLX_API void enable_compile();

/** Set the compiler mode to the given value. */
MLX_API void set_compile_mode(CompileMode mode);
} // namespace mlx::core


================================================
FILE: mlx/compile_impl.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <unordered_map>

#include "mlx/api.h"
#include "mlx/array.h"

namespace mlx::core::detail {

using ArraysAndExtra = std::pair<std::vector<array>, std::shared_ptr<void>>;
using ArrayFnWithExtra =
    std::function<ArraysAndExtra(const std::vector<array>&)>;

// This is not part of the general C++ API as calling with a bad id is a bad
// idea.
MLX_API std::function<std::vector<array>(const std::vector<array>&)> compile(
    std::function<std::vector<array>(const std::vector<array>&)> fun,
    std::uintptr_t fun_id,
    bool shapeless = false,
    std::vector<uint64_t> constants = {});

MLX_API ArrayFnWithExtra compile(
    ArrayFnWithExtra fun,
    std::uintptr_t fun_id,
    bool shapeless,
    std::vector<uint64_t> constants);

// Erase cached compile functions
MLX_API void compile_erase(std::uintptr_t fun_id);

// Clear the compiler cache causing a recompilation of all compiled functions
// when called again.
MLX_API void compile_clear_cache();

bool compile_available_for_device(const Device& device);

std::tuple<std::vector<array>, std::vector<array>, std::shared_ptr<void>>
compile_trace(
    const ArrayFnWithExtra& fun,
    const std::vector<array>& inputs,
    bool shapeless);

using ParentsMap =
    std::unordered_map<std::uintptr_t, std::vector<std::pair<array, int>>>;

// Traverses the graph to build a tape and a map of array ids to their parents
std::pair<std::vector<array>, ParentsMap> compile_dfs(
    const std::vector<array>& inputs,
    std::vector<array>& outputs,
    const std::vector<array>& original_inputs);

// Simplify the tape.
void compile_simplify(
    std::vector<array>& tape,
    ParentsMap& parents_map,
    std::vector<array>& outputs,
    int passes);

std::vector<array> compile_replace(
    const std::vector<array>& tape,
    const std::vector<array>& trace_inputs,
    const std::vector<array>& trace_outputs,
    const std::vector<array>& inputs,
    bool shapeless);

void compile_validate_shapeless(const std::vector<array>& tape);

} // namespace mlx::core::detail


================================================
FILE: mlx/device.cpp
================================================
// Copyright © 2023-2026 Apple Inc.

#include <stdexcept>

#include "mlx/backend/cpu/device_info.h"
#include "mlx/backend/gpu/device_info.h"
#include "mlx/device.h"

namespace mlx::core {

Device& mutable_default_device() {
  static Device default_device{gpu::is_available() ? Device::gpu : Device::cpu};
  return default_device;
}

const Device& default_device() {
  return mutable_default_device();
}

void set_default_device(const Device& d) {
  if (!gpu::is_available() && d == Device::gpu) {
    throw std::invalid_argument(
        "[set_default_device] Cannot set gpu device without gpu backend.");
  }
  mutable_default_device() = d;
}

bool operator==(const Device& lhs, const Device& rhs) {
  return lhs.type == rhs.type && lhs.index == rhs.index;
}

bool operator!=(const Device& lhs, const Device& rhs) {
  return !(lhs == rhs);
}

bool is_available(const Device& d) {
  switch (d.type) {
    case Device::cpu:
      return cpu::is_available() && (d.index < cpu::device_count());
    case Device::gpu:
      return gpu::is_available() && (d.index < gpu::device_count());
  }
  // appease compiler
  return false;
}

int device_count(Device::DeviceType type) {
  switch (type) {
    case Device::cpu:
      return cpu::device_count();
    case Device::gpu:
      return gpu::device_count();
  }
  // appease compiler
  return 0;
}

const std::unordered_map<std::string, std::variant<std::string, size_t>>&
device_info(const Device& d) {
  switch (d.type) {
    case Device::cpu:
      return cpu::device_info(d.index);
    case Device::gpu:
      return gpu::device_info(d.index);
  }
  // appease compiler
  static std::unordered_map<std::string, std::variant<std::string, size_t>>
      empty;
  return empty;
}

} // namespace mlx::core


================================================
FILE: mlx/device.h
================================================
// Copyright © 2023-2025 Apple Inc.

#pragma once

#include "mlx/api.h"

#include <string>
#include <unordered_map>
#include <variant>

namespace mlx::core {

struct MLX_API Device {
  enum class DeviceType {
    cpu,
    gpu,
  };

  static constexpr DeviceType cpu = DeviceType::cpu;
  static constexpr DeviceType gpu = DeviceType::gpu;

  Device(DeviceType type, int index = 0) : type(type), index(index) {}

  DeviceType type;
  int index;
};

MLX_API const Device& default_device();

MLX_API void set_default_device(const Device& d);

MLX_API bool operator==(const Device& lhs, const Device& rhs);
MLX_API bool operator!=(const Device& lhs, const Device& rhs);

MLX_API bool is_available(const Device& d);

/** Get the number of available devices for the given device type. */
MLX_API int device_count(Device::DeviceType type);

/**
 * Get information about a device.
 *
 * Returns a map of device properties. Keys vary by backend:
 *   - device_name (string): Device name
 *   - architecture (string): Architecture identifier
 *   - total_memory/memory_size (size_t): Total device memory
 *   - free_memory (size_t): Available memory (CUDA only)
 *   - uuid (string): Device UUID (CUDA only)
 *   - pci_bus_id (string): PCI bus ID (CUDA only)
 *   - compute_capability_major/minor (size_t): Compute capability (CUDA only)
 */
MLX_API const
    std::unordered_map<std::string, std::variant<std::string, size_t>>&
    device_info(const Device& d = default_device());

} // namespace mlx::core


================================================
FILE: mlx/distributed/CMakeLists.txt
================================================
target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/primitives.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/ops.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/distributed.cpp)

if(MLX_BUILD_CPU AND NOT WIN32)
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp)
endif()

add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/mpi)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ring)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/nccl)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/jaccl)


================================================
FILE: mlx/distributed/distributed.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <unordered_map>

#include "mlx/backend/cuda/cuda.h"
#include "mlx/distributed/distributed.h"
#include "mlx/distributed/distributed_impl.h"
#include "mlx/distributed/jaccl/jaccl.h"
#include "mlx/distributed/mpi/mpi.h"
#include "mlx/distributed/nccl/nccl.h"
#include "mlx/distributed/ring/ring.h"

namespace mlx::core::distributed {

namespace detail {

Stream communication_stream(Group group, StreamOrDevice s /* = {} */) {
  return group.raw_group()->communication_stream(s);
}

void all_sum(Group group, const array& input, array& output, Stream stream) {
  group.raw_group()->all_sum(input, output, stream);
}

void all_max(Group group, const array& input, array& output, Stream stream) {
  group.raw_group()->all_max(input, output, stream);
}

void all_min(Group group, const array& input, array& output, Stream stream) {
  group.raw_group()->all_min(input, output, stream);
}

void all_gather(Group group, const array& input, array& output, Stream stream) {
  group.raw_group()->all_gather(input, output, stream);
}

void send(Group group, const array& input, int dst, Stream stream) {
  group.raw_group()->send(input, dst, stream);
}

void recv(Group group, array& out, int src, Stream stream) {
  group.raw_group()->recv(out, src, stream);
}

void sum_scatter(
    Group group,
    const array& input,
    array& output,
    Stream stream) {
  group.raw_group()->sum_scatter(input, output, stream);
}

class EmptyGroup : public GroupImpl {
 public:
  Stream communication_stream(StreamOrDevice s) override {
    return to_stream(s);
  }

  int rank() override {
    return 0;
  }

  int size() override {
    return 1;
  }

  std::shared_ptr<GroupImpl> split(int color, int key = -1) override {
    throw std::runtime_error("Cannot split the distributed group further.");
  }

  void all_sum(const array&, array&, Stream) override {
    throw std::runtime_error(
        "Communication not implemented in an empty distributed group.");
  }
  void all_gather(const array&, array&, Stream) override {
    throw std::runtime_error(
        "Communication not implemented in an empty distributed group.");
  }
  void send(const array&, int, Stream) override {
    throw std::runtime_error(
        "Communication not implemented in an empty distributed group.");
  }
  void recv(array&, int, Stream) override {
    throw std::runtime_error(
        "Communication not implemented in an empty distributed group.");
  }

  void all_max(const array&, array&, Stream) override {
    throw std::runtime_error(
        "Communication not implemented in an empty distributed group.");
  }

  void all_min(const array&, array&, Stream) override {
    throw std::runtime_error(
        "Communication not implemented in an empty distributed group.");
  }
  void sum_scatter(const array&, array&, Stream) override {
    throw std::runtime_error(
        "Communication not implemented in an empty distributed group.");
  }
};

} // namespace detail

bool is_available() {
  return mpi::is_available() || ring::is_available() || nccl::is_available() ||
      jaccl::is_available();
}

bool is_available(const std::string& bk) {
  if (bk == "any") {
    return is_available();
  }
  if (bk == "mpi") {
    return mpi::is_available();
  }
  if (bk == "ring") {
    return ring::is_available();
  }
  if (bk == "nccl") {
    return nccl::is_available();
  }
  if (bk == "jaccl") {
    return jaccl::is_available();
  }
  return false;
}

int Group::rank() const {
  return group_->rank();
}

int Group::size() const {
  return group_->size();
}

Group Group::split(int color, int key /* = -1 */) const {
  return Group(group_->split(color, key));
}

Group init(bool strict /* = false */, const std::string& bk /* = "any" */) {
  static std::unordered_map<std::string, std::shared_ptr<detail::GroupImpl>>
      backends;

  // Already initialized so return the group.
  if (auto g = backends.find(bk); g != backends.end()) {
    return Group(g->second);
  }

  // Create the requested communication group
  std::shared_ptr<detail::GroupImpl> group{nullptr};
  std::string bk_ = bk;
  if (bk == "mpi") {
    group = mpi::init(strict);
  } else if (bk == "ring") {
    group = ring::init(strict);
  } else if (bk == "nccl") {
    group = nccl::init(strict);
  } else if (bk == "jaccl") {
    group = jaccl::init(strict);
  } else if (bk == "any") {
    if (mlx::core::cu::is_available()) {
      group = nccl::init(false);
      bk_ = "nccl";
    }
    if (group == nullptr) {
      group = ring::init(false);
      bk_ = "ring";
    }
    if (group == nullptr) {
      group = mpi::init(false);
      bk_ = "mpi";
    }
    if (group == nullptr) {
      group = jaccl::init(false);
      bk_ = "jaccl";
    }
    if (group == nullptr && strict) {
      throw std::runtime_error("[distributed] Couldn't initialize any backend");
    }
  } else {
    std::ostringstream msg;
    msg << "[distributed] The only valid values for backend are 'any', 'mpi', 'nccl', "
        << "'jaccl' and 'ring' but '" << bk << "' was provided.";
    throw std::invalid_argument(msg.str());
  }

  if (group == nullptr) {
    group = std::make_shared<detail::EmptyGroup>();
  } else {
    backends.insert({"any", group});
  }
  backends.insert({std::move(bk_), group});
  return Group(group);
}

} // namespace mlx::core::distributed


================================================
FILE: mlx/distributed/distributed.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include <memory>

#include "mlx/api.h"
#include "mlx/array.h"
#include "mlx/utils.h"

namespace mlx::core::distributed {

// Forward declaration of the base group implementation.
namespace detail {
class GroupImpl;
};

/* Check if a communication backend is available */
MLX_API bool is_available();
MLX_API bool is_available(const std::string& bk);

/**
 * A distributed::Group represents a group of independent mlx processes that
 * can communicate. We must also be able to create sub-groups from a group in
 * order to define more granular communication.
 */
struct MLX_API Group {
  Group(std::shared_ptr<detail::GroupImpl> group) : group_(std::move(group)) {}

  int rank() const;
  int size() const;

  /**
   * Split the group according to the provided color. Namely processes that use
   * the same color will go to the same group.
   *
   * The key defines the rank of the processes in the new group. The smaller
   * the key the smaller the rank. If the provided key is negative, then the
   * rank in the current group is used.
   */
  Group split(int color, int key = -1) const;

  const std::shared_ptr<detail::GroupImpl>& raw_group() const {
    return group_;
  }

 private:
  std::shared_ptr<detail::GroupImpl> group_{nullptr};
};

/**
 * Initialize the distributed backend and return the group containing all
 * discoverable processes.
 *
 * If strict is true then throw an error if we couldn't initialize the
 * distributed subsystem. Otherwise simply return a singleton group which will
 * render communication operations as no-op.
 */
MLX_API Group init(bool strict = false, const std::string& bk = "any");

} // namespace mlx::core::distributed


================================================
FILE: mlx/distributed/distributed_impl.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/distributed/distributed.h"

namespace mlx::core::distributed::detail {

/**
 * Abstract base class of a distributed group implementation.
 */
class GroupImpl {
 public:
  virtual ~GroupImpl() {}

  // Choose the stream this communication group can operate on
  virtual Stream communication_stream(StreamOrDevice s = {}) = 0;

  // Group operations
  virtual int rank() = 0;
  virtual int size() = 0;
  virtual std::shared_ptr<GroupImpl> split(int color, int key = -1) = 0;

  // Actual communication operations
  virtual void all_sum(const array& input, array& output, Stream stream) = 0;
  virtual void all_gather(const array& input, array& output, Stream stream) = 0;
  virtual void send(const array& input, int dst, Stream stream) = 0;
  virtual void recv(array& out, int src, Stream stream) = 0;
  virtual void all_max(const array& input, array& output, Stream stream) = 0;
  virtual void all_min(const array& input, array& output, Stream stream) = 0;
  virtual void
  sum_scatter(const array& input, array& output, Stream stream) = 0;
};

/* Define the MLX stream that the communication should happen in. */
Stream communication_stream(Group group, StreamOrDevice s = {});

/* Perform an all reduce sum operation */
void all_sum(Group group, const array& input, array& output, Stream stream);

/* Perform an all gather operation */
void all_gather(Group group, const array& input, array& output, Stream stream);

/** Send an array to the dst rank */
void send(Group group, const array& input, int dst, Stream stream);

/** Recv an array from the src rank */
void recv(Group group, array& out, int src, Stream stream);

/** Max reduction */
void all_max(Group group, const array& input, array& output, Stream stream);

/** Min reduction */
void all_min(Group group, const array& input, array& output, Stream stream);

/** Reduce scatter with average operation */
void sum_scatter(Group group, const array& input, array& output, Stream stream);

} // namespace mlx::core::distributed::detail


================================================
FILE: mlx/distributed/jaccl/CMakeLists.txt
================================================
if(MLX_BUILD_CPU
   AND ${CMAKE_SYSTEM_NAME} MATCHES "Darwin"
   AND MACOS_SDK_VERSION GREATER_EQUAL 26.2)
  target_sources(
    mlx
    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/jaccl.cpp
            ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
            ${CMAKE_CURRENT_SOURCE_DIR}/mesh.cpp
            ${CMAKE_CURRENT_SOURCE_DIR}/ring.cpp)
else()
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/no_jaccl.cpp)
endif()


================================================
FILE: mlx/distributed/jaccl/jaccl.cpp
================================================
// Copyright © 2025 Apple Inc.

#include <fstream>
#include <sstream>

#include <json.hpp>

#include "mlx/distributed/distributed_impl.h"
#include "mlx/distributed/jaccl/mesh.h"
#include "mlx/distributed/jaccl/ring.h"
#include "mlx/distributed/jaccl/utils.h"

using GroupImpl = mlx::core::distributed::detail::GroupImpl;
using json = nlohmann::json;

namespace {

struct DeviceFile {
  DeviceFile(const char* dev_file) {
    std::ifstream f(dev_file);
    json devices = json::parse(f);
    if (!devices.is_array()) {
      throw std::runtime_error(
          "[jaccl] The device file should start with an array");
    }

    devices_.resize(devices.size());
    for (int rank = 0; rank < devices.size(); rank++) {
      auto conn = devices[rank];
      if (!conn.is_array()) {
        throw std::runtime_error(
            "[jaccl] The device file should have an array of arrays");
      }
      if (conn.size() != devices_.size()) {
        std::ostringstream msg;
        msg << "[jaccl] The device file should contain the connectivity of each rank to "
            << "all other ranks but rank " << rank << " contains only "
            << conn.size() << " entries.";
        throw std::runtime_error(msg.str());
      }

      devices_[rank].resize(conn.size());
      for (int dst = 0; dst < conn.size(); dst++) {
        auto names = conn[dst];
        if (names.is_string()) {
          devices_[rank][dst].push_back(names);
        } else if (names.is_array()) {
          for (auto name_it = names.begin(); name_it != names.end();
               name_it++) {
            devices_[rank][dst].push_back(*name_it);
          }
        } else if (!names.is_null()) {
          throw std::runtime_error(
              "[jaccl] Device names should be null, a string or array of strings.");
        }
      }
    }
  }

  int size() {
    return devices_.size();
  }

  bool is_valid_mesh() {
    for (int src = 0; src < size(); src++) {
      for (int dst = 0; dst < size(); dst++) {
        if (devices_[src][dst].size() != static_cast<size_t>(src != dst)) {
          return false;
        }
      }
    }

    return true;
  }

  bool is_valid_ring() {
    int num_connections = devices_[0][1].size();
    if (num_connections == 0) {
      return false;
    }

    for (int src = 0; src < size(); src++) {
      int left = (src + size() - 1) % size();
      int right = (src + 1) % size();
      for (int dst = 0; dst < size(); dst++) {
        if (dst != left && dst != right) {
          if (devices_[src][dst].size() != 0) {
            return false;
          }
        } else {
          if (devices_[src][dst].size() != num_connections) {
            return false;
          }
        }
      }
    }

    return true;
  }

  std::vector<std::string> extract_mesh_connectivity(int rank) {
    std::vector<std::string> devices(size());
    for (int dst = 0; dst < size(); dst++) {
      if (dst != rank) {
        devices[dst] = devices_[rank][dst][0];
      }
    }
    return devices;
  }

  std::pair<std::vector<std::string>, std::vector<std::string>>
  extract_ring_connectivity(int rank) {
    int left = (rank + size() - 1) % size();
    int right = (rank + 1) % size();

    return std::make_pair(devices_[rank][left], devices_[rank][right]);
  }

  std::vector<std::vector<std::vector<std::string>>> devices_;
};

} // namespace

namespace mlx::core::distributed::jaccl {

bool is_available() {
  return ibv().is_available();
}

std::shared_ptr<GroupImpl> init(bool strict /* = false */) {
  const char* dev_file = std::getenv("MLX_IBV_DEVICES");
  const char* coordinator = std::getenv("MLX_JACCL_COORDINATOR");
  const char* rank_str = std::getenv("MLX_RANK");
  const char* ring = std::getenv("MLX_JACCL_RING");

  if (!is_available() || !dev_file || !coordinator || !rank_str) {
    if (strict) {
      std::ostringstream msg;
      msg << "[jaccl] You need to provide via environment variables a rank (MLX_RANK), "
          << "a device file (MLX_IBV_DEVICES) and a coordinator ip/port (MLX_JACCL_COORDINATOR) "
          << "but provided MLX_RANK=\"" << ((rank_str) ? rank_str : "")
          << "\", MLX_IBV_DEVICES=\"" << ((dev_file) ? dev_file : "")
          << "\" and MLX_JACCL_COORDINATOR=\""
          << ((coordinator) ? coordinator : "");
      throw std::runtime_error(msg.str());
    }
    return nullptr;
  }

  auto rank = std::atoi(rank_str);
  bool prefer_ring = ring != nullptr;
  DeviceFile devices(dev_file);

  if (rank >= devices.size() || rank < 0) {
    std::ostringstream msg;
    msg << "[jaccl] Invalid rank " << rank << ". It should be between 0 and "
        << devices.size();
    throw std::runtime_error(msg.str());
  }

  if (prefer_ring && devices.is_valid_ring()) {
    auto [left, right] = devices.extract_ring_connectivity(rank);
    return std::make_shared<RingGroup>(
        rank, devices.size(), left, right, coordinator);
  } else if (devices.is_valid_mesh()) {
    auto device_names = devices.extract_mesh_connectivity(rank);
    return std::make_shared<MeshGroup>(rank, device_names, coordinator);
  } else if (devices.is_valid_ring()) {
    auto [left, right] = devices.extract_ring_connectivity(rank);
    return std::make_shared<RingGroup>(
        rank, devices.size(), left, right, coordinator);
  } else {
    throw std::runtime_error(
        "[jaccl] The device file should define a valid mesh or a valid ring.");
  }
}

} // namespace mlx::core::distributed::jaccl


================================================
FILE: mlx/distributed/jaccl/jaccl.h
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/distributed/distributed.h"

namespace mlx::core::distributed::jaccl {

using GroupImpl = mlx::core::distributed::detail::GroupImpl;

bool is_available();
std::shared_ptr<GroupImpl> init(bool strict = false);

} // namespace mlx::core::distributed::jaccl


================================================
FILE: mlx/distributed/jaccl/mesh.cpp
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/distributed/jaccl/mesh.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/distributed/reduction_ops.h"
#include "mlx/dtype_utils.h"

namespace mlx::core::distributed::jaccl {

MeshGroup::MeshGroup(
    int rank,
    const std::vector<std::string>& device_names,
    const char* coordinator_addr)
    : rank_(rank),
      size_(device_names.size()),
      side_channel_(rank_, size_, coordinator_addr),
      connections_(create_connections(device_names)) {
  if (size_ > MESH_MAX_PEERS) {
    std::ostringstream msg;
    msg << "[jaccl] The JACCL mesh supports up to " << MESH_MAX_PEERS
        << " peers but " << size_ << " were provided.";
    throw std::runtime_error(msg.str());
  }

  // Initialize all the connections and allocate buffers
  initialize();

  // Make sure every node has reached here before continuing
  side_channel_.all_gather<int>(0);

  // Create the mesh implementation object
  mesh_ = MeshImpl(rank_, size_, connections_, buffers_);
  ring_ = RingImpl(
      rank_,
      size_,
      &connections_[(rank_ + size_ - 1) % size_],
      &connections_[(rank_ + 1) % size_],
      1,
      ring_send_buffers_,
      ring_recv_buffers_);
}

void MeshGroup::initialize() {
  // Create the queue pairs
  for (auto& conn : connections_) {
    if (conn.ctx == nullptr) {
      continue;
    }
    conn.allocate_protection_domain();
    conn.create_completion_queue(MAX_SEND_WR + MAX_RECV_WR);
    conn.create_queue_pair();
  }

  allocate_buffers();

  // First init all connections
  for (int peer = 0; peer < size_; peer++) {
    if (peer == rank_) {
      continue;
    }
    connections_[peer].queue_pair_init();
  }

  // Gather the information to be exchanged, this also serves as a barrier so
  // that all peers have initialized their connections before attempting to
  // transition to RTS.
  std::vector<Destination> info;
  for (auto& conn : connections_) {
    info.emplace_back(conn.info());
  }
  auto all_infos = side_channel_.all_gather(info);

  // Transition queue pairs to RTS
  for (int peer = 0; peer < size_; peer++) {
    if (peer == rank_) {
      continue;
    }
    auto peer_info = all_infos[peer][rank_];
    connections_[peer].queue_pair_rtr(peer_info);
    connections_[peer].queue_pair_rts();
  }
}

void MeshGroup::allocate_buffers() {
  // Deregister any buffers and free the memory
  buffers_.clear();
  ring_send_buffers_.clear();
  ring_recv_buffers_.clear();

  // Allocate the memory
  for (int k = 0; k < BUFFER_SIZES; k++) {
    for (int i = 0; i < NUM_BUFFERS; i++) {
      // Mesh buffers
      for (int j = 0; j < size_; j++) {
        buffers_.emplace_back(FRAME_SIZE * (1 << k));
      }
      // Ring buffers (1 for each direction)
      for (int j = 0; j < 2; j++) {
        ring_send_buffers_.emplace_back(FRAME_SIZE * (1 << k));
        ring_recv_buffers_.emplace_back(FRAME_SIZE * (1 << k));
      }
    }
  }

  for (int k = 0; k < BUFFER_SIZES; k++) {
    for (int i = 0; i < NUM_BUFFERS; i++) {
      // Mesh buffers
      for (int j = 0; j < size_; j++) {
        // This is our send buffer so register it with all pds so we can send
        // it to all connected devices.
        if (j == rank_) {
          for (auto& conn : connections_) {
            if (conn.ctx != nullptr) {
              buffers_[k * NUM_BUFFERS * size_ + i * size_ + j]
                  .register_to_protection_domain(conn.protection_domain);
            }
          }
        }

        // This is the recv buffer from rank j so register it to rank j's
        // protection domain.
        else {
          buffers_[k * NUM_BUFFERS * size_ + i * size_ + j]
              .register_to_protection_domain(connections_[j].protection_domain);
        }
      }

      // Ring buffers (see ring group for the logic below)
      // We register send buffers to both the right and the left.
      int left = (rank_ + size_ - 1) % size_;
      int right = (rank_ + 1) % size_;
      ring_send_buffers_[k * NUM_BUFFERS * 2 + i * 2 + 0]
          .register_to_protection_domain(connections_[right].protection_domain);
      ring_recv_buffers_[k * NUM_BUFFERS * 2 + i * 2 + 0]
          .register_to_protection_domain(connections_[left].protection_domain);
      ring_send_buffers_[k * NUM_BUFFERS * 2 + i * 2 + 1]
          .register_to_protection_domain(connections_[left].protection_domain);
      ring_recv_buffers_[k * NUM_BUFFERS * 2 + i * 2 + 1]
          .register_to_protection_domain(connections_[right].protection_domain);
    }
  }
}

void MeshGroup::all_sum(const array& input, array& output, Stream stream) {
  dispatch_all_types(output.dtype(), [&](auto type_tag) {
    using T = MLX_GET_TYPE(type_tag);
    all_reduce<T>(input, output, stream, detail::SumOp<T>{});
  });
}

void MeshGroup::all_max(const array& input, array& output, Stream stream) {
  dispatch_all_types(output.dtype(), [&](auto type_tag) {
    using T = MLX_GET_TYPE(type_tag);
    all_reduce<T>(input, output, stream, detail::MaxOp<T>{});
  });
}

void MeshGroup::all_min(const array& input, array& output, Stream stream) {
  dispatch_all_types(output.dtype(), [&](auto type_tag) {
    using T = MLX_GET_TYPE(type_tag);
    all_reduce<T>(input, output, stream, detail::MinOp<T>{});
  });
}

void MeshGroup::all_gather(const array& input, array& output, Stream stream) {
  auto in_ptr = input.data<char>();
  auto out_ptr = output.data<char>();
  size_t n_bytes = input.nbytes();
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(input);
  encoder.set_output_array(output);
  encoder.dispatch([in_ptr, out_ptr, n_bytes, this]() {
    mesh_.all_gather(in_ptr, out_ptr, n_bytes);
  });
}

void MeshGroup::send(const array& input, int dst, Stream stream) {
  auto data = input.data<char>();
  int64_t n_bytes = input.nbytes();
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(input);
  encoder.dispatch(
      [data, n_bytes, dst, this]() { mesh_.send(data, n_bytes, dst); });
}

void MeshGroup::recv(array& out, int src, Stream stream) {
  auto data = out.data<char>();
  int64_t n_bytes = out.nbytes();
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_output_array(out);
  encoder.dispatch(
      [data, n_bytes, src, this]() { mesh_.recv(data, n_bytes, src); });
}

template <typename T, typename ReduceOp>
void MeshGroup::all_reduce(
    const array& input,
    array& output,
    Stream stream,
    ReduceOp reduce_op) {
  auto in_ptr = input.data<T>();
  auto out_ptr = output.data<T>();
  int64_t size = input.size();
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(input);
  encoder.set_output_array(output);
  encoder.dispatch([in_ptr, out_ptr, size, this, reduce_op]() {
    if (size_ > 2 &&
        ((std::is_same_v<T, bfloat16_t> && size > 65536) ||
         size >= 8 * 1024 * 1024 / sizeof(T))) {
      ring_.all_reduce<2>(in_ptr, out_ptr, size, 1, reduce_op);
    } else {
      mesh_.all_reduce(in_ptr, out_ptr, size, reduce_op);
    }
  });
}

} // namespace mlx::core::distributed::jaccl


================================================
FILE: mlx/distributed/jaccl/mesh.h
================================================
// Copyright © 2026 Apple Inc.

#pragma once

#include "mlx/distributed/distributed_impl.h"
#include "mlx/distributed/jaccl/mesh_impl.h"
#include "mlx/distributed/jaccl/ring_impl.h"
#include "mlx/distributed/jaccl/utils.h"

using GroupImpl = mlx::core::distributed::detail::GroupImpl;

namespace mlx::core::distributed::jaccl {

/**
 * The JACCL communication group for a fully connected mesh. We expect one
 * connection per peer and it should be the lowest latency communication group
 * for small to medium size messages.
 *
 * Like all JACCL groups it uses a side channel to exchange the necessary
 * information and then configure the connections to be ready for RDMA
 * operations.
 */
class MeshGroup : public GroupImpl {
 public:
  MeshGroup(
      int rank,
      const std::vector<std::string>& device_names,
      const char* coordinator_addr);

  Stream communication_stream(StreamOrDevice s) override {
    return to_stream(s, Device::cpu);
  }

  int rank() override {
    return rank_;
  }

  int size() override {
    return size_;
  }

  void all_sum(const array& input, array& output, Stream stream) override;
  void all_max(const array& input, array& output, Stream stream) override;
  void all_min(const array& input, array& output, Stream stream) override;
  void all_gather(const array& input, array& output, Stream stream) override;
  void send(const array& input, int dst, Stream stream) override;
  void recv(array& out, int src, Stream stream) override;

  void sum_scatter(const array& input, array& output, Stream stream) override {
    throw std::runtime_error("[jaccl] sum_scatter not supported.");
  }

  std::shared_ptr<GroupImpl> split(int color, int key = -1) override {
    throw std::runtime_error("[jaccl] Group split not supported.");
  }

 private:
  template <typename T, typename ReduceOp>
  void all_reduce(
      const array& input,
      array& output,
      Stream stream,
      ReduceOp reduce_op);

  /**
   * Performs the connection initialization. Namely, after this call all
   * Connection objects should have a queue pair in RTS state and all buffers
   * should have been allocated.
   */
  void initialize();

  /**
   * Allocate all the buffers that we will use in the communication group.
   */
  void allocate_buffers();

  int rank_;
  int size_;
  SideChannel side_channel_;
  std::vector<Connection> connections_;
  std::vector<SharedBuffer> buffers_;
  std::vector<SharedBuffer> ring_send_buffers_;
  std::vector<SharedBuffer> ring_recv_buffers_;

  MeshImpl mesh_;
  RingImpl ring_;
};

} // namespace mlx::core::distributed::jaccl


================================================
FILE: mlx/distributed/jaccl/mesh_impl.h
================================================
// Copyright © 2026 Apple Inc.

#pragma once

#include <span>

#include "mlx/distributed/jaccl/utils.h"

constexpr int MESH_MAX_PEERS = 8;

namespace mlx::core::distributed::jaccl {

class MeshImpl {
 public:
  MeshImpl(
      int rank,
      int size,
      std::vector<Connection>& conns,
      std::vector<SharedBuffer>& buffers)
      : rank_(rank), size_(size), connections_(conns), buffers_(buffers) {}

  MeshImpl() : rank_(0), size_(1) {}

  template <typename T, typename ReduceOp>
  void
  all_reduce(const T* in_ptr, T* out_ptr, int64_t size, ReduceOp reduce_op) {
    // If not inplace all reduce then copy the input to the output first
    if (in_ptr != out_ptr) {
      std::memcpy(out_ptr, in_ptr, size * sizeof(T));
    }

    // Fully connected all reduce
    T* data = out_ptr;
    auto [sz, buffer_size] = buffer_size_from_message(size * sizeof(T));
    int64_t N = buffer_size / sizeof(T);
    constexpr int PIPELINE = 2;
    constexpr int WC_NUM = PIPELINE * MESH_MAX_PEERS * 2;
    int64_t total = static_cast<int64_t>(size);
    int num_peers = size_ - 1;

    // Counters to maintain the state of transfers
    int in_flight = 0;
    int64_t read_offset = 0;
    int completed_send_count[PIPELINE] = {0};
    int completed_recv_begin[MESH_MAX_PEERS] = {0};
    int completed_recv_end[MESH_MAX_PEERS] = {0};

    // Prefill the pipeline
    int buff = 0;
    while (read_offset < total && buff < PIPELINE) {
      post_recv_all(sz, buff);
      std::copy(
          data + read_offset,
          data + std::min(read_offset + N, total),
          send_buffer(sz, buff).begin<T>());
      post_send_all(sz, buff);

      buff++;
      in_flight += 2 * num_peers;
      read_offset += N;
    }

    // Main loop
    //
    // Keep going until we have no longer data in flight.
    while (in_flight > 0) {
      // Poll the hardware for completions.
      //
      // If a send was completed mark how many completions we have received
      // for that buffer. If we have sent the buffer to all peers we can
      // reuse the buffer so copy the next chunk of data and send it to all.
      //
      // If a receive is completed then advance the pointer of completed
      // receives.
      ibv_wc wc[WC_NUM];
      int n = poll(connections_, WC_NUM, wc);
      for (int i = 0; i < n; i++) {
        int work_type = wc[i].wr_id >> 16;
        int buff = (wc[i].wr_id >> 8) & 0xff;
        int rank = wc[i].wr_id & 0xff;

        in_flight--;

        if (work_type == SEND_WR && read_offset < total) {
          completed_send_count[buff]++;
          if (completed_send_count[buff] == num_peers) {
            std::copy(
                data + read_offset,
                data + std::min(read_offset + N, total),
                send_buffer(sz, buff).begin<T>());
            post_send_all(sz, buff);

            completed_send_count[buff] = 0;
            in_flight += num_peers;
            read_offset += N;
          }
        }

        else if (work_type == RECV_WR) {
          completed_recv_end[rank]++;
        }
      }

      // Process the completed recv
      //
      // For each rank we have a range of completed recv defined by a begin
      // and end inclusive and exlusive in standard C++ fashion.
      //
      // When there is an unprocessed receive we first check if we have
      // finished sending the write location. If so then we reduce in-place
      // and then check if there is more to be received and post a recv.
      for (int r = 0; r < size_; r++) {
        int s = completed_recv_begin[r];
        int e = completed_recv_end[r];
        int w = s * N;
        while (w < read_offset && e - s > 0) {
          int buff = s % PIPELINE;
          reduce_op(
              recv_buffer(sz, buff, r).begin<T>(),
              data + w,
              std::min(N, total - w));
          w += N;
          s++;
          if (w + (PIPELINE - 1) * N < total) {
            recv_from(sz, r, buff);
            in_flight++;
          }
        }
        completed_recv_begin[r] = s;
      }
    }
  }

  void all_gather(const char* in_ptr, char* out_ptr, int64_t n_bytes) {
    // Copy our data to the appropriate place
    std::memcpy(out_ptr + rank_ * n_bytes, in_ptr, n_bytes);

    // Fully connected all gather
    char* data = out_ptr;
    char* our_data = out_ptr + rank_ * n_bytes;
    auto [sz, N] = buffer_size_from_message(n_bytes);
    constexpr int PIPELINE = 2;
    constexpr int WC_NUM = PIPELINE * MESH_MAX_PEERS * 2;
    int64_t total = static_cast<int64_t>(n_bytes);
    int num_peers = size_ - 1;

    // Counters to maintain the state of transfers
    int in_flight = 0;
    int read_offset = 0;
    int completed_send_count[PIPELINE] = {0};
    int write_offset[MESH_MAX_PEERS] = {0};

    // Prefill the pipeline
    int buff = 0;
    while (read_offset < total && buff < PIPELINE) {
      post_recv_all(sz, buff);
      std::copy(
          our_data + read_offset,
          our_data + std::min(read_offset + N, total),
          send_buffer(sz, buff).begin<char>());
      post_send_all(sz, buff);

      buff++;
      in_flight += 2 * num_peers;
      read_offset += N;
    }

    // Main loop
    //
    // Keep going until we have no longer data in flight.
    while (in_flight > 0) {
      ibv_wc wc[WC_NUM];
      int n = poll(connections_, WC_NUM, wc);
      for (int i = 0; i < n; i++) {
        int work_type = wc[i].wr_id >> 16;
        int buff = (wc[i].wr_id >> 8) & 0xff;
        int rank = wc[i].wr_id & 0xff;

        in_flight--;

        // Send completed. If all sends completed then send the next chunk.
        if (work_type == SEND_WR && read_offset < total) {
          completed_send_count[buff]++;
          if (completed_send_count[buff] == num_peers) {
            std::copy(
                our_data + read_offset,
                our_data + std::min(read_offset + N, total),
                send_buffer(sz, buff).begin<char>());
            post_send_all(sz, buff);

            completed_send_count[buff] = 0;
            in_flight += num_peers;
            read_offset += N;
          }
        }

        // Recv completed. If we have more chunks then post another recv.
        else if (work_type == RECV_WR) {
          std::copy(
              recv_buffer(sz, buff, rank).begin<char>(),
              recv_buffer(sz, buff, rank).begin<char>() +
                  std::min(N, total - write_offset[rank]),
              data + rank * n_bytes + write_offset[rank]);
          write_offset[rank] += N;
          if (write_offset[rank] + N * (PIPELINE - 1) < total) {
            recv_from(sz, rank, buff);
            in_flight++;
          }
        }
      }
    }
  }

  void send(const char* in_ptr, int64_t n_bytes, int dst) {
    constexpr int PIPELINE = 2;
    constexpr int WC_NUM = PIPELINE;
    auto [sz, N] = buffer_size_from_message(n_bytes);

    int in_flight = 0;
    int64_t read_offset = 0;

    // Prefill the pipeline
    int buff = 0;
    while (read_offset < n_bytes && buff < PIPELINE) {
      std::copy(
          in_ptr + read_offset,
          in_ptr + std::min(read_offset + N, n_bytes),
          send_buffer(sz, buff).begin<char>());
      send_to(sz, dst, buff);

      buff++;
      read_offset += N;
      in_flight++;
    }

    // Main loop
    while (in_flight > 0) {
      // Poll the hardware for completions.
      //
      // If a send was completed and we have more data to send then go ahead
      // and send them.
      ibv_wc wc[WC_NUM];
      int n = connections_[dst].poll(WC_NUM, wc);
      for (int i = 0; i < n; i++) {
        int buff = (wc[i].wr_id >> 8) & 0xff;
        int rank = wc[i].wr_id & 0xff;

        in_flight--;

        if (read_offset < n_bytes) {
          std::copy(
              in_ptr + read_offset,
              in_ptr + std::min(read_offset + N, n_bytes),
              send_buffer(sz, buff).begin<char>());
          send_to(sz, dst, buff);

          read_offset += N;
          in_flight++;
        }
      }
    }
  }

  void recv(char* out_ptr, int64_t n_bytes, int src) {
    constexpr int PIPELINE = 2;
    constexpr int WC_NUM = PIPELINE;
    auto [sz, N] = buffer_size_from_message(n_bytes);

    int in_flight = 0;
    int64_t write_offset = 0;

    // Prefill the pipeline
    int buff = 0;
    while (N * buff < n_bytes && buff < PIPELINE) {
      recv_from(sz, src, buff);

      in_flight++;
      buff++;
    }

    // Main loop
    while (in_flight > 0) {
      // Poll the hardware for completions.
      //
      // If a recv was completed copy it to the output and if we have more
      // data to fetch post another recv.
      ibv_wc wc[WC_NUM];
      int n = connections_[src].poll(WC_NUM, wc);
      for (int i = 0; i < n; i++) {
        int buff = (wc[i].wr_id >> 8) & 0xff;
        int rank = wc[i].wr_id & 0xff;

        in_flight--;

        std::copy(
            recv_buffer(sz, buff, src).begin<char>(),
            recv_buffer(sz, buff, src).begin<char>() +
                std::min(n_bytes - write_offset, static_cast<int64_t>(N)),
            out_ptr + write_offset);
        write_offset += N;

        if (write_offset + (PIPELINE - 1) * N < n_bytes) {
          recv_from(sz, src, buff);

          in_flight++;
        }
      }
    }
  }

 private:
  void send_to(int sz, int rank, int buff) {
    connections_[rank].post_send(
        send_buffer(sz, buff), SEND_WR << 16 | buff << 8 | rank);
  }

  void recv_from(int sz, int rank, int buff) {
    connections_[rank].post_recv(
        recv_buffer(sz, buff, rank), RECV_WR << 16 | buff << 8 | rank);
  }

  SharedBuffer& send_buffer(int sz, int buff) {
    return buffers_[sz * NUM_BUFFERS * size_ + buff * size_ + rank_];
  }

  SharedBuffer& recv_buffer(int sz, int buff, int rank) {
    return buffers_[sz * NUM_BUFFERS * size_ + buff * size_ + rank];
  }

  void post_send_all(int sz, int buff) {
    auto& b = send_buffer(sz, buff);
    int wr_id = SEND_WR << 16 | buff << 8;
    for (int i = 0; i < size_; i++) {
      if (i == rank_) {
        continue;
      }
      connections_[i].post_send(b, wr_id | i);
    }
  }

  void post_recv_all(int sz, int buff) {
    int b = sz * NUM_BUFFERS * size_ + buff * size_;
    int wr_id = RECV_WR << 16 | buff << 8;
    for (int i = 0; i < size_; i++) {
      if (i == rank_) {
        continue;
      }
      connections_[i].post_recv(buffers_[b + i], wr_id | i);
    }
  }

  int rank_;
  int size_;
  std::span<Connection> connections_;
  std::span<SharedBuffer> buffers_;
};

} // namespace mlx::core::distributed::jaccl


================================================
FILE: mlx/distributed/jaccl/no_jaccl.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/distributed/jaccl/jaccl.h"

namespace mlx::core::distributed::jaccl {

using GroupImpl = mlx::core::distributed::detail::GroupImpl;

bool is_available() {
  return false;
}

std::shared_ptr<GroupImpl> init(bool strict /* = false */) {
  if (strict) {
    throw std::runtime_error("Cannot initialize jaccl distributed backend.");
  }
  return nullptr;
}

} // namespace mlx::core::distributed::jaccl


================================================
FILE: mlx/distributed/jaccl/ring.cpp
================================================
// Copyright © 2026 Apple Inc.

#include "mlx/distributed/jaccl/ring.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/distributed/reduction_ops.h"
#include "mlx/dtype_utils.h"

namespace mlx::core::distributed::jaccl {

RingGroup::RingGroup(
    int rank,
    int size,
    const std::vector<std::string>& left_devices,
    const std::vector<std::string>& right_devices,
    const char* coordinator_addr)
    : rank_(rank),
      size_(size),
      n_conns_(left_devices.size()),
      side_channel_(rank_, size_, coordinator_addr),
      left_(create_connections(left_devices)),
      right_(create_connections(right_devices)) {
  if (left_.size() > RING_MAX_CONNS || right_.size() > RING_MAX_CONNS) {
    std::ostringstream msg;
    msg << "[jaccl] Up to " << RING_MAX_CONNS << " per direction supported but "
        << left_.size() << " were provided.";
    throw std::runtime_error(msg.str());
  }

  // Initialize all the connections and allocate buffers
  initialize();

  // Make sure every node has reached here before continuing
  side_channel_.all_gather<int>(0);

  // Create the ring implementation object
  ring_ = RingImpl(rank_, size_, left_, right_, send_buffers_, recv_buffers_);
}

void RingGroup::initialize() {
  // Create the queue pairs
  for (auto& conn : left_) {
    conn.allocate_protection_domain();
    conn.create_completion_queue(MAX_SEND_WR + MAX_RECV_WR);
    conn.create_queue_pair();
  }
  for (auto& conn : right_) {
    conn.allocate_protection_domain();
    conn.create_completion_queue(MAX_SEND_WR + MAX_RECV_WR);
    conn.create_queue_pair();
  }

  // Allocate the buffers
  allocate_buffers();

  // Initialize the conections
  for (auto& conn : left_) {
    conn.queue_pair_init();
  }
  for (auto& conn : right_) {
    conn.queue_pair_init();
  }

  // Gather the information to be exchanged, this also serves as a barrier so
  // that all peers have initialized their connections before attempting to
  // transition to RTS.
  std::vector<Destination> left_info;
  for (auto& conn : left_) {
    left_info.emplace_back(conn.info());
  }
  std::vector<Destination> right_info;
  for (auto& conn : right_) {
    right_info.emplace_back(conn.info());
  }
  auto all_left_infos = side_channel_.all_gather(left_info);
  auto all_right_infos = side_channel_.all_gather(right_info);

  // Transition queue pairs to RTS
  int left_peer = (rank_ + size_ - 1) % size_;
  for (int i = 0; i < left_.size(); i++) {
    auto peer_info = all_right_infos[left_peer][i];
    left_[i].queue_pair_rtr(peer_info);
    left_[i].queue_pair_rts();
  }
  int right_peer = (rank_ + 1) % size_;
  for (int i = 0; i < right_.size(); i++) {
    auto peer_info = all_left_infos[right_peer][i];
    right_[i].queue_pair_rtr(peer_info);
    right_[i].queue_pair_rts();
  }
}

void RingGroup::allocate_buffers() {
  // Deregister any buffers and free the memory
  send_buffers_.clear();
  recv_buffers_.clear();

  // Allocate the memory
  for (int k = 0; k < BUFFER_SIZES; k++) {
    for (int i = 0; i < NUM_BUFFERS; i++) {
      for (int j = 0; j < n_conns_ * 2; j++) {
        send_buffers_.emplace_back(FRAME_SIZE * (1 << k));
        recv_buffers_.emplace_back(FRAME_SIZE * (1 << k));
      }
    }
  }

  // Register the buffers with the corresponding connections
  for (int k = 0; k < BUFFER_SIZES; k++) {
    for (int i = 0; i < NUM_BUFFERS; i++) {
      for (int j = 0; j < n_conns_ * 2; j++) {
        int wire = j % n_conns_;
        int lr = j / n_conns_;
        if (lr) {
          send_buffers_[k * NUM_BUFFERS * n_conns_ * 2 + i * n_conns_ * 2 + j]
              .register_to_protection_domain(left_[wire].protection_domain);
          recv_buffers_[k * NUM_BUFFERS * n_conns_ * 2 + i * n_conns_ * 2 + j]
              .register_to_protection_domain(right_[wire].protection_domain);
        } else {
          send_buffers_[k * NUM_BUFFERS * n_conns_ * 2 + i * n_conns_ * 2 + j]
              .register_to_protection_domain(right_[wire].protection_domain);
          recv_buffers_[k * NUM_BUFFERS * n_conns_ * 2 + i * n_conns_ * 2 + j]
              .register_to_protection_domain(left_[wire].protection_domain);
        }
      }
    }
  }
}

void RingGroup::all_sum(const array& input, array& output, Stream stream) {
  dispatch_all_types(output.dtype(), [&](auto type_tag) {
    using T = MLX_GET_TYPE(type_tag);
    all_reduce<T>(input, output, stream, detail::SumOp<T>{});
  });
}

void RingGroup::all_max(const array& input, array& output, Stream stream) {
  dispatch_all_types(output.dtype(), [&](auto type_tag) {
    using T = MLX_GET_TYPE(type_tag);
    all_reduce<T>(input, output, stream, detail::MaxOp<T>{});
  });
}

void RingGroup::all_min(const array& input, array& output, Stream stream) {
  dispatch_all_types(output.dtype(), [&](auto type_tag) {
    using T = MLX_GET_TYPE(type_tag);
    all_reduce<T>(input, output, stream, detail::MinOp<T>{});
  });
}

void RingGroup::all_gather(const array& input, array& output, Stream stream) {
  auto in_ptr = input.data<char>();
  auto out_ptr = output.data<char>();
  int64_t n_bytes = input.nbytes();
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(input);
  encoder.set_output_array(output);
  encoder.dispatch([in_ptr, out_ptr, n_bytes, this]() {
    ring_.all_gather(in_ptr, out_ptr, n_bytes, n_conns_);
  });
}

void RingGroup::send(const array& input, int dst, Stream stream) {
  int right = (rank_ + 1) % size_;
  int left = (rank_ + size_ - 1) % size_;
  if (dst != right && dst != left) {
    std::ostringstream msg;
    msg << "[jaccl] In ring mode send is only supported to direct neighbors "
        << "but tried to send to " << dst << " from " << rank_ << std::endl;
    throw std::runtime_error(msg.str());
  }
  auto data = input.data<char>();
  int64_t n_bytes = input.nbytes();
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(input);
  encoder.dispatch([data, n_bytes, dst, this]() {
    ring_.send(data, n_bytes, dst, n_conns_);
  });
}

void RingGroup::recv(array& out, int src, Stream stream) {
  int right = (rank_ + 1) % size_;
  int left = (rank_ + size_ - 1) % size_;
  if (src != right && src != left) {
    std::ostringstream msg;
    msg << "[jaccl] In ring mode recv is only supported to direct neighbors "
        << "but tried to recv from " << src << " to " << rank_ << std::endl;
    throw std::runtime_error(msg.str());
  }
  auto data = out.data<char>();
  int64_t n_bytes = out.nbytes();
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_output_array(out);
  encoder.dispatch([data, n_bytes, src, this]() {
    ring_.recv(data, n_bytes, src, n_conns_);
  });
}

template <typename T, typename ReduceOp>
void RingGroup::all_reduce(
    const array& input,
    array& output,
    Stream stream,
    ReduceOp reduce_op) {
  auto in_ptr = input.data<T>();
  auto out_ptr = output.data<T>();
  int64_t size = input.size();
  int64_t n_bytes = input.nbytes();
  auto& encoder = cpu::get_command_encoder(stream);
  encoder.set_input_array(input);
  encoder.set_output_array(output);
  encoder.dispatch([in_ptr, out_ptr, size, n_bytes, this, reduce_op]() {
    if (size < size_ * 2 * n_conns_) {
      ring_.all_reduce<1, T, ReduceOp>(in_ptr, out_ptr, size, 1, reduce_op);
      return;
    }

    if (n_bytes <= 65536) {
      ring_.all_reduce<2, T, ReduceOp>(in_ptr, out_ptr, size, 1, reduce_op);
      return;
    }

    ring_.all_reduce<2, T, ReduceOp>(
        in_ptr, out_ptr, size, n_conns_, reduce_op);
  });
}

} // namespace mlx::core::distributed::jaccl


================================================
FILE: mlx/distributed/jaccl/ring.h
================================================
// Copyright © 2026 Apple Inc.

#pragma once

#include "mlx/distributed/distributed_impl.h"
#include "mlx/distributed/jaccl/ring_impl.h"
#include "mlx/distributed/jaccl/utils.h"

using GroupImpl = mlx::core::distributed::detail::GroupImpl;

namespace mlx::core::distributed::jaccl {

/**
 * The JACCL communication group for a ring where each node is connected to its
 * two neighboring nodes. It should be the highest bandwidth communication
 * group for large messages when many connections per peer are used.
 *
 * Like all JACCL groups it uses a side channel to exchange the necessary
 * information and then configure the connections to be ready for RDMA
 * operations.
 */
class RingGroup : public GroupImpl {
 public:
  RingGroup(
      int rank,
      int size,
      const std::vector<std::string>& left_devices,
      const std::vector<std::string>& right_devices,
      const char* coordinator_addr);

  Stream communication_stream(StreamOrDevice s) override {
    return to_stream(s, Device::cpu);
  }

  int rank() override {
    return rank_;
  }

  int size() override {
    return size_;
  }

  void all_sum(const array& input, array& output, Stream stream) override;
  void all_max(const array& input, array& output, Stream stream) override;
  void all_min(const array& input, array& output, Stream stream) override;
  void all_gather(const array& input, array& output, Stream stream) override;
  void send(const array& input, int dst, Stream stream) override;
  void recv(array& out, int src, Stream stream) override;

  void sum_scatter(const array& input, array& output, Stream stream) override {
    throw std::runtime_error("[jaccl] sum_scatter not supported.");
  }

  std::shared_ptr<GroupImpl> split(int color, int key = -1) override {
    throw std::runtime_error("[jaccl] Group split not supported.");
  }

 private:
  template <typename T, typename ReduceOp>
  void all_reduce(
      const array& input,
      array& output,
      Stream stream,
      ReduceOp reduce_op);

  /**
   * Performs the connection initialization. Namely, after this call all
   * Connection objects should have a queue pair in RTS state and all buffers
   * should have been allocated.
   */
  void initialize();

  /**
   * Allocate all the buffers that we will use in the communication group.
   */
  void allocate_buffers();

  int rank_;
  int size_;
  int n_conns_;
  SideChannel side_channel_;
  std::vector<Connection> left_;
  std::vector<Connection> right_;
  std::vector<SharedBuffer> send_buffers_;
  std::vector<SharedBuffer> recv_buffers_;
  RingImpl ring_;
};

} // namespace mlx::core::distributed::jaccl


================================================
FILE: mlx/distributed/jaccl/ring_impl.h
================================================
// Copyright © 2026 Apple Inc.

#pragma once

#include <span>

#include "mlx/distributed/jaccl/utils.h"

constexpr int RING_MAX_CONNS = 4;

namespace mlx::core::distributed::jaccl {

class RingImpl {
 public:
  RingImpl(
      int rank,
      int size,
      std::vector<Connection>& left,
      std::vector<Connection>& right,
      std::vector<SharedBuffer>& send_buffers,
      std::vector<SharedBuffer>& recv_buffers)
      : rank_(rank),
        size_(size),
        n_conns_(left.size()),
        left_(left),
        right_(right),
        send_buffers_(send_buffers),
        recv_buffers_(recv_buffers) {}

  RingImpl(
      int rank,
      int size,
      Connection* left_begin,
      Connection* right_begin,
      size_t n_conns,
      std::vector<SharedBuffer>& send_buffers,
      std::vector<SharedBuffer>& recv_buffers)
      : rank_(rank),
        size_(size),
        n_conns_(n_conns),
        left_(left_begin, n_conns),
        right_(right_begin, n_conns),
        send_buffers_(send_buffers),
        recv_buffers_(recv_buffers) {}

  RingImpl() : rank_(0), size_(1), n_conns_(0) {}

  template <int MAX_DIR, typename T, typename ReduceOp>
  void all_reduce(
      const T* in_ptr,
      T* out_ptr,
      int64_t size,
      int n_wires,
      ReduceOp reduce_op) {
    // If not inplace all reduce then copy the input to the output first
    if (in_ptr != out_ptr) {
      std::memcpy(out_ptr, in_ptr, size * sizeof(T));
    }

    constexpr int PIPELINE = 2;
    constexpr int WC_NUM = PIPELINE * RING_MAX_CONNS * 2 * MAX_DIR;
    int64_t chunk_size = (size + size_ - 1) / size_;
    int64_t size_per_wire =
        (chunk_size + (MAX_DIR * n_wires) - 1) / (MAX_DIR * n_wires);
    auto [sz, N] = buffer_size_from_message(size_per_wire * sizeof(T));
    N /= sizeof(T);
    int64_t n_steps = (size_per_wire + N - 1) / N;

    // Counters to maintain the state of transfers
    int in_flight = 0;
    int64_t chunk_multiple_size = size_ * chunk_size;
    int64_t send_offset[MAX_DIR];
    int64_t recv_offset[MAX_DIR];
    int64_t send_limits[MAX_DIR];
    int64_t recv_limits[MAX_DIR];
    int send_count[MAX_DIR * RING_MAX_CONNS] = {0};
    int recv_count[MAX_DIR * RING_MAX_CONNS] = {0};
    send_offset[0] = rank_ * chunk_size;
    recv_offset[0] = ((rank_ + size_ - 1) % size_) * chunk_size;
    if constexpr (MAX_DIR == 2) {
      send_offset[1] = rank_ * chunk_size;
      recv_offset[1] = ((rank_ + 1) % size_) * chunk_size;
      send_limits[0] = std::min(
          n_wires * size_per_wire, std::max<int64_t>(0, size - send_offset[0]));
      send_limits[1] =
          std::min(chunk_size, std::max<int64_t>(0, size - send_offset[1]));
      recv_limits[0] = std::min(
          n_wires * size_per_wire, std::max<int64_t>(0, size - recv_offset[0]));
      recv_limits[1] =
          std::min(chunk_size, std::max<int64_t>(0, size - recv_offset[1]));
    } else {
      send_limits[0] =
          std::min(chunk_size, std::max<int64_t>(0, size - send_offset[0]));
      recv_limits[0] =
          std::min(chunk_size, std::max<int64_t>(0, size - recv_offset[0]));
    }

    // First reduce scatter
    //
    // Possible perf improvement by not syncing at every step but running ahead
    // as needed.
    for (int k = 0; k < size_ - 1; k++) {
      // Prefill the pipeline
      int buff = 0;
      while (buff < n_steps && buff < PIPELINE) {
        post_recv_all<MAX_DIR>(sz, buff, n_wires);
        for (int lr = 0; lr < MAX_DIR; lr++) {
          for (int lw = 0; lw < n_wires; lw++) {
            int64_t offset = lw * N +
                send_count[lr * RING_MAX_CONNS + lw] * n_wires * N +
                lr * n_wires * size_per_wire;
            std::copy(
                out_ptr + send_offset[lr] + offset,
                out_ptr + send_offset[lr] +
                    std::max(offset, std::min(offset + N, send_limits[lr])),
                send_buffer(sz, buff, lr, lw).begin<T>());
            send_count[lr * RING_MAX_CONNS + lw]++;
          }
        }
        post_send_all<MAX_DIR>(sz, buff, n_wires);

        buff++;
        in_flight += 2 * MAX_DIR * n_wires;
      }

      // Main loop
      //
      // Keep going until we have no longer data in flight.
      while (in_flight > 0) {
        ibv_wc wc[WC_NUM];
        int n = poll(left_, right_, WC_NUM, wc);
        for (int i = 0; i < n; i++) {
          int work_type = wc[i].wr_id >> 16;
          int buff = (wc[i].wr_id >> 8) & 0xff;
          int wire = wc[i].wr_id & 0xff;
          int lr = wire / RING_MAX_CONNS;
          int lw = wire % RING_MAX_CONNS;

          in_flight--;

          if (work_type == SEND_WR && send_count[wire] < n_steps) {
            int64_t offset = lw * N + send_count[wire] * n_wires * N +
                lr * n_wires * size_per_wire;
            std::copy(
                out_ptr + send_offset[lr] + offset,
                out_ptr + send_offset[lr] +
                    std::max(offset, std::min(offset + N, send_limits[lr])),
                send_buffer(sz, buff, lr, lw).begin<T>());
            send_to(sz, buff, lr, lw);
            in_flight++;
            send_count[wire]++;
          }

          else if (work_type == RECV_WR) {
            int64_t offset = lw * N + recv_count[wire] * n_wires * N +
                lr * n_wires * size_per_wire;
            reduce_op(
                recv_buffer(sz, buff, lr, lw).begin<T>(),
                out_ptr + recv_offset[lr] + offset,
                std::max<int64_t>(0, std::min(N, recv_limits[lr] - offset)));
            recv_count[wire]++;
            if (recv_count[wire] + (PIPELINE - 1) < n_steps) {
              recv_from(sz, buff, lr, lw);
              in_flight++;
            }
          }
        }
      }

      send_offset[0] = (send_offset[0] + chunk_multiple_size - chunk_size) %
          chunk_multiple_size;
      recv_offset[0] = (recv_offset[0] + chunk_multiple_size - chunk_size) %
          chunk_multiple_size;
      if constexpr (MAX_DIR == 2) {
        send_offset[1] = (send_offset[1] + chunk_size) % chunk_multiple_size;
        recv_offset[1] = (recv_offset[1] + chunk_size) % chunk_multiple_size;
        send_limits[0] = std::min(
            n_wires * size_per_wire,
            std::max<int64_t>(0, size - send_offset[0]));
        send_limits[1] =
            std::min(chunk_size, std::max<int64_t>(0, size - send_offset[1]));
        recv_limits[0] = std::min(
            n_wires * size_per_wire,
            std::max<int64_t>(0, size - recv_offset[0]));
        recv_limits[1] =
            std::min(chunk_size, std::max<int64_t>(0, size - recv_offset[1]));
      } else {
        send_limits[0] =
            std::min(chunk_size, std::max<int64_t>(0, size - send_offset[0]));
        recv_limits[0] =
            std::min(chunk_size, std::max<int64_t>(0, size - recv_offset[0]));
      }
      for (int i = 0; i < MAX_DIR * RING_MAX_CONNS; i++) {
        send_count[i] = recv_count[i] = 0;
      }
    }

    // Secondly all gather
    //
    // The offsets are correct from the scatter reduce
    for (int k = 0; k < size_ - 1; k++) {
      // Prefill the pipeline
      int buff = 0;
      while (buff < n_steps && buff < PIPELINE) {
        post_recv_all<MAX_DIR>(sz, buff, n_wires);
        for (int lr = 0; lr < MAX_DIR; lr++) {
          for (int lw = 0; lw < n_wires; lw++) {
            int64_t offset = lw * N +
                send_count[lr * RING_MAX_CONNS + lw] * n_wires * N +
                lr * n_wires * size_per_wire;
            std::copy(
                out_ptr + send_offset[lr] + offset,
                out_ptr + send_offset[lr] +
                    std::max(offset, std::min(offset + N, send_limits[lr])),
                send_buffer(sz, buff, lr, lw).begin<T>());
            send_count[lr * RING_MAX_CONNS + lw]++;
          }
        }
        post_send_all<MAX_DIR>(sz, buff, n_wires);

        buff++;
        in_flight += 2 * MAX_DIR * n_wires;
      }

      // Main loop
      //
      // Keep going until we have no longer data in flight.
      while (in_flight > 0) {
        ibv_wc wc[WC_NUM];
        int n = poll(left_, right_, WC_NUM, wc);
        for (int i = 0; i < n; i++) {
          int work_type = wc[i].wr_id >> 16;
          int buff = (wc[i].wr_id >> 8) & 0xff;
          int wire = wc[i].wr_id & 0xff;
          int lr = wire / RING_MAX_CONNS;
          int lw = wire % RING_MAX_CONNS;

          in_flight--;

          if (work_type == SEND_WR && send_count[wire] < n_steps) {
            int64_t offset = lw * N + send_count[wire] * n_wires * N +
                lr * n_wires * size_per_wire;
            std::copy(
                out_ptr + send_offset[lr] + offset,
                out_ptr + send_offset[lr] +
                    std::max(offset, std::min(offset + N, send_limits[lr])),
                send_buffer(sz, buff, lr, lw).begin<T>());
            send_to(sz, buff, lr, lw);
            in_flight++;
            send_count[wire]++;
          }

          else if (work_type == RECV_WR) {
            int64_t offset = lw * N + recv_count[wire] * n_wires * N +
                lr * n_wires * size_per_wire;
            std::copy(
                recv_buffer(sz, buff, lr, lw).begin<T>(),
                recv_buffer(sz, buff, lr, lw).begin<T>() +
                    std::max<int64_t>(0, std::min(N, recv_limits[lr] - offset)),
                out_ptr + recv_offset[lr] + offset);
            recv_count[wire]++;
            if (recv_count[wire] + (PIPELINE - 1) < n_steps) {
              recv_from(sz, buff, lr, lw);
              in_flight++;
            }
          }
        }
      }

      send_offset[0] = (send_offset[0] + chunk_multiple_size - chunk_size) %
          chunk_multiple_size;
      recv_offset[0] = (recv_offset[0] + chunk_multiple_size - chunk_size) %
          chunk_multiple_size;
      if constexpr (MAX_DIR == 2) {
        send_offset[1] = (send_offset[1] + chunk_size) % chunk_multiple_size;
        recv_offset[1] = (recv_offset[1] + chunk_size) % chunk_multiple_size;
        send_limits[0] = std::min(
            n_wires * size_per_wire,
            std::max<int64_t>(0, size - send_offset[0]));
        send_limits[1] =
            std::min(chunk_size, std::max<int64_t>(0, size - send_offset[1]));
        recv_limits[0] = std::min(
            n_wires * size_per_wire,
            std::max<int64_t>(0, size - recv_offset[0]));
        recv_limits[1] =
            std::min(chunk_size, std::max<int64_t>(0, size - recv_offset[1]));
      } else {
        send_limits[0] =
            std::min(chunk_size, std::max<int64_t>(0, size - send_offset[0]));
        recv_limits[0] =
            std::min(chunk_size, std::max<int64_t>(0, size - recv_offset[0]));
      }
      for (int i = 0; i < MAX_DIR * RING_MAX_CONNS; i++) {
        send_count[i] = recv_count[i] = 0;
      }
    }
  }

  void
  all_gather(const char* in_ptr, char* out_ptr, int64_t n_bytes, int n_wires) {
    // Copy our data to the appropriate place
    std::memcpy(out_ptr + rank_ * n_bytes, in_ptr, n_bytes);

    constexpr int PIPELINE = 2;
    constexpr int WC_NUM = PIPELINE * RING_MAX_CONNS * 2 * 2;
    size_t n_bytes_per_wire = (n_bytes + (2 * n_wires) - 1) / (2 * n_wires);
    size_t out_bytes = n_bytes * size_;
    auto [sz, N] = buffer_size_from_message(n_bytes_per_wire);
    int n_steps = (n_bytes_per_wire + N - 1) / N;

    // Counters to maintain the state of transfers
    int in_flight = 0;
    int64_t send_offset[2];
    int64_t recv_offset[2];
    int64_t limits[2];
    int send_count[2 * RING_MAX_CONNS] = {0};
    int recv_count[2 * RING_MAX_CONNS] = {0};
    send_offset[0] = send_offset[1] = rank_ * n_bytes;
    recv_offset[0] = ((rank_ + size_ - 1) % size_) * n_bytes;
    recv_offset[1] = ((rank_ + 1) % size_) * n_bytes;
    limits[0] = n_wires * n_bytes_per_wire;
    limits[1] = n_bytes;

    // Possible perf improvement by not syncing at every step but running ahead
    // as needed.
    for (int k = 0; k < size_ - 1; k++) {
      // Prefill the pipeline
      int buff = 0;
      while (buff < n_steps && buff < PIPELINE) {
        post_recv_all(sz, buff);
        for (int lr = 0; lr < 2; lr++) {
          for (int lw = 0; lw < n_wires; lw++) {
            int64_t offset = lw * N +
                send_count[lr * RING_MAX_CONNS + lw] * n_wires * N +
                lr * n_wires * n_bytes_per_wire;
            std::copy(
                out_ptr + send_offset[lr] + offset,
                out_ptr + send_offset[lr] +
                    std::max(offset, std::min(offset + N, limits[lr])),
                send_buffer(sz, buff, lr, lw).begin<char>());
            send_count[lr * RING_MAX_CONNS + lw]++;
          }
        }
        post_send_all(sz, buff);

        buff++;
        in_flight += 2 * 2 * n_wires;
      }

      // Main loop
      //
      // Keep going until we have no longer data in flight.
      while (in_flight > 0) {
        ibv_wc wc[WC_NUM];
        int n = poll(left_, right_, WC_NUM, wc);
        for (int i = 0; i < n; i++) {
          int work_type = wc[i].wr_id >> 16;
          int buff = (wc[i].wr_id >> 8) & 0xff;
          int wire = wc[i].wr_id & 0xff;
          int lr = wire / RING_MAX_CONNS;
          int lw = wire % RING_MAX_CONNS;

          in_flight--;

          if (work_type == SEND_WR && send_count[wire] < n_steps) {
            int64_t offset = lw * N + send_count[wire] * n_wires * N +
                lr * n_wires * n_bytes_per_wire;
            std::copy(
                out_ptr + send_offset[lr] + offset,
                out_ptr + send_offset[lr] +
                    std::max(offset, std::min(offset + N, limits[lr])),
                send_buffer(sz, buff, lr, lw).begin<char>());
            send_to(sz, buff, lr, lw);
            in_flight++;
            send_count[wire]++;
          }

          else if (work_type == RECV_WR) {
            int64_t offset = lw * N + recv_count[wire] * n_wires * N +
                lr * n_wires * n_bytes_per_wire;
            std::copy(
                recv_buffer(sz, buff, lr, lw).begin<char>(),
                recv_buffer(sz, buff, lr, lw).begin<char>() +
                    std::max<int64_t>(0, std::min(N, limits[lr] - offset)),
                out_ptr + recv_offset[lr] + offset);
            recv_count[wire]++;
            if (recv_count[wire] + (PIPELINE - 1) < n_steps) {
              recv_from(sz, buff, lr, lw);
              in_flight++;
            }
          }
        }
      }

      send_offset[0] = (send_offset[0] + out_bytes - n_bytes) % out_bytes;
      recv_offset[0] = (recv_offset[0] + out_bytes - n_bytes) % out_bytes;
      send_offset[1] = (send_offset[1] + n_bytes) % out_bytes;
      recv_offset[1] = (recv_offset[1] + n_bytes) % out_bytes;
      for (int i = 0; i < 2 * RING_MAX_CONNS; i++) {
        send_count[i] = recv_count[i] = 0;
      }
    }
  }

  void send(const char* in_ptr, int64_t n_bytes, int dst, int n_wires) {
    int left = (rank_ + size_ - 1) % size_;

    // In the case that size_ == 2 then left == right so we bias send towards
    // left and recv towards right so that the selections will be correct for
    // the 2 node case.
    auto& conns = (dst == left) ? left_ : right_;
    int dir = dst == left;

    constexpr int PIPELINE = 2;
    constexpr int WC_NUM = PIPELINE * RING_MAX_CONNS;

    int64_t bytes_per_wire = (n_bytes + n_wires - 1) / n_wires;
    auto [sz, N] = buffer_size_from_message(bytes_per_wire);

    int in_flight = 0;
    int64_t read_offset[RING_MAX_CONNS];
    int64_t limits[RING_MAX_CONNS];
    for (int lw = 0; lw < n_wires; lw++) {
      read_offset[lw] = std::min(lw * bytes_per_wire, n_bytes);
      limits[lw] = std::min((lw + 1) * bytes_per_wire, n_bytes);
    }

    // Prefill the pipeline
    for (int lw = 0; lw < n_wires; lw++) {
      int buff = 0;
      while (read_offset[lw] < limits[lw] && buff < PIPELINE) {
        std::copy(
            in_ptr + read_offset[lw],
            in_ptr + std::min(read_offset[lw] + N, limits[lw]),
            send_buffer(sz, buff, dir, lw).begin<char>());
        send_to(sz, buff, dir, lw);

        buff++;
        read_offset[lw] += N;
        in_flight++;
      }
    }

    // Main loop
    while (in_flight > 0) {
      // Poll the hardware for completions.
      //
      // If a send was completed and we have more data to send then go ahead
      // and send them.
      ibv_wc wc[WC_NUM];
      int n = poll(conns, WC_NUM, wc);
      for (int i = 0; i < n; i++) {
        int buff = (wc[i].wr_id >> 8) & 0xff;
        int wire = wc[i].wr_id & 0xff;
        int lw = wire % RING_MAX_CONNS;

        in_flight--;

        if (read_offset[lw] < limits[lw]) {
          std::copy(
              in_ptr + read_offset[lw],
              in_ptr + std::min(read_offset[lw] + N, limits[lw]),
              send_buffer(sz, buff, dir, lw).begin<char>());
          send_to(sz, buff, dir, lw);

          read_offset[lw] += N;
          in_flight++;
        }
      }
    }
  }

  void recv(char* out_ptr, int64_t n_bytes, int src, int n_wires) {
    int right = (rank_ + 1) % size_;

    // In the case that size_ == 2 then left == right so we bias send towards
    // left and recv towards right so that the selections will be correct for
    // the 2 node case.
    auto& conns = (src == right) ? right_ : left_;
    int dir = src == right;

    constexpr int PIPELINE = 2;
    constexpr int WC_NUM = PIPELINE * RING_MAX_CONNS;

    int64_t bytes_per_wire = (n_bytes + n_wires - 1) / n_wires;
    auto [sz, N] = buffer_size_from_message(bytes_per_wire);

    int in_flight = 0;
    int64_t write_offset[RING_MAX_CONNS];
    int64_t limits[RING_MAX_CONNS];
    for (int lw = 0; lw < n_wires; lw++) {
      write_offset[lw] = std::min(lw * bytes_per_wire, n_bytes);
      limits[lw] = std::min((lw + 1) * bytes_per_wire, n_bytes);
    }

    // Prefill the pipeline
    for (int lw = 0; lw < n_wires; lw++) {
      int buff = 0;
      while (N * buff < limits[lw] && buff < PIPELINE) {
        recv_from(sz, buff, dir, lw);

        buff++;
        in_flight++;
      }
    }

    // Main loop
    while (in_flight > 0) {
      // Poll the hardware for completions.
      //
      // If a recv was completed copy it to the output and if we have more
      // data to fetch post another recv.
      ibv_wc wc[WC_NUM];
      int n = poll(conns, WC_NUM, wc);
      for (int i = 0; i < n; i++) {
        int buff = (wc[i].wr_id >> 8) & 0xff;
        int wire = wc[i].wr_id & 0xff;
        int lw = wire % RING_MAX_CONNS;

        in_flight--;

        std::copy(
            recv_buffer(sz, buff, dir, lw).begin<char>(),
            recv_buffer(sz, buff, dir, lw).begin<char>() +
                std::max<int64_t>(
                    0, std::min<int64_t>(limits[lw] - write_offset[lw], N)),
            out_ptr + write_offset[lw]);
        write_offset[lw] += N;

        if (write_offset[lw] + (PIPELINE - 1) * N < limits[lw]) {
          recv_from(sz, buff, dir, lw);

          in_flight++;
        }
      }
    }
  }

 private:
  void send_to(int sz, int buff, int left_right, int wire) {
    if (left_right) {
      left_[wire].post_send(
          send_buffer_left(sz, buff, wire),
          SEND_WR << 16 | buff << 8 | (RING_MAX_CONNS + wire));
    } else {
      right_[wire].post_send(
          send_buffer_right(sz, buff, wire), SEND_WR << 16 | buff << 8 | wire);
    }
  }

  void recv_from(int sz, int buff, int left_right, int wire) {
    if (left_right) {
      right_[wire].post_recv(
          recv_buffer_right(sz, buff, wire),
          RECV_WR << 16 | buff << 8 | (RING_MAX_CONNS + wire));
    } else {
      left_[wire].post_recv(
          recv_buffer_left(sz, buff, wire), RECV_WR << 16 | buff << 8 | wire);
    }
  }

  SharedBuffer& send_buffer_right(int sz, int buff, int wire) {
    return send_buffers_
        [sz * NUM_BUFFERS * n_conns_ * 2 + buff * n_conns_ * 2 + wire];
  }

  SharedBuffer& send_buffer_left(int sz, int buff, int wire) {
    return send_buffers_
        [sz * NUM_BUFFERS * n_conns_ * 2 + buff * n_conns_ * 2 + n_conns_ +
         wire];
  }

  SharedBuffer& send_buffer(int sz, int buff, int left_right, int wire) {
    return send_buffers_
        [sz * NUM_BUFFERS * n_conns_ * 2 + buff * n_conns_ * 2 +
         left_right * n_conns_ + wire];
  }

  SharedBuffer& recv_buffer_left(int sz, int buff, int wire) {
    return recv_buffers_
        [sz * NUM_BUFFERS * n_conns_ * 2 + buff * n_conns_ * 2 + wire];
  }

  SharedBuffer& recv_buffer_right(int sz, int buff, int wire) {
    return recv_buffers_
        [sz * NUM_BUFFERS * n_conns_ * 2 + buff * n_conns_ * 2 + n_conns_ +
         wire];
  }

  SharedBuffer& recv_buffer(int sz, int buff, int left_right, int wire) {
    return recv_buffers_
        [sz * NUM_BUFFERS * n_conns_ * 2 + buff * n_conns_ * 2 +
         left_right * n_conns_ + wire];
  }

  template <int MAX_DIR>
  void post_recv_all(int sz, int buff, int n_wires) {
    for (int lr = 0; lr < MAX_DIR; lr++) {
      for (int lw = 0; lw < n_wires; lw++) {
        recv_from(sz, buff, lr, lw);
      }
    }
  }

  void post_recv_all(int sz, int buff) {
    post_recv_all<2>(sz, buff, n_conns_);
  }

  template <int MAX_DIR>
  void post_send_all(int sz, int buff, int n_wires) {
    for (int lr = 0; lr < MAX_DIR; lr++) {
      for (int lw = 0; lw < n_wires; lw++) {
        send_to(sz, buff, lr, lw);
      }
    }
  }

  void post_send_all(int sz, int buff) {
    post_send_all<2>(sz, buff, n_conns_);
  }

  int rank_;
  int size_;
  int n_conns_;
  std::span<Connection> left_;
  std::span<Connection> right_;
  std::span<SharedBuffer> send_buffers_;
  std::span<SharedBuffer> recv_buffers_;
};

} // namespace mlx::core::distributed::jaccl


================================================
FILE: mlx/distributed/jaccl/utils.cpp
================================================
// Copyright © 2025 Apple Inc.

#include <dlfcn.h>
#include <unistd.h>
#include <iostream>
#include <sstream>

#include "mlx/distributed/jaccl/utils.h"

#define LOAD_SYMBOL(symbol, variable)                               \
  {                                                                 \
    variable = (decltype(variable))dlsym(librdma_handle_, #symbol); \
    char* error = dlerror();                                        \
    if (error != nullptr) {                                         \
      std::cerr << IBV_TAG << " " << error << std::endl;            \
      librdma_handle_ = nullptr;                                    \
      return;                                                       \
    }                                                               \
  }

namespace {

void* page_aligned_alloc(size_t num_bytes) {
  static size_t page_size = sysconf(_SC_PAGESIZE);
  void* buf;
  if (posix_memalign(&buf, page_size, num_bytes)) {
    return nullptr;
  }
  return buf;
}

} // namespace

namespace mlx::core::distributed::jaccl {

IBVWrapper::IBVWrapper() {
  librdma_handle_ = dlopen("librdma.dylib", RTLD_NOW | RTLD_GLOBAL);
  if (librdma_handle_ == nullptr) {
    return;
  }

  LOAD_SYMBOL(ibv_get_device_list, get_device_list);
  LOAD_SYMBOL(ibv_get_device_name, get_device_name);
  LOAD_SYMBOL(ibv_open_device, open_device);
  LOAD_SYMBOL(ibv_free_device_list, free_device_list);
  LOAD_SYMBOL(ibv_close_device, close_device);

  LOAD_SYMBOL(ibv_alloc_pd, alloc_pd);
  LOAD_SYMBOL(ibv_create_qp, create_qp);
  LOAD_SYMBOL(ibv_create_cq, create_cq);
  LOAD_SYMBOL(ibv_destroy_cq, destroy_cq);
  LOAD_SYMBOL(ibv_destroy_qp, destroy_qp);
  LOAD_SYMBOL(ibv_dealloc_pd, dealloc_pd);

  LOAD_SYMBOL(ibv_query_port, query_port);
  LOAD_SYMBOL(ibv_query_gid, query_gid);
  LOAD_SYMBOL(ibv_modify_qp, modify_qp);
  LOAD_SYMBOL(ibv_reg_mr, reg_mr);
  LOAD_SYMBOL(ibv_dereg_mr, dereg_mr);

  // Not really symbols but leaving them here in case they become symbols in
  // the future.
  //
  // LOAD_SYMBOL(ibv_post_send, post_send);
  // LOAD_SYMBOL(ibv_post_recv, post_recv);
  // LOAD_SYMBOL(ibv_poll_cq, poll_cq);
}

IBVWrapper& ibv() {
  static IBVWrapper wrapper;
  return wrapper;
}

SharedBuffer::SharedBuffer(size_t num_bytes)
    : data_(page_aligned_alloc(num_bytes)), num_bytes_(num_bytes) {}

SharedBuffer::SharedBuffer(SharedBuffer&& b) : data_(nullptr), num_bytes_(0) {
  std::swap(data_, b.data_);
  std::swap(num_bytes_, b.num_bytes_);
  std::swap(memory_regions_, b.memory_regions_);
}

SharedBuffer::~SharedBuffer() {
  for (auto& [pd, mr] : memory_regions_) {
    ibv().dereg_mr(mr);
  }
  if (data_ != nullptr) {
    std::free(data_);
  }
}

void SharedBuffer::register_to_protection_domain(ibv_pd* protection_domain) {
  auto [it, inserted] = memory_regions_.insert({protection_domain, nullptr});
  if (!inserted) {
    throw std::runtime_error(
        "[jaccl] Buffer can be registered once per protection domain");
  }

  it->second = ibv().reg_mr(
      protection_domain,
      data_,
      num_bytes_,
      IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ |
          IBV_ACCESS_REMOTE_WRITE);
  if (!it->second) {
    throw std::runtime_error("[jaccl] Register memory region failed");
  }
}

Connection::Connection(ibv_context* ctx_)
    : ctx(ctx_),
      protection_domain(nullptr),
      completion_queue(nullptr),
      queue_pair(nullptr) {
  src.local_id = -1;
}

Connection::Connection(Connection&& c) : Connection(nullptr) {
  std::swap(ctx, c.ctx);
  std::swap(protection_domain, c.protection_domain);
  std::swap(completion_queue, c.completion_queue);
  std::swap(queue_pair, c.queue_pair);
  std::swap(src, c.src);
}

Connection::~Connection() {
  if (queue_pair != nullptr) {
    ibv().destroy_qp(queue_pair);
  }
  if (completion_queue != nullptr) {
    ibv().destroy_cq(completion_queue);
  }
  if (protection_domain != nullptr) {
    ibv().dealloc_pd(protection_domain);
  }
  if (ctx != nullptr) {
    ibv().close_device(ctx);
  }
}

void Connection::allocate_protection_domain() {
  protection_domain = ibv().alloc_pd(ctx);
  if (protection_domain == nullptr) {
    throw std::runtime_error("[jaccl] Couldn't allocate protection domain");
  }
}

void Connection::create_completion_queue(int num_entries) {
  completion_queue = ibv().create_cq(ctx, num_entries, nullptr, nullptr, 0);
  if (completion_queue == nullptr) {
    throw std::runtime_error("[jaccl] Couldn't create completion queue");
  }
}

void Connection::create_queue_pair() {
  ibv_qp_init_attr init_attr;
  init_attr.qp_context = ctx;
  init_attr.qp_context = ctx;
  init_attr.send_cq = completion_queue;
  init_attr.recv_cq = completion_queue;
  init_attr.srq = nullptr;
  init_attr.cap.max_send_wr = MAX_SEND_WR;
  init_attr.cap.max_recv_wr = MAX_RECV_WR;
  init_attr.cap.max_send_sge = 1;
  init_attr.cap.max_recv_sge = 1;
  init_attr.cap.max_inline_data = 0;
  init_attr.qp_type = IBV_QPT_UC;
  init_attr.sq_sig_all = 0;

  queue_pair = ibv().create_qp(protection_domain, &init_attr);

  if (queue_pair == nullptr) {
    throw std::runtime_error("[jaccl] Couldn't create queue pair");
  }
}

const Destination& Connection::info() {
  if (queue_pair == nullptr || src.local_id >= 0) {
    return src;
  }

  ibv_port_attr port_attr;
  ibv().query_port(ctx, 1, &port_attr);
  ibv_gid gid;
  ibv().query_gid(ctx, 1, 1, &gid);

  src.local_id = port_attr.lid;
  src.queue_pair_number = queue_pair->qp_num;
  src.packet_sequence_number = 7; // TODO: Change to sth random
  src.global_identifier = gid;

  return src;
}

void Connection::queue_pair_init() {
  ibv_qp_attr attr = {};
  attr.qp_state = IBV_QPS_INIT;
  attr.port_num = 1;
  attr.pkey_index = 0;
  attr.qp_access_flags =
      IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE;

  int mask =
      IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS;

  if (int status = ibv().modify_qp(queue_pair, &attr, mask); status != 0) {
    std::ostringstream msg;
    msg << "[jaccl] Changing queue pair to INIT failed with errno " << status;
    throw std::invalid_argument(msg.str());
  }
}

void Connection::queue_pair_rtr(const Destination& dst) {
  ibv_qp_attr attr = {};
  memset(&attr, 0, sizeof(attr));
  attr.qp_state = IBV_QPS_RTR;
  attr.path_mtu = IBV_MTU_1024;
  attr.rq_psn = dst.packet_sequence_number;
  attr.dest_qp_num = dst.queue_pair_number;
  attr.ah_attr.dlid = dst.local_id;
  attr.ah_attr.sl = 0;
  attr.ah_attr.src_path_bits = 0;
  attr.ah_attr.port_num = 1;
  attr.ah_attr.is_global = 0;

  if (dst.global_identifier.global.interface_id) {
    attr.ah_attr.is_global = 1;
    attr.ah_attr.grh.hop_limit = 1;
    attr.ah_attr.grh.dgid = dst.global_identifier;
    attr.ah_attr.grh.sgid_index = 1;
  }

  int mask = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN |
      IBV_QP_RQ_PSN;

  if (int status = ibv().modify_qp(queue_pair, &attr, mask); status != 0) {
    std::ostringstream msg;
    msg << "[jaccl] Changing queue pair to RTR failed with errno " << status;
    throw std::invalid_argument(msg.str());
  }
}

void Connection::queue_pair_rts() {
  ibv_qp_attr attr = {};
  attr.qp_state = IBV_QPS_RTS;
  attr.sq_psn = src.packet_sequence_number;

  int mask = IBV_QP_STATE | IBV_QP_SQ_PSN;

  if (int status = ibv().modify_qp(queue_pair, &attr, mask); status != 0) {
    std::ostringstream msg;
    msg << "[jaccl] Changing queue pair to RTS failed with errno " << status;
    throw std::invalid_argument(msg.str());
  }
}

std::vector<Connection> create_connections(
    const std::vector<std::string>& device_names) {
  std::vector<Connection> connections;
  int num_devices = 0;
  ibv_device** devices = ibv().get_device_list(&num_devices);
  for (auto& name : device_names) {
    // Empty so add a nullptr context
    if (name.empty()) {
      connections.emplace_back(nullptr);
      continue;
    }

    // Search for the name and try to open the device
    for (int i = 0; i < num_devices; i++) {
      if (name == ibv().get_device_name(devices[i])) {
        auto ctx = ibv().open_device(devices[i]);
        if (ctx == nullptr) {
          std::ostringstream msg;
          msg << "[jaccl] Could not open device " << name;
          throw std::runtime_error(msg.str());
        }
        connections.emplace_back(ctx);
        break;
      }
    }
  }
  ibv().free_device_list(devices);

  return connections;
}

SideChannel::SideChannel(int rank, int size, const char* addr)
    : rank_(rank), size_(size) {
  auto address = detail::parse_address(addr);

  if (rank_ == 0) {
    detail::TCPSocket server(IBV_TAG);
    server.listen(IBV_TAG, address);

    for (int i = 0; i < size - 1; i++) {
      sockets_.push_back(server.accept(IBV_TAG));
    }

    std::vector<int> ranks(size - 1);
    for (int i = 0; i < size - 1; i++) {
      sockets_[i].recv(
          IBV_TAG, reinterpret_cast<char*>(&ranks[i]), sizeof(int));
      ranks[i]--;
    }
    for (int i = 0; i < size - 1; i++) {
      while (i != ranks[i]) {
        std::swap(sockets_[i], sockets_[ranks[i]]);
        std::swap(ranks[i], ranks[ranks[i]]);
      }
    }
  } else {
    sockets_.push_back(
        detail::TCPSocket::connect(
            IBV_TAG, address, 4, 1000, [](int attempt, int wait) {
              std::cerr << IBV_TAG << " Connection attempt " << attempt
                        << " waiting " << wait << " ms" << std::endl;
            }));
    sockets_[0].send(IBV_TAG, reinterpret_cast<char*>(&rank_), sizeof(int));
  }
}

SideChannel::SideChannel(SideChannel&& sc)
    : rank_(sc.rank_), size_(sc.size_), sockets_(std::move(sc.sockets_)) {
  sc.rank_ = -1;
  sc.size_ = -1;
}

} // namespace mlx::core::distributed::jaccl


================================================
FILE: mlx/distributed/jaccl/utils.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include <infiniband/verbs.h>

#include <span>
#include <unordered_map>
#include <vector>

#include "mlx/distributed/utils.h"

constexpr const char* IBV_TAG = "[jaccl]";
constexpr int SEND_WR = 1;
constexpr int RECV_WR = 2;
constexpr int MAX_SEND_WR = 32;
constexpr int MAX_RECV_WR = 32;
constexpr int BUFFER_SIZES = 8;
constexpr int NUM_BUFFERS = 2;
constexpr int FRAME_SIZE = 4096;

namespace detail = mlx::core::distributed::detail;

namespace {

template <typename T, typename = void>
struct is_container : std::false_type {};

template <typename T>
struct is_container<
    T,
    std::void_t<typename T::value_type, typename T::iterator>>
    : std::true_type {};

inline std::pair<int, int64_t> buffer_size_from_message(int64_t msg) {
  if (__builtin_available(macOS 26.3, iOS 26.3, tvOS 26.3, visionOS 26.3, *)) {
    for (int k = BUFFER_SIZES - 1; k > 0; k--) {
      if (msg >= FRAME_SIZE * (1 << k)) {
        return {k, FRAME_SIZE * (1 << k)};
      }
    }
  }
  return {0, FRAME_SIZE};
}

} // namespace

namespace mlx::core::distributed::jaccl {

/**
 * Wrapper for the ibverbs API.
 */
struct IBVWrapper {
  IBVWrapper();
  bool is_available() {
    return librdma_handle_ != nullptr;
  }

  // API
  ibv_device** (*get_device_list)(int*);
  const char* (*get_device_name)(ibv_device*);
  ibv_context* (*open_device)(ibv_device*);
  void (*free_device_list)(ibv_device**);
  int (*close_device)(ibv_context*);

  ibv_pd* (*alloc_pd)(ibv_context*);
  ibv_qp* (*create_qp)(ibv_pd*, ibv_qp_init_attr*);
  ibv_cq* (*create_cq)(ibv_context*, int, void*, ibv_comp_channel*, int);
  int (*destroy_cq)(ibv_cq*);
  int (*destroy_qp)(ibv_qp*);
  int (*dealloc_pd)(ibv_pd*);

  int (*query_port)(ibv_context*, uint8_t, ibv_port_attr*);
  int (*query_gid)(ibv_context*, uint8_t, int, ibv_gid*);
  int (*modify_qp)(ibv_qp*, ibv_qp_attr*, int);
  ibv_mr* (*reg_mr)(ibv_pd*, void*, size_t, int);
  int (*dereg_mr)(ibv_mr*);

 private:
  void* librdma_handle_;
};

IBVWrapper& ibv();

/**
 * Contains the information that defines a destination to a remote device.
 * Basically we can compute our own destination and share it with remote hosts
 * over the side channel.
 */
struct Destination {
  int local_id;
  int queue_pair_number;
  int packet_sequence_number;
  ibv_gid global_identifier;
};

/**
 * A buffer that can be registered to a number of protection domains.
 */
class SharedBuffer {
 public:
  SharedBuffer(size_t num_bytes);
  SharedBuffer(SharedBuffer&& b);
  ~SharedBuffer();

  SharedBuffer(const SharedBuffer&) = delete;
  SharedBuffer& operator=(const SharedBuffer&) = delete;

  void register_to_protection_domain(ibv_pd* protection_domain);

  size_t size() const {
    return num_bytes_;
  }

  uint32_t local_key(ibv_pd* protection_domain) const {
    return memory_regions_.at(protection_domain)->lkey;
  }

  ibv_sge to_scatter_gather_entry(ibv_pd* protection_domain) const {
    ibv_sge entry;
    entry.addr = reinterpret_cast<uintptr_t>(data_);
    entry.length = size();
    entry.lkey = local_key(protection_domain);
    return entry;
  }

  template <typename T>
  T* data() {
    return static_cast<T*>(data_);
  }

  template <typename T>
  T* begin() {
    return static_cast<T*>(data_);
  }

  template <typename T>
  T* end() {
    return static_cast<T*>(data_) + size() / sizeof(T);
  }

 private:
  void* data_;
  size_t num_bytes_;
  std::unordered_map<ibv_pd*, ibv_mr*> memory_regions_;
};

/**
 * Manipulates an RDMA connection. Enables (among other things)
 *
 *   - Creating a queue pair
 *   - Sending and receiving
 *   - Checking completion
 */
struct Connection {
  ibv_context* ctx;
  ibv_pd* protection_domain;
  ibv_cq* completion_queue;
  ibv_qp* queue_pair;
  Destination src; // holds the local information

  Connection(ibv_context* ctx_);
  Connection(Connection&& c);

  Connection(const Connection&) = delete;
  Connection& operator=(Connection&) = delete;

  ~Connection();
  void allocate_protection_domain();
  void create_completion_queue(int num_entries);
  void create_queue_pair();

  const Destination& info();
  void queue_pair_init();
  void queue_pair_rtr(const Destination& dst);
  void queue_pair_rts();

  void post_send(const SharedBuffer& buff, uint64_t work_request_id) {
    ibv_send_wr work_request, *bad_work_request;

    auto entry = buff.to_scatter_gather_entry(protection_domain);
    work_request.wr_id = work_request_id;
    work_request.sg_list = &entry;
    work_request.num_sge = 1;
    work_request.opcode = IBV_WR_SEND;
    work_request.send_flags = IBV_SEND_SIGNALED;
    work_request.next = nullptr;

    if (int status =
            ibv_post_send(queue_pair, &work_request, &bad_work_request);
        status != 0) {
      std::ostringstream msg;
      msg << "[jaccl] Send failed with error code " << status;
      throw std::invalid_argument(msg.str());
    }
  }

  void post_recv(const SharedBuffer& buff, uint64_t work_request_id) {
    ibv_recv_wr work_request, *bad_work_request;

    auto entry = buff.to_scatter_gather_entry(protection_domain);
    work_request.wr_id = work_request_id;
    work_request.sg_list = &entry;
    work_request.num_sge = 1;
    work_request.next = nullptr;

    if (int status =
            ibv_post_recv(queue_pair, &work_request, &bad_work_request);
        status != 0) {
      std::ostringstream msg;
      msg << "[jaccl] Recv failed with error code " << status;
      throw std::invalid_argument(msg.str());
    }
  }

  int poll(int num_completions, ibv_wc* work_completions) {
    return ibv_poll_cq(completion_queue, num_completions, work_completions);
  }
};

std::vector<Connection> create_connections(
    const std::vector<std::string>& device_names);

inline int poll(
    std::span<const Connection> connections,
    int num_completions,
    ibv_wc* work_completions) {
  int completions = 0;
  for (auto& c : connections) {
    if (c.ctx == nullptr) {
      continue;
    }
    if (completions >= num_completions) {
      return completions;
    }

    int n = ibv_poll_cq(
        c.completion_queue,
        num_completions - completions,
        work_completions + completions);

    completions += n;
  }
  return completions;
}

inline int poll(
    std::span<const Connection> connections_1,
    std::span<const Connection> connections_2,
    int num_completions,
    ibv_wc* work_completions) {
  int completions = 0;
  completions += poll(connections_1, num_completions, work_completions);
  completions += poll(
      connections_2,
      num_completions - completions,
      work_completions + completions);
  return completions;
}

/**
 * Implement a TCP side channel to exchange information about the RDMA
 * connections.
 *
 * Implements a simple all gather where every node sends to rank 0 and rank 0
 * broadcasts to every node.
 */
class SideChannel {
 public:
  SideChannel(int rank, int size, const char* addr);
  SideChannel(SideChannel&& sc);

  SideChannel(const SideChannel&) = delete;
  SideChannel& operator=(const SideChannel&) = delete;

  template <typename T>
  std::vector<T> all_gather(const T& v) {
    std::vector<T> result(size_);

    // T is a container of stuff like std::vector or std::string
    if constexpr (is_container<T>::value) {
      using U = typename T::value_type;

      // Share the lengths first and set the communication size to be the
      // maximum length of the containers.
      auto lengths = all_gather<int>(v.size());
      auto max_len = *std::max_element(lengths.begin(), lengths.end());
      for (auto& s : result) {
        s.resize(max_len);
      }

      // All gather of length max_len
      if (rank_ == 0) {
        std::copy(v.begin(), v.end(), result[rank_].begin());
        for (int i = 1; i < size_; i++) {
          sockets_[i - 1].recv(IBV_TAG, result[i].data(), sizeof(U) * max_len);
        }
        for (int i = 1; i < size_; i++) {
          for (int j = 0; j < size_; j++) {
            sockets_[i - 1].send(
                IBV_TAG, result[j].data(), sizeof(U) * max_len);
          }
        }
      } else {
        std::copy(v.begin(), v.end(), result[rank_].begin());
        sockets_[0].send(IBV_TAG, result[rank_].data(), sizeof(U) * max_len);
        for (int i = 0; i < size_; i++) {
          sockets_[0].recv(IBV_TAG, result[i].data(), sizeof(U) * max_len);
        }
      }

      // Resize the outputs back to the original length
      for (int i = 0; i < size_; i++) {
        result[i].resize(lengths[i]);
      }
    }

    // T is a scalar
    else {
      if (rank_ == 0) {
        result[rank_] = v;
        for (int i = 1; i < size_; i++) {
          sockets_[i - 1].recv(IBV_TAG, &result[i], sizeof(T));
        }
        for (int i = 1; i < size_; i++) {
          sockets_[i - 1].send(IBV_TAG, result.data(), size_ * sizeof(T));
        }
      } else {
        sockets_[0].send(IBV_TAG, &v, sizeof(T));
        sockets_[0].recv(IBV_TAG, result.data(), size_ * sizeof(T));
      }
    }

    return result;
  }

 private:
  int rank_;
  int size_;
  std::vector<detail::TCPSocket> sockets_;
};

} // namespace mlx::core::distributed::jaccl


================================================
FILE: mlx/distributed/mpi/CMakeLists.txt
================================================
if(MLX_BUILD_CPU)
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/mpi.cpp)
else()
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/no_mpi.cpp)
endif()


================================================
FILE: mlx/distributed/mpi/mpi.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <dlfcn.h>
#include <cstdlib>
#include <iostream>

#include "mlx/backend/cpu/encoder.h"
#include "mlx/distributed/distributed.h"
#include "mlx/distributed/distributed_impl.h"
#include "mlx/distributed/mpi/mpi.h"
#include "mlx/distributed/mpi/mpi_declarations.h"

#define LOAD_SYMBOL(symbol, variable)                              \
  {                                                                \
    variable = (decltype(variable))dlsym(libmpi_handle_, #symbol); \
    char* error = dlerror();                                       \
    if (error != nullptr) {                                        \
      libmpi_handle_ = nullptr;                                    \
      return;                                                      \
    }                                                              \
  }

static const char* get_libmpi_name() {
  const char* libname = std::getenv("MLX_MPI_LIBNAME");
  if (libname != nullptr) {
    return libname;
  }
#ifdef __APPLE__
  return "libmpi.dylib";
#else
  return "libmpi.so";
#endif
}

namespace mlx::core::distributed::mpi {

using GroupImpl = mlx::core::distributed::detail::GroupImpl;

namespace {

template <typename T>
void simple_sum(
    void* input,
    void* accumulator,
    int* len,
    MPI_Datatype* datatype) {
  T* in = (T*)input;
  T* acc = (T*)accumulator;
  int N = *len;

  while (N-- > 0) {
    *acc += *in;
    acc++;
    in++;
  }
}
template void simple_sum<float16_t>(void*, void*, int*, MPI_Datatype*);
template void simple_sum<bfloat16_t>(void*, void*, int*, MPI_Datatype*);

template <typename T>
void simple_max(
    void* input,
    void* accumulator,
    int* len,
    MPI_Datatype* datatype) {
  T* in = (T*)input;
  T* acc = (T*)accumulator;
  int N = *len;

  while (N-- > 0) {
    *acc = std::max(*acc, *in);
    acc++;
    in++;
  }
}
template void simple_max<float16_t>(void*, void*, int*, MPI_Datatype*);
template void simple_max<bfloat16_t>(void*, void*, int*, MPI_Datatype*);
template void simple_max<complex64_t>(void*, void*, int*, MPI_Datatype*);

template <typename T>
void simple_min(
    void* input,
    void* accumulator,
    int* len,
    MPI_Datatype* datatype) {
  T* in = (T*)input;
  T* acc = (T*)accumulator;
  int N = *len;

  while (N-- > 0) {
    *acc = std::min(*acc, *in);
    acc++;
    in++;
  }
}
template void simple_min<float16_t>(void*, void*, int*, MPI_Datatype*);
template void simple_min<bfloat16_t>(void*, void*, int*, MPI_Datatype*);
template void simple_min<complex64_t>(void*, void*, int*, MPI_Datatype*);

struct MPIWrapper {
  MPIWrapper() {
    initialized_ = false;

    libmpi_handle_ = dlopen(get_libmpi_name(), RTLD_NOW | RTLD_GLOBAL);
    if (libmpi_handle_ == nullptr) {
      return;
    }

    // Check library version and warn if it isn't Open MPI
    int (*get_version)(char*, int*);
    LOAD_SYMBOL(MPI_Get_library_version, get_version);
    char version_ptr[MPI_MAX_LIBRARY_VERSION_STRING];
    int version_length = 0;
    get_version(version_ptr, &version_length);
    std::string_view version(version_ptr, version_length);
    if (version.find("Open MPI") == std::string::npos) {
      std::cerr << "[mpi] MPI found but it does not appear to be Open MPI."
                << "MLX requires Open MPI but this is " << version << std::endl;
      libmpi_handle_ = nullptr;
      return;
    }

    // API
    LOAD_SYMBOL(MPI_Init, init);
    LOAD_SYMBOL(MPI_Finalize, finalize);
    LOAD_SYMBOL(MPI_Comm_rank, rank);
    LOAD_SYMBOL(MPI_Comm_size, size);
    LOAD_SYMBOL(MPI_Comm_split, comm_split);
    LOAD_SYMBOL(MPI_Comm_free, comm_free);
    LOAD_SYMBOL(MPI_Allreduce, all_reduce);
    LOAD_SYMBOL(MPI_Allgather, all_gather);
    LOAD_SYMBOL(MPI_Send, send);
    LOAD_SYMBOL(MPI_Recv, recv);
    LOAD_SYMBOL(MPI_Type_contiguous, mpi_type_contiguous);
    LOAD_SYMBOL(MPI_Type_commit, mpi_type_commit);
    LOAD_SYMBOL(MPI_Op_create, mpi_op_create);

    // Objects
    LOAD_SYMBOL(ompi_mpi_comm_world, comm_world_);

    // Ops
    LOAD_SYMBOL(ompi_mpi_op_sum, op_sum_);
    LOAD_SYMBOL(ompi_mpi_op_max, op_max_);
    LOAD_SYMBOL(ompi_mpi_op_min, op_min_);

    // Datatypes
    LOAD_SYMBOL(ompi_mpi_c_bool, mpi_bool_);
    LOAD_SYMBOL(ompi_mpi_int8_t, mpi_int8_);
    LOAD_SYMBOL(ompi_mpi_uint8_t, mpi_uint8_);
    LOAD_SYMBOL(ompi_mpi_int16_t, mpi_int16_);
    LOAD_SYMBOL(ompi_mpi_uint16_t, mpi_uint16_);
    LOAD_SYMBOL(ompi_mpi_int32_t, mpi_int32_);
    LOAD_SYMBOL(ompi_mpi_uint32_t, mpi_uint32_);
    LOAD_SYMBOL(ompi_mpi_int64_t, mpi_int64_);
    LOAD_SYMBOL(ompi_mpi_uint64_t, mpi_uint64_);
    LOAD_SYMBOL(ompi_mpi_float, mpi_float_);
    LOAD_SYMBOL(ompi_mpi_double, mpi_double_);
    LOAD_SYMBOL(ompi_mpi_c_complex, mpi_complex_);
  }

  bool is_available() {
    return libmpi_handle_ != nullptr;
  }

  bool init_safe() {
    if (!is_available()) {
      return false;
    }
    bool success = init(nullptr, nullptr) == MPI_SUCCESS;

    // Initialize custom types and ops
    if (success && !initialized_) {
      // Custom float16 dtypes
      mpi_type_contiguous(2, mpi_uint8_, &mpi_float16_);
      mpi_type_commit(&mpi_float16_);
      mpi_type_contiguous(2, mpi_uint8_, &mpi_bfloat16_);
      mpi_type_commit(&mpi_bfloat16_);

      // Custom reduction ops
      mpi_op_create(&simple_sum<float16_t>, 1, &op_sum_f16_);
      mpi_op_create(&simple_sum<bfloat16_t>, 1, &op_sum_bf16_);
      mpi_op_create(&simple_max<float16_t>, 1, &op_max_f16_);
      mpi_op_create(&simple_max<bfloat16_t>, 1, &op_max_bf16_);
      mpi_op_create(&simple_max<complex64_t>, 1, &op_max_c64_);
      mpi_op_create(&simple_min<float16_t>, 1, &op_min_f16_);
      mpi_op_create(&simple_min<bfloat16_t>, 1, &op_min_bf16_);
      mpi_op_create(&simple_min<complex64_t>, 1, &op_min_c64_);

      initialized_ = true;
    }

    return success;
  }

  void finalize_safe() {
    if (is_available()) {
      finalize();
    }
  }

  MPI_Comm world() {
    return comm_world_;
  }

  MPI_Datatype datatype(const array& arr) {
    switch (arr.dtype()) {
      case bool_:
        return mpi_bool_;
      case int8:
        return mpi_int8_;
      case uint8:
        return mpi_uint8_;
      case int16:
        return mpi_int16_;
      case uint16:
        return mpi_uint16_;
      case int32:
        return mpi_int32_;
      case uint32:
        return mpi_uint32_;
      case int64:
        return mpi_int64_;
      case uint64:
        return mpi_uint64_;
      case float32:
        return mpi_float_;
      case complex64:
        return mpi_complex_;
      case float16:
        return mpi_float16_;
      case bfloat16:
        return mpi_bfloat16_;
      case float64:
        return mpi_double_;
      default:
        throw std::runtime_error("Invalid type");
    }
  }

  MPI_Op op_sum(const array& arr) {
    switch (arr.dtype()) {
      case float16:
        return op_sum_f16_;
      case bfloat16:
        return op_sum_bf16_;
      default:
        return op_sum_;
    }
  }

  MPI_Op op_max(const array& arr) {
    switch (arr.dtype()) {
      case float16:
        return op_max_f16_;
      case bfloat16:
        return op_max_bf16_;
      case complex64:
        return op_max_c64_;
      default:
        return op_max_;
    }
  }

  MPI_Op op_min(const array& arr) {
    switch (arr.dtype()) {
      case float16:
        return op_min_f16_;
      case bfloat16:
        return op_min_bf16_;
      case complex64:
        return op_min_c64_;
      default:
        return op_min_;
    }
  }

  void* libmpi_handle_;

  // API
  int (*init)(int*, char***);
  int (*finalize)();
  int (*rank)(MPI_Comm, int*);
  int (*size)(MPI_Comm, int*);
  int (*all_reduce)(const void*, void*, int, MPI_Datatype, MPI_Op, MPI_Comm);
  int (*all_gather)(
      const void*,
      int,
      MPI_Datatype,
      void*,
      int,
      MPI_Datatype,
      MPI_Comm);
  int (*comm_split)(MPI_Comm, int, int, MPI_Comm*);
  int (*comm_free)(MPI_Comm*);
  int (*send)(const void*, int, MPI_Datatype, int, int, MPI_Comm);
  int (*recv)(void*, int, MPI_Datatype, int, int, MPI_Comm, MPI_Status*);

  // Objects
  MPI_Comm comm_world_;

  // Ops
  MPI_Op op_sum_;
  MPI_Op op_sum_f16_;
  MPI_Op op_sum_bf16_;
  MPI_Op op_max_;
  MPI_Op op_max_f16_;
  MPI_Op op_max_bf16_;
  MPI_Op op_max_c64_;
  MPI_Op op_min_;
  MPI_Op op_min_f16_;
  MPI_Op op_min_bf16_;
  MPI_Op op_min_c64_;

  // Datatypes
  MPI_Datatype mpi_bool_;
  MPI_Datatype mpi_int8_;
  MPI_Datatype mpi_uint8_;
  MPI_Datatype mpi_int16_;
  MPI_Datatype mpi_uint16_;
  MPI_Datatype mpi_int32_;
  MPI_Datatype mpi_uint32_;
  MPI_Datatype mpi_int64_;
  MPI_Datatype mpi_uint64_;
  MPI_Datatype mpi_float_;
  MPI_Datatype mpi_double_;
  MPI_Datatype mpi_complex_;
  MPI_Datatype mpi_float16_;
  MPI_Datatype mpi_bfloat16_;

 private:
  bool initialized_;

  // Private API
  int (*mpi_type_contiguous)(int, MPI_Datatype, MPI_Datatype*);
  int (*mpi_type_commit)(MPI_Datatype*);
  int (*mpi_op_create)(MPI_User_function*, int, MPI_Op*);
};

MPIWrapper& mpi() {
  static MPIWrapper wrapper;
  return wrapper;
}

} // namespace

class MPIGroup : public GroupImpl {
 public:
  MPIGroup(MPI_Comm comm, bool global)
      : comm_(comm), global_(global), rank_(-1), size_(-1) {}

  virtual ~MPIGroup() {
    if (global_) {
      mpi().finalize_safe();
    } else {
      mpi().comm_free(&comm_);
    }
  }

  Stream communication_stream(StreamOrDevice s) override {
    return to_stream(s, Device::cpu);
  }

  int rank() override {
    if (rank_ < 0) {
      mpi().rank(comm_, &rank_);
    }
    return rank_;
  }

  int size() override {
    if (size_ < 0) {
      mpi().size(comm_, &size_);
    }
    return size_;
  }

  std::shared_ptr<GroupImpl> split(int color, int key = -1) override {
    key = (key < 0) ? rank() : key;

    MPI_Comm new_comm;
    int result = mpi().comm_split(comm_, color, key, &new_comm);
    if (result != MPI_SUCCESS) {
      throw std::runtime_error("MPI could not split this group");
    }

    return std::make_shared<MPIGroup>(new_comm, false);
  }

  void all_sum(const array& input, array& output, Stream stream) override {
    auto& encoder = cpu::get_command_encoder(stream);
    encoder.set_input_array(input);
    encoder.set_output_array(output);
    encoder.dispatch(
        mpi().all_reduce,
        (input.data<void>() == output.data<void>()) ? MPI_IN_PLACE
                                                    : input.data<void>(),
        output.data<void>(),
        input.size(),
        mpi().datatype(input),
        mpi().op_sum(input),
        comm_);
  }

  void all_max(const array& input, array& output, Stream stream) override {
    auto& encoder = cpu::get_command_encoder(stream);
    encoder.set_input_array(input);
    encoder.set_output_array(output);
    encoder.dispatch(
        mpi().all_reduce,
        (input.data<void>() == output.data<void>()) ? MPI_IN_PLACE
                                                    : input.data<void>(),
        output.data<void>(),
        input.size(),
        mpi().datatype(input),
        mpi().op_max(input),
        comm_);
  }

  void all_min(const array& input, array& output, Stream stream) override {
    auto& encoder = cpu::get_command_encoder(stream);
    encoder.set_input_array(input);
    encoder.set_output_array(output);
    encoder.dispatch(
        mpi().all_reduce,
        (input.data<void>() == output.data<void>()) ? MPI_IN_PLACE
                                                    : input.data<void>(),
        output.data<void>(),
        input.size(),
        mpi().datatype(input),
        mpi().op_min(input),
        comm_);
  }

  void all_gather(const array& input, array& output, Stream stream) override {
    auto& encoder = cpu::get_command_encoder(stream);
    encoder.set_input_array(input);
    encoder.set_output_array(output);
    encoder.dispatch(
        mpi().all_gather,
        input.data<void>(),
        input.size(),
        mpi().datatype(input),
        output.data<void>(),
        input.size(),
        mpi().datatype(output),
        comm_);
  }

  void send(const array& input, int dst, Stream stream) override {
    auto& encoder = cpu::get_command_encoder(stream);
    encoder.set_input_array(input);
    encoder.dispatch(
        mpi().send,
        input.data<void>(),
        input.size(),
        mpi().datatype(input),
        dst,
        0,
        comm_);
  }

  void recv(array& out, int src, Stream stream) override {
    auto& encoder = cpu::get_command_encoder(stream);
    encoder.set_output_array(out);
    encoder.dispatch([out_ptr = out.data<void>(),
                      out_size = out.size(),
                      out_type = mpi().datatype(out),
                      src,
                      comm = comm_]() {
      MPI_Status status;
      mpi().recv(out_ptr, out_size, out_type, src, MPI_ANY_TAG, comm, &status);
    });
  }

  void sum_scatter(const array& input, array& output, Stream stream) override {
    throw std::runtime_error("[mpi] sum_scatter not yet implemented.");
  }

 private:
  MPI_Comm comm_;
  bool global_;
  int rank_;
  int size_;
};

bool is_available() {
  return mpi().is_available();
}

std::shared_ptr<GroupImpl> init(bool strict /* = false */) {
  if (!mpi().init_safe()) {
    if (strict) {
      throw std::runtime_error("Cannot initialize MPI");
    }
    return nullptr;
  }

  return std::make_shared<MPIGroup>(mpi().world(), true);
}

} // namespace mlx::core::distributed::mpi


================================================
FILE: mlx/distributed/mpi/mpi.h
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/distributed/distributed.h"

namespace mlx::core::distributed::mpi {

using GroupImpl = mlx::core::distributed::detail::GroupImpl;

bool is_available();
std::shared_ptr<GroupImpl> init(bool strict = false);

} // namespace mlx::core::distributed::mpi


================================================
FILE: mlx/distributed/mpi/mpi_declarations.h
================================================
// Copyright © 2024 Apple Inc.

// Constants

#define MPI_SUCCESS 0
#define MPI_ANY_SOURCE -1
#define MPI_ANY_TAG -1
#define MPI_IN_PLACE ((void*)1)
#define MPI_MAX_LIBRARY_VERSION_STRING 256

// Define all the types that we use so that we don't include <mpi.h> which
// causes linker errors on some platforms.
//
// NOTE: We define everything for openmpi.

typedef void* MPI_Comm;
typedef void* MPI_Datatype;
typedef void* MPI_Op;

typedef void(MPI_User_function)(void*, void*, int*, MPI_Datatype*);

typedef struct ompi_status_public_t {
  int MPI_SOURCE;
  int MPI_TAG;
  int MPI_ERROR;
  int _cancelled;
  size_t _ucount;
} MPI_Status;


================================================
FILE: mlx/distributed/mpi/no_mpi.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/distributed/mpi/mpi.h"

namespace mlx::core::distributed::mpi {

using GroupImpl = mlx::core::distributed::detail::GroupImpl;

bool is_available() {
  return false;
}

std::shared_ptr<GroupImpl> init(bool strict /* = false */) {
  if (strict) {
    throw std::runtime_error("Cannot initialize MPI");
  }
  return nullptr;
}

} // namespace mlx::core::distributed::mpi


================================================
FILE: mlx/distributed/nccl/CMakeLists.txt
================================================
if(MLX_BUILD_CUDA AND NOT WIN32)
  find_package(NCCL)
  if(NCCL_FOUND)
    target_link_libraries(mlx PRIVATE ${NCCL_LIBRARIES})
    target_include_directories(mlx PRIVATE ${NCCL_INCLUDE_DIRS})
    target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/nccl.cpp)
  else()
    target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/no_nccl.cpp)
  endif()
else()
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/no_nccl.cpp)
endif()


================================================
FILE: mlx/distributed/nccl/nccl.cpp
================================================
// NCCL distributed support currently requires Unix socket APIs
// TODO: Add Windows Winsock2 support for Windows builds
#ifndef _WIN32
#include <arpa/inet.h>
#include <netdb.h>
#include <sys/socket.h>
#include <unistd.h>
#endif

#include <cuda_runtime.h>
#include <nccl.h>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <mutex>
#include <stdexcept>
#include <string>
#include <type_traits>

#include "mlx/backend/cuda/device.h"
#include "mlx/distributed/distributed.h"
#include "mlx/distributed/distributed_impl.h"
#include "mlx/dtype_utils.h"
#include "mlx/utils.h"

namespace mlx::core::distributed::nccl {

// Can be tuned with MLX_NCCL_TIMEOUT
constexpr int nccl_timeout = 300000; // miliseconds

#define CHECK_CUDA(cmd)              \
  do {                               \
    cudaError_t e = cmd;             \
    if (e != cudaSuccess) {          \
      fprintf(                       \
          stderr,                    \
          "CUDA error %s:%d '%s'\n", \
          __FILE__,                  \
          __LINE__,                  \
          cudaGetErrorString(e));    \
      exit(1);                       \
    }                                \
  } while (0)

#define CHECK_NCCL(cmd)              \
  do {                               \
    ncclResult_t r = cmd;            \
    if (r != ncclSuccess) {          \
      fprintf(                       \
          stderr,                    \
          "NCCL error %s:%d '%s'\n", \
          __FILE__,                  \
          __LINE__,                  \
          ncclGetErrorString(r));    \
      exit(1);                       \
    }                                \
  } while (0)

#define MLX_NCCL_TYPE_LIST(X) \
  X(int8_t, ncclChar)         \
  X(uint8_t, ncclUint8)       \
  X(int32_t, ncclInt)         \
  X(uint32_t, ncclUint32)     \
  X(int64_t, ncclInt64)       \
  X(uint64_t, ncclUint64)     \
  X(float16_t, ncclHalf)      \
  X(bfloat16_t, ncclBfloat16) \
  X(float, ncclFloat)         \
  X(double, ncclDouble)

template <class>
struct nccl_map {
  static constexpr bool ok = false; // default: unsupported
};

#define MLX_DEF_NCCL_MAP(T, E)                 \
  template <>                                  \
  struct nccl_map<T> {                         \
    static constexpr bool ok = true;           \
    static constexpr ncclDataType_t value = E; \
  };

MLX_NCCL_TYPE_LIST(MLX_DEF_NCCL_MAP)
#undef MLX_DEF_NCCL_MAP

namespace detail {

template <typename F>
void dispatch_dtype(const array& arr, F&& f) {
  dispatch_all_types(arr.dtype(), [&](auto type_tag) {
    using T = MLX_GET_TYPE(type_tag);
    if constexpr (nccl_map<T>::ok) {
      f(type_tag, nccl_map<T>::value);
    } else {
      throw std::invalid_argument("[nccl] Unknown or unsupported dtype");
    }
  });
}

#ifndef _WIN32
inline void sendAll(int sock, const void* buf, size_t len) {
  const char* ptr = reinterpret_cast<const char*>(buf);
  while (len > 0) {
    ssize_t sent = send(sock, ptr, len, 0);
    if (sent <= 0) {
      perror("send");
      exit(1);
    }
    ptr += sent;
    len -= sent;
  }
}

inline void recvAll(int sock, void* buf, size_t len) {
  char* ptr = reinterpret_cast<char*>(buf);
  while (len > 0) {
    ssize_t rec = recv(sock, ptr, len, 0);
    if (rec <= 0) {
      perror("recv");
      exit(1);
    }
    ptr += rec;
    len -= rec;
  }
}
#endif // _WIN32

#ifndef _WIN32
inline void bootstrap_unique_id(
    ncclUniqueId& id,
    int rank,
    int size,
    const std::string& initMethod) {
  // Parse the init method to extract the host and port
  if (initMethod.rfind("tcp://", 0) != 0)
    throw;
  auto hostport = initMethod.substr(6);
  auto colon = hostport.find(':');
  std::string host = hostport.substr(0, colon);
  int port = std::stoi(hostport.substr(colon + 1));

  if (rank == 0) {
    // create a unique id on the rank 0
    CHECK_NCCL(ncclGetUniqueId(&id));

    // create a socket to send the unique id to all other ranks
    int sock = socket(AF_INET, SOCK_STREAM, 0);

    if (sock < 0) {
      std::ostringstream msg;
      msg << "[nccl] Couldn't create socket (error: " << errno << ")";
      throw std::runtime_error(msg.str());
    }

    sockaddr_in serv = {};
    serv.sin_family = AF_INET;
    serv.sin_addr.s_addr = htonl(INADDR_ANY);
    serv.sin_port = htons(port);

    int reuse = 1;
    // Without this, if rank-0 crashes or restarts process quickly,
    // the OS might refuse to let binding to the same port, so reuse

    if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)) < 0) {
      std::ostringstream msg;
      msg << "[nccl] setsockopt() failed: " << strerror(errno);
      throw std::runtime_error(msg.str());
    }

    if (bind(sock, reinterpret_cast<sockaddr*>(&serv), sizeof(serv)) < 0) {
      std::ostringstream msg;
      msg << "[nccl] bind() failed: " << strerror(errno);
      throw std::runtime_error(msg.str());
    }
    if (listen(sock, size - 1) < 0) {
      std::ostringstream msg;
      msg << "[nccl] listen() failed: " << strerror(errno);
      throw std::runtime_error(msg.str());
    }

    for (int peer = 1; peer < size; ++peer) {
      int conn = accept(sock, nullptr, nullptr);
      if (conn < 0) {
        std::ostringstream msg;
        msg << "[nccl] accept() failed: " << strerror(errno);
        throw std::runtime_error(msg.str());
      }
      sendAll(conn, &id, sizeof(id));
      close(conn);
    }
    close(sock);

  } else {
    // Here we want to make sure that rank 0 has enough time to bind
    // so we will retry to connect until elapsed time exceeds nccl_timeout
    // this is particularity important for multinode setup

    int sock = socket(AF_INET, SOCK_STREAM, 0);
    if (sock < 0) {
      std::ostringstream msg;
      msg << "[nccl] socket() failed: " << strerror(errno);
      throw std::runtime_error(msg.str());
    }

    hostent* he = gethostbyname(host.c_str());
    if (!he) {
      throw std::runtime_error("[nccl] lookup failed for host: " + host);
    }
    sockaddr_in serv = {};
    serv.sin_family = AF_INET;
    memcpy(&serv.sin_addr, he->h_addr_list[0], he->h_length);
    serv.sin_port = htons(port);

    const int timeout_ms = env::nccl_timeout(nccl_timeout);
    bool connected = false;

    const char* dbg = std::getenv("NCCL_DEBUG");
    bool do_log = (dbg && std::string(dbg) == "INFO");

    auto start = std::chrono::steady_clock::now();
    int attempt = 0;

    while (true) {
      auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
                            std::chrono::steady_clock::now() - start)
                            .count();
      if (elapsed_ms > timeout_ms)
        break;
      if (connect(sock, reinterpret_cast<sockaddr*>(&serv), sizeof(serv)) ==
          0) {
        connected = true;
        if (do_log) {
          std::cout << "[Rank " << rank << "] Connected successfully after "
                    << elapsed_ms << " miliseconds" << std::endl;
          break;
        }
      }
      if (errno != ECONNREFUSED) {
        break;
      }
      ++attempt;
      std::this_thread::sleep_for(std::chrono::milliseconds(500));
    }

    if (!connected) {
      std::ostringstream msg;
      msg << "[Rank " << rank << "] connect() failed after " << timeout_ms
          << " milliseconds and " << attempt << " retries: " << strerror(errno);
      close(sock);
      throw std::runtime_error(msg.str());
    }
    recvAll(sock, &id, sizeof(id));
    close(sock);
  }
}
#else // _WIN32
inline void bootstrap_unique_id(
    ncclUniqueId& id,
    int rank,
    int size,
    const std::string& initMethod) {
  throw std::runtime_error(
      "[nccl] Distributed NCCL is not yet supported on Windows");
}
#endif // _WIN32

} // namespace detail

// helper struct to manage communicator
struct NCCLComm {
  ncclComm_t comm;
  int rank_;
  int size_;

  NCCLComm(ncclComm_t c, int rank, int size)
      : comm(c), rank_(rank), size_(size) {}

  static std::shared_ptr<NCCLComm>
  create(int numRanks, int rank, ncclUniqueId commId) {
    ncclComm_t raw;
    CHECK_NCCL(ncclCommInitRank(&raw, numRanks, commId, rank));
    return std::make_shared<NCCLComm>(raw, rank, numRanks);
  }

  static std::shared_ptr<NCCLComm> split(NCCLComm* source, int color, int key) {
    ncclComm_t raw;
    // default config, blocking comm creation
    ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
    CHECK_NCCL(ncclCommSplit(source->comm, color, key, &raw, &config));
    int new_rank, new_size;
    CHECK_NCCL(ncclCommUserRank(raw, &new_rank));
    CHECK_NCCL(ncclCommCount(raw, &new_size));
    return std::make_shared<NCCLComm>(raw, new_rank, new_size);
  }

  NCCLComm(const NCCLComm&) = delete;
  NCCLComm& operator=(const NCCLComm&) = delete;
};

using GroupImpl = mlx::core::distributed::detail::GroupImpl;
class NCCLGroup : public GroupImpl {
 public:
  NCCLGroup(int worldRank, int worldSize, const std::string initMethod)
      : rank_(worldRank), size_(worldSize), initMethod_(initMethod) {
    if (initialized_)
      return;
    int ndev;
    CHECK_CUDA(cudaGetDeviceCount(&ndev));
    CHECK_CUDA(cudaSetDevice(rank_ % ndev));
    detail::bootstrap_unique_id(uniqueId_, rank_, size_, initMethod_);
    comm_ = NCCLComm::create(size_, rank_, uniqueId_);
    initialized_ = true;
  }
  // Used by split() to wrap an already-created communicator
  NCCLGroup(std::shared_ptr<NCCLComm> comm, int rank, int size)
      : rank_(rank), size_(size), comm_(std::move(comm)) {}

  Stream communication_stream(StreamOrDevice s) override {
    return to_stream(s, Device::gpu);
  }

  int rank() override {
    return rank_;
  }

  int size() override {
    return size_;
  }

  void all_sum(const array& input, array& output, Stream stream) override {
    detail::dispatch_dtype(input, [&](auto type_tag, ncclDataType_t dt) {
      using T = typename decltype(type_tag)::type;
      all_reduce_impl<T>(input, output, stream, dt, ncclSum);
    });
  }

  std::shared_ptr<GroupImpl> split(int color, int key = -1) override {
    key = (key < 0) ? rank() : key;
    auto new_comm = NCCLComm::split(comm_.get(), color, key);
    return std::make_shared<NCCLGroup>(
        new_comm, new_comm->rank_, new_comm->size_);
  }

  void all_gather(const array& input, array& output, Stream stream) override {
    detail::dispatch_dtype(input, [&](auto type_tag, ncclDataType_t dt) {
      using T = typename decltype(type_tag)::type;
      auto& encoder = cu::get_command_encoder(stream);
      CHECK_NCCL(ncclAllGather(
          gpu_ptr<T>(input),
          gpu_ptr<T>(output),
          input.size(),
          dt,
          comm_->comm,
          encoder.stream()));
    });
  }

  void send(const array& input, int dst, Stream stream) override {
    throw std::runtime_error("[nccl] Send not supported in NCCL backend.");
  }

  void recv(array& output, int src, Stream stream) override {
    throw std::runtime_error("[nccl] Recv not supported in NCCL backend.");
  }

  void all_max(const array& input, array& output, Stream stream) override {
    detail::dispatch_dtype(input, [&](auto type_tag, ncclDataType_t dt) {
      using T = typename decltype(type_tag)::type;
      all_reduce_impl<T>(input, output, stream, dt, ncclMax);
    });
  }

  void all_min(const array& input, array& output, Stream stream) override {
    detail::dispatch_dtype(input, [&](auto type_tag, ncclDataType_t dt) {
      using T = typename decltype(type_tag)::type;
      all_reduce_impl<T>(input, output, stream, dt, ncclMin);
    });
  }

  void sum_scatter(const array& input, array& output, Stream stream) override {
    detail::dispatch_dtype(input, [&](auto type_tag, ncclDataType_t dt) {
      using T = typename decltype(type_tag)::type;
      reduce_scatter_impl<T>(input, output, stream, dt, ncclSum);
    });
  }

  template <typename T>
  void all_reduce_impl(
      const array& input,
      array& output,
      Stream stream,
      ncclDataType_t dt,
      ncclRedOp_t op) {
    auto& encoder = cu::get_command_encoder(stream);

    CHECK_NCCL(ncclAllReduce(
        gpu_ptr<T>(input),
        gpu_ptr<T>(output),
        input.size(),
        dt,
        op,
        comm_->comm,
        encoder.stream()));
  }

  template <typename T>
  void reduce_scatter_impl(
      const array& input,
      array& output,
      Stream stream,
      ncclDataType_t dt,
      ncclRedOp_t op) {
    auto& encoder = cu::get_command_encoder(stream);

    CHECK_NCCL(ncclReduceScatter(
        gpu_ptr<T>(input),
        gpu_ptr<T>(output),
        output.size(),
        dt,
        op,
        comm_->comm,
        encoder.stream()));
  }

  int rank_;
  int size_;
  std::string initMethod_;
  ncclUniqueId uniqueId_;
  std::shared_ptr<NCCLComm> comm_;
  bool initialized_ = false;
};

bool is_available() {
  return true;
}

namespace detail {
std::string get_env_var_or_throw(const char* env_var_name, bool strict) {
  const char* value = std::getenv(env_var_name);
  if (value == nullptr && strict) {
    std::ostringstream msg;
    msg << "[nccl] Required environment variable '" << env_var_name
        << "' is not set. "
        << "Please set it before initializing the distributed backend.";
    throw std::runtime_error(msg.str());
  }
  if (value == nullptr) {
    return "";
  }
  return std::string(value);
}
} // namespace detail

std::shared_ptr<GroupImpl> init(bool strict /* = false */) {
  std::string host = detail::get_env_var_or_throw("NCCL_HOST_IP", strict);
  std::string port = detail::get_env_var_or_throw("NCCL_PORT", strict);
  std::string rank_str = detail::get_env_var_or_throw("MLX_RANK", strict);
  std::string n_nodes_str =
      detail::get_env_var_or_throw("MLX_WORLD_SIZE", strict);
  if (!strict &&
      (host.empty() || port.empty() || rank_str.empty() ||
       n_nodes_str.empty())) {
    return nullptr;
  }

  int rank = std::stoi(rank_str);
  int n_nodes = std::stoi(n_nodes_str);
  std::string init_method = "tcp://" + host + ":" + port;

  return std::make_shared<NCCLGroup>(rank, n_nodes, init_method);
}
} // namespace mlx::core::distributed::nccl


================================================
FILE: mlx/distributed/nccl/nccl.h
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/distributed/distributed.h"

namespace mlx::core::distributed::nccl {

using GroupImpl = mlx::core::distributed::detail::GroupImpl;

bool is_available();
std::shared_ptr<GroupImpl> init(bool strict = false);

} // namespace mlx::core::distributed::nccl


================================================
FILE: mlx/distributed/nccl/no_nccl.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/distributed/nccl/nccl.h"

namespace mlx::core::distributed::nccl {

using GroupImpl = mlx::core::distributed::detail::GroupImpl;

bool is_available() {
  return false;
}

std::shared_ptr<GroupImpl> init(bool strict /* = false */) {
  if (strict) {
    throw std::runtime_error("Cannot initialize nccl distributed backend.");
  }
  return nullptr;
}

} // namespace mlx::core::distributed::nccl


================================================
FILE: mlx/distributed/ops.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <sstream>

#include "mlx/backend/cuda/cuda.h"
#include "mlx/backend/metal/metal.h"
#include "mlx/distributed/distributed_impl.h"
#include "mlx/distributed/ops.h"
#include "mlx/distributed/primitives.h"

namespace mlx::core::distributed {

namespace {

Group to_group(std::optional<Group> group) {
  if (group.has_value()) {
    return group.value();
  } else {
    return distributed::init();
  }
}

} // namespace

array all_sum(
    const array& x,
    std::optional<Group> group_ /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  auto group = to_group(group_);

  if (group.size() == 1) {
    return x;
  }
  auto stream = detail::communication_stream(group, s);

  return array(
      x.shape(),
      x.dtype(),
      std::make_shared<AllReduce>(stream, group, AllReduce::Sum),
      {x});
}

array all_max(
    const array& x,
    std::optional<Group> group_ /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  auto group = to_group(group_);

  if (group.size() == 1) {
    return x;
  }
  auto stream = detail::communication_stream(group, s);

  return array(
      x.shape(),
      x.dtype(),
      std::make_shared<AllReduce>(stream, group, AllReduce::Max),
      {x});
}

array all_min(
    const array& x,
    std::optional<Group> group_ /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  auto group = to_group(group_);

  if (group.size() == 1) {
    return x;
  }
  auto stream = detail::communication_stream(group, s);

  return array(
      x.shape(),
      x.dtype(),
      std::make_shared<AllReduce>(stream, group, AllReduce::Min),
      {x});
}

array all_gather(
    const array& x,
    std::optional<Group> group_ /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  auto group = to_group(group_);

  if (group.size() == 1) {
    return x;
  }
  auto stream = detail::communication_stream(group, s);

  auto result_shape = x.shape();
  if (result_shape.size() == 0) {
    result_shape.push_back(group.size());
  } else {
    result_shape[0] *= group.size();
  }
  return array(
      std::move(result_shape),
      x.dtype(),
      std::make_shared<AllGather>(stream, group),
      {x});
}

array send(
    const array& x,
    int dst,
    std::optional<Group> group_ /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  auto group = to_group(group_);

  if (group.size() == 1) {
    throw std::invalid_argument("Cannot send to a singleton group");
  }
  auto stream = detail::communication_stream(group, s);

  if (dst < 0 || dst >= group.size()) {
    std::ostringstream msg;
    msg << "Invalid destination=" << dst << " for a group of size "
        << group.size();
    throw std::invalid_argument(msg.str());
  }

  return array(
      x.shape(), x.dtype(), std::make_shared<Send>(stream, group, dst), {x});
}

array recv(
    Shape shape,
    Dtype dtype,
    int src,
    std::optional<Group> group_ /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  auto group = to_group(group_);

  if (group.size() == 1) {
    throw std::invalid_argument("Cannot recv from a singleton group");
  }
  auto stream = detail::communication_stream(group, s);

  if (src < 0 || src >= group.size()) {
    std::ostringstream msg;
    msg << "Invalid source=" << src << " for a group of size " << group.size();
    throw std::invalid_argument(msg.str());
  }

  return array(
      std::move(shape),
      std::move(dtype),
      std::make_shared<Recv>(stream, group, src),
      std::vector<array>{});
}

array recv_like(
    const array& x,
    int src,
    std::optional<Group> group_ /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  return recv(x.shape(), x.dtype(), src, group_, s);
}

array sum_scatter(
    const array& x,
    std::optional<Group> group_ /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  auto group = to_group(group_);
  if (group.size() == 1) {
    return x;
  }
  if (x.shape()[0] % group.size() != 0) {
    std::ostringstream msg;
    msg << "[sum_scatter] Invalid shape=" << x.shape()
        << " for a group of size " << group.size()
        << ". The first dimension (axis 0) must be divisible by the group size.";
    throw std::invalid_argument(msg.str());
  }

  auto result_shape = x.shape();
  result_shape[0] /= group.size();
  auto stream = detail::communication_stream(group, s);

  return array(
      std::move(result_shape),
      x.dtype(),
      std::make_shared<ReduceScatter>(stream, group, ReduceScatter::Sum),
      {x});
}
} // namespace mlx::core::distributed


================================================
FILE: mlx/distributed/ops.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include <optional>

#include "mlx/api.h"
#include "mlx/distributed/distributed.h"
#include "mlx/utils.h"

namespace mlx::core::distributed {

MLX_API array all_sum(
    const array& x,
    std::optional<Group> group = std::nullopt,
    StreamOrDevice s = {});

MLX_API array all_gather(
    const array& x,
    std::optional<Group> group = std::nullopt,
    StreamOrDevice S = {});

MLX_API array send(
    const array& x,
    int dst,
    std::optional<Group> group = std::nullopt,
    StreamOrDevice s = {});

MLX_API array recv(
    Shape shape,
    Dtype dtype,
    int src,
    std::optional<Group> group = std::nullopt,
    StreamOrDevice s = {});

MLX_API array recv_like(
    const array& x,
    int src,
    std::optional<Group> group = std::nullopt,
    StreamOrDevice s = {});

MLX_API array all_max(
    const array& x,
    std::optional<Group> group = std::nullopt,
    StreamOrDevice s = {});

MLX_API array all_min(
    const array& x,
    std::optional<Group> group = std::nullopt,
    StreamOrDevice s = {});

MLX_API array sum_scatter(
    const array& x,
    std::optional<Group> group = std::nullopt,
    StreamOrDevice s = {});

} // namespace mlx::core::distributed


================================================
FILE: mlx/distributed/primitives.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <cassert>

#include "mlx/allocator.h"
#include "mlx/distributed/ops.h"
#include "mlx/distributed/primitives.h"
#include "mlx/ops.h"

namespace mlx::core::distributed {

std::pair<std::vector<array>, std::vector<int>> AllReduce::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  switch (reduce_type_) {
    case Sum:
      return {{all_sum(inputs[0], group(), stream())}, axes};
    case Max:
      return {{all_max(inputs[0], group(), stream())}, axes};
    case Min:
      return {{all_min(inputs[0], group(), stream())}, axes};
    default:

      throw std::runtime_error(
          "Only all reduce sum, max and min are supported for now");
  }
}

std::vector<array> AllReduce::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>&) {
  switch (reduce_type_) {
    case Sum:
      return {all_sum(tangents[0], group(), stream())};
    case Max:
      return {all_max(tangents[0], group(), stream())};
    case Min:
      return {all_min(tangents[0], group(), stream())};
    default:
      throw std::runtime_error(
          "Only all reduce sum, max and min are supported for now");
  }
}

std::vector<array> AllReduce::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>&,
    const std::vector<array>& outputs) {
  return cotangents;
}

std::pair<std::vector<array>, std::vector<int>> AllGather::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  return {{all_gather(inputs[0], group(), stream())}, axes};
}

std::vector<array> AllGather::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>&) {
  return {all_gather(tangents[0], group(), stream())};
}

std::vector<array> AllGather::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>&,
    const std::vector<array>&) {
  auto g = group();
  auto ndim = primals[0].ndim();
  Shape starts(primals[0].ndim(), 0);
  auto stops = primals[0].shape();
  if (ndim == 0) {
    starts.push_back(0);
    stops.push_back(1);
  }
  starts[0] = g.rank() * stops[0];
  stops[0] += starts[0];
  auto out = slice(cotangents[0], starts, stops);
  if (ndim == 0) {
    out = squeeze(out, 0);
  }
  return {out};
}

std::pair<std::vector<array>, std::vector<int>> Send::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  return {{send(inputs[0], dst_, group(), stream())}, axes};
}

} // namespace mlx::core::distributed


================================================
FILE: mlx/distributed/primitives.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include "mlx/distributed/distributed.h"
#include "mlx/distributed/distributed_impl.h"
#include "mlx/primitives.h"

namespace mlx::core::distributed {

class DistPrimitive : public Primitive {
 public:
  DistPrimitive(Stream stream, Group group)
      : Primitive(stream), group_(group) {}

  const Group& group() const {
    return group_;
  }

 private:
  Group group_;
};

class AllReduce : public DistPrimitive {
 public:
  enum ReduceType { And, Or, Sum, Prod, Min, Max };

  AllReduce(Stream stream, Group group, ReduceType reduce_type)
      : DistPrimitive(stream, group), reduce_type_(reduce_type) {}

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  std::pair<std::vector<array>, std::vector<int>> vmap(
      const std::vector<array>& inputs,
      const std::vector<int>& axes) override;
  std::vector<array> jvp(
      const std::vector<array>& primals,
      const std::vector<array>& tangents,
      const std::vector<int>& argnums) override;
  std::vector<array> vjp(
      const std::vector<array>& primals,
      const std::vector<array>& cotangents,
      const std::vector<int>& argnums,
      const std::vector<array>& outputs) override;

  const char* name() const override {
    switch (reduce_type_) {
      case And:
        return "And AllReduce";
      case Or:
        return "Or AllReduce";
      case Sum:
        return "Sum AllReduce";
      case Prod:
        return "Prod AllReduce";
      case Min:
        return "Min AllReduce";
      case Max:
        return "Max AllReduce";
    }
    return "<unknwon AllReduce>";
  }

 private:
  ReduceType reduce_type_;
};

class AllGather : public DistPrimitive {
 public:
  AllGather(Stream stream, Group group) : DistPrimitive(stream, group) {}

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  std::pair<std::vector<array>, std::vector<int>> vmap(
      const std::vector<array>& inputs,
      const std::vector<int>& axes) override;
  std::vector<array> jvp(
      const std::vector<array>& primals,
      const std::vector<array>& tangents,
      const std::vector<int>& argnums) override;
  std::vector<array> vjp(
      const std::vector<array>& primals,
      const std::vector<array>& cotangents,
      const std::vector<int>& argnums,
      const std::vector<array>& outputs) override;

  DEFINE_NAME(AllGather);
};

class Send : public DistPrimitive {
 public:
  Send(Stream stream, Group group, int dst)
      : DistPrimitive(stream, group), dst_(dst) {}

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  std::pair<std::vector<array>, std::vector<int>> vmap(
      const std::vector<array>& inputs,
      const std::vector<int>& axes) override;

  DEFINE_NAME(Send);

 private:
  int dst_;
};

class Recv : public DistPrimitive {
 public:
  Recv(Stream stream, Group group, int src)
      : DistPrimitive(stream, group), src_(src) {}

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  DEFINE_NAME(Recv);

 private:
  int src_;
};

class ReduceScatter : public DistPrimitive {
 public:
  enum ReduceType { Sum, Min, Max };
  ReduceScatter(Stream stream, Group group, ReduceType reduce_type)
      : DistPrimitive(stream, group), reduce_type_(reduce_type) {}

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  const char* name() const override {
    switch (reduce_type_) {
      case Sum:
        return "Sum ReduceScatter";
      case Min:
        return "Min ReduceScatter";
      case Max:
        return "Max ReduceScatter";
    }
    return "<unknwon ReduceScatter>";
  }

 private:
  ReduceType reduce_type_;
};
} // namespace mlx::core::distributed


================================================
FILE: mlx/distributed/reduction_ops.h
================================================
// Copyright © 2025 Apple Inc.

namespace mlx::core::distributed::detail {

template <typename T>
struct SumOp {
  void operator()(const T* input, T* output, size_t N) const {
    while (N-- > 0) {
      *output += *input;
      input++;
      output++;
    }
  }
};

template <typename T>
struct MaxOp {
  void operator()(const T* input, T* output, size_t N) const {
    while (N-- > 0) {
      *output = std::max(*output, *input);
      input++;
      output++;
    }
  }
};

template <typename T>
struct MinOp {
  void operator()(const T* input, T* output, size_t N) const {
    while (N-- > 0) {
      *output = std::min(*output, *input);
      input++;
      output++;
    }
  }
};

} // namespace mlx::core::distributed::detail


================================================
FILE: mlx/distributed/ring/CMakeLists.txt
================================================
if(MLX_BUILD_CPU AND NOT WIN32)
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/ring.cpp)
else()
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/no_ring.cpp)
endif()


================================================
FILE: mlx/distributed/ring/no_ring.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/distributed/ring/ring.h"

namespace mlx::core::distributed::ring {

using GroupImpl = mlx::core::distributed::detail::GroupImpl;

bool is_available() {
  return false;
}

std::shared_ptr<GroupImpl> init(bool strict /* = false */) {
  if (strict) {
    throw std::runtime_error("Cannot initialize ring distributed backend.");
  }
  return nullptr;
}

} // namespace mlx::core::distributed::ring


================================================
FILE: mlx/distributed/ring/ring.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <fcntl.h>
#include <netinet/tcp.h>
#include <sys/socket.h>
#include <unistd.h>

#include <chrono>
#include <fstream>
#include <future>
#include <iostream>
#include <list>
#include <sstream>
#include <thread>
#include <unordered_map>

#include <json.hpp>

#include "mlx/backend/cpu/encoder.h"
#include "mlx/distributed/distributed.h"
#include "mlx/distributed/distributed_impl.h"
#include "mlx/distributed/reduction_ops.h"
#include "mlx/distributed/utils.h"
#include "mlx/threadpool.h"

#ifndef SOL_TCP
#define SOL_TCP IPPROTO_TCP
#endif

#define SWITCH_TYPE(x, ...)  \
  switch ((x).dtype()) {     \
    case bool_: {            \
      using T = bool;        \
      __VA_ARGS__;           \
    } break;                 \
    case int8: {             \
      using T = int8_t;      \
      __VA_ARGS__;           \
    } break;                 \
    case int16: {            \
      using T = int16_t;     \
      __VA_ARGS__;           \
    } break;                 \
    case int32: {            \
      using T = int32_t;     \
      __VA_ARGS__;           \
    } break;                 \
    case int64: {            \
      using T = int64_t;     \
      __VA_ARGS__;           \
    } break;                 \
    case uint8: {            \
      using T = uint8_t;     \
      __VA_ARGS__;           \
    } break;                 \
    case uint16: {           \
      using T = uint16_t;    \
      __VA_ARGS__;           \
    } break;                 \
    case uint32: {           \
      using T = uint32_t;    \
      __VA_ARGS__;           \
    } break;                 \
    case uint64: {           \
      using T = uint64_t;    \
      __VA_ARGS__;           \
    } break;                 \
    case bfloat16: {         \
      using T = bfloat16_t;  \
      __VA_ARGS__;           \
    } break;                 \
    case float16: {          \
      using T = float16_t;   \
      __VA_ARGS__;           \
    } break;                 \
    case float32: {          \
      using T = float;       \
      __VA_ARGS__;           \
    } break;                 \
    case float64: {          \
      using T = double;      \
      __VA_ARGS__;           \
    } break;                 \
    case complex64: {        \
      using T = complex64_t; \
      __VA_ARGS__;           \
    } break;                 \
  }

namespace mlx::core::distributed::ring {

constexpr const size_t ALL_SUM_SIZE = 8 * 1024 * 1024;
constexpr const size_t ALL_SUM_BUFFERS = 2;
constexpr const int CONN_ATTEMPTS = 5;
constexpr const int CONN_WAIT = 1000;
constexpr const char* RING_TAG = "[ring]";

using GroupImpl = mlx::core::distributed::detail::GroupImpl;
using json = nlohmann::json;
using namespace std::chrono_literals;

namespace {

template <typename T>
void log(std::ostream& os, T first) {
  os << first << std::endl;
}

template <typename T, typename... Args>
void log(std::ostream& os, T first, Args... args) {
  log(os << first << " ", args...);
}

template <typename... Args>
void log_info(bool verbose, Args... args) {
  if (!verbose) {
    return;
  }

  log(std::cerr, "[ring]", args...);
}

template <typename T, typename U>
decltype(T() * U()) ceildiv(T a, U b) {
  return (a + b - 1) / b;
}

class SocketThread {
 public:
  SocketThread(int fd) : fd_(fd), stop_(false) {
    worker_ = std::thread(&SocketThread::worker, this);
    int flags = fcntl(fd, F_GETFL, 0);
    fcntl(fd, F_SETFL, flags | O_NONBLOCK);
  }
  ~SocketThread() {
    stop_ = true;
    condition_.notify_all();
    worker_.join();
    int flags = fcntl(fd_, F_GETFL, 0);
    fcntl(fd_, F_SETFL, flags & ~O_NONBLOCK);
  }

  template <typename T>
  std::future<void> send(const T* buffer, size_t size) {
    return send_impl(reinterpret_cast<const char*>(buffer), size * sizeof(T));
  }

  template <typename T>
  std::future<void> recv(T* buffer, size_t size) {
    return recv_impl(reinterpret_cast<char*>(buffer), size * sizeof(T));
  }

 private:
  struct SocketTask {
    SocketTask(void* b, size_t s, std::promise<void>&& p)
        : buffer(b), size(s), promise(std::move(p)) {}
    SocketTask(SocketTask&& t)
        : buffer(t.buffer), size(t.size), promise(std::move(t.promise)) {}
    void* buffer;
    size_t size;
    std::promise<void> promise;
  };

  std::future<void> send_impl(const char* buffer, size_t size) {
    std::promise<void> send_completed_promise;
    auto send_completed_future = send_completed_promise.get_future();
    if (size == 0) {
      send_completed_promise.set_value();
      return send_completed_future;
    }

    {
      std::unique_lock lock(queue_mutex_);
      sends_.emplace_back(SocketTask(
          const_cast<char*>(buffer), size, std::move(send_completed_promise)));
    }
    condition_.notify_one();
    return send_completed_future;
  }

  std::future<void> recv_impl(char* buffer, size_t size) {
    std::promise<void> recv_completed_promise;
    auto recv_completed_future = recv_completed_promise.get_future();
    if (size == 0) {
      recv_completed_promise.set_value();
      return recv_completed_future;
    }

    {
      std::unique_lock lock(queue_mutex_);
      recvs_.emplace_back(
          SocketTask(buffer, size, std::move(recv_completed_promise)));
    }
    condition_.notify_one();
    return recv_completed_future;
  }

  bool have_tasks() {
    return !(sends_.empty() && recvs_.empty());
  }

  void worker() {
    int error_count = 0;
    bool delete_recv = false;
    bool delete_send = false;
    while (true) {
      {
        std::unique_lock lock(queue_mutex_);

        if (delete_recv) {
          recvs_.front().promise.set_value();
          recvs_.pop_front();
          delete_recv = false;
        }
        if (delete_send) {
          sends_.front().promise.set_value();
          sends_.pop_front();
          delete_send = false;
        }

        if (stop_) {
          return;
        }

        if (!have_tasks()) {
          condition_.wait(lock, [this] { return stop_ || have_tasks(); });
          if (stop_) {
            return;
          }
        }
      }

      if (!recvs_.empty()) {
        auto& task = recvs_.front();
        ssize_t r = ::recv(fd_, task.buffer, task.size, 0);
        if (r > 0) {
          task.buffer = static_cast<char*>(task.buffer) + r;
          task.size -= r;
          delete_recv = task.size == 0;
          error_count = 0;
        } else if (errno != EAGAIN) {
          error_count++;
          log_info(
              true, "Receiving from socket", fd_, "failed with errno", errno);
        }
      }
      if (!sends_.empty()) {
        auto& task = sends_.front();
        ssize_t r = ::send(fd_, task.buffer, task.size, 0);
        if (r > 0) {
          task.buffer = static_cast<char*>(task.buffer) + r;
          task.size -= r;
          delete_send = task.size == 0;
          error_count = 0;
        } else if (errno != EAGAIN) {
          error_count++;
          log_info(true, "Sending to socket", fd_, "failed with errno", errno);
        }
      }

      if (error_count >= 10) {
        log_info(true, "Too many send/recv errors. Aborting...");
        return;
      }
    }
  }

  int fd_;
  bool stop_;
  std::thread worker_;
  std::mutex queue_mutex_;
  std::condition_variable condition_;
  std::list<SocketTask> sends_;
  std::list<SocketTask> recvs_;
};

class CommunicationThreads {
 public:
  void add(const std::vector<int>& sockets) {
    for (int sock : sockets) {
      threads_.emplace(sock, sock);
    }
  }

  template <typename T>
  std::future<void> send(int socket, T* buffer, size_t size) {
    return threads_.at(socket).send<T>(buffer, size);
  }

  template <typename T>
  std::future<void> recv(int socket, T* buffer, size_t size) {
    return threads_.at(socket).recv<T>(buffer, size);
  }

 private:
  std::unordered_map<int, SocketThread> threads_;
};

/**
 * Load all addresses from the json hostfile. The hostfile is a list of
 * addresses in order of rank. For each rank there can be many addresses so
 * that we can have multiple connections between peers.
 *
 * For example:
 *  [
 *    ["ip1:5000", "ip1:5001"],
 *    ["ip2:5000", "ip2:5001"],
 *    ["ip3:5000", "ip3:5001"],
 *  ]
 */
std::vector<std::vector<detail::address_t>> load_nodes(const char* hostfile) {
  std::vector<std::vector<detail::address_t>> nodes;
  std::ifstream f(hostfile);

  json hosts = json::parse(f);
  for (auto& h : hosts) {
    std::vector<detail::address_t> host;
    for (auto& ips : h) {
      host.push_back(std::move(detail::parse_address(ips.get<std::string>())));
    }
    nodes.push_back(std::move(host));
  }

  return nodes;
}

/**
 * Create a socket and accept one connection for each of the provided
 * addresses.
 */
std::vector<int> accept_connections(
    const std::vector<detail::address_t>& addresses) {
  std::vector<int> sockets;
  int success;

  for (auto& address : addresses) {
    detail::TCPSocket socket(RING_TAG);
    socket.listen(RING_TAG, address);
    sockets.push_back(socket.accept(RING_TAG).detach());
  }

  return sockets;
}

/**
 * The counterpoint of `accept_connections`. Basically connect to each of the
 * provided addresses.
 */
std::vector<int> make_connections(
    const std::vector<detail::address_t>& addresses,
    bool verbose) {
  std::vector<int> sockets;
  int success;

  for (auto& address : addresses) {
    sockets.push_back(
        detail::TCPSocket::connect(
            RING_TAG,
            address,
            CONN_ATTEMPTS,
            CONN_WAIT,
            [verbose](int attempt, int wait) {
              log_info(
                  verbose,
                  "Attempt",
                  attempt,
                  "waiting",
                  wait,
                  "ms (error:",
                  errno,
                  ")");
            })
            .detach());
  }

  return sockets;
}

} // namespace

class RingGroup : public GroupImpl {
 public:
  RingGroup(
      int rank,
      std::vector<std::vector<detail::address_t>> nodes,
      bool verbose)
      : rank_(rank), verbose_(verbose), pool_(0) {
    if (rank_ > 0 && rank_ >= nodes.size()) {
      throw std::runtime_error(
          "[ring] Rank cannot be larger than the size of the group");
    }

    size_ = nodes.size();
    int connect_to = (rank_ + 1) % size_;

    // We define the connection order by having the rank_ == size_ - 1 connect
    // first and accept after.
    if (rank_ < connect_to) {
      log_info(verbose_, "Rank", rank_, "accepting");
      sockets_left_ = accept_connections(nodes[rank_]);
      log_info(verbose_, "Rank", rank_, "connecting to", connect_to);
      sockets_right_ = make_connections(nodes[connect_to], verbose);
    } else {
      log_info(verbose_, "Rank", rank_, "connecting to", connect_to);
      sockets_right_ = make_connections(nodes[connect_to], verbose);
      log_info(verbose_, "Rank", rank_, "accepting");
      sockets_left_ = accept_connections(nodes[rank_]);
    }

    // Failure if we couldn't make right or left sockets
    if (sockets_right_.empty()) {
      std::ostringstream msg;
      msg << "[ring] Rank " << rank_ << " has no sockets to the right.";
      throw std::invalid_argument(msg.str());
    }
    if (sockets_left_.empty()) {
      std::ostringstream msg;
      msg << "[ring] Rank " << rank_ << " has no sockets to the left.";
      throw std::invalid_argument(msg.str());
    }

    // The following could be relaxed since we can define non-homogeneous rings
    // but it makes things a bit simpler for now.
    if (sockets_right_.size() != sockets_left_.size()) {
      std::ostringstream msg;
      msg << "[ring] It is required to have as many connections to the left as "
          << "to the right but rank " << rank_ << " has "
          << sockets_right_.size() << " connections to the right and "
          << sockets_left_.size() << " to the left.";
      throw std::invalid_argument(msg.str());
    }

    // Configure all sockets to use TCP no delay.
    int one = 1;
    for (int i = 0; i < sockets_right_.size(); i++) {
      setsockopt(sockets_right_[i], SOL_TCP, TCP_NODELAY, &one, sizeof(one));
      setsockopt(sockets_left_[i], SOL_TCP, TCP_NODELAY, &one, sizeof(one));
    }

    // Start the all reduce threads. One all reduce per direction per ring.
    pool_.resize(sockets_right_.size() + sockets_left_.size());

    // Create a communication thread per socket. This also converts them to
    // non-blocking.
    comm_.add(sockets_right_);
    comm_.add(sockets_left_);

    // Allocate buffers for the all sum
    buffers_.resize(
        (sockets_right_.size() + sockets_left_.size()) * ALL_SUM_BUFFERS *
        ALL_SUM_SIZE);
  }

  ~RingGroup() {
    for (auto s : sockets_right_) {
      shutdown(s, 2);
      close(s);
    }
    for (auto s : sockets_left_) {
      shutdown(s, 2);
      close(s);
    }
  }

  Stream communication_stream(StreamOrDevice s) override {
    return to_stream(s, Device::cpu);
  }

  int rank() override {
    return rank_;
  }

  int size() override {
    return size_;
  }

  void all_sum(const array& input, array& output, Stream stream) override {
    SWITCH_TYPE(
        output, all_reduce<T>(input, output, stream, detail::SumOp<T>()));
  }

  void all_max(const array& input, array& output, Stream stream) override {
    SWITCH_TYPE(
        output, all_reduce<T>(input, output, stream, detail::MaxOp<T>()));
  }

  void all_min(const array& input, array& output, Stream stream) override {
    SWITCH_TYPE(
        output, all_reduce<T>(input, output, stream, detail::MinOp<T>()));
  }

  std::shared_ptr<GroupImpl> split(int color, int key = -1) override {
    throw std::runtime_error("[ring] Group split not supported.");
  }

  void all_gather(const array& input, array& output, Stream stream) override {
    auto& encoder = cpu::get_command_encoder(stream);
    encoder.set_input_array(input);
    encoder.set_output_array(output);
    encoder.dispatch([input_ptr = input.data<char>(),
                      nbytes = input.nbytes(),
                      output_ptr = output.data<char>(),
                      this]() {
      constexpr size_t min_send_size = 262144;
      size_t n_gathers = std::max(
          std::min(
              sockets_right_.size() + sockets_left_.size(),
              nbytes / min_send_size),
          size_t(1));
      size_t bytes_per_gather = ceildiv(nbytes, n_gathers);
      std::vector<std::future<void>> all_gathers;
      for (int i = 0; i < n_gathers; i++) {
        auto offset = i * bytes_per_gather;
        all_gathers.emplace_back(pool_.enqueue(
            std::bind(
                &RingGroup::all_gather_impl,
                this,
                input_ptr + offset,
                output_ptr + offset,
                nbytes,
                offset + bytes_per_gather > nbytes ? nbytes - offset
                                                   : bytes_per_gather,
                sockets_right_[i / 2],
                sockets_left_[i / 2],
                (i % 2) ? -1 : 1)));
      }
      for (auto& f : all_gathers) {
        f.wait();
      }
    });
  }

  void send(const array& input, int dst, Stream stream) override {
    auto& encoder = cpu::get_command_encoder(stream);
    encoder.set_input_array(input);
    encoder.dispatch(
        [input_ptr = input.data<char>(), nbytes = input.nbytes(), dst, this]() {
          int right = (rank_ + 1) % size_;
          int left = (rank_ + size_ - 1) % size_;
          if (dst == right) {
            send(sockets_right_, input_ptr, nbytes);
          } else if (dst == left) {
            send(sockets_left_, input_ptr, nbytes);
          } else {
            std::ostringstream msg;
            msg << "[ring] Send only supported to direct neighbors "
                << "but tried to send to " << dst << " from " << rank_
                << std::endl;
            throw std::runtime_error(msg.str());
          }
        });
  }

  void recv(array& out, int src, Stream stream) override {
    auto& encoder = cpu::get_command_encoder(stream);
    encoder.set_output_array(out);
    encoder.dispatch(
        [out_ptr = out.data<char>(), nbytes = out.nbytes(), src, this]() {
          // NOTE: We 'll check the sockets with the opposite order of send so
          // that they work even with 2 nodes where left and right is the same
          // neighbor.
          int right = (rank_ + 1) % size_;
          int left = (rank_ + size_ - 1) % size_;
          if (src == left) {
            recv(sockets_left_, out_ptr, nbytes);
          } else if (src == right) {
            recv(sockets_right_, out_ptr, nbytes);
          } else {
            std::ostringstream msg;
            msg << "[ring] Recv only supported from direct neighbors "
                << "but tried to recv from " << src << " to " << rank_
                << std::endl;
            throw std::runtime_error(msg.str());
          }
        });
  }

  void sum_scatter(const array& input, array& output, Stream stream) override {
    throw std::runtime_error("[ring] sum_scatter not supported.");
  }

 private:
  template <typename T, typename ReduceOp>
  void all_reduce(
      const array& input,
      array& output,
      Stream stream,
      ReduceOp reduce_op) {
    auto in_ptr = input.data<char>();
    auto out_ptr = output.data<char>();
    auto& encoder = cpu::get_command_encoder(stream);
    encoder.set_output_array(output);
    encoder.dispatch([in_ptr, out_ptr, size = input.size(), this, reduce_op]() {
      // If the input data cannot be split into size_ segments then copy it and
      // all reduce a local buffer prefilled with 0s.
      size_t nbytes = size * sizeof(T);
      if (size < size_) {
        // TODO: Maybe allocate dynamically so we don't have the constraint
        // below?
        if (sizeof(T) * size_ > 1024) {
          std::ostringstream msg;
          msg << "Can't perform the ring all reduce of " << size
              << " elements with a ring of size " << size_;
          throw std::runtime_error(msg.str());
        }

        char buffer[1024];
        std::memset(buffer, 0, size_ * sizeof(T));
        std::memcpy(buffer, in_ptr, nbytes);
        all_reduce_impl<T, ReduceOp>(
            reinterpret_cast<T*>(buffers_.data()),
            reinterpret_cast<T*>(buffer),
            size_,
            sockets_right_[0],
            sockets_left_[0],
            -1,
            reduce_op);
        std::memcpy(out_ptr, buffer, nbytes);
        return;
      }

      // If not inplace all reduce then copy the input to the output first
      if (in_ptr != out_ptr) {
        std::memcpy(out_ptr, in_ptr, nbytes);
      }

      // Split the all reduces so that each member has at least 1 buffer to
      // send/recv per segment.
      constexpr size_t min_send_size = 262144;
      size_t n_reduces = std::max(
          std::min(
              sockets_right_.size() + sockets_left_.size(),
              nbytes / (size_ * min_send_size)),
          size_t(1));
      size_t step = ceildiv(size, n_reduces);
      std::vector<std::future<void>> all_sums;

      for (int i = 0; i < n_reduces; i++) {
        all_sums.emplace_back(pool_.enqueue(
            std::bind(
                &RingGroup::all_reduce_impl<T, ReduceOp>,
                this,
                reinterpret_cast<T*>(
                    buffers_.data() + i * ALL_SUM_SIZE * ALL_SUM_BUFFERS),
                reinterpret_cast<T*>(out_ptr) + i * step,
                std::min(size, (i + 1) * step) - i * step,
                sockets_right_[i / 2],
                sockets_left_[i / 2],
                (i % 2) ? -1 : 1,
                reduce_op)));
      }
      for (auto& f : all_sums) {
        f.wait();
      }
    });
  }

  template <typename T, typename ReduceOp>
  void all_reduce_impl(
      T* buffer,
      T* data,
      size_t data_size,
      int socket_right,
      int socket_left,
      int direction,
      ReduceOp reduce_op) {
    // Choose which socket we send to and recv from
    int socket_send = (direction < 0) ? socket_right : socket_left;
    int socket_recv = (direction < 0) ? socket_left : socket_right;

    // We split the data into `size_` segments of size `segment_size` and each
    // of these in smaller segments of ALL_SUM_SIZE which we 'll call packets.
    size_t segment_size = ceildiv(data_size, size_);
    size_t BUFFER_SIZE = std::max(
        size_t(32768), std::min(ALL_SUM_SIZE / sizeof(T), segment_size / 2));
    size_t n_packets = ceildiv(segment_size, BUFFER_SIZE);

    // Initial segments
    int send_segment = rank_;
    int recv_segment = (rank_ + direction + size_) % size_;

    // Plan the whole reduce in terms of sends and recvs as indices in data.
    // It makes the actual async send and recv a bit simpler to follow when
    // there are less offset calculations around.
    std::vector<std::pair<size_t, size_t>> send_plan;
    std::vector<std::pair<size_t, size_t>> recv_plan;

    // Two times the same send/recv operations, first scatter reduce and then
    // gather.
    for (int k = 0; k < 2; k++) {
      for (int i = 0; i < size_ - 1; i++) {
        size_t send_start = send_segment * segment_size;
        size_t send_stop =
            std::min((send_segment + 1) * segment_size, data_size);
        size_t recv_start = recv_segment * segment_size;
        size_t recv_stop =
            std::min((recv_segment + 1) * segment_size, data_size);

        for (size_t j = 0; j < n_packets; j++) {
          send_plan.emplace_back(
              std::min(send_start + j * BUFFER_SIZE, send_stop),
              std::min(send_start + (j + 1) * BUFFER_SIZE, send_stop));
          recv_plan.emplace_back(
              std::min(recv_start + j * BUFFER_SIZE, recv_stop),
              std::min(recv_start + (j + 1) * BUFFER_SIZE, recv_stop));
        }

        send_segment = (send_segment + size_ + direction) % size_;
        recv_segment = (recv_segment + size_ + direction) % size_;
      }
    }

    // Running the plan is fairly simple, we keep a send and a recv in flight
    // while doing the summation.
    T* recv_buffers[ALL_SUM_BUFFERS];
    for (int i = 0; i < ALL_SUM_BUFFERS; i++) {
      recv_buffers[i] = buffer + i * BUFFER_SIZE;
    }
    std::future<void> sends[2], recvs[2];
    int a = 0;
    int b = (n_packets > 1) ? 1 : 0;
    for (int i = 0, j = -b; i < send_plan.size(); j++, i++) {
      sends[a] = comm_.send(
          socket_send,
          data + send_plan[i].first,
          send_plan[i].second - send_plan[i].first);
      if (2 * i < send_plan.size()) {
        recvs[a] = comm_.recv(
            socket_recv,
            recv_buffers[i % ALL_SUM_BUFFERS],
            recv_plan[i].second - recv_plan[i].first);
      } else {
        recvs[a] = comm_.recv(
            socket_recv,
            data + recv_plan[i].first,
            recv_plan[i].second - recv_plan[i].first);
      }

      if (j >= 0) {
        sends[b].wait();
        recvs[b].wait();
        if (2 * j < send_plan.size()) {
          reduce_op(
              recv_buffers[j % ALL_SUM_BUFFERS],
              data + recv_plan[j].first,
              recv_plan[j].second - recv_plan[j].first);
        }
      }

      std::swap(a, b);
    }
    sends[b].wait();
    recvs[b].wait();
  }

  void all_gather_impl(
      const char* input,
      char* output,
      size_t input_size,
      size_t data_size,
      int socket_right,
      int socket_left,
      int direction) {
    // Choose which socket we send to and recv from
    int socket_send = (direction < 0) ? socket_right : socket_left;
    int socket_recv = (direction < 0) ? socket_left : socket_right;

    // Initial segments
    int send_segment = rank_;
    int recv_segment = (rank_ + direction + size_) % size_;

    // Copy our own segment in the output
    std::memcpy(output + rank_ * input_size, input, data_size);

    // Simple send/recv all gather. Possible performance improvement by
    // splitting to multiple chunks and allowing send/recv to run a bit ahead.
    // See all_sum_impl for an example.
    for (int i = 0; i < size_ - 1; i++) {
      auto sent = comm_.send(
          socket_send, output + send_segment * input_size, data_size);
      auto recvd = comm_.recv(
          socket_recv, output + recv_segment * input_size, data_size);

      send_segment = (send_segment + size_ + direction) % size_;
      recv_segment = (recv_segment + size_ + direction) % size_;

      sent.wait();
      recvd.wait();
    }
  }

  void
  send(const std::vector<int>& sockets, const char* data, size_t data_size) {
    size_t segment_size =
        std::max(size_t(1024), ceildiv(data_size, sockets.size()));
    std::vector<std::future<void>> sends;
    for (int i = 0; i < sockets.size(); i++) {
      if (i * segment_size >= data_size) {
        break;
      }
      sends.emplace_back(comm_.send(
          sockets[i],
          data + i * segment_size,
          std::min(data_size, (i + 1) * segment_size) - i * segment_size));
    }
    for (auto& f : sends) {
      f.wait();
    }
  }

  void recv(const std::vector<int>& sockets, char* data, size_t data_size) {
    size_t segment_size =
        std::max(size_t(1024), ceildiv(data_size, sockets.size()));
    std::vector<std::future<void>> recvs;
    for (int i = 0; i < sockets.size(); i++) {
      if (i * segment_size >= data_size) {
        break;
      }
      recvs.emplace_back(comm_.recv(
          sockets[i],
          data + i * segment_size,
          std::min(data_size, (i + 1) * segment_size) - i * segment_size));
    }
    for (auto& f : recvs) {
      f.wait();
    }
  }

  int rank_;
  int size_;

  bool verbose_;

  ThreadPool pool_;
  CommunicationThreads comm_;

  std::vector<int> sockets_right_;
  std::vector<int> sockets_left_;

  std::vector<char> buffers_;
};

bool is_available() {
  return true;
}

std::shared_ptr<GroupImpl> init(bool strict /* = false */) {
  const char* hostfile = std::getenv("MLX_HOSTFILE");
  const char* rank_str = std::getenv("MLX_RANK");
  const char* ring_verbose = std::getenv("MLX_RING_VERBOSE");

  if (!hostfile || !rank_str) {
    if (strict) {
      std::ostringstream msg;
      msg << "[ring] You need to provide via environment variables both a rank (MLX_RANK) "
          << "and a hostfile (MLX_HOSTFILE) but provided MLX_RANK=\""
          << ((rank_str) ? rank_str : "") << "\" and MLX_HOSTFILE=\""
          << ((hostfile) ? hostfile : "") << "\"";
      throw std::runtime_error(msg.str());
    }
    return nullptr;
  }

  auto nodes = load_nodes(hostfile);
  int rank = std::atoi(rank_str);

  return std::make_shared<RingGroup>(rank, nodes, ring_verbose != nullptr);
}

} // namespace mlx::core::distributed::ring


================================================
FILE: mlx/distributed/ring/ring.h
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/distributed/distributed.h"

namespace mlx::core::distributed::ring {

using GroupImpl = mlx::core::distributed::detail::GroupImpl;

bool is_available();
std::shared_ptr<GroupImpl> init(bool strict = false);

} // namespace mlx::core::distributed::ring


================================================
FILE: mlx/distributed/utils.cpp
================================================
// Copyright © 2025 Apple Inc.

#include <netdb.h>
#include <unistd.h>
#include <cstring>
#include <sstream>
#include <thread>

#include "mlx/distributed/utils.h"

namespace mlx::core::distributed::detail {

/**
 * Parse a sockaddr from an ip and port provided as strings.
 */
address_t parse_address(const std::string& ip, const std::string& port) {
  struct addrinfo hints, *res;
  std::memset(&hints, 0, sizeof(hints));
  hints.ai_family = AF_UNSPEC;
  hints.ai_socktype = SOCK_STREAM;

  int status = getaddrinfo(ip.c_str(), port.c_str(), &hints, &res);
  if (status != 0) {
    std::ostringstream msg;
    msg << "Can't parse address " << ip << ":" << port;
    throw std::runtime_error(msg.str());
  }

  address_t result;
  memcpy(&result.addr, res->ai_addr, res->ai_addrlen);
  result.len = res->ai_addrlen;
  freeaddrinfo(res);

  return result;
}

/**
 * Parse a sockaddr provided as an <ip>:<port> string.
 */
address_t parse_address(const std::string& ip_port) {
  auto colon = ip_port.find(":");
  if (colon == std::string::npos) {
    std::ostringstream msg;
    msg << "Can't parse address " << ip_port;
    throw std::runtime_error(msg.str());
  }
  std::string ip(ip_port.begin(), ip_port.begin() + colon);
  std::string port(ip_port.begin() + colon + 1, ip_port.end());

  return parse_address(ip, port);
}

TCPSocket::TCPSocket(const char* tag) {
  sock_ = socket(AF_INET, SOCK_STREAM, 0);
  if (sock_ < 0) {
    std::ostringstream msg;
    msg << tag << " Couldn't create socket (error: " << errno << ")";
    throw std::runtime_error(msg.str());
  }
}

TCPSocket::TCPSocket(TCPSocket&& s) {
  sock_ = s.sock_;
  s.sock_ = -1;
}

TCPSocket& TCPSocket::operator=(TCPSocket&& s) {
  if (this != &s) {
    sock_ = s.sock_;
    s.sock_ = -1;
  }
  return *this;
}

TCPSocket::TCPSocket(int s) : sock_(s) {}

TCPSocket::~TCPSocket() {
  if (sock_ > 0) {
    shutdown(sock_, 2);
    close(sock_);
  }
}

int TCPSocket::detach() {
  int s = sock_;
  sock_ = -1;
  return s;
}

void TCPSocket::listen(const char* tag, const address_t& addr) {
  int success;

  // Make sure we can launch immediately after shutdown by setting the
  // reuseaddr option so that we don't get address already in use errors
  int enable = 1;
  success = setsockopt(sock_, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
  if (success < 0) {
    std::ostringstream msg;
    msg << tag << " Couldn't enable reuseaddr (error: " << errno << ")";
    throw std::runtime_error(msg.str());
  }
  success = setsockopt(sock_, SOL_SOCKET, SO_REUSEPORT, &enable, sizeof(int));
  if (success < 0) {
    std::ostringstream msg;
    msg << tag << " Couldn't enable reuseport (error: " << errno << ")";
    throw std::runtime_error(msg.str());
  }

  // Bind the socket to the address and port
  success = bind(sock_, addr.get(), addr.len);
  if (success < 0) {
    std::ostringstream msg;
    msg << tag << " Couldn't bind socket (error: " << errno << ")";
    throw std::runtime_error(msg.str());
  }

  // Prepare waiting for connections
  success = ::listen(sock_, 0);
  if (success < 0) {
    std::ostringstream msg;
    msg << tag << " Couldn't listen (error: " << errno << ")";
    throw std::runtime_error(msg.str());
  }
}

TCPSocket TCPSocket::accept(const char* tag) {
  int peer = ::accept(sock_, nullptr, nullptr);
  if (peer < 0) {
    std::ostringstream msg;
    msg << tag << " Accept failed (error: " << errno << ")";
    throw std::runtime_error(msg.str());
  }

  return TCPSocket(peer);
}

void TCPSocket::send(const char* tag, const void* data, size_t len) {
  while (len > 0) {
    auto n = ::send(sock_, data, len, 0);
    if (n <= 0) {
      std::ostringstream msg;
      msg << tag << " Send failed with errno=" << errno;
      throw std::runtime_error(msg.str());
    }
    len -= n;
    data = static_cast<const char*>(data) + n;
  }
}

void TCPSocket::recv(const char* tag, void* data, size_t len) {
  while (len > 0) {
    auto n = ::recv(sock_, data, len, 0);
    if (n <= 0) {
      std::ostringstream msg;
      msg << tag << " Recv failed with errno=" << errno;
      throw std::runtime_error(msg.str());
    }
    len -= n;
    data = static_cast<char*>(data) + n;
  }
}

TCPSocket TCPSocket::connect(
    const char* tag,
    const address_t& addr,
    int num_retries,
    int wait,
    std::function<void(int, int)> cb) {
  int sock, success;

  // Attempt to connect `num_retries` times with exponential backoff.
  for (int attempt = 0; attempt < num_retries; attempt++) {
    // Create the socket
    sock = socket(AF_INET, SOCK_STREAM, 0);
    if (sock < 0) {
      std::ostringstream msg;
      msg << tag << " Couldn't create socket to connect (error: " << errno
          << ")";
      throw std::runtime_error(msg.str());
    }

    success = ::connect(sock, addr.get(), addr.len);
    if (success == 0) {
      break;
    }

    if (cb != nullptr) {
      cb(attempt, wait);
    }
    if (wait > 0) {
      std::this_thread::sleep_for(std::chrono::milliseconds(wait));
    }

    wait <<= 1;
  }

  if (success < 0) {
    std::ostringstream msg;
    msg << tag << " Couldn't connect (error: " << errno << ")";
    throw std::runtime_error(msg.str());
  }

  return TCPSocket(sock);
}

} // namespace mlx::core::distributed::detail


================================================
FILE: mlx/distributed/utils.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include <sys/socket.h>
#include <functional>
#include <string>

namespace mlx::core::distributed::detail {

struct address_t {
  sockaddr_storage addr;
  socklen_t len;

  const sockaddr* get() const {
    return (struct sockaddr*)&addr;
  }
};

/**
 * Parse a sockaddr from an ip and port provided as strings.
 */
address_t parse_address(const std::string& ip, const std::string& port);

/**
 * Parse a sockaddr provided as an <ip>:<port> string.
 */
address_t parse_address(const std::string& ip_port);

/**
 * Small wrapper over a TCP socket to simplify initiating connections.
 */
class TCPSocket {
 public:
  TCPSocket(const char* tag);
  TCPSocket(const TCPSocket&) = delete;
  TCPSocket& operator=(const TCPSocket&) = delete;
  TCPSocket(TCPSocket&& s);
  TCPSocket& operator=(TCPSocket&&);
  ~TCPSocket();

  void listen(const char* tag, const address_t& addr);
  TCPSocket accept(const char* tag);

  void send(const char* tag, const void* data, size_t len);
  void recv(const char* tag, void* data, size_t len);

  int detach();

  operator int() const {
    return sock_;
  }

  static TCPSocket connect(
      const char* tag,
      const address_t& addr,
      int num_retries = 1,
      int wait = 0,
      std::function<void(int, int)> cb = nullptr);

 private:
  TCPSocket(int sock);

  int sock_;
};

} // namespace mlx::core::distributed::detail


================================================
FILE: mlx/dtype.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <cstdint>

#include "mlx/dtype.h"

namespace mlx::core {

namespace {

constexpr int num_types = 14;
constexpr int num_cats = 8;

constexpr Dtype::Kind type_kinds[num_types] = {
    Dtype::Kind::b, // bool_,
    Dtype::Kind::u, // uint8,
    Dtype::Kind::u, // uint16,
    Dtype::Kind::u, // uint32,
    Dtype::Kind::u, // uint64,
    Dtype::Kind::i, // int8,
    Dtype::Kind::i, // int16,
    Dtype::Kind::i, // int32,
    Dtype::Kind::i, // int64,
    Dtype::Kind::f, // float16,
    Dtype::Kind::f, // float32,
    Dtype::Kind::f, // float64,
    Dtype::Kind::V, // bfloat16,
    Dtype::Kind::c // complex64,
};

// Following Jax type promotion rules:
// https://jax.readthedocs.io/en/latest/type_promotion.html
// clang-format off
constexpr Dtype type_rules[num_types][num_types] = {
// bool       uint8      uint16     uint32     uint64     int8       int16      int32      int64      float16    float32   float64    bfloat16   complex64
  {bool_,     uint8,     uint16,    uint32,    uint64,    int8,      int16,     int32,     int64,     float16,   float32,  float64,   bfloat16,  complex64}, // bool
  {uint8,     uint8,     uint16,    uint32,    uint64,    int16,     int16,     int32,     int64,     float16,   float32,  float64,   bfloat16,  complex64}, // uint8
  {uint16,    uint16,    uint16,    uint32,    uint64,    int32,     int32,     int32,     int64,     float16,   float32,  float64,   bfloat16,  complex64}, // uint16
  {uint32,    uint32,    uint32,    uint32,    uint64,    int64,     int64,     int64,     int64,     float16,   float32,  float64,   bfloat16,  complex64}, // uint32
  {uint64,    uint64,    uint64,    uint64,    uint64,    float32,   float32,   float32,   float32,   float16,   float32,  float64,   bfloat16,  complex64}, // uint64
  {int8,      int16,     int32,     int64,     float32,   int8,      int16,     int32,     int64,     float16,   float32,  float64,   bfloat16,  complex64}, // int8
  {int16,     int16,     int32,     int64,     float32,   int16,     int16,     int32,     int64,     float16,   float32,  float64,   bfloat16,  complex64}, // int16
  {int32,     int32,     int32,     int64,     float32,   int32,     int32,     int32,     int64,     float16,   float32,  float64,   bfloat16,  complex64}, // int32
  {int64,     int64,     int64,     int64,     float32,   int64,     int64,     int64,     int64,     float16,   float32,  float64,   bfloat16,  complex64}, // int64
  {float16,   float16,   float16,   float16,   float16,   float16,   float16,   float16,   float16,   float16,   float32,  float64,   float32,   complex64}, // float16
  {float32,   float32,   float32,   float32,   float32,   float32,   float32,   float32,   float32,   float32,   float32,  float64,   float32,   complex64}, // float32
  {float64,   float64,   float64,   float64,   float64,   float64,   float64,   float64,   float64,   float64,   float64,  float64,   float64,   complex64}, // float64
  {bfloat16,  bfloat16,  bfloat16,  bfloat16,  bfloat16,  bfloat16,  bfloat16,  bfloat16,  bfloat16,  float32,   float32,  float64,   bfloat16,  complex64}, // bfloat16
  {complex64, complex64, complex64, complex64, complex64, complex64, complex64, complex64, complex64, complex64, complex64,complex64, complex64, complex64}, // complex64
};


constexpr bool subcategory_to_category[num_cats][num_cats] = {
// complexfloating floating inexact signedinteger unsignedinteger integer number generic
  {true,           false,   true,   false,        false,          false,  true,  true}, // complexfloating
  {false,          true,    true,   false,        false,          false,  true,  true}, // floating
  {false,          false,   true,   false,        false,          false,  true,  true}, // inexact
  {false,          false,   false,  true,         false,          true,   true,  true}, // signedinteger
  {false,          false,   false,  false,        true,           true,   true,  true}, // unsignedinteger
  {false,          false,   false,  false,        false,          true,   true,  true}, // integer
  {false,          false,   false,  false,        false,          false,  true,  true}, // number
  {false,          false,   false,  false,        false,          false,  false, true}, // generic
};

constexpr Dtype::Category type_to_category[num_types] = {
    Dtype::Category::generic, // bool_,
    Dtype::Category::unsignedinteger, // uint8,
    Dtype::Category::unsignedinteger, // uint16,
    Dtype::Category::unsignedinteger, // uint32,
    Dtype::Category::unsignedinteger, // uint64,
    Dtype::Category::signedinteger, // int8,
    Dtype::Category::signedinteger, // int16,
    Dtype::Category::signedinteger, // int32,
    Dtype::Category::signedinteger, // int64,
    Dtype::Category::floating, // float16,
    Dtype::Category::floating, // float32,
    Dtype::Category::floating, // float64,
    Dtype::Category::floating, // bfloat16,
    Dtype::Category::complexfloating, // complex64,
};

// clang-format on

} // namespace

Dtype promote_types(const Dtype& t1, const Dtype& t2) {
  return Dtype(
      type_rules[static_cast<int>(t1.val())][static_cast<int>(t2.val())]);
}

Dtype::Kind kindof(const Dtype& t) {
  return type_kinds[static_cast<int>(t.val())];
}

template class MLX_API TypeToDtype<bool>;
template class MLX_API TypeToDtype<uint8_t>;
template class MLX_API TypeToDtype<uint16_t>;
template class MLX_API TypeToDtype<uint32_t>;
template class MLX_API TypeToDtype<uint64_t>;
template class MLX_API TypeToDtype<int8_t>;
template class MLX_API TypeToDtype<int16_t>;
template class MLX_API TypeToDtype<int32_t>;
template class MLX_API TypeToDtype<int64_t>;
template class MLX_API TypeToDtype<float16_t>;
template class MLX_API TypeToDtype<float>;
template class MLX_API TypeToDtype<double>;
template class MLX_API TypeToDtype<bfloat16_t>;
template class MLX_API TypeToDtype<complex64_t>;

template <>
TypeToDtype<bool>::operator Dtype() {
  return bool_;
}

template <>
TypeToDtype<uint8_t>::operator Dtype() {
  return uint8;
}

template <>
TypeToDtype<uint16_t>::operator Dtype() {
  return uint16;
}

template <>
TypeToDtype<uint32_t>::operator Dtype() {
  return uint32;
}

template <>
TypeToDtype<uint64_t>::operator Dtype() {
  return uint64;
}

template <>
TypeToDtype<int8_t>::operator Dtype() {
  return int8;
}

template <>
TypeToDtype<int16_t>::operator Dtype() {
  return int16;
}

template <>
TypeToDtype<int32_t>::operator Dtype() {
  return int32;
}

template <>
TypeToDtype<int64_t>::operator Dtype() {
  return int64;
}

template <>
TypeToDtype<float16_t>::operator Dtype() {
  return float16;
}

template <>
TypeToDtype<float>::operator Dtype() {
  return float32;
}

template <>
TypeToDtype<double>::operator Dtype() {
  return float32;
}

template <>
TypeToDtype<bfloat16_t>::operator Dtype() {
  return bfloat16;
}

template <>
TypeToDtype<complex64_t>::operator Dtype() {
  return complex64;
}

bool issubdtype(const Dtype& a, const Dtype& b) {
  return a == b;
}

bool issubdtype(const Dtype::Category& cat, const Dtype& type) {
  return false;
}

bool issubdtype(const Dtype& type, const Dtype::Category& cat) {
  return issubdtype(type_to_category[static_cast<uint32_t>(type.val())], cat);
}

bool issubdtype(const Dtype::Category& a, const Dtype::Category& b) {
  return subcategory_to_category[static_cast<uint32_t>(a)]
                                [static_cast<uint32_t>(b)];
}

} // namespace mlx::core


================================================
FILE: mlx/dtype.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <complex>
#include <cstdint>

#include "mlx/api.h"
#include "mlx/types/complex.h"
#include "mlx/types/half_types.h"

namespace mlx::core {

struct Dtype {
  enum class Val {
    bool_,
    uint8,
    uint16,
    uint32,
    uint64,
    int8,
    int16,
    int32,
    int64,
    float16,
    float32,
    float64,
    bfloat16,
    complex64,
  };

  enum class Kind {
    b, /* bool */
    u, /* unsigned int */
    i, /* signed int */
    f, /* float */
    c, /* complex */
    V, /* void - used for brain float */
  };

  enum class Category {
    complexfloating,
    floating,
    inexact,
    signedinteger,
    unsignedinteger,
    integer,
    number,
    generic
  };

  constexpr explicit Dtype(Val val, uint8_t size) : val_(val), size_(size) {}

  constexpr operator Val() const {
    return val_;
  }
  constexpr Val val() const {
    return val_;
  }
  constexpr uint8_t size() const {
    return size_;
  }

 private:
  Val val_;
  uint8_t size_;
};

inline constexpr Dtype bool_{Dtype::Val::bool_, sizeof(bool)};

inline constexpr Dtype uint8{Dtype::Val::uint8, sizeof(uint8_t)};
inline constexpr Dtype uint16{Dtype::Val::uint16, sizeof(uint16_t)};
inline constexpr Dtype uint32{Dtype::Val::uint32, sizeof(uint32_t)};
inline constexpr Dtype uint64{Dtype::Val::uint64, sizeof(uint64_t)};

inline constexpr Dtype int8{Dtype::Val::int8, sizeof(int8_t)};
inline constexpr Dtype int16{Dtype::Val::int16, sizeof(int16_t)};
inline constexpr Dtype int32{Dtype::Val::int32, sizeof(int32_t)};
inline constexpr Dtype int64{Dtype::Val::int64, sizeof(int64_t)};

inline constexpr Dtype float16{Dtype::Val::float16, sizeof(uint16_t)};
inline constexpr Dtype float32{Dtype::Val::float32, sizeof(float)};
inline constexpr Dtype float64{Dtype::Val::float64, sizeof(double)};
inline constexpr Dtype bfloat16{Dtype::Val::bfloat16, sizeof(uint16_t)};
inline constexpr Dtype complex64{Dtype::Val::complex64, sizeof(complex64_t)};

inline constexpr Dtype::Category complexfloating =
    Dtype::Category::complexfloating;
inline constexpr Dtype::Category floating = Dtype::Category::floating;
inline constexpr Dtype::Category inexact = Dtype::Category::inexact;
inline constexpr Dtype::Category signedinteger = Dtype::Category::signedinteger;
inline constexpr Dtype::Category unsignedinteger =
    Dtype::Category::unsignedinteger;
inline constexpr Dtype::Category integer = Dtype::Category::integer;
inline constexpr Dtype::Category number = Dtype::Category::number;
inline constexpr Dtype::Category generic = Dtype::Category::generic;

MLX_API bool issubdtype(const Dtype& a, const Dtype& b);
MLX_API bool issubdtype(const Dtype::Category& a, const Dtype& b);
MLX_API bool issubdtype(const Dtype& a, const Dtype::Category& b);
MLX_API bool issubdtype(const Dtype::Category& a, const Dtype::Category& b);

MLX_API Dtype promote_types(const Dtype& t1, const Dtype& t2);

inline uint8_t size_of(const Dtype& t) {
  return t.size();
}

MLX_API Dtype::Kind kindof(const Dtype& t);

template <typename T>
struct MLX_API TypeToDtype {
  operator Dtype();
};

} // namespace mlx::core


================================================
FILE: mlx/dtype_utils.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/dtype_utils.h"

namespace mlx::core {

const char* dtype_to_string(Dtype arg) {
  switch (arg) {
    case bool_:
      return "bool";
    case int8:
      return "int8";
    case int16:
      return "int16";
    case int32:
      return "int32";
    case int64:
      return "int64";
    case uint8:
      return "uint8";
    case uint16:
      return "uint16";
    case uint32:
      return "uint32";
    case uint64:
      return "uint64";
    case float16:
      return "float16";
    case bfloat16:
      return "bfloat16";
    case float32:
      return "float32";
    case float64:
      return "float64";
    case complex64:
      return "complex64";
    default:
      return "unknown";
  }
}

} // namespace mlx::core


================================================
FILE: mlx/dtype_utils.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include <sstream>

#include "mlx/dtype.h"
#include "mlx/utils.h"

namespace mlx::core {

// Return string representation of dtype.
const char* dtype_to_string(Dtype arg);

#define MLX_INTERNAL_DTYPE_SWITCH_CASE(DTYPE, TYPE) \
  case DTYPE:                                       \
    f(type_identity<TYPE>{});                       \
    break

#define MLX_INTERNAL_DTYPE_SWITCH_INTS()            \
  MLX_INTERNAL_DTYPE_SWITCH_CASE(int8, int8_t);     \
  MLX_INTERNAL_DTYPE_SWITCH_CASE(int16, int16_t);   \
  MLX_INTERNAL_DTYPE_SWITCH_CASE(int32, int32_t);   \
  MLX_INTERNAL_DTYPE_SWITCH_CASE(int64, int64_t);   \
  MLX_INTERNAL_DTYPE_SWITCH_CASE(uint8, uint8_t);   \
  MLX_INTERNAL_DTYPE_SWITCH_CASE(uint16, uint16_t); \
  MLX_INTERNAL_DTYPE_SWITCH_CASE(uint32, uint32_t); \
  MLX_INTERNAL_DTYPE_SWITCH_CASE(uint64, uint64_t)

#define MLX_INTERNAL_DTYPE_SWITCH_FLOATS()              \
  MLX_INTERNAL_DTYPE_SWITCH_CASE(float16, float16_t);   \
  MLX_INTERNAL_DTYPE_SWITCH_CASE(bfloat16, bfloat16_t); \
  MLX_INTERNAL_DTYPE_SWITCH_CASE(float32, float);       \
  MLX_INTERNAL_DTYPE_SWITCH_CASE(float64, double)

// This already exists in C++20 but in C++20 we can also just use templated
// lambdas which will make this so much nicer.
template <typename T>
struct type_identity {
  using type = T;
};

#define MLX_GET_TYPE(x) typename decltype(x)::type
#define MLX_GET_VALUE(x) decltype(x)::value

template <typename F>
void dispatch_all_types(Dtype dt, F&& f) {
  switch (dt) {
    MLX_INTERNAL_DTYPE_SWITCH_CASE(bool_, bool);
    MLX_INTERNAL_DTYPE_SWITCH_INTS();
    MLX_INTERNAL_DTYPE_SWITCH_FLOATS();
    MLX_INTERNAL_DTYPE_SWITCH_CASE(complex64, complex64_t);
  }
}

template <typename F>
void dispatch_int_types(Dtype dt, std::string_view tag, F&& f) {
  switch (dt) {
    MLX_INTERNAL_DTYPE_SWITCH_INTS();
    default:
      std::ostringstream msg;
      msg << tag << " Only integer types supported but " << dt
          << " was provided";
      throw std::invalid_argument(msg.str());
  }
}

template <typename F>
void dispatch_float_types(Dtype dt, std::string_view tag, F&& f) {
  switch (dt) {
    MLX_INTERNAL_DTYPE_SWITCH_FLOATS();
    default:
      std::ostringstream msg;
      msg << tag << " Only float types supported but " << dt << " was provided";
      throw std::invalid_argument(msg.str());
  }
}

template <typename F>
void dispatch_inexact_types(Dtype dt, std::string_view tag, F&& f) {
  switch (dt) {
    MLX_INTERNAL_DTYPE_SWITCH_FLOATS();
    MLX_INTERNAL_DTYPE_SWITCH_CASE(complex64, complex64_t);
    default:
      std::ostringstream msg;
      msg << tag << " Only inexact (float/complex) types supported but " << dt
          << " was provided";
      throw std::invalid_argument(msg.str());
  }
}

template <typename F>
void dispatch_int_float_types(Dtype dt, std::string_view tag, F&& f) {
  switch (dt) {
    MLX_INTERNAL_DTYPE_SWITCH_INTS();
    MLX_INTERNAL_DTYPE_SWITCH_FLOATS();
    default:
      std::ostringstream msg;
      msg << tag << " Only integer and float types supported but " << dt
          << " was provided";
      throw std::invalid_argument(msg.str());
  }
}

template <typename F>
void dispatch_real_types(Dtype dt, std::string_view tag, F&& f) {
  switch (dt) {
    MLX_INTERNAL_DTYPE_SWITCH_CASE(bool_, bool);
    MLX_INTERNAL_DTYPE_SWITCH_INTS();
    MLX_INTERNAL_DTYPE_SWITCH_FLOATS();
    default:
      std::ostringstream msg;
      msg << tag << " Only real numbers supported but " << dt
          << " was provided";
      throw std::invalid_argument(msg.str());
  }
}

} // namespace mlx::core


================================================
FILE: mlx/einsum.cpp
================================================
// Copyright © 2024 Apple Inc.
#include <numeric>
#include <sstream>
#include <unordered_map>
#include <unordered_set>

#include "mlx/einsum.h"
#include "mlx/ops.h"

namespace mlx::core {

namespace {

// The MLX einsum implementation is based on NumPy (which is based on
// opt_einsum):
// https://github.com/numpy/numpy/blob/1d49c7f7ff527c696fc26ab2278ad51632a66660/numpy/_core/einsumfunc.py#L743
// https://github.com/dgasmith/opt_einsum

using CharSet = std::unordered_set<char>;

// A helper struct to hold the string and set
// representation of a subscript to avoid needing
// to recompute the set
struct Subscript {
  Subscript(std::string str, CharSet set)
      : str(std::move(str)), set(std::move(set)) {};
  std::string str;
  CharSet set;
};

struct PathInfo {
  size_t naive_cost;
  size_t naive_scaling;
  size_t optimized_cost;
  size_t optimized_scaling;
  size_t largest_term;
};

struct PathNode {
  PathNode(
      std::vector<Subscript> inputs,
      Subscript output,
      std::vector<int> positions)
      : inputs(std::move(inputs)),
        output(std::move(output)),
        positions(std::move(positions)) {};

  std::vector<Subscript> inputs;
  Subscript output;

  std::vector<int> positions;
};

// Parse the comma separated subscripts into a vector of strings. If the
// output subscripts are missing they are inferred.
//
// For example:
//  "ij,jk -> ik" becomes {{"ij", "jk"}, "ik"}
//  "ij,jk" becomes {{"ij", "jk"}, "ik"}
std::pair<std::vector<std::string>, std::string> parse(std::string subscripts) {
  std::string lhs, rhs;

  // Start by removing all white space
  subscripts.erase(
      std::remove(subscripts.begin(), subscripts.end(), ' '), subscripts.end());

  if (auto pos = subscripts.find("->"); pos != std::string::npos) {
    // Explicit mode
    lhs = subscripts.substr(0, pos);
    rhs = subscripts.substr(pos + 2);
  } else {
    // Implicit mode:
    // - repeats are summed
    // - ellipses are placed in the beginning of the output
    // - remaining output axes are ordered alphabetically
    lhs = subscripts;
    std::unordered_map<char, int> temp;
    for (auto& c : subscripts) {
      if (c == ',') {
        continue;
      }
      if (c == '.' && rhs.empty()) {
        rhs += "...";
        continue;
      }

      auto inserted = temp.insert({c, 0});
      inserted.first->second++;
    }
    for (auto& k : temp) {
      if (k.second == 1) {
        rhs += k.first;
      }
    }
    std::sort(rhs.begin(), rhs.end());
  }
  std::vector<std::string> input_list;
  std::stringstream ss(lhs);
  std::string token;
  while (getline(ss, token, ',')) {
    input_list.push_back(token);
  }
  return {input_list, rhs};
}

// Check if two sets are disjoint
bool disjoint(const CharSet& x, const CharSet& y) {
  for (auto& c : x) {
    if (y.find(c) != y.end()) {
      return false;
    }
  }
  return true;
}

template <typename T>
size_t term_size(const T& term, std::unordered_map<char, ShapeElem> dict) {
  size_t size = 1;
  for (auto c : term) {
    size *= dict[c];
  }
  return size;
}

size_t flop_count(
    const CharSet& term,
    bool inner,
    int num_terms,
    std::unordered_map<char, ShapeElem> dict) {
  size_t size = term_size(term, dict);
  auto op_factor = 1;
  if ((num_terms - 1) > op_factor) {
    op_factor = num_terms - 1;
  }
  if (inner) {
    op_factor += 1;
  }
  return size * op_factor;
}

std::pair<size_t, int> compute_cost_and_scaling(
    const std::vector<Subscript>& inputs,
    const Subscript& output,
    std::unordered_map<char, ShapeElem> dim_map) {
  CharSet contractions;
  for (auto& in : inputs) {
    contractions.insert(in.set.begin(), in.set.end());
  }

  bool inner = false;
  for (auto c : contractions) {
    if (output.set.find(c) == output.set.end()) {
      inner = true;
      break;
    }
  }
  auto cost = flop_count(contractions, inner, inputs.size(), dim_map);
  return {cost, contractions.size()};
}

std::tuple<std::vector<PathNode>, size_t, int> greedy_path(
    std::vector<Subscript> inputs,
    const Subscript& output,
    std::unordered_map<char, ShapeElem> dim_map,
    size_t cost_limit,
    size_t memory_limit) {
  // Helper struct for building the greedy path
  struct Contraction {
    Contraction(
        size_t size,
        size_t cost,
        CharSet output,
        int dims,
        int x,
        int y)
        : size(size),
          cost(cost),
          output(std::move(output)),
          dims(dims),
          x(x),
          y(y) {};

    int64_t size; // Size difference, can be negative
    size_t cost;
    CharSet output;
    int dims; // Number of dimensions in the contraction
    int x;
    int y;
  };

  // Start by iterating over all possible combinations
  std::vector<std::pair<int, int>> pos_pairs;
  for (int i = 0; i < inputs.size(); ++i) {
    for (int j = i + 1; j < inputs.size(); ++j) {
      pos_pairs.emplace_back(i, j);
    }
  }

  std::vector<PathNode> path;
  std::vector<Contraction> possible_contractions;
  size_t path_cost = 0;
  int path_scaling = 0;
  auto num_in = inputs.size();
  for (int i = 0; i < num_in - 1; ++i) {
    auto add_contraction = [&](int p1, int p2) {
      CharSet new_term;
      CharSet contractions(inputs[p1].set.begin(), inputs[p1].set.end());
      contractions.insert(inputs[p2].set.begin(), inputs[p2].set.end());
      for (int i = 0; i < inputs.size(); i++) {
        if (i == p1 || i == p2) {
          continue;
        }
        auto& in = inputs[i].set;
        for (auto c : in) {
          if (contractions.find(c) != contractions.end()) {
            new_term.insert(c);
          }
        }
      }
      for (auto c : output.set) {
        if (contractions.find(c) != contractions.end()) {
          new_term.insert(c);
        }
      }

      // Ignore if:
      // - The size of the new result is greater than the memory limit
      // - The cost is larger than the naive cost
      auto new_size = term_size(new_term, dim_map);
      if (new_size > memory_limit) {
        return;
      }
      int64_t removed_size = term_size(inputs[p1].set, dim_map) +
          term_size(inputs[p2].set, dim_map) - new_size;

      bool inner = contractions.size() > new_term.size();
      auto cost = flop_count(contractions, inner, 2, dim_map);
      if (path_cost + cost > cost_limit) {
        return;
      }
      possible_contractions.emplace_back(
          removed_size, cost, std::move(new_term), contractions.size(), p1, p2);
    };

    for (auto& [p1, p2] : pos_pairs) {
      // Ignore outer products
      if (!disjoint(inputs[p1].set, inputs[p2].set)) {
        add_contraction(p1, p2);
      }
    }

    // If there's nothing in the contraction list,
    // go over the pairs again without ignoring outer products
    if (possible_contractions.empty()) {
      for (auto& [p1, p2] : pos_pairs) {
        add_contraction(p1, p2);
      }
    }

    if (possible_contractions.empty()) {
      // Default to naive einsum for the remaining inputs
      std::vector<int> positions(inputs.size());
      std::iota(positions.begin(), positions.end(), 0);
      auto [cost, scale] = compute_cost_and_scaling(inputs, output, dim_map);
      path.emplace_back(std::move(inputs), output, std::move(positions));

      path_cost += cost;
      path_scaling = std::max(scale, path_scaling);
      break;
    }

    // Find the best contraction
    auto& best = *std::min_element(
        possible_contractions.begin(),
        possible_contractions.end(),
        [](const auto& x, const auto& y) {
          return x.size > y.size || (x.size == y.size && x.cost < y.cost);
        });
    path_scaling = std::max(best.dims, path_scaling);

    // Construct the output subscripts
    std::string out_str(best.output.begin(), best.output.end());
    // TODO, sorting by dimension size seems suboptimal?
    std::sort(out_str.begin(), out_str.end(), [&dim_map](auto x, auto y) {
      return dim_map[x] < dim_map[y];
    });
    Subscript new_output(std::move(out_str), std::move(best.output));

    // Add the chosen contraction to the path
    {
      std::vector<Subscript> in_terms;
      in_terms.push_back(std::move(inputs[best.x]));
      in_terms.push_back(std::move(inputs[best.y]));
      path.emplace_back(
          std::move(in_terms), new_output, std::vector<int>{best.x, best.y});
    }
    // Remove used terms
    inputs.erase(inputs.begin() + best.y);
    inputs.erase(inputs.begin() + best.x);

    // Add the new result
    inputs.push_back(std::move(new_output));

    // Update the existing contractions based on the selected one
    std::vector<Contraction> updated_contractions;
    for (auto& contraction : possible_contractions) {
      // Drop contractions which contain either selected term
      if (contraction.x == best.x || contraction.x == best.y ||
          contraction.y == best.x || contraction.y == best.y) {
        continue;
      }

      // Update the positions of other contractions
      int x =
          contraction.x - (contraction.x > best.x) - (contraction.x > best.y);
      int y =
          contraction.y - (contraction.y > best.x) - (contraction.y > best.y);
      contraction.x = x;
      contraction.y = y;
      updated_contractions.push_back(std::move(contraction));
    }

    pos_pairs.clear();
    for (int i = 0; i < inputs.size() - 1; ++i) {
      pos_pairs.emplace_back(i, inputs.size() - 1);
    }
    path_cost += best.cost;

    possible_contractions = std::move(updated_contractions);
  }
  return {path, path_cost, path_scaling};
}

// Assumes inputs have already have had repeats and single axis sums collapsed
bool can_dot(const std::vector<Subscript>& inputs, const Subscript& output) {
  if (inputs.size() != 2) {
    return false;
  }

  for (auto c : inputs[0].set) {
    // Use batched tensordot if anything is being contracted
    if (output.set.find(c) == output.set.end()) {
      return true;
    }
  }
  return false;
}

array batch_tensordot(
    array a,
    array b,
    std::vector<int> a_contract,
    std::vector<int> a_batch,
    std::vector<int> a_concat,
    std::vector<int> b_contract,
    std::vector<int> b_batch,
    std::vector<int> b_concat,
    StreamOrDevice s) {
  // Broadcast contracting dimensions
  {
    auto a_shape = a.shape();
    auto b_shape = b.shape();
    for (int i = 0; i < a_contract.size(); ++i) {
      auto d = std::max(a.shape(a_contract[i]), b.shape(b_contract[i]));
      a_shape[a_contract[i]] = d;
      b_shape[b_contract[i]] = d;
    }
    a = broadcast_to(a, a_shape, s);
    b = broadcast_to(b, b_shape, s);
  }
  auto transpose_reshape = [&s](
                               const array& x,
                               const std::vector<int>& i,
                               const std::vector<int>& j,
                               const std::vector<int>& k) {
    std::vector<int> reorder(i.begin(), i.end());
    reorder.insert(reorder.end(), j.begin(), j.end());
    reorder.insert(reorder.end(), k.begin(), k.end());

    int size1 = 1;
    for (auto s : j) {
      size1 *= x.shape(s);
    }

    int size2 = 1;
    for (auto s : k) {
      size2 *= x.shape(s);
    }

    Shape shape;
    for (auto ax : i) {
      shape.push_back(x.shape(ax));
    }
    shape.push_back(size1);
    shape.push_back(size2);

    return reshape(transpose(x, reorder, s), std::move(shape), s);
  };

  Shape out_shape;
  for (auto ax : a_batch) {
    out_shape.push_back(a.shape(ax));
  }
  for (auto ax : a_concat) {
    out_shape.push_back(a.shape(ax));
  }
  for (auto ax : b_concat) {
    out_shape.push_back(b.shape(ax));
  }

  a = transpose_reshape(a, a_batch, a_concat, a_contract);
  b = transpose_reshape(b, b_batch, b_contract, b_concat);

  return reshape(matmul(a, b, s), std::move(out_shape), s);
}

// Collapse repeated subscripts and return the resulting array. The subscript
// is also updated in place. For example:
// - Given an input with shape (4, 4) and subscript "ii", returns
//   the diagonal of shape (4,) and updates the subscript to "i".
// - Given an input with shape (4, 2, 4, 2) and subscript "ijij",
//   returns an output with shape (4, 2) and updates the subscript
//   to "ij".
array collapse_repeats(array in, Subscript& subscript, StreamOrDevice s) {
  // Build a list of (repeat chars, num repeats)
  auto& str = subscript.str;
  std::vector<std::pair<char, int>> repeats;
  std::string new_str;
  {
    std::string repeat_str;
    std::string no_repeat_str;
    std::unordered_map<char, int> counts;
    for (int i = 0; i < str.size(); ++i) {
      auto [it, _] = counts.insert({str[i], 0});
      it->second++;
    }

    for (auto& v : counts) {
      if (v.second > 1) {
        repeats.emplace_back(v.first, v.second);
        repeat_str += v.first;
      }
    }
    for (auto& c : str) {
      if (counts[c] == 1) {
        no_repeat_str += c;
      }
    }
    new_str = repeat_str + no_repeat_str;
  }

  // Build the inputs for gather
  auto slice_sizes = in.shape();
  std::vector<int> axes;
  std::vector<array> indices;
  int n_expand = repeats.size();
  for (auto [c, v] : repeats) {
    for (int i = 0; i < str.size(); ++i) {
      if (str[i] == c) {
        slice_sizes[i] = 1;
        axes.push_back(i);
      }
    }
    Shape idx_shape(n_expand--, 1);
    idx_shape[0] = in.shape(axes.back());
    auto idx = reshape(
        arange(static_cast<ShapeElem>(in.shape(axes.back())), s), idx_shape, s);
    for (int i = 0; i < v; ++i) {
      indices.push_back(idx);
    }
  }

  in = gather(in, indices, axes, slice_sizes, s);

  // Update subscript string with removed dups
  str = new_str;

  // Squeeze singleton dimensions left over from the gather
  for (auto& ax : axes) {
    ax += indices[0].ndim();
  }

  return squeeze(in, axes, s);
}

// Collapse repeat indices and sum single dimensions.
// For example:
// - "aa" becomes "a"
// - "ij,jk->k" becoms "j,jk->k"
void preprocess_einsum_inputs(
    std::vector<Subscript>& inputs,
    const Subscript& output,
    const std::vector<int>& positions,
    std::vector<array>& operands,
    StreamOrDevice s) {
  // Collapse repeat indices
  for (int i = 0; i < inputs.size(); ++i) {
    auto& in = inputs[i];
    if (in.set.size() < in.str.size()) {
      operands[positions[i]] = collapse_repeats(operands[positions[i]], in, s);
    }
  }

  // Sum indices that are only in a single input
  {
    std::unordered_map<char, int> counts;
    for (auto& in : inputs) {
      for (auto c : in.set) {
        auto inserted = counts.insert({c, 0});
        inserted.first->second++;
      }
    }
    for (auto c : output.set) {
      auto inserted = counts.insert({c, 0});
      inserted.first->second++;
    }
    for (int i = 0; i < inputs.size(); ++i) {
      auto& in = inputs[i];
      std::vector<int> sum_axes;
      for (int ax = 0; ax < in.str.size(); ++ax) {
        if (counts[in.str[ax]] == 1) {
          sum_axes.push_back(ax);
        }
      }
      if (!sum_axes.empty()) {
        operands[positions[i]] =
            sum(operands[positions[i]], sum_axes, false, s);
      }
      for (auto it = sum_axes.rbegin(); it != sum_axes.rend(); ++it) {
        in.set.erase(in.str[*it]);
        in.str.erase(in.str.begin() + *it);
      }
    }
  }
}

array einsum_naive(
    std::vector<Subscript> inputs,
    const Subscript& output,
    const std::vector<int>& positions,
    std::vector<array> operands,
    StreamOrDevice s) {
  // Map each character to an axis
  std::unordered_map<char, int> char_to_ax;
  for (auto& in : inputs) {
    for (auto c : in.str) {
      char_to_ax.insert({c, char_to_ax.size()});
    }
  }

  // Expand and transpose inputs as needed
  for (int i = 0; i < inputs.size(); ++i) {
    int pos = positions[i];
    auto& op = operands[pos];

    // Add missing dimensions at the end
    if (op.ndim() != char_to_ax.size()) {
      auto shape = op.shape();
      shape.insert(shape.end(), char_to_ax.size() - shape.size(), 1);
      op = reshape(op, std::move(shape), s);
    }

    // Transpose:
    // - Build a vector of (char, ax) pairs for the current input
    // - Sort the vector by the canonical axis in char_to_ax
    // - Extract the sorted axis to get transpose order
    std::vector<std::pair<char, int>> str_ax;
    for (auto c : inputs[i].str) {
      str_ax.emplace_back(c, str_ax.size());
    }
    for (auto [c, ax] : char_to_ax) {
      if (inputs[i].set.find(c) == inputs[i].set.end()) {
        str_ax.emplace_back(c, str_ax.size());
      }
    }
    std::sort(
        str_ax.begin(),
        str_ax.end(),
        [&char_to_ax](const auto& x, const auto& y) {
          return char_to_ax[x.first] < char_to_ax[y.first];
        });

    // Skip the transpose if not needed
    if (std::is_sorted(
            str_ax.begin(), str_ax.end(), [](const auto& x, const auto& y) {
              return x.second < y.second;
            })) {
      continue;
    }

    std::vector<int> reorder;
    for (auto [c, ax] : str_ax) {
      reorder.push_back(ax);
    }
    op = transpose(op, reorder, s);
  }

  // Multiply and sum
  auto out = operands[positions[0]];
  for (int i = 1; i < positions.size(); ++i) {
    out = multiply(out, operands[positions[i]], s);
  }
  std::vector<int> sum_axes;
  for (auto [c, ax] : char_to_ax) {
    if (output.set.find(c) == output.set.end()) {
      sum_axes.push_back(ax);
    }
  }
  if (!sum_axes.empty()) {
    out = sum(out, sum_axes, false, s);
  }

  // Transpose output if needed
  std::vector<int> reorder;
  for (auto c : output.str) {
    reorder.push_back(char_to_ax[c]);
  }
  for (auto& r : reorder) {
    int offset = 0;
    for (auto s : sum_axes) {
      if (r > s) {
        offset++;
      }
    }
    r -= offset;
  }
  return transpose(out, reorder, s);
}

std::pair<std::vector<PathNode>, PathInfo> einsum_path_helper(
    const std::string& subscripts,
    const std::vector<array>& operands,
    const std::string& fn_name) {
  if (operands.size() == 0) {
    std::ostringstream msg;
    msg << "[" << fn_name << "] At least one operand is required.";
    throw std::invalid_argument(msg.str());
  }

  auto [in_subscripts, out_subscript] = parse(subscripts);

  if (operands.size() != in_subscripts.size()) {
    std::ostringstream msg;
    msg << "[" << fn_name << "] Number of operands, " << operands.size()
        << ", does not match number of input subscripts, "
        << in_subscripts.size();
    throw std::invalid_argument(msg.str());
  }

  // Expand ellipses
  // 1. Collect all the characters we can use for the missing axes.
  // 2. Go over each subscript and check if all the characters are either
  //    alphanumeric or an ellipsis.
  // 3. Expand the ellipsis with as many characters from the unused ones as
  //    necessary. We use the last N characters effectively prepending with
  //    singleton dims for inputs with fewer dimensions.
  // 4. For the output use the maximum size of ellipsis that we encountered in
  //    the input.
  CharSet used_chars(subscripts.begin(), subscripts.end());
  std::string remaining_chars;
  remaining_chars.reserve(52 - used_chars.size());
  for (char c = 'a'; c <= 'z'; c++) {
    if (used_chars.find(c) == used_chars.end()) {
      remaining_chars += c;
    }
  }
  for (char c = 'A'; c <= 'Z'; c++) {
    if (used_chars.find(c) == used_chars.end()) {
      remaining_chars += c;
    }
  }
  int max_ellipsis_length = 0;
  auto check_letters_and_expand_ellipsis = [&](auto& subscript,
                                               const array* operand,
                                               int operand_idx) {
    bool have_ellipsis = false;
    int cnt_before = 0, cnt_after = 0;
    for (int i = 0; i < subscript.size(); i++) {
      if (!isalpha(subscript[i])) {
        if (i + 2 >= subscript.size() || subscript[i] != '.' ||
            subscript[i + 1] != '.' || subscript[i + 2] != '.') {
          std::ostringstream msg;
          msg << "[" << fn_name << "] Subscripts must be letters, but got '"
              << subscript[i] << "'.";
          throw std::invalid_argument(msg.str());
        }

        if (have_ellipsis) {
          std::ostringstream msg;
          msg << "[" << fn_name
              << "] Only one ellipsis per subscript is allowed but found more in '"
              << subscript << "'.";
          throw std::invalid_argument(msg.str());
        }

        have_ellipsis = true;
        i += 2;
        continue;
      }

      if (have_ellipsis) {
        cnt_after++;
      } else {
        cnt_before++;
      }
    }

    if (have_ellipsis) {
      int ellipsis_length;
      if (operand != nullptr) {
        ellipsis_length = operand->ndim() - cnt_before - cnt_after;
        if (ellipsis_length < 0) {
          std::ostringstream msg;
          msg << "[" << fn_name << "] Operand " << operand_idx << " with shape "
              << operand->shape()
              << " has insufficient dimensions for subscript '" << subscript
              << "'. The ellipsis requires at least "
              << (cnt_before + cnt_after) << " dimensions but the operand has "
              << operand->ndim() << " dimensions.";
          throw std::invalid_argument(msg.str());
        }
        max_ellipsis_length = std::max(ellipsis_length, max_ellipsis_length);
      } else {
        ellipsis_length = max_ellipsis_length;
      }

      subscript.replace(
          subscript.begin() + cnt_before,
          subscript.begin() + cnt_before + 3,
          remaining_chars.end() - ellipsis_length,
          remaining_chars.end());
    }
  };

  for (int i = 0; i < operands.size(); i++) {
    check_letters_and_expand_ellipsis(in_subscripts[i], &operands[i], i);
  }
  check_letters_and_expand_ellipsis(out_subscript, nullptr, -1);

  CharSet out_set(out_subscript.begin(), out_subscript.end());
  if (out_set.size() != out_subscript.size()) {
    std::ostringstream msg;
    msg << "[" << fn_name << "] Repeat indices not allowed in output.";
    throw std::invalid_argument(msg.str());
  }
  Subscript output(out_subscript, std::move(out_set));

  std::unordered_map<char, ShapeElem> dim_map;
  std::vector<Subscript> inputs;
  for (int i = 0; i < in_subscripts.size(); ++i) {
    auto& in = in_subscripts[i];
    CharSet in_set(in.begin(), in.end());
    inputs.emplace_back(in, in_set);

    if (in.size() != operands[i].ndim()) {
      std::ostringstream msg;
      msg << "[" << fn_name << "] Invalid number of subscripts " << in.size()
          << " for input " << i << " with " << operands[i].ndim()
          << " dimensions.";
      throw std::invalid_argument(msg.str());
    }

    // Check repeat subscripts are valid
    if (in_set.size() < in.size()) {
      std::unordered_map<char, ShapeElem> local_dims;
      for (int j = 0; j < in.size(); ++j) {
        auto dim = operands[i].shape(j);
        auto inserted = local_dims.insert({in[j], dim});
        if (!inserted.second) {
          if (inserted.first->second != dim) {
            std::ostringstream msg;
            msg << "[" << fn_name << "] Dimensions of repeated subscripts "
                << "do not have the same size (" << inserted.first->second
                << " != " << dim << ").";
            throw std::invalid_argument(msg.str());
          }
        }
      }
    }

    for (int j = 0; j < in.size(); j++) {
      auto c = in[j];
      auto dim = operands[i].shape(j);
      auto inserted = dim_map.insert({c, dim});
      auto& in_dim = inserted.first->second;
      if (dim != 1 && in_dim != 1 && in_dim != dim) {
        std::ostringstream msg;
        msg << "[" << fn_name << "] Cannot broadcast dimension " << j
            << " of input " << i << " with shape " << operands[i].shape()
            << " to size " << in_dim << ".";
        throw std::invalid_argument(msg.str());
      }
      // Ensure the broadcasted size is used
      in_dim = std::max(in_dim, dim);
    }
  }

  size_t max_size = term_size(out_subscript, dim_map);
  for (auto& in : in_subscripts) {
    max_size = std::max(max_size, term_size(in, dim_map));
  }

  PathInfo path_info{};

  // Get the full naive cost
  std::tie(path_info.naive_cost, path_info.naive_scaling) =
      compute_cost_and_scaling(inputs, output, dim_map);

  // Calculate the path
  std::vector<PathNode> path;
  if (inputs.size() <= 2) {
    std::vector<int> positions(in_subscripts.size());
    std::iota(positions.begin(), positions.end(), 0);
    path.emplace_back(
        std::move(inputs), std::move(output), std::move(positions));
    path_info.optimized_cost = path_info.naive_cost;
    path_info.optimized_scaling = path_info.naive_scaling;
  } else {
    std::tie(path, path_info.optimized_cost, path_info.optimized_scaling) =
        greedy_path(inputs, output, dim_map, path_info.naive_cost, max_size);
    // Set the final output subscript to the actual output
    path.back().output = std::move(output);
  }
  return {path, path_info};
}

} // namespace

std::pair<std::vector<std::vector<int>>, std::string> einsum_path(
    const std::string& subscripts,
    const std::vector<array>& operands) {
  auto [path, path_info] =
      einsum_path_helper(subscripts, operands, "einsum_path");

  std::vector<std::vector<int>> pos_path;
  for (auto& p : path) {
    pos_path.push_back(p.positions);
  }

  std::ostringstream path_print;
  path_print << "  Complete contraction:  " << subscripts << "\n"
             << "         Naive scaling:  " << path_info.naive_scaling << "\n"
             << "     Optimized scaling:  " << path_info.optimized_scaling
             << "\n"
             << "      Naive FLOP count:  " << path_info.naive_cost << "\n"
             << "  Optimized FLOP count:  " << path_info.optimized_cost << "\n";
  // TODO add more info here
  return {pos_path, path_print.str()};
}

array einsum(
    const std::string& subscripts,
    const std::vector<array>& operands,
    StreamOrDevice s /* = {} */) {
  auto [path, path_info] = einsum_path_helper(subscripts, operands, "einsum");
  auto inputs = operands;
  for (auto& node : path) {
    preprocess_einsum_inputs(
        node.inputs, node.output, node.positions, inputs, s);

    if (can_dot(node.inputs, node.output)) {
      auto& in_a = node.inputs[0];
      auto& in_b = node.inputs[1];
      auto& out = node.output;

      std::vector<int> a_contract;
      std::vector<int> a_batch;
      std::vector<int> a_concat;
      for (int i = 0; i < in_a.str.size(); ++i) {
        auto c = in_a.str[i];
        if (out.set.find(c) == out.set.end()) {
          // Not in the output, contraction
          a_contract.push_back(i);
        } else if (in_b.set.find(c) != in_b.set.end()) {
          // Not a contraction but in both inputs, batch dim
          a_batch.push_back(i);
        } else {
          // Not a batch dim or contract dim, so concat dim
          a_concat.push_back(i);
        }
      }

      std::vector<int> b_contract;
      std::vector<int> b_batch;
      std::vector<int> b_concat;
      for (auto a_i : a_contract) {
        b_contract.push_back(in_b.str.find(in_a.str[a_i]));
      }
      for (auto a_i : a_batch) {
        b_batch.push_back(in_b.str.find(in_a.str[a_i]));
      }
      for (int i = 0; i < in_b.str.size(); ++i) {
        auto c = in_b.str[i];
        if (out.set.find(c) != out.set.end() &&
            in_a.set.find(c) == in_a.set.end()) {
          b_concat.push_back(i);
        }
      }

      auto& a = inputs[node.positions[0]];
      auto& b = inputs[node.positions[1]];

      std::unordered_map<char, int> char_map;
      for (auto i : a_batch) {
        char_map.insert({in_a.str[i], char_map.size()});
      }
      for (auto i : a_concat) {
        char_map.insert({in_a.str[i], char_map.size()});
      }
      for (auto i : b_concat) {
        char_map.insert({in_b.str[i], char_map.size()});
      }
      inputs.emplace_back(batch_tensordot(
          a,
          b,
          std::move(a_contract),
          std::move(a_batch),
          std::move(a_concat),
          std::move(b_contract),
          std::move(b_batch),
          std::move(b_concat),
          s));

      std::vector<int> reorder;
      for (auto c : node.output.str) {
        reorder.push_back(char_map[c]);
      }
      inputs.back() = transpose(inputs.back(), reorder, s);

    } else {
      inputs.emplace_back(
          einsum_naive(node.inputs, node.output, node.positions, inputs, s));
    }

    // Positions are always sorted increasing, so start from the back
    for (auto it = node.positions.rbegin(); it != node.positions.rend(); ++it) {
      inputs.erase(inputs.begin() + *it);
    }
  }
  return inputs.front();
}

} // namespace mlx::core


================================================
FILE: mlx/einsum.h
================================================
// Copyright © 2024 Apple Inc.
#pragma once

#include <string>
#include <tuple>
#include <vector>

#include "mlx/api.h"
#include "mlx/array.h"
#include "mlx/utils.h"

namespace mlx::core {

MLX_API std::pair<std::vector<std::vector<int>>, std::string> einsum_path(
    const std::string& subscripts,
    const std::vector<array>& operands);

MLX_API array einsum(
    const std::string& subscripts,
    const std::vector<array>& operands,
    StreamOrDevice s = {});

} // namespace mlx::core


================================================
FILE: mlx/event.h
================================================
// Copyright © 2024 Apple Inc.
#pragma once

#include <cstdint>
#include <memory>
#include <stdexcept>

#include "mlx/stream.h"

namespace mlx::core {

class Event {
 public:
  Event() {};
  explicit Event(Stream stream);

  // Wait for the event to be signaled at its current value
  void wait();

  // Wait in the given stream for the event to be signaled at its current value
  void wait(Stream stream);

  // Signal the event at its current value in the given stream
  void signal(Stream stream);

  // Check if the event has been signaled at its current value
  bool is_signaled() const;

  // Check if the event is valid
  bool valid() const {
    return event_ != nullptr;
  }

  uint64_t value() const {
    return value_;
  }

  void set_value(uint64_t v) {
    value_ = v;
  }

  const Stream& stream() const {
    if (!valid()) {
      throw std::runtime_error(
          "[Event::stream] Cannot access stream on invalid event.");
    }
    return stream_;
  }

 private:
  // Default constructed stream should never be used
  // since the event is not yet valid
  Stream stream_{0, Device::cpu};
  std::shared_ptr<void> event_{nullptr};
  uint64_t value_{0};
};

} // namespace mlx::core


================================================
FILE: mlx/export.cpp
================================================
// Copyright © 2024 Apple Inc.
#include "mlx/export.h"
#include <map>
#include "mlx/compile_impl.h"
#include "mlx/fast_primitives.h"
#include "mlx/graph_utils.h"
#include "mlx/primitives.h"
#include "mlx/utils.h"
#include "mlx/version.h"

// clang-format off
#define SERIALIZE_PRIMITIVE(primitive, ...)  \
  {                                          \
    #primitive, {                            \
      serialize_primitive<primitive>,        \
      deserialize_primitive<primitive>,      \
      primitive_state<primitive>,            \
      {__VA_ARGS__}                          \
    }                                        \
  }
// clang-format on

bool is_big_endian() {
  int num = 1;
  return *reinterpret_cast<char*>(&num) != 1;
}

namespace mlx::core {

using namespace mlx::core::fast;

using Reader = io::ParallelFileReader;
using Writer = io::FileWriter;

struct PrimitiveSerializer {
  using Serializer = std::function<void(Writer&, const Primitive&)>;
  using Deserializer =
      std::function<std::shared_ptr<Primitive>(Reader&, Stream s)>;
  using StateExtractor = std::function<std::vector<StateT>(const Primitive&)>;

  PrimitiveSerializer(
      Serializer serialize,
      Deserializer deserialize,
      StateExtractor extract_state,
      std::vector<std::string> keys = {})
      : serialize(std::move(serialize)),
        deserialize(std::move(deserialize)),
        extract_state(std::move(extract_state)),
        keys(std::move(keys)) {};
  Serializer serialize;
  Deserializer deserialize;
  StateExtractor extract_state;
  std::vector<std::string> keys;
};

template <typename, typename = void>
constexpr bool is_iterable = false;

template <typename T>
constexpr bool is_iterable<
    T,
    std::void_t<
        decltype(std::declval<T>().begin()),
        decltype(std::declval<T>().end())>> = true;

template <template <typename...> class T, typename U>
constexpr bool is_specialization_of = false;

template <template <typename...> class T, typename... Us>
constexpr bool is_specialization_of<T, T<Us...>> = true;

template <typename T>
constexpr bool is_pair = is_specialization_of<std::pair, std::decay_t<T>>;

template <typename T>
constexpr bool is_tuple = is_specialization_of<std::tuple, std::decay_t<T>>;

template <typename T>
inline constexpr bool is_optional =
    is_specialization_of<std::optional, std::decay_t<T>>;

template <typename T>
inline constexpr bool is_variant =
    is_specialization_of<std::variant, std::decay_t<T>>;

template <typename>
constexpr bool dependent_false = false;

template <typename T>
struct NotSerializable {
  static_assert(dependent_false<T>, "Type is not serializable.");
};

template <typename T>
struct NotDeserializable {
  static_assert(dependent_false<T>, "Type is not deserializable.");
};

template <typename T>
void reverse_bytes(T& data) {
  auto* bytes = reinterpret_cast<uint8_t*>(&data);
  for (size_t j = 0; j < (sizeof(T) / 2); j++) {
    std::swap(bytes[j], bytes[sizeof(T) - j - 1]);
  }
}

template <typename T>
void serialize_variant(Writer& os, T v);

template <typename T>
T deserialize_variant(Reader& is);

template <typename T>
void serialize(Writer& os, T v) {
  if constexpr (std::is_arithmetic_v<T>) {
    if (is_big_endian()) {
      reverse_bytes(v);
    }
    os.write(reinterpret_cast<const char*>(&v), sizeof(T));
  } else if constexpr (std::is_enum_v<T>) {
    serialize(os, static_cast<int>(v));
  } else if constexpr (std::is_same_v<T, std::nullptr_t>) {
  } else if constexpr (is_iterable<T>) {
    serialize(os, static_cast<uint64_t>(v.size()));
    for (const auto& t : v) {
      serialize(os, t);
    }
  } else if constexpr (is_pair<T> || is_tuple<T>) {
    std::apply([&os](auto&... x) { (..., serialize(os, x)); }, v);
  } else if constexpr (is_variant<T>) {
    serialize_variant(os, v);
  } else if constexpr (is_optional<T>) {
    serialize(os, v.has_value());
    if (v.has_value()) {
      serialize(os, *v);
    }
  } else {
    NotSerializable<T>();
  }
}

template <typename T, std::size_t... I>
decltype(auto) deserialize_tuple(Reader& is, std::index_sequence<I...>);

template <typename T>
T deserialize(Reader& is) {
  if constexpr (std::is_arithmetic_v<T>) {
    T v;
    is.read(reinterpret_cast<char*>(&v), sizeof(T));
    if (is_big_endian()) {
      reverse_bytes(v);
    }
    return v;
  } else if constexpr (std::is_enum_v<T>) {
    return static_cast<T>(deserialize<int>(is));
  } else if constexpr (std::is_same_v<T, std::nullptr_t>) {
    return nullptr;
  } else if constexpr (is_iterable<T>) {
    T v;
    auto size = deserialize<uint64_t>(is);
    v.reserve(size);
    for (int i = 0; i < size; ++i) {
      v.push_back(deserialize<typename T::value_type>(is));
    }
    return v;
  } else if constexpr (is_pair<T> || is_tuple<T>) {
    return deserialize_tuple<T>(
        is, std::make_index_sequence<std::tuple_size_v<std::decay_t<T>>>{});
  } else if constexpr (is_optional<T>) {
    auto has_value = deserialize<bool>(is);
    if (has_value) {
      return T{deserialize<typename T::value_type>(is)};
    } else {
      return std::nullopt;
    }
  } else if constexpr (is_variant<T>) {
    return deserialize_variant<T>(is);
  } else {
    NotDeserializable<T>();
  }
}

enum class VariantType { Int = 0, Float = 1, Bool = 2 };

template <typename T>
void serialize_variant(Writer& os, T v) {
  std::visit(
      [&](auto&& x) {
        using ElemT = std::decay_t<decltype(x)>;
        if constexpr (std::is_same_v<ElemT, int>) {
          serialize(os, VariantType::Int);
        } else if constexpr (std::is_same_v<ElemT, float>) {
          serialize(os, VariantType::Float);
        } else if constexpr (std::is_same_v<ElemT, bool>) {
          serialize(os, VariantType::Bool);
        } else {
          static_assert(
              std::is_same_v<ElemT, void>, "Can't serialize variant type.");
        }
        serialize(os, x);
      },
      v);
}

template <typename T>
T deserialize_variant(Reader& is) {
  auto vt = deserialize<VariantType>(is);
  switch (vt) {
    case VariantType::Int:
      return deserialize<int>(is);
    case VariantType::Float:
      return deserialize<float>(is);
    case VariantType::Bool:
      return deserialize<bool>(is);
    default:
      throw std::runtime_error(
          "[deserialize_variant] Unknonw variant type tag.");
  }
}

template <typename T, std::size_t... I>
decltype(auto) deserialize_tuple(Reader& is, std::index_sequence<I...>) {
  return T{deserialize<std::tuple_element_t<I, T>>(is)...};
};

void serialize(Writer& os, const Stream& s) {
  serialize(os, s.index);
  serialize(os, s.device.type);
  serialize(os, s.device.index);
}
template <>
Stream deserialize(Reader& is) {
  auto stream_index = deserialize<int>(is);
  auto device_type = deserialize<Device::DeviceType>(is);
  auto device_index = deserialize<int>(is);
  return Stream(stream_index, Device(device_type, device_index));
}

void serialize(Writer& os, const Dtype& t) {
  serialize(os, t.val());
  serialize(os, t.size());
}

template <>
Dtype deserialize(Reader& is) {
  auto val = deserialize<Dtype::Val>(is);
  auto size = deserialize<uint8_t>(is);
  return Dtype(val, size);
};

void serialize(Writer& os, const array& arr) {
  serialize(os, arr.shape());
  serialize(os, arr.dtype());
}
template <>
array deserialize(Reader& is) {
  auto shape = deserialize<Shape>(is);
  auto type = deserialize<Dtype>(is);
  return array(std::move(shape), type, nullptr, std::vector<array>{});
}

template <typename, typename = void>
constexpr bool has_state = false;

template <typename T>
constexpr bool has_state<T, std::void_t<decltype(std::declval<T>().state())>> =
    true;

template <typename T>
void serialize_primitive(Writer& os, const Primitive& p) {
  if constexpr (has_state<T>) {
    serialize(os, static_cast<const T&>(p).state());
  }
}

template <typename T>
void extract_state(const T state, std::vector<StateT>& unpacked_state) {
  if constexpr (std::is_arithmetic_v<T>) {
    unpacked_state.push_back(state);
  } else if constexpr (std::is_enum_v<T>) {
    unpacked_state.push_back(static_cast<int>(state));
  } else if constexpr (std::is_same_v<T, Dtype>) {
    unpacked_state.push_back(state);
  } else if constexpr (is_iterable<T>) {
    unpacked_state.push_back(state);
  } else if constexpr (is_pair<T> || is_tuple<T>) {
    std::apply(
        [&unpacked_state](auto&... x) {
          (..., extract_state(x, unpacked_state));
        },
        state);
  }
}

template <typename T>
std::vector<StateT> primitive_state(const Primitive& p) {
  std::vector<StateT> state;
  if constexpr (has_state<T>) {
    extract_state(static_cast<const T&>(p).state(), state);
  }
  return state;
}

template <typename T>
std::shared_ptr<T> deserialize_primitive(Reader& is, Stream s) {
  if constexpr (has_state<T>) {
    auto args = deserialize<decltype(std::declval<T>().state())>(is);
    if constexpr (is_pair<decltype(args)> || is_tuple<decltype(args)>) {
      auto fn = [s](auto&&... args) {
        return std::make_shared<T>(s, std::move(args)...);
      };
      return std::apply(fn, std::move(args));
    } else {
      return std::make_shared<T>(s, std::move(args));
    }
  } else {
    return std::make_shared<T>(s);
  }
}

struct PrimitiveFactory {
  std::unordered_map<std::string, PrimitiveSerializer> factory = {
      SERIALIZE_PRIMITIVE(Abs),
      SERIALIZE_PRIMITIVE(Add),
      SERIALIZE_PRIMITIVE(AddMM),
      SERIALIZE_PRIMITIVE(Arange),
      SERIALIZE_PRIMITIVE(ArcCos),
      SERIALIZE_PRIMITIVE(ArcCosh),
      SERIALIZE_PRIMITIVE(ArcSin),
      SERIALIZE_PRIMITIVE(ArcSinh),
      SERIALIZE_PRIMITIVE(ArcTan),
      SERIALIZE_PRIMITIVE(ArcTan2),
      SERIALIZE_PRIMITIVE(ArcTanh),
      SERIALIZE_PRIMITIVE(ArgPartition),
      SERIALIZE_PRIMITIVE(ArgReduce),
      SERIALIZE_PRIMITIVE(ArgSort),
      SERIALIZE_PRIMITIVE(AsType),
      SERIALIZE_PRIMITIVE(AsStrided),
      SERIALIZE_PRIMITIVE(
          BitwiseBinary,
          "BitwiseAnd",
          "BitwiseOr",
          "BitwiseXor",
          "LeftShift",
          "RightShift"),
      SERIALIZE_PRIMITIVE(BlockMaskedMM),
      SERIALIZE_PRIMITIVE(Broadcast),
      SERIALIZE_PRIMITIVE(BroadcastAxes),
      SERIALIZE_PRIMITIVE(Ceil),
      SERIALIZE_PRIMITIVE(Concatenate),
      SERIALIZE_PRIMITIVE(Conjugate),
      SERIALIZE_PRIMITIVE(Convolution),
      SERIALIZE_PRIMITIVE(Copy),
      SERIALIZE_PRIMITIVE(Cos),
      SERIALIZE_PRIMITIVE(Cosh),
      SERIALIZE_PRIMITIVE(Depends),
      SERIALIZE_PRIMITIVE(Divide),
      SERIALIZE_PRIMITIVE(DivMod),
      SERIALIZE_PRIMITIVE(DynamicSlice),
      SERIALIZE_PRIMITIVE(DynamicSliceUpdate),
      SERIALIZE_PRIMITIVE(Equal, "NaNEqual"),
      SERIALIZE_PRIMITIVE(Erf),
      SERIALIZE_PRIMITIVE(ErfInv),
      SERIALIZE_PRIMITIVE(Exp),
      SERIALIZE_PRIMITIVE(Expm1),
      SERIALIZE_PRIMITIVE(ExpandDims),
      SERIALIZE_PRIMITIVE(FFT),
      SERIALIZE_PRIMITIVE(Flatten),
      SERIALIZE_PRIMITIVE(Floor),
      SERIALIZE_PRIMITIVE(Full),
      SERIALIZE_PRIMITIVE(Gather),
      SERIALIZE_PRIMITIVE(GatherAxis),
      SERIALIZE_PRIMITIVE(GatherMM),
      SERIALIZE_PRIMITIVE(Greater),
      SERIALIZE_PRIMITIVE(GreaterEqual),
      SERIALIZE_PRIMITIVE(Hadamard),
      SERIALIZE_PRIMITIVE(Imag),
      SERIALIZE_PRIMITIVE(Less),
      SERIALIZE_PRIMITIVE(LessEqual),
      SERIALIZE_PRIMITIVE(Log, "Log2", "Log10"),
      SERIALIZE_PRIMITIVE(Log1p),
      SERIALIZE_PRIMITIVE(LogicalNot),
      SERIALIZE_PRIMITIVE(LogicalAnd),
      SERIALIZE_PRIMITIVE(LogicalOr),
      SERIALIZE_PRIMITIVE(LogAddExp),
      SERIALIZE_PRIMITIVE(LogSumExp),
      SERIALIZE_PRIMITIVE(MaskedScatter),
      SERIALIZE_PRIMITIVE(Matmul),
      SERIALIZE_PRIMITIVE(Maximum),
      SERIALIZE_PRIMITIVE(Minimum),
      SERIALIZE_PRIMITIVE(Multiply),
      SERIALIZE_PRIMITIVE(Negative),
      SERIALIZE_PRIMITIVE(NotEqual),
      SERIALIZE_PRIMITIVE(Reshape),
      SERIALIZE_PRIMITIVE(NumberOfElements),
      SERIALIZE_PRIMITIVE(Pad),
      SERIALIZE_PRIMITIVE(Partition),
      SERIALIZE_PRIMITIVE(Power),
      SERIALIZE_PRIMITIVE(QuantizedMatmul),
      SERIALIZE_PRIMITIVE(GatherQMM),
      SERIALIZE_PRIMITIVE(RandomBits),
      SERIALIZE_PRIMITIVE(Real),
      SERIALIZE_PRIMITIVE(Remainder),
      SERIALIZE_PRIMITIVE(Reshape),
      SERIALIZE_PRIMITIVE(Reduce, "And", "Or", "Sum", "Prod", "Min", "Max"),
      SERIALIZE_PRIMITIVE(Round),
      SERIALIZE_PRIMITIVE(
          Scan,
          "CumSum",
          "CumProd",
          "CumMin",
          "CumMax",
          "CumLogaddexp"),
      SERIALIZE_PRIMITIVE(Scatter),
      SERIALIZE_PRIMITIVE(ScatterAxis),
      SERIALIZE_PRIMITIVE(Select),
      SERIALIZE_PRIMITIVE(Sigmoid),
      SERIALIZE_PRIMITIVE(Sign),
      SERIALIZE_PRIMITIVE(Sin),
      SERIALIZE_PRIMITIVE(Sinh),
      SERIALIZE_PRIMITIVE(Slice),
      SERIALIZE_PRIMITIVE(SliceUpdate),
      SERIALIZE_PRIMITIVE(Softmax),
      SERIALIZE_PRIMITIVE(Sort),
      SERIALIZE_PRIMITIVE(Split),
      SERIALIZE_PRIMITIVE(Square),
      SERIALIZE_PRIMITIVE(Squeeze),
      SERIALIZE_PRIMITIVE(Sqrt, "Rsqrt", "Sqrt"),
      SERIALIZE_PRIMITIVE(StopGradient),
      SERIALIZE_PRIMITIVE(Subtract),
      SERIALIZE_PRIMITIVE(Tan),
      SERIALIZE_PRIMITIVE(Tanh),
      SERIALIZE_PRIMITIVE(View),
      SERIALIZE_PRIMITIVE(Transpose),
      SERIALIZE_PRIMITIVE(Unflatten),
      SERIALIZE_PRIMITIVE(QRF),
      SERIALIZE_PRIMITIVE(SVD),
      SERIALIZE_PRIMITIVE(Inverse),
      SERIALIZE_PRIMITIVE(Cholesky),
      SERIALIZE_PRIMITIVE(Eig),
      SERIALIZE_PRIMITIVE(Eigh),
      SERIALIZE_PRIMITIVE(Quantize),
      SERIALIZE_PRIMITIVE(RMSNorm),
      SERIALIZE_PRIMITIVE(RMSNormVJP),
      SERIALIZE_PRIMITIVE(LayerNorm),
      SERIALIZE_PRIMITIVE(LayerNormVJP),
      SERIALIZE_PRIMITIVE(RoPE),
      SERIALIZE_PRIMITIVE(ScaledDotProductAttention),
      SERIALIZE_PRIMITIVE(CustomKernel)};
  std::unordered_map<std::string, std::string> name_remap;
  std::unordered_map<int, Stream> stream_map;

  PrimitiveFactory() {
    for (auto& [n, f] : factory) {
      for (auto& k : f.keys) {
        name_remap[k] = n;
      }
    }
  }

  void save(Writer& os, const std::shared_ptr<Primitive>& p) {
    serialize(os, p->stream());
    std::string name = p->name();
    name = name.substr(0, name.find(' '));
    if (auto it = name_remap.find(name); it != name_remap.end()) {
      name = it->second;
    }
    serialize(os, name);
    if (auto it = factory.find(name); it != factory.end()) {
      it->second.serialize(os, *p);
    } else {
      throw std::invalid_argument(
          "[export_function] Unable to serialize primitive " + name);
    }
  };

  Stream resolve_stream(const Stream& stream) {
    if (auto it = stream_map.find(stream.index); it != stream_map.end()) {
      return it->second;
    }
    // Try to find an existing stream on the same device
    for (auto& s : get_streams()) {
      if (s.device == stream.device) {
        stream_map.emplace(stream.index, s);
        return s;
      }
    }
    // No stream on that device, make a new one
    Stream s = new_stream(stream.device);
    stream_map.emplace(stream.index, s);
    return s;
  }

  std::shared_ptr<Primitive> load(Reader& is) {
    auto stream = resolve_stream(deserialize<Stream>(is));
    auto name = deserialize<std::string>(is);
    if (auto it = factory.find(name); it != factory.end()) {
      return it->second.deserialize(is, stream);
    } else {
      throw std::invalid_argument(
          "[import_function] Unable to deserialize primitive " + name);
    }
  };

  std::pair<std::string, std::vector<StateT>> extract_state(
      const std::shared_ptr<Primitive>& p) {
    std::string name = p->name();
    name = name.substr(0, name.find(' '));
    if (auto it = name_remap.find(name); it != name_remap.end()) {
      name = it->second;
    }

    if (auto it = factory.find(name); it != factory.end()) {
      auto state = it->second.extract_state(*p);
      return {name, state};
    } else {
      throw std::invalid_argument(
          "[export_function] Unable to get state for primitive " + name);
    }
  };
};

void write_header(Writer& os, int count, bool shapeless) {
  serialize(os, std::string(version()));
  serialize(os, count);
  serialize(os, shapeless);
}

// A struct to hold and retrieve the graphs that are exported / imported
struct FunctionTable {
  FunctionTable(bool shapeless = false) : shapeless(shapeless) {};
  struct Function {
    Function(
        std::vector<std::string> kwarg_keys,
        std::vector<array> inputs,
        std::vector<array> outputs,
        std::vector<array> tape)
        : kwarg_keys(std::move(kwarg_keys)),
          inputs(std::move(inputs)),
          outputs(std::move(outputs)),
          tape(std::move(tape)) {}

    std::vector<std::string> kwarg_keys;
    std::vector<array> inputs;
    std::vector<array> outputs;
    std::vector<array> tape;
    Function(const Function&) = delete;
    Function& operator=(const Function&) = delete;
    Function(Function&&) = default;
    Function() = default;
  };
  bool shapeless;
  std::unordered_map<int, std::vector<Function>> table;
  Function* find(const Args& args, const std::map<std::string, array>& kwargs);
  std::pair<Function&, bool> emplace(
      const Args& args,
      const std::map<std::string, array>& kwargs);
  void insert(
      std::vector<std::string> kwarg_keys,
      std::vector<array> inputs,
      std::vector<array> outputs,
      std::vector<array> tape) {
    auto [it, _] = table.emplace(inputs.size(), std::vector<Function>{});
    it->second.emplace_back(
        std::move(kwarg_keys),
        std::move(inputs),
        std::move(outputs),
        std::move(tape));
  }

  void print_functions(std::ostream& os) {
    int n = 1;
    for (auto& [_, vec] : table) {
      for (auto& fun : vec) {
        auto npos = fun.inputs.size() - fun.kwarg_keys.size();
        os << " " << n++ << ". Function with " << npos
           << " positional inputs and " << fun.kwarg_keys.size()
           << " keyword inputs:\n";
        for (int j = 0; j < fun.inputs.size(); ++j) {
          auto& in = fun.inputs[j];
          if (j < npos) {
            os << "   " << j + 1 << ": ";
          } else {
            os << "   \"" << fun.kwarg_keys[j - npos] << "\": ";
          }
          os << in.shape() << " " << in.dtype() << "\n";
        }
      }
    }
  }

 private:
  bool match(
      const Args& args,
      const std::map<std::string, array>& kwargs,
      const Function& fun);
};

bool FunctionTable::match(
    const Args& args,
    const std::map<std::string, array>& kwargs,
    const Function& fun) {
  for (auto& k : fun.kwarg_keys) {
    if (kwargs.find(k) == kwargs.end()) {
      return false;
    }
  }

  auto match_inputs = [shapeless = this->shapeless](
                          const array& x, const array& y) {
    if (x.dtype() != y.dtype()) {
      return false;
    }
    if (x.ndim() != y.ndim()) {
      return false;
    }
    if (!shapeless && x.shape() != y.shape()) {
      return false;
    }
    return true;
  };

  int i = 0;
  for (; i < args.size(); ++i) {
    if (!match_inputs(args[i], fun.inputs[i])) {
      return false;
    }
  }
  for (auto& [_, in] : kwargs) {
    if (!match_inputs(in, fun.inputs[i++])) {
      return false;
    }
  }

  return true;
}

std::pair<FunctionTable::Function&, bool> FunctionTable::emplace(
    const Args& args,
    const std::map<std::string, array>& kwargs) {
  auto n_inputs = args.size() + kwargs.size();
  auto [it, _] = table.emplace(n_inputs, std::vector<Function>{});
  auto& funs_vec = it->second;

  for (auto& fun : funs_vec) {
    if (match(args, kwargs, fun)) {
      return {fun, false};
    }
  }

  funs_vec.emplace_back();
  return {funs_vec.back(), true};
}

FunctionTable::Function* FunctionTable::find(
    const Args& args,
    const std::map<std::string, array>& kwargs) {
  auto n_inputs = args.size() + kwargs.size();
  auto it = table.find(n_inputs);
  if (it == table.end()) {
    return nullptr;
  }

  for (auto& fun : it->second) {
    if (match(args, kwargs, fun)) {
      return &fun;
    }
  }

  return nullptr;
}

FunctionExporter::FunctionExporter(
    const std::string& file,
    std::function<std::vector<array>(const Args&, const Kwargs&)> fun,
    bool shapeless)
    : os(file),
      fun(std::move(fun)),
      ftable(std::make_shared<FunctionTable>(shapeless)) {
  if (!os.is_open()) {
    throw std::runtime_error("[export_function] Failed to open " + file);
  }
  write_header(os, count, shapeless);
}

FunctionExporter::FunctionExporter(
    const ExportCallback& callback,
    std::function<std::vector<array>(const Args&, const Kwargs&)> fun,
    bool shapeless)
    : callback(callback),
      fun(std::move(fun)),
      ftable(std::make_shared<FunctionTable>(shapeless)) {}

void FunctionExporter::close() {
  closed = true;
};

void FunctionExporter::export_with_callback(
    const std::vector<array>& inputs,
    const std::vector<array>& outputs,
    const std::vector<array>& tape,
    const std::vector<std::string>& kwarg_keys) {
  NodeNamer namer{};
  auto to_vector_data = [&namer](const auto& arrays) {
    std::vector<std::tuple<std::string, Shape, Dtype>> data;
    for (auto& a : arrays) {
      data.emplace_back(namer.get_name(a), a.shape(), a.dtype());
    }
    return data;
  };

  // Callback on the inputs
  callback({{"type", "inputs"}, {"inputs", to_vector_data(inputs)}});
  std::vector<std::pair<std::string, std::string>> keyword_inputs;
  for (int i = inputs.size() - kwarg_keys.size(), j = 0; i < inputs.size();
       ++i, ++j) {
    keyword_inputs.emplace_back(kwarg_keys[j], namer.get_name(inputs[i]));
  }
  callback({{"type", "keyword_inputs"}, {"keywords", keyword_inputs}});

  // Callback on the outputs
  callback({{"type", "outputs"}, {"outputs", to_vector_data(outputs)}});

  // Callback on the constants
  {
    std::unordered_set<std::uintptr_t> input_set;
    for (auto& in : inputs) {
      input_set.insert(in.id());
    }
    std::vector<std::pair<std::string, array>> new_constants;
    for (auto& arr : tape) {
      if (arr.has_primitive() || input_set.find(arr.id()) != input_set.end()) {
        continue;
      }
      if (constants.insert({arr.id(), arr}).second) {
        new_constants.emplace_back(namer.get_name(arr), arr);
      }
    }
    callback({{"type", "constants"}, {"constants", new_constants}});
  }
  auto factory = PrimitiveFactory();

  // Callback for each primitive in the tape
  for (auto& arr : tape) {
    if (!arr.has_primitive()) {
      continue;
    }
    auto [name, state] = factory.extract_state(arr.primitive_ptr());
    callback(
        {{"type", "primitive"},
         {"inputs", to_vector_data(arr.inputs())},
         {"outputs", to_vector_data(arr.outputs())},
         {"name", name},
         {"arguments", state}});
  }
}

void FunctionExporter::export_function(const Args& args, const Kwargs& kwargs) {
  if (closed) {
    throw std::runtime_error(
        "[export_function] Attempting to write after exporting is closed.");
  }
  auto sorted_kwargs =
      std::map<std::string, array>(kwargs.begin(), kwargs.end());
  auto [fentry, inserted] = ftable->emplace(args, sorted_kwargs);
  if (!inserted) {
    throw std::runtime_error(
        "[export_function] Attempting to export a function twice with "
        "the same signature is not allowed.");
  }

  // Flatten the inputs to the function for tracing
  std::vector<std::string> kwarg_keys;
  auto inputs = args;
  for (auto& [k, v] : sorted_kwargs) {
    kwarg_keys.push_back(k);
    inputs.push_back(v);
  }

  auto flat_fun = [this, &kwarg_keys](const Args& flat_args) {
    auto args = Args(flat_args.begin(), flat_args.end() - kwarg_keys.size());
    Kwargs kwargs;
    auto it = flat_args.end() - kwarg_keys.size();
    ;
    for (auto& k : kwarg_keys) {
      kwargs.insert({k, *it++});
    }
    return detail::ArraysAndExtra{fun(args, kwargs), nullptr};
  };

  // Trace to build the graph
  auto [trace_inputs, trace_outputs, extra] =
      detail::compile_trace(flat_fun, inputs, ftable->shapeless);

  // DFS the graph and get the tape
  auto [tape, parents_map] =
      detail::compile_dfs(trace_inputs, trace_outputs, inputs);

  detail::compile_simplify(tape, parents_map, trace_outputs, /* passes */ 3);

  // Update the table entry
  fentry.kwarg_keys = kwarg_keys;
  fentry.inputs = trace_inputs;

  count++;

  if (callback) {
    export_with_callback(trace_inputs, trace_outputs, tape, kwarg_keys);
    return;
  }

  // Update the header
  auto pos = os.tell();
  os.seek(0);
  write_header(os, count, ftable->shapeless);
  os.seek(pos);
  serialize(os, kwarg_keys);

  auto arrays_to_ids = [](const std::vector<array>& arrs) {
    std::vector<uint64_t> ids;
    for (auto& arr : arrs) {
      ids.push_back(arr.id());
    }
    return ids;
  };

  // Inputs and outputs
  auto trace_input_ids = arrays_to_ids(trace_inputs);
  serialize(os, trace_input_ids);
  serialize(os, trace_inputs);
  serialize(os, arrays_to_ids(trace_outputs));

  std::unordered_set<std::uintptr_t> input_set(
      trace_input_ids.begin(), trace_input_ids.end());

  // Tape
  auto factory = PrimitiveFactory();
  serialize(os, static_cast<uint64_t>(tape.size()));
  for (auto& arr : tape) {
    serialize(os, static_cast<uint64_t>(arr.id()));
    if (arr.has_primitive()) {
      serialize(os, true);
      serialize(os, arrays_to_ids(arr.inputs()));
      factory.save(os, arr.primitive_ptr());
      serialize(os, static_cast<uint64_t>(arr.siblings().size()));
      if (arr.siblings().empty()) {
        serialize(os, arr.shape());
        serialize(os, arr.dtype());
      } else {
        auto outputs = arr.outputs();
        serialize(os, arrays_to_ids(outputs));

        std::vector<Shape> shapes;
        std::vector<Dtype> dtypes;
        for (auto& o : outputs) {
          shapes.push_back(o.shape());
          dtypes.push_back(o.dtype());
        }
        serialize(os, shapes);
        serialize(os, dtypes);
      }
    } else {
      serialize(os, false);
      if (input_set.find(arr.id()) == input_set.end()) {
        serialize(os, true);
        // Save constant data if not already saved
        if (constants.insert({arr.id(), arr}).second) {
          serialize(os, arr.shape());
          serialize(os, arr.dtype());
          os.write(arr.data<char>(), arr.nbytes());
        }
      } else {
        serialize(os, false);
      }
    }
  }
}

void FunctionExporter::operator()(const Args& args) {
  export_function(args, {});
}

void FunctionExporter::operator()(const Kwargs& kwargs) {
  export_function({}, kwargs);
}

void FunctionExporter::operator()(const Args& args, const Kwargs& kwargs) {
  export_function(args, kwargs);
}

FunctionExporter exporter(
    const std::string& file,
    const std::function<std::vector<array>(const Args&)>& fun,
    bool shapeless /* = false */) {
  return FunctionExporter{
      file,
      [fun](const Args& args, const Kwargs&) { return fun(args); },
      shapeless};
}

FunctionExporter exporter(
    const std::string& file,
    const std::function<std::vector<array>(const Kwargs&)>& fun,
    bool shapeless /* = false */) {
  return exporter(
      file,
      [fun](const Args&, const Kwargs kwargs) { return fun(kwargs); },
      shapeless);
}

FunctionExporter exporter(
    const std::string& file,
    const std::function<std::vector<array>(const Args&, const Kwargs&)>& fun,
    bool shapeless /* = false */) {
  return FunctionExporter{file, fun, shapeless};
}

void export_function(
    const std::string& file,
    const std::function<std::vector<array>(const Args&)>& fun,
    const Args& args,
    bool shapeless /* = false */) {
  exporter(file, fun, shapeless)(args);
}

void export_function(
    const std::string& file,
    const std::function<std::vector<array>(const Kwargs&)>& fun,
    const Kwargs& kwargs,
    bool shapeless /* = false */) {
  exporter(file, fun, shapeless)(kwargs);
}

void export_function(
    const std::string& file,
    const std::function<std::vector<array>(const Args&, const Kwargs&)>& fun,
    const Args& args,
    const Kwargs& kwargs,
    bool shapeless /* = false */) {
  exporter(file, fun, shapeless)(args, kwargs);
}

FunctionExporter exporter(
    const ExportCallback& callback,
    const std::function<std::vector<array>(const Args&)>& fun,
    bool shapeless /* = false */) {
  return FunctionExporter{
      callback,
      [fun](const Args& args, const Kwargs&) { return fun(args); },
      shapeless};
}

FunctionExporter exporter(
    const ExportCallback& callback,
    const std::function<std::vector<array>(const Kwargs&)>& fun,
    bool shapeless /* = false */) {
  return exporter(
      callback,
      [fun](const Args&, const Kwargs kwargs) { return fun(kwargs); },
      shapeless);
}

FunctionExporter exporter(
    const ExportCallback& callback,
    const std::function<std::vector<array>(const Args&, const Kwargs&)>& fun,
    bool shapeless /* = false */) {
  return FunctionExporter{callback, fun, shapeless};
}

void export_function(
    const ExportCallback& callback,
    const std::function<std::vector<array>(const Args&)>& fun,
    const Args& args,
    bool shapeless /* = false */) {
  exporter(callback, fun, shapeless)(args);
}

void export_function(
    const ExportCallback& callback,
    const std::function<std::vector<array>(const Kwargs&)>& fun,
    const Kwargs& kwargs,
    bool shapeless /* = false */) {
  exporter(callback, fun, shapeless)(kwargs);
}

void export_function(
    const ExportCallback& callback,
    const std::function<std::vector<array>(const Args&, const Kwargs&)>& fun,
    const Args& args,
    const Kwargs& kwargs,
    bool shapeless /* = false */) {
  exporter(callback, fun, shapeless)(args, kwargs);
}

std::vector<array> ImportedFunction::operator()(const Kwargs& kwargs) const {
  return this->operator()({}, kwargs);
}

std::vector<array> ImportedFunction::operator()(const Args& args) const {
  return this->operator()(args, {});
}

std::vector<array> ImportedFunction::operator()(
    const Args& args,
    const Kwargs& kwargs) const {
  auto sorted_kwargs =
      std::map<std::string, array>(kwargs.begin(), kwargs.end());
  auto* fun = ftable->find(args, sorted_kwargs);
  if (fun == nullptr) {
    std::ostringstream msg;
    msg << "[import_function::call] No imported function found which matches "
        << "the given positional and keyword arguments. Possible functions include:\n";
    ftable->print_functions(msg);
    msg << "\nCalled with " << args.size() << " positional inputs and "
        << kwargs.size() << " keyword inputs:\n";
    for (int i = 0; i < args.size(); ++i) {
      auto& in = args[i];
      msg << "  " << i + 1 << ": " << in.shape() << " " << in.dtype() << "\n";
    }
    for (auto& [k, in] : kwargs) {
      msg << "  \"" << k << "\": " << in.shape() << " " << in.dtype() << "\n";
    }
    throw std::invalid_argument(msg.str());
  }

  auto inputs = args;
  for (auto& [_, v] : sorted_kwargs) {
    inputs.push_back(v);
  }
  return detail::compile_replace(
      fun->tape, fun->inputs, fun->outputs, inputs, ftable->shapeless);
}

ImportedFunction import_function(const std::string& file) {
  return ImportedFunction{file};
}

ImportedFunction::ImportedFunction(const std::string& file)
    : ftable(std::make_shared<FunctionTable>()) {
  auto is_ptr = std::make_shared<Reader>(file);
  auto& is = *is_ptr;
  if (!is.is_open()) {
    throw std::runtime_error("[import_function] Failed to open " + file);
  }

  // Parse header
  auto mlx_version = deserialize<std::string>(is);
  auto function_count = deserialize<int>(is);
  ftable->shapeless = deserialize<bool>(is);
  std::unordered_map<std::uintptr_t, array> constants;

  auto import_one = [&]() {
    auto kwarg_keys = deserialize<std::vector<std::string>>(is);

    std::unordered_map<uint64_t, array> array_map;
    auto trace_input_ids = deserialize<std::vector<uint64_t>>(is);
    auto trace_inputs = deserialize<std::vector<array>>(is);
    for (int i = 0; i < trace_inputs.size(); ++i) {
      array_map.emplace(trace_input_ids[i], trace_inputs[i]);
    }
    auto trace_output_ids = deserialize<std::vector<uint64_t>>(is);

    std::vector<array> tape;
    auto tape_size = deserialize<uint64_t>(is);
    tape.reserve(tape_size);

    auto factory = PrimitiveFactory();
    for (size_t i = 0; i < tape_size; ++i) {
      auto id = deserialize<uint64_t>(is);
      if (deserialize<bool>(is)) {
        auto input_ids = deserialize<std::vector<uint64_t>>(is);
        std::vector<array> inputs;
        inputs.reserve(input_ids.size());
        for (auto id : input_ids) {
          inputs.push_back(array_map.at(id));
        }
        std::shared_ptr<Primitive> prim = factory.load(is);
        auto num_siblings = deserialize<uint64_t>(is);
        if (num_siblings == 0) {
          auto shape = deserialize<Shape>(is);
          auto type = deserialize<Dtype>(is);
          tape.emplace_back(
              std::move(shape), type, std::move(prim), std::move(inputs));
          array_map.emplace(id, tape.back());
        } else {
          auto ids = deserialize<std::vector<uint64_t>>(is);
          auto shapes = deserialize<std::vector<Shape>>(is);
          auto types = deserialize<std::vector<Dtype>>(is);
          auto arrays = array::make_arrays(
              std::move(shapes),
              std::move(types),
              std::move(prim),
              std::move(inputs));
          for (int i = 0; i < arrays.size(); ++i) {
            auto sid = ids[i];
            if (sid == id) {
              tape.push_back(arrays[i]);
            }
            array_map.emplace(sid, arrays[i]);
          }
        }
      } else {
        if (deserialize<bool>(is)) {
          // Load constant
          if (auto it = constants.find(id); it != constants.end()) {
            tape.push_back(it->second);
          } else {
            auto shape = deserialize<Shape>(is);
            auto type = deserialize<Dtype>(is);
            size_t offset = is.tell();
            tape.push_back(array(
                std::move(shape),
                type,
                std::make_shared<Load>(
                    default_stream(Device::cpu), is_ptr, offset),
                {}));
            is.seek(offset + tape.back().nbytes());
            constants.insert({id, tape.back()});
          }
          array_map.emplace(id, tape.back());
        } else {
          // Function inputs are in the map
          tape.push_back(array_map.at(id));
        }
      }
    }

    std::vector<array> trace_outputs;
    trace_outputs.reserve(trace_output_ids.size());
    for (auto id : trace_output_ids) {
      trace_outputs.push_back(array_map.at(id));
    }
    ftable->insert(
        std::move(kwarg_keys),
        std::move(trace_inputs),
        std::move(trace_outputs),
        std::move(tape));
  };

  for (int i = 0; i < function_count; ++i) {
    import_one();
  }
}

} // namespace mlx::core


================================================
FILE: mlx/export.h
================================================
// Copyright © 2024 Apple Inc.

#pragma once

#include <optional>
#include <set>
#include <unordered_map>
#include <variant>
#include "mlx/api.h"
#include "mlx/array.h"

namespace mlx::core {

using Args = std::vector<array>;
using Kwargs = std::unordered_map<std::string, array>;

// Possible types for a Primitive's state
using StateT = std::variant<
    bool,
    int,
    size_t,
    float,
    double,
    Dtype,
    Shape,
    Strides,
    std::vector<int>,
    std::vector<size_t>,
    std::vector<std::tuple<bool, bool, bool>>,
    std::vector<std::variant<bool, int, float>>,
    std::optional<float>,
    std::string>;

using ExportCallbackInput = std::unordered_map<
    std::string,
    std::variant<
        std::vector<std::tuple<std::string, Shape, Dtype>>,
        std::vector<std::pair<std::string, array>>,
        std::vector<std::pair<std::string, std::string>>,
        std::vector<StateT>,
        std::string>>;
using ExportCallback = std::function<void(const ExportCallbackInput&)>;

struct FunctionExporter;

/**
 * Make an exporter to save multiple traces of a given function to
 * the same file.
 */
MLX_API FunctionExporter exporter(
    const std::string& file,
    const std::function<std::vector<array>(const Args&)>& fun,
    bool shapeless = false);

MLX_API FunctionExporter exporter(
    const std::string& file,
    const std::function<std::vector<array>(const Kwargs&)>& fun,
    bool shapeless = false);

MLX_API FunctionExporter exporter(
    const std::string& path,
    const std::function<std::vector<array>(const Args&, const Kwargs&)>& fun,
    bool shapeless = false);

/**
 * Export a function to a file.
 */
MLX_API void export_function(
    const std::string& file,
    const std::function<std::vector<array>(const Args&)>& fun,
    const Args& args,
    bool shapeless = false);

MLX_API void export_function(
    const std::string& file,
    const std::function<std::vector<array>(const Kwargs&)>& fun,
    const Kwargs& kwargs,
    bool shapeless = false);

MLX_API void export_function(
    const std::string& file,
    const std::function<std::vector<array>(const Args&, const Kwargs&)>& fun,
    const Args& args,
    const Kwargs& kwargs,
    bool shapeless = false);

struct ImportedFunction;

/**
 * Import a function from a file.
 */
MLX_API ImportedFunction import_function(const std::string& file);

/**
 * Make an exporter to export multiple traces of a given function with the same
 * callback.
 */
MLX_API FunctionExporter exporter(
    const ExportCallback& callback,
    const std::function<std::vector<array>(const Args&)>& fun,
    bool shapeless = false);

MLX_API FunctionExporter exporter(
    const ExportCallback& callback,
    const std::function<std::vector<array>(const Kwargs&)>& fun,
    bool shapeless = false);

MLX_API FunctionExporter exporter(
    const ExportCallback& callback,
    const std::function<std::vector<array>(const Args&, const Kwargs&)>& fun,
    bool shapeless = false);

/**
 * Export a function with a callback.
 */
MLX_API void export_function(
    const ExportCallback& callback,
    const std::function<std::vector<array>(const Args&)>& fun,
    const Args& args,
    bool shapeless = false);

MLX_API void export_function(
    const ExportCallback& callback,
    const std::function<std::vector<array>(const Kwargs&)>& fun,
    const Kwargs& kwargs,
    bool shapeless = false);

MLX_API void export_function(
    const ExportCallback& callback,
    const std::function<std::vector<array>(const Args&, const Kwargs&)>& fun,
    const Args& args,
    const Kwargs& kwargs,
    bool shapeless = false);

} // namespace mlx::core

#include "mlx/export_impl.h"


================================================
FILE: mlx/export_impl.h
================================================
// Copyright © 2024 Apple Inc.

#include "mlx/api.h"
#include "mlx/io/load.h"

#pragma once

namespace mlx::core {

struct FunctionTable;

struct MLX_API FunctionExporter {
  void operator()(const std::initializer_list<array>& args) {
    this->operator()(Args(args));
  }
  void operator()(const Args& args);
  void operator()(const Kwargs& kwargs);
  void operator()(const Args& args, const Kwargs& kwargs);

  void close();

  FunctionExporter(const FunctionExporter&) = delete;
  FunctionExporter& operator=(const FunctionExporter&) = delete;
  FunctionExporter(FunctionExporter&& other) = default;

 private:
  friend MLX_API FunctionExporter exporter(
      const std::string&,
      const std::function<std::vector<array>(const Args&)>&,
      bool shapeless);

  friend MLX_API FunctionExporter exporter(
      const std::string&,
      const std::function<std::vector<array>(const Kwargs&)>&,
      bool shapeless);

  friend MLX_API FunctionExporter exporter(
      const std::string&,
      const std::function<std::vector<array>(const Args&, const Kwargs&)>&,
      bool shapeless);

  friend MLX_API FunctionExporter exporter(
      const ExportCallback&,
      const std::function<std::vector<array>(const Args&)>&,
      bool shapeless);

  friend MLX_API FunctionExporter exporter(
      const ExportCallback&,
      const std::function<std::vector<array>(const Kwargs&)>&,
      bool shapeless);

  friend MLX_API FunctionExporter exporter(
      const ExportCallback&,
      const std::function<std::vector<array>(const Args&, const Kwargs&)>&,
      bool shapeless);

  FunctionExporter(
      const std::string& file,
      std::function<std::vector<array>(const Args&, const Kwargs&)> fun,
      bool shapeless);

  FunctionExporter(
      const ExportCallback& callback,
      std::function<std::vector<array>(const Args&, const Kwargs&)> fun,
      bool shapeless);

  io::FileWriter os;
  ExportCallback callback;
  std::function<std::vector<array>(const Args&, const Kwargs& kwargs)> fun;
  void export_function(const Args& args, const Kwargs& kwargs);
  void export_with_callback(
      const std::vector<array>& inputs,
      const std::vector<array>& outputs,
      const std::vector<array>& tape,
      const std::vector<std::string>& kwarg_keys);
  std::unordered_map<std::uintptr_t, array> constants;
  int count{0};
  bool closed{false};
  std::shared_ptr<FunctionTable> ftable;
};

struct MLX_API ImportedFunction {
  std::vector<array> operator()(
      const std::initializer_list<array>& args) const {
    return this->operator()(Args(args));
  }
  std::vector<array> operator()(const Args& args) const;
  std::vector<array> operator()(const Kwargs& kwargs) const;
  std::vector<array> operator()(const Args& args, const Kwargs& kwargs) const;

 private:
  ImportedFunction(const std::string& file);
  friend MLX_API ImportedFunction import_function(const std::string&);
  ImportedFunction();

  std::shared_ptr<FunctionTable> ftable;
};

} // namespace mlx::core


================================================
FILE: mlx/fast.cpp
================================================
// Copyright © 2023-2024 Apple Inc.
#include <cassert>
#include <numeric>

#include "mlx/fast.h"
#include "mlx/fast_primitives.h"
#include "mlx/ops.h"
#include "mlx/transforms.h"
#include "mlx/transforms_impl.h"

namespace mlx::core::fast {

std::vector<array> Custom::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  auto [_, vjps] = mlx::core::vjp(fallback_, primals, cotangents);
  std::vector<array> vjp_outs;
  for (int i = 0, j = 0; i < vjps.size(); ++i) {
    if (j < argnums.size() && i == argnums[j]) {
      vjp_outs.push_back(vjps[i]);
      j++;
    }
  }
  return vjp_outs;
}

std::vector<array> Custom::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  std::vector<array> all_tangents;
  for (int i = 0, j = 0; i < primals.size(); i++) {
    if (j < argnums.size() && i == argnums[j]) {
      all_tangents.emplace_back(tangents[j++]);
    } else {
      all_tangents.emplace_back(zeros_like(primals[i]));
    }
  }
  auto [_, jvps] = mlx::core::jvp(fallback_, primals, all_tangents);
  return jvps;
}

std::pair<std::vector<array>, std::vector<int>> Custom::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto outputs = mlx::core::vmap(fallback_, axes)(inputs);
  auto out_axes = std::vector<int>(outputs.size(), 0);
  return {outputs, out_axes};
}

array rms_norm(
    const array& x,
    const std::optional<array>& weight,
    float eps,
    StreamOrDevice s_ /* = {} */) {
  bool has_weight = weight.has_value();

  if (x.ndim() == 0) {
    std::ostringstream msg;
    msg << "[rms_norm] Input must have at least 1 dimension but got input with "
           "0 dimensions.";
    throw std::invalid_argument(msg.str());
  }
  if (has_weight) {
    if ((*weight).ndim() != 1) {
      std::ostringstream msg;
      msg << "[rms_norm] (*weight) must have 1 dimension but has "
          << (*weight).ndim() << " dimensions.";
      throw std::invalid_argument(msg.str());
    }
    if ((*weight).size() != x.shape(-1)) {
      std::ostringstream msg;
      msg << "[rms_norm] (*weight) must have the same size as the last dimension of"
             " x but has "
          << (*weight).size() << " elements.";
      throw std::invalid_argument(msg.str());
    }
  }

  auto out_type = (weight.has_value()) ? result_type(x, (*weight)) : x.dtype();
  if (!issubdtype(out_type, floating)) {
    std::ostringstream msg;
    msg << "[rms_norm] Received unsupported type " << out_type << ".";
    throw std::invalid_argument(msg.str());
  }

  auto s = to_stream(s_);
  auto fallback =
      [has_weight, eps, out_type, s](const std::vector<array>& inputs) {
        auto x = astype(inputs[0], float32, s);
        x = multiply(
            x,
            rsqrt(
                add(mean(square(x, s), -1, /* keepdims */ true, s),
                    array(eps, float32),
                    s),
                s),
            s);
        x = astype(x, out_type, s);

        if (has_weight) {
          x = multiply(x, inputs[1], s);
        }

        return std::vector<array>{x};
      };

  auto passed_weight =
      (has_weight) ? astype(*weight, out_type, s) : array(1, out_type);

  if (!RMSNorm::use_fallback(s)) {
    return array(
        x.shape(),
        out_type,
        std::make_shared<RMSNorm>(s, fallback, eps),
        {astype(x, out_type, s), passed_weight});
  }
  return fallback({x, passed_weight})[0];
}

std::vector<array> RMSNorm::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  assert(primals.size() == 2);
  assert(outputs.size() == 1);
  assert(cotangents.size() == 1);

  auto s = stream();
  auto fallback = [eps = eps_, s](const std::vector<array>& inputs) {
    auto& x = inputs[0];
    auto& w = inputs[1];
    auto& g = inputs[2];

    std::vector<array> vjps;

    auto n = rsqrt(
        add(mean(square(x, s), /* axis= */ -1, /* keepdims= */ true, s),
            array(eps, x.dtype()),
            s),
        s);
    auto n3 = power(n, array(3, x.dtype()), s);

    // df/dx
    auto gw = multiply(g, w, s);
    auto t = mean(multiply(gw, x, s), /* axis= */ -1, /* keepdims= */ true, s);
    t = multiply(multiply(x, t, s), n3, s);
    vjps.push_back(subtract(multiply(gw, n, s), t, s));

    // df/dw
    std::vector<int> axes(g.ndim() - 1);
    std::iota(axes.begin(), axes.end(), 0);
    if (w.ndim() == 0) {
      vjps.push_back(zeros_like(w, s));
    } else {
      vjps.push_back(sum(
          multiply(g, multiply(x, n, s), s), axes, /* keepdims= */ false, s));
    }

    return vjps;
  };

  auto vjps = array::make_arrays(
      {primals[0].shape(), primals[1].shape()},
      {primals[0].dtype(), primals[1].dtype()},
      std::make_shared<RMSNormVJP>(s, fallback, eps_),
      {primals[0], primals[1], cotangents[0]});

  std::vector<array> returned_vjps;
  for (auto& arg : argnums) {
    returned_vjps.push_back(std::move(vjps[arg]));
  }

  return returned_vjps;
}

bool RMSNorm::is_equivalent(const Primitive& other) const {
  const RMSNorm& a_other = static_cast<const RMSNorm&>(other);
  return eps_ == a_other.eps_;
}

bool RMSNormVJP::is_equivalent(const Primitive& other) const {
  const RMSNormVJP& a_other = static_cast<const RMSNormVJP&>(other);
  return eps_ == a_other.eps_;
}

array layer_norm(
    const array& x,
    const std::optional<array>& weight,
    const std::optional<array>& bias,
    float eps,
    StreamOrDevice s_ /* = {} */) {
  bool has_weight = weight.has_value();
  bool has_bias = bias.has_value();

  if (x.ndim() == 0) {
    std::ostringstream msg;
    msg << "[layer_norm] Input must have at least 1 dimension but got input with "
           "0 dimensions.";
    throw std::invalid_argument(msg.str());
  }
  if (has_weight) {
    if ((*weight).ndim() != 1) {
      std::ostringstream msg;
      msg << "[layer_norm] weight must have 1 dimension but has "
          << (*weight).ndim() << " dimensions.";
      throw std::invalid_argument(msg.str());
    }
    if ((*weight).size() != x.shape(-1)) {
      std::ostringstream msg;
      msg << "[layer_norm] weight must have the same size as the last dimension of"
             " x but has "
          << (*weight).size() << " elements.";
      throw std::invalid_argument(msg.str());
    }
  }
  if (has_bias) {
    if ((*bias).ndim() != 1) {
      std::ostringstream msg;
      msg << "[layer_norm] bias must have 1 dimension but has "
          << (*bias).ndim() << " dimensions.";
      throw std::invalid_argument(msg.str());
    }
    if ((*bias).size() != x.shape(-1)) {
      std::ostringstream msg;
      msg << "[layer_norm] bias must have the same size as the last dimension of"
             " x but has "
          << (*bias).size() << " elements.";
      throw std::invalid_argument(msg.str());
    }
  }

  auto out_type = (has_weight)
      ? ((has_bias) ? result_type(x, *weight, *bias) : result_type(x, *weight))
      : x.dtype();
  if (!issubdtype(out_type, floating)) {
    std::ostringstream msg;
    msg << "[layer_norm] Received unsupported type " << out_type << ".";
    throw std::invalid_argument(msg.str());
  }

  auto s = to_stream(s_);
  auto fallback = [has_weight, has_bias, eps, out_type, s](
                      const std::vector<array>& inputs) {
    auto x = astype(inputs[0], float32, s);

    auto mu = mean(x, /* axis= */ -1, /* keepdims= */ true, s);
    auto xc = subtract(x, mu, s);
    auto v = mean(square(xc, s), /* axis= */ -1, /* keepdims= */ true, s);

    x = multiply(xc, rsqrt(add(v, array(eps, float32), s), s));
    x = astype(x, out_type, s);

    // If the LN is affine then transform x according to the weight and bias
    if (has_weight) {
      x = multiply(x, inputs[1], s);
    }
    if (has_bias) {
      x = add(x, inputs[2], s);
    }

    return std::vector<array>{x};
  };

  auto passed_weight =
      (has_weight) ? astype(*weight, out_type, s) : array(1, out_type);
  auto passed_bias =
      (has_bias) ? astype(*bias, out_type, s) : array(0, out_type);

  if (!LayerNorm::use_fallback(s)) {
    return array(
        x.shape(),
        out_type,
        std::make_shared<LayerNorm>(s, fallback, eps),
        {astype(x, out_type, s), passed_weight, passed_bias});
  }
  return fallback({x, passed_weight, passed_bias})[0];
}

std::vector<array> LayerNorm::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  assert(primals.size() == 3);
  assert(outputs.size() == 1);
  assert(cotangents.size() == 1);

  auto s = stream();
  auto fallback = [eps = eps_, s](const std::vector<array>& inputs) {
    auto& x = inputs[0];
    auto& w = inputs[1];
    auto& b = inputs[2];
    auto& g = inputs[3];

    std::vector<array> vjps;

    auto norm = number_of_elements(x, {-1}, true, x.dtype(), s);
    auto sumx = sum(x, /* axis= */ -1, /* keepdims= */ true, s);
    auto sumx2 = sum(square(x, s), /* axis= */ -1, /* keepdims= */ true, s);
    auto mu = multiply(sumx, norm, s);
    auto mu2 = multiply(sumx2, norm, s);
    auto var = subtract(mu2, square(mu, s), s);
    auto n = rsqrt(add(var, array(eps, x.dtype()), s));
    auto n3 = power(n, array(3, x.dtype()), s);
    auto x_c = subtract(x, mu, s);

    // df/dx
    auto wg = multiply(w, g, s);
    auto sumwg =
        multiply(sum(wg, /* axis= */ -1, /* keepdims= */ true, s), norm, s);
    auto sumwgxc = multiply(
        sum(multiply(wg, x_c, s), /* axis= */ -1, /* keepdims= */ true, s),
        norm,
        s);
    auto t1 = multiply(multiply(x_c, sumwgxc, s), n3, s);
    auto t2 = multiply(subtract(wg, sumwg, s), n, s);
    vjps.push_back(subtract(t2, t1, s));

    // df/dw
    std::vector<int> axes(g.ndim() - 1);
    std::iota(axes.begin(), axes.end(), 0);
    if (w.ndim() == 0) {
      vjps.push_back(zeros_like(w, s));
    } else {
      vjps.push_back(sum(
          multiply(g, multiply(x_c, n, s), s), axes, /* keepdims= */ false, s));
    }

    // df/db
    if (b.ndim() == 0) {
      vjps.push_back(zeros_like(b, s));
    } else {
      vjps.push_back(sum(g, axes, /* keepdims= */ false, s));
    }

    return vjps;
  };

  auto vjps = array::make_arrays(
      {primals[0].shape(), primals[1].shape(), primals[2].shape()},
      {primals[0].dtype(), primals[1].dtype(), primals[2].dtype()},
      std::make_shared<LayerNormVJP>(s, fallback, eps_),
      {primals[0], primals[1], primals[2], cotangents[0]});

  std::vector<array> returned_vjps;
  for (auto& arg : argnums) {
    returned_vjps.push_back(std::move(vjps[arg]));
  }

  return returned_vjps;
}

bool LayerNorm::is_equivalent(const Primitive& other) const {
  const LayerNorm& a_other = static_cast<const LayerNorm&>(other);
  return eps_ == a_other.eps_;
}

bool LayerNormVJP::is_equivalent(const Primitive& other) const {
  const LayerNormVJP& a_other = static_cast<const LayerNormVJP&>(other);
  return eps_ == a_other.eps_;
}

array rope(
    std::vector<array> inputs,
    int dims,
    bool traditional,
    float base,
    float scale,
    bool forward,
    StreamOrDevice s) {
  auto& x = inputs[0];
  auto& offset = inputs[1];
  if (x.ndim() < 3) {
    std::ostringstream msg;
    msg << "[rope] Input must have at least 3 dimensions but got input with "
        << x.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  if (!issubdtype(x.dtype(), floating)) {
    std::ostringstream msg;
    msg << "[rope] Input must be a floating type but got " << x.dtype() << ".";
    throw std::invalid_argument(msg.str());
  }
  if (offset.ndim() > 1) {
    std::ostringstream msg;
    msg << "[rope] offset must have at most one dimension but has shape "
        << offset.shape() << ".";
    throw std::invalid_argument(msg.str());
  }
  if (offset.size() != 1 && offset.size() != x.shape(0)) {
    std::ostringstream msg;
    msg << "[rope] offset must be a scalar or vector with " << x.shape(0)
        << " elements but has shape " << offset.shape() << ".";
    throw std::invalid_argument(msg.str());
  }
  if (!issubdtype(offset.dtype(), integer)) {
    std::ostringstream msg;
    msg << "[rope] offset must be an integer but got type " << offset.dtype()
        << ".";
    throw std::invalid_argument(msg.str());
  }
  if (offset.dtype().size() != 4) {
    inputs[1] = astype(offset, int32, s);
  }
  if (dims <= 0) {
    std::ostringstream msg;
    msg << "[rope] dims must be positive but got " << dims << ".";
    throw std::invalid_argument(msg.str());
  }
  if (dims % 2 != 0) {
    std::ostringstream msg;
    msg << "[rope] dims must be even but got " << dims << ".";
    throw std::invalid_argument(msg.str());
  }
  if (dims > x.shape(-1)) {
    std::ostringstream msg;
    msg << "[rope] dims must not exceed the input's last dimension ("
        << x.shape(-1) << ") but got " << dims << ".";
    throw std::invalid_argument(msg.str());
  }

  if (inputs.size() == 3 &&
      (inputs[2].ndim() != 1 || inputs[2].shape(0) != dims / 2)) {
    std::ostringstream msg;
    msg << "[rope] freqs must be one dimensional with size " << dims / 2
        << " but got shape " << inputs[2].shape() << ".";
    throw std::invalid_argument(msg.str());
  }

  auto fallback = [dims, traditional, base, scale, forward, s](
                      std::vector<array> inputs) {
    auto x = inputs[0];
    auto shape = x.shape();
    if (x.ndim() == 3) {
      x = expand_dims(x, 1, s);
    } else if (x.ndim() > 4) {
      x = flatten(x, 1, 1 + (x.ndim() - 4), s);
    }

    auto B = x.shape(0);
    auto N = x.shape(1);
    auto T = x.shape(2);
    auto t = x.dtype();
    // Compute sines and cosines
    auto half_dims = dims / 2;
    auto offset = inputs[1];
    if (offset.size() > 1) {
      offset = expand_dims(offset, {-1, -2}, s);
    }
    auto positions = multiply(
        add(arange(x.shape(2), float32, s), offset, s),
        array(scale, float32),
        s);

    auto default_inv_freqs = [&s, base, half_dims]() {
      return exp(
          multiply(
              arange(0, -half_dims, -1, float32, s),
              array(std::log(base) / half_dims, float32),
              s),
          s);
    };

    auto inv_freqs =
        inputs.size() == 3 ? reciprocal(inputs[2], s) : default_inv_freqs();
    auto theta = multiply(expand_dims(positions, -1, s), inv_freqs, s);
    auto coss = astype(cos(theta, s), t, s);
    auto sins = astype(sin(theta, s), t, s);

    auto apply_rope = [forward, s](
                          const array& x1,
                          const array& x2,
                          const array& coss,
                          const array& sins) {
      std::vector<array> outs;
      if (forward) {
        outs.push_back(
            subtract(multiply(x1, coss, s), multiply(x2, sins, s), s));
        outs.push_back(add(multiply(x1, sins, s), multiply(x2, coss, s), s));
      } else {
        outs.push_back(add(multiply(x2, sins, s), multiply(x1, coss, s), s));
        outs.push_back(
            subtract(multiply(x2, coss, s), multiply(x1, sins, s), s));
      }
      return outs;
    };

    if (traditional) {
      auto x1 = slice(x, {0, 0, 0, 0}, {B, N, T, dims}, {1, 1, 1, 2}, s);
      auto x2 = slice(x, {0, 0, 0, 1}, {B, N, T, dims}, {1, 1, 1, 2}, s);
      auto outs = apply_rope(x1, x2, coss, sins);
      for (auto& o : outs) {
        o = expand_dims(o, -1, s);
      }
      auto out = reshape(concatenate(outs, -1, s), {B, N, T, dims}, s);
      if (dims < x.shape(-1)) {
        out =
            concatenate({out, slice(x, {0, 0, 0, dims}, x.shape(), s)}, -1, s);
      }
      return std::vector<array>{reshape(out, shape, s)};
    } else {
      auto out_s = x.shape();
      out_s.back() = half_dims;
      auto x1 = slice(x, {0, 0, 0, 0}, out_s, s);
      out_s.back() = dims;
      auto x2 = slice(x, {0, 0, 0, half_dims}, out_s, s);

      auto outs = apply_rope(x1, x2, coss, sins);
      if (dims < x.shape(-1)) {
        outs.push_back(slice(x, {0, 0, 0, dims}, x.shape(), s));
      }
      return std::vector<array>{reshape(concatenate(outs, -1, s), shape, s)};
    }
  };
  auto stream = to_stream(s);
  if (!RoPE::use_fallback(stream)) {
    return array(
        x.shape(),
        x.dtype(),
        std::make_shared<RoPE>(
            stream, fallback, dims, traditional, base, scale, forward),
        std::move(inputs));
  }
  return fallback(std::move(inputs))[0];
}

array rope(
    const array& x,
    int dims,
    bool traditional,
    std::optional<float> base,
    float scale,
    const array& offset,
    const std::optional<array>& freqs /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  std::vector<array> inputs = {x, offset};
  if (freqs) {
    inputs.push_back(astype(*freqs, float32, s));
    if (base) {
      throw std::invalid_argument(
          "[rope] Only one of base or freqs can have a value.");
    }
  } else if (!base) {
    throw std::invalid_argument("[rope] Neither base nor freqs has a value.");
  }
  return rope(
      std::move(inputs),
      dims,
      traditional,
      base.has_value() ? *base : 1.0,
      scale,
      true,
      s);
}

array rope(
    const array& x,
    int dims,
    bool traditional,
    std::optional<float> base,
    float scale,
    int offset,
    const std::optional<array>& freqs /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  return rope(
      x, dims, traditional, base, scale, array(offset, int32), freqs, s);
}

std::vector<array> RoPE::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  auto s = stream();
  auto fallback = [dims = dims_,
                   traditional = traditional_,
                   base = base_,
                   scale = scale_,
                   forward = forward_,
                   s](std::vector<array> inputs) {
    return std::vector<array>{
        rope(std::move(inputs), dims, traditional, base, scale, !forward, s)};
  };
  if (argnums.size() > 1 || argnums[0] != 0) {
    throw std::invalid_argument(
        "[RoPE::vjp] vjp for offset or frequencies not supported");
  }
  auto inputs = std::vector<array>{cotangents[0], primals[1]};
  if (primals.size() == 3) {
    inputs.push_back(primals[2]);
  }
  return {array(
      cotangents[0].shape(),
      cotangents[0].dtype(),
      std::make_shared<RoPE>(
          s, fallback, dims_, traditional_, base_, scale_, !forward_),
      std::move(inputs))};
}

bool RoPE::is_equivalent(const Primitive& other) const {
  const RoPE& a_other = static_cast<const RoPE&>(other);
  return (
      dims_ == a_other.dims_ && base_ == a_other.base_ &&
      scale_ == a_other.scale_ && traditional_ == a_other.traditional_ &&
      forward_ == a_other.forward_);
}

/** Computes: O = softmax(Q @ K.T) @ V **/
array scaled_dot_product_attention(
    const array& queries,
    const array& keys,
    const array& values,
    const float scale,
    const std::string& mask_mode /* = "" */,
    std::optional<array> mask_arr /* = {} */,
    const std::optional<array>& sinks /* = {} */,
    StreamOrDevice s /* = {}*/) {
  for (const auto& tensor : {queries, keys, values}) {
    if (tensor.ndim() != 4) {
      std::ostringstream msg;
      msg << "[scaled_dot_product_attention] input with shape "
          << tensor.shape() << " expected to be rank 4";
      throw std::invalid_argument(msg.str());
    }
  }
  // Check valid mask
  if (mask_mode != "" && mask_mode != "causal" && mask_mode != "array") {
    std::ostringstream msg;
    msg << "[scaled_dot_product_attention] Invalid mask_mode " << mask_mode
        << ". mask_mode must be 'causal', 'array' or ''.";
    throw std::invalid_argument(msg.str());
  }

  bool do_causal = false;
  bool has_mask = false;
  bool has_arr_mask = false;
  bool has_bool_mask = false;

  if (mask_mode == "causal") {
    has_mask = true;
    do_causal = true;

    if (mask_arr) {
      std::ostringstream msg;
      msg << "[scaled_dot_product_attention] Invalid mask_arr for mask_mode "
          << "'casusal'. No array mask should be passed.";
      throw std::invalid_argument(msg.str());
    }
  } else if (mask_arr) {
    has_mask = true;
    has_arr_mask = true;
    has_bool_mask = mask_arr->dtype() == bool_;
  }

  if (has_arr_mask && mask_arr->ndim() > 4) {
    std::ostringstream msg;
    msg << "[scaled_dot_product_attention] the mask with shape "
        << mask_arr->shape() << " expected to have at most rank 4.";
    throw std::invalid_argument(msg.str());
  }

  const size_t batch_dim = queries.shape(0);
  for (const auto& tensor : {keys, values}) {
    if (tensor.shape(0) != batch_dim) {
      std::ostringstream msg;
      msg << "[scaled_dot_product_attention] mismatching batch dimension for input with shape "
          << tensor.shape() << ".";
      throw std::invalid_argument(msg.str());
    }
  }

  // Q, K must have matching last dims (d_k aka 'head_dim');
  if (queries.shape(-1) != keys.shape(-1)) {
    std::ostringstream msg;
    msg << "[scaled_dot_product_attention] query, keys expected to have matching last dimension; found query shape "
        << queries.shape() << " for keys shape " << keys.shape() << ".";
    throw std::invalid_argument(msg.str());
  }

  // K, V must have matching number of heads (n_kv_heads);
  auto n_q_heads = queries.shape(-3);
  auto n_kv_heads = keys.shape(-3);

  if (keys.shape(-3) != values.shape(-3)) {
    std::ostringstream msg;
    msg << "[scaled_dot_product_attention] keys, values expected to have matching n_kv_heads; found keys with n_heads "
        << keys.shape(-3) << " for values with n_heads " << values.shape(-3)
        << ".";
    throw std::invalid_argument(msg.str());
  }

  // n_heads % n_kv_heads == 0; n_heads >= 1, n_kv_heads >= 1.
  if (n_q_heads % n_kv_heads != 0) {
    std::ostringstream msg;
    msg << "[scaled_dot_product_attention] n_heads must be a multiple of n_kv_heads, found n_heads "
        << n_q_heads << " for n_kv_heads " << n_kv_heads << ".";
    throw std::invalid_argument(msg.str());
  }

  auto final_type = result_type(queries, keys, values);
  if (!issubdtype(final_type, floating)) {
    std::ostringstream msg;
    msg << "[scaled_dot_product_attention] Received unsupported type "
        << final_type << ".";
    throw std::invalid_argument(msg.str());
  }
  bool has_sinks = sinks.has_value();

  auto q = astype(queries, final_type, s);
  auto k = astype(keys, final_type, s);
  auto v = astype(values, final_type, s);

  auto fallback = [scale,
                   n_q_heads,
                   n_kv_heads,
                   do_causal,
                   has_sinks,
                   has_arr_mask,
                   s](const std::vector<array>& inputs) {
    auto q = multiply(array(scale, inputs[0].dtype()), inputs[0], s);
    int n_repeats = n_q_heads / n_kv_heads;
    auto k = inputs[1];
    auto v = inputs[2];
    if (n_repeats > 1) {
      q = unflatten(q, 1, {n_kv_heads, n_repeats}, s);
      k = expand_dims(k, 2, s);
      v = expand_dims(v, 2, s);
    }
    auto scores = matmul(q, swapaxes(k, -1, -2, s), s);
    if (has_arr_mask || do_causal) {
      // Mask must be broadcast-compatible with [B, n_q_heads, L_q, L_kv]
      auto make_or_fetch_mask = [&]() {
        if (do_causal) {
          int kL = k.shape(-2);
          int qL = q.shape(-2);
          int offset = kL - qL;
          auto q_idx = arange(offset, qL + offset, s);
          auto k_idx = arange(0, kL, s);
          q_idx = expand_dims(q_idx, 1, s);
          k_idx = expand_dims(k_idx, 0, s);
          return greater_equal(q_idx, k_idx, s);
        }
        return inputs[3];
      };
      auto mask = make_or_fetch_mask();

      if (n_repeats > 1 && mask.ndim() >= 3) {
        if (mask.shape(-3) == 1) {
          mask = expand_dims(mask, -3, s);
        } else {
          mask = unflatten(mask, -3, {n_kv_heads, n_repeats}, s);
        }
      }
      if (mask.dtype() == bool_) {
        scores = where(
            mask, scores, array(finfo(scores.dtype()).min, scores.dtype()), s);
      } else {
        scores = add(scores, mask, s);
      }
    }
    if (has_sinks) {
      auto sinks = inputs.back();
      // scores has shape B N_q N_k L_q L_k
      sinks = expand_dims(sinks, {0, 2, 3}, s);
      if (scores.ndim() == 5) {
        sinks = unflatten(sinks, 1, {n_kv_heads, n_repeats}, s);
      }
      auto bsx_shape = scores.shape();
      bsx_shape.back() = 1;
      scores = concatenate({broadcast_to(sinks, bsx_shape, s), scores}, -1, s);
    }
    scores = softmax(scores, std::vector<int>{-1}, true, s);
    if (has_sinks) {
      // Slice off scores
      auto start = Shape(scores.ndim(), 0);
      start.back() = 1;
      auto stop = scores.shape();
      scores = slice(scores, std::move(start), std::move(stop), s);
    }
    auto out = matmul(scores, v, s);
    if (n_repeats > 1) {
      out = flatten(out, 1, 2, s);
    }
    return std::vector<array>{out};
  };

  auto stream = to_stream(s);
  std::vector<array> inputs = {q, k, v};
  if (has_arr_mask) {
    // Check type
    has_bool_mask = mask_arr->dtype() == bool_;
    if (promote_types(mask_arr->dtype(), final_type) != final_type) {
      std::ostringstream msg;
      msg << "[scaled_dot_product_attention] Mask type must promote to output type "
          << final_type << ".";
      throw std::invalid_argument(msg.str());
    } else if (!has_bool_mask) {
      mask_arr = astype(*mask_arr, final_type, stream);
    }
    // Broadcast mask
    auto mask_shape = queries.shape();
    mask_shape.back() = keys.shape(-2);
    inputs.push_back(broadcast_to(*mask_arr, mask_shape, stream));
  }
  if (has_sinks) {
    if (promote_types(sinks->dtype(), final_type) != final_type) {
      std::ostringstream msg;
      msg << "[scaled_dot_product_attention] Type of sinks must promote to output type "
          << final_type << ".";
      throw std::invalid_argument(msg.str());
    }
    if (sinks->ndim() != 1 || sinks->shape(0) != n_q_heads) {
      std::ostringstream msg;
      msg << "[scaled_dot_product_attention] Received invalid shape for sinks "
          << sinks->shape() << ".";
      throw std::invalid_argument(msg.str());
    }
    inputs.push_back(astype(*sinks, final_type, stream));
  }

  bool is_training = detail::in_grad_tracing();
  bool has_fast_vjp = !ScaledDotProductAttentionVJP::use_fallback(q, stream);
  bool output_logsumexp = is_training && has_fast_vjp;
  if (!ScaledDotProductAttention::use_fallback(
          q,
          k,
          v,
          has_mask,
          has_arr_mask,
          do_causal,
          is_training,
          output_logsumexp,
          stream)) {
    if (has_bool_mask && !ScaledDotProductAttention::supports_bool_mask()) {
      // Convert bool mask to additive mask.
      float inf = std::numeric_limits<float>::infinity();
      array& mask = inputs[3];
      mask = where(
          mask,
          full_like(mask, 0, final_type, s),
          full_like(mask, -inf, final_type, s));
    }
    Shape out_shape{q.shape(0), q.shape(1), q.shape(2), v.shape(-1)};
    auto primitive = std::make_shared<ScaledDotProductAttention>(
        stream, fallback, scale, do_causal, has_sinks, output_logsumexp);
    if (output_logsumexp) {
      return array::make_arrays(
          {std::move(out_shape), Shape{q.shape(0), q.shape(1), q.shape(2), 1}},
          {final_type, float32},
          primitive,
          std::move(inputs))[0];
    } else {
      return array(
          std::move(out_shape), final_type, primitive, std::move(inputs));
    }
  }
  return fallback(std::move(inputs))[0];
}

std::vector<array> ScaledDotProductAttention::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  assert(primals.size() >= 3);
  assert(cotangents.size() == outputs.size());

  auto s = stream();
  if (ScaledDotProductAttentionVJP::use_fallback(primals[0], s)) {
    assert(outputs.size() == 1);
    return Custom::vjp(primals, cotangents, argnums, outputs);
  }

  auto fallback = [sdpa = fallback_, s](const std::vector<array>& inputs) {
    std::vector<array> primals(inputs.begin(), std::prev(inputs.end()));
    auto [_, vjps] = mlx::core::vjp(sdpa, primals, {inputs.back()});
    return vjps;
  };

  std::vector<Shape> shapes;
  std::vector<Dtype> dtypes;
  for (int i = 0; i < /* outputs size */ 3; ++i) {
    shapes.push_back(primals[i].shape());
    dtypes.push_back(primals[i].dtype());
  }
  auto primitive = std::make_shared<ScaledDotProductAttentionVJP>(
      s, fallback, scale_, do_causal_, has_sinks_);
  std::vector<array> inputs = primals;
  inputs.push_back(outputs[0]);
  inputs.push_back(outputs[1]);
  inputs.push_back(cotangents[0]);
  auto vjps = array::make_arrays(std::move(shapes), dtypes, primitive, inputs);

  std::vector<array> returned_vjps;
  for (int arg : argnums) {
    if (arg >= 3) {
      throw std::invalid_argument(
          "[scale_dot_product_attention] Does not support VJP with respect "
          " to mask or attention sinks.");
    }
    returned_vjps.push_back(std::move(vjps[arg]));
  }
  return returned_vjps;
}

bool ScaledDotProductAttention::is_equivalent(const Primitive& other) const {
  const ScaledDotProductAttention& a_other =
      static_cast<const ScaledDotProductAttention&>(other);
  return scale_ == a_other.scale_ && do_causal_ == a_other.do_causal_ &&
      has_sinks_ == a_other.has_sinks_ &&
      output_logsumexp_ == a_other.output_logsumexp_;
}

bool ScaledDotProductAttentionVJP::is_equivalent(const Primitive& other) const {
  const ScaledDotProductAttentionVJP& a_other =
      static_cast<const ScaledDotProductAttentionVJP&>(other);
  return scale_ == a_other.scale_ && do_causal_ == a_other.do_causal_ &&
      has_sinks_ == a_other.has_sinks_;
}

bool Quantize::is_equivalent(const Primitive& other) const {
  const Quantize& p_other = static_cast<const Quantize&>(other);
  return (
      p_other.group_size_ == group_size_ && p_other.bits_ == bits_ &&
      p_other.mode_ == mode_ && p_other.dequantize_ == dequantize_);
}

std::vector<Shape> Quantize::output_shapes(const std::vector<array>& inputs) {
  auto& w = inputs[0];
  if (dequantize_) {
    auto out_size = w.shape(-1) * 32 / bits_;
    auto out_shape = w.shape();
    out_shape.back() = out_size;
    return {std::move(out_shape)};
  } else {
    auto wq_shape = w.shape();
    wq_shape.back() = w.shape(-1) * bits_ / 32;
    auto sshape = w.shape();
    sshape.back() = w.shape(-1) / group_size_;
    if (inputs.size() == 2) {
      return {std::move(wq_shape), std::move(sshape)};
    } else {
      auto bshape = sshape;
      return {std::move(wq_shape), std::move(sshape), std::move(bshape)};
    }
  }
}

bool ConvertFP8::is_equivalent(const Primitive& other) const {
  const ConvertFP8& a_other = static_cast<const ConvertFP8&>(other);
  return to_fp8_ == a_other.to_fp8_;
}

} // namespace mlx::core::fast


================================================
FILE: mlx/fast.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <optional>
#include <variant>

#include "mlx/api.h"
#include "mlx/utils.h"

namespace mlx::core::fast {

MLX_API array rms_norm(
    const array& x,
    const std::optional<array>& weight,
    float eps,
    StreamOrDevice s = {});

MLX_API array layer_norm(
    const array& x,
    const std::optional<array>& weight,
    const std::optional<array>& bias,
    float eps,
    StreamOrDevice s = {});

MLX_API array rope(
    const array& x,
    int dims,
    bool traditional,
    std::optional<float> base,
    float scale,
    int offset,
    const std::optional<array>& freqs = std::nullopt,
    StreamOrDevice s = {});

MLX_API array rope(
    const array& x,
    int dims,
    bool traditional,
    std::optional<float> base,
    float scale,
    const array& offset,
    const std::optional<array>& freqs = std::nullopt,
    StreamOrDevice s = {});

/** Computes: O = softmax(Q @ K.T) @ V **/
MLX_API array scaled_dot_product_attention(
    const array& queries,
    const array& keys,
    const array& values,
    const float scale,
    const std::string& mask_mode = "",
    std::optional<array> mask_arr = {},
    const std::optional<array>& sinks = {},
    StreamOrDevice s = {});

using TemplateArg = std::variant<int, bool, Dtype>;
using ScalarArg = std::variant<bool, int, float>;

using CustomKernelFunction = std::function<std::vector<array>(
    const std::vector<array>&,
    const std::vector<Shape>&,
    const std::vector<Dtype>&,
    std::tuple<int, int, int>,
    std::tuple<int, int, int>,
    std::vector<std::pair<std::string, TemplateArg>>,
    std::optional<float>,
    bool,
    StreamOrDevice)>;

MLX_API CustomKernelFunction metal_kernel(
    const std::string& name,
    const std::vector<std::string>& input_names,
    const std::vector<std::string>& output_names,
    const std::string& source,
    const std::string& header = "",
    bool ensure_row_contiguous = true,
    bool atomic_outputs = false);

MLX_API CustomKernelFunction cuda_kernel(
    const std::string& name,
    const std::vector<std::string>& input_names,
    const std::vector<std::string>& output_names,
    const std::string& source,
    const std::string& header = "",
    bool ensure_row_contiguous = true,
    int shared_memory = 0);

MLX_API std::vector<array> precompiled_cuda_kernel(
    const std::string& name,
    const std::string& compiled_source,
    const std::vector<array>& inputs,
    const std::vector<Shape>& output_shapes,
    const std::vector<Dtype>& output_dtypes,
    const std::vector<ScalarArg>& scalars,
    std::tuple<int, int, int> grid,
    std::tuple<int, int, int> threadgroup,
    int shared_memory = 0,
    std::optional<float> init_value = std::nullopt,
    bool ensure_row_contiguous = false,
    StreamOrDevice s = {});

} // namespace mlx::core::fast


================================================
FILE: mlx/fast_primitives.h
================================================
// Copyright © 2024 Apple Inc.

#include <optional>
#include <variant>

#include "mlx/primitives.h"

namespace mlx::core::fast {

// Custom primitive accepts a fallback function which it uses for
// transformations. Transformations are virtual so that derived classes may
// override the default behavior.
class Custom : public Primitive {
 public:
  explicit Custom(
      Stream stream,
      std::function<std::vector<array>(std::vector<array>)> fallback)
      : Primitive(stream), fallback_(std::move(fallback)) {}

  virtual std::pair<std::vector<array>, std::vector<int>> vmap(
      const std::vector<array>& inputs,
      const std::vector<int>& axes) override;

  virtual std::vector<array> jvp(
      const std::vector<array>& primals,
      const std::vector<array>& tangents,
      const std::vector<int>& argnums) override;

  virtual std::vector<array> vjp(
      const std::vector<array>& primals,
      const std::vector<array>& cotangents,
      const std::vector<int>& argnums,
      const std::vector<array>& outputs) override;

 protected:
  std::function<std::vector<array>(std::vector<array>)> fallback_;
};

class RMSNorm : public Custom {
 public:
  RMSNorm(
      Stream stream,
      std::function<std::vector<array>(std::vector<array>)> fallback,
      float eps)
      : Custom(stream, std::move(fallback)), eps_(eps) {}

  static bool use_fallback(Stream stream);

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override {
    throw std::runtime_error("NYI");
  }
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  std::vector<array> vjp(
      const std::vector<array>& primals,
      const std::vector<array>& cotangents,
      const std::vector<int>& argnums,
      const std::vector<array>& outputs) override;

  DEFINE_NAME(RMSNorm)
  bool is_equivalent(const Primitive& other) const override;
  DEFINE_INPUT_OUTPUT_SHAPE()

  auto state() const {
    return std::make_pair(nullptr, eps_);
  }

 private:
  float eps_;
};

class RMSNormVJP : public Custom {
 public:
  RMSNormVJP(
      Stream stream,
      std::function<std::vector<array>(std::vector<array>)> fallback,
      float eps)
      : Custom(stream, std::move(fallback)), eps_(eps) {}

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override {
    throw std::runtime_error("NYI");
  }
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  DEFINE_NAME(RMSNormVJP)
  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return std::make_pair(nullptr, eps_);
  }

 private:
  float eps_;
};

class LayerNorm : public Custom {
 public:
  LayerNorm(
      Stream stream,
      std::function<std::vector<array>(std::vector<array>)> fallback,
      float eps)
      : Custom(stream, std::move(fallback)), eps_(eps) {}

  static bool use_fallback(Stream s);

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override {
    throw std::runtime_error("NYI");
  }
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  std::vector<array> vjp(
      const std::vector<array>& primals,
      const std::vector<array>& cotangents,
      const std::vector<int>& argnums,
      const std::vector<array>& outputs) override;

  DEFINE_NAME(LayerNorm)
  bool is_equivalent(const Primitive& other) const override;
  DEFINE_INPUT_OUTPUT_SHAPE()
  auto state() const {
    return std::make_pair(nullptr, eps_);
  }

 private:
  float eps_;
};

class LayerNormVJP : public Custom {
 public:
  LayerNormVJP(
      Stream stream,
      std::function<std::vector<array>(std::vector<array>)> fallback,
      float eps)
      : Custom(stream, std::move(fallback)), eps_(eps) {}

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override {
    throw std::runtime_error("NYI");
  }
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  DEFINE_NAME(LayerNormVJP)
  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return std::make_pair(nullptr, eps_);
  }

 private:
  float eps_;
};

class RoPE : public Custom {
 public:
  RoPE(
      Stream stream,
      std::function<std::vector<array>(std::vector<array>)> fallback,
      int dims,
      bool traditional,
      float base,
      float scale,
      bool forward)
      : Custom(stream, std::move(fallback)),
        dims_(dims),
        traditional_(traditional),
        base_(base),
        scale_(scale),
        forward_(forward) {}

  static bool use_fallback(Stream s);

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override {
    throw std::runtime_error("NYI");
  }
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  std::vector<array> vjp(
      const std::vector<array>& primals,
      const std::vector<array>& cotangents,
      const std::vector<int>& argnums,
      const std::vector<array>& outputs) override;

  DEFINE_NAME(RoPE)
  bool is_equivalent(const Primitive& other) const override;
  DEFINE_INPUT_OUTPUT_SHAPE()
  auto state() const {
    return std::make_tuple(
        nullptr, dims_, traditional_, base_, scale_, forward_);
  }

 private:
  int dims_;
  bool traditional_;
  float base_;
  float scale_;
  bool forward_;
};

class ScaledDotProductAttention : public Custom {
 public:
  ScaledDotProductAttention(
      Stream stream,
      std::function<std::vector<array>(std::vector<array>)> fallback,
      float scale,
      bool do_causal,
      bool has_sinks,
      bool output_logsumexp)
      : Custom(stream, std::move(fallback)),
        scale_(scale),
        do_causal_(do_causal),
        has_sinks_(has_sinks),
        output_logsumexp_(output_logsumexp) {}

  static bool use_fallback(
      const array& q,
      const array& k,
      const array& v,
      bool has_mask,
      bool has_arr_mask,
      bool do_causal,
      bool is_training,
      bool output_logsumexp,
      Stream s);
  static bool supports_bool_mask();

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override {
    throw std::runtime_error("NYI");
  }

  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  std::vector<array> vjp(
      const std::vector<array>& primals,
      const std::vector<array>& cotangents,
      const std::vector<int>& argnums,
      const std::vector<array>& outputs) override;

  bool is_equivalent(const Primitive& other) const override;

  DEFINE_NAME(ScaledDotProductAttention);
  DEFINE_INPUT_OUTPUT_SHAPE()
  auto state() const {
    return std::make_tuple(
        nullptr, scale_, do_causal_, has_sinks_, output_logsumexp_);
  }

 private:
  float scale_;
  bool do_causal_;
  bool has_sinks_;
  bool output_logsumexp_;
};

class ScaledDotProductAttentionVJP : public Custom {
 public:
  ScaledDotProductAttentionVJP(
      Stream stream,
      std::function<std::vector<array>(std::vector<array>)> fallback,
      float scale,
      bool do_causal,
      bool has_sinks)
      : Custom(stream, std::move(fallback)),
        scale_(scale),
        do_causal_(do_causal),
        has_sinks_(has_sinks) {}

  static bool use_fallback(const array& q, Stream s);

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override {
    throw std::runtime_error("NYI");
  }

  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  DEFINE_NAME(ScaledDotProductAttentionVJP);
  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return std::make_tuple(nullptr, scale_, do_causal_, has_sinks_);
  }

 private:
  float scale_;
  bool do_causal_;
  bool has_sinks_;
};

class ConvertFP8 : public Primitive {
 public:
  explicit ConvertFP8(Stream stream, bool to_fp8)
      : Primitive(stream), to_fp8_(to_fp8) {}

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  const char* name() const override {
    if (to_fp8_) {
      return "ToFP8";
    } else {
      return "FromFP8";
    }
  }
  bool state() const {
    return to_fp8_;
  };

  bool is_equivalent(const Primitive& other) const override;
  DEFINE_INPUT_OUTPUT_SHAPE();

 private:
  bool to_fp8_;
};

class Quantize : public Custom {
 public:
  explicit Quantize(
      Stream stream,
      std::function<std::vector<array>(std::vector<array>)> fallback,
      int group_size,
      int bits,
      QuantizationMode mode,
      bool dequantize)
      : Custom(stream, std::move(fallback)),
        group_size_(group_size),
        bits_(bits),
        mode_(mode),
        dequantize_(dequantize) {}

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  DEFINE_NAME(Quantize);

  bool is_equivalent(const Primitive& other) const override;
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  auto state() const {
    return std::make_tuple(nullptr, group_size_, bits_, mode_, dequantize_);
  }

 private:
  int group_size_;
  int bits_;
  QuantizationMode mode_;
  bool dequantize_;
};

using ScalarArg = std::variant<bool, int, float>;

class CustomKernel : public Primitive {
 public:
  CustomKernel(
      Stream stream,
      std::string name,
      std::string source,
      std::tuple<int, int, int> grid,
      std::tuple<int, int, int> threadgroup,
      std::vector<std::tuple<bool, bool, bool>> shape_infos,
      bool ensure_row_contiguous,
      std::optional<float> init_value,
      std::vector<ScalarArg> scalar_arguments,
      bool is_precompiled,
      int shared_memory)
      : Primitive(stream),
        name_(std::move(name)),
        source_(std::move(source)),
        grid_(grid),
        threadgroup_(threadgroup),
        shape_infos_(std::move(shape_infos)),
        ensure_row_contiguous_(ensure_row_contiguous),
        init_value_(init_value),
        scalar_arguments_(std::move(scalar_arguments)),
        is_precompiled_(is_precompiled),
        shared_memory_(shared_memory) {}

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override {
    throw std::runtime_error("Custom kernels only run on GPU.");
  }

  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  DEFINE_NAME(CustomKernel);
  auto state() const {
    return std::make_tuple(
        name_,
        source_,
        grid_,
        threadgroup_,
        shape_infos_,
        ensure_row_contiguous_,
        init_value_,
        scalar_arguments_,
        is_precompiled_,
        shared_memory_);
  }

 private:
  std::string name_;
  std::string source_;
  std::tuple<int, int, int> grid_;
  std::tuple<int, int, int> threadgroup_;
  std::vector<std::tuple<bool, bool, bool>> shape_infos_;
  bool ensure_row_contiguous_;
  std::optional<float> init_value_;
  std::vector<ScalarArg> scalar_arguments_;
  bool is_precompiled_;
  int shared_memory_;
};

} // namespace mlx::core::fast


================================================
FILE: mlx/fence.h
================================================
// Copyright © 2024 Apple Inc.

#include <vector>

#include "mlx/array.h"

namespace mlx::core {

/* A fence to be used for synchronizing work between streams.
 *
 * Calls to `wait` wait in the given stream until all previous calls to update
 * are complete on their given stream.
 *
 * The array passed to `update` is computed and visible after the call to
 * `wait` returns. The array passed to `wait` will not be read until all
 * previous calls to `update` have completed.
 *
 * Note, calls to `update` should always be from the same thread or explicitly
 * synchronized so that they occur in sequence. Calls to `wait` can be on any
 * thread.
 *
 * For the Metal back-end the fence supports slow (default) and fast mode.
 * Fast mode requires setting the environment variable
 * `MLX_METAL_FAST_SYNCH=1`. Fast mode also requires Metal 3.2+ (macOS 15+,
 * iOS 18+).
 */
class Fence {
 public:
  Fence() {};
  explicit Fence(Stream stream);

  void update(Stream stream, const array& x, bool cross_device);
  void wait(Stream stream, const array& x);

 private:
  std::shared_ptr<void> fence_{nullptr};
};

} // namespace mlx::core


================================================
FILE: mlx/fft.cpp
================================================
// Copyright © 2023 Apple Inc.
#include <numeric>
#include <set>

#include "mlx/fft.h"
#include "mlx/ops.h"
#include "mlx/primitives.h"
#include "mlx/utils.h"

namespace mlx::core::fft {

array fft_impl(
    const array& a,
    Shape n,
    const std::vector<int>& axes,
    bool real,
    bool inverse,
    StreamOrDevice s) {
  if (a.ndim() < 1) {
    throw std::invalid_argument(
        "[fftn] Requires array with at least one dimension.");
  }
  if (n.size() != axes.size()) {
    throw std::invalid_argument("[fftn] Shape and axes have different sizes.");
  }
  if (axes.empty()) {
    return a;
  }

  std::vector<size_t> valid_axes;
  for (int ax : axes) {
    valid_axes.push_back(ax < 0 ? ax + a.ndim() : ax);
  }
  std::set<int> unique_axes(valid_axes.begin(), valid_axes.end());
  if (unique_axes.size() != axes.size()) {
    std::ostringstream msg;
    msg << "[fftn] Duplicated axis received " << axes;
    throw std::invalid_argument(msg.str());
  }
  if (*unique_axes.begin() < 0 || *unique_axes.rbegin() >= a.ndim()) {
    std::ostringstream msg;
    msg << "[fftn] Invalid axis received for array with " << a.ndim()
        << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  // In the following shape manipulations there are three cases to consider:
  // 1. In a complex to complex transform (fftn / ifftn) the output
  //    and input shapes are the same.
  // 2. In a real to complex transform (rfftn) n specifies the input dims
  //    and the output dims are n[i] / 2 + 1
  // 3  In a complex to real transform (irfftn) n specifies the output dims
  //    and the input dims are n[i] / 2 + 1

  if (std::any_of(n.begin(), n.end(), [](auto i) { return i <= 0; })) {
    std::ostringstream msg;
    msg << "[fftn] Invalid FFT output size requested " << n;
    throw std::invalid_argument(msg.str());
  }

  auto in_shape = a.shape();
  for (int i = 0; i < valid_axes.size(); ++i) {
    in_shape[valid_axes[i]] = n[i];
  }
  if (real && inverse) {
    in_shape[valid_axes.back()] = n.back() / 2 + 1;
  }

  bool any_greater = false;
  bool any_less = false;
  for (int i = 0; i < in_shape.size(); ++i) {
    any_greater |= in_shape[i] > a.shape()[i];
    any_less |= in_shape[i] < a.shape()[i];
  }

  auto in = a;
  if (any_less) {
    in = slice(in, Shape(in.ndim(), 0), in_shape, s);
  }
  if (any_greater) {
    // Pad with zeros
    auto tmp = zeros(in_shape, a.dtype(), s);
    in = slice_update(tmp, in, Shape(in.ndim(), 0), in.shape());
  }

  auto out_shape = in_shape;
  if (real) {
    auto ax = valid_axes.back();
    out_shape[ax] = inverse ? n.back() : out_shape[ax] / 2 + 1;
  }

  auto in_type = real && !inverse ? float32 : complex64;
  auto out_type = real && inverse ? float32 : complex64;
  return array(
      out_shape,
      out_type,
      std::make_shared<FFT>(to_stream(s), valid_axes, inverse, real),
      {astype(in, in_type, s)});
}

array fft_impl(
    const array& a,
    const std::vector<int>& axes,
    bool real,
    bool inverse,
    StreamOrDevice s) {
  Shape n;
  for (auto ax : axes) {
    n.push_back(a.shape(ax));
  }
  if (real && inverse && a.ndim() > 0) {
    n.back() = (n.back() - 1) * 2;
  }
  return fft_impl(a, n, axes, real, inverse, s);
}

array fft_impl(const array& a, bool real, bool inverse, StreamOrDevice s) {
  std::vector<int> axes(a.ndim());
  std::iota(axes.begin(), axes.end(), 0);
  return fft_impl(a, axes, real, inverse, s);
}

array fftn(
    const array& a,
    const Shape& n,
    const std::vector<int>& axes,
    StreamOrDevice s /* = {} */) {
  return fft_impl(a, n, axes, false, false, s);
}
array fftn(
    const array& a,
    const std::vector<int>& axes,
    StreamOrDevice s /* = {} */) {
  return fft_impl(a, axes, false, false, s);
}
array fftn(const array& a, StreamOrDevice s /* = {} */) {
  return fft_impl(a, false, false, s);
}

array ifftn(
    const array& a,
    const Shape& n,
    const std::vector<int>& axes,
    StreamOrDevice s /* = {} */) {
  return fft_impl(a, n, axes, false, true, s);
}
array ifftn(
    const array& a,
    const std::vector<int>& axes,
    StreamOrDevice s /* = {} */) {
  return fft_impl(a, axes, false, true, s);
}
array ifftn(const array& a, StreamOrDevice s /* = {} */) {
  return fft_impl(a, false, true, s);
}

array rfftn(
    const array& a,
    const Shape& n,
    const std::vector<int>& axes,
    StreamOrDevice s /* = {} */) {
  return fft_impl(a, n, axes, true, false, s);
}
array rfftn(
    const array& a,
    const std::vector<int>& axes,
    StreamOrDevice s /* = {} */) {
  return fft_impl(a, axes, true, false, s);
}
array rfftn(const array& a, StreamOrDevice s /* = {} */) {
  return fft_impl(a, true, false, s);
}

array irfftn(
    const array& a,
    const Shape& n,
    const std::vector<int>& axes,
    StreamOrDevice s /* = {} */) {
  return fft_impl(a, n, axes, true, true, s);
}
array irfftn(
    const array& a,
    const std::vector<int>& axes,
    StreamOrDevice s /* = {} */) {
  return fft_impl(a, axes, true, true, s);
}

array irfftn(const array& a, StreamOrDevice s /* = {} */) {
  return fft_impl(a, true, true, s);
}

array fftshift(
    const array& a,
    const std::vector<int>& axes,
    StreamOrDevice s /* = {} */) {
  if (axes.empty()) {
    return a;
  }

  Shape shifts;
  for (int ax : axes) {
    // Convert negative axes to positive
    int axis = ax < 0 ? ax + a.ndim() : ax;
    if (axis < 0 || axis >= a.ndim()) {
      std::ostringstream msg;
      msg << "[fftshift] Invalid axis " << ax << " for array with " << a.ndim()
          << " dimensions.";
      throw std::invalid_argument(msg.str());
    }
    // Match NumPy's implementation
    shifts.push_back(a.shape(axis) / 2);
  }

  return roll(a, shifts, axes, s);
}

array ifftshift(
    const array& a,
    const std::vector<int>& axes,
    StreamOrDevice s /* = {} */) {
  if (axes.empty()) {
    return a;
  }

  Shape shifts;
  for (int ax : axes) {
    // Convert negative axes to positive
    int axis = ax < 0 ? ax + a.ndim() : ax;
    if (axis < 0 || axis >= a.ndim()) {
      std::ostringstream msg;
      msg << "[ifftshift] Invalid axis " << ax << " for array with " << a.ndim()
          << " dimensions.";
      throw std::invalid_argument(msg.str());
    }
    // Match NumPy's implementation
    int size = a.shape(axis);
    shifts.push_back(-(size / 2));
  }

  return roll(a, shifts, axes, s);
}

// Default versions that operate on all axes
array fftshift(const array& a, StreamOrDevice s /* = {} */) {
  if (a.ndim() < 1) {
    return a;
  }
  std::vector<int> axes(a.ndim());
  std::iota(axes.begin(), axes.end(), 0);
  return fftshift(a, axes, s);
}

array ifftshift(const array& a, StreamOrDevice s /* = {} */) {
  if (a.ndim() < 1) {
    return a;
  }
  std::vector<int> axes(a.ndim());
  std::iota(axes.begin(), axes.end(), 0);
  return ifftshift(a, axes, s);
}

} // namespace mlx::core::fft


================================================
FILE: mlx/fft.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include <variant>

#include "array.h"
#include "device.h"
#include "mlx/api.h"
#include "utils.h"

namespace mlx::core::fft {

/** Compute the n-dimensional Fourier Transform. */
MLX_API array fftn(
    const array& a,
    const Shape& n,
    const std::vector<int>& axes,
    StreamOrDevice s = {});
MLX_API array
fftn(const array& a, const std::vector<int>& axes, StreamOrDevice s = {});
MLX_API array fftn(const array& a, StreamOrDevice s = {});

/** Compute the n-dimensional inverse Fourier Transform. */
MLX_API array ifftn(
    const array& a,
    const Shape& n,
    const std::vector<int>& axes,
    StreamOrDevice s = {});
MLX_API array
ifftn(const array& a, const std::vector<int>& axes, StreamOrDevice s = {});
MLX_API array ifftn(const array& a, StreamOrDevice s = {});

/** Compute the one-dimensional Fourier Transform. */
inline array fft(const array& a, int n, int axis, StreamOrDevice s = {}) {
  return fftn(a, {n}, {axis}, s);
}
inline array fft(const array& a, int axis = -1, StreamOrDevice s = {}) {
  return fftn(a, {axis}, s);
}

/** Compute the one-dimensional inverse Fourier Transform. */
inline array ifft(const array& a, int n, int axis, StreamOrDevice s = {}) {
  return ifftn(a, {n}, {axis}, s);
}
inline array ifft(const array& a, int axis = -1, StreamOrDevice s = {}) {
  return ifftn(a, {axis}, s);
}

/** Compute the two-dimensional Fourier Transform. */
inline array fft2(
    const array& a,
    const Shape& n,
    const std::vector<int>& axes,
    StreamOrDevice s = {}) {
  return fftn(a, n, axes, s);
}
inline array fft2(
    const array& a,
    const std::vector<int>& axes = {-2, -1},
    StreamOrDevice s = {}) {
  return fftn(a, axes, s);
}

/** Compute the two-dimensional inverse Fourier Transform. */
inline array ifft2(
    const array& a,
    const Shape& n,
    const std::vector<int>& axes,
    StreamOrDevice s = {}) {
  return ifftn(a, n, axes, s);
}
inline array ifft2(
    const array& a,
    const std::vector<int>& axes = {-2, -1},
    StreamOrDevice s = {}) {
  return ifftn(a, axes, s);
}

/** Compute the n-dimensional Fourier Transform on a real input. */
MLX_API array rfftn(
    const array& a,
    const Shape& n,
    const std::vector<int>& axes,
    StreamOrDevice s = {});
MLX_API array
rfftn(const array& a, const std::vector<int>& axes, StreamOrDevice s = {});
MLX_API array rfftn(const array& a, StreamOrDevice s = {});

/** Compute the n-dimensional inverse of `rfftn`. */
MLX_API array irfftn(
    const array& a,
    const Shape& n,
    const std::vector<int>& axes,
    StreamOrDevice s = {});
MLX_API array
irfftn(const array& a, const std::vector<int>& axes, StreamOrDevice s = {});
MLX_API array irfftn(const array& a, StreamOrDevice s = {});

/** Compute the one-dimensional Fourier Transform on a real input. */
inline array rfft(const array& a, int n, int axis, StreamOrDevice s = {}) {
  return rfftn(a, {n}, {axis}, s);
}
inline array rfft(const array& a, int axis = -1, StreamOrDevice s = {}) {
  return rfftn(a, {axis}, s);
}
/** Compute the one-dimensional inverse of `rfft`. */
inline array irfft(const array& a, int n, int axis, StreamOrDevice s = {}) {
  return irfftn(a, {n}, {axis}, s);
}
inline array irfft(const array& a, int axis = -1, StreamOrDevice s = {}) {
  return irfftn(a, {axis}, s);
}

/** Compute the two-dimensional Fourier Transform on a real input. */
inline array rfft2(
    const array& a,
    const Shape& n,
    const std::vector<int>& axes,
    StreamOrDevice s = {}) {
  return rfftn(a, n, axes, s);
}
inline array rfft2(
    const array& a,
    const std::vector<int>& axes = {-2, -1},
    StreamOrDevice s = {}) {
  return rfftn(a, axes, s);
}

/** Compute the two-dimensional inverse of `rfft2`. */
inline array irfft2(
    const array& a,
    const Shape& n,
    const std::vector<int>& axes,
    StreamOrDevice s = {}) {
  return irfftn(a, n, axes, s);
}
inline array irfft2(
    const array& a,
    const std::vector<int>& axes = {-2, -1},
    StreamOrDevice s = {}) {
  return irfftn(a, axes, s);
}
/** Shift the zero-frequency component to the center of the spectrum. */
MLX_API array fftshift(const array& a, StreamOrDevice s = {});

/** Shift the zero-frequency component to the center of the spectrum along
 * specified axes. */
MLX_API array
fftshift(const array& a, const std::vector<int>& axes, StreamOrDevice s = {});

/** The inverse of fftshift. */
MLX_API array ifftshift(const array& a, StreamOrDevice s = {});

/** The inverse of fftshift along specified axes. */
MLX_API array
ifftshift(const array& a, const std::vector<int>& axes, StreamOrDevice s = {});

} // namespace mlx::core::fft


================================================
FILE: mlx/graph_utils.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <functional>
#include <optional>
#include <sstream>
#include <unordered_map>
#include <unordered_set>

#include "mlx/graph_utils.h"
#include "mlx/primitives.h"
#include "mlx/utils.h"

namespace mlx::core {

const std::string& NodeNamer::get_name(const array& x) {
  auto it = names.find(x.id());
  if (it == names.end()) {
    // Get the next name in the sequence
    // [A, B, ..., Z, AA, AB, ...]
    std::vector<char> letters;
    auto var_num = names.size() + 1;
    while (var_num > 0) {
      letters.push_back('A' + (var_num - 1) % 26);
      var_num = (var_num - 1) / 26;
    }
    names.emplace(x.id(), std::string(letters.rbegin(), letters.rend()));

    return get_name(x);
  }
  return it->second;
}

void NodeNamer::set_name(const array& x, std::string n) {
  names[x.id()] = std::move(n);
}

void depth_first_traversal(
    std::function<void(array)> callback,
    const std::vector<array>& outputs) {
  std::function<void(const array&)> recurse;
  std::unordered_set<std::uintptr_t> cache;
  recurse = [&](const array& x) {
    auto id = x.id();
    if (cache.find(id) != cache.end()) {
      return;
    }
    cache.insert(id);
    for (auto& s : x.siblings()) {
      cache.insert(s.id());
    }
    for (auto& in : x.inputs()) {
      recurse(in);
    }
    callback(x);
  };

  for (auto& o : outputs) {
    recurse(o);
  }
}

void print_graph(
    std::ostream& os,
    NodeNamer namer,
    const std::vector<array>& outputs) {
  std::vector<array> tape;
  std::vector<array> inputs;

  depth_first_traversal(
      [&](const array& x) {
        if (x.has_primitive()) {
          tape.push_back(x);
        } else {
          inputs.push_back(x);
        }
      },
      outputs);

  auto print_arrs = [&namer, &os](std::vector<array> arrs) {
    for (auto& arr : arrs) {
      os << namer.get_name(arr);
      os << " [" << arr.shape() << ", " << arr.dtype() << "]";
      if (&arr != &arrs.back()) {
        os << ", ";
      }
    }
  };

  os << "Inputs: ";
  print_arrs(inputs);
  os << "\nOutputs: ";
  print_arrs(outputs);
  os << "\n";

  for (auto& arr : tape) {
    os << arr.primitive().name();
    os << " ";
    print_arrs(arr.inputs());
    os << " -> ";
    print_arrs(arr.outputs());
    os << "\n";
  }
}

void export_to_dot(
    std::ostream& os,
    NodeNamer namer,
    const std::vector<array>& nodes) {
  // Perform one DFS to mark arrays as intermediate if they are used as inputs
  // to other arrays.
  std::unordered_set<std::uintptr_t> intermediate_set;
  depth_first_traversal(
      [&](const array& x) {
        // No primitive so it is an input
        if (!x.has_primitive()) {
          return;
        }

        for (auto& a : x.inputs()) {
          intermediate_set.insert(a.id());
        }
      },
      nodes);

  // Now we got everything we need to make the graph. Arrays can be one of 3
  // things:
  //  1. Inputs, when they have no primitive ie are evaluated
  //  2. Intermediates, when they are the intermediate set
  //  3. Outputs, if they are not inputs and not intermediates

  os << "digraph {" << std::endl;

  depth_first_traversal(
      [&](const array& x) {
        if (!x.has_primitive()) {
          os << "{ rank=source; \"" << namer.get_name(x) << "\"; }"
             << std::endl;
          return;
        }

        // Node for primitive
        if (x.has_primitive()) {
          os << "{ ";
          os << x.primitive_id();
          os << " [label =\"";
          os << x.primitive().name();
          os << "\", shape=rectangle]";
          os << "; }" << std::endl;
          // Arrows to primitive's inputs
          for (auto& a : x.inputs()) {
            os << '"' << namer.get_name(a) << "\" -> " << x.primitive_id()
               << std::endl;
          }
        }

        // Point outputs to their primitive
        for (auto& a : x.outputs()) {
          os << "{ ";
          if (intermediate_set.find(a.id()) == intermediate_set.end()) {
            os << "rank=sink; ";
          }
          os << '"' << namer.get_name(a);
          os << "\"; }" << std::endl;
          if (x.has_primitive()) {
            os << x.primitive_id() << " -> \"" << namer.get_name(a) << '"'
               << std::endl;
          }
        }
      },
      nodes);

  os << "}";
}

} // namespace mlx::core


================================================
FILE: mlx/graph_utils.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include <unordered_map>

#include "mlx/api.h"
#include "mlx/array.h"

namespace mlx::core {

struct MLX_API NodeNamer {
  std::unordered_map<std::uintptr_t, std::string> names;

  const std::string& get_name(const array& x);
  void set_name(const array& x, std::string n);
};

MLX_API void print_graph(
    std::ostream& os,
    NodeNamer namer,
    const std::vector<array>& outputs);

inline void print_graph(std::ostream& os, const std::vector<array>& outputs) {
  print_graph(os, NodeNamer{}, outputs);
}

template <typename... Arrays, typename = enable_for_arrays_t<Arrays...>>
inline void print_graph(std::ostream& os, Arrays&&... outputs) {
  print_graph(
      os, NodeNamer{}, std::vector<array>{std::forward<Arrays>(outputs)...});
}

template <typename... Arrays, typename = enable_for_arrays_t<Arrays...>>
inline void
print_graph(std::ostream& os, NodeNamer namer, Arrays&&... outputs) {
  print_graph(
      os,
      std::move(namer),
      std::vector<array>{std::forward<Arrays>(outputs)...});
}

MLX_API void export_to_dot(
    std::ostream& os,
    NodeNamer namer,
    const std::vector<array>& outputs);

inline void export_to_dot(std::ostream& os, const std::vector<array>& outputs) {
  export_to_dot(os, NodeNamer{}, outputs);
}

template <typename... Arrays, typename = enable_for_arrays_t<Arrays...>>
inline void export_to_dot(std::ostream& os, Arrays&&... outputs) {
  export_to_dot(
      os, NodeNamer{}, std::vector<array>{std::forward<Arrays>(outputs)...});
}

template <typename... Arrays, typename = enable_for_arrays_t<Arrays...>>
inline void
export_to_dot(std::ostream& os, NodeNamer namer, Arrays&&... outputs) {
  export_to_dot(
      os,
      std::move(namer),
      std::vector<array>{std::forward<Arrays>(outputs)...});
}

} // namespace mlx::core


================================================
FILE: mlx/io/CMakeLists.txt
================================================
target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp)

if(MLX_BUILD_SAFETENSORS)
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/safetensors.cpp)
else()
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/no_safetensors.cpp)
endif()

if(MLX_BUILD_GGUF)
  message(STATUS "Downloading gguflib")
  FetchContent_Declare(
    gguflib
    GIT_REPOSITORY https://github.com/antirez/gguf-tools/
    GIT_TAG 8fa6eb65236618e28fd7710a0fba565f7faa1848)
  FetchContent_MakeAvailable(gguflib)
  target_include_directories(mlx
                             PRIVATE $<BUILD_INTERFACE:${gguflib_SOURCE_DIR}>)
  add_library(gguflib STATIC ${gguflib_SOURCE_DIR}/fp16.c
                             ${gguflib_SOURCE_DIR}/gguflib.c)
  target_link_libraries(mlx PRIVATE $<BUILD_INTERFACE:gguflib>)
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/gguf.cpp
                             ${CMAKE_CURRENT_SOURCE_DIR}/gguf_quants.cpp)
else()
  target_sources(mlx PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/no_gguf.cpp)
endif()


================================================
FILE: mlx/io/gguf.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <cstdint>
#include <cstring>
#include <fstream>
#include <numeric>

#include "mlx/io/gguf.h"
#include "mlx/ops.h"

namespace mlx::core {

// https://github.com/antirez/gguf-tools/blob/af7d88d808a7608a33723fba067036202910acb3/gguflib.h#L102-L108
constexpr int gguf_array_header_size = 12;

std::optional<uint32_t> dtype_to_gguf_tensor_type(const Dtype& dtype) {
  switch (dtype) {
    case float32:
      return GGUF_TYPE_F32;
    case float16:
      return GGUF_TYPE_F16;
    case int8:
      return GGUF_TYPE_I8;
    case int16:
      return GGUF_TYPE_I16;
    case int32:
      return GGUF_TYPE_I32;
    default:
      return {};
  }
}

std::optional<Dtype> gguf_type_to_dtype(const uint32_t& gguf_type) {
  switch (gguf_type) {
    case GGUF_TYPE_F32:
      return float32;
    case GGUF_TYPE_F16:
      return float16;
    case GGUF_TYPE_I8:
      return int8;
    case GGUF_TYPE_I16:
      return int16;
    case GGUF_TYPE_I32:
      return int32;
    default:
      return {};
  }
}

Shape get_shape(const gguf_tensor& tensor) {
  Shape shape;
  // The dimension order in GGML is the reverse of the order used in MLX.
  for (int i = tensor.ndim - 1; i >= 0; i--) {
    shape.push_back(tensor.dim[i]);
  }
  return shape;
}

std::tuple<allocator::Buffer, Dtype> extract_tensor_data(gguf_tensor* tensor) {
  if (tensor == nullptr) {
    throw std::invalid_argument(
        "[extract_tensor_data] Input tensor pointer is null.");
  }
  std::optional<Dtype> equivalent_dtype = gguf_type_to_dtype(tensor->type);
  // If there's an equivalent type, we can simply copy.
  if (equivalent_dtype.has_value()) {
    if (tensor->weights_data == nullptr) {
      throw std::runtime_error("[load_gguf] NULL tensor data pointer");
    }
    allocator::Buffer buffer = allocator::malloc(tensor->bsize);
    memcpy(
        buffer.raw_ptr(),
        tensor->weights_data,
        tensor->num_weights * equivalent_dtype.value().size());
    return {buffer, equivalent_dtype.value()};
  }
  // Otherwise, we convert to float16.
  // TODO: Add other dequantization options.
  int16_t* data = gguf_tensor_to_f16(tensor);
  if (data == NULL) {
    throw std::runtime_error("[load_gguf] gguf_tensor_to_f16 failed");
  }
  const size_t new_size = tensor->num_weights * sizeof(int16_t);
  allocator::Buffer buffer = allocator::malloc(new_size);
  memcpy(buffer.raw_ptr(), data, new_size);
  free(data);
  return {buffer, float16};
}

void set_mx_value_from_gguf(
    gguf_ctx* ctx,
    uint32_t type,
    gguf_value* val,
    GGUFMetaData& value) {
  switch (type) {
    case GGUF_VALUE_TYPE_UINT8:
      value = array(val->uint8, uint8);
      break;
    case GGUF_VALUE_TYPE_INT8:
      value = array(val->int8, int8);
      break;
    case GGUF_VALUE_TYPE_UINT16:
      value = array(val->uint16, uint16);
      break;
    case GGUF_VALUE_TYPE_INT16:
      value = array(val->int16, int16);
      break;
    case GGUF_VALUE_TYPE_UINT32:
      value = array(val->uint32, uint32);
      break;
    case GGUF_VALUE_TYPE_INT32:
      value = array(val->int32, int32);
      break;
    case GGUF_VALUE_TYPE_UINT64:
      value = array(val->uint64, uint64);
      break;
    case GGUF_VALUE_TYPE_INT64:
      value = array(val->int64, int64);
      break;
    case GGUF_VALUE_TYPE_FLOAT32:
      value = array(val->float32, float32);
      break;
    case GGUF_VALUE_TYPE_BOOL:
      value = array(val->boolval, bool_);
      break;
    case GGUF_VALUE_TYPE_STRING:
      value =
          std::string(val->string.string, static_cast<int>(val->string.len));
      break;
    case GGUF_VALUE_TYPE_FLOAT64:
      value = array(val->float64, float32);
      break;
    case GGUF_VALUE_TYPE_ARRAY: {
      ctx->off += gguf_array_header_size; // Skip header
      char* data = reinterpret_cast<char*>(val) + gguf_array_header_size;
      auto size = static_cast<int>(val->array.len);
      if (val->array.type == GGUF_VALUE_TYPE_ARRAY) {
        throw std::invalid_argument(
            "[load_gguf] Only supports loading 1-layer of nested arrays.");
      }
      switch (val->array.type) {
        case GGUF_VALUE_TYPE_UINT8:
          value = array(reinterpret_cast<uint8_t*>(data), {size}, uint8);
          break;
        case GGUF_VALUE_TYPE_INT8:
          value = array(reinterpret_cast<int8_t*>(data), {size}, int8);
          break;
        case GGUF_VALUE_TYPE_UINT16:
          value = array(reinterpret_cast<uint16_t*>(data), {size}, uint16);
          break;
        case GGUF_VALUE_TYPE_INT16:
          value = array(reinterpret_cast<int16_t*>(data), {size}, int16);
          break;
        case GGUF_VALUE_TYPE_UINT32:
          value = array(reinterpret_cast<uint32_t*>(data), {size}, uint32);
          break;
        case GGUF_VALUE_TYPE_INT32:
          value = array(reinterpret_cast<int32_t*>(data), {size}, int32);
          break;
        case GGUF_VALUE_TYPE_UINT64:
          value = array(reinterpret_cast<uint64_t*>(data), {size}, uint64);
          break;
        case GGUF_VALUE_TYPE_INT64:
          value = array(reinterpret_cast<uint64_t*>(data), {size}, int64);
          break;
        case GGUF_VALUE_TYPE_FLOAT32:
          value = array(reinterpret_cast<float*>(data), {size}, float32);
          break;
        case GGUF_VALUE_TYPE_BOOL:
          value = array(reinterpret_cast<bool*>(data), {size}, bool_);
          break;
        case GGUF_VALUE_TYPE_STRING: {
          std::vector<std::string> strs(size);
          for (auto& str : strs) {
            auto str_val = reinterpret_cast<gguf_string*>(data);
            data += (str_val->len + sizeof(gguf_string));
            str = std::string(str_val->string, static_cast<int>(str_val->len));
            ctx->off += (str_val->len + sizeof(gguf_string));
          }
          value = std::move(strs);
          break;
        }
        case GGUF_VALUE_TYPE_FLOAT64:
          value = array(reinterpret_cast<double*>(data), {size}, float32);
          break;
        default:
          throw std::runtime_error(
              "[load_gguf] Multiple levels of nested arrays are not supported.");
      }
      break;
    }
    default:
      throw std::runtime_error("[load_gguf] Received unexpected type.");
      break;
  }
  if (type == GGUF_VALUE_TYPE_STRING) {
    ctx->off += (sizeof(gguf_string) + std::get<std::string>(value).size());
  } else if (auto pv = std::get_if<array>(&value); pv) {
    ctx->off += pv->nbytes();
  }
}

std::unordered_map<std::string, GGUFMetaData> load_metadata(gguf_ctx* ctx) {
  std::unordered_map<std::string, GGUFMetaData> metadata;
  gguf_key key;
  while (gguf_get_key(ctx, &key)) {
    std::string key_name = std::string(key.name, key.namelen);
    auto& val = metadata.insert({key_name, GGUFMetaData{}}).first->second;
    set_mx_value_from_gguf(ctx, key.type, key.val, val);
  }
  return metadata;
}

std::unordered_map<std::string, array> load_arrays(gguf_ctx* ctx) {
  std::unordered_map<std::string, array> array_map;
  gguf_tensor tensor;

  auto check_insert = [](const auto& inserted) {
    if (!inserted.second) {
      std::ostringstream msg;
      msg << "[load_gguf] Duplicate parameter name " << inserted.first->second
          << " this can happend when loading quantized tensors.";
      throw std::runtime_error(msg.str());
    }
  };

  while (gguf_get_tensor(ctx, &tensor)) {
    if (tensor.type == GGUF_TYPE_Q4_0 || tensor.type == GGUF_TYPE_Q4_1 ||
        tensor.type == GGUF_TYPE_Q8_0) {
      gguf_load_quantized(array_map, tensor);
    } else {
      std::string name(tensor.name, tensor.namelen);
      const auto& [data, dtype] = extract_tensor_data(&tensor);
      array loaded_array = array(data, get_shape(tensor), dtype);
      check_insert(array_map.insert({name, loaded_array}));
    }
  }
  return array_map;
}

GGUFLoad load_gguf(const std::string& file, StreamOrDevice s) {
  bool exists;
  {
    std::ifstream f(file.c_str());
    exists = f.good();
  }
  if (!exists) {
    throw std::invalid_argument("[load_gguf] Failed to open " + file);
  }

  std::unique_ptr<gguf_ctx, decltype(&gguf_close)> ctx(
      gguf_open(file.data()), gguf_close);
  if (!ctx) {
    throw std::runtime_error("[load_gguf] gguf_init failed");
  }
  auto metadata = load_metadata(ctx.get());
  auto arrays = load_arrays(ctx.get());
  return {arrays, metadata};
}

void append_kv_array(
    gguf_ctx* ctx,
    const std::string& key,
    array& val,
    uint32_t gguf_type) {
  if (val.ndim() == 1) {
    size_t gguf_size = val.nbytes() + gguf_array_header_size;
    std::vector<char> val_vec(gguf_size);
    gguf_value* gguf_val = reinterpret_cast<gguf_value*>(val_vec.data());
    gguf_val->array.type = gguf_type;
    gguf_val->array.len = val.size();
    memcpy(
        val_vec.data() + gguf_array_header_size,
        val.data<char>(),
        val.nbytes());
    gguf_append_kv(
        ctx,
        key.c_str(),
        key.length(),
        GGUF_VALUE_TYPE_ARRAY,
        reinterpret_cast<void*>(val_vec.data()),
        gguf_size);
  } else {
    gguf_append_kv(
        ctx,
        key.c_str(),
        key.length(),
        gguf_type,
        reinterpret_cast<void*>(val.data<char>()),
        val.nbytes());
  }
}

void save_gguf(
    std::string file,
    std::unordered_map<std::string, array> array_map,
    std::unordered_map<std::string, GGUFMetaData> metadata /* = {} */) {
  // Add .gguf to file name if it is not there
  if (file.length() < 5 || file.substr(file.length() - 5, 5) != ".gguf") {
    file += ".gguf";
  }

  std::unique_ptr<gguf_ctx, decltype(&gguf_close)> ctx(
      gguf_create(file.c_str(), GGUF_OVERWRITE), gguf_close);
  if (!ctx) {
    throw std::runtime_error("[save_gguf] gguf_create failed");
  }

  auto string_to_gguf = [](char* dst, const std::string& src) {
    gguf_string* val = reinterpret_cast<gguf_string*>(dst);
    val->len = src.length();
    memcpy(val->string, src.c_str(), src.length());
  };

  // Save any meta data
  for (auto& [key, value] : metadata) {
    if (auto pv = std::get_if<std::string>(&value); pv) {
      const std::string& str = *pv;
      size_t size = sizeof(gguf_string) + str.length();
      std::vector<char> val_vec(size);
      string_to_gguf(val_vec.data(), str);
      gguf_append_kv(
          ctx.get(),
          key.c_str(),
          key.length(),
          GGUF_VALUE_TYPE_STRING,
          static_cast<void*>(val_vec.data()),
          size);
    } else if (auto pv = std::get_if<std::vector<std::string>>(&value); pv) {
      const auto& str_vec = *pv;
      auto mem_size = std::accumulate(
          str_vec.begin(), str_vec.end(), 0, [](size_t accum, const auto& s) {
            return accum + s.size();
          });
      mem_size += str_vec.size() * sizeof(gguf_string) + gguf_array_header_size;
      std::vector<char> val_vec(mem_size);
      gguf_value* val = reinterpret_cast<gguf_value*>(val_vec.data());
      val->array.type = GGUF_VALUE_TYPE_STRING;
      val->array.len = str_vec.size();
      auto str_ptr = val_vec.data() + gguf_array_header_size;
      for (auto& str : str_vec) {
        string_to_gguf(str_ptr, str);
        str_ptr += str.length() + sizeof(gguf_string);
      }
      gguf_append_kv(
          ctx.get(),
          key.c_str(),
          key.length(),
          GGUF_VALUE_TYPE_ARRAY,
          static_cast<void*>(val),
          mem_size);
    } else if (auto pv = std::get_if<array>(&value); pv) {
      array v = *pv;
      if (v.ndim() > 1) {
        throw std::runtime_error(
            "[save_gguf] Cannot save arrays with more than one dimension.");
      }
      if (v.size() == 0) {
        throw std::runtime_error("[save_gguf] Cannot save empty arrays.");
      }

      eval(v);
      if (!v.flags().row_contiguous) {
        v = reshape(flatten(v), v.shape());
      }
      if (!v.flags().row_contiguous) {
        throw std::runtime_error(
            "[save_gguf] Cannot save non contiguous arrays.");
      }
      switch (v.dtype()) {
        case float32:
          append_kv_array(ctx.get(), key, v, GGUF_VALUE_TYPE_FLOAT32);
          break;
        case int64:
          append_kv_array(ctx.get(), key, v, GGUF_VALUE_TYPE_INT64);
          break;
        case int32:
          append_kv_array(ctx.get(), key, v, GGUF_VALUE_TYPE_INT32);
          break;
        case int16:
          append_kv_array(ctx.get(), key, v, GGUF_VALUE_TYPE_INT16);
          break;
        case int8:
          append_kv_array(ctx.get(), key, v, GGUF_VALUE_TYPE_INT8);
          break;
        case uint64:
          append_kv_array(ctx.get(), key, v, GGUF_VALUE_TYPE_UINT64);
          break;
        case uint32:
          append_kv_array(ctx.get(), key, v, GGUF_VALUE_TYPE_UINT32);
          break;
        case uint16:
          append_kv_array(ctx.get(), key, v, GGUF_VALUE_TYPE_UINT16);
          break;
        case uint8:
          append_kv_array(ctx.get(), key, v, GGUF_VALUE_TYPE_UINT8);
          break;
        case bool_:
          append_kv_array(ctx.get(), key, v, GGUF_VALUE_TYPE_BOOL);
          break;
        default:
          std::ostringstream msg;
          msg << "[save_gguf] array type " << v.dtype()
              << " not support for metadata.";
          throw std::invalid_argument(msg.str());
      }
    } else {
      throw std::runtime_error(
          "[save_gguf] Received unexpected type in metadata");
    }
  }

  // Tensor offsets are relative to data section, so we start at offset 0.
  uint64_t tensor_offset = 0;

  // First, append the tensor info
  for (auto& [key, arr] : array_map) {
    arr.eval();

    // Try to make it row contiguous
    if (!arr.flags().row_contiguous) {
      arr = reshape(flatten(arr), arr.shape());
      arr.eval();
    }

    // Has to be row-major now but, check one more time in case
    // any of the above change in the future
    if (!arr.flags().row_contiguous) {
      throw std::invalid_argument(
          "[save_gguf] can only serialize row-major arrays");
    }

    tensor_offset += gguf_get_alignment_padding(ctx->alignment, tensor_offset);
    const std::optional<uint32_t> gguf_type =
        dtype_to_gguf_tensor_type(arr.dtype());
    if (!gguf_type.has_value()) {
      std::ostringstream msg;
      msg << "[save_gguf] dtype " << arr.dtype() << " is not supported";
      throw std::runtime_error(msg.str());
    }
    const char* tensorname = key.c_str();
    const uint64_t namelen = key.length();
    const uint32_t num_dim = arr.ndim();
    std::vector<uint64_t> dim(num_dim);
    for (int i = 0; i < num_dim; i++) {
      dim[i] = arr.shape()[num_dim - 1 - i];
    }
    if (!gguf_append_tensor_info(
            ctx.get(),
            tensorname,
            namelen,
            num_dim,
            dim.data(),
            gguf_type.value(),
            tensor_offset)) {
      throw std::runtime_error("[save_gguf] gguf_append_tensor_info failed");
    }
    tensor_offset += arr.nbytes();
  }

  // Then, append the tensor weights
  for (const auto& [key, arr] : array_map) {
    if (!gguf_append_tensor_data(
            ctx.get(), (void*)arr.data<void>(), arr.nbytes())) {
      throw std::runtime_error("[save_gguf] gguf_append_tensor_data failed");
    }
  }
}

} // namespace mlx::core


================================================
FILE: mlx/io/gguf.h
================================================
// Copyright © 2023-2024 Apple Inc.
#pragma once

#include "mlx/io.h"
#include "mlx/primitives.h"
#include "mlx/transforms.h"
#include "mlx/utils.h"

extern "C" {
#include <gguflib.h>
}

namespace mlx::core {

Shape get_shape(const gguf_tensor& tensor);
void gguf_load_quantized(
    std::unordered_map<std::string, array>& a,
    const gguf_tensor& tensor);

} // namespace mlx::core


================================================
FILE: mlx/io/gguf_quants.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <cstdint>
#include <cstring>
#include <numeric>

#include "mlx/io/gguf.h"

namespace mlx::core {

void unpack_32_4(uint8_t* data, int8_t* dst) {
  std::fill_n(dst, 16, 0);
  for (int j = 0; j < 16; ++j) {
    uint8_t x = (data[j + 2] & 0x0F); // j+2 to skip scale bytes.
    if (j % 2 != 0) {
      x <<= 4;
    }
    dst[j / 2] += x;
  }
  // Last 16 weights are in the higher bits
  for (int j = 0; j < 16; ++j) {
    uint8_t x = (data[j + 2] >> 4);
    if (j % 2 != 0) {
      x <<= 4;
    }
    dst[8 + j / 2] += x;
  }
}

// Extracts (weight, scales, biases) from Q4_0 tensors.
// Data layout is: |16 bit scale|32 x 4bit weights|.
void extract_q4_0_data(
    const gguf_tensor& tensor,
    array& weights_arr,
    array& scales_arr,
    array& biases_arr) {
  const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights
  auto data = static_cast<uint8_t*>(tensor.weights_data);
  auto weights = weights_arr.data<int8_t>();
  auto scales = scales_arr.data<float16_t>();
  auto biases = biases_arr.data<float16_t>();
  for (int64_t i = 0; i < scales_arr.size(); i++) {
    scales[i] = *((float16_t*)data);
    biases[i] = -8 * scales[i];
    unpack_32_4(data, weights);
    weights += 16;
    data += bytes_per_block;
  }
}

// Extracts (weight, scales, biases) from Q4_1 tensors.
// Data layout is: |16 bit scale|16 bit bias|32 x 4bit weights|.
void extract_q4_1_data(
    const gguf_tensor& tensor,
    array& weights_arr,
    array& scales_arr,
    array& biases_arr) {
  const uint64_t bytes_per_block =
      20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights
  auto data = static_cast<uint8_t*>(tensor.weights_data);
  auto weights = weights_arr.data<int8_t>();
  auto scales = scales_arr.data<float16_t>();
  auto biases = biases_arr.data<float16_t>();
  for (int64_t i = 0; i < scales_arr.size(); i++) {
    scales[i] = *((float16_t*)data);
    biases[i] = *((float16_t*)(data) + 1);
    unpack_32_4(data, weights);
    weights += 16;
    data += bytes_per_block;
  }
}

// Extracts (weight, scales, biases) from Q8_0 tensors.
// Data layout is: |16 bit scale|32 x 8bit weights|.
void extract_q8_0_data(
    const gguf_tensor& tensor,
    array& weights_arr,
    array& scales_arr,
    array& biases_arr) {
  const uint64_t weights_per_block = 32;
  const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights
  auto data = static_cast<uint8_t*>(tensor.weights_data);
  auto weights = weights_arr.data<int8_t>();
  auto scales = scales_arr.data<float16_t>();
  auto biases = biases_arr.data<float16_t>();
  for (int64_t i = 0; i < scales_arr.size(); i++) {
    uint8_t* block_data = data + i * bytes_per_block;
    scales[i] = *((float16_t*)block_data);
    biases[i] = -128 * scales[i];
    for (int64_t j = 0; j < weights_per_block; ++j) {
      uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes.
      // Original data is in int8_t, so we add a bias of -128 and invert the
      // first bit.
      x ^= 1 << 7;
      weights[i * weights_per_block + j] = x;
    }
  }
}

void gguf_load_quantized(
    std::unordered_map<std::string, array>& a,
    const gguf_tensor& tensor) {
  uint64_t weights_per_byte;
  if (tensor.type == GGUF_TYPE_Q4_0 || tensor.type == GGUF_TYPE_Q4_1) {
    weights_per_byte = 2;
  } else { // tensor.type == GGUF_TYPE_Q8_0
    weights_per_byte = 1;
  }

  std::string name(tensor.name, tensor.namelen);

  auto shape = get_shape(tensor);
  const uint64_t weights_per_block = 32;
  if (shape[shape.size() - 1] % weights_per_block != 0) {
    std::ostringstream msg;
    msg << "[load_gguf] tensor " << name
        << "has incompatible last dim shape: " << shape[shape.size() - 1];
    throw std::runtime_error(msg.str());
  }

  auto weights_shape = shape;
  weights_shape.back() /= (weights_per_byte * 4);
  auto w_nbytes = uint32.size() *
      std::accumulate(weights_shape.begin(),
                      weights_shape.end(),
                      1,
                      std::multiplies<size_t>());

  array weights(allocator::malloc(w_nbytes), std::move(weights_shape), uint32);

  // For scales and bias
  shape[shape.size() - 1] = shape[shape.size() - 1] / weights_per_block;
  auto sb_nbytes = float16.size() *
      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());

  array scales(allocator::malloc(sb_nbytes), shape, float16);
  array biases(allocator::malloc(sb_nbytes), std::move(shape), float16);
  if (tensor.type == GGUF_TYPE_Q4_0) {
    extract_q4_0_data(tensor, weights, scales, biases);
  } else if (tensor.type == GGUF_TYPE_Q4_1) {
    extract_q4_1_data(tensor, weights, scales, biases);
  } else if (tensor.type == GGUF_TYPE_Q8_0) {
    extract_q8_0_data(tensor, weights, scales, biases);
  }

  a.emplace(name, std::move(weights));

  auto check_insert = [](const auto& inserted) {
    if (!inserted.second) {
      std::ostringstream msg;
      msg << "[load_gguf] Duplicate parameter name " << inserted.first->second
          << " this can happend when loading quantized tensors.";
      throw std::runtime_error(msg.str());
    }
  };

  constexpr std::string_view weight_suffix = ".weight";
  const std::string name_prefix =
      name.substr(0, name.length() - weight_suffix.length());
  check_insert(a.emplace(name_prefix + ".scales", std::move(scales)));
  check_insert(a.emplace(name_prefix + ".biases", std::move(biases)));
}

} // namespace mlx::core


================================================
FILE: mlx/io/load.cpp
================================================
// Copyright © 2023-2024 Apple Inc.
#include <algorithm>
#include <cstring>
#include <fstream>
#include <limits>
#include <sstream>

// Used by pread implementation.
#ifdef _WIN32
#include <windows.h>
#endif // _WIN32

#include "mlx/backend/cuda/cuda.h"
#include "mlx/io.h"
#include "mlx/io/load.h"
#include "mlx/ops.h"
#include "mlx/primitives.h"
#include "mlx/utils.h"

// Adapted from
// https://github.com/angeloskath/supervised-lda/blob/master/include/ldaplusplus/NumpyFormat.hpp

namespace mlx::core {

namespace {

constexpr uint8_t MAGIC[] = {
    0x93,
    0x4e,
    0x55,
    0x4d,
    0x50,
    0x59,
};

inline bool is_big_endian() {
  union ByteOrder {
    int32_t i;
    uint8_t c[4];
  };
  ByteOrder b = {0x01234567};

  return b.c[0] == 0x01;
}

// Array protocol typestring for Dtype
std::string dtype_to_array_protocol(const Dtype& t) {
  std::ostringstream r;
  if (size_of(t) > 1) {
    r << (is_big_endian() ? ">" : "<");
  } else {
    r << "|";
  }
  r << kindof(t) << (int)size_of(t);
  return r.str();
}

// Dtype from array protocol type string
Dtype dtype_from_array_protocol(std::string_view t) {
  if (t.length() == 2 || t.length() == 3) {
    std::string_view r = t.length() == 3 ? t.substr(1, 2) : t;

    if (r == "V2") {
      return bfloat16;
    }

    uint8_t size = r[1] - '0';

    switch (r[0]) {
      case 'b': {
        if (size == 1)
          return bool_;
        break;
      }
      case 'i': {
        if (size == 1)
          return int8;
        else if (size == 2)
          return int16;
        else if (size == 4)
          return int32;
        else if (size == 8)
          return int64;
        break;
      }
      case 'u': {
        if (size == 1)
          return uint8;
        else if (size == 2)
          return uint16;
        else if (size == 4)
          return uint32;
        else if (size == 8)
          return uint64;
        break;
      }
      case 'f': {
        if (size == 2)
          return float16;
        else if (size == 4)
          return float32;
        else if (size == 8)
          return float64;
        break;
      }
      case 'c': {
        if (size == 8)
          return complex64;
        break;
      }
    }
  }

  throw std::invalid_argument(
      "[from_str] Unsupported array protocol type-string: " + std::string(t));
}

#ifdef _WIN32
// There is no pread on Windows, emulate it with ReadFile.
int64_t pread(int fd, void* buf, uint64_t size, uint64_t offset) {
  HANDLE file = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
  if (file == INVALID_HANDLE_VALUE) {
    return -1;
  }

  OVERLAPPED overlapped = {0};
  overlapped.Offset = offset & 0xFFFFFFFF;
  overlapped.OffsetHigh = (offset >> 32) & 0xFFFFFFFF;

  DWORD bytes_read;
  if (!ReadFile(file, buf, size, &bytes_read, &overlapped)) {
    if (GetLastError() != ERROR_HANDLE_EOF) {
      return -1;
    }
  }

  return bytes_read;
}
#endif

} // namespace

/** Save array to out stream in .npy format */
void save(std::shared_ptr<io::Writer> out_stream, array a) {
  ////////////////////////////////////////////////////////
  // Check array

  a = contiguous(a, true);
  a.eval();

  if (a.nbytes() == 0) {
    throw std::invalid_argument("[save] cannot serialize an empty array");
  }

  ////////////////////////////////////////////////////////
  // Check file
  if (!out_stream->good() || !out_stream->is_open()) {
    throw std::runtime_error("[save] Failed to open " + out_stream->label());
  }

  ////////////////////////////////////////////////////////
  // Prepare header
  std::ostringstream magic_ver_len;
  magic_ver_len.write(reinterpret_cast<const char*>(MAGIC), 6);

  std::string fortran_order = a.flags().col_contiguous ? "True" : "False";
  std::ostringstream header;
  header << "{'descr': '" << dtype_to_array_protocol(a.dtype()) << "',"
         << " 'fortran_order': " << fortran_order << "," << " 'shape': (";
  for (auto i : a.shape()) {
    header << i << ", ";
  }
  header << ")}";

  size_t header_len = static_cast<size_t>(header.tellp());
  bool is_v1 = header_len + 15 < std::numeric_limits<uint16_t>::max();

  // Pad out magic + version + header_len + header + \n to be divisible by 16
  size_t padding = (6 + 2 + (2 + 2 * is_v1) + header_len + 1) % 16;

  header << std::string(padding, ' ') << '\n';

  if (is_v1) {
    magic_ver_len << (char)0x01 << (char)0x00;

    uint16_t v1_header_len = header.tellp();
    const char* len_bytes = reinterpret_cast<const char*>(&v1_header_len);

    if (!is_big_endian()) {
      magic_ver_len.write(len_bytes, 2);
    } else {
      magic_ver_len.write(len_bytes + 1, 1);
      magic_ver_len.write(len_bytes, 1);
    }
  } else {
    magic_ver_len << (char)0x02 << (char)0x00;

    uint32_t v2_header_len = header.tellp();
    const char* len_bytes = reinterpret_cast<const char*>(&v2_header_len);

    if (!is_big_endian()) {
      magic_ver_len.write(len_bytes, 4);
    } else {
      magic_ver_len.write(len_bytes + 3, 1);
      magic_ver_len.write(len_bytes + 2, 1);
      magic_ver_len.write(len_bytes + 1, 1);
      magic_ver_len.write(len_bytes, 1);
    }
  }
  ////////////////////////////////////////////////////////
  // Serialize array

  out_stream->write(magic_ver_len.str().c_str(), magic_ver_len.str().length());
  out_stream->write(header.str().c_str(), header.str().length());
  out_stream->write(a.data<char>(), a.nbytes());
}

/** Save array to file in .npy format */
void save(std::string file, array a) {
  // Add .npy to file name if it is not there
  if (file.length() < 4 || file.substr(file.length() - 4, 4) != ".npy")
    file += ".npy";

  // Serialize array
  save(std::make_shared<io::FileWriter>(std::move(file)), a);
}

/** Load array from reader in .npy format */
array load(std::shared_ptr<io::Reader> in_stream, StreamOrDevice s) {
  ////////////////////////////////////////////////////////
  // Open and check file
  if (!in_stream->good() || !in_stream->is_open()) {
    throw std::runtime_error("[load] Failed to open " + in_stream->label());
  }

  auto stream = cu::is_available() ? to_stream(s) : to_stream(s, Device::cpu);

  ////////////////////////////////////////////////////////
  // Read header and prepare array details

  // Read and check magic
  char read_magic_and_ver[8];
  in_stream->read(read_magic_and_ver, 8);
  if (std::memcmp(read_magic_and_ver, MAGIC, 6) != 0) {
    throw std::runtime_error("[load] Invalid header in " + in_stream->label());
  }

  // Read and check version
  if (read_magic_and_ver[6] != 1 && read_magic_and_ver[6] != 2) {
    throw std::runtime_error(
        "[load] Unsupported npy format version in " + in_stream->label());
  }

  // Read header len and header
  int header_len_size = read_magic_and_ver[6] == 1 ? 2 : 4;
  size_t header_len;

  if (header_len_size == 2) {
    uint16_t v1_header_len;
    in_stream->read(reinterpret_cast<char*>(&v1_header_len), header_len_size);
    header_len = v1_header_len;
  } else {
    uint32_t v2_header_len;
    in_stream->read(reinterpret_cast<char*>(&v2_header_len), header_len_size);
    header_len = v2_header_len;
  }

  // Read the header
  std::vector<char> buffer(header_len + 1);
  in_stream->read(&buffer[0], header_len);
  buffer[header_len] = 0;
  std::string header(buffer.data(), header_len);

  // Read data type from header
  std::string dtype_str = header.substr(11, 3);
  bool read_is_big_endian = dtype_str[0] == '>';
  Dtype dtype = dtype_from_array_protocol(dtype_str);

  // Read contiguity order
  bool col_contiguous = header.at(34) == 'T';

  // Read array shape from header
  Shape shape;

  size_t st = header.find_last_of('(') + 1;
  size_t ed = header.find_last_of(')');
  std::string shape_str = header.substr(st, ed - st);

  while (!shape_str.empty()) {
    // Read current number and get position of comma
    size_t pos;
    int dim = std::stoi(shape_str, &pos);
    shape.push_back(dim);

    // Skip the comma and space and read the next number
    if (pos + 2 <= shape_str.length())
      shape_str = shape_str.substr(pos + 2);
    else {
      shape_str = shape_str.substr(pos);
      if (!shape_str.empty() && shape_str != " " && shape_str != ",") {
        throw std::runtime_error(
            "[load] Unknown error while parsing header in " +
            in_stream->label());
      }
      shape_str = "";
    }
  }

  ////////////////////////////////////////////////////////
  // Build primitive

  size_t offset = 8 + header_len_size + header.length();
  bool swap_endianness = read_is_big_endian != is_big_endian();

  if (col_contiguous) {
    std::reverse(shape.begin(), shape.end());
  }
  auto loaded_array = array(
      shape,
      dtype,
      std::make_shared<Load>(stream, in_stream, offset, swap_endianness),
      std::vector<array>{});
  if (col_contiguous) {
    loaded_array = transpose(loaded_array, s);
  }

  return loaded_array;
}

/** Load array from file in .npy format */
array load(std::string file, StreamOrDevice s) {
  return load(std::make_shared<io::ParallelFileReader>(std::move(file)), s);
}

namespace io {

ThreadPool& thread_pool() {
  static ThreadPool pool_{4};
  return pool_;
}

ThreadPool& ParallelFileReader::thread_pool() {
  static ThreadPool thread_pool{4};
  return thread_pool;
}

void ParallelFileReader::read(char* data, size_t n) {
  while (n != 0) {
    auto m = ::read(fd_, data, std::min(n, static_cast<size_t>(INT32_MAX)));
    if (m <= 0) {
      std::ostringstream msg;
      msg << "[read] Unable to read " << n << " bytes from file.";
      throw std::runtime_error(msg.str());
    }
    data += m;
    n -= m;
  }
}

void ParallelFileReader::read(char* data, size_t n, size_t offset) {
  auto readfn = [fd = fd_](size_t offset, size_t size, char* buffer) -> bool {
    while (size != 0) {
      auto m = pread(fd, buffer, size, offset);
      if (m <= 0) {
        return false;
      }
      buffer += m;
      size -= m;
    }
    return true;
  };
  std::vector<std::future<bool>> futs;
  while (n != 0) {
    if (n < batch_size_) {
      if (!readfn(offset, n, data)) {
        throw std::runtime_error("[read] Unable to read from file.");
      }
      break;
    } else {
      size_t m = batch_size_;
      futs.emplace_back(
          ParallelFileReader::thread_pool().enqueue(readfn, offset, m, data));
      data += m;
      n -= m;
      offset += m;
    }
  }
  for (auto& f : futs) {
    if (!f.get()) {
      throw std::runtime_error("[read] Unable to read from file.");
    }
  }
}

} // namespace io

} // namespace mlx::core


================================================
FILE: mlx/io/load.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include <memory>
#include <sstream>

#include <fcntl.h>
#ifdef _MSC_VER
#include <io.h>
#else
#include <sys/stat.h>
#include <unistd.h>
#endif

#include "mlx/threadpool.h"

// Strictly we need to operate on files in binary mode (to avoid \r getting
// automatically inserted), but every modern system except for Windows no
// longer differentiates between binary and text files and for them define
// the flag as no-op.
#ifndef O_BINARY
#define O_BINARY 0
#endif

namespace mlx::core {

namespace io {

ThreadPool& thread_pool();

class Reader {
 public:
  virtual bool is_open() const = 0;
  virtual bool good() const = 0;
  virtual size_t tell() = 0; // tellp is non-const in iostream
  virtual void seek(
      int64_t off,
      std::ios_base::seekdir way = std::ios_base::beg) = 0;
  virtual void read(char* data, size_t n) = 0;
  virtual void read(char* data, size_t n, size_t offset) = 0;
  virtual std::string label() const = 0;
  virtual ~Reader() = default;
};

class Writer {
 public:
  virtual bool is_open() const = 0;
  virtual bool good() const = 0;
  virtual size_t tell() = 0;
  virtual void seek(
      int64_t off,
      std::ios_base::seekdir way = std::ios_base::beg) = 0;
  virtual void write(const char* data, size_t n) = 0;
  virtual std::string label() const = 0;
  virtual ~Writer() = default;
};

class ParallelFileReader : public Reader {
 public:
  explicit ParallelFileReader(std::string file_path)
      : fd_(open(file_path.c_str(), O_RDONLY | O_BINARY)),
        label_(std::move(file_path)) {}

  ~ParallelFileReader() override {
    close(fd_);
  }

  bool is_open() const override {
    return fd_ > 0;
  }

  bool good() const override {
    return is_open();
  }

  size_t tell() override {
    return lseek(fd_, 0, SEEK_CUR);
  }

  // Warning: do not use this function from multiple threads as
  // it advances the file descriptor
  void seek(int64_t off, std::ios_base::seekdir way = std::ios_base::beg)
      override {
    if (way == std::ios_base::beg) {
      lseek(fd_, off, 0);
    } else {
      lseek(fd_, off, SEEK_CUR);
    }
  }

  // Warning: do not use this function from multiple threads as
  // it advances the file descriptor
  void read(char* data, size_t n) override;

  void read(char* data, size_t n, size_t offset) override;

  std::string label() const override {
    return "file " + label_;
  }

 private:
  static constexpr size_t batch_size_ = 1 << 25;
  static ThreadPool& thread_pool();
  int fd_;
  std::string label_;
};

class FileWriter : public Writer {
 public:
  explicit FileWriter() {}
  explicit FileWriter(std::string file_path)
      : fd_(open(
            file_path.c_str(),
            O_CREAT | O_WRONLY | O_TRUNC | O_BINARY,
            0644)),
        label_(std::move(file_path)) {}

  FileWriter(const FileWriter&) = delete;
  FileWriter& operator=(const FileWriter&) = delete;
  FileWriter(FileWriter&& other) {
    std::swap(fd_, other.fd_);
  }

  ~FileWriter() override {
    if (fd_ != 0) {
      close(fd_);
    }
  }

  bool is_open() const override {
    return fd_ >= 0;
  }

  bool good() const override {
    return is_open();
  }

  size_t tell() override {
    return lseek(fd_, 0, SEEK_CUR);
  }

  void seek(int64_t off, std::ios_base::seekdir way = std::ios_base::beg)
      override {
    if (way == std::ios_base::beg) {
      lseek(fd_, off, 0);
    } else {
      lseek(fd_, off, SEEK_CUR);
    }
  }

  void write(const char* data, size_t n) override {
    while (n != 0) {
      auto m = ::write(fd_, data, std::min(n, static_cast<size_t>(INT32_MAX)));
      if (m <= 0) {
        std::ostringstream msg;
        msg << "[write] Unable to write " << n << " bytes to file.";
        throw std::runtime_error(msg.str());
      }
      data += m;
      n -= m;
    }
  }

  std::string label() const override {
    return "file " + label_;
  }

 private:
  int fd_{0};
  std::string label_;
};

} // namespace io
} // namespace mlx::core


================================================
FILE: mlx/io/no_gguf.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include "mlx/io.h"

namespace mlx::core {

GGUFLoad load_gguf(const std::string&, StreamOrDevice s) {
  throw std::runtime_error(
      "[load_gguf] Compile with MLX_BUILD_GGUF=ON to enable GGUF support.");
}

void save_gguf(
    std::string,
    std::unordered_map<std::string, array>,
    std::unordered_map<std::string, GGUFMetaData>) {
  throw std::runtime_error(
      "[save_gguf] Compile with MLX_BUILD_GGUF=ON to enable GGUF support.");
}

} // namespace mlx::core


================================================
FILE: mlx/io/no_safetensors.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include "mlx/io.h"

namespace mlx::core {

SafetensorsLoad load_safetensors(std::shared_ptr<io::Reader>, StreamOrDevice) {
  throw std::runtime_error(
      "[load_safetensors] Compile with MLX_BUILD_SAFETENSORS=ON "
      "to enable safetensors support.");
}

SafetensorsLoad load_safetensors(const std::string&, StreamOrDevice) {
  throw std::runtime_error(
      "[load_safetensors] Compile with MLX_BUILD_SAFETENSORS=ON "
      "to enable safetensors support.");
}

void save_safetensors(
    std::shared_ptr<io::Writer>,
    std::unordered_map<std::string, array>,
    std::unordered_map<std::string, std::string>) {
  throw std::runtime_error(
      "[save_safetensors] Compile with MLX_BUILD_SAFETENSORS=ON "
      "to enable safetensors support.");
}

void save_safetensors(
    std::string file,
    std::unordered_map<std::string, array>,
    std::unordered_map<std::string, std::string>) {
  throw std::runtime_error(
      "[save_safetensors] Compile with MLX_BUILD_SAFETENSORS=ON "
      "to enable safetensors support.");
}

} // namespace mlx::core


================================================
FILE: mlx/io/safetensors.cpp
================================================
// Copyright © 2023 Apple Inc.
//
#include <json.hpp>
#include <memory>
#include <stack>

#include "mlx/backend/cuda/cuda.h"
#include "mlx/io.h"
#include "mlx/io/load.h"
#include "mlx/ops.h"
#include "mlx/primitives.h"
#include "mlx/transforms.h"

using json = nlohmann::json;

#define ST_F16 "F16"
#define ST_BF16 "BF16"
#define ST_F32 "F32"

#define ST_BOOL "BOOL"
#define ST_I8 "I8"
#define ST_I16 "I16"
#define ST_I32 "I32"
#define ST_I64 "I64"
#define ST_U8 "U8"
#define ST_U16 "U16"
#define ST_U32 "U32"
#define ST_U64 "U64"
#define ST_F8_E4M3 "F8_E4M3"

// Note: Complex numbers aren't in the spec yet so this could change -
// https://github.com/huggingface/safetensors/issues/389
#define ST_C64 "C64"

namespace mlx::core {

std::string dtype_to_safetensor_str(Dtype t) {
  switch (t) {
    case float32:
      return ST_F32;
    case bfloat16:
      return ST_BF16;
    case float16:
      return ST_F16;
    case int64:
      return ST_I64;
    case int32:
      return ST_I32;
    case int16:
      return ST_I16;
    case int8:
      return ST_I8;
    case uint64:
      return ST_U64;
    case uint32:
      return ST_U32;
    case uint16:
      return ST_U16;
    case uint8:
      return ST_U8;
    case bool_:
      return ST_BOOL;
    case complex64:
      return ST_C64;
    default:
      throw std::runtime_error("[save_safetensors] received invalid dtype.");
  }
}

Dtype dtype_from_safetensor_str(std::string_view str) {
  if (str == ST_F32) {
    return float32;
  } else if (str == ST_F16) {
    return float16;
  } else if (str == ST_BF16) {
    return bfloat16;
  } else if (str == ST_I64) {
    return int64;
  } else if (str == ST_I32) {
    return int32;
  } else if (str == ST_I16) {
    return int16;
  } else if (str == ST_I8) {
    return int8;
  } else if (str == ST_U64) {
    return uint64;
  } else if (str == ST_U32) {
    return uint32;
  } else if (str == ST_U16) {
    return uint16;
  } else if (str == ST_U8) {
    return uint8;
  } else if (str == ST_BOOL) {
    return bool_;
  } else if (str == ST_C64) {
    return complex64;
  } else if (str == ST_F8_E4M3) {
    return uint8;
  } else {
    throw std::runtime_error(
        "[safetensor] unsupported dtype " + std::string(str));
  }
}

/** Load array from reader in safetensor format */
SafetensorsLoad load_safetensors(
    std::shared_ptr<io::Reader> in_stream,
    StreamOrDevice s) {
  ////////////////////////////////////////////////////////
  // Open and check file
  if (!in_stream->good() || !in_stream->is_open()) {
    throw std::runtime_error(
        "[load_safetensors] Failed to open " + in_stream->label());
  }

  auto stream = cu::is_available() ? to_stream(s) : to_stream(s, Device::cpu);

  uint64_t jsonHeaderLength = 0;
  // This is the same limit as in the original Rust Safetensors code.
  constexpr uint64_t kMaxJsonHeaderLength = 100000000;
  in_stream->read(reinterpret_cast<char*>(&jsonHeaderLength), 8);
  if (jsonHeaderLength <= 0 || jsonHeaderLength >= kMaxJsonHeaderLength) {
    throw std::runtime_error(
        "[load_safetensors] Invalid json header length " + in_stream->label());
  }
  // Load the json metadata
  auto rawJson = std::make_unique<char[]>(jsonHeaderLength);
  in_stream->read(rawJson.get(), jsonHeaderLength);
  auto metadata = json::parse(rawJson.get(), rawJson.get() + jsonHeaderLength);
  // Should always be an object on the top-level
  if (!metadata.is_object()) {
    throw std::runtime_error(
        "[load_safetensors] Invalid json metadata " + in_stream->label());
  }
  size_t offset = jsonHeaderLength + 8;
  // Load the arrays using metadata
  std::unordered_map<std::string, array> res;
  std::unordered_map<std::string, std::string> metadata_map;
  for (const auto& item : metadata.items()) {
    if (item.key() == "__metadata__") {
      for (const auto& meta_item : item.value().items()) {
        metadata_map.insert({meta_item.key(), meta_item.value()});
      }
      continue;
    }
    const std::string& dtype = item.value().at("dtype");
    const Shape& shape = item.value().at("shape");
    const std::vector<size_t>& data_offsets = item.value().at("data_offsets");
    Dtype type = dtype_from_safetensor_str(dtype);
    res.insert(
        {item.key(),
         array(
             shape,
             type,
             std::make_shared<Load>(
                 stream, in_stream, offset + data_offsets.at(0), false),
             std::vector<array>{})});
  }
  return {res, metadata_map};
}

SafetensorsLoad load_safetensors(const std::string& file, StreamOrDevice s) {
  return load_safetensors(std::make_shared<io::ParallelFileReader>(file), s);
}

void save_safetensors(
    std::shared_ptr<io::Writer> out_stream,
    std::unordered_map<std::string, array> a,
    std::unordered_map<std::string, std::string> metadata /* = {} */) {
  ////////////////////////////////////////////////////////
  // Check file
  if (!out_stream->good() || !out_stream->is_open()) {
    throw std::runtime_error(
        "[save_safetensors] Failed to open " + out_stream->label());
  }

  ////////////////////////////////////////////////////////
  // Check array map
  json parent;
  json _metadata;
  for (auto& [key, value] : metadata) {
    _metadata[key] = value;
  }
  parent["__metadata__"] = _metadata;

  {
    std::vector<array> to_eval;
    to_eval.reserve(a.size());
    for (auto& p : a) {
      p.second = contiguous(p.second);
      to_eval.push_back(p.second);
    }
    eval(std::move(to_eval));
  }

  size_t offset = 0;
  for (auto& [key, arr] : a) {
    if (arr.nbytes() == 0) {
      throw std::invalid_argument(
          "[save_safetensors] cannot serialize an empty array key: " + key);
    }

    json child;
    child["dtype"] = dtype_to_safetensor_str(arr.dtype());
    child["shape"] = arr.shape();
    child["data_offsets"] = std::vector<size_t>{offset, offset + arr.nbytes()};
    parent[key] = child;
    offset += arr.nbytes();
  }

  auto header = parent.dump();
  uint64_t header_len = header.length();
  out_stream->write(reinterpret_cast<char*>(&header_len), 8);
  out_stream->write(header.c_str(), header_len);
  for (auto& [key, arr] : a) {
    out_stream->write(arr.data<char>(), arr.nbytes());
  }
}

void save_safetensors(
    std::string file,
    std::unordered_map<std::string, array> a,
    std::unordered_map<std::string, std::string> metadata /* = {} */) {
  // Add .safetensors to file name if it is not there
  if (file.length() < 12 ||
      file.substr(file.length() - 12, 12) != ".safetensors")
    file += ".safetensors";

  // Serialize array
  save_safetensors(
      std::make_shared<io::FileWriter>(std::move(file)), a, metadata);
}

} // namespace mlx::core


================================================
FILE: mlx/io.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <unordered_map>
#include <variant>

#include "mlx/api.h"
#include "mlx/array.h"
#include "mlx/io/load.h"
#include "mlx/stream.h"
#include "mlx/utils.h"

namespace mlx::core {
using GGUFMetaData =
    std::variant<std::monostate, array, std::string, std::vector<std::string>>;
using GGUFLoad = std::pair<
    std::unordered_map<std::string, array>,
    std::unordered_map<std::string, GGUFMetaData>>;
using SafetensorsLoad = std::pair<
    std::unordered_map<std::string, array>,
    std::unordered_map<std::string, std::string>>;

/** Save array to out stream in .npy format */
MLX_API void save(std::shared_ptr<io::Writer> out_stream, array a);

/** Save array to file in .npy format */
MLX_API void save(std::string file, array a);

/** Load array from reader in .npy format */
MLX_API array
load(std::shared_ptr<io::Reader> in_stream, StreamOrDevice s = {});

/** Load array from file in .npy format */
MLX_API array load(std::string file, StreamOrDevice s = {});

/** Load array map from .safetensors file format */
MLX_API SafetensorsLoad
load_safetensors(std::shared_ptr<io::Reader> in_stream, StreamOrDevice s = {});
MLX_API SafetensorsLoad
load_safetensors(const std::string& file, StreamOrDevice s = {});

MLX_API void save_safetensors(
    std::shared_ptr<io::Writer> in_stream,
    std::unordered_map<std::string, array>,
    std::unordered_map<std::string, std::string> metadata = {});
MLX_API void save_safetensors(
    std::string file,
    std::unordered_map<std::string, array>,
    std::unordered_map<std::string, std::string> metadata = {});

/** Load array map and metadata from .gguf file format */

MLX_API GGUFLoad load_gguf(const std::string& file, StreamOrDevice s = {});

MLX_API void save_gguf(
    std::string file,
    std::unordered_map<std::string, array> array_map,
    std::unordered_map<std::string, GGUFMetaData> meta_data = {});

} // namespace mlx::core


================================================
FILE: mlx/linalg.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <numeric>
#include <ostream>
#include <vector>

#include "mlx/linalg.h"
#include "mlx/primitives.h"
#include "mlx/utils.h"

namespace mlx::core::linalg {

void check_cpu_stream(const StreamOrDevice& s, const std::string& prefix) {
  if (to_stream(s).device == Device::gpu) {
    throw std::invalid_argument(
        prefix +
        " This op is not yet supported on the GPU. "
        "Explicitly pass a CPU stream to run it.");
  }
}
void check_float(Dtype dtype, const std::string& prefix) {
  if (dtype != float32 && dtype != float64) {
    std::ostringstream msg;
    msg << prefix << " Arrays must have type float32 or float64. "
        << "Received array with type " << dtype << ".";
    throw std::invalid_argument(msg.str());
  }
}

void check_float_or_complex(Dtype dtype, const std::string& prefix) {
  if (dtype != float32 && dtype != float64 && dtype != complex64) {
    std::ostringstream msg;
    msg << prefix << " Arrays must have type float32, float64 or complex64. "
        << "Received array with type " << dtype << ".";
    throw std::invalid_argument(msg.str());
  }
}

Dtype at_least_float(const Dtype& d) {
  return issubdtype(d, inexact) ? d : promote_types(d, float32);
}

inline array l2_norm(
    const array& a,
    const std::vector<int>& axis,
    bool keepdims,
    StreamOrDevice s) {
  if (issubdtype(a.dtype(), complexfloating)) {
    return sqrt(sum(abs(a, s) * abs(a, s), axis, keepdims, s), s);
  } else {
    return sqrt(sum(square(a, s), axis, keepdims, s), s);
  }
}

inline array vector_norm(
    const array& a,
    const double ord,
    const std::vector<int>& axis,
    bool keepdims,
    StreamOrDevice s) {
  auto dtype = at_least_float(a.dtype());
  if (ord == 0.0) {
    return astype(sum(not_equal(a, array(0), s), axis, keepdims, s), dtype, s);
  } else if (ord == 1.0) {
    return astype(sum(abs(a, s), axis, keepdims, s), dtype, s);
  } else if (ord == 2.0) {
    return l2_norm(a, axis, keepdims, s);
  } else if (ord == std::numeric_limits<double>::infinity()) {
    return astype(max(abs(a, s), axis, keepdims, s), dtype, s);
  } else if (ord == -std::numeric_limits<double>::infinity()) {
    return astype(min(abs(a, s), axis, keepdims, s), dtype, s);
  } else {
    return power(
        sum(power(abs(a, s), array(ord, dtype), s), axis, keepdims, s),
        array(1.0 / ord, dtype),
        s);
  }
}

inline array matrix_norm(
    const array& a,
    const double ord,
    const std::vector<int>& axis,
    bool keepdims,
    StreamOrDevice s) {
  auto dtype = at_least_float(a.dtype());
  auto row_axis = axis[0];
  auto col_axis = axis[1];
  if (ord == -1.0) {
    col_axis -= (!keepdims && col_axis > row_axis && col_axis > 0);
    return astype(
        min(sum(abs(a, s), row_axis, keepdims, s), col_axis, keepdims, s),
        dtype,
        s);
  } else if (ord == 1.0) {
    col_axis -= (!keepdims && col_axis > row_axis && col_axis > 0);
    return astype(
        max(sum(abs(a, s), row_axis, keepdims, s), col_axis, keepdims, s),
        dtype,
        s);
  } else if (ord == std::numeric_limits<double>::infinity()) {
    row_axis -= (!keepdims && row_axis > col_axis && row_axis > 0);
    return astype(
        max(sum(abs(a, s), col_axis, keepdims, s), row_axis, keepdims, s),
        dtype,
        s);
  } else if (ord == -std::numeric_limits<double>::infinity()) {
    row_axis -= (!keepdims && row_axis > col_axis && row_axis > 0);
    return astype(
        min(sum(abs(a, s), col_axis, keepdims, s), row_axis, keepdims, s),
        dtype,
        s);
  } else if (ord == 2.0 || ord == -2.0) {
    row_axis = (axis[0] < 0) ? axis[0] + a.ndim() : axis[0];
    col_axis = (axis[1] < 0) ? axis[1] + a.ndim() : axis[1];
    auto a_matrix = (row_axis > col_axis)
        ? moveaxis(moveaxis(a, row_axis, -1, s), col_axis, -1, s)
        : moveaxis(moveaxis(a, col_axis, -1, s), row_axis, -2, s);
    a_matrix = svd(a_matrix, false, s).at(0);
    a_matrix = (ord == 2.0) ? max(a_matrix, -1, false, s)
                            : min(a_matrix, -1, false, s);
    if (keepdims) {
      std::vector<int> sorted_axes = (row_axis < col_axis)
          ? std::vector<int>{row_axis, col_axis}
          : std::vector<int>{col_axis, row_axis};
      a_matrix = expand_dims(a_matrix, sorted_axes, s);
    }
    return astype(a_matrix, dtype, s);
  } else {
    std::ostringstream msg;
    msg << "[linalg::norm] Invalid ord " << ord << " for matrix norm.";
    throw std::invalid_argument(msg.str());
  }
}

inline array matrix_norm(
    const array& a,
    const std::string& ord,
    const std::vector<int>& axis,
    bool keepdims,
    StreamOrDevice s) {
  if (ord == "f" || ord == "fro") {
    return l2_norm(a, axis, keepdims, s);
  } else if (ord == "nuc") {
    int row_axis = (axis[0] < 0) ? axis[0] + a.ndim() : axis[0];
    int col_axis = (axis[1] < 0) ? axis[1] + a.ndim() : axis[1];
    auto a_matrix = (row_axis > col_axis)
        ? moveaxis(moveaxis(a, row_axis, -1, s), col_axis, -1, s)
        : moveaxis(moveaxis(a, col_axis, -1, s), row_axis, -2, s);
    a_matrix = sum(svd(a_matrix, false, s).at(0), -1, false, s);
    if (keepdims) {
      std::vector<int> sorted_axes = (row_axis < col_axis)
          ? std::vector<int>{row_axis, col_axis}
          : std::vector<int>{col_axis, row_axis};
      a_matrix = expand_dims(a_matrix, sorted_axes, s);
    }
    return a_matrix;
  } else {
    std::ostringstream msg;
    msg << "[linalg::norm] Invalid ord value '" << ord << "' for matrix norm.";
    throw std::invalid_argument(msg.str());
  }
}

array norm(
    const array& a,
    const std::optional<std::vector<int>>& axis /* = std::nullopt */,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {} */) {
  if (!axis) {
    return norm(flatten(a, s), std::vector<int>{0}, keepdims, s);
  }

  if (axis.value().size() > 2) {
    throw std::invalid_argument(
        "[linalg::norm] Received too many axes for norm.");
  }
  return l2_norm(a, axis.value(), keepdims, s);
}

array norm(
    const array& a,
    const double ord,
    const std::optional<std::vector<int>>& axis /* = std::nullopt */,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {} */) {
  std::vector<int> ax;
  if (!axis) {
    ax.resize(a.ndim());
    std::iota(ax.begin(), ax.end(), 0);
  } else {
    ax = axis.value();
  }
  if (ax.size() == 1) {
    return vector_norm(a, ord, ax, keepdims, s);
  } else if (ax.size() == 2) {
    return matrix_norm(a, ord, ax, keepdims, s);
  } else {
    throw std::invalid_argument(
        "[linalg::norm] Received too many axes for norm.");
  }
}

array norm(
    const array& a,
    const std::string& ord,
    const std::optional<std::vector<int>>& axis /* = std::nullopt */,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {} */) {
  std::vector<int> ax;
  if (!axis) {
    ax.resize(a.ndim());
    std::iota(ax.begin(), ax.end(), 0);
  } else {
    ax = axis.value();
  }
  if (ax.size() != 2) {
    std::ostringstream msg;
    msg << "[linalg::norm] Norm '" << ord << "' only supported for matrices,"
        << " but received " << ax.size() << " axis/axes.";
    throw std::invalid_argument(msg.str());
  }
  return matrix_norm(a, ord, ax, keepdims, s);
}

std::pair<array, array> qr(const array& a, StreamOrDevice s /* = {} */) {
  check_cpu_stream(s, "[linalg::qr]");
  check_float(a.dtype(), "[linalg::qr]");

  if (a.ndim() < 2) {
    std::ostringstream msg;
    msg << "[linalg::qr] Arrays must have >= 2 dimensions. Received array "
           "with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  int k = std::min(a.shape(-2), a.shape(-1));
  auto q_shape = a.shape();
  q_shape.back() = k;
  auto r_shape = a.shape();
  r_shape[r_shape.size() - 2] = k;
  auto out = array::make_arrays(
      {std::move(q_shape), std::move(r_shape)},
      {a.dtype(), a.dtype()},
      std::make_shared<QRF>(to_stream(s)),
      {astype(a, a.dtype(), s)});
  return std::make_pair(out[0], out[1]);
}

std::vector<array>
svd(const array& a, bool compute_uv, StreamOrDevice s /* = {} */) {
  check_cpu_stream(s, "[linalg::svd]");
  check_float_or_complex(a.dtype(), "[linalg::svd]");

  if (a.ndim() < 2) {
    std::ostringstream msg;
    msg << "[linalg::svd] Input array must have >= 2 dimensions. Received array "
           "with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  const auto m = a.shape(-2);
  const auto n = a.shape(-1);
  const auto rank = a.ndim();

  auto s_shape = a.shape();
  s_shape.pop_back();
  s_shape[rank - 2] = std::min(m, n);

  auto s_dtype = a.dtype() == complex64 ? float32 : a.dtype();

  if (!compute_uv) {
    return {array(
        std::move(s_shape),
        s_dtype,
        std::make_shared<SVD>(to_stream(s), compute_uv),
        {a})};
  }

  auto u_shape = a.shape();
  u_shape[rank - 2] = m;
  u_shape[rank - 1] = m;

  auto vt_shape = a.shape();
  vt_shape[rank - 2] = n;
  vt_shape[rank - 1] = n;

  return array::make_arrays(
      {u_shape, s_shape, vt_shape},
      {a.dtype(), s_dtype, a.dtype()},
      std::make_shared<SVD>(to_stream(s), compute_uv),
      {a});
}

array inv_impl(const array& a, bool tri, bool upper, StreamOrDevice s) {
  check_cpu_stream(s, "[linalg::inv]");
  check_float(a.dtype(), "[linalg::inv]");

  if (a.ndim() < 2) {
    std::ostringstream msg;
    msg << "[linalg::inv] Arrays must have >= 2 dimensions. Received array "
           "with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  if (a.shape(-1) != a.shape(-2)) {
    throw std::invalid_argument(
        "[linalg::inv] Inverses are only defined for square matrices.");
  }

  return array(
      a.shape(),
      a.dtype(),
      std::make_shared<Inverse>(to_stream(s), tri, upper),
      {a});
}

array inv(const array& a, StreamOrDevice s /* = {} */) {
  return inv_impl(a, /*tri=*/false, /*upper=*/true, s);
}

array tri_inv(
    const array& a,
    bool upper /* = false */,
    StreamOrDevice s /* = {} */) {
  return inv_impl(a, /*tri=*/true, upper, s);
}

array cholesky(
    const array& a,
    bool upper /* = false */,
    StreamOrDevice s /* = {} */) {
  check_cpu_stream(s, "[linalg::cholesky]");
  check_float(a.dtype(), "[linalg::cholesky]");
  if (a.ndim() < 2) {
    std::ostringstream msg;
    msg << "[linalg::cholesky] Arrays must have >= 2 dimensions. Received array "
           "with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  if (a.shape(-1) != a.shape(-2)) {
    throw std::invalid_argument(
        "[linalg::cholesky] Cholesky decomposition is only defined for square "
        "matrices.");
  }
  return array(
      a.shape(),
      a.dtype(),
      std::make_shared<Cholesky>(to_stream(s), upper),
      {a});
}

array pinv(const array& a, StreamOrDevice s /* = {} */) {
  check_cpu_stream(s, "[linalg::pinv]");
  check_float(a.dtype(), "[linalg::pinv]");

  if (a.ndim() < 2) {
    std::ostringstream msg;
    msg << "[linalg::pinv] Arrays must have >= 2 dimensions. Received array "
        << "with " << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  int m = a.shape(-2);
  int n = a.shape(-1);
  int k = std::min(m, n);
  auto outs = linalg::svd(a, true, s);
  array U = outs[0];
  array S = outs[1];
  array V = outs[2];

  Shape starts(a.ndim(), 0);
  auto ends = a.shape();
  int i = a.ndim() - 2;
  int j = a.ndim() - 1;

  // Prepare U
  ends[i] = m;
  ends[j] = k;
  U = swapaxes(slice(U, starts, ends, s), -1, -2, s);

  // Prepare V
  ends[i] = k;
  ends[j] = n;
  V = swapaxes(slice(V, starts, ends, s), -1, -2, s);

  // Prepare S
  S = expand_dims(S, -2, s);

  auto rcond = 10. * std::max(m, n) * finfo(a.dtype()).eps;
  auto cutoff = multiply(array(rcond, a.dtype()), max(S, -1, true, s), s);
  auto rS =
      where(greater(S, cutoff, s), reciprocal(S, s), array(0.0f, a.dtype()), s);

  return matmul(multiply(V, rS, s), U, s);
}

array cholesky_inv(
    const array& L,
    bool upper /* = false */,
    StreamOrDevice s /* = {} */) {
  check_cpu_stream(s, "[linalg::cholesky_inv]");
  check_float(L.dtype(), "[linalg::cholesky_inv]");

  if (L.ndim() < 2) {
    std::ostringstream msg;
    msg << "[linalg::cholesky_inv] Arrays must have >= 2 dimensions. Received array "
           "with "
        << L.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  if (L.shape(-1) != L.shape(-2)) {
    throw std::invalid_argument(
        "[linalg::cholesky_inv] Cholesky inverse is only defined for square "
        "matrices.");
  }

  array L_inv = tri_inv(L, upper, s);
  if (upper) {
    return matmul(L_inv, swapaxes(L_inv, -1, -2, s), s);
  } else {
    return matmul(swapaxes(L_inv, -1, -2, s), L_inv, s);
  }
}

array cross(
    const array& a,
    const array& b,
    int axis /* = -1 */,
    StreamOrDevice s /* = {} */) {
  auto check_ax = [axis](const array& arr) {
    if (axis >= static_cast<int>(arr.ndim()) || axis + arr.ndim() < 0) {
      std::ostringstream msg;
      msg << "[linalg::cross] axis " << axis << " invalid for array with "
          << arr.ndim() << " dimensions.";
      throw std::invalid_argument(msg.str());
    }
    if (arr.shape(axis) < 2 || arr.shape(axis) > 3) {
      throw std::invalid_argument(
          "[linalg::cross] The specified axis must have size 2 or 3.");
    }
  };
  check_ax(a);
  check_ax(b);

  bool a_2d = a.shape(axis) == 2;
  bool b_2d = b.shape(axis) == 2;

  auto out_type = promote_types(a.dtype(), b.dtype());
  auto ashape = a.shape();
  auto bshape = b.shape();

  ashape[axis < 0 ? axis + a.ndim() : axis] = 3;
  bshape[axis < 0 ? axis + b.ndim() : axis] = 3;
  auto out_shape = broadcast_shapes(ashape, bshape);

  if (axis < 0) {
    axis += out_shape.size();
  }

  out_shape[axis] = a_2d ? 2 : 3;
  auto a_ = broadcast_to(astype(a, out_type, s), out_shape, s);

  out_shape[axis] = b_2d ? 2 : 3;
  auto b_ = broadcast_to(astype(b, out_type, s), out_shape, s);

  auto a_splits = split(a_, a_2d ? 2 : 3, axis);
  auto b_splits = split(b_, b_2d ? 2 : 3, axis);

  std::vector<array> outputs;
  if (a_2d && b_2d) {
    auto z = zeros_like(a_splits[0], s);
    outputs.push_back(z);
    outputs.push_back(z);
  } else if (b_2d) {
    outputs.push_back(negative(multiply(a_splits[2], b_splits[1], s), s));
    outputs.push_back(multiply(a_splits[2], b_splits[0], s));
  } else if (a_2d) {
    outputs.push_back(multiply(a_splits[1], b_splits[2], s));
    outputs.push_back(negative(multiply(a_splits[0], b_splits[2], s), s));
  } else {
    outputs.push_back(subtract(
        multiply(a_splits[1], b_splits[2], s),
        multiply(a_splits[2], b_splits[1], s),
        s));
    outputs.push_back(subtract(
        multiply(a_splits[2], b_splits[0], s),
        multiply(a_splits[0], b_splits[2], s),
        s));
  }
  outputs.push_back(subtract(
      multiply(a_splits[0], b_splits[1], s),
      multiply(a_splits[1], b_splits[0], s),
      s));
  return concatenate(outputs, axis, s);
}

void validate_eig(
    const array& a,
    const StreamOrDevice& stream,
    const std::string& fname) {
  check_cpu_stream(stream, fname);
  check_float_or_complex(a.dtype(), fname);

  if (a.ndim() < 2) {
    std::ostringstream msg;
    msg << fname << " Arrays must have >= 2 dimensions. Received array with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  if (a.shape(-1) != a.shape(-2)) {
    throw std::invalid_argument(fname + " Only defined for square matrices.");
  }
}

array eigvalsh(
    const array& a,
    std::string UPLO /* = "L" */,
    StreamOrDevice s /* = {} */) {
  validate_eig(a, s, "[linalg::eigvalsh]");
  Shape out_shape(a.shape().begin(), a.shape().end() - 1);
  Dtype eigval_type = a.dtype() == complex64 ? float32 : a.dtype();
  return array(
      std::move(out_shape),
      eigval_type,
      std::make_shared<Eigh>(to_stream(s), UPLO, false),
      {a});
}

std::pair<array, array> eigh(
    const array& a,
    std::string UPLO /* = "L" */,
    StreamOrDevice s /* = {} */) {
  validate_eig(a, s, "[linalg::eigh]");
  Dtype eigval_type = a.dtype() == complex64 ? float32 : a.dtype();
  auto out = array::make_arrays(
      {Shape(a.shape().begin(), a.shape().end() - 1), a.shape()},
      {eigval_type, a.dtype()},
      std::make_shared<Eigh>(to_stream(s), UPLO, true),
      {a});
  return std::make_pair(out[0], out[1]);
}

array eigvals(const array& a, StreamOrDevice s /* = {} */) {
  validate_eig(a, s, "[linalg::eigvals]");
  Shape out_shape(a.shape().begin(), a.shape().end() - 1);
  return array(
      std::move(out_shape),
      complex64,
      std::make_shared<Eig>(to_stream(s), false),
      {a});
}

std::pair<array, array> eig(const array& a, StreamOrDevice s /* = {} */) {
  validate_eig(a, s, "[linalg::eig]");
  auto out = array::make_arrays(
      {Shape(a.shape().begin(), a.shape().end() - 1), a.shape()},
      {complex64, complex64},
      std::make_shared<Eig>(to_stream(s), true),
      {a});
  return std::make_pair(out[0], out[1]);
}

void validate_lu(
    const array& a,
    const StreamOrDevice& stream,
    const std::string& fname) {
  check_cpu_stream(stream, fname);
  check_float(a.dtype(), fname);

  if (a.ndim() < 2) {
    std::ostringstream msg;
    msg << fname
        << " Arrays must have >= 2 dimensions. Received array "
           "with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
}

std::vector<array> lu_helper(const array& a, StreamOrDevice s /* = {} */) {
  int m = a.shape()[a.shape().size() - 2];
  int n = a.shape()[a.shape().size() - 1];

  Shape pivots_shape(a.shape().begin(), a.shape().end() - 2);
  pivots_shape.push_back(std::min(m, n));

  Shape row_idx_shape(a.shape().begin(), a.shape().end() - 1);

  return array::make_arrays(
      {a.shape(), pivots_shape, row_idx_shape},
      {a.dtype(), uint32, uint32},
      std::make_shared<LUF>(to_stream(s)),
      {astype(a, a.dtype(), s)});
}

std::vector<array> lu(const array& a, StreamOrDevice s /* = {} */) {
  validate_lu(a, s, "[linalg::lu]");

  auto out = lu_helper(a, s);
  auto& LU = out[0];
  auto& row_pivots = out[2];
  auto L = tril(LU, /* k = */ -1, s);
  auto U = triu(LU, /* k = */ 0, s);

  int M = a.shape(-2);
  int N = a.shape(-1);
  int K = std::min(M, N);
  if (N != K) {
    auto start = Shape(L.ndim(), 0);
    auto stop = L.shape();
    stop.back() = K;
    L = slice(L, std::move(start), std::move(stop), s);
  } else if (M != K) {
    auto start = Shape(U.ndim(), 0);
    auto stop = U.shape();
    stop[U.ndim() - 2] = K;
    U = slice(U, std::move(start), std::move(stop), s);
  }
  L = add(L, eye(M, K, s), s);
  return {row_pivots, L, U};
}

std::pair<array, array> lu_factor(const array& a, StreamOrDevice s /* = {} */) {
  validate_lu(a, s, "[linalg::lu_factor]");
  auto out = lu_helper(a, s);
  return std::make_pair(out[0], out[1]);
}

void validate_solve(
    const array& a,
    const array& b,
    const StreamOrDevice& stream,
    const std::string& fname) {
  check_cpu_stream(stream, fname);
  if (a.ndim() < 2) {
    std::ostringstream msg;
    msg << fname << " First input must have >= 2 dimensions. "
        << "Received array with " << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  if (b.ndim() < 1) {
    std::ostringstream msg;
    msg << fname << " Second input must have >= 1 dimensions. "
        << "Received array with " << b.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  if (a.shape(-1) != a.shape(-2)) {
    std::ostringstream msg;
    msg << fname << " First input must be a square matrix. "
        << "Received array with shape " << a.shape() << ".";
    throw std::invalid_argument(msg.str());
  }

  int lastDim = b.ndim() > 1 ? -2 : -1;
  if (a.shape(-1) != b.shape(lastDim)) {
    std::ostringstream msg;
    msg << fname << " Last dimension of first input with shape " << a.shape()
        << " must match second to last dimension of"
        << " second input with shape " << b.shape() << ".";
    throw std::invalid_argument(msg.str());
  }

  auto out_type = promote_types(a.dtype(), b.dtype());
  if (out_type != float32 && out_type != float64) {
    std::ostringstream msg;
    msg << fname
        << " Input arrays must promote to float32 or float64. "
           " Received arrays with type "
        << a.dtype() << " and " << b.dtype() << ".";
    throw std::invalid_argument(msg.str());
  }
}

array solve(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  validate_solve(a, b, s, "[linalg::solve]");

  // P, L, U matrices
  const auto luf = lu(a, s);
  auto perm = argsort(luf[0], -1, s);
  int take_axis = -1;
  if (b.ndim() >= 2) {
    perm = expand_dims(perm, -1, s);
    take_axis -= 1;
  }
  auto pb = take_along_axis(b, perm, take_axis, s);
  auto y = solve_triangular(luf[1], pb, /* upper = */ false, s);
  return solve_triangular(luf[2], y, /* upper = */ true, s);
}

array solve_triangular(
    const array& a,
    const array& b,
    bool upper /* = false */,
    StreamOrDevice s /* = {} */) {
  validate_solve(a, b, s, "[linalg::solve_triangular]");
  auto a_inv = tri_inv(a, upper, s);
  return matmul(a_inv, b, s);
}

} // namespace mlx::core::linalg

================================================
FILE: mlx/linalg.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include <optional>

#include "mlx/api.h"
#include "mlx/array.h"
#include "mlx/device.h"
#include "mlx/ops.h"
#include "mlx/stream.h"

namespace mlx::core::linalg {

/**
 * Compute vector or matrix norms.
 *
 * - If axis and ord are both unspecified, computes the 2-norm of flatten(x).
 * - If axis is not provided but ord is, then x must be either 1D or 2D.
 * - If axis is provided, but ord is not, then the 2-norm (or Frobenius norm
 *   for matrices) is computed along the given axes. At most 2 axes can be
 *   specified.
 * - If both axis and ord are provided, then the corresponding matrix or vector
 *   norm is computed. At most 2 axes can be specified.
 */
MLX_API array norm(
    const array& a,
    const double ord,
    const std::optional<std::vector<int>>& axis = std::nullopt,
    bool keepdims = false,
    StreamOrDevice s = {});
inline array norm(
    const array& a,
    const double ord,
    int axis,
    bool keepdims = false,
    StreamOrDevice s = {}) {
  return norm(a, ord, std::vector<int>{axis}, keepdims, s);
}
MLX_API array norm(
    const array& a,
    const std::string& ord,
    const std::optional<std::vector<int>>& axis = std::nullopt,
    bool keepdims = false,
    StreamOrDevice s = {});
inline array norm(
    const array& a,
    const std::string& ord,
    int axis,
    bool keepdims = false,
    StreamOrDevice s = {}) {
  return norm(a, ord, std::vector<int>{axis}, keepdims, s);
}
MLX_API array norm(
    const array& a,
    const std::optional<std::vector<int>>& axis = std::nullopt,
    bool keepdims = false,
    StreamOrDevice s = {});
inline array
norm(const array& a, int axis, bool keepdims = false, StreamOrDevice s = {}) {
  return norm(a, std::vector<int>{axis}, keepdims, s);
}

MLX_API std::pair<array, array> qr(const array& a, StreamOrDevice s = {});

MLX_API std::vector<array>
svd(const array& a, bool compute_uv, StreamOrDevice s /* = {} */);
inline std::vector<array> svd(const array& a, StreamOrDevice s = {}) {
  return svd(a, true, s);
}

MLX_API array inv(const array& a, StreamOrDevice s = {});

MLX_API array
tri_inv(const array& a, bool upper = false, StreamOrDevice s = {});

MLX_API array
cholesky(const array& a, bool upper = false, StreamOrDevice s = {});

MLX_API array pinv(const array& a, StreamOrDevice s = {});

MLX_API array
cholesky_inv(const array& a, bool upper = false, StreamOrDevice s = {});

MLX_API std::vector<array> lu(const array& a, StreamOrDevice s = {});

MLX_API std::pair<array, array> lu_factor(
    const array& a,
    StreamOrDevice s = {});

MLX_API array solve(const array& a, const array& b, StreamOrDevice s = {});

MLX_API array solve_triangular(
    const array& a,
    const array& b,
    bool upper = false,
    StreamOrDevice s = {});

/**
 * Compute the cross product of two arrays along the given axis.
 */
MLX_API array
cross(const array& a, const array& b, int axis = -1, StreamOrDevice s = {});

MLX_API std::pair<array, array> eig(const array& a, StreamOrDevice s = {});

MLX_API array eigvals(const array& a, StreamOrDevice s = {});

MLX_API array
eigvalsh(const array& a, std::string UPLO = "L", StreamOrDevice s = {});

MLX_API std::pair<array, array>
eigh(const array& a, std::string UPLO = "L", StreamOrDevice s = {});

} // namespace mlx::core::linalg


================================================
FILE: mlx/memory.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include <cstdlib>

#include "mlx/api.h"

namespace mlx::core {

/* Get the actively used memory in bytes.
 *
 * Note, this will not always match memory use reported by the system because
 * it does not include cached memory buffers.
 * */
MLX_API size_t get_active_memory();

/* Get the peak amount of used memory in bytes.
 *
 * The maximum memory used recorded from the beginning of the program
 * execution or since the last call to reset_peak_memory.
 * */
MLX_API size_t get_peak_memory();

/* Reset the peak memory to zero.
 * */
MLX_API void reset_peak_memory();

/* Get the cache size in bytes.
 *
 * The cache includes memory not currently used that has not been returned
 * to the system allocator.
 * */
MLX_API size_t get_cache_memory();

/* Set the memory limit.
 * The memory limit is a guideline for the maximum amount of memory to use
 * during graph evaluation. If the memory limit is exceeded and there is no
 * more RAM (including swap when available) allocations will result in an
 * exception.
 *
 * When Metal is available the memory limit defaults to 1.5 times the maximum
 * recommended working set size reported by the device.
 *
 * Returns the previous memory limit.
 * */
MLX_API size_t set_memory_limit(size_t limit);

/* Get the current memory limit. */
MLX_API size_t get_memory_limit();

/* Set the cache limit.
 * If using more than the given limit, free memory will be reclaimed
 * from the cache on the next allocation. To disable the cache,
 * set the limit to 0.
 *
 * The cache limit defaults to the memory limit.
 *
 * Returns the previous cache limit.
 * */
MLX_API size_t set_cache_limit(size_t limit);

/* Clear the memory cache. */
MLX_API void clear_cache();

/* Set the wired size limit.
 *
 * Note, this function is only useful when using the Metal backend with
 * macOS 15.0 or higher.
 *
 * The wired limit is the total size in bytes of memory that will be kept
 * resident. The default value is ``0``.
 *
 * Setting a wired limit larger than system wired limit is an error.
 *
 * Returns the previous wired limit.
 * */
MLX_API size_t set_wired_limit(size_t limit);

} // namespace mlx::core


================================================
FILE: mlx/mlx.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include "mlx/array.h"
#include "mlx/backend/cuda/cuda.h"
#include "mlx/backend/gpu/device_info.h"
#include "mlx/backend/metal/metal.h"
#include "mlx/compile.h"
#include "mlx/device.h"
#include "mlx/distributed/distributed.h"
#include "mlx/distributed/ops.h"
#include "mlx/einsum.h"
#include "mlx/export.h"
#include "mlx/fast.h"
#include "mlx/fft.h"
#include "mlx/io.h"
#include "mlx/linalg.h"
#include "mlx/memory.h"
#include "mlx/ops.h"
#include "mlx/random.h"
#include "mlx/stream.h"
#include "mlx/transforms.h"
#include "mlx/utils.h"
#include "mlx/version.h"


================================================
FILE: mlx/ops.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

// Required for using M_PI in MSVC.
#define _USE_MATH_DEFINES
#include <algorithm>
#include <climits>
#include <cmath>
#include <numeric>
#include <set>
#include <sstream>

#include "mlx/backend/cuda/cuda.h"
#include "mlx/backend/metal/metal.h"
#include "mlx/fast_primitives.h"
#include "mlx/ops.h"
#include "mlx/primitives.h"
#include "mlx/transforms.h"
#include "mlx/transforms_impl.h"
#include "mlx/utils.h"

namespace mlx::core {

namespace {

std::tuple<Shape, std::vector<int>, bool> compute_reduce_shape(
    const std::vector<int>& axes,
    const Shape& shape) {
  bool is_noop = true;
  std::set<int> axes_set;
  auto ndim = shape.size();
  for (auto ax : axes) {
    int ax_ = (ax < 0) ? ax + ndim : ax;
    if (ax_ < 0 || ax_ >= ndim) {
      std::ostringstream msg;
      msg << "Invalid axis " << ax << " for array with " << ndim
          << " dimensions.";
      throw std::out_of_range(msg.str());
    }
    axes_set.insert(ax_);
  }
  if (axes_set.size() != axes.size()) {
    throw std::invalid_argument("Duplicate axes detected in reduction.");
  }
  Shape out_shape;
  for (int i = 0; i < ndim; ++i) {
    if (axes_set.count(i) == 0) {
      out_shape.push_back(shape[i]);
    } else {
      out_shape.push_back(1);
    }
    is_noop &= (out_shape.back() == shape[i]);
  }
  std::vector<int> sorted_axes(axes_set.begin(), axes_set.end());
  return {out_shape, sorted_axes, is_noop};
}

Dtype at_least_float(const Dtype& d) {
  return issubdtype(d, inexact) ? d : promote_types(d, float32);
}

array indices_or_default(
    std::optional<array> indices,
    const array& x,
    StreamOrDevice s) {
  if (indices.has_value()) {
    return indices.value();
  }

  Shape shape(x.shape().begin(), x.shape().end() - 2);
  int total =
      std::reduce(shape.begin(), shape.end(), 1, std::multiplies<int>());
  return reshape(arange(total, uint32, s), std::move(shape), s);
}

void validate_quantized_input(
    std::string_view tag,
    const array& w,
    const array& scales,
    int group_size,
    int bits,
    const std::optional<array>& biases = std::nullopt) {
  if (w.dtype() != uint32) {
    std::ostringstream msg;
    msg << "[" << tag << "] The weight matrix should be uint32 "
        << "but received " << w.dtype();
    throw std::invalid_argument(msg.str());
  }

  if (biases && scales.shape() != biases->shape()) {
    std::ostringstream msg;
    msg << "[" << tag << "] Scales and biases should have the same shape. "
        << "Received scales with shape " << scales.shape()
        << " and biases with " << biases->shape();
    throw std::invalid_argument(msg.str());
  }

  if (!std::equal(
          w.shape().begin(), w.shape().end() - 2, scales.shape().begin())) {
    std::ostringstream msg;
    msg << "[" << tag
        << "] Weight and scales should have the same batch shape. "
        << "Received weight with shape " << w.shape() << ", scales with "
        << scales.shape() << ".";
    throw std::invalid_argument(msg.str());
  }

  if (w.shape(-1) * 32 / bits != scales.shape(-1) * group_size) {
    std::ostringstream msg;
    msg << "[" << tag << "] The shapes of the weight and scales are "
        << "incompatible based on bits and group_size. w.shape() == "
        << w.shape() << " and scales.shape() == " << scales.shape()
        << " with group_size=" << group_size << " and bits=" << bits;
    throw std::invalid_argument(msg.str());
  }
}

std::pair<int, int> extract_quantized_matmul_dims(
    std::string_view tag,
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases,
    bool transpose,
    int group_size,
    int bits) {
  validate_quantized_input(tag, w, scales, group_size, bits, biases);

  int x_inner_dims = x.shape(-1);

  // Calculate the expanded w's dims
  int w_inner_dims = (transpose) ? w.shape(-1) * 32 / bits : w.shape(-2);
  int w_outer_dims = (transpose) ? w.shape(-2) : w.shape(-1) * 32 / bits;

  if (w_inner_dims != x_inner_dims) {
    std::ostringstream msg;
    msg << "[" << tag << "] Last dimension of first input with "
        << "shape (..., " << x_inner_dims << ") does not match "
        << "the expanded quantized matrix (" << w_inner_dims << ", "
        << w_outer_dims << ") computed from shape " << w.shape()
        << " with group_size=" << group_size << ", bits=" << bits
        << " and transpose=" << std::boolalpha << transpose;
    throw std::invalid_argument(msg.str());
  }

  return {w_inner_dims, w_outer_dims};
}

} // namespace

array arange(
    double start,
    double stop,
    double step,
    Dtype dtype,
    StreamOrDevice s /* = {} */) {
  if (dtype == bool_) {
    std::ostringstream msg;
    msg << bool_ << " not supported for arange.";
    throw std::invalid_argument(msg.str());
  }
  if (std::isnan(start) || std::isnan(step) || std::isnan(stop)) {
    throw std::invalid_argument("[arange] Cannot compute length.");
  }

  if (std::isinf(start) || std::isinf(stop)) {
    throw std::invalid_argument("[arange] Cannot compute length.");
  }

  // Check if start and stop specify a valid range because if not, we have to
  // return an empty array
  if (std::isinf(step) &&
      ((step > 0 && start < stop) || (step < 0 && start > stop))) {
    return array({start}, dtype);
  }

  double real_size = std::ceil((stop - start) / step);

  if (real_size > INT_MAX) {
    throw std::invalid_argument("[arange] Maximum size exceeded.");
  }

  int size = std::max(static_cast<int>(real_size), 0);
  return array(
      {size},
      dtype,
      std::make_shared<Arange>(to_stream(s), start, stop, step),
      {});
}
array arange(
    double start,
    double stop,
    double step,
    StreamOrDevice s /* = {} */) {
  return arange(start, stop, step, float32, to_stream(s));
}
array arange(
    double start,
    double stop,
    Dtype dtype,
    StreamOrDevice s /* = {} */) {
  return arange(start, stop, 1.0, dtype, to_stream(s));
}
array arange(double start, double stop, StreamOrDevice s /* = {} */) {
  return arange(start, stop, 1.0, float32, to_stream(s));
}
array arange(double stop, Dtype dtype, StreamOrDevice s /* = {} */) {
  return arange(0.0, stop, 1.0, dtype, to_stream(s));
}
array arange(double stop, StreamOrDevice s /* = {} */) {
  return arange(0.0, stop, 1.0, float32, to_stream(s));
}
array arange(int start, int stop, int step, StreamOrDevice s /* = {} */) {
  return arange(
      static_cast<double>(start),
      static_cast<double>(stop),
      static_cast<double>(step),
      int32,
      to_stream(s));
}
array arange(int start, int stop, StreamOrDevice s /* = {} */) {
  return arange(
      static_cast<double>(start),
      static_cast<double>(stop),
      1.0,
      int32,
      to_stream(s));
}
array arange(int stop, StreamOrDevice s /* = {} */) {
  return arange(0.0, static_cast<double>(stop), 1.0, int32, to_stream(s));
}

array linspace(
    double start,
    double stop,
    int num /* = 50 */,
    Dtype dtype /* = float32 */,
    StreamOrDevice s /* = {} */) {
  if (num < 0) {
    std::ostringstream msg;
    msg << "[linspace] number of samples, " << num << ", must be non-negative.";
    throw std::invalid_argument(msg.str());
  }
  if (num == 1) {
    return astype(array({start}), dtype, s);
  }
  auto inner_type = dtype == float64 ? float64 : float32;
  array t =
      divide(arange(0, num, inner_type, s), array(num - 1, inner_type), s);
  array t_bar = subtract(array(1, inner_type), t, s);
  return astype(
      add(multiply(t_bar, array(start, inner_type), s),
          multiply(t, array(stop, inner_type), s),
          s),
      dtype,
      s);
}

array astype(array a, Dtype dtype, StreamOrDevice s /* = {} */) {
  if (dtype == a.dtype()) {
    return a;
  }
  auto copied_shape = a.shape(); // |a| will be moved
  return array(
      std::move(copied_shape),
      dtype,
      std::make_shared<AsType>(to_stream(s), dtype),
      {std::move(a)});
}

array as_strided(
    array a,
    Shape shape,
    Strides strides,
    size_t offset,
    StreamOrDevice s /* = {} */) {
  auto copied_shape = shape; // |shape| will be moved
  auto dtype = a.dtype(); // |a| will be moved
  return array(
      std::move(copied_shape),
      dtype,
      std::make_shared<AsStrided>(
          to_stream(s), std::move(shape), std::move(strides), offset),
      // Force the input array to be contiguous.
      {flatten(std::move(a), s)});
}

array copy(array a, StreamOrDevice s /* = {} */) {
  auto copied_shape = a.shape(); // |a| will be moved
  auto dtype = a.dtype();
  return array(
      std::move(copied_shape),
      dtype,
      std::make_shared<Copy>(to_stream(s)),
      {std::move(a)});
}

array full_impl(array vals, Dtype dtype, StreamOrDevice s /* = {} */) {
  return array(
      vals.shape(),
      dtype,
      std::make_shared<Full>(to_stream(s)),
      {astype(vals, dtype, s)});
}

array full(Shape shape, array vals, Dtype dtype, StreamOrDevice s /* = {} */) {
  if (std::any_of(shape.begin(), shape.end(), [](auto i) { return i < 0; })) {
    throw std::invalid_argument("[full] Negative dimensions not allowed.");
  }
  return full_impl(broadcast_to(vals, std::move(shape), s), dtype, s);
}

array full(Shape shape, array vals, StreamOrDevice s /* = {} */) {
  auto dtype = vals.dtype(); // |vals| will be moved
  return full(std::move(shape), std::move(vals), dtype, to_stream(s));
}

array full_like(
    const array& a,
    array vals,
    Dtype dtype,
    StreamOrDevice s /* = {} */) {
  auto inputs = broadcast_arrays({a, std::move(vals)}, s);
  return full_impl(std::move(inputs[1]), dtype, s);
}

array full_like(const array& a, array vals, StreamOrDevice s /* = {} */) {
  return full_like(a, std::move(vals), a.dtype(), to_stream(s));
}

array zeros(const Shape& shape, Dtype dtype, StreamOrDevice s /* = {} */) {
  return full(shape, array(0, dtype), to_stream(s));
}

array zeros_like(const array& a, StreamOrDevice s /* = {} */) {
  return full_like(a, 0, a.dtype(), to_stream(s));
}

array ones(const Shape& shape, Dtype dtype, StreamOrDevice s /* = {} */) {
  return full(shape, array(1, dtype), to_stream(s));
}

array ones_like(const array& a, StreamOrDevice s /* = {} */) {
  return full_like(a, 1, a.dtype(), to_stream(s));
}

array eye(int n, int m, int k, Dtype dtype, StreamOrDevice s /* = {} */) {
  if (n <= 0 || m <= 0) {
    throw std::invalid_argument("[eye] N and M must be positive integers.");
  }
  array result = zeros({n, m}, dtype, s);
  if (k >= m || -k >= n) {
    return result;
  }

  int diagonal_length = k >= 0 ? std::min(n, m - k) : std::min(n + k, m);

  std::vector<array> indices;
  auto s1 = std::max(0, -k);
  auto s2 = std::max(0, k);
  indices.push_back(arange(s1, diagonal_length + s1, int32, s));
  indices.push_back(arange(s2, diagonal_length + s2, int32, s));
  array ones_array = ones({diagonal_length, 1, 1}, dtype, s);
  return scatter(result, indices, ones_array, {0, 1}, s);
}

array identity(int n, Dtype dtype, StreamOrDevice s /* = {} */) {
  return eye(n, n, 0, dtype, s);
}

array tri(int n, int m, int k, Dtype type, StreamOrDevice s /* = {} */) {
  auto l = expand_dims(arange(n, s), 1, s);
  auto r = expand_dims(arange(-k, m - k, s), 0, s);
  return astype(greater_equal(l, r, s), type, s);
}

array tril(array x, int k /* = 0 */, StreamOrDevice s /* = {} */) {
  if (x.ndim() < 2) {
    throw std::invalid_argument("[tril] array must be at least 2-D");
  }
  auto mask = tri(x.shape(-2), x.shape(-1), k, bool_, s);
  return where(mask, x, array(0, x.dtype()), s);
}

array triu(array x, int k /* = 0 */, StreamOrDevice s /* = {} */) {
  if (x.ndim() < 2) {
    throw std::invalid_argument("[triu] array must be at least 2-D");
  }
  auto mask = tri(x.shape(-2), x.shape(-1), k - 1, bool_, s);
  return where(mask, array(0, x.dtype()), x, s);
}

array reshape(const array& a, Shape shape, StreamOrDevice s /* = {} */) {
  if (a.shape() == shape) {
    return a;
  }
  auto out_shape = Reshape::output_shape(a, shape);
  return array(
      std::move(out_shape),
      a.dtype(),
      std::make_shared<Reshape>(to_stream(s), std::move(shape)),
      {a});
}

array unflatten(
    const array& a,
    int axis,
    Shape shape,
    StreamOrDevice s /* = {} */) {
  if (shape.empty()) {
    throw std::invalid_argument(
        "[unflatten] Shape to unflatten to cannot be empty.");
  }
  auto ndim = static_cast<int>(a.ndim());
  auto ax = axis < 0 ? axis + ndim : axis;
  if (ax < 0 || ax >= ndim) {
    std::ostringstream msg;
    msg << "[unflatten] Invalid axes " << ax << " for array with " << a.ndim()
        << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  size_t size = 1;
  int infer_idx = -1;
  for (int i = 0; i < shape.size(); ++i) {
    if (shape[i] == -1) {
      if (infer_idx >= 0) {
        throw std::invalid_argument(
            "[Unflatten] Can only infer one dimension.");
      }
      infer_idx = i;
    } else {
      size *= shape[i];
    }
  }
  if (infer_idx >= 0) {
    shape[infer_idx] = a.shape(ax) / size;
    size *= shape[infer_idx];
  }
  if (size != a.shape(ax)) {
    std::ostringstream msg;
    msg << "[Unflatten] Cannot unflatten axis " << axis << " with size "
        << a.shape(ax) << " into shape " << shape << ".";
    throw std::invalid_argument(msg.str());
  }
  if (shape.size() == 1) {
    return a;
  }

  auto out_shape = Unflatten::output_shape(a, ax, shape);
  return array(
      std::move(out_shape),
      a.dtype(),
      std::make_shared<Unflatten>(to_stream(s), ax, std::move(shape)),
      {a});
}

array flatten(
    const array& a,
    int start_axis,
    int end_axis /* = -1 */,
    StreamOrDevice s /* = {} */) {
  auto ndim = static_cast<int>(a.ndim());
  auto start_ax = start_axis + (start_axis < 0 ? ndim : 0);
  auto end_ax = end_axis + (end_axis < 0 ? ndim : 0);
  start_ax = std::max(0, start_ax);
  end_ax = std::min(ndim - 1, end_ax);
  if (a.ndim() == 0) {
    return reshape(a, {1}, s);
  }
  if (end_ax < start_ax) {
    throw std::invalid_argument(
        "[flatten] start_axis must be less than or equal to end_axis");
  }
  if (start_ax >= ndim) {
    std::ostringstream msg;
    msg << "[flatten] Invalid start_axis " << start_axis << " for array with "
        << ndim << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  if (end_ax < 0) {
    std::ostringstream msg;
    msg << "[flatten] Invalid end_axis " << end_axis << " for array with "
        << ndim << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  if (start_ax == end_ax) {
    return a;
  }
  return array(
      Flatten::output_shape(a, start_ax, end_ax),
      a.dtype(),
      std::make_shared<Flatten>(to_stream(s), start_ax, end_ax),
      {a});
}

array flatten(const array& a, StreamOrDevice s /* = {} */) {
  return flatten(a, 0, a.ndim() - 1, s);
}

array hadamard_transform(
    const array& a,
    std::optional<float> scale_ /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  if (a.size() == 0) {
    throw std::invalid_argument(
        "[hadamard_transform] Does not support empty arrays.");
  }
  // Default to an orthonormal Hadamard matrix scaled by 1/sqrt(N)
  int n = a.ndim() > 0 ? a.shape(-1) : 1;
  float scale = scale_.has_value() ? *scale_ : 1.0f / std::sqrt(n);
  auto dtype = issubdtype(a.dtype(), floating) ? a.dtype() : float32;

  // Nothing to do for a scalar
  if (n == 1) {
    if (scale == 1) {
      return a;
    }

    return multiply(a, array(scale, dtype), s);
  }

  return array(
      a.shape(),
      dtype,
      std::make_shared<Hadamard>(to_stream(s), scale),
      {astype(a, dtype, s)});
}

array squeeze_impl(
    const array& a,
    std::vector<int> axes,
    StreamOrDevice s /* = {} */) {
  for (auto& ax : axes) {
    auto new_ax = ax < 0 ? ax + a.ndim() : ax;
    if (new_ax < 0 || new_ax >= a.ndim()) {
      std::ostringstream msg;
      msg << "[squeeze] Invalid axes " << ax << " for array with " << a.ndim()
          << " dimensions.";
      throw std::invalid_argument(msg.str());
    }
    if (a.shape(new_ax) != 1) {
      std::ostringstream msg;
      msg << "[squeeze] Cannot squeeze axis " << ax << " with size "
          << a.shape(ax) << " which is not equal to 1.";
      throw std::invalid_argument(msg.str());
    }
    ax = new_ax;
  }
  auto shape = Squeeze::output_shape(a, axes);
  return array(
      std::move(shape),
      a.dtype(),
      std::make_shared<Squeeze>(to_stream(s), std::move(axes)),
      {a});
}

array squeeze(
    const array& a,
    const std::vector<int>& axes,
    StreamOrDevice s /* = {} */) {
  if (axes.empty()) {
    return a;
  }
  std::set<int> unique_axes;
  for (auto ax : axes) {
    unique_axes.insert(ax < 0 ? ax + a.ndim() : ax);
  }
  if (unique_axes.size() != axes.size()) {
    throw std::invalid_argument("[squeeze] Received duplicate axes.");
  }
  std::vector<int> sorted_axes(unique_axes.begin(), unique_axes.end());
  return squeeze_impl(a, std::move(sorted_axes), s);
}

array squeeze(const array& a, int axis, StreamOrDevice s /* = {} */) {
  return squeeze_impl(a, {axis}, s);
}

array squeeze(const array& a, StreamOrDevice s /* = {} */) {
  std::vector<int> axes;
  for (int i = 0; i < a.ndim(); ++i) {
    if (a.shape(i) == 1) {
      axes.push_back(i);
    }
  }
  return squeeze_impl(a, std::move(axes), s);
}

array expand_dims_impl(
    const array& a,
    std::vector<int> axes,
    StreamOrDevice s /* = {} */) {
  auto out_ndim = a.ndim() + axes.size();
  for (auto& ax : axes) {
    auto new_ax = ax < 0 ? ax + out_ndim : ax;
    if (new_ax < 0 || new_ax >= out_ndim) {
      std::ostringstream msg;
      msg << "[expand_dims] Invalid axis " << ax << " for output array with "
          << a.ndim() << " dimensions.";
      throw std::invalid_argument(msg.str());
    }
    ax = new_ax;
  }
  auto shape = ExpandDims::output_shape(a, axes);
  return array(
      std::move(shape),
      a.dtype(),
      std::make_shared<ExpandDims>(to_stream(s), std::move(axes)),
      {a});
}

array expand_dims(const array& a, int axis, StreamOrDevice s /* = {} */) {
  return expand_dims_impl(a, {axis}, s);
}

array expand_dims(
    const array& a,
    const std::vector<int>& axes,
    StreamOrDevice s /* = {} */) {
  if (axes.empty()) {
    return a;
  }
  { // Check for repeats
    std::set<int> unique_axes(axes.begin(), axes.end());
    if (unique_axes.size() != axes.size()) {
      throw std::invalid_argument("[expand_dims] Received duplicate axes.");
    }
  }
  // Check for repeats again
  auto out_ndim = a.ndim() + axes.size();
  std::set<int> unique_axes;
  for (auto ax : axes) {
    unique_axes.insert(ax < 0 ? ax + out_ndim : ax);
  }
  if (unique_axes.size() != axes.size()) {
    throw std::invalid_argument("[expand_dims] Received duplicate axes.");
  }
  std::vector<int> sorted_axes(unique_axes.begin(), unique_axes.end());
  return expand_dims_impl(a, std::move(sorted_axes), s);
}

// Slice helper
namespace {

inline auto
normalize_slice(const Shape& shape, Shape& start, Shape stop, Shape& strides) {
  // - Start indices are normalized
  // - End indices are unchanged as -1 means something different
  //   pre-normalization (the end of the axis) versus post normalization (the
  //   position left of 0).
  // - Any strides corresponding to singleton dimension are set to 1

  Shape out_shape(shape.size());
  bool has_neg_strides = false;

  for (int i = 0; i < shape.size(); ++i) {
    // Following numpy docs
    //  Negative i and j are interpreted as n + i and n + j where n is
    //  the number of elements in the corresponding dimension. Negative
    //  k makes stepping go towards smaller indices

    auto n = shape[i];
    auto s = start[i];
    s = s < 0 ? s + n : s;
    auto e = stop[i];
    e = e < 0 ? e + n : e;

    // Note: -ve strides require start >= stop
    if (strides[i] < 0) {
      has_neg_strides = true;

      // Clamp to bounds
      auto st = std::min(s, n - 1);
      auto ed = e > -1 ? e : -1;

      start[i] = st;
      ed = ed > st ? st : ed;

      auto str = -strides[i];
      out_shape[i] = (start[i] - ed + str - 1) / str;

    } else {
      // Clamp to bounds
      auto st = std::max(static_cast<ShapeElem>(0), std::min(s, n));
      auto ed = std::max(static_cast<ShapeElem>(0), std::min(e, n));

      start[i] = st;
      ed = ed < st ? st : ed;

      out_shape[i] = (ed - start[i] + strides[i] - 1) / strides[i];
    }
    // Simplify the stride if it's unused
    if (out_shape[i] == 1) {
      strides[i] = 1;
    }
  }

  return std::make_pair(has_neg_strides, out_shape);
}

void normalize_dynamic_slice_inputs(
    const array& a,
    const array& start,
    std::vector<int>& axes,
    std::string_view prefix) {
  if (start.size() > a.ndim()) {
    std::ostringstream msg;
    msg << prefix << " Invalid number of starting positions for "
        << "array with dimension " << a.ndim() << ".";
    throw std::invalid_argument(msg.str());
  }
  if (start.ndim() > 1) {
    std::ostringstream msg;
    msg << prefix << " array of starting indices "
        << "must be zero or one dimensional but has dimension " << start.ndim()
        << ".";
    throw std::invalid_argument(msg.str());
  }
  if (start.size() != axes.size()) {
    std::ostringstream msg;
    msg << prefix << " Number of starting indices " << start.size()
        << " does not match number of axes " << axes.size() << ".";
    throw std::invalid_argument(msg.str());
  }
  if (!issubdtype(start.dtype(), integer)) {
    std::ostringstream msg;
    msg << prefix << " Start indices must be integers, got type "
        << start.dtype() << ".";
    throw std::invalid_argument(msg.str());
  }
  for (auto& ax : axes) {
    auto new_ax = ax < 0 ? ax + a.ndim() : ax;
    if (new_ax < 0 || new_ax >= a.ndim()) {
      std::ostringstream msg;
      msg << prefix << " Invalid axis " << ax << " for array with dimension "
          << a.ndim() << ".";
      throw std::invalid_argument(msg.str());
    }
    ax = new_ax;
  }
  std::set dims(axes.begin(), axes.end());
  if (dims.size() != axes.size()) {
    std::ostringstream msg;
    msg << prefix << " Repeat axes not allowed.";
    throw std::invalid_argument(msg.str());
  }
}

} // namespace

array slice(
    const array& a,
    Shape start,
    Shape stop,
    Shape strides,
    StreamOrDevice s /* = {} */) {
  if (start.size() != a.ndim() || stop.size() != a.ndim() ||
      strides.size() != a.ndim()) {
    std::ostringstream msg;
    msg << "[slice] Invalid number of indices or strides for "
        << "array with dimension " << a.ndim() << ".";
    throw std::invalid_argument(msg.str());
  }

  auto [has_neg_strides, out_shape] =
      normalize_slice(a.shape(), start, stop, strides);

  if (!has_neg_strides && out_shape == a.shape()) {
    return a;
  }

  return array(
      out_shape,
      a.dtype(),
      std::make_shared<Slice>(
          to_stream(s), std::move(start), std::move(stop), std::move(strides)),
      {a});
}

array slice(
    const array& a,
    Shape start,
    Shape stop,
    StreamOrDevice s /* = {} */) {
  return slice(
      a, std::move(start), std::move(stop), Shape(a.ndim(), 1), to_stream(s));
}

array slice(
    const array& a,
    const array& start,
    std::vector<int> axes,
    Shape slice_size,
    StreamOrDevice s /* = {} */) {
  normalize_dynamic_slice_inputs(a, start, axes, "[slice]");

  // Check the slice_size
  if (slice_size.size() != a.ndim()) {
    std::ostringstream msg;
    msg << "[slice] Invalid slice size for array with " << a.ndim()
        << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  for (int i = 0; i < a.ndim(); ++i) {
    if (slice_size[i] > a.shape(i)) {
      std::ostringstream msg;
      msg << "[slice] Invalid slice size " << slice_size
          << " for array with shape " << a.shape() << ".";
      throw std::invalid_argument(msg.str());
    }
  }
  auto out_shape = slice_size;
  return array(
      std::move(out_shape),
      a.dtype(),
      std::make_shared<DynamicSlice>(
          to_stream(s), std::move(axes), std::move(slice_size)),
      {a, start});
}

/** Update a slice from the source array */
array slice_update(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    Shape strides,
    StreamOrDevice s /* = {} */) {
  // Check dimensions
  if (start.size() != src.ndim() || stop.size() != src.ndim() ||
      strides.size() != src.ndim()) {
    std::ostringstream msg;
    msg << "[slice_update] Invalid number of indices or strides for "
        << "array with dimension " << src.ndim() << ".";
    throw std::invalid_argument(msg.str());
  }

  // Process slice dimensions
  auto [has_neg_strides, upd_shape] =
      normalize_slice(src.shape(), start, stop, strides);

  // Cast update to src type and broadcast update shape to slice shape
  auto upd = broadcast_to(astype(update, src.dtype(), s), upd_shape, s);

  // If the entire src is the slice, just return the update
  if (!has_neg_strides && upd_shape == src.shape()) {
    return upd;
  }
  return array(
      src.shape(),
      src.dtype(),
      std::make_shared<SliceUpdate>(
          to_stream(s),
          SliceUpdate::None,
          std::move(start),
          std::move(stop),
          std::move(strides)),
      {src, upd});
}

/** Update a slice from the source array with stride 1 in each dimension */
array slice_update(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    StreamOrDevice s /* = {} */) {
  return slice_update(
      src, update, std::move(start), std::move(stop), Shape(src.ndim(), 1), s);
}

/** Update a slice from the source array */
array slice_update(
    const array& src,
    const array& update,
    const array& start,
    std::vector<int> axes,
    StreamOrDevice s /* = {} */) {
  normalize_dynamic_slice_inputs(src, start, axes, "[slice_update]");

  // Broadcast update with unspecified axes
  auto up_shape = update.shape();
  auto dim_diff = std::max(src.ndim() - update.ndim(), size_t(0));
  up_shape.insert(
      up_shape.begin(), src.shape().begin(), src.shape().begin() + dim_diff);
  for (int d = dim_diff; d < src.ndim(); ++d) {
    up_shape[d] = std::min(up_shape[d], src.shape(d));
  }
  for (auto ax : axes) {
    if (ax < dim_diff) {
      up_shape[ax] = 1;
    }
  }
  auto upd = broadcast_to(astype(update, src.dtype(), s), up_shape, s);
  return array(
      src.shape(),
      src.dtype(),
      std::make_shared<DynamicSliceUpdate>(to_stream(s), std::move(axes)),
      {src, upd, start});
}

array slice_update(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    Shape strides,
    SliceUpdate::ReduceType mode,
    StreamOrDevice s) {
  if (start.size() != src.ndim() || stop.size() != src.ndim() ||
      strides.size() != src.ndim()) {
    std::ostringstream msg;
    msg << "[slice_update] Invalid number of indices or strides for "
        << "array with dimension " << src.ndim() << ".";
    throw std::invalid_argument(msg.str());
  }

  auto [has_neg_strides, upd_shape] =
      normalize_slice(src.shape(), start, stop, strides);

  auto upd = broadcast_to(astype(update, src.dtype(), s), upd_shape, s);

  if (!has_neg_strides && upd_shape == src.shape()) {
    switch (mode) {
      case SliceUpdate::None:
        return upd;
      case SliceUpdate::Sum:
        return add(src, upd, s);
      case SliceUpdate::Prod:
        return multiply(src, upd, s);
      case SliceUpdate::Max:
        return maximum(src, upd, s);
      case SliceUpdate::Min:
        return minimum(src, upd, s);
    }
  }

  return array(
      src.shape(),
      src.dtype(),
      std::make_shared<SliceUpdate>(
          to_stream(s),
          mode,
          std::move(start),
          std::move(stop),
          std::move(strides)),
      {src, upd});
}

array slice_update_add(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    Shape strides,
    StreamOrDevice s /*= {}*/) {
  return slice_update(
      src,
      update,
      std::move(start),
      std::move(stop),
      std::move(strides),
      SliceUpdate::Sum,
      s);
}

array slice_update_add(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    StreamOrDevice s /*= {}*/) {
  return slice_update_add(
      src, update, std::move(start), std::move(stop), Shape(src.ndim(), 1), s);
}

array slice_update_prod(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    Shape strides,
    StreamOrDevice s /*= {}*/) {
  return slice_update(
      src,
      update,
      std::move(start),
      std::move(stop),
      std::move(strides),
      SliceUpdate::Prod,
      s);
}

array slice_update_prod(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    StreamOrDevice s /*= {}*/) {
  return slice_update_prod(
      src, update, std::move(start), std::move(stop), Shape(src.ndim(), 1), s);
}

array slice_update_max(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    Shape strides,
    StreamOrDevice s /*= {}*/) {
  return slice_update(
      src,
      update,
      std::move(start),
      std::move(stop),
      std::move(strides),
      SliceUpdate::Max,
      s);
}

array slice_update_max(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    StreamOrDevice s /*= {}*/) {
  return slice_update_max(
      src, update, std::move(start), std::move(stop), Shape(src.ndim(), 1), s);
}

array slice_update_min(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    Shape strides,
    StreamOrDevice s /*= {}*/) {
  return slice_update(
      src,
      update,
      std::move(start),
      std::move(stop),
      std::move(strides),
      SliceUpdate::Min,
      s);
}

array slice_update_min(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    StreamOrDevice s /*= {}*/) {
  return slice_update_min(
      src, update, std::move(start), std::move(stop), Shape(src.ndim(), 1), s);
}

std::vector<array> split(
    const array& a,
    const Shape& indices,
    int axis,
    StreamOrDevice s /* = {} */) {
  auto ax = axis < 0 ? axis + a.ndim() : axis;
  if (ax < 0 || ax >= a.ndim()) {
    std::ostringstream msg;
    msg << "Invalid axis (" << axis << ") passed to split"
        << " for array with shape " << a.shape() << ".";
    throw std::invalid_argument(msg.str());
  }

  if (indices.empty()) {
    return {a};
  }

  if (indices.size() < 10 &&
      std::is_sorted(indices.begin(), indices.end(), std::less<>{}) &&
      indices[0] > 0 && indices.back() < a.shape(ax)) {
    std::vector<Dtype> dtypes(indices.size() + 1, a.dtype());
    std::vector<Shape> shapes(indices.size() + 1, a.shape());
    shapes[0][ax] = indices[0];
    for (int i = 1; i < indices.size(); i++) {
      shapes[i][ax] = indices[i] - indices[i - 1];
    }
    shapes.back()[ax] = a.shape(ax) - indices.back();

    return array::make_arrays(
        std::move(shapes),
        dtypes,
        std::make_shared<Split>(to_stream(s), indices, ax),
        {a});
  }

  std::vector<array> res;
  auto start_indices = Shape(a.ndim(), 0);
  auto stop_indices = a.shape();
  for (int i = 0; i < indices.size() + 1; ++i) {
    stop_indices[ax] = i < indices.size() ? indices[i] : a.shape(ax);
    res.push_back(slice(a, start_indices, stop_indices, to_stream(s)));
    start_indices[ax] = stop_indices[ax];
  }
  return res;
}

std::vector<array>
split(const array& a, const Shape& indices, StreamOrDevice s /* = {} */) {
  return split(a, indices, 0, s);
}

std::vector<array>
split(const array& a, int num_splits, int axis, StreamOrDevice s /* = {} */) {
  auto ax = axis < 0 ? axis + a.ndim() : axis;
  if (ax < 0 || ax >= a.ndim()) {
    std::ostringstream msg;
    msg << "Invalid axis " << axis << " passed to split"
        << " for array with shape " << a.shape() << ".";
    throw std::invalid_argument(msg.str());
  }
  if (num_splits <= 0) {
    std::ostringstream msg;
    msg << "[split] num_splits must be positive and non-zero but got "
        << num_splits << ".";
    throw std::invalid_argument(msg.str());
  }
  auto q_and_r = std::ldiv(a.shape(axis), num_splits);
  if (q_and_r.rem) {
    std::ostringstream msg;
    msg << "Array split does not result in sub arrays with equal size:"
        << " attempting " << num_splits << " splits along axis " << axis
        << " for shape " << a.shape() << ".";
    throw std::invalid_argument(msg.str());
  }
  auto split_size = q_and_r.quot;
  Shape indices(num_splits - 1);
  for (int i = 0; i < indices.size(); ++i) {
    indices[i] = (i + 1) * split_size;
  }
  return split(a, indices, axis, s);
}

std::vector<array>
split(const array& a, int num_splits, StreamOrDevice s /* = {} */) {
  return split(a, num_splits, 0, to_stream(s));
}

std::vector<array> meshgrid(
    const std::vector<array>& arrays,
    bool sparse /* = false */,
    const std::string& indexing /* = "xy" */,
    StreamOrDevice s /* = {} */) {
  if (indexing != "xy" && indexing != "ij") {
    throw std::invalid_argument(
        "[meshgrid] Invalid indexing value. Valid values are 'xy' and 'ij'.");
  }

  auto ndim = arrays.size();
  std::vector<array> outputs;
  for (int i = 0; i < ndim; ++i) {
    Shape shape(ndim, 1);
    shape[i] = -1;
    outputs.push_back(reshape(arrays[i], std::move(shape), s));
  }

  if (indexing == "xy" && ndim > 1) {
    Shape shape(ndim, 1);

    shape[1] = arrays[0].size();
    outputs[0] = reshape(arrays[0], shape, s);
    shape[1] = 1;
    shape[0] = arrays[1].size();
    outputs[1] = reshape(arrays[1], std::move(shape), s);
  }

  if (!sparse) {
    outputs = broadcast_arrays(outputs, s);
  }

  return outputs;
}

array clip(
    const array& a,
    const std::optional<array>& a_min,
    const std::optional<array>& a_max,
    StreamOrDevice s /* = {} */) {
  if (!a_min.has_value() && !a_max.has_value()) {
    throw std::invalid_argument("At most one of a_min and a_max may be None");
  }
  array result = a;
  if (a_min.has_value()) {
    result = maximum(result, a_min.value(), s);
  }
  if (a_max.has_value()) {
    result = minimum(result, a_max.value(), s);
  }
  return result;
}

array concatenate(
    std::vector<array> arrays,
    int axis,
    StreamOrDevice s /* = {} */) {
  if (arrays.size() == 0) {
    throw std::invalid_argument(
        "[concatenate] No arrays provided for concatenation");
  }
  if (arrays.size() == 1) {
    return arrays[0];
  }

  auto ax = normalize_axis_index(axis, arrays[0].ndim(), "[concatenate] ");

  auto throw_invalid_shapes = [&]() {
    std::ostringstream msg;
    msg << "[concatenate] All the input array dimensions must match exactly "
        << "except for the concatenation axis. However, the provided shapes are ";
    for (auto& a : arrays) {
      msg << a.shape() << ", ";
    }
    msg << "and the concatenation axis is " << axis << ".";
    throw std::invalid_argument(msg.str());
  };

  auto shape = arrays[0].shape();
  shape[ax] = 0;
  // Make the output shape and validate that all arrays have the same shape
  // except for the concatenation axis.
  for (auto& a : arrays) {
    if (a.ndim() != shape.size()) {
      std::ostringstream msg;
      msg << "[concatenate] All the input arrays must have the same number of "
          << "dimensions. However, got arrays with dimensions " << shape.size()
          << " and " << a.ndim() << ".";
      throw std::invalid_argument(msg.str());
    }
    for (int i = 0; i < a.ndim(); i++) {
      if (i == ax) {
        continue;
      }
      if (a.shape(i) != shape[i]) {
        throw_invalid_shapes();
      }
    }
    shape[ax] += a.shape(ax);
  }

  // Promote all the arrays to the same type
  auto dtype = result_type(arrays);
  for (auto& a : arrays) {
    a = astype(a, dtype, s);
  }

  return array(
      std::move(shape),
      dtype,
      std::make_shared<Concatenate>(to_stream(s), ax),
      std::move(arrays));
}

array concatenate(std::vector<array> arrays, StreamOrDevice s /* = {} */) {
  for (auto& a : arrays) {
    a = flatten(a, s);
  }
  return concatenate(std::move(arrays), 0, s);
}

/** Stack arrays along a new axis */
array stack(
    const std::vector<array>& arrays,
    int axis,
    StreamOrDevice s /* = {} */) {
  if (arrays.empty()) {
    throw std::invalid_argument("[stack] No arrays provided for stacking");
  }
  if (!std::all_of(arrays.begin(), arrays.end(), [&](const auto& a) {
        return arrays[0].shape() == a.shape();
      })) {
    throw std::invalid_argument("[stack] All arrays must have the same shape");
  }
  auto normalized_axis =
      normalize_axis_index(axis, arrays[0].ndim() + 1, "[stack] ");
  std::vector<array> new_arrays;
  new_arrays.reserve(arrays.size());
  for (auto& a : arrays) {
    new_arrays.emplace_back(expand_dims(a, normalized_axis, s));
  }
  return concatenate(new_arrays, axis, s);
}

array stack(const std::vector<array>& arrays, StreamOrDevice s /* = {} */) {
  return stack(arrays, 0, s);
}

/** array repeat with axis */
array repeat(const array& arr, int repeats, int axis, StreamOrDevice s) {
  axis = normalize_axis_index(axis, arr.ndim(), "[repeat] ");

  if (repeats < 0) {
    throw std::invalid_argument(
        "[repeat] Number of repeats cannot be negative");
  }

  if (repeats == 0) {
    return array({}, arr.dtype());
  }

  if (repeats == 1) {
    return arr;
  }

  // Broadcast to (S_1, S_2, ..., S_axis, repeats, S_axis+1, ...)
  auto shape = arr.shape();
  shape.insert(shape.begin() + axis + 1, repeats);
  array out = expand_dims(arr, axis + 1, s);
  out = broadcast_to(out, shape, s);

  // Reshape back into a contiguous array where S_axis is now S_axis * repeats
  shape.erase(shape.begin() + axis + 1);
  shape[axis] *= repeats;
  out = reshape(out, shape, s);

  return out;
}

array repeat(const array& arr, int repeats, StreamOrDevice s) {
  return repeat(flatten(arr, s), repeats, 0, s);
}

array tile(
    const array& arr,
    std::vector<int> reps,
    StreamOrDevice s /* = {} */) {
  auto shape = arr.shape();
  if (reps.size() < shape.size()) {
    reps.insert(reps.begin(), shape.size() - reps.size(), 1);
  }
  if (reps.size() > shape.size()) {
    shape.insert(shape.begin(), reps.size() - shape.size(), 1);
  }

  Shape expand_shape;
  Shape broad_shape;
  Shape final_shape;
  for (int i = 0; i < shape.size(); i++) {
    if (reps[i] != 1) {
      expand_shape.push_back(1);
      broad_shape.push_back(reps[i]);
    }
    expand_shape.push_back(shape[i]);
    broad_shape.push_back(shape[i]);
    final_shape.push_back(reps[i] * shape[i]);
  }

  auto x = reshape(arr, std::move(expand_shape), s);
  x = broadcast_to(x, std::move(broad_shape), s);
  return reshape(x, std::move(final_shape), s);
}

array edge_pad(
    const array& a,
    const std::vector<int>& axes,
    const Shape& low_pad_size,
    const Shape& high_pad_size,
    const Shape& out_shape,
    StreamOrDevice s /* = {}*/) {
  array out = zeros(out_shape, a.dtype(), s);
  auto stops = a.shape();
  for (int i = 0; i < stops.size(); i++) {
    stops[i] += low_pad_size[i];
  }
  // Copy over values from the unpadded array
  array padded = slice_update(out, a, low_pad_size, stops, s);

  for (int axis = 0; axis < a.ndim(); axis++) {
    if (low_pad_size[axis] > 0) {
      Shape starts(a.ndim(), 0);
      starts[axis] = low_pad_size[axis];
      auto stops = out.shape();
      stops[axis] = low_pad_size[axis] + 1;
      // Fetch edge values
      array edge_value = slice(padded, starts, stops, s);

      starts[axis] = 0;
      stops[axis] = low_pad_size[axis];
      // Update edge values in the padded array
      padded = slice_update(padded, edge_value, starts, stops, s);
    }

    if (high_pad_size[axis] > 0) {
      Shape starts(a.ndim(), 0);
      starts[axis] = -high_pad_size[axis] - 1;
      auto stops = out.shape();
      stops[axis] = -high_pad_size[axis];
      array edge_value = slice(padded, starts, stops, s);

      starts[axis] = -high_pad_size[axis];
      stops[axis] = out.shape(axis);
      padded = slice_update(padded, edge_value, starts, stops, s);
    }
  }
  return padded;
}

/** Pad an array with a constant value */
array pad(
    const array& a,
    const std::vector<int>& axes,
    const Shape& low_pad_size,
    const Shape& high_pad_size,
    const array& pad_value /*= array(0)*/,
    const std::string& mode /*= "constant"*/,
    StreamOrDevice s /* = {}*/) {
  if (axes.size() != low_pad_size.size() ||
      axes.size() != high_pad_size.size()) {
    std::ostringstream msg;
    msg << "Invalid number of padding sizes passed to pad "
        << "with axes of size " << axes.size();
    throw std::invalid_argument(msg.str());
  }

  auto out_shape = a.shape();

  for (int i = 0; i < axes.size(); i++) {
    if (low_pad_size[i] < 0) {
      std::ostringstream msg;
      msg << "Invalid low padding size (" << low_pad_size[i]
          << ") passed to pad for axis " << i
          << ". Padding sizes must be non-negative";
      throw std::invalid_argument(msg.str());
    }
    if (high_pad_size[i] < 0) {
      std::ostringstream msg;
      msg << "Invalid high padding size (" << high_pad_size[i]
          << ") passed to pad for axis " << i
          << ". Padding sizes must be non-negative";
      throw std::invalid_argument(msg.str());
    }

    auto ax = axes[i] < 0 ? a.ndim() + axes[i] : axes[i];
    out_shape[ax] += low_pad_size[i] + high_pad_size[i];
  }

  if (mode == "constant") {
    return array(
        std::move(out_shape),
        a.dtype(),
        std::make_shared<Pad>(to_stream(s), axes, low_pad_size, high_pad_size),
        {a, astype(pad_value, a.dtype(), s)});
  } else if (mode == "edge") {
    return edge_pad(a, axes, low_pad_size, high_pad_size, out_shape, s);
  } else {
    std::ostringstream msg;
    msg << "Invalid padding mode (" << mode << ") passed to pad";
    throw std::invalid_argument(msg.str());
  }
}

/** Pad an array with a constant value along all axes */
array pad(
    const array& a,
    const std::vector<std::pair<int, int>>& pad_width,
    const array& pad_value /*= array(0)*/,
    const std::string& mode /*= "constant"*/,
    StreamOrDevice s /*= {}*/) {
  std::vector<int> axes(a.ndim(), 0);
  std::iota(axes.begin(), axes.end(), 0);

  Shape lows;
  Shape highs;

  for (auto& pads : pad_width) {
    lows.push_back(pads.first);
    highs.push_back(pads.second);
  }

  return pad(a, axes, lows, highs, pad_value, mode, s);
}

array pad(
    const array& a,
    const std::pair<int, int>& pad_width,
    const array& pad_value /*= array(0)*/,
    const std::string& mode /*= "constant"*/,
    StreamOrDevice s /*= {}*/) {
  return pad(
      a,
      std::vector<std::pair<int, int>>(a.ndim(), pad_width),
      pad_value,
      mode,
      s);
}

array pad(
    const array& a,
    int pad_width,
    const array& pad_value /*= array(0)*/,
    const std::string& mode /*= "constant"*/,
    StreamOrDevice s /*= {}*/) {
  return pad(
      a,
      std::vector<std::pair<int, int>>(a.ndim(), {pad_width, pad_width}),
      pad_value,
      mode,
      s);
}

array moveaxis(
    const array& a,
    int source,
    int destination,
    StreamOrDevice s /* = {} */) {
  auto check_ax = [&a](int ax) {
    auto ndim = static_cast<int>(a.ndim());
    if (ax < -ndim || ax >= ndim) {
      std::ostringstream msg;
      msg << "[moveaxis] Invalid axis " << ax << " for array with " << ndim
          << " dimensions.";
      throw std::out_of_range(msg.str());
    }
    return ax < 0 ? ax + ndim : ax;
  };
  source = check_ax(source);
  destination = check_ax(destination);
  if (source == destination) {
    return a;
  }
  std::vector<int> reorder(a.ndim());
  std::iota(reorder.begin(), reorder.end(), 0);
  reorder.erase(reorder.begin() + source);
  reorder.insert(reorder.begin() + destination, source);
  return transpose(a, reorder, s);
}

array swapaxes(
    const array& a,
    int axis1,
    int axis2,
    StreamOrDevice s /* = {} */) {
  auto check_ax = [&a](int ax) {
    auto ndim = static_cast<int>(a.ndim());
    if (ax < -ndim || ax >= ndim) {
      std::ostringstream msg;
      msg << "[swapaxes] Invalid axis " << ax << " for array with " << ndim
          << " dimensions.";
      throw std::out_of_range(msg.str());
    }
    return ax < 0 ? ax + ndim : ax;
  };
  axis1 = check_ax(axis1);
  axis2 = check_ax(axis2);
  std::vector<int> reorder(a.ndim());
  std::iota(reorder.begin(), reorder.end(), 0);
  std::swap(reorder[axis1], reorder[axis2]);
  return transpose(a, std::move(reorder), s);
}

array transpose(
    const array& a,
    std::vector<int> axes,
    StreamOrDevice s /* = {} */) {
  for (auto& ax : axes) {
    ax = ax < 0 ? ax + a.ndim() : ax;
  }
  if (axes.size() != a.ndim()) {
    std::ostringstream msg;
    msg << "[transpose] Recived " << axes.size() << " axes for array with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  // Check in bounds and for duplicates
  Shape shape(axes.size(), 0);
  for (auto& ax : axes) {
    if (ax < 0 || ax >= a.ndim()) {
      std::ostringstream msg;
      msg << "[transpose] Invalid axis (" << ax << ") for array with "
          << a.ndim() << " dimensions.";
      throw std::invalid_argument(msg.str());
    }
    if (shape[ax] != 0) {
      throw std::invalid_argument("[transpose] Repeat axes not allowed.");
    }
    shape[ax] = 1;
  }

  for (int i = 0; i < axes.size(); ++i) {
    shape[i] = a.shape()[axes[i]];
  }
  return array(
      std::move(shape),
      a.dtype(),
      std::make_shared<Transpose>(to_stream(s), std::move(axes)),
      {a});
}

array transpose(const array& a, StreamOrDevice s /* = {} */) {
  std::vector<int> axes(a.ndim());
  std::iota(axes.rbegin(), axes.rend(), 0);
  return transpose(a, std::move(axes), to_stream(s));
}

array broadcast_to(
    const array& a,
    const Shape& shape,
    StreamOrDevice s /* = {} */) {
  if (a.shape() == shape) {
    return a;
  }

  // Make sure the shapes are broadcastable
  auto bxshape = broadcast_shapes(a.shape(), shape);
  if (bxshape != shape) {
    std::ostringstream msg;
    msg << "Cannot broadcast array of shape " << a.shape() << " into shape "
        << shape << ".";
    throw std::invalid_argument(msg.str());
  }
  return array(
      std::move(bxshape),
      a.dtype(),
      std::make_shared<Broadcast>(to_stream(s), shape),
      {a});
}

/** Broadcast the input arrays against one another while ignoring the
 * axes specified in `ignore_axes`. Note, this API is internal only.
 * The `ignore_axes` should be:
 * - negative values indicating axes from the end
 * - sorted in increasing order
 */
std::vector<array> broadcast_arrays(
    const std::vector<array>& inputs,
    std::vector<int> ignore_axes,
    StreamOrDevice s) {
  if (inputs.size() <= 1) {
    return inputs;
  }

  std::vector<array> outputs;
  auto shape = BroadcastAxes::output_shape(inputs, ignore_axes);
  auto check_and_get_shape = [&shape, &ignore_axes](const array& in) {
    auto out_shape = shape;
    for (int i = 0; i < ignore_axes.size(); ++i) {
      auto ax = ignore_axes[i];
      auto pos_ax = in.ndim() + ax;
      if (pos_ax < 0 || pos_ax > in.ndim() ||
          (i > 0 && ax <= ignore_axes[i - 1])) {
        throw std::invalid_argument(
            "[broadcast_arrays] Received invalid axes to ignore.");
      }
      out_shape[out_shape.size() + ax] = in.shape(ax);
    }
    return out_shape;
  };

  if (!detail::in_dynamic_tracing()) {
    for (auto& in : inputs) {
      auto out_shape = check_and_get_shape(in);
      if (in.shape() == out_shape) {
        outputs.push_back(in);
      } else {
        outputs.push_back(array(
            std::move(out_shape),
            in.dtype(),
            std::make_shared<Broadcast>(to_stream(s), out_shape),
            {in}));
      }
    }
    return outputs;
  }

  std::vector<array> stop_grad_inputs;
  for (auto& in : inputs) {
    stop_grad_inputs.push_back(stop_gradient(in, s));
  }

  for (int i = 0; i < inputs.size(); ++i) {
    auto& in = inputs[i];
    auto out_shape = check_and_get_shape(in);
    if (in.shape() == out_shape) {
      outputs.push_back(in);
    } else {
      // broadcasted array goes first followed by other stopgrad inputs
      std::vector<array> p_inputs = {in};
      for (int j = 0; j < inputs.size(); ++j) {
        if (j == i) {
          continue;
        }
        p_inputs.push_back(stop_grad_inputs[j]);
      }
      outputs.push_back(array(
          std::move(out_shape),
          in.dtype(),
          std::make_shared<BroadcastAxes>(to_stream(s), ignore_axes),
          std::move(p_inputs)));
    }
  }
  return outputs;
}

std::vector<array> broadcast_arrays(
    const std::vector<array>& inputs,
    StreamOrDevice s /* = {} */) {
  if (inputs.size() <= 1) {
    return inputs;
  }
  auto shape = Broadcast::output_shape(inputs);
  std::vector<array> outputs;

  if (!detail::in_dynamic_tracing()) {
    for (auto& in : inputs) {
      if (in.shape() == shape) {
        outputs.push_back(in);
      } else {
        outputs.push_back(array(
            shape,
            in.dtype(),
            std::make_shared<Broadcast>(to_stream(s), shape),
            {in}));
      }
    }
    return outputs;
  }

  std::vector<array> stop_grad_inputs;
  for (auto& in : inputs) {
    stop_grad_inputs.push_back(stop_gradient(in, s));
  }
  for (int i = 0; i < inputs.size(); ++i) {
    auto& in = inputs[i];
    if (in.shape() == shape) {
      outputs.push_back(in);
    } else {
      // broadcasted array goes first followed by other stopgrad inputs
      std::vector<array> p_inputs = {in};
      for (int j = 0; j < inputs.size(); ++j) {
        if (j == i) {
          continue;
        }
        p_inputs.push_back(stop_grad_inputs[j]);
      }
      outputs.push_back(array(
          shape,
          in.dtype(),
          std::make_shared<Broadcast>(to_stream(s), shape),
          std::move(p_inputs)));
    }
  }
  return outputs;
}

std::pair<array, array>
broadcast_arrays(const array& a, const array& b, StreamOrDevice s) {
  auto out = broadcast_arrays({a, b}, s);
  return {out[0], out[1]};
}

std::pair<array, array> broadcast_arrays(
    const array& a,
    const array& b,
    std::vector<int> ignore_axes,
    StreamOrDevice s) {
  auto out = broadcast_arrays({a, b}, std::move(ignore_axes), s);
  return {out[0], out[1]};
}

array equal(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  auto dtype = promote_types(a.dtype(), b.dtype());
  auto inputs = broadcast_arrays({astype(a, dtype, s), astype(b, dtype, s)}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape, bool_, std::make_shared<Equal>(to_stream(s)), std::move(inputs));
}

array not_equal(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  auto dtype = promote_types(a.dtype(), b.dtype());
  auto inputs = broadcast_arrays({astype(a, dtype, s), astype(b, dtype, s)}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape,
      bool_,
      std::make_shared<NotEqual>(to_stream(s)),
      std::move(inputs));
}

array greater(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  auto dtype = promote_types(a.dtype(), b.dtype());
  auto inputs = broadcast_arrays({astype(a, dtype, s), astype(b, dtype, s)}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape, bool_, std::make_shared<Greater>(to_stream(s)), std::move(inputs));
}

array greater_equal(
    const array& a,
    const array& b,
    StreamOrDevice s /* = {} */) {
  auto dtype = promote_types(a.dtype(), b.dtype());
  auto inputs = broadcast_arrays({astype(a, dtype, s), astype(b, dtype, s)}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape,
      bool_,
      std::make_shared<GreaterEqual>(to_stream(s)),
      std::move(inputs));
}

array less(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  auto dtype = promote_types(a.dtype(), b.dtype());
  auto inputs = broadcast_arrays({astype(a, dtype, s), astype(b, dtype, s)}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape, bool_, std::make_shared<Less>(to_stream(s)), std::move(inputs));
}

array less_equal(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  auto dtype = promote_types(a.dtype(), b.dtype());
  auto inputs = broadcast_arrays({astype(a, dtype, s), astype(b, dtype, s)}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape,
      bool_,
      std::make_shared<LessEqual>(to_stream(s)),
      std::move(inputs));
}

array array_equal(
    const array& a,
    const array& b,
    bool equal_nan,
    StreamOrDevice s /* = {} */) {
  if (a.shape() != b.shape()) {
    return array(false);
  } else {
    auto dtype = promote_types(a.dtype(), b.dtype());
    equal_nan &= issubdtype(dtype, inexact);
    return all(
        array(
            a.shape(),
            bool_,
            std::make_shared<Equal>(to_stream(s), equal_nan),
            {astype(a, dtype, s), astype(b, dtype, s)}),
        false,
        s);
  }
}

array isnan(const array& a, StreamOrDevice s /* = {} */) {
  if (issubdtype(a.dtype(), integer) || a.dtype() == bool_) {
    return full(a.shape(), false, bool_, s);
  }
  return not_equal(a, a, s);
}

array isinf(const array& a, StreamOrDevice s /* = {} */) {
  if (issubdtype(a.dtype(), integer) || a.dtype() == bool_) {
    return full(a.shape(), false, bool_, s);
  }
  return logical_or(isposinf(a, s), isneginf(a, s), s);
}

array isfinite(const array& a, StreamOrDevice s /* = {} */) {
  if (issubdtype(a.dtype(), integer) || a.dtype() == bool_) {
    return full(a.shape(), true, bool_, s);
  }
  return logical_not(logical_or(isinf(a, s), isnan(a, s), s), s);
}

array isposinf(const array& a, StreamOrDevice s /* = {} */) {
  if (issubdtype(a.dtype(), integer) || a.dtype() == bool_) {
    return full(a.shape(), false, bool_, s);
  }
  return equal(a, array(std::numeric_limits<float>::infinity(), a.dtype()), s);
}

array isneginf(const array& a, StreamOrDevice s /* = {} */) {
  if (issubdtype(a.dtype(), integer) || a.dtype() == bool_) {
    return full(a.shape(), false, bool_, s);
  }
  return equal(a, array(-std::numeric_limits<float>::infinity(), a.dtype()), s);
}

array where(
    const array& a,
    const array& b,
    const array& c,
    StreamOrDevice s /* = {} */) {
  auto condition = astype(a, bool_, s);
  Dtype out_dtype = promote_types(b.dtype(), c.dtype());
  auto inputs = broadcast_arrays(
      {condition, astype(b, out_dtype, s), astype(c, out_dtype, s)}, s);

  return array(
      inputs[0].shape(),
      out_dtype,
      std::make_shared<Select>(to_stream(s)),
      inputs);
}

array nan_to_num(
    const array& a,
    float nan /* = 0.0f */,
    const std::optional<float> posinf_ /* = std::nullopt */,
    const std::optional<float> neginf_ /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  Dtype dtype = a.dtype();
  if (!issubdtype(dtype, inexact)) {
    return a;
  }

  auto type_to_max = [](const auto& dtype) -> float {
    if (dtype == float32) {
      return std::numeric_limits<float>::max();
    } else if (dtype == bfloat16) {
      return std::numeric_limits<bfloat16_t>::max();
    } else if (dtype == float16) {
      return std::numeric_limits<float16_t>::max();
    } else {
      std::ostringstream msg;
      msg << "[nan_to_num] Does not yet support given type: " << dtype << ".";
      throw std::invalid_argument(msg.str());
    }
  };

  float posinf = posinf_ ? *posinf_ : type_to_max(dtype);
  float neginf = neginf_ ? *neginf_ : -type_to_max(dtype);

  auto out = where(isnan(a, s), array(nan, dtype), a, s);
  out = where(isposinf(a, s), array(posinf, dtype), out, s);
  out = where(isneginf(a, s), array(neginf, dtype), out, s);
  return out;
}

array allclose(
    const array& a,
    const array& b,
    double rtol /* = 1e-5 */,
    double atol /* = 1e-8 */,
    bool equal_nan /* = false */,
    StreamOrDevice s /* = {}*/) {
  return all(isclose(a, b, rtol, atol, equal_nan, s), s);
}

array isclose(
    const array& a,
    const array& b,
    double rtol /* = 1e-5 */,
    double atol /* = 1e-8 */,
    bool equal_nan /* = false */,
    StreamOrDevice s /* = {}*/) {
  // |a - b| <= atol + rtol * |b|
  auto rhs = add(array(atol), multiply(array(rtol), abs(b, s), s), s);
  auto lhs = abs(subtract(a, b, s), s);
  auto out = less_equal(lhs, rhs, s);

  // Correct the result for infinite values.
  auto a_pos_inf = isposinf(a, s);
  auto b_pos_inf = isposinf(b, s);
  auto a_neg_inf = isneginf(a, s);
  auto b_neg_inf = isneginf(b, s);
  auto any_inf = logical_or(
      logical_or(a_pos_inf, a_neg_inf, s),
      logical_or(b_pos_inf, b_neg_inf, s),
      s);
  auto both_inf = logical_or(
      logical_and(a_pos_inf, b_pos_inf, s),
      logical_and(a_neg_inf, b_neg_inf, s),
      s);

  // Convert all elements where either value is infinite to False.
  out = logical_and(out, logical_not(any_inf, s), s);

  // Convert all the elements where both values are infinite and of the same
  // sign to True.
  out = logical_or(out, both_inf, s);

  if (equal_nan) {
    auto both_nan = logical_and(isnan(a, s), isnan(b, s), s);
    out = logical_or(out, both_nan, s);
  }

  return out;
}

array all(const array& a, bool keepdims, StreamOrDevice s /* = {}*/) {
  std::vector<int> axes(a.ndim());
  std::iota(axes.begin(), axes.end(), 0);
  return all(a, axes, keepdims, s);
}

array all(
    const array& a,
    const std::vector<int>& axes,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {}*/) {
  auto [out_shape, sorted_axes, is_noop] =
      compute_reduce_shape(axes, a.shape());
  auto out = (is_noop)
      ? astype(a, bool_, s)
      : array(
            std::move(out_shape),
            bool_,
            std::make_shared<Reduce>(to_stream(s), Reduce::And, sorted_axes),
            {a});
  if (!keepdims) {
    out = squeeze(out, sorted_axes, s);
  }
  return out;
}

array all(
    const array& a,
    int axis,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {} */) {
  return all(a, std::vector<int>{axis}, keepdims, s);
}

array any(const array& a, bool keepdims, StreamOrDevice s /* = {}*/) {
  std::vector<int> axes(a.ndim());
  std::iota(axes.begin(), axes.end(), 0);
  return any(a, axes, keepdims, s);
}

array any(
    const array& a,
    const std::vector<int>& axes,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {}*/) {
  auto [out_shape, sorted_axes, is_noop] =
      compute_reduce_shape(axes, a.shape());
  auto out = (is_noop)
      ? astype(a, bool_, s)
      : array(
            std::move(out_shape),
            bool_,
            std::make_shared<Reduce>(to_stream(s), Reduce::Or, sorted_axes),
            {a});
  if (!keepdims) {
    out = squeeze(out, sorted_axes, s);
  }
  return out;
}

array any(
    const array& a,
    int axis,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {} */) {
  return any(a, std::vector<int>{axis}, keepdims, s);
}

array sum(const array& a, bool keepdims, StreamOrDevice s /* = {}*/) {
  std::vector<int> axes(a.ndim());
  std::iota(axes.begin(), axes.end(), 0);
  return sum(a, axes, keepdims, s);
}

array sum(
    const array& a,
    const std::vector<int>& axes,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {}*/) {
  if (axes.empty()) {
    return a;
  }
  auto [out_shape, sorted_axes, is_noop] =
      compute_reduce_shape(axes, a.shape());
  Dtype out_type = a.dtype();
  if (issubdtype(a.dtype(), signedinteger)) {
    out_type = a.dtype().size() <= 4 ? int32 : int64;
  } else if (issubdtype(a.dtype(), unsignedinteger)) {
    out_type = a.dtype().size() <= 4 ? uint32 : uint64;
  } else if (a.dtype() == bool_) {
    out_type = int32;
  }
  auto out = (is_noop)
      ? astype(a, out_type, s)
      : array(
            std::move(out_shape),
            out_type,
            std::make_shared<Reduce>(to_stream(s), Reduce::Sum, sorted_axes),
            {a});
  if (!keepdims) {
    out = squeeze(out, sorted_axes, s);
  }
  return out;
}

array sum(
    const array& a,
    int axis,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {} */) {
  return sum(a, std::vector<int>{axis}, keepdims, s);
}

array mean(const array& a, bool keepdims, StreamOrDevice s /* = {}*/) {
  std::vector<int> axes(a.ndim());
  std::iota(axes.begin(), axes.end(), 0);
  return mean(a, axes, keepdims, to_stream(s));
}

array mean(
    const array& a,
    const std::vector<int>& axes,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {}*/) {
  int ndim = a.ndim();
  for (int axis : axes) {
    if (axis < -ndim || axis >= ndim) {
      std::ostringstream msg;
      msg << "[mean] axis " << axis << " is out of bounds for array with "
          << ndim << " dimensions.";
      throw std::invalid_argument(msg.str());
    }
  }
  auto dtype = at_least_float(a.dtype());
  auto normalizer = number_of_elements(a, axes, true, dtype, s);
  return multiply(sum(a, axes, keepdims, s), normalizer, s);
}

array mean(
    const array& a,
    int axis,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {} */) {
  return mean(a, std::vector<int>{axis}, keepdims, to_stream(s));
}

array median(const array& a, bool keepdims, StreamOrDevice s /* = {}*/) {
  std::vector<int> axes(a.ndim());
  std::iota(axes.begin(), axes.end(), 0);
  return median(a, axes, keepdims, to_stream(s));
}

array median(
    const array& a,
    const std::vector<int>& axes,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {}*/) {
  int ndim = a.ndim();
  std::set<int> set_axes;
  for (int axis : axes) {
    if (axis < -ndim || axis >= ndim) {
      std::ostringstream msg;
      msg << "[median] axis " << axis << " is out of bounds for array with "
          << ndim << " dimensions.";
      throw std::invalid_argument(msg.str());
    }
    set_axes.insert(axis < 0 ? axis + ndim : axis);
  }
  if (set_axes.size() != axes.size()) {
    throw std::invalid_argument("[median] Received duplicate axis.");
  }
  std::vector<int> sorted_axes(set_axes.begin(), set_axes.end());
  auto dtype = at_least_float(a.dtype());
  std::vector<int> transpose_axes;
  for (int i = 0, j = 0; i < a.ndim(); ++i) {
    if (j < sorted_axes.size() && i == sorted_axes[j]) {
      j++;
      continue;
    }
    transpose_axes.push_back(i);
  }
  int flat_start = transpose_axes.size();
  transpose_axes.insert(
      transpose_axes.end(), sorted_axes.begin(), sorted_axes.end());

  // Move all the median axes to the back and flatten
  auto flat_a =
      flatten(transpose(a, transpose_axes, s), flat_start, a.ndim(), s);
  int flat_size = flat_a.shape(-1);
  if (flat_size == 0) {
    throw std::invalid_argument(
        "[median] Cannot take median along empty axis.");
  }

  // Sort the last axis
  auto sorted_a = sort(flat_a, -1, s);

  // Take the midpoint
  auto mp = flat_size / 2;
  auto start = Shape(sorted_a.ndim(), 0);
  auto stop = sorted_a.shape();
  start.back() = mp;
  stop.back() = mp + 1;
  auto median_a = astype(slice(sorted_a, start, stop, s), dtype, s);
  if (flat_size % 2 == 0) {
    start.back() = mp - 1;
    stop.back() = mp;
    median_a = multiply(
        add(median_a, astype(slice(sorted_a, start, stop, s), dtype, s), s),
        array(0.5, dtype),
        s);
  }
  median_a = squeeze(median_a, -1, s);
  if (keepdims) {
    median_a = expand_dims(median_a, sorted_axes, s);
  }
  return median_a;
}

array median(
    const array& a,
    int axis,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {} */) {
  return median(a, std::vector<int>{axis}, keepdims, to_stream(s));
}

array var(
    const array& a,
    bool keepdims,
    int ddof /* = 0*/,
    StreamOrDevice s /* = {}*/) {
  std::vector<int> axes(a.ndim());
  std::iota(axes.begin(), axes.end(), 0);
  return var(a, axes, keepdims, ddof, to_stream(s));
}

array var(
    const array& a,
    const std::vector<int>& axes,
    bool keepdims /* = false */,
    int ddof /* = 0*/,
    StreamOrDevice s /* = {}*/) {
  auto dtype = at_least_float(a.dtype());
  auto mu = mean(a, axes, /* keepdims= */ true, s);
  auto v = sum(square(subtract(a, mu, s), s), axes, keepdims, s);

  if (ddof != 0) {
    auto normalizer = maximum(
        subtract(
            number_of_elements(a, axes, false, dtype, s),
            array(ddof, dtype),
            s),
        array(0, dtype),
        s);
    v = divide(v, normalizer, s);
  } else {
    auto normalizer = number_of_elements(a, axes, true, dtype, s);
    v = multiply(v, normalizer, s);
  }

  return v;
}

array var(
    const array& a,
    int axis,
    bool keepdims /* = false */,
    int ddof /* = 0*/,
    StreamOrDevice s /* = {} */) {
  return var(a, std::vector<int>{axis}, keepdims, ddof, to_stream(s));
}

array std(
    const array& a,
    bool keepdims,
    int ddof /* = 0*/,
    StreamOrDevice s /* = {}*/) {
  std::vector<int> axes(a.ndim());
  std::iota(axes.begin(), axes.end(), 0);
  return std(a, axes, keepdims, ddof, to_stream(s));
}

array std(
    const array& a,
    const std::vector<int>& axes,
    bool keepdims /* = false */,
    int ddof /* = 0*/,
    StreamOrDevice s /* = {}*/) {
  return sqrt(var(a, axes, keepdims, ddof, s), s);
}

array std(
    const array& a,
    int axis,
    bool keepdims /* = false */,
    int ddof /* = 0*/,
    StreamOrDevice s /* = {} */) {
  return std(a, std::vector<int>{axis}, keepdims, ddof, to_stream(s));
}

array prod(const array& a, bool keepdims, StreamOrDevice s /* = {}*/) {
  std::vector<int> axes(a.ndim());
  std::iota(axes.begin(), axes.end(), 0);
  return prod(a, axes, keepdims, s);
}

array prod(
    const array& a,
    const std::vector<int>& axes,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {}*/) {
  if (axes.empty()) {
    return a;
  }
  auto [out_shape, sorted_axes, is_noop] =
      compute_reduce_shape(axes, a.shape());
  Dtype out_type = a.dtype();
  if (issubdtype(a.dtype(), signedinteger)) {
    out_type = a.dtype().size() <= 4 ? int32 : int64;
  } else if (issubdtype(a.dtype(), unsignedinteger)) {
    out_type = a.dtype().size() <= 4 ? uint32 : uint64;
  } else if (a.dtype() == bool_) {
    out_type = int32;
  }
  auto out = (is_noop)
      ? a
      : array(
            std::move(out_shape),
            out_type,
            std::make_shared<Reduce>(to_stream(s), Reduce::Prod, sorted_axes),
            {a});
  if (!keepdims) {
    out = squeeze(out, sorted_axes, s);
  }
  return out;
}

array prod(
    const array& a,
    int axis,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {} */) {
  return prod(a, std::vector<int>{axis}, keepdims, s);
}

array max(const array& a, bool keepdims, StreamOrDevice s /* = {}*/) {
  std::vector<int> axes(a.ndim());
  std::iota(axes.begin(), axes.end(), 0);
  return max(a, axes, keepdims, s);
}

array max(
    const array& a,
    const std::vector<int>& axes,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {}*/) {
  if (a.size() == 0) {
    throw std::invalid_argument("[max] Cannot max reduce zero size array.");
  }
  auto [out_shape, sorted_axes, is_noop] =
      compute_reduce_shape(axes, a.shape());
  auto out = (is_noop)
      ? a
      : array(
            std::move(out_shape),
            a.dtype(),
            std::make_shared<Reduce>(to_stream(s), Reduce::Max, sorted_axes),
            {a});
  if (!keepdims) {
    out = squeeze(out, sorted_axes, s);
  }
  return out;
}

array max(
    const array& a,
    int axis,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {} */) {
  return max(a, std::vector<int>{axis}, keepdims, s);
}

array min(const array& a, bool keepdims, StreamOrDevice s /* = {}*/) {
  std::vector<int> axes(a.ndim());
  std::iota(axes.begin(), axes.end(), 0);
  return min(a, axes, keepdims, s);
}

array min(
    const array& a,
    const std::vector<int>& axes,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {}*/) {
  if (a.size() == 0) {
    throw std::invalid_argument("[min] Cannot min reduce zero size array.");
  }
  if (axes.empty()) {
    return a;
  }
  auto [out_shape, sorted_axes, is_noop] =
      compute_reduce_shape(axes, a.shape());
  auto out = (is_noop)
      ? a
      : array(
            std::move(out_shape),
            a.dtype(),
            std::make_shared<Reduce>(to_stream(s), Reduce::Min, sorted_axes),
            {a});
  if (!keepdims) {
    out = squeeze(out, sorted_axes, s);
  }
  return out;
}

array min(
    const array& a,
    int axis,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {} */) {
  return min(a, std::vector<int>{axis}, keepdims, s);
}

array argmin(const array& a, bool keepdims, StreamOrDevice s /* = {} */) {
  auto result = argmin(flatten(a, s), 0, true, s);
  if (keepdims) {
    std::vector<int> axes(a.ndim() - 1);
    std::iota(axes.begin(), axes.end(), 0);
    result = expand_dims(result, axes, s);
  } else {
    result = squeeze(result, s);
  }
  return result;
}

array argmin(
    const array& a,
    int axis,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {} */) {
  if (a.size() == 0) {
    throw std::invalid_argument(
        "[argmin] Cannot argmin reduce zero size array.");
  }
  auto [out_shape, sorted_axes, is_noop] =
      compute_reduce_shape({axis}, a.shape());
  auto out = (is_noop)
      ? zeros(out_shape, uint32, s)
      : array(
            std::move(out_shape),
            uint32,
            std::make_shared<ArgReduce>(
                to_stream(s), ArgReduce::ArgMin, sorted_axes[0]),
            {a});
  if (!keepdims) {
    out = squeeze(out, sorted_axes[0], s);
  }
  return out;
}

array argmax(const array& a, bool keepdims, StreamOrDevice s /* = {} */) {
  auto result = argmax(flatten(a, s), 0, true, s);
  if (keepdims) {
    std::vector<int> axes(a.ndim() - 1);
    std::iota(axes.begin(), axes.end(), 0);
    result = expand_dims(result, axes, s);
  } else {
    result = squeeze(result, s);
  }
  return result;
}

array argmax(
    const array& a,
    int axis,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {} */) {
  if (a.size() == 0) {
    throw std::invalid_argument(
        "[argmax] Cannot argmax reduce zero size array.");
  }
  auto [out_shape, sorted_axes, is_noop] =
      compute_reduce_shape({axis}, a.shape());
  auto out = (is_noop)
      ? zeros(out_shape, uint32, s)
      : array(
            std::move(out_shape),
            uint32,
            std::make_shared<ArgReduce>(
                to_stream(s), ArgReduce::ArgMax, sorted_axes[0]),
            {a});
  if (!keepdims) {
    out = squeeze(out, sorted_axes[0], s);
  }
  return out;
}

array bartlett(int M, StreamOrDevice s /* = {} */) {
  if (M < 1) {
    return array({});
  }
  if (M == 1) {
    return ones({1}, float32, s);
  }

  auto n = arange(0, M, float32, s);
  float factor_val = 2.0f / (M - 1);
  auto factor = array(factor_val, float32);
  auto term = subtract(multiply(factor, n, s), array(1.0f, float32), s);
  return subtract(array(1.0f, float32), abs(term, s), s);
}

array hanning(int M, StreamOrDevice s /* = {} */) {
  if (M < 1) {
    return array({});
  }
  if (M == 1) {
    return ones({1}, float32, s);
  }

  auto n = arange(0, M, float32, s);
  array factor(M_PI / (M - 1), float32);
  return square(sin(multiply(factor, n, s), s), s);
}

array hamming(int M, StreamOrDevice s /* = {} */) {
  if (M < 1) {
    return array({});
  }
  if (M == 1) {
    return ones({1}, float32, s);
  }

  auto n = arange(0, M, float32, s);
  float factor_val = (2.0 * M_PI) / (M - 1);
  auto factor = array(factor_val, float32);

  auto arg = multiply(factor, n, s);
  auto cos_vals = cos(arg, s);

  auto left_coef = array(0.54f, float32);
  auto right_coef = array(0.46f, float32);

  return subtract(left_coef, multiply(right_coef, cos_vals, s), s);
}

array blackman(int M, StreamOrDevice s /* = {} */) {
  if (M < 1) {
    return array({});
  }
  if (M == 1) {
    return ones({1}, float32, s);
  }

  auto n = arange(0, M, float32, s);

  float arg_val = (2.0 * M_PI) / (M - 1);
  auto x = multiply(array(arg_val, float32), n, s);

  auto cos_x = cos(x, s);

  auto alpha = array(0.34f, float32);
  auto beta = array(0.5f, float32);
  auto gamma = array(0.16f, float32);

  auto term1 = multiply(beta, cos_x, s);

  auto cos_sq = square(cos_x, s);
  auto term2 = multiply(gamma, cos_sq, s);

  return add(subtract(alpha, term1, s), term2, s);
}

/** Returns a sorted copy of the flattened array. */
array sort(const array& a, StreamOrDevice s /* = {} */) {
  int size = a.size();
  return sort(reshape(a, {size}, s), 0, s);
}

/** Returns a sorted copy of the array along a given axis. */
array sort(const array& a, int axis, StreamOrDevice s /* = {} */) {
  // Check for valid axis
  if (axis + static_cast<int>(a.ndim()) < 0 ||
      axis >= static_cast<int>(a.ndim())) {
    std::ostringstream msg;
    msg << "[sort] Received invalid axis " << axis << " for array with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  return array(
      a.shape(), a.dtype(), std::make_shared<Sort>(to_stream(s), axis), {a});
}

/** Returns indices that sort the flattened array. */
array argsort(const array& a, StreamOrDevice s /* = {} */) {
  int size = a.size();
  return argsort(reshape(a, {size}, s), 0, s);
}

/** Returns indices that sort the array along a given axis. */
array argsort(const array& a, int axis, StreamOrDevice s /* = {} */) {
  // Check for valid axis
  if (axis + static_cast<int>(a.ndim()) < 0 ||
      axis >= static_cast<int>(a.ndim())) {
    std::ostringstream msg;
    msg << "[argsort] Received invalid axis " << axis << " for array with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  return array(
      a.shape(), uint32, std::make_shared<ArgSort>(to_stream(s), axis), {a});
}

/**
 * Returns a partitioned copy of the flattened array
 * such that the smaller kth elements are first.
 **/
array partition(const array& a, int kth, StreamOrDevice s /* = {} */) {
  int size = a.size();
  return partition(reshape(a, {size}, s), kth, 0, s);
}

/**
 * Returns a partitioned copy of the array along a given axis
 * such that the smaller kth elements are first.
 **/
array partition(
    const array& a,
    int kth,
    int axis,
    StreamOrDevice s /* = {} */) {
  // Check for valid axis
  if (axis + static_cast<int>(a.ndim()) < 0 ||
      axis >= static_cast<int>(a.ndim())) {
    std::ostringstream msg;
    msg << "[partition] Received invalid axis " << axis << " for array with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  int axis_ = axis < 0 ? axis + a.ndim() : axis;
  int kth_ = kth < 0 ? kth + a.shape(axis) : kth;
  if (kth_ < 0 || kth_ >= a.shape(axis_)) {
    std::ostringstream msg;
    msg << "[partition] Received invalid kth " << kth << "along axis " << axis
        << " for array with shape: " << a.shape();
    throw std::invalid_argument(msg.str());
  }
  return array(
      a.shape(),
      a.dtype(),
      std::make_shared<Partition>(to_stream(s), kth_, axis_),
      {a});
}

/**
 * Returns indices that partition the flattened array
 * such that the smaller kth elements are first.
 **/
array argpartition(const array& a, int kth, StreamOrDevice s /* = {} */) {
  int size = a.size();
  return argpartition(reshape(a, {size}, s), kth, 0, s);
}

/**
 * Returns indices that partition the array along a given axis
 * such that the smaller kth elements are first.
 **/
array argpartition(
    const array& a,
    int kth,
    int axis,
    StreamOrDevice s /* = {} */) {
  // Check for valid axis
  if (axis + static_cast<int>(a.ndim()) < 0 ||
      axis >= static_cast<int>(a.ndim())) {
    std::ostringstream msg;
    msg << "[argpartition] Received invalid axis " << axis << " for array with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  int axis_ = axis < 0 ? axis + a.ndim() : axis;
  int kth_ = kth < 0 ? kth + a.shape(axis) : kth;
  if (kth_ < 0 || kth_ >= a.shape(axis_)) {
    std::ostringstream msg;
    msg << "[argpartition] Received invalid kth " << kth << " along axis "
        << axis << " for array with shape: " << a.shape();
    throw std::invalid_argument(msg.str());
  }
  return array(
      a.shape(),
      uint32,
      std::make_shared<ArgPartition>(to_stream(s), kth_, axis_),
      {a});
}

/** Returns topk elements of the flattened array. */
array topk(const array& a, int k, StreamOrDevice s /* = {}*/) {
  int size = a.size();
  return topk(reshape(a, {size}, s), k, 0, s);
}

/** Returns topk elements of the array along a given axis. */
array topk(const array& a, int k, int axis, StreamOrDevice s /* = {}*/) {
  // Check for valid axis
  int axis_ = axis < 0 ? axis + a.ndim() : axis;
  if (axis_ < 0 || axis_ >= static_cast<int>(a.ndim())) {
    std::ostringstream msg;
    msg << "[topk] Received invalid axis " << axis << " for array with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  if (k < 0 || k > a.shape(axis_)) {
    std::ostringstream msg;
    msg << "[topk] Received invalid k=" << k << " along axis " << axis
        << " for array with shape: " << a.shape();
    throw std::invalid_argument(msg.str());
  }

  // Return early if the whole input was requested.
  if (k == a.shape(axis_)) {
    return a;
  }

  array a_partitioned = partition(a, -k, axis_, s);
  Shape slice_starts(a.ndim(), 0);
  auto slice_ends = a.shape();
  slice_starts[axis_] = a.shape(axis_) - k;
  return slice(a_partitioned, slice_starts, slice_ends, s);
}

array logsumexp(const array& a, bool keepdims, StreamOrDevice s /* = {}*/) {
  std::vector<int> axes(a.ndim());
  std::iota(axes.begin(), axes.end(), 0);
  return logsumexp(a, axes, keepdims, s);
}

array logsumexp(
    const array& a,
    const std::vector<int>& axes,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {}*/) {
  if (a.size() == 0) {
    throw std::invalid_argument("[logsumexp] Received empty array.");
  }
  if (a.ndim() == 0 && !axes.empty()) {
    throw std::invalid_argument(
        "[logsumexp] Received non-empty axes for array with 0 dimensions.");
  }
  bool reduce_last_dim =
      !axes.empty() && (axes.back() == a.ndim() - 1 || axes.back() == -1);
  if (reduce_last_dim) {
    // For more than 2 axes check if axes is [0, 1, ..., NDIM - 1] and shape
    // is [1, 1, ..., N].
    for (int i = axes.size() - 2; i >= 0; --i) {
      if ((axes[i] + 1 != axes[i + 1]) || (a.shape(axes[i]) != 1)) {
        reduce_last_dim = false;
        break;
      }
    }
  }
  bool is_complex = issubdtype(a.dtype(), complexfloating);
  if (!is_complex && reduce_last_dim) {
    auto dtype = at_least_float(a.dtype());
    auto out_shape = a.shape();
    out_shape.back() = 1;
    auto out = array(
        std::move(out_shape),
        dtype,
        std::make_shared<LogSumExp>(to_stream(s)),
        {astype(a, dtype, s)});
    if (!keepdims) {
      out = squeeze(out, -1, s);
    }
    return out;
  }
  auto maxval = stop_gradient(max(a, axes, true, s), s);
  auto out = log(sum(exp(subtract(a, maxval, s), s), axes, keepdims, s), s);
  out = add(out, reshape(maxval, out.shape(), s), s);
  if (!keepdims) {
    maxval = squeeze(maxval, axes, s);
  }
  return where(isinf(maxval, s), maxval, out, s);
}

array logsumexp(
    const array& a,
    int axis,
    bool keepdims /* = false */,
    StreamOrDevice s /* = {} */) {
  return logsumexp(a, std::vector<int>{axis}, keepdims, s);
}

array abs(const array& a, StreamOrDevice s /* = {} */) {
  auto out =
      array(a.shape(), a.dtype(), std::make_shared<Abs>(to_stream(s)), {a});
  if (a.dtype() == complex64) {
    out = astype(out, float32, s);
  }
  return out;
}

array negative(const array& a, StreamOrDevice s /* = {} */) {
  if (a.dtype() == bool_) {
    auto msg = "[negative] Not supported for bool, use logical_not instead.";
    throw std::invalid_argument(msg);
  }
  return array(
      a.shape(), a.dtype(), std::make_shared<Negative>(to_stream(s)), {a});
}
array operator-(const array& a) {
  return negative(a);
}

array sign(const array& a, StreamOrDevice s /* = {} */) {
  return array(a.shape(), a.dtype(), std::make_shared<Sign>(to_stream(s)), {a});
}

array logical_not(const array& a, StreamOrDevice s /* = {} */) {
  return array(
      a.shape(),
      bool_,
      std::make_shared<LogicalNot>(to_stream(s)),
      {astype(a, bool_, s)});
}

array logical_and(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  // Broadcast arrays to a common shape
  auto inputs = broadcast_arrays({astype(a, bool_, s), astype(b, bool_, s)}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape,
      bool_,
      std::make_shared<LogicalAnd>(to_stream(s)),
      std::move(inputs));
}
array operator&&(const array& a, const array& b) {
  return logical_and(a, b);
}

array logical_or(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  // Broadcast arrays to a common shape
  auto inputs = broadcast_arrays({astype(a, bool_, s), astype(b, bool_, s)}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape,
      bool_,
      std::make_shared<LogicalOr>(to_stream(s)),
      std::move(inputs));
}
array operator||(const array& a, const array& b) {
  return logical_or(a, b);
}

array reciprocal(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  return divide(array(1.0f, dtype), a, to_stream(s));
}

array add(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  auto out_type = promote_types(a.dtype(), b.dtype());
  auto inputs =
      broadcast_arrays({astype(a, out_type, s), astype(b, out_type, s)}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape, out_type, std::make_shared<Add>(to_stream(s)), std::move(inputs));
}

array operator+(const array& a, const array& b) {
  return add(a, b);
}

array subtract(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  auto out_type = promote_types(a.dtype(), b.dtype());
  auto inputs =
      broadcast_arrays({astype(a, out_type, s), astype(b, out_type, s)}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape,
      out_type,
      std::make_shared<Subtract>(to_stream(s)),
      std::move(inputs));
}

array operator-(const array& a, const array& b) {
  return subtract(a, b);
}

array multiply(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  auto out_type = promote_types(a.dtype(), b.dtype());
  auto inputs =
      broadcast_arrays({astype(a, out_type, s), astype(b, out_type, s)}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape,
      out_type,
      std::make_shared<Multiply>(to_stream(s)),
      std::move(inputs));
}

array operator*(const array& a, const array& b) {
  return multiply(a, b);
}

array divide(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(promote_types(a.dtype(), b.dtype()));
  auto inputs = broadcast_arrays(
      {astype(a, dtype, s), astype(b, dtype, to_stream(s))}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape, dtype, std::make_shared<Divide>(to_stream(s)), std::move(inputs));
}
array operator/(const array& a, const array& b) {
  return divide(a, b);
}
array operator/(double a, const array& b) {
  return divide(array(a), b);
}
array operator/(const array& a, double b) {
  return divide(a, array(b));
}

array floor_divide(
    const array& a,
    const array& b,
    StreamOrDevice s /* = {} */) {
  auto dtype = promote_types(a.dtype(), b.dtype());
  if (issubdtype(dtype, inexact)) {
    return floor(divide(a, b, s), s);
  }

  auto inputs = broadcast_arrays({astype(a, dtype, s), astype(b, dtype, s)}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape, dtype, std::make_shared<Divide>(to_stream(s)), std::move(inputs));
}

array remainder(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  auto dtype = promote_types(a.dtype(), b.dtype());
  auto inputs = broadcast_arrays(
      {astype(a, dtype, s), astype(b, dtype, to_stream(s))}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape,
      dtype,
      std::make_shared<Remainder>(to_stream(s)),
      std::move(inputs));
}
array operator%(const array& a, const array& b) {
  return remainder(a, b);
}

std::vector<array>
divmod(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  auto dtype = promote_types(a.dtype(), b.dtype());
  if (issubdtype(dtype, complexfloating)) {
    throw std::invalid_argument("[divmod] Complex type not supported.");
  }
  auto inputs = broadcast_arrays(
      {astype(a, dtype, s), astype(b, dtype, to_stream(s))}, s);
  return array::make_arrays(
      {inputs[0].shape(), inputs[0].shape()},
      {inputs[0].dtype(), inputs[0].dtype()},
      std::make_shared<DivMod>(to_stream(s)),
      inputs);
}

array maximum(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  auto out_type = promote_types(a.dtype(), b.dtype());
  auto inputs =
      broadcast_arrays({astype(a, out_type, s), astype(b, out_type, s)}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape,
      out_type,
      std::make_shared<Maximum>(to_stream(s)),
      std::move(inputs));
}

array minimum(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  auto out_type = promote_types(a.dtype(), b.dtype());
  auto inputs =
      broadcast_arrays({astype(a, out_type, s), astype(b, out_type, s)}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape,
      out_type,
      std::make_shared<Minimum>(to_stream(s)),
      std::move(inputs));
}

array floor(const array& a, StreamOrDevice s /* = {} */) {
  if (a.dtype() == complex64) {
    throw std::invalid_argument("[floor] Not supported for complex64.");
  }
  return array(
      a.shape(), a.dtype(), std::make_shared<Floor>(to_stream(s)), {a});
}

array ceil(const array& a, StreamOrDevice s /* = {} */) {
  if (a.dtype() == complex64) {
    throw std::invalid_argument("[floor] Not supported for complex64.");
  }
  return array(a.shape(), a.dtype(), std::make_shared<Ceil>(to_stream(s)), {a});
}

array square(const array& a, StreamOrDevice s /* = {} */) {
  return array(
      a.shape(), a.dtype(), std::make_shared<Square>(to_stream(s)), {a});
}

array exp(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(a.shape(), dtype, std::make_shared<Exp>(to_stream(s)), {input});
}

array expm1(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(
      a.shape(), dtype, std::make_shared<Expm1>(to_stream(s)), {input});
}

array sin(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(a.shape(), dtype, std::make_shared<Sin>(to_stream(s)), {input});
}

array cos(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(a.shape(), dtype, std::make_shared<Cos>(to_stream(s)), {input});
}

array tan(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(a.shape(), dtype, std::make_shared<Tan>(to_stream(s)), {input});
}

array arcsin(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(
      a.shape(), dtype, std::make_shared<ArcSin>(to_stream(s)), {input});
}

array arccos(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(
      a.shape(), dtype, std::make_shared<ArcCos>(to_stream(s)), {input});
}

array arctan(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(
      a.shape(), dtype, std::make_shared<ArcTan>(to_stream(s)), {input});
}

array arctan2(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(promote_types(a.dtype(), b.dtype()));
  auto inputs = broadcast_arrays({astype(a, dtype, s), astype(b, dtype, s)}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape, dtype, std::make_shared<ArcTan2>(to_stream(s)), std::move(inputs));
}

array sinh(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(a.shape(), dtype, std::make_shared<Sinh>(to_stream(s)), {input});
}

array cosh(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(a.shape(), dtype, std::make_shared<Cosh>(to_stream(s)), {input});
}

array tanh(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(a.shape(), dtype, std::make_shared<Tanh>(to_stream(s)), {input});
}

array arcsinh(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(
      a.shape(), dtype, std::make_shared<ArcSinh>(to_stream(s)), {input});
}

array arccosh(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(
      a.shape(), dtype, std::make_shared<ArcCosh>(to_stream(s)), {input});
}

array arctanh(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(
      a.shape(), dtype, std::make_shared<ArcTanh>(to_stream(s)), {input});
}

array degrees(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  return multiply(a, array(180.0 / M_PI, dtype), s);
}

array radians(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  return multiply(a, array(M_PI / 180.0, dtype), s);
}

array log(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(
      a.shape(),
      dtype,
      std::make_shared<Log>(to_stream(s), Log::Base::e),
      {input});
}

array log2(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(
      a.shape(),
      dtype,
      std::make_shared<Log>(to_stream(s), Log::Base::two),
      {input});
}

array log10(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(
      a.shape(),
      dtype,
      std::make_shared<Log>(to_stream(s), Log::Base::ten),
      {input});
}

array log1p(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(
      a.shape(), dtype, std::make_shared<Log1p>(to_stream(s)), {input});
}

array logaddexp(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  // Make sure out type is floating point
  auto out_type = at_least_float(promote_types(a.dtype(), b.dtype()));
  auto inputs =
      broadcast_arrays({astype(a, out_type, s), astype(b, out_type, s)}, s);
  auto& shape = inputs[0].shape();
  return array(
      shape,
      out_type,
      std::make_shared<LogAddExp>(to_stream(s)),
      std::move(inputs));
}

array sigmoid(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  auto input = astype(a, dtype, s);
  return array(
      a.shape(), dtype, std::make_shared<Sigmoid>(to_stream(s)), {input});
}

array erf(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  return array(
      a.shape(),
      dtype,
      std::make_shared<Erf>(to_stream(s)),
      {astype(a, dtype, s)});
}

array erfinv(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  return array(
      a.shape(),
      dtype,
      std::make_shared<ErfInv>(to_stream(s)),
      {astype(a, dtype, s)});
}

array stop_gradient(const array& a, StreamOrDevice s /* = {} */) {
  return array(
      a.shape(), a.dtype(), std::make_shared<StopGradient>(to_stream(s)), {a});
}

array round(const array& a, int decimals, StreamOrDevice s /* = {} */) {
  if (decimals == 0) {
    return array(
        a.shape(), a.dtype(), std::make_shared<Round>(to_stream(s)), {a});
  }

  auto dtype = at_least_float(a.dtype());
  float scale = std::pow(10, decimals);
  auto result = multiply(a, array(scale, dtype), s);
  result = round(result, 0, s);
  result = multiply(result, array(1 / scale, dtype), s);

  return astype(result, a.dtype(), s);
}

array matmul(
    const array& in_a,
    const array& in_b,
    StreamOrDevice s /* = {} */) {
  auto a = in_a;
  auto b = in_b;
  if (a.ndim() == 0 || b.ndim() == 0) {
    throw std::invalid_argument(
        "[matmul] Got 0 dimension input. Inputs must "
        "have at least one dimension.");
  }

  if (a.ndim() == 1) {
    // Insert a singleton dim in the beginning
    a = expand_dims(a, 0, s);
  }
  if (b.ndim() == 1) {
    // Insert a singleton dim at the end
    b = expand_dims(b, 1, s);
  }
  if (a.shape(-1) != b.shape(-2)) {
    std::ostringstream msg;
    msg << "[matmul] Last dimension of first input with shape " << a.shape()
        << " must match second to last dimension of"
        << " second input with shape " << b.shape() << ".";
    throw std::invalid_argument(msg.str());
  }

  // Type promotion
  auto out_type = promote_types(a.dtype(), b.dtype());

  if (!issubdtype(out_type, inexact)) {
    std::ostringstream msg;
    msg << "[matmul] Only inexact types are supported but " << a.dtype()
        << " and " << b.dtype() << " were provided which results"
        << " in " << out_type << ", which is not a floating point type.";
    throw std::invalid_argument(msg.str());
  }
  if (a.dtype() != out_type) {
    a = astype(a, out_type, s);
  }
  if (b.dtype() != out_type) {
    b = astype(b, out_type, s);
  }

  // We can batch the multiplication by reshaping a
  if (in_a.ndim() > 2 && in_b.ndim() <= 2) {
    a = flatten(a, 0, -2, s);
  } else if (in_b.ndim() > 2) {
    std::tie(a, b) = broadcast_arrays(a, b, {-2, -1}, s);
  }

  auto out_shape = a.shape();
  out_shape.back() = b.shape(-1);

  auto out = array(
      std::move(out_shape),
      out_type,
      std::make_shared<Matmul>(to_stream(s)),
      {a, b});
  if (in_a.ndim() > 2 && in_b.ndim() <= 2) {
    auto orig_shape = in_a.shape();
    orig_shape.pop_back();
    out = unflatten(out, 0, std::move(orig_shape), s);
  }

  // Remove the possibly inserted singleton dimensions
  std::vector<int> axes;
  if (in_a.ndim() == 1) {
    axes.push_back(out.ndim() - 2);
  }
  if (in_b.ndim() == 1) {
    axes.push_back(out.ndim() - 1);
  }
  return axes.empty() ? out : squeeze(out, axes, s);
}

array gather(
    const array& a,
    const std::vector<array>& indices,
    const std::vector<int>& axes,
    const Shape& slice_sizes,
    StreamOrDevice s /* = {} */) {
  // Checks that indices, dimensions, and slice_sizes are all valid
  if (indices.size() > a.ndim()) {
    std::ostringstream msg;
    msg << "[gather] Too many index arrays. Got " << indices.size()
        << " index arrays for input with " << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  std::set dims(axes.begin(), axes.end());
  if (dims.size() != axes.size()) {
    throw std::invalid_argument("[gather] Repeat axes not allowed in gather.");
  }
  if (!dims.empty() && (*dims.begin() < 0 || *dims.rbegin() >= a.ndim())) {
    throw std::invalid_argument("[gather] Axes don't match array dimensions.");
  }
  if (indices.size() != axes.size()) {
    throw std::invalid_argument(
        "[gather] Number of index arrays does not match number of axes.");
  }
  for (auto& x : indices) {
    if (x.dtype() == bool_) {
      throw std::invalid_argument("[Gather] Boolean indices not supported.");
    }
  }

  if (slice_sizes.size() != a.ndim()) {
    std::ostringstream msg;
    msg << "[gather] Got slice_sizes with size " << slice_sizes.size()
        << " for array with " << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  // Promote indices to the same type
  auto dtype = result_type(indices);
  if (issubdtype(dtype, inexact)) {
    throw std::invalid_argument(
        "[gather] Got indices with invalid dtype. Indices must be integral.");
  }

  // Broadcast and cast indices if necessary
  auto inputs = broadcast_arrays(indices);
  for (auto& idx : inputs) {
    idx = astype(idx, dtype, s);
  }

  if (a.size() == 0) {
    // Empty input, either the total slice size is 0 or the indices are empty
    auto total_slice = std::accumulate(
        slice_sizes.begin(), slice_sizes.end(), 1, std::multiplies<int64_t>{});
    auto idx_size = !inputs.empty() ? inputs[0].size() : 1;
    if (idx_size != 0 && total_slice != 0) {
      std::ostringstream msg;
      msg << "[gather] If the input is empty, either the indices must be"
          << " empty or the total slice size must be 0.";
      throw std::invalid_argument(msg.str());
    }
  } else {
    // Non-empty input, check slice sizes are valid
    for (int i = 0; i < a.ndim(); ++i) {
      if (slice_sizes[i] < 0 || slice_sizes[i] > a.shape(i)) {
        std::ostringstream msg;
        msg << "[gather] Slice sizes must be in [0, a.shape(i)]. Got "
            << slice_sizes << " for array with shape " << a.shape() << ".";
        throw std::invalid_argument(msg.str());
      }
    }
  }

  Shape out_shape;
  if (!inputs.empty()) {
    out_shape = inputs[0].shape();
  }
  out_shape.insert(out_shape.end(), slice_sizes.begin(), slice_sizes.end());

  inputs.insert(inputs.begin(), a);
  return array(
      std::move(out_shape),
      a.dtype(),
      std::make_shared<Gather>(
          to_stream(s), std::move(axes), std::move(slice_sizes)),
      inputs);
}

array kron(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  if (a.size() == 0 || b.size() == 0) {
    throw std::invalid_argument("[kron] Input arrays cannot be empty.");
  }

  int ndim = std::max(a.ndim(), b.ndim());
  Shape a_shape(2 * ndim, 1);
  Shape b_shape(2 * ndim, 1);
  Shape out_shape(ndim, 1);

  for (int i = ndim - 1, j = a.ndim() - 1; j >= 0; j--, i--) {
    a_shape[2 * i] = a.shape(j);
    out_shape[i] *= a.shape(j);
  }
  for (int i = ndim - 1, j = b.ndim() - 1; j >= 0; j--, i--) {
    b_shape[2 * i + 1] = b.shape(j);
    out_shape[i] *= b.shape(j);
  }

  return reshape(
      multiply(
          reshape(a, std::move(a_shape), s),
          reshape(b, std::move(b_shape), s),
          s),
      std::move(out_shape),
      s);
}

array take(
    const array& a,
    const array& indices,
    int axis,
    StreamOrDevice s /* = {} */) {
  // Check for valid axis
  if (axis + static_cast<int>(a.ndim()) < 0 ||
      axis >= static_cast<int>(a.ndim())) {
    std::ostringstream msg;
    msg << "[take] Received invalid axis " << axis << " for array with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  // Check for valid take
  if (a.shape(axis) == 0 && indices.size() != 0) {
    throw std::invalid_argument(
        "[take] Cannot do a non-empty take from an empty axis.");
  }

  // Handle negative axis
  axis = axis < 0 ? a.ndim() + axis : axis;

  // Make slice sizes to pass to gather
  Shape slice_sizes = a.shape();
  slice_sizes[axis] = 1;

  auto out = gather(a, indices, axis, slice_sizes, s);

  // Transpose indices dimensions to axis dimension
  if (axis != 0) {
    std::vector<int> t_axes(out.ndim());
    std::iota(t_axes.begin(), t_axes.begin() + axis, indices.ndim());
    std::iota(t_axes.begin() + axis, t_axes.begin() + axis + indices.ndim(), 0);
    std::iota(
        t_axes.begin() + axis + indices.ndim(),
        t_axes.end(),
        indices.ndim() + axis);
    out = transpose(out, t_axes, s);
  }

  // Squeeze the axis we take over
  return squeeze(out, indices.ndim() + axis, s);
}

array take(const array& a, const array& indices, StreamOrDevice s /* = {} */) {
  return take(flatten(a, s), indices, 0, s);
}

array take(const array& a, int index, int axis, StreamOrDevice s /* = {} */) {
  // Check for valid axis
  if (axis + static_cast<int>(a.ndim()) < 0 ||
      axis >= static_cast<int>(a.ndim())) {
    std::ostringstream msg;
    msg << "[take] Received invalid axis " << axis << " for array with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  // Check for valid take
  if (a.size() == 0) {
    throw std::invalid_argument(
        "[take] Cannot do a non-empty take from an array with zero elements.");
  }

  // Handle negative axis
  axis = axis < 0 ? a.ndim() + axis : axis;

  Shape starts(a.ndim(), 0);
  Shape stops = a.shape();
  starts[axis] = index;
  stops[axis] = index + 1;
  return squeeze(slice(a, std::move(starts), std::move(stops), s), axis, s);
}

array take(const array& a, int index, StreamOrDevice s /* = {} */) {
  return take(flatten(a, s), index, 0, s);
}

array take_along_axis(
    const array& a,
    const array& indices,
    int axis,
    StreamOrDevice s /* = {} */) {
  if (axis + a.ndim() < 0 || axis >= static_cast<int>(a.ndim())) {
    std::ostringstream msg;
    msg << "[take_along_axis] Received invalid axis for array with " << a.ndim()
        << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  if (indices.ndim() != a.ndim()) {
    std::ostringstream msg;
    msg << "[take_along_axis] Indices of dimension " << indices.ndim()
        << " does not match array of dimension " << a.ndim() << ".";
    throw std::invalid_argument(msg.str());
  }

  // Allow negative axis
  axis = axis < 0 ? a.ndim() + axis : axis;

  // Broadcast indices and input ignoring the take axis
  auto inputs =
      broadcast_arrays({a, indices}, std::vector<int>{axis - int(a.ndim())}, s);

  auto out_shape = inputs[1].shape();
  return array(
      std::move(out_shape),
      a.dtype(),
      std::make_shared<GatherAxis>(to_stream(s), axis),
      std::move(inputs));
}

array scatter_axis(
    const array& a,
    const array& indices,
    const array& values,
    int axis,
    ScatterAxis::ReduceType mode,
    StreamOrDevice s) {
  std::string prefix =
      (mode == ScatterAxis::None) ? "[put_along_axis]" : "[scatter_add_axis]";
  if (axis + a.ndim() < 0 || axis >= static_cast<int>(a.ndim())) {
    std::ostringstream msg;
    msg << prefix << " Received invalid axis for array with " << a.ndim()
        << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  if (indices.ndim() != a.ndim()) {
    std::ostringstream msg;
    msg << prefix << " Indices of dimension " << indices.ndim()
        << " does not match array of dimension " << a.ndim() << ".";
    throw std::invalid_argument(msg.str());
  }

  if (a.size() == 0) {
    return a;
  }

  auto upd = astype(values, a.dtype(), s);

  // Squeeze leading singletons out of update
  if (upd.ndim() > indices.ndim()) {
    std::vector<int> sq_ax(upd.ndim() - indices.ndim());
    std::iota(sq_ax.begin(), sq_ax.end(), 0);
    upd = squeeze(upd, sq_ax, s);
  }

  auto inputs = broadcast_arrays({indices, upd}, s);
  inputs.insert(inputs.begin(), a);

  // Allow negative axis
  axis = axis < 0 ? a.ndim() + axis : axis;

  // Broadcast src, indices, values while ignoring the take axis
  inputs = broadcast_arrays(inputs, {axis - int(a.ndim())}, s);

  auto out_shape = inputs[0].shape();
  return array(
      std::move(out_shape),
      a.dtype(),
      std::make_shared<ScatterAxis>(to_stream(s), mode, axis),
      std::move(inputs));
}

array put_along_axis(
    const array& a,
    const array& indices,
    const array& values,
    int axis,
    StreamOrDevice s /* = {} */) {
  return scatter_axis(a, indices, values, axis, ScatterAxis::None, s);
}

array scatter_add_axis(
    const array& a,
    const array& indices,
    const array& values,
    int axis,
    StreamOrDevice s /* = {} */) {
  return scatter_axis(a, indices, values, axis, ScatterAxis::Sum, s);
}

/** Scatter updates to given indices */
array scatter(
    const array& a,
    const std::vector<array>& indices,
    const array& updates,
    const std::vector<int>& axes,
    Scatter::ReduceType mode,
    StreamOrDevice s) {
  // Checks that indices, dimensions, and slice_sizes are all valid
  if (indices.size() > a.ndim()) {
    std::ostringstream msg;
    msg << "[scatter] Too many index arrays. Got " << indices.size()
        << " index arrays for input with " << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  for (auto& x : indices) {
    if (x.dtype() == bool_) {
      throw("[scatter] Boolean indices not supported.");
    }
  }

  std::set dims(axes.begin(), axes.end());
  if (dims.size() != axes.size()) {
    throw std::invalid_argument(
        "[scatter] Repeat axes not allowed in scatter.");
  }
  if (!dims.empty() && (*dims.begin() < 0 || *dims.rbegin() >= a.ndim())) {
    throw std::invalid_argument("[scatter] Axes don't match array dimensions.");
  }
  if (indices.size() != axes.size()) {
    throw std::invalid_argument(
        "[scatter] Number of index arrays does not match number of axes.");
  }

  // Broadcast and cast indices if necessary
  auto inputs = broadcast_arrays(indices);

  Shape idx_shape;
  if (!inputs.empty()) {
    idx_shape = inputs[0].shape();
  }

  if (updates.ndim() != (a.ndim() + idx_shape.size())) {
    std::ostringstream msg;
    msg << "[scatter] Updates with " << updates.ndim()
        << " dimensions does not match the sum of the array (" << a.ndim()
        << ") and indices (" << idx_shape.size() << ") dimensions.";
    throw std::invalid_argument(msg.str());
  }
  for (int i = 0; i < idx_shape.size(); ++i) {
    if (updates.shape(i) != idx_shape[i]) {
      std::ostringstream msg;
      msg << "[scatter] Update shape " << updates.shape()
          << " is not valid for broadcasted index shape " << idx_shape << ".";
      throw std::invalid_argument(msg.str());
    }
  }
  for (int i = 0; i < a.ndim(); ++i) {
    auto up_shape = updates.shape(i + idx_shape.size());
    if (up_shape > a.shape(i)) {
      std::ostringstream msg;
      msg << "[scatter] Updates with shape " << updates.shape()
          << " are too large for array with shape " << a.shape() << ".";
      throw std::invalid_argument(msg.str());
    }
  }

  // Promote indices to the same type
  auto dtype = result_type(indices);
  if (issubdtype(dtype, inexact)) {
    throw std::invalid_argument(
        "[scatter] Got indices with invalid dtype. Indices must be integral.");
  }
  for (auto& idx : inputs) {
    idx = astype(idx, dtype, s);
  }

  // TODO, remove when scatter supports 64-bit outputs
  if (to_stream(s).device == Device::gpu && size_of(a.dtype()) == 8) {
    std::ostringstream msg;
    msg << "[scatter] GPU scatter does not yet support " << a.dtype()
        << " for the input or updates.";
    throw std::invalid_argument(msg.str());
  }

  inputs.insert(inputs.begin(), a);
  inputs.push_back(astype(updates, a.dtype(), s));

  return array(
      a.shape(),
      a.dtype(),
      std::make_shared<Scatter>(to_stream(s), mode, axes),
      std::move(inputs));
}

array scatter(
    const array& a,
    const std::vector<array>& indices,
    const array& updates,
    const std::vector<int>& axes,
    StreamOrDevice s /*= {}*/) {
  return scatter(a, indices, updates, axes, Scatter::None, s);
}

array scatter_add(
    const array& a,
    const std::vector<array>& indices,
    const array& updates,
    const std::vector<int>& axes,
    StreamOrDevice s /*= {}*/) {
  return scatter(a, indices, updates, axes, Scatter::Sum, s);
}

array scatter_prod(
    const array& a,
    const std::vector<array>& indices,
    const array& updates,
    const std::vector<int>& axes,
    StreamOrDevice s /*= {}*/) {
  return scatter(a, indices, updates, axes, Scatter::Prod, s);
}

array scatter_max(
    const array& a,
    const std::vector<array>& indices,
    const array& updates,
    const std::vector<int>& axes,
    StreamOrDevice s /*= {}*/) {
  return scatter(a, indices, updates, axes, Scatter::Max, s);
}

array scatter_min(
    const array& a,
    const std::vector<array>& indices,
    const array& updates,
    const std::vector<int>& axes,
    StreamOrDevice s /*= {}*/) {
  return scatter(a, indices, updates, axes, Scatter::Min, s);
}

array masked_scatter(
    const array& a,
    const array& mask,
    const array& value,
    StreamOrDevice s /* =  {} */) {
  if (mask.dtype() != bool_) {
    throw std::invalid_argument("[masked_scatter] The mask has to be boolean.");
  }

  if (mask.ndim() > a.ndim()) {
    throw std::invalid_argument(
        "[masked_scatter] The mask cannot have more dimensions than the target.");
  }

  int unmasked_dims = a.ndim() - mask.ndim();

  if (value.ndim() > unmasked_dims + 1) {
    std::ostringstream msg;
    msg << "[masked_scatter] Value array shape must be broadcastable with the last "
        << unmasked_dims << " dimensions of the input.";
    throw std::invalid_argument(msg.str());
  }

  // Check if the start of the mask is compatible
  if (!std::equal(
          mask.shape().begin(), mask.shape().end(), a.shape().begin())) {
    std::ostringstream msg;
    msg << "[masked_scatter] The boolean mask should have the same shape as the "
        << "beginning of the indexed array but the mask has shape "
        << mask.shape() << " and the array has shape " << a.shape();
    throw std::invalid_argument(msg.str());
  }

  array expanded_mask = mask;
  array expanded_value = astype(value, a.dtype(), s);

  // Broadcast both the mask with the last unmasked_dims of a
  if (unmasked_dims > 0) {
    auto mask_shape = mask.shape();
    while (mask_shape.size() < a.ndim()) {
      mask_shape.push_back(1);
    }
    expanded_mask = broadcast_to(reshape(mask, mask_shape, s), a.shape(), s);
  }

  // Broadcast the value with the unmasked dims plus one extra dimension of
  // size mask.size(). If that dim is already provided leave it as is.
  if (value.ndim() < unmasked_dims + 1) {
    Shape value_shape(unmasked_dims + 1 - value.ndim(), 1);
    value_shape.insert(
        value_shape.end(), value.shape().begin(), value.shape().end());
    expanded_value = reshape(expanded_value, value_shape, s);

    value_shape[0] = mask.size();
    for (int i = 1; i < unmasked_dims + 1; i++) {
      value_shape[i] = a.shape(i - unmasked_dims - 1);
    }
    expanded_value = broadcast_to(expanded_value, value_shape, s);
  } else if (!std::equal(
                 value.shape().begin() + 1,
                 value.shape().end(),
                 a.shape().end() - unmasked_dims)) {
    auto value_shape = value.shape();
    for (int i = 1; i < unmasked_dims + 1; i++) {
      value_shape[i] = a.shape(i - unmasked_dims - 1);
    }
    expanded_value = broadcast_to(expanded_value, value_shape, s);
  }

  array expanded_a = expand_dims(a, 0, s);
  expanded_mask = expand_dims(expanded_mask, 0, s);
  expanded_value = expand_dims(expanded_value, 0, s);

  return squeeze(
      array(
          expanded_a.shape(),
          expanded_a.dtype(),
          std::make_shared<MaskedScatter>(to_stream(s)),
          {expanded_a, expanded_mask, expanded_value}),
      0,
      s);
}

array sqrt(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  return array(
      a.shape(),
      dtype,
      std::make_shared<Sqrt>(to_stream(s)),
      {astype(a, dtype, s)});
}

array rsqrt(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = at_least_float(a.dtype());
  return array(
      a.shape(),
      dtype,
      std::make_shared<Sqrt>(to_stream(s), true),
      {astype(a, dtype, s)});
}

array softmax(
    const array& a,
    const std::vector<int>& axes,
    bool precise /* = false */,
    StreamOrDevice s /* = {}*/) {
  if (a.size() == 0) {
    return a;
  }
  if (a.ndim() == 0 && !axes.empty()) {
    throw std::invalid_argument(
        "[softmax] Received non-empty axes for array with 0 dimensions.");
  }
  bool reduce_last_dim =
      !axes.empty() && (axes.back() == a.ndim() - 1 || axes.back() == -1);
  if (reduce_last_dim) {
    // For more than 2 axes check if axes is [0, 1, ..., NDIM - 1] and shape
    // is [1, 1, ..., N].
    for (int i = axes.size() - 2; i >= 0; --i) {
      if ((axes[i] + 1 != axes[i + 1]) || (a.shape(axes[i]) != 1)) {
        reduce_last_dim = false;
        break;
      }
    }
  }
  bool is_complex = issubdtype(a.dtype(), complexfloating);
  if (!is_complex && reduce_last_dim) {
    auto dtype = at_least_float(a.dtype());
    return array(
        a.shape(),
        dtype,
        std::make_shared<Softmax>(to_stream(s), precise),
        {astype(a, dtype, s)});
  } else {
    auto in = a;
    if (precise && !is_complex) {
      in = astype(a, float32, s);
    }
    auto a_max = stop_gradient(max(in, axes, /*keepdims = */ true, s), s);
    auto ex = exp(subtract(in, a_max, s), s);
    return astype(
        divide(ex, sum(ex, axes, /*keepdims = */ true, s), s), a.dtype(), s);
  }
}

array softmax(
    const array& a,
    bool precise /* = false */,
    StreamOrDevice s /* = {}*/) {
  std::vector<int> axes(a.ndim());
  std::iota(axes.begin(), axes.end(), 0);
  return softmax(a, axes, precise, s);
}

array power(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  auto dtype = promote_types(a.dtype(), b.dtype());
  std::vector<array> inputs = {astype(a, dtype, s), astype(b, dtype, s)};
  if (a.shape() != b.shape()) {
    inputs = broadcast_arrays(inputs, s);
  }
  return array(
      inputs[0].shape(), dtype, std::make_shared<Power>(to_stream(s)), inputs);
}

array cumsum(
    const array& a,
    int axis,
    bool reverse /* = false*/,
    bool inclusive /* = true*/,
    StreamOrDevice s /* = {}*/) {
  int ndim = a.ndim();
  if (axis >= ndim || axis < -ndim) {
    std::ostringstream msg;
    msg << "[cumsum] Axis " << axis << " is out of bounds for array with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  axis = (axis + a.ndim()) % a.ndim();
  auto out_type = a.dtype() == bool_ ? int32 : a.dtype();
  return array(
      a.shape(),
      out_type,
      std::make_shared<Scan>(
          to_stream(s), Scan::ReduceType::Sum, axis, reverse, inclusive),
      {a});
}

array cumsum(
    const array& a,
    bool reverse /* = false*/,
    bool inclusive /* = true*/,
    StreamOrDevice s /* = {}*/) {
  return cumsum(flatten(a, to_stream(s)), 0, reverse, inclusive, to_stream(s));
}

array cumprod(
    const array& a,
    int axis,
    bool reverse /* = false*/,
    bool inclusive /* = true*/,
    StreamOrDevice s /* = {}*/) {
  int ndim = a.ndim();
  if (axis >= ndim || axis < -ndim) {
    std::ostringstream msg;
    msg << "[cumprod] Axis " << axis << " is out of bounds for array with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  axis = (axis + a.ndim()) % a.ndim();
  return array(
      a.shape(),
      a.dtype(),
      std::make_shared<Scan>(
          to_stream(s), Scan::ReduceType::Prod, axis, reverse, inclusive),
      {a});
}

array cumprod(
    const array& a,
    bool reverse /* = false*/,
    bool inclusive /* = true*/,
    StreamOrDevice s /* = {}*/) {
  return cumprod(flatten(a, s), 0, reverse, inclusive, s);
}

array cummax(
    const array& a,
    int axis,
    bool reverse /* = false*/,
    bool inclusive /* = true*/,
    StreamOrDevice s /* = {}*/) {
  int ndim = a.ndim();
  if (axis >= ndim || axis < -ndim) {
    std::ostringstream msg;
    msg << "[cummax] Axis " << axis << " is out of bounds for array with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  axis = (axis + a.ndim()) % a.ndim();
  return array(
      a.shape(),
      a.dtype(),
      std::make_shared<Scan>(
          to_stream(s), Scan::ReduceType::Max, axis, reverse, inclusive),
      {a});
}

array cummax(
    const array& a,
    bool reverse /* = false*/,
    bool inclusive /* = true*/,
    StreamOrDevice s /* = {}*/) {
  return cummax(flatten(a, s), 0, reverse, inclusive, s);
}

array cummin(
    const array& a,
    int axis,
    bool reverse /* = false*/,
    bool inclusive /* = true*/,
    StreamOrDevice s /* = {}*/) {
  int ndim = a.ndim();
  if (axis >= ndim || axis < -ndim) {
    std::ostringstream msg;
    msg << "[cummin] Axis " << axis << " is out of bounds for array with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  axis = (axis + a.ndim()) % a.ndim();
  return array(
      a.shape(),
      a.dtype(),
      std::make_shared<Scan>(
          to_stream(s), Scan::ReduceType::Min, axis, reverse, inclusive),
      {a});
}

array cummin(
    const array& a,
    bool reverse /* = false*/,
    bool inclusive /* = true*/,
    StreamOrDevice s /* = {}*/) {
  return cummin(flatten(a, s), 0, reverse, inclusive, s);
}

array logcumsumexp(
    const array& a,
    int axis,
    bool reverse /* = false*/,
    bool inclusive /* = true*/,
    StreamOrDevice s /* = {}*/) {
  int ndim = a.ndim();
  if (axis >= ndim || axis < -ndim) {
    std::ostringstream msg;
    msg << "[logcumsumexp] Axis " << axis << " is out of bounds for array with "
        << a.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  axis = (axis + a.ndim()) % a.ndim();
  return array(
      a.shape(),
      a.dtype(),
      std::make_shared<Scan>(
          to_stream(s), Scan::ReduceType::LogAddExp, axis, reverse, inclusive),
      {a});
}

array logcumsumexp(
    const array& a,
    bool reverse /* = false*/,
    bool inclusive /* = true*/,
    StreamOrDevice s /* = {}*/) {
  return logcumsumexp(
      flatten(a, to_stream(s)), 0, reverse, inclusive, to_stream(s));
}

/** Convolution operations */

namespace {

inline void
run_conv_checks(const array& in, const array& wt, int n_dim, int groups) {
  if (!issubdtype(in.dtype(), floating)) {
    std::ostringstream msg;
    msg << "[conv] Invalid input array with type " << in.dtype() << "."
        << " Convolution currently only supports floating point types";
    throw std::invalid_argument(msg.str());
  }

  if (in.ndim() != n_dim + 2) {
    std::ostringstream msg;
    msg << "[conv] Invalid input array with " << in.ndim() << " dimensions for "
        << n_dim << "D convolution. Expected an array with " << n_dim + 2
        << " dimensions following the format [N, ..., C_in].";
    throw std::invalid_argument(msg.str());
  }

  if (wt.ndim() != n_dim + 2) {
    std::ostringstream msg;
    msg << "[conv] Invalid weight array with " << wt.ndim()
        << " dimensions for " << n_dim << "D convolution."
        << " Expected an array with " << n_dim + 2
        << " dimensions following the format [C_out, ..., C_in].";
    throw std::invalid_argument(msg.str());
  }

  if (in.shape(n_dim + 1) % groups != 0) {
    std::ostringstream msg;
    msg << "[conv] The input channels must be divisible by the number"
        << " of groups. Got input with shape " << in.shape() << " and "
        << groups << " groups.";
    throw std::invalid_argument(msg.str());
  }

  if (groups > 1 && wt.shape(0) % groups != 0) {
    std::ostringstream msg;
    msg << "[conv] If groups > 1, the output channels must be divisible by the number"
        << " of groups. Got " << wt.shape(0) << " output channels and "
        << groups << " groups.";
    throw std::invalid_argument(msg.str());
  }

  if (in.shape(n_dim + 1) != (groups * wt.shape(n_dim + 1))) {
    std::ostringstream msg;
    if (groups == 1) {
      msg << "[conv] Expect the input channels in the input"
          << " and weight array to match but got shapes -"
          << " input: " << in.shape() << " and weight: " << wt.shape();

    } else {
      msg << "Given groups=" << groups << " and weights of shape " << wt.shape()
          << ", expected to have " << (groups * wt.shape(n_dim + 1))
          << " input channels but got " << in.shape(n_dim + 1)
          << " input channels instead.";
    }
    throw std::invalid_argument(msg.str());
  }
}

} // namespace

/** 1D convolution with a filter */
array conv1d(
    const array& in_,
    const array& wt_,
    int stride /* = 1 */,
    int padding /* = 0 */,
    int dilation /* = 1 */,
    int groups /* = 1 */,
    StreamOrDevice s /* = {} */) {
  return conv_general(
      /* const array& input = */ in_,
      /* const array& weight = */ wt_,
      /* std::vector<int> stride = */ {stride},
      /* std::vector<int> padding = */ {padding},
      /* std::vector<int> kernel_dilation = */ {dilation},
      /* std::vector<int> input_dilation = */ {1},
      /* int groups = */ groups,
      /* bool flip = */ false,
      s);
}

/** 2D convolution with a filter */
array conv2d(
    const array& in_,
    const array& wt_,
    const std::pair<int, int>& stride /* = {1, 1} */,
    const std::pair<int, int>& padding /* = {0, 0} */,
    const std::pair<int, int>& dilation /* = {1, 1} */,
    int groups /* = 1 */,
    StreamOrDevice s /* = {} */) {
  return conv_general(
      /* const array& input = */ in_,
      /* const array& weight = */ wt_,
      /* std::vector<int> stride = */ {stride.first, stride.second},
      /* std::vector<int> padding = */ {padding.first, padding.second},
      /* std::vector<int> kernel_dilation = */
      {dilation.first, dilation.second},
      /* std::vector<int> input_dilation = */ {1, 1},
      /* int groups = */ groups,
      /* bool flip = */ false,
      s);
}

/** 3D convolution with a filter */
array conv3d(
    const array& in_,
    const array& wt_,
    const std::tuple<int, int, int>& stride /* = {1, 1, 1} */,
    const std::tuple<int, int, int>& padding /* = {0, 0, 0} */,
    const std::tuple<int, int, int>& dilation /* = {1, 1, 1} */,
    int groups /* = 1 */,
    StreamOrDevice s /* = {} */) {
  return conv_general(
      /* const array& input = */ in_,
      /* const array& weight = */ wt_,
      /* std::vector<int> stride = */
      {std::get<0>(stride), std::get<1>(stride), std::get<2>(stride)},
      /* std::vector<int> padding = */
      {std::get<0>(padding), std::get<1>(padding), std::get<2>(padding)},
      /* std::vector<int> kernel_dilation = */
      {std::get<0>(dilation), std::get<1>(dilation), std::get<2>(dilation)},
      /* std::vector<int> input_dilation = */ {1, 1, 1},
      /* int groups = */ groups,
      /* bool flip = */ false,
      s);
}

// Helper function for transposed convolutions
array conv_transpose_general(
    const array& input,
    const array& weight,
    std::vector<int> stride,
    std::vector<int> padding,
    std::vector<int> dilation,
    std::vector<int> output_padding,
    int groups,
    StreamOrDevice s) {
  std::vector<int> padding_lo(padding.size());
  std::vector<int> padding_hi(padding.size());
  for (int i = 0; i < padding.size(); ++i) {
    int wt_size = 1 + dilation[i] * (weight.shape(1 + i) - 1);
    padding_lo[i] = wt_size - padding[i] - 1;

    int conv_output_shape = (input.shape(i + 1) - 1) * stride[i] -
        2 * padding[i] + dilation[i] * (weight.shape(i + 1) - 1) + 1;

    int in_size = 1 + (conv_output_shape - 1);
    int out_size = 1 + stride[i] * (input.shape(1 + i) - 1);
    padding_hi[i] = in_size - out_size + padding[i] +
        output_padding[i]; // Adjust with output_padding
  }

  return conv_general(
      /* const array& input = */ input,
      /* const array& weight = */ weight,
      /* std::vector<int> stride = */ std::vector(stride.size(), 1),
      /* std::vector<int> padding_lo = */ std::move(padding_lo),
      /* std::vector<int> padding_hi = */ std::move(padding_hi),
      /* std::vector<int> kernel_dilation = */ std::move(dilation),
      /* std::vector<int> input_dilation = */ std::move(stride),
      /* int groups = */ groups,
      /* bool flip = */ true,
      s);
}

/** 1D transposed convolution with a filter */
array conv_transpose1d(
    const array& in_,
    const array& wt_,
    int stride /* = 1 */,
    int padding /* = 0 */,
    int dilation /* = 1 */,
    int output_padding /* = 0 */,
    int groups /* = 1 */,
    StreamOrDevice s /* = {} */) {
  return conv_transpose_general(
      in_, wt_, {stride}, {padding}, {dilation}, {output_padding}, groups, s);
}

/** 2D transposed convolution with a filter */
array conv_transpose2d(
    const array& in_,
    const array& wt_,
    const std::pair<int, int>& stride /* = {1, 1} */,
    const std::pair<int, int>& padding /* = {0, 0} */,
    const std::pair<int, int>& dilation /* = {1, 1} */,
    const std::pair<int, int>& output_padding /* = {0, 0} */,
    int groups /* = 1 */,
    StreamOrDevice s /* = {} */) {
  return conv_transpose_general(
      in_,
      wt_,
      {stride.first, stride.second},
      {padding.first, padding.second},
      {dilation.first, dilation.second},
      {output_padding.first, output_padding.second},
      groups,
      s);
}

/** 3D transposed convolution with a filter */
array conv_transpose3d(
    const array& in_,
    const array& wt_,
    const std::tuple<int, int, int>& stride /* = {1, 1, 1} */,
    const std::tuple<int, int, int>& padding /* = {0, 0, 0} */,
    const std::tuple<int, int, int>& dilation /* = {1, 1, 1} */,
    const std::tuple<int, int, int>& output_padding /* = {0, 0, 0} */,
    int groups /* = 1 */,
    StreamOrDevice s /* = {} */) {
  return conv_transpose_general(
      in_,
      wt_,
      {std::get<0>(stride), std::get<1>(stride), std::get<2>(stride)},
      {std::get<0>(padding), std::get<1>(padding), std::get<2>(padding)},
      {std::get<0>(dilation), std::get<1>(dilation), std::get<2>(dilation)},
      {std::get<0>(output_padding),
       std::get<1>(output_padding),
       std::get<2>(output_padding)},
      groups,
      s);
}

/** General convolution with a filter */
array conv_general(
    array in,
    array wt,
    std::vector<int> stride /* = {} */,
    std::vector<int> padding_lo /* = {} */,
    std::vector<int> padding_hi /* = {} */,
    std::vector<int> kernel_dilation /* = {} */,
    std::vector<int> input_dilation /* = {} */,
    int groups /* = 1 */,
    bool flip /* = false */,
    StreamOrDevice s /* = {} */) {
  // Run checks
  if (groups != 1 && in.ndim() != 3 && in.ndim() != 4) {
    throw std::invalid_argument(
        "[conv] Can only handle groups != 1 in 1D or 2D convolutions.");
  }

  int spatial_dims = in.ndim() - 2;

  if (spatial_dims < 1 || spatial_dims > 3) {
    throw std::invalid_argument(
        "[conv] Only works for inputs with 1-3 spatial dimensions."
        " The inputs must be in the format [N, ..., C_in]");
  }

  // Run checks
  run_conv_checks(in, wt, spatial_dims, groups);

  // Type promotion
  auto out_type = promote_types(in.dtype(), wt.dtype());
  in = astype(in, out_type, s);
  wt = astype(wt, out_type, s);

  if (stride.size() <= 1) {
    int stride_int = stride.size() ? stride[0] : 1;
    stride = std::vector<int>(spatial_dims, stride_int);
  }

  if (padding_lo.size() <= 1) {
    int padding_int = padding_lo.size() ? padding_lo[0] : 0;
    padding_lo = std::vector<int>(spatial_dims, padding_int);
  }

  if (padding_hi.size() <= 1) {
    int padding_int = padding_hi.size() ? padding_hi[0] : 0;
    padding_hi = std::vector<int>(spatial_dims, padding_int);
  }

  if (kernel_dilation.size() <= 1) {
    int kernel_dilation_int = kernel_dilation.size() ? kernel_dilation[0] : 1;
    kernel_dilation = std::vector<int>(spatial_dims, kernel_dilation_int);
  }

  if (input_dilation.size() <= 1) {
    int input_dilation_int = input_dilation.size() ? input_dilation[0] : 1;
    input_dilation = std::vector<int>(spatial_dims, input_dilation_int);
  }

  // Check for negative padding
  bool has_neg_padding = false;
  for (auto& pd : padding_lo) {
    has_neg_padding |= (pd < 0);
  }
  for (auto& pd : padding_hi) {
    has_neg_padding |= (pd < 0);
  }

  // Handle negative padding
  if (has_neg_padding) {
    Shape starts(in.ndim(), 0);
    auto stops = in.shape();

    for (int i = 0; i < spatial_dims; i++) {
      if (padding_lo[i] < 0) {
        starts[i + 1] -= padding_lo[i];
        padding_lo[i] = 0;
      }

      if (padding_hi[i] < 0) {
        stops[i + 1] += padding_hi[i];
        padding_hi[i] = 0;
      }
    }

    in = slice(in, std::move(starts), std::move(stops), s);
  }

  // Get output shapes
  auto out_shape = Convolution::conv_out_shape(
      in.shape(),
      wt.shape(),
      stride,
      padding_lo,
      padding_hi,
      kernel_dilation,
      input_dilation);

  return array(
      std::move(out_shape),
      in.dtype(),
      std::make_shared<Convolution>(
          to_stream(s),
          stride,
          padding_lo,
          padding_hi,
          kernel_dilation,
          input_dilation,
          groups,
          flip),
      {in, wt});
}

std::pair<int, int> quantization_params_from_mode(
    QuantizationMode mode,
    std::optional<int> group_size_,
    std::optional<int> bits_) {
  int default_group_size;
  int default_bits;
  switch (mode) {
    case QuantizationMode::Affine:
      default_group_size = 64;
      default_bits = 4;
      break;
    case QuantizationMode::Nvfp4:
      default_group_size = 16;
      default_bits = 4;
      break;
    case QuantizationMode::Mxfp4:
      default_group_size = 32;
      default_bits = 4;
      break;
    case QuantizationMode::Mxfp8:
      default_group_size = 32;
      default_bits = 8;
      break;
  }
  return {
      group_size_.has_value() ? *group_size_ : default_group_size,
      bits_.has_value() ? *bits_ : default_bits};
}

std::pair<Dtype, QuantizationMode> validate_mode_with_type(
    std::string_view tag,
    const array& scales,
    const std::optional<array>& biases,
    const std::optional<Dtype> out_type,
    const std::string& mode) {
  auto qmode = string_to_quantization_mode(mode, tag);
  if (out_type.has_value() && !issubdtype(*out_type, floating)) {
    std::ostringstream msg;
    msg << "[" << tag << "] Only real floating types are supported but "
        << "output dtype == " << *out_type << ".";
    throw std::invalid_argument(msg.str());
  }

  if (qmode == QuantizationMode::Affine) {
    if (!biases) {
      std::ostringstream msg;
      msg << "[" << tag << "] Biases must be provided for affine quantization.";
      throw std::invalid_argument(msg.str());
    }
    auto dtype = result_type(scales, *biases);
    if (!issubdtype(dtype, floating)) {
      std::ostringstream msg;
      msg << "[" << tag << "] Only real floating types are supported but "
          << "scales.dtype() == " << scales.dtype()
          << " and biases.dtype() == " << biases->dtype() << ".";
      throw std::invalid_argument(msg.str());
    }
    if (out_type.has_value()) {
      return {*out_type, qmode};
    } else {
      return {dtype, qmode};
    }
  } else if (scales.dtype() != uint8) {
    std::ostringstream msg;
    msg << "[" << tag << "] Scale type must be uint8 but received type "
        << scales.dtype() << ".";
    throw std::invalid_argument(msg.str());
  }
  if (biases) {
    std::ostringstream msg;
    msg << "[" << tag << "] Biases must be null for quantization mode '" << mode
        << "'.";
    throw std::invalid_argument(msg.str());
  }
  if (out_type.has_value()) {
    return {*out_type, qmode};
  } else {
    return {bfloat16, qmode};
  }
}

void validate_global_scale(
    std::string_view tag,
    QuantizationMode qmode,
    const std::optional<array>& global_scale) {
  if (global_scale.has_value()) {
    if (qmode != QuantizationMode::Nvfp4) {
      std::ostringstream msg;
      msg << "[" << tag << "] Global scale is only supported for 'nvfp4' "
          << "quantization mode.";
      throw std::invalid_argument(msg.str());
    } else {
      if (global_scale->size() != 1) {
        std::ostringstream msg;
        msg << "[" << tag << "] Global scale must be a scalar but got shape "
            << global_scale->shape() << ".";
        throw std::invalid_argument(msg.str());
      }
      // TODO: not sure if type should be restricted to float32
      if (global_scale->dtype() != float32) {
        std::ostringstream msg;
        msg << "[" << tag << "] Global scale must have dtype float32 but got "
            << global_scale->dtype() << ".";
        throw std::invalid_argument(msg.str());
      }
    }
  }
}

array quantized_matmul(
    array x,
    array w,
    array scales,
    std::optional<array> biases /* = std::nullopt */,
    bool transpose /* = true */,
    std::optional<int> group_size_ /* = std::nullopt */,
    std::optional<int> bits_ /* = std::nullopt */,
    const std::string& mode /* = "affine" */,
    StreamOrDevice s /* = {} */) {
  auto [dtype, qmode] = validate_mode_with_type(
      "quantized_matmul", scales, biases, std::nullopt, mode);

  auto [group_size, bits] =
      quantization_params_from_mode(qmode, group_size_, bits_);
  // Check and extract the quantized matrix shape against x
  auto [w_inner_dims, w_outer_dims] = extract_quantized_matmul_dims(
      "quantized_matmul", x, w, scales, biases, transpose, group_size, bits);

  if (qmode == QuantizationMode::Affine) {
    dtype = promote_types(x.dtype(), dtype);
  } else {
    dtype = x.dtype();
  }

  if (!issubdtype(dtype, floating)) {
    std::ostringstream msg;
    msg << "[quantized_matmul] Only real floating types are supported but "
        << "x.dtype() == " << x.dtype() << ".";
    throw std::invalid_argument(msg.str());
  }
  std::vector<array> inputs;
  if (qmode == QuantizationMode::Affine) {
    inputs = {
        astype(x, dtype), w, astype(scales, dtype), astype(*biases, dtype)};
  } else {
    inputs = {x, w, scales};
  }

  if (x.ndim() > 2 && w.ndim() > 2) {
    inputs = broadcast_arrays(inputs, {-2, -1}, s);
  }
  auto out_shape = inputs[0].shape();
  out_shape.back() = w_outer_dims;
  return array(
      std::move(out_shape),
      dtype,
      std::make_shared<QuantizedMatmul>(
          to_stream(s), group_size, bits, qmode, transpose),
      std::move(inputs));
}

void validate_qqmm_inputs(
    array x,
    array w,
    std::optional<array> scales_w,
    int group_size,
    int bits,
    std::optional<array> global_scale_x,
    std::optional<array> global_scale_w,
    QuantizationMode qmode) {
  // check 2D (for now)
  if (x.ndim() > 2 || w.ndim() > 2) {
    std::ostringstream msg;
    msg << "[qqmm] Only 2D inputs are supported but "
        << "x.ndim() == " << x.ndim() << " and "
        << "w.ndim() == " << w.ndim() << ".";
    throw std::invalid_argument(msg.str());
  }
  if (w.dtype() == uint32) {
    // if w is quantized, scales are provided
    if (!scales_w.has_value()) {
      std::ostringstream msg;
      throw std::invalid_argument(
          "[qqmm] Scales must be provided if second argument is quantized.");
    }
    // if scales are provided, check compatibility with quantized w
    else {
      validate_quantized_input("qqmm", w, *scales_w, group_size, bits);
    }
  }
  // if w is not quantized, dtype must be in {f16, bf16, fp32}
  else {
    if (!issubdtype(w.dtype(), floating) || w.dtype() == float64) {
      std::ostringstream msg;
      msg << "[qqmm] Only real floating types except float64 are supported but "
          << "second argument dtype == " << w.dtype() << ".";
      throw std::invalid_argument(msg.str());
    }
  }
  // x dtype must be in {f16, bf16, fp32}
  if (!issubdtype(x.dtype(), floating) || x.dtype() == float64) {
    std::ostringstream msg;
    msg << "[qqmm] Only real floating types except float64 are supported but "
        << "first argument dtype == " << x.dtype() << ".";
    throw std::invalid_argument(msg.str());
  }
  // validate global scales
  validate_global_scale("qqmm", qmode, global_scale_x);
  validate_global_scale("qqmm", qmode, global_scale_w);
  // For nvfp4 mode, both global scales must be provided together or neither
  if (qmode == QuantizationMode::Nvfp4) {
    bool has_x = global_scale_x.has_value();
    bool has_w = global_scale_w.has_value();
    if (has_x != has_w) {
      throw std::invalid_argument(
          "[qqmm] For nvfp4 mode, either both global_scale_x and "
          "global_scale_w must be provided, or neither.");
    }
  }
}

std::pair<int, int> extract_qqmm_dims(
    array x,
    array w,
    std::optional<array> scales_w,
    int group_size,
    int bits) {
  if (w.dtype() != uint32) {
    // if w is not quantized, check that last dims match
    if (x.shape(-1) != w.shape(-1)) {
      std::ostringstream msg;
      msg << "[qqmm] Last dimension of first input with shape " << x.shape()
          << " must match last dimension of"
          << " second input with shape " << w.shape() << ".";
      throw std::invalid_argument(msg.str());
    }
    return std::make_pair(w.shape(-1), w.shape(-2));
  } else {
    // if w is quantized, extract dims from quantized w
    return extract_quantized_matmul_dims(
        "qqmm",
        x,
        w,
        *scales_w,
        std::nullopt,
        /* transpose = */ true,
        group_size,
        bits);
  }
}

array qqmm(
    array in_x,
    array w,
    std::optional<array> scales_w,
    std::optional<int> group_size_ /* = std::nullopt */,
    std::optional<int> bits_ /* = std::nullopt */,
    const std::string& mode /* = "nvfp4" */,
    const std::optional<array> global_scale_x /* = std::nullopt */,
    const std::optional<array> global_scale_w /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  auto stream = to_stream(s);
  auto qmode = string_to_quantization_mode(mode, "qqmm");
  // cuBLAS block scaled matmul only supports nvfp4 and mxfp8
  if (qmode != QuantizationMode::Nvfp4 && qmode != QuantizationMode::Mxfp8) {
    std::ostringstream msg;
    msg << "[qqmm] Only 'nvfp4' and 'mxfp8' quantization modes are supported but '"
        << mode << "' was provided.";
    throw std::invalid_argument(msg.str());
  }
  // we need to check 2 cases:
  // 1. w is quantized, scales is provided
  // 2. w is not quantized, scales is not provided
  auto [group_size, bits] =
      quantization_params_from_mode(qmode, group_size_, bits_);

  // Allow gemv
  auto x = in_x;
  if (x.ndim() == 1) {
    // Insert a singleton dim in the beginning
    x = expand_dims(x, 0, s);
  } else if (w.ndim() == 2 && x.ndim() > 2) {
    x = flatten(x, 0, -2, s);
  }

  // validate inputs
  validate_qqmm_inputs(
      x, w, scales_w, group_size, bits, global_scale_x, global_scale_w, qmode);
  // validate and extract shapes
  auto [w_inner_dims, w_outer_dims] =
      extract_qqmm_dims(x, w, scales_w, group_size, bits);
  std::vector<array> inputs = {
      x,
      w,
  };
  if (scales_w.has_value()) {
    inputs.push_back(*scales_w);
  }
  if (global_scale_x.has_value() && global_scale_w.has_value()) {
    inputs.push_back(*global_scale_x);
    inputs.push_back(*global_scale_w);
  }

  auto out_shape = inputs[0].shape();
  out_shape.back() = w_outer_dims;
  auto out = array(
      std::move(out_shape),
      x.dtype(), // output dtype is the same as x dtype
      std::make_shared<QQMatmul>(stream, group_size, bits, qmode),
      std::move(inputs));
  if (in_x.ndim() > 2) {
    auto orig_shape = in_x.shape();
    orig_shape.pop_back();
    out = unflatten(out, 0, std::move(orig_shape), s);
  } else if (in_x.ndim() == 1) {
    out = squeeze(out, 0, s);
  }
  return out;
}

array pack_and_quantize(
    array& packed_w,
    const array& scales,
    const array& biases,
    int bits,
    const Stream& s) {
  int el_per_int = 32 / bits;
  array zero(0, packed_w.dtype());
  array n_bins((1 << bits) - 1, packed_w.dtype()); // 2**bits - 1
  packed_w = astype(
      clip(
          round(divide(subtract(packed_w, biases, s), scales, s), s),
          zero,
          n_bins,
          s),
      uint32,
      s);
  if (is_power_of_2(bits)) {
    array shifts = power(array(2, uint32), arange(0, 32, bits, uint32, s), s);
    packed_w = reshape(packed_w, {packed_w.shape(0), -1, el_per_int}, s);
    packed_w =
        sum(multiply(packed_w, shifts, s),
            /* axis= */ 2,
            /* keepdims= */ false,
            s);
  } else {
    // This is slow but we have fast GPU/CPU versions of this function so we
    // shouldn't be here often.
    packed_w = expand_dims(packed_w, /* axis= */ -1, s);
    packed_w = bitwise_and(
        right_shift(packed_w, arange(bits, uint32, s), s),
        array({1}, uint32),
        s);
    auto new_shape = packed_w.shape();
    new_shape[new_shape.size() - 2] = -1;
    new_shape.back() = 32;
    packed_w = reshape(packed_w, new_shape, s);
    array shifts = arange(32, uint32, s);
    packed_w =
        sum(left_shift(packed_w, shifts, s),
            /* axis= */ -1,
            /* keepdims= */ false,
            s);
  }
  return packed_w;
}

std::vector<array>
affine_quantize(const array& w, int group_size, int bits, StreamOrDevice s_) {
  auto s = to_stream(s_);
  if (group_size != 32 && group_size != 64 && group_size != 128) {
    std::ostringstream msg;
    msg << "[quantize] The requested group size " << group_size
        << " is not supported. The supported group sizes are 32, 64, and 128.";
    throw std::invalid_argument(msg.str());
  }

  if (bits < 2 || bits > 8 || bits == 7) {
    std::ostringstream msg;
    msg << "[quantize] The requested number of bits " << bits
        << " is not supported. The supported bits are 2, 3, 4, 5, 6 and 8.";
    throw std::invalid_argument(msg.str());
  }

  auto fallback = [group_size, bits, s](
                      const std::vector<array>& inputs) -> std::vector<array> {
    auto& w = inputs[0];
    auto wshape = w.shape();
    wshape.back() = -1;

    array zero(0, float32);
    array n_bins((1 << bits) - 1, float32); // 2**bits - 1
    array eps(1e-7, float32);

    array packed_w = reshape(w, {-1, w.shape(-1) / group_size, group_size}, s);

    array w_max = max(packed_w, /* axis= */ -1, /* keepdims= */ true, s);
    array w_min = min(packed_w, /* axis= */ -1, /* keepdims= */ true, s);
    w_max = astype(w_max, float32, s);
    w_min = astype(w_min, float32, s);

    array mask = greater(abs(w_min, s), abs(w_max, s), s);
    array scales =
        maximum(divide(subtract(w_max, w_min, s), n_bins, s), eps, s);
    scales = where(mask, scales, negative(scales, s), s);
    array edge = where(mask, w_min, w_max, s);
    array q0 = round(divide(edge, scales, s), s);
    scales = where(not_equal(q0, zero, s), divide(edge, q0, s), scales);
    array biases = where(equal(q0, zero, s), zero, edge, s);

    packed_w = pack_and_quantize(packed_w, scales, biases, bits, s);

    scales = astype(scales, w.dtype(), s);
    biases = astype(biases, w.dtype(), s);
    return {
        reshape(packed_w, wshape, s),
        reshape(scales, wshape, s),
        reshape(biases, wshape, s),
    };
  };

  auto wq_shape = w.shape();
  wq_shape.back() = w.shape(-1) * bits / 32;
  auto sshape = w.shape();
  sshape.back() = w.shape(-1) / group_size;
  return array::make_arrays(
      {std::move(wq_shape), sshape, sshape},
      {uint32, w.dtype(), w.dtype()},
      std::make_shared<fast::Quantize>(
          s, fallback, group_size, bits, QuantizationMode::Affine, false),
      {w});
}

std::vector<array> fp_quantize(
    const array& w,
    int group_size,
    int bits,
    QuantizationMode mode,
    const std::optional<array>& global_scale /* = std::nullopt */,
    Stream s) {
  int expected_gs = mode == QuantizationMode::Nvfp4 ? 16 : 32;
  int expected_bits = mode == QuantizationMode::Mxfp8 ? 8 : 4;
  if (group_size != expected_gs) {
    std::ostringstream msg;
    msg << "[quantize] " << quantization_mode_to_string(mode)
        << " quantization requires group size " << expected_gs << " but got "
        << group_size << ".";
    throw std::invalid_argument(msg.str());
  }
  if (bits != expected_bits) {
    std::ostringstream msg;
    msg << "[quantize] " << quantization_mode_to_string(mode)
        << " quantization requires bits to be " << expected_bits << " but got "
        << bits << ".";
    throw std::invalid_argument(msg.str());
  }

  auto inputs = std::vector<array>{w};
  if (global_scale.has_value()) {
    inputs.push_back(global_scale.value());
  }

  auto fallback = [bits = bits, group_size = group_size, s](
                      const std::vector<array>& inputs) -> std::vector<array> {
    auto& w = inputs[0];
    float maxval = (bits == 4) ? 6.0f : 448.0f;
    auto new_shape = w.shape();
    new_shape.back() = -1;
    auto wq = reshape(w, {-1, group_size}, s);
    auto scales =
        divide(max(abs(wq, s), -1, true, s), array(maxval, w.dtype()), s);
    if (group_size == 16) {
      // convert to e4m3
      auto scale_encode = inputs.size() > 1
          ? divide(array(448.0f * 6.0f, float32), inputs[1], s)
          : array(1.0f, float32);
      scales = multiply(scales, scale_encode, s);
      scales = to_fp8(scales, s);
      wq = multiply(
          divide(wq, from_fp8(scales, w.dtype(), s), s), scale_encode, s);
    } else {
      // convert to e8m0
      auto z = array(0, scales.dtype());
      scales = where(
          equal(scales, z, s),
          z,
          astype(round(log2(scales, s), s), int32, s),
          s);

      wq = divide(wq, power(array(2.0f, w.dtype()), scales, s), s);
      scales = astype(add(scales, array(127, int32), s), uint8, s);
    }
    if (bits == 4) {
      auto lut = array({
          +0.0f,
          +0.5f,
          +1.0f,
          +1.5f,
          +2.0f,
          +3.0f,
          +4.0f,
          +6.0f,
          -0.0f,
          -0.5f,
          -1.0f,
          -1.5f,
          -2.0f,
          -3.0f,
          -4.0f,
          -6.0f,
      });
      lut = astype(lut, w.dtype(), s);
      wq = argmin(
          abs(subtract(expand_dims(wq, -1, s), lut, s), s), -1, false, s);
      auto shifts = power(array(2, uint32), arange(0, 32, 4, uint32, s), s);
      wq = reshape(wq, {-1, 4, 8}, s);
      wq = sum(multiply(wq, shifts, s), -1, false, s);
    } else {
      wq = view(to_fp8(wq, s), uint32, s);
    }
    wq = reshape(wq, new_shape, s);
    scales = reshape(scales, new_shape, s);
    return {std::move(wq), std::move(scales)};
  };

  if (s.device == Device::gpu) {
    auto wq_shape = w.shape();
    wq_shape.back() = w.shape(-1) * bits / 32;
    auto sshape = w.shape();
    sshape.back() = w.shape(-1) / group_size;
    return array::make_arrays(
        {std::move(wq_shape), std::move(sshape)},
        {uint32, uint8},
        std::make_shared<fast::Quantize>(
            s, fallback, group_size, bits, mode, false),
        inputs);
  }
  return fallback(inputs);
}

std::vector<array> quantize(
    const array& w,
    std::optional<int> group_size_ /* = std::nullopt */,
    std::optional<int> bits_ /* = std::nullopt */,
    const std::string& mode /* = "affine" */,
    const std::optional<array>& global_scale /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  auto qmode = string_to_quantization_mode(mode, "quantize");
  auto [group_size, bits] =
      quantization_params_from_mode(qmode, group_size_, bits_);
  if (!issubdtype(w.dtype(), floating)) {
    std::ostringstream msg;
    msg << "[quantize] Only real floating types can be quantized "
        << "but w has type " << w.dtype() << ".";
    throw std::invalid_argument(msg.str());
  }

  if (w.ndim() < 2) {
    std::ostringstream msg;
    msg << "[quantize] The matrix to be quantized must have at least 2 dimension "
        << "but it has only " << w.ndim() << ".";
    throw std::invalid_argument(msg.str());
  }

  if ((w.shape(-1) % group_size) != 0) {
    std::ostringstream msg;
    msg << "[quantize] The last dimension of the matrix needs to be divisible by "
        << "the quantization group size " << group_size
        << ". However the provided "
        << " matrix has shape " << w.shape();
    throw std::invalid_argument(msg.str());
  }
  if (to_stream(s).device == Device::gpu && metal::is_available() &&
      global_scale.has_value()) {
    std::ostringstream msg;
    msg << "[quantize] Global scale is not supported on the Metal backend.";
    throw std::invalid_argument(msg.str());
  }
  validate_global_scale("quantize", qmode, global_scale);
  if (qmode == QuantizationMode::Affine) {
    return affine_quantize(w, group_size, bits, s);
  } else {
    return fp_quantize(w, group_size, bits, qmode, global_scale, to_stream(s));
  }
}

array affine_dequantize(
    const array& w,
    const array& scales,
    const array& biases,
    int group_size,
    int bits,
    StreamOrDevice s_) {
  auto wshape = w.shape();
  auto sshape = scales.shape();
  auto bshape = biases.shape();
  if (wshape.size() != sshape.size() || wshape.size() != bshape.size()) {
    throw std::invalid_argument(
        "[dequantize] Shape of scales and biases does not match the matrix");
  }
  wshape.back() = -1;
  sshape.back() = -1;
  bshape.back() = -1;

  if (wshape != sshape || wshape != bshape) {
    throw std::invalid_argument(
        "[dequantize] Shape of scales and biases does not match the matrix");
  }

  // Packing into uint32
  int out_size = w.shape(-1) * 32 / bits;

  if (out_size != scales.shape(-1) * group_size) {
    std::ostringstream msg;
    msg << "[dequantize] Shape of scales and biases does not match the matrix "
        << "given the quantization parameters. Provided matrix of shape "
        << w.shape() << " and scales/biases of shape " << scales.shape()
        << " with group_size=" << group_size << " and bits=" << bits << ".";
    throw std::invalid_argument(msg.str());
  }

  auto s = to_stream(s_);

  auto fallback =
      [wshape = std::move(wshape),
       sshape = std::move(sshape),
       group_size,
       bits,
       s](const std::vector<array>& inputs) mutable -> std::vector<array> {
    auto w = inputs[0];
    auto& scales = inputs[1];
    auto& biases = inputs[2];
    if (is_power_of_2(bits)) {
      std::vector<array> parts;
      for (int start = 0; start < 32; start += bits) {
        parts.push_back(expand_dims(
            right_shift(
                left_shift(w, array(32 - (start + bits), uint32), s),
                array(32 - bits, uint32),
                s),
            -1,
            s));
      }
      w = concatenate(parts, -1, s);
    } else {
      w = expand_dims(w, /* axis= */ -1, s);
      w = bitwise_and(
          right_shift(w, arange(32, uint32, s), s), array({1}, uint32), s);
      auto new_shape = w.shape();
      new_shape[new_shape.size() - 2] = -1;
      new_shape.back() = bits;
      w = reshape(w, new_shape, s);
      array shifts = arange(bits, uint32, s);
      w = sum(
          left_shift(w, shifts, s), /* axis= */ -1, /* keepdims= */ false, s);
    }

    // Dequantize
    wshape.push_back(group_size);
    w = reshape(w, wshape, s);
    w = multiply(w, expand_dims(scales, -1, s), s);
    w = add(w, expand_dims(biases, -1, s), s);
    w = reshape(w, sshape, s);

    return {w};
  };

  if (s.device == Device::gpu) {
    auto out_shape = w.shape();
    out_shape.back() = out_size;
    return array(
        std::move(out_shape),
        scales.dtype(),
        std::make_shared<fast::Quantize>(
            s, fallback, group_size, bits, QuantizationMode::Affine, true),
        {w, scales, biases});
  }
  return fallback({w, scales, biases})[0];
}

array fp_dequantize(
    const array& w,
    const array& scales,
    int group_size,
    int bits,
    Dtype out_type,
    QuantizationMode mode,
    const std::optional<array>& global_scale /* = std::nullopt */,
    Stream s) {
  int expected_gs = mode == QuantizationMode::Nvfp4 ? 16 : 32;
  int expected_bits = mode == QuantizationMode::Mxfp8 ? 8 : 4;
  if (group_size != expected_gs) {
    std::ostringstream msg;
    msg << "[dequantize] " << quantization_mode_to_string(mode)
        << " quantization requires group size " << expected_gs << " but got "
        << group_size << ".";
    throw std::invalid_argument(msg.str());
  }
  if (bits != expected_bits) {
    std::ostringstream msg;
    msg << "[dequantize] " << quantization_mode_to_string(mode)
        << " quantization requires bits to be " << expected_bits << " but got "
        << bits << ".";
    throw std::invalid_argument(msg.str());
  }

  auto wshape = w.shape();
  auto sshape = scales.shape();
  if (wshape.size() != sshape.size()) {
    throw std::invalid_argument(
        "[dequantize] Shape of scales does not match the matrix");
  }

  wshape.back() = -1;
  sshape.back() = -1;

  if (wshape != sshape) {
    throw std::invalid_argument(
        "[dequantize] Shape of scales does not match the matrix");
  }

  // Packing into uint32
  int out_size = w.shape(-1) * 32 / bits;

  if (out_size != scales.shape(-1) * group_size) {
    std::ostringstream msg;
    msg << "[dequantize] Shape of scales does not match the matrix "
        << "given the quantization parameters. Provided matrix of shape "
        << w.shape() << " and scales of shape " << scales.shape() << ".";
    throw std::invalid_argument(msg.str());
  }

  auto inputs = std::vector<array>{w, scales};
  if (global_scale.has_value()) {
    inputs.push_back(global_scale.value());
  }

  auto fallback =
      [wshape = std::move(wshape),
       sshape = std::move(sshape),
       group_size,
       bits,
       out_type,
       s](const std::vector<array>& inputs) mutable -> std::vector<array> {
    auto out = inputs[0];
    auto scales = inputs[1];
    if (bits == 4) {
      auto lut = array(
          {
              +0.0f,
              +0.5f,
              +1.0f,
              +1.5f,
              +2.0f,
              +3.0f,
              +4.0f,
              +6.0f,
              -0.0f,
              -0.5f,
              -1.0f,
              -1.5f,
              -2.0f,
              -3.0f,
              -4.0f,
              -6.0f,
          },
          out_type);
      out = view(reshape(out, {-1, 4}, s), int8, s);
      auto idx_lo = bitwise_and(out, array(0x0F, int8), s);
      auto idx_hi = right_shift(out, array(4, int8), s);
      auto lo = gather(lut, idx_lo, 0, {1}, s);
      auto hi = gather(lut, idx_hi, 0, {1}, s);
      out = concatenate({lo, hi}, -1, s);
    } else {
      out = from_fp8(view(out, uint8, s), out_type, s);
    }
    out = reshape(out, {-1, group_size}, s);
    scales = reshape(scales, {-1, 1}, s);
    if (group_size == 16) {
      array inv_scale_enc = inputs.size() > 2
          ? divide(inputs[2], array(448.0f * 6.0f, out_type), s)
          : array(1.0f, out_type);
      scales = multiply(from_fp8(scales, out_type, s), inv_scale_enc, s);
    } else {
      scales = subtract(astype(scales, out_type, s), array(127, out_type), s);
      scales = power(array(2.0f, out_type), scales, s);
    }
    return {reshape(multiply(out, scales, s), wshape, s)};
  };

  if (s.device == Device::gpu) {
    auto out_shape = w.shape();
    out_shape.back() = out_size;
    return array(
        std::move(out_shape),
        out_type,
        std::make_shared<fast::Quantize>(
            s, fallback, group_size, bits, mode, true),
        inputs);
  }
  return fallback(inputs)[0];
}

array dequantize(
    const array& w,
    const array& scales,
    const std::optional<array>& biases /* = std::nullopt */,
    std::optional<int> group_size_ /* = std::nullopt */,
    std::optional<int> bits_ /* = std::nullopt */,
    const std::string& mode /* = "affine" */,
    const std::optional<array>& global_scale /* = std::nullopt */,
    std::optional<Dtype> dtype /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  auto [out_type, qmode] =
      validate_mode_with_type("dequantize", scales, biases, dtype, mode);
  auto [group_size, bits] =
      quantization_params_from_mode(qmode, group_size_, bits_);
  if (bits <= 0) {
    std::ostringstream msg;
    msg << "[dequantize] Invalid value for bits: " << bits;
    throw std::invalid_argument(msg.str());
  }
  if (group_size <= 0) {
    std::ostringstream msg;
    msg << "[dequantize] Invalid value for group_size: " << group_size;
    throw std::invalid_argument(msg.str());
  }
  if (w.dtype() != uint32) {
    throw std::invalid_argument(
        "[dequantize] The matrix should be given as a uint32");
  }
  if (w.ndim() < 2) {
    std::ostringstream msg;
    msg << "[dequantize] The matrix to be dequantized must have at least 2 dimension "
        << "but it has only " << w.ndim() << ".";
    throw std::invalid_argument(msg.str());
  }
  if (global_scale.has_value()) {
    if (to_stream(s).device == Device::gpu && metal::is_available()) {
      std::ostringstream msg;
      msg << "[dequantize] Global scale is not supported on the Metal backend.";
      throw std::invalid_argument(msg.str());
    }
  }
  validate_global_scale("dequantize", qmode, global_scale);

  if (qmode == QuantizationMode::Affine) {
    return astype(
        affine_dequantize(w, scales, *biases, group_size, bits, s),
        out_type,
        s);
  } else {
    return fp_dequantize(
        w,
        scales,
        group_size,
        bits,
        out_type,
        qmode,
        global_scale,
        to_stream(s));
  }
}

array from_fp8(array x, Dtype dtype, StreamOrDevice s) {
  if (x.dtype() != uint8) {
    std::ostringstream msg;
    msg << "[from_fp8] Input must have type uint8 but "
        << "x.dtype() == " << x.dtype() << ".";
    throw std::invalid_argument(msg.str());
  }
  if (!issubdtype(dtype, floating)) {
    std::ostringstream msg;
    msg << "[from_fp8] Only real floating types are supported but "
        << "dtype == " << dtype << ".";
    throw std::invalid_argument(msg.str());
  }
  return array(
      x.shape(),
      dtype,
      std::make_shared<fast::ConvertFP8>(to_stream(s), false),
      {x});
}

array to_fp8(array x, StreamOrDevice s) {
  if (!issubdtype(x.dtype(), floating)) {
    std::ostringstream msg;
    msg << "[to_fp8] Only real floating types are supported but "
        << "x.dtype() == " << x.dtype() << ".";
    throw std::invalid_argument(msg.str());
  }
  return array(
      x.shape(),
      uint8,
      std::make_shared<fast::ConvertFP8>(to_stream(s), true),
      {x});
}

array gather_qmm(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases /* = std::nullopt */,
    std::optional<array> lhs_indices_ /* = std::nullopt */,
    std::optional<array> rhs_indices_ /* = std::nullopt */,
    bool transpose /* = true */,
    std::optional<int> group_size_ /* = std::nullopt */,
    std::optional<int> bits_ /* = std::nullopt */,
    const std::string& mode /* = "affine" */,
    bool sorted_indices /* = false */,
    StreamOrDevice s /* = {} */) {
  if (!lhs_indices_ && !rhs_indices_) {
    return quantized_matmul(
        x, w, scales, biases, transpose, group_size_, bits_, mode, s);
  }

  auto [out_type, qmode] =
      validate_mode_with_type("gather_qmm", scales, biases, std::nullopt, mode);
  auto [group_size, bits] =
      quantization_params_from_mode(qmode, group_size_, bits_);
  auto [w_inner_dims, w_outer_dims] = extract_quantized_matmul_dims(
      "gather_qmm", x, w, scales, biases, transpose, group_size, bits);
  if (qmode == QuantizationMode::Affine) {
    out_type = promote_types(x.dtype(), out_type);
  } else {
    out_type = x.dtype();
  }

  if (!issubdtype(out_type, floating)) {
    std::ostringstream msg;
    msg << "[gather_qmm] Only real floating types are supported but "
        << "x.dtype() == " << x.dtype() << ".";
    throw std::invalid_argument(msg.str());
  }

  // Extract indices and broadcast them
  array lhs_indices = indices_or_default(lhs_indices_, x, s);
  array rhs_indices = indices_or_default(rhs_indices_, w, s);
  std::tie(lhs_indices, rhs_indices) =
      broadcast_arrays(lhs_indices, rhs_indices, s);

  if (!issubdtype(lhs_indices.dtype(), integer)) {
    throw std::invalid_argument(
        "[gather_qmm] Got lhs_indices with invalid dtype. Indices must be integral.");
  }

  if (!issubdtype(rhs_indices.dtype(), integer)) {
    throw std::invalid_argument(
        "[gather_qmm] Got rhs_indices with invalid dtype. Indices must be integral.");
  }
  if (x.ndim() < 2) {
    std::ostringstream msg;
    msg << "[gather_qmm] Non-quantized input must have at least two"
        << " dimensions but got input with shape " << x.shape() << ".";
    throw std::invalid_argument(msg.str());
  }

  lhs_indices = astype(lhs_indices, uint32, s);
  rhs_indices = astype(rhs_indices, uint32, s);

  // Compute the full output shape
  auto out_shape = lhs_indices.shape();
  out_shape.push_back(x.shape(-2));
  out_shape.push_back(w_outer_dims);
  std::vector<array> inputs;
  if (qmode == QuantizationMode::Affine) {
    inputs = {
        astype(x, out_type, s),
        std::move(w),
        astype(scales, out_type, s),
        astype(*biases, out_type, s),
        std::move(lhs_indices),
        std::move(rhs_indices)};
  } else {
    inputs = {
        astype(x, out_type, s),
        std::move(w),
        std::move(scales),
        std::move(lhs_indices),
        std::move(rhs_indices)};
  }
  return array(
      std::move(out_shape),
      out_type,
      std::make_shared<GatherQMM>(
          to_stream(s),
          group_size,
          bits,
          qmode,
          transpose,
          sorted_indices && !rhs_indices_,
          sorted_indices && !lhs_indices_),
      std::move(inputs));
}

array tensordot(
    const array& a,
    const array& b,
    const int axis /* = 2 */,
    StreamOrDevice s /* = {} */
) {
  if (axis < 0) {
    throw std::invalid_argument(
        "[tensordot] axis must be greater or equal to 0.");
  }
  if (axis > std::min(a.ndim(), b.ndim())) {
    throw std::invalid_argument(
        "[tensordot] axis must be less than the number of dimensions of a and b.");
  }
  std::vector<int> adims;
  std::vector<int> bdims;
  for (int i = 0; i < axis; i++) {
    bdims.emplace_back(i);
    adims.emplace_back(i - axis);
  }
  return tensordot(a, b, {adims}, {bdims}, s);
}

array tensordot(
    const array& a,
    const array& b,
    const std::vector<int>& axes_a,
    const std::vector<int>& axes_b,
    StreamOrDevice s /* = {} */) {
  if (axes_a.size() != axes_b.size()) {
    throw std::invalid_argument("[tensordot] axes must have the same size.");
  }
  int csize = 1;
  auto x = a;
  auto y = b;
  for (int i = 0; i < axes_a.size(); i++) {
    if (x.shape(axes_a.at(i)) == y.shape(axes_b.at(i))) {
      csize *= x.shape(axes_a.at(i));
    } else {
      throw std::invalid_argument(
          "[tensordot] a and b must have the same shape on the contracted axes.");
    }
  }

  std::vector<bool> cdims1(x.ndim(), false);
  std::vector<bool> cdims2(y.ndim(), false);
  for (const auto n : axes_a) {
    int n_ = (n < 0) ? n + x.ndim() : n;
    cdims1[n_] = true;
  }
  for (const auto n : axes_b) {
    int n_ = (n < 0) ? n + y.ndim() : n;
    cdims2[n_] = true;
  }

  std::vector<int> t1;
  std::vector<int> t2;
  Shape rshape;
  int size1 = 1;
  int size2 = 1;
  for (int i = 0; i < a.ndim(); i++) {
    if (!cdims1[i]) {
      t1.emplace_back(i);
      size1 *= a.shape(i);
      rshape.emplace_back(a.shape(i));
    }
  }
  for (const auto x : axes_a) {
    t1.emplace_back(x);
  }
  for (const auto x : axes_b) {
    t2.emplace_back(x);
  }
  for (int i = 0; i < b.ndim(); i++) {
    if (!cdims2[i]) {
      t2.emplace_back(i);
      size2 *= b.shape(i);
      rshape.emplace_back(b.shape(i));
    }
  }
  x = reshape(transpose(x, t1, s), {size1, csize}, s);
  y = reshape(transpose(y, t2, s), {csize, size2}, s);
  return reshape(matmul(x, y, s), rshape, s);
}

array outer(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  return multiply(
      reshape(a, {static_cast<int>(a.size()), 1}, s), flatten(b, s), s);
}

array inner(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  if (a.ndim() == 0 || b.ndim() == 0) {
    return multiply(a, b, s);
  }
  if (a.shape(-1) != b.shape(-1)) {
    throw std::invalid_argument(
        "[inner] a and b must have the same last dimension.");
  }

  return tensordot(a, b, {-1}, {-1}, s);
}

/** Compute D = beta * C + alpha * (A @ B) */
array addmm(
    array c,
    array a,
    array b,
    const float& alpha /* = 1.f */,
    const float& beta /* = 1.f */,
    StreamOrDevice s /* = {} */) {
  int in_a_ndim = a.ndim();
  int in_b_ndim = b.ndim();

  if (a.ndim() == 0 || b.ndim() == 0) {
    throw std::invalid_argument(
        "[addmm] Got 0 dimension input. Inputs must "
        "have at least one dimension.");
  }

  // Type promotion
  auto out_type = result_type(a, b, c);

  if (out_type == complex64) {
    return add(
        multiply(matmul(a, b, s), array(alpha), s),
        multiply(array(beta), c, s),
        s);
  }

  if (a.ndim() == 1) {
    // Insert a singleton dim in the beginning
    a = expand_dims(a, 0, s);
  }
  if (b.ndim() == 1) {
    // Insert a singleton dim at the end
    b = expand_dims(b, 1, s);
  }

  if (a.shape(-1) != b.shape(-2)) {
    std::ostringstream msg;
    msg << "[addmm] Last dimension of first input with shape " << a.shape()
        << " must match second to last dimension of"
        << " second input with shape " << b.shape() << ".";
    throw std::invalid_argument(msg.str());
  }

  if (!issubdtype(out_type, floating)) {
    std::ostringstream msg;
    msg << "[addmm] Only real floating point types are supported but "
        << c.dtype() << ", " << a.dtype() << " and " << b.dtype()
        << " were provided which results in " << out_type
        << ", which is not a real floating point type.";
    throw std::invalid_argument(msg.str());
  }

  a = astype(a, out_type, s);
  b = astype(b, out_type, s);
  c = astype(c, out_type, s);

  // We can batch the multiplication by reshaping a
  if (a.ndim() > 2 && b.ndim() == 2 && c.ndim() <= 1) {
    auto out_shape = a.shape();
    a = reshape(a, {-1, out_shape.back()}, s);
    out_shape.back() = b.shape(-1);

    if (in_b_ndim == 1) {
      out_shape.pop_back();
    }

    c = broadcast_to(c, {a.shape(0), b.shape(1)}, s);

    auto out = array(
        {a.shape(0), b.shape(1)},
        out_type,
        std::make_shared<AddMM>(to_stream(s), alpha, beta),
        {a, b, c});
    return reshape(out, out_shape, s);
  }

  if (a.ndim() > 2 || b.ndim() > 2) {
    Shape bsx_a(a.shape().begin(), a.shape().end() - 2);
    Shape bsx_b(b.shape().begin(), b.shape().end() - 2);
    auto inner_shape = broadcast_shapes(bsx_a, bsx_b);

    // Broadcast a
    inner_shape.push_back(a.shape(-2));
    inner_shape.push_back(a.shape(-1));
    a = broadcast_to(a, inner_shape, s);

    // Broadcast b
    *(inner_shape.end() - 2) = b.shape(-2);
    *(inner_shape.end() - 1) = b.shape(-1);
    b = broadcast_to(b, inner_shape, s);
  }

  auto out_shape = a.shape();
  out_shape.back() = b.shape(-1);

  auto out_shape_adjusted = out_shape;

  if (in_a_ndim == 1 || in_b_ndim == 1) {
    out_shape_adjusted.erase(
        out_shape_adjusted.end() - ((in_a_ndim == 1) ? 2 : 1),
        out_shape_adjusted.end() - ((in_b_ndim == 1) ? 0 : 1));
  }

  auto c_broadcast_shape = broadcast_shapes(c.shape(), out_shape_adjusted);
  c = broadcast_to(c, c_broadcast_shape, s);

  if (in_a_ndim == 1 || in_b_ndim == 1) {
    auto c_reshape = c.shape();
    if (in_b_ndim == 1) {
      c_reshape.push_back(1);
    }

    if (in_a_ndim == 1) {
      c_reshape.push_back(c_reshape.back());
      c_reshape[c_reshape.size() - 2] = 1;
    }

    c = reshape(c, c_reshape, s);
  }
  if (c.shape() != out_shape) {
    throw std::invalid_argument(
        "[addmm] input c must broadcast to the output shape");
  }

  auto out = array(
      std::move(out_shape),
      out_type,
      std::make_shared<AddMM>(to_stream(s), alpha, beta),
      {a, b, c});

  // Remove the possibly inserted singleton dimensions
  std::vector<int> axes;
  if (in_a_ndim == 1) {
    axes.push_back(out.ndim() - 2);
  }
  if (in_b_ndim == 1) {
    axes.push_back(out.ndim() - 1);
  }
  return axes.empty() ? out : squeeze(out, axes, s);
}

/** Compute matrix product with tile-level masking */
array block_masked_mm(
    array a,
    array b,
    int block_size,
    std::optional<array> mask_out /* = std::nullopt */,
    std::optional<array> mask_lhs /* = std::nullopt */,
    std::optional<array> mask_rhs /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  // If no masks, just perform regular matmul
  if (!mask_out && !mask_lhs && !mask_rhs) {
    return matmul(a, b, s);
  }

  bool has_out_mask = mask_out.has_value();
  bool has_operand_mask = mask_lhs.has_value() || mask_rhs.has_value();

  // Check valid tile sizes
  // TODO: Add support for 16x16 tile
  if (block_size != 32 && block_size != 64) {
    std::ostringstream msg;
    msg << "[block_masked_mm] Only block_sizes 32, 64 are supported."
        << "Got block size " << block_size << ".";
    throw std::invalid_argument(msg.str());
  }

  // Do shape checks for operands
  int in_a_ndim = a.ndim();
  int in_b_ndim = b.ndim();

  if (a.ndim() == 0 || b.ndim() == 0) {
    throw std::invalid_argument(
        "[block_masked_mm] Got 0 dimension input. Inputs must "
        "have at least one dimension.");
  }

  if (a.ndim() == 1) {
    // Insert a singleton dim in the beginning
    a = expand_dims(a, 0, s);
  }
  if (b.ndim() == 1) {
    // Insert a singleton dim at the end
    b = expand_dims(b, 1, s);
  }

  if (a.shape(-1) != b.shape(-2)) {
    std::ostringstream msg;
    msg << "[block_masked_mm] Last dimension of first input with shape "
        << a.shape() << " must match second to last dimension of"
        << " second input with shape " << b.shape() << ".";
    throw std::invalid_argument(msg.str());
  }

  // Type promotion
  auto out_type = result_type(a, b);
  if (!issubdtype(out_type, floating)) {
    std::ostringstream msg;
    msg << "[block_masked_mm] Only real floating point types are supported but "
        << a.dtype() << " and " << b.dtype()
        << " were provided which results in " << out_type
        << ", which is not a real floating point type.";
    throw std::invalid_argument(msg.str());
  }

  a = astype(a, out_type, s);
  b = astype(b, out_type, s);

  // Handle broadcasting
  Shape bsx_a(a.shape().begin(), a.shape().end() - 2);
  Shape bsx_b(b.shape().begin(), b.shape().end() - 2);

  auto bsx_shape = broadcast_shapes(bsx_a, bsx_b);

  bsx_shape.push_back(1);
  bsx_shape.push_back(1);
  int nd = bsx_shape.size();

  int M = a.shape(-2);
  int N = b.shape(-1);
  int K = a.shape(-1);

  // Prepare A
  bsx_shape[nd - 2] = M;
  bsx_shape[nd - 1] = K;
  a = broadcast_to(a, bsx_shape, s);

  // Prepare B
  bsx_shape[nd - 2] = K;
  bsx_shape[nd - 1] = N;
  b = broadcast_to(b, bsx_shape, s);

  // Get output shape
  auto out_shape = bsx_shape;
  out_shape[nd - 2] = M;
  out_shape[nd - 1] = N;

  // Determine mask shape requirments
  int tm = (M + block_size - 1) / block_size;
  int tn = (N + block_size - 1) / block_size;
  int tk = (K + block_size - 1) / block_size;

  std::vector<array> inputs = {a, b};

  // Broadcast and astype mask
  auto broadcast_mask = [](array mask,
                           Shape& bs_shape,
                           int y,
                           int x,
                           Dtype mask_dtype,
                           StreamOrDevice s) {
    int nd_bsx = bs_shape.size();
    bs_shape[nd_bsx - 2] = y;
    bs_shape[nd_bsx - 1] = x;
    mask = astype(mask, mask_dtype, s);
    return broadcast_to(mask, bs_shape, s);
  };

  // Out mask
  if (has_out_mask) {
    array mask_out_p = mask_out.value_or(array({true}));
    if (in_a_ndim == 1 || in_b_ndim == 1) {
      std::vector<int> ex_dims;
      if (in_a_ndim == 1)
        ex_dims.push_back(-2);
      if (in_b_ndim == 1)
        ex_dims.push_back(-1);
      mask_out_p = expand_dims(mask_out_p, ex_dims, s);
    }
    auto maskout_dtype = mask_out_p.dtype() == bool_ ? bool_ : out_type;
    mask_out_p =
        broadcast_mask(mask_out_p, bsx_shape, tm, tn, maskout_dtype, s);

    inputs.push_back(mask_out_p);
  }

  // Operand masks
  if (has_operand_mask) {
    // Pull masks
    array mask_lhs_p = mask_lhs.value_or(array({true}));
    array mask_rhs_p = mask_rhs.value_or(array({true}));
    auto mask_dtype =
        (mask_lhs_p.dtype() == bool_ && mask_rhs_p.dtype() == bool_) ? bool_
                                                                     : out_type;

    // LHS mask
    if (in_a_ndim == 1) {
      mask_lhs_p = expand_dims(mask_lhs_p, -2, s);
    }
    mask_lhs_p = broadcast_mask(mask_lhs_p, bsx_shape, tm, tk, mask_dtype, s);

    // RHS mask
    if (in_b_ndim == 1) {
      mask_rhs_p = expand_dims(mask_rhs_p, -1, s);
    }
    mask_rhs_p = broadcast_mask(mask_rhs_p, bsx_shape, tk, tn, mask_dtype, s);

    inputs.push_back(mask_lhs_p);
    inputs.push_back(mask_rhs_p);
  }

  // Caculate array
  auto out = array(
      std::move(out_shape),
      out_type,
      std::make_shared<BlockMaskedMM>(to_stream(s), block_size),
      std::move(inputs));
  // Remove the possibly inserted singleton dimensions
  std::vector<int> axes;
  if (in_a_ndim == 1) {
    axes.push_back(out.ndim() - 2);
  }
  if (in_b_ndim == 1) {
    axes.push_back(out.ndim() - 1);
  }
  return axes.empty() ? out : squeeze(out, axes, s);
}

/** Compute matrix product with matrix-level gather */
array gather_mm(
    array a,
    array b,
    std::optional<array> lhs_indices_ /* = std::nullopt */,
    std::optional<array> rhs_indices_ /* = std::nullopt */,
    bool sorted_indices /* = false */,
    StreamOrDevice s /* = {} */) {
  // If no indices, fall back to full matmul
  if (!lhs_indices_ && !rhs_indices_) {
    return matmul(a, b, s);
  }

  // Do shape checks for operands
  int in_a_ndim = a.ndim();
  int in_b_ndim = b.ndim();

  if (a.ndim() == 0 || b.ndim() == 0) {
    throw std::invalid_argument(
        "[gather_mm] Got 0 dimension input. Inputs must "
        "have at least one dimension.");
  }

  if (a.ndim() == 1) {
    // Insert a singleton dim in the beginning
    a = expand_dims(a, 0, s);
  }
  if (b.ndim() == 1) {
    // Insert a singleton dim at the end
    b = expand_dims(b, 1, s);
  }

  if (a.shape(-1) != b.shape(-2)) {
    std::ostringstream msg;
    msg << "[gather_mm] Last dimension of first input with shape " << a.shape()
        << " must match second to last dimension of"
        << " second input with shape " << b.shape() << ".";
    throw std::invalid_argument(msg.str());
  }

  // Type promotion
  auto out_type = result_type(a, b);
  if (!issubdtype(out_type, floating)) {
    std::ostringstream msg;
    msg << "[gather_mm] Only real floating point types are supported but "
        << a.dtype() << " and " << b.dtype()
        << " were provided which results in " << out_type
        << ", which is not a real floating point type.";
    throw std::invalid_argument(msg.str());
  }

  a = astype(a, out_type, s);
  b = astype(b, out_type, s);

  // Handle broadcasting
  array lhs_indices = indices_or_default(lhs_indices_, a, s);
  array rhs_indices = indices_or_default(rhs_indices_, b, s);

  if (!issubdtype(lhs_indices.dtype(), integer)) {
    throw std::invalid_argument(
        "[gather_mm] Got lhs_indices with invalid dtype. Indices must be integral.");
  }

  if (!issubdtype(rhs_indices.dtype(), integer)) {
    throw std::invalid_argument(
        "[gather_mm] Got rhs_indices with invalid dtype. Indices must be integral.");
  }

  lhs_indices = astype(lhs_indices, uint32, s);
  rhs_indices = astype(rhs_indices, uint32, s);

  int M = a.shape(-2);
  int N = b.shape(-1);

  std::tie(lhs_indices, rhs_indices) =
      broadcast_arrays(lhs_indices, rhs_indices, s);

  auto out_shape = lhs_indices.shape();
  out_shape.push_back(M);
  out_shape.push_back(N);

  // Make the output array
  auto out = array(
      std::move(out_shape),
      out_type,
      std::make_shared<GatherMM>(
          to_stream(s),
          sorted_indices && !rhs_indices_,
          sorted_indices && !lhs_indices_),
      {std::move(a),
       std::move(b),
       std::move(lhs_indices),
       std::move(rhs_indices)});

  // Remove the possibly inserted singleton dimensions
  std::vector<int> axes;
  if (in_a_ndim == 1) {
    axes.push_back(out.ndim() - 2);
  }
  if (in_b_ndim == 1) {
    axes.push_back(out.ndim() - 1);
  }
  return axes.empty() ? out : squeeze(out, axes, s);
}

array segmented_mm(
    array a,
    array b,
    array segments,
    StreamOrDevice s /* = {} */) {
  if (a.ndim() != 2 || b.ndim() != 2) {
    throw std::invalid_argument("[segmented_mm] Batched matmul not supported");
  }

  if (segments.ndim() < 1 || segments.shape().back() != 2) {
    std::ostringstream msg;
    msg << "[segmented_mm] The segments should have shape (..., 2) but "
        << segments.shape() << " was provided.";
    throw std::invalid_argument(msg.str());
  }

  // Type promotion
  auto out_type = result_type(a, b);
  if (!issubdtype(out_type, floating)) {
    std::ostringstream msg;
    msg << "[segmented_mm] Only real floating point types are supported but "
        << a.dtype() << " and " << b.dtype()
        << " were provided which results in " << out_type
        << ", which is not a real floating point type.";
    throw std::invalid_argument(msg.str());
  }

  if (!issubdtype(segments.dtype(), integer)) {
    throw std::invalid_argument(
        "[segmented_mm] Got segments with invalid dtype. Segments must be integral.");
  }

  a = astype(a, out_type, s);
  b = astype(b, out_type, s);
  segments = astype(segments, uint32, s);

  Shape out_shape = segments.shape();
  out_shape.pop_back();
  out_shape.push_back(a.shape(0));
  out_shape.push_back(b.shape(1));

  return array(
      std::move(out_shape),
      out_type,
      std::make_shared<SegmentedMM>(to_stream(s)),
      {std::move(a), std::move(b), std::move(segments)});
}

array diagonal(
    const array& a,
    int offset /* = 0 */,
    int axis1 /* = 0 */,
    int axis2 /* = 1 */,
    StreamOrDevice s /* = {} */
) {
  int ndim = a.ndim();
  if (ndim < 2) {
    std::ostringstream msg;
    msg << "[diagonal] Array must have at least two dimensions, but got "
        << ndim << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  auto ax1 = (axis1 < 0) ? axis1 + ndim : axis1;
  if (ax1 < 0 || ax1 >= ndim) {
    std::ostringstream msg;
    msg << "[diagonal] Invalid axis1 " << axis1 << " for array with " << ndim
        << " dimensions.";
    throw std::out_of_range(msg.str());
  }

  auto ax2 = (axis2 < 0) ? axis2 + ndim : axis2;
  if (ax2 < 0 || ax2 >= ndim) {
    std::ostringstream msg;
    msg << "[diagonal] Invalid axis2 " << axis2 << " for array with " << ndim
        << " dimensions.";
    throw std::out_of_range(msg.str());
  }

  if (ax1 == ax2) {
    throw std::invalid_argument(
        "[diagonal] axis1 and axis2 cannot be the same axis");
  }

  ShapeElem off1 = std::max(-offset, 0);
  ShapeElem off2 = std::max(offset, 0);

  auto diag_size = std::min(a.shape(ax1) - off1, a.shape(ax2) - off2);
  diag_size = diag_size < 0 ? 0 : diag_size;

  std::vector<array> indices = {
      arange(off1, off1 + diag_size, s), arange(off2, off2 + diag_size, s)};

  Shape slice_sizes = a.shape();
  slice_sizes[ax1] = 1;
  slice_sizes[ax2] = 1;

  auto out = gather(a, indices, {ax1, ax2}, slice_sizes, s);
  return moveaxis(squeeze(out, {ax1 + 1, ax2 + 1}, s), 0, -1, s);
}

array diag(const array& a, int k /* = 0 */, StreamOrDevice s /* = {} */) {
  if (a.ndim() == 1) {
    int a_size = a.size();
    int n = a_size + std::abs(k);
    auto res = zeros({n, n}, a.dtype(), s);

    std::vector<array> indices;
    auto s1 = std::max(0, -k);
    auto s2 = std::max(0, k);
    indices.push_back(arange(s1, a_size + s1, uint32, s));
    indices.push_back(arange(s2, a_size + s2, uint32, s));

    return scatter(res, indices, reshape(a, {a_size, 1, 1}, s), {0, 1}, s);
  } else if (a.ndim() == 2) {
    return diagonal(a, k, 0, 1, s);
  } else {
    std::ostringstream msg;
    msg << "[diag] array must be 1-D or 2-D, got array with " << a.ndim()
        << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
}

array trace(
    const array& a,
    int offset,
    int axis1,
    int axis2,
    Dtype dtype,
    StreamOrDevice s /* = {} */) {
  int ndim = a.ndim();
  if (ndim < 2) {
    std::ostringstream msg;
    msg << "[trace] Array must have at least two dimensions, but got " << ndim
        << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  auto ax1 = (axis1 < 0) ? axis1 + ndim : axis1;
  if (ax1 < 0 || ax1 >= ndim) {
    std::ostringstream msg;
    msg << "[trace] Invalid axis1 " << axis1 << " for array with " << ndim
        << " dimensions.";
    throw std::out_of_range(msg.str());
  }

  auto ax2 = (axis2 < 0) ? axis2 + ndim : axis2;
  if (ax2 < 0 || ax2 >= ndim) {
    std::ostringstream msg;
    msg << "[trace] Invalid axis2 " << axis2 << " for array with " << ndim
        << " dimensions.";
    throw std::out_of_range(msg.str());
  }

  if (ax1 == ax2) {
    throw std::invalid_argument(
        "[trace] axis1 and axis2 cannot be the same axis");
  }

  return sum(
      astype(diagonal(a, offset, axis1, axis2, s), dtype, s),
      /* axis = */ -1,
      /* keepdims = */ false,
      s);
}
array trace(
    const array& a,
    int offset,
    int axis1,
    int axis2,
    StreamOrDevice s /* = {} */) {
  auto dtype = a.dtype();
  return trace(a, offset, axis1, axis2, dtype, s);
}
array trace(const array& a, StreamOrDevice s /* = {} */) {
  auto dtype = a.dtype();
  return trace(a, 0, 0, 1, dtype, s);
}

std::vector<array> depends(
    const std::vector<array>& inputs,
    const std::vector<array>& dependencies) {
  std::vector<array> all_inputs = inputs;
  all_inputs.insert(all_inputs.end(), dependencies.begin(), dependencies.end());

  // Compute the stream. Maybe do it in a smarter way at some point in the
  // future.
  Stream s = (inputs[0].has_primitive()) ? inputs[0].primitive().stream()
                                         : to_stream({});
  // Make the output info
  std::vector<Shape> shapes;
  std::vector<Dtype> dtypes;
  for (const auto& in : inputs) {
    shapes.emplace_back(in.shape());
    dtypes.emplace_back(in.dtype());
  }

  return array::make_arrays(
      std::move(shapes),
      dtypes,
      std::make_shared<Depends>(to_stream(s)),
      all_inputs);
}

array atleast_1d(const array& a, StreamOrDevice s /* = {} */) {
  if (a.ndim() == 0) {
    return reshape(a, {1}, s);
  }
  return a;
}

std::vector<array> atleast_1d(
    const std::vector<array>& arrays,
    StreamOrDevice s /* = {} */) {
  std::vector<array> out;
  out.reserve(arrays.size());
  for (const auto& a : arrays) {
    out.push_back(atleast_1d(a, s));
  }
  return out;
}

array atleast_2d(const array& a, StreamOrDevice s /* = {} */) {
  switch (a.ndim()) {
    case 0:
      return reshape(a, {1, 1}, s);
    case 1:
      return reshape(a, {1, a.shape(0)}, s);
    default:
      return a;
  }
}

std::vector<array> atleast_2d(
    const std::vector<array>& arrays,
    StreamOrDevice s /* = {} */) {
  std::vector<array> out;
  out.reserve(arrays.size());
  for (const auto& a : arrays) {
    out.push_back(atleast_2d(a, s));
  }
  return out;
}

array atleast_3d(const array& a, StreamOrDevice s /* = {} */) {
  switch (a.ndim()) {
    case 0:
      return reshape(a, {1, 1, 1}, s);
    case 1:
      return reshape(a, {1, a.shape(0), 1}, s);
    case 2:
      return reshape(a, {a.shape(0), a.shape(1), 1}, s);
    default:
      return a;
  }
}

std::vector<array> atleast_3d(
    const std::vector<array>& arrays,
    StreamOrDevice s /* = {} */) {
  std::vector<array> out;
  out.reserve(arrays.size());
  for (const auto& a : arrays) {
    out.push_back(atleast_3d(a, s));
  }
  return out;
}

array number_of_elements(
    const array& a,
    std::vector<int> axes,
    bool inverted,
    Dtype dtype /* = int32 */,
    StreamOrDevice s /* = {} */) {
  for (auto& ax : axes) {
    int normal_axis = (ax + a.ndim()) % a.ndim();
    if (normal_axis >= a.ndim() || normal_axis < 0) {
      std::ostringstream msg;
      msg << "[number_of_elements] Can't get the shape for axis " << ax
          << " from an array with " << a.ndim() << " dimensions.";
      throw std::invalid_argument(msg.str());
    }
    ax = normal_axis;
  }

  if (!detail::in_dynamic_tracing()) {
    double numel = 1;
    for (auto ax : axes) {
      numel *= a.shape(ax);
    }
    return array(inverted ? 1.0 / numel : numel, dtype);
  }
  return stop_gradient(array(
      Shape{},
      dtype,
      std::make_shared<NumberOfElements>(
          to_stream(s), std::move(axes), inverted, dtype),
      {a}));
}

array conjugate(const array& a, StreamOrDevice s /* = {} */) {
  // Mirror NumPy's behaviour for real input
  if (a.dtype() != complex64) {
    return a;
  }
  return array(
      a.shape(), a.dtype(), std::make_shared<Conjugate>(to_stream(s)), {a});
}

array bitwise_impl(
    const array& a,
    const array& b,
    BitwiseBinary::Op op,
    const std::string& op_name,
    const StreamOrDevice& s,
    std::optional<Dtype> out_type_ = std::nullopt) {
  auto out_type = out_type_ ? *out_type_ : promote_types(a.dtype(), b.dtype());
  if (!(issubdtype(out_type, integer) || out_type == bool_)) {
    std::ostringstream msg;
    msg << "[" << op_name
        << "] Only allowed on integer or boolean types "
           "but got types "
        << a.dtype() << " and " << b.dtype() << ".";
    throw std::runtime_error(msg.str());
  }
  auto inputs =
      broadcast_arrays({astype(a, out_type, s), astype(b, out_type, s)}, s);
  auto& out_shape = inputs[0].shape();
  return array(
      out_shape,
      out_type,
      std::make_shared<BitwiseBinary>(to_stream(s), op),
      std::move(inputs));
}

array bitwise_and(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  return bitwise_impl(a, b, BitwiseBinary::Op::And, "bitwise_and", s);
}
array operator&(const array& a, const array& b) {
  return bitwise_and(a, b);
}

array bitwise_or(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  return bitwise_impl(a, b, BitwiseBinary::Op::Or, "bitwise_or", s);
}
array operator|(const array& a, const array& b) {
  return bitwise_or(a, b);
}

array bitwise_xor(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  return bitwise_impl(a, b, BitwiseBinary::Op::Xor, "bitwise_xor", s);
}
array operator^(const array& a, const array& b) {
  return bitwise_xor(a, b);
}

array left_shift(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  auto t = result_type(a, b);
  if (t == bool_) {
    t = uint8;
  }
  return bitwise_impl(a, b, BitwiseBinary::Op::LeftShift, "left_shift", s, t);
}
array operator<<(const array& a, const array& b) {
  return left_shift(a, b);
}

array right_shift(const array& a, const array& b, StreamOrDevice s /* = {} */) {
  auto t = result_type(a, b);
  if (t == bool_) {
    t = uint8;
  }
  return bitwise_impl(
      astype(a, t, s),
      astype(b, t, s),
      BitwiseBinary::Op::RightShift,
      "right_shift",
      s,
      t);
}
array operator>>(const array& a, const array& b) {
  return right_shift(a, b);
}

array bitwise_invert(const array& a, StreamOrDevice s /* = {} */) {
  if (issubdtype(a.dtype(), inexact)) {
    throw std::invalid_argument(
        "[bitwise_invert] Bitwise inverse only allowed on integer types.");
  } else if (a.dtype() == bool_) {
    return logical_not(a, s);
  }
  return array(
      a.shape(), a.dtype(), std::make_shared<BitwiseInvert>(to_stream(s)), {a});
}

array operator~(const array& a) {
  return bitwise_invert(a);
}

array view(const array& a, const Dtype& dtype, StreamOrDevice s /* = {} */) {
  if (a.dtype() == dtype) {
    return a;
  }
  auto out_shape = a.shape();
  auto ibytes = size_of(a.dtype());
  auto obytes = size_of(dtype);
  if (a.ndim() == 0 && ibytes != obytes) {
    throw std::invalid_argument(
        "[view] Changing the type of a scalar is only allowed"
        " for types with the same size.");
  } else {
    if (ibytes < obytes) {
      if (out_shape.back() % (obytes / ibytes) != 0) {
        throw std::invalid_argument(
            "[view] When viewing as a larger dtype, the size in bytes of the last"
            " axis must be a multiple of the requested type size.");
      }
      out_shape.back() /= (obytes / ibytes);
    } else if (ibytes > obytes) {
      // Type size ratios are always integers
      out_shape.back() *= (ibytes / obytes);
    }
  }
  return array(
      out_shape, dtype, std::make_shared<View>(to_stream(s), dtype), {a});
}

array roll(
    const array& a,
    const Shape& shift,
    const std::vector<int>& axes,
    StreamOrDevice s /* = {} */) {
  if (axes.empty()) {
    return a;
  }

  if (shift.size() < axes.size()) {
    std::ostringstream msg;
    msg << "[roll] At least one shift value per axis is required, "
        << shift.size() << " provided for " << axes.size() << " axes.";
    throw std::invalid_argument(msg.str());
  }

  array result = a;
  for (int i = 0; i < axes.size(); i++) {
    int ax = axes[i];
    if (ax < 0) {
      ax += a.ndim();
    }
    if (ax < 0 || ax >= a.ndim()) {
      std::ostringstream msg;
      msg << "[roll] Invalid axis " << axes[i] << " for array with " << a.ndim()
          << " dimensions.";
      throw std::invalid_argument(msg.str());
    }

    auto sh = shift[i];
    auto size = a.shape(ax);
    if (size == 0) {
      continue; // skip rolling this axis if it has size 0
    }
    auto split_index = (sh < 0) ? (-sh) % size : size - sh % size;

    auto parts = split(result, Shape{split_index}, ax, s);
    std::swap(parts[0], parts[1]);
    result = concatenate(parts, ax, s);
  }

  return result;
}

array roll(const array& a, int shift, StreamOrDevice s /* = {} */) {
  auto shape = a.shape();
  return reshape(
      roll(flatten(a, s), Shape{shift}, std::vector<int>{0}, s),
      std::move(shape),
      s);
}

array roll(const array& a, const Shape& shift, StreamOrDevice s /* = {} */) {
  int total_shift = 0;
  for (auto& s : shift) {
    total_shift += s;
  }
  return roll(a, total_shift, s);
}

array roll(const array& a, int shift, int axis, StreamOrDevice s /* = {} */) {
  return roll(a, Shape{shift}, std::vector<int>{axis}, s);
}

array roll(
    const array& a,
    int shift,
    const std::vector<int>& axes,
    StreamOrDevice s /* = {} */) {
  Shape shifts(axes.size(), shift);
  return roll(a, shifts, axes, s);
}

array roll(
    const array& a,
    const Shape& shift,
    int axis,
    StreamOrDevice s /* = {} */) {
  int total_shift = 0;
  for (auto& s : shift) {
    total_shift += s;
  }
  return roll(a, Shape{total_shift}, std::vector<int>{axis}, s);
}

array real(const array& a, StreamOrDevice s /* = {} */) {
  if (!issubdtype(a.dtype(), complexfloating)) {
    return a;
  }
  return array(a.shape(), float32, std::make_shared<Real>(to_stream(s)), {a});
}

array imag(const array& a, StreamOrDevice s /* = {} */) {
  if (!issubdtype(a.dtype(), complexfloating)) {
    return zeros_like(a);
  }
  return array(a.shape(), float32, std::make_shared<Imag>(to_stream(s)), {a});
}

array contiguous(
    const array& a,
    bool allow_col_major /* = false */,
    StreamOrDevice s /* = {} */) {
  return array(
      a.shape(),
      a.dtype(),
      std::make_shared<Contiguous>(to_stream(s), allow_col_major),
      {a});
}

} // namespace mlx::core


================================================
FILE: mlx/ops.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <optional>

#include "mlx/api.h"
#include "mlx/array.h"
#include "mlx/device.h"
#include "mlx/stream.h"
#include "mlx/utils.h"

namespace mlx::core {

/**
 * \defgroup ops Core array operations
 * @{
 */

/**
 * A 1D array of numbers starting at `start` (optional),
 * stopping at stop, stepping by `step` (optional). */
MLX_API array arange(
    double start,
    double stop,
    double step,
    Dtype dtype,
    StreamOrDevice s = {});
MLX_API array
arange(double start, double stop, double step, StreamOrDevice s = {});
MLX_API array
arange(double start, double stop, Dtype dtype, StreamOrDevice s = {});
MLX_API array arange(double start, double stop, StreamOrDevice s = {});
MLX_API array arange(double stop, Dtype dtype, StreamOrDevice s = {});
MLX_API array arange(double stop, StreamOrDevice s = {});

MLX_API array arange(int start, int stop, int step, StreamOrDevice s = {});
MLX_API array arange(int start, int stop, StreamOrDevice s = {});
MLX_API array arange(int stop, StreamOrDevice s = {});

/** A 1D array of `num` evenly spaced numbers in the range `[start, stop]` */
MLX_API array linspace(
    double start,
    double stop,
    int num = 50,
    Dtype dtype = float32,
    StreamOrDevice s = {});

/** Convert an array to the given data type. */
MLX_API array astype(array a, Dtype dtype, StreamOrDevice s = {});

/** Create a view of an array with the given shape and strides. */
MLX_API array as_strided(
    array a,
    Shape shape,
    Strides strides,
    size_t offset,
    StreamOrDevice s = {});

/** Copy another array. */
MLX_API array copy(array a, StreamOrDevice s = {});

/** Fill an array of the given shape with the given value(s). */
MLX_API array full(Shape shape, array vals, Dtype dtype, StreamOrDevice s = {});
MLX_API array full(Shape shape, array vals, StreamOrDevice s = {});
template <typename T>
array full(Shape shape, T val, Dtype dtype, StreamOrDevice s = {}) {
  return full(std::move(shape), array(val, dtype), to_stream(s));
}
template <typename T>
array full(Shape shape, T val, StreamOrDevice s = {}) {
  return full(std::move(shape), array(val), to_stream(s));
}

MLX_API array
full_like(const array& a, array vals, Dtype dtype, StreamOrDevice s = {});
MLX_API array full_like(const array& a, array vals, StreamOrDevice s = {});
template <typename T>
array full_like(const array& a, T val, Dtype dtype, StreamOrDevice s = {}) {
  return full_like(a, array(val, dtype), dtype, to_stream(s));
}
template <typename T>
array full_like(const array& a, T val, StreamOrDevice s = {}) {
  return full_like(a, array(val, a.dtype()), to_stream(s));
}

/** Fill an array of the given shape with zeros. */
MLX_API array zeros(const Shape& shape, Dtype dtype, StreamOrDevice s = {});
inline array zeros(const Shape& shape, StreamOrDevice s = {}) {
  return zeros(shape, float32, s);
}
MLX_API array zeros_like(const array& a, StreamOrDevice s = {});

/** Fill an array of the given shape with ones. */
MLX_API array ones(const Shape& shape, Dtype dtype, StreamOrDevice s = {});
inline array ones(const Shape& shape, StreamOrDevice s = {}) {
  return ones(shape, float32, s);
}
MLX_API array ones_like(const array& a, StreamOrDevice s = {});

/** Fill an array of the given shape (n,m) with ones in the specified diagonal
 * k, and zeros everywhere else. */
MLX_API array eye(int n, int m, int k, Dtype dtype, StreamOrDevice s = {});
inline array eye(int n, Dtype dtype, StreamOrDevice s = {}) {
  return eye(n, n, 0, dtype, s);
}
inline array eye(int n, int m, StreamOrDevice s = {}) {
  return eye(n, m, 0, float32, s);
}
inline array eye(int n, int m, int k, StreamOrDevice s = {}) {
  return eye(n, m, k, float32, s);
}
inline array eye(int n, StreamOrDevice s = {}) {
  return eye(n, n, 0, float32, s);
}

/** Create a square matrix of shape (n,n) of zeros, and ones in the major
 * diagonal. */
MLX_API array identity(int n, Dtype dtype, StreamOrDevice s = {});
inline array identity(int n, StreamOrDevice s = {}) {
  return identity(n, float32, s);
}

MLX_API array tri(int n, int m, int k, Dtype type, StreamOrDevice s = {});
inline array tri(int n, Dtype type, StreamOrDevice s = {}) {
  return tri(n, n, 0, type, s);
}

MLX_API array tril(array x, int k = 0, StreamOrDevice s = {});
MLX_API array triu(array x, int k = 0, StreamOrDevice s = {});

/** Reshape an array to the given shape. */
MLX_API array reshape(const array& a, Shape shape, StreamOrDevice s = {});

/** Unflatten the axis to the given shape. */
MLX_API array
unflatten(const array& a, int axis, Shape shape, StreamOrDevice s = {});

/** Flatten the dimensions in the range `[start_axis, end_axis]` . */
MLX_API array flatten(
    const array& a,
    int start_axis,
    int end_axis = -1,
    StreamOrDevice s = {});

/** Flatten the array to 1D. */
MLX_API array flatten(const array& a, StreamOrDevice s = {});

/** Multiply the array by the Hadamard matrix of corresponding size. */
MLX_API array hadamard_transform(
    const array& a,
    std::optional<float> scale = std::nullopt,
    StreamOrDevice s = {});

/** Remove singleton dimensions at the given axes. */
MLX_API array
squeeze(const array& a, const std::vector<int>& axes, StreamOrDevice s = {});

/** Remove singleton dimensions at the given axis. */
MLX_API array squeeze(const array& a, int axis, StreamOrDevice s = {});

/** Remove all singleton dimensions. */
MLX_API array squeeze(const array& a, StreamOrDevice s = {});

/** Add a singleton dimension at the given axes. */
MLX_API array expand_dims(
    const array& a,
    const std::vector<int>& axes,
    StreamOrDevice s = {});

/** Add a singleton dimension at the given axis. */
MLX_API array expand_dims(const array& a, int axis, StreamOrDevice s = {});

/** Slice an array. */
MLX_API array slice(
    const array& a,
    Shape start,
    Shape stop,
    Shape strides,
    StreamOrDevice s = {});
inline array slice(
    const array& a,
    std::initializer_list<int> start,
    Shape stop,
    Shape strides,
    StreamOrDevice s = {}) {
  return slice(a, Shape(start), std::move(stop), std::move(strides), s);
}

/** Slice an array with a stride of 1 in each dimension. */
MLX_API array
slice(const array& a, Shape start, Shape stop, StreamOrDevice s = {});

/** Slice an array with dynamic starting indices. */
MLX_API array slice(
    const array& a,
    const array& start,
    std::vector<int> axes,
    Shape slice_size,
    StreamOrDevice s = {});

/** Update a slice from the source array. */
MLX_API array slice_update(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    Shape strides,
    StreamOrDevice s = {});

/** Update a slice from the source array with stride 1 in each dimension. */
MLX_API array slice_update(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    StreamOrDevice s = {});

/** Update a slice from the source array with dynamic starting indices. */
MLX_API array slice_update(
    const array& src,
    const array& update,
    const array& start,
    std::vector<int> axes,
    StreamOrDevice s = {});

/** Slice update and add updates to given slice. */
MLX_API array slice_update_add(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    Shape strides,
    StreamOrDevice s = {});

/** Slice update and add updates to given slice with stride 1 in each dimension.
 */
MLX_API array slice_update_add(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    StreamOrDevice s = {});

/** Slice update and prod updates to given slice. */
MLX_API array slice_update_prod(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    Shape strides,
    StreamOrDevice s = {});

/** Slice update and prod updates to given slice with stride 1 in each
 * dimension. */
MLX_API array slice_update_prod(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    StreamOrDevice s = {});

/** Slice update and max updates to given slice. */
MLX_API array slice_update_max(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    Shape strides,
    StreamOrDevice s = {});

/** Slice update and max updates to given slice with stride 1 in each dimension.
 */
MLX_API array slice_update_max(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    StreamOrDevice s = {});

/** Slice update and min updates to given slice. */
MLX_API array slice_update_min(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    Shape strides,
    StreamOrDevice s = {});

/** Slice update and min updates to given slice with stride 1 in each dimension.
 */
MLX_API array slice_update_min(
    const array& src,
    const array& update,
    Shape start,
    Shape stop,
    StreamOrDevice s = {});

/** Split an array into sub-arrays along a given axis. */
MLX_API std::vector<array>
split(const array& a, int num_splits, int axis, StreamOrDevice s = {});
MLX_API std::vector<array>
split(const array& a, int num_splits, StreamOrDevice s = {});
MLX_API std::vector<array>
split(const array& a, const Shape& indices, int axis, StreamOrDevice s = {});
MLX_API std::vector<array>
split(const array& a, const Shape& indices, StreamOrDevice s = {});

/** A vector of coordinate arrays from coordinate vectors. */
MLX_API std::vector<array> meshgrid(
    const std::vector<array>& arrays,
    bool sparse = false,
    const std::string& indexing = "xy",
    StreamOrDevice s = {});

/**
 * Clip (limit) the values in an array.
 */
MLX_API array clip(
    const array& a,
    const std::optional<array>& a_min = std::nullopt,
    const std::optional<array>& a_max = std::nullopt,
    StreamOrDevice s = {});

/** Concatenate arrays along a given axis. */
MLX_API array
concatenate(std::vector<array> arrays, int axis, StreamOrDevice s = {});
MLX_API array concatenate(std::vector<array> arrays, StreamOrDevice s = {});

/** Stack arrays along a new axis. */
MLX_API array
stack(const std::vector<array>& arrays, int axis, StreamOrDevice s = {});
MLX_API array stack(const std::vector<array>& arrays, StreamOrDevice s = {});

/** Repeat an array along an axis. */
MLX_API array
repeat(const array& arr, int repeats, int axis, StreamOrDevice s = {});
MLX_API array repeat(const array& arr, int repeats, StreamOrDevice s = {});

MLX_API array
tile(const array& arr, std::vector<int> reps, StreamOrDevice s = {});

/** Permutes the dimensions according to the given axes. */
MLX_API array
transpose(const array& a, std::vector<int> axes, StreamOrDevice s = {});
inline array transpose(
    const array& a,
    std::initializer_list<int> axes,
    StreamOrDevice s = {}) {
  return transpose(a, std::vector<int>(axes), s);
}

/** Swap two axes of an array. */
MLX_API array
swapaxes(const array& a, int axis1, int axis2, StreamOrDevice s = {});

/** Move an axis of an array. */
MLX_API array
moveaxis(const array& a, int source, int destination, StreamOrDevice s = {});

/** Pad an array with a constant value */
MLX_API array
pad(const array& a,
    const std::vector<int>& axes,
    const Shape& low_pad_size,
    const Shape& high_pad_size,
    const array& pad_value = array(0),
    const std::string& mode = "constant",
    StreamOrDevice s = {});

/** Pad an array with a constant value along all axes */
MLX_API array
pad(const array& a,
    const std::vector<std::pair<int, int>>& pad_width,
    const array& pad_value = array(0),
    const std::string& mode = "constant",
    StreamOrDevice s = {});
MLX_API array
pad(const array& a,
    const std::pair<int, int>& pad_width,
    const array& pad_value = array(0),
    const std::string& mode = "constant",
    StreamOrDevice s = {});
MLX_API array
pad(const array& a,
    int pad_width,
    const array& pad_value = array(0),
    const std::string& mode = "constant",
    StreamOrDevice s = {});

/** Permutes the dimensions in reverse order. */
MLX_API array transpose(const array& a, StreamOrDevice s = {});

/** Broadcast an array to a given shape. */
MLX_API array
broadcast_to(const array& a, const Shape& shape, StreamOrDevice s = {});

/** Broadcast a vector of arrays against one another. */
MLX_API std::vector<array> broadcast_arrays(
    const std::vector<array>& inputs,
    StreamOrDevice s = {});

/** Returns the bool array with (a == b) element-wise. */
MLX_API array equal(const array& a, const array& b, StreamOrDevice s = {});
inline array operator==(const array& a, const array& b) {
  return equal(a, b);
}
template <typename T>
array operator==(T a, const array& b) {
  return equal(array(a), b);
}
template <typename T>
array operator==(const array& a, T b) {
  return equal(a, array(b));
}

/** Returns the bool array with (a != b) element-wise. */
MLX_API array not_equal(const array& a, const array& b, StreamOrDevice s = {});
inline array operator!=(const array& a, const array& b) {
  return not_equal(a, b);
}
template <typename T>
array operator!=(T a, const array& b) {
  return not_equal(array(a), b);
}
template <typename T>
array operator!=(const array& a, T b) {
  return not_equal(a, array(b));
}

/** Returns bool array with (a > b) element-wise. */
MLX_API array greater(const array& a, const array& b, StreamOrDevice s = {});
inline array operator>(const array& a, const array& b) {
  return greater(a, b);
}
template <typename T>
array operator>(T a, const array& b) {
  return greater(array(a), b);
}
template <typename T>
array operator>(const array& a, T b) {
  return greater(a, array(b));
}

/** Returns bool array with (a >= b) element-wise. */
MLX_API array
greater_equal(const array& a, const array& b, StreamOrDevice s = {});
inline array operator>=(const array& a, const array& b) {
  return greater_equal(a, b);
}
template <typename T>
array operator>=(T a, const array& b) {
  return greater_equal(array(a), b);
}
template <typename T>
array operator>=(const array& a, T b) {
  return greater_equal(a, array(b));
}

/** Returns bool array with (a < b) element-wise. */
MLX_API array less(const array& a, const array& b, StreamOrDevice s = {});
inline array operator<(const array& a, const array& b) {
  return less(a, b);
}
template <typename T>
array operator<(T a, const array& b) {
  return less(array(a), b);
}
template <typename T>
array operator<(const array& a, T b) {
  return less(a, array(b));
}

/** Returns bool array with (a <= b) element-wise. */
MLX_API array less_equal(const array& a, const array& b, StreamOrDevice s = {});
inline array operator<=(const array& a, const array& b) {
  return less_equal(a, b);
}
template <typename T>
array operator<=(T a, const array& b) {
  return less_equal(array(a), b);
}
template <typename T>
array operator<=(const array& a, T b) {
  return less_equal(a, array(b));
}

/** True if two arrays have the same shape and elements. */
MLX_API array array_equal(
    const array& a,
    const array& b,
    bool equal_nan,
    StreamOrDevice s = {});
inline array
array_equal(const array& a, const array& b, StreamOrDevice s = {}) {
  return array_equal(a, b, false, s);
}

MLX_API array isnan(const array& a, StreamOrDevice s = {});

MLX_API array isinf(const array& a, StreamOrDevice s = {});

MLX_API array isfinite(const array& a, StreamOrDevice s = {});

MLX_API array isposinf(const array& a, StreamOrDevice s = {});

MLX_API array isneginf(const array& a, StreamOrDevice s = {});

/** Select from x or y depending on condition. */
MLX_API array where(
    const array& condition,
    const array& x,
    const array& y,
    StreamOrDevice s = {});

/** Replace NaN and infinities with finite numbers. */
MLX_API array nan_to_num(
    const array& a,
    float nan = 0.0f,
    const std::optional<float> posinf = std::nullopt,
    const std::optional<float> neginf = std::nullopt,
    StreamOrDevice s = {});

/** True if all elements in the array are true (or non-zero). **/
MLX_API array all(const array& a, bool keepdims, StreamOrDevice s = {});
inline array all(const array& a, StreamOrDevice s = {}) {
  return all(a, false, to_stream(s));
}

/** True if the two arrays are equal within the specified tolerance. */
MLX_API array allclose(
    const array& a,
    const array& b,
    double rtol = 1e-5,
    double atol = 1e-8,
    bool equal_nan = false,
    StreamOrDevice s = {});

/** Returns a boolean array where two arrays are element-wise equal within the
 * specified tolerance. */
MLX_API array isclose(
    const array& a,
    const array& b,
    double rtol = 1e-5,
    double atol = 1e-8,
    bool equal_nan = false,
    StreamOrDevice s = {});

/**
 *  Reduces the input along the given axes. An output value is true
 *  if all the corresponding inputs are true.
 **/
MLX_API array
all(const array& a,
    const std::vector<int>& axes,
    bool keepdims = false,
    StreamOrDevice s = {});

/**
 *  Reduces the input along the given axis. An output value is true
 *  if all the corresponding inputs are true.
 **/
MLX_API array
all(const array& a, int axis, bool keepdims = false, StreamOrDevice s = {});

/** True if any elements in the array are true (or non-zero). **/
MLX_API array any(const array& a, bool keepdims, StreamOrDevice s = {});
inline array any(const array& a, StreamOrDevice s = {}) {
  return any(a, false, to_stream(s));
}

/**
 *  Reduces the input along the given axes. An output value is true
 *  if any of the corresponding inputs are true.
 **/
MLX_API array
any(const array& a,
    const std::vector<int>& axes,
    bool keepdims = false,
    StreamOrDevice s = {});

/**
 *  Reduces the input along the given axis. An output value is true
 *  if any of the corresponding inputs are true.
 **/
MLX_API array
any(const array& a, int axis, bool keepdims = false, StreamOrDevice s = {});

/** Sums the elements of an array. */
MLX_API array sum(const array& a, bool keepdims, StreamOrDevice s = {});
inline array sum(const array& a, StreamOrDevice s = {}) {
  return sum(a, false, to_stream(s));
}

/** Sums the elements of an array along the given axes. */
MLX_API array
sum(const array& a,
    const std::vector<int>& axes,
    bool keepdims = false,
    StreamOrDevice s = {});

/** Sums the elements of an array along the given axis. */
MLX_API array
sum(const array& a, int axis, bool keepdims = false, StreamOrDevice s = {});

/** Computes the mean of the elements of an array. */
MLX_API array mean(const array& a, bool keepdims, StreamOrDevice s = {});
inline array mean(const array& a, StreamOrDevice s = {}) {
  return mean(a, false, to_stream(s));
}

/** Computes the mean of the elements of an array along the given axes */
MLX_API array mean(
    const array& a,
    const std::vector<int>& axes,
    bool keepdims = false,
    StreamOrDevice s = {});

/** Computes the mean of the elements of an array along the given axis */
MLX_API array
mean(const array& a, int axis, bool keepdims = false, StreamOrDevice s = {});

/** Computes the median of the elements of an array. */
MLX_API array median(const array& a, bool keepdims, StreamOrDevice s = {});
inline array median(const array& a, StreamOrDevice s = {}) {
  return median(a, false, to_stream(s));
}

/** Computes the median of the elements of an array along the given axes */
MLX_API array median(
    const array& a,
    const std::vector<int>& axes,
    bool keepdims = false,
    StreamOrDevice s = {});

/** Computes the median of the elements of an array along the given axis */
MLX_API array
median(const array& a, int axis, bool keepdims = false, StreamOrDevice s = {});

/** Computes the variance of the elements of an array. */
MLX_API array
var(const array& a, bool keepdims, int ddof = 0, StreamOrDevice s = {});
inline array var(const array& a, StreamOrDevice s = {}) {
  return var(a, false, 0, to_stream(s));
}

/** Computes the variance of the elements of an array along the given
 * axes */
MLX_API array
var(const array& a,
    const std::vector<int>& axes,
    bool keepdims = false,
    int ddof = 0,
    StreamOrDevice s = {});

/** Computes the variance of the elements of an array along the given
 * axis */
MLX_API array
var(const array& a,
    int axis,
    bool keepdims = false,
    int ddof = 0,
    StreamOrDevice s = {});

/** Computes the standard deviation of the elements of an array. */
MLX_API array
std(const array& a, bool keepdims, int ddof = 0, StreamOrDevice s = {});
inline array std(const array& a, StreamOrDevice s = {}) {
  return std(a, false, 0, to_stream(s));
}

/** Computes the standard deviation of the elements of an array along the given
 * axes */
MLX_API array
std(const array& a,
    const std::vector<int>& axes,
    bool keepdims = false,
    int ddof = 0,
    StreamOrDevice s = {});

/** Computes the standard deviation of the elements of an array along the given
 * axis */
MLX_API array
std(const array& a,
    int axis,
    bool keepdims = false,
    int ddof = 0,
    StreamOrDevice s = {});

/** The product of all elements of the array. */
MLX_API array prod(const array& a, bool keepdims, StreamOrDevice s = {});
inline array prod(const array& a, StreamOrDevice s = {}) {
  return prod(a, false, to_stream(s));
}

/** The product of the elements of an array along the given axes. */
MLX_API array prod(
    const array& a,
    const std::vector<int>& axes,
    bool keepdims = false,
    StreamOrDevice s = {});

/** The product of the elements of an array along the given axis. */
MLX_API array
prod(const array& a, int axis, bool keepdims = false, StreamOrDevice s = {});

/** The maximum of all elements of the array. */
MLX_API array max(const array& a, bool keepdims, StreamOrDevice s = {});
inline array max(const array& a, StreamOrDevice s = {}) {
  return max(a, false, to_stream(s));
}

/** The maximum of the elements of an array along the given axes. */
MLX_API array
max(const array& a,
    const std::vector<int>& axes,
    bool keepdims = false,
    StreamOrDevice s = {});

/** The maximum of the elements of an array along the given axis. */
MLX_API array
max(const array& a, int axis, bool keepdims = false, StreamOrDevice s = {});

/** The minimum of all elements of the array. */
MLX_API array min(const array& a, bool keepdims, StreamOrDevice s = {});
inline array min(const array& a, StreamOrDevice s = {}) {
  return min(a, false, to_stream(s));
}

/** The minimum of the elements of an array along the given axes. */
MLX_API array
min(const array& a,
    const std::vector<int>& axes,
    bool keepdims = false,
    StreamOrDevice s = {});

/** The minimum of the elements of an array along the given axis. */
MLX_API array
min(const array& a, int axis, bool keepdims = false, StreamOrDevice s = {});

/** Returns the Hanning window of size M. */
MLX_API array hanning(int M, StreamOrDevice s = {});

/** Returns the Hamming window of size M. */
MLX_API array hamming(int M, StreamOrDevice s = {});

/** Returns the bartlett window of size M. */
MLX_API array bartlett(int M, StreamOrDevice s = {});

/** Returns the Blackmann window of size M. */
MLX_API array blackman(int M, StreamOrDevice s = {});

/** Returns the index of the minimum value in the array. */
MLX_API array argmin(const array& a, bool keepdims, StreamOrDevice s = {});
inline array argmin(const array& a, StreamOrDevice s = {}) {
  return argmin(a, false, s);
}

/** Returns the indices of the minimum values along a given axis. */
MLX_API array
argmin(const array& a, int axis, bool keepdims = false, StreamOrDevice s = {});

/** Returns the index of the maximum value in the array. */
MLX_API array argmax(const array& a, bool keepdims, StreamOrDevice s = {});
inline array argmax(const array& a, StreamOrDevice s = {}) {
  return argmax(a, false, s);
}

/** Returns the indices of the maximum values along a given axis. */
MLX_API array
argmax(const array& a, int axis, bool keepdims = false, StreamOrDevice s = {});

/** Returns a sorted copy of the flattened array. */
MLX_API array sort(const array& a, StreamOrDevice s = {});

/** Returns a sorted copy of the array along a given axis. */
MLX_API array sort(const array& a, int axis, StreamOrDevice s = {});

/** Returns indices that sort the flattened array. */
MLX_API array argsort(const array& a, StreamOrDevice s = {});

/** Returns indices that sort the array along a given axis. */
MLX_API array argsort(const array& a, int axis, StreamOrDevice s = {});

/**
 * Returns a partitioned copy of the flattened array
 * such that the smaller kth elements are first.
 **/
MLX_API array partition(const array& a, int kth, StreamOrDevice s = {});

/**
 * Returns a partitioned copy of the array along a given axis
 * such that the smaller kth elements are first.
 **/
MLX_API array
partition(const array& a, int kth, int axis, StreamOrDevice s = {});

/**
 * Returns indices that partition the flattened array
 * such that the smaller kth elements are first.
 **/
MLX_API array argpartition(const array& a, int kth, StreamOrDevice s = {});

/**
 * Returns indices that partition the array along a given axis
 * such that the smaller kth elements are first.
 **/
MLX_API array
argpartition(const array& a, int kth, int axis, StreamOrDevice s = {});

/** Returns topk elements of the flattened array. */
MLX_API array topk(const array& a, int k, StreamOrDevice s = {});

/** Returns topk elements of the array along a given axis. */
MLX_API array topk(const array& a, int k, int axis, StreamOrDevice s = {});

/** Cumulative logsumexp of an array. */
MLX_API array logcumsumexp(
    const array& a,
    bool reverse = false,
    bool inclusive = true,
    StreamOrDevice s = {});

/** Cumulative logsumexp of an array along the given axis. */
MLX_API array logcumsumexp(
    const array& a,
    int axis,
    bool reverse = false,
    bool inclusive = true,
    StreamOrDevice s = {});

/** The logsumexp of all elements of the array. */
MLX_API array logsumexp(const array& a, bool keepdims, StreamOrDevice s = {});
inline array logsumexp(const array& a, StreamOrDevice s = {}) {
  return logsumexp(a, false, to_stream(s));
}

/** The logsumexp of the elements of an array along the given axes. */
MLX_API array logsumexp(
    const array& a,
    const std::vector<int>& axes,
    bool keepdims = false,
    StreamOrDevice s = {});

/** The logsumexp of the elements of an array along the given axis. */
MLX_API array logsumexp(
    const array& a,
    int axis,
    bool keepdims = false,
    StreamOrDevice s = {});

/** Absolute value of elements in an array. */
MLX_API array abs(const array& a, StreamOrDevice s = {});

/** Negate an array. */
MLX_API array negative(const array& a, StreamOrDevice s = {});
MLX_API array operator-(const array& a);

/** The sign of the elements in an array. */
MLX_API array sign(const array& a, StreamOrDevice s = {});

/** Logical not of an array */
MLX_API array logical_not(const array& a, StreamOrDevice s = {});

/** Logical and of two arrays */
MLX_API array
logical_and(const array& a, const array& b, StreamOrDevice s = {});
MLX_API array operator&&(const array& a, const array& b);

/** Logical or of two arrays */
MLX_API array logical_or(const array& a, const array& b, StreamOrDevice s = {});
MLX_API array operator||(const array& a, const array& b);

/** The reciprocal (1/x) of the elements in an array. */
MLX_API array reciprocal(const array& a, StreamOrDevice s = {});

/** Add two arrays. */
MLX_API array add(const array& a, const array& b, StreamOrDevice s = {});
MLX_API array operator+(const array& a, const array& b);
template <typename T>
array operator+(T a, const array& b) {
  return add(array(a), b);
}
template <typename T>
array operator+(const array& a, T b) {
  return add(a, array(b));
}

/** Subtract two arrays. */
MLX_API array subtract(const array& a, const array& b, StreamOrDevice s = {});
MLX_API array operator-(const array& a, const array& b);
template <typename T>
array operator-(T a, const array& b) {
  return subtract(array(a), b);
}
template <typename T>
array operator-(const array& a, T b) {
  return subtract(a, array(b));
}

/** Multiply two arrays. */
MLX_API array multiply(const array& a, const array& b, StreamOrDevice s = {});
MLX_API array operator*(const array& a, const array& b);
template <typename T>
array operator*(T a, const array& b) {
  return multiply(array(a), b);
}
template <typename T>
array operator*(const array& a, T b) {
  return multiply(a, array(b));
}

/** Divide two arrays. */
MLX_API array divide(const array& a, const array& b, StreamOrDevice s = {});
MLX_API array operator/(const array& a, const array& b);
MLX_API array operator/(double a, const array& b);
MLX_API array operator/(const array& a, double b);

/** Compute the element-wise quotient and remainder. */
MLX_API std::vector<array>
divmod(const array& a, const array& b, StreamOrDevice s = {});

/** Compute integer division. Equivalent to doing floor(a / x). */
MLX_API array
floor_divide(const array& a, const array& b, StreamOrDevice s = {});

/** Compute the element-wise remainder of division */
MLX_API array remainder(const array& a, const array& b, StreamOrDevice s = {});
MLX_API array operator%(const array& a, const array& b);
template <typename T>
array operator%(T a, const array& b) {
  return remainder(array(a), b);
}
template <typename T>
array operator%(const array& a, T b) {
  return remainder(a, array(b));
}

/** Element-wise maximum between two arrays. */
MLX_API array maximum(const array& a, const array& b, StreamOrDevice s = {});

/** Element-wise minimum between two arrays. */
MLX_API array minimum(const array& a, const array& b, StreamOrDevice s = {});

/** Floor the element of an array. **/
MLX_API array floor(const array& a, StreamOrDevice s = {});

/** Ceil the element of an array. **/
MLX_API array ceil(const array& a, StreamOrDevice s = {});

/** Square the elements of an array. */
MLX_API array square(const array& a, StreamOrDevice s = {});

/** Exponential of the elements of an array. */
MLX_API array exp(const array& a, StreamOrDevice s = {});

/** Sine of the elements of an array */
MLX_API array sin(const array& a, StreamOrDevice s = {});

/** Cosine of the elements of an array */
MLX_API array cos(const array& a, StreamOrDevice s = {});

/** Tangent of the elements of an array */
MLX_API array tan(const array& a, StreamOrDevice s = {});

/** Arc Sine of the elements of an array */
MLX_API array arcsin(const array& a, StreamOrDevice s = {});

/** Arc Cosine of the elements of an array */
MLX_API array arccos(const array& a, StreamOrDevice s = {});

/** Arc Tangent of the elements of an array */
MLX_API array arctan(const array& a, StreamOrDevice s = {});

/** Inverse tangent of the ratio of two arrays */
MLX_API array arctan2(const array& a, const array& b, StreamOrDevice s = {});

/** Hyperbolic Sine of the elements of an array */
MLX_API array sinh(const array& a, StreamOrDevice s = {});

/** Hyperbolic Cosine of the elements of an array */
MLX_API array cosh(const array& a, StreamOrDevice s = {});

/** Hyperbolic Tangent of the elements of an array */
MLX_API array tanh(const array& a, StreamOrDevice s = {});

/** Inverse Hyperbolic Sine of the elements of an array */
MLX_API array arcsinh(const array& a, StreamOrDevice s = {});

/** Inverse Hyperbolic Cosine of the elements of an array */
MLX_API array arccosh(const array& a, StreamOrDevice s = {});

/** Inverse Hyperbolic Tangent of the elements of an array */
MLX_API array arctanh(const array& a, StreamOrDevice s = {});

/** Convert the elements of an array from Radians to Degrees **/
MLX_API array degrees(const array& a, StreamOrDevice s = {});

/** Convert the elements of an array from Degrees to Radians **/
MLX_API array radians(const array& a, StreamOrDevice s = {});

/** Natural logarithm of the elements of an array. */
MLX_API array log(const array& a, StreamOrDevice s = {});

/** Log base 2 of the elements of an array. */
MLX_API array log2(const array& a, StreamOrDevice s = {});

/** Log base 10 of the elements of an array. */
MLX_API array log10(const array& a, StreamOrDevice s = {});

/** Natural logarithm of one plus elements in the array: `log(1 + a)`. */
MLX_API array log1p(const array& a, StreamOrDevice s = {});

/** Log-add-exp of one elements in the array: `log(exp(a) + exp(b))`. */
MLX_API array logaddexp(const array& a, const array& b, StreamOrDevice s = {});

/** Element-wise logistic sigmoid of the array: `1 / (1 + exp(-x)`. */
MLX_API array sigmoid(const array& a, StreamOrDevice s = {});

/** Computes the error function of the elements of an array. */
MLX_API array erf(const array& a, StreamOrDevice s = {});

/** Computes the inverse error function of the elements of an array. */
MLX_API array erfinv(const array& a, StreamOrDevice s = {});

/** Computes the expm1 function of the elements of an array. */
MLX_API array expm1(const array& a, StreamOrDevice s = {});

/** Stop the flow of gradients. */
MLX_API array stop_gradient(const array& a, StreamOrDevice s = {});

/** Round a floating point number */
MLX_API array round(const array& a, int decimals, StreamOrDevice s = {});
inline array round(const array& a, StreamOrDevice s = {}) {
  return round(a, 0, s);
}

/** Matrix-matrix multiplication. */
MLX_API array matmul(const array& a, const array& b, StreamOrDevice s = {});

/** Gather array entries given indices and slices */
MLX_API array gather(
    const array& a,
    const std::vector<array>& indices,
    const std::vector<int>& axes,
    const Shape& slice_sizes,
    StreamOrDevice s = {});
inline array gather(
    const array& a,
    const array& indices,
    int axis,
    const Shape& slice_sizes,
    StreamOrDevice s = {}) {
  return gather(a, {indices}, std::vector<int>{axis}, slice_sizes, s);
}

/**  Compute the Kronecker product of two arrays. */
MLX_API array kron(const array& a, const array& b, StreamOrDevice s = {});

/** Take array slices at the given indices of the specified axis. */
MLX_API array
take(const array& a, const array& indices, int axis, StreamOrDevice s = {});
MLX_API array take(const array& a, int index, int axis, StreamOrDevice s = {});

/** Take array entries at the given indices treating the array as flattened. */
MLX_API array take(const array& a, const array& indices, StreamOrDevice s = {});
MLX_API array take(const array& a, int index, StreamOrDevice s = {});

/** Take array entries given indices along the axis */
MLX_API array take_along_axis(
    const array& a,
    const array& indices,
    int axis,
    StreamOrDevice s = {});

/** Put the values into the array at the given indices along the axis */
MLX_API array put_along_axis(
    const array& a,
    const array& indices,
    const array& values,
    int axis,
    StreamOrDevice s = {});

/** Add the values into the array at the given indices along the axis */
MLX_API array scatter_add_axis(
    const array& a,
    const array& indices,
    const array& values,
    int axis,
    StreamOrDevice s = {});

/** Scatter updates to the given indices.
 *
 * The parameters ``indices`` and ``axes`` determine the locations of ``a``
 * that are updated with the values in ``updates``. Assuming 1-d ``indices``
 * for simplicity, ``indices[i]`` are the indices on axis ``axes[i]`` to which
 * the values in ``updates`` will be applied. Note each array in
 * ``indices`` is assigned to a corresponding axis and hence ``indices.size() ==
 * axes.size()``. If an index/axis pair is not provided then indices along that
 * axis are assumed to be zero.
 *
 * Note the rank of ``updates`` must be equal to the sum of the rank of the
 * broadcasted ``indices`` and the rank of ``a``. In other words, assuming the
 * arrays in ``indices`` have the same shape, ``updates.ndim() ==
 * indices[0].ndim() + a.ndim()``. The leading dimensions of ``updates``
 * correspond to the indices, and the remaining ``a.ndim()`` dimensions are the
 * values that will be applied to the given location in ``a``.
 *
 * For example:
 *
 * @code
 * auto in = zeros({4, 4}, float32);
 * auto indices = array({2});
 * auto updates = reshape(arange(1, 3, float32), {1, 1, 2});
 * std::vector<int> axes{0};
 *
 * auto out = scatter(in, {indices}, updates, axes);
 * @endcode
 *
 * will produce:
 *
 * @code
 * array([[0, 0, 0, 0],
 *        [0, 0, 0, 0],
 *        [1, 2, 0, 0],
 *        [0, 0, 0, 0]], dtype=float32)
 * @endcode
 *
 * This scatters the two-element row vector ``[1, 2]`` starting at the ``(2,
 * 0)`` position of ``a``.
 *
 * Adding another element to ``indices`` will scatter into another location of
 * ``a``. We also have to add an another update for the new index:
 *
 * @code
 * auto in = zeros({4, 4}, float32);
 * auto indices = array({2, 0});
 * auto updates = reshape(arange(1, 5, float32), {2, 1, 2});
 * std::vector<int> axes{0};
 *
 * auto out = scatter(in, {indices}, updates, axes):
 * @endcode
 *
 * will produce:
 *
 * @code
 * array([[3, 4, 0, 0],
 *        [0, 0, 0, 0],
 *        [1, 2, 0, 0],
 *        [0, 0, 0, 0]], dtype=float32)
 * @endcode
 *
 * To control the scatter location on an additional axis, add another index
 * array to ``indices`` and another axis to ``axes``:
 *
 * @code
 * auto in = zeros({4, 4}, float32);
 * auto indices = std::vector{array({2, 0}), array({1, 2})};
 * auto updates = reshape(arange(1, 5, float32), {2, 1, 2});
 * std::vector<int> axes{0, 1};
 *
 * auto out = scatter(in, indices, updates, axes);
 * @endcode
 *
 * will produce:
 *
 * @code
 * array([[0, 0, 3, 4],
 *       [0, 0, 0, 0],
 *       [0, 1, 2, 0],
 *       [0, 0, 0, 0]], dtype=float32)
 * @endcode
 *
 * Items in indices are broadcasted together. This means:
 *
 * @code
 * auto indices = std::vector{array({2, 0}), array({1})};
 * @endcode
 *
 * is equivalent to:
 *
 * @code
 * auto indices = std::vector{array({2, 0}), array({1, 1})};
 * @endcode
 *
 * Note, ``scatter`` does not perform bounds checking on the indices and
 * updates.  Out-of-bounds accesses on ``a`` are undefined and typically result
 * in unintended or invalid memory writes.
 */
MLX_API array scatter(
    const array& a,
    const std::vector<array>& indices,
    const array& updates,
    const std::vector<int>& axes,
    StreamOrDevice s = {});
inline array scatter(
    const array& a,
    const array& indices,
    const array& updates,
    int axis,
    StreamOrDevice s = {}) {
  return scatter(a, {indices}, updates, std::vector<int>{axis}, s);
}

/** Scatter and add updates to given indices */
MLX_API array scatter_add(
    const array& a,
    const std::vector<array>& indices,
    const array& updates,
    const std::vector<int>& axes,
    StreamOrDevice s = {});
inline array scatter_add(
    const array& a,
    const array& indices,
    const array& updates,
    int axis,
    StreamOrDevice s = {}) {
  return scatter_add(a, {indices}, updates, std::vector<int>{axis}, s);
}

/** Scatter and prod updates to given indices */
MLX_API array scatter_prod(
    const array& a,
    const std::vector<array>& indices,
    const array& updates,
    const std::vector<int>& axes,
    StreamOrDevice s = {});
inline array scatter_prod(
    const array& a,
    const array& indices,
    const array& updates,
    int axis,
    StreamOrDevice s = {}) {
  return scatter_prod(a, {indices}, updates, std::vector<int>{axis}, s);
}

/** Scatter and max updates to given linear indices */
MLX_API array scatter_max(
    const array& a,
    const std::vector<array>& indices,
    const array& updates,
    const std::vector<int>& axes,
    StreamOrDevice s = {});
inline array scatter_max(
    const array& a,
    const array& indices,
    const array& updates,
    int axis,
    StreamOrDevice s = {}) {
  return scatter_max(a, {indices}, updates, std::vector<int>{axis}, s);
}
/** Scatter and min updates to given linear indices */
MLX_API array scatter_min(
    const array& a,
    const std::vector<array>& indices,
    const array& updates,
    const std::vector<int>& axes,
    StreamOrDevice s = {});
inline array scatter_min(
    const array& a,
    const array& indices,
    const array& updates,
    int axis,
    StreamOrDevice s = {}) {
  return scatter_min(a, {indices}, updates, std::vector<int>{axis}, s);
}

MLX_API array masked_scatter(
    const array& a,
    const array& mask,
    const array& src,
    StreamOrDevice s = {});

/** Square root the elements of an array. */
MLX_API array sqrt(const array& a, StreamOrDevice s = {});

/** Square root and reciprocal the elements of an array. */
MLX_API array rsqrt(const array& a, StreamOrDevice s = {});

/** Softmax of an array. */
MLX_API array softmax(
    const array& a,
    const std::vector<int>& axes,
    bool precise = false,
    StreamOrDevice s = {});

/** Softmax of an array. */
MLX_API array
softmax(const array& a, bool precise = false, StreamOrDevice s = {});

/** Softmax of an array. */
inline array
softmax(const array& a, int axis, bool precise = false, StreamOrDevice s = {}) {
  return softmax(a, std::vector<int>{axis}, precise, s);
}

/** Raise elements of a to the power of b element-wise */
MLX_API array power(const array& a, const array& b, StreamOrDevice s = {});

/** Cumulative sum of an array. */
MLX_API array cumsum(
    const array& a,
    bool reverse = false,
    bool inclusive = true,
    StreamOrDevice s = {});

/** Cumulative sum of an array along the given axis. */
MLX_API array cumsum(
    const array& a,
    int axis,
    bool reverse = false,
    bool inclusive = true,
    StreamOrDevice s = {});

/** Cumulative product of an array. */
MLX_API array cumprod(
    const array& a,
    bool reverse = false,
    bool inclusive = true,
    StreamOrDevice s = {});

/** Cumulative product of an array along the given axis. */
MLX_API array cumprod(
    const array& a,
    int axis,
    bool reverse = false,
    bool inclusive = true,
    StreamOrDevice s = {});

/** Cumulative max of an array. */
MLX_API array cummax(
    const array& a,
    bool reverse = false,
    bool inclusive = true,
    StreamOrDevice s = {});

/** Cumulative max of an array along the given axis. */
MLX_API array cummax(
    const array& a,
    int axis,
    bool reverse = false,
    bool inclusive = true,
    StreamOrDevice s = {});

/** Cumulative min of an array. */
MLX_API array cummin(
    const array& a,
    bool reverse = false,
    bool inclusive = true,
    StreamOrDevice s = {});

/** Cumulative min of an array along the given axis. */
MLX_API array cummin(
    const array& a,
    int axis,
    bool reverse = false,
    bool inclusive = true,
    StreamOrDevice s = {});

/** General convolution with a filter */
MLX_API array conv_general(
    array input,
    array weight,
    std::vector<int> stride = {},
    std::vector<int> padding_lo = {},
    std::vector<int> padding_hi = {},
    std::vector<int> kernel_dilation = {},
    std::vector<int> input_dilation = {},
    int groups = 1,
    bool flip = false,
    StreamOrDevice s = {});

/** General convolution with a filter */
inline array conv_general(
    const array& input,
    const array& weight,
    std::vector<int> stride = {},
    std::vector<int> padding = {},
    std::vector<int> kernel_dilation = {},
    std::vector<int> input_dilation = {},
    int groups = 1,
    bool flip = false,
    StreamOrDevice s = {}) {
  return conv_general(
      /* const array& input = */ input,
      /* const array& weight = */ weight,
      /* std::vector<int> stride = */ stride,
      /* std::vector<int> padding_lo = */ padding,
      /* std::vector<int> padding_hi = */ padding,
      /* std::vector<int> kernel_dilation = */ kernel_dilation,
      /* std::vector<int> input_dilation = */ input_dilation,
      /* int groups = */ groups,
      /* bool flip = */ flip,
      /* StreamOrDevice s = */ s);
}

/** 1D convolution with a filter */
MLX_API array conv1d(
    const array& input,
    const array& weight,
    int stride = 1,
    int padding = 0,
    int dilation = 1,
    int groups = 1,
    StreamOrDevice s = {});

/** 2D convolution with a filter */
MLX_API array conv2d(
    const array& input,
    const array& weight,
    const std::pair<int, int>& stride = {1, 1},
    const std::pair<int, int>& padding = {0, 0},
    const std::pair<int, int>& dilation = {1, 1},
    int groups = 1,
    StreamOrDevice s = {});

/** 3D convolution with a filter */
MLX_API array conv3d(
    const array& input,
    const array& weight,
    const std::tuple<int, int, int>& stride = {1, 1, 1},
    const std::tuple<int, int, int>& padding = {0, 0, 0},
    const std::tuple<int, int, int>& dilation = {1, 1, 1},
    int groups = 1,
    StreamOrDevice s = {});

/** 1D transposed convolution with a filter */
MLX_API array conv_transpose1d(
    const array& input,
    const array& weight,
    int stride = 1,
    int padding = 0,
    int dilation = 1,
    int output_padding = 0,
    int groups = 1,
    StreamOrDevice s = {});

/** 2D transposed convolution with a filter */
MLX_API array conv_transpose2d(
    const array& input,
    const array& weight,
    const std::pair<int, int>& stride = {1, 1},
    const std::pair<int, int>& padding = {0, 0},
    const std::pair<int, int>& dilation = {1, 1},
    const std::pair<int, int>& output_padding = {0, 0},
    int groups = 1,
    StreamOrDevice s = {});

/** 3D transposed convolution with a filter */
MLX_API array conv_transpose3d(
    const array& input,
    const array& weight,
    const std::tuple<int, int, int>& stride = {1, 1, 1},
    const std::tuple<int, int, int>& padding = {0, 0, 0},
    const std::tuple<int, int, int>& dilation = {1, 1, 1},
    const std::tuple<int, int, int>& output_padding = {0, 0, 0},
    int groups = 1,
    StreamOrDevice s = {});

/** Quantized matmul multiplies x with a quantized matrix w*/
MLX_API array quantized_matmul(
    array x,
    array w,
    array scales,
    std::optional<array> biases = std::nullopt,
    bool transpose = true,
    std::optional<int> group_size = std::nullopt,
    std::optional<int> bits = std::nullopt,
    const std::string& mode = "affine",
    StreamOrDevice s = {});

/** Quantize a matrix along its last axis */
MLX_API std::vector<array> quantize(
    const array& w,
    std::optional<int> group_size = std::nullopt,
    std::optional<int> bits = std::nullopt,
    const std::string& mode = "affine",
    const std::optional<array>& global_scale = std::nullopt,
    StreamOrDevice s = {});

/** Dequantize a matrix produced by quantize() */
MLX_API array dequantize(
    const array& w,
    const array& scales,
    const std::optional<array>& biases = std::nullopt,
    std::optional<int> group_size = std::nullopt,
    std::optional<int> bits = std::nullopt,
    const std::string& mode = "affine",
    const std::optional<array>& global_scale = std::nullopt,
    std::optional<Dtype> dtype = std::nullopt,
    StreamOrDevice s = {});

MLX_API array qqmm(
    array x, // input activations
    array w, // maybe quantized weights
    const std::optional<array> w_scales = std::nullopt, // optional scales if w
                                                        // is quantized
    std::optional<int> group_size = std::nullopt,
    std::optional<int> bits = std::nullopt,
    const std::string& mode = "nvfp4",
    const std::optional<array> global_scale_x = std::nullopt,
    const std::optional<array> global_scale_w = std::nullopt,
    StreamOrDevice s = {});

/** Convert an E4M3 float8 to the given floating point dtype. */
MLX_API array from_fp8(array x, Dtype dtype, StreamOrDevice s = {});

/** Convert a floating point matrix to E4M3 float8. */
MLX_API array to_fp8(array x, StreamOrDevice s = {});

/** Compute matrix products with matrix-level gather. */
MLX_API array gather_qmm(
    const array& x,
    const array& w,
    const array& scales,
    const std::optional<array>& biases = std::nullopt,
    std::optional<array> lhs_indices = std::nullopt,
    std::optional<array> rhs_indices = std::nullopt,
    bool transpose = true,
    std::optional<int> group_size = std::nullopt,
    std::optional<int> bits = std::nullopt,
    const std::string& mode = "affine",
    bool sorted_indices = false,
    StreamOrDevice s = {});

/** Returns a contraction of a and b over multiple dimensions. */
MLX_API array tensordot(
    const array& a,
    const array& b,
    const int axis = 2,
    StreamOrDevice s = {});

MLX_API array tensordot(
    const array& a,
    const array& b,
    const std::vector<int>& axes_a,
    const std::vector<int>& axes_b,
    StreamOrDevice s = {});

/** Compute the outer product of two vectors. */
MLX_API array outer(const array& a, const array& b, StreamOrDevice s = {});

/** Compute the inner product of two vectors. */
MLX_API array inner(const array& a, const array& b, StreamOrDevice s = {});

/** Compute D = beta * C + alpha * (A @ B) */
MLX_API array addmm(
    array c,
    array a,
    array b,
    const float& alpha = 1.f,
    const float& beta = 1.f,
    StreamOrDevice s = {});

/** Compute matrix product with block masking */
MLX_API array block_masked_mm(
    array a,
    array b,
    int block_size,
    std::optional<array> mask_out = std::nullopt,
    std::optional<array> mask_lhs = std::nullopt,
    std::optional<array> mask_rhs = std::nullopt,
    StreamOrDevice s = {});

/** Compute matrix product with matrix-level gather */
MLX_API array gather_mm(
    array a,
    array b,
    std::optional<array> lhs_indices = std::nullopt,
    std::optional<array> rhs_indices = std::nullopt,
    bool sorted_indices = false,
    StreamOrDevice s = {});

/**
 * Compute a matrix product but segment the inner dimension and write the
 * result separately for each segment.
 */
MLX_API array
segmented_mm(array a, array b, array segments, StreamOrDevice s = {});

/** Extract a diagonal or construct a diagonal array */
MLX_API array diagonal(
    const array& a,
    int offset = 0,
    int axis1 = 0,
    int axis2 = 1,
    StreamOrDevice s = {});

/** Extract diagonal from a 2d array or create a diagonal matrix. */
MLX_API array diag(const array& a, int k = 0, StreamOrDevice s = {});

/** Return the sum along a specified diagonal in the given array. */
MLX_API array trace(
    const array& a,
    int offset,
    int axis1,
    int axis2,
    Dtype dtype,
    StreamOrDevice s = {});
MLX_API array
trace(const array& a, int offset, int axis1, int axis2, StreamOrDevice s = {});
MLX_API array trace(const array& a, StreamOrDevice s = {});

/**
 * Implements the identity function but allows injecting dependencies to other
 * arrays. This ensures that these other arrays will have been computed
 * when the outputs of this function are computed.
 */
MLX_API std::vector<array> depends(
    const std::vector<array>& inputs,
    const std::vector<array>& dependencies);

/** convert an array to an atleast ndim array */
MLX_API array atleast_1d(const array& a, StreamOrDevice s = {});
MLX_API std::vector<array> atleast_1d(
    const std::vector<array>& a,
    StreamOrDevice s = {});
MLX_API array atleast_2d(const array& a, StreamOrDevice s = {});
MLX_API std::vector<array> atleast_2d(
    const std::vector<array>& a,
    StreamOrDevice s = {});
MLX_API array atleast_3d(const array& a, StreamOrDevice s = {});
MLX_API std::vector<array> atleast_3d(
    const std::vector<array>& a,
    StreamOrDevice s = {});

/**
 * Extract the number of elements along some axes as a scalar array. Used to
 * allow shape dependent shapeless compilation (pun intended).
 */
MLX_API array number_of_elements(
    const array& a,
    std::vector<int> axes,
    bool inverted,
    Dtype dtype = int32,
    StreamOrDevice s = {});

MLX_API array conjugate(const array& a, StreamOrDevice s = {});

/** Bitwise and. */
MLX_API array
bitwise_and(const array& a, const array& b, StreamOrDevice s = {});
MLX_API array operator&(const array& a, const array& b);

/** Bitwise inclusive or. */
MLX_API array bitwise_or(const array& a, const array& b, StreamOrDevice s = {});
MLX_API array operator|(const array& a, const array& b);

/** Bitwise exclusive or. */
MLX_API array
bitwise_xor(const array& a, const array& b, StreamOrDevice s = {});
MLX_API array operator^(const array& a, const array& b);

/** Shift bits to the left. */
MLX_API array left_shift(const array& a, const array& b, StreamOrDevice s = {});
MLX_API array operator<<(const array& a, const array& b);

/** Shift bits to the right. */
MLX_API array
right_shift(const array& a, const array& b, StreamOrDevice s = {});
MLX_API array operator>>(const array& a, const array& b);

/** Invert the bits. */
MLX_API array bitwise_invert(const array& a, StreamOrDevice s = {});
MLX_API array operator~(const array& a);

MLX_API array view(const array& a, const Dtype& dtype, StreamOrDevice s = {});

/** Roll elements along an axis and introduce them on the other side */
MLX_API array roll(const array& a, int shift, StreamOrDevice s = {});
MLX_API array roll(const array& a, const Shape& shift, StreamOrDevice s = {});
MLX_API array roll(const array& a, int shift, int axis, StreamOrDevice s = {});
MLX_API array roll(
    const array& a,
    int shift,
    const std::vector<int>& axes,
    StreamOrDevice s = {});
MLX_API array
roll(const array& a, const Shape& shift, int axis, StreamOrDevice s = {});
MLX_API array roll(
    const array& a,
    const Shape& shift,
    const std::vector<int>& axes,
    StreamOrDevice s = {});

/* The real part of a complex array. */
MLX_API array real(const array& a, StreamOrDevice s = {});

/* The imaginary part of a complex array. */
MLX_API array imag(const array& a, StreamOrDevice s = {});

/* Ensure the array's underlying memory is contiguous. */
MLX_API array
contiguous(const array& a, bool allow_col_major = false, StreamOrDevice s = {});

/** @} */

} // namespace mlx::core


================================================
FILE: mlx/primitives.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

// Required for using M_2_SQRTPI in MSVC.
#define _USE_MATH_DEFINES

#include <algorithm>
#include <cassert>
#include <cmath>
#include <numeric>
#include <sstream>
#include <stdexcept>

#include "mlx/backend/common/utils.h"
#include "mlx/fft.h"
#include "mlx/linalg.h"
#include "mlx/ops.h"
#include "mlx/primitives.h"
#include "mlx/utils.h"

namespace mlx::core {

namespace {

std::tuple<array, array, int> vmap_binary_op(
    const std::vector<array>& inputs,
    const std::vector<int>& axes,
    const Stream& stream) {
  assert(inputs.size() == 2);
  assert(axes.size() == 2);

  if (axes[0] == -1 && axes[1] == -1) {
    return {inputs[0], inputs[1], -1};
  }

  auto a = inputs[0];
  auto b = inputs[1];
  int ndim = std::max(a.ndim() + (axes[0] == -1), b.ndim() + (axes[1] == -1));

  auto expand_dims = [stream, ndim](auto in) {
    auto shape = in.shape();
    shape.insert(shape.begin(), ndim - shape.size(), 1);
    return reshape(in, shape, stream);
  };

  int to_ax = (ndim - a.ndim()) + axes[0];
  int from_ax = (ndim - b.ndim()) + axes[1];
  a = expand_dims(a);
  b = expand_dims(b);

  if (from_ax != to_ax) {
    std::vector<int> tdims(b.ndim());
    std::iota(tdims.begin(), tdims.end(), 0);
    tdims.erase(tdims.begin() + from_ax);
    tdims.insert(tdims.begin() + to_ax, from_ax);
    b = transpose(b, tdims, stream);
  }
  return {a, b, to_ax};
}

std::tuple<array, array, array, int> vmap_ternary_op(
    const std::vector<array>& inputs,
    const std::vector<int>& axes,
    const Stream& stream) {
  assert(inputs.size() == 3);
  assert(axes.size() == 3);

  if (axes[0] == -1 && axes[1] == -1 && axes[2] == -1) {
    return {inputs[0], inputs[1], inputs[2], -1};
  }

  auto a = inputs[0];
  auto b = inputs[1];
  auto c = inputs[2];
  int ndim = std::max(
      {a.ndim() + (axes[0] == -1),
       b.ndim() + (axes[1] == -1),
       c.ndim() + (axes[2] == -1)});

  auto expand_dims = [stream, ndim](auto in) {
    auto shape = in.shape();
    shape.insert(shape.begin(), ndim - shape.size(), 1);
    return reshape(in, shape, stream);
  };

  int to_ax = (ndim - a.ndim()) + axes[0];
  int from_ax1 = (ndim - b.ndim()) + axes[1];
  int from_ax2 = (ndim - c.ndim()) + axes[2];
  a = expand_dims(a);
  b = expand_dims(b);
  c = expand_dims(c);

  auto find_tdims = [](auto x, int to_ax, int from_ax) {
    std::vector<int> tdims(x.ndim());
    std::iota(tdims.begin(), tdims.end(), 0);
    tdims.erase(tdims.begin() + from_ax);
    tdims.insert(tdims.begin() + to_ax, from_ax);
    return tdims;
  };

  if (to_ax != from_ax1) {
    std::vector<int> tdims = find_tdims(b, to_ax, from_ax1);
    b = transpose(b, tdims, stream);
  }

  if (to_ax != from_ax2) {
    std::vector<int> tdims = find_tdims(c, to_ax, from_ax2);
    c = transpose(c, tdims, stream);
  }
  return {a, b, c, to_ax};
}

// Calculate the gradient wrt to the weights of the following calculation
//
// y = gather_mm(x, w.T, lhs_indices, rhs_indices, sorted)
//
// Note the transpose above. This function returns the gradient for w.T so if w
// was used instead then one needs to transpose the returned gradient.
//
// We define it as a separate function to reuse it for gather_mm and
// gather_qmm.
array gather_mm_grad(
    const array& x,
    const array& dy,
    const array& lhs_indices,
    const array& rhs_indices,
    bool sorted,
    Shape batch_shape,
    const Stream& s) {
  int M = x.shape(-2);
  int K = x.shape(-1);
  int N = dy.shape(-1);
  int num_segments = std::accumulate(
      batch_shape.begin(), batch_shape.end(), 1, std::multiplies<int>());
  batch_shape.push_back(N);
  batch_shape.push_back(K);

  // If the indices are sorted then it means that we can do the whole gradient
  // computation via a segmented matmul. We just need to calculate the segments
  // using the indices.
  if (sorted) {
    auto segments = zeros({num_segments}, uint32, s);
    segments = scatter_add_axis(segments, rhs_indices, array(M, uint32), 0, s);
    segments = cumsum(segments, 0, false, true, s);
    segments = concatenate({array({0}, {1}, uint32), segments}, 0, s);
    segments = as_strided(segments, {num_segments, 2}, {1, 1}, 0, s);

    return reshape(
        segmented_mm(
            swapaxes(flatten(dy, 0, -2, s), 0, 1, s),
            flatten(x, 0, -2, s),
            segments,
            s),
        std::move(batch_shape),
        s);
  }

  // Otherwise we need to gather matmul the dy and then scatter add it to the
  // correct locations.
  else {
    // TODO: If the lhs indices wasn't provided, this is always a sorted matmul
    //       so we should add that check.
    auto dw = gather_mm(
        swapaxes(dy, -1, -2, s), x, std::nullopt, lhs_indices, false, s);
    return reshape(
        scatter_add(
            zeros({num_segments, N, K}, dw.dtype(), s),
            rhs_indices,
            expand_dims(dw, -3, s),
            0,
            s),
        std::move(batch_shape),
        s);
  }
}

} // namespace

std::vector<array> Primitive::jvp(
    const std::vector<array>&,
    const std::vector<array>&,
    const std::vector<int>&) {
  std::ostringstream msg;
  msg << "[Primitive::jvp] Not implemented for ";
  msg << name();
  msg << ".";
  throw std::invalid_argument(msg.str());
}

std::vector<array> Primitive::vjp(
    const std::vector<array>&,
    const std::vector<array>&,
    const std::vector<int>&,
    const std::vector<array>&) {
  std::ostringstream msg;
  msg << "[Primitive::vjp] Not implemented for ";
  msg << name();
  msg << ".";
  throw std::invalid_argument(msg.str());
}

std::pair<std::vector<array>, std::vector<int>> Primitive::vmap(
    const std::vector<array>&,
    const std::vector<int>&) {
  std::ostringstream msg;
  msg << "[Primitive::vmap] Not implemented for ";
  msg << name();
  msg << ".";
  throw std::invalid_argument(msg.str());
}

std::vector<Shape> Primitive::output_shapes(const std::vector<array>&) {
  std::ostringstream msg;
  msg << "[Primitive::output_shapes] ";
  msg << name();
  msg << " cannot infer output shapes.";
  throw std::invalid_argument(msg.str());
}

std::vector<array> Abs::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> Abs::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {multiply(tangents[0], sign(primals[0], stream()), stream())};
}

std::pair<std::vector<array>, std::vector<int>> Abs::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{abs(inputs[0], stream())}, axes};
}

std::vector<array> Add::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  return {
      tangents.size() > 1 ? add(tangents[0], tangents[1], stream())
                          : tangents[0]};
}

std::vector<array> Add::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  if (argnums.size() == 1) {
    return cotangents;
  } else {
    return {cotangents[0], cotangents[0]};
  }
}

std::pair<std::vector<array>, std::vector<int>> Add::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{add(a, b, stream())}, {to_ax}};
}

std::vector<array> AddMM::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;
  auto& cotan = cotangents[0];
  std::vector<int> reorder(cotan.ndim());
  std::iota(reorder.begin(), reorder.end(), 0);
  std::iter_swap(reorder.end() - 1, reorder.end() - 2);
  for (auto arg : argnums) {
    if (arg == 0) {
      // M X N * (K X N).T -> M X K
      auto cotan_scaled = cotan;
      if (alpha_ != 1.) {
        auto alpha_arr = array(alpha_, cotan.dtype());
        cotan_scaled = (multiply(alpha_arr, cotan_scaled, stream()));
      }
      vjps.push_back(matmul(
          cotan_scaled, transpose(primals[1], reorder, stream()), stream()));
    } else if (arg == 1) {
      // (M X K).T * M X N -> K X N
      auto cotan_scaled = cotan;
      if (alpha_ != 1.) {
        auto alpha_arr = array(alpha_, cotan.dtype());
        cotan_scaled = (multiply(alpha_arr, cotan_scaled, stream()));
      }
      vjps.push_back(matmul(
          transpose(primals[0], reorder, stream()), cotan_scaled, stream()));
    } else {
      auto cotan_scaled = cotan;
      if (beta_ != 1.) {
        auto beta_arr = array(beta_, cotan.dtype());
        cotan_scaled = (multiply(beta_arr, cotan_scaled, stream()));
      }
      vjps.push_back(cotan_scaled);
    }
  }
  return vjps;
}

std::vector<array> AddMM::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  std::vector<array> jvp;
  for (int i = 0; i < argnums.size(); ++i) {
    auto arg = argnums[i];
    if (arg == 0) {
      if (jvp.empty()) {
        jvp.push_back(matmul(tangents[i], primals[1], stream()));
      } else {
        jvp[0] = addmm(jvp[0], tangents[i], primals[1], 1.0f, 1.0f, stream());
      }
    } else if (arg == 1) {
      if (jvp.empty()) {
        jvp.push_back(matmul(primals[0], tangents[i], stream()));
      } else {
        jvp[0] = addmm(jvp[0], primals[0], tangents[i], 1.0f, 1.0f, stream());
      }
    } else {
      if (jvp.empty()) {
        jvp.push_back(tangents[i]);
      } else {
        jvp[0] = add(jvp[0], tangents[i], stream());
      }
    }
  }
  return jvp;
}

bool AddMM::is_equivalent(const Primitive& other) const {
  const AddMM& a_other = static_cast<const AddMM&>(other);
  return (alpha_ == a_other.alpha_ && beta_ == a_other.beta_);
}

std::pair<std::vector<array>, std::vector<int>> AddMM::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto maybe_move_ax = [this](auto& arr, auto ax) {
    return ax > 0 ? moveaxis(arr, ax, 0, stream()) : arr;
  };
  auto a = maybe_move_ax(inputs[0], axes[0]);
  auto b = maybe_move_ax(inputs[1], axes[1]);
  auto c = maybe_move_ax(inputs[2], axes[2]);
  return {{addmm(c, a, b, alpha_, beta_, stream())}, {0}};
}

bool Arange::is_equivalent(const Primitive& other) const {
  const Arange& a_other = static_cast<const Arange&>(other);
  return (
      start_ == a_other.start_ && stop_ == a_other.stop_ &&
      step_ == a_other.step_);
}

std::vector<Shape> Arange::output_shapes(const std::vector<array>&) {
  auto real_size = std::ceil((stop_ - start_) / step_);
  return {{std::max(static_cast<int>(real_size), 0)}};
}

std::vector<array> ArcCos::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> ArcCos::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  array one = array(1., primals[0].dtype());
  array t = subtract(one, square(primals[0], stream()), stream());
  array denom = negative(rsqrt(t, stream()), stream());
  return {multiply(tangents[0], denom, stream())};
}

std::pair<std::vector<array>, std::vector<int>> ArcCos::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{arccos(inputs[0], stream())}, axes};
}

std::vector<array> ArcCosh::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> ArcCosh::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  array one = array(1., primals[0].dtype());
  array t = subtract(square(primals[0], stream()), one, stream());
  return {multiply(tangents[0], rsqrt(t, stream()), stream())};
}

std::pair<std::vector<array>, std::vector<int>> ArcCosh::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{arccosh(inputs[0], stream())}, axes};
}

std::vector<array> ArcSin::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> ArcSin::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  array one = array(1., primals[0].dtype());
  array t = subtract(one, square(primals[0], stream()), stream());
  return {multiply(tangents[0], rsqrt(t, stream()), stream())};
}

std::pair<std::vector<array>, std::vector<int>> ArcSin::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{arcsin(inputs[0], stream())}, axes};
}

std::vector<array> ArcSinh::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> ArcSinh::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  array one = array(1., primals[0].dtype());
  array t = add(square(primals[0], stream()), one, stream());
  return {multiply(tangents[0], rsqrt(t, stream()), stream())};
}

std::pair<std::vector<array>, std::vector<int>> ArcSinh::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{arcsinh(inputs[0], stream())}, axes};
}

std::vector<array> ArcTan::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> ArcTan::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  array one = array(1., primals[0].dtype());
  array t = add(one, square(primals[0], stream()), stream());
  return {divide(tangents[0], t, stream())};
}

std::pair<std::vector<array>, std::vector<int>> ArcTan::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{arctan(inputs[0], stream())}, axes};
}

std::vector<array> ArcTan2::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  assert(primals.size() == 2);
  assert(argnums.size() == 2);

  const auto& s = stream();
  const array& x1 = primals[0];
  const array& x2 = primals[1];
  const array& dy = cotangents[0];

  std::vector<array> grads;
  array dy_over_x1_x2_squared =
      divide(dy, add(square(x1, s), square(x2, s)), s);

  for (auto arg : argnums) {
    if (arg == 0) {
      grads.emplace_back(multiply(x2, dy_over_x1_x2_squared, s));
    } else {
      grads.emplace_back(multiply(negative(x1, s), dy_over_x1_x2_squared, s));
    }
  }

  return grads;
}

std::vector<array> ArcTan2::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 2);
  assert(argnums.size() == 2);

  const auto& s = stream();
  const array& x1 = primals[0];
  const array& x2 = primals[1];
  const array& dx1 = tangents[0];
  const array& dx2 = tangents[1];

  return {divide(
      subtract(multiply(x2, dx1, s), multiply(x1, dx2, s), s),
      add(square(x1, s), square(x2, s), s),
      s)};
}

std::pair<std::vector<array>, std::vector<int>> ArcTan2::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 2);
  assert(axes.size() == 2);
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{arctan2(a, b, stream())}, {to_ax}};
}

std::vector<array> ArcTanh::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> ArcTanh::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  array one = array(1., primals[0].dtype());
  array t = subtract(one, square(primals[0], stream()), stream());
  return {divide(tangents[0], t, stream())};
}

std::pair<std::vector<array>, std::vector<int>> ArcTanh::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{arctanh(inputs[0], stream())}, axes};
}

std::pair<std::vector<array>, std::vector<int>> ArgPartition::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);

  int axis_left = axes[0] >= 0 && axes[0] <= axis_;
  return {{argpartition(inputs[0], axis_ + axis_left, stream())}, axes};
}

std::vector<array> ArgPartition::vjp(
    const std::vector<array>& primals,
    const std::vector<array>&,
    const std::vector<int>&,
    const std::vector<array>&) {
  return {zeros_like(primals[0], stream())};
}

std::vector<array> ArgPartition::jvp(
    const std::vector<array>&,
    const std::vector<array>& tangents,
    const std::vector<int>&) {
  return {zeros_like(tangents[0], stream())};
}

bool ArgPartition::is_equivalent(const Primitive& other) const {
  const ArgPartition& r_other = static_cast<const ArgPartition&>(other);
  return axis_ == r_other.axis_ && kth_ == r_other.kth_;
}

bool ArgReduce::is_equivalent(const Primitive& other) const {
  const ArgReduce& r_other = static_cast<const ArgReduce&>(other);
  return reduce_type_ == r_other.reduce_type_ && axis_ == r_other.axis_;
}

std::pair<std::vector<array>, std::vector<int>> ArgReduce::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  int reduce_ax = axis_ + (axes[0] >= 0 && axis_ >= axes[0]);
  auto& in = inputs[0];
  std::vector<array> out;
  if (reduce_type_ == ArgReduce::ArgMin) {
    out.push_back(argmin(in, reduce_ax, true, stream()));
  } else {
    out.push_back(argmax(in, reduce_ax, true, stream()));
  }
  return {out, axes};
}

std::vector<array> ArgReduce::vjp(
    const std::vector<array>& primals,
    const std::vector<array>&,
    const std::vector<int>&,
    const std::vector<array>&) {
  return {zeros_like(primals[0], stream())};
}

std::vector<array> ArgReduce::jvp(
    const std::vector<array>& primals,
    const std::vector<array>&,
    const std::vector<int>&) {
  auto shape = output_shapes(primals)[0];
  return {zeros(shape, uint32, stream())};
}

std::pair<std::vector<array>, std::vector<int>> ArgSort::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);

  int axis_left = axes[0] >= 0 && axes[0] <= axis_;
  return {{argsort(inputs[0], axis_ + axis_left, stream())}, axes};
}

std::vector<Shape> ArgReduce::output_shapes(const std::vector<array>& inputs) {
  auto out_shape = inputs[0].shape();
  out_shape[axis_] = 1;
  return {std::move(out_shape)};
}

bool ArgSort::is_equivalent(const Primitive& other) const {
  const ArgSort& r_other = static_cast<const ArgSort&>(other);
  return axis_ == r_other.axis_;
}

std::vector<array> ArgSort::vjp(
    const std::vector<array>& primals,
    const std::vector<array>&,
    const std::vector<int>&,
    const std::vector<array>&) {
  return {zeros_like(primals[0], stream())};
}

std::vector<array> ArgSort::jvp(
    const std::vector<array>& primals,
    const std::vector<array>&,
    const std::vector<int>&) {
  return {zeros(primals[0].shape(), uint32, stream())};
}

std::vector<array> AsType::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  if (cotangents[0].dtype() != dtype_) {
    throw std::invalid_argument(
        "[astype] Type of cotangents does not match primal output type.");
  }
  return {astype(cotangents[0], primals[0].dtype(), stream())};
}

std::vector<array> AsType::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  return {astype(tangents[0], dtype_, stream())};
}

std::pair<std::vector<array>, std::vector<int>> AsType::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  return {{astype(inputs[0], dtype_, stream())}, axes};
}

bool AsType::is_equivalent(const Primitive& other) const {
  const AsType& a_other = static_cast<const AsType&>(other);
  return dtype_ == a_other.dtype_;
}

std::vector<array> AsStrided::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  assert(argnums.size() == 1);

  // Extract the sizes and cast them to ints
  int grad_size = primals[0].size();
  int cotangents_size = cotangents[0].size();

  // Make a flat container to hold the gradients
  auto grad = zeros_like(primals[0], stream());
  grad = reshape(grad, {grad_size}, stream());

  // Create the indices that map output to input
  auto idx = arange(grad_size, stream());
  idx = as_strided(idx, shape_, strides_, offset_, stream());
  idx = reshape(idx, {cotangents_size}, stream());

  // Reshape the cotangentsgent for use with scatter
  auto flat_cotangents = reshape(cotangents[0], {cotangents_size, 1}, stream());

  // Finally accumulate the gradients and reshape them to look like the input
  grad = scatter_add(grad, idx, flat_cotangents, 0, stream());
  grad = reshape(grad, primals[0].shape(), stream());

  return {grad};
}

std::vector<array> AsStrided::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);

  return {as_strided(tangents[0], shape_, strides_, offset_, stream())};
}

bool AsStrided::is_equivalent(const Primitive& other) const {
  const AsStrided& a_other = static_cast<const AsStrided&>(other);
  return shape_ == a_other.shape_ && strides_ == a_other.strides_ &&
      offset_ == a_other.offset_;
}

bool BitwiseBinary::is_equivalent(const Primitive& other) const {
  const BitwiseBinary& a_other = static_cast<const BitwiseBinary&>(other);
  return op_ == a_other.op_;
}

std::pair<std::vector<array>, std::vector<int>> BitwiseBinary::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {
      {array(
          a.shape(),
          a.dtype(),
          std::make_shared<BitwiseBinary>(stream(), op_),
          {a, b})},
      {to_ax}};
}

std::vector<array> BitwiseBinary::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 2);
  std::vector<array> vjps = {zeros_like(tangents[0], stream())};
  if (argnums.size() > 1) {
    vjps.push_back(vjps.back());
  }
  return vjps;
}

std::vector<array> BitwiseBinary::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array>
broadcast_vjp(const array& primal, const array& cotan, const Stream& s) {
  // Reduce cotangents to the shape of the primal
  auto& shape = primal.shape();
  int diff = cotan.ndim() - shape.size();
  std::vector<int> squeeze_axes(diff);
  std::iota(squeeze_axes.begin(), squeeze_axes.end(), 0);
  auto reduce_axes = squeeze_axes;
  for (int i = diff; i < cotan.ndim(); ++i) {
    if (shape[i - diff] != cotan.shape(i)) {
      reduce_axes.push_back(i);
    }
  }
  return {squeeze(sum(cotan, reduce_axes, true, s), squeeze_axes, s)};
}

std::vector<array> Broadcast::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>&,
    const std::vector<array>&) {
  return broadcast_vjp(primals[0], cotangents[0], stream());
}

std::vector<array> Broadcast::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  return {array(
      shape_,
      tangents[0].dtype(),
      std::make_shared<Broadcast>(stream(), shape_),
      tangents)};
}

std::pair<std::vector<array>, std::vector<int>> Broadcast::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto ax = axes[0];
  auto& in = inputs[0];
  if (ax >= 0) {
    int diff = shape_.size() - in.ndim() + 1;
    assert(diff >= 0);
    shape_.insert(shape_.begin() + ax + diff, in.shape(ax));
    ax += diff;
  }
  return {{broadcast_to(in, shape_, stream())}, {ax}};
}

bool Broadcast::is_equivalent(const Primitive& other) const {
  const Broadcast& b_other = static_cast<const Broadcast&>(other);
  return shape_ == b_other.shape_;
}

Shape Broadcast::output_shape(const std::vector<array>& inputs) {
  auto shape = inputs[0].shape();
  for (int i = 1; i < inputs.size(); ++i) {
    shape = broadcast_shapes(shape, inputs[i].shape());
  }
  return shape;
}

std::vector<Shape> Broadcast::output_shapes(const std::vector<array>& inputs) {
  if (inputs.size() < 2) {
    if (broadcast_shapes(inputs[0].shape(), shape_) != shape_) {
      throw std::invalid_argument(
          "[Broadcast] Unable to infer broadcast shape");
    }
    return {shape_};
  }
  return {output_shape(inputs)};
};

std::vector<array> BroadcastAxes::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>&,
    const std::vector<array>&) {
  return broadcast_vjp(primals[0], cotangents[0], stream());
}

std::vector<array> BroadcastAxes::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  return {array(
      output_shape(primals, ignore_axes_),
      tangents[0].dtype(),
      std::make_shared<BroadcastAxes>(stream(), ignore_axes_),
      tangents)};
}

std::pair<std::vector<array>, std::vector<int>> BroadcastAxes::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  throw std::invalid_argument("[BroadcastAxes] VMAP NYI");
}

bool BroadcastAxes::is_equivalent(const Primitive& other) const {
  const auto& b_other = static_cast<const BroadcastAxes&>(other);
  return ignore_axes_ == b_other.ignore_axes_;
}

Shape BroadcastAxes::output_shape(
    const std::vector<array>& inputs,
    const std::vector<int>& ignore_axes) {
  auto shape = Shape{};
  for (auto& in : inputs) {
    auto in_shape = in.shape();
    for (auto it = ignore_axes.rbegin(); it != ignore_axes.rend(); ++it) {
      in_shape.erase(in_shape.begin() + in.ndim() + *it);
    }
    shape = broadcast_shapes(shape, in_shape);
  }
  int dims = ignore_axes.size() + shape.size();
  for (auto ax : ignore_axes) {
    shape.insert(shape.begin() + dims + ax, inputs[0].shape(ax));
  }
  return shape;
}

std::vector<Shape> BroadcastAxes::output_shapes(
    const std::vector<array>& inputs) {
  return {output_shape(inputs, ignore_axes_)};
}

std::vector<array> Ceil::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> Ceil::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {zeros_like(primals[0], stream())};
}

std::pair<std::vector<array>, std::vector<int>> Ceil::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{ceil(inputs[0], stream())}, axes};
}

std::pair<std::vector<array>, std::vector<int>> Cholesky::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto ax = axes[0] >= 0 ? 0 : -1;
  auto a = axes[0] > 0 ? moveaxis(inputs[0], axes[0], 0, stream()) : inputs[0];
  return {{linalg::cholesky(a, upper_, stream())}, {ax}};
}

std::pair<std::vector<array>, std::vector<int>> Eig::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);

  bool needs_move = axes[0] >= (inputs[0].ndim() - 2);
  auto a = needs_move ? moveaxis(inputs[0], axes[0], 0, stream()) : inputs[0];
  auto ax = needs_move ? 0 : axes[0];

  std::vector<array> outputs;
  if (compute_eigenvectors_) {
    auto [values, vectors] = linalg::eig(a, stream());
    outputs = {values, vectors};
  } else {
    outputs = {linalg::eigvals(a, stream())};
  }

  return {outputs, std::vector<int>(outputs.size(), ax)};
}

std::vector<Shape> Eig::output_shapes(const std::vector<array>& inputs) {
  auto shape = inputs[0].shape();
  shape.pop_back(); // Remove last dimension for eigenvalues
  if (compute_eigenvectors_) {
    return {
        std::move(shape), inputs[0].shape()}; // Eigenvalues and eigenvectors
  } else {
    return {std::move(shape)}; // Only eigenvalues
  }
}

bool Eig::is_equivalent(const Primitive& other) const {
  auto& e_other = static_cast<const Eig&>(other);
  return compute_eigenvectors_ == e_other.compute_eigenvectors_;
}

std::pair<std::vector<array>, std::vector<int>> Eigh::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);

  bool needs_move = axes[0] >= (inputs[0].ndim() - 2);
  auto a = needs_move ? moveaxis(inputs[0], axes[0], 0, stream()) : inputs[0];
  auto ax = needs_move ? 0 : axes[0];

  std::vector<array> outputs;
  if (compute_eigenvectors_) {
    auto [values, vectors] = linalg::eigh(a, uplo_, stream());
    outputs = {values, vectors};
  } else {
    outputs = {linalg::eigvalsh(a, uplo_, stream())};
  }

  return {outputs, std::vector<int>(outputs.size(), ax)};
}

std::vector<Shape> Eigh::output_shapes(const std::vector<array>& inputs) {
  auto shape = inputs[0].shape();
  shape.pop_back(); // Remove last dimension for eigenvalues
  if (compute_eigenvectors_) {
    return {
        std::move(shape), inputs[0].shape()}; // Eigenvalues and eigenvectors
  } else {
    return {std::move(shape)}; // Only eigenvalues
  }
}

bool Eigh::is_equivalent(const Primitive& other) const {
  auto& e_other = static_cast<const Eigh&>(other);
  return uplo_ == e_other.uplo_ &&
      compute_eigenvectors_ == e_other.compute_eigenvectors_;
}

std::vector<array> Concatenate::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  auto& cotan = cotangents[0];
  Shape start(cotan.ndim(), 0);
  Shape stop = cotan.shape();

  Shape sizes;
  sizes.push_back(0);
  for (auto& p : primals) {
    sizes.push_back(p.shape(axis_));
  }
  std::partial_sum(sizes.cbegin(), sizes.cend(), sizes.begin());

  std::vector<array> grads;
  for (auto i : argnums) {
    start[axis_] = sizes[i];
    stop[axis_] = sizes[i + 1];
    grads.push_back(slice(cotan, start, stop, stream()));
  }
  return grads;
}

std::vector<array> Concatenate::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  std::vector<int> argidx(argnums.size());
  std::iota(argidx.begin(), argidx.end(), 0);
  std::sort(argidx.begin(), argidx.end(), [&argnums](int a, int b) {
    return argnums[a] < argnums[b];
  });

  std::vector<array> vals;
  for (int i = 0, j = 0; i < primals.size(); ++i) {
    if (j < argnums.size() && argnums[argidx[j]] == i) {
      vals.push_back(tangents[argidx[j++]]);
    } else {
      vals.push_back(zeros_like(primals[i], stream()));
    }
  }
  return {concatenate(vals, axis_, stream())};
}

std::pair<std::vector<array>, std::vector<int>> Concatenate::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  int out_ax = -1;
  int first_vmap = -1;

  // Find the first vmapped input
  for (int i = 0; i < axes.size(); i++) {
    if (axes[i] >= 0) {
      out_ax = axes[i];
      first_vmap = i;
      break;
    }
  }

  // No vmap, should we even be in here?
  if (out_ax < 0) {
    return {{concatenate(inputs, axis_, stream())}, {out_ax}};
  }

  // Make sure vmapped arrays have all vmapped axes in the same location and
  // expand non-vmapped arrays to be compatible with the vmapped ones.
  std::vector<array> t_inputs;
  int axis = axis_ + (axis_ >= out_ax);
  auto cat_shape = inputs[first_vmap].shape();
  for (int i = 0; i < axes.size(); i++) {
    if (axes[i] >= 0) {
      if (out_ax != axes[i]) {
        t_inputs.push_back(moveaxis(inputs[i], axes[i], out_ax, stream()));
      } else {
        t_inputs.push_back(inputs[i]);
      }
    } else {
      cat_shape[axis] = inputs[i].shape(axis_);
      t_inputs.push_back(broadcast_to(
          expand_dims(inputs[i], out_ax, stream()), cat_shape, stream()));
    }
  }

  return {{concatenate(t_inputs, axis, stream())}, {out_ax}};
}

bool Concatenate::is_equivalent(const Primitive& other) const {
  const Concatenate& c_other = static_cast<const Concatenate&>(other);
  return axis_ == c_other.axis_;
}

std::vector<Shape> Concatenate::output_shapes(
    const std::vector<array>& inputs) {
  auto shape = inputs[0].shape();
  for (int i = 1; i < inputs.size(); ++i) {
    shape[axis_] += inputs[i].shape(axis_);
  }
  return {std::move(shape)};
}

std::pair<std::vector<array>, std::vector<int>> Conjugate::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{conjugate(inputs[0], stream())}, axes};
}

std::vector<array> Contiguous::vjp(
    const std::vector<array>&,
    const std::vector<array>& cotangents,
    const std::vector<int>&,
    const std::vector<array>&) {
  return {cotangents};
}

std::vector<array> Contiguous::jvp(
    const std::vector<array>&,
    const std::vector<array>& tangents,
    const std::vector<int>&) {
  return {tangents};
}

std::pair<std::vector<array>, std::vector<int>> Contiguous::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  return {{contiguous(inputs[0], allow_col_major_, stream())}, axes};
}

bool Contiguous::is_equivalent(const Primitive& other) const {
  const Contiguous& c_other = static_cast<const Contiguous&>(other);
  return allow_col_major_ == c_other.allow_col_major_;
}

array conv_weight_backward_patches(
    const array& in,
    const array& wt,
    const array& cotan,
    const std::vector<int>& kernel_strides,
    const std::vector<int>& padding_lo,
    const std::vector<int>& padding_hi,
    StreamOrDevice s) {
  // Resolve Padded input shapes and strides
  Shape padding_starts(in.ndim(), 0);
  auto padding_ends = in.shape();
  auto in_padded_shape = in.shape();

  // padded shape
  for (int i = 1; i < in.ndim() - 1; i++) {
    in_padded_shape[i] += padding_lo[i - 1] + padding_hi[i - 1];
    padding_ends[i] += padding_lo[i - 1];
    padding_starts[i] += padding_lo[i - 1];
  }

  // padded strides (contiguous)
  Strides in_padded_strides(in.ndim(), 1);
  for (int i = in.ndim() - 2; i >= 0; --i) {
    in_padded_strides[i] = in_padded_strides[i + 1] * in_padded_shape[i + 1];
  }

  // Pad input
  std::vector<int> padded_axes(in.ndim() - 2, 0);
  std::iota(padded_axes.begin(), padded_axes.end(), 1);
  auto in_padded =
      pad(in,
          padded_axes,
          Shape(padding_lo.begin(), padding_lo.end()),
          Shape(padding_hi.begin(), padding_hi.end()),
          array(0, in.dtype()),
          "constant",
          s);

  // Resolve strided patches

  // patches are shaped as
  // (batch_dim, out_spatial_dims, weight_spatial_dims, in_channels)
  Shape patches_shape{cotan.shape().begin(), cotan.shape().end() - 1};
  patches_shape.insert(
      patches_shape.end(), wt.shape().begin() + 1, wt.shape().end());

  // Resolve patch strides
  int n_spatial_dim = in.ndim() - 2;
  Strides patches_strides(patches_shape.size(), 1);
  patches_strides[0] = in_padded_strides[0];
  for (int i = 1; i < n_spatial_dim + 1; i++) {
    patches_strides[i] = in_padded_strides[i] * kernel_strides[i - 1];
  }
  for (int i = 1; i < in.ndim(); i++) {
    patches_strides[n_spatial_dim + i] = in_padded_strides[i];
  }

  // Make patches from in
  auto in_patches = as_strided(in_padded, patches_shape, patches_strides, 0, s);

  // Prepare for matmul
  int O = wt.shape(0);
  auto cotan_mat = reshape(cotan, {-1, O}, s);
  in_patches = reshape(in_patches, {cotan_mat.shape(0), -1}, s);

  auto grad = matmul(transpose(cotan_mat, {1, 0}, s), in_patches, s);
  grad = reshape(grad, wt.shape(), s);
  return grad;
}

namespace {

// Conv helpers
inline int conv_out_axis_size(int in_dim, int wt_dim, int stride, int padding) {
  return ((in_dim + padding - wt_dim) / stride) + 1;
}

// Conv helpers
inline int dilate_size(int dim, int dil) {
  return 1 + dil * (dim - 1);
}

} // namespace

Shape Convolution::conv_out_shape(
    const Shape& in_shape,
    const Shape& wt_shape,
    const std::vector<int>& strides,
    const std::vector<int>& pads_lo,
    const std::vector<int>& pads_hi,
    const std::vector<int>& kernel_dilation,
    const std::vector<int>& input_dilation) {
  int N = in_shape[0];
  int O = wt_shape[0];
  Shape out_shape(in_shape.size());
  int i = 0;
  out_shape[i++] = N;

  int spatial_dims = in_shape.size() - 2;

  if (strides.size() != spatial_dims) {
    std::ostringstream msg;
    msg << "[conv] Invalid strides " << strides << " for " << spatial_dims
        << "D convolution.";
    throw std::invalid_argument(msg.str());
  }

  if (pads_lo.size() != spatial_dims || pads_hi.size() != spatial_dims) {
    std::ostringstream msg;
    msg << "[conv] Invalid padding " << pads_lo << " | " << pads_hi << " for "
        << spatial_dims << "D convolution.";
    throw std::invalid_argument(msg.str());
  }

  if (kernel_dilation.size() != spatial_dims) {
    std::ostringstream msg;
    msg << "[conv] Invalid kernel dilation " << kernel_dilation << " for "
        << spatial_dims << "D convolution.";
    throw std::invalid_argument(msg.str());
  }

  if (input_dilation.size() != spatial_dims) {
    std::ostringstream msg;
    msg << "[conv] Invalid input dilation " << input_dilation << " for "
        << spatial_dims << "D convolution.";
    throw std::invalid_argument(msg.str());
  }

  for (; i < in_shape.size() - 1; i++) {
    if (kernel_dilation[i - 1] <= 0) {
      std::ostringstream msg;
      msg << "[conv] Kernel dilation sizes must be positive."
          << " Got kernel dilation " << kernel_dilation << ".";
      throw std::invalid_argument(msg.str());
    }

    if (input_dilation[i - 1] <= 0) {
      std::ostringstream msg;
      msg << "[conv] Input dilation sizes must be positive."
          << " Got input dilation " << input_dilation << ".";
      throw std::invalid_argument(msg.str());
    }

    if (pads_lo[i - 1] < 0 || pads_hi[i - 1] < 0) {
      std::ostringstream msg;
      msg << "[conv] Padding sizes must be non-negative. Got padding "
          << pads_lo << " | " << pads_hi << ".";
      throw std::invalid_argument(msg.str());
    }

    if (strides[i - 1] <= 0) {
      std::ostringstream msg;
      msg << "[conv] Stride sizes must be positive."
          << " Got strides " << strides << ".";
      throw std::invalid_argument(msg.str());
    }

    int kd = dilate_size(wt_shape[i], kernel_dilation[i - 1]);
    int id = dilate_size(in_shape[i], input_dilation[i - 1]);

    out_shape[i] = conv_out_axis_size(
        id, kd, strides[i - 1], pads_lo[i - 1] + pads_hi[i - 1]);

    if (out_shape[i] <= 0) {
      std::ostringstream msg;
      msg << "[conv] Spatial dimensions of input after padding"
          << " cannot be smaller than weight spatial dimensions."
          << " Got error at axis " << i << " for input with shape " << in_shape
          << ", padding low " << pads_lo << ", padding high " << pads_hi
          << ", and weight of shape " << wt_shape << ".";
      throw std::invalid_argument(msg.str());
    }
  }
  out_shape[i] = O;

  return out_shape;
}

std::vector<array> Convolution::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  assert(primals.size() == 2);
  std::vector<array> grads;

  // Collect info
  auto& in = primals[0];
  auto& wt = primals[1];
  auto& cotan = cotangents[0];

  auto group_transpose =
      [this](const array& x, int group_dim, int ax_a, int ax_b) {
        if (groups_ > 1) {
          auto shape = x.shape();
          if (group_dim < 0) {
            group_dim += shape.size();
          }
          shape.insert(shape.begin() + group_dim, groups_);
          shape[group_dim + 1] = shape[group_dim + 1] / groups_;
          auto x_trans = swapaxes(
              reshape(x, std::move(shape), stream()), ax_a, ax_b, stream());
          return flatten(x_trans, group_dim, group_dim + 1, stream());
        } else {
          return swapaxes(x, 0, -1, stream());
        }
      };

  for (int a : argnums) {
    // Grads for input
    if (a == 0) {
      std::vector<int> padding_lo = padding_lo_;
      std::vector<int> padding_hi = padding_hi_;

      for (int i = 0; i < padding_lo.size(); ++i) {
        int wt_size = 1 + kernel_dilation_[i] * (wt.shape(1 + i) - 1);
        padding_lo[i] = wt_size - padding_lo_[i] - 1;

        int in_size = 1 + input_dilation_[i] * (in.shape(1 + i) - 1);
        int out_size = 1 + kernel_strides_[i] * (cotan.shape(1 + i) - 1);
        padding_hi[i] = in_size - out_size + padding_hi_[i];
      }

      // Check for negative padding
      bool has_neg_padding = false;
      for (auto& pd : padding_lo) {
        has_neg_padding |= (pd < 0);
      }
      for (auto& pd : padding_hi) {
        has_neg_padding |= (pd < 0);
      }

      auto wt_trans = group_transpose(wt, 0, 1, -1);
      auto grad = conv_general(
          /* const array& input = */ cotan,
          /* const array& weight = */ wt_trans,
          /* std::vector<int> stride = */ input_dilation_,
          /* std::vector<int> padding_lo = */ padding_lo,
          /* std::vector<int> padding_hi = */ padding_hi,
          /* std::vector<int> kernel_dilation = */ kernel_dilation_,
          /* std::vector<int> input_dilation = */ kernel_strides_,
          /* int groups = */ groups_,
          /* bool flip = */ !flip_,
          stream());

      // Handle negative padding
      if (has_neg_padding) {
        Shape starts(grad.ndim(), 0);
        auto stops = grad.shape();

        for (int i = 0; i < grad.ndim() - 2; i++) {
          if (padding_lo[i] < 0) {
            starts[i + 1] -= padding_lo[i];
          }
          if (padding_hi[i] < 0) {
            stops[i + 1] += padding_hi[i];
          }
        }

        grad = slice(grad, std::move(starts), std::move(stops), stream());
      }

      grads.push_back(grad);
    }
    // Grads for weight
    else if (a == 1) {
      bool no_dilation = true;

      for (int i = 0; i < input_dilation_.size(); i++) {
        no_dilation &= (input_dilation_[i] == 1) && (kernel_dilation_[i] == 1);
      }

      if (no_dilation && !flip_ && groups_ == 1) {
        auto grad = conv_weight_backward_patches(
            in, wt, cotan, kernel_strides_, padding_lo_, padding_hi_, stream());
        grads.push_back(grad);
      } else {
        auto padding_hi = padding_lo_;

        for (int i = 0; i < padding_hi.size(); ++i) {
          int in_size = 1 + input_dilation_[i] * (in.shape(1 + i) - 1);
          int out_size = 1 + kernel_strides_[i] * (cotan.shape(1 + i) - 1);
          int wt_size = 1 + kernel_dilation_[i] * (wt.shape(1 + i) - 1);
          padding_hi[i] = out_size - in_size + wt_size - padding_hi[i] - 1;
        }

        auto cotan_trans = swapaxes(cotan, 0, -1, stream());
        auto in_trans = group_transpose(in, -1, 0, -1);

        auto grad_trans = conv_general(
            /* const array& input = */ in_trans,
            /* const array& weight = */ cotan_trans,
            /* std::vector<int> stride = */ kernel_dilation_,
            /* std::vector<int> padding_lo = */ padding_lo_,
            /* std::vector<int> padding_hi = */ padding_hi,
            /* std::vector<int> kernel_dilation = */ kernel_strides_,
            /* std::vector<int> input_dilation = */ input_dilation_,
            /* int groups = */ groups_,
            /* bool flip = */ false,
            stream());
        if (flip_) {
          auto start = Shape(grad_trans.ndim(), 0);
          auto stop = Shape(grad_trans.ndim(), 0);
          auto strides = Shape(grad_trans.ndim(), 1);
          for (int i = 0; i < stop.size(); ++i) {
            if (i >= 1 && i < stop.size() - 1) {
              start[i] = grad_trans.shape(i);
              stop[i] = -start[i] - 1;
              strides[i] = -1;
            } else {
              stop[i] = grad_trans.shape(i);
            }
          }
          grad_trans = slice(grad_trans, start, stop, strides, stream());
        }
        grads.push_back(swapaxes(grad_trans, 0, -1, stream()));
      }
    }
  }

  return grads;
}

std::pair<std::vector<array>, std::vector<int>> Convolution::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto do_conv = [&](const array& in, const array& w, int groups) {
    return conv_general(
        in,
        w,
        kernel_strides_,
        padding_lo_,
        padding_hi_,
        kernel_dilation_,
        input_dilation_,
        groups,
        flip_,
        stream());
  };
  bool in_vmap = axes[0] >= 0;
  bool w_vmap = axes[1] >= 0;
  auto in = inputs[0];
  auto w = inputs[1];
  if (in_vmap && !w_vmap) {
    // flatten / unflatten the batch dimension
    // of the input / output
    if (axes[0] > 0) {
      in = moveaxis(in, axes[0], 0, stream());
    }
    auto out = do_conv(flatten(in, 0, 1, stream()), w, groups_);
    out = unflatten(out, 0, {in.shape(0), in.shape(1)}, stream());
    return {{out}, {0}};
  } else if (!in_vmap && w_vmap) {
    // flatten into the output channels of w
    // unflatten the channels of the output
    if (axes[1] > 0) {
      w = moveaxis(w, axes[1], 0, stream());
    }
    auto out = do_conv(in, flatten(w, 0, 1, stream()), groups_);
    out = unflatten(out, -1, {w.shape(0), w.shape(1)}, stream());
    return {{out}, {static_cast<int>(out.ndim() - 2)}};
  } else if (in_vmap && w_vmap) {
    // use a group convolution when both inputs are vmapped
    auto b = in.shape(axes[0]);
    in = moveaxis(in, axes[0], -2, stream());
    in = flatten(in, -2, -1, stream());
    if (axes[1] > 0) {
      w = moveaxis(w, axes[1], 0, stream());
    }
    auto c_out = w.shape(1);
    w = flatten(w, 0, 1, stream());
    auto out = do_conv(in, w, groups_ * b);
    out = unflatten(out, -1, {b, c_out}, stream());
    return {{out}, {static_cast<int>(out.ndim() - 2)}};
  } else {
    return {{do_conv(in, w, groups_)}, {-1}};
  }
}

bool Convolution::is_equivalent(const Primitive& other) const {
  const Convolution& c_other = static_cast<const Convolution&>(other);
  return padding_lo_ == c_other.padding_lo_ &&
      padding_hi_ == c_other.padding_hi_ &&
      kernel_strides_ == c_other.kernel_strides_ &&
      kernel_dilation_ == c_other.kernel_dilation_ &&
      input_dilation_ == c_other.input_dilation_ &&
      groups_ == c_other.groups_ && flip_ == c_other.flip_;
}

std::vector<Shape> Convolution::output_shapes(
    const std::vector<array>& inputs) {
  return {conv_out_shape(
      inputs[0].shape(), // in_shape
      inputs[1].shape(), // wt_shape
      kernel_strides_,
      padding_lo_,
      padding_hi_,
      kernel_dilation_,
      input_dilation_)};
}

std::vector<array> Copy::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return cotangents;
}

std::vector<array> Copy::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return tangents;
}

std::pair<std::vector<array>, std::vector<int>> Copy::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{copy(inputs[0], stream())}, axes};
}

std::vector<array> Cos::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return {jvp(primals, cotangents, argnums)};
}

std::vector<array> Cos::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {multiply(
      tangents[0], negative(sin(primals[0], stream()), stream()), stream())};
}

std::pair<std::vector<array>, std::vector<int>> Cos::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{cos(inputs[0], stream())}, axes};
}

std::vector<array> Cosh::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> Cosh::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {multiply(tangents[0], sinh(primals[0], stream()), stream())};
}

std::pair<std::vector<array>, std::vector<int>> Cosh::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{cosh(inputs[0], stream())}, axes};
}

std::vector<array> CustomTransforms::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  // Extract the inputs to the VJP function
  std::vector<array> inputs(primals.begin(), primals.end() - num_outputs_);

  // Compute all the vjps
  auto all_vjps = vjp_fun_(inputs, cotangents, outputs);
  for (const auto& cot : cotangents) {
    all_vjps.emplace_back(cot);
  }

  // Select the vjps requested
  std::vector<array> vjps;
  vjps.reserve(argnums.size());
  for (auto arg : argnums) {
    vjps.push_back(all_vjps[arg]);
  }

  return vjps;
}

std::vector<array> CustomTransforms::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  // Extract the inputs to the JVP function
  std::vector<array> inputs(primals.begin(), primals.end() - num_outputs_);

  // Compute the jvps
  return jvp_fun_(inputs, tangents, argnums);
}

std::pair<std::vector<array>, std::vector<int>> CustomTransforms::vmap(
    const std::vector<array>& inputs_,
    const std::vector<int>& axes_) {
  // Extract the inputs to the vmap function
  std::vector<array> inputs(inputs_.begin(), inputs_.end() - num_outputs_);
  std::vector<int> axes(axes_.begin(), axes_.end() - num_outputs_);
  return vmap_fun_(inputs, axes);
}

std::vector<array> Depends::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  std::vector<array> vjps;

  for (auto arg : argnums) {
    if (arg < cotangents.size()) {
      vjps.push_back(cotangents[arg]);
    } else {
      vjps.push_back(zeros_like(primals[arg]));
    }
  }
  return vjps;
}

std::vector<array> Divide::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;
  array denominator_bar = conjugate(primals[1], stream());
  for (auto arg : argnums) {
    if (arg == 0) {
      vjps.push_back(divide(cotangents[0], denominator_bar, stream()));
    } else {
      vjps.push_back(negative(
          divide(
              multiply(
                  cotangents[0], conjugate(primals[0], stream()), stream()),
              square(denominator_bar, stream()),
              stream()),
          stream()));
    }
  }
  return vjps;
}

std::vector<array> DivMod::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;
  for (auto arg : argnums) {
    vjps.push_back(zeros_like(primals[arg], stream()));
  }
  return vjps;
}

std::vector<array> DivMod::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  return {zeros_like(primals[0], stream())};
}

std::pair<std::vector<array>, std::vector<int>> DivMod::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {divmod(a, b, stream()), {to_ax}};
}

std::vector<array> Divide::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  auto jvp_fun = [&](int i) {
    int arg = argnums[i];
    if (arg == 0) {
      return divide(tangents[i], primals[1], stream());
    } else {
      return negative(
          divide(
              multiply(tangents[i], primals[0], stream()),
              square(primals[1], stream()),
              stream()),
          stream());
    }
  };
  auto out = jvp_fun(0);
  if (argnums.size() > 1) {
    out = add(out, jvp_fun(1), stream());
  }
  return {out};
}

std::pair<std::vector<array>, std::vector<int>> Divide::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{divide(a, b, stream())}, {to_ax}};
}

std::vector<array> Remainder::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;
  for (auto arg : argnums) {
    if (arg == 0) {
      vjps.push_back(cotangents[0]);
    } else {
      auto x_over_y = divide(primals[0], primals[1], stream());
      x_over_y = floor(x_over_y, stream());
      vjps.push_back(
          negative(multiply(x_over_y, cotangents[0], stream()), stream()));
    }
  }
  return vjps;
}

std::vector<array> Remainder::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  auto jvp_fun = [&](int i) {
    int arg = argnums[i];
    if (arg == 0) {
      return tangents[i];
    } else {
      auto x_over_y = divide(primals[0], primals[1], stream());
      x_over_y = floor(x_over_y, stream());
      return negative(multiply(x_over_y, tangents[i], stream()), stream());
    }
  };
  auto out = jvp_fun(0);
  if (argnums.size() > 1) {
    out = add(out, jvp_fun(1), stream());
  }
  return {out};
}

std::pair<std::vector<array>, std::vector<int>> Remainder::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{remainder(a, b, stream())}, {to_ax}};
}

std::pair<std::vector<array>, std::vector<int>> Equal::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{equal(a, b, stream())}, {to_ax}};
}

std::vector<array> Equal::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;
  for (auto arg : argnums) {
    vjps.push_back(zeros_like(primals[arg], stream()));
  }
  return vjps;
}

std::vector<array> Equal::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  auto shape = broadcast_shapes(primals[0].shape(), primals[1].shape());
  return {zeros(shape, tangents[0].dtype(), stream())};
}

std::vector<array> Erf::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> Erf::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  auto dtype = primals[0].dtype();
  auto scale = multiply(array(M_2_SQRTPI, dtype), tangents[0], stream());
  return {multiply(
      scale,
      exp(negative(square(primals[0], stream()), stream()), stream()),
      stream())};
}

std::pair<std::vector<array>, std::vector<int>> Erf::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{erf(inputs[0], stream())}, axes};
}

std::vector<array> ErfInv::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  auto dtype = primals[0].dtype();
  auto scale =
      multiply(array(1.0 / M_2_SQRTPI, dtype), cotangents[0], stream());
  return {
      multiply(scale, exp(square(outputs[0], stream()), stream()), stream())};
}

std::vector<array> ErfInv::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  auto dtype = primals[0].dtype();
  auto scale = multiply(array(1.0 / M_2_SQRTPI, dtype), tangents[0], stream());
  return {multiply(
      scale,
      exp(square(erfinv(primals[0], stream()), stream()), stream()),
      stream())};
}

std::pair<std::vector<array>, std::vector<int>> ErfInv::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{erfinv(inputs[0], stream())}, axes};
}

std::vector<array> Exp::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  return {multiply(cotangents[0], outputs[0], stream())};
}

std::vector<array> Exp::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {multiply(tangents[0], exp(primals[0], stream()), stream())};
}

std::pair<std::vector<array>, std::vector<int>> Exp::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{exp(inputs[0], stream())}, axes};
}

std::vector<array> Expm1::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  return {multiply(
      cotangents[0],
      add(outputs[0], array(1.0f, outputs[0].dtype()), stream()),
      stream())};
}

std::vector<array> Expm1::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {multiply(tangents[0], exp(primals[0], stream()), stream())};
}

std::pair<std::vector<array>, std::vector<int>> Expm1::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{expm1(inputs[0], stream())}, axes};
}

std::vector<array> ExpandDims::vjp(
    const std::vector<array>&,
    const std::vector<array>& cotangents,
    const std::vector<int>&,
    const std::vector<array>&) {
  return {squeeze(cotangents[0], axes_, stream())};
}

std::vector<array> ExpandDims::jvp(
    const std::vector<array>&,
    const std::vector<array>& tangents,
    const std::vector<int>&) {
  return {expand_dims(tangents[0], axes_, stream())};
}

std::pair<std::vector<array>, std::vector<int>> ExpandDims::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto ax = axes[0];
  auto expand_axes = axes_;
  for (auto& s : expand_axes) {
    if (s >= axes[0]) {
      s++;
    } else {
      ax++;
    }
  }
  return {{expand_dims(inputs[0], std::move(expand_axes), stream())}, {ax}};
}

bool ExpandDims::is_equivalent(const Primitive& other) const {
  const ExpandDims& a_other = static_cast<const ExpandDims&>(other);
  return (axes_ == a_other.axes_);
}

Shape ExpandDims::output_shape(
    const array& input,
    const std::vector<int>& axes) {
  auto shape = input.shape();
  for (auto ax : axes) {
    shape.insert(shape.begin() + ax, 1);
  }
  return shape;
}

std::vector<Shape> ExpandDims::output_shapes(const std::vector<array>& inputs) {
  return {ExpandDims::output_shape(inputs[0], axes_)};
}

std::vector<array> Flatten::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>&,
    const std::vector<array>&) {
  auto& in = primals[0];
  Shape unflatten_shape(
      in.shape().begin() + start_axis_, in.shape().begin() + end_axis_ + 1);
  return {unflatten(
      cotangents[0], start_axis_, std::move(unflatten_shape), stream())};
}

std::vector<array> Flatten::jvp(
    const std::vector<array>&,
    const std::vector<array>& tangents,
    const std::vector<int>&) {
  return {flatten(tangents[0], start_axis_, end_axis_, stream())};
}

std::pair<std::vector<array>, std::vector<int>> Flatten::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto ax = axes[0];
  auto start_axis = start_axis_;
  auto end_axis = end_axis_;
  auto in = inputs[0];
  if (ax < start_axis) {
    start_axis++;
    end_axis++;
  } else if (ax <= end_axis_) {
    start_axis++;
    end_axis++;
    in = moveaxis(in, ax, 0, stream());
    ax = 0;
  } else {
    ax -= (end_axis - start_axis);
  }
  return {{flatten(in, start_axis, end_axis, stream())}, {ax}};
}

bool Flatten::is_equivalent(const Primitive& other) const {
  const Flatten& a_other = static_cast<const Flatten&>(other);
  return start_axis_ == a_other.start_axis_ && end_axis_ == a_other.end_axis_;
}

Shape Flatten::output_shape(const array& input, int start_axis, int end_axis) {
  Shape shape = input.shape();
  auto flat_size = input.shape(start_axis);
  for (int ax = start_axis + 1; ax <= end_axis; ++ax) {
    flat_size *= input.shape(ax);
  }
  shape.erase(shape.begin() + start_axis + 1, shape.begin() + end_axis + 1);
  shape[start_axis] = flat_size;
  return shape;
}

std::vector<Shape> Flatten::output_shapes(const std::vector<array>& inputs) {
  return {Flatten::output_shape(inputs[0], start_axis_, end_axis_)};
}

bool FFT::is_equivalent(const Primitive& other) const {
  const FFT& r_other = static_cast<const FFT&>(other);
  return axes_ == r_other.axes_ && inverse_ == r_other.inverse_ &&
      real_ == r_other.real_;
}

std::vector<array> Unflatten::vjp(
    const std::vector<array>&,
    const std::vector<array>& cotangents,
    const std::vector<int>&,
    const std::vector<array>&) {
  return {flatten(cotangents[0], axis_, axis_ + shape_.size() - 1, stream())};
}

std::vector<array> Unflatten::jvp(
    const std::vector<array>&,
    const std::vector<array>& tangents,
    const std::vector<int>&) {
  return {unflatten(tangents[0], axis_, shape_, stream())};
}

std::pair<std::vector<array>, std::vector<int>> Unflatten::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto ax = axes[0];
  auto axis = axis_;
  if (ax <= axis_) {
    axis++;
  } else {
    ax += (shape_.size() - 1);
  }
  return {{unflatten(inputs[0], axis, shape_, stream())}, {ax}};
}

bool Unflatten::is_equivalent(const Primitive& other) const {
  const auto& a_other = static_cast<const Unflatten&>(other);
  return axis_ == a_other.axis_ && shape_ == a_other.shape_;
}

Shape Unflatten::output_shape(
    const array& input,
    int axis,
    const Shape& shape) {
  Shape out_shape = input.shape();
  out_shape[axis] = shape[0];
  out_shape.insert(
      out_shape.begin() + axis + 1, shape.begin() + 1, shape.end());
  return out_shape;
}

std::vector<Shape> Unflatten::output_shapes(const std::vector<array>& inputs) {
  return {Unflatten::output_shape(inputs[0], axis_, shape_)};
}

std::pair<std::vector<array>, std::vector<int>> FFT::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto& in = inputs[0];
  int ax = axes[0];
  auto fft_axes = axes_;
  auto out_shape = in.shape();
  if (ax >= 0) {
    for (auto& fft_ax : fft_axes) {
      if (fft_ax >= ax) {
        fft_ax++;
      }
      if (real_) {
        auto n = out_shape[fft_ax];
        out_shape[fft_ax] = inverse_ ? 2 * (n - 1) : n / 2 + 1;
      }
    }
  }
  return {
      {array(
          out_shape,
          real_ && inverse_ ? float32 : complex64,
          std::make_shared<FFT>(stream(), fft_axes, inverse_, real_),
          {in})},
      {ax}};
}

std::vector<array> FFT::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  auto& in = primals[0];
  std::vector<int> axes(axes_.begin(), axes_.end());

  // TODO: Add it as an option to do an unnormalized or scaled fft so that this
  //       isn't part of the graph.
  double n_elements = 1;
  for (auto ax : axes) {
    n_elements *= inverse_ ? cotangents[0].shape(ax) : primals[0].shape(ax);
  }

  if (real_ && inverse_) {
    // Make a mask to account for the double use in the forward pass.
    // Everything except the DC and nyquist frequencies gets doubled.
    int N = in.shape(axes_.back());
    bool odd = cotangents[0].shape(axes_.back()) % 2;
    Shape c(in.ndim(), 1);
    c[axes_.back()] = N;
    array indices = reshape(arange(N, stream()), std::move(c), stream());
    array first(0, indices.dtype());
    array last(N - 1 + odd, indices.dtype());
    array one(1 / n_elements, in.dtype());
    array two(2 / n_elements, in.dtype());
    array mask = where(
        logical_and(
            greater(indices, first, stream()),
            less(indices, last, stream()),
            stream()),
        two,
        one,
        stream());
    return {
        multiply(fft::rfftn(cotangents[0], axes, stream()), mask, stream())};
  } else if (real_) {
    Shape n;
    for (auto ax : axes_) {
      n.push_back(in.shape(ax));
    }
    // Make a mask to account for the double use in the forward pass.
    // Everything except the DC and nyquist frequencies gets halved.
    int N = cotangents[0].shape(axes_.back());
    bool odd = in.shape(axes_.back()) % 2;
    Shape c(in.ndim(), 1);
    c[axes_.back()] = N;
    array indices = reshape(arange(N, stream()), std::move(c), stream());
    array first(0, indices.dtype());
    array last(N - 1 + odd, indices.dtype());
    array one(1, complex64);
    array half(0.5, complex64);
    array mask = where(
        logical_and(
            greater(indices, first, stream()),
            less(indices, last, stream()),
            stream()),
        half,
        one,
        stream());
    return {multiply(
        fft::irfftn(multiply(cotangents[0], mask, stream()), n, axes, stream()),
        array(n_elements, in.dtype()),
        stream())};
  } else if (inverse_) {
    return {multiply(
        fft::fftn(cotangents[0], axes, stream()),
        array(1 / n_elements, complex64),
        stream())};
  } else {
    return {multiply(
        fft::ifftn(cotangents[0], axes, stream()),
        array(n_elements, complex64),
        stream())};
  }
}

std::vector<array> FFT::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  auto& tan = tangents[0];
  if (real_ & inverse_) {
    return {fft::irfftn(tan, stream())};
  } else if (real_) {
    return {fft::rfftn(tan, stream())};
  } else if (inverse_) {
    return {fft::ifftn(tan, stream())};
  } else {
    return {fft::fftn(tan, stream())};
  }
}

std::vector<array> Floor::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> Floor::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {zeros_like(primals[0], stream())};
}

std::pair<std::vector<array>, std::vector<int>> Floor::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{floor(inputs[0], stream())}, axes};
}

std::vector<array> Full::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {multiply(cotangents[0], primals[0], stream())};
}

std::vector<array> Full::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return tangents;
}

std::pair<std::vector<array>, std::vector<int>> Full::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  auto& in = inputs[0];
  auto out =
      array(in.shape(), in.dtype(), std::make_shared<Full>(stream()), {in});
  return {{out}, axes};
}

std::pair<std::vector<array>, std::vector<int>> Gather::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto& src = inputs[0];
  std::vector<array> indices(inputs.begin() + 1, inputs.end());
  auto gather_axes = axes_;
  auto slice_sizes = slice_sizes_;
  auto src_vmapped = axes[0] >= 0;
  auto ind_vmap_ax_ptr =
      std::find_if(axes.begin() + 1, axes.end(), [](int a) { return a >= 0; });
  int out_ax = -1;
  bool indices_vmapped = (ind_vmap_ax_ptr != axes.end());
  if (indices_vmapped) {
    out_ax = *ind_vmap_ax_ptr;
  } else if (src_vmapped) {
    out_ax = axes[0];
  }

  // Reorder all the index arrays so the vmap axis is in the same spot.
  if (indices_vmapped) {
    for (int i = 1; i < axes.size(); ++i) {
      if (out_ax != axes[i] && axes[i] >= 0) {
        indices[i - 1] = moveaxis(indices[i - 1], axes[i], out_ax, stream());
      } else if (axes[i] < 0) {
        indices[i - 1] = expand_dims(indices[i - 1], out_ax, stream());
      }
    }
  }

  int idx_dims = indices.empty() ? 0 : indices[0].ndim();

  if (src_vmapped) {
    for (auto& ax : gather_axes) {
      if (ax >= axes[0]) {
        ax++;
      }
    }
    if (indices_vmapped) {
      // Make a new index array for the vmapped dimension
      auto vmap_inds =
          arange(static_cast<ShapeElem>(0), src.shape(axes[0]), stream());
      // Reshape it so it broadcasts with other index arrays
      {
        auto shape = Shape(idx_dims, 1);
        shape[out_ax] = vmap_inds.size();
        vmap_inds = reshape(vmap_inds, std::move(shape), stream());
      }
      // Update gather axes and slice sizes accordingly
      slice_sizes.insert(slice_sizes.begin() + axes[0], 1);
      gather_axes.push_back(axes[0]);
      indices.push_back(vmap_inds);
    } else {
      slice_sizes.insert(slice_sizes.begin() + out_ax, src.shape(out_ax));
      out_ax += idx_dims;
    }
  }
  auto out = gather(src, indices, gather_axes, slice_sizes, stream());
  if (src_vmapped && indices_vmapped) {
    out = squeeze(out, idx_dims + axes[0], stream());
  }
  return {{out}, {out_ax}};
}

std::vector<array> Gather::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;
  for (int argnum : argnums) {
    if (argnum > 0) {
      // Grads w.r.t. indices are zero
      vjps.push_back(
          zeros(primals[argnum].shape(), primals[argnum].dtype(), stream()));
    } else {
      auto src = zeros_like(primals[0], stream());
      std::vector<array> inds(primals.begin() + 1, primals.end());
      vjps.push_back(scatter_add(src, inds, cotangents[0], axes_, stream()));
    }
  }
  return vjps;
}

std::vector<array> Gather::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  if (argnums.size() > 1 || argnums[0] != 0) {
    throw std::invalid_argument(
        "[gather] Cannot calculate JVP with respect to indices.");
  }
  std::vector<array> inds(primals.begin() + 1, primals.end());
  return {gather(tangents[0], inds, axes_, slice_sizes_, stream())};
}

bool Gather::is_equivalent(const Primitive& other) const {
  const Gather& g_other = static_cast<const Gather&>(other);
  return axes_ == g_other.axes_ && slice_sizes_ == g_other.slice_sizes_;
}

std::pair<std::vector<array>, std::vector<int>> GatherAxis::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  bool vmap_in = axes[0] >= 0;
  bool vmap_idx = axes[1] >= 0;

  auto in = inputs[0];
  auto idx = inputs[1];
  int out_ax;
  if (vmap_in && vmap_idx) {
    // reorder the vmap axes to the same location
    idx = moveaxis(idx, axes[1], axes[0], stream());
    out_ax = axes[0];
  } else if (vmap_in) {
    // expand just the indices dimension
    idx = expand_dims(idx, axes[0], stream());
    out_ax = axes[0];
  } else if (vmap_idx) {
    // expand just the input dimension
    in = expand_dims(in, axes[1], stream());
    out_ax = axes[1];
  } else {
    out_ax = -1;
  }
  int axis = (out_ax >= 0 && axis_ >= out_ax) ? axis_ + 1 : axis_;
  return {{take_along_axis(in, idx, axis, stream())}, {out_ax}};
}

std::vector<array> GatherAxis::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;
  for (int argnum : argnums) {
    if (argnum > 0) {
      // Grads w.r.t. indices are zero
      vjps.push_back(
          zeros(primals[argnum].shape(), primals[argnum].dtype(), stream()));
    } else {
      auto src = zeros_like(primals[0], stream());
      vjps.push_back(array(
          src.shape(),
          src.dtype(),
          std::make_shared<ScatterAxis>(stream(), ScatterAxis::Sum, axis_),
          {src, primals[1], cotangents[0]}));
    }
  }
  return vjps;
}

std::vector<array> GatherAxis::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  if (argnums.size() > 1 || argnums[0] != 0) {
    throw std::invalid_argument(
        "[gather_axis] Cannot calculate JVP with respect to indices.");
  }
  return {take_along_axis(tangents[0], primals[1], axis_, stream())};
}

std::vector<Shape> GatherAxis::output_shapes(const std::vector<array>& inputs) {
  return {inputs[1].shape()};
}

bool GatherAxis::is_equivalent(const Primitive& other) const {
  auto& g_other = static_cast<const GatherAxis&>(other);
  return axis_ == g_other.axis_;
}

std::vector<Shape> Gather::output_shapes(const std::vector<array>& inputs) {
  Shape out_shape;
  if (inputs.size() > 1) {
    out_shape = inputs[1].shape();
  }
  out_shape.insert(out_shape.end(), slice_sizes_.begin(), slice_sizes_.end());
  return {std::move(out_shape)};
}

std::pair<std::vector<array>, std::vector<int>> Greater::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{greater(a, b, stream())}, {to_ax}};
}

std::vector<array> Greater::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;
  for (auto arg : argnums) {
    vjps.push_back(zeros_like(primals[arg], stream()));
  }
  return vjps;
}

std::vector<array> Greater::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  auto shape = broadcast_shapes(primals[0].shape(), primals[1].shape());
  return {zeros(shape, tangents[0].dtype(), stream())};
}

std::pair<std::vector<array>, std::vector<int>> GreaterEqual::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{greater_equal(a, b, stream())}, {to_ax}};
}

std::vector<array> GreaterEqual::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;
  for (auto arg : argnums) {
    vjps.push_back(zeros_like(primals[arg], stream()));
  }
  return vjps;
}

std::vector<array> GreaterEqual::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  auto shape = broadcast_shapes(primals[0].shape(), primals[1].shape());
  return {zeros(shape, tangents[0].dtype(), stream())};
}

std::vector<array> Imag::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {multiply(
      array(complex64_t{0.0f, 1.0f}, primals[0].dtype()),
      cotangents[0],
      stream())};
}

std::vector<array> Imag::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {imag(tangents[0], stream())};
}

std::pair<std::vector<array>, std::vector<int>> Imag::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{imag(inputs[0], stream())}, axes};
}

std::pair<std::vector<array>, std::vector<int>> Less::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{less(a, b, stream())}, {to_ax}};
}

std::vector<array> Less::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;
  for (auto arg : argnums) {
    vjps.push_back(zeros_like(primals[arg], stream()));
  }
  return vjps;
}

std::vector<array> Less::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  auto shape = broadcast_shapes(primals[0].shape(), primals[1].shape());
  return {zeros(shape, tangents[0].dtype(), stream())};
}

std::pair<std::vector<array>, std::vector<int>> LessEqual::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{less_equal(a, b, stream())}, {to_ax}};
}

std::vector<array> LessEqual::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;
  for (auto arg : argnums) {
    vjps.push_back(zeros_like(primals[arg], stream()));
  }
  return vjps;
}

std::vector<array> LessEqual::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  auto shape = broadcast_shapes(primals[0].shape(), primals[1].shape());
  return {zeros(shape, tangents[0].dtype(), stream())};
}

std::vector<array> Log::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> Log::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  auto out = divide(tangents[0], primals[0], stream());
  if (base_ != Base::e) {
    auto scale = 1 / std::log(base_ == Base::ten ? 10.0f : 2.0f);
    out = multiply(array(scale, out.dtype()), out, stream());
  }
  return {out};
}

std::pair<std::vector<array>, std::vector<int>> Log::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  auto& in = inputs[0];
  return {
      {array(
          in.shape(),
          in.dtype(),
          std::make_shared<Log>(stream(), base_),
          {in})},
      axes};
}

std::vector<array> Log1p::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> Log1p::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  auto dtype = primals[0].dtype();
  return {divide(
      tangents[0], add(array(1.0f, dtype), primals[0], stream()), stream())};
}

std::pair<std::vector<array>, std::vector<int>> Log1p::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{log1p(inputs[0], stream())}, axes};
}

std::vector<array> LogicalNot::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> LogicalNot::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {zeros_like(tangents[0], stream())};
}

std::pair<std::vector<array>, std::vector<int>> LogicalNot::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{logical_not(inputs[0], stream())}, axes};
}

std::vector<array> LogicalAnd::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  assert(primals.size() == 2);
  std::vector<array> vjps = {zeros_like(cotangents[0], stream())};
  if (argnums.size() > 1) {
    vjps.push_back(vjps.back());
  }
  return vjps;
}

std::vector<array> LogicalAnd::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 2);
  assert(argnums.size() <= 2);
  return {zeros_like(primals[0], stream())};
}

std::pair<std::vector<array>, std::vector<int>> LogicalAnd::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 2);
  assert(axes.size() == 2);

  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{logical_and(a, b, stream())}, {to_ax}};
}

std::vector<array> LogicalOr::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  assert(primals.size() == 2);
  std::vector<array> vjps = {zeros_like(cotangents[0], stream())};
  if (argnums.size() > 1) {
    vjps.push_back(vjps.back());
  }
  return vjps;
}

std::vector<array> LogicalOr::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 2);
  assert(argnums.size() <= 2);

  return {zeros_like(primals[0], stream())};
}

std::pair<std::vector<array>, std::vector<int>> LogicalOr::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 2);
  assert(axes.size() == 2);

  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{logical_or(a, b, stream())}, {to_ax}};
}

std::vector<array> LogAddExp::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  auto a = primals[0];
  auto b = primals[1];
  auto s = sigmoid(subtract(a, b, stream()), stream());
  std::vector<array> vjps;
  for (auto arg : argnums) {
    vjps.push_back(multiply(
        cotangents[0],
        arg == 0 ? s : subtract(array(1.0f, s.dtype()), s, stream()),
        stream()));
  }
  return vjps;
}

std::vector<array> LogAddExp::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  auto a = primals[0];
  auto b = primals[1];
  auto s = sigmoid(subtract(a, b, stream()), stream());
  auto jvp_fun = [&](int i) {
    int arg = argnums[i];
    return multiply(
        tangents[i],
        arg == 0 ? s : subtract(array(1.0f, s.dtype()), s, stream()),
        stream());
  };
  auto out = jvp_fun(0);
  if (argnums.size() > 1) {
    out = add(out, jvp_fun(1), stream());
  }
  return {out};
}

std::pair<std::vector<array>, std::vector<int>> LogAddExp::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{logaddexp(a, b, stream())}, {to_ax}};
}

std::pair<std::vector<array>, std::vector<int>> LogSumExp::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto ax = axes[0];
  auto in = inputs[0];
  if (ax == (in.ndim() - 1)) {
    in = swapaxes(in, -1, -2, stream());
    ax = in.ndim() - 2;
  }
  return {{logsumexp(in, -1, true, stream())}, {ax}};
}

std::vector<array> LogSumExp::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  assert(primals.size() == 1);
  assert(cotangents.size() == 1);
  return {multiply(
      cotangents[0],
      softmax(primals[0], std::vector<int>{-1}, true, stream()),
      stream())};
}

std::vector<array> LogSumExp::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(tangents.size() == 1);
  return {multiply(
      tangents[0],
      softmax(primals[0], std::vector<int>{-1}, true, stream()),
      stream())};
}

std::vector<Shape> LogSumExp::output_shapes(const std::vector<array>& inputs) {
  auto s = inputs[0].shape();
  s.back() = 1;
  return {s};
}

std::vector<array> Matmul::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;
  auto& cotan = cotangents[0];
  std::vector<int> reorder(cotan.ndim());
  std::iota(reorder.begin(), reorder.end(), 0);
  std::iter_swap(reorder.end() - 1, reorder.end() - 2);
  auto& s = stream();

  auto complex_transpose = [&](const array& x) {
    return transpose(conjugate(x, s), reorder, s);
  };

  for (auto arg : argnums) {
    if (arg == 0) {
      // M X N * (K X N).T -> M X K
      vjps.push_back(matmul(cotan, complex_transpose(primals[1]), s));
    } else {
      // (M X K).T * M X N -> K X N
      vjps.push_back(matmul(complex_transpose(primals[0]), cotan, s));
    }
  }
  return vjps;
}

std::vector<array> Matmul::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  std::vector<array> jvp;
  for (int i = 0; i < argnums.size(); ++i) {
    auto arg = argnums[i];
    if (arg == 0 && i == 0) {
      jvp.push_back(matmul(tangents[0], primals[1], stream()));
    } else if (arg == 0 && i == 1) {
      jvp[0] = addmm(jvp[0], tangents[1], primals[1], 1.0f, 1.0f, stream());
    } else if (i == 0) {
      jvp.push_back(matmul(primals[0], tangents[0], stream()));
    } else if (i == 1) {
      jvp[0] = addmm(jvp[0], primals[0], tangents[1], 1.0f, 1.0f, stream());
    }
  }
  return jvp;
}

std::pair<std::vector<array>, std::vector<int>> Matmul::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto maybe_move_ax = [this](auto& arr, auto ax) {
    return ax > 0 ? moveaxis(arr, ax, 0, stream()) : arr;
  };
  auto a = maybe_move_ax(inputs[0], axes[0]);
  auto b = maybe_move_ax(inputs[1], axes[1]);
  return {{matmul(a, b, stream())}, {0}};
}

std::vector<Shape> Matmul::output_shapes(const std::vector<array>& inputs) {
  auto out_shape = inputs[0].shape();
  out_shape.back() = inputs[1].shape(-1);
  return {std::move(out_shape)};
}

std::vector<array> Maximum::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  auto& a = primals[0];
  auto& b = primals[1];
  std::vector<array> vjps;
  for (auto arg : argnums) {
    auto mask =
        (arg == 0) ? greater(a, b, stream()) : less_equal(a, b, stream());
    vjps.push_back(multiply(cotangents[0], mask, stream()));
  }
  return {vjps};
}

std::vector<array> Maximum::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  auto& a = primals[0];
  auto& b = primals[1];
  auto jvp_fun = [&](int i) {
    int arg = argnums[i];
    auto mask =
        (arg == 0) ? greater(a, b, stream()) : less_equal(a, b, stream());
    return multiply(tangents[i], mask, stream());
  };
  auto out = jvp_fun(0);
  if (argnums.size() > 1) {
    out = add(out, jvp_fun(1), stream());
  }
  return {out};
}

std::pair<std::vector<array>, std::vector<int>> Maximum::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{maximum(a, b, stream())}, {to_ax}};
}

std::vector<array> Minimum::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  auto& a = primals[0];
  auto& b = primals[1];
  std::vector<array> vjps;
  for (auto arg : argnums) {
    auto mask =
        (arg == 0) ? less(a, b, stream()) : greater_equal(a, b, stream());
    vjps.push_back(multiply(cotangents[0], mask, stream()));
  }
  return vjps;
}

std::vector<array> Minimum::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  auto& a = primals[0];
  auto& b = primals[1];
  auto jvp_fun = [&](int i) {
    int arg = argnums[i];
    auto mask =
        (arg == 0) ? less(a, b, stream()) : greater_equal(a, b, stream());
    return multiply(tangents[i], mask, stream());
  };
  auto out = jvp_fun(0);
  if (argnums.size() > 1) {
    out = add(out, jvp_fun(1), stream());
  }
  return {out};
}

std::pair<std::vector<array>, std::vector<int>> Minimum::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{minimum(a, b, stream())}, {to_ax}};
}

std::vector<array> Multiply::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  auto arg = argnums[0];
  auto jvp = multiply(tangents[0], primals[1 - arg], stream());
  if (argnums.size() > 1) {
    arg = argnums[1];
    jvp = add(jvp, multiply(tangents[1], primals[1 - arg], stream()), stream());
  }
  return {jvp};
}

std::vector<array> Multiply::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;
  for (auto arg : argnums) {
    vjps.push_back(multiply(
        conjugate(primals[1 - arg], stream()), cotangents[0], stream()));
  }
  return vjps;
}

std::pair<std::vector<array>, std::vector<int>> Multiply::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{multiply(a, b, stream())}, {to_ax}};
}

std::vector<array> Select::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 3);
  assert(tangents.size() == 3);

  auto jvp_fun = [&](int i) {
    int arg = argnums[i];

    if (arg == 0) {
      return zeros_like(primals[0], stream());
    } else if (arg == 1) {
      return multiply(
          astype(primals[0], tangents[1].dtype(), stream()),
          tangents[1],
          stream());
    } else {
      return multiply(
          astype(
              logical_not(primals[0], stream()), tangents[2].dtype(), stream()),
          tangents[2],
          stream());
    }
  };

  array jvp = jvp_fun(argnums[0]);
  for (int i = 1; i < argnums.size(); i++) {
    jvp = add(jvp, jvp_fun(argnums[i]));
  }
  return {jvp};
}

std::vector<array> Select::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  assert(primals.size() == 3);
  assert(cotangents.size() == 1);

  std::vector<array> vjps;
  for (auto arg : argnums) {
    if (arg == 0) {
      vjps.push_back(zeros_like(primals[0], stream()));
    } else if (arg == 1) {
      vjps.push_back(multiply(
          astype(primals[0], cotangents[0].dtype(), stream()),
          cotangents[0],
          stream()));
    } else if (arg == 2) {
      vjps.push_back(multiply(
          astype(
              logical_not(primals[0], stream()),
              cotangents[0].dtype(),
              stream()),
          cotangents[0],
          stream()));
    }
  }
  return vjps;
}

std::pair<std::vector<array>, std::vector<int>> Select::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, c, to_ax] = vmap_ternary_op(inputs, axes, stream());
  return {{where(a, b, c, stream())}, {to_ax}};
}

std::vector<array> Negative::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> Negative::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {negative(tangents[0], stream())};
}

std::pair<std::vector<array>, std::vector<int>> Negative::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{negative(inputs[0], stream())}, axes};
}

std::pair<std::vector<array>, std::vector<int>> NotEqual::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{not_equal(a, b, stream())}, axes};
}

std::vector<array> NotEqual::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;
  for (auto arg : argnums) {
    vjps.push_back(zeros_like(primals[arg], stream()));
  }
  return vjps;
}

std::vector<array> NotEqual::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  auto shape = broadcast_shapes(primals[0].shape(), primals[1].shape());
  return {zeros(shape, tangents[0].dtype(), stream())};
}

std::vector<array> Pad::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  assert(argnums.size() == 1 && argnums[0] == 0);

  auto& cotan = cotangents[0];
  Shape start(cotan.ndim(), 0);
  auto stop = cotan.shape();

  for (auto i : axes_) {
    start[i] = low_pad_size_[i];
    stop[i] -= high_pad_size_[i];
  }

  auto out = slice(cotan, start, stop, stream());

  return {out};
}

std::vector<array> Pad::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(argnums.size() == 1 && argnums[0] == 0);

  return {
      pad(tangents[0],
          axes_,
          low_pad_size_,
          high_pad_size_,
          array(0, tangents[0].dtype()),
          "constant",
          stream())};
}

std::pair<std::vector<array>, std::vector<int>> Pad::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  throw std::runtime_error("Pad vmap is NYI.");
}

bool Pad::is_equivalent(const Primitive& other) const {
  const Pad& p_other = static_cast<const Pad&>(other);
  return (
      p_other.axes_ == axes_ && p_other.low_pad_size_ == low_pad_size_ &&
      p_other.high_pad_size_ == high_pad_size_);
}

std::vector<array> Partition::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  auto sort_idx = argpartition(primals[0], kth_, axis_, stream());
  return {put_along_axis(
      zeros_like(primals[0], stream()),
      sort_idx,
      cotangents[0],
      axis_,
      stream())};
}

std::vector<array> Partition::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(tangents.size() == 1);
  auto sort_idx = argpartition(primals[0], kth_, axis_, stream());
  auto out = take_along_axis(tangents[0], sort_idx, axis_, stream());
  return {out};
}

std::pair<std::vector<array>, std::vector<int>> Partition::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);

  int axis_left = axes[0] >= 0 && axes[0] <= axis_;
  return {{partition(inputs[0], axis_ + axis_left, stream())}, axes};
}

bool Partition::is_equivalent(const Primitive& other) const {
  const Partition& r_other = static_cast<const Partition&>(other);
  return axis_ == r_other.axis_ && kth_ == r_other.kth_;
}

std::vector<array> Power::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  std::vector<array> vjps;
  for (auto arg : argnums) {
    if (arg == 0) {
      vjps.push_back(multiply(
          power(
              primals[0],
              subtract(primals[1], array(1, primals[0].dtype()), stream()),
              stream()),
          primals[1],
          stream()));
    } else {
      auto& exp = outputs[0];
      auto exp_vjp = multiply(log(primals[0], stream()), outputs[0], stream());
      // 0 * log 0 -> 0
      vjps.push_back(where(exp, exp_vjp, array(0.0f, exp.dtype()), stream()));
    }
    vjps.back() = multiply(cotangents[0], vjps.back(), stream());
  }
  return vjps;
}

std::vector<array> Power::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  auto output = power(primals[0], primals[1], stream());
  auto grads = vjp(primals, tangents, argnums, {output});
  if (argnums.size() > 1) {
    return {add(grads[0], grads[1], stream())};
  } else {
    return grads;
  }
}

std::pair<std::vector<array>, std::vector<int>> Power::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{power(a, b, stream())}, {to_ax}};
}

std::string quantization_mode_to_string(QuantizationMode mode) {
  switch (mode) {
    case QuantizationMode::Affine:
      return "affine";
    case QuantizationMode::Mxfp4:
      return "mxfp4";
    case QuantizationMode::Mxfp8:
      return "mxfp8";
    case QuantizationMode::Nvfp4:
    default:
      return "nvfp4";
  }
}

QuantizationMode string_to_quantization_mode(
    const std::string& mode,
    std::string_view tag /* = "" */) {
  if (mode == "affine") {
    return QuantizationMode::Affine;
  } else if (mode == "mxfp4") {
    return QuantizationMode::Mxfp4;
  } else if (mode == "mxfp8") {
    return QuantizationMode::Mxfp8;
  } else if (mode == "nvfp4") {
    return QuantizationMode::Nvfp4;
  }
  std::string msg;
  if (!tag.empty()) {
    msg += "[" + std::string(tag) + "]";
  }
  msg += " Invalid quantization mode '" + mode + "'.";
  throw std::invalid_argument(msg);
}

std::pair<std::vector<array>, std::vector<int>> QuantizedMatmul::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  throw std::runtime_error("[QuantizedMatmul::vmap] NYI");
}

std::vector<array> QuantizedMatmul::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;

  // We rely on the fact that w is always 2D so transpose is simple
  std::optional<array> dsb = std::nullopt;
  for (auto arg : argnums) {
    // gradient wrt to x
    if (arg == 0) {
      vjps.push_back(quantized_matmul(
          cotangents[0],
          primals[1],
          primals[2],
          mode_ == QuantizationMode::Affine ? std::optional<array>(primals[3])
                                            : std::nullopt,
          !transpose_,
          group_size_,
          bits_,
          quantization_mode_to_string(mode_),
          stream()));
    }

    // gradient wrt to w_q, scales or biases
    else if (arg == 1) {
      throw std::runtime_error(
          "[QuantizedMatmul::vjp] no gradient wrt the quantized weights.");
    } else {
      if (mode_ != QuantizationMode::Affine) {
        std::ostringstream msg;
        msg << "[QuantizedMatmul::vjp] no gradient wrt scales in "
            << quantization_mode_to_string(mode_) << " quantization.";
        throw std::invalid_argument(msg.str());
      }
      if (!dsb) {
        int ndim = primals[1].ndim();
        auto fc = flatten(cotangents[0], 0, -ndim, stream());
        auto fx = flatten(primals[0], 0, -ndim, stream());
        auto dw = transpose_
            ? matmul(swapaxes(fc, -1, -2, stream()), fx, stream())
            : matmul(swapaxes(fx, -1, -2, stream()), fc, stream());
        dsb = unflatten(dw, -1, {-1, group_size_}, stream());
      }
      if (arg == 3) {
        // biases
        vjps.push_back(sum(*dsb, -1, false, stream()));
      } else {
        // scales
        auto wq = dequantize(
            primals[1],
            ones_like(primals[2], stream()),
            zeros_like(primals[3], stream()),
            group_size_,
            bits_,
            quantization_mode_to_string(mode_),
            {}, // placeholder for amax
            std::nullopt,
            stream());
        wq = unflatten(wq, -1, {-1, group_size_}, stream());
        vjps.push_back(sum(multiply(*dsb, wq, stream()), -1, false, stream()));
      }
    }
  }
  return vjps;
}

std::vector<array> QuantizedMatmul::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  if (argnums.size() > 1 || argnums[0] != 0) {
    throw std::runtime_error(
        "[QuantizedMatmul::jvp] No JVP wrt the quantized matrix yet.");
  }
  return {quantized_matmul(
      tangents[0],
      primals[1],
      primals[2],
      mode_ == QuantizationMode::Affine ? std::optional<array>(primals[3])
                                        : std::nullopt,
      transpose_,
      group_size_,
      bits_,
      quantization_mode_to_string(mode_),
      stream())};
}

bool QuantizedMatmul::is_equivalent(const Primitive& other) const {
  const QuantizedMatmul& qm_other = static_cast<const QuantizedMatmul&>(other);
  return group_size_ == qm_other.group_size_ && bits_ == qm_other.bits_ &&
      mode_ == qm_other.mode_ && transpose_ == qm_other.transpose_;
}

std::vector<Shape> QuantizedMatmul::output_shapes(
    const std::vector<array>& inputs) {
  auto& w = inputs[1];
  int w_outer_dims = (transpose_) ? w.shape(-2) : w.shape(-1) * 32 / bits_;
  auto out_shape = inputs[0].shape();
  out_shape.back() = w_outer_dims;
  return {std::move(out_shape)};
}

bool QQMatmul::is_equivalent(const Primitive& other) const {
  const QQMatmul& qm_other = static_cast<const QQMatmul&>(other);
  return group_size_ == qm_other.group_size_ && bits_ == qm_other.bits_ &&
      mode_ == qm_other.mode_;
}

std::vector<Shape> QQMatmul::output_shapes(const std::vector<array>& inputs) {
  auto out_shape = inputs[0].shape();
  int w_outer_dims = inputs[1].shape(-2);
  out_shape.back() = w_outer_dims;
  return {std::move(out_shape)};
}

std::vector<array> QQMatmul::vjp(
    const std::vector<array>& primals, // non quantized x, non quantized w, if
                                       // nvfp4 global_scale_x, global_scale_w
    const std::vector<array>& cotangents, // non quantized upstream grads
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  bool is_nvfp4 = mode_ == QuantizationMode::Nvfp4;
  assert(primals.size() == 2 || (is_nvfp4 && primals.size() == 4));

  std::vector<array> vjps;
  auto& cotan = cotangents[0];
  auto& s = stream();
  // primal[1] -- non quantized w (N, K)
  // primal[0] -- non quantized activations (M, K)
  // cotan -- non quantized grads (M, N)
  auto qmode = quantization_mode_to_string(mode_);
  std::optional<array> cotan_amax = (primals.size() == 4)
      ? std::make_optional(astype(max(abs(cotan, s), s), float32, s))
      : std::nullopt;

  auto get_primal_scale = [&](int idx) {
    return (primals.size() == 4) ? std::make_optional(primals[idx])
                                 : std::nullopt;
  };

  for (auto arg : argnums) {
    if (arg == 0) { // gradient wrt to x
      // We transpose weights -> quantize along N
      vjps.push_back(qqmm(
          cotan, //  M X N
          swapaxes(primals[1], -1, -2, s), // assuming that w is 2D
          {},
          group_size_,
          bits_,
          qmode,
          cotan_amax,
          get_primal_scale(3), // global_scale_w (for w.T)
          s));
    } else if (arg == 1) { // gradient wrt to weights
      vjps.push_back(qqmm(
          swapaxes(cotan, -1, -2, s), // (N, M)
          swapaxes(primals[0], -1, -2, s), // (K, M)
          {},
          group_size_,
          bits_,
          qmode,
          cotan_amax,
          get_primal_scale(2), // global_scale_x (for x.T)
          s));
    } else {
      vjps.push_back(zeros_like(primals[arg], s));
    }
  }
  return vjps;
}

std::vector<array> QQMatmul::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  throw std::runtime_error("QQMM::jvp NYI");
}

std::pair<std::vector<array>, std::vector<int>> GatherQMM::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  throw std::runtime_error("GatherQMM::vmap NYI");
}

std::vector<array> GatherQMM::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;

  auto& cotan = cotangents[0];

  auto& x = primals[0];
  auto& w = primals[1];
  auto& scales = primals[2];
  auto& lhs_indices = primals[primals.size() - 2];
  auto& rhs_indices = primals[primals.size() - 1];
  auto biases = (mode_ == QuantizationMode::Affine)
      ? std::optional<array>(primals[3])
      : std::nullopt;

  int M = cotan.shape(-2);
  int K = x.shape(-1);

  bool sorted = left_sorted_ || right_sorted_;
  bool no_broadcast = rhs_indices.size() * M * K == x.size();
  std::optional<array> dsb = std::nullopt;

  for (auto arg : argnums) {
    // gradient wrt to x
    if (arg == 0) {
      auto g = gather_qmm(
          cotan,
          w,
          scales,
          biases,
          std::nullopt,
          rhs_indices,
          !transpose_,
          group_size_,
          bits_,
          quantization_mode_to_string(mode_),
          sorted,
          stream());
      if (sorted && no_broadcast) {
        vjps.push_back(g);
      } else {
        vjps.push_back(reshape(
            scatter_add(
                flatten(zeros_like(x, stream()), 0, -3, stream()),
                lhs_indices,
                expand_dims(g, -3, stream()),
                0,
                stream()),
            x.shape(),
            stream()));
      }
    }

    // gradient wrt to the indices is undefined
    else if (arg > 3) {
      throw std::runtime_error(
          "[GatherQMM::vjp] cannot compute the gradient wrt the indices.");
    }

    // gradient wrt to w_q, scales or biases
    else if (arg == 1) {
      throw std::runtime_error(
          "[GatherQMM::vjp] no gradient wrt the quantized weights.");
    } else {
      if (mode_ != QuantizationMode::Affine) {
        std::ostringstream msg;
        msg << "[GatherQMM::vjp] no gradient wrt scales in "
            << quantization_mode_to_string(mode_) << " quantization.";
        throw std::invalid_argument(msg.str());
      }

      if (!dsb) {
        auto shape = w.shape();
        shape.pop_back();
        shape.pop_back();
        dsb = unflatten(
            gather_mm_grad(
                x,
                cotan,
                lhs_indices,
                rhs_indices,
                sorted,
                std::move(shape),
                stream()),
            -1,
            {-1, group_size_},
            stream());
      }
      if (arg == 3) {
        vjps.push_back(sum(*dsb, -1, false, stream()));
      } else {
        vjps.push_back(
            sum(multiply(
                    *dsb,
                    unflatten(
                        dequantize(
                            w,
                            ones_like(scales, stream()),
                            zeros_like(*biases, stream()),
                            group_size_,
                            bits_,
                            quantization_mode_to_string(mode_),
                            std::nullopt,
                            std::nullopt, // amax placeholder
                            stream()),
                        -1,
                        {-1, group_size_},
                        stream()),
                    stream()),
                -1,
                false,
                stream()));
      }
    }
  }
  return vjps;
}

std::vector<array> GatherQMM::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  throw std::runtime_error("GatherQMM::jvp NYI");
}

bool GatherQMM::is_equivalent(const Primitive& other) const {
  const GatherQMM& qm_other = static_cast<const GatherQMM&>(other);
  return group_size_ == qm_other.group_size_ && bits_ == qm_other.bits_ &&
      mode_ == qm_other.mode_ && transpose_ == qm_other.transpose_;
}

std::pair<std::vector<array>, std::vector<int>> RandomBits::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);

  // The last dimension of the key is always a key pair
  auto key = inputs[0];
  auto kax = axes[0];
  if (kax == key.ndim() - 1) {
    std::vector<int> reorder(key.ndim());
    std::iota(reorder.begin(), reorder.end(), 0);
    std::swap(reorder[kax], reorder[kax - 1]);
    key = transpose(key, reorder, stream());
    kax--;
  }

  auto shape = shape_;
  if (kax >= 0) {
    shape.insert(shape.begin() + kax, key.shape()[kax]);
  }

  auto get_dtype = [width = width_]() {
    switch (width) {
      case 1:
        return uint8;
      case 2:
        return uint16;
      default:
        return uint32;
    }
  };

  auto out = array(
      shape,
      get_dtype(),
      std::make_shared<RandomBits>(stream(), shape, width_),
      {key});
  return {{out}, {kax}};
}

bool RandomBits::is_equivalent(const Primitive& other) const {
  const RandomBits& r_other = static_cast<const RandomBits&>(other);
  return shape_ == r_other.shape_ && width_ == r_other.width_;
}

std::vector<array> Real::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {astype(cotangents[0], primals[0].dtype(), stream())};
}

std::vector<array> Real::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {real(tangents[0], stream())};
}

std::pair<std::vector<array>, std::vector<int>> Real::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{real(inputs[0], stream())}, axes};
}

std::pair<std::vector<array>, std::vector<int>> Reshape::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  // Transpose the input so that the vmap dim is first.
  auto& in = inputs[0];
  auto ax = axes[0];
  if (ax >= 0) {
    std::vector<int> reorder(in.ndim());
    std::iota(reorder.begin(), reorder.end(), 0);
    reorder.erase(reorder.begin() + ax);
    reorder.insert(reorder.begin(), ax);
    // Insert the vmap dim into the shape at the beginning.
    auto out = transpose(in, reorder, stream());
    shape_.insert(shape_.begin(), in.shape()[ax]);
    // Reshape the transposed input to the new shape.
    return {{reshape(out, shape_, stream())}, {0}};
  } else {
    return {{reshape(in, shape_, stream())}, {ax}};
  }
}

std::vector<array> Reshape::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  assert(argnums[0] == 0);
  return {reshape(cotangents[0], primals[0].shape(), stream())};
}

std::vector<array> Reshape::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  assert(argnums[0] == 0);
  return {reshape(tangents[0], shape_, stream())};
}

bool Reshape::is_equivalent(const Primitive& other) const {
  const Reshape& r_other = static_cast<const Reshape&>(other);
  return shape_ == r_other.shape_;
}

Shape Reshape::output_shape(const array& input, Shape shape) {
  size_t size = 1;
  int infer_idx = -1;
  for (int i = 0; i < shape.size(); ++i) {
    if (shape[i] == -1) {
      if (infer_idx >= 0) {
        throw std::invalid_argument(
            "[reshape] Reshape can only infer one dimension.");
      }
      infer_idx = i;
    } else {
      size *= shape[i];
    }
  }

  // Infer the shape
  if (size > 0 && infer_idx >= 0) {
    shape[infer_idx] = input.size() / size;
    size *= shape[infer_idx];
  } else if (infer_idx >= 0) {
    throw std::invalid_argument(
        "[reshape] Cannot infer the shape of an empty array");
  }

  // Check that the reshaping is valid
  if (input.size() != size) {
    std::ostringstream msg;
    msg << "[reshape] Cannot reshape array of size " << input.size()
        << " into shape " << shape << ".";
    throw std::invalid_argument(msg.str());
  }
  return shape;
}

std::vector<Shape> Reshape::output_shapes(const std::vector<array>& inputs) {
  return {output_shape(inputs[0], shape_)};
}

std::vector<array> Reduce::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  auto in = primals[0];

  auto& cotan = cotangents[0];
  if (reduce_type_ == Reduce::Sum) {
    return {broadcast_arrays({cotan, in}, stream())[0]};
  } else if (reduce_type_ == Reduce::Prod) {
    auto s = stream();
    auto prod_grad_single_axis =
        [&s](const array& x, const array& cotan, int axis) {
          auto p1 = cumprod(x, axis, /*reverse=*/false, /*inclusive=*/false, s);
          auto p2 = cumprod(x, axis, /*reverse=*/true, /*inclusive=*/false, s);
          auto exclusive_prod = multiply(p1, p2, s);
          return multiply(exclusive_prod, cotan, s);
        };

    // To compute a numerically stable gradient for prod we need an exclusive
    // product of all elements in axes_ . To achieve that we move axes_ to the
    // last dim and perform two exclusive cumprods. Afterwards we move
    // everything back to the original axes.
    if (axes_.size() > 1) {
      std::vector<int> transpose_to;
      std::vector<int> transpose_back;
      Shape shape_flat;
      {
        // Find the transpose needed to move axes_ to the back and the shape
        // except the reduced over axes.
        int j = 0;
        for (int i = 0; i < in.ndim(); i++) {
          if (j < axes_.size() && axes_[j] == i) {
            j++;
          } else {
            transpose_to.push_back(i);
            shape_flat.push_back(in.shape(i));
          }
        }
        for (auto ax : axes_) {
          transpose_to.push_back(ax);
        }
        shape_flat.push_back(-1);
        transpose_back.resize(transpose_to.size());
        for (int i = 0; i < transpose_to.size(); i++) {
          transpose_back[transpose_to[i]] = i;
        }
      }

      // Move axes to the back
      auto x = transpose(in, transpose_to, s);
      // Keep the shape in order to reshape back to the original
      auto shape_to = x.shape();

      // Flatten and compute the gradient
      x = reshape(x, shape_flat, stream());
      auto grad = prod_grad_single_axis(x, reshape(cotan, shape_flat, s), -1);

      // Reshape and transpose to the original shape
      grad = reshape(grad, shape_to, s);
      grad = transpose(grad, transpose_back, s);

      return {grad};
    } else {
      return {prod_grad_single_axis(in, cotan, axes_[0])};
    }

  } else if (reduce_type_ == Reduce::Min || reduce_type_ == Reduce::Max) {
    auto out = outputs[0];
    if (out.ndim() != in.ndim()) {
      out = expand_dims(out, axes_, stream());
    }
    auto mask = equal(in, out, stream());
    auto normalizer = sum(mask, axes_, true, stream());
    return {multiply(divide(cotan, normalizer, stream()), mask, stream())};
  }

  else {
    return {zeros_like(in, stream())};
  }
}

std::vector<array> Reduce::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  auto in = primals[0];
  auto s = stream();

  auto grad_op = [&s, reduce_type = reduce_type_](
                     const array& x, const array& tan, int axis) {
    if (reduce_type == Reduce::Min) {
      auto idx = argmin(x, axis, true, s);
      return take_along_axis(tan, idx, axis, s);
    } else if (reduce_type == Reduce::Max) {
      auto idx = argmax(x, axis, true, s);
      return take_along_axis(tan, idx, axis, s);
    } else {
      auto p1 = cumprod(x, axis, /*reverse=*/false, /*inclusive=*/false, s);
      auto p2 = cumprod(x, axis, /*reverse=*/true, /*inclusive=*/false, s);
      auto out = multiply(multiply(p1, p2, s), tan, s);
      return sum(out, axis, true, s);
    }
  };

  auto tan = tangents[0];
  if (reduce_type_ == Reduce::Sum) {
    return {sum(tan, axes_, true, s)};
  } else {
    if (axes_.size() > 1) {
      std::vector<int> transpose_to;
      {
        // Find the transpose needed to move axes_ to the back.
        int j = 0;
        for (int i = 0; i < in.ndim(); i++) {
          if (j < axes_.size() && axes_[j] == i) {
            j++;
          } else {
            transpose_to.push_back(i);
          }
        }
        for (auto ax : axes_) {
          transpose_to.push_back(ax);
        }
      }

      int start_ax = in.ndim() - axes_.size();
      in = flatten(transpose(in, transpose_to, s), start_ax, -1, s);
      tan = flatten(transpose(tan, transpose_to, s), start_ax, -1, s);

      auto grad = squeeze(grad_op(in, tan, -1), -1, s);
      return {expand_dims(grad, axes_, s)};
    } else {
      return {grad_op(in, tan, axes_[0])};
    }
  }
}

std::pair<std::vector<array>, std::vector<int>> Reduce::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto ax = axes[0];
  auto reduce_axes = axes_;
  if (ax >= 0) {
    for (auto& rax : reduce_axes) {
      if (rax >= ax) {
        rax++;
      }
    }
  }
  auto& in = inputs[0];
  std::vector<array> out;
  switch (reduce_type_) {
    case Reduce::And:
      out.push_back(all(in, reduce_axes, true, stream()));
      break;
    case Reduce::Or:
      out.push_back(any(in, reduce_axes, true, stream()));
      break;
    case Reduce::Sum:
      out.push_back(sum(in, reduce_axes, true, stream()));
      break;
    case Reduce::Prod:
      out.push_back(prod(in, reduce_axes, true, stream()));
      break;
    case Reduce::Min:
      out.push_back(min(in, reduce_axes, true, stream()));
      break;
    case Reduce::Max:
      out.push_back(max(in, reduce_axes, true, stream()));
      break;
  }
  return {out, axes};
}

bool Reduce::is_equivalent(const Primitive& other) const {
  const Reduce& r_other = static_cast<const Reduce&>(other);
  return reduce_type_ == r_other.reduce_type_ && axes_ == r_other.axes_;
}

std::vector<Shape> Reduce::output_shapes(const std::vector<array>& inputs) {
  auto out_shape = inputs[0].shape();
  for (auto i : axes_) {
    out_shape[i] = 1;
  }
  return {std::move(out_shape)};
}

std::vector<array> Round::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> Round::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {zeros_like(primals[0], stream())};
}

std::pair<std::vector<array>, std::vector<int>> Round::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{round(inputs[0], stream())}, axes};
}

std::pair<std::vector<array>, std::vector<int>> Scan::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto& in = inputs[0];
  auto out_dtype =
      (in.dtype() == bool_ && reduce_type_ == Scan::Sum) ? int32 : in.dtype();
  int axis_left = axes[0] >= 0 && axes[0] <= axis_;
  return {
      {array(
          in.shape(),
          out_dtype,
          std::make_shared<Scan>(
              stream(), reduce_type_, axis_ + axis_left, reverse_, inclusive_),
          {in})},
      axes};
}

std::vector<array> Scan::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  assert(primals.size() == 1);
  assert(argnums[0] == 0);

  if (reduce_type_ == Scan::Sum) {
    return {cumsum(cotangents[0], axis_, !reverse_, inclusive_, stream())};
  } else if (reduce_type_ == Scan::LogAddExp) {
    // Ref:
    // https://github.com/tensorflow/tensorflow/blob/2a5910906a0e0f3dbc186ff9db6386d81a63448c/tensorflow/python/ops/math_grad.py#L1832-L1863

    auto x = primals[0];
    auto grad = cotangents[0];
    auto results = outputs[0];

    auto zero = zeros({1}, grad.dtype(), stream());
    auto grad_min = array(finfo(grad.dtype()).min, grad.dtype());

    // Split the incoming gradient into positive and negative part
    // in order to take logs. This is required for stable results.
    auto log_abs_grad = log(abs(grad, stream()), stream());
    auto log_grad_positive =
        where(greater(grad, zero, stream()), log_abs_grad, grad_min, stream());
    auto log_grad_negative =
        where(less(grad, zero, stream()), log_abs_grad, grad_min, stream());

    auto output_pos = exp(
        add(logcumsumexp(
                subtract(log_grad_positive, results, stream()),
                axis_,
                !reverse_,
                inclusive_,
                stream()),
            x,
            stream()));
    auto output_neg = exp(
        add(logcumsumexp(
                subtract(log_grad_negative, results, stream()),
                axis_,
                !reverse_,
                inclusive_,
                stream()),
            x,
            stream()));

    return {subtract(output_pos, output_neg, stream())};
  } else if (reduce_type_ == Scan::Prod) {
    auto in = primals[0];
    // Find the location of the first 0 and set it to 1:
    // - A: Exclusive cumprod
    // - B: Inclusive cumprod
    // - Find the location that is 0 in A and not zero B
    // Compute the gradient by:
    // - Compute the regular gradient for everything before the first zero
    // - Set the first zero to 1 and redo the computation, use this for the
    //   gradient of the first zero
    // - Everything after the first zero has a gradient of 0

    // Get inclusive and exclusive cum prods
    auto cprod_exclusive = cumprod(in, axis_, reverse_, !inclusive_, stream());
    auto cprod_inclusive = outputs[0];
    if (!inclusive_) {
      std::swap(cprod_exclusive, cprod_inclusive);
    }

    // Make the mask for the first zero
    auto z = array(0, in.dtype());
    auto eq_zero = equal(cprod_inclusive, z, stream());
    auto first_zero =
        logical_and(eq_zero, not_equal(cprod_exclusive, z, stream()), stream());

    auto to_partial_grad = [this, &cotangents](const array& arr) {
      return cumsum(
          multiply(arr, cotangents[0], stream()),
          axis_,
          !reverse_,
          inclusive_,
          stream());
    };

    auto cprod_with_one = cumprod(
        where(first_zero, array(1, in.dtype()), in, stream()),
        axis_,
        reverse_,
        inclusive_,
        stream());
    auto grad_with_one = to_partial_grad(cprod_with_one);
    auto grad = divide(to_partial_grad(outputs[0]), in, stream());
    return {where(
        first_zero,
        grad_with_one,
        where(eq_zero, z, grad, stream()),
        stream())};
  } else {
    // Can probably be implemented by equals and then cummax to make the mask
    throw std::runtime_error("VJP is not implemented for cumulative min/max");
  }
}

std::vector<array> Scan::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(tangents.size() == 1);
  assert(argnums[0] == 0);

  if (reduce_type_ == Scan::Sum) {
    return {cumsum(tangents[0], axis_, reverse_, inclusive_, stream())};
  } else {
    throw std::runtime_error(
        "JVP is not implemented for cumulative prod/min/max");
  }
}

bool Scan::is_equivalent(const Primitive& other) const {
  const Scan& s_other = static_cast<const Scan&>(other);
  return (
      reduce_type_ == s_other.reduce_type_ && axis_ == s_other.axis_ &&
      reverse_ == s_other.reverse_ && inclusive_ == s_other.inclusive_);
}

bool Scatter::is_equivalent(const Primitive& other) const {
  const Scatter& s_other = static_cast<const Scatter&>(other);
  return reduce_type_ == s_other.reduce_type_ && axes_ == s_other.axes_;
}

std::vector<array> Scatter::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  switch (reduce_type_) {
    case Scatter::None:
    case Scatter::Sum:
    case Scatter::Max:
    case Scatter::Min:
      break;
    default:
      throw std::runtime_error(
          "[scatter] VJP not implemented for scatter_prod");
  }

  const array& result = outputs[0];
  const array& values = primals[0];
  const array& updates = primals.back();
  const std::vector<array> indices(primals.begin() + 1, primals.end() - 1);

  std::vector<array> vjps;
  for (auto num : argnums) {
    // Gradient wrt to the input array
    if (num == 0) {
      switch (reduce_type_) {
        case Scatter::None:
          // Scatter 0s to the locations that were updated with the updates
          vjps.push_back(scatter(
              cotangents[0],
              indices,
              zeros_like(updates, stream()),
              axes_,
              stream()));
          break;
        case Scatter::Sum:
          // The input array values are kept so they all get gradients
          vjps.push_back(cotangents[0]);
          break;
        case Scatter::Max:
        case Scatter::Min: {
          vjps.push_back(where(
              equal(result, values, stream()),
              cotangents[0],
              array(0, cotangents[0].dtype()),
              stream()));
          break;
        }
        default:
          // Should never reach here
          throw std::invalid_argument("");
      }
    } else if (num == primals.size() - 1) {
      switch (reduce_type_) {
        case Scatter::None:
        case Scatter::Sum: {
          // Gather the values from the cotangent
          auto slice_sizes = cotangents[0].shape();
          for (auto ax : axes_) {
            slice_sizes[ax] = 1;
          }
          vjps.push_back(
              gather(cotangents[0], indices, axes_, slice_sizes, stream()));
          break;
        }
        case Scatter::Max:
        case Scatter::Min: {
          auto slice_sizes = cotangents[0].shape();
          for (auto ax : axes_) {
            slice_sizes[ax] = 1;
          }
          auto gathered_cotan =
              gather(cotangents[0], indices, axes_, slice_sizes, stream());
          auto gathered_result =
              gather(result, indices, axes_, slice_sizes, stream());
          vjps.push_back(
              multiply(gathered_cotan, gathered_result == updates, stream()));
          break;
        }
        default: {
          // Should never reach here
          throw std::invalid_argument("");
        }
      }
    } else {
      throw std::invalid_argument(
          "[scatter] Cannot calculate VJP with respect to indices.");
    }
  }
  return vjps;
}

std::vector<array> Scatter::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  throw std::runtime_error("[scatter] JVP not yet implemented");
}

std::pair<std::vector<array>, std::vector<int>> Scatter::vmap(
    const std::vector<array>& inputs_,
    const std::vector<int>& vmap_axes) {
  assert(inputs_.size() >= 2);
  assert(inputs_.size() == vmap_axes.size());

  auto inputs = inputs_;

  auto scatter_axes = axes_;
  int src_ax = vmap_axes[0];

  auto vmap_ax_it = std::find_if(
      vmap_axes.begin(), vmap_axes.end(), [](int a) { return a >= 0; });
  auto vmap_ax = *vmap_ax_it;
  if (vmap_ax >= 0) {
    auto vmap_size = inputs[vmap_ax_it - vmap_axes.begin()].shape(vmap_ax);
    if (src_ax < 0) {
      src_ax = 0;
      inputs[0] =
          repeat(expand_dims(inputs[0], 0, stream()), vmap_size, 0, stream());
    }
    for (int i = 1; i < vmap_axes.size() - 1; ++i) {
      // vmap axis for indices goes to 0
      if (vmap_axes[i] >= 0) {
        inputs[i] = moveaxis(inputs[i], vmap_axes[i], 0, stream());
      }
      // insert a vmap axis and repeat
      if (vmap_axes[i] < 0) {
        auto idx_shape = inputs[i].shape();
        inputs[i] =
            repeat(expand_dims(inputs[i], 0, stream()), vmap_size, 0, stream());
      }
      // Adjust non-vmapped index axes to account for the extra vmap dimension.
      if (scatter_axes[i - 1] >= src_ax) {
        scatter_axes[i - 1]++;
      }
    }

    auto vmap_inds = arange(vmap_size, inputs[1].dtype(), stream());
    auto vmap_inds_shape = Shape(inputs[1].ndim(), 1);
    vmap_inds_shape[0] = vmap_inds.size();
    vmap_inds = reshape(vmap_inds, std::move(vmap_inds_shape), stream());
    inputs.insert(
        inputs.end() - 1, broadcast_to(vmap_inds, inputs[1].shape(), stream()));
    scatter_axes.push_back(src_ax);

    // Clone updates along the vmap dimension so they can be applied to each
    // source tensor in the vmap.
    auto& updates = inputs.back();
    if (vmap_axes.back() < 0) {
      updates = expand_dims(
          updates, {0, static_cast<int>(inputs[1].ndim())}, stream());
      updates = repeat(updates, vmap_size, 0, stream());
    } else {
      updates =
          expand_dims(updates, static_cast<int>(inputs[1].ndim()), stream());
      updates = moveaxis(updates, vmap_axes.back(), 0, stream());
    }
  }

  auto& shape = inputs[0].shape();
  auto dtype = inputs[0].dtype();
  auto out = array(
      shape,
      dtype,
      std::make_shared<Scatter>(stream(), reduce_type_, scatter_axes),
      std::move(inputs));

  return {{out}, {src_ax}};
}

std::vector<array> ScatterAxis::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  const auto& indices = primals[1];
  const auto& updates = primals[2];

  std::vector<array> vjps;
  for (auto num : argnums) {
    // Gradient wrt to the input array
    if (num == 0) {
      if (reduce_type_ == ScatterAxis::None) {
        // Scatter 0s to the locations that were updated with the updates
        vjps.push_back(put_along_axis(
            cotangents[0],
            indices,
            zeros_like(updates, stream()),
            axis_,
            stream()));
      } else {
        // The input array values are kept so they all get gradients
        vjps.push_back(cotangents[0]);
      }
    } else if (num == 2) {
      vjps.push_back(take_along_axis(cotangents[0], indices, axis_, stream()));
    } else {
      throw std::invalid_argument(
          "[scatter_axis] Cannot calculate VJP with respect to indices.");
    }
  }
  return vjps;
}

std::vector<array> ScatterAxis::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  for (auto arg : argnums) {
    if (arg == 1) {
      throw std::invalid_argument(
          "[scatter_axis] Cannot calculate JVP with respect to indices.");
    }
  }
  if (argnums.size() == 2) {
    return {array(
        primals[0].shape(),
        primals[0].dtype(),
        std::make_shared<ScatterAxis>(stream(), reduce_type_, axis_),
        {tangents[0], primals[1], tangents[1]})};
  } else {
    auto tan_a =
        argnums[0] == 0 ? tangents[0] : zeros_like(primals[0], stream());
    auto tan_b =
        argnums[0] == 2 ? tangents[0] : zeros_like(primals[2], stream());
    return {array(
        primals[0].shape(),
        primals[0].dtype(),
        std::make_shared<ScatterAxis>(stream(), reduce_type_, axis_),
        {tan_a, primals[1], tan_b})};
  }
}

std::pair<std::vector<array>, std::vector<int>> ScatterAxis::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  // Find the first vmap axis
  int out_ax = -1;
  for (auto ax : axes) {
    if (ax >= 0) {
      out_ax = ax;
      break;
    }
  }

  if (out_ax < 0) {
    return {
        {array(
            inputs[0].shape(),
            inputs[0].dtype(),
            std::make_shared<ScatterAxis>(stream(), reduce_type_, axis_),
            inputs)},
        {-1}};
  }

  auto v_in = inputs;
  for (int i = 0; i < axes.size(); ++i) {
    if (axes[i] >= 0) {
      // if out_ax >= 0 move axis o/w set out_ax
      if (out_ax != axes[i]) {
        v_in[i] = moveaxis(v_in[i], axes[i], out_ax, stream());
      }
    } else {
      v_in[i] = expand_dims(v_in[i], out_ax, stream());
    }
  }
  int axis = axis_ >= out_ax ? axis_ + 1 : axis_;
  auto fn = reduce_type_ == Sum ? scatter_add_axis : put_along_axis;
  return {{fn(v_in[0], v_in[1], v_in[2], axis, stream())}, {out_ax}};
}

std::vector<Shape> ScatterAxis::output_shapes(
    const std::vector<array>& inputs) {
  return {inputs[0].shape()};
}

bool ScatterAxis::is_equivalent(const Primitive& other) const {
  auto& s_other = static_cast<const ScatterAxis&>(other);
  return reduce_type_ == s_other.reduce_type_ && axis_ == s_other.axis_;
}

std::vector<array> MaskedScatter::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  auto& s = stream();
  const array& dst = primals[0];
  const array& mask = primals[1];
  const array& src = primals[2];
  const array mask_b = broadcast_to(mask, dst.shape(), s);
  const array& cotan = cotangents[0];

  std::vector<array> vjps;
  vjps.reserve(argnums.size());

  for (int arg : argnums) {
    if (arg == 0) {
      vjps.push_back(where(mask_b, zeros_like(cotan, s), cotan, s));
    } else if (arg == 2) {
      const array mask_flat = flatten(mask_b, s);
      const array cotan_flat = flatten(cotan, s);

      const array idx_src =
          cumsum(astype(mask_flat, int32, s), 0, false, false, s);
      const array cotan_src =
          where(mask_flat, cotan_flat, array(0, cotan_flat.dtype()), s);

      array gsrc_flat =
          zeros({static_cast<int>(src.size())}, cotan_src.dtype(), s);
      if (src.size() > 0) {
        const array cotan_updates =
            reshape(cotan_src, {static_cast<int>(idx_src.size()), 1}, s);
        gsrc_flat = scatter_add(gsrc_flat, idx_src, cotan_updates, 0, s);
      }

      vjps.push_back(reshape(gsrc_flat, src.shape(), s));
    } else {
      throw std::invalid_argument(
          "[masked_scatter] Cannot calculate VJP with respect to mask.");
    }
  }
  return vjps;
}

std::vector<array> MaskedScatter::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  auto& s = stream();
  const array& dst = primals[0];
  const array& mask = primals[1];
  array mask_b = mask;
  if (mask_b.ndim() < dst.ndim()) {
    std::vector<int> axes(dst.ndim() - mask_b.ndim(), 0);
    std::iota(axes.begin(), axes.end(), mask_b.ndim());
    mask_b = expand_dims(mask_b, axes, s);
  }

  array out = zeros_like(dst, s);
  for (int arg : argnums) {
    if (arg == 0) {
      out = where(mask_b, out, tangents[0], s);
    } else if (arg == 2) {
      out = array(
          out.shape(),
          out.dtype(),
          std::make_shared<MaskedScatter>(s),
          {out, mask, tangents[1]});
    } else {
      throw std::invalid_argument("[masked_scatter] invalid arg index in JVP");
    }
  }
  return {out};
}

std::pair<std::vector<array>, std::vector<int>> MaskedScatter::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto& s = stream();

  // The inputs all had batching in the 0-th dim. So vectorization amounts to
  //  - Move the vectorized axis first
  //  - Expand and broadcast the unvectorized inputs
  //  - Flatten the first two dims (the new and old batch axes)
  //  - Masked scatter
  //  - Unflatten the vectorized axis again

  // Find the batch dim if any
  int batch_dim = -1;
  for (int i = 0; i < axes.size(); i++) {
    if (axes[i] >= 0) {
      batch_dim = inputs[i].shape(axes[i]);
    }
  }

  // Early exit if it's not vmapped
  if (batch_dim < 0) {
    return {
        {array(
            inputs[0].shape(),
            inputs[0].dtype(),
            std::make_shared<MaskedScatter>(to_stream(s)),
            inputs)},
        {-1}};
  }

  // Move vmapped axis to 0-th dim and broadcast the non-vectorized ones
  auto v_in = inputs;
  for (int i = 0; i < axes.size(); i++) {
    if (axes[i] > 0) {
      v_in[i] = moveaxis(v_in[i], axes[i], 0, s);
    } else if (axes[i] < 0) {
      v_in[i] = expand_dims(v_in[i], 0, s);
      auto in_shape = v_in[i].shape();
      in_shape[0] = batch_dim;
      v_in[i] = broadcast_to(v_in[i], in_shape, s);
    }
  }

  // Flatten the first 2 dims
  for (int i = 0; i < 3; i++) {
    v_in[i] = flatten(v_in[i], 0, 1, s);
  }

  // Masked scatter
  const auto result_shape = v_in[0].shape();
  const auto result_dtype = v_in[0].dtype();
  array result(
      result_shape,
      result_dtype,
      std::make_shared<MaskedScatter>(to_stream(s)),
      std::move(v_in));

  // Now unflatten so the vectorized axis is nice and separate
  result = unflatten(result, 0, {batch_dim, -1}, s);

  return {{result}, {0}};
}

std::vector<array> Sigmoid::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  auto& s = outputs[0];
  auto sprime =
      multiply(s, subtract(array(1.0f, s.dtype()), s, stream()), stream());
  return {multiply(cotangents[0], sprime, stream())};
}

std::vector<array> Sigmoid::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  auto s = sigmoid(primals[0], stream());
  auto sprime =
      multiply(s, subtract(array(1.0f, s.dtype()), s, stream()), stream());
  return {multiply(tangents[0], sprime, stream())};
}

std::pair<std::vector<array>, std::vector<int>> Sigmoid::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{sigmoid(inputs[0], stream())}, axes};
}

std::vector<array> Sign::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> Sign::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {zeros(primals[0].shape(), primals[0].dtype(), stream())};
}

std::pair<std::vector<array>, std::vector<int>> Sign::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{sign(inputs[0], stream())}, axes};
}

std::vector<array> Sin::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> Sin::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {multiply(tangents[0], cos(primals[0], stream()), stream())};
}

std::pair<std::vector<array>, std::vector<int>> Sin::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{sin(inputs[0], stream())}, axes};
}

std::vector<array> Sinh::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> Sinh::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {multiply(tangents[0], cosh(primals[0], stream()), stream())};
}

std::pair<std::vector<array>, std::vector<int>> Sinh::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{sinh(inputs[0], stream())}, axes};
}

std::pair<std::vector<array>, std::vector<int>> Slice::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto start = start_indices_;
  auto stop = end_indices_;
  auto strides = strides_;
  auto ax = axes[0];
  auto& input = inputs[0];
  if (ax >= 0) {
    start.insert(start.begin() + ax, 0);
    stop.insert(stop.begin() + ax, input.shape(ax));
    strides.insert(strides.begin() + ax, 1);
  }
  return {{slice(input, start, stop, strides, stream())}, {ax}};
}

std::vector<array> Slice::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  // Check inputs
  assert(primals.size() == 1);
  auto out = zeros_like(primals[0], stream());
  return {slice_update(
      out, cotangents[0], start_indices_, end_indices_, strides_, stream())};
}

std::vector<array> Slice::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  // Check inputs
  assert(primals.size() == 1);
  return {slice(tangents[0], start_indices_, end_indices_, strides_, stream())};
}

bool Slice::is_equivalent(const Primitive& other) const {
  const Slice& s_other = static_cast<const Slice&>(other);
  return (
      start_indices_ == s_other.start_indices_ &&
      end_indices_ == s_other.end_indices_ && strides_ == s_other.strides_);
}

std::pair<std::vector<array>, std::vector<int>> SliceUpdate::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 2);
  assert(axes.size() == 2);

  auto start = start_indices_;
  auto stop = end_indices_;
  auto strides = strides_;

  auto src = inputs[0];
  auto upd = inputs[1];

  auto src_ax = axes[0];
  auto upd_ax = axes[1];

  // No vmapping needed
  if (src_ax == -1 && upd_ax == -1) {
    return {
        {array(
            src.shape(),
            src.dtype(),
            std::make_shared<SliceUpdate>(
                stream(), reduce_type_, start, stop, strides),
            {src, upd})},
        {-1}};
  }

  // Broadcast Src
  if (src_ax == -1) {
    src = expand_dims(src, upd_ax, stream());
    auto shape = src.shape();
    shape[upd_ax] = upd.shape(upd_ax);
    src = broadcast_to(src, shape, stream());
    src_ax = upd_ax;
  }

  // Broadcast upd
  if (upd_ax == -1) {
    upd = expand_dims(upd, src_ax, stream());
    upd_ax = src_ax;
  }

  if (src_ax != upd_ax) {
    upd = moveaxis(upd, upd_ax, src_ax, stream());
  }

  start.insert(start.begin() + src_ax, 0);
  stop.insert(stop.begin() + src_ax, src.shape(src_ax));
  strides.insert(strides.begin() + src_ax, 1);

  return {
      {array(
          src.shape(),
          src.dtype(),
          std::make_shared<SliceUpdate>(
              stream(), reduce_type_, start, stop, strides),
          {src, upd})},
      {src_ax}};
}

std::vector<array> SliceUpdate::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  // Check inputs
  assert(primals.size() == 2);

  const array& result = outputs[0];
  const array& values = primals[0];
  const array& updates = primals.back();
  const array& cotan = cotangents[0];

  std::vector<array> vjps;

  for (int num : argnums) {
    // Vjp for source
    if (num == 0) {
      switch (reduce_type_) {
        case SliceUpdate::None:
          vjps.push_back(array(
              cotan.shape(),
              cotan.dtype(),
              std::make_shared<SliceUpdate>(
                  stream(),
                  reduce_type_,
                  start_indices_,
                  end_indices_,
                  strides_),
              {cotan, zeros_like(updates, stream())}));
          break;
        case SliceUpdate::Sum:
          vjps.push_back(cotan);
          break;
        case SliceUpdate::Max:
        case SliceUpdate::Min:
          vjps.push_back(where(
              equal(result, values, stream()),
              cotan,
              array(0, cotan.dtype()),
              stream()));
          break;
        case SliceUpdate::Prod:
          vjps.push_back(array(
              cotan.shape(),
              cotan.dtype(),
              std::make_shared<SliceUpdate>(
                  stream(),
                  reduce_type_,
                  start_indices_,
                  end_indices_,
                  strides_),
              {cotan, updates}));
          break;
      }
    }
    // Vjp fpr updates
    else {
      auto sliced_cotan =
          slice(cotan, start_indices_, end_indices_, strides_, stream());
      switch (reduce_type_) {
        case SliceUpdate::None:
        case SliceUpdate::Sum:
          vjps.emplace_back(std::move(sliced_cotan));
          break;
        case SliceUpdate::Max:
        case SliceUpdate::Min: {
          auto sliced_result =
              slice(result, start_indices_, end_indices_, strides_, stream());
          vjps.push_back(where(
              equal(sliced_result, updates, stream()),
              sliced_cotan,
              array(0, cotan.dtype()),
              stream()));
          break;
        }
        case SliceUpdate::Prod: {
          auto sliced_values =
              slice(values, start_indices_, end_indices_, strides_, stream());
          vjps.push_back(multiply(sliced_cotan, sliced_values, stream()));
          break;
        }
      }
    }
  }

  return vjps;
}

std::vector<array> SliceUpdate::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  // Check inputs
  assert(primals.size() == 2);

  if (argnums.size() != 2) {
    throw std::runtime_error(
        "[SliceUpdate] JVP for one argument not implemented yet.");
  }

  auto result_tan = tangents[0];

  switch (reduce_type_) {
    case SliceUpdate::None:
      return {array(
          result_tan.shape(),
          result_tan.dtype(),
          std::make_shared<SliceUpdate>(
              stream(), reduce_type_, start_indices_, end_indices_, strides_),
          {result_tan, tangents[1]})};
    case SliceUpdate::Sum:
      return {array(
          result_tan.shape(),
          result_tan.dtype(),
          std::make_shared<SliceUpdate>(
              stream(), reduce_type_, start_indices_, end_indices_, strides_),
          {result_tan, tangents[1]})};
    case SliceUpdate::Prod:
    case SliceUpdate::Max:
    case SliceUpdate::Min: {
      throw std::runtime_error(
          "[SliceUpdate] JVP for product, minimum and maximum not implemented.");
    }
  }

  // Appease gcc (although no path reaches here).
  return {};
}

bool SliceUpdate::is_equivalent(const Primitive& other) const {
  const auto& s_other = static_cast<const SliceUpdate&>(other);
  return (
      reduce_type_ == s_other.reduce_type_ &&
      start_indices_ == s_other.start_indices_ &&
      end_indices_ == s_other.end_indices_ && strides_ == s_other.strides_);
}

std::pair<std::vector<array>, std::vector<int>> DynamicSlice::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto& in = inputs[0];
  auto& start = inputs[1];
  auto vax = axes[0];
  if (axes[1] >= 0) {
    throw std::invalid_argument(
        "[DynamicSlice::vmap] vmap over start indices not yet supported.");
  }
  auto slice_size = slice_size_;
  auto slice_axes = axes_;
  if (vax >= 0) {
    for (auto& ax : slice_axes) {
      if (ax >= vax) {
        ax++;
      }
    }
    slice_size.insert(slice_size.begin() + vax, in.shape(vax));
  }
  return {
      {slice(
          in, start, std::move(slice_axes), std::move(slice_size), stream())},
      {vax}};
}

std::vector<array> DynamicSlice::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  if (argnums[0] == 1 || argnums.size() > 1) {
    throw std::invalid_argument(
        "[DynamicSlice::vjp] Not supported for start indices.");
  }
  auto out = zeros_like(primals[0], stream());
  return {slice_update(out, cotangents[0], primals[1], axes_, stream())};
}

std::vector<array> DynamicSlice::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>&) {
  return {slice(tangents[0], primals[1], axes_, slice_size_, stream())};
}

bool DynamicSlice::is_equivalent(const Primitive& other) const {
  const auto& s_other = static_cast<const DynamicSlice&>(other);
  return (axes_ == s_other.axes_ && slice_size_ == s_other.slice_size_);
}

std::vector<Shape> DynamicSlice::output_shapes(const std::vector<array>&) {
  return {slice_size_};
}

std::pair<std::vector<array>, std::vector<int>> DynamicSliceUpdate::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto src = inputs[0];
  auto upd = inputs[1];
  auto& start = inputs[2];
  auto src_ax = axes[0];
  auto upd_ax = axes[1];
  if (axes[2] >= 0) {
    throw std::runtime_error(
        "[DynamicSliceUpdate::vmap] vmap over start indices not yet supported.");
  }
  // No vmapping needed
  if (src_ax == -1 && upd_ax == -1) {
    return {{slice_update(src, upd, start, axes_, stream())}, {-1}};
  }

  // Broadcast src
  if (src_ax == -1) {
    src = expand_dims(src, upd_ax, stream());
    auto shape = src.shape();
    shape[upd_ax] = upd.shape(upd_ax);
    src = broadcast_to(src, shape, stream());
    src_ax = upd_ax;
  }

  // Broadcast upd
  if (upd_ax == -1) {
    upd = expand_dims(upd, src_ax, stream());
    upd_ax = src_ax;
  }

  if (src_ax != upd_ax) {
    upd = moveaxis(upd, upd_ax, src_ax, stream());
  }

  auto slice_axes = axes_;
  for (auto& ax : slice_axes) {
    if (ax >= src_ax) {
      ax++;
    }
  }
  return {
      {slice_update(src, upd, start, std::move(slice_axes), stream())},
      {src_ax}};
}

std::vector<array> DynamicSliceUpdate::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  auto& cotan = cotangents[0];
  auto& upd = primals[1];
  auto& start = primals[2];

  std::vector<array> vjps;

  for (int num : argnums) {
    if (num == 0) {
      // Vjp for source
      vjps.push_back(slice_update(
          cotan, zeros_like(upd, stream()), start, axes_, stream()));
    } else if (num == 1) {
      // Vjp fpr updates
      vjps.push_back(slice(cotan, start, axes_, upd.shape(), stream()));
    } else {
      throw std::invalid_argument(
          "[DynamicSliceUpdate::vjp] Not supported for start indices");
    }
  }
  return vjps;
}

std::vector<array> DynamicSliceUpdate::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>&) {
  return {slice_update(tangents[0], tangents[1], primals[2], axes_, stream())};
}

bool DynamicSliceUpdate::is_equivalent(const Primitive& other) const {
  const auto& s_other = static_cast<const DynamicSliceUpdate&>(other);
  return axes_ == s_other.axes_;
}

std::pair<std::vector<array>, std::vector<int>> Softmax::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);

  std::vector<int> softmax_axes;

  // We are vectorizing over an axis other than the last one so keep the
  // softmax axis unchanged
  if (axes[0] >= 0 && axes[0] < inputs[0].ndim() - 1) {
    softmax_axes.push_back(-1);
  } else {
    softmax_axes.push_back(-2);
  }
  return {{softmax(inputs[0], softmax_axes, precise_, stream())}, axes};
}

std::vector<array> Softmax::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  assert(primals.size() == 1);
  assert(cotangents.size() == 1);
  auto& s = outputs[0];
  auto sv = multiply(s, cotangents[0], stream());
  return {subtract(
      sv,
      multiply(s, sum(sv, std::vector<int>{-1}, true, stream()), stream()))};
}

std::vector<array> Softmax::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(tangents.size() == 1);
  auto s = softmax(primals[0], std::vector<int>{-1}, precise_, stream());
  auto sv = multiply(s, tangents[0], stream());
  return {subtract(
      sv,
      multiply(s, sum(sv, std::vector<int>{-1}, true, stream()), stream()))};
}

bool Softmax::is_equivalent(const Primitive& other) const {
  const Softmax& s_other = static_cast<const Softmax&>(other);
  return precise_ == s_other.precise_;
}

std::pair<std::vector<array>, std::vector<int>> Sort::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);

  int axis_left = axes[0] >= 0 && axes[0] <= axis_;
  return {{sort(inputs[0], axis_ + axis_left, stream())}, axes};
}

std::vector<array> Sort::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> Sort::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(tangents.size() == 1);
  auto sort_idx = argsort(primals[0], axis_, stream());
  auto out = take_along_axis(tangents[0], sort_idx, axis_, stream());
  return {out};
}

bool Sort::is_equivalent(const Primitive& other) const {
  const Sort& r_other = static_cast<const Sort&>(other);
  return axis_ == r_other.axis_;
}

std::pair<std::vector<array>, std::vector<int>> Split::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  int axis_left = axes[0] >= 0 && axes[0] <= axis_;
  auto output = split(inputs[0], indices_, axis_ + axis_left, stream());
  std::vector<int> output_axes(output.size(), axes[0]);
  return {std::move(output), std::move(output_axes)};
}

std::vector<array> Split::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return {concatenate(cotangents, axis_, stream())};
}

std::vector<array> Split::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  return split(tangents[0], indices_, axis_, stream());
}

bool Split::is_equivalent(const Primitive& other) const {
  const Split& s_other = static_cast<const Split&>(other);
  return axis_ == s_other.axis_ && indices_ == s_other.indices_;
}

std::vector<array> Square::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> Square::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(tangents.size() == 1);
  return {multiply(
      primals[0],
      multiply(array(2, primals[0].dtype()), tangents[0], stream()),
      stream())};
}

std::pair<std::vector<array>, std::vector<int>> Square::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{square(inputs[0], stream())}, axes};
}

std::vector<array> Sqrt::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>& outputs) {
  assert(primals.size() == 1);
  assert(cotangents.size() == 1);
  auto dtype = primals[0].dtype();
  if (recip_) {
    auto one_over_x_root_x = divide(outputs[0], primals[0], stream());
    return {multiply(
        multiply(array(-0.5, dtype), cotangents[0], stream()),
        one_over_x_root_x,
        stream())};
  } else {
    return {divide(
        multiply(array(0.5, dtype), cotangents[0], stream()),
        outputs[0],
        stream())};
  }
}

std::vector<array> Sqrt::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  if (recip_) {
    return vjp(primals, tangents, argnums, {rsqrt(primals[0], stream())});
  } else {
    return vjp(primals, tangents, argnums, {sqrt(primals[0], stream())});
  }
}

std::pair<std::vector<array>, std::vector<int>> Sqrt::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  if (recip_) {
    return {{rsqrt(inputs[0], stream())}, axes};
  }
  return {{sqrt(inputs[0], stream())}, axes};
}

bool Sqrt::is_equivalent(const Primitive& other) const {
  const Sqrt& s_other = static_cast<const Sqrt&>(other);
  return recip_ == s_other.recip_;
}

std::pair<std::vector<array>, std::vector<int>> StopGradient::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  return {{stop_gradient(inputs[0], stream())}, axes};
}

std::vector<array> Subtract::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;
  for (auto arg : argnums) {
    auto vjp = cotangents[0];
    if (arg == 1) {
      vjp = negative(vjp, stream());
    }
    vjps.push_back(vjp);
  }
  return vjps;
}

std::vector<array> Subtract::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  auto jvp_fun = [&](int i) {
    int arg = argnums[i];
    return arg == 1 ? negative(tangents[i], stream()) : tangents[i];
  };
  auto out = jvp_fun(0);
  if (argnums.size() > 1) {
    out = add(out, jvp_fun(1), stream());
  }
  return {out};
}

std::pair<std::vector<array>, std::vector<int>> Subtract::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto [a, b, to_ax] = vmap_binary_op(inputs, axes, stream());
  return {{subtract(a, b, stream())}, {to_ax}};
}

std::vector<array> Squeeze::vjp(
    const std::vector<array>&,
    const std::vector<array>& cotangents,
    const std::vector<int>&,
    const std::vector<array>&) {
  return {expand_dims(cotangents[0], axes_, stream())};
}

std::vector<array> Squeeze::jvp(
    const std::vector<array>&,
    const std::vector<array>& tangents,
    const std::vector<int>&) {
  return {squeeze(tangents[0], axes_, stream())};
}

std::pair<std::vector<array>, std::vector<int>> Squeeze::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto ax = axes[0];
  auto squeeze_axes = axes_;
  for (auto& s : squeeze_axes) {
    if (s >= axes[0]) {
      s++;
    } else {
      ax--;
    }
  }
  return {{squeeze(inputs[0], std::move(squeeze_axes), stream())}, {ax}};
}

bool Squeeze::is_equivalent(const Primitive& other) const {
  const Squeeze& a_other = static_cast<const Squeeze&>(other);
  return (axes_ == a_other.axes_);
}

Shape Squeeze::output_shape(const array& input, const std::vector<int>& axes) {
  Shape shape;
  for (int i = 0, j = 0; i < input.ndim(); ++i) {
    if (j < axes.size() && i == axes[j]) {
      j++;
    } else {
      shape.push_back(input.shape(i));
    }
  }
  return shape;
}

std::vector<Shape> Squeeze::output_shapes(const std::vector<array>& inputs) {
  return {Squeeze::output_shape(inputs[0], axes_)};
}

std::vector<array> Tan::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> Tan::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  array cos_sq = square(cos(primals[0], stream()), stream());
  return {divide(tangents[0], cos_sq, stream())};
}

std::pair<std::vector<array>, std::vector<int>> Tan::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{tan(inputs[0], stream())}, axes};
}

std::vector<array> Tanh::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  return jvp(primals, cotangents, argnums);
}

std::vector<array> Tanh::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  array cosh_sq = square(cosh(primals[0], stream()), stream());
  return {divide(tangents[0], cosh_sq, stream())};
}

std::pair<std::vector<array>, std::vector<int>> Tanh::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{tanh(inputs[0], stream())}, axes};
}

std::pair<std::vector<array>, std::vector<int>> BitwiseInvert::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{bitwise_invert(inputs[0], stream())}, axes};
}

std::vector<array> BlockMaskedMM::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  /////////////////////////////////////////////////////////////////////////////
  // The operation that is done w/o intermediates by the primitive is
  //    - tm = (M + block_size - 1) // block_size; MP = tm * block_size;
  //    - tn = (N + block_size - 1) // block_size; NP = tn * block_size;
  //    - tm = (K + block_size - 1) // block_size; KP = tk * block_size;
  //    - mask_b <- mask broadcasted to block sizes
  //    - A_m = A [..., M, K] * mask_b_lhs [..., MP, KP]
  //    - B_m = B [..., K, N] * mask_b_rhs [..., KP, MP]
  //    - C = A_m [..., M, K]  @ B_m [..., K, N]
  //    - C_m = C [..., M, N] * mask_b_out [..., MP, NP]
  //
  // The grads are therefore
  //    - dC_m = cotan [..., M, N]
  //    - dmask_b_out = cotan [..., M, N] * C [..., M, N]
  //    - dC = cotan [..., M, N] * mask_b_out [..., MP, NP]
  //    - dA_m = dC [..., M, N] @ B_m.T [..., N, K]
  //    - dB_m = A_m.T [..., K, M] @ dC [..., M, N]
  //    - dA = dA_m * mask_b_lhs [..., MP, KP]
  //    - dB = dB_m * mask_b_rhs [..., KP, MP]
  //    - dmask_b_lhs = dA_m [..., M, K] * A [..., M, K] // need [..., MP,
  //    KP]
  //    - dmask_b_rhs = dB_m [..., K, N] * B [..., K, N] // need [..., KP,
  //    NP]
  //
  // Observations:
  //  * If dmask_b_lhs is not needed, then dA can be calulated in one go as
  //  a
  //    as a block_masked_mm with mask_b_lhs as the out_mask without needing
  //    to materialize the intermediate dA_m. Similar for dB.
  //  * If dmask_b_lhs is needed, we need to materialize dA_m directly and
  //  then
  //    point-wise multiply with A. But the output needs to be padded

  std::vector<array> vjps;
  auto& cotan = cotangents[0];
  std::vector<int> reorder(cotan.ndim());
  std::iota(reorder.begin(), reorder.end(), 0);
  std::iter_swap(reorder.end() - 1, reorder.end() - 2);

  bool has_op_mask = primals.size() > 3;
  bool has_out_mask = primals.size() == 3 || primals.size() == 5;

  const int op_mask_idx = has_out_mask ? 3 : 2;
  bool needs_lhs_mask_vjp = has_op_mask;
  bool needs_rhs_mask_vjp = has_op_mask;

  for (auto arg : argnums) {
    needs_lhs_mask_vjp = arg == op_mask_idx;
    needs_rhs_mask_vjp = arg == op_mask_idx + 1;
  }

  if ((needs_lhs_mask_vjp && primals[op_mask_idx].dtype() == bool_) ||
      (needs_rhs_mask_vjp && primals[op_mask_idx + 1].dtype() == bool_)) {
    throw std::invalid_argument(
        "[BlockMaskedMM] Cannot calculate VJP with respect to boolean masks.");
  }

  auto expand_mask = [&](array mask, int Y, int X) {
    // Exapnd mask
    auto mask_reshape = mask.shape();
    mask = expand_dims(mask, {-3, -1}, stream());
    auto mask_shape = mask.shape();
    int mask_ndim = mask_shape.size();

    // Broadcast mask
    mask_shape[mask_ndim - 1] = block_size_;
    mask_shape[mask_ndim - 3] = block_size_;
    mask = broadcast_to(mask, mask_shape, stream());

    // Reshape mask to squeeze in braodcasted dims
    mask_ndim = mask_reshape.size();
    mask_reshape[mask_ndim - 2] *= block_size_;
    mask_reshape[mask_ndim - 1] *= block_size_;
    mask = reshape(mask, mask_reshape, stream());

    // Slice mask
    mask_reshape[mask_ndim - 2] = Y;
    mask_reshape[mask_ndim - 1] = X;
    mask = slice(mask, Shape(mask_ndim, 0), mask_reshape, stream());

    return mask;
  };

  array zero = array(0, cotan.dtype());

  auto multiply_pad_reduce = [&](array p, array q, int align_Y, int align_X) {
    // Multiply with cotan
    auto r = multiply(p, q, stream());

    // Pad if needed
    if ((align_Y != 0) || (align_X != 0)) {
      r = pad(
          r, {-2, -1}, {0, 0}, {align_Y, align_X}, zero, "constant", stream());
    }

    // Reshape
    Shape r_reshape(r.shape().begin(), r.shape().end() - 2);
    r_reshape.push_back(r.shape(-2) / block_size_);
    r_reshape.push_back(block_size_);
    r_reshape.push_back(r.shape(-1) / block_size_);
    r_reshape.push_back(block_size_);
    r = reshape(r, r_reshape, stream());

    // Reduce
    return sum(r, {-3, -1}, false, stream());
  };

  // Prepare for padding if needed
  const int M = cotan.shape(-2);
  const int N = cotan.shape(-1);
  const int K = primals[0].shape(-1);
  const int tm = (M + block_size_ - 1) / block_size_;
  const int tn = (N + block_size_ - 1) / block_size_;
  const int tk = (K + block_size_ - 1) / block_size_;
  const int align_M = tm * block_size_ - M;
  const int align_N = tn * block_size_ - N;
  const int align_K = tk * block_size_ - K;

  // Potential intermediates
  array unmasked_lhs_grad = primals[0];
  array unmasked_rhs_grad = primals[1];

  bool unmasked_lhs_grad_calculated = false;
  bool unmasked_rhs_grad_calculated = false;

  for (auto arg : argnums) {
    if (arg == 0) {
      // M X N * (K X N).T -> M X K
      auto b_t = transpose(primals[1], reorder, stream());
      auto out_mask =
          has_out_mask ? std::make_optional<array>(primals[2]) : std::nullopt;
      auto lhs_mask = has_op_mask && !needs_lhs_mask_vjp
          ? std::make_optional<array>(primals[op_mask_idx])
          : std::nullopt;
      auto rhs_mask_t = has_op_mask
          ? std::make_optional<array>(
                transpose(primals[op_mask_idx + 1], reorder, stream()))
          : std::nullopt;

      auto grad = block_masked_mm(
          cotan, b_t, block_size_, lhs_mask, out_mask, rhs_mask_t, stream());

      if (needs_lhs_mask_vjp) {
        unmasked_lhs_grad = grad;
        unmasked_lhs_grad_calculated = true;
        auto exp_mask = expand_mask(primals[op_mask_idx], M, K);
        grad = multiply(grad, exp_mask, stream());
      }

      vjps.push_back(grad);

    } else if (arg == 1) {
      // (M X K).T * M X N -> K X N
      auto a_t = transpose(primals[0], reorder, stream());
      auto out_mask =
          has_out_mask ? std::make_optional<array>(primals[2]) : std::nullopt;
      auto lhs_mask_t = has_op_mask
          ? std::make_optional<array>(
                transpose(primals[op_mask_idx], reorder, stream()))
          : std::nullopt;
      auto rhs_mask = has_op_mask && !needs_rhs_mask_vjp
          ? std::make_optional<array>(primals[op_mask_idx + 1])
          : std::nullopt;

      auto grad = block_masked_mm(
          a_t, cotan, block_size_, rhs_mask, lhs_mask_t, out_mask, stream());

      if (needs_rhs_mask_vjp) {
        unmasked_rhs_grad = grad;
        unmasked_rhs_grad_calculated = true;
        auto exp_mask = expand_mask(primals[op_mask_idx + 1], K, N);
        grad = multiply(grad, exp_mask, stream());
      }

      vjps.push_back(grad);

    } else if (arg == 2 && has_out_mask) {
      // Produce the forward result
      auto lhs_mask = has_op_mask
          ? std::make_optional<array>(primals[op_mask_idx])
          : std::nullopt;
      auto rhs_mask = has_op_mask
          ? std::make_optional<array>(primals[op_mask_idx + 1])
          : std::nullopt;

      auto C = block_masked_mm(
          primals[0],
          primals[1],
          block_size_,
          primals[2],
          lhs_mask,
          rhs_mask,
          stream());

      // Multiply, Pad and Reduce if needed
      auto grad = multiply_pad_reduce(cotan, C, align_M, align_N);
      vjps.push_back(grad);

    } else if (arg == op_mask_idx && has_op_mask) {
      if (!unmasked_lhs_grad_calculated) {
        // (M X K).T * M X N -> K X N
        auto b_t = transpose(primals[1], reorder, stream());
        auto out_mask =
            has_out_mask ? std::make_optional<array>(primals[2]) : std::nullopt;
        auto rhs_mask_t =
            transpose(primals[op_mask_idx + 1], reorder, stream());

        unmasked_lhs_grad = block_masked_mm(
            cotan,
            b_t,
            block_size_,
            std::nullopt,
            out_mask,
            rhs_mask_t,
            stream());

        unmasked_lhs_grad_calculated = true;
      }

      // Multiply, Pad and Reduce if needed
      auto grad =
          multiply_pad_reduce(primals[0], unmasked_lhs_grad, align_M, align_K);
      vjps.push_back(grad);

    } else if (arg == op_mask_idx + 1 && has_op_mask) {
      if (!unmasked_rhs_grad_calculated) {
        // (M X K).T * M X N -> K X N
        auto a_t = transpose(primals[0], reorder, stream());
        auto out_mask =
            has_out_mask ? std::make_optional<array>(primals[2]) : std::nullopt;
        auto lhs_mask_t = transpose(primals[op_mask_idx], reorder, stream());

        unmasked_rhs_grad = block_masked_mm(
            a_t,
            cotan,
            block_size_,
            std::nullopt,
            lhs_mask_t,
            out_mask,
            stream());

        unmasked_rhs_grad_calculated = true;
      }

      // Multiply, Pad and Reduce if needed
      auto grad =
          multiply_pad_reduce(primals[1], unmasked_rhs_grad, align_K, align_N);
      vjps.push_back(grad);

    } else {
      throw std::invalid_argument(
          "[BlockMaskedMM] Cannot calculate VJP with respect to masks.");
    }
  }
  return vjps;
}

std::vector<array> GatherMM::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  std::vector<array> vjps;
  auto& cotan = cotangents[0];

  auto& a = primals[0];
  auto& b = primals[1];
  auto& lhs_indices = primals[2];
  auto& rhs_indices = primals[3];

  int M = cotan.shape(-2);
  int K = primals[0].shape(-1);

  bool sorted = left_sorted_ || right_sorted_;
  bool no_broadcast = rhs_indices.size() * M * K == primals[0].size();

  for (auto arg : argnums) {
    if (arg == 0) {
      auto g = gather_mm(
          cotan,
          swapaxes(b, -1, -2, stream()),
          std::nullopt,
          rhs_indices,
          sorted,
          stream());
      if (sorted && no_broadcast) {
        vjps.push_back(g);
      } else {
        vjps.push_back(reshape(
            scatter_add(
                flatten(zeros_like(a, stream()), 0, -3, stream()),
                lhs_indices,
                expand_dims(g, -3, stream()),
                0,
                stream()),
            a.shape(),
            stream()));
      }
    } else if (arg == 1) {
      auto shape = b.shape();
      shape.pop_back();
      shape.pop_back();
      vjps.push_back(swapaxes(
          gather_mm_grad(
              a,
              cotan,
              lhs_indices,
              rhs_indices,
              sorted,
              std::move(shape),
              stream()),
          -1,
          -2,
          stream()));
    } else {
      throw std::invalid_argument(
          "[GatherMM] Cannot calculate VJP with respect to indices.");
    }
  }
  return vjps;
}

bool GatherMM::is_equivalent(const Primitive& other) const {
  const GatherMM& g_other = static_cast<const GatherMM&>(other);
  return left_sorted_ == g_other.left_sorted_ &&
      right_sorted_ == g_other.right_sorted_;
}

bool BlockMaskedMM::is_equivalent(const Primitive& other) const {
  const BlockMaskedMM& a_other = static_cast<const BlockMaskedMM&>(other);
  return (block_size_ == a_other.block_size_);
}

std::vector<array> Transpose::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  std::vector<int> iaxes(axes_.size());
  for (int i = 0; i < axes_.size(); ++i) {
    iaxes[axes_[i]] = i;
  }
  return {transpose(cotangents[0], iaxes, stream())};
}

std::vector<array> Transpose::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(tangents.size() == 1);
  return {transpose(tangents[0], axes_, stream())};
}

std::pair<std::vector<array>, std::vector<int>> Transpose::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  auto vdim = axes[0];
  if (vdim >= 0) {
    for (auto& dim : axes_) {
      if (dim >= vdim) {
        dim++;
      }
    }
    axes_.insert(axes_.begin() + vdim, vdim);
  }
  return {{transpose(inputs[0], axes_, stream())}, {vdim}};
}

bool Transpose::is_equivalent(const Primitive& other) const {
  const Transpose& t_other = static_cast<const Transpose&>(other);
  return axes_ == t_other.axes_;
}

std::vector<Shape> Transpose::output_shapes(const std::vector<array>& inputs) {
  auto& in = inputs[0];
  Shape shape(in.ndim(), 0);
  for (int i = 0; i < axes_.size(); ++i) {
    shape[i] = in.shape()[axes_[i]];
  }
  return {std::move(shape)};
}

std::pair<std::vector<array>, std::vector<int>> NumberOfElements::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);

  std::vector<int> new_axes = axes_;
  auto vdim = axes[0];
  if (vdim >= 0) {
    for (auto& dim : new_axes) {
      if (dim >= vdim) {
        dim++;
      }
    }
  }

  array out = array(
      {},
      dtype_,
      std::make_shared<NumberOfElements>(stream(), new_axes, inverted_, dtype_),
      inputs);

  return {{out}, {-1}};
}

bool NumberOfElements::is_equivalent(const Primitive& other) const {
  const NumberOfElements& n_other = static_cast<const NumberOfElements&>(other);
  return axes_ == n_other.axes_ && inverted_ == n_other.inverted_ &&
      dtype_ == n_other.dtype_;
}

std::pair<std::vector<array>, std::vector<int>> SVD::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto ax = axes[0] >= 0 ? 0 : -1;
  auto a = axes[0] > 0 ? moveaxis(inputs[0], axes[0], 0, stream()) : inputs[0];
  std::vector<int> new_axes(compute_uv_ ? 3 : 1, ax);
  return {linalg::svd(a, compute_uv_, stream()), std::move(new_axes)};
}

std::pair<std::vector<array>, std::vector<int>> Inverse::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  auto ax = axes[0] >= 0 ? 0 : -1;
  auto a = axes[0] > 0 ? moveaxis(inputs[0], axes[0], 0, stream()) : inputs[0];
  return {{linalg::inv(a, stream())}, {ax}};
}

std::pair<std::vector<array>, std::vector<int>> View::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  return {{view(inputs[0], dtype_, stream())}, axes};
}

const char* View::name() const {
  if (name_.empty()) {
    std::ostringstream os;
    os << "View " << dtype_;
    name_ = os.str();
  }
  return name_.c_str();
}

bool View::is_equivalent(const Primitive& other) const {
  const View& a_other = static_cast<const View&>(other);
  return (dtype_ == a_other.dtype_);
}

std::pair<std::vector<array>, std::vector<int>> Hadamard::vmap(
    const std::vector<array>& inputs,
    const std::vector<int>& axes) {
  assert(inputs.size() == 1);
  assert(axes.size() == 1);
  auto& s = stream();
  if (axes[0] == inputs[0].ndim() - 1) {
    auto a = moveaxis(inputs[0], axes[0], 0, s);
    auto b = hadamard_transform(a, scale_, s);
    return {{b}, {0}};
  }
  return {{hadamard_transform(inputs[0], scale_, s)}, axes};
}

std::vector<array> Hadamard::vjp(
    const std::vector<array>& primals,
    const std::vector<array>& cotangents,
    const std::vector<int>& argnums,
    const std::vector<array>&) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return jvp(primals, cotangents, argnums);
}

std::vector<array> Hadamard::jvp(
    const std::vector<array>& primals,
    const std::vector<array>& tangents,
    const std::vector<int>& argnums) {
  assert(primals.size() == 1);
  assert(argnums.size() == 1);
  return {hadamard_transform(tangents[0], scale_, stream())};
}

bool Hadamard::is_equivalent(const Primitive& other) const {
  const Hadamard& h_other = static_cast<const Hadamard&>(other);
  return scale_ == h_other.scale_;
}

} // namespace mlx::core


================================================
FILE: mlx/primitives.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <unordered_set>

#include "mlx/api.h"
#include "mlx/array.h"
#include "mlx/device.h"
#include "mlx/io/load.h"
#include "mlx/stream.h"

#define DEFINE_VMAP()                                                 \
  virtual std::pair<std::vector<array>, std::vector<int>> vmap(       \
      const std::vector<array>& inputs, const std::vector<int>& axes) \
      override;

#define DEFINE_GRADS()                           \
  std::vector<array> jvp(                        \
      const std::vector<array>& primals,         \
      const std::vector<array>& tangents,        \
      const std::vector<int>& argnums) override; \
                                                 \
  std::vector<array> vjp(                        \
      const std::vector<array>& primals,         \
      const std::vector<array>& cotangents,      \
      const std::vector<int>& argnums,           \
      const std::vector<array>& outputs) override;

#define DEFINE_NAME(PRIMITIVE)        \
  const char* name() const override { \
    return #PRIMITIVE;                \
  }

#define DEFINE_DEFAULT_IS_EQUIVALENT()                        \
  bool is_equivalent(const Primitive& other) const override { \
    return true;                                              \
  }

#define DEFINE_INPUT_OUTPUT_SHAPE()                                  \
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) \
      override {                                                     \
    return {inputs[0].shape()};                                      \
  }

namespace mlx::core {

// Abstract base class
class MLX_API Primitive {
 public:
  explicit Primitive(Stream stream) : stream_(stream) {}

  /** The device the primitive will run on. */
  const Device& device() {
    return stream().device;
  }

  /** The stream the primitive will run on. */
  const Stream& stream() {
    return stream_;
  }

  /**
   * A primitive must know how to evaluate itself on
   * the CPU/GPU for the given inputs and populate the output arrays.
   *
   * To avoid unnecessary allocations, the evaluation function
   * is responsible for allocating space for the array.
   */
  virtual void eval_cpu(
      const std::vector<array>& inputs,
      std::vector<array>& outputs) = 0;
  virtual void eval_gpu(
      const std::vector<array>& inputs,
      std::vector<array>& outputs) = 0;

  /**
   * The Jacobian-vector product.
   */
  virtual std::vector<array> jvp(
      const std::vector<array>& primals,
      const std::vector<array>& tangents,
      const std::vector<int>& argnums);

  /**
   * The vector-Jacobian product.
   */
  virtual std::vector<array> vjp(
      const std::vector<array>& primals,
      const std::vector<array>& cotangents,
      const std::vector<int>& argnums,
      const std::vector<array>& outputs);

  /**
   * The primitive must know how to vectorize itself across
   * the given axes. The output is a pair containing the output arrays
   * representing the vectorized computation and the axes which
   * corresponds to the vectorized dimensions of each output.
   */
  virtual std::pair<std::vector<array>, std::vector<int>> vmap(
      const std::vector<array>& inputs,
      const std::vector<int>& axes);

  /** Get the name of primitive. */
  virtual const char* name() const = 0;

  /** Equivalence check defaults to false unless overridden by the primitive */
  virtual bool is_equivalent(const Primitive& other) const {
    return false;
  }

  /** Get the output shapes of the primitive. This is not required to be
   * implemented by derived classes, in which case it will throw. */
  virtual std::vector<Shape> output_shapes(const std::vector<array>& inputs);

  virtual ~Primitive() = default;
  Primitive(const Primitive& other) = delete;
  Primitive(Primitive&& other) = delete;
  Primitive& operator=(const Primitive& other) = delete;
  Primitive& operator=(Primitive&& other) = delete;

 private:
  // Every primitive stores the stream it should run in
  Stream stream_;
};

class MLX_API UnaryPrimitive : public Primitive {
  /**
   * An abstract base class for a primitive with a single output.
   */
 public:
  explicit UnaryPrimitive(Stream stream) : Primitive(stream) {}

  virtual void eval_cpu(const std::vector<array>& inputs, array& output) = 0;
  virtual void eval_gpu(const std::vector<array>& inputs, array& output) = 0;

  inline void eval_cpu(
      const std::vector<array>& inputs,
      std::vector<array>& outputs) override {
    eval_cpu(inputs, outputs[0]);
  }
  inline void eval_gpu(
      const std::vector<array>& inputs,
      std::vector<array>& outputs) override {
    eval_gpu(inputs, outputs[0]);
  }

  virtual ~UnaryPrimitive() = default;
  UnaryPrimitive(const UnaryPrimitive& other) = delete;
  UnaryPrimitive(UnaryPrimitive&& other) = delete;
  UnaryPrimitive& operator=(const UnaryPrimitive& other) = delete;
  UnaryPrimitive& operator=(UnaryPrimitive&& other) = delete;
};

enum class QuantizationMode { Affine, Mxfp4, Mxfp8, Nvfp4 };

std::string quantization_mode_to_string(QuantizationMode mode);
QuantizationMode string_to_quantization_mode(
    const std::string& mode,
    std::string_view error_tag = "");

class Abs : public UnaryPrimitive {
 public:
  explicit Abs(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Abs)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class MLX_API Add : public UnaryPrimitive {
 public:
  explicit Add(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Add)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class AddMM : public UnaryPrimitive {
 public:
  explicit AddMM(Stream stream, float alpha, float beta)
      : UnaryPrimitive(stream), alpha_(alpha), beta_(beta) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_GRADS()
  DEFINE_VMAP()
  DEFINE_NAME(AddMM)

  bool is_equivalent(const Primitive& other) const override;
  std::pair<float, float> state() const {
    return {alpha_, beta_};
  };

 private:
  const float alpha_;
  const float beta_;
};

class Arange : public UnaryPrimitive {
 public:
  explicit Arange(Stream stream, double start, double stop, double step)
      : UnaryPrimitive(stream), start_(start), stop_(stop), step_(step) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_NAME(Arange)
  bool is_equivalent(const Primitive& other) const override;
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  std::tuple<double, double, double> state() const {
    return {start_, stop_, step_};
  };

 private:
  double start_;
  double stop_;
  double step_;
};

class ArcCos : public UnaryPrimitive {
 public:
  explicit ArcCos(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(ArcCos)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class ArcCosh : public UnaryPrimitive {
 public:
  explicit ArcCosh(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(ArcCosh)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class ArcSin : public UnaryPrimitive {
 public:
  explicit ArcSin(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(ArcSin)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class ArcSinh : public UnaryPrimitive {
 public:
  explicit ArcSinh(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(ArcSinh)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class ArcTan : public UnaryPrimitive {
 public:
  explicit ArcTan(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(ArcTan)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class ArcTan2 : public UnaryPrimitive {
 public:
  explicit ArcTan2(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(ArcTan2)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class ArcTanh : public UnaryPrimitive {
 public:
  explicit ArcTanh(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(ArcTanh)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class ArgPartition : public UnaryPrimitive {
 public:
  explicit ArgPartition(Stream stream, int kth, int axis)
      : UnaryPrimitive(stream), kth_(kth), axis_(axis) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(ArgPartition)
  DEFINE_INPUT_OUTPUT_SHAPE()
  bool is_equivalent(const Primitive& other) const override;
  std::pair<int, int> state() const {
    return {kth_, axis_};
  };

 private:
  int kth_;
  int axis_;
};

class MLX_API ArgReduce : public UnaryPrimitive {
 public:
  enum ReduceType {
    ArgMin,
    ArgMax,
  };

  explicit ArgReduce(Stream stream, ReduceType reduce_type, int axis)
      : UnaryPrimitive(stream), reduce_type_(reduce_type), axis_(axis) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(ArgReduce)
  bool is_equivalent(const Primitive& other) const override;
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  std::pair<ReduceType, int> state() const {
    return {reduce_type_, axis_};
  };

 private:
  ReduceType reduce_type_;
  int axis_;
};

class ArgSort : public UnaryPrimitive {
 public:
  explicit ArgSort(Stream stream, int axis)
      : UnaryPrimitive(stream), axis_(axis) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(ArgSort)
  DEFINE_INPUT_OUTPUT_SHAPE()
  bool is_equivalent(const Primitive& other) const override;
  int state() const {
    return axis_;
  };

 private:
  int axis_;
};

class AsType : public UnaryPrimitive {
 public:
  explicit AsType(Stream stream, Dtype dtype)
      : UnaryPrimitive(stream), dtype_(dtype) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(AsType)
  DEFINE_INPUT_OUTPUT_SHAPE()
  bool is_equivalent(const Primitive& other) const override;
  Dtype state() const {
    return dtype_;
  };

 private:
  Dtype dtype_;
};

class AsStrided : public UnaryPrimitive {
 public:
  explicit AsStrided(Stream stream, Shape shape, Strides strides, size_t offset)
      : UnaryPrimitive(stream),
        shape_(std::move(shape)),
        strides_(std::move(strides)),
        offset_(offset) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_GRADS()
  DEFINE_NAME(AsStrided)
  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return std::make_tuple(shape_, strides_, offset_);
  }

 private:
  Shape shape_;
  Strides strides_;
  size_t offset_;

  void eval(const std::vector<array>& inputs, array& out);
};

class BitwiseBinary : public UnaryPrimitive {
 public:
  enum Op { And, Or, Xor, LeftShift, RightShift };

  explicit BitwiseBinary(Stream stream, Op op)
      : UnaryPrimitive(stream), op_(op) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()

  const char* name() const override {
    switch (op_) {
      case BitwiseBinary::And:
        return "BitwiseAnd";
      case BitwiseBinary::Or:
        return "BitwiseOr";
      case BitwiseBinary::Xor:
        return "BitwiseXor";
      case BitwiseBinary::LeftShift:
        return "LeftShift";
      case BitwiseBinary::RightShift:
        return "RightShift";
    }
    return "<unknwon BitwiseBinary>";
  }

  bool is_equivalent(const Primitive& other) const override;
  DEFINE_INPUT_OUTPUT_SHAPE()
  auto state() const {
    return op_;
  }

 private:
  Op op_;
};

class BitwiseInvert : public UnaryPrimitive {
 public:
  explicit BitwiseInvert(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_NAME(BitwiseInvert)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class BlockMaskedMM : public UnaryPrimitive {
 public:
  explicit BlockMaskedMM(Stream stream, int block_size)
      : UnaryPrimitive(stream), block_size_(block_size) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  std::vector<array> vjp(
      const std::vector<array>& primals,
      const std::vector<array>& cotangents,
      const std::vector<int>& argnums,
      const std::vector<array>& outputs) override;

  DEFINE_NAME(BlockMaskedMM)
  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return block_size_;
  }

 private:
  int block_size_;
};

class GatherMM : public UnaryPrimitive {
 public:
  explicit GatherMM(
      Stream stream,
      bool left_sorted = false,
      bool right_sorted = false)
      : UnaryPrimitive(stream),
        left_sorted_(left_sorted),
        right_sorted_(right_sorted) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  std::vector<array> vjp(
      const std::vector<array>& primals,
      const std::vector<array>& cotangents,
      const std::vector<int>& argnums,
      const std::vector<array>& outputs) override;

  DEFINE_NAME(GatherMM)
  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return std::make_pair(left_sorted_, right_sorted_);
  }

 private:
  bool left_sorted_;
  bool right_sorted_;
};

class SegmentedMM : public UnaryPrimitive {
 public:
  explicit SegmentedMM(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_NAME(SegmentedMM)
};

class BroadcastAxes : public UnaryPrimitive {
 public:
  explicit BroadcastAxes(Stream stream, std::vector<int> ignore_axes = {})
      : UnaryPrimitive(stream), ignore_axes_(std::move(ignore_axes)) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(BroadcastAxes)
  bool is_equivalent(const Primitive& other) const override;
  static Shape output_shape(
      const std::vector<array>& inputs,
      const std::vector<int>& ignore_axes);
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  auto state() const {
    return ignore_axes_;
  }

 private:
  void eval(const std::vector<array>& inputs, array& out);
  std::vector<int> ignore_axes_;
};

class Broadcast : public UnaryPrimitive {
 public:
  explicit Broadcast(Stream stream, const Shape& shape)
      : UnaryPrimitive(stream), shape_(shape) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Broadcast)
  static Shape output_shape(const std::vector<array>& inputs);
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  bool is_equivalent(const Primitive& other) const override;
  Shape state() const {
    return shape_;
  };

 private:
  Shape shape_;

  void eval(const std::vector<array>& inputs, array& out);
};

class Ceil : public UnaryPrimitive {
 public:
  explicit Ceil(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Ceil)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class MLX_API Compiled : public Primitive {
 public:
  /*
   * The inputs, outputs and tape are either tracers or constants.
   * - The tape should not contain the inputs, but it should contain the
   *   outputs.
   * - The tape should also have only one array per primitive for multi-output
   *   primitives.
   * - The constant_ids contains ids of arrays in the input list that are safe
   *   to treat as scalar constants.
   */
  explicit Compiled(
      Stream stream,
      std::vector<array> inputs,
      std::vector<array> outputs,
      std::vector<array> tape,
      std::unordered_set<uintptr_t> constant_ids);

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  const char* name() const override;
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  bool is_equivalent(const Primitive& other) const override;

  std::string lib_name() const {
    return kernel_lib_;
  }

 private:
  const std::vector<array> inputs_;
  const std::vector<array> outputs_;
  const std::vector<array> tape_;
  const std::unordered_set<uintptr_t> constant_ids_;
  const std::function<bool(size_t)> is_constant_;

  mutable std::string name_;
  std::string kernel_lib_;
};

class Concatenate : public UnaryPrimitive {
 public:
  explicit Concatenate(Stream stream, int axis)
      : UnaryPrimitive(stream), axis_(axis) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Concatenate)
  bool is_equivalent(const Primitive& other) const override;
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  auto state() const {
    return axis_;
  }

 private:
  int axis_;
};

class Conjugate : public UnaryPrimitive {
 public:
  explicit Conjugate(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_NAME(Conjugate)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Contiguous : public UnaryPrimitive {
 public:
  explicit Contiguous(Stream stream, bool allow_col_major)
      : UnaryPrimitive(stream), allow_col_major_(allow_col_major) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Contiguous)
  DEFINE_INPUT_OUTPUT_SHAPE()

  bool is_equivalent(const Primitive& other) const override;

 private:
  bool allow_col_major_;
};

class Convolution : public UnaryPrimitive {
 public:
  explicit Convolution(
      Stream stream,
      const std::vector<int>& kernel_strides,
      const std::vector<int>& padding_lo,
      const std::vector<int>& padding_hi,
      const std::vector<int>& kernel_dilation,
      const std::vector<int>& input_dilation,
      const int groups = 1,
      const bool flip = false)
      : UnaryPrimitive(stream),
        padding_lo_(padding_lo),
        padding_hi_(padding_hi),
        kernel_strides_(kernel_strides),
        kernel_dilation_(kernel_dilation),
        input_dilation_(input_dilation),
        groups_(groups),
        flip_(flip) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  std::vector<array> vjp(
      const std::vector<array>& primals,
      const std::vector<array>& cotangents,
      const std::vector<int>& argnums,
      const std::vector<array>& outputs) override;

  DEFINE_VMAP()
  DEFINE_NAME(Convolution)
  bool is_equivalent(const Primitive& other) const override;
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  auto state() const {
    return std::make_tuple(
        kernel_strides_,
        padding_lo_,
        padding_hi_,
        kernel_dilation_,
        input_dilation_,
        groups_,
        flip_);
  }

  static Shape conv_out_shape(
      const Shape& in_shape,
      const Shape& wt_shape,
      const std::vector<int>& strides,
      const std::vector<int>& pads_lo,
      const std::vector<int>& pads_hi,
      const std::vector<int>& kernel_dilation,
      const std::vector<int>& input_dilation);

 private:
  std::vector<int> padding_lo_;
  std::vector<int> padding_hi_;
  std::vector<int> kernel_strides_;
  std::vector<int> kernel_dilation_;
  std::vector<int> input_dilation_;
  int groups_;
  bool flip_;
};

class Copy : public UnaryPrimitive {
 public:
  explicit Copy(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Copy)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()

 private:
  void eval(const std::vector<array>& inputs, array& out);
};

class Cos : public UnaryPrimitive {
 public:
  explicit Cos(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Cos)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Cosh : public UnaryPrimitive {
 public:
  explicit Cosh(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Cosh)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class CustomTransforms : public Primitive {
 public:
  explicit CustomTransforms(
      Stream stream,
      int num_outputs,
      std::function<std::vector<array>(
          const std::vector<array>&,
          const std::vector<array>&,
          const std::vector<array>&)> vjp,
      std::function<std::vector<array>(
          const std::vector<array>&,
          const std::vector<array>&,
          const std::vector<int>&)> jvp,
      std::function<std::pair<std::vector<array>, std::vector<int>>(
          const std::vector<array>&,
          const std::vector<int>&)> vmap)
      : Primitive(stream),
        num_outputs_(num_outputs),
        vjp_fun_(std::move(vjp)),
        jvp_fun_(std::move(jvp)),
        vmap_fun_(std::move(vmap)) {}

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  DEFINE_GRADS();
  DEFINE_VMAP();
  DEFINE_NAME(CustomTransforms);

 private:
  void eval(const std::vector<array>& inputs, std::vector<array>& outputs);

  int num_outputs_;

  std::function<std::vector<array>(
      const std::vector<array>&,
      const std::vector<array>&,
      const std::vector<array>&)>
      vjp_fun_;
  std::function<std::vector<array>(
      const std::vector<array>&,
      const std::vector<array>&,
      const std::vector<int>&)>
      jvp_fun_;
  std::function<std::pair<std::vector<array>, std::vector<int>>(
      const std::vector<array>&,
      const std::vector<int>&)>
      vmap_fun_;
};

class Depends : public Primitive {
 public:
  explicit Depends(Stream stream) : Primitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  std::vector<array> vjp(
      const std::vector<array>& primals,
      const std::vector<array>& cotan,
      const std::vector<int>& argnums,
      const std::vector<array>& outputs) override;

  DEFINE_NAME(Depends);

 private:
  void eval(const std::vector<array>& inputs, std::vector<array>& outputs);
};

class Divide : public UnaryPrimitive {
 public:
  explicit Divide(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Divide)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class DivMod : public Primitive {
 public:
  explicit DivMod(Stream stream) : Primitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(DivMod)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override {
    return std::vector{inputs[0].shape(), inputs[0].shape()};
  }
};

class Select : public UnaryPrimitive {
 public:
  explicit Select(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Select)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Remainder : public UnaryPrimitive {
 public:
  explicit Remainder(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Remainder)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Equal : public UnaryPrimitive {
 public:
  explicit Equal(Stream stream, bool equal_nan = false)
      : UnaryPrimitive(stream), equal_nan_(equal_nan) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()

  const char* name() const override {
    if (equal_nan_) {
      return "NaNEqual";
    } else {
      return "Equal";
    }
  }
  auto state() const {
    return equal_nan_;
  };

 private:
  bool equal_nan_;
};

class Erf : public UnaryPrimitive {
 public:
  explicit Erf(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Erf)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class ErfInv : public UnaryPrimitive {
 public:
  explicit ErfInv(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(ErfInv)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class MLX_API Exp : public UnaryPrimitive {
 public:
  explicit Exp(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Exp)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Expm1 : public UnaryPrimitive {
 public:
  explicit Expm1(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Expm1)
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class ExpandDims : public UnaryPrimitive {
 public:
  explicit ExpandDims(Stream stream, std::vector<int> axes)
      : UnaryPrimitive(stream), axes_(std::move(axes)) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(ExpandDims)

  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  bool is_equivalent(const Primitive& other) const override;

  static Shape output_shape(const array& input, const std::vector<int>& axes);
  auto state() const {
    return axes_;
  }

 private:
  void eval(const std::vector<array>& inputs, array& out);
  std::vector<int> axes_;
};

class FFT : public UnaryPrimitive {
 public:
  explicit FFT(
      Stream stream,
      const std::vector<size_t>& axes,
      bool inverse,
      bool real)
      : UnaryPrimitive(stream), axes_(axes), inverse_(inverse), real_(real) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(FFT)

  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return std::make_tuple(axes_, inverse_, real_);
  }

 private:
  std::vector<size_t> axes_;
  bool inverse_;
  bool real_;
};

class Flatten : public UnaryPrimitive {
 public:
  explicit Flatten(Stream stream, int start_axis, int end_axis)
      : UnaryPrimitive(stream), start_axis_(start_axis), end_axis_(end_axis) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Flatten)
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  bool is_equivalent(const Primitive& other) const override;

  static Shape output_shape(const array& input, int start_axis, int end_axis);
  auto state() const {
    return std::make_pair(start_axis_, end_axis_);
  }

 private:
  int start_axis_;
  int end_axis_;
  void eval(const std::vector<array>& inputs, array& out);
};

class Floor : public UnaryPrimitive {
 public:
  explicit Floor(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Floor)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Full : public UnaryPrimitive {
 public:
  explicit Full(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Full)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Gather : public UnaryPrimitive {
 public:
  explicit Gather(Stream stream, std::vector<int> axes, Shape slice_sizes)
      : UnaryPrimitive(stream),
        axes_(std::move(axes)),
        slice_sizes_(std::move(slice_sizes)) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Gather)
  bool is_equivalent(const Primitive& other) const override;
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  std::pair<std::vector<int>, Shape> state() const {
    return {axes_, slice_sizes_};
  }

 private:
  std::vector<int> axes_;
  Shape slice_sizes_;
};

class GatherAxis : public UnaryPrimitive {
 public:
  explicit GatherAxis(Stream stream, int axis)
      : UnaryPrimitive(stream), axis_(axis) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(GatherAxis)
  bool is_equivalent(const Primitive& other) const override;
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  auto state() const {
    return axis_;
  }

 private:
  int axis_;
};

class Greater : public UnaryPrimitive {
 public:
  explicit Greater(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Greater)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class GreaterEqual : public UnaryPrimitive {
 public:
  explicit GreaterEqual(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(GreaterEqual)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Hadamard : public UnaryPrimitive {
 public:
  explicit Hadamard(Stream stream, float scale)
      : UnaryPrimitive(stream), scale_(scale) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Hadamard)
  DEFINE_INPUT_OUTPUT_SHAPE()

  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return scale_;
  }

 private:
  float scale_;
};

class Imag : public UnaryPrimitive {
 public:
  explicit Imag(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Imag)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Less : public UnaryPrimitive {
 public:
  explicit Less(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Less)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class LessEqual : public UnaryPrimitive {
 public:
  explicit LessEqual(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(LessEqual)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Load : public UnaryPrimitive {
 public:
  explicit Load(
      Stream stream,
      std::shared_ptr<io::Reader> reader,
      size_t offset,
      bool swap_endianness = false)
      : UnaryPrimitive(stream),
        reader_(std::move(reader)),
        offset_(offset),
        swap_endianness_(swap_endianness) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_NAME(Load)

 private:
  std::shared_ptr<io::Reader> reader_;
  size_t offset_;
  bool swap_endianness_;
};

class Log : public UnaryPrimitive {
 public:
  enum Base { two, ten, e };

  explicit Log(Stream stream, Base base)
      : UnaryPrimitive(stream), base_(base) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()

  Base state() const {
    return base_;
  };

  const char* name() const override {
    switch (base_) {
      case e:
        return "Log";
      case two:
        return "Log2";
      case ten:
        return "Log10";
    }
    return "<unknwon Log>";
  }

 private:
  Base base_;
};

class Log1p : public UnaryPrimitive {
 public:
  explicit Log1p(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Log1p)
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class LogicalNot : public UnaryPrimitive {
 public:
  explicit LogicalNot(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(LogicalNot)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class LogicalAnd : public UnaryPrimitive {
 public:
  explicit LogicalAnd(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(LogicalAnd)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class LogicalOr : public UnaryPrimitive {
 public:
  explicit LogicalOr(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(LogicalOr)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class LogAddExp : public UnaryPrimitive {
 public:
  explicit LogAddExp(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(LogAddExp)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class LogSumExp : public UnaryPrimitive {
 public:
  explicit LogSumExp(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(LogSumExp)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
};

class Matmul : public UnaryPrimitive {
 public:
  explicit Matmul(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_GRADS()
  DEFINE_VMAP()
  DEFINE_NAME(Matmul)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
};

class Maximum : public UnaryPrimitive {
 public:
  explicit Maximum(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Maximum)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Minimum : public UnaryPrimitive {
 public:
  explicit Minimum(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Minimum)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Multiply : public UnaryPrimitive {
 public:
  explicit Multiply(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Multiply)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Negative : public UnaryPrimitive {
 public:
  explicit Negative(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Negative)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class NotEqual : public UnaryPrimitive {
 public:
  explicit NotEqual(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(NotEqual)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class NumberOfElements : public UnaryPrimitive {
 public:
  explicit NumberOfElements(
      Stream stream,
      std::vector<int> axes,
      bool inverted,
      Dtype dtype)
      : UnaryPrimitive(stream),
        axes_(std::move(axes)),
        inverted_(inverted),
        dtype_(dtype) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_NAME(NumberOfElements)
  bool is_equivalent(const Primitive& other) const override;
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override {
    return {{}};
  }
  std::tuple<std::vector<int>, bool, Dtype> state() const {
    return {axes_, inverted_, dtype_};
  }

 private:
  std::vector<int> axes_;
  bool inverted_;
  Dtype dtype_;

  void eval(const std::vector<array>& inputs, array& out);
};

class Pad : public UnaryPrimitive {
 public:
  explicit Pad(
      Stream stream,
      const std::vector<int>& axes,
      const Shape& low_pad_size,
      const Shape& high_pad_size)
      : UnaryPrimitive(stream),
        axes_(axes),
        low_pad_size_(low_pad_size),
        high_pad_size_(high_pad_size) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Pad)
  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return std::make_tuple(axes_, low_pad_size_, high_pad_size_);
  }

 private:
  std::vector<int> axes_;
  Shape low_pad_size_;
  Shape high_pad_size_;
};

class Partition : public UnaryPrimitive {
 public:
  explicit Partition(Stream stream, int kth, int axis)
      : UnaryPrimitive(stream), kth_(kth), axis_(axis) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Partition)
  DEFINE_INPUT_OUTPUT_SHAPE()
  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return std::make_pair(kth_, axis_);
  };

 private:
  int kth_;
  int axis_;
};

class Power : public UnaryPrimitive {
 public:
  explicit Power(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Power)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class QuantizedMatmul : public UnaryPrimitive {
 public:
  explicit QuantizedMatmul(
      Stream stream,
      int group_size,
      int bits,
      QuantizationMode mode,
      bool transpose)
      : UnaryPrimitive(stream),
        group_size_(group_size),
        bits_(bits),
        mode_(mode),
        transpose_(transpose) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(QuantizedMatmul)
  bool is_equivalent(const Primitive& other) const override;
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  auto state() const {
    return std::make_tuple(group_size_, bits_, mode_, transpose_);
  }

 private:
  int group_size_;
  int bits_;
  QuantizationMode mode_;
  bool transpose_;
};

class QQMatmul : public UnaryPrimitive {
 public:
  explicit QQMatmul(
      Stream stream,
      int group_size,
      int bits,
      QuantizationMode mode)
      : UnaryPrimitive(stream),
        group_size_(group_size),
        bits_(bits),
        mode_(mode) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  // DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(QQMatmul)
  bool is_equivalent(const Primitive& other) const override;
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  auto state() const {
    return std::make_tuple(group_size_, bits_, mode_);
  }

 private:
  int group_size_;
  int bits_;
  QuantizationMode mode_;
};

class GatherQMM : public UnaryPrimitive {
 public:
  explicit GatherQMM(
      Stream stream,
      int group_size,
      int bits,
      QuantizationMode mode,
      bool transpose,
      bool left_sorted = false,
      bool right_sorted = false)
      : UnaryPrimitive(stream),
        group_size_(group_size),
        bits_(bits),
        mode_(mode),
        transpose_(transpose),
        left_sorted_(left_sorted),
        right_sorted_(right_sorted) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(GatherQMM)
  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return std::make_tuple(
        group_size_, bits_, mode_, transpose_, left_sorted_, right_sorted_);
  }

 private:
  int group_size_;
  int bits_;
  QuantizationMode mode_;
  bool transpose_;
  bool left_sorted_;
  bool right_sorted_;
};

class RandomBits : public UnaryPrimitive {
 public:
  explicit RandomBits(Stream stream, const Shape& shape, int width)
      : UnaryPrimitive(stream), shape_(shape), width_(width) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_NAME(RandomBits)
  bool is_equivalent(const Primitive& other) const override;
  std::pair<Shape, int> state() const {
    return {shape_, width_};
  };

 private:
  Shape shape_;
  int width_;
};

class Real : public UnaryPrimitive {
 public:
  explicit Real(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Real)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Reshape : public UnaryPrimitive {
 public:
  explicit Reshape(Stream stream, const Shape& shape)
      : UnaryPrimitive(stream), shape_(shape) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Reshape)
  bool is_equivalent(const Primitive& other) const override;
  Shape state() const {
    return shape_;
  };
  static Shape output_shape(const array& input, Shape shape);
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;

 private:
  Shape shape_;
};

class MLX_API Reduce : public UnaryPrimitive {
 public:
  enum ReduceType { And, Or, Sum, Prod, Min, Max };

  explicit Reduce(
      Stream stream,
      ReduceType reduce_type,
      const std::vector<int>& axes)
      : UnaryPrimitive(stream), reduce_type_(reduce_type), axes_(axes) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS();

  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;

  const char* name() const override {
    switch (reduce_type_) {
      case And:
        return "And";
      case Or:
        return "Or";
      case Sum:
        return "Sum";
      case Prod:
        return "Prod";
      case Min:
        return "Min";
      case Max:
        return "Max";
    }
    return "<unknwon Reduce>";
  }

  bool is_equivalent(const Primitive& other) const override;
  std::pair<ReduceType, std::vector<int>> state() const {
    return {reduce_type_, axes_};
  };

 private:
  ReduceType reduce_type_;
  std::vector<int> axes_;
};

class Round : public UnaryPrimitive {
 public:
  explicit Round(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Round)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Scan : public UnaryPrimitive {
 public:
  enum ReduceType { Max, Min, Sum, Prod, LogAddExp };

  explicit Scan(
      Stream stream,
      ReduceType reduce_type,
      int axis,
      bool reverse,
      bool inclusive)
      : UnaryPrimitive(stream),
        reduce_type_(reduce_type),
        axis_(axis),
        reverse_(reverse),
        inclusive_(inclusive) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS();

  const char* name() const override {
    switch (reduce_type_) {
      case Sum:
        return "CumSum";
      case Prod:
        return "CumProd";
      case Min:
        return "CumMin";
      case Max:
        return "CumMax";
      case LogAddExp:
        return "CumLogAddExp";
    }
    return "<unknwon Scan>";
  }

  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return std::make_tuple(reduce_type_, axis_, reverse_, inclusive_);
  }

 private:
  ReduceType reduce_type_;
  int axis_;
  bool reverse_;
  bool inclusive_;
};

class Scatter : public UnaryPrimitive {
 public:
  enum ReduceType { Max, Min, Sum, Prod, None };

  explicit Scatter(
      Stream stream,
      ReduceType reduce_type,
      const std::vector<int>& axes)
      : UnaryPrimitive(stream), reduce_type_(reduce_type), axes_(axes) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP();
  DEFINE_GRADS();

  const char* name() const override {
    switch (reduce_type_) {
      case Sum:
        return "Scatter Sum";
      case Prod:
        return "Scatter Prod";
      case Min:
        return "Scatter Min";
      case Max:
        return "Scatter Max";
      case None:
        return "Scatter";
    }
    return "<unknwon Scatter>";
  }

  bool is_equivalent(const Primitive& other) const override;
  std::pair<ReduceType, std::vector<int>> state() const {
    return {reduce_type_, axes_};
  };

 private:
  ReduceType reduce_type_;
  std::vector<int> axes_;
};

class ScatterAxis : public UnaryPrimitive {
 public:
  enum ReduceType { Sum, None };

  explicit ScatterAxis(Stream stream, ReduceType reduce_type, int axis)
      : UnaryPrimitive(stream), reduce_type_(reduce_type), axis_(axis) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()

  const char* name() const override {
    switch (reduce_type_) {
      case Sum:
        return "ScatterAxis Sum";
      case None:
        return "ScatterAxis";
    }
    return "<unknwon ScatterAxis>";
  }

  bool is_equivalent(const Primitive& other) const override;
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  std::pair<ReduceType, int> state() const {
    return {reduce_type_, axis_};
  }

 private:
  ReduceType reduce_type_;
  int axis_;
};

class MaskedScatter : public UnaryPrimitive {
 public:
  explicit MaskedScatter(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP();
  DEFINE_GRADS();
  DEFINE_NAME(MaskedScatter);
  DEFINE_DEFAULT_IS_EQUIVALENT();
  DEFINE_INPUT_OUTPUT_SHAPE();
};

class Sigmoid : public UnaryPrimitive {
 public:
  explicit Sigmoid(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Sigmoid)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Sign : public UnaryPrimitive {
 public:
  explicit Sign(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Sign)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Sin : public UnaryPrimitive {
 public:
  explicit Sin(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Sin)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Sinh : public UnaryPrimitive {
 public:
  explicit Sinh(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Sinh)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Slice : public UnaryPrimitive {
 public:
  explicit Slice(
      Stream stream,
      const Shape& start_indices,
      const Shape& end_indices,
      const Shape& strides)
      : UnaryPrimitive(stream),
        start_indices_(start_indices),
        end_indices_(end_indices),
        strides_(strides) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Slice)
  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return std::make_tuple(start_indices_, end_indices_, strides_);
  }

 private:
  Shape start_indices_;
  Shape end_indices_;
  Shape strides_;
};

class SliceUpdate : public UnaryPrimitive {
 public:
  enum ReduceType { Max, Min, Sum, Prod, None };

  explicit SliceUpdate(
      Stream stream,
      ReduceType reduce_type,
      const Shape& start_indices,
      const Shape& end_indices,
      const Shape& strides)
      : UnaryPrimitive(stream),
        reduce_type_(reduce_type),
        start_indices_(start_indices),
        end_indices_(end_indices),
        strides_(strides) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()

  const char* name() const override {
    switch (reduce_type_) {
      case Sum:
        return "SliceUpdate Sum";
      case Prod:
        return "SliceUpdate Prod";
      case Min:
        return "SliceUpdate Min";
      case Max:
        return "SliceUpdate Max";
      case None:
        return "SliceUpdate";
    }
    return "<unknown SliceUpdate>";
  }

  bool is_equivalent(const Primitive& other) const override;
  DEFINE_INPUT_OUTPUT_SHAPE()
  auto state() const {
    return std::make_tuple(
        reduce_type_, start_indices_, end_indices_, strides_);
  }

 private:
  ReduceType reduce_type_;
  Shape start_indices_;
  Shape end_indices_;
  Shape strides_;
};

class DynamicSlice : public UnaryPrimitive {
 public:
  explicit DynamicSlice(Stream stream, std::vector<int> axes, Shape slice_size)
      : UnaryPrimitive(stream),
        axes_(std::move(axes)),
        slice_size_(std::move(slice_size)) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(DynamicSlice)
  bool is_equivalent(const Primitive& other) const override;
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  auto state() const {
    return std::make_pair(axes_, slice_size_);
  }

 private:
  std::vector<int> axes_;
  Shape slice_size_;
};

class DynamicSliceUpdate : public UnaryPrimitive {
 public:
  explicit DynamicSliceUpdate(Stream stream, std::vector<int> axes)
      : UnaryPrimitive(stream), axes_(std::move(axes)) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(DynamicSliceUpdate)
  bool is_equivalent(const Primitive& other) const override;
  DEFINE_INPUT_OUTPUT_SHAPE()
  auto state() const {
    return axes_;
  }

 private:
  std::vector<int> axes_;
};

class Softmax : public UnaryPrimitive {
 public:
  explicit Softmax(Stream stream, bool precise)
      : UnaryPrimitive(stream), precise_(precise) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Softmax)
  DEFINE_INPUT_OUTPUT_SHAPE()

  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return precise_;
  };

 private:
  bool precise_;
};

class Sort : public UnaryPrimitive {
 public:
  explicit Sort(Stream stream, int axis)
      : UnaryPrimitive(stream), axis_(axis) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Sort)
  DEFINE_INPUT_OUTPUT_SHAPE()
  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return axis_;
  }

 private:
  int axis_;
};

class Split : public Primitive {
 public:
  explicit Split(Stream stream, const Shape& indices, int axis)
      : Primitive(stream), indices_(indices), axis_(axis) {}

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Split)
  bool is_equivalent(const Primitive& other) const override;
  std::pair<Shape, int> state() const {
    return {indices_, axis_};
  };

 private:
  void eval(const std::vector<array>& inputs, std::vector<array>& outputs);

  Shape indices_;
  int axis_;
};

class Square : public UnaryPrimitive {
 public:
  explicit Square(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Square)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Sqrt : public UnaryPrimitive {
 public:
  explicit Sqrt(Stream stream, bool recip = false)
      : UnaryPrimitive(stream), recip_(recip) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_INPUT_OUTPUT_SHAPE()
  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return recip_;
  }

  const char* name() const override {
    if (recip_) {
      return "Rsqrt";
    } else {
      return "Sqrt";
    }
  }

 private:
  bool recip_;
};

class StopGradient : public UnaryPrimitive {
 public:
  explicit StopGradient(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_NAME(StopGradient)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()

 private:
  void eval(const std::vector<array>& inputs, array& out);
};

class Subtract : public UnaryPrimitive {
 public:
  explicit Subtract(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Subtract)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Squeeze : public UnaryPrimitive {
 public:
  explicit Squeeze(Stream stream, std::vector<int> axes)
      : UnaryPrimitive(stream), axes_(std::move(axes)) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Squeeze)

  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  bool is_equivalent(const Primitive& other) const override;

  static Shape output_shape(const array& input, const std::vector<int>& axes);
  auto state() const {
    return axes_;
  };

 private:
  void eval(const std::vector<array>& inputs, array& out);
  std::vector<int> axes_;
};

class Tan : public UnaryPrimitive {
 public:
  explicit Tan(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Tan)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Tanh : public UnaryPrimitive {
 public:
  explicit Tanh(Stream stream) : UnaryPrimitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Tanh)
  DEFINE_DEFAULT_IS_EQUIVALENT()
  DEFINE_INPUT_OUTPUT_SHAPE()
};

class Unflatten : public UnaryPrimitive {
 public:
  explicit Unflatten(Stream stream, int axis, Shape shape)
      : UnaryPrimitive(stream), axis_(axis), shape_(std::move(shape)) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Unflatten)
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  bool is_equivalent(const Primitive& other) const override;

  static Shape output_shape(const array& input, int axis, const Shape& shape);
  auto state() const {
    return std::make_pair(axis_, shape_);
  }

 private:
  int axis_;
  Shape shape_;
  void eval(const std::vector<array>& inputs, array& out);
};

class View : public UnaryPrimitive {
 public:
  explicit View(Stream stream, Dtype dtype)
      : UnaryPrimitive(stream), dtype_(dtype) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  const char* name() const override;
  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return dtype_;
  }

 private:
  Dtype dtype_;
  mutable std::string name_;
};

class Transpose : public UnaryPrimitive {
 public:
  explicit Transpose(Stream stream, const std::vector<int>& axes)
      : UnaryPrimitive(stream), axes_(axes) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;

  DEFINE_VMAP()
  DEFINE_GRADS()
  DEFINE_NAME(Transpose)
  bool is_equivalent(const Primitive& other) const override;
  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;
  std::vector<int> state() const {
    return axes_;
  };

 private:
  std::vector<int> axes_;

  void eval(const std::vector<array>& inputs, array& out);
};

/* QR Factorization primitive. */
class QRF : public Primitive {
 public:
  explicit QRF(Stream stream) : Primitive(stream) {}

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  DEFINE_NAME(QRF)
};

/* SVD primitive. */
class SVD : public Primitive {
 public:
  explicit SVD(Stream stream, bool compute_uv)
      : Primitive(stream), compute_uv_(compute_uv) {}

  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  DEFINE_VMAP()
  DEFINE_NAME(SVD)
  auto state() const {
    return compute_uv_;
  }

 private:
  bool compute_uv_;
};

/* Matrix inversion primitive. */
class Inverse : public UnaryPrimitive {
 public:
  explicit Inverse(Stream stream, bool tri, bool upper)
      : UnaryPrimitive(stream), tri_(tri), upper_(upper) {}

  void eval_cpu(const std::vector<array>& inputs, array& output) override;
  void eval_gpu(const std::vector<array>& inputs, array& output) override;

  DEFINE_VMAP()
  DEFINE_NAME(Inverse)
  auto state() const {
    return std::make_pair(tri_, upper_);
  }

 private:
  bool tri_;
  bool upper_;
};

class Cholesky : public UnaryPrimitive {
 public:
  explicit Cholesky(Stream stream, bool upper)
      : UnaryPrimitive(stream), upper_(upper) {}

  void eval_cpu(const std::vector<array>& inputs, array& out) override;
  void eval_gpu(const std::vector<array>& inputs, array& out) override;
  auto state() const {
    return upper_;
  }

  DEFINE_VMAP()
  DEFINE_NAME(Cholesky)

 private:
  bool upper_;
};

class Eig : public Primitive {
 public:
  explicit Eig(Stream stream, bool compute_eigenvectors)
      : Primitive(stream), compute_eigenvectors_(compute_eigenvectors) {}
  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  DEFINE_VMAP()
  DEFINE_NAME(Eig)

  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;

  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return compute_eigenvectors_;
  }

 private:
  bool compute_eigenvectors_;
};

class Eigh : public Primitive {
 public:
  explicit Eigh(Stream stream, std::string uplo, bool compute_eigenvectors)
      : Primitive(stream),
        uplo_(std::move(uplo)),
        compute_eigenvectors_(compute_eigenvectors) {}
  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  DEFINE_VMAP()
  DEFINE_NAME(Eigh)

  std::vector<Shape> output_shapes(const std::vector<array>& inputs) override;

  bool is_equivalent(const Primitive& other) const override;
  auto state() const {
    return std::make_pair(uplo_, compute_eigenvectors_);
  }

 private:
  std::string uplo_;
  bool compute_eigenvectors_;
};

/* LU Factorization primitive. */
class LUF : public Primitive {
 public:
  explicit LUF(Stream stream) : Primitive(stream) {}
  void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;
  void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
      override;

  DEFINE_NAME(LUF)
};

} // namespace mlx::core


================================================
FILE: mlx/random.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <cmath>
#include <sstream>

#include "mlx/linalg.h"
#include "mlx/ops.h"
#include "mlx/primitives.h"
#include "mlx/random.h"
#include "mlx/utils.h"

namespace mlx::core::random {

KeySequence::KeySequence(uint64_t seed) : key_(key(seed)) {}

void KeySequence::seed(uint64_t seed) {
  key_ = key((seed));
}

array KeySequence::next() {
  auto out = split(key_);
  key_ = out.first;
  return out.second;
}

void seed(uint64_t seed) {
  KeySequence::default_().seed(seed);
}

array key(uint64_t seed) {
  uint32_t k1 = static_cast<uint32_t>(seed >> 32);
  uint32_t k2 = static_cast<uint32_t>(seed);
  return array({k1, k2});
}

array bits(
    const Shape& shape,
    int width /* 4 */,
    const std::optional<array>& key_ /*= nullopt */,
    StreamOrDevice s /* = {} */) {
  auto key = key_ ? *key_ : KeySequence::default_().next();
  if (key.dtype() != uint32) {
    std::ostringstream msg;
    msg << "[bits] Expected key type uint32 but received " << key.dtype()
        << ".";
    throw std::invalid_argument(msg.str());
  }
  if (key.shape() != Shape{2}) {
    std::ostringstream msg;
    msg << "[bits] Expected key shape (2) but received " << key.shape() << ".";
    throw std::invalid_argument(msg.str());
  }

  auto get_dtype = [width]() {
    switch (width) {
      case 4:
        return uint32;
      case 2:
        return uint16;
      case 1:
        return uint8;
      default:
        std::ostringstream msg;
        msg << "[bits] Bit width must be in {1, 2, 4} but got " << width << ".";
        throw std::invalid_argument(msg.str());
    }
  };
  return array(
      shape,
      get_dtype(),
      std::make_shared<RandomBits>(to_stream(s), shape, width),
      {key});
}

std::pair<array, array> split(const array& key, StreamOrDevice s /* = {} */) {
  auto stream = to_stream(s);
  auto out = mlx::core::split(random::split(key, 2, stream), 2, stream);
  return {reshape(out[0], {2}, stream), reshape(out[1], {2}, stream)};
}

array split(const array& key, int num, StreamOrDevice s /* = {} */) {
  return bits({num, 2}, 4, key, s);
}

// Get the next representable value below 1.0 for half precision
// floating point types (fp16, bf16)
template <typename T>
T below_one() {
  T f = T(1.0);
  uint16_t* m = (uint16_t*)&f;
  *m -= 1;
  return f;
}

array uniform(
    const array& low,
    const array& high,
    const Shape& shape,
    Dtype dtype /* = float32 */,
    const std::optional<array>& key /*= nullopt */,
    StreamOrDevice s /* = {} */) {
  if (!issubdtype(dtype, floating)) {
    throw std::invalid_argument(
        "[uniform] Can only generate uniform numbers with real "
        "floating point type.");
  }

  auto stream = to_stream(s);
  auto lo = astype(low, dtype, stream);
  auto hi = astype(high, dtype, stream);
  auto range = subtract(hi, lo, stream);
  auto out_shape = broadcast_shapes(shape, range.shape());
  if (out_shape != shape) {
    std::ostringstream msg;
    msg << "[uniform] Cannot generate random values of shape " << shape
        << " from broadcasted shape " << out_shape << ".";
    throw std::invalid_argument(msg.str());
  }

  // Get random values between [0, nextafter(1.0, 0.0)] since samples must
  // be in [low, high)
  auto get_upper = [&dtype]() {
    switch (dtype) {
      case float32:
        return array(std::nextafter(1.0f, 0.0f), float32);
      case float16:
        return array(below_one<float16_t>(), float32);
      case bfloat16:
        return array(below_one<bfloat16_t>(), float32);
      default:
        throw std::runtime_error("[uniform] Unsupported type.");
    }
  };

  auto upper = get_upper();
  auto maxval = array(std::numeric_limits<uint32_t>::max(), float32);
  auto out = bits(shape, size_of(float32), key, stream);
  out = divide(out, maxval, stream);
  out = astype(minimum(out, upper, stream), dtype, stream);
  return add(multiply(range, out, stream), lo, stream);
}

array uniform(
    const Shape& shape,
    Dtype dtype,
    const std::optional<array>& key /*= nullopt */,
    StreamOrDevice s /* = {} */) {
  return uniform(
      array(0.0, dtype), array(1.0, dtype), shape, dtype, key, to_stream(s));
}

inline array complex_normal(
    Shape shape,
    const std::optional<array>& loc,
    const std::optional<array>& scale,
    const std::optional<array>& key,
    StreamOrDevice s) {
  auto stream = to_stream(s);
  auto low = array(std::nextafter(-1.0f, 0.0f), float32);
  auto high = array(1.0f, float32);
  shape.push_back(2);
  auto samples =
      erfinv(uniform(low, high, shape, float32, key, stream), stream);
  samples = squeeze(view(samples, complex64, stream), -1, stream);
  if (scale.has_value()) {
    samples = multiply(*scale, samples, stream);
  }
  if (loc.has_value()) {
    samples = add(*loc, samples, stream);
  }
  return samples;
}

array normal(
    const Shape& shape,
    Dtype dtype,
    const std::optional<array>& loc,
    const std::optional<array>& scale,
    const std::optional<array>& key,
    StreamOrDevice s /* = {} */) {
  if (dtype == complex64) {
    return complex_normal(shape, loc, scale, key, s);
  } else if (!issubdtype(dtype, floating)) {
    throw std::invalid_argument(
        "[normal] Can only generate uniform numbers with "
        "floating point type.");
  }

  auto stream = to_stream(s);
  auto low = array(std::nextafter(-1.0f, 0.0f), float32);
  auto high = array(1.0f, float32);
  auto samples = uniform(low, high, shape, float32, key, stream);
  auto applied_scale = array(std::sqrt(2.0), dtype);
  if (scale.has_value()) {
    applied_scale =
        multiply(applied_scale, astype(*scale, dtype, stream), stream);
  }
  samples = astype(erfinv(samples, stream), dtype, stream);
  samples = multiply(applied_scale, samples, stream);
  if (loc.has_value()) {
    samples = add(astype(*loc, dtype, stream), samples, stream);
  }
  return samples;
}

array multivariate_normal(
    const array& mean,
    const array& cov,
    const Shape& shape,
    Dtype dtype,
    const std::optional<array>& key /* = nullopt */,
    StreamOrDevice s) {
  auto stream = to_stream(s);

  if (dtype != float32) {
    throw std::invalid_argument("[multivariate_normal] dtype must be float32.");
  }

  if (mean.ndim() < 1) {
    throw std::invalid_argument(
        "[multivariate_normal] mean must have at least one dimension.");
  }

  if (cov.ndim() < 2) {
    throw std::invalid_argument(
        "[multivariate_normal] cov must have at least two dimensions.");
  }

  auto n = mean.shape(-1);

  // Check shapes compatibility of mean and cov
  if (cov.shape(-1) != cov.shape(-2)) {
    throw std::invalid_argument(
        "[multivariate_normal] last two dimensions of cov must be equal.");
  }
  if (n != cov.shape(-1)) {
    throw std::invalid_argument(
        "[multivariate_normal] mean and cov must have compatible shapes.");
  }

  // Compute output shape
  auto truncated_mean_shape =
      Shape(mean.shape().begin(), mean.shape().end() - 1);
  auto truncated_cov_shape = Shape(cov.shape().begin(), cov.shape().end() - 2);
  auto output_shape =
      broadcast_shapes(truncated_cov_shape, truncated_mean_shape);
  output_shape = broadcast_shapes(output_shape, shape);
  output_shape.push_back(n);

  // Compute the square-root of the covariance matrix, using the SVD
  auto covariance = astype(cov, float32, stream);
  auto SVD = linalg::svd(covariance, true, stream);
  auto std = astype(
      matmul(
          multiply(
              SVD[0], expand_dims(sqrt(SVD[1], stream), -2, stream), stream),
          SVD[2],
          stream),
      dtype,
      stream);

  // Generate standard the samples
  auto standard_normal = normal(output_shape, dtype, 0.0, 1.0, key, stream);
  auto scaled_out = squeeze(
      matmul(expand_dims(standard_normal, -2, stream), std, stream),
      -2,
      stream);
  return add(mean, scaled_out, stream);
}

array randint(
    const array& low,
    const array& high,
    const Shape& shape,
    Dtype dtype /* = int32 */,
    const std::optional<array>& key /*= nullopt */,
    StreamOrDevice s /* = {} */) {
  if (issubdtype(dtype, inexact)) {
    throw std::invalid_argument(
        "[randint] randint only accepts integer dtypes and bool.");
  }
  auto u = uniform(low, high, shape, float32, key, s);
  return astype(maximum(u, low, s), dtype, s);
}

array bernoulli(
    const array& p,
    const Shape& shape,
    const std::optional<array>& key /*= nullopt */,
    StreamOrDevice s /* = {} */) {
  if (!issubdtype(p.dtype(), floating)) {
    throw std::invalid_argument(
        "[bernoulli] bernoulli probability `p` must be a float type.");
  }

  // Place p on the scale [0, nexthigher(UINT32_MAX)] so that if p >= 1.0 we
  // get all true and if p <= 0.0 we get all false
  auto upper = array(
      std::nextafter(
          static_cast<float>(std::numeric_limits<uint32_t>::max()),
          std::numeric_limits<float>::max()),
      float32);
  auto res = less(bits(shape, key, s), multiply(p, upper, s), s);
  if (res.shape() != shape) {
    throw std::invalid_argument(
        "[bernoulli] shape of `p` is incompatible with argument `shape`.");
  }
  return res;
}

array bernoulli(
    const array& p,
    const std::optional<array>& key /*= nullopt */,
    StreamOrDevice s /* = {} */) {
  return bernoulli(p, p.shape(), key, s);
}

array bernoulli(
    const std::optional<array>& key /*= nullopt */,
    StreamOrDevice s /* = {} */) {
  return bernoulli(array(0.5f), key, s);
}

array truncated_normal(
    const array& lower,
    const array& upper,
    const Shape& shape,
    Dtype dtype /* = float32 */,
    const std::optional<array>& key /*= nullopt */,
    StreamOrDevice s /* = {} */) {
  // Same as
  // https://jax.readthedocs.io/en/latest/_modules/jax/_src/random.html#truncated_normal

  if (!issubdtype(dtype, floating)) {
    throw std::invalid_argument(
        "[trunc_normal] trunc_normal only accepts floating point dtypes.");
  }

  auto sqrt2 = array(std::sqrt(2.0), dtype);
  auto lower_t = astype(lower, dtype, s);
  auto upper_t = astype(upper, dtype, s);
  auto a = erf(divide(lower_t, sqrt2, s), s);
  auto b = erf(divide(upper_t, sqrt2, s), s);
  auto u = uniform(a, b, shape, dtype, key, s);
  auto out = multiply(sqrt2, erfinv(u, s), s);

  // Clip in bounds
  return maximum(minimum(upper_t, out, s), lower_t, s);
}

array truncated_normal(
    const array& lower,
    const array& upper,
    Dtype dtype /* = float32 */,
    const std::optional<array>& key /*= nullopt */,
    StreamOrDevice s /* = {} */) {
  auto shape = broadcast_shapes(lower.shape(), upper.shape());
  return truncated_normal(lower, upper, shape, dtype, key, s);
}

array gumbel(
    const Shape& shape,
    Dtype dtype /* = float32 */,
    const std::optional<array>& key /*= nullopt */,
    StreamOrDevice s /* = {} */) {
  // -log(-log(uniform(shape)))
  return negative(
      log(negative(log(uniform(shape, dtype, key, s), s), s), s), s);
}

int get_valid_axis(int axis, int ndim) {
  int ax = axis < 0 ? axis + ndim : axis;
  if (ax < 0 || ax >= ndim) {
    std::ostringstream msg;
    msg << "[categorical] Invalid axis " << axis << " for logits with " << ndim
        << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  return ax;
}

array categorical_impl(
    const array& logits,
    int axis,
    const Shape& shape,
    const std::optional<array>& key /*= nullopt */,
    StreamOrDevice s) {
  auto gumbel_shape = shape;
  auto offset = axis + shape.size() - logits.ndim() + 1;
  gumbel_shape.insert(gumbel_shape.begin() + offset, logits.shape(axis));
  auto g = gumbel(gumbel_shape, float32, key, s);
  return argmax(add(g, logits, s), offset, false, s);
}

array categorical(
    const array& logits,
    int axis,
    const Shape& shape,
    const std::optional<array>& key /*= nullopt */,
    StreamOrDevice s /* = {} */) {
  // Validate and normalize axis
  axis = get_valid_axis(axis, logits.ndim());

  // Check that shape broadcasts with reduce(logits, axis)
  auto reduced_shape = logits.shape();
  reduced_shape.erase(reduced_shape.begin() + axis);
  if (broadcast_shapes(shape, reduced_shape) != shape) {
    std::ostringstream msg;
    msg << "[categorical] Requested shape " << shape
        << " is not broadcast compatible with reduced logits shape"
        << reduced_shape << ".";
    throw std::invalid_argument(msg.str());
  }

  return categorical_impl(logits, axis, shape, key, s);
}

array categorical(
    const array& logits_,
    int axis,
    int num_samples,
    const std::optional<array>& key /*= nullopt */,
    StreamOrDevice s /* = {} */) {
  axis = get_valid_axis(axis, logits_.ndim());
  auto logits = expand_dims(logits_, -1);
  auto shape = logits.shape();
  shape.erase(shape.begin() + axis);
  shape.back() = num_samples;
  return categorical_impl(logits, axis, shape, key, s);
}

array categorical(
    const array& logits,
    int axis /* = -1 */,
    const std::optional<array>& key /*= nullopt */,
    StreamOrDevice s /* = {} */) {
  axis = get_valid_axis(axis, logits.ndim());
  auto shape = logits.shape();
  shape.erase(shape.begin() + axis);
  return categorical_impl(logits, axis, shape, key, s);
}

array laplace(
    const Shape& shape,
    Dtype dtype,
    const float loc /* = 0.0 */,
    const float scale /* = 1.0 */,
    const std::optional<array>& key /*= nullopt */,
    StreamOrDevice s /* = {} */) {
  if (!issubdtype(dtype, floating)) {
    throw std::invalid_argument(
        "[laplace] Can only generate uniform numbers with real"
        "floating point type.");
  }

  auto stream = to_stream(s);
  auto low = array(std::nextafter(-1.0f, 0.0f), float32);
  auto high = array(1.0f, float32);
  auto samples = uniform(low, high, shape, float32, key, stream);
  // Use inverse CDF to generate Laplacian noise
  samples = multiply(
      sign(samples, stream),
      log1p(
          multiply(array(-1.0f, dtype), abs(samples, stream), stream), stream),
      stream);
  samples = astype(samples, dtype, stream);

  if (scale != 1.0) {
    samples = multiply(array(scale, dtype), samples, stream);
  }
  if (loc != 0.0) {
    samples = add(array(loc, dtype), samples, stream);
  }
  return samples;
}

array permutation(
    const array& x,
    int axis /* = 0 */,
    const std::optional<array>& key /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  return take(x, permutation(x.shape(axis), key, s), axis, s);
}

array permutation(
    int x,
    const std::optional<array>& key /* = std::nullopt */,
    StreamOrDevice s /* = {} */) {
  return argsort(bits({x}, key, s), s);
}

} // namespace mlx::core::random


================================================
FILE: mlx/random.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include <chrono>
#include <optional>

#include "mlx/api.h"
#include "mlx/array.h"
#include "mlx/stream.h"
#include "mlx/utils.h"

namespace mlx::core::random {

class MLX_API KeySequence {
 public:
  explicit KeySequence(uint64_t seed);

  void seed(uint64_t seed);
  array next();

  // Each thread has its own random key to avoid race condition.
  static KeySequence& default_() {
    static auto time_seed = []() {
      auto now = std::chrono::system_clock::now();
      return std::chrono::duration_cast<std::chrono::milliseconds>(
                 now.time_since_epoch())
          .count();
    }();
    static thread_local KeySequence ks(time_seed);
    return ks;
  }

 private:
  array key_;
};

/** Get a PRNG key from a seed. */
MLX_API array key(uint64_t seed);

/** Seed the default PRNG key. */
MLX_API void seed(uint64_t seed);

/** Generate an array with type uint32 filled with random bits. */
MLX_API array bits(
    const Shape& shape,
    int width,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {});
inline array bits(
    const Shape& shape,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {}) {
  return bits(shape, 4, key, s);
}

/** Split the rng key into a pair of keys. */
MLX_API std::pair<array, array> split(const array& key, StreamOrDevice s = {});

/** Split the rng key into `num` keys. */
MLX_API array split(const array& key, int num, StreamOrDevice s = {});

/** Generate uniform random numbers between low and high. */
MLX_API array uniform(
    const array& low,
    const array& high,
    const Shape& shape,
    Dtype dtype = float32,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {});

template <typename T, typename U>
array uniform(
    T low,
    U high,
    const Shape& shape,
    Dtype dtype = float32,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {}) {
  return uniform(array(low), array(high), shape, dtype, key, to_stream(s));
}

/** Generate uniform random numbers between 0 and 1. */
MLX_API array uniform(
    const Shape& shape,
    Dtype dtype,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {});
inline array uniform(
    const Shape& shape,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {}) {
  return uniform(shape, float32, key, s);
}

/** Generate samples from the standard normal distribution. */
MLX_API array normal(
    const Shape& shape,
    Dtype dtype,
    const std::optional<array>& loc,
    const std::optional<array>& scale,
    const std::optional<array>& key,
    StreamOrDevice s = {});
inline array normal(
    const Shape& shape,
    Dtype dtype,
    const float loc,
    const float scale,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {}) {
  auto loc_ = loc == 0 ? std::nullopt : std::make_optional(array(loc, dtype));
  auto scale_ =
      scale == 1 ? std::nullopt : std::make_optional(array(scale, dtype));
  return normal(shape, dtype, loc_, scale_, key, s);
}
inline array normal(
    const Shape& shape,
    const float loc,
    const float scale,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {}) {
  return normal(shape, float32, loc, scale, key, s);
}
inline array normal(
    const Shape& shape,
    const Dtype dtype,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {}) {
  return normal(shape, dtype, std::nullopt, std::nullopt, key, s);
}
inline array normal(
    const Shape& shape,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {}) {
  return normal(shape, float32, std::nullopt, std::nullopt, key, s);
}

/** Generate samples from a multivariate normal distribution. **/
MLX_API array multivariate_normal(
    const array& mean,
    const array& cov,
    const Shape& shape,
    Dtype dtype,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {});

/** Generate integer samples uniformly at random */
MLX_API array randint(
    const array& low,
    const array& high,
    const Shape& shape,
    Dtype dtype = int32,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {});

template <typename T, typename U>
array randint(
    T low,
    U high,
    const Shape& shape,
    Dtype dtype = int32,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {}) {
  return randint(array(low), array(high), shape, dtype, key, to_stream(s));
}

/** Generate binary variables with probability to be true equal to p */
MLX_API array bernoulli(
    const array& p,
    const Shape& shape,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {});
MLX_API array bernoulli(
    const array& p,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {});

template <typename T>
array bernoulli(
    T p,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {}) {
  return bernoulli(array(p), key, s);
}

template <typename T>
array bernoulli(
    T p,
    const Shape& shape,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {}) {
  return bernoulli(array(p), shape, key, s);
}

MLX_API array bernoulli(
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {});

MLX_API array truncated_normal(
    const array& lower,
    const array& upper,
    const Shape& shape,
    Dtype dtype = float32,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {});

MLX_API array truncated_normal(
    const array& lower,
    const array& upper,
    Dtype dtype = float32,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {});

MLX_API array gumbel(
    const Shape& shape,
    Dtype dtype = float32,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {});

MLX_API array categorical(
    const array& logits,
    int axis,
    const Shape& shape,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {});

MLX_API array categorical(
    const array& logits_,
    int axis,
    int num_samples,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {});

MLX_API array categorical(
    const array& logits,
    int axis = -1,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {});

/** Generate samples from the laplace distribution. */
MLX_API array laplace(
    const Shape& shape,
    Dtype dtype,
    const float loc,
    const float scale,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {});
inline array laplace(
    const Shape& shape,
    const float loc,
    const float scale,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {}) {
  return laplace(shape, float32, loc, scale, key, s);
}
inline array laplace(
    const Shape& shape,
    const Dtype dtype,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {}) {
  return laplace(shape, dtype, 0.0, 1.0, key, s);
}
inline array laplace(
    const Shape& shape,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {}) {
  return laplace(shape, float32, 0.0, 1.0, key, s);
}

/* Randomly permute the elements of x along the given axis. */
MLX_API array permutation(
    const array& x,
    int axis = 0,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {});

/* A random permutation of `arange(x)` */
MLX_API array permutation(
    int x,
    const std::optional<array>& key = std::nullopt,
    StreamOrDevice s = {});

} // namespace mlx::core::random


================================================
FILE: mlx/scheduler.cpp
================================================
// Copyright © 2023 Apple Inc.

#include "mlx/scheduler.h"
#include "mlx/backend/gpu/device_info.h"
#include "mlx/backend/gpu/eval.h"

namespace mlx::core {

Stream default_stream(Device d) {
  if (!gpu::is_available() && d == Device::gpu) {
    throw std::invalid_argument(
        "[default_stream] Cannot get gpu stream without gpu backend.");
  }
  return scheduler::scheduler().get_default_stream(d);
}

void set_default_stream(Stream s) {
  if (!gpu::is_available() && s.device == Device::gpu) {
    throw std::invalid_argument(
        "[set_default_stream] Cannot set gpu stream without gpu backend.");
  }
  return scheduler::scheduler().set_default_stream(s);
}

Stream get_stream(int index) {
  return scheduler::scheduler().get_stream(index);
}

std::vector<Stream> get_streams() {
  return scheduler::scheduler().get_streams();
}

Stream new_stream(Device d) {
  if (!gpu::is_available() && d == Device::gpu) {
    throw std::invalid_argument(
        "[new_stream] Cannot make gpu stream without gpu backend.");
  }
  return scheduler::scheduler().new_stream(d);
}

Stream new_stream() {
  return scheduler::scheduler().new_stream(default_device());
}

void synchronize(Stream s) {
  if (s.device == mlx::core::Device::cpu) {
    auto p = std::make_shared<std::promise<void>>();
    std::future<void> f = p->get_future();
    scheduler::enqueue(s, [p = std::move(p)]() { p->set_value(); });
    f.wait();
  } else {
    gpu::synchronize(s);
  }
}

void synchronize() {
  synchronize(default_stream(default_device()));
}

namespace scheduler {

/** A singleton scheduler to manage devices, streams, and task execution. */
Scheduler& scheduler() {
  // Intentionally leaked to avoid the "static destruction order fiasco":
  // background threads (e.g. command buffer completion handlers) may
  // reference this singleton after other static objects are destroyed
  // during process teardown.
  static Scheduler* scheduler = new Scheduler;
  return *scheduler;
}

} // namespace scheduler
} // namespace mlx::core


================================================
FILE: mlx/scheduler.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include <atomic>
#include <future>
#include <queue>
#include <thread>
#include <unordered_map>

#include "mlx/api.h"
#include "mlx/backend/gpu/eval.h"
#include "mlx/device.h"
#include "mlx/stream.h"

namespace mlx::core::scheduler {

struct StreamThread {
  std::mutex mtx;
  std::queue<std::function<void()>> q;
  std::condition_variable cond;
  bool stop;
  std::thread thread;

  StreamThread() : stop(false), thread(&StreamThread::thread_fn, this) {}

  ~StreamThread() {
    {
      std::lock_guard<std::mutex> lk(mtx);
      stop = true;
    }
    cond.notify_one();
    thread.join();
  }

  void thread_fn() {
    while (true) {
      std::function<void()> task;
      {
        std::unique_lock<std::mutex> lk(mtx);
        cond.wait(lk, [this] { return !this->q.empty() || this->stop; });
        if (q.empty() && stop) {
          return;
        }
        task = std::move(q.front());
        q.pop();
      }

      task();
    }
  }

  template <typename F>
  void enqueue(F&& f) {
    {
      std::lock_guard<std::mutex> lk(mtx);
      if (stop) {
        throw std::runtime_error(
            "Cannot enqueue work after stream is stopped.");
      }
      q.emplace(std::forward<F>(f));
    }
    cond.notify_one();
  }
};

class Scheduler {
 public:
  Scheduler() : n_active_tasks_(0) {
    if (is_available(Device::gpu)) {
      default_streams_.insert({Device::gpu, new_stream(Device::gpu)});
    }
    default_streams_.insert({Device::cpu, new_stream(Device::cpu)});
  }

  // Not copyable or moveable
  Scheduler(const Scheduler&) = delete;
  Scheduler(Scheduler&&) = delete;
  Scheduler& operator=(const Scheduler&) = delete;
  Scheduler& operator=(Scheduler&&) = delete;

  Stream new_stream(const Device& d) {
    streams_.emplace_back(streams_.size(), d);
    if (d == Device::gpu) {
      threads_.push_back(nullptr);
      gpu::new_stream(streams_.back());
    } else {
      threads_.push_back(new StreamThread{});
    }
    return streams_.back();
  }

  template <typename F>
  void enqueue(const Stream& stream, F&& f);

  Stream get_default_stream(const Device& d) const {
    return default_streams_.at(d.type);
  }
  Stream get_stream(int index) const {
    return streams_.at(index);
  }
  std::vector<Stream> get_streams() const {
    return streams_;
  }

  void set_default_stream(const Stream& s) {
    default_streams_.at(s.device.type) = s;
  }

  void notify_new_task(const Stream& stream) {
    {
      std::lock_guard<std::mutex> lk(mtx);
      n_active_tasks_++;
    }
    completion_cv.notify_all();
  }

  void notify_task_completion(const Stream& stream) {
    {
      std::lock_guard<std::mutex> lk(mtx);
      n_active_tasks_--;
    }
    completion_cv.notify_all();
  }

  int n_active_tasks() const {
    return n_active_tasks_;
  }

  void wait_for_one() {
    std::unique_lock<std::mutex> lk(mtx);
    int n_tasks_old = n_active_tasks();
    if (n_tasks_old > 1) {
      completion_cv.wait(lk, [this, n_tasks_old] {
        return this->n_active_tasks() < n_tasks_old;
      });
    }
  }

  ~Scheduler() {
    for (auto s : streams_) {
      try {
        synchronize(s);
      } catch (const std::runtime_error&) {
        // ignore errors if synch fails
      }
    }
    for (auto t : threads_) {
      if (t != nullptr) {
        delete t;
      }
    }
  }

 private:
  int n_active_tasks_;
  std::vector<StreamThread*> threads_;
  std::vector<Stream> streams_;
  std::unordered_map<Device::DeviceType, Stream> default_streams_;
  std::condition_variable completion_cv;
  std::mutex mtx;
};

template <typename F>
void Scheduler::enqueue(const Stream& stream, F&& f) {
  threads_[stream.index]->enqueue(std::forward<F>(f));
}

MLX_API Scheduler& scheduler();

template <typename F>
void enqueue(const Stream& stream, F&& f) {
  scheduler().enqueue(stream, std::forward<F>(f));
}

inline int n_active_tasks() {
  return scheduler().n_active_tasks();
}

inline void notify_new_task(const Stream& stream) {
  scheduler().notify_new_task(stream);
}

inline void notify_task_completion(const Stream& stream) {
  scheduler().notify_task_completion(stream);
}

inline void wait_for_one() {
  scheduler().wait_for_one();
}

} // namespace mlx::core::scheduler


================================================
FILE: mlx/small_vector.h
================================================
// Copyright © 2025 Apple Inc.
// Copyright © 2018 the V8 project authors.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
//       notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
//       copyright notice, this list of conditions and the following
//       disclaimer in the documentation and/or other materials provided
//       with the distribution.
//     * Neither the name of Google Inc. nor the names of its
//       contributors may be used to endorse or promote products derived
//       from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#pragma once

#include <algorithm>
#include <cassert>
#include <type_traits>
#include <utility>

namespace mlx::core {

#if defined(__has_builtin)
#define MLX_HAS_BUILTIN(x) __has_builtin(x)
#else
#define MLX_HAS_BUILTIN(x) 0
#endif

#if defined(__has_attribute)
#define MLX_HAS_ATTRIBUTE(x) __has_attribute(x)
#else
#define MLX_HAS_ATTRIBUTE(x) 0
#endif

#if MLX_HAS_BUILTIN(__builtin_expect)
#define MLX_LIKELY(condition) (__builtin_expect(!!(condition), 1))
#define MLX_UNLIKELY(condition) (__builtin_expect(!!(condition), 0))
#else
#define MLX_LIKELY(condition) (condition)
#define MLX_UNLIKELY(condition) (condition)
#endif

#if MLX_HAS_ATTRIBUTE(noinline)
#define MLX_NOINLINE __attribute__((noinline))
#else
#define MLX_NOINLINE
#endif

template <typename T, typename = void>
struct is_iterator : std::false_type {};

template <typename T>
struct is_iterator<
    T,
    std::void_t<
        typename std::iterator_traits<T>::difference_type,
        typename std::iterator_traits<T>::iterator_category,
        typename std::iterator_traits<T>::pointer,
        typename std::iterator_traits<T>::reference,
        typename std::iterator_traits<T>::value_type>> : std::true_type {};

template <typename T>
constexpr bool is_iterator_v = is_iterator<T>::value;

// Minimal SmallVector implementation. Uses inline storage first, switches to
// dynamic storage when it overflows.
//
// Notes:
// * The default inline storage size is MAX_NDIM, as it is mainly used for
//   shapes and strides, users should choose a better size for other cases.
// * The data() returns real address even for empty vector.
// * The pointer returned by data() will change after moving the vector as it
//   points to the inline storage.
// * For trivial elements the storage will not be default constructed,
//   i.e. SmallVector<int>(10) will not be filled with 0 by default.
template <typename T, size_t kSize = 10, typename Allocator = std::allocator<T>>
class SmallVector {
 public:
  using value_type = T;
  using reference = T&;
  using const_reference = const T&;
  using iterator = T*;
  using const_iterator = const T*;
  using difference_type = std::ptrdiff_t;
  using size_type = std::size_t;

  SmallVector() = default;

  explicit SmallVector(const Allocator& allocator) : allocator_(allocator) {}

  explicit SmallVector(size_t size, const Allocator& allocator = Allocator())
      : allocator_(allocator) {
    resize(size);
  }

  SmallVector(
      size_t size,
      const T& initial_value,
      const Allocator& allocator = Allocator())
      : allocator_(allocator) {
    resize(size, initial_value);
  }

  SmallVector(
      std::initializer_list<T> init,
      const Allocator& allocator = Allocator())
      : allocator_(allocator) {
    if (init.size() > capacity()) {
      grow(init.size());
    }
    assert(capacity() >= init.size()); // sanity check
    std::uninitialized_move(init.begin(), init.end(), begin_);
    end_ = begin_ + init.size();
  }

  template <typename Iter, typename = std::enable_if_t<is_iterator_v<Iter>>>
  SmallVector(Iter begin, Iter end, const Allocator& allocator = Allocator())
      : allocator_(allocator) {
    size_t size = std::distance(begin, end);
    if (size > capacity()) {
      grow(size);
    }
    assert(capacity() >= size); // sanity check
    std::uninitialized_copy(begin, end, begin_);
    end_ = begin_ + size;
  }

  SmallVector(const SmallVector& other) : allocator_(other.allocator_) {
    *this = other;
  }
  SmallVector(const SmallVector& other, const Allocator& allocator)
      : allocator_(allocator) {
    *this = other;
  }
  SmallVector(SmallVector&& other) : allocator_(std::move(other.allocator_)) {
    *this = std::move(other);
  }
  SmallVector(SmallVector&& other, const Allocator& allocator)
      : allocator_(allocator) {
    *this = std::move(other);
  }

  ~SmallVector() {
    free_storage();
  }

  SmallVector& operator=(const SmallVector& other) {
    if (this == &other) {
      return *this;
    }
    size_t other_size = other.size();
    if (capacity() < other_size) {
      // Create large-enough heap-allocated storage.
      free_storage();
      begin_ = allocator_.allocate(other_size);
      end_of_storage_ = begin_ + other_size;
      std::uninitialized_copy(other.begin_, other.end_, begin_);
    } else if constexpr (kHasTrivialElement) {
      std::copy(other.begin_, other.end_, begin_);
    } else {
      ptrdiff_t to_copy =
          std::min(static_cast<ptrdiff_t>(other_size), end_ - begin_);
      std::copy(other.begin_, other.begin_ + to_copy, begin_);
      if (other.begin_ + to_copy < other.end_) {
        std::uninitialized_copy(
            other.begin_ + to_copy, other.end_, begin_ + to_copy);
      } else {
        std::destroy_n(begin_ + to_copy, size() - to_copy);
      }
    }
    end_ = begin_ + other_size;
    return *this;
  }

  SmallVector& operator=(SmallVector&& other) {
    if (this == &other) {
      return *this;
    }
    if (other.is_big()) {
      free_storage();
      begin_ = other.begin_;
      end_ = other.end_;
      end_of_storage_ = other.end_of_storage_;
    } else {
      assert(capacity() >= other.size()); // sanity check
      size_t other_size = other.size();
      if constexpr (kHasTrivialElement) {
        std::move(other.begin_, other.end_, begin_);
      } else {
        ptrdiff_t to_move =
            std::min(static_cast<ptrdiff_t>(other_size), end_ - begin_);
        std::move(other.begin_, other.begin_ + to_move, begin_);
        if (other.begin_ + to_move < other.end_) {
          std::uninitialized_move(
              other.begin_ + to_move, other.end_, begin_ + to_move);
        } else {
          std::destroy_n(begin_ + to_move, size() - to_move);
        }
      }
      end_ = begin_ + other_size;
    }
    other.reset_to_inline_storage();
    return *this;
  }

  bool operator==(const SmallVector& other) const {
    if (size() != other.size()) {
      return false;
    }
    return std::equal(begin_, end_, other.begin_);
  }

  bool operator!=(const SmallVector& other) const {
    return !(*this == other);
  }

  T* data() {
    return begin_;
  }
  const T* data() const {
    return begin_;
  }

  iterator begin() {
    return begin_;
  }
  const_iterator begin() const {
    return begin_;
  }

  iterator end() {
    return end_;
  }
  const_iterator end() const {
    return end_;
  }

  const_iterator cbegin() const {
    return begin_;
  }

  const_iterator cend() const {
    return end_;
  }

  auto rbegin() {
    return std::make_reverse_iterator(end_);
  }
  auto rbegin() const {
    return std::make_reverse_iterator(end_);
  }

  auto rend() {
    return std::make_reverse_iterator(begin_);
  }
  auto rend() const {
    return std::make_reverse_iterator(begin_);
  }

  size_t size() const {
    return end_ - begin_;
  }
  bool empty() const {
    return end_ == begin_;
  }
  size_t capacity() const {
    return end_of_storage_ - begin_;
  }

  T& front() {
    assert(size() != 0);
    return begin_[0];
  }
  const T& front() const {
    assert(size() != 0);
    return begin_[0];
  }

  T& back() {
    assert(size() != 0);
    return end_[-1];
  }
  const T& back() const {
    assert(size() != 0);
    return end_[-1];
  }

  T& at(size_t index) {
    if (index >= size()) {
      throw std::out_of_range("SmallVector out of range.");
    }
    return begin_[index];
  }
  const T& at(size_t index) const {
    return const_cast<SmallVector*>(this)->at(index);
  }

  T& operator[](size_t index) {
    assert(size() > index);
    return begin_[index];
  }
  const T& operator[](size_t index) const {
    return const_cast<SmallVector*>(this)->operator[](index);
  }

  template <typename... Args>
  void emplace_back(Args&&... args) {
    if (MLX_UNLIKELY(end_ == end_of_storage_)) {
      grow();
    }
    void* storage = end_;
    end_ += 1;
    new (storage) T(std::forward<Args>(args)...);
  }

  void push_back(T x) {
    emplace_back(std::move(x));
  }

  void pop_back(size_t count = 1) {
    assert(size() >= count);
    end_ -= count;
    std::destroy_n(end_, count);
  }

  iterator insert(iterator pos, T value) {
    return insert(pos, static_cast<size_t>(1), std::move(value));
  }

  iterator insert(iterator pos, size_t count, T value) {
    assert(pos <= end_);
    size_t offset = pos - begin_;
    size_t old_size = size();
    resize(old_size + count);
    pos = begin_ + offset;
    iterator old_end = begin_ + old_size;
    assert(old_end <= end_);
    std::move_backward(pos, old_end, end_);
    if constexpr (kHasTrivialElement) {
      std::fill_n(pos, count, value);
    } else {
      std::fill_n(pos + 1, count - 1, value);
      *pos = std::move(value);
    }
    return pos;
  }

  template <typename Iter, typename = std::enable_if_t<is_iterator_v<Iter>>>
  iterator insert(iterator pos, Iter begin, Iter end) {
    if constexpr (std::is_same_v<std::decay_t<Iter>, iterator>) {
      // The implementation can not take overlapping range.
      assert(!(begin >= pos && begin < pos + std::distance(begin, end)));
      assert(!(end > pos && end <= pos + std::distance(begin, end)));
    }

    assert(pos <= end_);
    size_t offset = pos - begin_;
    size_t count = std::distance(begin, end);
    size_t old_size = size();
    resize(old_size + count);
    pos = begin_ + offset;
    iterator old_end = begin_ + old_size;
    assert(old_end <= end_);
    std::move_backward(pos, old_end, end_);
    std::copy(begin, end, pos);
    return pos;
  }

  iterator insert(iterator pos, std::initializer_list<const T> values) {
    return insert(pos, values.begin(), values.end());
  }

  iterator erase(iterator erase_start, iterator erase_end) {
    assert(erase_start >= begin_);
    assert(erase_start <= erase_end);
    assert(erase_end <= end_);
    iterator new_end = std::move(erase_end, end_, erase_start);
    std::destroy_n(new_end, std::distance(new_end, end_));
    end_ = new_end;
    return erase_start;
  }

  iterator erase(iterator pos) {
    return erase(pos, pos + 1);
  }

  void resize(size_t new_size) {
    if (new_size > capacity()) {
      grow(new_size);
    }
    T* new_end = begin_ + new_size;
    if constexpr (!kHasTrivialElement) {
      if (new_end > end_) {
        std::uninitialized_default_construct(end_, new_end);
      } else {
        std::destroy_n(new_end, end_ - new_end);
      }
    }
    end_ = new_end;
  }

  void resize(size_t new_size, const T& initial_value) {
    if (new_size > capacity()) {
      grow(new_size);
    }
    T* new_end = begin_ + new_size;
    if (new_end > end_) {
      std::uninitialized_fill(end_, new_end, initial_value);
    } else {
      std::destroy_n(new_end, end_ - new_end);
    }
    end_ = new_end;
  }

  void reserve(size_t new_capacity) {
    if (new_capacity > capacity()) {
      grow(new_capacity);
    }
  }

  // Clear without reverting back to inline storage.
  void clear() {
    std::destroy_n(begin_, end_ - begin_);
    end_ = begin_;
  }

 private:
  // Grows the backing store by a factor of two, and at least to {min_capacity}.
  // TODO: Move to private after removing external code using this method.
  MLX_NOINLINE void grow(size_t min_capacity = 0) {
    size_t new_capacity = std::max(min_capacity, 2 * capacity());
    // Round up to power of 2.
    new_capacity--;
    new_capacity |= new_capacity >> 1;
    new_capacity |= new_capacity >> 2;
    new_capacity |= new_capacity >> 4;
    new_capacity |= new_capacity >> 8;
    new_capacity |= new_capacity >> 16;
    if constexpr (sizeof(size_t) == sizeof(uint64_t)) {
      new_capacity |= new_capacity >> 32;
    }
    new_capacity++;

    T* new_storage = allocator_.allocate(new_capacity);
    if (new_storage == nullptr) {
      throw std::bad_alloc();
    }

    size_t in_use = end_ - begin_;
    std::uninitialized_move(begin_, end_, new_storage);
    free_storage();
    begin_ = new_storage;
    end_ = new_storage + in_use;
    end_of_storage_ = new_storage + new_capacity;
  }

  MLX_NOINLINE void free_storage() {
    std::destroy_n(begin_, end_ - begin_);
    if (is_big()) {
      allocator_.deallocate(begin_, end_of_storage_ - begin_);
    }
  }

  // Clear and go back to inline storage. Dynamic storage is *not* freed. For
  // internal use only.
  void reset_to_inline_storage() {
    if constexpr (!kHasTrivialElement) {
      if (!is_big())
        std::destroy_n(begin_, end_ - begin_);
    }
    begin_ = inline_storage_begin();
    end_ = begin_;
    end_of_storage_ = begin_ + kSize;
  }

  bool is_big() const {
    return begin_ != inline_storage_begin();
  }

  T* inline_storage_begin() {
    return reinterpret_cast<T*>(inline_storage_);
  }
  const T* inline_storage_begin() const {
    return reinterpret_cast<const T*>(inline_storage_);
  }

  Allocator allocator_;

  // Invariants:
  // 1. The elements in the range between `begin_` (included) and `end_` (not
  //    included) will be initialized at all times.
  // 2. All other elements outside the range, both in the inline storage and in
  //    the dynamic storage (if it exists), will be uninitialized at all times.

  T* begin_ = inline_storage_begin();
  T* end_ = begin_;
  T* end_of_storage_ = begin_ + kSize;

  alignas(T) char inline_storage_[sizeof(T) * kSize];

  static constexpr bool kHasTrivialElement =
      std::is_trivially_copyable<T>::value &&
      std::is_trivially_destructible<T>::value;
};

template <typename>
struct is_vector : std::false_type {};

template <typename T, size_t Size, typename Allocator>
struct is_vector<SmallVector<T, Size, Allocator>> : std::true_type {};

template <typename T, typename Allocator>
struct is_vector<std::vector<T, Allocator>> : std::true_type {};

template <typename Vec>
inline constexpr bool is_vector_v = is_vector<Vec>::value;

#undef MLX_HAS_BUILTIN
#undef MLX_HAS_ATTRIBUTE
#undef MLX_LIKELY
#undef MLX_UNLIKELY
#undef MLX_NOINLINE

} // namespace mlx::core


================================================
FILE: mlx/stream.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include <vector>

#include "mlx/api.h"
#include "mlx/device.h"

namespace mlx::core {

struct MLX_API Stream {
  int index;
  Device device;
  explicit Stream(int index, Device device) : index(index), device(device) {}
};

/** Get the default stream for the given device. */
MLX_API Stream default_stream(Device d);

/** Make the stream the default for its device. */
MLX_API void set_default_stream(Stream s);

/** Make a new stream on the given device. */
MLX_API Stream new_stream(Device d);

/** Get the stream with the given index. */
MLX_API Stream get_stream(int index);

/** Get all available streams. */
MLX_API std::vector<Stream> get_streams();

inline bool operator==(const Stream& lhs, const Stream& rhs) {
  return lhs.index == rhs.index;
}

inline bool operator!=(const Stream& lhs, const Stream& rhs) {
  return !(lhs == rhs);
}

/* Synchronize with the default stream. */
MLX_API void synchronize();

/* Synchronize with the provided stream. */
MLX_API void synchronize(Stream);

} // namespace mlx::core


================================================
FILE: mlx/threadpool.h
================================================
// This code was modified from https://github.com/progschj/ThreadPool
// The original License is copied below:
//
// Copyright (c) 2012 Jakob Progsch, Václav Zeman
// This software is provided 'as-is', without any express or implied
// warranty. In no event will the authors be held liable for any damages
// arising from the use of this software.
//
// Permission is granted to anyone to use this software for any purpose,
// including commercial applications, and to alter it and redistribute it
// freely, subject to the following restrictions:
//
//    1. The origin of this software must not be misrepresented; you must not
//    claim that you wrote the original software. If you use this software
//    in a product, an acknowledgment in the product documentation would be
//    appreciated but is not required.
//
//    2. Altered source versions must be plainly marked as such, and must not be
//    misrepresented as being the original software.
//
//    3. This notice may not be removed or altered from any source
//    distribution.
#pragma once

#include <condition_variable>
#include <functional>
#include <future>
#include <memory>
#include <mutex>
#include <queue>
#include <stdexcept>
#include <thread>
#include <vector>

class ThreadPool {
 public:
  ThreadPool(size_t);
  template <class F, class... Args>
  auto enqueue(F&& f, Args&&... args)
      -> std::future<typename std::invoke_result_t<F, Args...>>;
  void resize(size_t);
  ~ThreadPool();

 private:
  void stop_and_wait();
  void start_threads(size_t);

  std::vector<std::thread> workers;
  std::queue<std::function<void()>> tasks;
  std::mutex queue_mutex;
  std::condition_variable condition;
  bool stop;
};

inline ThreadPool::ThreadPool(size_t threads) : stop(false) {
  start_threads(threads);
}

template <class F, class... Args>
auto ThreadPool::enqueue(F&& f, Args&&... args)
    -> std::future<typename std::invoke_result_t<F, Args...>> {
  using return_type = typename std::invoke_result_t<F, Args...>;

  auto task = std::make_shared<std::packaged_task<return_type()>>(
      std::bind(std::forward<F>(f), std::forward<Args>(args)...));

  std::future<return_type> res = task->get_future();
  {
    std::unique_lock<std::mutex> lock(queue_mutex);

    if (stop) {
      throw std::runtime_error(
          "[ThreadPool::enqueue] Not allowed on stopped ThreadPool");
    }

    tasks.emplace([task]() { (*task)(); });
  }
  condition.notify_one();
  return res;
}

inline void ThreadPool::resize(size_t threads) {
  if (workers.size() == threads) {
    return;
  }

  if (workers.size() > threads) {
    stop_and_wait();
  }
  start_threads(threads - workers.size());
}

inline ThreadPool::~ThreadPool() {
  stop_and_wait();
}

inline void ThreadPool::stop_and_wait() {
  // Stop the current threads and wait until they finish
  {
    std::unique_lock<std::mutex> lock(queue_mutex);
    stop = true;
  }
  condition.notify_all();
  for (std::thread& worker : workers) {
    worker.join();
  }

  // Reset the member variables so that the threadpool is reusable
  stop = false;
  workers.clear();
}

inline void ThreadPool::start_threads(size_t threads) {
  for (size_t i = 0; i < threads; ++i) {
    workers.emplace_back([this] {
      for (;;) {
        std::function<void()> task;

        {
          std::unique_lock<std::mutex> lock(this->queue_mutex);
          this->condition.wait(
              lock, [this] { return this->stop || !this->tasks.empty(); });
          if (this->stop && this->tasks.empty())
            return;
          task = std::move(this->tasks.front());
          this->tasks.pop();
        }

        task();
      }
    });
  }
}


================================================
FILE: mlx/transforms.cpp
================================================
// Copyright © 2023-2024 Apple Inc.
#include <algorithm>
#include <deque>
#include <future>
#include <numeric>
#include <set>
#include <sstream>
#include <stack>
#include <unordered_map>
#include <unordered_set>

#include "mlx/backend/cpu/eval.h"
#include "mlx/backend/gpu/eval.h"
#include "mlx/fence.h"
#include "mlx/memory.h"
#include "mlx/ops.h"
#include "mlx/primitives.h"
#include "mlx/scheduler.h"
#include "mlx/transforms.h"
#include "mlx/transforms_impl.h"
#include "mlx/utils.h"

namespace mlx::core {

static constexpr int MAX_ACTIVE_TASKS = 10;

/* This class is only meant to be used in eval
 * for synchronizing with the main thread. */
class Synchronizer : public Primitive {
 public:
  explicit Synchronizer(Stream stream) : Primitive(stream) {}

  void eval_cpu(const std::vector<array>&, std::vector<array>&) override {}
  void eval_gpu(const std::vector<array>&, std::vector<array>&) override {}

  DEFINE_NAME(Synchronize);
};

// Initialize the static tracing members from transforms_impl.h
//
// These are used to implement the in_tracing() function the returns true if we
// are currently under a function transformation and the retain_graph()
// function which returns true if we are forced to retain the graph during
// evaluation.
std::vector<std::pair<char, char>>& detail::InTracing::trace_stack() {
  static std::vector<std::pair<char, char>> trace_stack_;
  return trace_stack_;
}
int detail::InTracing::grad_counter{0};
int detail::RetainGraph::tracing_counter{0};

array eval_impl(std::vector<array> outputs, bool async) {
  std::deque<array> tape;

  // Make an effort to choose a good output stream
  Stream stream = default_stream(default_device());
  for (auto& o : outputs) {
    if (o.status() == array::Status::unscheduled && o.has_primitive()) {
      stream = o.primitive().stream();
      break;
    }
  }

  // Map of array id that needs fence and stream it's computed on
  std::unordered_map<uintptr_t, std::pair<uint32_t, bool>> needs_fence;

  auto synchronizer = array(
      {}, bool_, std::make_shared<Synchronizer>(stream), std::move(outputs));

  // Stream fences for inter-stream synchronization
  std::unordered_map<uint32_t, Fence> fences;

  // Stream events for synchronization after eval
  std::unordered_map<uint32_t, Event> events;
  {
    auto e = Event{stream};
    e.set_value(1);
    synchronizer.attach_event(e);
    events.emplace(stream.index, std::move(e));
  }

  {
    // Record the degree of each input
    std::unordered_map<std::uintptr_t, int> cache;

    std::stack<std::pair<std::reference_wrapper<array>, int>> dfs;
    dfs.emplace(synchronizer, 0);
    while (!dfs.empty()) {
      auto& [a_ref, idx] = dfs.top();
      auto& a = a_ref.get();

      if (idx < a.inputs().size()) {
        // Add an input, and continue
        auto& in = a.inputs()[idx++];

        if (in.status() == array::Status::unscheduled) {
          if (async && in.is_tracer()) {
            throw std::invalid_argument(
                "[async_eval] Not allowed inside a graph transformation.");
          }
          if (!in.has_primitive()) {
            if (in.is_tracer()) {
              throw std::invalid_argument(
                  "[eval] Attempting to eval an array during function"
                  " transformations like compile or vmap is not allowed.");
            }
            throw std::runtime_error(
                "[eval] Attempting to eval an array without a primitive.\n"
                "If you are compiling a function, make sure all the inputs "
                "and outputs are captured:\n"
                "https://ml-explore.github.io/mlx/build/html/usage/compile.html#pure-functions.\n"
                "If you are not using compile, this may be a bug. "
                "Please file an issue here:\n"
                "https://github.com/ml-explore/mlx/issues.");
          }
          if (a.primitive().stream() != in.primitive().stream()) {
            bool device_switch =
                a.primitive().stream().device != in.primitive().stream().device;
            auto [it, inserted] = needs_fence.emplace(
                in.id(),
                std::make_pair(in.primitive().stream().index, device_switch));
            if (!inserted) {
              it->second.second |= device_switch;
            }
          }
        }

        // All siblings have the same degree
        auto cache_it = cache.find(in.id());
        if (cache_it == cache.end()) {
          dfs.emplace(in, 0);
          cache.insert({in.id(), 1});
          for (auto& s : in.siblings()) {
            cache.insert({s.id(), 1});
          }
        } else {
          cache_it->second++;
          for (auto& s : in.siblings()) {
            cache[s.id()]++;
          }
        }
        continue;
      }
      if ((a.status() != array::Status::unscheduled) && !a.is_tracer() &&
          a.has_primitive()) {
        // If the array is evaluated and is no longer a tracer, detach it
        a.detach();
      }
      dfs.pop();
    }

    // Build the tape in BFS order with a width limit
    int max_width = env::bfs_max_width();
    dfs = std::stack<std::pair<std::reference_wrapper<array>, int>>();
    tape.push_back(synchronizer);
    for (int i = 0; !cache.empty() && (i < tape.size() || !dfs.empty());) {
      auto& a = (i >= tape.size()) ? dfs.top().first.get() : tape[i];
      int j = 0;
      if (i >= tape.size()) {
        j = dfs.top().second;
        dfs.pop();
      } else {
        i++;
      }
      for (; j < a.inputs().size(); ++j) {
        auto& in = a.inputs()[j];
        if (in.status() != array::Status::unscheduled) {
          continue;
        }

        // If the width limit is exceeded, push the array on the stack
        // and go down a level
        if ((tape.size() - i) >= max_width) {
          dfs.emplace(a, j);
          break;
        }

        auto it = cache.find(in.id());
        it->second -= 1;

        if (it->second != 0) {
          for (auto& s : in.siblings()) {
            cache[s.id()] -= 1;
          }
          continue;
        }

        // Remove input and siblings from cache
        cache.erase(it);
        for (auto& s : in.siblings()) {
          cache.erase(s.id());
        }

        tape.push_back(in);
      }
    }
  }

  std::unordered_set<int> open_streams;
  while (!tape.empty()) {
    auto arr = std::move(tape.back());
    tape.pop_back();

    auto stream = arr.primitive().stream();
    open_streams.insert(stream.index);

    if (async) {
      // Lookup corresponding event
      auto e = events.find(stream.index);
      if (e == events.end()) {
        e = events.emplace(stream.index, Event{stream}).first;
      }
      e->second.set_value(1);
      arr.attach_event(e->second);
      for (auto& s : arr.siblings()) {
        s.attach_event(e->second);
      }
    }

    for (auto& in : arr.inputs()) {
      if (auto it = needs_fence.find(in.id()); it != needs_fence.end()) {
        // Use fence to wait within a single eval
        // Get the input array's stream fence and wait on the
        // output arrays stream
        fences[it->second.first].wait(stream, in);
      } else if (in.event().valid()) {
        if (in.event().is_signaled()) {
          in.detach_event();
        } else if (in.event().stream() != stream) {
          // Use event to wait across async eval
          in.event().wait(stream);
        }
      }
    }

    if (arr.primitive().device() == Device::gpu) {
      gpu::eval(arr);
    } else {
      cpu::eval(arr);
    }

    if (scheduler::n_active_tasks() > MAX_ACTIVE_TASKS ||
        (get_active_memory() > get_memory_limit() &&
         scheduler::n_active_tasks() > 0)) {
      // Commit any open streams
      for (auto i : open_streams) {
        auto s = get_stream(i);
        if (s.device == Device::gpu) {
          gpu::finalize(s);
        }
      }
      scheduler::wait_for_one();
      while (get_active_memory() > get_memory_limit() &&
             scheduler::n_active_tasks() > 0) {
        scheduler::wait_for_one();
      }
    }

    auto maybe_update_fence = [&fences, &needs_fence, stream](const array& a) {
      if (auto nf = needs_fence.find(a.id()); nf != needs_fence.end()) {
        auto it = fences.find(stream.index);
        if (it == fences.end()) {
          it = fences.emplace(stream.index, Fence{stream}).first;
        }
        it->second.update(stream, a, nf->second.second);
      }
    };

    arr.set_status(array::Status::evaluated);
    // TODO Maybe always want the fence coherent kernel in the same cbuf
    // as the other kernels?
    maybe_update_fence(arr);
    for (auto& sib : arr.siblings()) {
      sib.set_status(array::Status::evaluated);
      maybe_update_fence(sib);
    }
    if (!arr.is_tracer()) {
      arr.detach();
    }
  }

  // Signal the event in its stream
  for (auto i : open_streams) {
    auto s = get_stream(i);
    if (auto e = events.find(i); e != events.end()) {
      e->second.signal(s);
    }
    if (s.device == Device::gpu) {
      gpu::finalize(s);
    }
  }

  return synchronizer;
}

void async_eval(std::vector<array> outputs) {
  if (outputs.empty()) {
    return;
  }

  if (std::none_of(outputs.begin(), outputs.end(), [](array& x) {
        return x.status() == array::Status::unscheduled;
      })) {
    return;
  }

  eval_impl(std::move(outputs), true);
}

void eval(std::vector<array> outputs) {
  if (outputs.empty()) {
    return;
  }

  if (std::none_of(outputs.begin(), outputs.end(), [](array& x) {
        return x.status() == array::Status::unscheduled;
      })) {
    for (auto& x : outputs) {
      x.wait();
    }
    return;
  }

  eval_impl(std::move(outputs), false).wait();
}

std::pair<std::vector<array>, std::vector<array>> vjp(
    const std::function<std::vector<array>(const std::vector<array>&)>& fun,
    const std::vector<array>& primals,
    const std::vector<array>& cotans,
    const std::vector<int>& argnums) {
  // Set the global tracing flag.
  detail::InTracing in_tracing{false, true};

  // Make tracers from given primals
  std::vector<array> primals_;
  for (auto& p : primals) {
    auto s = p.has_primitive() ? p.primitive().stream()
                               : default_stream(default_device());
    primals_.push_back(copy(p, s)); // Does not do a deep copy
    primals_.back().set_tracer(true);
  }

  // Pass tracer primals through the function
  // Any variables that depend on the primals are marked as tracers
  auto outputs = fun(primals_);

  // Map outputs to passed cotans while ignoring the outputs
  // that have stop_gradient called on them
  int cotan_index = 0;
  std::vector<std::pair<int, int>> output_cotan_pairs;
  for (int i = 0; i < outputs.size(); ++i) {
    auto& out = outputs[i];
    if (out.has_primitive()) {
      if (auto& p = out.primitive(); typeid(p) == typeid(StopGradient)) {
        continue;
      }
    }
    if (cotan_index >= cotans.size()) {
      std::ostringstream msg;
      msg << "[vjp] Number of outputs to compute gradients for ("
          << outputs.size() << ") does not match number of cotangents ("
          << cotans.size() << ").";
      throw std::invalid_argument(msg.str());
    }
    if (out.shape() != cotans[cotan_index].shape()) {
      std::ostringstream msg;
      msg << "[vjp] Output shape " << out.shape()
          << " does not match cotangent shape " << cotans[cotan_index].shape()
          << ".";
      if (outputs.size() == 1 && out.size() == 1) {
        msg << " If you are using grad your function must return a scalar.";
      }
      throw std::invalid_argument(msg.str());
    }
    output_cotan_pairs.emplace_back(i, cotan_index++);
  }

  // Topologically sort the compute graph, add graph nodes
  // to the tape which need a gradient.
  std::unordered_set<std::uintptr_t> cache;
  std::unordered_set<std::uintptr_t> calc_grad;
  for (int i = 0, j = 0; i < primals_.size(); ++i) {
    auto& primal = primals_[i];
    primal.set_tracer(false);
    cache.insert(primal.id());
    if (j < argnums.size() && argnums[j] == i) {
      j++;
      calc_grad.insert(primal.id());
    }
  }

  std::vector<array> tape;

  std::function<void(array&)> recurse;
  recurse = [&](auto& a) {
    // Check if visited and add to cache if not
    if (auto inserted = cache.insert(a.id()); !inserted.second) {
      return;
    }
    a.set_tracer(false);
    for (auto& s : a.siblings()) {
      s.set_tracer(false);
      cache.insert(s.id());
    }

    for (auto& input : a.inputs()) {
      recurse(input);
    }

    // Stop grad
    if (a.has_primitive()) {
      if (auto& p = a.primitive(); typeid(p) == typeid(StopGradient)) {
        return;
      }
    }

    // Calculate gradient if any inputs require gradient
    for (auto& input : a.inputs()) {
      if (calc_grad.find(input.id()) != calc_grad.end()) {
        tape.push_back(a);
        calc_grad.insert(a.id());
        for (auto& s : a.siblings()) {
          calc_grad.insert(s.id());
        }
        break;
      }
    }
  };

  for (auto out : outputs) {
    recurse(out);
  }

  // Run the tape backwards, computing vector-jacobian
  // products for each primitive
  std::unordered_map<std::uintptr_t, array> cotan_map;
  for (auto [out_idx, cotan_idx] : output_cotan_pairs) {
    auto& o = outputs[out_idx];
    auto s = o.has_primitive() ? o.primitive().stream()
                               : default_stream(default_device());
    cotan_map.insert({o.id(), astype(cotans[cotan_idx], o.dtype(), s)});
  }
  for (auto it = tape.rbegin(); it != tape.rend(); ++it) {
    auto& a = *it;

    // Get the arguments whose gradients are needed
    std::vector<int> argnums;
    for (int i = 0; i < a.inputs().size(); ++i) {
      if (calc_grad.find(a.inputs()[i].id()) != calc_grad.end()) {
        argnums.push_back(i);
      }
    }

    // Check if any of the array or its siblings have cotangents,
    // if not, we can skip this primitive
    auto outputs = a.outputs();
    bool has_cotans =
        std::any_of(outputs.cbegin(), outputs.cend(), [&cotan_map](auto& s) {
          return cotan_map.find(s.id()) != cotan_map.end();
        });
    if (!has_cotans) {
      continue;
    }

    auto s = a.primitive().stream();
    std::vector<array> cotangents{};
    for (auto& o : outputs) {
      if (auto cotan_it = cotan_map.find(o.id()); cotan_it != cotan_map.end()) {
        cotangents.push_back(cotan_map.extract(cotan_it).mapped());
      } else {
        cotangents.push_back(zeros_like(o, s));
      }
    }

    std::vector<array> vjps;
    {
      detail::RetainGraph retain;
      vjps = a.primitive().vjp(a.inputs(), cotangents, argnums, outputs);
    }
    // Accumulate the vector-jacobian products for each input
    for (int i = 0; i < argnums.size(); ++i) {
      auto in_id = a.inputs()[argnums[i]].id();
      if (auto cotan_it = cotan_map.find(in_id); cotan_it != cotan_map.end()) {
        cotan_it->second = add(cotan_it->second, vjps[i], s);
      } else {
        cotan_map.insert({in_id, vjps[i]});
      }
    }
  }
  std::vector<array> vjps;
  for (auto arg : argnums) {
    auto& primal = primals_[arg];
    if (auto cotan_it = cotan_map.find(primal.id());
        cotan_it != cotan_map.end()) {
      vjps.push_back(cotan_it->second);
    } else {
      auto s = primal.has_primitive() ? primal.primitive().stream()
                                      : default_stream(default_device());
      vjps.push_back(zeros_like(primal, s));
    }
  }
  return {outputs, vjps};
}

std::pair<std::vector<array>, std::vector<array>> vjp(
    const std::function<std::vector<array>(const std::vector<array>&)>& fun,
    const std::vector<array>& primals,
    const std::vector<array>& cotans) {
  std::vector<int> argnums(primals.size());
  std::iota(argnums.begin(), argnums.end(), 0);
  return vjp(fun, primals, cotans, argnums);
}

std::pair<array, array> vjp(
    const std::function<array(const array&)>& fun,
    const array& primal,
    const array& cotan) {
  auto vec_fun = [fun](const std::vector<array>& inputs) {
    return std::vector<array>{fun(inputs[0])};
  };
  auto [outputs, vjps] = vjp(vec_fun, {primal}, {cotan});
  return {outputs[0], vjps[0]};
}

std::pair<std::vector<array>, std::vector<array>> jvp(
    const std::function<std::vector<array>(const std::vector<array>&)>& fun,
    const std::vector<array>& primals,
    const std::vector<array>& tangents) {
  // Set the global tracing flag.
  detail::InTracing in_tracing{false, true};

  if (primals.size() != tangents.size()) {
    throw std::invalid_argument(
        "[jvp] Number of inputs does not match number of tangents.");
  }
  for (int i = 0; i < primals.size(); ++i) {
    if (primals[i].shape() != tangents[i].shape()) {
      throw std::invalid_argument(
          "[jvp] Input shape does not match shape of tangent.");
    }
  }

  std::vector<array> primals_;
  for (auto& p : primals) {
    auto s = p.has_primitive() ? p.primitive().stream()
                               : default_stream(default_device());
    primals_.push_back(copy(p, s)); // Does not do a deep copy
    primals_.back().set_tracer(true);
  }
  auto outputs = fun(primals_);

  // Topologically sort the compute graph, record outputs
  // in the tape if a gradient is needed.
  std::unordered_set<std::uintptr_t> cache;
  std::unordered_set<std::uintptr_t> calc_grad;
  for (auto& primal : primals_) {
    primal.set_tracer(false);
    calc_grad.insert(primal.id());
    cache.insert(primal.id());
  }

  std::vector<array> tape;

  std::function<void(array&)> recurse;
  recurse = [&](auto& a) {
    // Check if visited and add to cache if not
    if (auto inserted = cache.insert(a.id()); !inserted.second) {
      return;
    }
    a.set_tracer(false);
    for (auto& s : a.siblings()) {
      s.set_tracer(false);
      cache.insert(s.id());
    }

    for (auto input : a.inputs()) {
      recurse(input);
    }

    // Stop grad
    if (a.has_primitive()) {
      if (auto& p = a.primitive(); typeid(p) == typeid(StopGradient)) {
        return;
      }
    }

    // Calculate gradient if any inputs require gradient
    for (auto& input : a.inputs()) {
      if (calc_grad.find(input.id()) != calc_grad.end()) {
        tape.push_back(a);
        calc_grad.insert(a.id());
        for (auto& s : a.siblings()) {
          calc_grad.insert(s.id());
        }
        break;
      }
    }
  };

  for (auto out : outputs) {
    recurse(out);
  }

  std::unordered_map<std::uintptr_t, array> tan_map;
  for (int i = 0; i < primals_.size(); ++i) {
    tan_map.insert({primals_[i].id(), tangents[i]});
  }

  for (auto& a : tape) {
    // Get the arguments used in the jvp
    std::vector<int> argnums;
    std::vector<array> tangents;
    for (int i = 0; i < a.inputs().size(); ++i) {
      if (auto it = tan_map.find(a.inputs()[i].id()); it != tan_map.end()) {
        argnums.push_back(i);
        tangents.push_back(it->second);
      }
    }

    auto jvps = a.primitive().jvp(a.inputs(), tangents, argnums);
    auto outputs = a.outputs();
    for (int i = 0; i < jvps.size(); ++i) {
      tan_map.insert({outputs[i].id(), jvps[i]});
    }
  }

  std::vector<array> jvps;
  for (auto& out : outputs) {
    if (auto it = tan_map.find(out.id()); it != tan_map.end()) {
      jvps.push_back(it->second);
    } else {
      auto s = out.has_primitive() ? out.primitive().stream()
                                   : default_stream(default_device());
      jvps.push_back(zeros_like(out, s));
    }
  }
  return {outputs, jvps};
}

std::pair<array, array> jvp(
    const std::function<array(const array&)>& fun,
    const array& primal,
    const array& tangent) {
  auto vec_fun = [fun](const std::vector<array>& inputs) {
    return std::vector<array>{fun(inputs[0])};
  };
  auto [outputs, jvps] = jvp(vec_fun, {primal}, {tangent});
  return {outputs[0], jvps[0]};
}

ValueAndGradFn value_and_grad(
    const std::function<std::vector<array>(const std::vector<array>&)>& fun,
    const std::vector<int>& argnums) {
  if (argnums.empty()) {
    throw std::invalid_argument("[grad] Must specify at least one argument.");
  }
  return [fun, argnums](const std::vector<array>& inputs) {
    std::set<int> args;
    for (auto& arg : argnums) {
      args.insert(arg < 0 ? arg + inputs.size() : arg);
    }
    if (args.size() != argnums.size()) {
      throw std::invalid_argument(
          "[grad] Repeat argument number not allowed in grad.");
    }
    if (*args.begin() < 0 || *args.rbegin() >= inputs.size()) {
      std::ostringstream msg;
      msg << "[grad] Invalid argument number for function with "
          << inputs.size() << " inputs.";
      throw std::invalid_argument(msg.str());
    }
    std::vector<int> sorted_argnums(args.begin(), args.end());

    auto gfun = [&fun](const std::vector<array>& inputs) {
      auto outputs = fun(inputs);
      for (int i = 1; i < outputs.size(); i++) {
        auto& out = outputs[i];
        auto s = out.has_primitive() ? out.primitive().stream()
                                     : default_stream(default_device());
        outputs[i] = stop_gradient(out, s);
      }
      return outputs;
    };

    // Set the incoming gradient to float32, vjp will cast it to the output type
    auto [outputs, grads] = vjp(gfun, inputs, {array(1.0f)}, sorted_argnums);
    return std::make_pair(outputs, grads);
  };
}

namespace detail {

std::pair<std::vector<array>, std::vector<array>> vmap_trace(
    const std::function<std::vector<array>(const std::vector<array>&)>& fun,
    const std::vector<array>& inputs,
    const std::vector<int>& in_axes) {
  // Set the global tracing flag.
  detail::InTracing in_tracing;

  if (in_axes.size() != inputs.size()) {
    std::stringstream ss;
    ss << "[vmap] The number of in axes (" << in_axes.size()
       << ") must match the number of inputs (" << inputs.size() << ").";
    throw std::invalid_argument(ss.str());
  }

  // Some error checking and get the vmap axis size
  size_t vmap_ax_size;
  for (int i = 0; i < inputs.size(); ++i) {
    if (in_axes[i] != -1) {
      if (inputs[i].ndim() == 0) {
        throw std::invalid_argument(
            "[vmap] Cannot vmap an input with zero dimensions.");
      }
      if (in_axes[i] > inputs[i].ndim()) {
        std::ostringstream msg;
        msg << "[vmap] Axis " << in_axes[i] << " invalid for input with "
            << inputs[i].ndim() << " dimensions.";
        throw std::invalid_argument(msg.str());
      }
      vmap_ax_size = inputs[i].shape(in_axes[i]);
    }
  }
  // Check that all vmapped axes have the same size
  for (int i = 0; i < inputs.size(); ++i) {
    if (in_axes[i] != -1) {
      if (size_t in_ax = inputs[i].shape(in_axes[i]); vmap_ax_size != in_ax) {
        std::ostringstream msg;
        msg << "[vmap] Inconsistent axis sizes: " << in_ax << " and "
            << vmap_ax_size << ".";
        throw std::invalid_argument(msg.str());
      }
    }
  }

  // Run the function on placeholder inputs
  // to get the original graph
  std::vector<array> s_inputs;
  for (int i = 0; i < inputs.size(); ++i) {
    if (in_axes[i] != -1) {
      auto shape = inputs[i].shape();
      shape.erase(shape.begin() + in_axes[i]);
      array in(shape, inputs[i].dtype(), nullptr, {});
      s_inputs.push_back(in);
      s_inputs.back().set_tracer(true);
    } else {
      s_inputs.push_back(inputs[i]);
    }
  }
  return {s_inputs, fun(s_inputs)};
}

std::vector<array> vmap_replace(
    const std::vector<array>& inputs,
    const std::vector<array>& s_inputs,
    const std::vector<array>& s_outputs,
    const std::vector<int>& in_axes,
    const std::vector<int>& out_axes) {
  if (out_axes.size() != s_outputs.size()) {
    std::stringstream msg;
    msg << "[vmap] The number of out axes (" << out_axes.size()
        << ") must match the number of outputs (" << s_outputs.size() << ").";
    throw std::invalid_argument(msg.str());
  }

  int vmap_size = -1;
  for (int i = 0; i < inputs.size(); ++i) {
    if (in_axes[i] >= 0) {
      vmap_size = inputs[i].shape(in_axes[i]);
      break;
    }
  }
  if (vmap_size == -1) {
    throw std::invalid_argument("At least one of in_axes must be non-None.");
  }

  std::unordered_map<std::uintptr_t, std::pair<array, int>> tmap;
  std::unordered_set<std::uintptr_t> needs_vmap;
  std::unordered_set<std::uintptr_t> cache;
  for (int i = 0; i < s_inputs.size(); ++i) {
    auto in = s_inputs[i];
    if (in_axes[i] != -1) {
      tmap.insert({in.id(), {inputs[i], in_axes[i]}});
      needs_vmap.insert(in.id());
      in.set_tracer(false);
    }
    cache.insert(in.id());
  }

  // Topologically sort the graph
  std::vector<array> tape;

  std::function<void(const array&)> recurse;

  recurse = [&](const array& a) {
    auto id = a.id();
    if (cache.find(id) != cache.end()) {
      return;
    }
    cache.insert(id);
    for (auto& s : a.siblings()) {
      cache.insert(s.id());
    }

    // Recurse on inputs
    for (auto& input : a.inputs()) {
      recurse(input);
    }
    // If any input needs a vmap, then the outputs also need
    // a vmap
    for (auto& input : a.inputs()) {
      if (needs_vmap.find(input.id()) != needs_vmap.end()) {
        tape.push_back(a);
        tape.back().set_tracer(false);
        needs_vmap.insert(a.id());
        for (auto s : a.siblings()) {
          needs_vmap.insert(s.id());
          s.set_tracer(false);
        }
        break;
      }
    }
  };

  for (auto& out : s_outputs) {
    if (out.has_primitive()) {
      recurse(out);
    }
  }

  // Transform each primitive in the graph with
  // its vmap implementation
  for (auto& a : tape) {
    std::vector<array> v_inputs;
    std::vector<int> v_axes;
    for (auto& in : a.inputs()) {
      auto map_it = tmap.find(in.id());
      if (map_it != tmap.end()) {
        v_inputs.push_back(map_it->second.first);
        v_axes.push_back(map_it->second.second);
      } else {
        v_inputs.push_back(in);
        v_axes.push_back(-1);
      }
    }

    auto [v_outputs, v_out_axes] = a.primitive().vmap(v_inputs, v_axes);

    // For each primitive's outputs add its id, the vout id and the vax
    auto outputs = a.outputs();
    for (int i = 0; i < v_outputs.size(); ++i) {
      tmap.insert({outputs[i].id(), {v_outputs[i], v_out_axes[i]}});
    }
  }

  // Populate the outputs and make sure all the output axes are
  // in the right place
  std::vector<array> outputs;
  for (int i = 0; i < s_outputs.size(); ++i) {
    if (auto map_it = tmap.find(s_outputs[i].id()); map_it != tmap.end()) {
      auto& [out, vdim] = map_it->second;
      if (vdim != out_axes[i]) {
        if (out_axes[i] >= out.ndim()) {
          std::ostringstream msg;
          msg << "[vmap] Axis " << out_axes[i] << " invalid for output with "
              << out.ndim() << " dimensions.";
          throw std::invalid_argument(msg.str());
        }
        out = moveaxis(out, vdim, out_axes[i]);
      }
      outputs.push_back(out);
    } else {
      // When the output has no input dependencies
      // use the size of the vmapped axis in the inputs to expand the output
      array output = expand_dims(s_outputs[i], out_axes[i]);
      output = repeat(output, vmap_size, out_axes[i]);
      outputs.push_back(output);
    }
  }
  return outputs;
}

} // namespace detail

std::function<std::vector<array>(const std::vector<array>&)> vmap(
    const std::function<std::vector<array>(const std::vector<array>&)>& fun,
    const std::vector<int>& in_axes /* = {} */,
    const std::vector<int>& out_axes /* = {} */) {
  auto infer_axes = [](auto axes) {
    return !axes.empty() &&
        std::all_of(axes.begin(), axes.end(), [](int ax) { return ax < 0; });
  };
  if (infer_axes(in_axes) != infer_axes(out_axes)) {
    throw std::invalid_argument(
        "[vmap] Input (or output) axes must be "
        "specified if output (or input) axes are.");
  }
  auto vfun = [fun, in_axes = in_axes, out_axes = out_axes](
                  const std::vector<array>& inputs) mutable {
    if (in_axes.size() == 0) {
      in_axes.resize(inputs.size(), 0);
    }

    auto [trace_inputs, trace_outputs] =
        detail::vmap_trace(fun, inputs, in_axes);

    if (out_axes.size() == 0) {
      out_axes.resize(trace_outputs.size(), 0);
    }

    return detail::vmap_replace(
        inputs, trace_inputs, trace_outputs, in_axes, out_axes);
  };

  return vfun;
}

std::function<array(const array&, const array&)> vmap(
    const std::function<array(const array&, const array&)>& fun,
    int in_axis_a /* = 0 */,
    int in_axis_b /* = 0 */,
    int out_axis /* = 0 */) {
  auto vfun = vmap(
      [fun](const std::vector<array>& inputs) {
        return std::vector<array>{fun(inputs[0], inputs[1])};
      },
      {in_axis_a, in_axis_b},
      {out_axis});
  return [vfun](const array& a, const array& b) { return vfun({a, b})[0]; };
}

std::function<array(const array&)> vmap(
    const std::function<array(const array&)>& fun,
    int in_axis /* = 0 */,
    int out_axis /* = 0 */) {
  auto vfun = vmap(
      [fun](const std::vector<array>& inputs) {
        return std::vector<array>{fun(inputs[0])};
      },
      {in_axis},
      {out_axis});
  return [vfun](const array& a) { return vfun({a})[0]; };
}

std::function<std::vector<array>(const std::vector<array>&)> custom_function(
    std::function<std::vector<array>(const std::vector<array>&)> fun,
    std::optional<std::function<std::vector<array>(
        const std::vector<array>&,
        const std::vector<array>&,
        const std::vector<array>&)>> fun_vjp /* = std::nullopt */,
    std::optional<std::function<std::vector<array>(
        const std::vector<array>&,
        const std::vector<array>&,
        const std::vector<int>&)>> fun_jvp /* = std::nullopt */,
    std::optional<std::function<std::pair<std::vector<array>, std::vector<int>>(
        const std::vector<array>&,
        const std::vector<int>&)>> fun_vmap /* = std::nullopt */) {
  if (!fun_vjp.has_value() && !fun_jvp.has_value() && !fun_vmap.has_value()) {
    return fun;
  }

  return [fun = std::move(fun),
          fun_vjp = std::move(fun_vjp),
          fun_jvp = std::move(fun_jvp),
          fun_vmap = std::move(fun_vmap)](const std::vector<array>& args) {
    // Compute the outputs
    auto outputs = fun(args);
    for (auto& out : outputs) {
      out = stop_gradient(out);
    }

    // Prepare the inputs to the primitive
    // We also add the outputs to the primitive so that it can "run" the forward
    // pass.
    std::vector<array> inputs = args;
    inputs.insert(inputs.end(), outputs.begin(), outputs.end());

    // Compute the stream. Maybe do it in a smarter way at some point in the
    // future.
    Stream s = (outputs[0].has_primitive()) ? outputs[0].primitive().stream()
                                            : default_stream(default_device());

    // Make the output info
    std::vector<Shape> shapes;
    std::vector<Dtype> dtypes;
    for (const auto& out : outputs) {
      shapes.emplace_back(out.shape());
      dtypes.emplace_back(out.dtype());
    }

    return array::make_arrays(
        std::move(shapes),
        dtypes,
        std::make_shared<CustomTransforms>(
            to_stream(s),
            outputs.size(),

            // We use the passed vjp function or compute it from the inputs and
            // passed cotangents. Note that this may be less efficient than
            // using `fun` directly because we may not be able to fully reuse
            // the outputs of the forward pass.
            fun_vjp.value_or(
                [fun](auto primals, auto cotangents, auto outputs) {
                  auto [__, vjps] = vjp(fun, primals, cotangents);
                  return vjps;
                }),

            // We use the passed jvp function or compute it from the primals
            // and tangents. Similarly we can't take full advantage of the
            // argnums so it is best to use `fun` directly if we don't need a
            // custom transform.
            //
            // TODO: Use stop_gradient to make full use of argnums and not
            //       waste computation.
            fun_jvp.value_or([fun](auto primals, auto tangents, auto argnums) {
              std::vector<array> all_tangents;
              for (int i = 0, j = 0; i < primals.size(); i++) {
                if (j < argnums.size() && i == argnums[j]) {
                  all_tangents.emplace_back(tangents[j++]);
                } else {
                  all_tangents.emplace_back(zeros_like(primals[i]));
                }
              }
              auto [__, jvps] = jvp(fun, primals, all_tangents);
              return jvps;
            }),

            // Same as above, we use the passed vmap function or we compute it
            // from `fun`. The output axes is selected to be all 0s which again
            // may be suboptimal but the only thing we can do without any
            // information for `fun`.
            fun_vmap.value_or(
                [fun, out_size = outputs.size()](auto inputs, auto in_axes)
                    -> std::pair<std::vector<array>, std::vector<int>> {
                  std::vector<int> out_axes(out_size, 0);
                  return {vmap(fun, in_axes, out_axes)(inputs), out_axes};
                })),
        inputs);
  };
}

std::function<std::vector<array>(const std::vector<array>&)> custom_vjp(
    std::function<std::vector<array>(const std::vector<array>&)> fun,
    std::function<std::vector<array>(
        const std::vector<array>&,
        const std::vector<array>&,
        const std::vector<array>&)> fun_vjp) {
  return custom_function(fun, fun_vjp, std::nullopt, std::nullopt);
}

std::function<std::vector<array>(const std::vector<array>&)> checkpoint(
    std::function<std::vector<array>(const std::vector<array>&)> fun) {
  auto vjp_fun = [fun](
                     const std::vector<array>& primals,
                     const std::vector<array>& cotangents,
                     const std::vector<array>& outputs) -> std::vector<array> {
    auto [__, vjps] = vjp(fun, depends(primals, outputs), cotangents);
    return vjps;
  };

  return custom_vjp(fun, vjp_fun);
}

} // namespace mlx::core


================================================
FILE: mlx/transforms.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <optional>

#include "mlx/api.h"
#include "mlx/array.h"

namespace mlx::core {

MLX_API void async_eval(std::vector<array> outputs);

template <typename... Arrays, typename = enable_for_arrays_t<Arrays...>>
void async_eval(Arrays&&... outputs) {
  async_eval(std::vector<array>{std::forward<Arrays>(outputs)...});
}

MLX_API void eval(std::vector<array> outputs);

template <typename... Arrays, typename = enable_for_arrays_t<Arrays...>>
void eval(Arrays&&... outputs) {
  eval(std::vector<array>{std::forward<Arrays>(outputs)...});
}

/**
 *  Computes the output and vector-Jacobian product (VJP) of a function.
 *
 *  Computes the vector-Jacobian product of the vector of cotangents with the
 *  Jacobian of the function evaluated at the primals. Returns a pair of
 *  vectors of output arrays and VJP arrays.
 **/
MLX_API std::pair<std::vector<array>, std::vector<array>> vjp(
    const std::function<std::vector<array>(const std::vector<array>&)>& fun,
    const std::vector<array>& primals,
    const std::vector<array>& cotangents);

/**
 *  Computes the output and vector-Jacobian product (VJP) of a unary function.
 */
MLX_API std::pair<array, array> vjp(
    const std::function<array(const array&)>& fun,
    const array& primal,
    const array& cotangent);

/**
 *  Computes the output and Jacobian-vector product (JVP) of a function.
 *
 *  Computes the Jacobian-vector product of the Jacobian of the function
 *  evaluated at the primals with the vector of tangents. Returns a pair of
 *  vectors of output arrays and JVP arrays.
 **/
MLX_API std::pair<std::vector<array>, std::vector<array>> jvp(
    const std::function<std::vector<array>(const std::vector<array>&)>& fun,
    const std::vector<array>& primals,
    const std::vector<array>& tangents);

/**
 *  Computes the output and Jacobian-vector product (JVP) of a unary function.
 */
MLX_API std::pair<array, array> jvp(
    const std::function<array(const array&)>& fun,
    const array& primal,
    const array& tangent);

// Return type of general value_and_grad: a function which takes an input
// vector of arrays and returns a pair of vectors of arrays one for the
// values and one for the gradients wrt the first value.
using ValueAndGradFn =
    std::function<std::pair<std::vector<array>, std::vector<array>>(
        const std::vector<array>&)>;
using SimpleValueAndGradFn = std::function<std::pair<array, std::vector<array>>(
    const std::vector<array>&)>;

/**
 *  Returns a function which computes the value and gradient of the input
 *  function with respect to a vector of input arrays.
 **/
MLX_API ValueAndGradFn value_and_grad(
    const std::function<std::vector<array>(const std::vector<array>&)>& fun,
    const std::vector<int>& argnums);

/**
 *  Returns a function which computes the value and gradient of the input
 *  function with respect to a single input array.
 **/
ValueAndGradFn inline value_and_grad(
    const std::function<std::vector<array>(const std::vector<array>&)>& fun,
    int argnum = 0) {
  return value_and_grad(fun, std::vector<int>{argnum});
}

/**
 *  Returns a function which computes the value and gradient of the unary
 *  input function.
 **/
std::function<std::pair<array, array>(const array&)> inline value_and_grad(
    const std::function<array(const array&)>& fun) {
  return [fun](auto inputs) { return vjp(fun, inputs, array(1.0f)); };
}

SimpleValueAndGradFn inline value_and_grad(
    const std::function<array(const std::vector<array>&)>& fun,
    const std::vector<int>& argnums) {
  return [fun, argnums](auto inputs) {
    auto result = value_and_grad(
        [fun](auto inputs) { return std::vector<array>{fun(inputs)}; },
        argnums)(inputs);

    return std::make_pair(result.first[0], result.second);
  };
}

SimpleValueAndGradFn inline value_and_grad(
    const std::function<array(const std::vector<array>&)>& fun,
    int argnum = 0) {
  return value_and_grad(fun, std::vector<int>{argnum});
}

/**
 *  Returns a function which computes the gradient of the input function with
 *  respect to a vector of input arrays.
 *
 *  The function being differentiated takes a vector of arrays and returns an
 *  array. The vector of `argnums` specifies which the arguments to compute
 *  the gradient with respect to. At least one argument must be specified.
 **/
std::function<std::vector<array>(const std::vector<array>&)> inline grad(
    const std::function<array(const std::vector<array>&)>& fun,
    const std::vector<int>& argnums) {
  auto fn = value_and_grad(fun, argnums);
  return [fn](const std::vector<array>& inputs) { return fn(inputs).second; };
}

/**
 *  Returns a function which computes the gradient of the input function with
 *  respect to a single input array.
 *
 *  The function being differentiated takes a vector of arrays and returns an
 *  array. The optional `argnum` index specifies which the argument to compute
 *  the gradient with respect to and defaults to 0.
 **/
std::function<std::vector<array>(const std::vector<array>&)> inline grad(
    const std::function<array(const std::vector<array>&)>& fun,
    int argnum = 0) {
  return grad(fun, std::vector<int>{argnum});
}

/**
 *  Returns a function which computes the gradient of the unary input function.
 **/
std::function<array(const array&)> inline grad(
    const std::function<array(const array&)>& fun) {
  auto fn = value_and_grad(fun);
  return [fn](const array& input) { return fn(input).second; };
}

/**
 * Automatically vectorize a unary function over the requested axes.
 */
MLX_API std::function<array(const array&)> vmap(
    const std::function<array(const array&)>& fun,
    int in_axis = 0,
    int out_axis = 0);

/**
 * Automatically vectorize a binary function over the requested axes.
 */
MLX_API std::function<array(const array&, const array&)> vmap(
    const std::function<array(const array&, const array&)>& fun,
    int in_axis_a = 0,
    int in_axis_b = 0,
    int out_axis = 0);

/**
 * Automatically vectorize a function over the requested axes.
 *
 * The input function to `vmap` takes as an argument a vector of arrays and
 * returns a vector of arrays. Optionally specify the axes to vectorize over
 * with `in_axes` and `out_axes`, otherwise a default of 0 is used.
 * Returns a vectorized function with the same signature as the input
 * function.
 */
MLX_API std::function<std::vector<array>(const std::vector<array>&)> vmap(
    const std::function<std::vector<array>(const std::vector<array>&)>& fun,
    const std::vector<int>& in_axes = {},
    const std::vector<int>& out_axes = {});

/**
 * Redefine the transformations of `fun` according to the provided functions.
 *
 * Namely when calling the vjp of `fun` then `fun_vjp` will be called,
 * `fun_jvp` for the jvp and `fun_vmap` for vmap.
 *
 * If any transformation is not provided, then a default one is created by
 * calling `vjp`, `jvp` and `vmap` on the function directly.
 */
MLX_API std::function<std::vector<array>(const std::vector<array>&)>
custom_function(
    std::function<std::vector<array>(const std::vector<array>&)> fun,
    std::optional<std::function<std::vector<array>(
        const std::vector<array>&,
        const std::vector<array>&,
        const std::vector<array>&)>> fun_vjp = std::nullopt,
    std::optional<std::function<std::vector<array>(
        const std::vector<array>&,
        const std::vector<array>&,
        const std::vector<int>&)>> fun_jvp = std::nullopt,
    std::optional<std::function<std::pair<std::vector<array>, std::vector<int>>(
        const std::vector<array>&,
        const std::vector<int>&)>> fun_vmap = std::nullopt);

/**
 * Return a function that behaves exactly like `fun` but if the vjp of the
 * results is computed `fun_vjp` will be used instead of `vjp(fun, ...)` .
 */
MLX_API std::function<std::vector<array>(const std::vector<array>&)> custom_vjp(
    std::function<std::vector<array>(const std::vector<array>&)> fun,
    std::function<std::vector<array>(
        const std::vector<array>&,
        const std::vector<array>&,
        const std::vector<array>&)> fun_vjp);

/**
 * Checkpoint the gradient of a function. Namely, discard all intermediate
 * state and recalculate it when we need to compute the gradient.
 */
MLX_API std::function<std::vector<array>(const std::vector<array>&)> checkpoint(
    std::function<std::vector<array>(const std::vector<array>&)> fun);

} // namespace mlx::core


================================================
FILE: mlx/transforms_impl.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include "mlx/api.h"

namespace mlx::core::detail {

MLX_API std::pair<std::vector<array>, std::vector<array>> vmap_trace(
    const std::function<std::vector<array>(const std::vector<array>&)>& fun,
    const std::vector<array>& inputs,
    const std::vector<int>& in_axes);

MLX_API std::vector<array> vmap_replace(
    const std::vector<array>& inputs,
    const std::vector<array>& s_inputs,
    const std::vector<array>& s_outputs,
    const std::vector<int>& in_axes,
    const std::vector<int>& out_axes);

// Create an InTracing object during tracing operations to signify to the rest
// of the codebase that we are during tracing so evals should not throw away
// the graph.
struct InTracing {
  explicit InTracing(bool dynamic = false, bool grad = false) {
    grad_counter += grad;
    trace_stack().push_back({dynamic, grad});
  }
  ~InTracing() {
    grad_counter -= trace_stack().back().second;
    trace_stack().pop_back();
  }

  static bool in_tracing() {
    return !trace_stack().empty();
  }
  static bool in_dynamic_tracing() {
    // compile is always and only the outer-most transform
    return in_tracing() && trace_stack().front().first;
  }

  static bool in_grad_tracing() {
    return grad_counter > 0;
  }

 private:
  static int grad_counter;
  static std::vector<std::pair<char, char>>& trace_stack();
};

struct RetainGraph {
  RetainGraph() {
    tracing_counter++;
  }
  ~RetainGraph() {
    tracing_counter--;
  }

  static bool retain_graph() {
    return tracing_counter > 0;
  }

 private:
  static int tracing_counter;
};

/** Return true if we are currently performing a function transformation in
 * order to keep the graph when evaluating tracer arrays. */
inline bool in_tracing() {
  return detail::InTracing::in_tracing();
}

/** Return true if we are in a dynamic (shapeless) trace used for compiling or
 * exporting graphs with dynamic shapes.  */
inline bool in_dynamic_tracing() {
  return detail::InTracing::in_dynamic_tracing();
}

/** Return true if we are in a gradient trace (vjp, jvp, etc).  */
inline bool in_grad_tracing() {
  return detail::InTracing::in_grad_tracing();
}

inline bool retain_graph() {
  return detail::RetainGraph::retain_graph();
}

} // namespace mlx::core::detail


================================================
FILE: mlx/types/bf16.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include <algorithm>
#include <cmath>
#include <cstdint>
#include <vector>

#define __MLX_BFLOAT_NAN__ 0x7FC0
#define __MLX_BFLOAT_ONE__ 0x3F80

namespace mlx::core {

namespace {
union float_bits_bf16 {
  float f;
  uint32_t u;
};
} // namespace

struct _MLX_BFloat16 {
  uint16_t bits_;

  // Default constructor
  _MLX_BFloat16() = default;

  // Default copy constructor
  _MLX_BFloat16(_MLX_BFloat16 const&) = default;

  // Appease std::vector<bool> for being special
  _MLX_BFloat16& operator=(std::vector<bool>::reference x) {
    bits_ = (x) ? __MLX_BFLOAT_ONE__ : 0;
    return (*this);
  }

  _MLX_BFloat16& operator=(const float& x) {
    return (*this = _MLX_BFloat16(x));
  }

  // From float32
  _MLX_BFloat16(const float& x) {
    if (std::isnan(x)) {
      bits_ = __MLX_BFLOAT_NAN__;
    } else {
      // Union
      float_bits_bf16 in;

      // Take bits
      in.f = x;

      // Round to nearest even
      in.u += (in.u >> 16 & 1) + uint32_t(0x7FFF);

      // Take upper 16 bits
      bits_ = in.u >> 16;
    }
  }

  // To float32
  operator float() const {
    // Union
    float_bits_bf16 out;

    // Upper 16 bits are the data and lower 16 bits are 0s
    out.u = ((uint32_t)bits_) << 16;

    return out.f;
  }
};

#define bfloat_binop_base(__op__, __operator__, otype, atype, btype, ctype) \
  inline otype __operator__(atype lhs, btype rhs) {                         \
    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);          \
  }

#define bfloat_binop_helper(__op__, __operator__, otype, itype, ctype) \
  inline otype __operator__(_MLX_BFloat16 lhs, itype rhs) {            \
    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);     \
  }                                                                    \
  inline otype __operator__(itype lhs, _MLX_BFloat16 rhs) {            \
    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);     \
  }

// Operators
#define bfloat_binop(_op_, _operator_)                                       \
  bfloat_binop_base(                                                         \
      _op_, _operator_, _MLX_BFloat16, _MLX_BFloat16, _MLX_BFloat16, float); \
  bfloat_binop_helper(_op_, _operator_, float, float, float);                \
  bfloat_binop_helper(_op_, _operator_, double, double, double);             \
  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, bool, float);         \
  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, int32_t, float);      \
  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, uint32_t, float);     \
  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, int64_t, float);      \
  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, uint64_t, float);

bfloat_binop(+, operator+);
bfloat_binop(-, operator-);
bfloat_binop(*, operator*);
bfloat_binop(/, operator/);

#undef bfloat_binop

// Comparison ops
#define bfloat_compop(__op__, __operator__)                             \
  bfloat_binop_base(                                                    \
      __op__, __operator__, bool, _MLX_BFloat16, _MLX_BFloat16, float); \
  bfloat_binop_helper(__op__, __operator__, bool, float, float);        \
  bfloat_binop_helper(__op__, __operator__, bool, double, double);      \
  bfloat_binop_helper(__op__, __operator__, bool, int32_t, float);      \
  bfloat_binop_helper(__op__, __operator__, bool, uint32_t, float);     \
  bfloat_binop_helper(__op__, __operator__, bool, int64_t, float);      \
  bfloat_binop_helper(__op__, __operator__, bool, uint64_t, float);

bfloat_compop(>, operator>);
bfloat_compop(<, operator<);
bfloat_compop(>=, operator>=);
bfloat_compop(<=, operator<=);
bfloat_compop(==, operator==);
bfloat_compop(!=, operator!=);

#undef bfloat_compop

// Negative
inline _MLX_BFloat16 operator-(_MLX_BFloat16 lhs) {
  return -static_cast<float>(lhs);
}

// Inplace ops
#define bfloat_inplace_op(__op__, __operator__)                              \
  inline _MLX_BFloat16& __operator__(_MLX_BFloat16& lhs, const float& rhs) { \
    lhs = lhs __op__ rhs;                                                    \
    return lhs;                                                              \
  }                                                                          \
  inline float& __operator__(float& lhs, _MLX_BFloat16 rhs) {                \
    lhs = lhs __op__ rhs;                                                    \
    return lhs;                                                              \
  }

bfloat_inplace_op(+, operator+=);
bfloat_inplace_op(-, operator-=);
bfloat_inplace_op(*, operator*=);
bfloat_inplace_op(/, operator/=);

#undef bfloat_inplace_op

// Bitwise ops

#define bfloat_bitop(__op__, __operator__)                                  \
  inline _MLX_BFloat16 __operator__(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs) { \
    _MLX_BFloat16 out;                                                      \
    out.bits_ = lhs.bits_ __op__ rhs.bits_;                                 \
    return out;                                                             \
  }                                                                         \
  inline _MLX_BFloat16 __operator__(_MLX_BFloat16 lhs, uint16_t rhs) {      \
    _MLX_BFloat16 out;                                                      \
    out.bits_ = lhs.bits_ __op__ rhs;                                       \
    return out;                                                             \
  }                                                                         \
  inline _MLX_BFloat16 __operator__(uint16_t lhs, _MLX_BFloat16 rhs) {      \
    _MLX_BFloat16 out;                                                      \
    out.bits_ = lhs __op__ rhs.bits_;                                       \
    return out;                                                             \
  }

bfloat_bitop(|, operator|);
bfloat_bitop(&, operator&);
bfloat_bitop(^, operator^);

#undef bfloat_bitop

#define bfloat_inplace_bitop(__op__, __operator__)                            \
  inline _MLX_BFloat16& __operator__(_MLX_BFloat16& lhs, _MLX_BFloat16 rhs) { \
    lhs.bits_ = lhs.bits_ __op__ rhs.bits_;                                   \
    return lhs;                                                               \
  }                                                                           \
  inline _MLX_BFloat16& __operator__(_MLX_BFloat16& lhs, uint16_t rhs) {      \
    lhs.bits_ = lhs.bits_ __op__ rhs;                                         \
    return lhs;                                                               \
  }

bfloat_inplace_bitop(|, operator|=);
bfloat_inplace_bitop(&, operator&=);
bfloat_inplace_bitop(^, operator^=);

#undef bfloat_inplace_bitop

} // namespace mlx::core


================================================
FILE: mlx/types/complex.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once
#include <complex>
#include "mlx/types/half_types.h"

namespace mlx::core {

struct complex64_t;
struct complex128_t;

template <typename T>
inline constexpr bool can_convert_to_complex128 =
    !std::is_same_v<T, complex128_t> && std::is_convertible_v<T, double>;

struct complex128_t : public std::complex<double> {
  complex128_t() : std::complex<double>() {};
  complex128_t(double v, double u) : std::complex<double>(v, u) {};
  complex128_t(std::complex<double> v) : std::complex<double>(v) {};

  template <
      typename T,
      typename = typename std::enable_if<can_convert_to_complex128<T>>::type>
  complex128_t(T x) : std::complex<double>(x){};

  operator float() const {
    return real();
  };
};

template <typename T>
inline constexpr bool can_convert_to_complex64 =
    !std::is_same_v<T, complex64_t> && std::is_convertible_v<T, float>;

struct complex64_t : public std::complex<float> {
  complex64_t() : std::complex<float>() {};
  complex64_t(float v, float u) : std::complex<float>(v, u) {};
  complex64_t(std::complex<float> v) : std::complex<float>(v) {};

  template <
      typename T,
      typename = typename std::enable_if<can_convert_to_complex64<T>>::type>
  complex64_t(T x) : std::complex<float>(x){};

  operator float() const {
    return real();
  };
};

inline bool operator>=(const complex64_t& a, const complex64_t& b) {
  return (a.real() > b.real()) ||
      (a.real() == b.real() && a.imag() >= b.imag());
}

inline bool operator>(const complex64_t& a, const complex64_t& b) {
  return (a.real() > b.real()) || (a.real() == b.real() && a.imag() > b.imag());
}

inline complex64_t operator%(complex64_t a, complex64_t b) {
  auto real = a.real() - (b.real() * static_cast<int64_t>(a.real() / b.real()));
  auto imag = a.imag() - (b.imag() * static_cast<int64_t>(a.imag() / b.imag()));
  if (real != 0 && ((real < 0) != (b.real() < 0)))
    real += b.real();
  if (imag != 0 && ((imag < 0) != (b.imag() < 0)))
    imag += b.imag();
  return {real, imag};
}

inline bool operator<=(const complex64_t& a, const complex64_t& b) {
  return operator>=(b, a);
}

inline bool operator<(const complex64_t& a, const complex64_t& b) {
  return operator>(b, a);
}

inline complex64_t operator-(const complex64_t& v) {
  return -static_cast<std::complex<float>>(v);
}

// clang-format off
#define complex_binop_helper(_op_, _operator_, itype)            \
  inline complex64_t _operator_(itype x, const complex64_t& y) { \
    return static_cast<complex64_t>(x) _op_ y;           \
  }                                                              \
  inline complex64_t _operator_(const complex64_t& x, itype y) { \
    return x _op_ static_cast<complex64_t>(y);           \
  }

#define complex_binop(_op_, _operator_)                                               \
  inline complex64_t _operator_(const std::complex<float>& x, const complex64_t& y) { \
    return x _op_ static_cast<std::complex<float>>(y);                                \
  }                                                                                   \
  inline complex64_t _operator_(const complex64_t& x, const std::complex<float>& y) { \
    return static_cast<std::complex<float>>(x) _op_ y;                                \
  }                                                                                   \
  inline complex64_t _operator_(const complex64_t& x, const complex64_t& y) {         \
    return static_cast<std::complex<float>>(x)                                        \
        _op_ static_cast<std::complex<float>>(y);                                     \
  }                                                                                   \
  complex_binop_helper(_op_, _operator_, bool)                                        \
  complex_binop_helper(_op_, _operator_, uint32_t)                                    \
  complex_binop_helper(_op_, _operator_, uint64_t)                                    \
  complex_binop_helper(_op_, _operator_, int32_t)                                     \
  complex_binop_helper(_op_, _operator_, int64_t)                                     \
  complex_binop_helper(_op_, _operator_, float16_t)                                   \
  complex_binop_helper(_op_, _operator_, bfloat16_t)                                  \
  complex_binop_helper(_op_, _operator_, float)
// clang-format on

complex_binop(+, operator+)

} // namespace mlx::core


================================================
FILE: mlx/types/fp16.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#include <algorithm>
#include <cmath>
#include <cstdint>
#include <vector>

#define __MLX_HALF_NAN__ 0x7D00
#define __MLX_HALF_ONE__ 0x3C00

namespace mlx::core {

namespace {
union float_bits_fp16 {
  float f;
  uint32_t u;
};
} // namespace

struct _MLX_Float16 {
  uint16_t bits_;

  // Default constructor
  _MLX_Float16() = default;

  // Default copy constructor
  _MLX_Float16(_MLX_Float16 const&) = default;

  // Appease std::vector<bool> for being special
  _MLX_Float16& operator=(std::vector<bool>::reference x) {
    bits_ = (x) ? __MLX_HALF_ONE__ : 0;
    return (*this);
  }

  _MLX_Float16& operator=(const float& x) {
    return (*this = _MLX_Float16(x));
  }

  // From float32
  _MLX_Float16(const float& x) : bits_(0) {
    // Conversion following
    // https://github.com/Maratyszcza/FP16/blob/master/include/fp16/fp16.h

    // Union
    float_bits_fp16 in;

    // Take fp32 bits
    in.f = x;

    // Find and take sign bit
    uint32_t x_sign_32 = in.u & uint32_t(0x80000000);
    uint16_t x_sign_16 = (x_sign_32 >> 16);

    if (std::isnan(x)) {
      bits_ = x_sign_16 | uint16_t(__MLX_HALF_NAN__);
    } else {
      // Union
      float_bits_fp16 inf_scale, zero_scale, magic_bits;

      // Find exponent bits and take the max supported by half
      uint32_t x_expo_32 = in.u & uint32_t(0x7f800000);
      uint32_t max_expo_32 = uint32_t(0x38800000);
      x_expo_32 = x_expo_32 < max_expo_32 ? max_expo_32 : x_expo_32;
      x_expo_32 += uint32_t(15) << 23;

      // Handle scaling to inf as needed
      inf_scale.u = uint32_t(0x77800000);
      zero_scale.u = uint32_t(0x08800000);

      // Combine with magic and let addition do rounding
      magic_bits.u = x_expo_32;
      magic_bits.f += (std::abs(x) * inf_scale.f) * zero_scale.f;

      // Take the lower 5 bits of the exponent
      uint32_t x_expo_16 = ((magic_bits.u >> 13) & uint32_t(0x7c00));

      // Collect the lower 12 bits which have the mantissa
      uint32_t x_mant_16 = magic_bits.u & uint32_t(0x0fff);

      // Combine sign, exp and mantissa
      bits_ = (x_sign_16 | uint16_t(x_expo_16 + x_mant_16));
    }
  }

  // To float32
  operator float() const {
    // Conversion following
    // https://github.com/Maratyszcza/FP16/blob/master/include/fp16/fp16.h

    // Union
    float_bits_fp16 out;

    uint32_t x_sign_32 = (bits_ << 16) & uint32_t(0x80000000);
    uint32_t base = (bits_ << 16);
    uint32_t two_base = base + base;

    uint32_t denorm_max = 1u << 27;
    if (two_base < denorm_max) {
      out.u = uint32_t(126) << 23; // magic mask
      out.u |= (two_base >> 17); // Bits from fp16
      out.f -= 0.5f; // magic bias
    } else {
      out.u = uint32_t(0xE0) << 23; // exponent offset
      out.u += (two_base >> 4); // Bits from fp16
      float out_unscaled = out.f; // Store value
      out.u = uint32_t(0x7800000); // exponent scale
      out.f *= out_unscaled;
    }

    // Add sign
    out.u |= x_sign_32;

    return out.f;
  }
};

#define half_binop_base(__op__, __operator__, otype, atype, btype, ctype) \
  inline otype __operator__(atype lhs, btype rhs) {                       \
    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);        \
  }

#define half_binop_helper(__op__, __operator__, otype, itype, ctype) \
  inline otype __operator__(_MLX_Float16 lhs, itype rhs) {           \
    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);   \
  }                                                                  \
  inline otype __operator__(itype lhs, _MLX_Float16 rhs) {           \
    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);   \
  }

// Operators
#define half_binop(__op__, __operator__)                                      \
  half_binop_base(                                                            \
      __op__, __operator__, _MLX_Float16, _MLX_Float16, _MLX_Float16, float); \
  half_binop_helper(__op__, __operator__, float, float, float);               \
  half_binop_helper(__op__, __operator__, double, double, double);            \
  half_binop_helper(__op__, __operator__, _MLX_Float16, bool, float);         \
  half_binop_helper(__op__, __operator__, _MLX_Float16, int32_t, float);      \
  half_binop_helper(__op__, __operator__, _MLX_Float16, uint32_t, float);     \
  half_binop_helper(__op__, __operator__, _MLX_Float16, int64_t, float);      \
  half_binop_helper(__op__, __operator__, _MLX_Float16, uint64_t, float);

half_binop(+, operator+);
half_binop(-, operator-);
half_binop(*, operator*);
half_binop(/, operator/);

#undef half_binop

// Comparison ops
#define half_compop(__op__, __operator__)                             \
  half_binop_base(                                                    \
      __op__, __operator__, bool, _MLX_Float16, _MLX_Float16, float); \
  half_binop_helper(__op__, __operator__, bool, float, float);        \
  half_binop_helper(__op__, __operator__, bool, double, double);      \
  half_binop_helper(__op__, __operator__, bool, int32_t, float);      \
  half_binop_helper(__op__, __operator__, bool, uint32_t, float);     \
  half_binop_helper(__op__, __operator__, bool, int64_t, float);      \
  half_binop_helper(__op__, __operator__, bool, uint64_t, float);

half_compop(>, operator>);
half_compop(<, operator<);
half_compop(>=, operator>=);
half_compop(<=, operator<=);
half_compop(==, operator==);
half_compop(!=, operator!=);

#undef half_compop

// Negative
inline _MLX_Float16 operator-(_MLX_Float16 lhs) {
  return -static_cast<float>(lhs);
}

// Inplace ops
#define half_inplace_op(__op__, __operator__)                              \
  inline _MLX_Float16& __operator__(_MLX_Float16& lhs, const float& rhs) { \
    lhs = lhs __op__ rhs;                                                  \
    return lhs;                                                            \
  }                                                                        \
  inline float& __operator__(float& lhs, _MLX_Float16 rhs) {               \
    lhs = lhs __op__ rhs;                                                  \
    return lhs;                                                            \
  }

half_inplace_op(+, operator+=);
half_inplace_op(-, operator-=);
half_inplace_op(*, operator*=);
half_inplace_op(/, operator/=);

#undef half_inplace_op

// Bitwise ops

#define half_bitop(__op__, __operator__)                                 \
  inline _MLX_Float16 __operator__(_MLX_Float16 lhs, _MLX_Float16 rhs) { \
    _MLX_Float16 out;                                                    \
    out.bits_ = lhs.bits_ __op__ rhs.bits_;                              \
    return out;                                                          \
  }                                                                      \
  inline _MLX_Float16 __operator__(_MLX_Float16 lhs, uint16_t rhs) {     \
    _MLX_Float16 out;                                                    \
    out.bits_ = lhs.bits_ __op__ rhs;                                    \
    return out;                                                          \
  }                                                                      \
  inline _MLX_Float16 __operator__(uint16_t lhs, _MLX_Float16 rhs) {     \
    _MLX_Float16 out;                                                    \
    out.bits_ = lhs __op__ rhs.bits_;                                    \
    return out;                                                          \
  }

half_bitop(|, operator|);
half_bitop(&, operator&);
half_bitop(^, operator^);

#undef half_bitop

#define half_inplace_bitop(__op__, __operator__)                           \
  inline _MLX_Float16& __operator__(_MLX_Float16& lhs, _MLX_Float16 rhs) { \
    lhs.bits_ = lhs.bits_ __op__ rhs.bits_;                                \
    return lhs;                                                            \
  }                                                                        \
  inline _MLX_Float16& __operator__(_MLX_Float16& lhs, uint16_t rhs) {     \
    lhs.bits_ = lhs.bits_ __op__ rhs;                                      \
    return lhs;                                                            \
  }

half_inplace_bitop(|, operator|=);
half_inplace_bitop(&, operator&=);
half_inplace_bitop(^, operator^=);

#undef half_inplace_bitop

} // namespace mlx::core


================================================
FILE: mlx/types/half_types.h
================================================
// Copyright © 2023 Apple Inc.

#pragma once

#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC

#include <arm_fp16.h>
namespace mlx::core {
using ::float16_t;
} // namespace mlx::core

#else

#define ADD_HALF_BINOPS
#include "mlx/types/fp16.h"
namespace mlx::core {
typedef struct _MLX_Float16 float16_t;
} // namespace mlx::core

#endif // __ARM_FEATURE_FP16_SCALAR_ARITHMETIC

#ifdef __ARM_FEATURE_BF16

#include <arm_bf16.h>
namespace mlx::core {
using ::bfloat16_t;
} // namespace mlx::core

#else

#define ADD_HALF_BINOPS
#include "mlx/types/bf16.h"
namespace mlx::core {
typedef struct _MLX_BFloat16 bfloat16_t;
} // namespace mlx::core

#endif // __ARM_FEATURE_BF16

#ifdef ADD_HALF_BINOPS
namespace mlx::core {

// clang-format off
#define fp16_bf16_binop_helper(__op__, __operator__)               \
  inline float __operator__(float16_t lhs, bfloat16_t rhs) {       \
    return static_cast<float>(lhs) __op__ static_cast<float>(rhs); \
  }                                                                \
  inline float __operator__(bfloat16_t lhs, float16_t rhs) {       \
    return static_cast<float>(lhs) __op__ static_cast<float>(rhs); \
  }

fp16_bf16_binop_helper(+, operator+)
fp16_bf16_binop_helper(-, operator-)
fp16_bf16_binop_helper(*, operator*)
fp16_bf16_binop_helper(/, operator/)
// clang-format on

} // namespace mlx::core
#endif


================================================
FILE: mlx/types/limits.h
================================================
// Copyright © 2024 Apple Inc.
#pragma once

#include <limits>
#include "mlx/types/half_types.h"

namespace mlx::core {

template <typename T>
struct numeric_limits;

template <>
struct numeric_limits<float> : public std::numeric_limits<float> {};

template <>
struct numeric_limits<double> : public std::numeric_limits<double> {};

template <>
struct numeric_limits<float16_t> {
 private:
  union half_or_bits {
    uint16_t bits;
    float16_t value;
  };
  constexpr static float16_t bits_to_half(uint16_t v) {
    return half_or_bits{v}.value;
  }

 public:
  constexpr static float16_t lowest() {
    return bits_to_half(0xFBFF);
  }
  static constexpr float16_t max() {
    return bits_to_half(0x7BFF);
  }
  static constexpr float16_t epsilon() {
    return bits_to_half(0x1400);
  }
  static constexpr float16_t infinity() {
    return bits_to_half(0x7C00);
  }
};

template <>
struct numeric_limits<bfloat16_t> {
 private:
  union bfloat_or_bits {
    uint16_t bits;
    bfloat16_t value;
  };
  constexpr static bfloat16_t bits_to_bfloat(uint16_t v) {
    return bfloat_or_bits{v}.value;
  }

 public:
  constexpr static bfloat16_t lowest() {
    return bits_to_bfloat(0xFF7F);
  }
  static constexpr bfloat16_t max() {
    return bits_to_bfloat(0x7F7F);
  }
  static constexpr bfloat16_t epsilon() {
    return bits_to_bfloat(0x3C00);
  }
  static constexpr bfloat16_t infinity() {
    return bits_to_bfloat(0x7F80);
  }
};

} // namespace mlx::core


================================================
FILE: mlx/utils.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <cstdlib>
#include <iostream>
#include <sstream>
#include <vector>

#include "mlx/dtype_utils.h"
#include "mlx/types/limits.h"
#include "mlx/utils.h"

namespace mlx::core {

Stream to_stream(StreamOrDevice s) {
  if (std::holds_alternative<std::monostate>(s)) {
    return default_stream(default_device());
  } else if (std::holds_alternative<Device>(s)) {
    return default_stream(std::get<Device>(s));
  } else {
    return std::get<Stream>(s);
  }
}

Stream to_stream(StreamOrDevice s, Device default_) {
  if (std::holds_alternative<std::monostate>(s)) {
    return default_stream(default_);
  } else if (std::holds_alternative<Device>(s)) {
    return default_stream(std::get<Device>(s));
  } else {
    return std::get<Stream>(s);
  }
}

void PrintFormatter::print(std::ostream& os, bool val) {
  if (capitalize_bool) {
    os << (val ? "True" : "False");
  } else {
    os << val;
  }
}
inline void PrintFormatter::print(std::ostream& os, int16_t val) {
  os << val;
}
inline void PrintFormatter::print(std::ostream& os, uint16_t val) {
  os << val;
}
inline void PrintFormatter::print(std::ostream& os, int32_t val) {
  os << val;
}
inline void PrintFormatter::print(std::ostream& os, uint32_t val) {
  os << val;
}
inline void PrintFormatter::print(std::ostream& os, int64_t val) {
  os << val;
}
inline void PrintFormatter::print(std::ostream& os, uint64_t val) {
  os << val;
}
inline void PrintFormatter::print(std::ostream& os, float16_t val) {
  os << val;
}
inline void PrintFormatter::print(std::ostream& os, bfloat16_t val) {
  os << val;
}
inline void PrintFormatter::print(std::ostream& os, float val) {
  os << val;
}
inline void PrintFormatter::print(std::ostream& os, double val) {
  os << val;
}
inline void PrintFormatter::print(std::ostream& os, complex64_t val) {
  os << val.real();
  if (val.imag() >= 0 || std::isnan(val.imag())) {
    os << "+" << val.imag() << "j";
  } else {
    os << "-" << -val.imag() << "j";
  }
}

PrintFormatter& get_global_formatter() {
  static PrintFormatter formatter;
  return formatter;
}

void abort_with_exception(const std::exception& error) {
  std::ostringstream msg;
  msg << "Terminating due to uncaught exception: " << error.what();
  std::cerr << msg.str() << std::endl;
  std::abort();
}

Dtype result_type(const std::vector<array>& arrays) {
  Dtype t = bool_;
  for (auto& arr : arrays) {
    t = promote_types(t, arr.dtype());
  }
  return t;
}

Shape broadcast_shapes(const Shape& s1, const Shape& s2) {
  // Use the same broadcasting rules as numpy
  // https://numpy.org/doc/1.20/user/theory.broadcasting.html
  // "The size of the trailing axes for both arrays in an operation must
  // either be the same size or one of them must be one."
  int ndim1 = s1.size();
  int ndim2 = s2.size();
  int ndim = std::max(ndim1, ndim2);
  int diff = std::abs(ndim1 - ndim2);
  const auto& big = ndim1 > ndim2 ? s1 : s2;
  const auto& small = ndim1 > ndim2 ? s2 : s1;
  Shape out_shape(ndim);
  for (int i = ndim - 1; i >= diff; --i) {
    auto a = big[i];
    auto b = small[i - diff];
    if (b == a) {
      out_shape[i] = a;
    } else if (a == 1 || b == 1) {
      // 0 if a or b is 0 otherwise max(a, b)
      out_shape[i] = a * b;
    } else {
      std::ostringstream msg;
      msg << "[broadcast_shapes] Shapes " << s1 << " and " << s2
          << " cannot be broadcast.";
      throw std::invalid_argument(msg.str());
    }
  }
  for (int i = diff - 1; i >= 0; --i) {
    out_shape[i] = big[i];
  }
  return out_shape;
}

int normalize_axis_index(
    int axis,
    int ndim,
    const std::string& msg_prefix /* = "" */) {
  if (axis < -ndim || axis >= ndim) {
    std::ostringstream msg;
    msg << msg_prefix << "Axis " << axis << " is out of bounds for array with "
        << ndim << " dimensions.";
    throw std::invalid_argument(msg.str());
  }
  return axis < 0 ? axis + ndim : axis;
}

std::ostream& operator<<(std::ostream& os, const Device& d) {
  os << "Device(";
  switch (d.type) {
    case Device::cpu:
      os << "cpu";
      break;
    case Device::gpu:
      os << "gpu";
      break;
  }
  os << ", " << d.index << ")";
  return os;
}

std::ostream& operator<<(std::ostream& os, const Stream& s) {
  os << "Stream(";
  os << s.device;
  os << ", " << s.index << ")";
  return os;
}

std::ostream& operator<<(std::ostream& os, int8_t x) {
  os << static_cast<int>(x);
  return os;
}

std::ostream& operator<<(std::ostream& os, uint8_t x) {
  os << static_cast<unsigned int>(x);
  return os;
}

namespace {

template <typename T>
void print_subarray(std::ostream& os, const array& a, size_t index, int dim) {
  int num_print = 3;
  int n = a.shape(dim);
  size_t s = a.strides()[dim];
  bool is_last = dim == a.ndim() - 1;
  auto prefix = is_last ? "" : std::string(7 + dim, ' ');
  auto postfix = is_last ? ", " : ",\n";
  os << "[";
  for (int i = 0; i < n; ++i) {
    os << (i == 0 ? "" : prefix);
    if (i == num_print && n > 2 * num_print) {
      os << "...";
      i = n - num_print - 1;
      index += s * (n - 2 * num_print - 1);
    } else if (is_last) {
      get_global_formatter().print(os, a.data<T>()[index]);
    } else {
      print_subarray<T>(os, a, index, dim + 1);
    }
    os << (i == n - 1 ? "" : postfix);
    index += s;
  }
  os << "]";
}

template <typename T>
void print_array(std::ostream& os, const array& a) {
  os << std::boolalpha;
  os << "array(";
  if (a.ndim() == 0) {
    auto data = a.data<T>();
    get_global_formatter().print(os, data[0]);
  } else {
    print_subarray<T>(os, a, 0, 0);
  }
  os << ", dtype=" << a.dtype() << ")";
  os << std::noboolalpha;
}

} // namespace

std::ostream& operator<<(std::ostream& os, const Dtype& dtype) {
  return os << dtype_to_string(dtype);
}

std::ostream& operator<<(std::ostream& os, const Dtype::Kind& k) {
  switch (k) {
    case Dtype::Kind::b:
      return os << "b";
    case Dtype::Kind::i:
      return os << "i";
    case Dtype::Kind::u:
      return os << "u";
    case Dtype::Kind::f:
      return os << "f";
    case Dtype::Kind::c:
      return os << "c";
    case Dtype::Kind::V:
      return os << "V";
  }
  return os;
}

std::ostream& operator<<(std::ostream& os, array a) {
  a.eval();
  dispatch_all_types(a.dtype(), [&](auto type_tag) {
    print_array<MLX_GET_TYPE(type_tag)>(os, a);
  });
  return os;
}

namespace env {

int get_var(const char* name, int default_value) {
  if (const char* buff_str = std::getenv(name)) {
    return atoi(buff_str);
  } else {
    return default_value;
  }
}

std::string get_var(const char* name, const char* default_value) {
  if (const char* buff_str = std::getenv(name)) {
    return buff_str;
  } else {
    return default_value;
  }
}

} // namespace env

template <typename T>
void set_finfo_limits(double& min, double& max, double& eps) {
  min = numeric_limits<T>::lowest();
  max = numeric_limits<T>::max();
  eps = numeric_limits<T>::epsilon();
}

finfo::finfo(Dtype dtype) : dtype(dtype) {
  if (!issubdtype(dtype, inexact)) {
    std::ostringstream msg;
    msg << "[finfo] dtype " << dtype << " is not inexact.";
    throw std::invalid_argument(msg.str());
  }
  if (dtype == float32) {
    set_finfo_limits<float>(min, max, eps);
  } else if (dtype == float16) {
    set_finfo_limits<float16_t>(min, max, eps);
  } else if (dtype == bfloat16) {
    set_finfo_limits<bfloat16_t>(min, max, eps);
  } else if (dtype == float64) {
    set_finfo_limits<double>(min, max, eps);
  } else if (dtype == complex64) {
    this->dtype = float32;
    set_finfo_limits<float>(min, max, eps);
  }
}

template <typename T>
void set_iinfo_limits(int64_t& min, uint64_t& max) {
  min = std::numeric_limits<T>::min();
  max = std::numeric_limits<T>::max();
}

iinfo::iinfo(Dtype dtype) : dtype(dtype) {
  dispatch_int_types(dtype, "[iinfo]", [&](auto type_tag) {
    set_iinfo_limits<MLX_GET_TYPE(type_tag)>(min, max);
  });
}

} // namespace mlx::core


================================================
FILE: mlx/utils.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <exception>
#include <variant>

#include "mlx/api.h"
#include "mlx/array.h"
#include "mlx/device.h"
#include "mlx/dtype.h"
#include "mlx/stream.h"

namespace mlx::core {

using StreamOrDevice = std::variant<std::monostate, Stream, Device>;
MLX_API Stream to_stream(StreamOrDevice s);
MLX_API Stream to_stream(StreamOrDevice s, Device default_);

struct StreamContext {
 public:
  StreamContext(StreamOrDevice s) : _stream(default_stream(default_device())) {
    if (std::holds_alternative<std::monostate>(s)) {
      throw std::runtime_error(
          "[StreamContext] Invalid argument, please specify a stream or device.");
    }
    auto _s = to_stream(s);
    set_default_device(_s.device);
    set_default_stream(_s);
  }

  ~StreamContext() {
    set_default_device(_stream.device);
    set_default_stream(_stream);
  }

 private:
  Stream _stream;
};

struct PrintFormatter {
  inline void print(std::ostream& os, bool val);
  inline void print(std::ostream& os, int16_t val);
  inline void print(std::ostream& os, uint16_t val);
  inline void print(std::ostream& os, int32_t val);
  inline void print(std::ostream& os, uint32_t val);
  inline void print(std::ostream& os, int64_t val);
  inline void print(std::ostream& os, uint64_t val);
  inline void print(std::ostream& os, float16_t val);
  inline void print(std::ostream& os, bfloat16_t val);
  inline void print(std::ostream& os, float val);
  inline void print(std::ostream& os, double val);
  inline void print(std::ostream& os, complex64_t val);

  bool capitalize_bool{false};
};

MLX_API PrintFormatter& get_global_formatter();

/** Print the exception and then abort. */
MLX_API void abort_with_exception(const std::exception& error);

/** Holds information about floating-point types. */
struct MLX_API finfo {
  explicit finfo(Dtype dtype);
  Dtype dtype;
  double min;
  double max;
  double eps;
};

/** Holds information about integral types. */
struct MLX_API iinfo {
  explicit iinfo(Dtype dtype);
  Dtype dtype;
  int64_t min;
  uint64_t max;
};

/** The type from promoting the arrays' types with one another. */
inline Dtype result_type(const array& a, const array& b) {
  return promote_types(a.dtype(), b.dtype());
}
inline Dtype result_type(const array& a, const array& b, const array& c) {
  return promote_types(result_type(a, b), c.dtype());
}
MLX_API Dtype result_type(const std::vector<array>& arrays);

MLX_API Shape broadcast_shapes(const Shape& s1, const Shape& s2);

/**
 * Returns the axis normalized to be in the range [0, ndim).
 */
MLX_API int
normalize_axis_index(int axis, int ndim, const std::string& msg_prefix = "");

MLX_API std::ostream& operator<<(std::ostream& os, const Device& d);
MLX_API std::ostream& operator<<(std::ostream& os, const Stream& s);
MLX_API std::ostream& operator<<(std::ostream& os, const Dtype& d);
MLX_API std::ostream& operator<<(std::ostream& os, const Dtype::Kind& k);
MLX_API std::ostream& operator<<(std::ostream& os, array a);
inline std::ostream& operator<<(std::ostream& os, const complex64_t& v) {
  return os << v.real() << (v.imag() >= 0 ? "+" : "") << v.imag() << "j";
}
inline std::ostream& operator<<(std::ostream& os, const float16_t& v) {
  return os << static_cast<float>(v);
}
inline std::ostream& operator<<(std::ostream& os, const bfloat16_t& v) {
  return os << static_cast<float>(v);
}

template <typename Vec, typename = std::enable_if_t<is_vector_v<Vec>>>
inline std::ostream& operator<<(std::ostream& os, const Vec& v) {
  os << "(";
  for (auto it = v.begin(); it != v.end(); ++it) {
    os << *it;
    if (it != std::prev(v.end())) {
      os << ",";
    }
  }
  os << ")";
  return os;
}

inline bool is_power_of_2(int n) {
  return ((n & (n - 1)) == 0) && n != 0;
}

inline int next_power_of_2(int n) {
  if (is_power_of_2(n)) {
    return n;
  }
  return pow(2, std::ceil(std::log2(n)));
}

namespace env {

int get_var(const char* name, int default_value);
std::string get_var(const char* name, const char* default_value);

inline int bfs_max_width() {
  static int bfs_max_width_ = get_var("MLX_BFS_MAX_WIDTH", 20);
  return bfs_max_width_;
}

inline int max_ops_per_buffer(int default_value) {
  static int max_ops_per_buffer_ =
      get_var("MLX_MAX_OPS_PER_BUFFER", default_value);
  return max_ops_per_buffer_;
}

inline int max_mb_per_buffer(int default_value) {
  static int max_mb_per_buffer_ =
      get_var("MLX_MAX_MB_PER_BUFFER", default_value);
  return max_mb_per_buffer_;
}

inline bool metal_fast_synch() {
  static bool metal_fast_synch = get_var("MLX_METAL_FAST_SYNCH", 0);
  return metal_fast_synch;
}

inline bool enable_tf32() {
  static bool enable_tf32_ = get_var("MLX_ENABLE_TF32", 1);
  return enable_tf32_;
}

inline int nccl_timeout(int default_value) {
  static int nccl_timeout = get_var("MLX_NCCL_TIMEOUT", default_value);
  return nccl_timeout;
}

inline const std::string& metal_gpu_arch() {
  static std::string gpu_arch_ = get_var("MLX_METAL_GPU_ARCH", "");
  return gpu_arch_;
}

} // namespace env

} // namespace mlx::core


================================================
FILE: mlx/version.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/version.h"

namespace mlx::core {

const char* version() {
  return MLX_VERSION;
}

} // namespace mlx::core


================================================
FILE: mlx/version.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/api.h"

#define MLX_VERSION_MAJOR 0
#define MLX_VERSION_MINOR 31
#define MLX_VERSION_PATCH 2
#define MLX_VERSION_NUMERIC \
  (100000 * MLX_VERSION_MAJOR + 1000 * MLX_VERSION_MINOR + MLX_VERSION_PATCH)

namespace mlx::core {

/* A string representation of the MLX version in the format
 * "major.minor.patch".
 *
 * For dev builds, the version will include the suffix ".devYYYYMMDD+hash"
 */
MLX_API const char* version();

} // namespace mlx::core


================================================
FILE: mlx.pc.in
================================================
# Find MLX
#
# Defines the following variables:
#
#   MLX_FOUND            : True if MLX is found
#   MLX_INCLUDE_DIRS     : Include directory
#   MLX_LIBRARIES        : Libraries to link against
#   MLX_CXX_FLAGS        : Additional compiler flags
#   MLX_BUILD_ACCELERATE : True if MLX was built with accelerate 
#   MLX_BUILD_METAL      : True if MLX was built with metal 

@PACKAGE_INIT@

include(@PACKAGE_MLX_CMAKE_INSTALL_MODULE_DIR@/MLXTargets.cmake)
include(@PACKAGE_MLX_CMAKE_INSTALL_MODULE_DIR@/extension.cmake)

set_and_check(MLX_LIBRARY_DIRS @PACKAGE_CMAKE_INSTALL_LIBDIR@)
set_and_check(MLX_INCLUDE_DIRS @PACKAGE_CMAKE_INSTALL_INCLUDEDIR@)
set(MLX_LIBRARIES mlx)

find_library(MLX_LIBRARY mlx PATHS ${MLX_LIBRARY_DIRS})

if (@MLX_BUILD_ACCELERATE@)
    set(MLX_BUILD_ACCELERATE @MLX_BUILD_ACCELERATE@)
    set(MLX_CXX_FLAGS ${MLX_CXX_FLAGS} -DACCELERATE_NEW_LAPACK)
endif()

if (@MLX_BUILD_METAL@)
    set(MLX_BUILD_METAL @MLX_BUILD_METAL@)
    set(MLX_CXX_FLAGS ${MLX_CXX_FLAGS} -D_METAL_)
    set(MLX_INCLUDE_DIRS 
        "${MLX_INCLUDE_DIRS};"
        @PACKAGE_CMAKE_INSTALL_INCLUDEDIR@/metal_cpp
    )
    if(@MLX_METAL_VERSION@ GREATER_EQUAL 310)
      set(MLX_INCLUDE_DIRS
        "${MLX_INCLUDE_DIRS};"
        @PACKAGE_CMAKE_INSTALL_INCLUDEDIR@/mlx/backend/metal/kernels/metal_3_1)
    else()
      set(MLX_INCLUDE_DIRS
        "${MLX_INCLUDE_DIRS};"
        @PACKAGE_CMAKE_INSTALL_INCLUDEDIR@/mlx/backend/metal/kernels/metal_3_0)
    endif()
endif()

set_target_properties(mlx PROPERTIES
    CXX_STANDARD 17
    INTERFACE_COMPILE_OPTIONS "${MLX_CXX_FLAGS}"
)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(MLX DEFAULT_MSG MLX_LIBRARY MLX_INCLUDE_DIRS)


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = [
  "setuptools>=80",
  "cmake>=3.25",
  "typing_extensions",
]
build-backend = "setuptools.build_meta"


================================================
FILE: python/mlx/__main__.py
================================================
import argparse


def main() -> None:
    from mlx.core import __version__

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--version",
        action="version",
        version=__version__,
        help="Print the version number.",
    )
    parser.add_argument(
        "--cmake-dir",
        action="store_true",
        help="Print the path to the MLX CMake module directory.",
    )
    args = parser.parse_args()
    if args.cmake_dir:
        from pathlib import Path

        print(Path(__file__).parent)


if __name__ == "__main__":
    main()


================================================
FILE: python/mlx/_distributed_utils/common.py
================================================
# Copyright © 2025 Apple Inc.

import argparse
import ipaddress
import json
import sys
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional, Union


@dataclass
class Host:
    rank: int
    ssh_hostname: str
    ips: list[str]
    rdma: list[Optional[Union[str, list[str]]]]


@dataclass
class Hostfile:
    hosts: list[Host]
    backend: str = ""
    envs: list[str] = field(default_factory=list)

    def to_json(self):
        return {
            "backend": self.backend,
            "envs": self.envs,
            "hosts": [
                {"ssh": h.ssh_hostname, "ips": h.ips, "rdma": h.rdma}
                for h in self.hosts
            ],
        }

    @classmethod
    def from_file(cls, hostfile):
        """Parse the json hostfile that contains both the hostnames to ssh into and
        the ips to communicate over when using the ring backend. It can also
        contain the backend to be used and environment variables to set when
        launching a distributed job.

        Example:

            {
                "backend": "jaccl",
                "envs": [
                    "MLX_METAL_FAST_SYNCH=1"
                ],
                "hosts": [
                    {"ssh": "hostname1", "ips": ["123.123.123.1"], "rdma": [null, "rdma_en2", "rdma_en3"]},
                    {"ssh": "hostname2", "ips": ["123.123.123.2"], "rdma": ["rdma_en2", null, "rdma_en3"]},
                    ...
                    {"ssh": "hostnameN", "ips": ["123.123.123.N"], "rdma": ["rdma_en2", "rdma_en3", null]},
                ]
            }

        Args:
            hostfile (str): The path to the json file containing the host
                information
        """
        hostfile = Path(hostfile)
        if not hostfile.exists():
            raise ValueError(f"Hostfile {str(hostfile)} doesn't exist")

        try:
            data = json.load(open(hostfile))
            backend = ""
            envs = []
            hosts = []
            if isinstance(data, dict):
                backend = data["backend"]
                envs = data["envs"]
                hosts = data["hosts"]
            elif isinstance(data, list):
                hosts = data

            hosts = [
                Host(i, h["ssh"], h.get("ips", []), h.get("rdma", []))
                for i, h in enumerate(hosts)
            ]

            return cls(hosts, backend, envs)

        except Exception as e:
            raise ValueError(
                f"Failed to parse hostfile {str(hostfile)} ({str(e)})"
            ) from e

    @classmethod
    def from_list(cls, hostlist, repeats=1):
        hosts = []
        for i, h in enumerate(hostlist.split(",")):
            if h == "":
                raise ValueError("Hostname cannot be empty")
            try:
                ipaddress.ip_address(h)
                ips = [h]
            except ValueError:
                ips = []
            for i in range(repeats):
                hosts.append(Host(i, h, ips, []))
        return cls(hosts)


class OptionalBoolAction(argparse.Action):
    def __call__(self, parser, namespace, values, option_string=None):
        if option_string.startswith("--no-"):
            setattr(namespace, self.dest, False)
        else:
            setattr(namespace, self.dest, True)


def positive_number(x):
    x = int(x)
    if x <= 0:
        raise ValueError("Number should be positive")
    return x


def log(verbose, *args, **kwargs):
    if not verbose:
        return
    kwargs["file"] = sys.stderr
    print("\033[32m[INFO]", *args, "\033[0m", **kwargs)


def log_warning(*args, **kwargs):
    kwargs["file"] = sys.stderr
    print("\033[33m[WARN]", *args, "\033[0m", **kwargs)


def log_error(*args, **kwargs):
    kwargs["file"] = sys.stderr
    print("\033[31m[ERROR]", *args, "\033[0m", **kwargs)


================================================
FILE: python/mlx/_distributed_utils/config.py
================================================
# Copyright © 2025 Apple Inc.

import argparse
import json
import shlex
import sys
import threading
from collections import defaultdict
from dataclasses import dataclass
from subprocess import DEVNULL, run
from typing import Optional

import mlx.core as mx

from .common import (
    Host,
    Hostfile,
    OptionalBoolAction,
    log,
    log_error,
    log_warning,
)


@dataclass
class SSHInfo:
    can_ssh: bool
    has_sudo: bool

    def __bool__(self):
        return self.can_ssh


@dataclass
class ThunderboltPort:
    iface: str
    uuid: str
    connected_to: Optional[str]


@dataclass
class ThunderboltHost:
    name: str
    ports: list[ThunderboltPort]


def add_ips(hosts, verbose=False):
    # Get the ips for each host
    for h in hosts:
        log(verbose, "Getting the ip from", h.ssh_hostname)
        ip = run(
            ["ssh", h.ssh_hostname, "ipconfig", "getifaddr", "en0"],
            capture_output=True,
            text=True,
        ).stdout.strip()
        if ip != "":
            h.ips.append(ip)
            continue

        ip = run(
            ["ssh", h.ssh_hostname, "ipconfig", "getifaddr", "en1"],
            capture_output=True,
            text=True,
        ).stdout.strip()
        if ip != "":
            h.ips.append(ip)
            continue

        log_warning("Could not extract ip for", h.ssh_hostname)


def save_hostfile(args, hostfile):
    if args.output_hostfile:
        with open(args.output_hostfile, "w") as f:
            json.dump(hostfile.to_json(), f, indent=4)
    else:
        print("Hostfile")
        print("========")
        print(json.dumps(hostfile.to_json(), indent=4))


def check_rdma(hosts, verbose=False, strict=True):
    # Check whether the hosts are capable of RDMA over thunderbolt
    log_f = log_warning if not strict else log_error
    failed = False
    for h in hosts:
        log(verbose, "Checking that", h.ssh_hostname, "supports RDMA")
        rdma_devs = (
            run(["ssh", h.ssh_hostname, "ibv_devices"], capture_output=True, text=True)
            .stdout.strip()
            .split()
        )
        rdma_devs = [d for d in rdma_devs if d.startswith("rdma_")]
        if not rdma_devs:
            log_f(h.ssh_hostname, "does not seem to have RDMA enabled")
            failed = True

    if failed:
        log_f()
        log_f("Some of the hosts don't have RDMA enabled or they don't support RDMA.")
        log_f()
        log_f("See https://ml-explore.github.io/mlx/build/html/usage/distributed.html")
        log_f("for instructions on how to enable RDMA.")

    if failed and strict:
        sys.exit(1)

    return not failed


def can_auto_setup(hosts, sshinfo, auto_setup=False):
    has_sudo = all(info.has_sudo for info in sshinfo)
    if not has_sudo and auto_setup:
        log_warning(
            "Automatic setup requested but the following hosts do not have passwordless sudo"
        )
        for h, i in zip(hosts, sshinfo):
            if not i.has_sudo:
                log_warning(" - ", h.ssh_hostname)
    return has_sudo


class IPConfigurator:
    def __init__(self, hosts, tb_hosts, uuid_reverse_index):
        assigned = set()
        ips = defaultdict(list)
        ip0 = 0
        ip1 = 0
        for src_node, h in enumerate(tb_hosts):
            for src_port, p in enumerate(h.ports):
                if not p.connected_to:
                    continue
                if p.connected_to not in uuid_reverse_index:
                    continue
                if (src_node, src_port) in assigned:
                    continue

                dst_node, dst_port = uuid_reverse_index[p.connected_to]

                ip_src = f"192.168.{ip0}.{ip1 + 1}"
                ip_dst = f"192.168.{ip0}.{ip1 + 2}"
                iface_src = p.iface
                iface_dst = tb_hosts[dst_node].ports[dst_port].iface

                ips[src_node, dst_node].append((iface_src, ip_src))
                ips[dst_node, src_node].append((iface_dst, ip_dst))

                assigned.add((src_node, src_port))
                assigned.add((dst_node, dst_port))

                ip1 += 4
                if ip1 > 255:
                    ip0 += 1
                    ip1 = 0
                if ip0 > 255:
                    raise ValueError("Ran out of available local IPs")

        self.ips = ips
        self.hosts = hosts
        self.tb_hosts = tb_hosts

    def setup(self, verbose=False, auto_setup=False):
        netmask = "255.255.255.252"
        for i, (h, th) in enumerate(zip(self.hosts, self.tb_hosts)):
            command = ""
            command += "sudo ifconfig bridge0 down\n"
            for j in range(len(self.hosts)):
                if i == j or (i, j) not in self.ips:
                    continue
                for (iface, ip), (_, peer) in zip(self.ips[i, j], self.ips[j, i]):
                    command += f"sudo ifconfig {iface} inet {ip} netmask {netmask}\n"
                    command += f"sudo route change {peer} -interface {iface}\n"
            if auto_setup:
                print(f"Running auto setup for {h.ssh_hostname}")
                command = command.strip().replace("\n", " ; ")
                command = ["ssh", h.ssh_hostname, command]
                log(verbose, shlex.join(command))
                run(command)
            else:
                msg = f"Setup for {h.ssh_hostname}"
                print(msg)
                print("=" * len(msg))
                print(command)
                input("Enter to continue")
            print()


def parse_hardware_ports(ports_string):
    ports = {}
    port_name = None
    for l in ports_string.decode("utf-8").split("\n"):
        if l.startswith("Hardware Port:"):
            port_name = l.strip()[15:]
        elif l.startswith("Device:"):
            ports[port_name] = l.strip()[8:]
            port_name = None
    return ports


def extract_connectivity(hosts, verbose):
    # Extract the current connectivity from the remote hosts
    thunderbolt_connections = []
    for h in hosts:
        log(verbose, "Getting connectivity from", h.ssh_hostname)
        thunderbolt_connections.append(
            json.loads(
                run(
                    [
                        "ssh",
                        h.ssh_hostname,
                        "system_profiler",
                        "SPThunderboltDataType",
                        "-json",
                    ],
                    capture_output=True,
                ).stdout
            )
        )
    interface_maps = []
    for h in hosts:
        log(verbose, "Getting interface names from", h.ssh_hostname)
        interface_maps.append(
            parse_hardware_ports(
                run(
                    [
                        "ssh",
                        h.ssh_hostname,
                        "networksetup",
                        "-listallhardwareports",
                    ],
                    capture_output=True,
                ).stdout
            )
        )

    # Parse the connectivity into some simple dataclasses
    tb_hosts = []
    for c, iface_map in zip(thunderbolt_connections, interface_maps):
        name = ""
        ports = []
        for t in c["SPThunderboltDataType"]:
            uuid = t.get("domain_uuid_key")
            if uuid is None:
                continue
            name = t["device_name_key"]
            tag = t["receptacle_1_tag"]["receptacle_id_key"]
            items = t.get("_items", [])
            connected_items = [item for item in items if "domain_uuid_key" in item]
            connected_to = (
                connected_items[0]["domain_uuid_key"] if connected_items else None
            )
            iface = iface_map[f"Thunderbolt {tag}"]
            ports.append(ThunderboltPort(iface, uuid, connected_to))
        tb_hosts.append(ThunderboltHost(name, sorted(ports, key=lambda x: x.iface)))

    # Create a reverse index to be able to map uuids to (host, port) quickly
    uuid_reverse_index = {}
    for i, h in enumerate(tb_hosts):
        for j, p in enumerate(h.ports):
            uuid_reverse_index[p.uuid] = (i, j)

    return tb_hosts, uuid_reverse_index


def make_connectivity_matrix(tb_hosts, uuid_reverse_index):
    connectivity = []
    for i, h in enumerate(tb_hosts):
        c = [0] * len(tb_hosts)
        for p in h.ports:
            if p.connected_to in uuid_reverse_index:
                j, _ = uuid_reverse_index[p.connected_to]
                c[j] += 1
        connectivity.append(c)
    return connectivity


def tb_connectivity_to_dot(hosts, tb_hosts, uuid_reverse_index):
    # Make ids per node
    names = []
    for i in range(len(tb_hosts)):
        n = ""
        j = i
        while True:
            n += chr(97 + j % 26)
            j //= 26
            if j == 0:
                break
        names.append(n)

    print("graph G {")
    print("  node [shape=rectangle];")
    for i, h in enumerate(hosts):
        print(f'  {names[i]} [label="{h.ssh_hostname}"];')
    for i, h in enumerate(tb_hosts):
        for p in h.ports:
            if not p.connected_to:
                continue
            if p.connected_to not in uuid_reverse_index:
                continue
            dst = uuid_reverse_index[p.connected_to]
            if dst[0] < i:
                continue
            print(f"  {names[i]} -- {names[dst[0]]}", end="")
            print(f' [label="{p.iface}/{tb_hosts[dst[0]].ports[dst[1]].iface}"]')
    print("}")


def extract_rings(connectivity):
    rings = []
    existing_rings = set()
    num_nodes = len(connectivity)

    def dfs(start_node, node, path, visited):
        path.append(node)
        visited.add(node)
        for j in range(num_nodes):
            if connectivity[node][j] <= 0:
                continue
            if j == start_node:
                yield path[:]
            if j not in visited:
                yield from dfs(start_node, j, path, visited)
        path.pop()
        visited.remove(node)

    for start in range(num_nodes):
        for r in dfs(start, start, [], set()):
            cnt = min(connectivity[r[i]][r[(i + 1) % len(r)]] for i in range(len(r)))
            rkey = tuple(sorted(r))
            if rkey not in existing_rings:
                rings.append((r, cnt))
                existing_rings.add(rkey)

    return sorted(rings, key=lambda x: -len(x[0]))


def check_valid_mesh(hosts, connectivity, strict=True):
    num_nodes = len(connectivity)
    for i in range(num_nodes):
        for j in range(num_nodes):
            if i == j:
                continue
            if connectivity[i][j] <= 0:
                if strict:
                    log_error(
                        f"Incomplete mesh, {hosts[i].ssh_hostname} is not connected to {hosts[j].ssh_hostname}"
                    )
                    log_error()
                    log_error("Try passing --dot to visualize the connectivity")
                    sys.exit(1)
                else:
                    return False
    return True


def check_valid_ring(hosts, rings, strict=True):
    has_ring = len(rings) > 0 and len(rings[0][0]) == len(hosts)
    if strict and not has_ring:
        log_error("Could not find a full ring.")
        log_error()
        log_error("Try passing --dot to visualize the connectivity")
        if len(rings) > 0:
            log_error("Rings found:")
            for r in rings:
                log_error(f" - {','.join(hosts[i].ssh_hostname for i in r)}")
        sys.exit(1)
    return has_ring


def check_ssh_connections(hosts, ignore_unreachable=False):
    results = [None] * len(hosts)

    def _check(hostname, i):
        info = SSHInfo(False, False)
        results[i] = info

        # Check for ssh
        result = run(
            [
                "ssh",
                "-o",
                "BatchMode=yes",
                "-o",
                "ConnectTimeout=5",
                hostname,
                "echo",
                "success",
            ],
            stdout=DEVNULL,
            stderr=DEVNULL,
        )
        info.can_ssh = result.returncode == 0
        if not info.can_ssh:
            return

        # Check for sudo
        result = run(
            [
                "ssh",
                "-o",
                "BatchMode=yes",
                "-o",
                "ConnectTimeout=5",
                hostname,
                "sudo",
                "ls",
            ],
            stdout=DEVNULL,
            stderr=DEVNULL,
        )
        info.has_sudo = result.returncode == 0

    threads = [
        threading.Thread(target=_check, args=(h.ssh_hostname, i))
        for i, h in enumerate(hosts)
    ]
    for t in threads:
        t.start()
    for t in threads:
        t.join()

    if not all(results) and not ignore_unreachable:
        log_error("Could not ssh to the following hosts:")
        for i, h in enumerate(hosts):
            if not results[i]:
                log_error("  - ", h.ssh_hostname)
        log_error()
        log_error("Maybe they are not set-up for password-less ssh?")
        sys.exit(1)

    return results


def prepare_ethernet_hostfile(args, hosts):
    log(args.verbose, f"Preparing an ethernet hostfile")
    add_ips(hosts, args.verbose)

    hostfile = Hostfile(
        [Host(i, h.ssh_hostname, h.ips, []) for i, h in enumerate(hosts)], "", args.env
    )

    save_hostfile(args, hostfile)


def configure_ring(args, hosts, ips, ring, sshinfo):
    log(args.verbose, "Prepare a ring hostfile")
    ring, count = ring
    ring_hosts = []
    for i, node in enumerate(ring):
        h = hosts[node]
        peer = ring[i - 1]
        ring_hosts.append(
            Host(
                i, h.ssh_hostname, [ips.ips[node, peer][c][1] for c in range(count)], []
            )
        )
    hostfile = Hostfile(ring_hosts, "ring", args.env)

    has_sudo = can_auto_setup(hosts, sshinfo, args.auto_setup)
    ips.setup(verbose=args.verbose, auto_setup=args.auto_setup and has_sudo)

    save_hostfile(args, hostfile)


def configure_jaccl(args, hosts, ips, sshinfo):
    log(args.verbose, "Prepare a jaccl hostfile")
    add_ips(hosts, args.verbose)

    jaccl_hosts = []
    for i, h in enumerate(hosts):
        rdma = []
        for j in range(len(hosts)):
            if i == j:
                rdma.append(None)
            else:
                rdma.append(f"rdma_{ips.ips[i, j][0][0]}")
        jaccl_hosts.append(Host(i, h.ssh_hostname, h.ips, rdma))
    hostfile = Hostfile(jaccl_hosts, "jaccl", args.env)

    has_sudo = can_auto_setup(hosts, sshinfo, args.auto_setup)
    ips.setup(verbose=args.verbose, auto_setup=args.auto_setup and has_sudo)

    save_hostfile(args, hostfile)


def configure_jaccl_ring(args, hosts, ips, ring, sshinfo):
    log(args.verbose, "Prepare a jaccl-ring hostfile")
    add_ips(hosts, args.verbose)

    jaccl_hosts = []
    num_nodes = len(hosts)
    ring, count = ring
    for i, node in enumerate(ring):
        h = hosts[node]
        peer_left = ring[i - 1]
        peer_right = ring[(i + 1) % num_nodes]
        rdmas = []
        for other in ring:
            if other not in (peer_left, peer_right):
                rdmas.append(None)
            else:
                rdma = []
                for c in range(count):
                    rdma.append(f"rdma_{ips.ips[node, other][c][0]}")
                rdmas.append(rdma[0] if count == 1 else rdma)
        jaccl_hosts.append(Host(i, h.ssh_hostname, h.ips, rdmas))
    hostfile = Hostfile(jaccl_hosts, "jaccl-ring", args.env)

    has_sudo = can_auto_setup(hosts, sshinfo, args.auto_setup)
    ips.setup(verbose=args.verbose, auto_setup=args.auto_setup and has_sudo)

    save_hostfile(args, hostfile)


def prepare_tb_hostfile(args, hosts, sshinfo):
    log(args.verbose, f"Preparing for communication over thunderbolt")
    tb_hosts, uuid_reverse_index = extract_connectivity(hosts, args.verbose)

    if args.dot:
        tb_connectivity_to_dot(hosts, tb_hosts, uuid_reverse_index)
        return

    ips = IPConfigurator(hosts, tb_hosts, uuid_reverse_index)
    connectivity = make_connectivity_matrix(tb_hosts, uuid_reverse_index)

    if args.backend is None:
        rings = extract_rings(connectivity)
        has_mesh = check_valid_mesh(hosts, connectivity, False)
        has_ring = check_valid_ring(hosts, rings, False)
        has_rdma = check_rdma(hosts, args.verbose, False)

        if not has_ring and not has_mesh:
            log_error("Neither thunderbolt mesh nor ring found.")
            log_error("Perhaps run with --dot to generate a plot of the connectivity.")
            sys.exit(1)

        elif has_rdma and has_mesh:
            configure_jaccl(args, hosts, ips, sshinfo)

        elif has_rdma and has_ring:
            configure_jaccl_ring(args, hosts, ips, rings[0], sshinfo)

        elif has_ring:
            configure_ring(args, hosts, ips, rings[0], sshinfo)

        else:
            log_error("RDMA is not available and ring is not found.")
            log_error("Perhaps run with --dot to generate a plot of the connectivity.")
            sys.exit(1)

    elif args.backend == "ring":
        rings = extract_rings(connectivity)
        check_valid_ring(hosts, rings)
        configure_ring(args, hosts, ips, rings[0], sshinfo)

    elif args.backend == "jaccl":
        check_valid_mesh(hosts, connectivity)
        check_rdma(hosts, args.verbose)
        configure_jaccl(args, hosts, ips, sshinfo)

    elif args.backend == "jaccl-ring":
        rings = extract_rings(connectivity)
        check_valid_ring(hosts, rings)
        check_rdma(hosts, args.verbose)
        configure_jaccl_ring(args, hosts, ips, rings[0], sshinfo)


def main():
    parser = argparse.ArgumentParser(
        description="Configure remote machines for use with MLX distributed"
    )
    parser.add_argument(
        "--verbose", action="store_true", help="Print debug messages in stdout"
    )
    parser.add_argument(
        "--hosts", default="127.0.0.1", help="A comma separated list of hosts"
    )
    parser.add_argument(
        "--ignore-unreachable",
        action="store_true",
        help="Ignore hosts that are not reachable via ssh",
    )
    parser.add_argument("--hostfile", help="The file containing the hosts")
    parser.add_argument(
        "--over",
        choices=["thunderbolt", "ethernet"],
        default="thunderbolt",
        help="What type of connectivity to configure",
        required=True,
    )
    parser.add_argument(
        "--output-hostfile", help="If provided, save the hostfile to this path"
    )
    parser.add_argument(
        "--auto-setup",
        "--no-auto-setup",
        action=OptionalBoolAction,
        nargs=0,
        dest="auto_setup",
        default=None,
    )
    parser.add_argument(
        "--dot", action="store_true", help="Output the topology in DOT format and exit"
    )
    parser.add_argument(
        "--backend",
        choices=["ring", "jaccl", "jaccl-ring"],
        default=None,
        help="Which distributed backend to configure",
    )
    parser.add_argument(
        "--env",
        action="append",
        default=[],
        help="Set environment variables for the jobs",
    )
    args = parser.parse_args()

    if args.hostfile is not None:
        hosts = Hostfile.from_file(args.hostfile).hosts
    else:
        hosts = Hostfile.from_list(args.hosts).hosts

    # Check that we can ssh
    log(
        args.verbose,
        f"Checking for ssh access for {', '.join(h.ssh_hostname for h in hosts)}",
    )
    sshinfo = check_ssh_connections(hosts, args.ignore_unreachable)
    hosts = [h for r, h in zip(sshinfo, hosts) if r]
    sshinfo = [r for r in sshinfo if r]

    # Prepare a hostfile for communication over ethernet using the ips of the
    # provided hostnames
    if args.over == "ethernet":
        prepare_ethernet_hostfile(args, hosts)

    # Configure the macs for communication over thunderbolt, both via RDMA and IP
    else:
        prepare_tb_hostfile(args, hosts, sshinfo)


================================================
FILE: python/mlx/_distributed_utils/launch.py
================================================
# Copyright © 2025 Apple Inc.

import argparse
import base64
import json
import os
import shlex
import shutil
import sys
import tempfile
import threading
from collections import Counter
from itertools import chain
from pathlib import Path
from queue import Empty as QueueEmpty
from queue import Queue
from select import select
from subprocess import PIPE, Popen, run

import mlx.core as mx

from .common import Hostfile, log, log_warning, positive_number


class CommandProcess:
    @property
    def process(self):
        """Return the Popen object that refers to the current command."""
        raise NotImplementedError()

    @property
    def exit_status(self):
        """Return a tuple (returncode, killed) for the command. It should be
        (None, None) while the command is running normally."""
        raise NotImplementedError()

    def preprocess_output(self, data: str, is_stdout=False):
        """Preprocess the output of the command so that extra data can be
        capture or the format changed on the fly."""
        raise NotImplementedError()

    def terminate(self):
        """Terminate or return the exit code."""
        raise NotImplementedError()


class RemoteProcess(CommandProcess):
    def __init__(self, rank, host, python, cwd, files, env, command):
        is_local = host == "127.0.0.1"
        cmd = RemoteProcess.make_launch_script(rank, cwd, files, env, command, is_local)
        if not is_local:
            cmd = f"ssh -tt -o LogLevel=QUIET {host} {shlex.quote(cmd)}"

        self._host = host
        self._pidfile = None
        self._is_local = is_local
        self._process = Popen(
            cmd,
            shell=True,
            executable="/bin/bash",
            stdin=PIPE,
            stdout=PIPE,
            stderr=PIPE,
        )

        self._killed = False

    @property
    def process(self):
        return self._process

    @property
    def exit_status(self):
        return self._process.poll(), self._killed

    def preprocess_output(self, data, is_stdout=False):
        if self._pidfile is None:
            pidfile, *rest = data.split("\n", maxsplit=1)
            self._pidfile = pidfile
            return rest[0] if rest else ""

        return data

    def terminate(self):
        if self._killed:
            return

        self._process.terminate()
        self._process.wait()

        # Kill the remote program if possible
        cmd = RemoteProcess.make_kill_script(self._pidfile)
        if not self._is_local:
            cmd = f"ssh {self._host} {shlex.quote(cmd)}"
        c = run(
            cmd,
            check=True,
            shell=True,
            executable="/bin/bash",
            capture_output=True,
            text=True,
        )

        self._killed = c.stdout.strip() == "1"

    @staticmethod
    def make_launch_script(rank, cwd, files, env, command, is_local):
        script = ""

        # Disable echo
        if not is_local:
            script = "stty -echo; "

        # Write the PID to a file so we can kill the process if needed
        script += "pidfile=$(mktemp); "
        script += "echo $$ > $pidfile; "
        script += 'printf "%s\\n" $pidfile; '

        # Change the working directory if one was requested. Otherwise attempt to
        # change to the current one but don't fail if it wasn't possible.
        d = cwd or os.getcwd()
        script += f"if [[ -d {repr(d)} ]]; then "
        script += f"  cd {repr(d)}; "
        if cwd is not None:
            script += "else "
            script += f" echo 'Failed to change directory to' {repr(d)} >2; "
        script += "fi; "

        # Add the environment variables that were requested
        for e in env:
            key, *value = e.split("=", maxsplit=1)
            value = shlex.quote(value[0]) if len(value) > 0 else ""
            if not all(c.isalnum() or c == "_" for c in key):
                log_warning(
                    f"'{e}' is an invalid environment variable so it is ignored"
                )
                continue
            script += f"export {key}={value}; "

        # Make the temporary files
        for env_name, content in files.items():
            script += "fname=$(mktemp); "
            script += f"echo {shlex.quote(content)} >$fname; "
            script += f"export {env_name}=$fname; "

        # Finally add the rank
        script += f"export MLX_RANK={rank}; "

        # Replace the process with the script
        script += f"cmd=({' '.join(map(shlex.quote, command))}); "
        script += 'exec "${cmd[@]}"'

        return script

    @staticmethod
    def make_kill_script(pidfile):
        script = ""
        script += f"pid=$(cat {pidfile}); "
        script += "if ps -p $pid >/dev/null; then "
        script += "    kill $pid; "
        script += "    echo 1; "
        script += "else "
        script += "    echo 0; "
        script += "fi; "
        script += f"rm {pidfile}"

        return script


def _launch_with_io(command_class, arguments, verbose):
    stop = False
    exit_codes = [(None, None)] * len(arguments)

    def _thread_fn(rank, *args, **kwargs):
        stdin_queue = kwargs.pop("stdin_queue")
        stdout_queue = kwargs.pop("stdout_queue")
        stderr_queue = kwargs.pop("stderr_queue")

        command = command_class(rank, *args, **kwargs)
        p = command.process
        os.set_blocking(p.stdout.fileno(), False)
        os.set_blocking(p.stderr.fileno(), False)
        os.set_blocking(p.stdin.fileno(), False)

        to_read = [p.stdout.fileno(), p.stderr.fileno()]
        to_write = [p.stdin.fileno()]

        stdin_buffer = b""
        while p.poll() is None:
            try:
                stdin_buffer += stdin_queue.get_nowait()
            except QueueEmpty:
                pass
            rlist, wlist, _ = select(to_read, to_write, [], 1.0)
            for fd in rlist:
                is_stdout = fd == p.stdout.fileno()
                msg = os.read(fd, 8192).decode(errors="ignore")
                msg = command.preprocess_output(msg, is_stdout)
                if is_stdout:
                    stdout_queue.put(msg.encode())
                else:
                    stderr_queue.put(msg.encode())
            for fd in wlist:
                if len(stdin_buffer) > 0:
                    n = os.write(fd, stdin_buffer)
                    stdin_buffer = stdin_buffer[n:]
            if stop:
                command.terminate()
                break
        exit_codes[rank] = command.exit_status

        if exit_codes[rank][1]:
            log_warning(f"Node with rank {rank} was killed")
        elif exit_codes[rank][0] != 0:
            log_warning(f"Node with rank {rank} exited with code {exit_codes[rank][0]}")
        else:
            log(verbose, f"Node with rank {rank} completed")

    stdin_queues = []
    stdout_queues = []
    stderr_queues = []
    threads = []
    for i, (args, kwargs) in enumerate(arguments):
        stdin_queues.append(Queue())
        stdout_queues.append(Queue())
        stderr_queues.append(Queue())
        t = threading.Thread(
            target=_thread_fn,
            args=args,
            kwargs=kwargs
            | {
                "stdin_queue": stdin_queues[-1],
                "stdout_queue": stdout_queues[-1],
                "stderr_queue": stderr_queues[-1],
            },
        )
        t.start()
        threads.append(t)

    os.set_blocking(sys.stdin.fileno(), False)
    os.set_blocking(sys.stdout.fileno(), True)
    os.set_blocking(sys.stderr.fileno(), True)
    while not stop or any(not q.empty() for q in chain(stdout_queues, stderr_queues)):
        # Broadcast user input to the jobs
        rlist, _, _ = select([sys.stdin.fileno()], [], [], 0.1)
        for fd in rlist:
            stdin_buffer = os.read(fd, 8192)
            for q in stdin_queues:
                q.put(stdin_buffer)

        # Gather job output
        for q in stdout_queues:
            try:
                while not q.empty():
                    sys.stdout.buffer.write(q.get_nowait())
            except QueueEmpty:
                pass
        for q in stderr_queues:
            try:
                while not q.empty():
                    sys.stderr.buffer.write(q.get_nowait())
            except QueueEmpty:
                pass
        sys.stdout.buffer.flush()
        sys.stderr.buffer.flush()

        # Check if all are running and terminate otherwise
        if any(t.is_alive() for t in threads):
            for i, t in enumerate(threads):
                if not t.is_alive():
                    if exit_codes[i][0] != 0:
                        stop = True
                        break
        else:
            break

    # Wait for the jobs to finish
    for t in threads:
        t.join()

    # Process any remaining outputs
    for q in stdout_queues:
        while not q.empty():
            sys.stdout.buffer.write(q.get())
    for q in stderr_queues:
        while not q.empty():
            sys.stderr.buffer.write(q.get())
    sys.stdout.buffer.flush()
    sys.stderr.buffer.flush()


def launch_ring(parser, hosts, args, command):
    if any(len(h.ips) == 0 for h in hosts):
        parser.error(
            "The ring backend requires IPs to be provided instead of hostnames"
        )

    port = args.starting_port
    ring_hosts = []
    for h in hosts:
        node = []
        for ip in h.ips:
            for i in range(args.connections_per_ip):
                node.append(f"{ip}:{port}")
                port += 1
        ring_hosts.append(node)
    hostfile = json.dumps(ring_hosts) if len(ring_hosts) > 1 else ""

    files = {"MLX_HOSTFILE": hostfile}
    env = args.env
    if args.verbose:
        env.append("MLX_RING_VERBOSE=1")
    cwd = args.cwd

    log(args.verbose, "Running", shlex.join(command))

    _launch_with_io(
        RemoteProcess,
        [
            ((rank, h.ssh_hostname, args.python, cwd, files, env, command), {})
            for rank, h in enumerate(hosts)
        ],
        args.verbose,
    )


def launch_nccl(parser, hosts, args, command):
    if not hosts[0].ips:
        raise ValueError("Rank 0 should have an IP reachable from all other ranks")

    master_host = hosts[0].ips[0]
    master_port = args.nccl_port
    world_size = len(hosts)

    env = args.env
    cwd = args.cwd
    if args.verbose:
        env.append("NCCL_DEBUG=INFO")
    env.append(f"NCCL_HOST_IP={master_host}")
    env.append(f"NCCL_PORT={master_port}")
    env.append(f"MLX_WORLD_SIZE={world_size}")

    log(args.verbose, "Running", shlex.join(command))

    _launch_with_io(
        RemoteProcess,
        [
            (
                (
                    rank,
                    h.ssh_hostname,
                    args.python,
                    cwd,
                    {},
                    env + [f"CUDA_VISIBLE_DEVICES={rank % args.repeat_hosts}"],
                    command,
                ),
                {},
            )
            for rank, h in enumerate(hosts)
        ],
        args.verbose,
    )


def launch_jaccl(parser, hosts, args, command):
    if not hosts[0].ips:
        raise ValueError("Rank 0 should have an IP reachable from all other ranks")

    jaccl_ring = args.backend == "jaccl-ring"
    have_rdmas = all(len(h.rdma) == len(hosts) for h in hosts)
    have_nulls = all(h.rdma[i] is None for i, h in enumerate(hosts))
    if not have_rdmas or not have_nulls:
        raise ValueError("Malformed hostfile for jaccl backend")

    coordinator = hosts[0].ips[0]
    env = args.env
    cwd = args.cwd
    env.append(f"MLX_JACCL_COORDINATOR={coordinator}:{args.starting_port}")
    if jaccl_ring:
        env.append("MLX_JACCL_RING=1")
    files = {"MLX_IBV_DEVICES": json.dumps([h.rdma for h in hosts])}

    log(args.verbose, "Running", shlex.join(command))

    _launch_with_io(
        RemoteProcess,
        [
            ((rank, h.ssh_hostname, args.python, cwd, files, env, command), {})
            for rank, h in enumerate(hosts)
        ],
        args.verbose,
    )


def get_mpi_libname():
    try:
        ompi_info = run(["which", "ompi_info"], check=True, capture_output=True)
        ompi_info = ompi_info.stdout.strip().decode()

        if platform.system() == "Darwin":
            otool_output = run(
                ["otool", "-L", ompi_info], check=True, capture_output=True
            )
        else:
            otool_output = run(["ldd", ompi_info], check=True, capture_output=True)
        otool_output = otool_output.stdout.decode()

        # StopIteration if not found
        libmpi_line = next(
            filter(lambda line: "libmpi" in line, otool_output.splitlines())
        )
        return libmpi_line.strip().split()[0].removeprefix("@rpath/")
    except:
        return None


def launch_mpi(parser, hosts, args, command):
    mpirun = run(["which", "mpirun"], check=True, capture_output=True)
    mpirun = mpirun.stdout.strip().decode()

    # Compatibility with homebrew and pip installs
    mpi_libname = get_mpi_libname()
    if mpi_libname is not None:
        dyld = Path(mpirun).parent.parent / "lib"
        args.env = [
            f"DYLD_LIBRARY_PATH={str(dyld)}",
            f"MLX_MPI_LIBNAME={mpi_libname}",
        ] + args.env

    log(args.verbose, f"Using '{mpirun}'")
    with tempfile.NamedTemporaryFile(mode="w") as f:
        hosts = Counter((h.ssh_hostname for h in hosts))
        for h, n in hosts.items():
            print(f"{h} slots={n}", file=f)
        f.flush()

        cmd = [
            mpirun,
            "--output",
            ":raw",  # do not line buffer output
            "--hostfile",
            f.name,
            *(["-cwd", args.cwd] if args.cwd else []),
            *sum((["-x", e] for e in args.env), []),
            *sum([shlex.split(arg) for arg in args.mpi_arg], []),
            "--",
            *command,
        ]
        log(args.verbose, "Running", " ".join(cmd))
        try:
            run(cmd)
        except KeyboardInterrupt:
            pass


def main():
    parser = argparse.ArgumentParser(description="Launch an MLX distributed program")
    parser.add_argument(
        "--print-python",
        action="store_true",
        help="Print the path to the current python executable and exit",
    )
    parser.add_argument(
        "--verbose", action="store_true", help="Print debug messages in stdout"
    )
    parser.add_argument(
        "--hosts", default="127.0.0.1", help="A comma separated list of hosts"
    )
    parser.add_argument(
        "--repeat-hosts",
        "-n",
        type=positive_number,
        default=1,
        help="Repeat each host a given number of times",
    )
    parser.add_argument("--hostfile", help="The file containing the hosts")
    parser.add_argument(
        "--backend",
        help="Which distributed backend to launch",
    )
    parser.add_argument(
        "--env",
        action="append",
        default=[],
        help="Set environment variables for the jobs",
    )
    parser.add_argument(
        "--mpi-arg",
        action="append",
        default=[],
        help="Arguments to pass directly to mpirun",
    )
    parser.add_argument(
        "--connections-per-ip",
        default=1,
        type=int,
        help="How many connections per ip to use for the ring backend",
    )
    parser.add_argument(
        "--starting-port",
        "-p",
        type=int,
        default=32323,
        help="For the ring backend listen on this port increasing by 1 per rank and IP",
    )
    parser.add_argument(
        "--cwd", help="Set the working directory on each node to the provided one"
    )
    parser.add_argument(
        "--nccl-port",
        type=int,
        default=12345,
        help="The port to use for the NCCL communication (only for nccl backend)",
    )
    parser.add_argument(
        "--no-verify-script",
        action="store_false",
        dest="verify_script",
        help="Do not verify that the script exists",
    )
    parser.add_argument(
        "--python", default=sys.executable, help="Use this python on the remote hosts"
    )

    args, rest = parser.parse_known_args()

    if args.print_python:
        print(args.python)
        return

    if len(rest) == 0:
        parser.error("No script is provided")
    if rest[0] == "--":
        rest.pop(0)

    # Try to extract a list of hosts and corresponding ips
    if args.hostfile is not None:
        hostfile = Hostfile.from_file(args.hostfile)
    else:
        hostfile = Hostfile.from_list(args.hosts, args.repeat_hosts)

    # Extract extra arguments from the hostfile
    if hostfile.backend != "" and args.backend is None:
        args.backend = hostfile.backend
    if args.backend is None:
        args.backend = "nccl" if mx.cuda.is_available() else "ring"
    args.env = hostfile.envs + args.env

    # Check if the script is a file and convert it to a full path
    if (script := Path(rest[0])).exists() and script.is_file():
        rest[0:1] = [args.python, str(script.resolve())]
    elif (command := shutil.which(rest[0])) is not None:
        rest[0] = command
    elif args.verify_script:
        raise ValueError(f"Invalid script or command {rest[0]}")

    # Launch
    if args.backend == "ring":
        launch_ring(parser, hostfile.hosts, args, rest)
    elif args.backend == "mpi":
        launch_mpi(parser, hostfile.hosts, args, rest)
    elif args.backend == "nccl":
        launch_nccl(parser, hostfile.hosts, args, rest)
    elif args.backend == "jaccl" or args.backend == "jaccl-ring":
        launch_jaccl(parser, hostfile.hosts, args, rest)
    else:
        parser.error(
            "The backend should be one of {'ring', 'mpi', 'nccl', 'jaccl', 'jaccl-ring'}"
        )


================================================
FILE: python/mlx/_reprlib_fix.py
================================================
# Copyright © 2023 Apple Inc.

import array
import reprlib

_old_repr_array = reprlib.Repr.repr_array


def repr_array(self, x, maxlevel):
    if isinstance(x, array.array):
        return _old_repr_array(self, x, maxlevel)
    else:
        return self.repr_instance(x, maxlevel)


reprlib.Repr.repr_array = repr_array


================================================
FILE: python/mlx/_stub_patterns.txt
================================================
mlx.core.__prefix__:
  from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, ParamSpec, TypeVar
  import sys
  if sys.version_info >= (3, 10):
    from typing import TypeAlias
  else:
    from typing_extensions import TypeAlias
  P = ParamSpec("P")
  R = TypeVar("R")

mlx.core.__suffix__:
  from typing import Union
  scalar: TypeAlias = Union[int, float, bool]
  list_or_scalar: TypeAlias = Union[scalar, list["list_or_scalar"]]
  bool_: Dtype = ...

mlx.core.distributed.__prefix__:
  from mlx.core import array, Dtype, Device, Stream, scalar
  from mlx.core.distributed import Group
  from typing import Sequence, Optional, Union

mlx.core.fast.__prefix__:
  from mlx.core import array, Dtype, Device, Stream, scalar
  from typing import Sequence, Optional, Union

mlx.core.linalg.__prefix__:
  from mlx.core import array, Dtype, Device, Stream, scalar
  from typing import Sequence, Optional, Tuple, Union

mlx.core.metal.__prefix__:
  from mlx.core import array, Dtype, Device, Stream, scalar
  from typing import Sequence, Optional, Union

mlx.core.random.__prefix__:
  from mlx.core import array, Dtype, Device, Stream, scalar, float32, int32
  from typing import Sequence, Optional, Union


================================================
FILE: python/mlx/extension.py
================================================
# Copyright © 2023 Apple Inc.

import os
import re
import subprocess
import sys
from pathlib import Path

from setuptools import Extension
from setuptools.command.build_ext import build_ext

import mlx

_MLX_PATH = str(mlx.__path__[0])


# A CMakeExtension needs a sourcedir instead of a file list.
class CMakeExtension(Extension):
    def __init__(self, name: str, sourcedir: str = "") -> None:
        super().__init__(name, sources=[])
        self.sourcedir = os.fspath(Path(sourcedir).resolve())


class CMakeBuild(build_ext):
    def build_extension(self, ext: CMakeExtension) -> None:
        # Must be in this form due to bug in .resolve() only fixed in Python 3.10+
        ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name)  # type: ignore[no-untyped-call]
        extdir = ext_fullpath.parent.resolve()

        debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
        cfg = "Debug" if debug else "Release"

        # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
        # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
        # from Python.
        cmake_args = [
            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}{os.sep}",
            f"-DCMAKE_BUILD_TYPE={cfg}",
            "-DBUILD_SHARED_LIBS=ON",
        ]
        build_args = []
        # Adding CMake arguments set as environment variable
        # (needed e.g. to build for ARM OSx on conda-forge)
        if "CMAKE_ARGS" in os.environ:
            cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]

        if sys.platform.startswith("darwin"):
            # Cross-compile support for macOS - respect ARCHFLAGS if set
            archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", ""))
            if archs:
                cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]

        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
        # across all generators.
        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
            build_args += [f"-j{os.cpu_count()}"]

        build_temp = Path(self.build_temp) / ext.name
        if not build_temp.exists():
            build_temp.mkdir(parents=True)

        # Make sure cmake can find MLX
        os.environ["MLX_DIR"] = _MLX_PATH

        subprocess.run(
            ["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True
        )
        subprocess.run(
            ["cmake", "--build", ".", *build_args], cwd=build_temp, check=True
        )

    def run(self) -> None:
        super().run()

        # Based on https://github.com/pypa/setuptools/blob/main/setuptools/command/build_ext.py#L102
        if self.inplace:
            for ext in self.extensions:
                if isinstance(ext, CMakeExtension):
                    # Resolve inplace package dir
                    build_py = self.get_finalized_command("build_py")
                    inplace_file, regular_file = self._get_inplace_equivalent(
                        build_py, ext
                    )

                    inplace_dir = str(Path(inplace_file).parent.resolve())
                    regular_dir = str(Path(regular_file).parent.resolve())

                    self.copy_tree(regular_dir, inplace_dir)


================================================
FILE: python/mlx/nn/__init__.py
================================================
# Copyright © 2023 Apple Inc.

from mlx.nn import init, losses
from mlx.nn.layers import *
from mlx.nn.utils import (
    average_gradients,
    fsdp_apply_gradients,
    value_and_grad,
)


================================================
FILE: python/mlx/nn/init.py
================================================
# Copyright © 2023-2024 Apple Inc.

import math
from typing import Callable, Literal

import mlx.core as mx


def constant(
    value: float, dtype: mx.Dtype = mx.float32
) -> Callable[[mx.array], mx.array]:
    r"""An initializer that returns an array filled with ``value``.

    Args:
        value (float): The value to fill the array with.
        dtype (Dtype, optional): The data type of the array. Default:
          ``float32``.

    Returns:
        Callable[[array], array]: An initializer that returns an array with the
        same shape as the input, filled with ``value``.

    Example:

        >>> init_fn = nn.init.constant(0.5)
        >>> init_fn(mx.zeros((2, 2)))
        array([[0.5, 0.5],
               [0.5, 0.5]], dtype=float32)
    """

    def initializer(a: mx.array) -> mx.array:
        return mx.full(a.shape, value, dtype=dtype)

    return initializer


def normal(
    mean: float = 0.0, std: float = 1.0, dtype: mx.Dtype = mx.float32
) -> Callable[[mx.array], mx.array]:
    r"""An initializer that returns samples from a normal distribution.

    Args:
        mean (float, optional): Mean of the normal distribution. Default:
          ``0.0``.
        std (float, optional): Standard deviation of the normal distribution.
          Default: ``1.0``.
        dtype (Dtype, optional): The data type of the array. Default:
          ``float32``.

    Returns:
        Callable[[array], array]: An initializer that returns an array with the
        same shape as the input, filled with samples from a normal distribution.

    Example:

        >>> init_fn = nn.init.normal()
        >>> init_fn(mx.zeros((2, 2)))
        array([[-0.982273, -0.534422],
               [0.380709, 0.0645099]], dtype=float32)
    """

    def initializer(a: mx.array) -> mx.array:
        return mx.random.normal(shape=a.shape, scale=std, loc=mean, dtype=dtype)

    return initializer


def uniform(
    low: float = 0.0, high: float = 1.0, dtype: mx.Dtype = mx.float32
) -> Callable[[mx.array], mx.array]:
    r"""An initializer that returns samples from a uniform distribution.

    Args:
        low (float, optional): The lower bound of the uniform distribution.
          Default: ``0.0``.
        high (float, optional): The upper bound of the uniform distribution.
          Default: ``1.0``
        dtype (Dtype, optional): The data type of the array. Default: ``float32``.

    Returns:
        Callable[[array], array]: An initializer that returns an array
        with the same shape as the input, filled with samples from a uniform
        distribution

    Example:

        >>> init_fn = nn.init.uniform(low=0, high=1)
        >>> init_fn(mx.zeros((2, 2)))
        array([[0.883935, 0.863726],
               [0.617261, 0.417497]], dtype=float32)
    """

    def initializer(a: mx.array) -> mx.array:
        return mx.random.uniform(low, high, a.shape, dtype=dtype)

    return initializer


def identity(dtype: mx.Dtype = mx.float32) -> Callable[[mx.array], mx.array]:
    r"""An initializer that returns an identity matrix.

    Args:
        dtype (Dtype, optional): The data type of the array. Default:
          ``float32``.

    Returns:
        Callable[[array], array]: An initializer that returns an identity
        matrix with the same shape as the input.

    Example:

        >>> init_fn = nn.init.identity()
        >>> init_fn(mx.zeros((2, 2)))
        array([[1, 0],
               [0, 1]], dtype=float32)
    """

    def initializer(arr: mx.array) -> mx.array:
        if arr.ndim != 2 or arr.shape[0] != arr.shape[1]:
            raise ValueError(
                f"The input array must be a square matrix but got shape {arr.shape}."
            )
        return mx.eye(n=arr.shape[0], dtype=dtype)

    return initializer


def _calculate_fan_in_fan_out(x):
    if x.ndim < 2:
        raise ValueError(
            "Glorot / He initialization requires at least 2 dimensional input"
            f" but input with {x.ndim} dimensions."
        )

    fan_in = x.shape[-1]
    fan_out = x.shape[0]

    if x.ndim > 2:
        receptive_field = 1
        for d in x.shape[1:-1]:
            receptive_field *= d

        fan_in = fan_in * receptive_field
        fan_out = fan_out * receptive_field

    return fan_in, fan_out


def glorot_normal(
    dtype: mx.Dtype = mx.float32,
) -> Callable[[mx.array, float], mx.array]:
    r"""A Glorot normal initializer.

    This initializer samples from a normal distribution with a standard
    deviation computed from the number of input (``fan_in``) and output
    (``fan_out``) units according to:

    .. math::
        \sigma = \gamma \sqrt{\frac{2.0}{\text{fan\_in} + \text{fan\_out}}}

    For more details see the original reference: `Understanding the difficulty
    of training deep feedforward neural networks
    <https://proceedings.mlr.press/v9/glorot10a.html>`_

    Args:
        dtype (Dtype, optional): The data type of the array. Default: ``float32``.

    Returns:
        Callable[[array, float], array]: An initializer that returns an array
        with the same shape as the input, filled with samples from the Glorot
        normal distribution.

    Example:

        >>> init_fn = nn.init.glorot_normal()
        >>> init_fn(mx.zeros((2, 2)))
        array([[0.191107, 1.61278],
               [-0.150594, -0.363207]], dtype=float32)
        >>> init_fn(mx.zeros((2, 2)), gain=4.0)
        array([[1.89613, -4.53947],
               [4.48095, 0.995016]], dtype=float32)
    """

    def initializer(a: mx.array, gain: float = 1.0) -> mx.array:
        fan_in, fan_out = _calculate_fan_in_fan_out(a)
        std = gain * math.sqrt(2.0 / (fan_in + fan_out))
        return mx.random.normal(shape=a.shape, scale=std, dtype=dtype)

    return initializer


def glorot_uniform(
    dtype: mx.Dtype = mx.float32,
) -> Callable[[mx.array, float], mx.array]:
    r"""A Glorot uniform initializer.

    This initializer samples from a uniform distribution with a range
    computed from the number of input (``fan_in``) and output (``fan_out``)
    units according to:

    .. math::
        \sigma = \gamma \sqrt{\frac{6.0}{\text{fan\_in} + \text{fan\_out}}}

    For more details see the original reference: `Understanding the difficulty
    of training deep feedforward neural networks
    <https://proceedings.mlr.press/v9/glorot10a.html>`_

    Args:
        dtype (Dtype, optional): The data type of the array. Default: ``float32``.

    Returns:
        Callable[[array, float], array]: An initializer that returns an array
        with the same shape as the input, filled with samples from the Glorot
        uniform distribution.

    Example:

        >>> init_fn = nn.init.glorot_uniform()
        >>> init_fn(mx.zeros((2, 2)))
        array([[0.223404, -0.890597],
               [-0.379159, -0.776856]], dtype=float32)
        >>> init_fn(mx.zeros((2, 2)), gain=4.0)
        array([[-1.90041, 3.02264],
               [-0.912766, 4.12451]], dtype=float32)
    """

    def initializer(a: mx.array, gain: float = 1.0) -> mx.array:
        fan_in, fan_out = _calculate_fan_in_fan_out(a)
        limit = gain * math.sqrt(6.0 / (fan_in + fan_out))
        return mx.random.uniform(-limit, limit, a.shape, dtype=dtype)

    return initializer


def he_normal(
    dtype: mx.Dtype = mx.float32,
) -> Callable[[mx.array, Literal["fan_in", "fan_out"], float], mx.array]:
    r"""Build a He normal initializer.

    This initializer samples from a normal distribution with a standard
    deviation computed from the number of input (``fan_in``) or output
    (``fan_out``) units according to:

    .. math::
        \sigma = \gamma \frac{1}{\sqrt{\text{fan}}}

    where :math:`\text{fan}` is either the number of input units when the
    ``mode`` is ``"fan_in"`` or output units when the ``mode`` is
    ``"fan_out"``.

    For more details see the original reference: `Delving Deep into Rectifiers:
    Surpassing Human-Level Performance on ImageNet Classification
    <https://arxiv.org/abs/1502.01852>`_

    Args:
        dtype (Dtype, optional): The data type of the array. Default: ``float32``.

    Returns:
        Callable[[array, str, float], array]: An initializer that returns an
        array with the same shape as the input, filled with samples from the He
        normal distribution.

    Example:

        >>> init_fn = nn.init.he_normal()
        >>> init_fn(mx.zeros((2, 2)))  # uses fan_in
        array([[-1.25211, 0.458835],
               [-0.177208, -0.0137595]], dtype=float32)
        >>> init_fn(mx.zeros((2, 2)), mode="fan_out", gain=5)
        array([[5.6967, 4.02765],
               [-4.15268, -2.75787]], dtype=float32)
    """

    def initializer(
        a: mx.array,
        mode: Literal["fan_in", "fan_out"] = "fan_in",
        gain: float = 1.0,
    ) -> mx.array:
        fan_in, fan_out = _calculate_fan_in_fan_out(a)
        if mode == "fan_in":
            fan = fan_in
        elif mode == "fan_out":
            fan = fan_out
        else:
            raise ValueError(f"Invalid mode: {mode}. Valid modes are: fan_in, fan_out")

        std = gain / math.sqrt(fan)
        return mx.random.normal(shape=a.shape, scale=std, dtype=dtype)

    return initializer


def he_uniform(
    dtype: mx.Dtype = mx.float32,
) -> Callable[[mx.array, Literal["fan_in", "fan_out"], float], mx.array]:
    r"""A He uniform (Kaiming uniform) initializer.

    This initializer samples from a uniform distribution with a range
    computed from the number of input (``fan_in``) or output (``fan_out``)
    units according to:

    .. math::

        \sigma = \gamma \sqrt{\frac{3.0}{\text{fan}}}

    where :math:`\text{fan}` is either the number of input units when the
    ``mode`` is ``"fan_in"`` or output units when the ``mode`` is
    ``"fan_out"``.

    For more details see the original reference: `Delving Deep into Rectifiers:
    Surpassing Human-Level Performance on ImageNet Classification
    <https://arxiv.org/abs/1502.01852>`_


    Args:
        dtype (Dtype, optional): The data type of the array. Default: ``float32``.

    Returns:
        Callable[[array, str, float], array]: An initializer that returns an
        array with the same shape as the input, filled with samples from  the
        He uniform distribution.

    Example:

        >>> init_fn = nn.init.he_uniform()
        >>> init_fn(mx.zeros((2, 2)))  # uses fan_in
        array([[0.0300242, -0.0184009],
               [0.793615, 0.666329]], dtype=float32)
        >>> init_fn(mx.zeros((2, 2)), mode="fan_out", gain=5)
        array([[-1.64331, -2.16506],
               [1.08619, 5.79854]], dtype=float32)
    """

    def initializer(
        a: mx.array,
        mode: Literal["fan_in", "fan_out"] = "fan_in",
        gain: float = 1.0,
    ) -> mx.array:
        fan_in, fan_out = _calculate_fan_in_fan_out(a)
        if mode == "fan_in":
            fan = fan_in
        elif mode == "fan_out":
            fan = fan_out
        else:
            raise ValueError(f"Invalid mode: {mode}. Valid modes are: fan_in, fan_out")

        limit = gain * math.sqrt(3.0 / fan)
        return mx.random.uniform(-limit, limit, a.shape, dtype=dtype)

    return initializer


def sparse(
    sparsity: float,
    mean: float = 0.0,
    std: float = 1.0,
    dtype: mx.Dtype = mx.float32,
) -> Callable[[mx.array], mx.array]:
    r"""An initializer that returns a sparse matrix.

    Args:
        sparsity (float): The fraction of elements in each column to be set to
        zero.
        mean (float, optional): Mean of the normal distribution. Default:
          ``0.0``.
        std (float, optional): Standard deviation of the normal distribution.
          Default: ``1.0``.
        dtype (Dtype, optional): The data type of the array. Default:
          ``float32``.

    Returns:
        Callable[[array], array]: An initializer that returns an array with the
        same shape as the input, filled with samples from a normal distribution.

    Example:

        >>> init_fn = nn.init.sparse(sparsity=0.5)
        >>> init_fn(mx.zeros((2, 2)))
        array([[-1.91187, -0.117483],
       [0, 0]], dtype=float32)
    """

    def initializer(a: mx.array) -> mx.array:
        if a.ndim != 2:
            raise ValueError("Only tensors with 2 dimensions are supported")

        rows, cols = a.shape
        num_zeros = int(math.ceil(sparsity * cols))

        order = mx.argsort(mx.random.uniform(shape=a.shape), axis=1)
        a = mx.random.normal(shape=a.shape, scale=std, loc=mean, dtype=dtype)

        a[mx.arange(rows).reshape(rows, 1), order[:, :num_zeros]] = 0

        return a

    return initializer


def orthogonal(
    gain: float = 1.0, dtype: mx.Dtype = mx.float32
) -> Callable[[mx.array], mx.array]:
    r"""An initializer that returns an orthogonal matrix.

    Args:
        gain (float, optional): Scaling factor for the orthogonal matrix.
            Default: ``1.0``.
        dtype (Dtype, optional): Data type of the array. Default: ``float32``.

    Returns:
        Callable[[array], array]: An initializer that returns
        an orthogonal matrix with the same shape as the input.
    """

    def initializer(a: mx.array) -> mx.array:
        if a.ndim != 2:
            raise ValueError(
                f"Orthogonal initialization requires a 2D array but got"
                " a {a.ndim}D array."
            )

        rows, cols = a.shape
        n = max(rows, cols)

        rmat = mx.random.normal(shape=(n, n))

        # Perform QR decomposition on CPU
        q, r = mx.linalg.qr(rmat, stream=mx.cpu)

        # Adjust the sign of Q using the diagonal of R
        d = mx.diag(r)
        q = q * mx.sign(d)

        # Slice Q to the desired shape
        q = q[:rows, :cols]

        # Scale Q by gain
        q = q * gain
        return q.astype(dtype)

    return initializer


================================================
FILE: python/mlx/nn/layers/__init__.py
================================================
# Copyright © 2023 Apple Inc.

from mlx.nn.layers.activations import (
    CELU,
    ELU,
    GELU,
    GLU,
    SELU,
    HardShrink,
    Hardswish,
    HardTanh,
    LeakyReLU,
    LogSigmoid,
    LogSoftmax,
    Mish,
    PReLU,
    ReLU,
    ReLU2,
    ReLU6,
    Sigmoid,
    SiLU,
    Softmax,
    Softmin,
    Softplus,
    Softshrink,
    Softsign,
    Step,
    Tanh,
    celu,
    elu,
    gelu,
    gelu_approx,
    gelu_fast_approx,
    glu,
    hard_shrink,
    hard_tanh,
    hardswish,
    leaky_relu,
    log_sigmoid,
    log_softmax,
    mish,
    prelu,
    relu,
    relu2,
    relu6,
    selu,
    sigmoid,
    silu,
    softmax,
    softmin,
    softplus,
    softshrink,
    softsign,
    step,
    tanh,
)
from mlx.nn.layers.base import Module
from mlx.nn.layers.containers import Sequential
from mlx.nn.layers.convolution import Conv1d, Conv2d, Conv3d
from mlx.nn.layers.convolution_transpose import (
    ConvTranspose1d,
    ConvTranspose2d,
    ConvTranspose3d,
)
from mlx.nn.layers.distributed import (
    AllToShardedLinear,
    QuantizedAllToShardedLinear,
    QuantizedShardedToAllLinear,
    ShardedToAllLinear,
)
from mlx.nn.layers.dropout import Dropout, Dropout2d, Dropout3d
from mlx.nn.layers.embedding import Embedding
from mlx.nn.layers.linear import Bilinear, Identity, Linear
from mlx.nn.layers.normalization import (
    BatchNorm,
    GroupNorm,
    InstanceNorm,
    LayerNorm,
    RMSNorm,
)
from mlx.nn.layers.pooling import (
    AvgPool1d,
    AvgPool2d,
    AvgPool3d,
    MaxPool1d,
    MaxPool2d,
    MaxPool3d,
)
from mlx.nn.layers.positional_encoding import ALiBi, RoPE, SinusoidalPositionalEncoding
from mlx.nn.layers.quantized import (
    QQLinear,
    QuantizedEmbedding,
    QuantizedLinear,
    quantize,
)
from mlx.nn.layers.recurrent import GRU, LSTM, RNN
from mlx.nn.layers.transformer import (
    MultiHeadAttention,
    Transformer,
    TransformerDecoder,
    TransformerDecoderLayer,
    TransformerEncoder,
    TransformerEncoderLayer,
)
from mlx.nn.layers.upsample import Upsample


================================================
FILE: python/mlx/nn/layers/activations.py
================================================
# Copyright © 2023 Apple Inc.

import math
from functools import partial
from typing import Any

import mlx.core as mx
from mlx.nn.layers.base import Module


def _make_activation_module(f):
    def decorator(klass):
        klass.__call__ = lambda _, x: f(x)
        return klass

    return decorator


@partial(mx.compile, shapeless=True)
def sigmoid(x):
    r"""Applies the sigmoid function.

    .. math::
        \text{Sigmoid}(x) = \sigma(x) = \frac{1}{1 + \exp(-x)}
    """
    return mx.sigmoid(x)


@partial(mx.compile, shapeless=True)
def relu(x):
    r"""Applies the Rectified Linear Unit.

    Simply ``mx.maximum(x, 0)``.
    """
    return mx.maximum(x, 0)


@partial(mx.compile, shapeless=True)
def relu2(x):
    r"""Applies the ReLU² activation function.

    Applies :math:`\max(0, x)^2` element wise.
    """
    return mx.square(mx.maximum(x, 0))


@partial(mx.compile, shapeless=True)
def relu6(x):
    r"""Applies the Rectified Linear Unit 6.

    Applies :math:`\min(\max(x, 0), 6)` element wise.
    """
    return mx.minimum(mx.maximum(x, 0), 6.0)


@partial(mx.compile, shapeless=True)
def leaky_relu(x, negative_slope=0.01):
    r"""Applies the Leaky Rectified Linear Unit.

    Simply ``mx.maximum(negative_slope * x, x)``.
    """
    return mx.maximum(negative_slope * x, x)


@partial(mx.compile, shapeless=True)
def log_softmax(x, axis=-1):
    r"""Applies the Log Softmax function.

    Applies :math:`x + \log \sum_i e^{x_i}` element wise.
    """
    return x - mx.logsumexp(x, axis=axis, keepdims=True)


@partial(mx.compile, shapeless=True)
def elu(x, alpha=1.0):
    r"""Applies the Exponential Linear Unit.

    Simply ``mx.where(x > 0, x, alpha * (mx.exp(x) - 1))``.
    """
    return mx.where(x > 0, x, alpha * (mx.exp(x) - 1))


@partial(mx.compile, shapeless=True)
def softmax(x, axis=-1):
    r"""Applies the Softmax function.

    Applies :math:`\frac{e^{x_i}}{\sum_j e^{x_j}}` element wise.
    """
    return mx.softmax(x, axis=axis)


@partial(mx.compile, shapeless=True)
def softplus(x):
    r"""Applies the Softplus function.

    Applies :math:`\log(1 + \exp(x))` element wise.
    """
    return mx.logaddexp(x, 0)


@partial(mx.compile, shapeless=True)
def softsign(x):
    r"""Applies the Softsign function.

    Applies :math:`\frac{x}{1 + |x|}` element wise.
    """
    return mx.divide(x, 1 + mx.abs(x))


@partial(mx.compile, shapeless=True)
def softshrink(x, lambd: float = 0.5):
    r"""Applies the Softshrink activation function.

    .. math::
        \text{softshrink}(x) = \begin{cases}
        x - \lambda & \text{if } x > \lambda \\
        x + \lambda & \text{if } x < -\lambda \\
        0 & \text{otherwise}
        \end{cases}
    """
    return mx.where(mx.abs(x) > lambd, x - mx.sign(x) * lambd, 0)


@partial(mx.compile, shapeless=True)
def celu(x, alpha=1.0):
    r"""Applies the Continuously Differentiable Exponential Linear Unit.

    Applies :math:`\max(0, x) + \min(0, \alpha * (\exp(x / \alpha) - 1))`
    element wise.
    """
    return mx.maximum(x, 0.0) + alpha * (mx.exp(mx.minimum(x, 0.0) / alpha) - 1)


@partial(mx.compile, shapeless=True)
def silu(x):
    r"""Applies the Sigmoid Linear Unit. Also known as Swish.

    Applies :math:`x \sigma(x)` element wise, where :math:`\sigma(\cdot)` is
    the logistic sigmoid.
    """
    return x * mx.sigmoid(x)


@partial(mx.compile, shapeless=True)
def log_sigmoid(x):
    r"""Applies the Log Sigmoid function.

    Applies :math:`\log(\sigma(x)) = -\log(1 + e^{-x})` element wise.
    """
    return -softplus(-x)


@partial(mx.compile, shapeless=True)
def gelu(x) -> mx.array:
    r"""Applies the Gaussian Error Linear Units function.

    .. math::
        \textrm{GELU}(x) = x * \Phi(x)

    where :math:`\Phi(x)` is the Gaussian CDF.

    See also :func:`gelu_approx` and :func:`gelu_fast_approx` for faster
    approximations.
    """
    return x * (1 + mx.erf(x / math.sqrt(2))) / 2


@partial(mx.compile, shapeless=True)
def gelu_approx(x):
    r"""An approximation to Gaussian Error Linear Unit.

    See :func:`gelu` for the exact computation.

    This function approximates ``gelu`` with a maximum absolute error :math:`<
    0.0005` in the range :math:`[-6, 6]` using the following

    .. math::

        x = 0.5 * x * \left(1 + \text{Tanh}\left((\sqrt{2 / \pi} * \left(x + 0.044715 * x^3\right)\right)\right)

    """
    return 0.5 * x * (1 + mx.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * x**3)))


@partial(mx.compile, shapeless=True)
def gelu_fast_approx(x):
    r"""A fast approximation to Gaussian Error Linear Unit.

    See :func:`gelu` for the exact computation.

    This function approximates ``gelu`` with a maximum absolute error :math:`<
    0.015` in the range :math:`[-6, 6]` using the following

    .. math::

        x = x \sigma\left(1.702 x\right)

    where :math:`\sigma(\cdot)` is the logistic sigmoid.

    References:
    - https://github.com/hendrycks/GELUs
    - https://arxiv.org/abs/1606.08415
    """
    return x * mx.sigmoid(1.702 * x)


def glu(x: mx.array, axis: int = -1) -> mx.array:
    r"""Applies the gated linear unit function.

    This function splits the ``axis`` dimension of the input into two halves
    (:math:`a` and :math:`b`) and applies :math:`a * \sigma(b)`.

    .. math::
        \textrm{GLU}(x) = a * \sigma(b)

    Args:
        axis (int): The dimension to split along. Default: ``-1``
    """
    a, b = mx.split(x, indices_or_sections=2, axis=axis)
    return a * mx.sigmoid(b)


@partial(mx.compile, shapeless=True)
def step(x: mx.array, threshold: float = 0.0):
    r"""Applies the Step Activation Function.

    This function implements a binary step activation, where the output is set
    to 1 if the input is greater than a specified threshold, and 0 otherwise.

    .. math::
        \text{step}(x) = \begin{cases}
        0 & \text{if } x < \text{threshold} \\
        1 & \text{if } x \geq \text{threshold}
        \end{cases}

    Args:
        threshold: The value to threshold at.
    """

    return mx.where(x > threshold, 1, 0)


@partial(mx.compile, shapeless=True)
def selu(x):
    r"""Applies the Scaled Exponential Linear Unit.

    .. math::
        \text{selu}(x) = \begin{cases}
        \lambda x & \text{if } x > 0 \\
        \lambda \alpha (\exp(x) - 1) & \text{if } x \leq 0
        \end{cases}

    where :math:`\lambda = 1.0507` and :math:`\alpha = 1.67326`.

    See also :func:`elu`.
    """
    return elu(x, 1.67326) * 1.0507


@partial(mx.compile, shapeless=True)
def prelu(x: mx.array, alpha: mx.array) -> mx.array:
    r"""Applies the element-wise parametric ReLU.

    .. math::
        \text{PReLU}(x) = \max(0,x) + a * \min(0,x)

    where :math:`a` is an array.
    """
    return mx.maximum(0, x) + alpha * mx.minimum(0, x)


@partial(mx.compile, shapeless=True)
def mish(x: mx.array) -> mx.array:
    r"""Applies the Mish function, element-wise.

    Mish: A Self Regularized Non-Monotonic Neural Activation Function.

    Reference: https://arxiv.org/abs/1908.08681

    .. math::
        \text{Mish}(x) = x * \text{Tanh}(\text{Softplus}(x))

    """
    return x * mx.tanh(softplus(x))


@partial(mx.compile, shapeless=True)
def hardswish(x):
    r"""Applies the hardswish function, element-wise.

    .. math::
        \text{Hardswish}(x) = x * \min(\max(x + 3, 0), 6) / 6
    """
    max_x_3 = mx.maximum(x + 3, 0)
    return x * mx.minimum(max_x_3, 6) / 6


@partial(mx.compile, shapeless=True)
def hard_tanh(x, min_val=-1.0, max_val=1.0):
    r"""Applies the HardTanh function.

    Applies :math:`\max(\min(x, \mathrm{max\_val}), \mathrm{min\_val})` element-wise.
    """
    return mx.minimum(mx.maximum(x, min_val), max_val)


@partial(mx.compile, shapeless=True)
def hard_shrink(x, lambd=0.5):
    r"""Applies the HardShrink activation function.

    .. math::
        \text{hardshrink}(x) = \begin{cases}
        x & \text{if } x > \lambda \\
        x & \text{if } x < -\lambda \\
        0 & \text{otherwise}
        \end{cases}
    """
    return mx.where(mx.abs(x) > lambd, x, 0)


@partial(mx.compile, shapeless=True)
def softmin(x, axis=-1):
    r"""Applies the Softmin function.

    Applies :math:`\frac{e^{-x_i}}{\sum_j e^{-x_j}}` element-wise.
    """
    return mx.softmax(-x, axis=axis)


def tanh(x):
    """Applies the hyperbolic tangent function.

    Simply ``mx.tanh(x)``.
    """
    return mx.tanh(x)


class GLU(Module):
    r"""Applies the gated linear unit function.

    This function splits the ``axis`` dimension of the input into two halves
    (:math:`a` and :math:`b`) and applies :math:`a * \sigma(b)`.

    .. math::
        \textrm{GLU}(x) = a * \sigma(b)

    Args:
        axis (int): The dimension to split along. Default: ``-1``
    """

    def __init__(self, axis: int = -1):
        super().__init__()
        self.axis = axis

    def __call__(self, x) -> Any:
        return glu(x=x, axis=self.axis)


@_make_activation_module(sigmoid)
class Sigmoid(Module):
    r"""Applies the sigmoid function, element-wise.

    .. math::
        \text{Sigmoid}(x) = \sigma(x) = \frac{1}{1 + \exp(-x)}
    """


@_make_activation_module(mish)
class Mish(Module):
    r"""Applies the Mish function, element-wise.

    Reference: https://arxiv.org/abs/1908.08681

    .. math::
        \text{Mish}(x) = x * \text{Tanh}(\text{Softplus}(x))

    """


@_make_activation_module(relu)
class ReLU(Module):
    r"""Applies the Rectified Linear Unit.
        Simply ``mx.maximum(x, 0)``.

    See :func:`relu` for the functional equivalent.
    """


@_make_activation_module(relu2)
class ReLU2(Module):
    r"""Applies the ReLU² activation function.

    See :func:`relu2` for the functional equivalent.
    """


@_make_activation_module(relu6)
class ReLU6(Module):
    r"""Applies the Rectified Linear Unit 6.

    See :func:`relu6` for the functional equivalent.
    """


class LeakyReLU(Module):
    r"""Applies the Leaky Rectified Linear Unit.

    Simply ``mx.maximum(negative_slope * x, x)``.

    Args:
        negative_slope: Controls the angle of the negative slope. Default: ``1e-2``
    """

    def __init__(self, negative_slope=1e-2):
        super().__init__()
        self._negative_slope = negative_slope

    def __call__(self, x):
        return leaky_relu(x, self._negative_slope)


class ELU(Module):
    r"""Applies the Exponential Linear Unit.
        Simply ``mx.where(x > 0, x, alpha * (mx.exp(x) - 1))``.

    See :func:`elu` for the functional equivalent.

    Args:
        alpha: the :math:`\alpha` value for the ELU formulation. Default: ``1.0``
    """

    def __init__(self, alpha=1.0):
        super().__init__()
        self._alpha = alpha

    def __call__(self, x):
        return elu(x, self._alpha)


@_make_activation_module(softmax)
class Softmax(Module):
    r"""Applies the Softmax function.

    See :func:`softmax` for the functional equivalent.
    """


@_make_activation_module(softplus)
class Softplus(Module):
    r"""Applies the Softplus function.

    See :func:`softplus` for the functional equivalent.
    """


@_make_activation_module(softsign)
class Softsign(Module):
    r"""Applies the Softsign function.

    See :func:`softsign` for the functional equivalent.
    """


class Softshrink(Module):
    r"""Applies the Softshrink function.

    See :func:`softshrink` for the functional equivalent.

    Args:
        lambd: the :math:`\lambda` value for Softshrink. Default: ``0.5``
    """

    def __init__(self, lambd=0.5):
        super().__init__()
        self.lambd = lambd

    def __call__(self, x):
        return softshrink(x, self.lambd)


class CELU(Module):
    r"""Applies the Continuously Differentiable Exponential Linear Unit.
        Applies :math:`\max(0, x) + \min(0, \alpha * (\exp(x / \alpha) - 1))`
        element wise.

    See :func:`celu` for the functional equivalent.

    Args:
        alpha: the :math:`\alpha` value for the CELU formulation. Default: ``1.0``
    """

    def __init__(self, alpha=1.0):
        super().__init__()
        self._alpha = alpha

    def __call__(self, x):
        return celu(x, self._alpha)


@_make_activation_module(silu)
class SiLU(Module):
    r"""Applies the Sigmoid Linear Unit. Also known as Swish.

    See :func:`silu` for the functional equivalent.
    """


@_make_activation_module(log_softmax)
class LogSoftmax(Module):
    r"""Applies the Log Softmax function.

    See :func:`log_softmax` for the functional equivalent.
    """


@_make_activation_module(log_sigmoid)
class LogSigmoid(Module):
    r"""Applies the Log Sigmoid function.

    See :func:`log_sigmoid` for the functional equivalent.
    """


class PReLU(Module):
    r"""Applies the element-wise parametric ReLU.
        Applies :math:`\max(0, x) + a * \min(0, x)` element wise, where :math:`a`
        is an array.

    See :func:`prelu` for the functional equivalent.

    Args:
        num_parameters: number of :math:`a` to learn. Default: ``1``
        init: the initial value of :math:`a`. Default: ``0.25``
    """

    def __init__(self, num_parameters=1, init=0.25):
        super().__init__()
        self.weight = mx.full([num_parameters], init)

    def __call__(self, x: mx.array):
        return prelu(x, self.weight)


class GELU(Module):
    r"""Applies the Gaussian Error Linear Units.

    .. math::
        \textrm{GELU}(x) = x * \Phi(x)

    where :math:`\Phi(x)` is the Gaussian CDF.

    However, if ``approx`` is set to 'precise' or 'fast' it applies

    .. math::
        \textrm{GELUApprox}(x) &= 0.5 * x * \left(1 + \text{Tanh}\left((\sqrt{2 / \pi} * \left(x + 0.044715 * x^3\right)\right)\right) \\
        \textrm{GELUFast}(x) &= x * \sigma\left(1.702 * x\right)

    respectively.

    .. note::
       For compatibility with the PyTorch API, 'tanh' can be used as an alias
       for 'precise'.

    See :func:`gelu`, :func:`gelu_approx` and :func:`gelu_fast_approx` for the
    functional equivalents and information regarding error bounds.


    Args:
        approx ('none' | 'precise' | 'fast'): Which approximation to gelu to use if any.
    """

    def __init__(self, approx="none"):
        super().__init__()
        self._approx = approx
        allowed = ["none", "precise", "tanh", "fast"]
        if approx not in allowed:
            raise ValueError(
                f"The approximation should be in {allowed} but '{approx}' was given"
            )

    def __call__(self, x):
        if self._approx == "none":
            return gelu(x)
        elif self._approx in ["precise", "tanh"]:
            return gelu_approx(x)
        return gelu_fast_approx(x)


@_make_activation_module(tanh)
class Tanh(Module):
    r"""Applies the hyperbolic tangent function.

    See :func:`tanh` for the functional equivalent.
    """


@_make_activation_module(hardswish)
class Hardswish(Module):
    r"""Applies the hardswish function, element-wise.

    See :func:`hardswish` for the functional equivalent.
    """


class Step(Module):
    r"""Applies the Step Activation Function.

    This function implements a binary step activation, where the output is set
    to 1 if the input is greater than a specified threshold, and 0 otherwise.

    .. math::
        \text{step}(x) = \begin{cases}
        0 & \text{if } x < \text{threshold} \\
        1 & \text{if } x \geq \text{threshold}
        \end{cases}

    Args:
        threshold: The value to threshold at.
    """

    def __init__(self, threshold: float = 0.0):
        super().__init__()
        self.threshold = threshold

    def __call__(self, x: mx.array):
        return step(x, self.threshold)


@_make_activation_module(selu)
class SELU(Module):
    r"""Applies the Scaled Exponential Linear Unit.

    See :func:`selu` for the functional equivalent.
    """


@_make_activation_module(hard_tanh)
class HardTanh(Module):
    r"""Applies the HardTanh function.

    See :func:`hard_tanh` for the functional equivalent.
    """


@_make_activation_module(hard_shrink)
class HardShrink(Module):
    r"""Applies the HardShrink function.

    See :func:`hard_shrink` for the functional equivalent.

    Args:
        lambd: the :math:`\lambda` value for Hardshrink. Default: ``0.5``
    """


@_make_activation_module(softmin)
class Softmin(Module):
    r"""Applies the Softmin function.

    See :func:`softmin` for the functional equivalent.
    """


================================================
FILE: python/mlx/nn/layers/base.py
================================================
# Copyright © 2023 Apple Inc.

from __future__ import annotations

import textwrap
from typing import Any, Callable, List, Optional, Tuple, Union

import mlx.core as mx
from mlx.utils import tree_flatten, tree_unflatten


class Module(dict):
    """Base class for building neural networks with MLX.

    All the layers provided in :mod:`mlx.nn.layers` subclass this class and
    your models should do the same.

    A ``Module`` can contain other ``Module`` instances or :class:`mlx.core.array`
    instances in arbitrary nesting of python lists or dicts. The ``Module``
    then allows recursively extracting all the :class:`mlx.core.array` instances
    using :meth:`mlx.nn.Module.parameters`.

    In addition, the ``Module`` has the concept of trainable and non trainable
    parameters (called "frozen"). When using :func:`mlx.nn.value_and_grad`
    the gradients are returned only with respect to the trainable parameters.
    All arrays in a module are trainable unless they are added in the "frozen"
    set by calling :meth:`freeze`.

    .. code-block:: python

        import mlx.core as mx
        import mlx.nn as nn

        class MyMLP(nn.Module):
            def __init__(self, in_dims: int, out_dims: int, hidden_dims: int = 16):
                super().__init__()

                self.in_proj = nn.Linear(in_dims, hidden_dims)
                self.out_proj = nn.Linear(hidden_dims, out_dims)

            def __call__(self, x):
                x = self.in_proj(x)
                x = mx.maximum(x, 0)
                return self.out_proj(x)

        model = MyMLP(2, 1)

        # All the model parameters are created but since MLX is lazy by
        # default, they are not evaluated yet. Calling `mx.eval` actually
        # allocates memory and initializes the parameters.
        mx.eval(model.parameters())

        # Setting a parameter to a new value is as simply as accessing that
        # parameter and assigning a new array to it.
        model.in_proj.weight = model.in_proj.weight * 2
        mx.eval(model.parameters())
    """

    __call__: Callable

    def __init__(self):
        """Should be called by the subclasses of ``Module``."""
        self._no_grad = set()
        self._training = True

    @property
    def training(self):
        """Boolean indicating if the model is in training mode."""
        return self._training

    @property
    def state(self):
        """The module's state dictionary

        The module's state dictionary contains any attribute set on the
        module including parameters in :meth:`Module.parameters`

        Unlike :meth:`Module.parameters`, the :attr:`Module.state` property is
        a reference to the module's state. Updates to it will be reflected in
        the original module.
        """
        return self

    def _extra_repr(self) -> str:
        return ""

    def __repr__(self):
        children = tree_flatten(self.children(), is_leaf=self.is_module)
        value = f"{type(self).__name__}({self._extra_repr()}"
        for k, v in children:
            value += "\n"
            value += textwrap.indent(f"({k}): {repr(v)}", prefix="  ")
        if children:
            value += "\n"
        value += ")"

        return value

    def __getattr__(self, key: str):
        if (value := self.get(key, None)) is not None:
            return value
        else:
            super(Module, self).__getattribute__(key)

    def __setattr__(self, key: str, val: Any):
        if isinstance(val, (mx.array, dict, list, tuple)):
            # If attribute was previously set but not in the
            # dictionary, delete it so we pick it up in future
            # calls to __getattr__
            if hasattr(self, key) and key not in self:
                delattr(self, key)
            self[key] = val
        else:
            super(Module, self).__setattr__(key, val)
            self.pop(key, None)

    def __delattr__(self, name):
        if (val := self.get(name, None)) is not None:
            del self[name]
        else:
            super().__delattr__(name)

    def load_weights(
        self,
        file_or_weights: Union[str, List[Tuple[str, mx.array]]],
        strict: bool = True,
    ) -> Module:
        """
        Update the model's weights from a ``.npz``, a ``.safetensors`` file, or a list.

        Args:
            file_or_weights (str or list(tuple(str, mx.array))): The path to
                the weights ``.npz`` file (``.npz`` or ``.safetensors``) or a list
                of pairs of parameter names and arrays.
            strict (bool, optional): If ``True`` then checks that the provided
              weights exactly match the parameters of the model. Otherwise,
              only the weights actually contained in the model are loaded and
              shapes are not checked. Default: ``True``.

        Returns:
            The module instance after updating the weights.

        Example:

            .. code-block:: python

                import mlx.core as mx
                import mlx.nn as nn
                model = nn.Linear(10, 10)

                # Load from file
                model.load_weights("weights.npz")

                # Load from .safetensors file
                model.load_weights("weights.safetensors")

                # Load from list
                weights = [
                    ("weight", mx.random.uniform(shape=(10, 10))),
                    ("bias",  mx.zeros((10,))),
                ]
                model.load_weights(weights)

                # Missing weight
                weights = [
                    ("weight", mx.random.uniform(shape=(10, 10))),
                ]

                # Raises a ValueError exception
                model.load_weights(weights)

                # Ok, only updates the weight but not the bias
                model.load_weights(weights, strict=False)
        """
        weights = file_or_weights
        if isinstance(weights, str):
            weights = list(mx.load(weights).items())

        if strict:
            new_weights = dict(weights)
            curr_weights = tree_flatten(self.parameters(), destination={})
            if extras := (new_weights.keys() - curr_weights.keys()):
                num_extra = len(extras)
                extras = ",\n".join(sorted(extras))
                raise ValueError(
                    f"Received {num_extra} parameters not in model: \n{extras}."
                )
            if missing := (curr_weights.keys() - new_weights.keys()):
                num_missing = len(missing)
                missing = ",\n".join(sorted(missing))
                raise ValueError(f"Missing {num_missing} parameters: \n{missing}.")
            for k, v in curr_weights.items():
                v_new = new_weights[k]
                if not isinstance(v_new, mx.array):
                    raise ValueError(
                        "Expected mx.array but received "
                        f"{type(v_new)} for parameter {k}"
                    )
                if v_new.shape != v.shape:
                    raise ValueError(
                        f"Expected shape {v.shape} but received "
                        f"shape {v_new.shape} for parameter {k}"
                    )

        if len(weights) != 0:
            self.update(tree_unflatten(weights), strict=False)
        return self

    def save_weights(self, file: str):
        """
        Save the model's weights to a file. The saving method is determined by the file extension:
        - ``.npz`` will use :func:`mx.savez`
        - ``.safetensors`` will use :func:`mx.save_safetensors`
        """
        params_dict = tree_flatten(self.parameters(), destination={})

        if file.endswith(".npz"):
            mx.savez(file, **params_dict)
        elif file.endswith(".safetensors"):
            mx.save_safetensors(file, params_dict)
        else:
            raise ValueError(
                f"Unsupported file extension for {file}. Use '.npz' or '.safetensors'."
            )

    @staticmethod
    def is_module(value):
        return isinstance(value, Module)

    @staticmethod
    def valid_child_filter(module, key, value):
        return isinstance(value, (dict, list))

    @staticmethod
    def valid_parameter_filter(module, key, value):
        return isinstance(value, (dict, list, mx.array)) and not key.startswith("_")

    @staticmethod
    def trainable_parameter_filter(module, key, value):
        return (
            Module.valid_parameter_filter(module, key, value)
            and key not in module._no_grad
        )

    def filter_and_map(
        self,
        filter_fn: Callable[[Module, str, Any], bool],
        map_fn: Optional[Callable] = None,
        is_leaf_fn: Optional[Callable[[Module, str, Any], bool]] = None,
    ):
        """Recursively filter the contents of the module using ``filter_fn``,
        namely only select keys and values where ``filter_fn`` returns true.

        This is used to implement :meth:`parameters` and :meth:`trainable_parameters`
        but it can also be used to extract any subset of the module's parameters.

        Args:
            filter_fn (Callable): Given a value, the key in which it is found
                and the containing module, decide whether to keep the value or
                drop it.
            map_fn (Callable, optional): Optionally transform the value before
                returning it.
            is_leaf_fn (Callable, optional): Given a value, the key in which it
                is found and the containing module decide if it is a leaf.

        Returns:
            A dictionary containing the contents of the module recursively filtered
        """

        map_fn = map_fn or (lambda x: x)
        is_leaf_fn = is_leaf_fn or (
            lambda m, k, v: not isinstance(v, (Module, dict, list))
        )
        return {
            k: _unwrap(self, k, v, filter_fn, map_fn, is_leaf_fn)
            for k, v in self.items()
            if filter_fn(self, k, v)
        }

    def parameters(self):
        """Recursively return all the :class:`mlx.core.array` members of this Module
        as a dict of dicts and lists."""
        return self.filter_and_map(self.valid_parameter_filter)

    def trainable_parameters(self):
        """Recursively return all the non frozen :class:`mlx.core.array` members of
        this Module as a dict of dicts and lists."""
        return self.filter_and_map(self.trainable_parameter_filter)

    def children(self):
        """Return the direct descendants of this Module instance."""
        return self.filter_and_map(
            self.valid_child_filter, is_leaf_fn=lambda m, k, v: isinstance(v, Module)
        )

    def leaf_modules(self):
        """Return the submodules that do not contain other modules."""

        def _is_leaf_module(m, k, v):
            return isinstance(v, Module) and len(tree_flatten(v.children())) == 0

        return self.filter_and_map(self.valid_child_filter, is_leaf_fn=_is_leaf_module)

    def update(self, parameters: dict, strict: bool = True) -> Module:
        """Replace the parameters of this Module with the provided ones in the
        dict of dicts and lists.

        Commonly used by the optimizer to change the model to the updated
        (optimized) parameters. Also used by the :meth:`mlx.nn.value_and_grad` to set the
        tracers in the model in order to compute gradients.

        The passed in parameters dictionary need not be a full dictionary
        similar to :meth:`parameters`. Only the provided locations will be
        updated.

        Args:
            parameters (dict): A complete or partial dictionary of the modules
                parameters.
            strict (bool): If ``True`` checks that ``parameters`` is a
                subset of the module's parameters. Default: ``True``.
        Returns:
            The module instance after updating the parameters.
        """

        def apply(dst, parameters):
            if isinstance(parameters, dict):
                for k in parameters:
                    if k in dst:
                        current_value = dst[k]
                        new_value = parameters[k]
                        if isinstance(current_value, mx.array):
                            if strict and not isinstance(new_value, mx.array):
                                raise ValueError(
                                    f"Received invalid type: {type(new_value).__name__}."
                                )
                            dst[k] = new_value
                        else:
                            apply(current_value, new_value)
                    elif strict:
                        raise ValueError(f'Module does not have parameter named "{k}".')
            elif isinstance(parameters, list):
                for i in range(len(parameters)):
                    if i >= len(dst):
                        if strict:
                            raise ValueError(
                                f"List index {i} is out of bounds for "
                                f"destination of length {len(dst)}."
                            )
                        continue
                    current_value = dst[i]
                    new_value = parameters[i]
                    if isinstance(current_value, mx.array):
                        if strict and not isinstance(new_value, mx.array):
                            raise ValueError(
                                f"Received invalid type: {type(new_value).__name__}."
                            )
                        dst[i] = new_value
                    else:
                        apply(current_value, new_value)
            elif strict:
                raise ValueError(f"Received invalid type: {type(parameters).__name__}.")

        apply(self, parameters)
        return self

    def apply(
        self,
        map_fn: Callable[[mx.array], mx.array],
        filter_fn: Optional[Callable[[Module, str, Any], bool]] = None,
    ) -> Module:
        """Map all the parameters using the provided ``map_fn`` and immediately
        update the module with the mapped parameters.

        For instance running ``model.apply(lambda x: x.astype(mx.float16))``
        casts all parameters to 16 bit floats.

        Args:
            map_fn (Callable): Maps an array to another array
            filter_fn (Callable, optional): Filter to select which arrays to
                map (default: :meth:`Module.valid_parameter_filter`).

        Returns:
            The module instance after updating the parameters.
        """
        filter_fn = filter_fn or Module.valid_parameter_filter
        self.update(self.filter_and_map(filter_fn, map_fn))
        return self

    def update_modules(self, modules: dict, strict: bool = True) -> Module:
        """Replace the child modules of this :class:`Module` instance with the
        provided ones in the dict of dicts and lists.

        It is the equivalent of :meth:`Module.update` but for modules instead
        of parameters and allows us to flexibly edit complex architectures by
        programmatically swapping layers.

        The passed in parameters dictionary need not be a full dictionary
        similar to :meth:`modules`. Only the provided locations will be
        updated.

        Args:
            modules (dict): A complete or partial dictionary of the module's
                submodules.
            strict (bool): If ``True`` checks that ``modules`` is a
                subset of the child modules of this instance. Default: ``True``.
        Returns:
            The module instance after updating the submodules.
        """
        _update_modules(self, modules, strict)
        return self

    def apply_to_modules(self, apply_fn: Callable[[str, Module], Any]) -> Module:
        """Apply a function to all the modules in this instance (including this
        instance).

        Args:
            apply_fn (Callable): The function to apply to the modules which
                takes two parameters. The first parameter is the string path of
                the module (e.g. ``"model.layers.0.linear"``). The second
                parameter is the module object.

        Returns:
            The module instance after updating submodules.
        """
        module_stack = [("", self)]
        while module_stack:
            prefix, mod = module_stack.pop()
            apply_fn(prefix, mod)
            prefix = "." + prefix if prefix else ""
            module_stack.extend(
                tree_flatten(mod.children(), prefix=prefix, is_leaf=self.is_module)
            )
        return self

    def modules(self):
        """Return a list with all the modules in this instance.

        Returns:
            A list of :class:`mlx.nn.Module` instances.
        """
        modulelist = []
        self.apply_to_modules(lambda k, m: modulelist.append(m))
        return modulelist

    def named_modules(self):
        """Return a list with all the modules in this instance and their name
        with dot notation.

        Returns:
            A list of tuples (str, :class:`mlx.nn.Module`).
        """
        modulelist = []
        self.apply_to_modules(lambda k, m: modulelist.append((k, m)))
        return modulelist

    def _validate_keys(self, keys, strict):
        keys = keys if isinstance(keys, list) else [keys]
        if strict:
            for k in keys:
                if k not in self:
                    raise KeyError(f"Module doesn't contain member {k}.")
        return keys

    def freeze(
        self,
        *,
        recurse: bool = True,
        keys: Optional[Union[str, List[str]]] = None,
        strict: bool = False,
    ) -> Module:
        """Freeze the Module's parameters or some of them. Freezing a parameter means not
        computing gradients for it.

        This function is idempotent i.e. freezing a frozen model is a no-op.

        Example:
            For instance to only train the attention parameters from a Transformer:

            .. code-block:: python

                model = nn.Transformer()
                model.freeze()
                model.apply_to_modules(lambda k, v: v.unfreeze() if k.endswith("attention") else None)

        Args:
            recurse (bool, optional): If True then freeze the parameters of the
                submodules as well. Default: ``True``.
            keys (str or list[str], optional): If provided then only these
                parameters will be frozen otherwise all the parameters of a
                module. For instance freeze all biases by calling
                ``module.freeze(keys="bias")``.
            strict (bool, optional): If set to ``True`` validate that the passed keys exist.
                Default: ``False``.

        Returns:
            The module instance after freezing the parameters.
        """

        def _freeze_impl(_, m):
            local_keys = keys
            if local_keys is None:
                local_keys = tree_flatten(
                    m.filter_and_map(
                        lambda m, k, v: (not isinstance(v, Module))
                        and m.valid_parameter_filter(m, k, v)
                    )
                )
                local_keys = [k for (k, v) in local_keys]

            local_keys = m._validate_keys(local_keys, strict)
            m._no_grad.update(local_keys)

        if recurse:
            self.apply_to_modules(_freeze_impl)
        else:
            _freeze_impl("", self)
        return self

    def unfreeze(
        self,
        *,
        recurse: bool = True,
        keys: Optional[Union[str, List[str]]] = None,
        strict: bool = False,
    ) -> Module:
        """Unfreeze the Module's parameters or some of them.

        This function is idempotent ie unfreezing a model that is not frozen is
        a noop.

        Example:

            For instance to only train the biases of a Transformer one can do:

            .. code-block:: python

                model = nn.Transformer()
                model.freeze()
                model.unfreeze(keys="bias")

        Args:
            recurse (bool, optional): If True then unfreeze the parameters of the
                submodules as well. Default: ``True``.
            keys (str or list[str], optional): If provided then only these
                parameters will be unfrozen otherwise all the parameters of a
                module. For instance unfreeze all biases by calling
                ``module.unfreeze(keys="bias")``.
            strict (bool, optional): If set to ``True`` validate that the passed keys exist.
                Default: ``False``.

        Returns:
            The module instance after unfreezing the parameters.
        """

        def _unfreeze_impl(_, m):
            if keys is None:
                m._no_grad.clear()

            else:
                local_keys = m._validate_keys(keys, strict)
                m._no_grad.difference_update(local_keys)

        if recurse:
            self.apply_to_modules(_unfreeze_impl)
        else:
            _unfreeze_impl("", self)
        return self

    def _set_training_mode(self, mode: bool) -> None:
        self._training = mode

    def train(self, mode: bool = True) -> Module:
        """Set the model in or out of training mode.

        Training mode only applies to certain layers. For example
        :obj:`Dropout` applies a random mask in training mode, but is the
        identity in evaluation mode.

        Args:
            mode (bool): Indicate if the model should be in training or
                evaluation mode. Default: ``True``.
        Returns:
            The module instance after updating the training mode.
        """

        self.apply_to_modules(lambda _, m: m._set_training_mode(mode))

        return self

    def eval(self) -> Module:
        """Set the model to evaluation mode.

        See :func:`train`.
        """
        return self.train(False)

    def set_dtype(
        self,
        dtype: mx.Dtype,
        predicate: Optional[Callable[[mx.Dtype], bool]] = lambda x: mx.issubdtype(
            x, mx.floating
        ),
    ):
        """Set the dtype of the module's parameters.

        Args:
            dtype (Dtype): The new dtype.
            predicate (typing.Callable, optional): A predicate to select
              parameters to cast. By default, only parameters of type
              :attr:`floating` will be updated to avoid casting integer
              parameters to the new dtype.
        """
        if predicate is None:
            predicate = lambda _: True

        self.apply(lambda x: x.astype(dtype) if predicate(x.dtype) else x)


def _update_modules(dst, modules, strict):
    if isinstance(modules, dict):
        for k in modules:
            if k in dst:
                current_value = dst[k]
                new_value = modules[k]
                if Module.is_module(current_value) and Module.is_module(new_value):
                    dst[k] = new_value
                elif isinstance(current_value, (dict, list)):
                    _update_modules(current_value, new_value, strict)
                elif strict and new_value != {}:
                    raise ValueError(
                        f"Received invalid type: {type(new_value).__name__}."
                    )
            elif strict:
                raise ValueError(f'Module does not have sub-module named "{k}".')
    elif isinstance(modules, list):
        for i in range(len(modules)):
            current_value = dst[i]
            new_value = modules[i]
            if Module.is_module(current_value) and Module.is_module(new_value):
                dst[i] = new_value
            elif isinstance(current_value, (dict, list)):
                _update_modules(current_value, new_value, strict)
            elif strict and new_value != {}:
                raise ValueError(f"Received invalid type: {type(new_value).__name__}.")
    elif strict:
        raise ValueError(f"Received invalid type: {type(modules).__name__}.")


def _unwrap(model, value_key, value, filter_fn, map_fn, is_leaf_fn):
    if is_leaf_fn(model, value_key, value):
        return map_fn(value)

    elif isinstance(value, Module):
        return {
            k: _unwrap(value, k, v, filter_fn, map_fn, is_leaf_fn)
            for k, v in value.items()
            if filter_fn(value, k, v)
        }

    elif isinstance(value, dict):
        nd = {}
        for k, v in value.items():
            tk = f"{value_key}.{k}"
            nd[k] = (
                _unwrap(model, tk, v, filter_fn, map_fn, is_leaf_fn)
                if filter_fn(model, tk, v)
                else {}
            )
        return nd

    elif isinstance(value, list):
        nl = []
        for i, vi in enumerate(value):
            tk = f"{value_key}.{i}"
            nl.append(
                _unwrap(model, tk, vi, filter_fn, map_fn, is_leaf_fn)
                if filter_fn(model, tk, vi)
                else {}
            )
        return nl

    raise RuntimeError("Unexpected leaf found while traversing the module")


================================================
FILE: python/mlx/nn/layers/containers.py
================================================
# Copyright © 2023 Apple Inc.

from mlx.nn.layers.base import Module


class Sequential(Module):
    """A layer that calls the passed callables in order.

    We can pass either modules or plain callables to the Sequential module. If
    our functions have learnable parameters they should be implemented as
    ``nn.Module`` instances.

    Args:
        modules (tuple of Callables): The modules to call in order
    """

    def __init__(self, *modules):
        super().__init__()
        self.layers = list(modules)

    def __call__(self, x):
        for m in self.layers:
            x = m(x)
        return x


================================================
FILE: python/mlx/nn/layers/convolution.py
================================================
# Copyright © 2023 Apple Inc.

import math
from typing import Union

import mlx.core as mx
from mlx.nn.layers.base import Module


class Conv1d(Module):
    """Applies a 1-dimensional convolution over the multi-channel input sequence.

    The channels are expected to be last i.e. the input shape should be ``NLC`` where:

    * ``N`` is the batch dimension
    * ``L`` is the sequence length
    * ``C`` is the number of input channels

    Args:
        in_channels (int): The number of input channels
        out_channels (int): The number of output channels
        kernel_size (int): The size of the convolution filters
        stride (int, optional): The stride when applying the filter.
            Default: ``1``.
        padding (int, optional): How many positions to 0-pad the input with.
            Default: ``0``.
        dilation (int, optional): The dilation of the convolution.
        groups (int, optional): The number of groups for the convolution.
            Default: ``1``.
        bias (bool, optional): If ``True`` add a learnable bias to the output.
            Default: ``True``
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int = 1,
        padding: int = 0,
        dilation: int = 1,
        groups: int = 1,
        bias: bool = True,
    ):
        super().__init__()

        if in_channels % groups != 0:
            raise ValueError(
                f"The number of input channels ({in_channels}) must be "
                f"divisible by the number of groups ({groups})"
            )

        scale = math.sqrt(1 / (in_channels * kernel_size))
        self.weight = mx.random.uniform(
            low=-scale,
            high=scale,
            shape=(out_channels, kernel_size, in_channels // groups),
        )
        if bias:
            self.bias = mx.zeros((out_channels,))

        self.padding = padding
        self.dilation = dilation
        self.stride = stride
        self.groups = groups

    def _extra_repr(self):
        return (
            f"{self.weight.shape[-1] * self.groups}, {self.weight.shape[0]}, "
            f"kernel_size={self.weight.shape[1]}, stride={self.stride}, "
            f"padding={self.padding}, dilation={self.dilation}, "
            f"groups={self.groups}, "
            f"bias={'bias' in self}"
        )

    def __call__(self, x):
        y = mx.conv1d(
            x, self.weight, self.stride, self.padding, self.dilation, self.groups
        )
        if "bias" in self:
            y = y + self.bias
        return y


class Conv2d(Module):
    """Applies a 2-dimensional convolution over the multi-channel input image.

    The channels are expected to be last i.e. the input shape should be ``NHWC`` where:

    * ``N`` is the batch dimension
    * ``H`` is the input image height
    * ``W`` is the input image width
    * ``C`` is the number of input channels

    Args:
        in_channels (int): The number of input channels.
        out_channels (int): The number of output channels.
        kernel_size (int or tuple): The size of the convolution filters.
        stride (int or tuple, optional): The size of the stride when
            applying the filter. Default: ``1``.
        padding (int or tuple, optional): How many positions to 0-pad
            the input with. Default: ``0``.
        dilation (int or tuple, optional): The dilation of the convolution.
        groups (int, optional): The number of groups for the convolution.
            Default: ``1``.
        bias (bool, optional): If ``True`` add a learnable bias to the
            output. Default: ``True``
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: Union[int, tuple],
        stride: Union[int, tuple] = 1,
        padding: Union[int, tuple] = 0,
        dilation: Union[int, tuple] = 1,
        groups: int = 1,
        bias: bool = True,
    ):
        super().__init__()

        if in_channels % groups != 0:
            raise ValueError(
                f"The number of input channels ({in_channels}) must be "
                f"divisible by the number of groups ({groups})"
            )

        kernel_size, stride, padding = map(
            lambda x: (x, x) if isinstance(x, int) else x,
            (kernel_size, stride, padding),
        )
        scale = math.sqrt(1 / (in_channels * kernel_size[0] * kernel_size[1]))
        self.weight = mx.random.uniform(
            low=-scale,
            high=scale,
            shape=(out_channels, *kernel_size, in_channels // groups),
        )
        if bias:
            self.bias = mx.zeros((out_channels,))

        self.padding = padding
        self.stride = stride
        self.dilation = dilation
        self.groups = groups

    def _extra_repr(self):
        return (
            f"{self.weight.shape[-1] * self.groups}, {self.weight.shape[0]}, "
            f"kernel_size={self.weight.shape[1:3]}, stride={self.stride}, "
            f"padding={self.padding}, dilation={self.dilation}, "
            f"groups={self.groups}, "
            f"bias={'bias' in self}"
        )

    def __call__(self, x):
        y = mx.conv2d(
            x, self.weight, self.stride, self.padding, self.dilation, self.groups
        )
        if "bias" in self:
            y = y + self.bias
        return y


class Conv3d(Module):
    """Applies a 3-dimensional convolution over the multi-channel input image.

    The channels are expected to be last i.e. the input shape should be ``NDHWC`` where:

    * ``N`` is the batch dimension
    * ``D`` is the input image depth
    * ``H`` is the input image height
    * ``W`` is the input image width
    * ``C`` is the number of input channels

    Args:
        in_channels (int): The number of input channels.
        out_channels (int): The number of output channels.
        kernel_size (int or tuple): The size of the convolution filters.
        stride (int or tuple, optional): The size of the stride when
            applying the filter. Default: ``1``.
        dilation (int or tuple, optional): The dilation of the convolution.
        padding (int or tuple, optional): How many positions to 0-pad
            the input with. Default: ``0``.
        bias (bool, optional): If ``True`` add a learnable bias to the
            output. Default: ``True``
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: Union[int, tuple],
        stride: Union[int, tuple] = 1,
        padding: Union[int, tuple] = 0,
        dilation: Union[int, tuple] = 1,
        bias: bool = True,
    ):
        super().__init__()

        kernel_size, stride, padding = map(
            lambda x: (x, x, x) if isinstance(x, int) else x,
            (kernel_size, stride, padding),
        )
        scale = math.sqrt(
            1 / (in_channels * kernel_size[0] * kernel_size[1] * kernel_size[2])
        )
        self.weight = mx.random.uniform(
            low=-scale,
            high=scale,
            shape=(out_channels, *kernel_size, in_channels),
        )
        if bias:
            self.bias = mx.zeros((out_channels,))

        self.padding = padding
        self.stride = stride
        self.dilation = dilation

    def _extra_repr(self):
        return (
            f"{self.weight.shape[-1] * self.groups}, {self.weight.shape[0]}, "
            f"kernel_size={self.weight.shape[1:4]}, stride={self.stride}, "
            f"padding={self.padding}, dilation={self.dilation}, "
            f"bias={'bias' in self}"
        )

    def __call__(self, x):
        y = mx.conv3d(x, self.weight, self.stride, self.padding, self.dilation)
        if "bias" in self:
            y = y + self.bias
        return y


================================================
FILE: python/mlx/nn/layers/convolution_transpose.py
================================================
# Copyright © 2023 Apple Inc.

import math
from typing import Union

import mlx.core as mx
from mlx.nn.layers.base import Module


class ConvTranspose1d(Module):
    """Applies a 1-dimensional transposed convolution over the multi-channel input sequence.

    The channels are expected to be last i.e. the input shape should be ``NLC`` where:

    * ``N`` is the batch dimension
    * ``L`` is the sequence length
    * ``C`` is the number of input channels

    Args:
        in_channels (int): The number of input channels
        out_channels (int): The number of output channels
        kernel_size (int): The size of the convolution filters
        stride (int, optional): The stride when applying the filter.
            Default: ``1``.
        padding (int, optional): How many positions to 0-pad the input with.
            Default: ``0``.
        dilation (int, optional): The dilation of the convolution.
        output_padding(int, optional): Additional size added to one side of the
            output shape. Default: ``0``.
        bias (bool, optional): If ``True`` add a learnable bias to the output.
            Default: ``True``
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int = 1,
        padding: int = 0,
        dilation: int = 1,
        output_padding: int = 0,
        bias: bool = True,
    ):
        super().__init__()

        scale = math.sqrt(1 / (in_channels * kernel_size))
        self.weight = mx.random.uniform(
            low=-scale,
            high=scale,
            shape=(out_channels, kernel_size, in_channels),
        )
        if bias:
            self.bias = mx.zeros((out_channels,))

        self.padding = padding
        self.dilation = dilation
        self.stride = stride
        self.output_padding = output_padding

    def _extra_repr(self):
        return (
            f"{self.weight.shape[-1]}, {self.weight.shape[0]}, "
            f"kernel_size={self.weight.shape[1]}, stride={self.stride}, "
            f"padding={self.padding}, dilation={self.dilation}, "
            f"output_padding={self.output_padding}, "
            f"bias={'bias' in self}"
        )

    def __call__(self, x):
        y = mx.conv_transpose1d(
            x,
            self.weight,
            self.stride,
            self.padding,
            self.dilation,
            self.output_padding,
        )
        if "bias" in self:
            y = y + self.bias
        return y


class ConvTranspose2d(Module):
    """Applies a 2-dimensional transposed convolution over the multi-channel input image.

    The channels are expected to be last i.e. the input shape should be ``NHWC`` where:

    * ``N`` is the batch dimension
    * ``H`` is the input image height
    * ``W`` is the input image width
    * ``C`` is the number of input channels

    Args:
        in_channels (int): The number of input channels.
        out_channels (int): The number of output channels.
        kernel_size (int or tuple): The size of the convolution filters.
        stride (int or tuple, optional): The size of the stride when
            applying the filter. Default: ``1``.
        padding (int or tuple, optional): How many positions to 0-pad
            the input with. Default: ``0``.
        dilation (int or tuple, optional): The dilation of the convolution.
        output_padding(int or tuple, optional): Additional size added to one
            side of the output shape. Default: ``0``.
        bias (bool, optional): If ``True`` add a learnable bias to the
            output. Default: ``True``
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: Union[int, tuple],
        stride: Union[int, tuple] = 1,
        padding: Union[int, tuple] = 0,
        dilation: Union[int, tuple] = 1,
        output_padding: Union[int, tuple] = 0,
        bias: bool = True,
    ):
        super().__init__()

        kernel_size, stride, padding, output_padding = map(
            lambda x: (x, x) if isinstance(x, int) else x,
            (kernel_size, stride, padding, output_padding),
        )
        scale = math.sqrt(1 / (in_channels * kernel_size[0] * kernel_size[1]))
        self.weight = mx.random.uniform(
            low=-scale,
            high=scale,
            shape=(out_channels, *kernel_size, in_channels),
        )
        if bias:
            self.bias = mx.zeros((out_channels,))

        self.padding = padding
        self.stride = stride
        self.dilation = dilation
        self.output_padding = output_padding

    def _extra_repr(self):
        return (
            f"{self.weight.shape[-1]}, {self.weight.shape[0]}, "
            f"kernel_size={self.weight.shape[1:2]}, stride={self.stride}, "
            f"padding={self.padding}, dilation={self.dilation}, "
            f"output_padding={self.output_padding}, "
            f"bias={'bias' in self}"
        )

    def __call__(self, x):
        y = mx.conv_transpose2d(
            x,
            self.weight,
            self.stride,
            self.padding,
            self.dilation,
            self.output_padding,
        )
        if "bias" in self:
            y = y + self.bias
        return y


class ConvTranspose3d(Module):
    """Applies a 3-dimensional transposed convolution over the multi-channel input image.

    The channels are expected to be last i.e. the input shape should be ``NDHWC`` where:

    * ``N`` is the batch dimension
    * ``D`` is the input image depth
    * ``H`` is the input image height
    * ``W`` is the input image width
    * ``C`` is the number of input channels

    Args:
        in_channels (int): The number of input channels.
        out_channels (int): The number of output channels.
        kernel_size (int or tuple): The size of the convolution filters.
        stride (int or tuple, optional): The size of the stride when
            applying the filter. Default: ``1``.
        padding (int or tuple, optional): How many positions to 0-pad
            the input with. Default: ``0``.
        dilation (int or tuple, optional): The dilation of the convolution.
        output_padding(int or tuple, optional): Additional size added to one
            side of the output shape. Default: ``0``.
        bias (bool, optional): If ``True`` add a learnable bias to the
            output. Default: ``True``
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: Union[int, tuple],
        stride: Union[int, tuple] = 1,
        padding: Union[int, tuple] = 0,
        dilation: Union[int, tuple] = 1,
        output_padding: Union[int, tuple] = 0,
        bias: bool = True,
    ):
        super().__init__()

        kernel_size, stride, padding, output_padding = map(
            lambda x: (x, x, x) if isinstance(x, int) else x,
            (kernel_size, stride, padding, output_padding),
        )
        scale = math.sqrt(
            1 / (in_channels * kernel_size[0] * kernel_size[1] * kernel_size[2])
        )
        self.weight = mx.random.uniform(
            low=-scale,
            high=scale,
            shape=(out_channels, *kernel_size, in_channels),
        )
        if bias:
            self.bias = mx.zeros((out_channels,))

        self.padding = padding
        self.stride = stride
        self.dilation = dilation
        self.output_padding = output_padding

    def _extra_repr(self):
        return (
            f"{self.weight.shape[-1]}, {self.weight.shape[0]}, "
            f"kernel_size={self.weight.shape[1:3]}, stride={self.stride}, "
            f"padding={self.padding}, dilation={self.dilation}, "
            f"output_padding={self.output_padding}, "
            f"bias={'bias' in self}"
        )

    def __call__(self, x):
        y = mx.conv_transpose3d(
            x,
            self.weight,
            self.stride,
            self.padding,
            self.dilation,
            self.output_padding,
        )
        if "bias" in self:
            y = y + self.bias
        return y


================================================
FILE: python/mlx/nn/layers/distributed.py
================================================
# Copyright © 2024 Apple Inc.

import math
from functools import lru_cache
from typing import Callable, Optional, Union

import mlx.core as mx
from mlx.nn.layers.base import Module
from mlx.nn.layers.linear import Linear
from mlx.nn.layers.quantized import QuantizedLinear
from mlx.utils import tree_map_with_path


@lru_cache
def sum_gradients(group):
    if group.size() == 1:
        return lambda x: x

    @mx.custom_function
    def f(x):
        return x

    @f.vjp
    def f(x, dx, _):
        return mx.distributed.all_sum(dx, group=group)

    return f


def _split(weight, segments, axis):
    """Equivalent to mx.split but allows for fractional segments."""
    if isinstance(segments, int) or isinstance(segments[0], int):
        return mx.split(weight, segments, axis=axis)

    N = weight.shape[axis]
    indices = [int(s * N) for s in segments]
    return mx.split(weight, indices, axis=axis)


def _shard(
    parameters: dict,
    sharding_predicate: Callable,
    group: Optional[mx.distributed.Group] = None,
):
    """Returns a new parameter tree with the weights sharded according to the
    sharding_predicate.

    The sharding predicate should return the sharding axis and optionally also
    the segments that comprise the weight.
    """
    group = group or mx.distributed.init()
    N = group.size()
    r = group.rank()

    def _shard_fn(path, weight):
        if not isinstance(weight, mx.array):
            return weight

        s = sharding_predicate(path, weight)
        if s is None:
            return weight

        axis = None
        segments = 1
        if isinstance(s, int):
            axis = s
        elif isinstance(s, tuple):
            axis, segments = s
        else:
            raise ValueError(
                "The sharding function should return int or tuple[int, list]"
            )

        return mx.contiguous(
            mx.concatenate(
                [_split(part, N, axis)[r] for part in _split(weight, segments, axis)],
                axis=axis,
            )
        )

    return tree_map_with_path(_shard_fn, parameters)


def _all_to_sharded(segments):
    """Simple predicate to shard fully connected layers such that a common
    representation becomes a sharded representation."""

    def _shard_fn(path, weight):
        if path.endswith("bias"):
            return -1, segments
        return max(weight.ndim - 2, 0), segments

    return _shard_fn


def _sharded_to_all(segments):
    """Simple predicate to shard fully connected layers such that a sharded
    representation becomes a common representation."""

    def _shard_fn(path, weight):
        if path.endswith("bias"):
            return None
        return -1, segments

    return _shard_fn


def _check_sharding(sharding):
    if sharding not in ("all-to-sharded", "sharded-to-all"):
        raise ValueError(
            (
                f"Sharding type {sharding=} not supported, "
                "choose one of 'all-to-sharded' or 'sharded-to-all'"
            )
        )


def shard_inplace(
    module: Module,
    sharding: Union[str, Callable],
    *,
    segments: Union[int, list] = 1,
    group: Optional[mx.distributed.Group] = None,
):
    """Shard a module in-place by updating its parameter dictionary with the
    sharded parameter dictionary.

    The ``sharding`` argument can be any callable that given the path and the
    weight returns the sharding axis and optionally also the segments that
    comprise the unsharded weight. For instance if the weight is a fused QKV
    matrix the segments should be 3.

    .. note::
        The module doesn't change so in order for distributed communication to
        happen the module needs to natively support it and for it to be enabled.

    Args:
        module (mlx.nn.Module): The parameters of this module will be sharded
            in-place.
        sharding (str or callable): One of "all-to-sharded" and
            "sharded-to-all" or a callable that returns the sharding axis and
            segments.
        segments (int or list): The segments to use if ``sharding`` is a
            string. Default: ``1``.
        group (mlx.core.distributed.Group): The distributed group to shard
            across. If not set, the global group will be used. Default: ``None``.
    """
    if isinstance(sharding, str):
        _check_sharding(sharding)
        sharding = (
            _all_to_sharded(segments)
            if sharding == "all-to-sharded"
            else _sharded_to_all(segments)
        )
    module.update(_shard(module.parameters(), sharding, group))


def shard_linear(
    module: Module,
    sharding: str,
    *,
    segments: Union[int, list] = 1,
    group: Optional[mx.distributed.Group] = None,
):
    """Create a new linear layer that has its parameters sharded and also
    performs distributed communication either in the forward or backward
    pass.

    .. note::
        Contrary to ``shard_inplace``, the original layer is not changed but a
        new layer is returned.

    Args:
        module (mlx.nn.Module): The linear layer to be sharded.
        sharding (str): One of "all-to-sharded" and
            "sharded-to-all" that defines the type of sharding to perform.
        segments (int or list): The segments to use. Default: ``1``.
        group (mlx.core.distributed.Group): The distributed group to shard
            across. If not set, the global group will be used. Default: ``None``.
    """
    _check_sharding(sharding)
    fns = {
        ("all-to-sharded", True): AllToShardedLinear.from_linear,
        ("all-to-sharded", False): QuantizedAllToShardedLinear.from_quantized_linear,
        ("sharded-to-all", True): ShardedToAllLinear.from_linear,
        ("sharded-to-all", False): QuantizedShardedToAllLinear.from_quantized_linear,
    }
    return fns[sharding, isinstance(module, Linear)](
        module, segments=segments, group=group
    )


class AllToShardedLinear(Module):
    """Each member of the group applies part of the affine transformation such
    that the result is sharded across the group.

    The gradients are automatically aggregated from each member of the group.

    Args:
        input_dims (int): The dimensionality of the input features
        output_dims (int): The dimensionality of the output features
        bias (bool, optional): If set to ``False`` the the layer will not use a
            bias. Default is ``True``.
        group (mx.distributed.Group, optional): The sharding will happen across
            this group. If not set then the global group is used. Default is
            ``None``.
    """

    def __init__(
        self,
        input_dims: int,
        output_dims: int,
        bias: bool = True,
        group: Optional[mx.distributed.Group] = None,
    ):
        super().__init__()

        # Initialize the parameters
        scale = math.sqrt(1.0 / input_dims)
        self.group = group or mx.distributed.init()
        N = self.group.size()

        if (output_dims % N) != 0:
            raise ValueError(
                f"Cannot shard the output of size {output_dims} across {N} devices."
            )

        self.weight = mx.random.uniform(
            low=-scale,
            high=scale,
            shape=(output_dims // N, input_dims),
        )
        if bias:
            self.bias = mx.random.uniform(
                low=-scale,
                high=scale,
                shape=(output_dims // N,),
            )

    def _extra_repr(self) -> str:
        out_dims, in_dims = self.weight.shape
        N = self.group.size()
        out_dims *= N
        return f"input_dims={in_dims}, output_dims={out_dims}, bias={'bias' in self}"

    def __call__(self, x: mx.array) -> mx.array:
        # Aggregate the gradients coming from each shard
        x = sum_gradients(self.group)(x)

        # Compute the affine projection
        if "bias" in self:
            x = mx.addmm(self["bias"], x, self["weight"].T)
        else:
            x = x @ self["weight"].T
        return x

    @classmethod
    def from_linear(
        cls,
        linear_layer: Module,
        *,
        segments: Union[int, list] = 1,
        group: Optional[mx.distributed.Group] = None,
    ):
        group = group or mx.distributed.init()
        output_dims, input_dims = linear_layer.weight.shape

        sl = cls(input_dims, output_dims, hasattr(linear_layer, "bias"), group)
        sl.update(_shard(linear_layer.parameters(), _all_to_sharded(segments), group))

        return sl


class ShardedToAllLinear(Module):
    """Each member of the group applies part of the affine transformation and
    then aggregates the results.

    All nodes will have the same exact result after this layer.

    :class:`ShardedToAllLinear` provides a classmethod :meth:`from_linear` to
    convert linear layers to sharded :obj:`ShardedToAllLinear` layers.

    Args:
        input_dims (int): The dimensionality of the input features
        output_dims (int): The dimensionality of the output features
        bias (bool, optional): If set to ``False`` the the layer will not use a
            bias. Default is ``True``.
        group (mx.distributed.Group, optional): The sharding will happen across
            this group. If not set then the global group is used. Default is
            ``None``.
    """

    def __init__(
        self,
        input_dims: int,
        output_dims: int,
        bias: bool = True,
        group: Optional[mx.distributed.Group] = None,
    ):
        super().__init__()

        # Initialize the parameters
        scale = math.sqrt(1.0 / input_dims)
        self.group = group or mx.distributed.init()
        N = self.group.size()

        if (input_dims % N) != 0:
            raise ValueError(
                f"The input of size {input_dims} cannot be sharded across {N} devices."
            )

        self.weight = mx.random.uniform(
            low=-scale,
            high=scale,
            shape=(output_dims, input_dims // N),
        )
        if bias:
            self.bias = mx.random.uniform(
                low=-scale,
                high=scale,
                shape=(output_dims,),
            )

    def _extra_repr(self) -> str:
        N = self.group.size()
        out_dims, in_dims = self.weight.shape
        in_dims *= N
        return f"input_dims={in_dims}, output_dims={out_dims}, bias={'bias' in self}"

    def __call__(self, x: mx.array) -> mx.array:
        x = x @ self["weight"].T

        x = mx.distributed.all_sum(x, group=self.group)

        if "bias" in self:
            x = x + self["bias"]

        return x

    @classmethod
    def from_linear(
        cls,
        linear_layer: Module,
        *,
        segments: Union[int, list] = 1,
        group: Optional[mx.distributed.Group] = None,
    ):
        group = group or mx.distributed.init()
        output_dims, input_dims = linear_layer.weight.shape

        sl = cls(input_dims, output_dims, hasattr(linear_layer, "bias"), group)
        sl.update(_shard(linear_layer.parameters(), _sharded_to_all(segments), group))

        return sl


class QuantizedAllToShardedLinear(Module):
    """Each member of the group applies part of the affine transformation with
    a quantized matrix such that the result is sharded across the group.

    It is the quantized equivalent of :class:`mlx.nn.AllToShardedLinear`.
    Similar to :class:`mlx.nn.QuantizedLinear` its parameters are frozen and
    will not be included in any gradient computation.

    Args:
        input_dims (int): The dimensionality of the input features.
        output_dims (int): The dimensionality of the output features.
        bias (bool, optional): If set to ``False`` then the layer will not use
            a bias. Default: ``True``.
        group_size (int, optional): The group size to use for the quantized
            weight. See :func:`~mlx.core.quantize`. Default: ``64``.
        bits (int, optional): The bit width to use for the quantized weight.
            See :func:`~mlx.core.quantize`. Default: ``4``.
        mode (str, optional): The quantization method to use (see
            :func:`~mlx.core.quantize`). Default: ``"affine"``.
        group (mx.distributed.Group, optional): The sharding will happen across
            this group. If not set then the global group is used. Default is
            ``None``.
    """

    def __init__(
        self,
        input_dims: int,
        output_dims: int,
        bias: bool = True,
        group_size: int = 64,
        bits: int = 4,
        mode: str = "affine",
        group: Optional[mx.distributed.Group] = None,
    ):
        super().__init__()

        # Quantization config
        self.group_size = group_size
        self.bits = bits
        self.mode = mode

        # Initialize the quantized weight
        scale = math.sqrt(1.0 / input_dims)
        self.group = group or mx.distributed.init()
        N = self.group.size()

        if (output_dims % N) != 0:
            raise ValueError(
                f"Cannot shard the output of size {output_dims} across {N} devices."
            )

        weight = mx.random.uniform(
            low=-scale,
            high=scale,
            shape=(output_dims // N, input_dims),
        )
        self.weight, self.scales, *biases = mx.quantize(
            weight, group_size, bits, mode=mode
        )
        self.biases = biases[0] if biases else None

        # And bias if needed
        if bias:
            self.bias = mx.zeros((output_dims // N,))

        # Freeze this model's parameters
        self.freeze()

    def unfreeze(self, *args, **kwargs):
        """Wrap unfreeze so that we unfreeze any layers we might contain but
        our parameters will remain frozen."""
        super().unfreeze(*args, **kwargs)
        self.freeze(recurse=False)

    def _extra_repr(self) -> str:
        out_dims, in_dims = self.weight.shape
        in_dims = (in_dims * 32) // self.bits
        out_dims *= self.group.size()
        return (
            f"input_dims={in_dims}, output_dims={out_dims}, bias={'bias' in self}, "
            f"group_size={self.group_size}, bits={self.bits}, mode={self.mode}"
        )

    def __call__(self, x: mx.array) -> mx.array:
        # Aggregate the gradients coming from each shard
        x = sum_gradients(self.group)(x)

        x = mx.quantized_matmul(
            x,
            self["weight"],
            scales=self["scales"],
            biases=self.get("biases"),
            transpose=True,
            group_size=self.group_size,
            bits=self.bits,
            mode=self.mode,
        )
        if "bias" in self:
            x = x + self["bias"]
        return x

    @classmethod
    def from_quantized_linear(
        cls,
        quantized_linear_layer: Module,
        *,
        segments: Union[int, list] = 1,
        group: Optional[mx.distributed.Group] = None,
    ):
        group = group or mx.distributed.init()
        output_dims, input_dims = quantized_linear_layer.weight.shape
        input_dims = (input_dims * 32) // quantized_linear_layer.bits

        sl = cls(
            input_dims,
            output_dims,
            hasattr(quantized_linear_layer, "bias"),
            group_size=quantized_linear_layer.group_size,
            bits=quantized_linear_layer.bits,
            mode=getattr(quantized_linear_layer, "mode", "affine"),
            group=group,
        )
        sl.update(
            _shard(
                quantized_linear_layer.parameters(),
                _all_to_sharded(segments),
                group,
            )
        )

        return sl


class QuantizedShardedToAllLinear(Module):
    """Each member of the group applies part of the affine transformation using
    the quantized matrix and then aggregates the results.

    All nodes will have the same exact result after this layer.

    It is the quantized equivalent of :class:`mlx.nn.ShardedToAllLinear`.
    Similar to :class:`mlx.nn.QuantizedLinear` its parameters are frozen and
    will not be included in any gradient computation.

    Args:
        input_dims (int): The dimensionality of the input features.
        output_dims (int): The dimensionality of the output features.
        bias (bool, optional): If set to ``False`` then the layer will not use
            a bias. Default: ``True``.
        group_size (int, optional): The group size to use for the quantized
            weight. See :func:`~mlx.core.quantize`. Default: ``64``.
        bits (int, optional): The bit width to use for the quantized weight.
            See :func:`~mlx.core.quantize`. Default: ``4``.
        mode (str, optional): The quantization method to use (see
            :func:`~mlx.core.quantize`). Default: ``"affine"``.
        group (mx.distributed.Group, optional): The sharding will happen across
            this group. If not set then the global group is used. Default is
            ``None``.
    """

    def __init__(
        self,
        input_dims: int,
        output_dims: int,
        bias: bool = True,
        group_size: int = 64,
        bits: int = 4,
        mode: str = "affine",
        group: Optional[mx.distributed.Group] = None,
    ):
        super().__init__()

        # Quantization config
        self.group_size = group_size
        self.bits = bits
        self.mode = mode

        # Initialize the quantized weight
        scale = math.sqrt(1.0 / input_dims)
        self.group = group or mx.distributed.init()
        N = self.group.size()

        if (input_dims % N) != 0:
            raise ValueError(
                f"The input of size {input_dims} cannot be sharded across {N} devices."
            )

        weight = mx.random.uniform(
            low=-scale,
            high=scale,
            shape=(output_dims, input_dims // N),
        )
        self.weight, self.scales, *biases = mx.quantize(
            weight, group_size, bits, mode=mode
        )
        self.biases = biases[0] if biases else None

        # And bias if needed
        if bias:
            self.bias = mx.zeros((output_dims,))

        # Freeze this model's parameters
        self.freeze()

    def unfreeze(self, *args, **kwargs):
        """Wrap unfreeze so that we unfreeze any layers we might contain but
        our parameters will remain frozen."""
        super().unfreeze(*args, **kwargs)
        self.freeze(recurse=False)

    def _extra_repr(self) -> str:
        out_dims, in_dims = self.weight.shape
        in_dims = (in_dims * 32) // self.bits * self.group.size()
        return (
            f"input_dims={in_dims}, output_dims={out_dims}, bias={'bias' in self}, "
            f"group_size={self.group_size}, bits={self.bits}, mode={self.mode}"
        )

    def __call__(self, x: mx.array) -> mx.array:
        x = mx.quantized_matmul(
            x,
            self["weight"],
            scales=self["scales"],
            biases=self.get("biases"),
            transpose=True,
            group_size=self.group_size,
            bits=self.bits,
            mode=self.mode,
        )
        x = mx.distributed.all_sum(x, group=self.group)
        if "bias" in self:
            x = x + self["bias"]
        return x

    @classmethod
    def from_quantized_linear(
        cls,
        quantized_linear_layer: Module,
        *,
        segments: Union[int, list] = 1,
        group: Optional[mx.distributed.Group] = None,
    ):
        group = group or mx.distributed.init()
        output_dims, input_dims = quantized_linear_layer.weight.shape
        input_dims = (input_dims * 32) // quantized_linear_layer.bits

        sl = cls(
            input_dims,
            output_dims,
            hasattr(quantized_linear_layer, "bias"),
            group_size=quantized_linear_layer.group_size,
            bits=quantized_linear_layer.bits,
            mode=getattr(quantized_linear_layer, "mode", "affine"),
            group=group,
        )
        sl.update(
            _shard(
                quantized_linear_layer.parameters(),
                _sharded_to_all(segments),
                group,
            )
        )

        return sl


================================================
FILE: python/mlx/nn/layers/dropout.py
================================================
# Copyright © 2023 Apple Inc.

import mlx.core as mx
from mlx.nn.layers.base import Module


class Dropout(Module):
    r"""Randomly zero a portion of the elements during training.

    The remaining elements are multiplied with :math:`\frac{1}{1-p}` where
    :math:`p` is the probability of zeroing an element. This is done so the
    expected value of a given element will remain the same.

    Args:
        p (float): The probability to zero an element
    """

    def __init__(self, p: float = 0.5):
        super().__init__()

        if p < 0 or p >= 1:
            raise ValueError(f"The dropout probability {p} is not in [0, 1)")

        self._p_1 = 1 - p

    def _extra_repr(self) -> str:
        return f"p={1-self._p_1}"

    def __call__(self, x: mx.array) -> mx.array:
        if self._p_1 == 1 or not self.training:
            return x

        mask = mx.random.bernoulli(self._p_1, x.shape)

        return (mask * x) * (1 / self._p_1)


class Dropout2d(Module):
    r"""Apply 2D channel-wise dropout during training.

    Randomly zero out entire channels independently with probability :math:`p`.
    This layer expects the channels to be last, i.e. the input shape should be
    ``NWHC`` or ``WHC`` where:``N`` is the batch dimension,``H`` is the input
    image height,``W`` is the input image width, and``C`` is the number of
    input channels

    The remaining channels are scaled by :math:`\frac{1}{1-p}` to
    maintain the expected value of each element. Unlike traditional dropout,
    which zeros individual entries, this layer zeros entire channels. This is
    beneficial for early convolution layers where adjacent pixels are
    correlated. In such case, traditional dropout may not effectively
    regularize activations. For more details, see [1].

    [1]: Thompson, J., Goroshin, R., Jain, A., LeCun, Y. and Bregler C., 2015.
    Efficient Object Localization Using Convolutional Networks. CVPR 2015.

    Args:
        p (float): Probability of zeroing a channel during training.
    """

    def __init__(self, p: float = 0.5):
        super().__init__()

        if p < 0 or p >= 1:
            raise ValueError(f"The dropout probability {p} is not in [0, 1)")

        self._p_1 = 1 - p

    def _extra_repr(self) -> str:
        return f"p={1-self._p_1}"

    def __call__(self, x: mx.array) -> mx.array:
        if x.ndim not in (3, 4):
            raise ValueError(
                f"Received input with {x.ndim} dimensions. Expected 3 or 4 dimensions."
            )

        if self._p_1 == 1 or not self.training:
            return x

        # Dropout is applied on the whole channel
        # 3D input: (1, 1, C)
        # 4D input: (B, 1, 1, C)
        mask_shape = list(x.shape)
        mask_shape[-2] = mask_shape[-3] = 1

        mask = mx.random.bernoulli(p=self._p_1, shape=mask_shape)
        return (mask * x) * (1 / self._p_1)


class Dropout3d(Module):
    r"""Apply 3D channel-wise dropout during training.

    Randomly zero out entire channels independently with probability :math:`p`.
    This layer expects the channels to be last, i.e., the input shape should be
    `NDHWC` or `DHWC` where: `N` is the batch dimension, `D` is the depth,
    `H` is the input image height, `W` is the input image width, and `C` is
    the number of input channels.

    The remaining channels are scaled by :math:`\frac{1}{1-p}` to
    maintain the expected value of each element. Unlike traditional dropout,
    which zeros individual entries, this layer zeros entire channels. This is
    often beneficial for convolutional layers processing 3D data, like in
    medical imaging or video processing.

    Args:
        p (float): Probability of zeroing a channel during training.
    """

    def __init__(self, p: float = 0.5):
        super().__init__()

        if p < 0 or p >= 1:
            raise ValueError(f"The dropout probability {p} is not in [0, 1)")

        self._p_1 = 1 - p

    def _extra_repr(self) -> str:
        return f"p={1-self._p_1}"

    def __call__(self, x: mx.array) -> mx.array:
        if x.ndim not in (4, 5):
            raise ValueError(
                f"Received input with {x.ndim} dimensions. Expected 4 or 5 dimensions."
            )

        if self._p_1 == 1 or not self.training:
            return x

        # Dropout is applied on the whole channel
        # 4D input: (1, 1, 1, C)
        # 5D input: (B, 1, 1, 1, C)
        mask_shape = list(x.shape)
        mask_shape[-2] = mask_shape[-3] = mask_shape[-4] = 1

        mask = mx.random.bernoulli(p=self._p_1, shape=mask_shape)
        return (mask * x) * (1 / self._p_1)


================================================
FILE: python/mlx/nn/layers/embedding.py
================================================
# Copyright © 2023-2024 Apple Inc.

import math
from typing import Optional

import mlx.core as mx
from mlx.nn.layers.base import Module
from mlx.nn.layers.quantized import QuantizedEmbedding


class Embedding(Module):
    """Implements a simple lookup table that maps each input integer to a
    high-dimensional vector.

    Typically used to embed discrete tokens for processing by neural networks.

    Args:
        num_embeddings (int): How many possible discrete tokens can we embed.
           Usually called the vocabulary size.
        dims (int): The dimensionality of the embeddings.
    """

    def __init__(self, num_embeddings: int, dims: int):
        super().__init__()
        scale = math.sqrt(1 / dims)
        self.weight = mx.random.normal(shape=(num_embeddings, dims), scale=scale)

    def _extra_repr(self):
        return f"{self.weight.shape[0]}, {self.weight.shape[1]}"

    def __call__(self, x):
        return self.weight[x]

    def as_linear(self, x):
        """
        Call the embedding layer as a linear layer.

        Use this for example when input embedding and output projection
        weights are tied.
        """
        return x @ self.weight.T

    def to_quantized(
        self,
        group_size: Optional[int] = None,
        bits: Optional[int] = None,
        mode: str = "affine",
        quantize_input: bool = False,
    ):
        """Return a :obj:`QuantizedEmbedding` layer that approximates this embedding layer."""
        if quantize_input:
            raise ValueError("Quantized input is not supported.")
        return QuantizedEmbedding.from_embedding(self, group_size, bits, mode)


================================================
FILE: python/mlx/nn/layers/linear.py
================================================
# Copyright © 2023 Apple Inc.

import math
from typing import Any, Optional

import mlx.core as mx
from mlx.nn.layers.base import Module
from mlx.nn.layers.quantized import QQLinear, QuantizedLinear


class Identity(Module):
    r"""A placeholder identity operator that is argument-insensitive.

    Args:
        args: any argument (unused)
        kwargs: any keyword argument (unused)
    """

    def __init__(self, *args: Any, **kwargs: Any) -> None:
        super().__init__()

    def __call__(self, x: mx.array) -> mx.array:
        return x


class Linear(Module):
    r"""Applies an affine transformation to the input.

    Concretely:

    .. math::

        y = x W^\top + b

    where:
    where :math:`W` has shape ``[output_dims, input_dims]`` and :math:`b` has shape ``[output_dims]``.

    The values are initialized from the uniform distribution :math:`\mathcal{U}(-{k}, {k})`,
    where :math:`k = \frac{1}{\sqrt{D_i}}` and :math:`D_i` is equal to ``input_dims``.

    Args:
        input_dims (int): The dimensionality of the input features
        output_dims (int): The dimensionality of the output features
        bias (bool, optional): If set to ``False`` then the layer will
          not use a bias. Default is ``True``.
    """

    def __init__(self, input_dims: int, output_dims: int, bias: bool = True) -> None:
        super().__init__()
        scale = math.sqrt(1.0 / input_dims)
        self.weight = mx.random.uniform(
            low=-scale,
            high=scale,
            shape=(output_dims, input_dims),
        )
        if bias:
            self.bias = mx.random.uniform(
                low=-scale,
                high=scale,
                shape=(output_dims,),
            )

    def _extra_repr(self) -> str:
        return f"input_dims={self.weight.shape[1]}, output_dims={self.weight.shape[0]}, bias={'bias' in self}"

    def __call__(self, x: mx.array) -> mx.array:
        if "bias" in self:
            x = mx.addmm(self["bias"], x, self["weight"].T)
        else:
            x = x @ self["weight"].T
        return x

    def to_quantized(
        self,
        group_size: Optional[int] = None,
        bits: Optional[int] = None,
        mode: str = "affine",
        quantize_input: bool = False,
    ):
        """Return a quantized approximation of this layer.

        If ``quantize_input`` is ``False``, returns a :obj:`QuantizedLinear`
        (weights are quantized). If ``quantize_input`` is ``True``, returns
        a :obj:`QQLinear` (weights and activations are quantized).

        Args:
            group_size (Optional[int]): The quantization group size (see
                :func:`mlx.core.quantize`). Default: ``None``.
            bits (Optional[int]): The number of bits per parameter (see
                :func:`mlx.core.quantize`). Default: ``None``.
            mode (str): The quantization method to use (see
                :func:`mlx.core.quantize`). Default: ``"affine"``.
            quantize_input (bool): Whether to quantize input. Default: ``False``.

        Returns:
            QuantizedLinear or QQLinear: A quantized version of this layer.

        Notes:
            Quantized input is only supported for ``"nvfp4"`` and ``"mxfp8"``
            modes.
        """
        if quantize_input:
            if mode not in ["nvfp4", "mxfp8"]:
                raise ValueError(
                    f"Quantized activations are only supported for 'nvfp4' and 'mxfp8' modes, got {mode}."
                )
            return QQLinear.from_linear(self, group_size, bits, mode)
        return QuantizedLinear.from_linear(self, group_size, bits, mode)


class Bilinear(Module):
    r"""Applies a bilinear transformation to the inputs.

    Concretely:

    .. math::

        y_i = x_1^\top W_i x_2 + b_i

    where:
    :math:`W` has shape ``[output_dims, input1_dims, input2_dims]``, :math:`b` has shape ``[output_dims ]``,
    and :math:`i` indexes the output dimension.

    The values are initialized from the uniform distribution :math:`\mathcal{U}(-{k}, {k})`,
    where :math:`k = \frac{1}{\sqrt{D_1}}` and :math:`D_1` is ``input1_dims``.

    Args:
        input1_dims (int): The dimensionality of the input1 features
        input2_dims (int): The dimensionality of the input2 features
        output_dims (int): The dimensionality of the output features
        bias (bool, optional): If set to ``False`` then the layer will
          not use a bias. Default is ``True``.
    """

    def __init__(
        self, input1_dims: int, input2_dims: int, output_dims: int, bias: bool = True
    ) -> None:
        super().__init__()
        scale = math.sqrt(1.0 / input1_dims)
        self.weight = mx.random.uniform(
            low=-scale,
            high=scale,
            shape=(output_dims, input2_dims, input1_dims),
        )
        if bias:
            self.bias = mx.random.uniform(
                low=-scale,
                high=scale,
                shape=(output_dims,),
            )

    def _extra_repr(self) -> str:
        out, in2, in1 = self.weight.shape
        return (
            f"input1_dims={in1}, input2_dims={in2}, output_dims={out}, "
            f"bias={'bias' in self}"
        )

    def __call__(self, x1: mx.array, x2: mx.array) -> mx.array:
        # Normalize shapes
        out, in2, in1 = self.weight.shape
        xshape = x1.shape[:-1]
        x1 = x1.reshape(-1, in1)
        x2 = x2.reshape(-1, 1, in2)

        # Perform the bilinear transformation
        w = self.weight.reshape(out * in2, in1)
        y = x1 @ w.T
        y = y.reshape(-1, out, in2).swapaxes(-2, -1)
        y = x2 @ y
        y = y.squeeze(1)

        # Reset the shape
        y = y.reshape(*xshape, out)

        # Apply the bias
        if "bias" in self:
            y = y + self.bias

        return y


================================================
FILE: python/mlx/nn/layers/normalization.py
================================================
# Copyright © 2023 Apple Inc.

from typing import Tuple

import mlx.core as mx
from mlx.nn.layers.base import Module


class InstanceNorm(Module):
    r"""Applies instance normalization [1] on the inputs.

    Computes

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta,

    where :math:`\gamma` and :math:`\beta` are learned per feature dimension
    parameters initialized at 1 and 0 respectively. Both are of size :attr:`dims`,
    if :attr:`affine` is ``True``.

    Args:
        dims (int): The number of features of the input.
        eps (float): A value added to the denominator for numerical stability. Default: ``1e-5``.
        affine (bool): Default: ``False``.

    Shape:
      - Input: :math:`(..., C)` where :math:`C` is equal to :attr:`dims`.
      - Output: Same shape as the input.

    Examples:
        >>> import mlx.core as mx
        >>> import mlx.nn as nn
        >>> x = mx.random.normal((8, 4, 4, 16))
        >>> inorm = nn.InstanceNorm(dims=16)
        >>> output = inorm(x)

    References:
        [1]: https://arxiv.org/abs/1607.08022
    """

    def __init__(
        self,
        dims: int,
        eps: float = 1e-5,
        affine: bool = False,
    ):
        super().__init__()
        if affine:
            self.weight = mx.ones((dims,))
            self.bias = mx.zeros((dims,))
        self.dims = dims
        self.eps = eps

    def _extra_repr(self):
        return f"{self.dims}, eps={self.eps}, affine={'weight' in self}"

    def __call__(self, x: mx.array) -> mx.array:
        reduction_axes = tuple(range(1, x.ndim - 1))
        # Compute stats
        mean = mx.mean(x, axis=reduction_axes, keepdims=True)
        var = mx.var(x, axis=reduction_axes, keepdims=True)
        # Normalize
        x = (x - mean) * mx.rsqrt(var + self.eps)
        # Scale and shift if necessary
        return (self.weight * x + self.bias) if "weight" in self else x


class LayerNorm(Module):
    r"""Applies layer normalization [1] on the inputs.

    Computes

    .. math::

        y = \frac{x - E[x]}{\sqrt{Var[x]} + \epsilon} \gamma + \beta,

    where :math:`\gamma` and :math:`\beta` are learned per feature dimension
    parameters initialized at 1 and 0 respectively.

    [1]: https://arxiv.org/abs/1607.06450

    Args:
        dims (int): The feature dimension of the input to normalize over
        eps (float): A small additive constant for numerical stability
        affine (bool): If True learn an affine transform to apply after the
            normalization
        bias (bool): If True include a translation to the affine
            transformation. If set to False the transformation is not really affine
            just scaling.
    """

    def __init__(
        self, dims: int, eps: float = 1e-5, affine: bool = True, bias: bool = True
    ):
        super().__init__()
        if affine:
            self.weight = mx.ones((dims,))
            if bias:
                self.bias = mx.zeros((dims,))
        self.eps = eps
        self.dims = dims

    def _extra_repr(self):
        return f"{self.dims}, eps={self.eps}, affine={'weight' in self}"

    def __call__(self, x):
        weight = self.weight if "weight" in self else None
        bias = self.bias if "bias" in self else None
        return mx.fast.layer_norm(x, weight, bias, self.eps)


class RMSNorm(Module):
    r"""Applies Root Mean Square normalization [1] to the inputs.

    Computes

    ..  math::

        y = \frac{x}{\sqrt{E[x^2] + \epsilon}} \gamma

    where :math:`\gamma` is a learned per feature dimension parameter initialized at
    1.

    Note the accumulation for the mean is done in 32-bit precision.

    [1]: https://arxiv.org/abs/1910.07467

    Args:
        dims (int): The feature dimension of the input to normalize over
        eps (float): A small additive constant for numerical stability
    """

    def __init__(self, dims: int, eps: float = 1e-5):
        super().__init__()
        self.weight = mx.ones((dims,))
        self.eps = eps

    def _extra_repr(self):
        return f"{self.weight.shape[0]}, eps={self.eps}"

    def __call__(self, x):
        return mx.fast.rms_norm(x, self["weight"], self.eps)


class GroupNorm(Module):
    r"""Applies Group Normalization [1] to the inputs.

    Computes the same normalization as layer norm, namely

    .. math::

        y = \frac{x - E[x]}{\sqrt{Var[x]} + \epsilon} \gamma + \beta,

    where :math:`\gamma` and :math:`\beta` are learned per feature dimension
    parameters initialized at 1 and 0 respectively. However, the mean and
    variance are computed over the spatial dimensions and each group of
    features. In particular, the input is split into num_groups across the
    feature dimension.

    The feature dimension is assumed to be the last dimension and the dimensions
    that precede it (except the first) are considered the spatial dimensions.

    [1]: https://arxiv.org/abs/1803.08494

    Args:
        num_groups (int): Number of groups to separate the features into
        dims (int): The feature dimensions of the input to normalize over
        eps (float): A small additive constant for numerical stability
        affine (bool): If True learn an affine transform to apply after the
            normalization.
        pytorch_compatible (bool): If True perform the group normalization in
            the same order/grouping as PyTorch.
    """

    def __init__(
        self,
        num_groups: int,
        dims: int,
        eps: float = 1e-5,
        affine: bool = True,
        pytorch_compatible: bool = False,
    ):
        super().__init__()
        if affine:
            self.bias = mx.zeros((dims,))
            self.weight = mx.ones((dims,))
        self.num_groups = num_groups
        self.dims = dims
        self.eps = eps
        self.pytorch_compatible = pytorch_compatible

    def _extra_repr(self):
        return (
            f"{self.num_groups}, {self.dims}, eps={self.eps}, "
            f"affine={'weight' in self}, pytorch_compatible={self.pytorch_compatible}"
        )

    def _pytorch_compatible_group_norm(self, x):
        num_groups = self.num_groups
        batch, *rest, dims = x.shape
        group_size = dims // num_groups

        # Split into groups
        x = x.reshape(batch, -1, num_groups, group_size)
        x = x.transpose(0, 2, 1, 3).reshape(batch, num_groups, -1)

        # Normalize
        x = mx.fast.layer_norm(x, eps=self.eps, weight=None, bias=None)

        x = x.reshape(batch, num_groups, -1, group_size)
        x = x.transpose(0, 2, 1, 3).reshape(batch, *rest, dims)
        return x

    def _group_norm(self, x):
        num_groups = self.num_groups
        batch, *rest, dims = x.shape

        # Split into groups
        x = x.reshape(batch, -1, num_groups)

        # Normalize
        means = mx.mean(x, axis=1, keepdims=True)
        var = mx.var(x, axis=1, keepdims=True)
        x = (x - means) * mx.rsqrt(var + self.eps)
        x = x.reshape(batch, *rest, dims)

        return x

    def __call__(self, x):
        group_norm = (
            self._pytorch_compatible_group_norm
            if self.pytorch_compatible
            else self._group_norm
        )
        x = group_norm(x)
        return (self.weight * x + self.bias) if "weight" in self else x


class BatchNorm(Module):
    r"""Applies Batch Normalization over a 2D or 3D input.

    Computes

    .. math::

        y = \frac{x - E[x]}{\sqrt{Var[x]} + \epsilon} \gamma + \beta,

    where :math:`\gamma` and :math:`\beta` are learned per feature dimension
    parameters initialized at 1 and 0 respectively.

    The input shape is specified as ``NC`` or ``NLC``, where ``N`` is the
    batch, ``C`` is the number of features or channels, and ``L`` is the
    sequence length. The output has the same shape as the input. For
    four-dimensional arrays, the shape is ``NHWC``, where ``H`` and ``W`` are
    the height and width respectively.

    For more information on Batch Normalization, see the original paper `Batch
    Normalization: Accelerating Deep Network Training by Reducing Internal
    Covariate Shift <https://arxiv.org/abs/1502.03167>`_.

    Args:
        num_features (int): The feature dimension to normalize over.
        eps (float, optional): A small additive constant for numerical
            stability. Default: ``1e-5``.
        momentum (float, optional): The momentum for updating the running
            mean and variance. Default: ``0.1``.
        affine (bool, optional): If ``True``, apply a learned affine
            transformation after the normalization. Default: ``True``.
        track_running_stats (bool, optional): If ``True``, track the
            running mean and variance. Default: ``True``.

    Examples:
        >>> import mlx.core as mx
        >>> import mlx.nn as nn
        >>> x = mx.random.normal((5, 4))
        >>> bn = nn.BatchNorm(num_features=4, affine=True)
        >>> output = bn(x)
    """

    def __init__(
        self,
        num_features: int,
        eps: float = 1e-5,
        momentum: float = 0.1,
        affine: bool = True,
        track_running_stats: bool = True,
    ):
        super().__init__()

        self.num_features = num_features
        self.eps = eps
        self.momentum = momentum
        self.track_running_stats = track_running_stats

        if affine:
            self.weight = mx.ones((num_features,))
            self.bias = mx.zeros((num_features,))

        if self.track_running_stats:
            self.running_mean = mx.zeros((num_features,))
            self.running_var = mx.ones((num_features,))
            self.freeze(keys=["running_mean", "running_var"], recurse=False)

    def unfreeze(self, *args, **kwargs):
        """Wrap unfreeze to make sure that running_mean and var are always
        frozen parameters."""
        super().unfreeze(*args, **kwargs)
        self.freeze(keys=["running_mean", "running_var"], recurse=False)

    def _extra_repr(self):
        return (
            f"{self.num_features}, eps={self.eps}, "
            f"momentum={self.momentum}, affine={'weight' in self}, "
            f"track_running_stats={self.track_running_stats}"
        )

    def _calc_stats(self, x: mx.array) -> Tuple[mx.array, mx.array]:
        """
        Calculate the mean and variance of the input tensor across the batch
        and spatial dimensions.

        Args:
            x (array): Input tensor.

        Returns:
            tuple: Tuple containing mean and variance.
        """
        reduction_axes = tuple(range(0, x.ndim - 1))

        mean = mx.mean(x, axis=reduction_axes)
        var = mx.var(x, axis=reduction_axes)

        return mean, var

    def __call__(self, x: mx.array) -> mx.array:
        """
        Forward pass of BatchNorm.

        Args:
            x (array): Input tensor.

        Returns:
            array: Normalized output tensor.
        """
        if x.ndim < 2 or x.ndim > 4:
            raise ValueError(
                f"Expected input tensor to have 2, 3 or 4 dimensions, but got {x.ndim}"
            )

        # Calculate the mean and variance used to normalize the input x. If we
        # are in training mode update the running stats if needed.
        mean, var = self._calc_stats(x)
        if self.training and self.track_running_stats:
            mu = self.momentum
            self.running_mean = (1 - mu) * self.running_mean + mu * mean
            self.running_var = (1 - mu) * self.running_var + mu * var
        elif self.track_running_stats:
            mean = self.running_mean
            var = self.running_var

        x = (x - mean) * mx.rsqrt(var + self.eps)
        return (self.weight * x + self.bias) if "weight" in self else x


================================================
FILE: python/mlx/nn/layers/pooling.py
================================================
# Copyright © 2023-2024 Apple Inc.

import operator
from itertools import accumulate
from typing import Optional, Tuple, Union

import mlx.core as mx
from mlx.nn.layers.base import Module


def _value_or_list(x, n, msg):
    if isinstance(x, (list, tuple)):
        if len(x) != n:
            raise ValueError(msg)
        return list(x)

    if not isinstance(x, int):
        raise ValueError(msg)

    return [x] * n


def _non_overlapping_sliding_windows(x, shape, window_shape):
    # Compute the intermediate shape
    new_shape = [shape[0]]
    for s, w in zip(shape[1:], window_shape):
        new_shape.append(s // w)
        new_shape.append(w)
    new_shape.append(shape[-1])

    last_axis = len(new_shape) - 1
    axis_order = [0, *range(1, last_axis, 2), *range(2, last_axis, 2), last_axis]

    x = x.reshape(new_shape)
    x = x.transpose(axis_order)
    return x


def _sliding_windows(x, window_shape, window_strides):
    if x.ndim < 3:
        raise ValueError(
            f"To extract sliding windows at least 1 spatial dimension "
            f"(3 total) is needed but the input only has {x.ndim} dimensions."
        )

    spatial_dims = x.shape[1:-1]
    if not (len(spatial_dims) == len(window_shape) == len(window_strides)):
        raise ValueError(
            f"To extract sliding windows the window shapes and strides must have "
            f"the same number of spatial dimensions as the signal but the signal "
            f"has {len(spatial_dims)} dims and the window shape has {len(window_shape)} "
            f"and strides have {len(window_strides)}."
        )

    shape = x.shape
    if all(
        window == stride and size % window == 0
        for size, window, stride in zip(spatial_dims, window_shape, window_strides)
    ):
        return _non_overlapping_sliding_windows(x, shape, window_shape)

    strides = list(reversed(list(accumulate(reversed(shape + (1,)), operator.mul))))[1:]

    # Compute the output shape
    final_shape = [shape[0]]
    final_shape += [
        (size - window) // stride + 1
        for size, window, stride in zip(spatial_dims, window_shape, window_strides)
    ]
    final_shape += window_shape
    final_shape += [shape[-1]]

    # Compute the output strides
    final_strides = strides[:1]
    final_strides += [
        og_stride * stride for og_stride, stride in zip(strides[1:-1], window_strides)
    ]
    final_strides += strides[1:-1]
    final_strides += strides[-1:]  # should always be [1]

    return mx.as_strided(x, final_shape, final_strides)


class _Pool(Module):
    def __init__(self, pooling_function, kernel_size, stride, padding, padding_value):
        super().__init__()

        self._pooling_function = pooling_function
        self._kernel_size = kernel_size
        self._stride = stride
        self._padding = padding
        self._padding_value = padding_value
        self._axes = tuple(range(-len(self._kernel_size) - 1, -1, 1))

    def _extra_repr(self):
        ks = tuple(self._kernel_size)
        st = tuple(self._stride)
        pd = tuple(p[0] for p in self._padding)

        return f"kernel_size={ks}, stride={st}, padding={pd}"

    def __call__(self, x):
        if any(p[0] > 0 for p in self._padding):
            x = mx.pad(
                x,
                [(0, 0)] + self._padding + [(0, 0)],
                constant_values=self._padding_value,
            )
        x = _sliding_windows(x, self._kernel_size, self._stride)
        return self._pooling_function(x, self._axes)


class _Pool1d(_Pool):
    def __init__(
        self,
        pooling_function,
        padding_value,
        kernel_size: Union[int, Tuple[int]],
        stride: Optional[Union[int, Tuple[int]]] = None,
        padding: Union[int, Tuple[int]] = 0,
    ):
        class_name = type(self).__name__
        msg = "[{}] '{}' must be an integer or a tuple containing 1 integer"
        kernel_size = _value_or_list(
            kernel_size, 1, msg.format(class_name, "kernel_size")
        )
        if stride is not None:
            stride = _value_or_list(stride, 1, msg.format(class_name, "stride"))
        else:
            stride = kernel_size
        padding = _value_or_list(padding, 1, msg.format(class_name, "padding"))
        padding = [(p, p) for p in padding]

        super().__init__(pooling_function, kernel_size, stride, padding, padding_value)


class _Pool2d(_Pool):
    def __init__(
        self,
        pooling_function,
        padding_value,
        kernel_size: Union[int, Tuple[int, int]],
        stride: Optional[Union[int, Tuple[int, int]]] = None,
        padding: Optional[Union[int, Tuple[int, int]]] = 0,
    ):
        class_name = type(self).__name__
        msg = "[{}] '{}' must be an integer or a tuple containing 2 integers"
        kernel_size = _value_or_list(
            kernel_size, 2, msg.format(class_name, "kernel_size")
        )
        if stride is not None:
            stride = _value_or_list(stride, 2, msg.format(class_name, "stride"))
        else:
            stride = kernel_size
        padding = _value_or_list(padding, 2, msg.format(class_name, "padding"))
        padding = [(p, p) for p in padding]

        super().__init__(pooling_function, kernel_size, stride, padding, padding_value)


class _Pool3d(_Pool):
    def __init__(
        self,
        pooling_function,
        padding_value,
        kernel_size: Union[int, Tuple[int, int, int]],
        stride: Optional[Union[int, Tuple[int, int, int]]] = None,
        padding: Optional[Union[int, Tuple[int, int, int]]] = 0,
    ):
        class_name = type(self).__name__
        msg = "[{}] '{}' must be an integer or a tuple containing 3 integers"
        kernel_size = _value_or_list(
            kernel_size, 3, msg.format(class_name, "kernel_size")
        )
        if stride is not None:
            stride = _value_or_list(stride, 3, msg.format(class_name, "stride"))
        else:
            stride = kernel_size
        padding = _value_or_list(padding, 3, msg.format(class_name, "padding"))
        padding = [(p, p) for p in padding]

        super().__init__(pooling_function, kernel_size, stride, padding, padding_value)


class MaxPool1d(_Pool1d):
    r"""Applies 1-dimensional max pooling.

    Spatially downsamples the input by taking the maximum of a sliding window
    of size ``kernel_size`` and sliding stride ``stride``.

    Args:
        kernel_size (int or tuple(int)): The size of the pooling window kernel.
        stride (int or tuple(int), optional): The stride of the pooling window.
            Default: ``kernel_size``.
        padding (int or tuple(int), optional): How much negative infinity
            padding to apply to the input. The padding amount is applied to
            both sides of the spatial axis. Default: ``0``.

    Examples:
        >>> import mlx.core as mx
        >>> import mlx.nn.layers as nn
        >>> x = mx.random.normal(shape=(4, 16, 5))
        >>> pool = nn.MaxPool1d(kernel_size=2, stride=2)
        >>> pool(x)
    """

    def __init__(
        self,
        kernel_size: Union[int, Tuple[int]],
        stride: Optional[Union[int, Tuple[int]]] = None,
        padding: Union[int, Tuple[int]] = 0,
    ):
        super().__init__(mx.max, -float("inf"), kernel_size, stride, padding)


class AvgPool1d(_Pool1d):
    r"""Applies 1-dimensional average pooling.

    Spatially downsamples the input by taking the average of a sliding window
    of size ``kernel_size`` and sliding stride ``stride``.

    Args:
        kernel_size (int or tuple(int)): The size of the pooling window kernel.
        stride (int or tuple(int), optional): The stride of the pooling window.
            Default: ``kernel_size``.
        padding (int or tuple(int), optional): How much zero padding to apply to
            the input. The padding amount is applied to both sides of the spatial
            axis. Default: ``0``.

    Examples:
        >>> import mlx.core as mx
        >>> import mlx.nn.layers as nn
        >>> x = mx.random.normal(shape=(4, 16, 5))
        >>> pool = nn.AvgPool1d(kernel_size=2, stride=2)
        >>> pool(x)
    """

    def __init__(
        self,
        kernel_size: Union[int, Tuple[int]],
        stride: Optional[Union[int, Tuple[int]]] = None,
        padding: Union[int, Tuple[int]] = 0,
    ):
        super().__init__(mx.mean, 0, kernel_size, stride, padding)


class MaxPool2d(_Pool2d):
    r"""Applies 2-dimensional max pooling.

    Spatially downsamples the input by taking the maximum of a sliding window
    of size ``kernel_size`` and sliding stride ``stride``.

    The parameters ``kernel_size``, ``stride``, and ``padding`` can either be:

    * a single ``int`` -- in which case the same value is used for both the
      height and width axis.
    * a ``tuple`` of two ``int`` s -- in which case, the first ``int`` is
      used for the height axis, the second ``int`` for the width axis.

    Args:
        kernel_size (int or tuple(int, int)): The size of the pooling window.
        stride (int or tuple(int, int), optional): The stride of the pooling
            window. Default: ``kernel_size``.
        padding (int or tuple(int, int), optional): How much negative infinity
            padding to apply to the input. The padding is applied on both sides
            of the height and width axis. Default: ``0``.

    Examples:
        >>> import mlx.core as mx
        >>> import mlx.nn.layers as nn
        >>> x = mx.random.normal(shape=(8, 32, 32, 4))
        >>> pool = nn.MaxPool2d(kernel_size=2, stride=2)
        >>> pool(x)
    """

    def __init__(
        self,
        kernel_size: Union[int, Tuple[int, int]],
        stride: Optional[Union[int, Tuple[int, int]]] = None,
        padding: Optional[Union[int, Tuple[int, int]]] = 0,
    ):
        super().__init__(mx.max, -float("inf"), kernel_size, stride, padding)


class AvgPool2d(_Pool2d):
    r"""Applies 2-dimensional average pooling.

    Spatially downsamples the input by taking the average of a sliding window
    of size ``kernel_size`` and sliding stride ``stride``.

    The parameters ``kernel_size``, ``stride``, and ``padding`` can either be:

    * a single ``int`` -- in which case the same value is used for both the
      height and width axis.
    * a ``tuple`` of two ``int`` s -- in which case, the first ``int`` is
      used for the height axis, the second ``int`` for the width axis.

    Args:
        kernel_size (int or tuple(int, int)): The size of the pooling window.
        stride (int or tuple(int, int), optional): The stride of the pooling
            window. Default: ``kernel_size``.
        padding (int or tuple(int, int), optional): How much zero
            padding to apply to the input. The padding is applied on both sides
            of the height and width axis. Default: ``0``.

    Examples:
        >>> import mlx.core as mx
        >>> import mlx.nn.layers as nn
        >>> x = mx.random.normal(shape=(8, 32, 32, 4))
        >>> pool = nn.AvgPool2d(kernel_size=2, stride=2)
        >>> pool(x)
    """

    def __init__(
        self,
        kernel_size: Union[int, Tuple[int, int]],
        stride: Optional[Union[int, Tuple[int, int]]] = None,
        padding: Optional[Union[int, Tuple[int, int]]] = 0,
    ):
        super().__init__(mx.mean, 0, kernel_size, stride, padding)


class MaxPool3d(_Pool3d):
    r"""Applies 3-dimensional max pooling.

    Spatially downsamples the input by taking the maximum of a sliding window
    of size ``kernel_size`` and sliding stride ``stride``.

    The parameters ``kernel_size``, ``stride``, and ``padding`` can either be:

    * a single ``int`` -- in which case the same value is used for the depth,
      height, and width axis.
    * a ``tuple`` of three ``int`` s -- in which case, the first ``int`` is used
      for the depth axis, the second ``int`` for the height axis, and the third
      ``int`` for the width axis.

    Args:
        kernel_size (int or tuple(int, int, int)): The size of the pooling window.
        stride (int or tuple(int, int, int), optional): The stride of the pooling
            window. Default: ``kernel_size``.
        padding (int or tuple(int, int, int), optional): How much negative infinity
            padding to apply to the input. The padding is applied on both sides
            of the depth, height and width axis. Default: ``0``.

    Examples:
        >>> import mlx.core as mx
        >>> import mlx.nn.layers as nn
        >>> x = mx.random.normal(shape=(8, 16, 32, 32, 4))
        >>> pool = nn.MaxPool3d(kernel_size=2, stride=2)
        >>> pool(x)
    """

    def __init__(
        self,
        kernel_size: Union[int, Tuple[int, int, int]],
        stride: Optional[Union[int, Tuple[int, int, int]]] = None,
        padding: Optional[Union[int, Tuple[int, int, int]]] = 0,
    ):
        super().__init__(mx.max, -float("inf"), kernel_size, stride, padding)


class AvgPool3d(_Pool3d):
    r"""Applies 3-dimensional average pooling.

    Spatially downsamples the input by taking the average of a sliding window
    of size ``kernel_size`` and sliding stride ``stride``.

    The parameters ``kernel_size``, ``stride``, and ``padding`` can either be:

    * a single ``int`` -- in which case the same value is used for the depth,
      height, and width axis.
    * a ``tuple`` of three ``int`` s -- in which case, the first ``int`` is used
      for the depth axis, the second ``int`` for the height axis, and the third
      ``int`` for the width axis.

    Args:
        kernel_size (int or tuple(int, int, int)): The size of the pooling window.
        stride (int or tuple(int, int, int), optional): The stride of the pooling
            window. Default: ``kernel_size``.
        padding (int or tuple(int, int, int), optional): How much zero
            padding to apply to the input. The padding is applied on both sides
            of the depth, height and width axis. Default: ``0``.

    Examples:
        >>> import mlx.core as mx
        >>> import mlx.nn.layers as nn
        >>> x = mx.random.normal(shape=(8, 16, 32, 32, 4))
        >>> pool = nn.AvgPool3d(kernel_size=2, stride=2)
        >>> pool(x)
    """

    def __init__(
        self,
        kernel_size: Union[int, Tuple[int, int, int]],
        stride: Optional[Union[int, Tuple[int, int, int]]] = None,
        padding: Optional[Union[int, Tuple[int, int, int]]] = 0,
    ):
        super().__init__(mx.mean, 0, kernel_size, stride, padding)


================================================
FILE: python/mlx/nn/layers/positional_encoding.py
================================================
# Copyright © 2023-2024 Apple Inc.

import math
from typing import Optional

import mlx.core as mx
from mlx.nn.layers.base import Module


class RoPE(Module):
    """Implements the rotary positional encoding.

    The traditional implementation rotates consecutive pairs of elements in the
    feature dimension while the default implementation rotates pairs with
    stride half the feature dimensions for efficiency.

    For more details see `RoFormer: Enhanced Transformer with Rotary Position
    Embedding <https://arxiv.org/abs/2104.09864>`_.

    Args:
        dims (int): The feature dimensions to be rotated. If the input feature
            is larger than dims then the rest is left unchanged.
        traditional (bool, optional): If set to ``True`` choose the traditional
            implementation which is slightly less efficient. Default: ``False``.
        base (float, optional): The base used to compute angular frequency for
            each dimension in the positional encodings. Default: ``10000``.
        scale (float, optional): The scale used to scale the positions. Default: ``1.0``.
    """

    def __init__(
        self,
        dims: int,
        traditional: bool = False,
        base: float = 10000,
        scale: float = 1.0,
    ):
        super().__init__()
        self.dims = dims
        self.traditional = traditional
        self.base = base
        self.scale = scale

    def _extra_repr(self):
        return f"{self.dims}, traditional={self.traditional}"

    def __call__(self, x, offset: int = 0):
        return mx.fast.rope(
            x,
            self.dims,
            traditional=self.traditional,
            base=self.base,
            scale=self.scale,
            offset=offset,
        )


class SinusoidalPositionalEncoding(Module):
    r"""Implements sinusoidal positional encoding.

    For more details see the paper `Attention Is All You Need
    <https://arxiv.org/abs/1706.03762>`_.

    Args:
        dims (int): The dimensionality of the resulting positional embeddings.
        min_freq (float, optional): The minimum frequency expected. Default:
            ``0.0001``.
        max_freq (float, optional): The maximum frequency expected. Default:
            ``1``.
        scale (float, optional): A multiplicative scale for the embeddings.
            Default: ``sqrt(2/dims)``.
        cos_first (bool, optional): If ``True`` embed using ``[cos(x); sin(x)]``
            instead of the reverse. Default: ``False``.
        full_turns (bool, optional): If ``True`` multiply the frequencies with
            :math:`2\pi`. Default: ``False``.
    """

    def __init__(
        self,
        dims: int,
        min_freq: float = 0.0001,
        max_freq: float = 1,
        scale: Optional[float] = None,
        cos_first: bool = False,
        full_turns: bool = False,
    ):
        super().__init__()

        one_zero = 1 - mx.arange(0, dims // 2) / (dims // 2 - 1)
        min_freq = math.log(min_freq)
        max_freq = math.log(max_freq)

        # Start with underscore so it is not included in the parameters
        self._sigmas = mx.exp(one_zero * (max_freq - min_freq) + min_freq)
        if full_turns:
            self._sigmas = self._sigmas * (2 * math.pi)

        # Save some constants that define the implementation
        self.scale = scale or (2 / dims) ** 0.5
        self.cos_first = cos_first

    def __call__(self, x):
        y = x[..., None] * self._sigmas
        cosy = mx.cos(y)
        siny = mx.sin(y)

        if self.cos_first:
            y = mx.concatenate([cosy, siny], axis=-1)
        else:
            y = mx.concatenate([siny, cosy], axis=-1)

        if self.scale != 1:
            y = y * self.scale

        return y


class ALiBi(Module):
    @staticmethod
    def create_alibi_matrix(
        q_sequence_length: int,
        k_sequence_length: int,
        num_heads: int,
        offset: int,
        dtype=mx.float32,
    ):
        x1 = mx.arange(offset, q_sequence_length)
        x2 = mx.arange(0, k_sequence_length)
        distance_matrix = -mx.abs(
            mx.expand_dims(x1[:, None] - x2[None, :], axis=(0, 1))
        )
        alibi_slope = ALiBi.create_alibi_slope(num_heads=num_heads, dtype=dtype)
        alibi_mask = (distance_matrix * alibi_slope).astype(dtype)
        return alibi_mask

    @staticmethod
    def create_alibi_slope(num_heads, dtype):
        def get_slopes(n: int):
            if math.log2(n).is_integer():
                start = 2 ** (-(2 ** -(math.log2(n) - 3)))
                return [start * start**i for i in range(n)]
            else:
                closest_power_of_2 = 2 ** math.floor(math.log2(n))
                return (
                    get_slopes(closest_power_of_2)
                    + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
                )

        slopes = get_slopes(num_heads)
        out = mx.array(slopes, dtype=dtype)
        return mx.expand_dims(out, axis=(-1, -2))

    def __call__(self, attention_scores, offset=0, mask=None):
        alibi_mask = ALiBi.create_alibi_matrix(
            q_sequence_length=attention_scores.shape[-2] + offset,
            k_sequence_length=attention_scores.shape[-1],
            num_heads=attention_scores.shape[1],
            offset=offset,
            dtype=attention_scores.dtype,
        )
        if mask is not None:
            alibi_mask = alibi_mask + mask
        return attention_scores + alibi_mask


================================================
FILE: python/mlx/nn/layers/quantized.py
================================================
# Copyright © 2023-2024 Apple Inc.

import math
from typing import Callable, Optional, Union

import mlx.core as mx
from mlx.nn.layers.base import Module
from mlx.utils import tree_map_with_path


def _defaults_for_mode(mode, group_size, bits):
    mode_defaults = {
        "affine": (64, 4),
        "mxfp4": (32, 4),
        "nvfp4": (16, 4),
        "mxfp8": (32, 8),
    }
    default_group_size, default_bits = mode_defaults[mode]
    return group_size or default_group_size, bits or default_bits


def quantize(
    model: Module,
    group_size: int = None,
    bits: int = None,
    *,
    mode: str = "affine",
    quantize_input: bool = False,
    class_predicate: Optional[Callable[[str, Module], Union[bool, dict]]] = None,
):
    """Quantize the sub-modules of a module according to a predicate.

    By default all layers that define a ``to_quantized()`` method will be
    quantized. Both :obj:`Linear` and :obj:`Embedding` layers will be
    quantized. The module is updated in-place.

    Note:
        ``quantize_input=True`` is only supported for ``"nvfp4"`` and ``"mxfp8"``
        modes and :obj:`Linear` layers.

    Args:
        model (mlx.nn.Module): The model whose leaf modules may be quantized.
        group_size (Optional[int]): The quantization group size (see
           :func:`mlx.core.quantize`). Default: ``None``.
        bits (Optional[int]): The number of bits per parameter (see
           :func:`mlx.core.quantize`). Default: ``None``.
        mode (str): The quantization method to use (see
           :func:`mlx.core.quantize`). Default: ``"affine"``.
        quantize_input (bool): Whether to quantize activations. Default: ``False``.
        class_predicate (Optional[Callable]): A callable which receives the
           :obj:`Module` path and :obj:`Module` itself and returns ``True`` or a
           dict of params for ``to_quantized`` if it should be quantized and
           ``False`` otherwise. If ``None``, then all layers that define a
           ``to_quantized()`` method are quantized. Default: ``None``.

    Example:
        Weight only quantization for all layers that define a ``to_quantized()`` method:

        >>> import mlx.nn as nn
        >>> nn.quantize(model, group_size=64, bits=4, mode="affine")

        Weight and input quantization for all linear layers:

        >>> predicate = lambda p, m: isinstance(m, nn.Linear)
        >>> nn.quantize(model, mode="nvfp4", quantize_input=True, class_predicate=predicate)
    """
    class_predicate = class_predicate or (lambda _, m: hasattr(m, "to_quantized"))

    def _maybe_quantize(path, m):
        if bool_or_params := class_predicate(path, m):
            if hasattr(m, "to_quantized"):
                if isinstance(bool_or_params, bool):
                    kwargs = {"group_size": group_size, "bits": bits, "mode": mode}
                    if quantize_input:
                        kwargs["quantize_input"] = quantize_input
                    return m.to_quantized(**kwargs)
                elif isinstance(bool_or_params, dict):
                    if ("quantize_input" in bool_or_params) and not bool_or_params[
                        "quantize_input"
                    ]:
                        bool_or_params.pop("quantize_input")
                    return m.to_quantized(**bool_or_params)
                else:
                    raise ValueError(
                        "``class_predicate`` must return a bool"
                        " or a dict of parameters to pass to ``to_quantized``"
                    )
            else:
                raise ValueError(f"Unable to quantize model of type {type(m)}")
        else:
            return m

    leaves = model.leaf_modules()
    leaves = tree_map_with_path(_maybe_quantize, leaves, is_leaf=Module.is_module)
    model.update_modules(leaves)


class QuantizedEmbedding(Module):
    """The same as :obj:`Embedding` but with a  quantized weight matrix.

    :obj:`QuantizedEmbedding` also provides a :meth:`from_embedding`
    classmethod to convert embedding layers to :obj:`QuantizedEmbedding`
    layers.

    Args:
        num_embeddings (int): How many possible discrete tokens can we embed.
           Usually called the vocabulary size.
        dims (int): The dimensionality of the embeddings.
        group_size (Optional[int]): The group size to use for the quantized
            weight. See :func:`~mlx.core.quantize`. Default: ``None``.
        bits (Optional[int]): The bit width to use for the quantized weight.
            See :func:`~mlx.core.quantize`. Default: ``None``.
        mode (str): The quantization method to use (see
           :func:`mlx.core.quantize`). Default: ``"affine"``.
    """

    def __init__(
        self,
        num_embeddings: int,
        dims: int,
        group_size: int = None,
        bits: int = None,
        mode: str = "affine",
    ):
        super().__init__()

        # Quantization config
        self.group_size, self.bits = _defaults_for_mode(mode, group_size, bits)
        self.mode = mode

        # Initialize the quantized weight
        scale = math.sqrt(1 / dims)
        weight = mx.random.normal(shape=(num_embeddings, dims), scale=scale)
        self.weight, self.scales, *biases = mx.quantize(
            weight, group_size, bits, mode=mode
        )
        self.biases = biases[0] if biases else None
        self.num_embeddings = num_embeddings
        self.dims = dims

        # Freeze this model's parameters
        self.freeze()

    def __call__(self, x):
        biases = self.get("biases")
        return mx.dequantize(
            self["weight"][x],
            scales=self["scales"][x],
            biases=biases[x] if biases is not None else None,
            group_size=self.group_size,
            bits=self.bits,
            mode=self.mode,
        )

    def as_linear(self, x):
        """
        Call the quantized embedding layer as a quantized linear layer.

        Use this for example when input embedding and output projection
        weights are tied.
        """
        return mx.quantized_matmul(
            x,
            self["weight"],
            scales=self["scales"],
            biases=self.get("biases"),
            transpose=True,
            group_size=self.group_size,
            bits=self.bits,
            mode=self.mode,
        )

    def _extra_repr(self):
        return (
            f"{self.num_embeddings}, {self.dims}, "
            f"group_size={self.group_size}, bits={self.bits}, mode={self.mode}"
        )

    @classmethod
    def from_embedding(
        cls,
        embedding_layer: Module,
        group_size: int = None,
        bits: int = None,
        mode: str = "affine",
    ):
        """Create a :obj:`QuantizedEmbedding` layer from an :obj:`Embedding` layer."""
        embedding_dims, dims = embedding_layer.weight.shape
        ql = cls(embedding_dims, dims, group_size, bits, mode=mode)
        ql.weight, ql.scales, *biases = mx.quantize(
            embedding_layer.weight,
            group_size,
            bits,
            mode=mode,
        )
        ql.biases = biases[0] if biases else None
        return ql


class QuantizedLinear(Module):
    """Applies an affine transformation to the input using a quantized weight matrix.

    It is the quantized equivalent of :class:`mlx.nn.Linear`. For now its
    parameters are frozen and will not be included in any gradient computation
    but this will probably change in the future.

    :obj:`QuantizedLinear` also provides a classmethod :meth:`from_linear` to
    convert linear layers to :obj:`QuantizedLinear` layers.

    Args:
        input_dims (int): The dimensionality of the input features.
        output_dims (int): The dimensionality of the output features.
        bias (bool, optional): If set to ``False`` then the layer will not use
            a bias. Default: ``True``.
        group_size (Optional[int]): The group size to use for the quantized
            weight. See :func:`~mlx.core.quantize`. Default: ``None``.
        bits (Optional[int]): The bit width to use for the quantized weight.
            See :func:`~mlx.core.quantize`. Default: ``None``.
        mode (str): The quantization method to use (see
           :func:`mlx.core.quantize`). Default: ``"affine"``.
    """

    def __init__(
        self,
        input_dims: int,
        output_dims: int,
        bias: bool = True,
        group_size: int = None,
        bits: int = None,
        mode: str = "affine",
    ):
        super().__init__()

        # Quantization config
        self.group_size, self.bits = _defaults_for_mode(mode, group_size, bits)
        self.mode = mode

        # Initialize the quantized weight
        scale = math.sqrt(1 / input_dims)
        weight = mx.random.uniform(
            low=-scale,
            high=scale,
            shape=(output_dims, input_dims),
        )
        self.weight, self.scales, *biases = mx.quantize(
            weight, group_size, bits, mode=mode
        )
        self.biases = biases[0] if biases else None

        # And bias if needed
        if bias:
            self.bias = mx.zeros((output_dims,))

        # Freeze this model's parameters
        self.freeze()

    def _extra_repr(self):
        out_dims, in_dims = self.weight.shape
        in_dims = (in_dims * 32) // self.bits
        return (
            f"input_dims={in_dims}, output_dims={out_dims}, bias={'bias' in self}, "
            f"group_size={self.group_size}, bits={self.bits}, mode={self.mode}"
        )

    def __call__(self, x):
        x = mx.quantized_matmul(
            x,
            self["weight"],
            scales=self["scales"],
            biases=self.get("biases"),
            transpose=True,
            group_size=self.group_size,
            bits=self.bits,
            mode=self.mode,
        )
        if "bias" in self:
            x = x + self["bias"]
        return x

    @classmethod
    def from_linear(
        cls,
        linear_layer: Module,
        group_size: int = None,
        bits: int = None,
        mode: str = "affine",
    ):
        """Create a :obj:`QuantizedLinear` layer from a :obj:`Linear` layer."""
        output_dims, input_dims = linear_layer.weight.shape
        ql = cls(input_dims, output_dims, False, group_size, bits, mode=mode)
        ql.weight, ql.scales, *biases = mx.quantize(
            linear_layer.weight,
            group_size,
            bits,
            mode=mode,
        )
        ql.biases = biases[0] if biases else None

        if "bias" in linear_layer:
            ql.bias = linear_layer.bias

        return ql


class QQLinear(Module):
    """Quantizes the input and applies an affine transformation using quantized weights.

    Two use cases are supported:

    1) **Eval**:  The weights are frozen and stored in quantized form together with
       their scales (``self.weight`` is quantized and ``self.scales`` is provided).
    2) **Train**: The weights are stored in higher precision and are quantized on
         the fly during computation so that gradients with respect to the weights
         can be computed.

    To switch between the two cases, use ``layer.eval()`` and ``layer.train()`` respectively.

    Compared to the :class:`mlx.nn.QuantizedLinear` layer, this layer
    quantizes the input as well and includes weights in gradient computations.

    :obj:`QQLinear` also provides the class method :meth:`from_linear` to
    convert :class:`mlx.nn.Linear` layers to :obj:`QQLinear` layers.

    Note: This layer does not support a bias term yet.

    Args:
        input_dims (int): The dimensionality of the input features.
        output_dims (int): The dimensionality of the output features.
        group_size (Optional[int]): The group size to use for the quantized weight.
            See :func:`~mlx.core.quantize`. Default: ``None``.
        bits (Optional[int]): The bit width to use for the quantized weight.
            See :func:`~mlx.core.quantize`. Default: ``None``.
        mode (Optional[str]): The quantization method to use (see
            :func:`mlx.core.quantize`). Currently, only ``"nvfp4"`` and ``"mxfp8"``
            are supported. Default: ``"nvfp4"``.
    """

    def __init__(
        self,
        input_dims: int,
        output_dims: int,
        group_size: int = None,
        bits: int = None,
        mode: str = "nvfp4",
    ):
        super().__init__()

        # Quantization config
        self.group_size, self.bits = _defaults_for_mode(mode, group_size, bits)
        self.mode = mode

        scale = math.sqrt(1 / input_dims)
        self.weight = mx.random.uniform(
            low=-scale,
            high=scale,
            shape=(output_dims, input_dims),
        )
        self._quantized = False

    def _extra_repr(self):
        out_dims, in_dims = self.weight.shape
        if self.weight.dtype == mx.uint32:
            in_dims = (in_dims * 32) // self.bits
        return (
            f"input_dims={in_dims}, output_dims={out_dims}, "
            f"group_size={self.group_size}, bits={self.bits}, mode={self.mode}"
        )

    def quantize(self):
        if not self._quantized:
            self.weight, self.scales = mx.quantize(
                self.weight,
                self.group_size,
                self.bits,
                mode=self.mode,
            )
            self._quantized = True

    def dequantize(self):
        if self._quantized:
            self.weight = mx.dequantize(
                self.weight,
                scales=self.scales,
                group_size=self.group_size,
                bits=self.bits,
                mode=self.mode,
            )
            self.__delattr__("scales")
            self._quantized = False

    def _set_training_mode(self, mode: bool):
        super()._set_training_mode(mode)

        if self._training:
            self.dequantize()
        else:
            self.quantize()

    def __call__(self, x):
        x = mx.qqmm(
            x,
            self["weight"],
            scales=self.get("scales"),
            group_size=self.group_size,
            bits=self.bits,
            mode=self.mode,
        )
        return x

    @classmethod
    def from_linear(
        cls,
        linear_layer: Module,
        group_size: int = None,
        bits: int = None,
        mode: str = "nvfp4",
    ):
        """Create a :obj:`QQLinear` layer from a :obj:`Linear` layer."""
        output_dims, input_dims = linear_layer.weight.shape  # (N,K)
        if linear_layer.get("bias") is not None:
            raise NotImplementedError("QQLinear does not support bias yet.")
        ql = cls(input_dims, output_dims, group_size, bits, mode=mode)
        ql.weight = linear_layer.weight
        ql.train(linear_layer.training)

        return ql


================================================
FILE: python/mlx/nn/layers/recurrent.py
================================================
# Copyright © 2024 Apple Inc.

import math
from typing import Callable, Optional

import mlx.core as mx
from mlx.nn.layers.activations import tanh
from mlx.nn.layers.base import Module


class RNN(Module):
    r"""An Elman recurrent layer.

    The input is a sequence of shape ``NLD`` or ``LD`` where:

    * ``N`` is the optional batch dimension
    * ``L`` is the sequence length
    * ``D`` is the input's feature dimension

    Concretely, for each element along the sequence length axis, this
    layer applies the function:

    .. math::

        h_{t + 1} = \text{tanh} (W_{ih}x_t + W_{hh}h_t + b)

    The hidden state :math:`h` has shape ``NH`` or ``H``, depending on
    whether the input is batched or not. Returns the hidden state at each
    time step, of shape ``NLH`` or ``LH``.

    Args:
        input_size (int): Dimension of the input, ``D``.
        hidden_size (int): Dimension of the hidden state, ``H``.
        bias (bool, optional): Whether to use a bias. Default: ``True``.
        nonlinearity (callable, optional): Non-linearity to use. If ``None``,
            then func:`tanh` is used. Default: ``None``.
    """

    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        bias: bool = True,
        nonlinearity: Optional[Callable] = None,
    ):
        super().__init__()

        self.nonlinearity = nonlinearity or tanh
        if not callable(self.nonlinearity):
            raise ValueError(
                f"Nonlinearity must be callable. Current value: {nonlinearity}."
            )

        scale = 1.0 / math.sqrt(hidden_size)
        self.hidden_size = hidden_size
        self.Wxh = mx.random.uniform(
            low=-scale, high=scale, shape=(hidden_size, input_size)
        )
        self.Whh = mx.random.uniform(
            low=-scale, high=scale, shape=(hidden_size, hidden_size)
        )
        self.bias = (
            mx.random.uniform(low=-scale, high=scale, shape=(hidden_size,))
            if bias
            else None
        )

    def _extra_repr(self):
        return (
            f"input_dims={self.Wxh.shape[1]}, "
            f"hidden_size={self.hidden_size}, "
            f"nonlinearity={self.nonlinearity}, bias={self.bias is not None}"
        )

    def __call__(self, x, hidden=None):
        if self.bias is not None:
            x = mx.addmm(self.bias, x, self.Wxh.T)
        else:
            x = x @ self.Wxh.T

        all_hidden = []
        for idx in range(x.shape[-2]):
            if hidden is not None:
                hidden = mx.addmm(x[..., idx, :], hidden, self.Whh.T)
            else:
                hidden = x[..., idx, :]
            hidden = self.nonlinearity(hidden)
            all_hidden.append(hidden)

        return mx.stack(all_hidden, axis=-2)


class GRU(Module):
    r"""A gated recurrent unit (GRU) RNN layer.

    The input has shape ``NLD`` or ``LD`` where:

    * ``N`` is the optional batch dimension
    * ``L`` is the sequence length
    * ``D`` is the input's feature dimension

    Concretely, for each element of the sequence, this layer computes:

    .. math::

        \begin{aligned}
        r_t &= \sigma (W_{xr}x_t + W_{hr}h_t + b_{r}) \\
        z_t &= \sigma (W_{xz}x_t + W_{hz}h_t + b_{z}) \\
        n_t &= \text{tanh}(W_{xn}x_t + b_{n} + r_t \odot (W_{hn}h_t + b_{hn})) \\
        h_{t + 1} &= (1 - z_t) \odot n_t + z_t \odot h_t
        \end{aligned}

    The hidden state :math:`h` has shape ``NH`` or ``H`` depending on
    whether the input is batched or not. Returns the hidden state at each
    time step of shape ``NLH`` or ``LH``.

    Args:
        input_size (int): Dimension of the input, ``D``.
        hidden_size (int): Dimension of the hidden state, ``H``.
        bias (bool): Whether to use biases or not. Default: ``True``.
    """

    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        bias: bool = True,
    ):
        super().__init__()

        self.hidden_size = hidden_size
        scale = 1.0 / math.sqrt(hidden_size)
        self.Wx = mx.random.uniform(
            low=-scale, high=scale, shape=(3 * hidden_size, input_size)
        )
        self.Wh = mx.random.uniform(
            low=-scale, high=scale, shape=(3 * hidden_size, hidden_size)
        )
        self.b = (
            mx.random.uniform(low=-scale, high=scale, shape=(3 * hidden_size,))
            if bias
            else None
        )
        self.bhn = (
            mx.random.uniform(low=-scale, high=scale, shape=(hidden_size,))
            if bias
            else None
        )

    def _extra_repr(self):
        return (
            f"input_dims={self.Wx.shape[1]}, "
            f"hidden_size={self.hidden_size}, bias={self.b is not None}"
        )

    def __call__(self, x, hidden=None):
        if self.b is not None:
            x = mx.addmm(self.b, x, self.Wx.T)
        else:
            x = x @ self.Wx.T

        x_rz = x[..., : -self.hidden_size]
        x_n = x[..., -self.hidden_size :]

        all_hidden = []

        for idx in range(x.shape[-2]):
            rz = x_rz[..., idx, :]
            if hidden is not None:
                h_proj = hidden @ self.Wh.T
                h_proj_rz = h_proj[..., : -self.hidden_size]
                h_proj_n = h_proj[..., -self.hidden_size :]

                if self.bhn is not None:
                    h_proj_n += self.bhn

                rz = rz + h_proj_rz

            rz = mx.sigmoid(rz)

            r, z = mx.split(rz, 2, axis=-1)

            n = x_n[..., idx, :]

            if hidden is not None:
                n = n + r * h_proj_n
            elif self.bhn is not None:
                n = n + r * self.bhn
            n = mx.tanh(n)

            if hidden is not None:
                hidden = (1 - z) * n + z * hidden
            else:
                hidden = (1 - z) * n

            all_hidden.append(hidden)

        return mx.stack(all_hidden, axis=-2)


class LSTM(Module):
    r"""An LSTM recurrent layer.

    The input has shape ``NLD`` or ``LD`` where:

    * ``N`` is the optional batch dimension
    * ``L`` is the sequence length
    * ``D`` is the input's feature dimension

    Concretely, for each element of the sequence, this layer computes:

    .. math::
        \begin{aligned}
        i_t &= \sigma (W_{xi}x_t + W_{hi}h_t + b_{i}) \\
        f_t &= \sigma (W_{xf}x_t + W_{hf}h_t + b_{f}) \\
        g_t &= \text{tanh} (W_{xg}x_t + W_{hg}h_t + b_{g}) \\
        o_t &= \sigma (W_{xo}x_t + W_{ho}h_t + b_{o}) \\
        c_{t + 1} &= f_t \odot c_t + i_t \odot g_t \\
        h_{t + 1} &= o_t \text{tanh}(c_{t + 1})
        \end{aligned}

    The hidden state :math:`h` and cell state :math:`c` have shape ``NH``
    or ``H``, depending on whether the input is batched or not.

    The layer returns two arrays, the hidden state and the cell state at
    each time step, both of shape ``NLH`` or ``LH``.

    Args:
        input_size (int): Dimension of the input, ``D``.
        hidden_size (int): Dimension of the hidden state, ``H``.
        bias (bool): Whether to use biases or not. Default: ``True``.
    """

    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        bias: bool = True,
    ):
        super().__init__()

        self.hidden_size = hidden_size
        scale = 1.0 / math.sqrt(hidden_size)
        self.Wx = mx.random.uniform(
            low=-scale, high=scale, shape=(4 * hidden_size, input_size)
        )
        self.Wh = mx.random.uniform(
            low=-scale, high=scale, shape=(4 * hidden_size, hidden_size)
        )
        self.bias = (
            mx.random.uniform(low=-scale, high=scale, shape=(4 * hidden_size,))
            if bias
            else None
        )

    def _extra_repr(self):
        return (
            f"input_dims={self.Wx.shape[1]}, "
            f"hidden_size={self.hidden_size}, bias={self.bias is not None}"
        )

    def __call__(self, x, hidden=None, cell=None):
        if self.bias is not None:
            x = mx.addmm(self.bias, x, self.Wx.T)
        else:
            x = x @ self.Wx.T

        all_hidden = []
        all_cell = []

        for idx in range(x.shape[-2]):
            ifgo = x[..., idx, :]
            if hidden is not None:
                ifgo = mx.addmm(ifgo, hidden, self.Wh.T)
            i, f, g, o = mx.split(ifgo, 4, axis=-1)

            i = mx.sigmoid(i)
            f = mx.sigmoid(f)
            g = mx.tanh(g)
            o = mx.sigmoid(o)

            if cell is not None:
                cell = f * cell + i * g
            else:
                cell = i * g
            hidden = o * mx.tanh(cell)

            all_cell.append(cell)
            all_hidden.append(hidden)

        return mx.stack(all_hidden, axis=-2), mx.stack(all_cell, axis=-2)


================================================
FILE: python/mlx/nn/layers/transformer.py
================================================
# Copyright © 2023 Apple Inc.

import math
from typing import Any, Callable, Optional

import mlx.core as mx
from mlx.nn.layers.activations import relu
from mlx.nn.layers.base import Module
from mlx.nn.layers.dropout import Dropout
from mlx.nn.layers.linear import Linear
from mlx.nn.layers.normalization import LayerNorm
from mlx.nn.utils import checkpoint


class MultiHeadAttention(Module):
    """Implements the scaled dot product attention with multiple heads.

    Given inputs for queries, keys and values the ``MultiHeadAttention``
    produces new values by aggregating information from the input values
    according to the similarities of the input queries and keys.

    All inputs as well as the output are linearly projected without biases by
    default.

    ``MultiHeadAttention`` also takes an optional additive attention mask that
    should be broadcastable with ``(batch, num_heads, # queries, # keys)``. The
    mask should have ``-inf`` or very large negative numbers at the positions
    that should *not* be attended to.

    Args:
        dims (int): The model dimensions. This is also the default
            value for the queries, keys, values, and the output.
        num_heads (int): The number of attention heads to use.
        query_input_dims (int, optional): The input dimensions of the queries.
            Default: ``dims``.
        key_input_dims (int, optional): The input dimensions of the keys.
            Default: ``dims``.
        value_input_dims (int, optional): The input dimensions of the values.
            Default: ``key_input_dims``.
        value_dims (int, optional): The dimensions of the values after the
            projection. Default: ``dims``.
        value_output_dims (int, optional): The dimensions the new values will
            be projected to. Default: ``dims``.
        bias (bool, optional): Whether or not to use a bias in the projections.
            Default: ``False``.
    """

    def __init__(
        self,
        dims: int,
        num_heads: int,
        query_input_dims: Optional[int] = None,
        key_input_dims: Optional[int] = None,
        value_input_dims: Optional[int] = None,
        value_dims: Optional[int] = None,
        value_output_dims: Optional[int] = None,
        bias: bool = False,
    ):
        super().__init__()

        if (dims % num_heads) != 0:
            raise ValueError(
                "The input feature dimensions should be divisible by the "
                f"number of heads ({dims} % {num_heads}) != 0"
            )

        query_input_dims = query_input_dims or dims
        key_input_dims = key_input_dims or dims
        value_input_dims = value_input_dims or key_input_dims
        value_dims = value_dims or dims
        value_output_dims = value_output_dims or dims

        self.num_heads = num_heads
        self.query_proj = Linear(query_input_dims, dims, bias=bias)
        self.key_proj = Linear(key_input_dims, dims, bias=bias)
        self.value_proj = Linear(value_input_dims, value_dims, bias=bias)
        self.out_proj = Linear(value_dims, value_output_dims, bias=bias)

    def __call__(self, queries, keys, values, mask=None):
        queries = self.query_proj(queries)
        keys = self.key_proj(keys)
        values = self.value_proj(values)

        num_heads = self.num_heads
        queries = mx.unflatten(queries, -1, (num_heads, -1)).transpose(0, 2, 1, 3)
        keys = mx.unflatten(keys, -1, (num_heads, -1)).transpose(0, 2, 1, 3)
        values = mx.unflatten(values, -1, (num_heads, -1)).transpose(0, 2, 1, 3)
        scale = math.sqrt(1 / queries.shape[-1])
        output = mx.fast.scaled_dot_product_attention(
            queries, keys, values, scale=scale, mask=mask
        )
        output = output.transpose(0, 2, 1, 3).flatten(-2, -1)
        return self.out_proj(output)

    @staticmethod
    def create_additive_causal_mask(N: int, dtype: mx.Dtype = mx.float32):
        indices = mx.arange(N)
        mask = indices[:, None] < indices[None]
        mask = mask.astype(dtype) * mx.finfo(dtype).min
        return mask


class TransformerEncoderLayer(Module):
    def __init__(
        self,
        dims: int,
        num_heads: int,
        mlp_dims: Optional[int] = None,
        dropout: float = 0.0,
        activation: Callable[[Any], Any] = relu,
        norm_first: bool = True,
    ):
        super().__init__()
        mlp_dims = mlp_dims or dims * 4
        self.attention = MultiHeadAttention(dims, num_heads)
        self.ln1 = LayerNorm(dims)
        self.ln2 = LayerNorm(dims)
        self.linear1 = Linear(dims, mlp_dims)
        self.linear2 = Linear(mlp_dims, dims)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
        self.activation = activation
        self.norm_first = norm_first

    def __call__(self, x, mask):
        if self.norm_first:
            y = self.ln1(x)
            y = self.attention(y, y, y, mask)
            y = self.dropout1(y)
            x = x + y

            y = self.ln2(x)
            y = self.linear1(y)
            y = self.activation(y)
            y = self.dropout2(y)
            y = self.linear2(y)
            y = x + y

        else:
            y = self.attention(x, x, x, mask)
            y = self.dropout1(y)
            x = self.ln1(x + y)

            y = self.linear1(x)
            y = self.activation(y)
            y = self.dropout2(y)
            y = self.linear2(y)
            y = self.ln2(x + y)

        return y


class TransformerEncoder(Module):
    def __init__(
        self,
        num_layers: int,
        dims: int,
        num_heads: int,
        mlp_dims: Optional[int] = None,
        dropout: float = 0.0,
        activation=relu,
        norm_first: bool = True,
        checkpoint: bool = False,
    ):
        super().__init__()
        self.layers = [
            TransformerEncoderLayer(
                dims, num_heads, mlp_dims, dropout, activation, norm_first
            )
            for i in range(num_layers)
        ]
        self.ln = LayerNorm(dims)
        self.checkpoint = checkpoint

    def __call__(self, x, mask):
        for l in self.layers:
            l = checkpoint(l) if self.checkpoint else l
            x = l(x, mask)
        return self.ln(x)


class TransformerDecoderLayer(Module):
    def __init__(
        self,
        dims: int,
        num_heads: int,
        mlp_dims: Optional[int] = None,
        dropout: float = 0.0,
        activation: Callable[[Any], Any] = relu,
        norm_first: bool = True,
    ):
        super().__init__()
        mlp_dims = mlp_dims or dims * 4
        self.self_attention = MultiHeadAttention(dims, num_heads)
        self.cross_attention = MultiHeadAttention(dims, num_heads)
        self.ln1 = LayerNorm(dims)
        self.ln2 = LayerNorm(dims)
        self.ln3 = LayerNorm(dims)
        self.linear1 = Linear(dims, mlp_dims)
        self.linear2 = Linear(mlp_dims, dims)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
        self.dropout3 = Dropout(dropout)
        self.activation = activation
        self.norm_first = norm_first

    def __call__(self, x, memory, x_mask, memory_mask):
        if self.norm_first:
            y = self.ln1(x)
            y = self.self_attention(y, y, y, x_mask)
            y = self.dropout1(y)
            x = x + y

            y = self.ln2(x)
            y = self.cross_attention(y, memory, memory, memory_mask)
            y = self.dropout2(y)
            x = x + y

            y = self.ln3(x)
            y = self.linear1(y)
            y = self.activation(y)
            y = self.dropout3(y)
            y = self.linear2(y)
            y = x + y

        else:
            y = self.self_attention(x, x, x, x_mask)
            y = self.dropout1(y)
            x = self.ln1(x + y)

            y = self.cross_attention(y, memory, memory, memory_mask)
            y = self.dropout2(y)
            x = self.ln2(x + y)

            y = self.linear1(x)
            y = self.activation(y)
            y = self.dropout3(y)
            y = self.linear2(y)
            y = self.ln3(x + y)

        return y


class TransformerDecoder(Module):
    def __init__(
        self,
        num_layers: int,
        dims: int,
        num_heads: int,
        mlp_dims: Optional[int] = None,
        dropout: float = 0.0,
        activation=relu,
        norm_first: bool = True,
        checkpoint: bool = False,
    ):
        super().__init__()
        self.layers = [
            TransformerDecoderLayer(
                dims, num_heads, mlp_dims, dropout, activation, norm_first
            )
            for i in range(num_layers)
        ]
        self.ln = LayerNorm(dims)
        self.checkpoint = checkpoint

    def __call__(self, x, memory, x_mask, memory_mask):
        for l in self.layers:
            l = checkpoint(l) if self.checkpoint else l
            x = l(x, memory, x_mask, memory_mask)
        return self.ln(x)


class Transformer(Module):
    """
    Implements a standard Transformer model.

    The implementation is based on `Attention Is All You Need
    <https://arxiv.org/abs/1706.03762>`_.

    The Transformer model contains an encoder and a decoder. The encoder
    processes the input sequence and the decoder generates the output sequence.
    The interaction between encoder and decoder happens through the attention
    mechanism.

    Args:
        dims (int, optional): The number of expected features in the
            encoder/decoder inputs. Default: ``512``.
        num_heads (int, optional): The number of attention heads. Default:
            ``8``.
        num_encoder_layers (int, optional): The number of encoder layers in the
            Transformer encoder. Default: ``6``.
        num_decoder_layers (int, optional): The number of decoder layers in the
            Transformer decoder. Default: ``6``.
        mlp_dims (int, optional): The hidden dimension of the MLP block in each
            Transformer layer. Defaults to ``4*dims`` if not provided. Default:
            ``None``.
        dropout (float, optional): The dropout value for the Transformer
            encoder and decoder. Dropout is used after each attention layer and
            the activation in the MLP layer. Default: ``0.0``.
        activation (function, optional): the activation function for the MLP
            hidden layer. Default: :func:`mlx.nn.relu`.
        custom_encoder (nn.Module, optional): A custom encoder to replace the
            standard Transformer encoder. Default: ``None``.
        custom_decoder (nn.Module, optional): A custom decoder to replace the
            standard Transformer decoder. Default: ``None``.
        norm_first (bool, optional): if ``True``, encoder and decoder layers
            will perform layer normalization before attention and MLP
            operations, otherwise after. Default: ``True``.
        checkpoint (bool, optional): if ``True`` perform gradient checkpointing
            to reduce the memory usage at the expense of more computation.
            Default: ``False``.
    """

    def __init__(
        self,
        dims: int = 512,
        num_heads: int = 8,
        num_encoder_layers: int = 6,
        num_decoder_layers: int = 6,
        mlp_dims: Optional[int] = None,
        dropout: float = 0.0,
        activation: Callable[[Any], Any] = relu,
        custom_encoder: Optional[Any] = None,
        custom_decoder: Optional[Any] = None,
        norm_first: bool = True,
        checkpoint: bool = False,
    ):
        super().__init__()

        self.encoder = custom_encoder or TransformerEncoder(
            num_encoder_layers,
            dims,
            num_heads,
            mlp_dims,
            dropout,
            activation,
            norm_first,
            checkpoint,
        )

        self.decoder = custom_decoder or TransformerDecoder(
            num_decoder_layers,
            dims,
            num_heads,
            mlp_dims,
            dropout,
            activation,
            norm_first,
            checkpoint,
        )

    def __call__(self, src, tgt, src_mask, tgt_mask, memory_mask):
        memory = self.encoder(src, src_mask)
        return self.decoder(tgt, memory, tgt_mask, memory_mask)


================================================
FILE: python/mlx/nn/layers/upsample.py
================================================
# Copyright © 2023-2024 Apple Inc.

import operator
from functools import partial, reduce
from itertools import product
from typing import Callable, Literal, Tuple, Union

import mlx.core as mx
from mlx.nn.layers.base import Module


def _scaled_indices(N, scale, align_corners, dim, ndims):
    M = int(scale * N)
    if align_corners:
        indices = mx.arange(M, dtype=mx.float32) * ((N - 1) / (M - 1))
    else:
        step = 1 / scale
        start = ((M - 1) * step - N + 1) / 2
        indices = mx.arange(M, dtype=mx.float32) * step - start

    shape = [1] * ndims
    shape[dim] = -1

    return indices.reshape(shape)


def _nearest_indices(N, scale, dim, ndims):
    M = int(scale * N)
    indices = mx.arange(M, dtype=mx.float32)
    if M > N:
        indices = (indices + 0.5) * (N / M) - 0.5
        indices = indices.round()
    else:
        indices = indices * (N / M)
    shape = [1] * ndims
    shape[dim] = -1
    return indices.astype(mx.uint32).reshape(shape)


def _linear_indices(N, scale, align_corners, dim, ndims):
    indices = _scaled_indices(N, scale, align_corners, dim, ndims)
    indices = mx.clip(indices, a_min=0, a_max=N - 1)
    indices_l = mx.floor(indices)
    indices_r = mx.ceil(indices)
    weight = indices - indices_l
    weight = mx.expand_dims(weight, -1)

    return (
        (indices_l.astype(mx.uint32), 1 - weight),
        (indices_r.astype(mx.uint32), weight),
    )


def _cubic_indices(N, scale, align_corners, dim, ndims):
    indices = _scaled_indices(N, scale, align_corners, dim, ndims)
    indices_l1 = mx.floor(indices)
    indices_r1 = mx.floor(indices + 1)
    indices_l2 = indices_l1 - 1
    indices_r2 = indices_r1 + 1

    @partial(mx.compile, shapeless=True)
    def _get_weight(ind, grid, dist):
        # PyTorch uses -0.5 for antialiasing=true (compatibility with PIL)
        # and uses -0.75 for antialiasing=false (compatibility with OpenCV)
        a = -0.75
        x = mx.abs(ind - grid)
        if dist == 1:
            weight = ((a + 2.0) * x - (a + 3.0)) * x * x + 1
        else:
            weight = (((x - 5) * x + 8) * x - 4) * a
        return weight

    weight_l1 = _get_weight(indices, indices_l1, dist=1)[..., None]
    weight_r1 = _get_weight(indices, indices_r1, dist=1)[..., None]
    weight_l2 = _get_weight(indices, indices_l2, dist=2)[..., None]
    weight_r2 = _get_weight(indices, indices_r2, dist=2)[..., None]

    # padding with border value
    indices_l1 = mx.clip(indices_l1, a_min=0, a_max=N - 1)
    indices_r1 = mx.clip(indices_r1, a_min=0, a_max=N - 1)
    indices_l2 = mx.clip(indices_l2, a_min=0, a_max=N - 1)
    indices_r2 = mx.clip(indices_r2, a_min=0, a_max=N - 1)

    return (
        (indices_l1.astype(mx.uint32), weight_l1),
        (indices_r1.astype(mx.uint32), weight_r1),
        (indices_l2.astype(mx.uint32), weight_l2),
        (indices_r2.astype(mx.uint32), weight_r2),
    )


def upsample_nearest(x: mx.array, scale_factor: Tuple):
    dims = x.ndim - 2
    if dims != len(scale_factor):
        raise ValueError("A scale needs to be provided for each spatial dimension")

    # Integer scale_factors means we can simply expand-broadcast and reshape
    if tuple(map(int, scale_factor)) == scale_factor:
        shape = list(x.shape)
        for d in range(dims):
            shape.insert(2 + 2 * d, 1)
        x = x.reshape(shape)
        for d in range(dims):
            shape[2 + 2 * d] = int(scale_factor[d])
        x = mx.broadcast_to(x, shape)
        for d in range(dims):
            shape[d + 1] *= shape[d + 2]
            shape.pop(d + 2)
        x = x.reshape(shape)
        return x

    else:
        B, *N, C = x.shape
        indices = [slice(None)]
        for i, (n, s) in enumerate(zip(N, scale_factor)):
            indices.append(_nearest_indices(n, s, i, dims))
        indices = tuple(indices)

        return x[indices]


def _interpolate(
    x: mx.array, scale_factor: Tuple, indices_fn: Callable, align_corners: bool = False
):
    dims = x.ndim - 2
    if dims != len(scale_factor):
        raise ValueError("A scale needs to be provided for each spatial dimension")

    B, *N, C = x.shape

    # Compute the sampling grid
    indices = []
    for i, (n, s) in enumerate(zip(N, scale_factor)):
        indices.append(indices_fn(n, s, align_corners, i, dims))

    # Sample and compute the weights
    samples = []
    weights = []
    for idx_weight in product(*indices):
        idx, weight = zip(*idx_weight)
        samples.append(x[(slice(None),) + idx])
        weights.append(reduce(operator.mul, weight))

    # Interpolate
    return sum(wi * xi for wi, xi in zip(weights, samples))


def upsample_linear(x: mx.array, scale_factor: Tuple, align_corners: bool = False):
    return _interpolate(
        x=x,
        scale_factor=scale_factor,
        indices_fn=_linear_indices,
        align_corners=align_corners,
    )


def upsample_cubic(x: mx.array, scale_factor: Tuple, align_corners: bool = False):
    return _interpolate(
        x=x,
        scale_factor=scale_factor,
        indices_fn=_cubic_indices,
        align_corners=align_corners,
    )


class Upsample(Module):
    r"""Upsample the input signal spatially.

    The spatial dimensions are by convention dimensions ``1`` to ``x.ndim -
    2``. The first is the batch dimension and the last is the feature
    dimension.

    For example, an audio signal would be 3D with 1 spatial dimension, an image
    4D with 2 and so on and so forth.

    There are three upsampling algorithms implemented nearest neighbor upsampling,
    linear interpolation, and cubic interpolation. All can be applied to any number
    of spatial dimensions. The linear interpolation will be bilinear, trilinear etc
    when applied to more than one spatial dimension. And cubic interpolation will be
    bicubic when there are 2 spatial dimensions.

    .. note::
       When using one of the linear or cubic interpolation modes the ``align_corners``
       argument changes how the corners are treated in the input image. If
       ``align_corners=True`` then the top and left edge of the input and
       output will be matching as will the bottom right edge.

    Parameters:
        scale_factor (float or tuple): The multiplier for the spatial size.
            If a ``float`` is provided, it is the multiplier for all spatial dimensions.
            Otherwise, the number of scale factors provided must match the
            number of spatial dimensions.
        mode (str, optional): The upsampling algorithm, either ``"nearest"``,
            ``"linear"`` or ``"cubic"``. Default: ``"nearest"``.
        align_corners (bool, optional): Changes the way the corners are treated
            during ``"linear"`` and ``"cubic"`` upsampling.  See the note above and the
            examples below for more details.  Default: ``False``.

    Examples:
        >>> import mlx.core as mx
        >>> import mlx.nn as nn
        >>> x = mx.arange(1, 5).reshape((1, 2, 2, 1))
        >>> x
        array([[[[1],
                 [2]],
                [[3],
                 [4]]]], dtype=int32)
        >>> n = nn.Upsample(scale_factor=2, mode='nearest')
        >>> n(x).squeeze()
        array([[1, 1, 2, 2],
               [1, 1, 2, 2],
               [3, 3, 4, 4],
               [3, 3, 4, 4]], dtype=int32)
        >>> b = nn.Upsample(scale_factor=2, mode='linear')
        >>> b(x).squeeze()
        array([[1, 1.25, 1.75, 2],
               [1.5, 1.75, 2.25, 2.5],
               [2.5, 2.75, 3.25, 3.5],
               [3, 3.25, 3.75, 4]], dtype=float32)
        >>> b = nn.Upsample(scale_factor=2, mode='linear', align_corners=True)
        >>> b(x).squeeze()
        array([[1, 1.33333, 1.66667, 2],
               [1.66667, 2, 2.33333, 2.66667],
               [2.33333, 2.66667, 3, 3.33333],
               [3, 3.33333, 3.66667, 4]], dtype=float32)
    """

    def __init__(
        self,
        scale_factor: Union[float, Tuple],
        mode: Literal["nearest", "linear", "cubic"] = "nearest",
        align_corners: bool = False,
    ):
        super().__init__()
        if mode not in ["nearest", "linear", "cubic"]:
            raise ValueError(f"[Upsample] Got unsupported upsampling algorithm: {mode}")
        if isinstance(scale_factor, (list, tuple)):
            self.scale_factor = tuple(map(float, scale_factor))
        else:
            self.scale_factor = float(scale_factor)
        self.mode = mode
        self.align_corners = align_corners

    def _extra_repr(self) -> str:
        return (
            f"scale_factor={self.scale_factor}, mode={self.mode!r}, "
            f"align_corners={self.align_corners}"
        )

    def __call__(self, x: mx.array) -> mx.array:
        dims = x.ndim - 2
        if dims <= 0:
            raise ValueError(
                f"[Upsample] The input should have at least 1 spatial "
                f"dimension which means it should be at least 3D but "
                f"{x.ndim}D was provided"
            )

        scale_factor = self.scale_factor
        if isinstance(scale_factor, tuple):
            if len(scale_factor) != dims:
                raise ValueError(
                    f"[Upsample] One scale per spatial dimension is required but "
                    f"scale_factor={scale_factor} and the number of spatial "
                    f"dimensions were {dims}"
                )
        else:
            scale_factor = (scale_factor,) * dims

        if self.mode == "nearest":
            return upsample_nearest(x, scale_factor)
        elif self.mode == "linear":
            return upsample_linear(x, scale_factor, self.align_corners)
        elif self.mode == "cubic":
            return upsample_cubic(x, scale_factor, self.align_corners)
        else:
            raise Exception(f"Unknown interpolation mode: {self.mode}")


================================================
FILE: python/mlx/nn/losses.py
================================================
# Copyright © 2023 Apple Inc.

import math
from typing import Literal, Optional, get_args

import mlx.core as mx

Reduction = Literal["none", "mean", "sum"]


def _reduce(loss: mx.array, reduction: Reduction = "none"):
    if reduction not in get_args(Reduction):
        raise ValueError(f"Invalid reduction. Must be one of {get_args(Reduction)}.")

    if reduction == "mean":
        return mx.mean(loss)
    elif reduction == "sum":
        return mx.sum(loss)
    elif reduction == "none":
        return loss


def cross_entropy(
    logits: mx.array,
    targets: mx.array,
    weights: Optional[mx.array] = None,
    axis: int = -1,
    label_smoothing: float = 0.0,
    reduction: Reduction = "none",
) -> mx.array:
    """
    Computes the cross entropy loss.

    Args:
        logits (array): The unnormalized logits.
        targets (array): The ground truth values. These can be class indices or
            probabilities for each class. If the ``targets`` are class indices,
            then ``targets`` shape should match the ``logits`` shape with
            the ``axis`` dimension removed. If the ``targets`` are probabilities
            (or one-hot encoded), then the ``targets`` shape should be the same as
            the ``logits`` shape.
        weights (array, optional): Optional weights for each target. Default: ``None``.
        axis (int, optional): The axis over which to compute softmax. Default: ``-1``.
        label_smoothing (float, optional): Label smoothing factor. Default: ``0``.
        reduction (str, optional): Specifies the reduction to apply to the output:
            ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'none'``.

    Returns:
        array: The computed cross entropy loss.

    Examples:
        >>> import mlx.core as mx
        >>> import mlx.nn as nn
        >>>
        >>> # Class indices as targets
        >>> logits = mx.array([[2.0, -1.0], [-1.0, 2.0]])
        >>> targets = mx.array([0, 1])
        >>> nn.losses.cross_entropy(logits, targets)
        array([0.0485873, 0.0485873], dtype=float32)
        >>>
        >>> # Probabilities (or one-hot vectors) as targets
        >>> logits = mx.array([[2.0, -1.0], [-1.0, 2.0]])
        >>> targets = mx.array([[0.9, 0.1], [0.1, 0.9]])
        >>> nn.losses.cross_entropy(logits, targets)
        array([0.348587, 0.348587], dtype=float32)
    """
    if label_smoothing < 0 or label_smoothing >= 1:
        raise ValueError(f"Label smoothing must in [0, 1), got {label_smoothing}.")

    # Whether targets are class indices or probabilities
    targets_as_probs = targets.ndim == logits.ndim

    def _drop_dim(shape, axis):
        shape = list(shape)
        shape.pop(axis)
        return tuple(shape)

    # Check shapes in two cases: targets as class indices and targets as probabilities
    if (targets_as_probs and targets.shape != logits.shape) or (
        not targets_as_probs and targets.shape != _drop_dim(logits.shape, axis)
    ):
        raise ValueError(
            f"Targets shape {targets.shape} does not match logits shape {logits.shape}."
        )

    if targets_as_probs:
        score = mx.sum(logits * targets, axis=axis)
    else:
        score = mx.take_along_axis(logits, mx.expand_dims(targets, axis), axis).squeeze(
            axis
        )

    logsumexp_logits = mx.logsumexp(logits, axis=axis)
    if label_smoothing > 0:
        # Adjust the true class score with label smoothing
        adjusted_score = (1 - label_smoothing) * score

        # Calculate the mean logit across the classes for smoothed loss
        mean_logits = logits.mean(axis=axis)
        smoothed_loss = -mean_logits * label_smoothing

        # Combine the adjusted score and smoothed loss with the logsumexp logits
        loss = logsumexp_logits - adjusted_score + smoothed_loss
    else:
        loss = logsumexp_logits - score

    # Apply weights if provided
    if weights is not None:
        if weights.shape != loss.shape:
            raise ValueError(
                f"Weights with shape {weights.shape} is not the same as "
                f"output loss with shape {loss.shape}."
            )
        loss *= weights

    # Apply reduction
    return _reduce(loss, reduction)


def binary_cross_entropy(
    inputs: mx.array,
    targets: mx.array,
    weights: Optional[mx.array] = None,
    with_logits: bool = True,
    reduction: Reduction = "mean",
) -> mx.array:
    """
    Computes the binary cross entropy loss.

    By default, this function takes the pre-sigmoid logits, which results in a faster
    and more precise loss. For improved numerical stability when ``with_logits=False``,
    the loss calculation clips the input probabilities (in log-space) to a minimum value
    of ``-100``.

    Args:
        inputs (array): The predicted values. If ``with_logits`` is ``True``, then
            ``inputs`` are unnormalized logits. Otherwise, ``inputs`` are probabilities.
        targets (array): The binary target values in {0, 1}.
        with_logits (bool, optional): Whether ``inputs`` are logits. Default: ``True``.
        weights (array, optional): Optional weights for each target. Default: ``None``.
        reduction (str, optional): Specifies the reduction to apply to the output:
          ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'mean'``.

    Returns:
        array: The computed binary cross entropy loss.
    Examples:
        >>> import mlx.core as mx
        >>> import mlx.nn as nn

        >>> logits = mx.array([0.105361, 0.223144, 1.20397, 0.916291])
        >>> targets = mx.array([0, 0, 1, 1])
        >>> loss = nn.losses.binary_cross_entropy(logits, targets, reduction="mean")
        >>> loss
        array(0.539245, dtype=float32)

        >>> probs = mx.array([0.1, 0.1, 0.4, 0.4])
        >>> targets = mx.array([0, 0, 1, 1])
        >>> loss = nn.losses.binary_cross_entropy(probs, targets, with_logits=False, reduction="mean")
        >>> loss
        array(0.510826, dtype=float32)
    """
    if inputs.shape != targets.shape:
        raise ValueError(
            f"Inputs shape {inputs.shape} does not match targets shape {targets.shape}."
        )

    if with_logits:
        loss = mx.logaddexp(0.0, inputs) - inputs * targets
    else:
        log_inputs_clip = mx.clip(mx.log(inputs), a_min=-100, a_max=None)
        log_inputs_inv_clip = mx.clip(mx.log(1 - inputs), a_min=-100, a_max=None)
        loss = -(targets * log_inputs_clip + (1 - targets) * log_inputs_inv_clip)

    # Apply weights if provided
    if weights is not None:
        if weights.shape != loss.shape:
            raise ValueError(
                f"Weights with shape {weights.shape} is not the same as "
                f"output loss with shape {loss.shape}."
            )
        loss *= weights

    return _reduce(loss, reduction)


def l1_loss(
    predictions: mx.array, targets: mx.array, reduction: Reduction = "mean"
) -> mx.array:
    """
    Computes the L1 loss.

    Args:
        predictions (array): The predicted values.
        targets (array): The target values.
        reduction (str, optional): Specifies the reduction to apply to the output:
          ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'mean'``.

    Returns:
        array: The computed L1 loss.
    """
    if predictions.shape != targets.shape:
        raise ValueError(
            f"Predictions shape {predictions.shape} does not match "
            f"targets shape {targets.shape}."
        )
    loss = mx.abs(predictions - targets)

    return _reduce(loss, reduction)


def mse_loss(
    predictions: mx.array, targets: mx.array, reduction: Reduction = "mean"
) -> mx.array:
    """
    Computes the mean squared error loss.

    Args:
        predictions (array): The predicted values.
        targets (array): The target values.
        reduction (str, optional): Specifies the reduction to apply to the output:
          ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'mean'``.

    Returns:
        array: The computed mean squared error loss.
    """
    if predictions.shape != targets.shape:
        raise ValueError(
            f"Predictions shape {predictions.shape} does not match "
            f"targets shape {targets.shape}."
        )

    loss = mx.square(predictions - targets)
    return _reduce(loss, reduction)


def nll_loss(
    inputs: mx.array, targets: mx.array, axis: int = -1, reduction: Reduction = "none"
) -> mx.array:
    """
    Computes the negative log likelihood loss.

    Args:
        inputs (array): The predicted distribution in log space.
        targets (array): The target values.
        axis (int, optional): The distribution axis. Default: ``-1``.
        reduction (str, optional): Specifies the reduction to apply to the output:
          ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'none'``.

    Returns:
        array: The computed NLL loss.
    """
    loss = -mx.take_along_axis(inputs, targets[..., None], axis).squeeze(-1)

    return _reduce(loss, reduction)


def gaussian_nll_loss(
    inputs: mx.array,
    targets: mx.array,
    vars: mx.array,
    full: bool = False,
    eps: float = 1e-6,
    reduction: Reduction = "mean",
) -> mx.array:
    r"""
    Computes the negative log likelihood loss for a Gaussian distribution.

    The loss is given by:

    .. math::
        \frac{1}{2}\left(\log\left(\max\left(\text{vars},
        \ \epsilon\right)\right) + \frac{\left(\text{inputs} - \text{targets} \right)^2}
        {\max\left(\text{vars}, \ \epsilon \right)}\right) + \text{const.}

    where ``inputs`` are the predicted means and ``vars`` are the the
    predicted variances.

    Args:
        inputs (array): The predicted expectation of the Gaussian distribution.
        targets (array): The target values (samples from the Gaussian distribution).
        vars (array): The predicted variance of the Gaussian distribution.
        full (bool, optional): Whether to include the constant term in the loss calculation.
            Default: ``False``.
        eps (float, optional): Small positive constant for numerical stability.
            Default: ``1e-6``.
        reduction (str, optional): Specifies the reduction to apply to the output:
          ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'none'``.

    Returns:
        array: The Gaussian NLL loss.
    """
    if inputs.shape != targets.shape:
        raise ValueError(
            f"Inputs shape {inputs.shape} does not match targets shape {targets.shape}."
        )

    if inputs.shape != vars.shape:
        raise ValueError(
            f"Inputs shape {inputs.shape} does not match vars shape {vars.shape}."
        )

    # For stability
    vars = mx.maximum(vars, eps)
    loss = 0.5 * (mx.log(vars) + mx.square(targets - inputs) / vars)

    if full:
        loss += 0.5 * math.log(2 * math.pi)

    return _reduce(loss, reduction)


def kl_div_loss(
    inputs: mx.array, targets: mx.array, axis: int = -1, reduction: Reduction = "none"
) -> mx.array:
    """
    Computes the Kullback-Leibler divergence loss.

    Computes the following when ``reduction == 'none'``:

    .. code-block:: python

        mx.exp(targets) * (targets - inputs).sum(axis)

    Args:
        inputs (array): Log probabilities for the predicted distribution.
        targets (array): Log probabilities for the target distribution.
        axis (int, optional): The distribution axis. Default: ``-1``.
        reduction (str, optional): Specifies the reduction to apply to the output:
          ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'none'``.

    Returns:
        array: The computed Kullback-Leibler divergence loss.
    """
    loss = mx.sum(mx.exp(targets) * (targets - inputs), axis)

    return _reduce(loss, reduction)


def smooth_l1_loss(
    predictions: mx.array,
    targets: mx.array,
    beta: float = 1.0,
    reduction: Reduction = "mean",
) -> mx.array:
    r"""
    Computes the smooth L1 loss.

    The smooth L1 loss is a variant of the L1 loss which replaces the absolute
    difference with a squared difference when the absolute difference is less
    than ``beta``.

    The formula for the smooth L1 Loss is:

    .. math::

      l = \begin{cases}
            0.5 (x - y)^2 / \beta, & \text{if } |x - y| < \beta \\
            |x - y| - 0.5 \beta, & \text{otherwise}
          \end{cases}

    Args:
        predictions (array): Predicted values.
        targets (array): Ground truth values.
        beta (float, optional): The threshold after which the loss changes
          from the squared to the absolute difference. Default: ``1.0``.
        reduction (str, optional): Specifies the reduction to apply to the output:
          ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'mean'``.

    Returns:
        array: The computed smooth L1 loss.
    """
    if predictions.shape != targets.shape:
        raise ValueError(
            f"Predictions shape {predictions.shape} does not match "
            f"targets shape {targets.shape}."
        )

    diff = mx.abs(predictions - targets)
    loss = mx.where(
        diff < beta, 0.5 * mx.square(diff) / beta, mx.abs(diff) - 0.5 * beta
    )

    return _reduce(loss, reduction)


def triplet_loss(
    anchors: mx.array,
    positives: mx.array,
    negatives: mx.array,
    axis: int = -1,
    p: int = 2,
    margin: float = 1.0,
    eps: float = 1e-6,
    reduction: Reduction = "none",
) -> mx.array:
    r"""
    Computes the triplet loss for a set of anchor, positive, and negative samples.
    Margin is represented with alpha in the math section.

    .. math::

       \max\left(\|A - P\|_p - \|A - N\|_p + \alpha, 0\right)

    Args:
        anchors (array): The anchor samples.
        positives (array): The positive samples.
        negatives (array): The negative samples.
        axis (int, optional): The distribution axis. Default: ``-1``.
        p (int, optional): The norm degree for pairwise distance. Default: ``2``.
        margin (float, optional): Margin for the triplet loss. Defaults to ``1.0``.
        eps (float, optional): Small positive constant to prevent numerical instability. Defaults to ``1e-6``.
        reduction (str, optional): Specifies the reduction to apply to the output:
          ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'none'``.

    Returns:
        array: Computed triplet loss. If reduction is "none", returns a tensor of the same shape as input;
                  if reduction is "mean" or "sum", returns a scalar tensor.
    """
    loss = mx.maximum(
        mx.sqrt(mx.power(anchors - positives, p).sum(axis) + eps)
        - mx.sqrt(mx.power(anchors - negatives, p).sum(axis) + eps)
        + margin,
        0,
    )
    return _reduce(loss, reduction)


def hinge_loss(
    inputs: mx.array, targets: mx.array, reduction: Reduction = "none"
) -> mx.array:
    r"""
    Computes the hinge loss between inputs and targets.

    .. math::

       \text{hinge}(y, y_{\text{pred}}) = \max(0, 1 - y \cdot y_{\text{pred}})


    Args:
        inputs (array): The predicted values.
        targets (array): The target values. They should be -1 or 1.
        reduction (str, optional): Specifies the reduction to apply to the output:
          ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'none'``.

    Returns:
        array: The computed hinge loss.
    """
    loss = mx.maximum(1 - inputs * targets, 0)

    return _reduce(loss, reduction)


def huber_loss(
    inputs: mx.array,
    targets: mx.array,
    delta: float = 1.0,
    reduction: Reduction = "none",
) -> mx.array:
    r"""
    Computes the Huber loss between inputs and targets.

    .. math::

        l_{\delta}(a) =
        \left\{ \begin{array}{ll}
            \frac{1}{2} a^2 & \text{for } |a| \leq \delta, \\
            \delta \left( |a| - \frac{1}{2} \delta \right) & \text{otherwise.}
        \end{array} \right.

    Args:
        inputs (array): The predicted values.
        targets (array): The target values.
        delta (float, optional): The threshold at which to change between L1 and L2 loss.
          Default: ``1.0``.
        reduction (str, optional): Specifies the reduction to apply to the output:
          ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'none'``.

    Returns:
        array: The computed Huber loss.
    """
    errors = inputs - targets
    abs_errors = mx.abs(errors)
    quadratic = mx.minimum(abs_errors, delta)
    linear = abs_errors - quadratic
    loss = 0.5 * quadratic**2 + delta * linear

    return _reduce(loss, reduction)


def log_cosh_loss(
    inputs: mx.array, targets: mx.array, reduction: Reduction = "none"
) -> mx.array:
    r"""
    Computes the log cosh loss between inputs and targets.

    Logcosh acts like L2 loss for small errors, ensuring stable gradients,
    and like the L1 loss for large errors, reducing sensitivity to outliers. This
    dual behavior offers a balanced, robust approach for regression tasks.

    .. math::

       \text{logcosh}(y_{\text{true}}, y_{\text{pred}}) =
            \frac{1}{n} \sum_{i=1}^{n}
            \log(\cosh(y_{\text{pred}}^{(i)} - y_{\text{true}}^{(i)}))


    Args:
        inputs (array): The predicted values.
        targets (array): The target values.
        reduction (str, optional): Specifies the reduction to apply to the output:
          ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'none'``.

    Returns:
        array: The computed log cosh loss.
    """
    errors = inputs - targets
    loss = mx.logaddexp(errors, -errors) - math.log(2)

    return _reduce(loss, reduction)


def cosine_similarity_loss(
    x1: mx.array,
    x2: mx.array,
    axis: int = 1,
    eps: float = 1e-8,
    reduction: Reduction = "none",
) -> mx.array:
    r"""
    Computes the cosine similarity between the two inputs.

    The cosine similarity loss is given by

    .. math::

        \frac{x_1 \cdot x_2}{\max(\|x_1\|  \cdot \|x_2\|, \epsilon)}

    Args:
        x1 (mx.array): The first set of inputs.
        x2 (mx.array): The second set of inputs.
        axis (int, optional): The embedding axis. Default: ``1``.
        eps (float, optional): The minimum value of the denominator used for
          numerical stability. Default: ``1e-8``.
        reduction (str, optional): Specifies the reduction to apply to the output:
          ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'none'``.

    Returns:
        mx.array: The computed cosine similarity loss.
    """
    x1_norm = mx.linalg.norm(x1, axis=axis)
    x2_norm = mx.linalg.norm(x2, axis=axis)

    loss = mx.sum(x1 * x2, axis=axis) / mx.maximum(x1_norm * x2_norm, eps)

    return _reduce(loss, reduction)


def margin_ranking_loss(
    inputs1: mx.array,
    inputs2: mx.array,
    targets: mx.array,
    margin: float = 0.0,
    reduction: Reduction = "none",
) -> mx.array:
    r"""
    Calculate the margin ranking loss that loss given inputs :math:`x_1`, :math:`x_2` and a label
    :math:`y` (containing 1 or -1).

    The loss is given by:

    .. math::
        \text{loss} = \max (0, -y * (x_1 - x_2) + \text{margin})

    Where :math:`y` represents ``targets``, :math:`x_1` represents ``inputs1`` and :math:`x_2`
    represents ``inputs2``.

    Args:
        inputs1 (array): Scores for the first input.
        inputs2 (array): Scores for the second input.
        targets (array): Labels indicating whether samples in ``inputs1`` should be ranked higher
            than samples in ``inputs2``. Values should be 1 or -1.
        margin (float, optional): The margin by which the scores should be separated.
            Default: ``0.0``.
        reduction (str, optional): Specifies the reduction to apply to the output:
            ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'none'``.

    Returns:
        array: The computed margin ranking loss.

    Examples:
        >>> import mlx.core as mx
        >>> import mlx.nn as nn
        >>> targets = mx.array([1, 1, -1])
        >>> inputs1 = mx.array([-0.573409, -0.765166, -0.0638])
        >>> inputs2 = mx.array([0.75596, 0.225763, 0.256995])
        >>> loss = nn.losses.margin_ranking_loss(inputs1, inputs2, targets)
        >>> loss
        array(0.773433, dtype=float32)
    """
    if not (inputs1.shape == inputs2.shape == targets.shape):
        raise ValueError(
            f"The shapes of the arguments do not match. The provided shapes are "
            f"inputs1.shape={inputs1.shape}, inputs2.shape={inputs2.shape}, and "
            f"targets.shape={targets.shape}."
        )

    differences = inputs1 - inputs2
    loss = mx.maximum(0, -targets * differences + margin)

    return _reduce(loss, reduction)


================================================
FILE: python/mlx/nn/utils.py
================================================
# Copyright © 2023-2024 Apple Inc.

from functools import reduce, wraps
from typing import Any, Callable, Optional

import mlx.core as mx

from ..utils import tree_flatten, tree_map, tree_reduce, tree_unflatten
from .layers.base import Module


def value_and_grad(model: Module, fn: Callable):
    """Transform the passed function ``fn`` to a function that computes the
    gradients of ``fn`` wrt the model's trainable parameters and also its
    value.

    Args:
        model (mlx.nn.Module): The model whose trainable parameters to compute
                               gradients for
        fn (Callable): The scalar function to compute gradients for

    Returns:
        A callable that returns the value of ``fn`` and the gradients wrt the
        trainable parameters of ``model``
    """

    def inner_fn(params, *args, **kwargs):
        model.update(params)
        return fn(*args, **kwargs)

    value_grad_fn = mx.value_and_grad(inner_fn)

    @wraps(fn)
    def wrapped_value_grad_fn(*args, **kwargs):
        value, grad = value_grad_fn(model.trainable_parameters(), *args, **kwargs)
        return value, grad

    return wrapped_value_grad_fn


def checkpoint(module: Module, fn: Optional[Callable] = None):
    """Transform the passed callable to one that performs gradient
    checkpointing with respect to the trainable parameters of the module (and
    the callable's inputs).

    Args:
        module (mlx.nn.Module): The module for whose parameters we will be
            performing gradient checkpointing.
        fn (Callable, optional): The function to checkpoint. If not provided it
            defaults to the provided module.

    Returns:
        A callable that saves the inputs and outputs during the forward pass
        and recomputes all intermediate states during the backward pass.
    """
    if fn is None:
        # Capturing module instead of module.__call__ allows someone to
        # monkey-patch __call__ later on and the correct method will be used
        fn = module

    def inner_fn(params, *args, **kwargs):
        module.update(params)
        return fn(*args, **kwargs)

    checkpointed_fn = mx.checkpoint(inner_fn)

    @wraps(fn)
    def wrapped_checkpointed_fn(*args, **kwargs):
        return checkpointed_fn(module.trainable_parameters(), *args, **kwargs)

    return wrapped_checkpointed_fn


def _extract_info(flat):
    keys = [k for k, _ in flat]
    shapes = [g.shape for _, g in flat]
    sizes = [g.size for _, g in flat]
    dtypes = [g.dtype for _, g in flat]
    return keys, shapes, sizes, dtypes


def _group_by_size(keys, sizes, itemsize, communication_size):
    grad_groups = []
    grad_group = []
    grad_group_size = 0
    for i in range(len(keys)):
        grad_group.append(i)
        grad_group_size += sizes[i] * itemsize
        if grad_group_size >= communication_size:
            grad_groups.append(grad_group)
            grad_group = []
            grad_group_size = 0
    if grad_group:
        grad_groups.append(grad_group)
        grad_group = []
    return grad_groups


def average_gradients(
    gradients: Any,
    group: Optional[mx.distributed.Group] = None,
    all_reduce_size: int = 32 * 1024**2,
    communication_stream: Optional[mx.Stream] = None,
):
    """Average the gradients across the distributed processes in the passed group.

    This helper enables concatenating several gradients of small arrays to one
    big all reduce call for better networking performance.

    Args:
        gradients (Any): The Python tree containing the gradients (it should
            have the same structure across processes)
        group (Optional[mlx.core.distributed.Group]): The group of processes to
            average the gradients. If set to ``None`` the global group is used.
            Default: ``None``.
        all_reduce_size (int): Group arrays until their size in bytes exceeds
            this number. Perform one communication step per group of arrays. If
            less or equal to 0 array grouping is disabled. Default: ``32MiB``.
        communication_stream (Optional[mlx.core.Stream]): The stream to use
            for the communication. If unspecified the default communication
            stream is used which can vary by back-end. Default: ``None``.
    """
    group = group or mx.distributed.init()
    N = group.size()

    if N == 1:
        return gradients

    if all_reduce_size <= 0:
        return tree_map(
            lambda x: mx.distributed.all_sum(
                x,
                group=group,
                stream=communication_stream,
            )
            / N,
            gradients,
        )

    else:
        flat_grads = tree_flatten(gradients)
        if len(flat_grads) == 0:
            return gradients

        # Extract some info for the gradient
        keys, shapes, sizes, dtypes = _extract_info(flat_grads)

        # We can't group them if they have mixed types
        if not all(dt == dtypes[0] for dt in dtypes):
            return average_gradients(gradients, group, 0)
        # Gather the gradients in groups that are just above or equal to all_reduce_size
        grad_groups = _group_by_size(keys, sizes, dtypes[0].size, all_reduce_size)

        # Concatenate-reduce-split
        new_flat_grads = []
        for grad_group in grad_groups:
            indices = reduce(lambda x, y: x + [x[-1] + sizes[y]], grad_group, [0])
            big_grad = mx.concatenate(
                [flat_grads[i][1].reshape(-1) for i in grad_group]
            )
            big_grad = (
                mx.distributed.all_sum(
                    big_grad, stream=communication_stream, group=group
                )
                / N
            )
            big_grad = mx.split(big_grad, indices[1:-1])
            new_flat_grads.extend(
                (keys[j], big_grad[i].reshape(shapes[j]))
                for i, j in enumerate(grad_group)
            )

        return tree_unflatten(new_flat_grads)


def _clip_grads_fsdp(grads_slice, max_norm, group=None):
    local_norm_sq = tree_reduce(lambda acc, g: acc + g.square().sum(), grads_slice, 0.0)
    global_norm_sq = mx.distributed.all_sum(local_norm_sq, group=group)
    grad_norm = mx.sqrt(global_norm_sq)
    normalizer = mx.minimum(max_norm / (grad_norm + 1e-6), 1.0)
    grads_slice = tree_map(lambda g: g * normalizer, grads_slice)

    return grads_slice, grad_norm


def fsdp_apply_gradients(
    gradients,
    parameters,
    optimizer,
    fsdp_group=None,
    dp_group=None,
    communication_size=32 * 1024**2,
    communication_stream=None,
    max_norm=None,
):
    """Perform a distributed optimizer step by sharding gradients and optimizer states across ranks.

    This helper function performs the following steps:
    1. Reduce-scatter the gradients across ranks so each rank gets a shard of the averaged gradients.
    2. Optionally clip the sharded gradients by global norm.
    3. Apply the optimizer update on the local parameter slice using the sharded gradients.
    4. All-gather the updated parameter slices from all ranks to reconstruct the full parameters tree.

    This is similar to PyTorch's FSDP with `reshard_after_forward=False`.

    Args:
        gradients (Any): The Python tree containing the full gradients (it should
            have the same structure as ``parameters``). Each gradient's first
            dimension must be divisible by ``fsdp_group.size()``.
        parameters (Any): The Python tree containing the full parameters (it should
            have the same structure across processes). Each parameter's first
            dimension must be divisible by ``fsdp_group.size()``.
        optimizer: Optimizer with an ``apply_gradients`` method.
        fsdp_group (Optional[mlx.core.distributed.Group]): The group of processes
            for FSDP sharding. If ``None``, the global group is used.
        dp_group (Optional[mlx.core.distributed.Group]): The group of processes
            for data-parallel gradient averaging. Required when ``fsdp_group`` is
            smaller than the world (e.g. FSDP intra-node, DDP inter-node).
            Default: ``None``.
        communication_size (int): Group arrays until their size in bytes exceeds
            this number. Perform one communication step per group of arrays. If
            less or equal to 0 array grouping is disabled. Default: ``32MiB``.
        communication_stream (Optional[mlx.core.Stream]): The stream to use
            for the communication. If unspecified the default communication
            stream is used which can vary by back-end. Default: ``None``.
        max_norm (Optional[float]): If provided, clip gradients to this
            maximum global norm before applying the optimizer update.
            Default: ``None``.

    Returns:
        If ``max_norm`` is ``None``, returns the updated full-parameter tree.
        Otherwise returns ``(parameters, grad_norm)``, where ``grad_norm`` is
        the global gradient norm before clipping.

    Example:

        >>> optimizer = optim.SGD(learning_rate=0.01)
        >>> # Without gradient clipping
        >>> updated_params = fsdp_apply_gradients(grads, params, optimizer)
        >>> model.update(updated_params)
        >>>
        >>> # With gradient clipping
        >>> updated_params, grad_norm = fsdp_apply_gradients(
        ...     grads, params, optimizer, max_norm=1.0
        ... )
        >>> model.update(updated_params)
    """
    fsdp_group = fsdp_group or mx.distributed.init()
    N = fsdp_group.size() * (dp_group.size() if dp_group is not None else 1)

    if N == 1:
        if max_norm is not None:
            gradients, grad_norm = _clip_grads_fsdp(gradients, max_norm)
            return optimizer.apply_gradients(gradients, parameters), grad_norm
        return optimizer.apply_gradients(gradients, parameters)

    flat_grads = tree_flatten(gradients)
    flat_params = tree_flatten(parameters)

    keys, shapes, sizes, dtypes = _extract_info(flat_grads)
    itemsize = dtypes[0].size

    groups = _group_by_size(keys, sizes, itemsize, communication_size)

    S = fsdp_group.size()
    fsdp_rank = fsdp_group.rank()
    # reduce-scatter gradients, shard parameters
    grad_slices = {}
    param_slices = {}
    for group_idx, arr_group in enumerate(groups):
        big_grad = mx.concatenate(
            [flat_grads[i][1].reshape(S, -1) for i in arr_group], axis=1
        )
        grad_slices[group_idx] = (
            mx.distributed.sum_scatter(
                big_grad, group=fsdp_group, stream=communication_stream
            )
            / N
        )
        if dp_group is not None:
            grad_slices[group_idx] = mx.distributed.all_sum(
                grad_slices[group_idx], group=dp_group, stream=communication_stream
            )
        big_param = mx.concatenate(
            [flat_params[i][1].reshape(S, -1) for i in arr_group], axis=1
        )
        param_slices[group_idx] = big_param[fsdp_rank]

    # clip gradients if needed
    grad_norm = None
    if max_norm is not None:
        grad_slices, grad_norm = _clip_grads_fsdp(
            grad_slices, max_norm, group=fsdp_group
        )

    # optimizer step
    updated_param_slices = optimizer.apply_gradients(grad_slices, param_slices)

    # all-gather and reconstruct
    new_flat = []
    for group_idx, arr_group in enumerate(groups):
        big_gathered = mx.distributed.all_gather(
            updated_param_slices[group_idx],
            group=fsdp_group,
            stream=communication_stream,
        )
        split_sizes = [sizes[i] // S for i in arr_group]
        split_indices = []
        acc = 0
        for s in split_sizes:
            acc += s
            split_indices.append(acc)

        parts = mx.split(big_gathered, split_indices[:-1], axis=1)
        for idx_in_group, i in enumerate(arr_group):
            new_flat.append((keys[i], parts[idx_in_group].reshape(shapes[i])))

    result = tree_unflatten(new_flat)
    if max_norm is not None:
        return result, grad_norm
    return result


================================================
FILE: python/mlx/optimizers/__init__.py
================================================
# Copyright © 2023-2024 Apple Inc.

from mlx.optimizers.optimizers import *
from mlx.optimizers.schedulers import *


================================================
FILE: python/mlx/optimizers/optimizers.py
================================================
# Copyright © 2023-2024 Apple Inc.

from typing import Callable, List, Optional, Tuple, Union

import mlx.core as mx
from mlx.nn import Module
from mlx.utils import tree_flatten, tree_map, tree_merge, tree_reduce, tree_unflatten


class Optimizer:
    """The base class for all optimizers. It allows us to implement an
    optimizer on a per-parameter basis and apply it to a parameter tree.
    """

    def __init__(self, schedulers=None):
        self._initialized = False
        self._state = {"step": mx.array(0, mx.uint64)}
        self._schedulers = {k: v for k, v in (schedulers or {}).items()}

    def update(self, model: Module, gradients: dict):
        """Apply the gradients to the parameters of the model and update the
        model with the new parameters.

        Args:
            model (mlx.nn.Module): An mlx module to be updated.
            gradients (dict): A Python tree of gradients, most likely computed
                              via :func:`mlx.nn.value_and_grad`.
        """
        model.update(self.apply_gradients(gradients, model))

    def init(self, parameters: dict):
        """Initialize the optimizer's state

        This function can be used to initialize optimizers which have state
        (like momentum in :class:`SGD`). Using this method is optional as the
        optimizer will initialize itself if the state is not yet set. However,
        there are some cases where explicit initialization is useful in order
        to have access to the :attr:`Optimizer.state` before the first call to
        :meth:`Optimizer.update`.

        Args:
            model (dict): A Python tree of parameters.

        Example:
            >>> optimizer = optim.SGD(learning_rate=1e-1, momentum=0.9)
            >>> model = nn.Linear(2, 2)
            >>> optimizer.init(model.trainable_parameters())
            >>> optimizer.state.keys()
            dict_keys(['step', 'learning_rate', 'weight', 'bias'])
        """

        # Initialize the optimizer state to match the parameter state
        def update_state(params, state):
            if isinstance(params, (list, tuple)):
                state = list(state)
                for i in range(len(state)):
                    state[i] = update_state(params[i], state[i])
                if len(state) != len(params):
                    state.extend(tree_map(lambda _: {}, params[len(state) :]))
                return type(params)(state)
            elif isinstance(params, dict):
                for k, v in params.items():
                    if k not in state:
                        state[k] = tree_map(lambda _: {}, v)
                    else:
                        state[k] = update_state(v, state[k])
                return state
            else:
                return state

        update_state(parameters, self._state)
        tree_map(lambda p, s: s or self.init_single(p, s), parameters, self._state)
        self._initialized = True

    def init_single(self, parameter: mx.array, state: dict):
        """To be extended by the children classes to implement each optimizer's
        state initialization.

        Args:
            parameter (mx.array): A single parameter that will be optimized.
            state (dict): The optimizer's state.
        """
        raise NotImplementedError()

    def apply_gradients(self, gradients: dict, parameters: dict):
        """Apply the gradients to the parameters and return the updated parameters.

        Can be used to update a model via
        ``model.update(opt.apply_gradients(grads, model))`` which is precisely
        how :meth:`Optimizer.update` is implemented.

        Args:
            gradients (dict): A Python tree of gradients.
            parameters (dict): A Python tree of parameters. It can be a
              superset of the gradients. In that case the returned python
              tree will be of the same structure as the gradients.
        """
        if not self._initialized:
            self.init(gradients)

        # Update any scheduled variables
        for param, scheduler in self._schedulers.items():
            self.state[param] = scheduler(self.step)

        # Increment the step
        self.state["step"] = self.step + 1

        # Apply the update
        return tree_map(self.apply_single, gradients, parameters, self.state)

    def apply_single(self, gradient: mx.array, parameter: mx.array, state: dict):
        """To be extended by derived classes to implement the optimizer's update.

        Args:
            gradient (mx.array): The ``parameter`` gradient.
            parameter (mx.array): The ``parameter`` to update.
            state (dict): The optimizer's state.
        """
        raise NotImplementedError()

    @property
    def state(self):
        """The optimizer's state dictionary."""
        return self._state

    @state.setter
    def state(self, state: dict):
        self._initialized = False
        self._state = state

    @property
    def step(self):
        return self.state["step"]

    @property
    def learning_rate(self):
        return self.state["learning_rate"]

    @learning_rate.setter
    def learning_rate(self, learning_rate: Union[float, mx.array]):
        self.state["learning_rate"] = mx.array(learning_rate)

    def _maybe_schedule(
        self, name: str, param: Union[float, Callable[[mx.array], mx.array]]
    ):
        """
        To be used by derived classes to optionally put a parameter on a schedule.
        """
        if isinstance(param, Callable):
            self._schedulers[name] = param
            parameter = param(self.step)
        else:
            parameter = mx.array(param)
        self.state[name] = parameter


class MultiOptimizer(Optimizer):
    """Wraps a list of optimizers with corresponding weight predicates/filters
    to make it easy to use different optimizers for different weights.

    The predicates take the full "path" of the weight and the weight itself and
    return True if it should be considered for this optimizer. The last
    optimizer in the list is a fallback optimizer and no predicate should be
    given for it.

    Args:
        optimizers (list[Optimizer]): A list of optimizers to delegate to
        filters (list[Callable[[str, array], bool]): A list of predicates that
            should be one less than the provided optimizers.
    """

    def __init__(self, optimizers, filters: list = []):
        super().__init__()
        self._state = {}

        if len(filters) != len(optimizers) - 1:
            raise ValueError(
                f"Given {len(filters)} filters but {len(optimizers)-1} needed."
            )

        self.optimizers = optimizers
        self.filters = filters + [lambda *args, **kwargs: True]

    def _split_dictionary(self, gradients: dict):
        if len(self.optimizers) == 1:
            return [gradients]

        parts = [[] for _ in range(len(self.optimizers))]
        flat_gradients = tree_flatten(gradients)
        for k, g in flat_gradients:
            for i, fn in enumerate(self.filters):
                if fn(k, g):
                    parts[i].append((k, g))
                    break

        return [tree_unflatten(p) for p in parts]

    def init(self, parameters: dict):
        for o, p in zip(self.optimizers, self._split_dictionary(parameters)):
            o.init(p)

    def apply_gradients(self, gradients: dict, parameters: dict):
        tree = {}
        for o, g in zip(self.optimizers, self._split_dictionary(gradients)):
            tree = tree_merge(tree, o.apply_gradients(g, parameters))
        return tree

    @property
    def state(self):
        return {"states": [o.state for o in self.optimizers]}

    @state.setter
    def state(self, state: dict):
        if "states" not in state or len(state["states"]) != len(self.optimizers):
            raise ValueError("Invalid state provided")

        for o, s in zip(self.optimizers, state["states"]):
            o.state = s

    @property
    def learning_rate(self):
        return self.optimizers[0].learning_rate

    @learning_rate.setter
    def learning_rate(self, learning_rate: Union[float, mx.array]):
        for o in self.optimizers:
            o.learning_rate = learning_rate


class SGD(Optimizer):
    r"""The stochastic gradient descent optimizer.

    Updates a parameter :math:`w` with a gradient :math:`g` as follows

    .. math::

        v_{t+1} &= \mu v_t + (1 - \tau) g_t \\
        w_{t+1} &= w_t - \lambda v_{t+1}

    Args:
        learning_rate (float or callable): The learning rate :math:`\lambda`.
        momentum (float, optional): The momentum strength :math:`\mu`. Default: ``0``
        weight_decay (float, optional): The weight decay (L2 penalty). Default: ``0``
        dampening (float, optional): Dampening for momentum :math:`\tau`. Default: ``0``
        nesterov (bool, optional): Enables Nesterov momentum. Default: ``False``
    """

    def __init__(
        self,
        learning_rate: Union[float, Callable[[mx.array], mx.array]],
        momentum: float = 0.0,
        weight_decay: float = 0.0,
        dampening: float = 0.0,
        nesterov: bool = False,
    ):
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError(
                "Nesterov momentum requires a momentum and zero dampening."
            )
        super().__init__()

        self._maybe_schedule("learning_rate", learning_rate)
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.dampening = dampening
        self.nesterov = nesterov

    def init_single(self, parameter: mx.array, state: dict):
        """Initialize optimizer state"""
        state["v"] = mx.zeros_like(parameter)

    def apply_single(self, gradient: mx.array, parameter: mx.array, state: dict):
        """Performs the SGD parameter update and stores :math:`v` in the
        optimizer state."""

        if self.weight_decay != 0:
            gradient += self.weight_decay * parameter

        if self.momentum <= 0:
            return parameter - self.learning_rate.astype(gradient.dtype) * gradient

        v = self.momentum * state.get("v")
        if self.dampening > 0:
            v += (1 - self.dampening) * gradient
        else:
            v += gradient

        if self.nesterov:
            update = gradient + self.momentum * v
        else:
            update = v

        state["v"] = v
        return parameter - self.learning_rate.astype(gradient.dtype) * update


class RMSprop(Optimizer):
    r"""The RMSprop optimizer [1].

    [1]: Tieleman, T. and Hinton, G. 2012. Lecture 6.5-rmsprop, coursera: Neural networks for machine learning

    .. math::

        v_{t+1} &= \alpha v_t + (1 - \alpha) g_t^2 \\
        w_{t+1} &= w_t - \lambda \frac{g_t}{\sqrt{v_{t+1}} + \epsilon}

    Args:
        learning_rate (float or callable): The learning rate :math:`\lambda`.
        alpha (float, optional): The smoothing constant :math:`\alpha`.
          Default: ``0.99``
        eps (float, optional): The term :math:`\epsilon` added to the denominator
          to improve numerical stability. Default: ``1e-8``
    """

    def __init__(
        self,
        learning_rate: Union[float, Callable[[mx.array], mx.array]],
        alpha: float = 0.99,
        eps: float = 1e-8,
    ):
        super().__init__()

        self._maybe_schedule("learning_rate", learning_rate)
        self.alpha = alpha
        self.eps = eps

        if self.alpha < 0.0:
            raise ValueError(
                f"RMSprop alpha should be >=0, {self.alpha} was provided instead"
            )
        if self.eps < 0.0:
            raise ValueError(
                f"RMSprop epsilon should be >0, {self.eps} was provided instead"
            )

    def init_single(self, parameter: mx.array, state: dict):
        """Initialize optimizer state"""
        state["v"] = mx.zeros_like(parameter)

    def apply_single(self, gradient: mx.array, parameter: mx.array, state: dict):
        """Performs the RMSprop parameter update and stores :math:`v` in the optimizer state."""
        lr = self.learning_rate.astype(gradient.dtype)
        alpha = self.alpha
        eps = self.eps

        v = state["v"]
        v = alpha * v + (1 - alpha) * mx.square(gradient)
        state["v"] = v

        return parameter - lr * gradient / (mx.sqrt(v) + eps)


class Adagrad(Optimizer):
    r"""The Adagrad optimizer [1].

    Our Adagrad implementation follows the original paper. In detail,

    [1]: Duchi, J., Hazan, E. and Singer, Y., 2011. Adaptive subgradient methods
    for online learning and stochastic optimization. JMLR 2011.

    .. math::

        v_{t+1} &= v_t + g_t^2 \\
        w_{t+1} &= w_t - \lambda \frac{g_t}{\sqrt{v_{t+1}} + \epsilon}

    Args:
        learning_rate (float or callable): The learning rate :math:`\lambda`.
        eps (float, optional): The term :math:`\epsilon` added to the
          denominator to improve numerical stability. Default: ``1e-8``
    """

    def __init__(
        self,
        learning_rate: Union[float, Callable[[mx.array], mx.array]],
        eps: float = 1e-8,
    ):
        super().__init__()

        self._maybe_schedule("learning_rate", learning_rate)
        self.eps = eps

        if self.eps < 0.0:
            raise ValueError(
                f"Adagrad epsilon should be >0, {self.eps} was provided instead"
            )

    def init_single(self, parameter: mx.array, state: dict):
        """Initialize optimizer state"""
        state["v"] = mx.zeros_like(parameter)

    def apply_single(self, gradient: mx.array, parameter: mx.array, state: dict):
        """Performs the Adagrad parameter update and stores :math:`v` in the
        optimizer state."""
        lr = self.learning_rate.astype(gradient.dtype)
        eps = self.eps

        v = state["v"] + mx.square(gradient)
        state["v"] = v

        return parameter - lr * gradient / (mx.sqrt(v) + eps)


class AdaDelta(Optimizer):
    r"""The AdaDelta optimizer with a learning rate [1].

    Our AdaDelta implementation follows the original paper. In detail,

    [1]: Zeiler, M.D., 2012. ADADELTA: an adaptive learning rate method. arXiv preprint arXiv:1212.5701.

    .. math::

        v_{t+1} &= \rho v_t + (1 - \rho) g_t^2 \\
        \Delta w_{t+1} &= \frac{\sqrt{u_t + \epsilon}}{\sqrt{v_{t+1} + \epsilon}} g_t \\
        u_{t+1} &= \rho u_t + (1 - \rho) \Delta w_{t+1}^2 \\
        w_{t+1} &= w_t - \lambda \Delta w_{t+1}

    Args:
        learning_rate (float or callable): The learning rate :math:`\lambda`.
        rho (float, optional): The coefficient :math:`\rho` used for computing a
            running average of squared gradients. Default: ``0.9``
        eps (float, optional): The term :math:`\epsilon` added to the denominator to improve
          numerical stability. Default: `1e-8`
    """

    def __init__(
        self,
        learning_rate: Union[float, Callable[[mx.array], mx.array]],
        rho: float = 0.9,
        eps: float = 1e-6,
    ):
        super().__init__()

        self._maybe_schedule("learning_rate", learning_rate)
        self.rho = rho
        self.eps = eps
        if self.rho < 0.0:
            raise ValueError(
                f"AdaDelta rho should be >=0, {self.rho} was provided instead"
            )
        if self.eps < 0.0:
            raise ValueError(
                f"AdaDelta epsilon should be >0, {self.eps} was provided instead"
            )

    def init_single(self, parameter: mx.array, state: dict):
        """Initialize optimizer state"""
        state["v"] = mx.zeros_like(parameter)
        state["u"] = mx.zeros_like(parameter)

    def apply_single(self, gradient: mx.array, parameter: mx.array, state: dict):
        """Performs the AdaDelta parameter update and stores :math:`v` and
        :math:`u` in the optimizer state."""
        lr = self.learning_rate.astype(gradient.dtype)
        rho = self.rho
        eps = self.eps

        v = state["v"]
        u = state["u"]

        v = rho * v + (1 - rho) * mx.square(gradient)
        d = mx.sqrt(u + eps) / mx.sqrt(v + eps) * gradient
        u = rho * u + (1 - rho) * mx.square(d)

        state["v"] = v
        state["u"] = u

        return parameter - lr * d


class Adam(Optimizer):
    r"""The Adam optimizer [1]. In detail,

    [1]: Kingma, D.P. and Ba, J., 2015. Adam: A method for stochastic
    optimization. ICLR 2015.

    .. math::

        m_{t+1} &= \beta_1 m_t + (1 - \beta_1) g_t \\
        v_{t+1} &= \beta_2 v_t + (1 - \beta_2) g_t^2 \\
        w_{t+1} &= w_t - \lambda \frac{m_{t+1}}{\sqrt{v_{t+1}} + \epsilon}

    Args:
        learning_rate (float or callable): The learning rate :math:`\lambda`.
        betas (Tuple[float, float], optional): The coefficients
          :math:`(\beta_1, \beta_2)` used for computing running averages of the
          gradient and its square. Default: ``(0.9, 0.999)``
        eps (float, optional): The term :math:`\epsilon` added to the
          denominator to improve numerical stability. Default: ``1e-8``
        bias_correction (bool, optional): If set to ``True``, bias correction
          is applied. Default: ``False``
    """

    def __init__(
        self,
        learning_rate: Union[float, Callable[[mx.array], mx.array]],
        betas: List[float] = [0.9, 0.999],
        eps: float = 1e-8,
        bias_correction: bool = False,
    ):
        super().__init__()

        self._maybe_schedule("learning_rate", learning_rate)
        self.betas = betas
        self.eps = eps
        self.bias_correction = bias_correction

    def init_single(self, parameter: mx.array, state: dict):
        """Initialize optimizer state"""
        state["m"] = mx.zeros_like(parameter)
        state["v"] = mx.zeros_like(parameter)

    def apply_single(self, gradient: mx.array, parameter: mx.array, state: dict):
        """Performs the Adam parameter update and stores :math:`v` and
        :math:`m` in the optimizer state."""
        lr = self.learning_rate.astype(gradient.dtype)
        b1, b2 = self.betas
        eps = self.eps
        bias_correction = self.bias_correction
        step = self.step

        m = state["m"]
        v = state["v"]
        m = b1 * m + (1 - b1) * gradient
        v = b2 * v + (1 - b2) * mx.square(gradient)
        state["m"] = m
        state["v"] = v

        if bias_correction:
            c1 = (lr / (1 - b1**step)).astype(gradient.dtype)
            c2 = mx.rsqrt(1 - b2**step).astype(gradient.dtype)
            numerator = c1 * m
            denominator = mx.sqrt(v) * c2 + eps
            return parameter - numerator / denominator
        else:
            return parameter - lr * m / (mx.sqrt(v) + eps)


class AdamW(Adam):
    r"""The AdamW optimizer [1]. We update the weights with a weight_decay
    (:math:`\lambda`) value:

    [1]: Loshchilov, I. and Hutter, F., 2019. Decoupled weight decay
    regularization. ICLR 2019.

    .. math::

        m_{t+1} &= \beta_1 m_t + (1 - \beta_1) g_t \\
        v_{t+1} &= \beta_2 v_t + (1 - \beta_2) g_t^2 \\
        w_{t+1} &= w_t - \alpha (\frac{m_{t+1}}{\sqrt{v_{t+1}} + \epsilon} + \lambda w_t)

    Args:
        learning_rate (float or callable): The learning rate :math:`\alpha`.
        betas (Tuple[float, float], optional): The coefficients
          :math:`(\beta_1, \beta_2)` used for computing running averages of the
          gradient and its square. Default: ``(0.9, 0.999)``
        eps (float, optional): The term :math:`\epsilon` added to the
          denominator to improve numerical stability. Default: ``1e-8``
        weight_decay (float, optional): The weight decay :math:`\lambda`.
          Default: ``0.01``.
        bias_correction (bool, optional): If set to ``True``, bias correction
          is applied. Default: ``False``
    """

    def __init__(
        self,
        learning_rate: Union[float, Callable[[mx.array], mx.array]],
        betas: List[float] = [0.9, 0.999],
        eps: float = 1e-8,
        weight_decay: float = 0.01,
        bias_correction: bool = False,
    ):
        super().__init__(
            learning_rate=learning_rate,
            betas=betas,
            eps=eps,
            bias_correction=bias_correction,
        )
        self.weight_decay = weight_decay

    def apply_single(self, gradient: mx.array, parameter: mx.array, state: dict):
        """Performs the AdamW parameter update by modifying the parameters
        passed into Adam.
        """

        lr = self.learning_rate.astype(gradient.dtype)
        return super().apply_single(
            gradient, parameter * (1 - lr * self.weight_decay), state
        )


class Adamax(Adam):
    r"""The Adamax optimizer, a variant of Adam based on the infinity norm [1].

    Our Adam implementation follows the original paper and omits the bias
    correction in the first and second moment estimates. In detail,

    [1]: Kingma, D.P. and Ba, J., 2015. Adam: A method for stochastic
    optimization. ICLR 2015.

    .. math::

        m_{t+1} &= \beta_1 m_t + (1 - \beta_1) g_t \\
        v_{t+1} &= \max(\beta_2 v_t, |g_t|) \\
        w_{t+1} &= w_t - \lambda \frac{m_{t+1}}{v_{t+1} + \epsilon}

    Args:
        learning_rate (float or callable): The learning rate :math:`\lambda`.
        betas (Tuple[float, float], optional): The coefficients
          :math:`(\beta_1, \beta_2)` used for computing running averages of the
          gradient and its square. Default: ``(0.9, 0.999)``
        eps (float, optional): The term :math:`\epsilon` added to the
          denominator to improve numerical stability. Default: ``1e-8``
    """

    def __init__(
        self,
        learning_rate: Union[float, Callable[[mx.array], mx.array]],
        betas: List[float] = [0.9, 0.999],
        eps: float = 1e-8,
    ):
        super().__init__(learning_rate, betas, eps)
        if not 0.0 <= eps:
            raise ValueError(
                f"Epsilon value should be >=0, {self.eps} was provided instead"
            )

    def init_single(self, parameter: mx.array, state: dict):
        """Initialize optimizer state"""
        state["m"] = mx.zeros_like(parameter)
        state["v"] = mx.zeros_like(parameter)

    def apply_single(self, gradient: mx.array, parameter: mx.array, state: dict):
        """Performs the Adamax parameter update and stores :math:`v` and
        :math:`m` in the optimizer state."""
        lr = self.learning_rate.astype(gradient.dtype)
        b1, b2 = self.betas
        eps = self.eps

        m = state["m"]
        v = state["v"]

        m = b1 * m + (1 - b1) * gradient
        v = mx.maximum(b2 * v, mx.abs(gradient))
        state["m"] = m
        state["v"] = v

        return parameter - lr * m / (v + eps)


class Lion(Optimizer):
    r"""The Lion optimizer [1].

    Since updates are computed through the sign operation, they tend to
    have larger norm than for other optimizers such as SGD and Adam.
    We recommend a learning rate that is 3-10x smaller than AdamW and a
    weight decay 3-10x larger than AdamW to maintain the strength
    (lr * wd). Our Lion implementation follows the original paper. In
    detail,

    [1]: Chen, X. Symbolic Discovery of Optimization Algorithms. arXiv
    preprint arXiv:2302.06675.

    .. math::

        c_{t + 1} &= \beta_1 m_t + (1 - \beta_1) g_t \\
        m_{t + 1} &= \beta_2 m_t + (1 - \beta_2) g_t \\
        w_{t + 1} &= w_t - \eta (\text{sign}(c_t) + \lambda w_t)

    Args:
        learning_rate (float or callable): The learning rate :math:`\eta`.
        betas (Tuple[float, float], optional): The coefficients
          :math:`(\beta_1, \beta_2)` used for computing the gradient
          momentum and update direction. Default: ``(0.9, 0.99)``
        weight_decay (float, optional): The weight decay :math:`\lambda`. Default: ``0.0``
    """

    def __init__(
        self,
        learning_rate: Union[float, Callable[[mx.array], mx.array]],
        betas: List[float] = [0.9, 0.99],
        weight_decay: float = 0.0,
    ):
        super().__init__()

        self._maybe_schedule("learning_rate", learning_rate)
        self.betas = betas
        self.weight_decay = weight_decay

    def init_single(self, parameter: mx.array, state: dict):
        """Initialize optimizer state"""
        state["m"] = mx.zeros_like(parameter)

    def apply_single(self, gradient: mx.array, parameter: mx.array, state: dict):
        """Performs the Lion parameter update and stores :math:`m`
        in the optimizer state."""
        lr = self.learning_rate.astype(gradient.dtype)
        b1, b2 = self.betas
        weight_decay = self.weight_decay

        m = state["m"]
        c = b1 * m + (1 - b1) * gradient
        state["m"] = b2 * m + (1 - b2) * gradient
        if weight_decay > 0:
            parameter = (1 - lr * weight_decay) * parameter
        return parameter - lr * mx.sign(c)


class Adafactor(Optimizer):
    r"""The Adafactor optimizer.

    Our Adafactor implementation follows the original paper: `Adafactor:
    Adaptive Learning Rates with Sublinear Memory Cost
    <https://arxiv.org/abs/1804.04235>`_

    Args:
        learning_rate (float or callable, optional): The learning rate.
            Default: ``None``.
        eps (tuple(float, float), optional): The first term :math:`\epsilon_1`
            added to the square of the gradients to improve numerical
            stability and the second term :math:`\epsilon_2` is used for
            parameter scaling if ``parameter_scale`` is set to ``True``.
            Default: ``(1e-30, 1e-3)``.
        clip_threshold (float, optional): Clips the unscaled update at
            ``clip_threshold``. Default: ``1.0``.
        decay_rate (float, optional): Coefficient for the running average
            of the squared gradient. Default: ``-0.8``.
        beta_1 (float, optional): If set to a value bigger than zero
            then first moment will be used. Default: ``None``.
        weight_decay (float, optional): The weight decay :math:`\lambda`.
            Default: ``0.0``.
        scale_parameter (bool, optional): If set to ``True`` the learning rate
            will be scaled by :math:`\max(\epsilon_1, \text{RMS}(w_{t-1}))`.
            Default: ``True``.
        relative_step (bool, optional): If set to ``True`` the ``learning_rate``
            will be ignored and relative step size will be computed.
            Default: ``True``.
        warmup_init (bool, optional): If set to ``True`` then the relative
            step size will be calculated by the current step. Default:
            ``False``.
    """

    def __init__(
        self,
        learning_rate: Union[float, Callable[[mx.array], mx.array], None] = None,
        eps: Tuple[float, float] = (1e-30, 1e-3),
        clip_threshold: float = 1.0,
        decay_rate: float = -0.8,
        beta_1: Optional[float] = None,
        weight_decay: float = 0.0,
        scale_parameter: bool = True,
        relative_step: bool = True,
        warmup_init: bool = False,
    ):
        super().__init__()
        if learning_rate is not None:
            self._maybe_schedule("learning_rate", learning_rate)
        self.eps = eps
        self.clip_threshold = clip_threshold
        self.decay_rate = decay_rate
        self.beta_1 = beta_1
        self.weight_decay = weight_decay
        self.scale_parameter = scale_parameter
        self.relative_step = relative_step
        self.warmup_init = warmup_init

    def init_single(self, parameter: mx.array, state: dict):
        """Initialize optimizer state"""
        if parameter.ndim >= 2:
            shape = parameter.shape
            dtype = parameter.dtype
            state["exp_avg_sq_row"] = mx.zeros(shape[:-1], dtype=dtype)
            state["exp_avg_sq_col"] = mx.zeros(shape[:-2] + shape[-1:], dtype=dtype)
        else:
            state["exp_avg_sq"] = mx.zeros_like(parameter)

        if self.beta_1 is not None:
            state["exp_avg"] = mx.zeros_like(parameter)

    def _compute_rms(self, inputs):
        return mx.sqrt(mx.mean(mx.square(inputs)))

    def _compute_learning_rate(self, step, parameter_rms):
        if self.relative_step:
            min_step = 1e-6 * step if self.warmup_init else 1e-2
            relative_step_size = mx.minimum(min_step, mx.rsqrt(step))
        else:
            relative_step_size = self.learning_rate

        relative_step_size = relative_step_size.astype(parameter_rms.dtype)
        parameter_scale = 1.0
        if self.scale_parameter:
            parameter_scale = mx.maximum(self.eps[1], parameter_rms)
        return parameter_scale * relative_step_size

    def _approximate_exp_moving_avg(self, exp_avg_sq_row, exp_avg_sq_col):
        r_factor = mx.rsqrt(
            exp_avg_sq_row / mx.mean(exp_avg_sq_row, axis=-1, keepdims=True)
        )
        c_factor = mx.rsqrt(exp_avg_sq_col)
        return mx.matmul(
            mx.expand_dims(r_factor, axis=-1), mx.expand_dims(c_factor, axis=0)
        )

    def apply_single(self, gradient: mx.array, parameter: mx.array, state: dict):
        """Performs the Adafactor parameter and state update."""
        factored = gradient.ndim >= 2

        step = self.step
        use_first_moment = self.beta_1 is not None

        parameter_rms = self._compute_rms(parameter)
        learning_rate = self._compute_learning_rate(step, parameter_rms)
        beta_2 = 1.0 - (step**self.decay_rate).astype(parameter_rms.dtype)
        update = mx.square(gradient) + self.eps[0]

        if factored:
            exp_avg_sq_row = state["exp_avg_sq_row"]
            exp_avg_sq_col = state["exp_avg_sq_col"]
            exp_avg_sq_row = (beta_2 * exp_avg_sq_row) + (
                (1 - beta_2) * mx.mean(update, axis=-1)
            )
            exp_avg_sq_col = (beta_2 * exp_avg_sq_col) + (
                (1 - beta_2) * mx.mean(update, axis=-2)
            )
            state["exp_avg_sq_row"] = exp_avg_sq_row
            state["exp_avg_sq_col"] = exp_avg_sq_col
            update = self._approximate_exp_moving_avg(exp_avg_sq_row, exp_avg_sq_col)
            update = update * gradient
        else:
            exp_avg_sq = state["exp_avg_sq"]
            exp_avg_sq = (beta_2 * exp_avg_sq) + ((1 - beta_2) * update)
            state["exp_avg_sq"] = exp_avg_sq
            update = mx.rsqrt(exp_avg_sq) * gradient

        update = update / mx.maximum(
            1.0, self._compute_rms(update) / self.clip_threshold
        )
        update = learning_rate * update

        if use_first_moment:
            exp_avg = state["exp_avg"]
            exp_avg = (self.beta_1 * exp_avg) + ((1 - self.beta_1) * update)
            state["exp_avg"] = exp_avg
            update = exp_avg

        if self.weight_decay != 0:
            parameter += parameter * (-self.weight_decay * learning_rate)
        return parameter - update


class Muon(Optimizer):
    r"""The Muon optimizer.

    Our Muon (MomentUm Orthogonalized by Newton-schulz) optimizer follows the
    original implementation: `Muon: An optimizer for hidden layers in neural
    networks <https://kellerjordan.github.io/posts/muon/>`_

    Note:
        - Muon may be sub-optimal for the embedding layer, the final fully
          connected layer, or any 0D/1D parameters. Those should be optimized
          by a different method (e.g., :class:`AdamW`).
        - For 4D convolutional filters, it works by flattening their last
          dimensions.

    Args:
        learning_rate (float or callable): The learning rate.
        momentum (float, optional): The momentum strength. Default: ``0.95``
        weight_decay (float, optional): The weight decay (L2 penalty).
            Default: ``0.01``
        nesterov (bool, optional): Enables Nesterov momentum. Recommended for
            better performance.  Default: ``True``
        ns_steps (int, optional): Number of Newton-Schulz iteration steps for
            orthogonalization.  Default: ``5``
    """

    def __init__(
        self,
        learning_rate: Union[float, Callable[[mx.array], mx.array]],
        momentum: float = 0.95,
        weight_decay: float = 0.01,
        nesterov: bool = True,
        ns_steps: int = 5,
    ):
        super().__init__()

        self._maybe_schedule("learning_rate", learning_rate)
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.nesterov = nesterov
        self.ns_steps = ns_steps

    def init_single(self, parameter: mx.array, state: dict):
        """Initialize optimizer state"""
        state["v"] = mx.zeros_like(parameter)

    def _zeropower_via_newtonschulz5(self, X, steps: int):
        assert (
            X.ndim == 2
        ), f"Expected a 2D array for Newton-Schulz iteration, got shape {X.shape} instead."
        a, b, c = (3.4445, -4.7750, 2.0315)
        transpose_needed = X.shape[-2] > X.shape[-1]

        if transpose_needed:
            X = X.T

        X = X / (mx.linalg.norm(X, keepdims=True) + 1e-7)

        for _ in range(steps):
            A = X @ X.T
            B = mx.addmm(b * A, A, A, beta=1.0, alpha=c)
            X = mx.addmm(a * X, B, X, beta=1.0, alpha=1.0)

        if transpose_needed:
            X = X.T
        return X

    def apply_single(self, gradient: mx.array, parameter: mx.array, state: dict):
        """Performs the Muon parameter update"""

        if self.weight_decay != 0:
            gradient = gradient + self.weight_decay * parameter

        v = self.momentum * state["v"]
        v = v + (1 - self.momentum) * gradient
        state["v"] = v

        if self.nesterov:
            update = gradient * (1 - self.momentum) + v * self.momentum
        else:
            update = v

        lr = self.learning_rate.astype(gradient.dtype)

        if update.ndim >= 2:
            original_shape = update.shape
            reshape_needed = update.ndim > 2

            if reshape_needed:
                update = mx.reshape(update, (update.shape[0], -1))

            update = self._zeropower_via_newtonschulz5(update, steps=self.ns_steps)

            if reshape_needed:
                update = mx.reshape(update, original_shape)

            lr *= max(1, update.shape[-2] / update.shape[-1]) ** 0.5

        return parameter - lr * update


def clip_grad_norm(grads, max_norm):
    """Clips the global norm of the gradients.

    This function ensures that the global norm of the gradients does not exceed
    ``max_norm``. It scales down the gradients proportionally if their norm is
    greater than ``max_norm``.

    Example:
        >>> grads = {"w1": mx.array([2, 3]), "w2": mx.array([1])}
        >>> clipped_grads, total_norm = clip_grad_norm(grads, max_norm=2.0)
        >>> print(clipped_grads)
        {"w1": mx.array([...]), "w2": mx.array([...])}

    Args:
        grads (dict): A dictionary containing the gradient arrays.
        max_norm (float): The maximum allowed global norm of the gradients.

    Returns:
        (dict, float): The possibly rescaled gradients and the original
        gradient norm.
    """
    norm_squared = tree_reduce(lambda acc, g: acc + g.square().sum(), grads, 0.0)
    total_norm = mx.sqrt(norm_squared)
    normalizer = mx.minimum(max_norm / (total_norm + 1e-6), 1.0)
    clipped_grads = tree_map(lambda g: g * normalizer, grads)
    return clipped_grads, total_norm


================================================
FILE: python/mlx/optimizers/schedulers.py
================================================
# Copyright © 2023-2024 Apple Inc.

import math
from typing import Callable, List

import mlx.core as mx


def exponential_decay(init: float, decay_rate: float) -> Callable:
    r"""Make an exponential decay scheduler.

    Args:
        init (float): Initial value.
        decay_rate (float): Multiplicative factor to decay by.

    Example:
        >>> lr_schedule = optim.exponential_decay(1e-1, 0.9)
        >>> optimizer = optim.SGD(learning_rate=lr_schedule)
        >>> optimizer.learning_rate
        array(0.1, dtype=float32)
        >>>
        >>> for _ in range(5): optimizer.update({}, {})
        ...
        >>> optimizer.learning_rate
        array(0.06561, dtype=float32)
    """

    def schedule(step):
        return init * decay_rate**step

    return schedule


def step_decay(init: float, decay_rate: float, step_size: int) -> Callable:
    r"""Make a step decay scheduler.

    Args:
        init (float): Initial value.
        decay_rate (float): Multiplicative factor to decay by.
        step_size (int): Decay every ``step_size`` steps.

    Example:

        >>> lr_schedule = optim.step_decay(1e-1, 0.9, 10)
        >>> optimizer = optim.SGD(learning_rate=lr_schedule)
        >>> optimizer.learning_rate
        array(0.1, dtype=float32)
        >>>
        >>> for _ in range(21): optimizer.update({}, {})
        ...
        >>> optimizer.learning_rate
        array(0.081, dtype=float32)
    """

    def schedule(step):
        return init * (decay_rate ** (step // step_size))

    return schedule


def cosine_decay(init: float, decay_steps: int, end: float = 0.0) -> Callable:
    r"""Make a cosine decay scheduler.

    Args:
        init (float): Initial value.
        decay_steps (int): Number of steps to decay over. The decayed
            value is constant for steps beyond ``decay_steps``.
        end (float, optional): Final value to decay to. Default: ``0``.

    Example:

        >>> lr_schedule = optim.cosine_decay(1e-1, 1000)
        >>> optimizer = optim.SGD(learning_rate=lr_schedule)
        >>> optimizer.learning_rate
        array(0.1, dtype=float32)
        >>>
        >>> for _ in range(5): optimizer.update({}, {})
        ...
        >>> optimizer.learning_rate
        array(0.0999961, dtype=float32)
    """

    def schedule(step):
        s = mx.minimum(step, decay_steps)
        decay = 0.5 * (1.0 + mx.cos((math.pi / decay_steps) * s))
        return end + decay * (init - end)

    return schedule


def join_schedules(schedules: List[Callable], boundaries: List[int]) -> Callable:
    r"""Join multiple schedules to create a new schedule.

    Args:
        schedules (list(Callable)): A list of schedules. Schedule :math:`i+1`
          receives a step count indicating the number of steps since
          the :math:`i`-th boundary.
        boundaries (list(int)): A list of integers of length ``len(schedules) - 1``
          that indicates when to transition between schedules.

    Example:
        >>> linear = optim.linear_schedule(0, 1e-1, steps=10)
        >>> cosine = optim.cosine_decay(1e-1, 200)
        >>> lr_schedule = optim.join_schedules([linear, cosine], [10])
        >>> optimizer = optim.Adam(learning_rate=lr_schedule)
        >>> optimizer.learning_rate
        array(0.0, dtype=float32)
        >>> for _ in range(12): optimizer.update({}, {})
        ...
        >>> optimizer.learning_rate
        array(0.0999938, dtype=float32)
    """
    if len(schedules) == 0:
        raise ValueError("Must provide at least 1 schedule to join.")

    if len(schedules) != len(boundaries) + 1:
        raise ValueError(
            f"Received {len(boundaries)} boundaries but "
            f"expected {len(schedules) - 1}."
        )

    def schedule(step):
        output = schedules[0](step)
        for boundary, schedule in zip(boundaries, schedules[1:]):
            output = mx.where(step < boundary, output, schedule(step - boundary))
        return output

    return schedule


def linear_schedule(init: float, end: float, steps: int) -> Callable:
    r"""Make a linear scheduler.

    Args:
        init (float): Initial value.
        end (float): Final value.
        steps (int): Number of steps to apply the schedule over. The value is
          ``end`` for any steps beyond ``steps``.

    Example:

        >>> lr_schedule = optim.linear_schedule(0, 1e-1, 100)
        >>> optimizer = optim.Adam(learning_rate=lr_schedule)
        >>> optimizer.learning_rate
        array(0.0, dtype=float32)
        >>> for _ in range(101): optimizer.update({}, {})
        ...
        >>> optimizer.learning_rate
        array(0.1, dtype=float32)
    """
    if steps < 1:
        raise ValueError(f"steps must be greater than 0, but got {steps}.")

    def schedule(step):
        step = mx.minimum(step, steps)
        return step * ((end - init) / steps) + init

    return schedule


================================================
FILE: python/mlx/py.typed
================================================


================================================
FILE: python/mlx/utils.py
================================================
# Copyright © 2023 Apple Inc.
from collections import defaultdict
from itertools import zip_longest
from typing import Any, Callable, Dict, List, Optional, Tuple, Union


def tree_map(
    fn: Callable, tree: Any, *rest: Any, is_leaf: Optional[Callable] = None
) -> Any:
    """Applies ``fn`` to the leaves of the Python tree ``tree`` and
    returns a new collection with the results.

    If ``rest`` is provided, every item is assumed to be a superset of ``tree``
    and the corresponding leaves are provided as extra positional arguments to
    ``fn``. In that respect, :meth:`tree_map` is closer to :func:`itertools.starmap`
    than to :func:`map`.

    The keyword argument ``is_leaf`` decides what constitutes a leaf from
    ``tree`` similar to :func:`tree_flatten`.

    .. code-block:: python

        import mlx.nn as nn
        from mlx.utils import tree_map

        model = nn.Linear(10, 10)
        print(model.parameters().keys())
        # dict_keys(['weight', 'bias'])

        # square the parameters
        model.update(tree_map(lambda x: x*x, model.parameters()))

    Args:
        fn (callable): The function that processes the leaves of the tree.
        tree (Any): The main Python tree that will be iterated upon.
        rest (tuple[Any]): Extra trees to be iterated together with ``tree``.
        is_leaf (callable, optional): An optional callable that returns ``True``
           if the passed object is considered a leaf or ``False`` otherwise.

    Returns:
        A Python tree with the new values returned by ``fn``.
    """
    if is_leaf is not None and is_leaf(tree):
        return fn(tree, *rest)
    elif isinstance(tree, (list, tuple)):
        TreeType = type(tree)
        subtrees = (
            tree_map(fn, child, *(r[i] for r in rest), is_leaf=is_leaf)
            for i, child in enumerate(tree)
        )
        return TreeType(*subtrees) if hasattr(tree, "_fields") else TreeType(subtrees)
    elif isinstance(tree, dict):
        return {
            k: tree_map(fn, child, *(r[k] for r in rest), is_leaf=is_leaf)
            for k, child in tree.items()
        }
    else:
        return fn(tree, *rest)


def tree_map_with_path(
    fn: Callable,
    tree: Any,
    *rest: Any,
    is_leaf: Optional[Callable] = None,
    path: Optional[Any] = None,
) -> Any:
    """Applies ``fn`` to the path and leaves of the Python tree ``tree`` and
    returns a new collection with the results.

    This function is the same :func:`tree_map` but the ``fn`` takes the path as
    the first argument followed by the remaining tree nodes.

    Args:
        fn (callable): The function that processes the leaves of the tree.
        tree (Any): The main Python tree that will be iterated upon.
        rest (tuple[Any]): Extra trees to be iterated together with ``tree``.
        is_leaf (Optional[Callable]): An optional callable that returns ``True``
           if the passed object is considered a leaf or ``False`` otherwise.
        path (Optional[Any]): Prefix will be added to the result.

    Returns:
        A Python tree with the new values returned by ``fn``.

    Example:
        >>> from mlx.utils import tree_map_with_path
        >>> tree = {"model": [{"w": 0, "b": 1}, {"w": 0, "b": 1}]}
        >>> new_tree = tree_map_with_path(lambda path, _: print(path), tree)
        model.0.w
        model.0.b
        model.1.w
        model.1.b
    """
    if is_leaf is not None and is_leaf(tree):
        return fn(path, tree, *rest)
    elif isinstance(tree, (list, tuple)):
        prefix = f"{path}." if path else ""
        TreeType = type(tree)
        return TreeType(
            tree_map_with_path(
                fn, child, *(r[i] for r in rest), is_leaf=is_leaf, path=f"{prefix}{i}"
            )
            for i, child in enumerate(tree)
        )
    elif isinstance(tree, dict):
        prefix = f"{path}." if path else ""
        return {
            k: tree_map_with_path(
                fn, child, *(r[k] for r in rest), is_leaf=is_leaf, path=f"{prefix}{k}"
            )
            for k, child in tree.items()
        }
    else:
        return fn(path, tree, *rest)


def tree_flatten(
    tree: Any,
    prefix: str = "",
    is_leaf: Optional[Callable] = None,
    destination: Optional[Union[List[Tuple[str, Any]], Dict[str, Any]]] = None,
) -> Union[List[Tuple[str, Any]], Dict[str, Any]]:
    """Flattens a Python tree to a list of key, value tuples.

    The keys are using the dot notation to define trees of arbitrary depth and
    complexity.

    .. code-block:: python

        from mlx.utils import tree_flatten

        print(tree_flatten([[[0]]]))
        # [("0.0.0", 0)]

        print(tree_flatten([[[0]]], prefix=".hello"))
        # [("hello.0.0.0", 0)]

        tree_flatten({"a": {"b": 1}}, destination={})
        {"a.b": 1}

    .. note::
       Dictionaries should have keys that are valid Python identifiers.

    Args:
        tree (Any): The Python tree to be flattened.
        prefix (str): A prefix to use for the keys. The first character is
            always discarded.
        is_leaf (callable): An optional callable that returns True if the
            passed object is considered a leaf or False otherwise.
        destination (list or dict, optional): A list or dictionary to store the
            flattened tree. If None an empty list will be used. Default: ``None``.

    Returns:
        Union[List[Tuple[str, Any]], Dict[str, Any]]: The flat representation of
            the Python tree.
    """
    if destination is None:
        destination = []

    # Create the function to update the destination. We are taking advantage of
    # the fact that list.extend and dict.update have the same API to simplify
    # the code a bit.
    if isinstance(destination, list):
        _add_to_destination = destination.extend
    elif isinstance(destination, dict):
        _add_to_destination = destination.update
    else:
        raise ValueError("Destination should be either a list or a dictionary or None")

    # Leaf identified by is_leaf so add it and return
    if is_leaf is not None and is_leaf(tree):
        _add_to_destination([(prefix[1:], tree)])
        return destination

    # List or tuple so recursively add each subtree
    if isinstance(tree, (list, tuple)):
        for i, item in enumerate(tree):
            tree_flatten(item, f"{prefix}.{i}", is_leaf, destination)
        return destination

    # Dictionary so recursively add each subtree
    if isinstance(tree, dict):
        for key, value in tree.items():
            tree_flatten(value, f"{prefix}.{key}", is_leaf, destination)
        return destination

    # Leaf so add it and return
    _add_to_destination([(prefix[1:], tree)])

    return destination


def tree_unflatten(tree: Union[List[Tuple[str, Any]], Dict[str, Any]]) -> Any:
    """Recreate a Python tree from its flat representation.

    .. code-block:: python

        from mlx.utils import tree_unflatten

        d = tree_unflatten([("hello.world", 42)])
        print(d)
        # {"hello": {"world": 42}}

        d = tree_unflatten({"hello.world": 42})
        print(d)
        # {"hello": {"world": 42}}

    Args:
        tree (list[tuple[str, Any]] or dict[str, Any]): The flat representation of a Python tree.
           For instance as returned by :meth:`tree_flatten`.

    Returns:
        A Python tree.
    """
    items = tree.items() if isinstance(tree, dict) else tree

    # Special case when we have just one element in the tree ie not a tree
    if len(items) == 1:
        key, value = next(iter(items))
        if key == "":
            return value

    # collect children
    children = defaultdict(list)
    for key, value in items:
        current_idx, *next_idx = key.split(".", maxsplit=1)
        next_idx = "" if not next_idx else next_idx[0]
        children[current_idx].append((next_idx, value))

    # Assume they are a list and fail to dict if the keys are not all integers
    try:
        keys = sorted((int(idx), idx) for idx in children.keys())
        l = []
        for i, k in keys:
            # if i <= len(l), no {} will be appended.
            l.extend([{} for _ in range(i - len(l))])
            l.append(tree_unflatten(children[k]))
        return l
    except ValueError:
        return {k: tree_unflatten(v) for k, v in children.items()}


def tree_reduce(fn, tree, initializer=None, is_leaf=None):
    """Applies a reduction to the leaves of a Python tree.

    This function reduces Python trees into an accumulated result by applying
    the provided function ``fn`` to the leaves of the tree.

    Example:
        >>> from mlx.utils import tree_reduce
        >>> tree = {"a": [1, 2, 3], "b": [4, 5]}
        >>> tree_reduce(lambda acc, x: acc + x, tree, 0)
        15

    Args:
        fn (callable): The reducer function that takes two arguments (accumulator,
            current value) and returns the updated accumulator.
        tree (Any): The Python tree to reduce. It can be any nested combination of
            lists, tuples, or dictionaries.
        initializer (Any, optional): The initial value to start the reduction. If
            not provided, the first leaf value is used.
        is_leaf (callable, optional): A function to determine if an object is a
            leaf, returning ``True`` for leaf nodes and ``False`` otherwise.

    Returns:
        Any: The accumulated value.
    """
    if is_leaf is not None and is_leaf(tree):
        return tree if initializer is None else fn(initializer, tree)

    accumulator = initializer

    if isinstance(tree, (list, tuple)):
        for item in tree:
            accumulator = tree_reduce(fn, item, accumulator, is_leaf)
    elif isinstance(tree, dict):
        for item in tree.values():
            accumulator = tree_reduce(fn, item, accumulator, is_leaf)
    else:
        return tree if accumulator is None else fn(accumulator, tree)

    return accumulator


def tree_merge(tree_a, tree_b, merge_fn=None):
    """Merge two Python trees in one containing the values of both. It can be
    thought of as a deep dict.update method.

    Args:
        tree_a (Any): The first Python tree.
        tree_b (Any): The second Python tree.
        merge_fn (callable, optional): A function to merge leaves.

    Returns:
        The Python tree containing the values of both ``tree_a`` and
        ``tree_b``.
    """
    if isinstance(tree_a, (dict, list, tuple)) and len(tree_a) == 0:
        tree_a = None
    if isinstance(tree_b, (dict, list, tuple)) and len(tree_b) == 0:
        tree_b = None
    if tree_a is None and tree_b is not None:
        return tree_b
    if tree_a is not None and tree_b is None:
        return tree_a

    if isinstance(tree_a, (list, tuple)) and isinstance(tree_b, (list, tuple)):
        TreeType = type(tree_a)
        return TreeType(
            tree_merge(a, b, merge_fn) for a, b in zip_longest(tree_a, tree_b)
        )
    elif isinstance(tree_a, dict) and isinstance(tree_b, dict):
        return {
            k: tree_merge(tree_a.get(k, None), tree_b.get(k, None), merge_fn)
            for k in set(tree_a.keys()) | set(tree_b.keys())
        }
    else:
        if merge_fn is None:
            raise ValueError(
                (
                    "Trees contain elements at the same locations but no merge "
                    "function was provided"
                )
            )
        return merge_fn(tree_a, tree_b)


================================================
FILE: python/src/CMakeLists.txt
================================================
nanobind_add_module(
  core
  NB_STATIC
  STABLE_ABI
  LTO
  NOMINSIZE
  NB_DOMAIN
  mlx
  ${CMAKE_CURRENT_SOURCE_DIR}/mlx.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/array.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/convert.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/distributed.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/export.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/fast.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/fft.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/load.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/metal.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/memory.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/mlx_func.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/ops.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/stream.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/transforms.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/random.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/linalg.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/constants.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/trees.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp)

if(MLX_BUILD_PYTHON_STUBS)
  nanobind_add_stub(
    core_stub
    # Run stubgen -m mlx.core -i python -p _stub_patterns.txt -o python/mlx
    RECURSIVE
    MODULE
    "mlx.core"
    PYTHON_PATH
    "$<TARGET_FILE_DIR:core>/.."
    "${CMAKE_CURRENT_SOURCE_DIR}/.."
    PATTERN_FILE
    "${CMAKE_CURRENT_SOURCE_DIR}/../mlx/_stub_patterns.txt"
    OUTPUT_PATH
    "${CMAKE_CURRENT_SOURCE_DIR}/../mlx"
    # Note that the list is passed to cmake for dependency managment and not
    # used by stubgen.
    OUTPUT
    "${CMAKE_CURRENT_SOURCE_DIR}/../mlx/core/__init__.pyi"
    "${CMAKE_CURRENT_SOURCE_DIR}/../mlx/core/cuda.pyi"
    "${CMAKE_CURRENT_SOURCE_DIR}/../mlx/core/distributed.pyi"
    "${CMAKE_CURRENT_SOURCE_DIR}/../mlx/core/fast.pyi"
    "${CMAKE_CURRENT_SOURCE_DIR}/../mlx/core/fft.pyi"
    "${CMAKE_CURRENT_SOURCE_DIR}/../mlx/core/linalg.pyi"
    "${CMAKE_CURRENT_SOURCE_DIR}/../mlx/core/metal.pyi"
    "${CMAKE_CURRENT_SOURCE_DIR}/../mlx/core/random.pyi"
    # Make this an optional installable component.
    EXCLUDE_FROM_ALL
    INSTALL_TIME
    COMPONENT
    core_stub)
endif()

if(NOT MLX_PYTHON_BINDINGS_OUTPUT_DIRECTORY)
  if(NOT CMAKE_LIBRARY_OUTPUT_DIRECTORY)
    set(MLX_PYTHON_BINDINGS_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
  else()
    set(MLX_PYTHON_BINDINGS_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
  endif()
endif()

set_target_properties(
  core
  PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${MLX_PYTHON_BINDINGS_OUTPUT_DIRECTORY}
             # Do not append a sub-dir for multi-config generators like MSVC
             # and XCode.
             LIBRARY_OUTPUT_DIRECTORY_RELEASE
             ${MLX_PYTHON_BINDINGS_OUTPUT_DIRECTORY}
             LIBRARY_OUTPUT_DIRECTORY_DEBUG
             ${MLX_PYTHON_BINDINGS_OUTPUT_DIRECTORY}
             LIBRARY_OUTPUT_DIRECTORY_RELWITHDEBINFO
             ${MLX_PYTHON_BINDINGS_OUTPUT_DIRECTORY}
             LIBRARY_OUTPUT_DIRECTORY_MINSIZEREL
             ${MLX_PYTHON_BINDINGS_OUTPUT_DIRECTORY})

target_link_libraries(core PRIVATE mlx)

if(BUILD_SHARED_LIBS)
  if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
    set_target_properties(core PROPERTIES INSTALL_RPATH "@loader_path/lib")
  else()
    set_target_properties(core PROPERTIES INSTALL_RPATH "\$ORIGIN/lib")
  endif()
  # Do not add build dir to rpath.
  set_target_properties(core PROPERTIES BUILD_WITH_INSTALL_RPATH ON)
endif()


================================================
FILE: python/src/array.cpp
================================================
// Copyright © 2023-2024 Apple Inc.
#include <cstdint>
#include <cstring>
#include <sstream>

#include <nanobind/ndarray.h>
#include <nanobind/stl/complex.h>
#include <nanobind/stl/optional.h>
#include <nanobind/stl/string.h>
#include <nanobind/stl/variant.h>
#include <nanobind/stl/vector.h>
#include <nanobind/typing.h>

#include "mlx/backend/metal/metal.h"
#include "python/src/buffer.h"
#include "python/src/convert.h"
#include "python/src/indexing.h"
#include "python/src/small_vector.h"
#include "python/src/utils.h"

#include "mlx/mlx.h"

namespace mx = mlx::core;
namespace nb = nanobind;
using namespace nb::literals;

class ArrayAt {
 public:
  ArrayAt(mx::array x) : x_(std::move(x)) {}
  ArrayAt& set_indices(nb::object indices) {
    initialized_ = true;
    indices_ = indices;
    return *this;
  }
  void check_initialized() {
    if (!initialized_) {
      throw std::invalid_argument(
          "Must give indices to array.at (e.g. `x.at[0].add(4)`).");
    }
  }

  mx::array add(const ScalarOrArray& v) {
    check_initialized();
    return mlx_add_item(x_, indices_, v);
  }
  mx::array subtract(const ScalarOrArray& v) {
    check_initialized();
    return mlx_subtract_item(x_, indices_, v);
  }
  mx::array multiply(const ScalarOrArray& v) {
    check_initialized();
    return mlx_multiply_item(x_, indices_, v);
  }
  mx::array divide(const ScalarOrArray& v) {
    check_initialized();
    return mlx_divide_item(x_, indices_, v);
  }
  mx::array maximum(const ScalarOrArray& v) {
    check_initialized();
    return mlx_maximum_item(x_, indices_, v);
  }
  mx::array minimum(const ScalarOrArray& v) {
    check_initialized();
    return mlx_minimum_item(x_, indices_, v);
  }

 private:
  mx::array x_;
  bool initialized_{false};
  nb::object indices_;
};

class ArrayPythonIterator {
 public:
  ArrayPythonIterator(mx::array x) : idx_(0), x_(std::move(x)) {
    if (x_.shape(0) > 0 && x_.shape(0) < 10) {
      splits_ = mx::split(x_, x_.shape(0));
    }
  }

  mx::array next() {
    if (idx_ >= x_.shape(0)) {
      throw nb::stop_iteration();
    }

    if (idx_ >= 0 && idx_ < splits_.size()) {
      return mx::squeeze(splits_[idx_++], 0);
    }

    return *(x_.begin() + idx_++);
  }

 private:
  int idx_;
  mx::array x_;
  std::vector<mx::array> splits_;
};

void init_array(nb::module_& m) {
  // Set Python print formatting options
  mx::get_global_formatter().capitalize_bool = true;

  // Types
  nb::class_<mx::Dtype>(
      m,
      "Dtype",
      R"pbdoc(
      An object to hold the type of a :class:`array`.

      See the :ref:`list of types <data_types>` for more details
      on available data types.
      )pbdoc")
      .def_prop_ro(
          "size", &mx::Dtype::size, R"pbdoc(Size of the type in bytes.)pbdoc")
      .def(
          "__repr__",
          [](const mx::Dtype& t) {
            std::ostringstream os;
            os << "mlx.core.";
            os << t;
            return os.str();
          })
      .def(
          "__eq__",
          [](const mx::Dtype& t, const nb::object& other) {
            return nb::isinstance<mx::Dtype>(other) &&
                t == nb::cast<mx::Dtype>(other);
          })
      .def("__hash__", [](const mx::Dtype& t) {
        return static_cast<int64_t>(t.val());
      });

  m.attr("bool_") = nb::cast(mx::bool_);
  m.attr("uint8") = nb::cast(mx::uint8);
  m.attr("uint16") = nb::cast(mx::uint16);
  m.attr("uint32") = nb::cast(mx::uint32);
  m.attr("uint64") = nb::cast(mx::uint64);
  m.attr("int8") = nb::cast(mx::int8);
  m.attr("int16") = nb::cast(mx::int16);
  m.attr("int32") = nb::cast(mx::int32);
  m.attr("int64") = nb::cast(mx::int64);
  m.attr("float16") = nb::cast(mx::float16);
  m.attr("float32") = nb::cast(mx::float32);
  m.attr("float64") = nb::cast(mx::float64);
  m.attr("bfloat16") = nb::cast(mx::bfloat16);
  m.attr("complex64") = nb::cast(mx::complex64);
  nb::enum_<mx::Dtype::Category>(
      m,
      "DtypeCategory",
      R"pbdoc(
      Type to hold categories of :class:`dtypes <Dtype>`.

      * :attr:`~mlx.core.generic`

        * :ref:`bool_ <data_types>`
        * :attr:`~mlx.core.number`

          * :attr:`~mlx.core.integer`

            * :attr:`~mlx.core.unsignedinteger`

              * :ref:`uint8 <data_types>`
              * :ref:`uint16 <data_types>`
              * :ref:`uint32 <data_types>`
              * :ref:`uint64 <data_types>`

            * :attr:`~mlx.core.signedinteger`

              * :ref:`int8 <data_types>`
              * :ref:`int32 <data_types>`
              * :ref:`int64 <data_types>`

          * :attr:`~mlx.core.inexact`

            * :attr:`~mlx.core.floating`

              * :ref:`float16 <data_types>`
              * :ref:`bfloat16 <data_types>`
              * :ref:`float32 <data_types>`
              * :ref:`float64 <data_types>`

            * :attr:`~mlx.core.complexfloating`

              * :ref:`complex64 <data_types>`

      See also :func:`~mlx.core.issubdtype`.
      )pbdoc")
      .value("complexfloating", mx::complexfloating)
      .value("floating", mx::floating)
      .value("inexact", mx::inexact)
      .value("signedinteger", mx::signedinteger)
      .value("unsignedinteger", mx::unsignedinteger)
      .value("integer", mx::integer)
      .value("number", mx::number)
      .value("generic", mx::generic)
      .export_values();

  nb::class_<mx::finfo>(
      m,
      "finfo",
      R"pbdoc(
      Get information on floating-point types.
      )pbdoc")
      .def(nb::init<mx::Dtype>())
      .def_ro(
          "min",
          &mx::finfo::min,
          R"pbdoc(The smallest representable number.)pbdoc")
      .def_ro(
          "max",
          &mx::finfo::max,
          R"pbdoc(The largest representable number.)pbdoc")
      .def_ro(
          "eps",
          &mx::finfo::eps,
          R"pbdoc(
            The difference between 1.0 and the next smallest
            representable number larger than 1.0.
          )pbdoc")
      .def_ro("dtype", &mx::finfo::dtype, R"pbdoc(The :obj:`Dtype`.)pbdoc")
      .def("__repr__", [](const mx::finfo& f) {
        std::ostringstream os;
        os << "finfo("
           << "min=" << f.min << ", max=" << f.max << ", dtype=" << f.dtype
           << ")";
        return os.str();
      });

  nb::class_<mx::iinfo>(
      m,
      "iinfo",
      R"pbdoc(
      Get information on integer types.
      )pbdoc")
      .def(nb::init<mx::Dtype>())
      .def_ro(
          "min",
          &mx::iinfo::min,
          R"pbdoc(The smallest representable number.)pbdoc")
      .def_ro(
          "max",
          &mx::iinfo::max,
          R"pbdoc(The largest representable number.)pbdoc")
      .def_ro("dtype", &mx::iinfo::dtype, R"pbdoc(The :obj:`Dtype`.)pbdoc")
      .def("__repr__", [](const mx::iinfo& i) {
        std::ostringstream os;
        os << "iinfo("
           << "min=" << i.min << ", max=" << i.max << ", dtype=" << i.dtype
           << ")";
        return os.str();
      });

  nb::class_<ArrayAt>(
      m,
      "ArrayAt",
      R"pbdoc(
      A helper object to apply updates at specific indices.
      )pbdoc")
      .def("__getitem__", &ArrayAt::set_indices, "indices"_a.none())
      .def("add", &ArrayAt::add, "value"_a)
      .def("subtract", &ArrayAt::subtract, "value"_a)
      .def("multiply", &ArrayAt::multiply, "value"_a)
      .def("divide", &ArrayAt::divide, "value"_a)
      .def("maximum", &ArrayAt::maximum, "value"_a)
      .def("minimum", &ArrayAt::minimum, "value"_a);

  nb::class_<ArrayLike>(
      m,
      "ArrayLike",
      R"pbdoc(
        Any Python object which has an ``__mlx__array__`` method that
        returns an :obj:`array`.
      )pbdoc")
      .def(nb::init_implicit<nb::object>());

  nb::class_<ArrayPythonIterator>(
      m,
      "ArrayIterator",
      R"pbdoc(
      A helper object to iterate over the 1st dimension of an array.
      )pbdoc")
      .def("__next__", &ArrayPythonIterator::next)
      .def("__iter__", [](const ArrayPythonIterator& it) { return it; });

  // Install buffer protocol functions
  PyType_Slot array_slots[] = {
      {Py_bf_getbuffer, (void*)getbuffer},
      {Py_bf_releasebuffer, (void*)releasebuffer},
      {0, nullptr}};

  nb::class_<mx::array>(
      m,
      "array",
      R"pbdoc(An N-dimensional array object.)pbdoc",
      nb::type_slots(array_slots),
      nb::is_weak_referenceable())
      .def(
          "__init__",
          [](mx::array* aptr, ArrayInitType v, std::optional<mx::Dtype> t) {
            new (aptr) mx::array(create_array(v, t));
          },
          "val"_a,
          "dtype"_a = nb::none(),
          nb::sig(
              "def __init__(self: array, val: Union[scalar, list, tuple, numpy.ndarray, array], dtype: Optional[Dtype] = None)"))
      .def_prop_ro(
          "size",
          &mx::array::size,
          R"pbdoc(Number of elements in the array.)pbdoc")
      .def_prop_ro(
          "ndim", &mx::array::ndim, R"pbdoc(The array's dimension.)pbdoc")
      .def_prop_ro(
          "itemsize",
          &mx::array::itemsize,
          R"pbdoc(The size of the array's datatype in bytes.)pbdoc")
      .def_prop_ro(
          "nbytes",
          &mx::array::nbytes,
          R"pbdoc(The number of bytes in the array.)pbdoc")
      .def_prop_ro(
          "shape",
          [](const mx::array& a) { return nb::cast(a.shape()); },
          nb::sig("def shape(self) -> tuple[int, ...]"),
          R"pbdoc(
          The shape of the array as a Python tuple.

          Returns:
            tuple(int): A tuple containing the sizes of each dimension.
        )pbdoc")
      .def_prop_ro(
          "dtype",
          &mx::array::dtype,
          R"pbdoc(
            The array's :class:`Dtype`.
          )pbdoc")
      .def_prop_ro(
          "real",
          [](const mx::array& a) { return mx::real(a); },
          R"pbdoc(
            The real part of a complex array.
          )pbdoc")
      .def_prop_ro(
          "imag",
          [](const mx::array& a) { return mx::imag(a); },
          R"pbdoc(
            The imaginary part of a complex array.
          )pbdoc")
      .def(
          "item",
          &to_scalar,
          nb::sig("def item(self) -> scalar"),
          R"pbdoc(
            Access the value of a scalar array.

            Returns:
                Standard Python scalar.
          )pbdoc")
      .def(
          "tolist",
          &tolist,
          nb::sig("def tolist(self) -> list_or_scalar"),
          R"pbdoc(
            Convert the array to a Python :class:`list`.

            Returns:
                list: The Python list.

                If the array is a scalar then a standard Python scalar is returned.

                If the array has more than one dimension then the result is a nested
                list of lists.

                The value type of the list corresponding to the last dimension is either
                ``bool``, ``int`` or ``float`` depending on the ``dtype`` of the array.
          )pbdoc")
      .def(
          "astype",
          &mx::astype,
          "dtype"_a,
          "stream"_a = nb::none(),
          R"pbdoc(
            Cast the array to a specified type.

            Args:
                dtype (Dtype): Type to which the array is cast.
                stream (Stream): Stream (or device) for the operation.

            Returns:
                array: The array with type ``dtype``.
          )pbdoc")
      .def(
          "__array_namespace__",
          [](const mx::array& a,
             const std::optional<std::string>& api_version) {
            if (api_version) {
              throw std::invalid_argument(
                  "Explicitly specifying api_version is not yet implemented.");
            }
            return nb::module_::import_("mlx.core");
          },
          "api_version"_a = nb::none(),
          R"pbdoc(
            Returns an object that has all the array API functions on it.

            See the `Python array API <https://data-apis.org/array-api/latest/index.html>`_
            for more information.

            Args:
                api_version (str, optional): String representing the version
                  of the array API spec to return. Default: ``None``.

            Returns:
                out (Any): An object representing the array API namespace.
          )pbdoc")
      .def("__getitem__", mlx_get_item, nb::arg().none())
      .def("__setitem__", mlx_set_item, nb::arg().none(), nb::arg())
      .def_prop_ro(
          "at",
          [](const mx::array& a) { return ArrayAt(a); },
          R"pbdoc(
            Used to apply updates at the given indices.

            .. note::

               Regular in-place updates map to assignment. For instance ``x[idx] += y``
               maps to ``x[idx] = x[idx] + y``. As a result, assigning to the
               same index ignores all but one update. Using ``x.at[idx].add(y)``
               will correctly apply all updates to all indices.

            .. list-table::
               :header-rows: 1

               * - array.at syntax
                 - In-place syntax
               * - ``x = x.at[idx].add(y)``
                 - ``x[idx] += y``
               * - ``x = x.at[idx].subtract(y)``
                 - ``x[idx] -= y``
               * - ``x = x.at[idx].multiply(y)``
                 - ``x[idx] *= y``
               * - ``x = x.at[idx].divide(y)``
                 - ``x[idx] /= y``
               * - ``x = x.at[idx].maximum(y)``
                 - ``x[idx] = mx.maximum(x[idx], y)``
               * - ``x = x.at[idx].minimum(y)``
                 - ``x[idx] = mx.minimum(x[idx], y)``

            Example:
                >>> a = mx.array([0, 0])
                >>> idx = mx.array([0, 1, 0, 1])
                >>> a[idx] += 1
                >>> a
                array([1, 1], dtype=int32)
                >>>
                >>> a = mx.array([0, 0])
                >>> a.at[idx].add(1)
                array([2, 2], dtype=int32)
          )pbdoc")
      .def(
          "__len__",
          [](const mx::array& a) {
            if (a.ndim() == 0) {
              throw nb::type_error("len() 0-dimensional array.");
            }
            return a.shape(0);
          })
      .def(
          "__iter__", [](const mx::array& a) { return ArrayPythonIterator(a); })
      .def(
          "__getstate__",
          [](const mx::array& a) {
            auto nd = (a.dtype() == mx::bfloat16)
                ? mlx_to_np_array(mx::view(a, mx::uint16))
                : mlx_to_np_array(a);
            return nb::make_tuple(nd, static_cast<uint8_t>(a.dtype().val()));
          })
      .def(
          "__setstate__",
          [](mx::array& arr, const nb::tuple& state) {
            if (nb::len(state) != 2) {
              throw std::invalid_argument(
                  "Invalid pickle state: expected (ndarray, Dtype::Val)");
            }
            using ND = nb::ndarray<nb::ro, nb::c_contig, nb::device::cpu>;
            ND nd = nb::cast<ND>(state[0]);
            auto val = static_cast<mx::Dtype::Val>(nb::cast<uint8_t>(state[1]));
            if (val == mx::Dtype::Val::bfloat16) {
              auto owner = nb::handle(state[0].ptr());
              new (&arr) mx::array(nd_array_to_mlx(
                  ND(nd.data(),
                     nd.ndim(),
                     reinterpret_cast<const size_t*>(nd.shape_ptr()),
                     owner,
                     nullptr,
                     nb::bfloat16),
                  mx::bfloat16));
            } else {
              new (&arr) mx::array(nd_array_to_mlx(nd, std::nullopt));
            }
          })
      .def("__dlpack__", [](const mx::array& a) { return mlx_to_dlpack(a); })
      .def(
          "__dlpack_device__",
          [](const mx::array& a) {
            // See
            // https://github.com/dmlc/dlpack/blob/5c210da409e7f1e51ddf445134a4376fdbd70d7d/include/dlpack/dlpack.h#L74
            if (mx::metal::is_available()) {
              return nb::make_tuple(8, 0);
            } else if (mx::cu::is_available()) {
              return nb::make_tuple(13, 0);
            } else {
              // CPU device
              return nb::make_tuple(1, 0);
            }
          })
      .def("__copy__", [](const mx::array& self) { return mx::array(self); })
      .def(
          "__deepcopy__",
          [](const mx::array& self, nb::dict) { return mx::array(self); },
          "memo"_a)
      .def(
          "__add__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("addition", v);
            }
            auto b = to_array(v, a.dtype());
            return mx::add(a, b);
          },
          "other"_a)
      .def(
          "__iadd__",
          [](mx::array& a, const ScalarOrArray v) -> mx::array& {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("inplace addition", v);
            }
            a.overwrite_descriptor(mx::add(a, to_array(v, a.dtype())));
            return a;
          },
          "other"_a,
          nb::rv_policy::none)
      .def(
          "__radd__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("addition", v);
            }
            return mx::add(a, to_array(v, a.dtype()));
          },
          "other"_a)
      .def(
          "__sub__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("subtraction", v);
            }
            return mx::subtract(a, to_array(v, a.dtype()));
          },
          "other"_a)
      .def(
          "__isub__",
          [](mx::array& a, const ScalarOrArray v) -> mx::array& {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("inplace subtraction", v);
            }
            a.overwrite_descriptor(mx::subtract(a, to_array(v, a.dtype())));
            return a;
          },
          "other"_a,
          nb::rv_policy::none)
      .def(
          "__rsub__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("subtraction", v);
            }
            return mx::subtract(to_array(v, a.dtype()), a);
          },
          "other"_a)
      .def(
          "__mul__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("multiplication", v);
            }
            return mx::multiply(a, to_array(v, a.dtype()));
          },
          "other"_a)
      .def(
          "__imul__",
          [](mx::array& a, const ScalarOrArray v) -> mx::array& {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("inplace multiplication", v);
            }
            a.overwrite_descriptor(mx::multiply(a, to_array(v, a.dtype())));
            return a;
          },
          "other"_a,
          nb::rv_policy::none)
      .def(
          "__rmul__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("multiplication", v);
            }
            return mx::multiply(a, to_array(v, a.dtype()));
          },
          "other"_a)
      .def(
          "__truediv__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("division", v);
            }
            return mx::divide(a, to_array(v, a.dtype()));
          },
          "other"_a)
      .def(
          "__itruediv__",
          [](mx::array& a, const ScalarOrArray v) -> mx::array& {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("inplace division", v);
            }
            if (!mx::issubdtype(a.dtype(), mx::inexact)) {
              throw std::invalid_argument(
                  "In place division cannot cast to non-floating point type.");
            }
            a.overwrite_descriptor(divide(a, to_array(v, a.dtype())));
            return a;
          },
          "other"_a,
          nb::rv_policy::none)
      .def(
          "__rtruediv__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("division", v);
            }
            return mx::divide(to_array(v, a.dtype()), a);
          },
          "other"_a)
      .def(
          "__div__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("division", v);
            }
            return mx::divide(a, to_array(v, a.dtype()));
          },
          "other"_a)
      .def(
          "__rdiv__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("division", v);
            }
            return mx::divide(to_array(v, a.dtype()), a);
          },
          "other"_a)
      .def(
          "__floordiv__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("floor division", v);
            }
            return mx::floor_divide(a, to_array(v, a.dtype()));
          },
          "other"_a)
      .def(
          "__ifloordiv__",
          [](mx::array& a, const ScalarOrArray v) -> mx::array& {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("inplace floor division", v);
            }
            a.overwrite_descriptor(mx::floor_divide(a, to_array(v, a.dtype())));
            return a;
          },
          "other"_a,
          nb::rv_policy::none)
      .def(
          "__rfloordiv__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("floor division", v);
            }
            auto b = to_array(v, a.dtype());
            return mx::floor_divide(b, a);
          },
          "other"_a)
      .def(
          "__mod__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("modulus", v);
            }
            return mx::remainder(a, to_array(v, a.dtype()));
          },
          "other"_a)
      .def(
          "__imod__",
          [](mx::array& a, const ScalarOrArray v) -> mx::array& {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("inplace modulus", v);
            }
            a.overwrite_descriptor(mx::remainder(a, to_array(v, a.dtype())));
            return a;
          },
          "other"_a,
          nb::rv_policy::none)
      .def(
          "__rmod__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("modulus", v);
            }
            return mx::remainder(to_array(v, a.dtype()), a);
          },
          "other"_a)
      .def(
          "__eq__",
          [](const mx::array& a,
             const ScalarOrArray& v) -> std::variant<mx::array, bool> {
            if (!is_comparable_with_array(v)) {
              return false;
            }
            return mx::equal(a, to_array(v, a.dtype()));
          },
          "other"_a)
      .def(
          "__lt__",
          [](const mx::array& a, const ScalarOrArray v) -> mx::array {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("less than", v);
            }
            return mx::less(a, to_array(v, a.dtype()));
          },
          "other"_a)
      .def(
          "__le__",
          [](const mx::array& a, const ScalarOrArray v) -> mx::array {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("less than or equal", v);
            }
            return mx::less_equal(a, to_array(v, a.dtype()));
          },
          "other"_a)
      .def(
          "__gt__",
          [](const mx::array& a, const ScalarOrArray v) -> mx::array {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("greater than", v);
            }
            return mx::greater(a, to_array(v, a.dtype()));
          },
          "other"_a)
      .def(
          "__ge__",
          [](const mx::array& a, const ScalarOrArray v) -> mx::array {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("greater than or equal", v);
            }
            return mx::greater_equal(a, to_array(v, a.dtype()));
          },
          "other"_a)
      .def(
          "__ne__",
          [](const mx::array& a,
             const ScalarOrArray v) -> std::variant<mx::array, bool> {
            if (!is_comparable_with_array(v)) {
              return true;
            }
            return mx::not_equal(a, to_array(v, a.dtype()));
          },
          "other"_a)
      .def("__neg__", [](const mx::array& a) { return -a; })
      .def("__bool__", [](mx::array& a) { return nb::bool_(to_scalar(a)); })
      .def(
          "__repr__",
          [](mx::array& a) {
            nb::gil_scoped_release nogil;
            std::ostringstream os;
            os << a;
            return os.str();
          })
      .def(
          "__matmul__",
          [](const mx::array& a, mx::array& other) {
            return mx::matmul(a, other);
          },
          "other"_a)
      .def(
          "__imatmul__",
          [](mx::array& a, mx::array& other) -> mx::array& {
            a.overwrite_descriptor(mx::matmul(a, other));
            return a;
          },
          "other"_a,
          nb::rv_policy::none)
      .def(
          "__pow__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("power", v);
            }
            return mx::power(a, to_array(v, a.dtype()));
          },
          "other"_a)
      .def(
          "__rpow__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("power", v);
            }
            return mx::power(to_array(v, a.dtype()), a);
          },
          "other"_a)
      .def(
          "__ipow__",
          [](mx::array& a, const ScalarOrArray v) -> mx::array& {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("inplace power", v);
            }
            a.overwrite_descriptor(mx::power(a, to_array(v, a.dtype())));
            return a;
          },
          "other"_a,
          nb::rv_policy::none)
      .def(
          "__invert__",
          [](const mx::array& a) {
            if (mx::issubdtype(a.dtype(), mx::inexact)) {
              throw std::invalid_argument(
                  "Floating point types not allowed with bitwise inversion.");
            }
            if (a.dtype() == mx::bool_) {
              return mx::logical_not(a);
            }
            return mx::bitwise_invert(a);
          })
      .def(
          "__and__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("bitwise and", v);
            }
            auto b = to_array(v, a.dtype());
            if (mx::issubdtype(a.dtype(), mx::inexact) ||
                mx::issubdtype(b.dtype(), mx::inexact)) {
              throw std::invalid_argument(
                  "Floating point types not allowed with bitwise and.");
            }
            return mx::bitwise_and(a, b);
          },
          "other"_a)
      .def(
          "__iand__",
          [](mx::array& a, const ScalarOrArray v) -> mx::array& {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("inplace bitwise and", v);
            }
            auto b = to_array(v, a.dtype());
            if (mx::issubdtype(a.dtype(), mx::inexact) ||
                mx::issubdtype(b.dtype(), mx::inexact)) {
              throw std::invalid_argument(
                  "Floating point types not allowed with bitwise and.");
            }
            a.overwrite_descriptor(mx::bitwise_and(a, b));
            return a;
          },
          "other"_a,
          nb::rv_policy::none)
      .def(
          "__or__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("bitwise or", v);
            }
            auto b = to_array(v, a.dtype());
            if (mx::issubdtype(a.dtype(), mx::inexact) ||
                mx::issubdtype(b.dtype(), mx::inexact)) {
              throw std::invalid_argument(
                  "Floating point types not allowed with bitwise or.");
            }
            return mx::bitwise_or(a, b);
          },
          "other"_a)
      .def(
          "__ior__",
          [](mx::array& a, const ScalarOrArray v) -> mx::array& {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("inplace bitwise or", v);
            }
            auto b = to_array(v, a.dtype());
            if (mx::issubdtype(a.dtype(), mx::inexact) ||
                mx::issubdtype(b.dtype(), mx::inexact)) {
              throw std::invalid_argument(
                  "Floating point types not allowed with bitwise or.");
            }
            a.overwrite_descriptor(mx::bitwise_or(a, b));
            return a;
          },
          "other"_a,
          nb::rv_policy::none)
      .def(
          "__lshift__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("left shift", v);
            }
            auto b = to_array(v, a.dtype());
            if (mx::issubdtype(a.dtype(), mx::inexact) ||
                mx::issubdtype(b.dtype(), mx::inexact)) {
              throw std::invalid_argument(
                  "Floating point types not allowed with left shift.");
            }
            return mx::left_shift(a, b);
          },
          "other"_a)
      .def(
          "__ilshift__",
          [](mx::array& a, const ScalarOrArray v) -> mx::array& {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("inplace left shift", v);
            }
            auto b = to_array(v, a.dtype());
            if (mx::issubdtype(a.dtype(), mx::inexact) ||
                mx::issubdtype(b.dtype(), mx::inexact)) {
              throw std::invalid_argument(
                  "Floating point types not allowed with left shift.");
            }
            a.overwrite_descriptor(mx::left_shift(a, b));
            return a;
          },
          "other"_a,
          nb::rv_policy::none)
      .def(
          "__rshift__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("right shift", v);
            }
            auto b = to_array(v, a.dtype());
            if (mx::issubdtype(a.dtype(), mx::inexact) ||
                mx::issubdtype(b.dtype(), mx::inexact)) {
              throw std::invalid_argument(
                  "Floating point types not allowed with right shift.");
            }
            return mx::right_shift(a, b);
          },
          "other"_a)
      .def(
          "__irshift__",
          [](mx::array& a, const ScalarOrArray v) -> mx::array& {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("inplace right shift", v);
            }
            auto b = to_array(v, a.dtype());
            if (mx::issubdtype(a.dtype(), mx::inexact) ||
                mx::issubdtype(b.dtype(), mx::inexact)) {
              throw std::invalid_argument(
                  "Floating point types not allowed with right shift.");
            }
            a.overwrite_descriptor(mx::right_shift(a, b));
            return a;
          },
          "other"_a,
          nb::rv_policy::none)
      .def(
          "__xor__",
          [](const mx::array& a, const ScalarOrArray v) {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("bitwise xor", v);
            }
            auto b = to_array(v, a.dtype());
            if (mx::issubdtype(a.dtype(), mx::inexact) ||
                mx::issubdtype(b.dtype(), mx::inexact)) {
              throw std::invalid_argument(
                  "Floating point types not allowed with bitwise xor.");
            }
            return mx::bitwise_xor(a, b);
          },
          "other"_a)
      .def(
          "__ixor__",
          [](mx::array& a, const ScalarOrArray v) -> mx::array& {
            if (!is_comparable_with_array(v)) {
              throw_invalid_operation("inplace bitwise xor", v);
            }
            auto b = to_array(v, a.dtype());
            if (mx::issubdtype(a.dtype(), mx::inexact) ||
                mx::issubdtype(b.dtype(), mx::inexact)) {
              throw std::invalid_argument(
                  "Floating point types not allowed bitwise xor.");
            }
            a.overwrite_descriptor(mx::bitwise_xor(a, b));
            return a;
          },
          "other"_a,
          nb::rv_policy::none)
      .def("__int__", [](mx::array& a) { return nb::int_(to_scalar(a)); })
      .def("__float__", [](mx::array& a) { return nb::float_(to_scalar(a)); })
      .def(
          "__format__",
          [](mx::array& a, nb::object format_spec) {
            if (nb::len(nb::str(format_spec)) > 0 && a.ndim() > 0) {
              throw nb::type_error(
                  "unsupported format string passed to mx.array.__format__");
            } else if (a.ndim() == 0) {
              auto obj = to_scalar(a);
              return nb::cast<std::string>(
                  nb::handle(PyObject_Format(obj.ptr(), format_spec.ptr())));
            } else {
              nb::gil_scoped_release nogil;
              std::ostringstream os;
              os << a;
              return os.str();
            }
          })
      .def(
          "flatten",
          [](const mx::array& a,
             int start_axis,
             int end_axis,
             const mx::StreamOrDevice& s) {
            return mx::flatten(a, start_axis, end_axis, s);
          },
          "start_axis"_a = 0,
          "end_axis"_a = -1,
          nb::kw_only(),
          "stream"_a = nb::none(),
          R"pbdoc(
            See :func:`flatten`.
          )pbdoc")
      .def(
          "reshape",
          [](const mx::array& a, nb::args shape_, mx::StreamOrDevice s) {
            mx::Shape shape;
            if (!nb::isinstance<int>(shape_[0])) {
              shape = nb::cast<mx::Shape>(shape_[0]);
            } else {
              shape = nb::cast<mx::Shape>(shape_);
            }
            return mx::reshape(a, std::move(shape), s);
          },
          "shape"_a,
          "stream"_a = nb::none(),
          R"pbdoc(
            Equivalent to :func:`reshape` but the shape can be passed either as a
            :obj:`tuple` or as separate arguments.

            See :func:`reshape` for full documentation.
          )pbdoc")
      .def(
          "squeeze",
          [](const mx::array& a,
             const IntOrVec& v,
             const mx::StreamOrDevice& s) {
            if (std::holds_alternative<std::monostate>(v)) {
              return mx::squeeze(a, s);
            } else if (auto pv = std::get_if<int>(&v); pv) {
              return mx::squeeze(a, *pv, s);
            } else {
              return mx::squeeze(a, std::get<std::vector<int>>(v), s);
            }
          },
          "axis"_a = nb::none(),
          nb::kw_only(),
          "stream"_a = nb::none(),
          R"pbdoc(
            See :func:`squeeze`.
          )pbdoc")
      .def(
          "abs",
          &mx::abs,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`abs`.")
      .def(
          "__abs__",
          [](const mx::array& a) { return mx::abs(a); },
          "See :func:`abs`.")
      .def(
          "square",
          &mx::square,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`square`.")
      .def(
          "sqrt",
          &mx::sqrt,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`sqrt`.")
      .def(
          "rsqrt",
          &mx::rsqrt,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`rsqrt`.")
      .def(
          "reciprocal",
          &mx::reciprocal,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`reciprocal`.")
      .def(
          "exp",
          &mx::exp,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`exp`.")
      .def(
          "log",
          &mx::log,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`log`.")
      .def(
          "log2",
          &mx::log2,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`log2`.")
      .def(
          "log10",
          &mx::log10,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`log10`.")
      .def(
          "sin",
          &mx::sin,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`sin`.")
      .def(
          "cos",
          &mx::cos,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`cos`.")
      .def(
          "log1p",
          &mx::log1p,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`log1p`.")
      .def(
          "all",
          [](const mx::array& a,
             const IntOrVec& axis,
             bool keepdims,
             mx::StreamOrDevice s) {
            return mx::all(a, get_reduce_axes(axis, a.ndim()), keepdims, s);
          },
          "axis"_a = nb::none(),
          "keepdims"_a = false,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`all`.")
      .def(
          "any",
          [](const mx::array& a,
             const IntOrVec& axis,
             bool keepdims,
             mx::StreamOrDevice s) {
            return mx::any(a, get_reduce_axes(axis, a.ndim()), keepdims, s);
          },
          "axis"_a = nb::none(),
          "keepdims"_a = false,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`any`.")
      .def(
          "moveaxis",
          &mx::moveaxis,
          "source"_a,
          "destination"_a,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`moveaxis`.")
      .def(
          "swapaxes",
          &mx::swapaxes,
          "axis1"_a,
          "axis2"_a,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`swapaxes`.")
      .def(
          "transpose",
          [](const mx::array& a, nb::args axes_, mx::StreamOrDevice s) {
            if (axes_.size() == 0) {
              return mx::transpose(a, s);
            }
            std::vector<int> axes;
            if (!nb::isinstance<int>(axes_[0])) {
              axes = nb::cast<std::vector<int>>(axes_[0]);
            } else {
              axes = nb::cast<std::vector<int>>(axes_);
            }
            return mx::transpose(a, axes, s);
          },
          "axes"_a,
          "stream"_a = nb::none(),
          R"pbdoc(
            Equivalent to :func:`transpose` but the axes can be passed either as
            a tuple or as separate arguments.

            See :func:`transpose` for full documentation.
          )pbdoc")
      .def_prop_ro(
          "T",
          [](const mx::array& a) { return mx::transpose(a); },
          "Equivalent to calling ``self.transpose()`` with no arguments.")
      .def(
          "sum",
          [](const mx::array& a,
             const IntOrVec& axis,
             bool keepdims,
             mx::StreamOrDevice s) {
            return mx::sum(a, get_reduce_axes(axis, a.ndim()), keepdims, s);
          },
          "axis"_a = nb::none(),
          "keepdims"_a = false,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`sum`.")
      .def(
          "prod",
          [](const mx::array& a,
             const IntOrVec& axis,
             bool keepdims,
             mx::StreamOrDevice s) {
            return mx::prod(a, get_reduce_axes(axis, a.ndim()), keepdims, s);
          },
          "axis"_a = nb::none(),
          "keepdims"_a = false,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`prod`.")
      .def(
          "min",
          [](const mx::array& a,
             const IntOrVec& axis,
             bool keepdims,
             mx::StreamOrDevice s) {
            return mx::min(a, get_reduce_axes(axis, a.ndim()), keepdims, s);
          },
          "axis"_a = nb::none(),
          "keepdims"_a = false,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`min`.")
      .def(
          "max",
          [](const mx::array& a,
             const IntOrVec& axis,
             bool keepdims,
             mx::StreamOrDevice s) {
            return mx::max(a, get_reduce_axes(axis, a.ndim()), keepdims, s);
          },
          "axis"_a = nb::none(),
          "keepdims"_a = false,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`max`.")
      .def(
          "logcumsumexp",
          [](const mx::array& a,
             std::optional<int> axis,
             bool reverse,
             bool inclusive,
             mx::StreamOrDevice s) {
            if (axis) {
              return mx::logcumsumexp(a, *axis, reverse, inclusive, s);
            } else {
              return mx::logcumsumexp(a, reverse, inclusive, s);
            }
          },
          "axis"_a = nb::none(),
          nb::kw_only(),
          "reverse"_a = false,
          "inclusive"_a = true,
          "stream"_a = nb::none(),
          "See :func:`logcumsumexp`.")
      .def(
          "logsumexp",
          [](const mx::array& a,
             const IntOrVec& axis,
             bool keepdims,
             mx::StreamOrDevice s) {
            return mx::logsumexp(
                a, get_reduce_axes(axis, a.ndim()), keepdims, s);
          },
          "axis"_a = nb::none(),
          "keepdims"_a = false,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`logsumexp`.")
      .def(
          "mean",
          [](const mx::array& a,
             const IntOrVec& axis,
             bool keepdims,
             mx::StreamOrDevice s) {
            return mx::mean(a, get_reduce_axes(axis, a.ndim()), keepdims, s);
          },
          "axis"_a = nb::none(),
          "keepdims"_a = false,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`mean`.")
      .def(
          "std",
          [](const mx::array& a,
             const IntOrVec& axis,
             bool keepdims,
             int ddof,
             mx::StreamOrDevice s) {
            return mx::std(
                a, get_reduce_axes(axis, a.ndim()), keepdims, ddof, s);
          },
          "axis"_a = nb::none(),
          "keepdims"_a = false,
          "ddof"_a = 0,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`std`.")
      .def(
          "var",
          [](const mx::array& a,
             const IntOrVec& axis,
             bool keepdims,
             int ddof,
             mx::StreamOrDevice s) {
            return mx::var(
                a, get_reduce_axes(axis, a.ndim()), keepdims, ddof, s);
          },
          "axis"_a = nb::none(),
          "keepdims"_a = false,
          "ddof"_a = 0,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`var`.")
      .def(
          "split",
          [](const mx::array& a,
             const std::variant<int, mx::Shape>& indices_or_sections,
             int axis,
             mx::StreamOrDevice s) {
            if (auto pv = std::get_if<int>(&indices_or_sections); pv) {
              return mx::split(a, *pv, axis, s);
            } else {
              return mx::split(
                  a, std::get<mx::Shape>(indices_or_sections), axis, s);
            }
          },
          "indices_or_sections"_a,
          "axis"_a = 0,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`split`.")
      .def(
          "argmin",
          [](const mx::array& a,
             std::optional<int> axis,
             bool keepdims,
             mx::StreamOrDevice s) {
            if (axis) {
              return mx::argmin(a, *axis, keepdims, s);
            } else {
              return mx::argmin(a, keepdims, s);
            }
          },
          "axis"_a = std::nullopt,
          "keepdims"_a = false,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`argmin`.")
      .def(
          "argmax",
          [](const mx::array& a,
             std::optional<int> axis,
             bool keepdims,
             mx::StreamOrDevice s) {
            if (axis) {
              return mx::argmax(a, *axis, keepdims, s);
            } else {
              return mx::argmax(a, keepdims, s);
            }
          },
          "axis"_a = nb::none(),
          "keepdims"_a = false,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`argmax`.")
      .def(
          "cumsum",
          [](const mx::array& a,
             std::optional<int> axis,
             bool reverse,
             bool inclusive,
             mx::StreamOrDevice s) {
            if (axis) {
              return mx::cumsum(a, *axis, reverse, inclusive, s);
            } else {
              return mx::cumsum(a, reverse, inclusive, s);
            }
          },
          "axis"_a = nb::none(),
          nb::kw_only(),
          "reverse"_a = false,
          "inclusive"_a = true,
          "stream"_a = nb::none(),
          "See :func:`cumsum`.")
      .def(
          "cumprod",
          [](const mx::array& a,
             std::optional<int> axis,
             bool reverse,
             bool inclusive,
             mx::StreamOrDevice s) {
            if (axis) {
              return mx::cumprod(a, *axis, reverse, inclusive, s);
            } else {
              return mx::cumprod(a, reverse, inclusive, s);
            }
          },
          "axis"_a = nb::none(),
          nb::kw_only(),
          "reverse"_a = false,
          "inclusive"_a = true,
          "stream"_a = nb::none(),
          "See :func:`cumprod`.")
      .def(
          "cummax",
          [](const mx::array& a,
             std::optional<int> axis,
             bool reverse,
             bool inclusive,
             mx::StreamOrDevice s) {
            if (axis) {
              return mx::cummax(a, *axis, reverse, inclusive, s);
            } else {
              return mx::cummax(a, reverse, inclusive, s);
            }
          },
          "axis"_a = nb::none(),
          nb::kw_only(),
          "reverse"_a = false,
          "inclusive"_a = true,
          "stream"_a = nb::none(),
          "See :func:`cummax`.")
      .def(
          "cummin",
          [](const mx::array& a,
             std::optional<int> axis,
             bool reverse,
             bool inclusive,
             mx::StreamOrDevice s) {
            if (axis) {
              return mx::cummin(a, *axis, reverse, inclusive, s);
            } else {
              return mx::cummin(a, reverse, inclusive, s);
            }
          },
          "axis"_a = nb::none(),
          nb::kw_only(),
          "reverse"_a = false,
          "inclusive"_a = true,
          "stream"_a = nb::none(),
          "See :func:`cummin`.")
      .def(
          "round",
          [](const mx::array& a, int decimals, mx::StreamOrDevice s) {
            return mx::round(a, decimals, s);
          },
          "decimals"_a = 0,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`round`.")
      .def(
          "diagonal",
          [](const mx::array& a,
             int offset,
             int axis1,
             int axis2,
             mx::StreamOrDevice s) {
            return mx::diagonal(a, offset, axis1, axis2, s);
          },
          "offset"_a = 0,
          "axis1"_a = 0,
          "axis2"_a = 1,
          "stream"_a = nb::none(),
          "See :func:`diagonal`.")
      .def(
          "diag",
          [](const mx::array& a, int k, mx::StreamOrDevice s) {
            return mx::diag(a, k, s);
          },
          "k"_a = 0,
          nb::kw_only(),
          "stream"_a = nb::none(),
          R"pbdoc(
            Extract a diagonal or construct a diagonal matrix.
        )pbdoc")
      .def(
          "conj",
          [](const mx::array& a, mx::StreamOrDevice s) {
            return mx::conjugate(to_array(a), s);
          },
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`conj`.")
      .def(
          "view",
          [](const ScalarOrArray& a,
             const mx::Dtype& dtype,
             mx::StreamOrDevice s) { return mx::view(to_array(a), dtype, s); },
          "dtype"_a,
          nb::kw_only(),
          "stream"_a = nb::none(),
          "See :func:`view`.");
}


================================================
FILE: python/src/buffer.h
================================================
// Copyright © 2024 Apple Inc.
#pragma once
#include <optional>

#include <nanobind/nanobind.h>

#include "mlx/array.h"
#include "mlx/utils.h"

// Only defined in >= Python 3.9
// https://github.com/python/cpython/blob/f6cdc6b4a191b75027de342aa8b5d344fb31313e/Include/typeslots.h#L2-L3
#ifndef Py_bf_getbuffer
#define Py_bf_getbuffer 1
#define Py_bf_releasebuffer 2
#endif

namespace mx = mlx::core;
namespace nb = nanobind;

std::string buffer_format(const mx::array& a) {
  // https://docs.python.org/3.10/library/struct.html#format-characters
  switch (a.dtype()) {
    case mx::bool_:
      return "?";
    case mx::uint8:
      return "B";
    case mx::uint16:
      return "H";
    case mx::uint32:
      return "I";
    case mx::uint64:
      return "Q";
    case mx::int8:
      return "b";
    case mx::int16:
      return "h";
    case mx::int32:
      return "i";
    case mx::int64:
      return "q";
    case mx::float16:
      return "e";
    case mx::float32:
      return "f";
    case mx::bfloat16:
      return "B";
    case mx::float64:
      return "d";
    case mx::complex64:
      return "Zf\0";
    default: {
      std::ostringstream os;
      os << "bad dtype: " << a.dtype();
      throw std::runtime_error(os.str());
    }
  }
}

struct buffer_info {
  std::string format;
  std::vector<Py_ssize_t> shape;
  std::vector<Py_ssize_t> strides;

  buffer_info(
      std::string format,
      std::vector<Py_ssize_t> shape_in,
      std::vector<Py_ssize_t> strides_in)
      : format(std::move(format)),
        shape(std::move(shape_in)),
        strides(std::move(strides_in)) {}

  buffer_info(const buffer_info&) = delete;
  buffer_info& operator=(const buffer_info&) = delete;

  buffer_info(buffer_info&& other) noexcept {
    (*this) = std::move(other);
  }

  buffer_info& operator=(buffer_info&& rhs) noexcept {
    format = std::move(rhs.format);
    shape = std::move(rhs.shape);
    strides = std::move(rhs.strides);
    return *this;
  }
};

extern "C" inline int getbuffer(PyObject* obj, Py_buffer* view, int flags) {
  std::memset(view, 0, sizeof(Py_buffer));
  auto a = nb::cast<mx::array>(nb::handle(obj));

  {
    nb::gil_scoped_release nogil;
    a.eval();
  }

  std::vector<Py_ssize_t> shape(a.shape().begin(), a.shape().end());
  std::vector<Py_ssize_t> strides(a.strides().begin(), a.strides().end());
  for (auto& s : strides) {
    s *= a.itemsize();
  }
  buffer_info* info =
      new buffer_info(buffer_format(a), std::move(shape), std::move(strides));

  view->obj = obj;
  view->ndim = a.ndim();
  view->internal = info;
  view->buf = a.data<void>();
  view->itemsize = a.itemsize();
  view->len = a.nbytes();
  view->readonly = false;
  if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
    view->format = const_cast<char*>(info->format.c_str());
  }
  if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
    view->strides = info->strides.data();
    view->shape = info->shape.data();
  }
  Py_INCREF(view->obj);
  return 0;
}

extern "C" inline void releasebuffer(PyObject*, Py_buffer* view) {
  delete (buffer_info*)view->internal;
}


================================================
FILE: python/src/constants.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <nanobind/nanobind.h>
#include <limits>

namespace nb = nanobind;

void init_constants(nb::module_& m) {
  m.attr("e") = 2.71828182845904523536028747135266249775724709369995;
  m.attr("euler_gamma") = 0.5772156649015328606065120900824024310421;
  m.attr("inf") = std::numeric_limits<double>::infinity();
  m.attr("nan") = NAN;
  m.attr("newaxis") = nb::none();
  m.attr("pi") = 3.1415926535897932384626433;
}


================================================
FILE: python/src/convert.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <nanobind/stl/complex.h>

#include "python/src/convert.h"
#include "python/src/utils.h"

#include "mlx/utils.h"

enum PyScalarT {
  pybool = 0,
  pyint = 1,
  pyfloat = 2,
  pycomplex = 3,
};

namespace nanobind {
template <>
struct ndarray_traits<mx::float16_t> {
  static constexpr bool is_complex = false;
  static constexpr bool is_float = true;
  static constexpr bool is_bool = false;
  static constexpr bool is_int = false;
  static constexpr bool is_signed = true;
};
}; // namespace nanobind

int check_shape_dim(int64_t dim) {
  if (dim > std::numeric_limits<int>::max()) {
    throw std::invalid_argument(
        "Shape dimension falls outside supported `int` range.");
  }
  return static_cast<int>(dim);
}

template <typename T>
mx::array nd_array_to_mlx_contiguous(
    nb::ndarray<nb::ro, nb::c_contig, nb::device::cpu> nd_array,
    const mx::Shape& shape,
    mx::Dtype dtype) {
  // Make a copy of the numpy buffer
  // Get buffer ptr pass to array constructor
  auto data_ptr = nd_array.data();
  return mx::array(static_cast<const T*>(data_ptr), shape, dtype);
}

mx::array nd_array_to_mlx(
    nb::ndarray<nb::ro, nb::c_contig, nb::device::cpu> nd_array,
    std::optional<mx::Dtype> dtype) {
  // Compute the shape and size
  mx::Shape shape;
  shape.reserve(nd_array.ndim());
  for (int i = 0; i < nd_array.ndim(); i++) {
    shape.push_back(check_shape_dim(nd_array.shape(i)));
  }
  auto type = nd_array.dtype();

  // Copy data and make array
  if (type == nb::dtype<bool>()) {
    return nd_array_to_mlx_contiguous<bool>(
        nd_array, shape, dtype.value_or(mx::bool_));
  } else if (type == nb::dtype<uint8_t>()) {
    return nd_array_to_mlx_contiguous<uint8_t>(
        nd_array, shape, dtype.value_or(mx::uint8));
  } else if (type == nb::dtype<uint16_t>()) {
    return nd_array_to_mlx_contiguous<uint16_t>(
        nd_array, shape, dtype.value_or(mx::uint16));
  } else if (type == nb::dtype<uint32_t>()) {
    return nd_array_to_mlx_contiguous<uint32_t>(
        nd_array, shape, dtype.value_or(mx::uint32));
  } else if (type == nb::dtype<uint64_t>()) {
    return nd_array_to_mlx_contiguous<uint64_t>(
        nd_array, shape, dtype.value_or(mx::uint64));
  } else if (type == nb::dtype<int8_t>()) {
    return nd_array_to_mlx_contiguous<int8_t>(
        nd_array, shape, dtype.value_or(mx::int8));
  } else if (type == nb::dtype<int16_t>()) {
    return nd_array_to_mlx_contiguous<int16_t>(
        nd_array, shape, dtype.value_or(mx::int16));
  } else if (type == nb::dtype<int32_t>()) {
    return nd_array_to_mlx_contiguous<int32_t>(
        nd_array, shape, dtype.value_or(mx::int32));
  } else if (type == nb::dtype<int64_t>()) {
    return nd_array_to_mlx_contiguous<int64_t>(
        nd_array, shape, dtype.value_or(mx::int64));
  } else if (type == nb::dtype<mx::float16_t>()) {
    return nd_array_to_mlx_contiguous<mx::float16_t>(
        nd_array, shape, dtype.value_or(mx::float16));
  } else if (type == nb::bfloat16) {
    return nd_array_to_mlx_contiguous<mx::bfloat16_t>(
        nd_array, shape, dtype.value_or(mx::bfloat16));
  } else if (type == nb::dtype<float>()) {
    return nd_array_to_mlx_contiguous<float>(
        nd_array, shape, dtype.value_or(mx::float32));
  } else if (type == nb::dtype<double>()) {
    return nd_array_to_mlx_contiguous<double>(
        nd_array, shape, dtype.value_or(mx::float32));
  } else if (type == nb::dtype<std::complex<float>>()) {
    return nd_array_to_mlx_contiguous<mx::complex64_t>(
        nd_array, shape, dtype.value_or(mx::complex64));
  } else if (type == nb::dtype<std::complex<double>>()) {
    return nd_array_to_mlx_contiguous<mx::complex128_t>(
        nd_array, shape, dtype.value_or(mx::complex64));
  } else {
    throw std::invalid_argument("Cannot convert numpy array to mlx array.");
  }
}

template <typename T, typename... NDParams>
nb::ndarray<NDParams...> mlx_to_nd_array_impl(
    mx::array a,
    std::optional<nb::dlpack::dtype> t = {}) {
  {
    nb::gil_scoped_release nogil;
    a.eval();
  }
  std::vector<size_t> shape(a.shape().begin(), a.shape().end());
  return nb::ndarray<NDParams...>(
      a.data<T>(),
      a.ndim(),
      shape.data(),
      /* owner= */ nb::none(),
      a.strides().data(),
      t.value_or(nb::dtype<T>()));
}

template <typename... NDParams>
nb::ndarray<NDParams...> mlx_to_nd_array(const mx::array& a) {
  switch (a.dtype()) {
    case mx::bool_:
      return mlx_to_nd_array_impl<bool, NDParams...>(a);
    case mx::uint8:
      return mlx_to_nd_array_impl<uint8_t, NDParams...>(a);
    case mx::uint16:
      return mlx_to_nd_array_impl<uint16_t, NDParams...>(a);
    case mx::uint32:
      return mlx_to_nd_array_impl<uint32_t, NDParams...>(a);
    case mx::uint64:
      return mlx_to_nd_array_impl<uint64_t, NDParams...>(a);
    case mx::int8:
      return mlx_to_nd_array_impl<int8_t, NDParams...>(a);
    case mx::int16:
      return mlx_to_nd_array_impl<int16_t, NDParams...>(a);
    case mx::int32:
      return mlx_to_nd_array_impl<int32_t, NDParams...>(a);
    case mx::int64:
      return mlx_to_nd_array_impl<int64_t, NDParams...>(a);
    case mx::float16:
      return mlx_to_nd_array_impl<mx::float16_t, NDParams...>(a);
    case mx::bfloat16:
      throw nb::type_error("bfloat16 arrays cannot be converted to NumPy.");
    case mx::float32:
      return mlx_to_nd_array_impl<float, NDParams...>(a);
    case mx::float64:
      return mlx_to_nd_array_impl<double, NDParams...>(a);
    case mx::complex64:
      return mlx_to_nd_array_impl<std::complex<float>, NDParams...>(a);
    default:
      throw nb::type_error("type cannot be converted to NumPy.");
  }
}

nb::ndarray<nb::numpy> mlx_to_np_array(const mx::array& a) {
  return mlx_to_nd_array<nb::numpy>(a);
}

nb::ndarray<> mlx_to_dlpack(const mx::array& a) {
  return mlx_to_nd_array<>(a);
}

nb::object to_scalar(mx::array& a) {
  if (a.size() != 1) {
    throw std::invalid_argument(
        "[convert] Only length-1 arrays can be converted to Python scalars.");
  }
  {
    nb::gil_scoped_release nogil;
    a.eval();
  }
  switch (a.dtype()) {
    case mx::bool_:
      return nb::cast(a.item<bool>());
    case mx::uint8:
      return nb::cast(a.item<uint8_t>());
    case mx::uint16:
      return nb::cast(a.item<uint16_t>());
    case mx::uint32:
      return nb::cast(a.item<uint32_t>());
    case mx::uint64:
      return nb::cast(a.item<uint64_t>());
    case mx::int8:
      return nb::cast(a.item<int8_t>());
    case mx::int16:
      return nb::cast(a.item<int16_t>());
    case mx::int32:
      return nb::cast(a.item<int32_t>());
    case mx::int64:
      return nb::cast(a.item<int64_t>());
    case mx::float16:
      return nb::cast(static_cast<float>(a.item<mx::float16_t>()));
    case mx::float32:
      return nb::cast(a.item<float>());
    case mx::bfloat16:
      return nb::cast(static_cast<float>(a.item<mx::bfloat16_t>()));
    case mx::complex64:
      return nb::cast(a.item<std::complex<float>>());
    case mx::float64:
      return nb::cast(a.item<double>());
    default:
      throw nb::type_error("type cannot be converted to Python scalar.");
  }
}

template <typename T, typename U = T>
nb::list to_list(mx::array& a, size_t index, int dim) {
  nb::list pl;
  auto stride = a.strides()[dim];
  for (int i = 0; i < a.shape(dim); ++i) {
    if (dim == a.ndim() - 1) {
      pl.append(static_cast<U>(a.data<T>()[index]));
    } else {
      pl.append(to_list<T, U>(a, index, dim + 1));
    }
    index += stride;
  }
  return pl;
}

nb::object tolist(mx::array& a) {
  if (a.ndim() == 0) {
    return to_scalar(a);
  }
  {
    nb::gil_scoped_release nogil;
    a.eval();
  }
  switch (a.dtype()) {
    case mx::bool_:
      return to_list<bool>(a, 0, 0);
    case mx::uint8:
      return to_list<uint8_t>(a, 0, 0);
    case mx::uint16:
      return to_list<uint16_t>(a, 0, 0);
    case mx::uint32:
      return to_list<uint32_t>(a, 0, 0);
    case mx::uint64:
      return to_list<uint64_t>(a, 0, 0);
    case mx::int8:
      return to_list<int8_t>(a, 0, 0);
    case mx::int16:
      return to_list<int16_t>(a, 0, 0);
    case mx::int32:
      return to_list<int32_t>(a, 0, 0);
    case mx::int64:
      return to_list<int64_t>(a, 0, 0);
    case mx::float16:
      return to_list<mx::float16_t, float>(a, 0, 0);
    case mx::float32:
      return to_list<float>(a, 0, 0);
    case mx::bfloat16:
      return to_list<mx::bfloat16_t, float>(a, 0, 0);
    case mx::float64:
      return to_list<double>(a, 0, 0);
    case mx::complex64:
      return to_list<std::complex<float>>(a, 0, 0);
    default:
      throw nb::type_error("data type cannot be converted to Python list.");
  }
}

template <typename T, typename U>
void fill_vector(T list, std::vector<U>& vals) {
  for (auto l : list) {
    if (nb::isinstance<nb::list>(l)) {
      fill_vector(nb::cast<nb::list>(l), vals);
    } else if (nb::isinstance<nb::tuple>(*list.begin())) {
      fill_vector(nb::cast<nb::tuple>(l), vals);
    } else {
      vals.push_back(nb::cast<U>(l));
    }
  }
}

template <typename T>
PyScalarT validate_shape(
    T list,
    const mx::Shape& shape,
    int idx,
    bool& all_python_primitive_elements) {
  if (idx >= shape.size()) {
    throw std::invalid_argument("Initialization encountered extra dimension.");
  }
  auto s = shape[idx];
  if (nb::len(list) != s) {
    throw std::invalid_argument(
        "Initialization encountered non-uniform length.");
  }

  if (s == 0) {
    return pyfloat;
  }

  PyScalarT type = pybool;
  for (auto l : list) {
    PyScalarT t;
    if (nb::isinstance<nb::list>(l)) {
      t = validate_shape(
          nb::cast<nb::list>(l), shape, idx + 1, all_python_primitive_elements);
    } else if (nb::isinstance<nb::tuple>(*list.begin())) {
      t = validate_shape(
          nb::cast<nb::tuple>(l),
          shape,
          idx + 1,
          all_python_primitive_elements);
    } else if (nb::isinstance<mx::array>(l)) {
      all_python_primitive_elements = false;
      auto arr = nb::cast<mx::array>(l);
      if (arr.ndim() + idx + 1 == shape.size() &&
          std::equal(
              arr.shape().cbegin(),
              arr.shape().cend(),
              shape.cbegin() + idx + 1)) {
        t = pybool;
      } else {
        throw std::invalid_argument(
            "Initialization encountered non-uniform length.");
      }
    } else {
      if (nb::isinstance<nb::bool_>(l)) {
        t = pybool;
      } else if (nb::isinstance<nb::int_>(l)) {
        t = pyint;
      } else if (nb::isinstance<nb::float_>(l)) {
        t = pyfloat;
      } else if (PyComplex_Check(l.ptr())) {
        t = pycomplex;
      } else {
        std::ostringstream msg;
        msg << "Invalid type " << nb::type_name(l.type()).c_str()
            << " received in array initialization.";
        throw std::invalid_argument(msg.str());
      }

      if (idx + 1 != shape.size()) {
        throw std::invalid_argument(
            "Initialization encountered non-uniform length.");
      }
    }
    type = std::max(type, t);
  }
  return type;
}

template <typename T>
void get_shape(T list, mx::Shape& shape) {
  shape.push_back(check_shape_dim(nb::len(list)));
  if (shape.back() > 0) {
    auto l = list.begin();
    if (nb::isinstance<nb::list>(*l)) {
      return get_shape(nb::cast<nb::list>(*l), shape);
    } else if (nb::isinstance<nb::tuple>(*l)) {
      return get_shape(nb::cast<nb::tuple>(*l), shape);
    } else if (nb::isinstance<mx::array>(*l)) {
      auto arr = nb::cast<mx::array>(*l);
      for (int i = 0; i < arr.ndim(); i++) {
        shape.push_back(arr.shape(i));
      }
      return;
    }
  }
}

template <typename T>
mx::array array_from_list_impl(
    T pl,
    const PyScalarT& inferred_type,
    std::optional<mx::Dtype> specified_type,
    const mx::Shape& shape) {
  // Make the array
  switch (inferred_type) {
    case pybool: {
      std::vector<bool> vals;
      fill_vector(pl, vals);
      return mx::array(vals.begin(), shape, specified_type.value_or(mx::bool_));
    }
    case pyint: {
      auto dtype = specified_type.value_or(mx::int32);
      if (dtype == mx::int64) {
        std::vector<int64_t> vals;
        fill_vector(pl, vals);
        return mx::array(vals.begin(), shape, dtype);
      } else if (dtype == mx::uint64) {
        std::vector<uint64_t> vals;
        fill_vector(pl, vals);
        return mx::array(vals.begin(), shape, dtype);
      } else if (dtype == mx::uint32) {
        std::vector<uint32_t> vals;
        fill_vector(pl, vals);
        return mx::array(vals.begin(), shape, dtype);
      } else if (mx::issubdtype(dtype, mx::inexact)) {
        std::vector<float> vals;
        fill_vector(pl, vals);
        return mx::array(vals.begin(), shape, dtype);
      } else {
        std::vector<int> vals;
        fill_vector(pl, vals);
        return mx::array(vals.begin(), shape, dtype);
      }
    }
    case pyfloat: {
      auto out_type = specified_type.value_or(mx::float32);
      if (out_type == mx::float64) {
        std::vector<double> vals;
        fill_vector(pl, vals);
        return mx::array(vals.begin(), shape, out_type);
      } else {
        std::vector<float> vals;
        fill_vector(pl, vals);
        return mx::array(vals.begin(), shape, out_type);
      }
    }
    case pycomplex: {
      std::vector<std::complex<float>> vals;
      fill_vector(pl, vals);
      return mx::array(
          reinterpret_cast<mx::complex64_t*>(vals.data()),
          shape,
          specified_type.value_or(mx::complex64));
    }
    default: {
      std::ostringstream msg;
      msg << "Should not happen, inferred: " << inferred_type
          << " on subarray made of only python primitive types.";
      throw std::runtime_error(msg.str());
    }
  }
}

template <typename T>
mx::array array_from_list_impl(T pl, std::optional<mx::Dtype> dtype) {
  // Compute the shape
  mx::Shape shape;
  get_shape(pl, shape);

  // Validate the shape and type
  bool all_python_primitive_elements = true;
  auto type = validate_shape(pl, shape, 0, all_python_primitive_elements);

  if (all_python_primitive_elements) {
    // `pl` does not contain mlx arrays
    return array_from_list_impl(pl, type, dtype, shape);
  }

  // `pl` contains mlx arrays
  std::vector<mx::array> arrays;
  for (auto l : pl) {
    arrays.push_back(create_array(nb::cast<ArrayInitType>(l), dtype));
  }
  return mx::stack(arrays);
}

mx::array array_from_list(nb::list pl, std::optional<mx::Dtype> dtype) {
  return array_from_list_impl(pl, dtype);
}

mx::array array_from_list(nb::tuple pl, std::optional<mx::Dtype> dtype) {
  return array_from_list_impl(pl, dtype);
}

mx::array create_array(ArrayInitType v, std::optional<mx::Dtype> t) {
  if (auto pv = std::get_if<nb::bool_>(&v); pv) {
    return mx::array(nb::cast<bool>(*pv), t.value_or(mx::bool_));
  } else if (auto pv = std::get_if<nb::int_>(&v); pv) {
    auto val = nb::cast<int64_t>(*pv);
    auto default_type = (val > std::numeric_limits<int>::max() ||
                         val < std::numeric_limits<int>::min())
        ? mx::int64
        : mx::int32;
    return mx::array(val, t.value_or(default_type));
  } else if (auto pv = std::get_if<nb::float_>(&v); pv) {
    auto out_type = t.value_or(mx::float32);
    if (out_type == mx::float64) {
      return mx::array(nb::cast<double>(*pv), out_type);
    } else {
      return mx::array(nb::cast<float>(*pv), out_type);
    }
  } else if (auto pv = std::get_if<std::complex<float>>(&v); pv) {
    return mx::array(
        static_cast<mx::complex64_t>(*pv), t.value_or(mx::complex64));
  } else if (auto pv = std::get_if<nb::list>(&v); pv) {
    return array_from_list(*pv, t);
  } else if (auto pv = std::get_if<nb::tuple>(&v); pv) {
    return array_from_list(*pv, t);
  } else if (auto pv = std::get_if<
                 nb::ndarray<nb::ro, nb::c_contig, nb::device::cpu>>(&v);
             pv) {
    return nd_array_to_mlx(*pv, t);
  } else if (auto pv = std::get_if<mx::array>(&v); pv) {
    return mx::astype(*pv, t.value_or((*pv).dtype()));
  } else {
    auto arr = to_array_with_accessor(std::get<ArrayLike>(v).obj);
    return mx::astype(arr, t.value_or(arr.dtype()));
  }
}


================================================
FILE: python/src/convert.h
================================================
// Copyright © 2024 Apple Inc.
#pragma once

#include <optional>

#include <nanobind/nanobind.h>
#include <nanobind/ndarray.h>

#include "mlx/array.h"
#include "mlx/ops.h"

namespace mx = mlx::core;
namespace nb = nanobind;

namespace nanobind {
static constexpr dlpack::dtype bfloat16{4, 16, 1};
}; // namespace nanobind

struct ArrayLike {
  ArrayLike(nb::object obj) : obj(obj) {};
  nb::object obj;
};

using ArrayInitType = std::variant<
    nb::bool_,
    nb::int_,
    nb::float_,
    // Must be above ndarray
    mx::array,
    // Must be above complex
    nb::ndarray<nb::ro, nb::c_contig, nb::device::cpu>,
    std::complex<float>,
    nb::list,
    nb::tuple,
    ArrayLike>;

mx::array nd_array_to_mlx(
    nb::ndarray<nb::ro, nb::c_contig, nb::device::cpu> nd_array,
    std::optional<mx::Dtype> dtype);

nb::ndarray<nb::numpy> mlx_to_np_array(const mx::array& a);
nb::ndarray<> mlx_to_dlpack(const mx::array& a);

nb::object to_scalar(mx::array& a);

nb::object tolist(mx::array& a);

mx::array create_array(ArrayInitType v, std::optional<mx::Dtype> t);
mx::array array_from_list(nb::list pl, std::optional<mx::Dtype> dtype);
mx::array array_from_list(nb::tuple pl, std::optional<mx::Dtype> dtype);


================================================
FILE: python/src/cuda.cpp
================================================
// Copyright © 2023-2025 Apple Inc.

#include <nanobind/nanobind.h>

#include "mlx/backend/cuda/cuda.h"

namespace mx = mlx::core;
namespace nb = nanobind;

void init_cuda(nb::module_& m) {
  nb::module_ cuda = m.def_submodule("cuda", "mlx.cuda");

  cuda.def(
      "is_available",
      &mx::cu::is_available,
      R"pbdoc(
      Check if the CUDA back-end is available.
      )pbdoc");
}


================================================
FILE: python/src/device.cpp
================================================
// Copyright © 2023-2025 Apple Inc.

#include <sstream>

#include <nanobind/nanobind.h>
#include <nanobind/stl/string.h>
#include <nanobind/stl/unordered_map.h>
#include <nanobind/stl/variant.h>

#include "mlx/device.h"
#include "mlx/utils.h"

namespace mx = mlx::core;
namespace nb = nanobind;
using namespace nb::literals;

void init_device(nb::module_& m) {
  auto device_class = nb::class_<mx::Device>(
      m, "Device", R"pbdoc(A device to run operations on.)pbdoc");
  nb::enum_<mx::Device::DeviceType>(m, "DeviceType")
      .value("cpu", mx::Device::DeviceType::cpu)
      .value("gpu", mx::Device::DeviceType::gpu)
      .export_values()
      .def(
          "__eq__",
          [](const mx::Device::DeviceType& d, const nb::object& other) {
            if (!nb::isinstance<mx::Device>(other) &&
                !nb::isinstance<mx::Device::DeviceType>(other)) {
              return false;
            }
            return d == nb::cast<mx::Device>(other);
          });

  device_class
      .def(nb::init<mx::Device::DeviceType, int>(), "type"_a, "index"_a = 0)
      .def_ro("type", &mx::Device::type)
      .def(
          "__repr__",
          [](const mx::Device& d) {
            std::ostringstream os;
            os << d;
            return os.str();
          })
      .def("__eq__", [](const mx::Device& d, const nb::object& other) {
        if (!nb::isinstance<mx::Device>(other) &&
            !nb::isinstance<mx::Device::DeviceType>(other)) {
          return false;
        }
        return d == nb::cast<mx::Device>(other);
      });

  nb::implicitly_convertible<mx::Device::DeviceType, mx::Device>();

  m.def(
      "default_device",
      &mx::default_device,
      R"pbdoc(Get the default device.)pbdoc");
  m.def(
      "set_default_device",
      &mx::set_default_device,
      "device"_a,
      R"pbdoc(Set the default device.)pbdoc");
  m.def(
      "is_available",
      &mx::is_available,
      "device"_a,
      R"pbdoc(Check if a back-end is available for the given device.)pbdoc");
  m.def(
      "device_count",
      &mx::device_count,
      "device_type"_a,
      R"pbdoc(
      Get the number of available devices for the given device type.

      Args:
          device_type (DeviceType): The type of device to query (cpu or gpu).

      Returns:
          int: Number of devices.
      )pbdoc");
  m.def(
      "device_info",
      &mx::device_info,
      nb::arg("d") = mx::default_device(),
      R"pbdoc(
      Get information about a device.

      Returns a dictionary with device properties. Available keys depend
      on the backend and device type. Common keys include ``device_name``,
      ``architecture``, and ``total_memory`` (or ``memory_size``).

      Args:
          d (Device): The device to query (defaults to the default device).

      Returns:
          dict: Device information.
      )pbdoc");
}


================================================
FILE: python/src/distributed.cpp
================================================
// Copyright  © 2024 Apple Inc.

#include <nanobind/nanobind.h>
#include <nanobind/stl/optional.h>
#include <nanobind/stl/shared_ptr.h>
#include <nanobind/stl/string.h>
#include <nanobind/stl/variant.h>
#include <nanobind/stl/vector.h>

#include "mlx/distributed/distributed.h"
#include "mlx/distributed/ops.h"
#include "python/src/small_vector.h"
#include "python/src/utils.h"

namespace mx = mlx::core;
namespace nb = nanobind;
using namespace nb::literals;

void init_distributed(nb::module_& parent_module) {
  auto m = parent_module.def_submodule(
      "distributed", "mlx.core.distributed: Communication operations");

  nb::class_<mx::distributed::Group>(
      m,
      "Group",
      R"pbcopy(
        An :class:`mlx.core.distributed.Group` represents a group of independent mlx
        processes that can communicate.
      )pbcopy")
      .def(
          "rank", &mx::distributed::Group::rank, "Get the rank of this process")
      .def("size", &mx::distributed::Group::size, "Get the size of the group")
      .def(
          "split",
          &mx::distributed::Group::split,
          "color"_a,
          "key"_a = -1,
          nb::sig("def split(self, color: int, key: int = -1) -> Group"),
          R"pbdoc(
            Split the group to subgroups based on the provided color.

            Processes that use the same color go to the same group. The ``key``
            argument defines the rank in the new group. The smaller the key the
            smaller the rank. If the key is negative then the rank in the
            current group is used.

            Args:
              color (int): A value to group processes into subgroups.
              key (int, optional): A key to optionally change the rank ordering
                of the processes.
          )pbdoc");

  m.def(
      "is_available",
      [](const std::string& backend) {
        return mx::distributed::is_available(backend);
      },
      "backend"_a = "any",
      nb::sig("def is_available(backend: str = 'any') -> bool"),
      R"pbdoc(
      Check if a communication backend is available.

      Note, this function returns whether MLX has the capability of
      instantiating that distributed backend not whether it is possible to
      create a communication group. For that purpose one should use
      ``init(strict=True)``.

      Args:
        backend (str, optional): The name of the backend to check for availability.
          It takes the same values as :func:`init()`. Default: ``"any"``.

      Returns:
        bool: Whether the distributed backend is available.
      )pbdoc");

  m.def(
      "init",
      &mx::distributed::init,
      "strict"_a = false,
      "backend"_a = "any",
      nb::sig("def init(strict: bool = False, backend: str = 'any') -> Group"),
      R"pbdoc(
        Initialize the communication backend and create the global communication group.

        Example:

          .. code:: python

            import mlx.core as mx

            group = mx.distributed.init(backend="ring")

        Args:
          strict (bool, optional): If set to False it returns a singleton group
            in case ``mx.distributed.is_available()`` returns False otherwise
            it throws a runtime error. Default: ``False``
          backend (str, optional): Which distributed backend to initialize.
            Possible values ``mpi``, ``ring``, ``nccl``, ``jaccl``, ``any``. If
            set to ``any`` all available backends are tried and the first one
            that succeeds becomes the global group which will be returned in
            subsequent calls. Default: ``any``

        Returns:
          Group: The group representing all the launched processes.
      )pbdoc");

  m.def(
      "all_sum",
      [](const ScalarOrArray& x,
         std::optional<mx::distributed::Group> group,
         mx::StreamOrDevice s) {
        return mx::distributed::all_sum(to_array(x), group, s);
      },
      "x"_a,
      nb::kw_only(),
      "group"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def all_sum(x: array, *, group: Optional[Group] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        All reduce sum.

        Sum the ``x`` arrays from all processes in the group.

        Args:
          x (array): Input array.
          group (Group): The group of processes that will participate in the
            reduction. If set to ``None`` the global group is used. Default:
            ``None``.
          stream (Stream, optional): Stream or device. Defaults to ``None``
            in which case the default stream of the default device is used.

        Returns:
          array: The sum of all ``x`` arrays.
      )pbdoc");
  m.def(
      "all_max",
      [](const ScalarOrArray& x,
         std::optional<mx::distributed::Group> group,
         mx::StreamOrDevice s) {
        return mx::distributed::all_max(to_array(x), group, s);
      },
      "x"_a,
      nb::kw_only(),
      "group"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def all_max(x: array, *, group: Optional[Group] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        All reduce max.

        Find the maximum of the ``x`` arrays from all processes in the group.

        Args:
          x (array): Input array.
          group (Group): The group of processes that will participate in the
            reduction. If set to ``None`` the global group is used. Default:
            ``None``.
          stream (Stream, optional): Stream or device. Defaults to ``None``
            in which case the default stream of the default device is used.

        Returns:
          array: The maximum of all ``x`` arrays.
      )pbdoc");
  m.def(
      "all_min",
      [](const ScalarOrArray& x,
         std::optional<mx::distributed::Group> group,
         mx::StreamOrDevice s) {
        return mx::distributed::all_min(to_array(x), group, s);
      },
      "x"_a,
      nb::kw_only(),
      "group"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def all_min(x: array, *, group: Optional[Group] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
      All reduce min.

      Find the minimum of the ``x`` arrays from all processes in the group.

      Args:
        x (array): Input array.
        group (Group): The group of processes that will participate in the
          reduction. If set to ``None`` the global group is used. Default:
          ``None``.
        stream (Stream, optional): Stream or device. Defaults to ``None``
          in which case the default stream of the default device is used.

      Returns:
        array: The minimum of all ``x`` arrays.
    )pbdoc");
  m.def(
      "all_gather",
      [](const ScalarOrArray& x,
         std::optional<mx::distributed::Group> group,
         mx::StreamOrDevice s) {
        return mx::distributed::all_gather(to_array(x), group, s);
      },
      "x"_a,
      nb::kw_only(),
      "group"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def all_gather(x: array, *, group: Optional[Group] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Gather arrays from all processes.

        Gather the ``x`` arrays from all processes in the group and concatenate
        them along the first axis. The arrays should all have the same shape.

        Args:
          x (array): Input array.
          group (Group): The group of processes that will participate in the
            gather. If set to ``None`` the global group is used. Default:
            ``None``.
          stream (Stream, optional): Stream or device. Defaults to ``None``
            in which case the default stream of the default device is used.

        Returns:
          array: The concatenation of all ``x`` arrays.
      )pbdoc");

  m.def(
      "send",
      [](const ScalarOrArray& x,
         int dst,
         std::optional<mx::distributed::Group> group,
         mx::StreamOrDevice s) {
        return mx::distributed::send(to_array(x), dst, group, s);
      },
      "x"_a,
      "dst"_a,
      nb::kw_only(),
      "group"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def send(x: array, dst: int, *, group: Optional[Group] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Send an array from the current process to the process that has rank
        ``dst`` in the group.

        Args:
          x (array): Input array.
          dst (int): Rank of the destination process in the group.
          group (Group): The group of processes that will participate in the
            send. If set to ``None`` the global group is used. Default:
            ``None``.
          stream (Stream, optional): Stream or device. Defaults to ``None``
            in which case the default stream of the default device is used.

        Returns:
          array: An array identical to ``x`` which when evaluated the send is performed.
      )pbdoc");

  m.def(
      "recv",
      &mx::distributed::recv,
      "shape"_a,
      "dtype"_a,
      "src"_a,
      nb::kw_only(),
      "group"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def recv(shape: Sequence[int], dtype: Dtype, src: int, *, group: Optional[Group] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Recv an array with shape ``shape`` and dtype ``dtype`` from process
        with rank ``src``.

        Args:
          shape (Tuple[int]): The shape of the array we are receiving.
          dtype (Dtype): The data type of the array we are receiving.
          src (int): Rank of the source process in the group.
          group (Group): The group of processes that will participate in the
            recv. If set to ``None`` the global group is used. Default:
            ``None``.
          stream (Stream, optional): Stream or device. Defaults to ``None``
            in which case the default stream of the default device is used.

        Returns:
          array: The array that was received from ``src``.
      )pbdoc");

  m.def(
      "recv_like",
      [](const ScalarOrArray& x,
         int src,
         std::optional<mx::distributed::Group> group,
         mx::StreamOrDevice s) {
        return mx::distributed::recv_like(to_array(x), src, group, s);
      },
      "x"_a,
      "src"_a,
      nb::kw_only(),
      "group"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def recv_like(x: array, src: int, *, group: Optional[Group] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Recv an array with shape and type like ``x`` from process with rank
        ``src``.

        It is equivalent to calling ``mx.distributed.recv(x.shape, x.dtype, src)``.

        Args:
          x (array): An array defining the shape and dtype of the array we are
            receiving.
          src (int): Rank of the source process in the group.
          group (Group): The group of processes that will participate in the
            recv. If set to ``None`` the global group is used. Default:
            ``None``.
          stream (Stream, optional): Stream or device. Defaults to ``None``
            in which case the default stream of the default device is used.

        Returns:
          array: The array that was received from ``src``.
      )pbdoc");

  m.def(
      "sum_scatter",
      [](const ScalarOrArray& x,
         std::optional<mx::distributed::Group> group,
         mx::StreamOrDevice s) {
        return mx::distributed::sum_scatter(to_array(x), group, s);
      },
      "x"_a,
      nb::kw_only(),
      "group"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def sum_scatter(x: array, *, group: Optional[Group] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
      Sum ``x`` across all processes in the group and shard the result along the first axis across ranks.
      ``x.shape[0]`` must be divisible by the group size.

      The result is equivalent to ``all_sum(x)[rank*chunk_size:(rank+1)*chunk_size]``, where ``chunk_size = x.shape[0] // group.size()`` and ``rank`` is the rank of this process in the group.
      Note: ``all_sum`` is mentioned only for illustration; the actual implementation does not perform ``all_sum`` and uses a single reduce-scatter collective instead.
      Currently supported only for the NCCL backend.

      Args:
        x (array): Input array.
        group (Group): The group of processes that will participate in the
          sum scatter. If set to ``None`` the global group is used. Default:
          ``None``.
        stream (Stream, optional): Stream or device. Defaults to ``None``
          in which case the default stream of the default device is used.
      Returns:
        array: The output array with shape ``[x.shape[0] // group.size(), *x.shape[1:]]``.
    )pbdoc");
}


================================================
FILE: python/src/export.cpp
================================================
// Copyright © 2024 Apple Inc.
#include <nanobind/nanobind.h>
#include <nanobind/stl/optional.h>
#include <nanobind/stl/pair.h>
#include <nanobind/stl/string.h>
#include <nanobind/stl/tuple.h>
#include <nanobind/stl/unordered_map.h>
#include <nanobind/stl/variant.h>
#include <nanobind/stl/vector.h>

#include <fstream>

#include "mlx/array.h"
#include "mlx/export.h"
#include "mlx/graph_utils.h"
#include "python/src/small_vector.h"
#include "python/src/trees.h"

namespace mx = mlx::core;
namespace nb = nanobind;
using namespace nb::literals;

std::pair<mx::Args, mx::Kwargs> validate_and_extract_inputs(
    const nb::args& args,
    const nb::kwargs& kwargs,
    const std::string& prefix) {
  auto maybe_throw = [&prefix](bool valid) {
    if (!valid) {
      throw std::invalid_argument(
          prefix +
          " Inputs can either be a variable "
          "number of positional and keyword arrays or a single tuple "
          "and/or dictionary of arrays.");
    }
  };
  mx::Args args_;
  mx::Kwargs kwargs_;
  if (args.size() == 0) {
    // No args so kwargs must be keyword arrays
    maybe_throw(nb::try_cast(kwargs, kwargs_));
  } else if (args.size() > 0 && nb::isinstance<mx::array>(args[0])) {
    // Args are positional arrays and kwargs are keyword arrays
    maybe_throw(nb::try_cast(args, args_));
    maybe_throw(nb::try_cast(kwargs, kwargs_));
  } else if (args.size() == 1) {
    // - args[0] can be a tuple or list or arrays or a dict
    //   with string keys and array values
    // - kwargs should be empty
    maybe_throw(kwargs.size() == 0);
    if (!nb::try_cast(args[0], args_)) {
      maybe_throw(nb::try_cast(args[0], kwargs_));
    }
  } else if (args.size() == 2) {
    // - args[0] can be a tuple or list of arrays
    // - args[1] can be a dict of string keys with array values.
    // - kwargs should be empty
    maybe_throw(kwargs.size() == 0);
    maybe_throw(nb::try_cast(args[0], args_));
    maybe_throw(nb::try_cast(args[1], kwargs_));
  } else {
    maybe_throw(false);
  }
  return {args_, kwargs_};
}

int py_function_exporter_tp_traverse(
    PyObject* self,
    visitproc visit,
    void* arg);

class PyFunctionExporter {
 public:
  PyFunctionExporter(mx::FunctionExporter exporter, nb::handle dep)
      : exporter_(std::move(exporter)), dep_(dep) {}
  ~PyFunctionExporter() {
    nb::gil_scoped_acquire gil;
  }
  PyFunctionExporter(const PyFunctionExporter&) = delete;
  PyFunctionExporter& operator=(const PyFunctionExporter&) = delete;
  PyFunctionExporter& operator=(const PyFunctionExporter&&) = delete;
  PyFunctionExporter(PyFunctionExporter&& other)
      : exporter_(std::move(other.exporter_)), dep_(std::move(other.dep_)) {}

  void close() {
    exporter_.close();
  }
  void operator()(const mx::Args& args, const mx::Kwargs& kwargs) {
    exporter_(args, kwargs);
  }

  friend int py_function_exporter_tp_traverse(PyObject*, visitproc, void*);

 private:
  mx::FunctionExporter exporter_;
  nb::handle dep_;
};

int py_function_exporter_tp_traverse(
    PyObject* self,
    visitproc visit,
    void* arg) {
  Py_VISIT(Py_TYPE(self));
  if (!nb::inst_ready(self)) {
    return 0;
  }
  auto* p = nb::inst_ptr<PyFunctionExporter>(self);
  Py_VISIT(p->dep_.ptr());
  return 0;
}

PyType_Slot py_function_exporter_slots[] = {
    {Py_tp_traverse, (void*)py_function_exporter_tp_traverse},
    {0, 0}};

auto wrap_export_function(nb::callable fun) {
  return
      [fun = std::move(fun)](const mx::Args& args_, const mx::Kwargs& kwargs_) {
        auto kwargs = nb::dict();
        kwargs.update(nb::cast(kwargs_));
        auto args = nb::tuple(nb::cast(args_));
        auto outputs = fun(*args, **kwargs);
        std::vector<mx::array> outputs_;
        if (nb::isinstance<mx::array>(outputs)) {
          outputs_.push_back(nb::cast<mx::array>(outputs));
        } else if (!nb::try_cast(outputs, outputs_)) {
          throw std::invalid_argument(
              "[export_function] Outputs can be either a single array "
              "a tuple or list of arrays.");
        }
        return outputs_;
      };
}

void init_export(nb::module_& m) {
  m.def(
      "export_function",
      [](nb::object& file_or_callback,
         const nb::callable& fun,
         const nb::args& args,
         bool shapeless,
         const nb::kwargs& kwargs) {
        auto [args_, kwargs_] =
            validate_and_extract_inputs(args, kwargs, "[export_function]");
        if (nb::isinstance<nb::str>(file_or_callback)) {
          mx::export_function(
              nb::cast<std::string>(file_or_callback),
              wrap_export_function(fun),
              args_,
              kwargs_,
              shapeless);
        } else {
          auto callback = nb::cast<nb::callable>(file_or_callback);
          auto wrapped_callback =
              [callback](const mx::ExportCallbackInput& input) {
                return callback(input);
              };
          mx::export_function(
              callback, wrap_export_function(fun), args_, kwargs_, shapeless);
        }
      },
      nb::arg(),
      "fun"_a,
      "args"_a,
      nb::kw_only(),
      "shapeless"_a = false,
      "kwargs"_a,
      nb::sig(
          "def export_function(file_or_callback: Union[str, Callable], fun: Callable, *args, shapeless: bool = False, **kwargs) -> None"),
      R"pbdoc(
        Export an MLX function.

        Example input arrays must be provided to export a function. The example
        inputs can be variable ``*args`` and ``**kwargs`` or a tuple of arrays
        and/or dictionary of string keys with array values.

        .. warning::

          This is part of an experimental API which is likely to
          change in future versions of MLX. Functions exported with older
          versions of MLX may not be compatible with future versions.

        Args:
            file_or_callback (str or Callable): Either a file path to export
              the function to or a callback.
            fun (Callable): A function which takes as input zero or more
              :class:`array` and returns one or more :class:`array`.
            *args (array): Example array inputs to the function.
            shapeless (bool, optional): Whether or not the function allows
              inputs with variable shapes. Default: ``False``.
            **kwargs (array): Additional example keyword array inputs to the
              function.

        Example:

          .. code-block:: python

            def fun(x, y):
                return x + y

            x = mx.array(1)
            y = mx.array([1, 2, 3])
            mx.export_function("fun.mlxfn", fun, x, y=y)
      )pbdoc");
  m.def(
      "import_function",
      [](const std::string& file) {
        return nb::cpp_function(
            [fn = mx::import_function(file)](
                const nb::args& args, const nb::kwargs& kwargs) {
              auto [args_, kwargs_] = validate_and_extract_inputs(
                  args, kwargs, "[import_function::call]");
              return nb::tuple(nb::cast(fn(args_, kwargs_)));
            });
      },
      "file"_a,
      nb::sig("def import_function(file: str) -> Callable"),
      R"pbdoc(
        Import a function from a file.

        The imported function can be called either with ``*args`` and
        ``**kwargs`` or with a tuple of arrays and/or dictionary of string
        keys with array values. Imported functions always return a tuple of
        arrays.

        .. warning::

          This is part of an experimental API which is likely to
          change in future versions of MLX. Functions exported with older
          versions of MLX may not be compatible with future versions.

        Args:
            file (str): The file path to import the function from.

        Returns:
            Callable: The imported function.

        Example:
          >>> fn = mx.import_function("function.mlxfn")
          >>> out = fn(a, b, x=x, y=y)[0]
          >>>
          >>> out = fn((a, b), {"x": x, "y": y}[0]
      )pbdoc");

  nb::class_<PyFunctionExporter>(
      m,
      "FunctionExporter",
      nb::type_slots(py_function_exporter_slots),
      R"pbdoc(
       A context managing class for exporting multiple traces of the same
       function to a file.

       Make an instance of this class by calling fun:`mx.exporter`.
      )pbdoc")
      .def("close", &PyFunctionExporter::close)
      .def("__enter__", [](PyFunctionExporter& exporter) { return &exporter; })
      .def(
          "__exit__",
          [](PyFunctionExporter& exporter,
             const std::optional<nb::object>&,
             const std::optional<nb::object>&,
             const std::optional<nb::object>&) { exporter.close(); },
          "exc_type"_a = nb::none(),
          "exc_value"_a = nb::none(),
          "traceback"_a = nb::none())
      .def(
          "__call__",
          [](PyFunctionExporter& exporter,
             const nb::args& args,
             const nb::kwargs& kwargs) {
            auto [args_, kwargs_] =
                validate_and_extract_inputs(args, kwargs, "[export_function]");
            exporter(args_, kwargs_);
          });

  m.def(
      "exporter",
      [](const std::string& file, nb::callable fun, bool shapeless) {
        return PyFunctionExporter{
            mx::exporter(file, wrap_export_function(fun), shapeless), fun};
      },
      "file"_a,
      "fun"_a,
      nb::kw_only(),
      "shapeless"_a = false,
      R"pbdoc(
        Make a callable object to export multiple traces of a function to a file.

        .. warning::

          This is part of an experimental API which is likely to
          change in future versions of MLX. Functions exported with older
          versions of MLX may not be compatible with future versions.

        Args:
            file (str): File path to export the function to.
            shapeless (bool, optional): Whether or not the function allows
              inputs with variable shapes. Default: ``False``.

        Example:

          .. code-block:: python

            def fun(*args):
                return sum(args)

            with mx.exporter("fun.mlxfn", fun) as exporter:
                exporter(mx.array(1))
                exporter(mx.array(1), mx.array(2))
                exporter(mx.array(1), mx.array(2), mx.array(3))
      )pbdoc");
  m.def(
      "export_to_dot",
      [](nb::object file, const nb::args& args, const nb::kwargs& kwargs) {
        std::vector<mx::array> arrays =
            tree_flatten(nb::make_tuple(args, kwargs));
        mx::NodeNamer namer;
        for (const auto& n : kwargs) {
          namer.set_name(
              nb::cast<mx::array>(n.second), nb::cast<std::string>(n.first));
        }
        if (nb::isinstance<nb::str>(file)) {
          std::ofstream out(nb::cast<std::string>(file));
          mx::export_to_dot(out, std::move(namer), arrays);
        } else if (nb::hasattr(file, "write")) {
          std::ostringstream out;
          mx::export_to_dot(out, std::move(namer), arrays);
          auto write = file.attr("write");
          write(out.str());
        } else {
          throw std::invalid_argument(
              "[export_to_dot] Accepts file-like objects or strings "
              "to be used as filenames.");
        }
      },
      "file"_a,
      "args"_a,
      "kwargs"_a,
      R"pbdoc(
        Export a graph to DOT format for visualization.

        A variable number of output arrays can be provided for exporting
        The graph exported will recursively include all unevaluated inputs of
        the provided outputs.

        Args:
            file (str): The file path to export to.
            *args (array): The output arrays.
            **kwargs (dict[str, array]): Provide some names for arrays in the
              graph to make the result easier to parse.

        Example:
          >>> a = mx.array(1) + mx.array(2)
          >>> mx.export_to_dot("graph.dot", a)
          >>> x = mx.array(1)
          >>> y = mx.array(2)
          >>> mx.export_to_dot("graph.dot", x + y, x=x, y=y)
      )pbdoc");
}


================================================
FILE: python/src/fast.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <nanobind/nanobind.h>
#include <nanobind/stl/optional.h>
#include <nanobind/stl/pair.h>
#include <nanobind/stl/string.h>
#include <nanobind/stl/tuple.h>
#include <nanobind/stl/variant.h>
#include <nanobind/stl/vector.h>

#include "mlx/fast.h"
#include "mlx/ops.h"
#include "python/src/small_vector.h"
#include "python/src/utils.h"

namespace mx = mlx::core;
namespace nb = nanobind;
using namespace nb::literals;

namespace {

struct PyCustomKernelFunction {
  PyCustomKernelFunction(mx::fast::CustomKernelFunction kernel, const char* tag)
      : kernel_(std::move(kernel)), tag_(tag) {}

  std::vector<mx::array> operator()(
      const std::vector<ScalarOrArray>& inputs_,
      const std::vector<mx::Shape>& output_shapes,
      const std::vector<mx::Dtype>& output_dtypes,
      std::tuple<int, int, int> grid,
      std::tuple<int, int, int> threadgroup,
      const std::optional<std::vector<std::pair<std::string, nb::object>>>&
          template_args_ = std::nullopt,
      std::optional<float> init_value = std::nullopt,
      bool verbose = false,
      mx::StreamOrDevice s = {}) const {
    std::vector<mx::array> inputs;
    for (const auto& value : inputs_) {
      inputs.push_back(to_array(value, std::nullopt));
    }
    std::vector<std::pair<std::string, mx::fast::TemplateArg>> template_args;
    if (template_args_) {
      for (const auto& [name, value] : template_args_.value()) {
        // Handle bool, int and dtype template args
        if (nb::isinstance<bool>(value)) {
          bool bool_val = nb::cast<bool>(value);
          template_args.emplace_back(name, bool_val);
        } else if (nb::isinstance<int>(value)) {
          int int_val = nb::cast<int>(value);
          template_args.emplace_back(name, int_val);
        } else if (nb::isinstance<mx::Dtype>(value)) {
          mx::Dtype dtype = nb::cast<mx::Dtype>(value);
          template_args.emplace_back(name, dtype);
        } else {
          std::ostringstream msg;
          msg << tag_
              << " Invalid template argument. Must be `mlx.core.Dtype`, `int` or `bool`.";
          throw std::invalid_argument(msg.str());
        }
      }
    }
    return kernel_(
        inputs,
        output_shapes,
        output_dtypes,
        grid,
        threadgroup,
        template_args,
        init_value,
        verbose,
        s);
  }

  mx::fast::CustomKernelFunction kernel_;
  const char* tag_;
};

} // namespace

void init_fast(nb::module_& parent_module) {
  auto m =
      parent_module.def_submodule("fast", "mlx.core.fast: fast operations");

  m.def(
      "rms_norm",
      &mx::fast::rms_norm,
      "x"_a,
      "weight"_a.none(),
      "eps"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def rms_norm(x: array, weight: Optional[array], eps: float, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Root Mean Square normalization (RMS norm).

        The normalization is with respect to the last axis of the input ``x``.

        Args:
            x (array): Input array.
            weight (array, optional): A multiplicative weight to scale the result by.
              The ``weight`` should be one-dimensional with the same size
              as the last axis of ``x``. If set to ``None`` then no scaling happens.
            eps (float): A small additive constant for numerical stability.

        Returns:
            array: The output array.
      )pbdoc");

  m.def(
      "layer_norm",
      &mx::fast::layer_norm,
      "x"_a,
      "weight"_a.none(),
      "bias"_a.none(),
      "eps"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def layer_norm(x: array, weight: Optional[array], bias: Optional[array], eps: float, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Layer normalization.

        The normalization is with respect to the last axis of the input ``x``.

        Args:
            x (array): Input array.
            weight (array, optional): A multiplicative weight to scale the result by.
              The ``weight`` should be one-dimensional with the same size
              as the last axis of ``x``. If set to ``None`` then no scaling happens.
            bias (array, optional): An additive offset to be added to the result.
              The ``bias`` should be one-dimensional with the same size
              as the last axis of ``x``. If set to ``None`` then no translation happens.
            eps (float): A small additive constant for numerical stability.

        Returns:
            array: The output array.
      )pbdoc");

  m.def(
      "rope",
      [](const mx::array& a,
         int dims,
         bool traditional,
         std::optional<float> base,
         float scale,
         const ScalarOrArray& offset,
         const std::optional<mx::array>& freqs /* = std::nullopt */,
         mx::StreamOrDevice s /* = {} */) {
        return mx::fast::rope(
            a, dims, traditional, base, scale, to_array(offset), freqs, s);
      },
      "a"_a,
      "dims"_a,
      nb::kw_only(),
      "traditional"_a,
      "base"_a.none(),
      "scale"_a,
      "offset"_a,
      "freqs"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def rope(a: array, dims: int, *, traditional: bool, base: Optional[float], scale: float, offset: Union[int, array], freqs: Optional[array] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Apply rotary positional encoding to the input.

        The input is expected to be at least 3D with shape ``(B, *, T, D)`` where:
          * ``B`` is the batch size.
          * ``T`` is the sequence length.
          * ``D`` is the feature dimension.

        Args:
            a (array): The input array.
            dims (int): The feature dimensions to be rotated. If the input feature
              is larger than dims then the rest is left unchanged.
            traditional (bool): If set to ``True`` choose the traditional
              implementation which rotates consecutive dimensions.
            base (float, optional): The base used to compute angular frequency for
              each dimension in the positional encodings. Exactly one of ``base`` and
              ``freqs`` must be ``None``.
            scale (float): The scale used to scale the positions.
            offset (int or array): The position offset to start at. If an
              :obj:`array` is given it can be a scalar or vector of ``B``
              offsets for each example in the batch.
            freqs (array, optional): Optional frequencies to use with RoPE.
              If set, the ``base`` parameter must be ``None``. Default: ``None``.

        Returns:
            array: The output array.
      )pbdoc");

  m.def(
      "scaled_dot_product_attention",
      [](const mx::array& queries,
         const mx::array& keys,
         const mx::array& values,
         const float scale,
         const std::variant<std::monostate, std::string, mx::array>& mask,
         const std::optional<mx::array>& sinks,
         mx::StreamOrDevice s) {
        bool has_mask = !std::holds_alternative<std::monostate>(mask);
        bool has_str_mask =
            has_mask && std::holds_alternative<std::string>(mask);
        bool has_arr_mask = has_mask && std::holds_alternative<mx::array>(mask);

        if (has_mask) {
          if (has_str_mask) {
            auto mask_str = std::get<std::string>(mask);
            if (mask_str != "causal") {
              std::ostringstream msg;
              msg << "[scaled_dot_product_attention] invalid mask option '"
                  << mask_str << "'. Must be 'causal', or an array.";
              throw std::invalid_argument(msg.str());
            }
            return mx::fast::scaled_dot_product_attention(
                queries, keys, values, scale, mask_str, std::nullopt, sinks, s);
          } else {
            auto mask_arr = std::get<mx::array>(mask);
            return mx::fast::scaled_dot_product_attention(
                queries, keys, values, scale, "", mask_arr, sinks, s);
          }

        } else {
          return mx::fast::scaled_dot_product_attention(
              queries, keys, values, scale, "", {}, sinks, s);
        }
      },
      "q"_a,
      "k"_a,
      "v"_a,
      nb::kw_only(),
      "scale"_a,
      "mask"_a = nb::none(),
      "sinks"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def scaled_dot_product_attention(q: array, k: array, v: array, *, scale: float,  mask: Union[None, str, array] = None, sinks: Optional[array] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        A fast implementation of multi-head attention: ``O = softmax(Q @ K.T, dim=-1) @ V``.

        Supports:

        * `Multi-Head Attention <https://arxiv.org/abs/1706.03762>`_
        * `Grouped Query Attention <https://arxiv.org/abs/2305.13245>`_
        * `Multi-Query Attention <https://arxiv.org/abs/1911.02150>`_

        .. note::

          * The softmax operation is performed in ``float32`` regardless of
            the input precision.
          * For Grouped Query Attention and Multi-Query Attention, the ``k``
            and ``v`` inputs should not be pre-tiled to match ``q``.

        In the following the dimensions are given by:

        * ``B``: The batch size.
        * ``N_q``: The number of query heads.
        * ``N_kv``: The number of key and value heads.
        * ``T_q``: The number of queries per example.
        * ``T_kv``: The number of keys and values per example.
        * ``D``: The per-head dimension.

        Args:
            q (array): Queries with shape ``[B, N_q, T_q, D]``.
            k (array): Keys with shape ``[B, N_kv, T_kv, D]``.
            v (array): Values with shape ``[B, N_kv, T_kv, D]``.
            scale (float): Scale for queries (typically ``1.0 / sqrt(q.shape(-1)``).
            mask (str or array, optional): The mask to apply to the
               query-key scores. The mask can be an array or a string indicating
               the mask type. The only supported string type is ``"causal"``. If
               the mask is an array it can be a boolean or additive mask. The mask
               can have at most 4 dimensions and must be broadcast-compatible with
               the shape ``[B, N, T_q, T_kv]``. If an additive mask is given its
               type must promote to the promoted type of ``q``, ``k``, and ``v``.
               The ``"causal"`` mask uses lower-right alignment where the
               last query aligns with the last key.
            sinks (array, optional): An optional array of attention sinks.
               Default: ``None``.

        Returns:
            array: The output array.

        Example:

          .. code-block:: python

            B = 2
            N_q = N_kv = 32
            T_q = T_kv = 1000
            D = 128

            q = mx.random.normal(shape=(B, N_q, T_q, D))
            k = mx.random.normal(shape=(B, N_kv, T_kv, D))
            v = mx.random.normal(shape=(B, N_kv, T_kv, D))
            scale = D ** -0.5
            out = mx.fast.scaled_dot_product_attention(q, k, v, scale=scale, mask="causal")
      )pbdoc");

  m.def(
      "metal_kernel",
      [](const std::string& name,
         const std::vector<std::string>& input_names,
         const std::vector<std::string>& output_names,
         const std::string& source,
         const std::string& header,
         bool ensure_row_contiguous,
         bool atomic_outputs) {
        auto kernel = mx::fast::metal_kernel(
            name,
            input_names,
            output_names,
            source,
            header,
            ensure_row_contiguous,
            atomic_outputs);
        return nb::cpp_function(
            PyCustomKernelFunction(std::move(kernel), "[metal_kernel]"),
            nb::kw_only(),
            "inputs"_a,
            "output_shapes"_a,
            "output_dtypes"_a,
            "grid"_a,
            "threadgroup"_a,
            "template"_a = nb::none(),
            "init_value"_a = nb::none(),
            "verbose"_a = false,
            "stream"_a = nb::none(),
            nb::sig(
                "def __call__(self, *, inputs: List[Union[scalar, array]], output_shapes: List[Sequence[int]], output_dtypes: List[Dtype], grid: tuple[int, int, int], threadgroup: tuple[int, int, int], template: Optional[List[Tuple[str, Union[bool, int, Dtype]]]] = None, init_value: Optional[float] = None, verbose: bool = false, stream: Union[None, Stream, Device] = None)"),
            R"pbdoc(
            Run the kernel.

            Args:
              inputs (List[array]): The inputs passed to the Metal kernel.
              output_shapes (List[Sequence[int]]): The list of shapes for each output in ``output_names``.
              output_dtypes (List[Dtype]): The list of data types for each output in ``output_names``.
              grid (tuple[int, int, int]): 3-tuple specifying the grid to launch the kernel with.
                This will be passed to ``MTLComputeCommandEncoder::dispatchThreads``.
              threadgroup (tuple[int, int, int]): 3-tuple specifying the threadgroup size to use.
                This will be passed to ``MTLComputeCommandEncoder::dispatchThreads``.
              template (List[Tuple[str, Union[bool, int, Dtype]]], optional): Template arguments.
                  These will be added as template arguments to the kernel definition. Default: ``None``.
              init_value (float, optional): Optional value to use to initialize all of the output arrays.
                  By default, output arrays are uninitialized. Default: ``None``.
              verbose (bool, optional): Whether to print the full generated source code of the kernel
                  when it is run. Default: ``False``.
              stream (mx.stream, optional): Stream to run the kernel on. Default: ``None``.

            Returns:
              List[array]: The list of output arrays.)pbdoc");
      },
      "name"_a,
      "input_names"_a,
      "output_names"_a,
      "source"_a,
      "header"_a = "",
      "ensure_row_contiguous"_a = true,
      "atomic_outputs"_a = false,
      R"pbdoc(
      A jit-compiled custom Metal kernel defined from a source string.

      Full documentation: :ref:`custom_metal_kernels`.

      Args:
        name (str): Name for the kernel.
        input_names (List[str]): The parameter names of the inputs in the
           function signature.
        output_names (List[str]): The parameter names of the outputs in the
           function signature.
        source (str): Source code. This is the body of a function in Metal,
           the function signature will be automatically generated.
        header (str): Header source code to include before the main function.
           Useful for helper functions or includes that should live outside of
           the main function body.
        ensure_row_contiguous (bool): Whether to ensure the inputs are row contiguous
           before the kernel runs. Default: ``True``.
        atomic_outputs (bool): Whether to use atomic outputs in the function signature
           e.g. ``device atomic<float>``. Default: ``False``.

      Returns:
        Callable ``metal_kernel``.

      Example:

        .. code-block:: python

          def exp_elementwise(a: mx.array):
              source = '''
                  uint elem = thread_position_in_grid.x;
                  T tmp = inp[elem];
                  out[elem] = metal::exp(tmp);
              '''

              kernel = mx.fast.metal_kernel(
                  name="myexp",
                  input_names=["inp"],
                  output_names=["out"],
                  source=source
              )
              outputs = kernel(
                  inputs=[a],
                  template=[("T", mx.float32)],
                  grid=(a.size, 1, 1),
                  threadgroup=(256, 1, 1),
                  output_shapes=[a.shape],
                  output_dtypes=[a.dtype],
                  verbose=True,
              )
              return outputs[0]

          a = mx.random.normal(shape=(4, 16)).astype(mx.float16)
          b = exp_elementwise(a)
          assert mx.allclose(b, mx.exp(a))
     )pbdoc");

  m.def(
      "cuda_kernel",
      [](const std::string& name,
         const std::vector<std::string>& input_names,
         const std::vector<std::string>& output_names,
         const std::string& source,
         const std::string& header,
         bool ensure_row_contiguous,
         int shared_mem) {
        auto kernel = mx::fast::cuda_kernel(
            name,
            input_names,
            output_names,
            source,
            header,
            ensure_row_contiguous,
            shared_mem);
        return nb::cpp_function(
            PyCustomKernelFunction(std::move(kernel), "[cuda_kernel]"),
            nb::kw_only(),
            "inputs"_a,
            "output_shapes"_a,
            "output_dtypes"_a,
            "grid"_a,
            "threadgroup"_a,
            "template"_a = nb::none(),
            "init_value"_a = nb::none(),
            "verbose"_a = false,
            "stream"_a = nb::none(),
            nb::sig(
                "def __call__(self, *, inputs: List[Union[scalar, array]], output_shapes: List[Sequence[int]], output_dtypes: List[Dtype], grid: tuple[int, int, int], threadgroup: tuple[int, int, int], template: Optional[List[Tuple[str, Union[bool, int, Dtype]]]] = None, init_value: Optional[float] = None, verbose: bool = false, stream: Union[None, Stream, Device] = None)"),
            R"pbdoc(
            Run the kernel.

            Args:
              inputs (List[array]): The inputs passed to the CUDA kernel.
              output_shapes (List[Sequence[int]]): The list of shapes for each output in ``output_names``.
              output_dtypes (List[Dtype]): The list of data types for each output in ``output_names``.
              grid (tuple[int, int, int]): 3-tuple specifying the grid to launch the kernel with.
                For compatibility with :func:`metal_kernel` the grid is in threads and not in threadgroups.
              threadgroup (tuple[int, int, int]): 3-tuple specifying the threadgroup size to use.
              template (List[Tuple[str, Union[bool, int, Dtype]]], optional): Template arguments.
                  These will be added as template arguments to the kernel definition. Default: ``None``.
              init_value (float, optional): Optional value to use to initialize all of the output arrays.
                  By default, output arrays are uninitialized. Default: ``None``.
              verbose (bool, optional): Whether to print the full generated source code of the kernel
                  when it is run. Default: ``False``.
              stream (mx.stream, optional): Stream to run the kernel on. Default: ``None``.

            Returns:
              List[array]: The list of output arrays.)pbdoc");
      },
      "name"_a,
      "input_names"_a,
      "output_names"_a,
      "source"_a,
      "header"_a = "",
      "ensure_row_contiguous"_a = true,
      "shared_memory"_a = 0,
      R"pbdoc(
      A jit-compiled custom CUDA kernel defined from a source string.

      This is the CUDA equivalent of :ref:`custom_metal_kernels`.

      Args:
        name (str): Name for the kernel.
        input_names (List[str]): The parameter names of the inputs in the
           function signature.
        output_names (List[str]): The parameter names of the outputs in the
           function signature.
        source (str): Source code. This is the body of a function in CUDA,
           the function signature will be automatically generated.
        header (str): Header source code to include before the main function.
           Useful for helper functions or includes that should live outside of
           the main function body.
        ensure_row_contiguous (bool): Whether to ensure the inputs are row contiguous
           before the kernel runs. Default: ``True``.
        shared_memory (int): The dynamic shared memory to request for the
          kernel. A value of 0 means no dynamic shared memory. Default: ``0``.

      Returns:
        Callable ``cuda_kernel``.

      Example:

        .. code-block:: python

          def exp_elementwise(a: mx.array):
              source = '''
                  auto elem = cooperative_groups::this_grid().thread_rank();
                  T tmp = inp[elem];
                  out[elem] = exp(tmp);
              '''

              kernel = mx.fast.cuda_kernel(
                  name="myexp",
                  input_names=["inp"],
                  output_names=["out"],
                  source=source
              )
              outputs = kernel(
                  inputs=[a],
                  template=[("T", mx.float32)],
                  grid=(a.size, 1, 1),
                  threadgroup=(256, 1, 1),
                  output_shapes=[a.shape],
                  output_dtypes=[a.dtype],
                  verbose=True,
              )
              return outputs[0]

          a = mx.random.normal(shape=(16, 16)).astype(mx.float16)
          b = exp_elementwise(a)
          assert mx.allclose(b, mx.exp(a))
     )pbdoc");

  m.def(
      "precompiled_cuda_kernel",
      [](const std::string& name,
         const nb::bytes compiled_source,
         const std::vector<ScalarOrArray>& inputs_,
         const std::vector<mx::Shape>& output_shapes,
         const std::vector<mx::Dtype>& output_dtypes,
         const std::vector<nb::object>& scalars_,
         std::tuple<int, int, int> grid,
         std::tuple<int, int, int> threadgroup,
         int shared_memory,
         std::optional<float> init_value = std::nullopt,
         bool ensure_row_contiguous = false,
         mx::StreamOrDevice s = {}) {
        // Collect the inputs and cast them to array
        std::vector<mx::array> inputs;
        for (const auto& value : inputs_) {
          inputs.push_back(to_array(value, std::nullopt));
        }

        // Collect the scalar inputs
        std::vector<mx::fast::ScalarArg> scalars;
        scalars.reserve(scalars_.size());
        for (const auto& v : scalars_) {
          if (nb::isinstance<bool>(v)) {
            scalars.push_back(nb::cast<bool>(v));
          } else if (nb::isinstance<int>(v)) {
            scalars.push_back(nb::cast<int>(v));
          } else if (nb::isinstance<float>(v)) {
            scalars.push_back(nb::cast<float>(v));
          } else {
            nb::object vtype = v.attr("__class__");
            std::string vtype_name =
                nb::cast<std::string>(vtype.attr("__name__"));
            std::ostringstream msg;
            msg << "[precompiled_cuda_kernel] Invalid scalar argument type. "
                << "Received " << vtype_name
                << " but must be one of bool, int or float";
            throw std::invalid_argument(msg.str());
          }
        }

        return mx::fast::precompiled_cuda_kernel(
            name,
            std::string(
                static_cast<const char*>(compiled_source.data()),
                compiled_source.size()),
            inputs,
            output_shapes,
            output_dtypes,
            scalars,
            grid,
            threadgroup,
            shared_memory,
            init_value,
            ensure_row_contiguous,
            s);
      },
      nb::kw_only(),
      "name"_a,
      "compiled_source"_a,
      "inputs"_a,
      "output_shapes"_a,
      "output_dtypes"_a,
      "scalars"_a,
      "grid"_a,
      "threadgroup"_a,
      "shared_memory"_a = 0,
      "init_value"_a = nb::none(),
      "ensure_row_contiguous"_a = false,
      "stream"_a = nb::none(),
      R"pbdoc(
      Run a precompiled CUDA kernel defined from PTX or cubin.

      This op is still experimental and various parts of the API may change.

      Args:
        name (str): Name for the kernel
        compiled_source (bytes): The precompiled kernel in raw bytes.
        inputs (List[array]): The inputs passed to the CUDA kernel.
        output_shapes (List[Sequence[int]]): The list of shapes for each output.
        output_dtypes (List[Dtype]): The list of data types for each output.
        scalars (List[Union[bool, int, float]]): A list of scalar arguments to
          pass to the kernel.
        grid (tuple[int, int, int]): 3-tuple specifying the grid to launch the kernel with.
          For compatibility with :func:`metal_kernel` the grid is in threads and not in threadblocks.
        threadgroup (tuple[int, int, int]): 3-tuple specifying the threadgroup size to use.
        shared_memory (int): The dynamic shared memory to request for the
          kernel. A value of 0 means no dynamic shared memory. Default: ``0``.
        init_value (float, optional): Optional value to use to initialize all of the output arrays.
            By default, output arrays are uninitialized. Default: ``None``.
        ensure_row_contiguous (bool): Whether to ensure the inputs are row contiguous
           before the kernel runs. Default: ``False``.
        stream (mx.stream, optional): Stream to run the kernel on. Default: ``None``.
      )pbdoc");
}


================================================
FILE: python/src/fft.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <nanobind/nanobind.h>
#include <nanobind/stl/optional.h>
#include <nanobind/stl/variant.h>
#include <nanobind/stl/vector.h>
#include <numeric>

#include "mlx/fft.h"
#include "mlx/ops.h"
#include "python/src/small_vector.h"

namespace mx = mlx::core;
namespace nb = nanobind;
using namespace nb::literals;

void init_fft(nb::module_& parent_module) {
  auto m = parent_module.def_submodule(
      "fft", "mlx.core.fft: Fast Fourier Transforms.");
  m.def(
      "fft",
      [](const mx::array& a,
         const std::optional<int>& n,
         int axis,
         mx::StreamOrDevice s) {
        if (n.has_value()) {
          return mx::fft::fft(a, n.value(), axis, s);
        } else {
          return mx::fft::fft(a, axis, s);
        }
      },
      "a"_a,
      "n"_a = nb::none(),
      "axis"_a = -1,
      "stream"_a = nb::none(),
      R"pbdoc(
        One dimensional discrete Fourier Transform.

        Args:
            a (array): The input array.
            n (int, optional): Size of the transformed axis. The
               corresponding axis in the input is truncated or padded with
               zeros to match ``n``. The default value is ``a.shape[axis]``.
            axis (int, optional): Axis along which to perform the FFT. The
               default is ``-1``.

        Returns:
            array: The DFT of the input along the given axis.
      )pbdoc");
  m.def(
      "ifft",
      [](const mx::array& a,
         const std::optional<int>& n,
         int axis,
         mx::StreamOrDevice s) {
        if (n.has_value()) {
          return mx::fft::ifft(a, n.value(), axis, s);
        } else {
          return mx::fft::ifft(a, axis, s);
        }
      },
      "a"_a,
      "n"_a = nb::none(),
      "axis"_a = -1,
      "stream"_a = nb::none(),
      R"pbdoc(
        One dimensional inverse discrete Fourier Transform.

        Args:
            a (array): The input array.
            n (int, optional): Size of the transformed axis. The
               corresponding axis in the input is truncated or padded with
               zeros to match ``n``. The default value is ``a.shape[axis]``.
            axis (int, optional): Axis along which to perform the FFT. The
               default is ``-1``.

        Returns:
            array: The inverse DFT of the input along the given axis.
      )pbdoc");
  m.def(
      "fft2",
      [](const mx::array& a,
         const std::optional<mx::Shape>& n,
         const std::optional<std::vector<int>>& axes,
         mx::StreamOrDevice s) {
        if (axes.has_value() && n.has_value()) {
          return mx::fft::fftn(a, n.value(), axes.value(), s);
        } else if (axes.has_value()) {
          return mx::fft::fftn(a, axes.value(), s);
        } else if (n.has_value()) {
          throw std::invalid_argument(
              "[fft2] `axes` should not be `None` if `s` is not `None`.");
        } else {
          return mx::fft::fftn(a, s);
        }
      },
      "a"_a,
      "s"_a = nb::none(),
      "axes"_a.none() = std::vector<int>{-2, -1},
      "stream"_a = nb::none(),
      R"pbdoc(
        Two dimensional discrete Fourier Transform.

        Args:
            a (array): The input array.
            s (list(int), optional): Sizes of the transformed axes. The
               corresponding axes in the input are truncated or padded with
               zeros to match the sizes in ``s``. The default value is the
               sizes of ``a`` along ``axes``.
            axes (list(int), optional): Axes along which to perform the FFT.
               The default is ``[-2, -1]``.

        Returns:
            array: The DFT of the input along the given axes.
      )pbdoc");
  m.def(
      "ifft2",
      [](const mx::array& a,
         const std::optional<mx::Shape>& n,
         const std::optional<std::vector<int>>& axes,
         mx::StreamOrDevice s) {
        if (axes.has_value() && n.has_value()) {
          return mx::fft::ifftn(a, n.value(), axes.value(), s);
        } else if (axes.has_value()) {
          return mx::fft::ifftn(a, axes.value(), s);
        } else if (n.has_value()) {
          throw std::invalid_argument(
              "[ifft2] `axes` should not be `None` if `s` is not `None`.");
        } else {
          return mx::fft::ifftn(a, s);
        }
      },
      "a"_a,
      "s"_a = nb::none(),
      "axes"_a.none() = std::vector<int>{-2, -1},
      "stream"_a = nb::none(),
      R"pbdoc(
        Two dimensional inverse discrete Fourier Transform.

        Args:
            a (array): The input array.
            s (list(int), optional): Sizes of the transformed axes. The
               corresponding axes in the input are truncated or padded with
               zeros to match the sizes in ``s``. The default value is the
               sizes of ``a`` along ``axes``.
            axes (list(int), optional): Axes along which to perform the FFT.
               The default is ``[-2, -1]``.

        Returns:
            array: The inverse DFT of the input along the given axes.
      )pbdoc");
  m.def(
      "fftn",
      [](const mx::array& a,
         const std::optional<mx::Shape>& n,
         const std::optional<std::vector<int>>& axes,
         mx::StreamOrDevice s) {
        if (axes.has_value() && n.has_value()) {
          return mx::fft::fftn(a, n.value(), axes.value(), s);
        } else if (axes.has_value()) {
          return mx::fft::fftn(a, axes.value(), s);
        } else if (n.has_value()) {
          throw std::invalid_argument(
              "[fftn] `axes` should not be `None` if `s` is not `None`.");
        } else {
          return mx::fft::fftn(a, s);
        }
      },
      "a"_a,
      "s"_a = nb::none(),
      "axes"_a = nb::none(),
      "stream"_a = nb::none(),
      R"pbdoc(
        n-dimensional discrete Fourier Transform.

        Args:
            a (array): The input array.
            s (list(int), optional): Sizes of the transformed axes. The
               corresponding axes in the input are truncated or padded with
               zeros to match the sizes in ``s``. The default value is the
               sizes of ``a`` along ``axes``.
            axes (list(int), optional): Axes along which to perform the FFT.
               The default is ``None`` in which case the FFT is over the last
               ``len(s)`` axes are or all axes if ``s`` is also ``None``.

        Returns:
            array: The DFT of the input along the given axes.
      )pbdoc");
  m.def(
      "ifftn",
      [](const mx::array& a,
         const std::optional<mx::Shape>& n,
         const std::optional<std::vector<int>>& axes,
         mx::StreamOrDevice s) {
        if (axes.has_value() && n.has_value()) {
          return mx::fft::ifftn(a, n.value(), axes.value(), s);
        } else if (axes.has_value()) {
          return mx::fft::ifftn(a, axes.value(), s);
        } else if (n.has_value()) {
          throw std::invalid_argument(
              "[ifftn] `axes` should not be `None` if `s` is not `None`.");
        } else {
          return mx::fft::ifftn(a, s);
        }
      },
      "a"_a,
      "s"_a = nb::none(),
      "axes"_a = nb::none(),
      "stream"_a = nb::none(),
      R"pbdoc(
        n-dimensional inverse discrete Fourier Transform.

        Args:
            a (array): The input array.
            s (list(int), optional): Sizes of the transformed axes. The
               corresponding axes in the input are truncated or padded with
               zeros to match the sizes in ``s``. The default value is the
               sizes of ``a`` along ``axes``.
            axes (list(int), optional): Axes along which to perform the FFT.
               The default is ``None`` in which case the FFT is over the last
               ``len(s)`` axes or all axes if ``s`` is also ``None``.

        Returns:
            array: The inverse DFT of the input along the given axes.
      )pbdoc");
  m.def(
      "rfft",
      [](const mx::array& a,
         const std::optional<int>& n,
         int axis,
         mx::StreamOrDevice s) {
        if (n.has_value()) {
          return mx::fft::rfft(a, n.value(), axis, s);
        } else {
          return mx::fft::rfft(a, axis, s);
        }
      },
      "a"_a,
      "n"_a = nb::none(),
      "axis"_a = -1,
      "stream"_a = nb::none(),
      R"pbdoc(
        One dimensional discrete Fourier Transform on a real input.

        The output has the same shape as the input except along ``axis`` in
        which case it has size ``n // 2 + 1``.

        Args:
            a (array): The input array. If the array is complex it will be silently
               cast to a real type.
            n (int, optional): Size of the transformed axis. The
               corresponding axis in the input is truncated or padded with
               zeros to match ``n``. The default value is ``a.shape[axis]``.
            axis (int, optional): Axis along which to perform the FFT. The
               default is ``-1``.

        Returns:
            array: The DFT of the input along the given axis. The output
            data type will be complex.
      )pbdoc");
  m.def(
      "irfft",
      [](const mx::array& a,
         const std::optional<int>& n,
         int axis,
         mx::StreamOrDevice s) {
        if (n.has_value()) {
          return mx::fft::irfft(a, n.value(), axis, s);
        } else {
          return mx::fft::irfft(a, axis, s);
        }
      },
      "a"_a,
      "n"_a = nb::none(),
      "axis"_a = -1,
      "stream"_a = nb::none(),
      R"pbdoc(
        The inverse of :func:`rfft`.

        The output has the same shape as the input except along ``axis`` in
        which case it has size ``n``.

        Args:
            a (array): The input array.
            n (int, optional): Size of the transformed axis. The
               corresponding axis in the input is truncated or padded with
               zeros to match ``n // 2 + 1``. The default value is
               ``a.shape[axis] // 2 + 1``.
            axis (int, optional): Axis along which to perform the FFT. The
               default is ``-1``.

        Returns:
            array: The real array containing the inverse of :func:`rfft`.
      )pbdoc");
  m.def(
      "rfft2",
      [](const mx::array& a,
         const std::optional<mx::Shape>& n,
         const std::optional<std::vector<int>>& axes,
         mx::StreamOrDevice s) {
        if (axes.has_value() && n.has_value()) {
          return mx::fft::rfftn(a, n.value(), axes.value(), s);
        } else if (axes.has_value()) {
          return mx::fft::rfftn(a, axes.value(), s);
        } else if (n.has_value()) {
          throw std::invalid_argument(
              "[rfft2] `axes` should not be `None` if `s` is not `None`.");
        } else {
          return mx::fft::rfftn(a, s);
        }
      },
      "a"_a,
      "s"_a = nb::none(),
      "axes"_a.none() = std::vector<int>{-2, -1},
      "stream"_a = nb::none(),
      R"pbdoc(
        Two dimensional real discrete Fourier Transform.

        The output has the same shape as the input except along the dimensions in
        ``axes`` in which case it has sizes from ``s``. The last axis in ``axes`` is
        treated as the real axis and will have size ``s[-1] // 2 + 1``.

        Args:
            a (array): The input array. If the array is complex it will be silently
               cast to a real type.
            s (list(int), optional): Sizes of the transformed axes. The
               corresponding axes in the input are truncated or padded with
               zeros to match the sizes in ``s``. The default value is the
               sizes of ``a`` along ``axes``.
            axes (list(int), optional): Axes along which to perform the FFT.
               The default is ``[-2, -1]``.

        Returns:
            array: The real DFT of the input along the given axes. The output
            data type will be complex.
      )pbdoc");
  m.def(
      "irfft2",
      [](const mx::array& a,
         const std::optional<mx::Shape>& n,
         const std::optional<std::vector<int>>& axes,
         mx::StreamOrDevice s) {
        if (axes.has_value() && n.has_value()) {
          return mx::fft::irfftn(a, n.value(), axes.value(), s);
        } else if (axes.has_value()) {
          return mx::fft::irfftn(a, axes.value(), s);
        } else if (n.has_value()) {
          throw std::invalid_argument(
              "[irfft2] `axes` should not be `None` if `s` is not `None`.");
        } else {
          return mx::fft::irfftn(a, s);
        }
      },
      "a"_a,
      "s"_a = nb::none(),
      "axes"_a.none() = std::vector<int>{-2, -1},
      "stream"_a = nb::none(),
      R"pbdoc(
        The inverse of :func:`rfft2`.

        Note the input is generally complex. The dimensions of the input
        specified in ``axes`` are padded or truncated to match the sizes
        from ``s``. The last axis in ``axes`` is treated as the real axis
        and will have size ``s[-1] // 2 + 1``.

        Args:
            a (array): The input array.
            s (list(int), optional): Sizes of the transformed axes. The
               corresponding axes in the input are truncated or padded with
               zeros to match the sizes in ``s`` except for the last axis
               which has size ``s[-1] // 2 + 1``. The default value is the
               sizes of ``a`` along ``axes``.
            axes (list(int), optional): Axes along which to perform the FFT.
               The default is ``[-2, -1]``.

        Returns:
            array: The real array containing the inverse of :func:`rfft2`.
      )pbdoc");
  m.def(
      "rfftn",
      [](const mx::array& a,
         const std::optional<mx::Shape>& n,
         const std::optional<std::vector<int>>& axes,
         mx::StreamOrDevice s) {
        if (axes.has_value() && n.has_value()) {
          return mx::fft::rfftn(a, n.value(), axes.value(), s);
        } else if (axes.has_value()) {
          return mx::fft::rfftn(a, axes.value(), s);
        } else if (n.has_value()) {
          throw std::invalid_argument(
              "[rfftn] `axes` should not be `None` if `s` is not `None`.");
        } else {
          return mx::fft::rfftn(a, s);
        }
      },
      "a"_a,
      "s"_a = nb::none(),
      "axes"_a = nb::none(),
      "stream"_a = nb::none(),
      R"pbdoc(
        n-dimensional real discrete Fourier Transform.

        The output has the same shape as the input except along the dimensions in
        ``axes`` in which case it has sizes from ``s``. The last axis in ``axes`` is
        treated as the real axis and will have size ``s[-1] // 2 + 1``.

        Args:
            a (array): The input array. If the array is complex it will be silently
               cast to a real type.
            s (list(int), optional): Sizes of the transformed axes. The
               corresponding axes in the input are truncated or padded with
               zeros to match the sizes in ``s``. The default value is the
               sizes of ``a`` along ``axes``.
            axes (list(int), optional): Axes along which to perform the FFT.
               The default is ``None`` in which case the FFT is over the last
               ``len(s)`` axes or all axes if ``s`` is also ``None``.

        Returns:
            array: The real DFT of the input along the given axes. The output
      )pbdoc");
  m.def(
      "irfftn",
      [](const mx::array& a,
         const std::optional<mx::Shape>& n,
         const std::optional<std::vector<int>>& axes,
         mx::StreamOrDevice s) {
        if (axes.has_value() && n.has_value()) {
          return mx::fft::irfftn(a, n.value(), axes.value(), s);
        } else if (axes.has_value()) {
          return mx::fft::irfftn(a, axes.value(), s);
        } else if (n.has_value()) {
          throw std::invalid_argument(
              "[irfftn] `axes` should not be `None` if `s` is not `None`.");
        } else {
          return mx::fft::irfftn(a, s);
        }
      },
      "a"_a,
      "s"_a = nb::none(),
      "axes"_a = nb::none(),
      "stream"_a = nb::none(),
      R"pbdoc(
        The inverse of :func:`rfftn`.

        Note the input is generally complex. The dimensions of the input
        specified in ``axes`` are padded or truncated to match the sizes
        from ``s``. The last axis in ``axes`` is treated as the real axis
        and will have size ``s[-1] // 2 + 1``.

        Args:
            a (array): The input array.
            s (list(int), optional): Sizes of the transformed axes. The
               corresponding axes in the input are truncated or padded with
               zeros to match the sizes in ``s``. The default value is the
               sizes of ``a`` along ``axes``.
            axes (list(int), optional): Axes along which to perform the FFT.
               The default is ``None`` in which case the FFT is over the last
               ``len(s)`` axes or all axes if ``s`` is also ``None``.

        Returns:
            array: The real array containing the inverse of :func:`rfftn`.
      )pbdoc");
  m.def(
      "fftshift",
      [](const mx::array& a,
         const std::optional<std::vector<int>>& axes,
         mx::StreamOrDevice s) {
        if (axes.has_value()) {
          return mx::fft::fftshift(a, axes.value(), s);
        } else {
          return mx::fft::fftshift(a, s);
        }
      },
      "a"_a,
      "axes"_a = nb::none(),
      "stream"_a = nb::none(),
      R"pbdoc(
        Shift the zero-frequency component to the center of the spectrum.

        Args:
            a (array): The input array.
            axes (list(int), optional): Axes over which to perform the shift.
               If ``None``, shift all axes. 

        Returns:
            array: The shifted array with the same shape as the input.
      )pbdoc");
  m.def(
      "ifftshift",
      [](const mx::array& a,
         const std::optional<std::vector<int>>& axes,
         mx::StreamOrDevice s) {
        if (axes.has_value()) {
          return mx::fft::ifftshift(a, axes.value(), s);
        } else {
          return mx::fft::ifftshift(a, s);
        }
      },
      "a"_a,
      "axes"_a = nb::none(),
      "stream"_a = nb::none(),
      R"pbdoc(
        The inverse of :func:`fftshift`. While identical to :func:`fftshift` for even-length axes,
        the behavior differs for odd-length axes.

        Args:
            a (array): The input array.
            axes (list(int), optional): Axes over which to perform the inverse shift.
               If ``None``, shift all axes. 

        Returns:
            array: The inverse-shifted array with the same shape as the input.
      )pbdoc");
}


================================================
FILE: python/src/indexing.cpp
================================================
// Copyright © 2023-2024 Apple Inc.
#include <numeric>
#include <optional>
#include <sstream>

#include "python/src/convert.h"
#include "python/src/indexing.h"

#include "mlx/ops.h"

bool is_none_slice(const nb::slice& in_slice) {
  return (
      nb::getattr(in_slice, "start").is_none() &&
      nb::getattr(in_slice, "stop").is_none() &&
      nb::getattr(in_slice, "step").is_none());
}

bool is_index_scalar(const nb::object& obj) {
  if (nb::isinstance<nb::bool_>(obj)) {
    return false;
  }
  if (!PyIndex_Check(obj.ptr())) {
    return false;
  }
  // Exclude multi-dimensional arrays (mx.array, np.ndarray) by checking ndim
  if (nb::hasattr(obj, "ndim")) {
    auto ndim = nb::getattr(obj, "ndim");
    if (nb::isinstance<nb::int_>(ndim) && nb::cast<int>(ndim) > 0) {
      return false;
    }
  }
  return true;
}

int safe_to_int32(nb::object obj) {
  auto idx = nb::steal<nb::object>(PyNumber_Index(obj.ptr()));
  if (!idx.is_valid()) {
    throw nb::python_error();
  }

  auto val = nb::cast<int64_t>(nb::cast<nb::int_>(idx));
  if (val > INT32_MAX || val < INT32_MIN) {
    throw std::invalid_argument("Slice indices must be 32-bit integers.");
  }
  return static_cast<int>(val);
}

int get_slice_int(nb::object obj, int default_val) {
  if (!obj.is_none()) {
    if (!is_index_scalar(obj)) {
      throw std::invalid_argument("Slice indices must be integers or None.");
    }
    return safe_to_int32(obj);
  }
  return default_val;
}

void get_slice_params(
    mx::ShapeElem& starts,
    mx::ShapeElem& ends,
    mx::ShapeElem& strides,
    const nb::slice& in_slice,
    int axis_size) {
  // Following numpy's convention
  //    Assume n is the number of elements in the dimension being sliced.
  //    Then, if i is not given it defaults to 0 for k > 0 and n - 1 for
  //    k < 0 . If j is not given it defaults to n for k > 0 and -n-1 for
  //    k < 0 . If k is not given it defaults to 1

  strides = get_slice_int(nb::getattr(in_slice, "step"), 1);
  starts = get_slice_int(
      nb::getattr(in_slice, "start"), strides < 0 ? axis_size - 1 : 0);
  ends = get_slice_int(
      nb::getattr(in_slice, "stop"), strides < 0 ? -axis_size - 1 : axis_size);
}

mx::array get_int_index(nb::object idx, int axis_size) {
  int idx_ = safe_to_int32(idx);
  idx_ = (idx_ < 0) ? idx_ + axis_size : idx_;

  return mx::array(idx_, mx::uint32);
}

bool is_valid_index_type(const nb::object& obj) {
  return nb::isinstance<nb::slice>(obj) || is_index_scalar(obj) ||
      nb::isinstance<mx::array>(obj) || obj.is_none() ||
      nb::ellipsis().is(obj) || nb::isinstance<nb::list>(obj);
}

mx::array mlx_get_item_slice(const mx::array& src, const nb::slice& in_slice) {
  // Check input and raise error if 0 dim for parity with np
  if (src.ndim() == 0) {
    throw std::invalid_argument(
        "too many indices for array: array is 0-dimensional");
  }

  // Return a copy of the array if none slice is request
  if (is_none_slice(in_slice)) {
    return src;
  }

  mx::Shape starts(src.ndim(), 0);
  auto ends = src.shape();
  mx::Shape strides(src.ndim(), 1);

  // Check and update slice params
  get_slice_params(starts[0], ends[0], strides[0], in_slice, ends[0]);
  return slice(src, starts, ends, strides);
}

mx::array mlx_get_item_array(const mx::array& src, const mx::array& indices) {
  // Check input and raise error if 0 dim for parity with np
  if (src.ndim() == 0) {
    throw std::invalid_argument(
        "too many indices for array: array is 0-dimensional");
  }

  if (indices.dtype() == mx::bool_) {
    throw std::invalid_argument("boolean indices are not yet supported");
  }

  // If only one input array is mentioned, we set axis=0 in take
  // for parity with np
  return take(src, indices, 0);
}

mx::array mlx_get_item_int(const mx::array& src, const nb::object& idx) {
  // Check input and raise error if 0 dim for parity with np
  if (src.ndim() == 0) {
    throw std::invalid_argument(
        "too many indices for array: array is 0-dimensional");
  }

  // If only one input idx is mentioned, we set axis=0 in take
  // for parity with np
  return take(src, get_int_index(idx, src.shape(0)), 0);
}

mx::array mlx_gather_nd(
    mx::array src,
    const std::vector<nb::object>& indices,
    bool gather_first,
    int& max_dims) {
  max_dims = 0;
  std::vector<mx::array> gather_indices;
  std::vector<bool> is_slice(indices.size(), false);
  int num_slices = 0;
  // gather all the arrays
  for (int i = 0; i < indices.size(); i++) {
    auto& idx = indices[i];

    if (nb::isinstance<nb::slice>(idx)) {
      mx::ShapeElem start, end, stride;
      get_slice_params(
          start, end, stride, nb::cast<nb::slice>(idx), src.shape(i));

      // Handle negative indices
      start = (start < 0) ? start + src.shape(i) : start;
      end = (end < 0) ? end + src.shape(i) : end;

      gather_indices.push_back(arange(start, end, stride, mx::uint32));
      num_slices++;
      is_slice[i] = true;
    } else if (is_index_scalar(idx)) {
      gather_indices.push_back(get_int_index(idx, src.shape(i)));
    } else if (nb::isinstance<mx::array>(idx)) {
      auto arr = nb::cast<mx::array>(idx);
      max_dims = std::max(static_cast<int>(arr.ndim()), max_dims);
      gather_indices.push_back(arr);
    }
  }

  // reshape them so that the int/array indices are first
  if (gather_first) {
    int slice_index = 0;
    for (int i = 0; i < gather_indices.size(); i++) {
      if (is_slice[i]) {
        mx::Shape index_shape(max_dims + num_slices, 1);
        index_shape[max_dims + slice_index] = gather_indices[i].shape(0);
        gather_indices[i] = reshape(gather_indices[i], std::move(index_shape));
        slice_index++;
      } else {
        auto index_shape = gather_indices[i].shape();
        index_shape.insert(index_shape.end(), num_slices, 1);
        gather_indices[i] = reshape(gather_indices[i], std::move(index_shape));
      }
    }
  } else {
    // reshape them so that the int/array indices are last
    for (int i = 0; i < gather_indices.size(); i++) {
      if (i < num_slices) {
        mx::Shape index_shape(max_dims + num_slices, 1);
        index_shape[i] = gather_indices[i].shape(0);
        gather_indices[i] = reshape(gather_indices[i], std::move(index_shape));
      }
    }
  }

  // Do the gather
  std::vector<int> axes(indices.size());
  std::iota(axes.begin(), axes.end(), 0);
  auto slice_sizes = src.shape();
  std::fill(slice_sizes.begin(), slice_sizes.begin() + indices.size(), 1);
  src = gather(src, gather_indices, axes, slice_sizes);

  // Squeeze the array index dims
  for (auto& ax : axes) {
    ax += max_dims + num_slices;
  }
  return mx::squeeze(src, axes);
}

auto mlx_expand_ellipsis(const mx::Shape& shape, const nb::tuple& entries) {
  std::vector<nb::object> indices;

  // Go over all entries and note the position of ellipsis
  int non_none_indices_before = 0;
  int non_none_indices_after = 0;
  std::vector<nb::object> r_indices;
  int i = 0;
  bool has_ellipsis = false;

  // Start from dimension 0 till we hit an ellipsis
  for (; i < entries.size(); i++) {
    auto idx = entries[i];
    if (!is_valid_index_type(idx)) {
      throw std::invalid_argument(
          "Cannot index mlx array using the given type yet");
    }
    if (!nb::ellipsis().is(idx)) {
      indices.push_back(idx);
      non_none_indices_before += !idx.is_none();
    } else {
      has_ellipsis = true;
      break;
    }
  }

  // If we do hit an ellipsis, collect indices from the back
  for (int j = entries.size() - 1; j > i; j--) {
    auto idx = entries[j];
    if (!is_valid_index_type(idx)) {
      throw std::invalid_argument(
          "Cannot index mlx array using the given type yet");
    }
    if (nb::ellipsis().is(idx)) {
      throw std::invalid_argument(
          "An index can only have a single ellipsis (...)");
    }
    r_indices.push_back(idx);
    non_none_indices_after += !idx.is_none();
  }

  // Count up the number of non none indices
  int non_none_indices = non_none_indices_before + non_none_indices_after;

  // Expand ellipsis
  if (has_ellipsis) {
    for (int axis = non_none_indices_before;
         axis < shape.size() - non_none_indices_after;
         axis++) {
      indices.push_back(
          nb::slice(mx::ShapeElem{0}, shape[axis], mx::ShapeElem{1}));
      non_none_indices++;
    }
  }

  // Insert indices collected after the ellipsis
  indices.insert(indices.end(), r_indices.rbegin(), r_indices.rend());

  return std::make_pair(non_none_indices, indices);
}

mx::array mlx_get_item_nd(mx::array src, const nb::tuple& entries) {
  // No indices make this a noop
  if (entries.size() == 0) {
    return src;
  }

  // The plan is as follows:
  // 1. Replace the ellipsis with a series of slice(None)
  // 2. Convert list to array
  // 3. Loop over the indices and calculate the gather indices
  // 4. Calculate the remaining slices and reshapes

  // Ellipsis handling
  auto [non_none_indices, indices] = mlx_expand_ellipsis(src.shape(), entries);
  // List handling
  for (auto& idx : indices) {
    if (nb::isinstance<nb::list>(idx)) {
      idx = nb::cast(array_from_list(nb::cast<nb::list>(idx), {}));
    }
  }

  // Check for the number of indices passed
  if (non_none_indices > src.ndim()) {
    std::ostringstream msg;
    msg << "Too many indices for array with " << src.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  // Gather handling
  //
  // Check whether we have arrays or integer indices and delegate to gather_nd
  // after removing the slices at the end and all Nones.
  std::vector<nb::object> remaining_indices;
  bool have_array = false;
  {
    // First check whether the results of gather are going to be 1st or
    // normally in between.
    bool have_non_array = false;
    bool gather_first = false;
    for (auto& idx : indices) {
      if (nb::isinstance<mx::array>(idx) || is_index_scalar(idx)) {
        if (have_array && have_non_array) {
          gather_first = true;
          break;
        }
        have_array = true;
      } else {
        have_non_array |= have_array;
      }
    }

    int n_arr = 0;
    for (auto& idx : indices) {
      n_arr += nb::isinstance<mx::array>(idx);
    }

    have_array &= n_arr > 0;

    if (have_array) {
      int last_array;
      // Then find the last array
      for (last_array = indices.size() - 1; last_array >= 0; last_array--) {
        auto& idx = indices[last_array];
        if (nb::isinstance<mx::array>(idx) || is_index_scalar(idx)) {
          break;
        }
      }

      std::vector<nb::object> gather_indices;
      for (int i = 0; i <= last_array; i++) {
        auto& idx = indices[i];
        if (!idx.is_none()) {
          gather_indices.push_back(idx);
        }
      }
      int max_dims;
      src = mlx_gather_nd(src, gather_indices, gather_first, max_dims);

      // Reassemble the indices for the slicing or reshaping if there are any
      if (gather_first) {
        for (int i = 0; i < max_dims; i++) {
          remaining_indices.push_back(
              nb::slice(nb::none(), nb::none(), nb::none()));
        }
        for (int i = 0; i < last_array; i++) {
          auto& idx = indices[i];
          if (idx.is_none()) {
            remaining_indices.push_back(indices[i]);
          } else if (nb::isinstance<nb::slice>(idx)) {
            remaining_indices.push_back(
                nb::slice(nb::none(), nb::none(), nb::none()));
          }
        }
        for (int i = last_array + 1; i < indices.size(); i++) {
          remaining_indices.push_back(indices[i]);
        }
      } else {
        for (int i = 0; i < indices.size(); i++) {
          auto& idx = indices[i];
          if (nb::isinstance<mx::array>(idx) || is_index_scalar(idx)) {
            break;
          } else if (idx.is_none()) {
            remaining_indices.push_back(idx);
          } else {
            remaining_indices.push_back(
                nb::slice(nb::none(), nb::none(), nb::none()));
          }
        }
        for (int i = 0; i < max_dims; i++) {
          remaining_indices.push_back(
              nb::slice(nb::none(), nb::none(), nb::none()));
        }
        for (int i = last_array + 1; i < indices.size(); i++) {
          remaining_indices.push_back(indices[i]);
        }
      }
    }
  }
  if (have_array && remaining_indices.empty()) {
    return src;
  }
  if (remaining_indices.empty()) {
    remaining_indices = indices;
  }

  bool squeeze_needed = false;
  bool unsqueeze_needed = false;

  // Slice handling
  {
    mx::Shape starts(src.ndim(), 0);
    auto ends = src.shape();
    mx::Shape strides(src.ndim(), 1);
    int axis = 0;
    for (auto& idx : remaining_indices) {
      if (!idx.is_none()) {
        if (!have_array && is_index_scalar(idx)) {
          int st = safe_to_int32(idx);
          st = (st < 0) ? st + src.shape(axis) : st;

          starts[axis] = st;
          ends[axis] = st + 1;

          squeeze_needed = true;

        } else {
          get_slice_params(
              starts[axis],
              ends[axis],
              strides[axis],
              nb::cast<nb::slice>(idx),
              ends[axis]);
        }

        axis++;
      } else {
        unsqueeze_needed = true;
      }
    }
    src = slice(src, starts, ends, strides);
  }

  // Unsqueeze handling
  if (unsqueeze_needed || squeeze_needed) {
    std::vector<int> squeeze_axes;
    std::vector<int> unsqueeze_axes;
    for (int axis = 0; axis < remaining_indices.size(); ++axis) {
      auto& idx = remaining_indices[axis];
      if (unsqueeze_needed && idx.is_none()) {
        unsqueeze_axes.push_back(axis - squeeze_axes.size());
      } else if (squeeze_needed && is_index_scalar(idx)) {
        squeeze_axes.push_back(axis - unsqueeze_axes.size());
      }
    }
    if (!squeeze_axes.empty()) {
      src = squeeze(src, std::move(squeeze_axes));
    }
    if (!unsqueeze_axes.empty()) {
      src = expand_dims(src, std::move(unsqueeze_axes));
    }
  }

  return src;
}

mx::array mlx_get_item(const mx::array& src, const nb::object& obj) {
  if (nb::isinstance<nb::slice>(obj)) {
    return mlx_get_item_slice(src, nb::cast<nb::slice>(obj));
  } else if (nb::isinstance<mx::array>(obj)) {
    return mlx_get_item_array(src, nb::cast<mx::array>(obj));
  } else if (is_index_scalar(obj)) {
    return mlx_get_item_int(src, obj);
  } else if (nb::isinstance<nb::tuple>(obj)) {
    return mlx_get_item_nd(src, nb::cast<nb::tuple>(obj));
  } else if (nb::isinstance<nb::ellipsis>(obj)) {
    return src;
  } else if (obj.is_none()) {
    return expand_dims(src, 0);
  } else if (nb::isinstance<nb::list>(obj)) {
    return mlx_get_item_array(
        src, array_from_list(nb::cast<nb::list>(obj), {}));
  }
  throw std::invalid_argument("Cannot index mlx array using the given type.");
}

std::tuple<std::vector<mx::array>, mx::array, std::vector<int>>
mlx_scatter_args_int(
    const mx::array& src,
    const nb::object& idx,
    const mx::array& update) {
  if (src.ndim() == 0) {
    throw std::invalid_argument(
        "too many indices for array: array is 0-dimensional");
  }

  // Remove any leading singleton dimensions from the update
  // and then broadcast update to shape of src[0, ...]
  int s = 0;
  for (; s < update.ndim() && update.shape(s) == 1; s++)
    ;
  auto up_shape = mx::Shape(update.shape().begin() + s, update.shape().end());
  auto shape = src.shape();
  shape[0] = 1;

  return {
      {get_int_index(idx, src.shape(0))},
      broadcast_to(reshape(update, up_shape), shape),
      {0}};
}

mx::array squeeze_leading_singletons(const mx::array& in) {
  int s = 0;
  for (; s < in.ndim() && in.shape(s) == 1; s++)
    ;
  auto squeeze_axes = std::vector<int>(s);
  std::iota(squeeze_axes.begin(), squeeze_axes.end(), 0);
  return mx::squeeze(in, squeeze_axes);
}

std::tuple<std::vector<mx::array>, mx::array, std::vector<int>>
mlx_scatter_args_array(
    const mx::array& src,
    const mx::array& indices,
    const mx::array& update) {
  if (src.ndim() == 0) {
    throw std::invalid_argument(
        "too many indices for array: array is 0-dimensional");
  }

  auto up = squeeze_leading_singletons(update);

  // The update shape must broadcast with indices.shape + [1] + src.shape[1:]
  auto up_shape = indices.shape();
  up_shape.insert(up_shape.end(), src.shape().begin() + 1, src.shape().end());
  up = broadcast_to(up, up_shape);
  up_shape.insert(up_shape.begin() + indices.ndim(), 1);
  up = reshape(up, up_shape);

  return {{indices}, up, {0}};
}

std::tuple<std::vector<mx::array>, mx::array, std::vector<int>>
mlx_scatter_args_slice(
    const mx::array& src,
    const nb::slice& in_slice,
    const mx::array& update) {
  // Check input and raise error if 0 dim for parity with np
  if (src.ndim() == 0) {
    throw std::invalid_argument(
        "too many indices for array: array is 0-dimensional");
  }

  // If none slice is requested broadcast the update
  // to the src size and return it.
  if (is_none_slice(in_slice)) {
    return {
        {}, broadcast_to(squeeze_leading_singletons(update), src.shape()), {}};
  }

  mx::ShapeElem start = 0;
  auto end = src.shape(0);
  mx::ShapeElem stride = 1;

  // Check and update slice params
  get_slice_params(start, end, stride, in_slice, end);

  // If simple stride
  if (stride == 1) {
    // Squeeze out singleton dims from the start of update
    auto up = squeeze_leading_singletons(update);

    // Build array to mark start of slice
    auto idx = mx::array({start}, {1}, mx::uint32);

    // Get slice size
    int slice_size = (end - start);

    // Broadcast update to slice size
    mx::Shape up_shape_broadcast = {1, slice_size};
    up_shape_broadcast.insert(
        up_shape_broadcast.end(), src.shape().begin() + 1, src.shape().end());

    up = broadcast_to(up, up_shape_broadcast);

    auto indices = std::vector<mx::array>{idx};
    auto axes = std::vector<int>{0};

    return {indices, up, axes};
  }

  return mlx_scatter_args_array(
      src, arange(start, end, stride, mx::uint32), update);
}

std::tuple<std::vector<mx::array>, mx::array, std::vector<int>>
mlx_scatter_args_nd(
    const mx::array& src,
    const nb::tuple& entries,
    const mx::array& update) {
  // Expand ellipses into a series of ':' slices
  auto [non_none_indices, indices] = mlx_expand_ellipsis(src.shape(), entries);

  // Convert List to array
  for (auto& idx : indices) {
    if (nb::isinstance<nb::list>(idx)) {
      idx = nb::cast(array_from_list(nb::cast<nb::list>(idx), {}));
    }
  }

  if (non_none_indices > src.ndim()) {
    std::ostringstream msg;
    msg << "Too many indices for array with " << src.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  auto up = squeeze_leading_singletons(update);

  // If no non-None indices return the broadcasted update
  if (non_none_indices == 0) {
    return {{}, broadcast_to(up, src.shape()), {}};
  }

  // Analyse the types of the indices
  size_t max_dim = 0;
  bool arrays_first = false;
  int num_none = 0;
  int num_slices = 0;
  int num_arrays = 0;
  int num_strided_slices = 0;
  int num_simple_slices_post = 0;
  {
    bool have_array = false;
    bool have_non_array = false;
    for (auto& idx : indices) {
      if (idx.is_none()) {
        have_non_array = have_array;
        num_none++;

      } else if (nb::isinstance<nb::slice>(idx)) {
        have_non_array = have_array;
        num_slices++;

        auto slice = nb::cast<nb::slice>(idx);
        int stride = get_slice_int(nb::getattr(slice, "step"), 1);
        if (stride != 1) {
          num_strided_slices++;
          num_simple_slices_post = 0;
        } else {
          num_simple_slices_post++;
        }

      } else if (nb::isinstance<mx::array>(idx)) {
        have_array = true;
        if (have_array && have_non_array) {
          arrays_first = true;
        }
        max_dim = std::max(nb::cast<mx::array>(idx).ndim(), max_dim);
        num_arrays++;
        num_simple_slices_post = 0;
      }
    }
  }

  // We have index dims for the arrays, strided slices (implemented as arrays),
  // none
  int idx_ndim = max_dim + num_none + num_slices - num_simple_slices_post;

  // If we have simple non-strided slices, we also attach an index for that
  idx_ndim = idx_ndim == 0 ? 1 : idx_ndim;

  // Go over each index type and translate to the needed scatter args
  std::vector<mx::array> arr_indices;
  int slice_num = 0;
  int array_num = 0;
  int ax = 0;

  // We collect the shapes of the slices and updates during this process
  std::vector<int> update_shape(non_none_indices, 1);
  std::vector<int> slice_shapes;

  for (int i = 0; i < indices.size(); ++i) {
    auto& pyidx = indices[i];
    if (nb::isinstance<nb::slice>(pyidx)) {
      mx::ShapeElem start, end, stride;
      auto axis_size = src.shape(ax++);
      get_slice_params(
          start, end, stride, nb::cast<nb::slice>(pyidx), axis_size);

      // Handle negative indices
      start = (start < 0) ? start + axis_size : start;
      end = (end < 0) ? end + axis_size : end;

      mx::Shape idx_shape(idx_ndim, 1);

      // If it's a simple slice, we only need to add the start index
      if (array_num >= num_arrays && num_strided_slices <= 0 && stride == 1) {
        auto idx = mx::array({start}, idx_shape, mx::uint32);
        slice_shapes.push_back(end - start);
        arr_indices.push_back(idx);

        // Add the shape to the update
        update_shape[ax - 1] = slice_shapes.back();
      }
      // Otherwise we expand the slice into indices using arange
      else {
        auto idx = arange(start, end, stride, mx::uint32);
        auto loc = slice_num + (arrays_first ? max_dim : 0);
        idx_shape[loc] = idx.size();
        arr_indices.push_back(reshape(idx, idx_shape));

        slice_num++;
        num_strided_slices--;

        // Add the shape to the update
        update_shape[ax - 1] = 1;
      }
    } else if (is_index_scalar(pyidx)) {
      // Add index to arrays
      arr_indices.push_back(get_int_index(pyidx, src.shape(ax++)));
      // Add the shape to the update
      update_shape[ax - 1] = 1;
    } else if (pyidx.is_none()) {
      // We only use the None's for bookeeping dimensions
      slice_num++;
    } else if (nb::isinstance<mx::array>(pyidx)) {
      ax++;
      auto idx = nb::cast<mx::array>(pyidx);
      mx::Shape idx_shape(idx_ndim, 1);

      // Place the arrays in the correct dimension
      int st = (!arrays_first) * slice_num + max_dim - idx.ndim();
      for (int j = 0; j < idx.ndim(); j++) {
        idx_shape[st + j] = idx.shape()[j];
      }
      arr_indices.push_back(reshape(idx, idx_shape));
      if (!arrays_first && ++array_num == num_arrays) {
        slice_num += max_dim;
      }

      // Add the shape to the update
      update_shape[ax - 1] = 1;
    } else {
      throw std::invalid_argument(
          "Cannot index mlx array using the given type yet");
    }
  }

  // Broadcast the update to the indices and slices
  arr_indices = broadcast_arrays(arr_indices);
  auto up_shape_broadcast = arr_indices[0].shape();

  up_shape_broadcast.insert(
      up_shape_broadcast.end(), slice_shapes.begin(), slice_shapes.end());
  up_shape_broadcast.insert(
      up_shape_broadcast.end(),
      src.shape().begin() + non_none_indices,
      src.shape().end());
  up = broadcast_to(up, up_shape_broadcast);

  // Reshape the update with the size-1 dims for the int and array indices
  auto up_reshape = arr_indices[0].shape();
  up_reshape.insert(up_reshape.end(), update_shape.begin(), update_shape.end());
  up_reshape.insert(
      up_reshape.end(),
      src.shape().begin() + non_none_indices,
      src.shape().end());

  up = reshape(up, up_reshape);

  // Collect axes
  std::vector<int> axes(arr_indices.size(), 0);
  std::iota(axes.begin(), axes.end(), 0);

  return {arr_indices, up, axes};
}

std::tuple<std::vector<mx::array>, mx::array, std::vector<int>>
mlx_compute_scatter_args(
    const mx::array& src,
    const nb::object& obj,
    const ScalarOrArray& v) {
  auto vals = to_array(v, src.dtype());
  if (nb::isinstance<nb::slice>(obj)) {
    return mlx_scatter_args_slice(src, nb::cast<nb::slice>(obj), vals);
  } else if (nb::isinstance<mx::array>(obj)) {
    return mlx_scatter_args_array(src, nb::cast<mx::array>(obj), vals);
  } else if (is_index_scalar(obj)) {
    return mlx_scatter_args_int(src, obj, vals);
  } else if (nb::isinstance<nb::tuple>(obj)) {
    return mlx_scatter_args_nd(src, nb::cast<nb::tuple>(obj), vals);
  } else if (obj.is_none()) {
    return {{}, broadcast_to(vals, src.shape()), {}};
  } else if (nb::isinstance<nb::list>(obj)) {
    return mlx_scatter_args_array(
        src, array_from_list(nb::cast<nb::list>(obj), {}), vals);
  }

  throw std::invalid_argument("Cannot index mlx array using the given type.");
}

std::tuple<std::optional<mx::array>, mx::Shape, mx::Shape, mx::Shape>
mlx_compute_slice_update_args(
    const mx::array& src,
    const nb::object& obj,
    const ScalarOrArray& v) {
  // Build the slice params
  mx::Shape starts(src.ndim(), 0);
  mx::Shape stops = src.shape();
  mx::Shape strides(src.ndim(), 1);

  // Can't route to slice update if not slice, tuple, or int
  if (src.ndim() == 0 ||
      (!nb::isinstance<nb::slice>(obj) && !nb::isinstance<nb::tuple>(obj) &&
       !is_index_scalar(obj))) {
    return std::make_tuple(
        std::nullopt, std::move(starts), std::move(stops), std::move(strides));
  }
  if (nb::isinstance<nb::tuple>(obj)) {
    // Can't route to slice update if any arrays are present
    for (auto idx : nb::cast<nb::tuple>(obj)) {
      if (nb::isinstance<mx::array>(idx) || nb::isinstance<nb::list>(idx)) {
        return std::make_tuple(
            std::nullopt,
            std::move(starts),
            std::move(stops),
            std::move(strides));
      }
    }
  }

  // Should be able to route to slice update just extract the update value and
  // and the slice arguments.

  // Cast v to an array and ensure it is the right type
  auto update = to_array(v, src.dtype());

  // Remove extra leading singletons dimensions from the update
  int s = 0;
  for (; s < static_cast<int>(update.ndim()) - 1 && update.shape(s) == 1 &&
       (update.ndim() - s) > src.ndim();
       s++) {
  };
  auto squeeze_axes = std::vector<int>(s);
  std::iota(squeeze_axes.begin(), squeeze_axes.end(), 0);
  update = mx::squeeze(update, squeeze_axes);

  // Single int then make it a slice of size 1
  if (is_index_scalar(obj)) {
    if (src.ndim() < 1) {
      std::ostringstream msg;
      msg << "Too many indices for array with " << src.ndim() << " dimensions.";
      throw std::invalid_argument(msg.str());
    }
    auto idx = safe_to_int32(obj);
    idx = idx < 0 ? idx + stops[0] : idx;
    starts[0] = idx;
    stops[0] = idx + 1;
    return std::make_tuple(
        update, std::move(starts), std::move(stops), std::move(strides));
  }

  // Simple slice, just extract it into the first dim
  if (nb::isinstance<nb::slice>(obj)) {
    // Read slice arguments
    get_slice_params(
        starts[0],
        stops[0],
        strides[0],
        nb::cast<nb::slice>(obj),
        src.shape(0));
    return std::make_tuple(
        update, std::move(starts), std::move(stops), std::move(strides));
  }

  // It must be a tuple
  auto entries = nb::cast<nb::tuple>(obj);

  // Expand ellipsis into a series of ':' slices
  auto [non_none_indices, indices] = mlx_expand_ellipsis(src.shape(), entries);

  // Dimension check
  if (non_none_indices > src.ndim()) {
    std::ostringstream msg;
    msg << "Too many indices for array with " << src.ndim() << " dimensions.";
    throw std::invalid_argument(msg.str());
  }

  // If no non-None indices return the broadcasted update
  if (non_none_indices == 0) {
    return std::make_tuple(
        broadcast_to(update, src.shape()),
        std::move(starts),
        std::move(stops),
        std::move(strides));
  }

  // Parse the update slice
  int unspecified = src.ndim() - non_none_indices;
  std::vector<int> squeeze_dims;
  std::vector<int> expand_dims;
  for (int i = indices.size() - 1,
           ax = non_none_indices - 1,
           upd_ax = update.ndim() - unspecified - 1;
       i >= 0;
       --i) {
    auto& pyidx = indices[i];
    if (nb::isinstance<nb::slice>(pyidx)) {
      get_slice_params(
          starts[ax],
          stops[ax],
          strides[ax],
          nb::cast<nb::slice>(pyidx),
          src.shape(ax));
      ax--;
      upd_ax--;
    } else if (is_index_scalar(pyidx)) {
      int st = safe_to_int32(pyidx);
      st = (st < 0) ? st + src.shape(i) : st;
      starts[ax] = st;
      stops[ax] = st + 1;
      if (upd_ax >= 0) {
        expand_dims.push_back(i - indices.size() - unspecified);
      }
      ax--;
    } else if (pyidx.is_none()) {
      if (upd_ax-- >= 0) {
        squeeze_dims.push_back(i - indices.size() - unspecified);
      }
    }
  }
  update = mx::squeeze(
      mx::expand_dims(update, std::move(expand_dims)), std::move(squeeze_dims));

  return std::make_tuple(
      update, std::move(starts), std::move(stops), std::move(strides));
}

std::optional<mx::array> extract_boolean_mask(const nb::object& obj) {
  using NDArray = nb::ndarray<nb::ro, nb::c_contig, nb::device::cpu>;
  if (nb::isinstance<nb::bool_>(obj)) {
    return mx::array(nb::cast<bool>(obj), mx::bool_);
  } else if (nb::isinstance<mx::array>(obj)) {
    auto mask = nb::cast<mx::array>(obj);
    if (mask.dtype() == mx::bool_) {
      return mask;
    }
  } else if (nb::isinstance<NDArray>(obj)) {
    auto mask = nb::cast<NDArray>(obj);
    if (mask.dtype() == nb::dtype<bool>()) {
      return nd_array_to_mlx(mask, mx::bool_);
    }
  } else if (nb::isinstance<nb::list>(obj)) {
    auto mask = array_from_list(nb::cast<nb::list>(obj), {});
    if (mask.dtype() == mx::bool_) {
      return mask;
    }
  }
  return std::nullopt;
}

void mlx_set_item(
    mx::array& src,
    const nb::object& obj,
    const ScalarOrArray& v) {
  auto [update, starts, stops, strides] =
      mlx_compute_slice_update_args(src, obj, v);
  if (update) {
    src.overwrite_descriptor(
        slice_update(src, *update, starts, stops, strides));
    return;
  }

  if (auto mask = extract_boolean_mask(obj)) {
    auto updates = to_array(v, src.dtype());
    auto result = masked_scatter(src, *mask, updates);
    src.overwrite_descriptor(result);
    return;
  }

  auto [indices, updates, axes] = mlx_compute_scatter_args(src, obj, v);
  if (indices.size() > 0) {
    auto out = scatter(src, indices, updates, axes);
    src.overwrite_descriptor(out);
  } else {
    src.overwrite_descriptor(updates);
  }
}

mx::array mlx_add_item(
    const mx::array& src,
    const nb::object& obj,
    const ScalarOrArray& v) {
  auto [update, starts, stops, strides] =
      mlx_compute_slice_update_args(src, obj, v);
  if (update) {
    return slice_update_add(src, *update, starts, stops, strides);
  }

  auto [indices, updates, axes] = mlx_compute_scatter_args(src, obj, v);
  if (indices.size() > 0) {
    return scatter_add(src, indices, updates, axes);
  } else {
    return src + updates;
  }
}

mx::array mlx_subtract_item(
    const mx::array& src,
    const nb::object& obj,
    const ScalarOrArray& v) {
  auto [update, starts, stops, strides] =
      mlx_compute_slice_update_args(src, obj, v);
  if (update) {
    return slice_update_add(src, -(*update), starts, stops, strides);
  }

  auto [indices, updates, axes] = mlx_compute_scatter_args(src, obj, v);
  if (indices.size() > 0) {
    return scatter_add(src, indices, -updates, axes);
  } else {
    return src - updates;
  }
}

mx::array mlx_multiply_item(
    const mx::array& src,
    const nb::object& obj,
    const ScalarOrArray& v) {
  auto [update, starts, stops, strides] =
      mlx_compute_slice_update_args(src, obj, v);
  if (update) {
    return slice_update_prod(src, *update, starts, stops, strides);
  }

  auto [indices, updates, axes] = mlx_compute_scatter_args(src, obj, v);
  if (indices.size() > 0) {
    return scatter_prod(src, indices, updates, axes);
  } else {
    return src * updates;
  }
}

mx::array mlx_divide_item(
    const mx::array& src,
    const nb::object& obj,
    const ScalarOrArray& v) {
  auto [update, starts, stops, strides] =
      mlx_compute_slice_update_args(src, obj, v);
  if (update) {
    return slice_update_prod(src, reciprocal(*update), starts, stops, strides);
  }

  auto [indices, updates, axes] = mlx_compute_scatter_args(src, obj, v);
  if (indices.size() > 0) {
    return scatter_prod(src, indices, reciprocal(updates), axes);
  } else {
    return src / updates;
  }
}

mx::array mlx_maximum_item(
    const mx::array& src,
    const nb::object& obj,
    const ScalarOrArray& v) {
  auto [update, starts, stops, strides] =
      mlx_compute_slice_update_args(src, obj, v);
  if (update) {
    return slice_update_max(src, *update, starts, stops, strides);
  }

  auto [indices, updates, axes] = mlx_compute_scatter_args(src, obj, v);
  if (indices.size() > 0) {
    return scatter_max(src, indices, updates, axes);
  } else {
    return maximum(src, updates);
  }
}

mx::array mlx_minimum_item(
    const mx::array& src,
    const nb::object& obj,
    const ScalarOrArray& v) {
  auto [update, starts, stops, strides] =
      mlx_compute_slice_update_args(src, obj, v);
  if (update) {
    return slice_update_min(src, *update, starts, stops, strides);
  }

  auto [indices, updates, axes] = mlx_compute_scatter_args(src, obj, v);
  if (indices.size() > 0) {
    return scatter_min(src, indices, updates, axes);
  } else {
    return minimum(src, updates);
  }
}


================================================
FILE: python/src/indexing.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <nanobind/nanobind.h>

#include "mlx/array.h"
#include "python/src/utils.h"

namespace mx = mlx::core;
namespace nb = nanobind;

mx::array mlx_get_item(const mx::array& src, const nb::object& obj);
void mlx_set_item(
    mx::array& src,
    const nb::object& obj,
    const ScalarOrArray& v);
mx::array mlx_add_item(
    const mx::array& src,
    const nb::object& obj,
    const ScalarOrArray& v);
mx::array mlx_subtract_item(
    const mx::array& src,
    const nb::object& obj,
    const ScalarOrArray& v);
mx::array mlx_multiply_item(
    const mx::array& src,
    const nb::object& obj,
    const ScalarOrArray& v);
mx::array mlx_divide_item(
    const mx::array& src,
    const nb::object& obj,
    const ScalarOrArray& v);
mx::array mlx_maximum_item(
    const mx::array& src,
    const nb::object& obj,
    const ScalarOrArray& v);
mx::array mlx_minimum_item(
    const mx::array& src,
    const nb::object& obj,
    const ScalarOrArray& v);


================================================
FILE: python/src/linalg.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <variant>

#include <nanobind/nanobind.h>
#include <nanobind/stl/pair.h>
#include <nanobind/stl/string.h>
#include <nanobind/stl/variant.h>
#include <nanobind/stl/vector.h>

#include "mlx/linalg.h"
#include "python/src/small_vector.h"

namespace mx = mlx::core;
namespace nb = nanobind;
using namespace nb::literals;

void init_linalg(nb::module_& parent_module) {
  auto m = parent_module.def_submodule(
      "linalg", "mlx.core.linalg: linear algebra routines.");

  m.def(
      "norm",
      [](const mx::array& a,
         const std::variant<std::monostate, int, double, std::string>& ord_,
         const std::variant<std::monostate, int, std::vector<int>>& axis_,
         const bool keepdims,
         const mx::StreamOrDevice stream) {
        std::optional<std::vector<int>> axis = std::nullopt;
        if (auto pv = std::get_if<int>(&axis_); pv) {
          axis = std::vector<int>{*pv};
        } else if (auto pv = std::get_if<std::vector<int>>(&axis_); pv) {
          axis = *pv;
        }

        if (std::holds_alternative<std::monostate>(ord_)) {
          return mx::linalg::norm(a, axis, keepdims, stream);
        } else {
          if (auto pv = std::get_if<std::string>(&ord_); pv) {
            return mx::linalg::norm(a, *pv, axis, keepdims, stream);
          }
          double ord;
          if (auto pv = std::get_if<int>(&ord_); pv) {
            ord = *pv;
          } else {
            ord = std::get<double>(ord_);
          }
          return mx::linalg::norm(a, ord, axis, keepdims, stream);
        }
      },
      nb::arg(),
      "ord"_a = nb::none(),
      "axis"_a = nb::none(),
      "keepdims"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def norm(a: array, /, ord: Union[None, int, float, str] = None, axis: Union[None, int, list[int]] = None, keepdims: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Matrix or vector norm.

        This function computes vector or  matrix norms depending on the value of
        the ``ord`` and ``axis`` parameters.

        Args:
          a (array): Input array.  If ``axis`` is ``None``, ``a`` must be 1-D or 2-D,
            unless ``ord`` is ``None``. If both ``axis`` and ``ord`` are ``None``, the
            2-norm of ``a.flatten`` will be returned.
          ord (int, float or str, optional): Order of the norm (see table under ``Notes``).
            If ``None``, the 2-norm (or Frobenius norm for matrices) will be computed
            along the given ``axis``.  Default: ``None``.
          axis (int or list(int), optional): If ``axis`` is an integer, it specifies the
            axis of ``a`` along which to compute the vector norms.  If ``axis`` is a
            2-tuple, it specifies the axes that hold 2-D matrices, and the matrix
            norms of these matrices are computed. If `axis` is ``None`` then
            either a vector norm (when ``a`` is 1-D) or a matrix norm (when ``a`` is
            2-D) is returned. Default: ``None``.
          keepdims (bool, optional): If ``True``, the axes which are normed over are
            left in the result as dimensions with size one. Default ``False``.

        Returns:
          array: The output containing the norm(s).

        Notes:
          For values of ``ord < 1``, the result is, strictly speaking, not a
          mathematical norm, but it may still be useful for various numerical
          purposes.

          The following norms can be calculated:

          =====  ============================  ==========================
          ord    norm for matrices             norm for vectors
          =====  ============================  ==========================
          None   Frobenius norm                2-norm
          'fro'  Frobenius norm                --
          'nuc'  nuclear norm                  --
          inf    max(sum(abs(x), axis=1))      max(abs(x))
          -inf   min(sum(abs(x), axis=1))      min(abs(x))
          0      --                            sum(x != 0)
          1      max(sum(abs(x), axis=0))      as below
          -1     min(sum(abs(x), axis=0))      as below
          2      2-norm (largest sing. value)  as below
          -2     smallest singular value       as below
          other  --                            sum(abs(x)**ord)**(1./ord)
          =====  ============================  ==========================

          The Frobenius norm is given by [1]_:

              :math:`||A||_F = [\sum_{i,j} abs(a_{i,j})^2]^{1/2}`

          The nuclear norm is the sum of the singular values.

          Both the Frobenius and nuclear norm orders are only defined for
          matrices and raise a ``ValueError`` when ``a.ndim != 2``.

        References:
          .. [1] G. H. Golub and C. F. Van Loan, *Matrix Computations*,
                 Baltimore, MD, Johns Hopkins University Press, 1985, pg. 15

        Examples:
          >>> import mlx.core as mx
          >>> from mlx.core import linalg as la
          >>> a = mx.arange(9) - 4
          >>> a
          array([-4, -3, -2, ..., 2, 3, 4], dtype=int32)
          >>> b = a.reshape((3,3))
          >>> b
          array([[-4, -3, -2],
                 [-1,  0,  1],
                 [ 2,  3,  4]], dtype=int32)
          >>> la.norm(a)
          array(7.74597, dtype=float32)
          >>> la.norm(b)
          array(7.74597, dtype=float32)
          >>> la.norm(b, 'fro')
          array(7.74597, dtype=float32)
          >>> la.norm(a, float("inf"))
          array(4, dtype=float32)
          >>> la.norm(b, float("inf"))
          array(9, dtype=float32)
          >>> la.norm(a, -float("inf"))
          array(0, dtype=float32)
          >>> la.norm(b, -float("inf"))
          array(2, dtype=float32)
          >>> la.norm(a, 1)
          array(20, dtype=float32)
          >>> la.norm(b, 1)
          array(7, dtype=float32)
          >>> la.norm(a, -1)
          array(0, dtype=float32)
          >>> la.norm(b, -1)
          array(6, dtype=float32)
          >>> la.norm(a, 2)
          array(7.74597, dtype=float32)
          >>> la.norm(a, 3)
          array(5.84804, dtype=float32)
          >>> la.norm(a, -3)
          array(0, dtype=float32)
          >>> c = mx.array([[ 1, 2, 3],
          ...               [-1, 1, 4]])
          >>> la.norm(c, axis=0)
          array([1.41421, 2.23607, 5], dtype=float32)
          >>> la.norm(c, axis=1)
          array([3.74166, 4.24264], dtype=float32)
          >>> la.norm(c, ord=1, axis=1)
          array([6, 6], dtype=float32)
          >>> m = mx.arange(8).reshape(2,2,2)
          >>> la.norm(m, axis=(1,2))
          array([3.74166, 11.225], dtype=float32)
          >>> la.norm(m[0, :, :]), LA.norm(m[1, :, :])
          (array(3.74166, dtype=float32), array(11.225, dtype=float32))
      )pbdoc");
  m.def(
      "qr",
      &mx::linalg::qr,
      "a"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def qr(a: array, *, stream: Union[None, Stream, Device] = None) -> Tuple[array, array]"),
      R"pbdoc(
        The QR factorization of the input matrix.

        This function supports arrays with at least 2 dimensions. The matrices
        which are factorized are assumed to be in the last two dimensions of
        the input.

        Args:
            a (array): Input array.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

        Returns:
            tuple(array, array): ``Q`` and ``R`` matrices such that ``Q @ R = a``.

        Example:
            >>> A = mx.array([[2., 3.], [1., 2.]])
            >>> Q, R = mx.linalg.qr(A, stream=mx.cpu)
            >>> Q
            array([[-0.894427, -0.447214],
                   [-0.447214, 0.894427]], dtype=float32)
            >>> R
            array([[-2.23607, -3.57771],
                   [0, 0.447214]], dtype=float32)
      )pbdoc");
  m.def(
      "svd",
      [](const mx::array& a,
         bool compute_uv /* = true */,
         mx::StreamOrDevice s /* = {} */) -> nb::object {
        const auto result = mx::linalg::svd(a, compute_uv, s);
        if (result.size() == 1) {
          return nb::cast(result.at(0));
        } else {
          return nb::make_tuple(result.at(0), result.at(1), result.at(2));
        }
      },
      "a"_a,
      "compute_uv"_a = true,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def svd(a: array, compute_uv: bool = True, *, stream: Union[None, Stream, Device] = None) -> Tuple[array, array, array]"),
      R"pbdoc(
        The Singular Value Decomposition (SVD) of the input matrix.

        This function supports arrays with at least 2 dimensions. When the input
        has more than two dimensions, the function iterates over all indices of the first
        a.ndim - 2 dimensions and for each combination SVD is applied to the last two indices.

        Args:
            a (array): Input array.
            compute_uv (bool, optional): If ``True``, return the ``U``, ``S``, and ``Vt`` components.
              If ``False``, return only the ``S`` array. Default: ``True``.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

        Returns:
            Union[tuple(array, ...), array]:
              If compute_uv is ``True`` returns the ``U``, ``S``, and ``Vt`` matrices, such that
              ``A = U @ diag(S) @ Vt``. If compute_uv is ``False`` returns singular values array ``S``.
      )pbdoc");
  m.def(
      "inv",
      &mx::linalg::inv,
      "a"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def inv(a: array, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Compute the inverse of a square matrix.

        This function supports arrays with at least 2 dimensions. When the input
        has more than two dimensions, the inverse is computed for each matrix
        in the last two dimensions of ``a``.

        Args:
            a (array): Input array.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

        Returns:
            array: ``ainv`` such that ``dot(a, ainv) = dot(ainv, a) = eye(a.shape[0])``
      )pbdoc");
  m.def(
      "tri_inv",
      &mx::linalg::tri_inv,
      "a"_a,
      "upper"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def tri_inv(a: array, upper: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Compute the inverse of a triangular square matrix.

        This function supports arrays with at least 2 dimensions. When the input
        has more than two dimensions, the inverse is computed for each matrix
        in the last two dimensions of ``a``.

        Args:
            a (array): Input array.
            upper (bool, optional): Whether the array is upper or lower triangular. Defaults to ``False``.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

        Returns:
            array: ``ainv`` such that ``dot(a, ainv) = dot(ainv, a) = eye(a.shape[0])``
      )pbdoc");
  m.def(
      "cholesky",
      &mx::linalg::cholesky,
      "a"_a,
      "upper"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def cholesky(a: array, upper: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Compute the Cholesky decomposition of a real symmetric positive semi-definite matrix.

        This function supports arrays with at least 2 dimensions. When the input
        has more than two dimensions, the Cholesky decomposition is computed for each matrix
        in the last two dimensions of ``a``.

        If the input matrix is not symmetric positive semi-definite, behaviour is undefined.

        Args:
            a (array): Input array.
            upper (bool, optional): If ``True``, return the upper triangular Cholesky factor.
              If ``False``, return the lower triangular Cholesky factor. Default: ``False``.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

        Returns:
          array: If ``upper = False``, it returns a lower triangular ``L`` matrix such
          that ``L @ L.T = a``.  If ``upper = True``, it returns an upper triangular
          ``U`` matrix such that ``U.T @ U = a``.
      )pbdoc");
  m.def(
      "cholesky_inv",
      &mx::linalg::cholesky_inv,
      "a"_a,
      "upper"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def cholesky_inv(L: array, upper: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Compute the inverse of a real symmetric positive semi-definite matrix using it's Cholesky decomposition.

        Let :math:`\mathbf{A}` be a real symmetric positive semi-definite matrix and :math:`\mathbf{L}` its Cholesky decomposition such that:

        .. math::

          \begin{aligned}
            \mathbf{A} = \mathbf{L}\mathbf{L}^T
          \end{aligned}

        This function computes :math:`\mathbf{A}^{-1}`.

        This function supports arrays with at least 2 dimensions. When the input
        has more than two dimensions, the Cholesky inverse is computed for each matrix
        in the last two dimensions of :math:`\mathbf{L}`.

        If the input matrix is not a triangular matrix behaviour is undefined.

        Args:
            L (array): Input array.
            upper (bool, optional): If ``True``, return the upper triangular Cholesky factor.
              If ``False``, return the lower triangular Cholesky factor. Default: ``False``.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

        Returns:
          array: :math:`\mathbf{A^{-1}}` where :math:`\mathbf{A} = \mathbf{L}\mathbf{L}^T`.
      )pbdoc");
  m.def(
      "pinv",
      &mx::linalg::pinv,
      "a"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def pinv(a: array, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Compute the (Moore-Penrose) pseudo-inverse of a matrix.

        This function calculates a generalized inverse of a matrix using its
        singular-value decomposition. This function supports arrays with at least 2 dimensions.
        When the input has more than two dimensions, the inverse is computed for each
        matrix in the last two dimensions of ``a``.

        Args:
            a (array): Input array.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

        Returns:
            array: ``aplus`` such that ``a @ aplus @ a = a``
      )pbdoc");
  m.def(
      "cross",
      &mx::linalg::cross,
      "a"_a,
      "b"_a,
      "axis"_a = -1,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def cross(a: array, b: array, axis: int = -1, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Compute the cross product of two arrays along a specified axis.

        The cross product is defined for arrays with size 2 or 3 in the
        specified axis. If the size is 2 then the third value is assumed
        to be zero.

        Args:
            a (array): Input array.
            b (array): Input array.
            axis (int, optional): Axis along which to compute the cross
              product. Default: ``-1``.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

        Returns:
            array: The cross product of ``a`` and ``b`` along the specified axis.
      )pbdoc");
  m.def(
      "eigvals",
      &mx::linalg::eigvals,
      "a"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      R"pbdoc(
        Compute the eigenvalues of a square matrix.

        This function differs from :func:`numpy.linalg.eigvals` in that the
        return type is always complex even if the eigenvalues are all real.

        This function supports arrays with at least 2 dimensions. When the
        input has more than two dimensions, the eigenvalues are computed for
        each matrix in the last two dimensions.

        Args:
            a (array): The input array.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

        Returns:
            array: The eigenvalues (not necessarily in order).

        Example:
            >>> A = mx.array([[1., -2.], [-2., 1.]])
            >>> eigenvalues = mx.linalg.eigvals(A, stream=mx.cpu)
            >>> eigenvalues
            array([3+0j, -1+0j], dtype=complex64)
      )pbdoc");
  m.def(
      "eig",
      [](const mx::array& a, mx::StreamOrDevice s) {
        auto result = mx::linalg::eig(a, s);
        return nb::make_tuple(result.first, result.second);
      },
      "a"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def eig(a: array, *, stream: Union[None, Stream, Device] = None) -> Tuple[array, array]"),
      R"pbdoc(
        Compute the eigenvalues and eigenvectors of a square matrix.

        This function differs from :func:`numpy.linalg.eig` in that the
        return type is always complex even if the eigenvalues are all real.

        This function supports arrays with at least 2 dimensions. When the input
        has more than two dimensions, the eigenvalues and eigenvectors are
        computed for each matrix in the last two dimensions.

        Args:
            a (array): The input array.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

        Returns:
            Tuple[array, array]:
              A tuple containing the eigenvalues and the normalized right
              eigenvectors. The column ``v[:, i]`` is the eigenvector
              corresponding to the i-th eigenvalue.

        Example:
            >>> A = mx.array([[1., -2.], [-2., 1.]])
            >>> w, v = mx.linalg.eig(A, stream=mx.cpu)
            >>> w
            array([3+0j, -1+0j], dtype=complex64)
            >>> v
            array([[0.707107+0j, 0.707107+0j],
                   [-0.707107+0j, 0.707107+0j]], dtype=complex64)
      )pbdoc");

  m.def(
      "eigvalsh",
      &mx::linalg::eigvalsh,
      "a"_a,
      "UPLO"_a = "L",
      nb::kw_only(),
      "stream"_a = nb::none(),
      R"pbdoc(
        Compute the eigenvalues of a complex Hermitian or real symmetric matrix.

        This function supports arrays with at least 2 dimensions. When the
        input has more than two dimensions, the eigenvalues are computed for
        each matrix in the last two dimensions.

        Args:
            a (array): Input array. Must be a real symmetric or complex
              Hermitian matrix.
            UPLO (str, optional): Whether to use the upper (``"U"``) or
              lower (``"L"``) triangle of the matrix.  Default: ``"L"``.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

        Returns:
            array: The eigenvalues in ascending order.

        Note:
            The input matrix is assumed to be symmetric (or Hermitian). Only
            the selected triangle is used. No checks for symmetry are performed.

        Example:
            >>> A = mx.array([[1., -2.], [-2., 1.]])
            >>> eigenvalues = mx.linalg.eigvalsh(A, stream=mx.cpu)
            >>> eigenvalues
            array([-1., 3.], dtype=float32)
      )pbdoc");
  m.def(
      "eigh",
      [](const mx::array& a, const std::string& UPLO, mx::StreamOrDevice s) {
        auto result = mx::linalg::eigh(a, UPLO, s);
        return nb::make_tuple(result.first, result.second);
      },
      "a"_a,
      "UPLO"_a = "L",
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def eigh(a: array, UPLO: str = 'L', *, stream: Union[None, Stream, Device] = None) -> Tuple[array, array]"),
      R"pbdoc(
        Compute the eigenvalues and eigenvectors of a complex Hermitian or
        real symmetric matrix.

        This function supports arrays with at least 2 dimensions. When the input
        has more than two dimensions, the eigenvalues and eigenvectors are
        computed for each matrix in the last two dimensions.

        Args:
            a (array): Input array. Must be a real symmetric or complex
              Hermitian matrix.
            UPLO (str, optional): Whether to use the upper (``"U"``) or
               lower (``"L"``) triangle of the matrix.  Default: ``"L"``.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

        Returns:
            Tuple[array, array]:
              A tuple containing the eigenvalues in ascending order and
              the normalized eigenvectors. The column ``v[:, i]`` is the
              eigenvector corresponding to the i-th eigenvalue.

        Note:
            The input matrix is assumed to be symmetric (or Hermitian). Only
            the selected triangle is used. No checks for symmetry are performed.

        Example:
            >>> A = mx.array([[1., -2.], [-2., 1.]])
            >>> w, v = mx.linalg.eigh(A, stream=mx.cpu)
            >>> w
            array([-1., 3.], dtype=float32)
            >>> v
            array([[ 0.707107, -0.707107],
                  [ 0.707107,  0.707107]], dtype=float32)
      )pbdoc");
  m.def(
      "lu",
      [](const mx::array& a, mx::StreamOrDevice s /* = {} */) {
        auto result = mx::linalg::lu(a, s);
        return nb::make_tuple(result.at(0), result.at(1), result.at(2));
      },
      "a"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def lu(a: array, *, stream: Union[None, Stream, Device] = None) -> Tuple[array, array, array]"),
      R"pbdoc(
        Compute the LU factorization of the given matrix ``A``.

        Note, unlike the default behavior of ``scipy.linalg.lu``, the pivots
        are indices. To reconstruct the input use ``L[P, :] @ U`` for 2
        dimensions or ``mx.take_along_axis(L, P[..., None], axis=-2) @ U``
        for more than 2 dimensions.

        To construct the full permuation matrix do:

        .. code-block::

          P = mx.put_along_axis(mx.zeros_like(L), p[..., None], mx.array(1.0), axis=-1)

        Args:
            a (array): Input array.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

        Returns:
            tuple(array, array, array):
              The ``p``, ``L``, and ``U`` arrays, such that ``A = L[P, :] @ U``
      )pbdoc");
  m.def(
      "lu_factor",
      &mx::linalg::lu_factor,
      "a"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def lu_factor(a: array, *, stream: Union[None, Stream, Device] = None) -> Tuple[array, array]"),
      R"pbdoc(
        Computes a compact representation of the LU factorization.

        Args:
            a (array): Input array.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

        Returns:
            tuple(array, array): The ``LU`` matrix and ``pivots`` array.
      )pbdoc");
  m.def(
      "solve",
      &mx::linalg::solve,
      "a"_a,
      "b"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def solve(a: array, b: array, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Compute the solution to a system of linear equations ``AX = B``.

        Args:
            a (array): Input array.
            b (array): Input array.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

        Returns:
            array: The unique solution to the system ``AX = B``.
      )pbdoc");
  m.def(
      "solve_triangular",
      &mx::linalg::solve_triangular,
      "a"_a,
      "b"_a,
      nb::kw_only(),
      "upper"_a = false,
      "stream"_a = nb::none(),
      nb::sig(
          "def solve_triangular(a: array, b: array, *, upper: bool = False, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Computes the solution of a triangular system of linear equations ``AX = B``.

        Args:
            a (array): Input array.
            b (array): Input array.
            upper (bool, optional): Whether the array is upper or lower
              triangular. Default: ``False``.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

        Returns:
            array: The unique solution to the system ``AX = B``.
      )pbdoc");
}


================================================
FILE: python/src/load.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <nanobind/stl/vector.h>
#include <cstring>
#include <fstream>
#include <stdexcept>
#include <string_view>
#include <unordered_map>
#include <vector>

#include "mlx/io/load.h"
#include "mlx/ops.h"
#include "mlx/utils.h"
#include "python/src/load.h"
#include "python/src/small_vector.h"
#include "python/src/utils.h"

namespace mx = mlx::core;
namespace nb = nanobind;
using namespace nb::literals;

///////////////////////////////////////////////////////////////////////////////
// Helpers
///////////////////////////////////////////////////////////////////////////////

bool is_str_or_path(nb::object obj) {
  if (nb::isinstance<nb::str>(obj)) {
    return true;
  }
  nb::object path_type = nb::module_::import_("pathlib").attr("Path");
  return nb::isinstance(obj, path_type);
}

bool is_istream_object(const nb::object& file) {
  return nb::hasattr(file, "readinto") && nb::hasattr(file, "seek") &&
      nb::hasattr(file, "tell") && nb::hasattr(file, "closed");
}

bool is_ostream_object(const nb::object& file) {
  return nb::hasattr(file, "write") && nb::hasattr(file, "seek") &&
      nb::hasattr(file, "tell") && nb::hasattr(file, "closed");
}

bool is_zip_file(const nb::module_& zipfile, const nb::object& file) {
  if (is_istream_object(file)) {
    auto st_pos = file.attr("tell")();
    bool r = nb::cast<bool>(zipfile.attr("is_zipfile")(file));
    file.attr("seek")(st_pos, 0);
    return r;
  }
  return nb::cast<bool>(zipfile.attr("is_zipfile")(file));
}

class ZipFileWrapper {
 public:
  ZipFileWrapper(
      const nb::module_& zipfile,
      const nb::object& file,
      char mode = 'r',
      int compression = 0)
      : zipfile_module_(zipfile),
        zipfile_object_(zipfile.attr("ZipFile")(
            file,
            "mode"_a = mode,
            "compression"_a = compression,
            "allowZip64"_a = true)),
        files_list_(zipfile_object_.attr("namelist")()),
        open_func_(zipfile_object_.attr("open")),
        read_func_(zipfile_object_.attr("read")),
        close_func_(zipfile_object_.attr("close")) {}

  std::vector<std::string> namelist() const {
    return nb::cast<std::vector<std::string>>(files_list_);
  }

  nb::object open(const std::string& key, char mode = 'r') {
    // Following numpy :
    // https://github.com/numpy/numpy/blob/db4f43983cb938f12c311e1f5b7165e270c393b4/numpy/lib/npyio.py#L742C36-L742C47
    if (mode == 'w') {
      return open_func_(key, "mode"_a = mode, "force_zip64"_a = true);
    }
    return open_func_(key, "mode"_a = mode);
  }

 private:
  nb::module_ zipfile_module_;
  nb::object zipfile_object_;
  nb::list files_list_;
  nb::object open_func_;
  nb::object read_func_;
  nb::object close_func_;
};

///////////////////////////////////////////////////////////////////////////////
// Loading
///////////////////////////////////////////////////////////////////////////////

class PyFileReader : public mx::io::Reader {
 public:
  PyFileReader(nb::object file)
      : pyistream_(file),
        readinto_func_(file.attr("readinto")),
        seek_func_(file.attr("seek")),
        tell_func_(file.attr("tell")) {}

  ~PyFileReader() {
    nb::gil_scoped_acquire gil;

    pyistream_.release().dec_ref();
    readinto_func_.release().dec_ref();
    seek_func_.release().dec_ref();
    tell_func_.release().dec_ref();
  }

  bool is_open() const override {
    bool out;
    {
      nb::gil_scoped_acquire gil;
      out = !nb::cast<bool>(pyistream_.attr("closed"));
    }
    return out;
  }

  bool good() const override {
    bool out;
    {
      nb::gil_scoped_acquire gil;
      out = !pyistream_.is_none();
    }
    return out;
  }

  size_t tell() override {
    size_t out;
    {
      nb::gil_scoped_acquire gil;
      out = nb::cast<size_t>(tell_func_());
    }
    return out;
  }

  void seek(int64_t off, std::ios_base::seekdir way = std::ios_base::beg)
      override {
    nb::gil_scoped_acquire gil;
    seek_func_(off, (int)way);
  }

  void read(char* data, size_t n) override {
    nb::gil_scoped_acquire gil;
    _read(data, n);
  }

  void read(char* data, size_t n, size_t offset) override {
    nb::gil_scoped_acquire gil;
    seek_func_(offset, (int)std::ios_base::beg);
    _read(data, n);
  }

  std::string label() const override {
    return "python file object";
  }

 private:
  void _read(char* data, size_t n) {
    nb::object memview =
        nb::steal<nb::object>(PyMemoryView_FromMemory(data, n, PyBUF_WRITE));
    if (!memview.is_valid()) {
      throw std::runtime_error("[load] Failed to create memoryview for read");
    }
    nb::object bytes_read = readinto_func_(memview);

    if (bytes_read.is_none() || nb::cast<size_t>(bytes_read) < n) {
      throw std::runtime_error("[load] Failed to read from python stream");
    }
  }

  nb::object pyistream_;
  nb::object readinto_func_;
  nb::object seek_func_;
  nb::object tell_func_;
};

std::pair<
    std::unordered_map<std::string, mx::array>,
    std::unordered_map<std::string, std::string>>
mlx_load_safetensor_helper(nb::object file, mx::StreamOrDevice s) {
  if (is_str_or_path(file)) { // Assume .safetensors file path string
    auto file_str = nb::cast<std::string>(nb::str(file));
    return mx::load_safetensors(file_str, s);
  } else if (is_istream_object(file)) {
    // If we don't own the stream and it was passed to us, eval immediately
    auto res = mx::load_safetensors(std::make_shared<PyFileReader>(file), s);
    {
      nb::gil_scoped_release gil;
      for (auto& [key, arr] : std::get<0>(res)) {
        arr.eval();
      }
    }
    return res;
  }

  throw std::invalid_argument(
      "[load_safetensors] Input must be a file-like object, or string");
}

mx::GGUFLoad mlx_load_gguf_helper(nb::object file, mx::StreamOrDevice s) {
  if (is_str_or_path(file)) { // Assume .gguf file path string
    auto file_str = nb::cast<std::string>(nb::str(file));
    return mx::load_gguf(file_str, s);
  }

  throw std::invalid_argument("[load_gguf] Input must be a string");
}

std::unordered_map<std::string, mx::array> mlx_load_npz_helper(
    nb::object file,
    mx::StreamOrDevice s) {
  bool own_file = is_str_or_path(file);

  nb::module_ zipfile = nb::module_::import_("zipfile");
  if (!is_zip_file(zipfile, file)) {
    throw std::invalid_argument(
        "[load_npz] Input must be a zip file or a file-like object that can be "
        "opened with zipfile.ZipFile");
  }
  // Output dictionary filename in zip -> loaded array
  std::unordered_map<std::string, mx::array> array_dict;

  // Create python ZipFile object
  ZipFileWrapper zipfile_object(zipfile, file);
  for (const std::string& st : zipfile_object.namelist()) {
    // Open zip file as a python file stream
    nb::object sub_file = zipfile_object.open(st);

    // Create array from python file stream
    auto arr = mx::load(std::make_shared<PyFileReader>(sub_file), s);

    // Remove .npy from file if it is there
    auto key = st;
    if (st.length() > 4 && st.substr(st.length() - 4, 4) == ".npy")
      key = st.substr(0, st.length() - 4);

    // Add array to dict
    array_dict.insert({key, arr});
  }

  // If we don't own the stream and it was passed to us, eval immediately
  if (!own_file) {
    nb::gil_scoped_release gil;
    for (auto& [key, arr] : array_dict) {
      arr.eval();
    }
  }

  return array_dict;
}

mx::array mlx_load_npy_helper(nb::object file, mx::StreamOrDevice s) {
  if (is_str_or_path(file)) { // Assume .npy file path string
    auto file_str = nb::cast<std::string>(nb::str(file));
    return mx::load(file_str, s);
  } else if (is_istream_object(file)) {
    // If we don't own the stream and it was passed to us, eval immediately
    auto arr = mx::load(std::make_shared<PyFileReader>(file), s);
    {
      nb::gil_scoped_release gil;
      arr.eval();
    }
    return arr;
  }
  throw std::invalid_argument(
      "[load_npy] Input must be a file-like object, or string");
}

LoadOutputTypes mlx_load_helper(
    nb::object file,
    std::optional<std::string> format,
    bool return_metadata,
    mx::StreamOrDevice s) {
  if (!format.has_value()) {
    std::string fname;
    if (is_str_or_path(file)) {
      fname = nb::cast<std::string>(nb::str(file));
    } else if (is_istream_object(file)) {
      fname = nb::cast<std::string>(file.attr("name"));
    } else {
      throw std::invalid_argument(
          "[load] Input must be a file-like object opened in binary mode, or string");
    }
    size_t ext = fname.find_last_of('.');
    if (ext == std::string::npos) {
      throw std::invalid_argument(
          "[load] Could not infer file format from extension");
    }
    format.emplace(fname.substr(ext + 1));
  }

  if (return_metadata && (format.value() == "npy" || format.value() == "npz")) {
    throw std::invalid_argument(
        "[load] metadata not supported for format " + format.value());
  }
  if (format.value() == "safetensors") {
    auto [dict, metadata] = mlx_load_safetensor_helper(file, s);
    if (return_metadata) {
      return std::make_pair(dict, metadata);
    }
    return dict;
  } else if (format.value() == "npz") {
    return mlx_load_npz_helper(file, s);
  } else if (format.value() == "npy") {
    return mlx_load_npy_helper(file, s);
  } else if (format.value() == "gguf") {
    auto [weights, metadata] = mlx_load_gguf_helper(file, s);
    if (return_metadata) {
      return std::make_pair(weights, metadata);
    } else {
      return weights;
    }
  } else {
    throw std::invalid_argument("[load] Unknown file format " + format.value());
  }
}

///////////////////////////////////////////////////////////////////////////////
// Saving
///////////////////////////////////////////////////////////////////////////////

class PyFileWriter : public mx::io::Writer {
 public:
  PyFileWriter(nb::object file)
      : pyostream_(file),
        write_func_(file.attr("write")),
        seek_func_(file.attr("seek")),
        tell_func_(file.attr("tell")) {}

  ~PyFileWriter() {
    nb::gil_scoped_acquire gil;

    pyostream_.release().dec_ref();
    write_func_.release().dec_ref();
    seek_func_.release().dec_ref();
    tell_func_.release().dec_ref();
  }

  bool is_open() const override {
    bool out;
    {
      nb::gil_scoped_acquire gil;
      out = !nb::cast<bool>(pyostream_.attr("closed"));
    }
    return out;
  }

  bool good() const override {
    bool out;
    {
      nb::gil_scoped_acquire gil;
      out = !pyostream_.is_none();
    }
    return out;
  }

  size_t tell() override {
    size_t out;
    {
      nb::gil_scoped_acquire gil;
      out = nb::cast<size_t>(tell_func_());
    }
    return out;
  }

  void seek(int64_t off, std::ios_base::seekdir way = std::ios_base::beg)
      override {
    nb::gil_scoped_acquire gil;
    seek_func_(off, (int)way);
  }

  void write(const char* data, size_t n) override {
    nb::gil_scoped_acquire gil;

    nb::object memview = nb::steal<nb::object>(
        PyMemoryView_FromMemory(const_cast<char*>(data), n, PyBUF_READ));
    if (!memview.is_valid()) {
      throw std::runtime_error("[load] Failed to create memoryview for write");
    }
    nb::object bytes_written = write_func_(memview);

    if (bytes_written.is_none() || nb::cast<size_t>(bytes_written) < n) {
      throw std::runtime_error("[load] Failed to write to python stream");
    }
  }

  std::string label() const override {
    return "python file object";
  }

 private:
  nb::object pyostream_;
  nb::object write_func_;
  nb::object seek_func_;
  nb::object tell_func_;
};

void mlx_save_helper(nb::object file, mx::array a) {
  if (is_str_or_path(file)) {
    auto file_str = nb::cast<std::string>(nb::str(file));
    mx::save(file_str, a);
    return;
  } else if (is_ostream_object(file)) {
    auto writer = std::make_shared<PyFileWriter>(file);
    {
      nb::gil_scoped_release gil;
      mx::save(writer, a);
    }

    return;
  }

  throw std::invalid_argument(
      "[save] Input must be a file-like object, or string");
}

void mlx_savez_helper(
    nb::object file_,
    nb::args args,
    const nb::kwargs& kwargs,
    bool compressed) {
  // Add .npz to the end of the filename if not already there
  nb::object file = file_;

  if (is_str_or_path(file)) {
    std::string fname = nb::cast<std::string>(nb::str(file_));

    // Add .npz to file name if it is not there
    if (fname.length() < 4 || fname.substr(fname.length() - 4, 4) != ".npz")
      fname += ".npz";

    file = nb::cast(fname);
  }

  // Collect args and kwargs
  auto arrays_dict =
      nb::cast<std::unordered_map<std::string, mx::array>>(kwargs);
  auto arrays_list = nb::cast<std::vector<mx::array>>(args);

  for (int i = 0; i < arrays_list.size(); i++) {
    std::string arr_name = "arr_" + std::to_string(i);

    if (arrays_dict.count(arr_name) > 0) {
      throw std::invalid_argument(
          "[savez] Cannot use un-named variables and keyword " + arr_name);
    }

    arrays_dict.insert({arr_name, arrays_list[i]});
  }

  // Create python ZipFile object depending on compression
  nb::module_ zipfile = nb::module_::import_("zipfile");
  int compression = nb::cast<int>(
      compressed ? zipfile.attr("ZIP_DEFLATED") : zipfile.attr("ZIP_STORED"));
  char mode = 'w';
  ZipFileWrapper zipfile_object(zipfile, file, mode, compression);

  // Save each array
  for (auto [k, a] : arrays_dict) {
    std::string fname = k + ".npy";
    auto py_ostream = zipfile_object.open(fname, 'w');
    auto writer = std::make_shared<PyFileWriter>(py_ostream);
    {
      nb::gil_scoped_release nogil;
      mx::save(writer, a);
    }
  }

  return;
}

void mlx_save_safetensor_helper(
    nb::object file,
    nb::dict d,
    std::optional<nb::dict> m) {
  std::unordered_map<std::string, std::string> metadata_map;
  if (m) {
    try {
      metadata_map =
          nb::cast<std::unordered_map<std::string, std::string>>(m.value());
    } catch (const nb::cast_error& e) {
      throw std::invalid_argument(
          "[save_safetensors] Metadata must be a dictionary with string keys and values");
    }
  } else {
    metadata_map = std::unordered_map<std::string, std::string>();
  }
  auto arrays_map = nb::cast<std::unordered_map<std::string, mx::array>>(d);
  if (is_str_or_path(file)) {
    {
      auto file_str = nb::cast<std::string>(nb::str(file));
      nb::gil_scoped_release nogil;
      mx::save_safetensors(file_str, arrays_map, metadata_map);
    }
  } else if (is_ostream_object(file)) {
    auto writer = std::make_shared<PyFileWriter>(file);
    {
      nb::gil_scoped_release nogil;
      mx::save_safetensors(writer, arrays_map, metadata_map);
    }
  } else {
    throw std::invalid_argument(
        "[save_safetensors] Input must be a file-like object, or string");
  }
}

void mlx_save_gguf_helper(
    nb::object file,
    nb::dict a,
    std::optional<nb::dict> m) {
  auto arrays_map = nb::cast<std::unordered_map<std::string, mx::array>>(a);
  if (is_str_or_path(file)) {
    if (m) {
      auto metadata_map =
          nb::cast<std::unordered_map<std::string, mx::GGUFMetaData>>(
              m.value());
      {
        auto file_str = nb::cast<std::string>(nb::str(file));
        nb::gil_scoped_release nogil;
        mx::save_gguf(file_str, arrays_map, metadata_map);
      }
    } else {
      {
        auto file_str = nb::cast<std::string>(nb::str(file));
        nb::gil_scoped_release nogil;
        mx::save_gguf(file_str, arrays_map);
      }
    }
  } else {
    throw std::invalid_argument("[save_gguf] Input must be a string");
  }
}


================================================
FILE: python/src/load.h
================================================
// Copyright © 2023-2024 Apple Inc.

#pragma once

#include <nanobind/nanobind.h>
#include <nanobind/stl/optional.h>
#include <nanobind/stl/string.h>
#include <nanobind/stl/unordered_map.h>
#include <nanobind/stl/variant.h>

#include <optional>
#include <string>
#include <unordered_map>
#include <variant>
#include "mlx/io.h"

namespace mx = mlx::core;
namespace nb = nanobind;

using LoadOutputTypes = std::variant<
    mx::array,
    std::unordered_map<std::string, mx::array>,
    mx::SafetensorsLoad,
    mx::GGUFLoad>;

mx::SafetensorsLoad mlx_load_safetensor_helper(
    nb::object file,
    mx::StreamOrDevice s);
void mlx_save_safetensor_helper(
    nb::object file,
    nb::dict d,
    std::optional<nb::dict> m);

mx::GGUFLoad mlx_load_gguf_helper(nb::object file, mx::StreamOrDevice s);

void mlx_save_gguf_helper(
    nb::object file,
    nb::dict d,
    std::optional<nb::dict> m);

LoadOutputTypes mlx_load_helper(
    nb::object file,
    std::optional<std::string> format,
    bool return_metadata,
    mx::StreamOrDevice s);
void mlx_save_helper(nb::object file, mx::array a);
void mlx_savez_helper(
    nb::object file,
    nb::args args,
    const nb::kwargs& kwargs,
    bool compressed = false);


================================================
FILE: python/src/memory.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "mlx/memory.h"
#include <nanobind/nanobind.h>

namespace mx = mlx::core;
namespace nb = nanobind;
using namespace nb::literals;

void init_memory(nb::module_& m) {
  m.def(
      "get_active_memory",
      &mx::get_active_memory,
      R"pbdoc(
      Get the actively used memory in bytes.

      Note, this will not always match memory use reported by the system because
      it does not include cached memory buffers.
      )pbdoc");
  m.def(
      "get_peak_memory",
      &mx::get_peak_memory,
      R"pbdoc(
      Get the peak amount of used memory in bytes.

      The maximum memory used recorded from the beginning of the program
      execution or since the last call to :func:`reset_peak_memory`.
      )pbdoc");
  m.def(
      "reset_peak_memory",
      &mx::reset_peak_memory,
      R"pbdoc(
      Reset the peak memory to zero.
      )pbdoc");
  m.def(
      "get_cache_memory",
      &mx::get_cache_memory,
      R"pbdoc(
      Get the cache size in bytes.

      The cache includes memory not currently used that has not been returned
      to the system allocator.
      )pbdoc");
  m.def(
      "set_memory_limit",
      &mx::set_memory_limit,
      "limit"_a,
      R"pbdoc(
      Set the memory limit.

      The memory limit is a guideline for the maximum amount of memory to use
      during graph evaluation. If the memory limit is exceeded and there is no
      more RAM (including swap when available) allocations will result in an
      exception.

      When metal is available the memory limit defaults to 1.5 times the
      maximum recommended working set size reported by the device.

      Args:
        limit (int): Memory limit in bytes.

      Returns:
        int: The previous memory limit in bytes.
      )pbdoc");
  m.def(
      "set_cache_limit",
      &mx::set_cache_limit,
      "limit"_a,
      R"pbdoc(
      Set the free cache limit.

      If using more than the given limit, free memory will be reclaimed
      from the cache on the next allocation. To disable the cache, set
      the limit to ``0``.

      The cache limit defaults to the memory limit. See
      :func:`set_memory_limit` for more details.

      Args:
        limit (int): The cache limit in bytes.

      Returns:
        int: The previous cache limit in bytes.
      )pbdoc");
  m.def(
      "set_wired_limit",
      &mx::set_wired_limit,
      "limit"_a,
      R"pbdoc(
      Set the wired size limit.

      .. note::
         * This function is only useful on macOS 15.0 or higher.
         * The wired limit should remain strictly less than the total
           memory size.

      The wired limit is the total size in bytes of memory that will be kept
      resident. The default value is ``0``.

      Setting a wired limit larger than system wired limit is an error. You can
      increase the system wired limit with:

      .. code-block::

        sudo sysctl iogpu.wired_limit_mb=<size_in_megabytes>

      Use :func:`device_info` to query the system wired limit
      (``"max_recommended_working_set_size"``) and the total memory size
      (``"memory_size"``).

      Args:
        limit (int): The wired limit in bytes.

      Returns:
        int: The previous wired limit in bytes.
      )pbdoc");
  m.def(
      "clear_cache",
      &mx::clear_cache,
      R"pbdoc(
      Clear the memory cache.

      After calling this, :func:`get_cache_memory` should return ``0``.
      )pbdoc");
}


================================================
FILE: python/src/metal.cpp
================================================
// Copyright © 2023-2024 Apple Inc.
#include <iostream>

#include <nanobind/nanobind.h>
#include <nanobind/stl/optional.h>
#include <nanobind/stl/string.h>
#include <nanobind/stl/unordered_map.h>
#include <nanobind/stl/variant.h>
#include <nanobind/stl/vector.h>

#include "mlx/backend/metal/metal.h"
#include "mlx/device.h"
#include "mlx/memory.h"
#include "python/src/small_vector.h"

namespace mx = mlx::core;
namespace nb = nanobind;
using namespace nb::literals;

bool DEPRECATE(const char* old_fn, const char* new_fn) {
  std::cerr << old_fn << " is deprecated and will be removed in a future "
            << "version. Use " << new_fn << " instead." << std::endl;
  return true;
}

#define DEPRECATE(oldfn, newfn) static bool dep = DEPRECATE(oldfn, newfn)

void init_metal(nb::module_& m) {
  nb::module_ metal = m.def_submodule("metal", "mlx.metal");
  metal.def(
      "is_available",
      &mx::metal::is_available,
      R"pbdoc(
      Check if the Metal back-end is available.
      )pbdoc");
  metal.def("get_active_memory", []() {
    DEPRECATE("mx.metal.get_active_memory", "mx.get_active_memory");
    return mx::get_active_memory();
  });
  metal.def("get_peak_memory", []() {
    DEPRECATE("mx.metal.get_peak_memory", "mx.get_peak_memory");
    return mx::get_peak_memory();
  });
  metal.def("reset_peak_memory", []() {
    DEPRECATE("mx.metal.reset_peak_memory", "mx.reset_peak_memory");
    mx::reset_peak_memory();
  });
  metal.def("get_cache_memory", []() {
    DEPRECATE("mx.metal.get_cache_memory", "mx.get_cache_memory");
    return mx::get_cache_memory();
  });
  metal.def(
      "set_memory_limit",
      [](size_t limit) {
        DEPRECATE("mx.metal.set_memory_limit", "mx.set_memory_limit");
        return mx::set_memory_limit(limit);
      },
      "limit"_a);
  metal.def(
      "set_cache_limit",
      [](size_t limit) {
        DEPRECATE("mx.metal.set_cache_limit", "mx.set_cache_limit");
        return mx::set_cache_limit(limit);
      },
      "limit"_a);
  metal.def(
      "set_wired_limit",
      [](size_t limit) {
        DEPRECATE("mx.metal.set_wired_limit", "mx.set_wired_limit");
        return mx::set_wired_limit(limit);
      },
      "limit"_a);
  metal.def("clear_cache", []() {
    DEPRECATE("mx.metal.clear_cache", "mx.clear_cache");
    mx::clear_cache();
  });
  metal.def(
      "start_capture",
      &mx::metal::start_capture,
      "path"_a,
      R"pbdoc(
      Start a Metal capture.

      Args:
        path (str): The path to save the capture which should have
          the extension ``.gputrace``.
      )pbdoc");
  metal.def(
      "stop_capture",
      &mx::metal::stop_capture,
      R"pbdoc(
      Stop a Metal capture.
      )pbdoc");
  metal.def("device_info", []() {
    DEPRECATE("mx.metal.device_info", "mx.device_info");
    return mx::device_info(mx::Device(mx::Device::gpu, 0));
  });
}


================================================
FILE: python/src/mlx.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <nanobind/nanobind.h>

#include "mlx/version.h"

namespace mx = mlx::core;
namespace nb = nanobind;

void init_mlx_func(nb::module_&);
void init_array(nb::module_&);
void init_device(nb::module_&);
void init_stream(nb::module_&);
void init_metal(nb::module_&);
void init_cuda(nb::module_&);
void init_memory(nb::module_&);
void init_ops(nb::module_&);
void init_transforms(nb::module_&);
void init_random(nb::module_&);
void init_fft(nb::module_&);
void init_linalg(nb::module_&);
void init_constants(nb::module_&);
void init_fast(nb::module_&);
void init_distributed(nb::module_&);
void init_export(nb::module_&);

NB_MODULE(core, m) {
  m.doc() = "mlx: A framework for machine learning on Apple silicon.";

  auto reprlib_fix = nb::module_::import_("mlx._reprlib_fix");
  nb::set_leak_warnings(false);

  init_mlx_func(m);
  init_device(m);
  init_stream(m);
  init_array(m);
  init_metal(m);
  init_cuda(m);
  init_memory(m);
  init_ops(m);
  init_transforms(m);
  init_random(m);
  init_fft(m);
  init_linalg(m);
  init_constants(m);
  init_fast(m);
  init_distributed(m);
  init_export(m);

  m.attr("__version__") = mx::version();
}


================================================
FILE: python/src/mlx_func.cpp
================================================
// Copyright © 2025 Apple Inc.

#include "python/src/mlx_func.h"

// A garbage collected function which wraps nb::cpp_function
// See https://github.com/wjakob/nanobind/discussions/919

struct gc_func {
  PyObject_HEAD
      // Vector call implementation that forwards calls to nanobind
      PyObject* (*vectorcall)(PyObject*, PyObject* const*, size_t, PyObject*);
  // The nanobind wrapper func
  PyObject* func;

  // The original wrapped func
  PyObject* orig_func;
  // A non-owning reference to dependencies owned by 'func'
  std::vector<PyObject*> deps;
};

int gc_func_tp_traverse(PyObject* self, visitproc visit, void* arg) {
  Py_VISIT(Py_TYPE(self));
  gc_func* w = (gc_func*)self;
  Py_VISIT(w->func);
  for (auto d : w->deps) {
    Py_VISIT(d);
  }
  return 0;
};

int gc_func_tp_clear(PyObject* self) {
  gc_func* w = (gc_func*)self;
  Py_CLEAR(w->func);
  return 0;
}

PyObject* gc_func_get_doc(PyObject* self, void*) {
  return PyObject_GetAttrString(((gc_func*)self)->func, "__doc__");
}

PyObject* gc_func_get_sig(PyObject* self, void*) {
  return PyObject_GetAttrString(((gc_func*)self)->func, "__nb_signature__");
}

PyObject* gc_func_vectorcall(
    PyObject* self,
    PyObject* const* args,
    size_t nargs,
    PyObject* kwnames) {
  return PyObject_Vectorcall(((gc_func*)self)->func, args, nargs, kwnames);
}

void gc_func_dealloc(PyObject* self) {
  PyObject_GC_UnTrack(self);
  Py_XDECREF(((gc_func*)self)->func);
  PyObject_GC_Del(self);
}

static PyMemberDef gc_func_members[] = {
    {"__vectorcalloffset__",
     T_PYSSIZET,
     (Py_ssize_t)offsetof(gc_func, vectorcall),
     READONLY,
     nullptr},
    {nullptr, 0, 0, 0, nullptr}};

static PyGetSetDef gc_func_getset[] = {
    {"__doc__", gc_func_get_doc, nullptr, nullptr, nullptr},
    {"__nb_signature__", gc_func_get_sig, nullptr, nullptr, nullptr},
    {nullptr, nullptr, nullptr, nullptr, nullptr}};

static PyObject* gc_func_getattro(PyObject* self, PyObject* name_) {
  gc_func* w = (gc_func*)self;
  return PyObject_GenericGetAttr(w->orig_func, name_);
}

// Table of custom type slots we want to install
PyType_Slot gc_func_slots[] = {
    {Py_tp_traverse, (void*)gc_func_tp_traverse},
    {Py_tp_clear, (void*)gc_func_tp_clear},
    {Py_tp_getset, (void*)gc_func_getset},
    {Py_tp_getattro, (void*)gc_func_getattro},
    {Py_tp_members, (void*)gc_func_members},
    {Py_tp_call, (void*)PyVectorcall_Call},
    {Py_tp_dealloc, (void*)gc_func_dealloc},
    {0, 0}};

static PyType_Spec gc_func_spec = {
    /* .name = */ "mlx.gc_func",
    /* .basicsize = */ (int)sizeof(gc_func),
    /* .itemsize = */ 0,
    /* .flags = */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
        Py_TPFLAGS_HAVE_VECTORCALL,
    /* .slots = */ gc_func_slots};

static PyTypeObject* gc_func_tp = nullptr;

nb::callable mlx_func(
    nb::object func,
    const nb::callable& orig_func,
    std::vector<PyObject*> deps) {
  gc_func* r = (gc_func*)PyType_GenericAlloc(gc_func_tp, 0);
  r->func = func.inc_ref().ptr();
  r->orig_func = orig_func.ptr();
  deps.push_back(r->orig_func);
  r->deps = std::move(deps);
  r->vectorcall = gc_func_vectorcall;
  return nb::steal<nb::callable>((PyObject*)r);
}

void init_mlx_func(nb::module_& m) {
  gc_func_tp = (PyTypeObject*)PyType_FromSpec(&gc_func_spec);
  if (!gc_func_tp) {
    nb::raise("Could not register MLX function type.");
  }
}


================================================
FILE: python/src/mlx_func.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include <vector>

#include <nanobind/nanobind.h>
#include <nanobind/stl/function.h>

namespace nb = nanobind;
using namespace nb::literals;

nb::callable mlx_func(
    nb::object func,
    const nb::callable& orig_func,
    std::vector<PyObject*> deps);

template <typename F, typename... Deps>
nb::callable mlx_func(F func, const nb::callable& orig_func, Deps&&... deps) {
  return mlx_func(
      nb::cpp_function(std::move(func)),
      orig_func,
      std::vector<PyObject*>{deps.ptr()...});
}

template <typename... Deps>
nb::callable
mlx_func(nb::object func, const nb::callable& orig_func, Deps&&... deps) {
  return mlx_func(
      std::move(func), orig_func, std::vector<PyObject*>{deps.ptr()...});
}


================================================
FILE: python/src/ops.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <numeric>
#include <ostream>
#include <variant>

#include <nanobind/nanobind.h>
#include <nanobind/stl/optional.h>
#include <nanobind/stl/pair.h>
#include <nanobind/stl/string.h>
#include <nanobind/stl/tuple.h>
#include <nanobind/stl/variant.h>
#include <nanobind/stl/vector.h>

#include "mlx/einsum.h"
#include "mlx/ops.h"
#include "mlx/utils.h"
#include "python/src/load.h"
#include "python/src/small_vector.h"
#include "python/src/utils.h"

namespace mx = mlx::core;
namespace nb = nanobind;
using namespace nb::literals;

using Scalar = std::variant<bool, int, double>;

mx::Dtype scalar_to_dtype(Scalar s) {
  if (std::holds_alternative<int>(s)) {
    return mx::int32;
  } else if (std::holds_alternative<double>(s)) {
    return mx::float32;
  } else {
    return mx::bool_;
  }
}

double scalar_to_double(Scalar s) {
  if (auto pv = std::get_if<int>(&s); pv) {
    return static_cast<double>(*pv);
  } else if (auto pv = std::get_if<double>(&s); pv) {
    return *pv;
  } else {
    return static_cast<double>(std::get<bool>(s));
  }
}

void init_ops(nb::module_& m) {
  m.def(
      "reshape",
      &mx::reshape,
      nb::arg(),
      "shape"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def reshape(a: array, /, shape: Sequence[int], *, stream: "
          "Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Reshape an array while preserving the size.

        Args:
            a (array): Input array.
            shape (tuple(int)): New shape.
            stream (Stream, optional): Stream or device. Defaults to ``None``
              in which case the default stream of the default device is used.

        Returns:
            array: The reshaped array.
      )pbdoc");
  m.def(
      "flatten",
      [](const mx::array& a,
         int start_axis,
         int end_axis,
         const mx::StreamOrDevice& s) {
        return mx::flatten(a, start_axis, end_axis);
      },
      nb::arg(),
      "start_axis"_a = 0,
      "end_axis"_a = -1,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def flatten(a: array, /, start_axis: int = 0, end_axis: int = "
          "-1, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
      Flatten an array.

      The axes flattened will be between ``start_axis`` and ``end_axis``,
      inclusive. Negative axes are supported. After converting negative axis to
      positive, axes outside the valid range will be clamped to a valid value,
      ``start_axis`` to ``0`` and ``end_axis`` to ``ndim - 1``.

      Args:
          a (array): Input array.
          start_axis (int, optional): The first dimension to flatten. Defaults to ``0``.
          end_axis (int, optional): The last dimension to flatten. Defaults to ``-1``.
          stream (Stream, optional): Stream or device. Defaults to ``None``
            in which case the default stream of the default device is used.

      Returns:
          array: The flattened array.

      Example:
          >>> a = mx.array([[1, 2], [3, 4]])
          >>> mx.flatten(a)
          array([1, 2, 3, 4], dtype=int32)
          >>>
          >>> mx.flatten(a, start_axis=0, end_axis=-1)
          array([1, 2, 3, 4], dtype=int32)
  )pbdoc");
  m.def(
      "unflatten",
      &mx::unflatten,
      nb::arg(),
      "axis"_a,
      "shape"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def unflatten(a: array, /, axis: int, shape: Sequence[int], *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
      Unflatten an axis of an array to a shape.

      Args:
          a (array): Input array.
          axis (int): The axis to unflatten.
          shape (tuple(int)): The shape to unflatten to. At most one
            entry can be ``-1`` in which case the corresponding size will be
            inferred.
          stream (Stream, optional): Stream or device. Defaults to ``None``
            in which case the default stream of the default device is used.

      Returns:
          array: The unflattened array.

      Example:
          >>> a = mx.array([1, 2, 3, 4])
          >>> mx.unflatten(a, 0, (2, -1))
          array([[1, 2], [3, 4]], dtype=int32)
  )pbdoc");
  m.def(
      "squeeze",
      [](const mx::array& a, const IntOrVec& v, const mx::StreamOrDevice& s) {
        if (std::holds_alternative<std::monostate>(v)) {
          return mx::squeeze(a, s);
        } else if (auto pv = std::get_if<int>(&v); pv) {
          return mx::squeeze(a, *pv, s);
        } else {
          return mx::squeeze(a, std::get<std::vector<int>>(v), s);
        }
      },
      nb::arg(),
      "axis"_a = nb::none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def squeeze(a: array, /, axis: Union[None, int, Sequence[int]] = "
          "None, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Remove length one axes from an array.

        Args:
            a (array): Input array.
            axis (int or tuple(int), optional): Axes to remove. Defaults
              to ``None`` in which case all size one axes are removed.

        Returns:
            array: The output array with size one axes removed.
      )pbdoc");
  m.def(
      "expand_dims",
      [](const mx::array& a,
         const std::variant<int, std::vector<int>>& v,
         mx::StreamOrDevice s) {
        if (auto pv = std::get_if<int>(&v); pv) {
          return mx::expand_dims(a, *pv, s);
        } else {
          return mx::expand_dims(a, std::get<std::vector<int>>(v), s);
        }
      },
      nb::arg(),
      "axis"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def expand_dims(a: array, /, axis: Union[int, Sequence[int]], "
          "*, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Add a size one dimension at the given axis.

        Args:
            a (array): Input array.
            axes (int or tuple(int)): The index of the inserted dimensions.

        Returns:
            array: The array with inserted dimensions.
      )pbdoc");
  m.def(
      "abs",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::abs(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def abs(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise absolute value.

        Args:
            a (array): Input array.

        Returns:
            array: The absolute value of ``a``.
      )pbdoc");
  m.def(
      "sign",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::sign(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def sign(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise sign.

        Args:
            a (array): Input array.

        Returns:
            array: The sign of ``a``.
      )pbdoc");
  m.def(
      "negative",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::negative(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def negative(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise negation.

        Args:
            a (array): Input array.

        Returns:
            array: The negative of ``a``.
      )pbdoc");
  m.def(
      "add",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::add(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def add(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise addition.

        Add two arrays with numpy-style broadcasting semantics. Either or both input arrays
        can also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The sum of ``a`` and ``b``.
      )pbdoc");
  m.def(
      "subtract",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::subtract(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def subtract(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise subtraction.

        Subtract one array from another with numpy-style broadcasting semantics. Either or both
        input arrays can also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The difference ``a - b``.
      )pbdoc");
  m.def(
      "multiply",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::multiply(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def multiply(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise multiplication.

        Multiply two arrays with numpy-style broadcasting semantics. Either or both
        input arrays can also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The multiplication ``a * b``.
      )pbdoc");
  m.def(
      "divide",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::divide(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def divide(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise division.

        Divide two arrays with numpy-style broadcasting semantics. Either or both
        input arrays can also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The quotient ``a / b``.
      )pbdoc");
  m.def(
      "divmod",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::divmod(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def divmod(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise quotient and remainder.

        The fuction ``divmod(a, b)`` is equivalent to but faster than
        ``(a // b, a % b)``. The function uses numpy-style broadcasting
        semantics. Either or both input arrays can also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            tuple(array, array): The quotient ``a // b`` and remainder ``a % b``.
      )pbdoc");
  m.def(
      "floor_divide",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::floor_divide(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def floor_divide(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise integer division.

        If either array is a floating point type then it is equivalent to
        calling :func:`floor` after :func:`divide`.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The quotient ``a // b``.
      )pbdoc");
  m.def(
      "remainder",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::remainder(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def remainder(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise remainder of division.

        Computes the remainder of dividing a with b with numpy-style
        broadcasting semantics. Either or both input arrays can also be
        scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The remainder of ``a // b``.
      )pbdoc");
  m.def(
      "equal",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::equal(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def equal(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise equality.

        Equality comparison on two arrays with numpy-style broadcasting semantics.
        Either or both input arrays can also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The element-wise comparison ``a == b``.
      )pbdoc");
  m.def(
      "not_equal",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::not_equal(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def not_equal(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise not equal.

        Not equal comparison on two arrays with numpy-style broadcasting semantics.
        Either or both input arrays can also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The element-wise comparison ``a != b``.
      )pbdoc");
  m.def(
      "less",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::less(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def less(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise less than.

        Strict less than on two arrays with numpy-style broadcasting semantics.
        Either or both input arrays can also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The element-wise comparison ``a < b``.
      )pbdoc");
  m.def(
      "less_equal",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::less_equal(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def less_equal(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise less than or equal.

        Less than or equal on two arrays with numpy-style broadcasting semantics.
        Either or both input arrays can also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The element-wise comparison ``a <= b``.
      )pbdoc");
  m.def(
      "greater",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::greater(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def greater(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise greater than.

        Strict greater than on two arrays with numpy-style broadcasting semantics.
        Either or both input arrays can also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The element-wise comparison ``a > b``.
      )pbdoc");
  m.def(
      "greater_equal",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::greater_equal(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def greater_equal(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise greater or equal.

        Greater than or equal on two arrays with numpy-style broadcasting semantics.
        Either or both input arrays can also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The element-wise comparison ``a >= b``.
      )pbdoc");
  m.def(
      "array_equal",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         bool equal_nan,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::array_equal(a, b, equal_nan, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "equal_nan"_a = false,
      "stream"_a = nb::none(),
      nb::sig(
          "def array_equal(a: Union[scalar, array], b: Union[scalar, array], equal_nan: bool = False, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Array equality check.

        Compare two arrays for equality. Returns ``True`` if and only if the arrays
        have the same shape and their values are equal. The arrays need not have
        the same type to be considered equal.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.
            equal_nan (bool): If ``True``, NaNs are considered equal.
              Defaults to ``False``.

        Returns:
            array: A scalar boolean array.
      )pbdoc");
  m.def(
      "matmul",
      &mx::matmul,
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def matmul(a: array, b: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Matrix multiplication.

        Perform the (possibly batched) matrix multiplication of two arrays. This function supports
        broadcasting for arrays with more than two dimensions.

        - If the first array is 1-D then a 1 is prepended to its shape to make it
          a matrix. Similarly if the second array is 1-D then a 1 is appended to its
          shape to make it a matrix. In either case the singleton dimension is removed
          from the result.
        - A batched matrix multiplication is performed if the arrays have more than
          2 dimensions.  The matrix dimensions for the matrix product are the last
          two dimensions of each input.
        - All but the last two dimensions of each input are broadcast with one another using
          standard numpy-style broadcasting semantics.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The matrix product of ``a`` and ``b``.
      )pbdoc");
  m.def(
      "square",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::square(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def square(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise square.

        Args:
            a (array): Input array.

        Returns:
            array: The square of ``a``.
      )pbdoc");
  m.def(
      "sqrt",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::sqrt(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def sqrt(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise square root.

        Args:
            a (array): Input array.

        Returns:
            array: The square root of ``a``.
      )pbdoc");
  m.def(
      "rsqrt",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::rsqrt(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def rsqrt(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise reciprocal and square root.

        Args:
            a (array): Input array.

        Returns:
            array: One over the square root of ``a``.
      )pbdoc");
  m.def(
      "reciprocal",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::reciprocal(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def reciprocal(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise reciprocal.

        Args:
            a (array): Input array.

        Returns:
            array: The reciprocal of ``a``.
      )pbdoc");
  m.def(
      "logical_not",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::logical_not(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def logical_not(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise logical not.

        Args:
            a (array): Input array or scalar.

        Returns:
            array: The boolean array containing the logical not of ``a``.
      )pbdoc");
  m.def(
      "logical_and",
      [](const ScalarOrArray& a, const ScalarOrArray& b, mx::StreamOrDevice s) {
        return mx::logical_and(to_array(a), to_array(b), s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def logical_and(a: array, b: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise logical and.

        Args:
            a (array): First input array or scalar.
            b (array): Second input array or scalar.

        Returns:
            array: The boolean array containing the logical and of ``a`` and ``b``.
    )pbdoc");

  m.def(
      "logical_or",
      [](const ScalarOrArray& a, const ScalarOrArray& b, mx::StreamOrDevice s) {
        return mx::logical_or(to_array(a), to_array(b), s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def logical_or(a: array, b: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise logical or.

        Args:
            a (array): First input array or scalar.
            b (array): Second input array or scalar.

        Returns:
            array: The boolean array containing the logical or of ``a`` and ``b``.
    )pbdoc");
  m.def(
      "logaddexp",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::logaddexp(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def logaddexp(a: Union[scalar, array], b: Union[scalar, array], /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise log-add-exp.

        This is a numerically stable log-add-exp of two arrays with numpy-style
        broadcasting semantics. Either or both input arrays can also be scalars.

        The computation is is a numerically stable version of ``log(exp(a) + exp(b))``.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The log-add-exp of ``a`` and ``b``.
      )pbdoc");
  m.def(
      "exp",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::exp(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def exp(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise exponential.

        Args:
            a (array): Input array.

        Returns:
            array: The exponential of ``a``.
      )pbdoc");
  m.def(
      "expm1",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::expm1(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def expm1(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise exponential minus 1.

        Computes ``exp(x) - 1`` with greater precision for small ``x``.

        Args:
            a (array): Input array.

        Returns:
            array: The expm1 of ``a``.
      )pbdoc");
  m.def(
      "erf",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::erf(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def erf(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise error function.

        .. math::
          \mathrm{erf}(x) = \frac{2}{\sqrt{\pi}} \int_0^x e^{-t^2} \, dt

        Args:
            a (array): Input array.

        Returns:
            array: The error function of ``a``.
      )pbdoc");
  m.def(
      "erfinv",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::erfinv(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def erfinv(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise inverse of :func:`erf`.

        Args:
            a (array): Input array.

        Returns:
            array: The inverse error function of ``a``.
      )pbdoc");
  m.def(
      "sin",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::sin(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def sin(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise sine.

        Args:
            a (array): Input array.

        Returns:
            array: The sine of ``a``.
      )pbdoc");
  m.def(
      "cos",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::cos(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def cos(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise cosine.

        Args:
            a (array): Input array.

        Returns:
            array: The cosine of ``a``.
      )pbdoc");
  m.def(
      "tan",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::tan(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def tan(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise tangent.

        Args:
            a (array): Input array.

        Returns:
            array: The tangent of ``a``.
      )pbdoc");
  m.def(
      "arcsin",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::arcsin(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def arcsin(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise inverse sine.

        Args:
            a (array): Input array.

        Returns:
            array: The inverse sine of ``a``.
      )pbdoc");
  m.def(
      "arccos",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::arccos(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def arccos(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise inverse cosine.

        Args:
            a (array): Input array.

        Returns:
            array: The inverse cosine of ``a``.
      )pbdoc");
  m.def(
      "arctan",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::arctan(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def arctan(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise inverse tangent.

        Args:
            a (array): Input array.

        Returns:
            array: The inverse tangent of ``a``.
      )pbdoc");
  m.def(
      "arctan2",
      &mx::arctan2,
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def arctan2(a: array, b: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise inverse tangent of the ratio of two arrays.

        Args:
            a (array): Input array.
            b (array): Input array.

        Returns:
            array: The inverse tangent of the ratio of ``a`` and ``b``.
      )pbdoc");
  m.def(
      "sinh",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::sinh(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def sinh(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise hyperbolic sine.

        Args:
            a (array): Input array.

        Returns:
            array: The hyperbolic sine of ``a``.
      )pbdoc");
  m.def(
      "cosh",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::cosh(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def cosh(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise hyperbolic cosine.

        Args:
            a (array): Input array.

        Returns:
            array: The hyperbolic cosine of ``a``.
      )pbdoc");
  m.def(
      "tanh",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::tanh(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def tanh(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise hyperbolic tangent.

        Args:
            a (array): Input array.

        Returns:
            array: The hyperbolic tangent of ``a``.
      )pbdoc");
  m.def(
      "arcsinh",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::arcsinh(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def arcsinh(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise inverse hyperbolic sine.

        Args:
            a (array): Input array.

        Returns:
            array: The inverse hyperbolic sine of ``a``.
      )pbdoc");
  m.def(
      "arccosh",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::arccosh(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def arccosh(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise inverse hyperbolic cosine.

        Args:
            a (array): Input array.

        Returns:
            array: The inverse hyperbolic cosine of ``a``.
      )pbdoc");
  m.def(
      "arctanh",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::arctanh(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def arctanh(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise inverse hyperbolic tangent.

        Args:
            a (array): Input array.

        Returns:
            array: The inverse hyperbolic tangent of ``a``.
      )pbdoc");
  m.def(
      "degrees",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::degrees(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def degrees(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
      Convert angles from radians to degrees.

      Args:
          a (array): Input array.

      Returns:
          array: The angles in degrees.
    )pbdoc");
  m.def(
      "radians",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::radians(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def radians(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
      Convert angles from degrees to radians.

      Args:
          a (array): Input array.

      Returns:
          array: The angles in radians.
    )pbdoc");
  m.def(
      "log",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::log(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def log(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise natural logarithm.

        Args:
            a (array): Input array.

        Returns:
            array: The natural logarithm of ``a``.
      )pbdoc");
  m.def(
      "log2",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::log2(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def log2(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise base-2 logarithm.

        Args:
            a (array): Input array.

        Returns:
            array: The base-2 logarithm of ``a``.
      )pbdoc");
  m.def(
      "log10",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::log10(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def log10(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise base-10 logarithm.

        Args:
            a (array): Input array.

        Returns:
            array: The base-10 logarithm of ``a``.
      )pbdoc");
  m.def(
      "log1p",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::log1p(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def log1p(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise natural log of one plus the array.

        Args:
            a (array): Input array.

        Returns:
            array: The natural logarithm of one plus ``a``.
      )pbdoc");
  m.def(
      "stop_gradient",
      &mx::stop_gradient,
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def stop_gradient(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Stop gradients from being computed.

        The operation is the identity but it prevents gradients from flowing
        through the array.

        Args:
            a (array): Input array.

        Returns:
            array:
              The unchanged input ``a`` but without gradient flowing
              through it.
      )pbdoc");
  m.def(
      "sigmoid",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::sigmoid(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def sigmoid(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise logistic sigmoid.

        The logistic sigmoid function is:

        .. math::
          \mathrm{sigmoid}(x) = \frac{1}{1 + e^{-x}}

        Args:
            a (array): Input array.

        Returns:
            array: The logistic sigmoid of ``a``.
      )pbdoc");
  m.def(
      "power",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::power(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def power(a: Union[scalar, array], b: Union[scalar, array], /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise power operation.

        Raise the elements of a to the powers in elements of b with numpy-style
        broadcasting semantics. Either or both input arrays can also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: Bases of ``a`` raised to powers in ``b``.
      )pbdoc");
  m.def(
      "arange",
      [](Scalar start,
         Scalar stop,
         const std::optional<Scalar>& step,
         const std::optional<mx::Dtype>& dtype_,
         mx::StreamOrDevice s) {
        // Determine the final dtype based on input types
        mx::Dtype dtype = dtype_
            ? *dtype_
            : mx::promote_types(
                  scalar_to_dtype(start),
                  step ? mx::promote_types(
                             scalar_to_dtype(stop), scalar_to_dtype(*step))
                       : scalar_to_dtype(stop));
        return mx::arange(
            scalar_to_double(start),
            scalar_to_double(stop),
            step ? scalar_to_double(*step) : 1.0,
            dtype,
            s);
      },
      "start"_a.noconvert(),
      "stop"_a.noconvert(),
      "step"_a.noconvert() = nb::none(),
      "dtype"_a = nb::none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def arange(start : Union[int, float], stop : Union[int, float], step : Union[None, int, float], dtype: Optional[Dtype] = None, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
      Generates ranges of numbers.

      Generate numbers in the half-open interval ``[start, stop)`` in
      increments of ``step``.

      Args:
          start (float or int, optional): Starting value which defaults to ``0``.
          stop (float or int): Stopping value.
          step (float or int, optional): Increment which defaults to ``1``.
          dtype (Dtype, optional): Specifies the data type of the output. If unspecified will default to ``float32`` if any of ``start``, ``stop``, or ``step`` are ``float``. Otherwise will default to ``int32``.

      Returns:
          array: The range of values.

      Note:
        Following the Numpy convention the actual increment used to
        generate numbers is ``dtype(start + step) - dtype(start)``.
        This can lead to unexpected results for example if `start + step`
        is a fractional value and the `dtype` is integral.
      )pbdoc");
  m.def(
      "arange",
      [](Scalar stop,
         const std::optional<Scalar>& step,
         const std::optional<mx::Dtype>& dtype_,
         mx::StreamOrDevice s) {
        mx::Dtype dtype = dtype_ ? *dtype_
            : step
            ? mx::promote_types(scalar_to_dtype(stop), scalar_to_dtype(*step))
            : scalar_to_dtype(stop);
        return mx::arange(
            0.0,
            scalar_to_double(stop),
            step ? scalar_to_double(*step) : 1.0,
            dtype,
            s);
      },
      "stop"_a.noconvert(),
      "step"_a.noconvert() = nb::none(),
      "dtype"_a = nb::none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def arange(stop : Union[int, float], step : Union[None, int, float] = None, dtype: Optional[Dtype] = None, *, stream: Union[None, Stream, Device] = None) -> array"));
  m.def(
      "bartlett",
      &mlx::core::bartlett,
      "M"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      R"pbdoc(
        Return the Bartlett window.
        
        The Bartlett window is a taper formed by using a weighted cosine.

        .. math::
          w(n) = 1 - \frac{2|n - (M-1)/2|}{M-1}
           \qquad 0 \le n \le M-1
        
        Args:
            M (int): Number of points in the output window.
            
        Returns:
            array: The window, with the maximum value normalized to one (the value one
                   appears only if the number of samples is odd).
    )pbdoc");
  m.def(
      "hanning",
      &mlx::core::hanning,
      "M"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      R"pbdoc(
        Return the Hanning window.
        
        The Hanning window is a taper formed by using a weighted cosine.

        .. math::
          w(n) = 0.5 - 0.5 \cos\left(\frac{2\pi n}{M-1}\right)
           \qquad 0 \le n \le M-1
        
        Args:
            M (int): Number of points in the output window.
            
        Returns:
            array: The window, with the maximum value normalized to one (the value one
                   appears only if the number of samples is odd).
    )pbdoc");
  m.def(
      "hamming",
      &mlx::core::hamming,
      "M"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def hamming(M: int, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Return the Hamming window.

        The Hamming window is a taper formed by using a weighted cosine.

        .. math::
           w(n) = 0.54 - 0.46 \cos\left(\frac{2\pi n}{M-1}\right)
           \qquad 0 \le n \le M-1

        Args:
            M (int): Number of points in the output window.

        Returns:
            array: The window, with the maximum value normalized to one (the value one
                   appears only if the number of samples is odd).
    )pbdoc");
  m.def(
      "blackman",
      &mlx::core::blackman,
      "M"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def blackman(M: int, *, stream: Union[None, Stream, Device] = None) -> array"), // <--- J'ai rajouté ça
      R"pbdoc(
        Return the Blackman window.
        
        The Blackman window is a taper formed by using the first three terms of a summation of cosines.

        .. math::
          w(n) = 0.42 - 0.5 \cos\left(\frac{2\pi n}{M-1}\right) + 0.08 \cos\left(\frac{4\pi n}{M-1}\right)
           \qquad 0 \le n \le M-1
        
        Args:
            M (int): Number of points in the output window.
            
        Returns:
            array: The window, with the maximum value normalized to one (the value one
                   appears only if the number of samples is odd).
    )pbdoc");
  m.def(
      "linspace",
      [](Scalar start,
         Scalar stop,
         int num,
         std::optional<mx::Dtype> dtype,
         mx::StreamOrDevice s) {
        return mx::linspace(
            scalar_to_double(start),
            scalar_to_double(stop),
            num,
            dtype.value_or(mx::float32),
            s);
      },
      "start"_a,
      "stop"_a,
      "num"_a = 50,
      "dtype"_a.none() = mx::float32,
      "stream"_a = nb::none(),
      nb::sig(
          "def linspace(start: scalar, stop: scalar, num: Optional[int] = 50, dtype: Optional[Dtype] = float32, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Generate ``num`` evenly spaced numbers over interval ``[start, stop]``.

        Args:
            start (scalar): Starting value.
            stop (scalar): Stopping value.
            num (int, optional): Number of samples, defaults to ``50``.
            dtype (Dtype, optional): Specifies the data type of the output,
              default to ``float32``.

        Returns:
            array: The range of values.
      )pbdoc");
  m.def(
      "kron",
      &mx::kron,
      nb::arg("a"),
      nb::arg("b"),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def kron(a: array, b: array, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Compute the Kronecker product of two arrays ``a`` and ``b``.

        Args:
          a (array): The first input array.
          b (array): The second input array.
          stream (Union[None, Stream, Device], optional): Optional stream or
            device for execution. Default: ``None``.

        Returns:
          array: The Kronecker product of ``a`` and ``b``.

        Examples:
          >>> a = mx.array([[1, 2], [3, 4]])
          >>> b = mx.array([[0, 5], [6, 7]])
          >>> result = mx.kron(a, b)
          >>> print(result)
          array([[0, 5, 0, 10],
                 [6, 7, 12, 14],
                 [0, 15, 0, 20],
                 [18, 21, 24, 28]], dtype=int32)
      )pbdoc");
  m.def(
      "take",
      [](const mx::array& a,
         const std::variant<nb::int_, mx::array>& indices,
         const std::optional<int>& axis,
         mx::StreamOrDevice s) {
        if (auto pv = std::get_if<nb::int_>(&indices); pv) {
          auto idx = nb::cast<int>(*pv);
          return axis ? mx::take(a, idx, axis.value(), s) : mx::take(a, idx, s);
        } else {
          auto indices_ = std::get<mx::array>(indices);
          return axis ? mx::take(a, indices_, axis.value(), s)
                      : mx::take(a, indices_, s);
        }
      },
      nb::arg(),
      "indices"_a,
      "axis"_a = nb::none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def take(a: array, /, indices: Union[int, array], axis: Optional[int] = None, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Take elements along an axis.

        The elements are taken from ``indices`` along the specified axis.
        If the axis is not specified the array is treated as a flattened
        1-D array prior to performing the take.

        As an example, if the ``axis=1`` this is equivalent to ``a[:, indices, ...]``.

        Args:
            a (array): Input array.
            indices (int or array): Integer index or input array with integral type.
            axis (int, optional): Axis along which to perform the take. If unspecified
              the array is treated as a flattened 1-D vector.

        Returns:
            array: The indexed values of ``a``.
      )pbdoc");
  m.def(
      "take_along_axis",
      [](const mx::array& a,
         const mx::array& indices,
         const std::optional<int>& axis,
         mx::StreamOrDevice s) {
        if (axis.has_value()) {
          return mx::take_along_axis(a, indices, axis.value(), s);
        } else {
          return mx::take_along_axis(mx::reshape(a, {-1}, s), indices, 0, s);
        }
      },
      nb::arg(),
      "indices"_a,
      "axis"_a.none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def take_along_axis(a: array, /, indices: array, axis: Optional[int] = None, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Take values along an axis at the specified indices.

        Args:
            a (array): Input array.
            indices (array): Indices array. These should be broadcastable with
              the input array excluding the `axis` dimension.
            axis (int or None): Axis in the input to take the values from. If
              ``axis == None`` the array is flattened to 1D prior to the indexing
              operation.

        Returns:
            array: The output array.
      )pbdoc");
  m.def(
      "put_along_axis",
      [](const mx::array& a,
         const mx::array& indices,
         const mx::array& values,
         const std::optional<int>& axis,
         mx::StreamOrDevice s) {
        if (axis.has_value()) {
          return mx::put_along_axis(a, indices, values, axis.value(), s);
        } else {
          return mx::reshape(
              mx::put_along_axis(
                  mx::reshape(a, {-1}, s), indices, values, 0, s),
              a.shape(),
              s);
        }
      },
      nb::arg(),
      "indices"_a,
      "values"_a,
      "axis"_a.none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def put_along_axis(a: array, /, indices: array, values: array, axis: Optional[int] = None, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Put values along an axis at the specified indices.

        Args:
            a (array): Destination array.
            indices (array): Indices array. These should be broadcastable with
              the input array excluding the `axis` dimension.
            values (array): Values array. These should be broadcastable with
              the indices.

            axis (int or None): Axis in the destination to put the values to. If
              ``axis == None`` the destination is flattened prior to the put
              operation.

        Returns:
            array: The output array.
      )pbdoc");
  m.def(
      "full",
      [](const std::variant<int, mx::Shape>& shape,
         const ScalarOrArray& vals,
         std::optional<mx::Dtype> dtype,
         mx::StreamOrDevice s) {
        if (auto pv = std::get_if<int>(&shape); pv) {
          return mx::full({*pv}, to_array(vals, dtype), s);
        } else {
          return mx::full(std::get<mx::Shape>(shape), to_array(vals, dtype), s);
        }
      },
      "shape"_a,
      "vals"_a,
      "dtype"_a = nb::none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def full(shape: Union[int, Sequence[int]], vals: Union[scalar, array], dtype: Optional[Dtype] = None, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Construct an array with the given value.

        Constructs an array of size ``shape`` filled with ``vals``. If ``vals``
        is an :obj:`array` it must be broadcastable to the given ``shape``.

        Args:
            shape (int or list(int)): The shape of the output array.
            vals (float or int or array): Values to fill the array with.
            dtype (Dtype, optional): Data type of the output array. If
              unspecified the output type is inferred from ``vals``.

        Returns:
            array: The output array with the specified shape and values.
      )pbdoc");
  m.def(
      "zeros",
      [](const std::variant<int, mx::Shape>& shape,
         std::optional<mx::Dtype> dtype,
         mx::StreamOrDevice s) {
        auto t = dtype.value_or(mx::float32);
        if (auto pv = std::get_if<int>(&shape); pv) {
          return mx::zeros({*pv}, t, s);
        } else {
          return mx::zeros(std::get<mx::Shape>(shape), t, s);
        }
      },
      "shape"_a,
      "dtype"_a.none() = mx::float32,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def zeros(shape: Union[int, Sequence[int]], dtype: Optional[Dtype] = float32, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Construct an array of zeros.

        Args:
            shape (int or list(int)): The shape of the output array.
            dtype (Dtype, optional): Data type of the output array. If
              unspecified the output type defaults to ``float32``.

        Returns:
            array: The array of zeros with the specified shape.
      )pbdoc");
  m.def(
      "asarray",
      [](const ArrayInitType& a, std::optional<mx::Dtype> dtype) {
        return create_array(a, dtype);
      },
      nb::arg(),
      "dtype"_a = nb::none(),
      nb::sig(
          "def asarray(a: Union[scalar, array, Sequence], dtype: "
          "Optional[Dtype] = None) -> array"),
      R"pbdoc(
        Convert the input to an array.

        Args:
            a: Input data.
            dtype (Dtype, optional): The desired data-type for the array.

        Returns:
            array: An array interpretation of the input.
      )pbdoc");
  m.def(
      "zeros_like",
      &mx::zeros_like,
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def zeros_like(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        An array of zeros like the input.

        Args:
            a (array): The input to take the shape and type from.

        Returns:
            array: The output array filled with zeros.
      )pbdoc");
  m.def(
      "ones",
      [](const std::variant<int, mx::Shape>& shape,
         std::optional<mx::Dtype> dtype,
         mx::StreamOrDevice s) {
        auto t = dtype.value_or(mx::float32);
        if (auto pv = std::get_if<int>(&shape); pv) {
          return mx::ones({*pv}, t, s);
        } else {
          return mx::ones(std::get<mx::Shape>(shape), t, s);
        }
      },
      "shape"_a,
      "dtype"_a.none() = mx::float32,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def ones(shape: Union[int, Sequence[int]], dtype: Optional[Dtype] = float32, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Construct an array of ones.

        Args:
            shape (int or list(int)): The shape of the output array.
            dtype (Dtype, optional): Data type of the output array. If
              unspecified the output type defaults to ``float32``.

        Returns:
            array: The array of ones with the specified shape.
      )pbdoc");
  m.def(
      "ones_like",
      &mx::ones_like,
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def ones_like(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        An array of ones like the input.

        Args:
            a (array): The input to take the shape and type from.

        Returns:
            array: The output array filled with ones.
      )pbdoc");
  m.def(
      "eye",
      [](int n,
         std::optional<int> m,
         int k,
         std::optional<mx::Dtype> dtype,
         mx::StreamOrDevice s) {
        return mx::eye(n, m.value_or(n), k, dtype.value_or(mx::float32), s);
      },
      "n"_a,
      "m"_a = nb::none(),
      "k"_a = 0,
      "dtype"_a.none() = mx::float32,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def eye(n: int, m: Optional[int] = None, k: int = 0, dtype: Optional[Dtype] = float32, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Create an identity matrix or a general diagonal matrix.

        Args:
            n (int): The number of rows in the output.
            m (int, optional): The number of columns in the output. Defaults to n.
            k (int, optional): Index of the diagonal. Defaults to 0 (main diagonal).
            dtype (Dtype, optional): Data type of the output array. Defaults to float32.
            stream (Stream, optional): Stream or device. Defaults to None.

        Returns:
            array: An array where all elements are equal to zero, except for the k-th diagonal, whose values are equal to one.
      )pbdoc");
  m.def(
      "identity",
      [](int n, std::optional<mx::Dtype> dtype, mx::StreamOrDevice s) {
        return mx::identity(n, dtype.value_or(mx::float32), s);
      },
      "n"_a,
      "dtype"_a.none() = mx::float32,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def identity(n: int, dtype: Optional[Dtype] = float32, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Create a square identity matrix.

        Args:
            n (int): The number of rows and columns in the output.
            dtype (Dtype, optional): Data type of the output array. Defaults to float32.
            stream (Stream, optional): Stream or device. Defaults to None.

        Returns:
            array: An identity matrix of size n x n.
      )pbdoc");
  m.def(
      "tri",
      [](int n,
         std::optional<int> m,
         int k,
         std::optional<mx::Dtype> type,
         mx::StreamOrDevice s) {
        return mx::tri(n, m.value_or(n), k, type.value_or(mx::float32), s);
      },
      "n"_a,
      "m"_a = nb::none(),
      "k"_a = 0,
      "dtype"_a.none() = mx::float32,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def tri(n: int, m: int, k: int, dtype: Optional[Dtype] = None, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        An array with ones at and below the given diagonal and zeros elsewhere.

        Args:
          n (int): The number of rows in the output.
          m (int, optional): The number of cols in the output. Defaults to ``None``.
          k (int, optional): The diagonal of the 2-D array. Defaults to ``0``.
          dtype (Dtype, optional): Data type of the output array. Defaults to ``float32``.
          stream (Stream, optional): Stream or device. Defaults to ``None``.

        Returns:
          array: Array with its lower triangle filled with ones and zeros elsewhere
      )pbdoc");
  m.def(
      "tril",
      &mx::tril,
      "x"_a,
      "k"_a = 0,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def tril(x: array, k: int, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Zeros the array above the given diagonal.

        Args:
          x (array): input array.
          k (int, optional): The diagonal of the 2-D array. Defaults to ``0``.
          stream (Stream, optional): Stream or device. Defaults to ``None``.

        Returns:
          array: Array zeroed above the given diagonal
      )pbdoc");
  m.def(
      "triu",
      &mx::triu,
      "x"_a,
      "k"_a = 0,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def triu(x: array, k: int, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Zeros the array below the given diagonal.

        Args:
          x (array): input array.
          k (int, optional): The diagonal of the 2-D array. Defaults to ``0``.
          stream (Stream, optional): Stream or device. Defaults to ``None``.

        Returns:
          array: Array zeroed below the given diagonal
    )pbdoc");
  m.def(
      "allclose",
      &mx::allclose,
      nb::arg(),
      nb::arg(),
      "rtol"_a = 1e-5,
      "atol"_a = 1e-8,
      nb::kw_only(),
      "equal_nan"_a = false,
      "stream"_a = nb::none(),
      nb::sig(
          "def allclose(a: array, b: array, /, rtol: float = 1e-05, atol: float = 1e-08, *, equal_nan: bool = False, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Approximate comparison of two arrays.

        Infinite values are considered equal if they have the same sign, NaN values are not equal unless ``equal_nan`` is ``True``.

        The arrays are considered equal if:

        .. code-block::

         all(abs(a - b) <= (atol + rtol * abs(b)))

        Note unlike :func:`array_equal`, this function supports numpy-style
        broadcasting.

        Args:
            a (array): Input array.
            b (array): Input array.
            rtol (float): Relative tolerance.
            atol (float): Absolute tolerance.
            equal_nan (bool): If ``True``, NaNs are considered equal.
              Defaults to ``False``.

        Returns:
            array: The boolean output scalar indicating if the arrays are close.
      )pbdoc");
  m.def(
      "isclose",
      &mx::isclose,
      nb::arg(),
      nb::arg(),
      "rtol"_a = 1e-5,
      "atol"_a = 1e-8,
      nb::kw_only(),
      "equal_nan"_a = false,
      "stream"_a = nb::none(),
      nb::sig(
          "def isclose(a: array, b: array, /, rtol: float = 1e-05, atol: float = 1e-08, *, equal_nan: bool = False, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Returns a boolean array where two arrays are element-wise equal within a tolerance.

        Infinite values are considered equal if they have the same sign, NaN values are
        not equal unless ``equal_nan`` is ``True``.

        Two values are considered equal if:

        .. code-block::

         abs(a - b) <= (atol + rtol * abs(b))

        Note unlike :func:`array_equal`, this function supports numpy-style
        broadcasting.

        Args:
            a (array): Input array.
            b (array): Input array.
            rtol (float): Relative tolerance.
            atol (float): Absolute tolerance.
            equal_nan (bool): If ``True``, NaNs are considered equal.
              Defaults to ``False``.

        Returns:
            array: The boolean output scalar indicating if the arrays are close.
      )pbdoc");
  m.def(
      "all",
      [](const mx::array& a,
         const IntOrVec& axis,
         bool keepdims,
         mx::StreamOrDevice s) {
        return mx::all(a, get_reduce_axes(axis, a.ndim()), keepdims, s);
      },
      nb::arg(),
      "axis"_a = nb::none(),
      "keepdims"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def all(a: array, /, axis: Union[None, int, Sequence[int]] = None, keepdims: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        An `and` reduction over the given axes.

        Args:
            a (array): Input array.
            axis (int or list(int), optional): Optional axis or
              axes to reduce over. If unspecified this defaults
              to reducing over the entire array.
            keepdims (bool, optional): Keep reduced axes as
              singleton dimensions, defaults to `False`.

        Returns:
            array: The output array with the corresponding axes reduced.
      )pbdoc");
  m.def(
      "any",
      [](const mx::array& a,
         const IntOrVec& axis,
         bool keepdims,
         mx::StreamOrDevice s) {
        return mx::any(a, get_reduce_axes(axis, a.ndim()), keepdims, s);
      },
      nb::arg(),
      "axis"_a = nb::none(),
      "keepdims"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def any(a: array, /, axis: Union[None, int, Sequence[int]] = None, keepdims: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        An `or` reduction over the given axes.

        Args:
            a (array): Input array.
            axis (int or list(int), optional): Optional axis or
              axes to reduce over. If unspecified this defaults
              to reducing over the entire array.
            keepdims (bool, optional): Keep reduced axes as
              singleton dimensions, defaults to `False`.

        Returns:
            array: The output array with the corresponding axes reduced.
      )pbdoc");
  m.def(
      "minimum",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::minimum(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def minimum(a: Union[scalar, array], b: Union[scalar, array], /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise minimum.

        Take the element-wise min of two arrays with numpy-style broadcasting
        semantics. Either or both input arrays can also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The min of ``a`` and ``b``.
      )pbdoc");
  m.def(
      "maximum",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::maximum(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def maximum(a: Union[scalar, array], b: Union[scalar, array], /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise maximum.

        Take the element-wise max of two arrays with numpy-style broadcasting
        semantics. Either or both input arrays can also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The max of ``a`` and ``b``.
      )pbdoc");
  m.def(
      "floor",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::floor(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def floor(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise floor.

        Args:
            a (array): Input array.

        Returns:
            array: The floor of ``a``.
      )pbdoc");
  m.def(
      "ceil",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::ceil(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def ceil(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise ceil.

        Args:
            a (array): Input array.

        Returns:
            array: The ceil of ``a``.
      )pbdoc");
  m.def(
      "isnan",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::isnan(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def isnan(a: array, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Return a boolean array indicating which elements are NaN.

        Args:
            a (array): Input array.

        Returns:
            array: The boolean array indicating which elements are NaN.
      )pbdoc");
  m.def(
      "isinf",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::isinf(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def isinf(a: array, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Return a boolean array indicating which elements are +/- inifnity.

        Args:
            a (array): Input array.

        Returns:
            array: The boolean array indicating which elements are +/- infinity.
      )pbdoc");
  m.def(
      "isfinite",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::isfinite(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def isfinite(a: array, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Return a boolean array indicating which elements are finite.

        An element is finite if it is not infinite or NaN.

        Args:
            a (array): Input array.

        Returns:
            array: The boolean array indicating which elements are finite.
      )pbdoc");
  m.def(
      "isposinf",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::isposinf(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def isposinf(a: array, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Return a boolean array indicating which elements are positive infinity.

        Args:
            a (array): Input array.
            stream (Union[None, Stream, Device]): Optional stream or device.

        Returns:
            array: The boolean array indicating which elements are positive infinity.
      )pbdoc");
  m.def(
      "isneginf",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::isneginf(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def isneginf(a: array, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Return a boolean array indicating which elements are negative infinity.

        Args:
            a (array): Input array.
            stream (Union[None, Stream, Device]): Optional stream or device.

        Returns:
            array: The boolean array indicating which elements are negative infinity.
      )pbdoc");
  m.def(
      "moveaxis",
      &mx::moveaxis,
      nb::arg(),
      "source"_a,
      "destination"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def moveaxis(a: array, /, source: int, destination: int, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Move an axis to a new position.

        Args:
            a (array): Input array.
            source (int): Specifies the source axis.
            destination (int): Specifies the destination axis.

        Returns:
            array: The array with the axis moved.
      )pbdoc");
  m.def(
      "swapaxes",
      &mx::swapaxes,
      nb::arg(),
      "axis1"_a,
      "axis2"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def swapaxes(a: array, /, axis1 : int, axis2: int, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Swap two axes of an array.

        Args:
            a (array): Input array.
            axis1 (int): Specifies the first axis.
            axis2 (int): Specifies the second axis.

        Returns:
            array: The array with swapped axes.
      )pbdoc");
  m.def(
      "transpose",
      [](const mx::array& a,
         const std::optional<std::vector<int>>& axes,
         mx::StreamOrDevice s) {
        if (axes.has_value()) {
          return mx::transpose(a, *axes, s);
        } else {
          return mx::transpose(a, s);
        }
      },
      nb::arg(),
      "axes"_a = nb::none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def transpose(a: array, /, axes: Optional[Sequence[int]] = None, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Transpose the dimensions of the array.

        Args:
            a (array): Input array.
            axes (list(int), optional): Specifies the source axis for each axis
              in the new array. The default is to reverse the axes.

        Returns:
            array: The transposed array.
      )pbdoc");
  m.def(
      "permute_dims",
      [](const mx::array& a,
         const std::optional<std::vector<int>>& axes,
         mx::StreamOrDevice s) {
        if (axes.has_value()) {
          return mx::transpose(a, *axes, s);
        } else {
          return mx::transpose(a, s);
        }
      },
      nb::arg(),
      "axes"_a = nb::none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def permute_dims(a: array, /, axes: Optional[Sequence[int]] = None, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        See :func:`transpose`.
      )pbdoc");
  m.def(
      "sum",
      [](const mx::array& a,
         const IntOrVec& axis,
         bool keepdims,
         mx::StreamOrDevice s) {
        return mx::sum(a, get_reduce_axes(axis, a.ndim()), keepdims, s);
      },
      "array"_a,
      "axis"_a = nb::none(),
      "keepdims"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def sum(a: array, /, axis: Union[None, int, Sequence[int]] = None, keepdims: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Sum reduce the array over the given axes.

        Args:
            a (array): Input array.
            axis (int or list(int), optional): Optional axis or
              axes to reduce over. If unspecified this defaults
              to reducing over the entire array.
            keepdims (bool, optional): Keep reduced axes as
              singleton dimensions, defaults to `False`.

        Returns:
            array: The output array with the corresponding axes reduced.
      )pbdoc");
  m.def(
      "prod",
      [](const mx::array& a,
         const IntOrVec& axis,
         bool keepdims,
         mx::StreamOrDevice s) {
        return mx::prod(a, get_reduce_axes(axis, a.ndim()), keepdims, s);
      },
      nb::arg(),
      "axis"_a = nb::none(),
      "keepdims"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def prod(a: array, /, axis: Union[None, int, Sequence[int]] = None, keepdims: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        An product reduction over the given axes.

        Args:
            a (array): Input array.
            axis (int or list(int), optional): Optional axis or
              axes to reduce over. If unspecified this defaults
              to reducing over the entire array.
            keepdims (bool, optional): Keep reduced axes as
              singleton dimensions, defaults to `False`.

        Returns:
            array: The output array with the corresponding axes reduced.
      )pbdoc");
  m.def(
      "min",
      [](const mx::array& a,
         const IntOrVec& axis,
         bool keepdims,
         mx::StreamOrDevice s) {
        return mx::min(a, get_reduce_axes(axis, a.ndim()), keepdims, s);
      },
      nb::arg(),
      "axis"_a = nb::none(),
      "keepdims"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def min(a: array, /, axis: Union[None, int, Sequence[int]] = None, keepdims: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        A `min` reduction over the given axes.

        Args:
            a (array): Input array.
            axis (int or list(int), optional): Optional axis or
              axes to reduce over. If unspecified this defaults
              to reducing over the entire array.
            keepdims (bool, optional): Keep reduced axes as
              singleton dimensions, defaults to `False`.

        Returns:
            array: The output array with the corresponding axes reduced.
      )pbdoc");
  m.def(
      "max",
      [](const mx::array& a,
         const IntOrVec& axis,
         bool keepdims,
         mx::StreamOrDevice s) {
        return mx::max(a, get_reduce_axes(axis, a.ndim()), keepdims, s);
      },
      nb::arg(),
      "axis"_a = nb::none(),
      "keepdims"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def max(a: array, /, axis: Union[None, int, Sequence[int]] = None, keepdims: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        A `max` reduction over the given axes.

        Args:
            a (array): Input array.
            axis (int or list(int), optional): Optional axis or
              axes to reduce over. If unspecified this defaults
              to reducing over the entire array.
            keepdims (bool, optional): Keep reduced axes as
              singleton dimensions, defaults to `False`.

        Returns:
            array: The output array with the corresponding axes reduced.
      )pbdoc");
  m.def(
      "logcumsumexp",
      [](const mx::array& a,
         std::optional<int> axis,
         bool reverse,
         bool inclusive,
         mx::StreamOrDevice s) {
        if (axis) {
          return mx::logcumsumexp(a, *axis, reverse, inclusive, s);
        } else {
          return mx::logcumsumexp(
              mx::reshape(a, {-1}, s), 0, reverse, inclusive, s);
        }
      },
      nb::arg(),
      "axis"_a = nb::none(),
      nb::kw_only(),
      "reverse"_a = false,
      "inclusive"_a = true,
      "stream"_a = nb::none(),
      nb::sig(
          "def logcumsumexp(a: array, /, axis: Optional[int] = None, *, reverse: bool = False, inclusive: bool = True, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Return the cumulative logsumexp of the elements along the given axis.

        Args:
          a (array): Input array
          axis (int, optional): Optional axis to compute the cumulative logsumexp
            over. If unspecified the cumulative logsumexp of the flattened array is
            returned.
          reverse (bool): Perform the cumulative logsumexp in reverse.
          inclusive (bool): The i-th element of the output includes the i-th
            element of the input.

        Returns:
          array: The output array.
      )pbdoc");
  m.def(
      "logsumexp",
      [](const mx::array& a,
         const IntOrVec& axis,
         bool keepdims,
         mx::StreamOrDevice s) {
        return mx::logsumexp(a, get_reduce_axes(axis, a.ndim()), keepdims, s);
      },
      nb::arg(),
      "axis"_a = nb::none(),
      "keepdims"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def logsumexp(a: array, /, axis: Union[None, int, Sequence[int]] = None, keepdims: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        A `log-sum-exp` reduction over the given axes.

        The log-sum-exp reduction is a numerically stable version of:

        .. code-block::

          log(sum(exp(a), axis))

        Args:
            a (array): Input array.
            axis (int or list(int), optional): Optional axis or
              axes to reduce over. If unspecified this defaults
              to reducing over the entire array.
            keepdims (bool, optional): Keep reduced axes as
              singleton dimensions, defaults to `False`.

        Returns:
            array: The output array with the corresponding axes reduced.
      )pbdoc");
  m.def(
      "mean",
      [](const mx::array& a,
         const IntOrVec& axis,
         bool keepdims,
         mx::StreamOrDevice s) {
        return mx::mean(a, get_reduce_axes(axis, a.ndim()), keepdims, s);
      },
      nb::arg(),
      "axis"_a = nb::none(),
      "keepdims"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def mean(a: array, /, axis: Union[None, int, Sequence[int]] = None, keepdims: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Compute the mean(s) over the given axes.

        Args:
            a (array): Input array.
            axis (int or list(int), optional): Optional axis or
              axes to reduce over. If unspecified this defaults
              to reducing over the entire array.
            keepdims (bool, optional): Keep reduced axes as
              singleton dimensions, defaults to `False`.

        Returns:
            array: The output array of means.
      )pbdoc");
  m.def(
      "median",
      [](const mx::array& a,
         const IntOrVec& axis,
         bool keepdims,
         mx::StreamOrDevice s) {
        return mx::median(a, get_reduce_axes(axis, a.ndim()), keepdims, s);
      },
      nb::arg(),
      "axis"_a = nb::none(),
      "keepdims"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def median(a: array, /, axis: Union[None, int, Sequence[int]] = None, keepdims: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Compute the median(s) over the given axes.

        Args:
            a (array): Input array.
            axis (int or list(int), optional): Optional axis or
              axes to reduce over. If unspecified this defaults
              to reducing over the entire array.
            keepdims (bool, optional): Keep reduced axes as
              singleton dimensions, defaults to `False`.

        Returns:
            array: The output array of medians.
      )pbdoc");
  m.def(
      "var",
      [](const mx::array& a,
         const IntOrVec& axis,
         bool keepdims,
         int ddof,
         mx::StreamOrDevice s) {
        return mx::var(a, get_reduce_axes(axis, a.ndim()), keepdims, ddof, s);
      },
      nb::arg(),
      "axis"_a = nb::none(),
      "keepdims"_a = false,
      "ddof"_a = 0,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def var(a: array, /, axis: Union[None, int, Sequence[int]] = None, keepdims: bool = False, ddof: int = 0, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Compute the variance(s) over the given axes.

        Args:
            a (array): Input array.
            axis (int or list(int), optional): Optional axis or
              axes to reduce over. If unspecified this defaults
              to reducing over the entire array.
            keepdims (bool, optional): Keep reduced axes as
              singleton dimensions, defaults to `False`.
            ddof (int, optional): The divisor to compute the variance
              is ``N - ddof``, defaults to 0.

        Returns:
            array: The output array of variances.
      )pbdoc");
  m.def(
      "std",
      [](const mx::array& a,
         const IntOrVec& axis,
         bool keepdims,
         int ddof,
         mx::StreamOrDevice s) {
        return mx::std(a, get_reduce_axes(axis, a.ndim()), keepdims, ddof, s);
      },
      nb::arg(),
      "axis"_a = nb::none(),
      "keepdims"_a = false,
      "ddof"_a = 0,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def std(a: array, /, axis: Union[None, int, Sequence[int]] = None, keepdims: bool = False, ddof: int = 0, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Compute the standard deviation(s) over the given axes.

        Args:
            a (array): Input array.
            axis (int or list(int), optional): Optional axis or
              axes to reduce over. If unspecified this defaults
              to reducing over the entire array.
            keepdims (bool, optional): Keep reduced axes as
              singleton dimensions, defaults to `False`.
            ddof (int, optional): The divisor to compute the variance
              is ``N - ddof``, defaults to 0.

        Returns:
            array: The output array of standard deviations.
      )pbdoc");
  m.def(
      "split",
      [](const mx::array& a,
         const std::variant<int, mx::Shape>& indices_or_sections,
         int axis,
         mx::StreamOrDevice s) {
        if (auto pv = std::get_if<int>(&indices_or_sections); pv) {
          return mx::split(a, *pv, axis, s);
        } else {
          return mx::split(
              a, std::get<mx::Shape>(indices_or_sections), axis, s);
        }
      },
      nb::arg(),
      "indices_or_sections"_a,
      "axis"_a = 0,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def split(a: array, /, indices_or_sections: Union[int, Sequence[int]], axis: int = 0, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Split an array along a given axis.

        Args:
            a (array): Input array.
            indices_or_sections (int or list(int)): If ``indices_or_sections``
              is an integer the array is split into that many sections of equal
              size. An error is raised if this is not possible. If
              ``indices_or_sections`` is a list, then the indices are the split
              points, and the array is divided into
              ``len(indices_or_sections) + 1`` sub-arrays.
            axis (int, optional): Axis to split along, defaults to `0`.

        Returns:
            list(array): A list of split arrays.

        Example:

          >>> a = mx.array([1, 2, 3, 4], dtype=mx.int32)
          >>> mx.split(a, 2)
          [array([1, 2], dtype=int32), array([3, 4], dtype=int32)]
          >>> mx.split(a, [1, 3])
          [array([1], dtype=int32), array([2, 3], dtype=int32), array([4], dtype=int32)]

      )pbdoc");
  m.def(
      "argmin",
      [](const mx::array& a,
         std::optional<int> axis,
         bool keepdims,
         mx::StreamOrDevice s) {
        if (axis) {
          return mx::argmin(a, *axis, keepdims, s);
        } else {
          return mx::argmin(a, keepdims, s);
        }
      },
      nb::arg(),
      "axis"_a = nb::none(),
      "keepdims"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def argmin(a: array, /, axis: Union[None, int] = None, keepdims: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Indices of the minimum values along the axis.

        Args:
            a (array): Input array.
            axis (int, optional): Optional axis to reduce over. If unspecified
              this defaults to reducing over the entire array.
            keepdims (bool, optional): Keep reduced axes as
              singleton dimensions, defaults to `False`.

        Returns:
            array: The ``uint32`` array with the indices of the minimum values.
      )pbdoc");
  m.def(
      "argmax",
      [](const mx::array& a,
         std::optional<int> axis,
         bool keepdims,
         mx::StreamOrDevice s) {
        if (axis) {
          return mx::argmax(a, *axis, keepdims, s);
        } else {
          return mx::argmax(a, keepdims, s);
        }
      },
      nb::arg(),
      "axis"_a = nb::none(),
      "keepdims"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def argmax(a: array, /, axis: Union[None, int] = None, keepdims: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Indices of the maximum values along the axis.

        Args:
            a (array): Input array.
            axis (int, optional): Optional axis to reduce over. If unspecified
              this defaults to reducing over the entire array.
            keepdims (bool, optional): Keep reduced axes as
              singleton dimensions, defaults to `False`.

        Returns:
            array: The ``uint32`` array with the indices of the maximum values.
      )pbdoc");
  m.def(
      "sort",
      [](const mx::array& a, std::optional<int> axis, mx::StreamOrDevice s) {
        if (axis) {
          return mx::sort(a, *axis, s);
        } else {
          return mx::sort(a, s);
        }
      },
      nb::arg(),
      "axis"_a.none() = -1,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def sort(a: array, /, axis: Union[None, int] = -1, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Returns a sorted copy of the array.

        Args:
            a (array): Input array.
            axis (int or None, optional): Optional axis to sort over.
              If ``None``, this sorts over the flattened array.
              If unspecified, it defaults to -1 (sorting over the last axis).

        Returns:
            array: The sorted array.
      )pbdoc");
  m.def(
      "argsort",
      [](const mx::array& a, std::optional<int> axis, mx::StreamOrDevice s) {
        if (axis) {
          return mx::argsort(a, *axis, s);
        } else {
          return mx::argsort(a, s);
        }
      },
      nb::arg(),
      "axis"_a.none() = -1,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def argsort(a: array, /, axis: Union[None, int] = -1, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Returns the indices that sort the array.

        Args:
            a (array): Input array.
            axis (int or None, optional): Optional axis to sort over.
              If ``None``, this sorts over the flattened array.
              If unspecified, it defaults to -1 (sorting over the last axis).

        Returns:
            array: The ``uint32`` array containing indices that sort the input.
      )pbdoc");
  m.def(
      "partition",
      [](const mx::array& a,
         int kth,
         std::optional<int> axis,
         mx::StreamOrDevice s) {
        if (axis) {
          return mx::partition(a, kth, *axis, s);
        } else {
          return mx::partition(a, kth, s);
        }
      },
      nb::arg(),
      "kth"_a,
      "axis"_a.none() = -1,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def partition(a: array, /, kth: int, axis: Union[None, int] = -1, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Returns a partitioned copy of the array such that the smaller ``kth``
        elements are first.

        The ordering of the elements in partitions is undefined.

        Args:
            a (array): Input array.
            kth (int): Element at the ``kth`` index will be in its sorted
              position in the output. All elements before the kth index will
              be less or equal to the ``kth`` element and all elements after
              will be greater or equal to the ``kth`` element in the output.
            axis (int or None, optional): Optional axis to partition over.
              If ``None``, this partitions over the flattened array.
              If unspecified, it defaults to ``-1``.

        Returns:
            array: The partitioned array.
      )pbdoc");
  m.def(
      "argpartition",
      [](const mx::array& a,
         int kth,
         std::optional<int> axis,
         mx::StreamOrDevice s) {
        if (axis) {
          return mx::argpartition(a, kth, *axis, s);
        } else {
          return mx::argpartition(a, kth, s);
        }
      },
      nb::arg(),
      "kth"_a,
      "axis"_a.none() = -1,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def argpartition(a: array, /, kth: int, axis: Union[None, int] = -1, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Returns the indices that partition the array.

        The ordering of the elements within a partition in given by the indices
        is undefined.

        Args:
            a (array): Input array.
            kth (int): Element index at the ``kth`` position in the output will
              give the sorted position. All indices before the ``kth`` position
              will be of elements less or equal to the element at the ``kth``
              index and all indices after will be of elements greater or equal
              to the element at the ``kth`` index.
            axis (int or None, optional): Optional axis to partition over.
              If ``None``, this partitions over the flattened array.
              If unspecified, it defaults to ``-1``.

        Returns:
            array: The ``uint32`` array containing indices that partition the input.
      )pbdoc");
  m.def(
      "topk",
      [](const mx::array& a,
         int k,
         std::optional<int> axis,
         mx::StreamOrDevice s) {
        if (axis) {
          return mx::topk(a, k, *axis, s);
        } else {
          return mx::topk(a, k, s);
        }
      },
      nb::arg(),
      "k"_a,
      "axis"_a.none() = -1,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def topk(a: array, /, k: int, axis: Union[None, int] = -1, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Returns the ``k`` largest elements from the input along a given axis.

        The elements will not necessarily be in sorted order.

        Args:
            a (array): Input array.
            k (int): ``k`` top elements to be returned
            axis (int or None, optional): Optional axis to select over.
              If ``None``, this selects the top ``k`` elements over the
              flattened array. If unspecified, it defaults to ``-1``.

        Returns:
            array: The top ``k`` elements from the input.
      )pbdoc");
  m.def(
      "broadcast_to",
      [](const ScalarOrArray& a, const mx::Shape& shape, mx::StreamOrDevice s) {
        return mx::broadcast_to(to_array(a), shape, s);
      },
      nb::arg(),
      "shape"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def broadcast_to(a: Union[scalar, array], /, shape: Sequence[int], *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Broadcast an array to the given shape.

        The broadcasting semantics are the same as Numpy.

        Args:
            a (array): Input array.
            shape (list(int)): The shape to broadcast to.

        Returns:
            array: The output array with the new shape.
      )pbdoc");
  m.def(
      "broadcast_arrays",
      [](const nb::args& args, mx::StreamOrDevice s) {
        return broadcast_arrays(nb::cast<std::vector<mx::array>>(args), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def broadcast_arrays(*arrays: array, stream: Union[None, Stream, Device] = None) -> Tuple[array, ...]"),
      R"pbdoc(
        Broadcast arrays against one another.

        The broadcasting semantics are the same as Numpy.

        Args:
            *arrays (array): The input arrays.

        Returns:
            tuple(array): The output arrays with the broadcasted shape.
      )pbdoc");
  m.def(
      "softmax",
      [](const mx::array& a,
         const IntOrVec& axis,
         bool precise,
         mx::StreamOrDevice s) {
        return mx::softmax(a, get_reduce_axes(axis, a.ndim()), precise, s);
      },
      nb::arg(),
      "axis"_a = nb::none(),
      nb::kw_only(),
      "precise"_a = false,
      "stream"_a = nb::none(),
      nb::sig(
          "def softmax(a: array, /, axis: Union[None, int, Sequence[int]] = None, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Perform the softmax along the given axis.

        This operation is a numerically stable version of:

        .. code-block::

          exp(a) / sum(exp(a), axis, keepdims=True)

        Args:
            a (array): Input array.
            axis (int or list(int), optional): Optional axis or axes to compute
             the softmax over. If unspecified this performs the softmax over
             the full array.

        Returns:
            array: The output of the softmax.
      )pbdoc");
  m.def(
      "concatenate",
      [](const std::vector<mx::array>& arrays,
         std::optional<int> axis,
         mx::StreamOrDevice s) {
        if (axis) {
          return mx::concatenate(arrays, *axis, s);
        } else {
          return mx::concatenate(arrays, s);
        }
      },
      nb::arg(),
      "axis"_a.none() = 0,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def concatenate(arrays: list[array], axis: Optional[int] = 0, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Concatenate the arrays along the given axis.

        Args:
            arrays (list(array)): Input :obj:`list` or :obj:`tuple` of arrays.
            axis (int, optional): Optional axis to concatenate along. If
              unspecified defaults to ``0``.

        Returns:
            array: The concatenated array.
      )pbdoc");
  m.def(
      "concat",
      [](const std::vector<mx::array>& arrays,
         std::optional<int> axis,
         mx::StreamOrDevice s) {
        if (axis) {
          return mx::concatenate(arrays, *axis, s);
        } else {
          return mx::concatenate(arrays, s);
        }
      },
      nb::arg(),
      "axis"_a.none() = 0,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def concat(arrays: list[array], axis: Optional[int] = 0, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        See :func:`concatenate`.
      )pbdoc");
  m.def(
      "stack",
      [](const std::vector<mx::array>& arrays,
         std::optional<int> axis,
         mx::StreamOrDevice s) {
        if (axis.has_value()) {
          return mx::stack(arrays, axis.value(), s);
        } else {
          return mx::stack(arrays, s);
        }
      },
      nb::arg(),
      "axis"_a = 0,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def stack(arrays: list[array], axis: Optional[int] = 0, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Stacks the arrays along a new axis.

        Args:
            arrays (list(array)): A list of arrays to stack.
            axis (int, optional): The axis in the result array along which the
              input arrays are stacked. Defaults to ``0``.
            stream (Stream, optional): Stream or device. Defaults to ``None``.

        Returns:
            array: The resulting stacked array.
      )pbdoc");
  m.def(
      "meshgrid",
      [](nb::args arrays_,
         bool sparse,
         std::string indexing,
         mx::StreamOrDevice s) {
        std::vector<mx::array> arrays =
            nb::cast<std::vector<mx::array>>(arrays_);
        return mx::meshgrid(arrays, sparse, indexing, s);
      },
      "arrays"_a,
      "sparse"_a = false,
      "indexing"_a = "xy",
      "stream"_a = nb::none(),
      nb::sig(
          "def meshgrid(*arrays: array, sparse: Optional[bool] = False, indexing: Optional[str] = 'xy', stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Generate multidimensional coordinate grids from 1-D coordinate arrays

        Args:
            *arrays (array): Input arrays.
            sparse (bool, optional): If ``True``, a sparse grid is returned in which each output
              array has a single non-zero element. If ``False``, a dense grid is returned.
              Defaults to ``False``.
            indexing (str, optional): Cartesian ('xy') or matrix ('ij') indexing of the output arrays.
              Defaults to ``'xy'``.

        Returns:
            list(array): The output arrays.
      )pbdoc");
  m.def(
      "repeat",
      [](const mx::array& array,
         int repeats,
         std::optional<int> axis,
         mx::StreamOrDevice s) {
        if (axis.has_value()) {
          return mx::repeat(array, repeats, axis.value(), s);
        } else {
          return mx::repeat(array, repeats, s);
        }
      },
      nb::arg(),
      "repeats"_a,
      "axis"_a = nb::none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def repeat(array: array, repeats: int, axis: Optional[int] = None, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Repeat an array along a specified axis.

        Args:
            array (array): Input array.
            repeats (int): The number of repetitions for each element.
            axis (int, optional): The axis in which to repeat the array along. If
              unspecified it uses the flattened array of the input and repeats
              along axis 0.
            stream (Stream, optional): Stream or device. Defaults to ``None``.

        Returns:
            array: The resulting repeated array.
      )pbdoc");
  m.def(
      "clip",
      [](const mx::array& a,
         const std::optional<ScalarOrArray>& min,
         const std::optional<ScalarOrArray>& max,
         mx::StreamOrDevice s) {
        std::optional<mx::array> min_ = std::nullopt;
        std::optional<mx::array> max_ = std::nullopt;
        if (min) {
          min_ = to_arrays(a, min.value()).second;
        }
        if (max) {
          max_ = to_arrays(a, max.value()).second;
        }
        return mx::clip(a, min_, max_, s);
      },
      nb::arg(),
      "a_min"_a.none(),
      "a_max"_a.none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def clip(a: array, /, a_min: Union[scalar, array, None], a_max: Union[scalar, array, None], *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Clip the values of the array between the given minimum and maximum.

        If either ``a_min`` or ``a_max`` are ``None``, then corresponding edge
        is ignored. At least one of ``a_min`` and ``a_max`` cannot be ``None``.
        The input ``a`` and the limits must broadcast with one another.

        Args:
            a (array): Input array.
            a_min (scalar or array or None): Minimum value to clip to.
            a_max (scalar or array or None): Maximum value to clip to.

        Returns:
            array: The clipped array.
      )pbdoc");
  m.def(
      "pad",
      [](const mx::array& a,
         const std::variant<
             int,
             std::tuple<int>,
             std::pair<int, int>,
             std::vector<std::pair<int, int>>>& pad_width,
         const std::string& mode,
         const ScalarOrArray& constant_value,
         mx::StreamOrDevice s) {
        if (auto pv = std::get_if<int>(&pad_width); pv) {
          return mx::pad(a, *pv, to_array(constant_value), mode, s);
        } else if (auto pv = std::get_if<std::tuple<int>>(&pad_width); pv) {
          return mx::pad(
              a, std::get<0>(*pv), to_array(constant_value), mode, s);
        } else if (auto pv = std::get_if<std::pair<int, int>>(&pad_width); pv) {
          return mx::pad(a, *pv, to_array(constant_value), mode, s);
        } else {
          auto v = std::get<std::vector<std::pair<int, int>>>(pad_width);
          if (v.size() == 1) {
            return mx::pad(a, v[0], to_array(constant_value), mode, s);
          } else {
            return mx::pad(a, v, to_array(constant_value), mode, s);
          }
        }
      },
      nb::arg(),
      "pad_width"_a,
      "mode"_a = "constant",
      "constant_values"_a = 0,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def pad(a: array, pad_width: Union[int, tuple[int], tuple[int, int], list[tuple[int, int]]], mode: Literal['constant', 'edge'] = 'constant', constant_values: Union[scalar, array] = 0, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Pad an array with a constant value

        Args:
            a (array): Input array.
            pad_width (int, tuple(int), tuple(int, int) or list(tuple(int, int))): Number of padded
              values to add to the edges of each axis:``((before_1, after_1),
              (before_2, after_2), ..., (before_N, after_N))``. If a single pair
              of integers is passed then ``(before_i, after_i)`` are all the same.
              If a single integer or tuple with a single integer is passed then
              all axes are extended by the same number on each side.
            mode: Padding mode. One of the following strings:
              "constant" (default): Pads with a constant value.
              "edge": Pads with the edge values of array.
            constant_value (array or scalar, optional): Optional constant value
              to pad the edges of the array with.

        Returns:
            array: The padded array.
      )pbdoc");
  m.def(
      "as_strided",
      [](const mx::array& a,
         std::optional<mx::Shape> shape,
         std::optional<mx::Strides> strides,
         size_t offset,
         mx::StreamOrDevice s) {
        auto a_shape = (shape) ? *shape : a.shape();
        mx::Strides a_strides;
        if (strides) {
          a_strides = *strides;
        } else {
          a_strides = mx::Strides(a_shape.size(), 1);
          for (int i = a_shape.size() - 1; i > 0; i--) {
            a_strides[i - 1] = a_shape[i] * a_strides[i];
          }
        }
        return mx::as_strided(a, a_shape, a_strides, offset, s);
      },
      nb::arg(),
      "shape"_a = nb::none(),
      "strides"_a = nb::none(),
      "offset"_a = 0,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def as_strided(a: array, /, shape: Optional[Sequence[int]] = None, strides: Optional[Sequence[int]] = None, offset: int = 0, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Create a view into the array with the given shape and strides.

        The resulting array will always be as if the provided array was row
        contiguous regardless of the provided arrays storage order and current
        strides.

        .. note::
           Note that this function should be used with caution as it changes
           the shape and strides of the array directly. This can lead to the
           resulting array pointing to invalid memory locations which can
           result into crashes.

        Args:
          a (array): Input array
          shape (list(int), optional): The shape of the resulting array. If
            None it defaults to ``a.shape()``.
          strides (list(int), optional): The strides of the resulting array. If
            None it defaults to the reverse exclusive cumulative product of
            ``a.shape()``.
          offset (int): Skip that many elements from the beginning of the input
            array.

        Returns:
          array: The output array which is the strided view of the input.
      )pbdoc");
  m.def(
      "cumsum",
      [](const mx::array& a,
         std::optional<int> axis,
         bool reverse,
         bool inclusive,
         mx::StreamOrDevice s) {
        if (axis) {
          return mx::cumsum(a, *axis, reverse, inclusive, s);
        } else {
          return mx::cumsum(mx::reshape(a, {-1}, s), 0, reverse, inclusive, s);
        }
      },
      nb::arg(),
      "axis"_a = nb::none(),
      nb::kw_only(),
      "reverse"_a = false,
      "inclusive"_a = true,
      "stream"_a = nb::none(),
      nb::sig(
          "def cumsum(a: array, /, axis: Optional[int] = None, *, reverse: bool = False, inclusive: bool = True, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Return the cumulative sum of the elements along the given axis.

        Args:
          a (array): Input array
          axis (int, optional): Optional axis to compute the cumulative sum
            over. If unspecified the cumulative sum of the flattened array is
            returned.
          reverse (bool): Perform the cumulative sum in reverse.
          inclusive (bool): The i-th element of the output includes the i-th
            element of the input.

        Returns:
          array: The output array.
      )pbdoc");
  m.def(
      "cumprod",
      [](const mx::array& a,
         std::optional<int> axis,
         bool reverse,
         bool inclusive,
         mx::StreamOrDevice s) {
        if (axis) {
          return mx::cumprod(a, *axis, reverse, inclusive, s);
        } else {
          return mx::cumprod(mx::reshape(a, {-1}, s), 0, reverse, inclusive, s);
        }
      },
      nb::arg(),
      "axis"_a = nb::none(),
      nb::kw_only(),
      "reverse"_a = false,
      "inclusive"_a = true,
      "stream"_a = nb::none(),
      nb::sig(
          "def cumprod(a: array, /, axis: Optional[int] = None, *, reverse: bool = False, inclusive: bool = True, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Return the cumulative product of the elements along the given axis.

        Args:
          a (array): Input array
          axis (int, optional): Optional axis to compute the cumulative product
            over. If unspecified the cumulative product of the flattened array is
            returned.
          reverse (bool): Perform the cumulative product in reverse.
          inclusive (bool): The i-th element of the output includes the i-th
            element of the input.

        Returns:
          array: The output array.
      )pbdoc");
  m.def(
      "cummax",
      [](const mx::array& a,
         std::optional<int> axis,
         bool reverse,
         bool inclusive,
         mx::StreamOrDevice s) {
        if (axis) {
          return mx::cummax(a, *axis, reverse, inclusive, s);
        } else {
          return mx::cummax(mx::reshape(a, {-1}, s), 0, reverse, inclusive, s);
        }
      },
      nb::arg(),
      "axis"_a = nb::none(),
      nb::kw_only(),
      "reverse"_a = false,
      "inclusive"_a = true,
      "stream"_a = nb::none(),
      nb::sig(
          "def cummax(a: array, /, axis: Optional[int] = None, *, reverse: bool = False, inclusive: bool = True, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Return the cumulative maximum of the elements along the given axis.

        Args:
          a (array): Input array
          axis (int, optional): Optional axis to compute the cumulative maximum
            over. If unspecified the cumulative maximum of the flattened array is
            returned.
          reverse (bool): Perform the cumulative maximum in reverse.
          inclusive (bool): The i-th element of the output includes the i-th
            element of the input.

        Returns:
          array: The output array.
      )pbdoc");
  m.def(
      "cummin",
      [](const mx::array& a,
         std::optional<int> axis,
         bool reverse,
         bool inclusive,
         mx::StreamOrDevice s) {
        if (axis) {
          return mx::cummin(a, *axis, reverse, inclusive, s);
        } else {
          return mx::cummin(mx::reshape(a, {-1}, s), 0, reverse, inclusive, s);
        }
      },
      nb::arg(),
      "axis"_a = nb::none(),
      nb::kw_only(),
      "reverse"_a = false,
      "inclusive"_a = true,
      "stream"_a = nb::none(),
      nb::sig(
          "def cummin(a: array, /, axis: Optional[int] = None, *, reverse: bool = False, inclusive: bool = True, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Return the cumulative minimum of the elements along the given axis.

        Args:
          a (array): Input array
          axis (int, optional): Optional axis to compute the cumulative minimum
            over. If unspecified the cumulative minimum of the flattened array is
            returned.
          reverse (bool): Perform the cumulative minimum in reverse.
          inclusive (bool): The i-th element of the output includes the i-th
            element of the input.

        Returns:
          array: The output array.
      )pbdoc");
  m.def(
      "conj",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::conjugate(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def conj(a: array, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Return the elementwise complex conjugate of the input.
        Alias for `mx.conjugate`.

        Args:
          a (array): Input array

        Returns:
          array: The output array.
      )pbdoc");
  m.def(
      "conjugate",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::conjugate(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def conjugate(a: array, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Return the elementwise complex conjugate of the input.
        Alias for `mx.conj`.

        Args:
          a (array): Input array

        Returns:
          array: The output array.
      )pbdoc");
  m.def(
      "convolve",
      [](const mx::array& a,
         const mx::array& v,
         const std::string& mode,
         mx::StreamOrDevice s) {
        if (a.ndim() != 1 || v.ndim() != 1) {
          throw std::invalid_argument("[convolve] Inputs must be 1D.");
        }

        if (a.size() == 0 || v.size() == 0) {
          throw std::invalid_argument("[convolve] Inputs cannot be empty.");
        }

        mx::array in = a.size() < v.size() ? v : a;
        mx::array wt = a.size() < v.size() ? a : v;
        wt = mx::slice(wt, {wt.shape(0) - 1}, {-wt.shape(0) - 1}, {-1}, s);

        in = mx::reshape(in, {1, -1, 1}, s);
        wt = mx::reshape(wt, {1, -1, 1}, s);

        int padding = 0;

        if (mode == "full") {
          padding = wt.size() - 1;
        } else if (mode == "valid") {
          padding = 0;
        } else if (mode == "same") {
          // Odd sizes use symmetric padding
          if (wt.size() % 2) {
            padding = wt.size() / 2;
          } else { // Even sizes use asymmetric padding
            int pad_l = wt.size() / 2;
            int pad_r = std::max(0, pad_l - 1);
            in = mx::pad(
                in,
                {{0, 0}, {pad_l, pad_r}, {0, 0}},
                mx::array(0),
                "constant",
                s);
          }

        } else {
          throw std::invalid_argument("[convolve] Invalid mode.");
        }

        mx::array out = mx::conv1d(
            in,
            wt,
            /*stride = */ 1,
            /*padding = */ padding,
            /*dilation = */ 1,
            /*groups = */ 1,
            s);

        return mx::reshape(out, {-1}, s);
      },
      nb::arg(),
      nb::arg(),
      "mode"_a = "full",
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          R"(def convolve(a: array, v: array, /, mode: str = "full", *, stream: Union[None, Stream, Device] = None) -> array)"),
      R"pbdoc(
        The discrete convolution of 1D arrays.

        If ``v`` is longer than ``a``, then they are swapped.
        The conv filter is flipped following signal processing convention.

        Args:
            a (array): 1D Input array.
            v (array): 1D Input array.
            mode (str, optional): {'full', 'valid', 'same'}

        Returns:
            array: The convolved array.
      )pbdoc");
  m.def(
      "conv1d",
      &mx::conv1d,
      nb::arg(),
      nb::arg(),
      "stride"_a = 1,
      "padding"_a = 0,
      "dilation"_a = 1,
      "groups"_a = 1,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def conv1d(input: array, weight: array, /, stride: int = 1, padding: int = 0, dilation: int = 1, groups: int = 1, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        1D convolution over an input with several channels

        Args:
            input (array): Input array of shape ``(N, L, C_in)``.
            weight (array): Weight array of shape ``(C_out, K, C_in)``.
            stride (int, optional): Kernel stride. Default: ``1``.
            padding (int, optional): Input padding. Default: ``0``.
            dilation (int, optional): Kernel dilation. Default: ``1``.
            groups (int, optional): Input feature groups. Default: ``1``.

        Returns:
            array: The convolved array.
      )pbdoc");
  m.def(
      "conv2d",
      [](const mx::array& input,
         const mx::array& weight,
         const std::variant<int, std::pair<int, int>>& stride,
         const std::variant<int, std::pair<int, int>>& padding,
         const std::variant<int, std::pair<int, int>>& dilation,
         int groups,
         mx::StreamOrDevice s) {
        std::pair<int, int> stride_pair{1, 1};
        std::pair<int, int> padding_pair{0, 0};
        std::pair<int, int> dilation_pair{1, 1};

        if (auto pv = std::get_if<int>(&stride); pv) {
          stride_pair = std::pair<int, int>{*pv, *pv};
        } else {
          stride_pair = std::get<std::pair<int, int>>(stride);
        }

        if (auto pv = std::get_if<int>(&padding); pv) {
          padding_pair = std::pair<int, int>{*pv, *pv};
        } else {
          padding_pair = std::get<std::pair<int, int>>(padding);
        }

        if (auto pv = std::get_if<int>(&dilation); pv) {
          dilation_pair = std::pair<int, int>{*pv, *pv};
        } else {
          dilation_pair = std::get<std::pair<int, int>>(dilation);
        }

        return mx::conv2d(
            input, weight, stride_pair, padding_pair, dilation_pair, groups, s);
      },
      nb::arg(),
      nb::arg(),
      "stride"_a = 1,
      "padding"_a = 0,
      "dilation"_a = 1,
      "groups"_a = 1,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def conv2d(input: array, weight: array, /, stride: Union[int, tuple[int, int]] = 1, padding: Union[int, tuple[int, int]] = 0, dilation: Union[int, tuple[int, int]] = 1, groups: int = 1, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        2D convolution over an input with several channels

        Args:
            input (array): Input array of shape ``(N, H, W, C_in)``.
            weight (array): Weight array of shape ``(C_out, KH, KW, C_in)``.
            stride (int or tuple(int), optional): :obj:`tuple` of size 2 with
                kernel strides. All spatial dimensions get the same stride if
                only one number is specified. Default: ``1``.
            padding (int or tuple(int), optional): :obj:`tuple` of size 2 with
                symmetric input padding. All spatial dimensions get the same
                padding if only one number is specified. Default: ``0``.
            dilation (int or tuple(int), optional): :obj:`tuple` of size 2 with
                kernel dilation. All spatial dimensions get the same dilation
                if only one number is specified. Default: ``1``
            groups (int, optional): input feature groups. Default: ``1``.

        Returns:
            array: The convolved array.
      )pbdoc");
  m.def(
      "conv3d",
      [](const mx::array& input,
         const mx::array& weight,
         const std::variant<int, std::tuple<int, int, int>>& stride,
         const std::variant<int, std::tuple<int, int, int>>& padding,
         const std::variant<int, std::tuple<int, int, int>>& dilation,
         int groups,
         mx::StreamOrDevice s) {
        std::tuple<int, int, int> stride_tuple{1, 1, 1};
        std::tuple<int, int, int> padding_tuple{0, 0, 0};
        std::tuple<int, int, int> dilation_tuple{1, 1, 1};

        if (auto pv = std::get_if<int>(&stride); pv) {
          stride_tuple = std::tuple<int, int, int>{*pv, *pv, *pv};
        } else {
          stride_tuple = std::get<std::tuple<int, int, int>>(stride);
        }

        if (auto pv = std::get_if<int>(&padding); pv) {
          padding_tuple = std::tuple<int, int, int>{*pv, *pv, *pv};
        } else {
          padding_tuple = std::get<std::tuple<int, int, int>>(padding);
        }

        if (auto pv = std::get_if<int>(&dilation); pv) {
          dilation_tuple = std::tuple<int, int, int>{*pv, *pv, *pv};
        } else {
          dilation_tuple = std::get<std::tuple<int, int, int>>(dilation);
        }

        return mx::conv3d(
            input,
            weight,
            stride_tuple,
            padding_tuple,
            dilation_tuple,
            groups,
            s);
      },
      nb::arg(),
      nb::arg(),
      "stride"_a = 1,
      "padding"_a = 0,
      "dilation"_a = 1,
      "groups"_a = 1,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def conv3d(input: array, weight: array, /, stride: Union[int, tuple[int, int, int]] = 1, padding: Union[int, tuple[int, int, int]] = 0, dilation: Union[int, tuple[int, int, int]] = 1, groups: int = 1, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        3D convolution over an input with several channels

        Note: Only the default ``groups=1`` is currently supported.

        Args:
            input (array): Input array of shape ``(N, D, H, W, C_in)``.
            weight (array): Weight array of shape ``(C_out, KD, KH, KW, C_in)``.
            stride (int or tuple(int), optional): :obj:`tuple` of size 3 with
                kernel strides. All spatial dimensions get the same stride if
                only one number is specified. Default: ``1``.
            padding (int or tuple(int), optional): :obj:`tuple` of size 3 with
                symmetric input padding. All spatial dimensions get the same
                padding if only one number is specified. Default: ``0``.
            dilation (int or tuple(int), optional): :obj:`tuple` of size 3 with
                kernel dilation. All spatial dimensions get the same dilation
                if only one number is specified. Default: ``1``
            groups (int, optional): input feature groups. Default: ``1``.

        Returns:
            array: The convolved array.
      )pbdoc");
  m.def(
      "conv_transpose1d",
      &mx::conv_transpose1d,
      nb::arg(),
      nb::arg(),
      "stride"_a = 1,
      "padding"_a = 0,
      "dilation"_a = 1,
      "output_padding"_a = 0,
      "groups"_a = 1,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def conv_transpose1d(input: array, weight: array, /, stride: int = 1, padding: int = 0, dilation: int = 1, output_padding: int = 0, groups: int = 1, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        1D transposed convolution over an input with several channels

        Args:
            input (array): Input array of shape ``(N, L, C_in)``.
            weight (array): Weight array of shape ``(C_out, K, C_in)``.
            stride (int, optional): Kernel stride. Default: ``1``.
            padding (int, optional): Input padding. Default: ``0``.
            dilation (int, optional): Kernel dilation. Default: ``1``.
            output_padding (int, optional): Output padding. Default: ``0``.
            groups (int, optional): Input feature groups. Default: ``1``.

        Returns:
            array: The convolved array.
      )pbdoc");
  m.def(
      "conv_transpose2d",
      [](const mx::array& input,
         const mx::array& weight,
         const std::variant<int, std::pair<int, int>>& stride,
         const std::variant<int, std::pair<int, int>>& padding,
         const std::variant<int, std::pair<int, int>>& dilation,
         const std::variant<int, std::pair<int, int>>& output_padding,
         int groups,
         mx::StreamOrDevice s) {
        std::pair<int, int> stride_pair{1, 1};
        std::pair<int, int> padding_pair{0, 0};
        std::pair<int, int> dilation_pair{1, 1};
        std::pair<int, int> output_padding_pair{0, 0};

        if (auto pv = std::get_if<int>(&stride); pv) {
          stride_pair = std::pair<int, int>{*pv, *pv};
        } else {
          stride_pair = std::get<std::pair<int, int>>(stride);
        }

        if (auto pv = std::get_if<int>(&padding); pv) {
          padding_pair = std::pair<int, int>{*pv, *pv};
        } else {
          padding_pair = std::get<std::pair<int, int>>(padding);
        }

        if (auto pv = std::get_if<int>(&dilation); pv) {
          dilation_pair = std::pair<int, int>{*pv, *pv};
        } else {
          dilation_pair = std::get<std::pair<int, int>>(dilation);
        }

        if (auto pv = std::get_if<int>(&output_padding); pv) {
          output_padding_pair = std::pair<int, int>{*pv, *pv};
        } else {
          output_padding_pair = std::get<std::pair<int, int>>(output_padding);
        }

        return mx::conv_transpose2d(
            input,
            weight,
            stride_pair,
            padding_pair,
            dilation_pair,
            output_padding_pair,
            groups,
            s);
      },
      nb::arg(),
      nb::arg(),
      "stride"_a = 1,
      "padding"_a = 0,
      "dilation"_a = 1,
      "output_padding"_a = 0,
      "groups"_a = 1,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def conv_transpose2d(input: array, weight: array, /, stride: Union[int, Tuple[int, int]] = 1, padding: Union[int, Tuple[int, int]] = 0, dilation: Union[int, Tuple[int, int]] = 1, output_padding: Union[int, Tuple[int, int]] = 0, groups: int = 1, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        2D transposed convolution over an input with several channels

        Note: Only the default ``groups=1`` is currently supported.

        Args:
            input (array): Input array of shape ``(N, H, W, C_in)``.
            weight (array): Weight array of shape ``(C_out, KH, KW, C_in)``.
            stride (int or tuple(int), optional): :obj:`tuple` of size 2 with
                kernel strides. All spatial dimensions get the same stride if
                only one number is specified. Default: ``1``.
            padding (int or tuple(int), optional): :obj:`tuple` of size 2 with
                symmetric input padding. All spatial dimensions get the same
                padding if only one number is specified. Default: ``0``.
            dilation (int or tuple(int), optional): :obj:`tuple` of size 2 with
                kernel dilation. All spatial dimensions get the same dilation
                if only one number is specified. Default: ``1``
            output_padding (int or tuple(int), optional): :obj:`tuple` of size 2 with
                output padding. All spatial dimensions get the same output
                padding if only one number is specified. Default: ``0``.
            groups (int, optional): input feature groups. Default: ``1``.

        Returns:
            array: The convolved array.
      )pbdoc");
  m.def(
      "conv_transpose3d",
      [](const mx::array& input,
         const mx::array& weight,
         const std::variant<int, std::tuple<int, int, int>>& stride,
         const std::variant<int, std::tuple<int, int, int>>& padding,
         const std::variant<int, std::tuple<int, int, int>>& dilation,
         const std::variant<int, std::tuple<int, int, int>>& output_padding,
         int groups,
         mx::StreamOrDevice s) {
        std::tuple<int, int, int> stride_tuple{1, 1, 1};
        std::tuple<int, int, int> padding_tuple{0, 0, 0};
        std::tuple<int, int, int> dilation_tuple{1, 1, 1};
        std::tuple<int, int, int> output_padding_tuple{0, 0, 0};

        if (auto pv = std::get_if<int>(&stride); pv) {
          stride_tuple = std::tuple<int, int, int>{*pv, *pv, *pv};
        } else {
          stride_tuple = std::get<std::tuple<int, int, int>>(stride);
        }

        if (auto pv = std::get_if<int>(&padding); pv) {
          padding_tuple = std::tuple<int, int, int>{*pv, *pv, *pv};
        } else {
          padding_tuple = std::get<std::tuple<int, int, int>>(padding);
        }

        if (auto pv = std::get_if<int>(&dilation); pv) {
          dilation_tuple = std::tuple<int, int, int>{*pv, *pv, *pv};
        } else {
          dilation_tuple = std::get<std::tuple<int, int, int>>(dilation);
        }

        if (auto pv = std::get_if<int>(&output_padding); pv) {
          output_padding_tuple = std::tuple<int, int, int>{*pv, *pv, *pv};
        } else {
          output_padding_tuple =
              std::get<std::tuple<int, int, int>>(output_padding);
        }

        return mx::conv_transpose3d(
            input,
            weight,
            stride_tuple,
            padding_tuple,
            dilation_tuple,
            output_padding_tuple,
            groups,
            s);
      },
      nb::arg(),
      nb::arg(),
      "stride"_a = 1,
      "padding"_a = 0,
      "dilation"_a = 1,
      "output_padding"_a = 0,
      "groups"_a = 1,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def conv_transpose3d(input: array, weight: array, /, stride: Union[int, Tuple[int, int, int]] = 1, padding: Union[int, Tuple[int, int, int]] = 0, dilation: Union[int, Tuple[int, int, int]] = 1, output_padding: Union[int, Tuple[int, int, int]] = 0, groups: int = 1, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        3D transposed convolution over an input with several channels

        Note: Only the default ``groups=1`` is currently supported.

        Args:
            input (array): Input array of shape ``(N, D, H, W, C_in)``.
            weight (array): Weight array of shape ``(C_out, KD, KH, KW, C_in)``.
            stride (int or tuple(int), optional): :obj:`tuple` of size 3 with
                kernel strides. All spatial dimensions get the same stride if
                only one number is specified. Default: ``1``.
            padding (int or tuple(int), optional): :obj:`tuple` of size 3 with
                symmetric input padding. All spatial dimensions get the same
                padding if only one number is specified. Default: ``0``.
            dilation (int or tuple(int), optional): :obj:`tuple` of size 3 with
                kernel dilation. All spatial dimensions get the same dilation
                if only one number is specified. Default: ``1``
            output_padding (int or tuple(int), optional): :obj:`tuple` of size 3 with
                output padding. All spatial dimensions get the same output
                padding if only one number is specified. Default: ``0``.
            groups (int, optional): input feature groups. Default: ``1``.

        Returns:
            array: The convolved array.
      )pbdoc");
  m.def(
      "conv_general",
      [](const mx::array& input,
         const mx::array& weight,
         const std::variant<int, std::vector<int>>& stride,
         const std::variant<
             int,
             std::vector<int>,
             std::pair<std::vector<int>, std::vector<int>>>& padding,
         const std::variant<int, std::vector<int>>& kernel_dilation,
         const std::variant<int, std::vector<int>>& input_dilation,
         int groups,
         bool flip,
         mx::StreamOrDevice s) {
        std::vector<int> stride_vec;
        std::vector<int> padding_lo_vec;
        std::vector<int> padding_hi_vec;
        std::vector<int> kernel_dilation_vec;
        std::vector<int> input_dilation_vec;

        if (auto pv = std::get_if<int>(&stride); pv) {
          stride_vec.push_back(*pv);
        } else {
          stride_vec = std::get<std::vector<int>>(stride);
        }

        if (auto pv = std::get_if<int>(&padding); pv) {
          padding_lo_vec.push_back(*pv);
          padding_hi_vec.push_back(*pv);
        } else if (auto pv = std::get_if<std::vector<int>>(&padding); pv) {
          padding_lo_vec = *pv;
          padding_hi_vec = *pv;
        } else {
          auto [pl, ph] =
              std::get<std::pair<std::vector<int>, std::vector<int>>>(padding);
          padding_lo_vec = pl;
          padding_hi_vec = ph;
        }

        if (auto pv = std::get_if<int>(&kernel_dilation); pv) {
          kernel_dilation_vec.push_back(*pv);
        } else {
          kernel_dilation_vec = std::get<std::vector<int>>(kernel_dilation);
        }

        if (auto pv = std::get_if<int>(&input_dilation); pv) {
          input_dilation_vec.push_back(*pv);
        } else {
          input_dilation_vec = std::get<std::vector<int>>(input_dilation);
        }

        return mx::conv_general(
            /* array input = */ std::move(input),
            /* array weight = */ std::move(weight),
            /* std::vector<int> stride = */ std::move(stride_vec),
            /* std::vector<int> padding_lo = */ std::move(padding_lo_vec),
            /* std::vector<int> padding_hi = */ std::move(padding_hi_vec),
            /* std::vector<int> kernel_dilation = */
            std::move(kernel_dilation_vec),
            /* std::vector<int> input_dilation = */
            std::move(input_dilation_vec),
            /* int groups = */ groups,
            /* bool flip = */ flip,
            s);
      },
      nb::arg(),
      nb::arg(),
      "stride"_a = 1,
      "padding"_a = 0,
      "kernel_dilation"_a = 1,
      "input_dilation"_a = 1,
      "groups"_a = 1,
      "flip"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def conv_general(input: array, weight: array, /, stride: Union[int, Sequence[int]] = 1, padding: Union[int, Sequence[int], tuple[Sequence[int], Sequence[int]]] = 0, kernel_dilation: Union[int, Sequence[int]] = 1, input_dilation: Union[int, Sequence[int]] = 1, groups: int = 1, flip: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        General convolution over an input with several channels

        Args:
            input (array): Input array of shape ``(N, ..., C_in)``.
            weight (array): Weight array of shape ``(C_out, ..., C_in)``.
            stride (int or list(int), optional): :obj:`list` with kernel strides.
                All spatial dimensions get the same stride if
                only one number is specified. Default: ``1``.
            padding (int, list(int), or tuple(list(int), list(int)), optional):
                :obj:`list` with input padding. All spatial dimensions get the same
                padding if only one number is specified. Default: ``0``.
            kernel_dilation (int or list(int), optional): :obj:`list` with
                kernel dilation. All spatial dimensions get the same dilation
                if only one number is specified. Default: ``1``
            input_dilation (int or list(int), optional): :obj:`list` with
                input dilation. All spatial dimensions get the same dilation
                if only one number is specified. Default: ``1``
            groups (int, optional): Input feature groups. Default: ``1``.
            flip (bool, optional): Flip the order in which the spatial dimensions of
                the weights are processed. Performs the cross-correlation operator when
                ``flip`` is ``False`` and the convolution operator otherwise.
                Default: ``False``.

        Returns:
            array: The convolved array.
      )pbdoc");
  m.def(
      "save",
      &mlx_save_helper,
      "file"_a,
      "arr"_a,
      nb::sig(
          "def save(file: Union[file, str, pathlib.Path], arr: array) -> None"),
      R"pbdoc(
        Save the array to a binary file in ``.npy`` format.

        Args:
            file (str, pathlib.Path, file): File to which the array is saved
            arr (array): Array to be saved.
      )pbdoc");
  m.def(
      "savez",
      [](nb::object file, nb::args args, const nb::kwargs& kwargs) {
        mlx_savez_helper(file, args, kwargs, /* compressed= */ false);
      },
      "file"_a,
      "args"_a,
      "kwargs"_a,
      nb::sig(
          "def savez(file: Union[file, str, pathlib.Path], *args, **kwargs)"),
      R"pbdoc(
        Save several arrays to a binary file in uncompressed ``.npz``
        format.

        .. code-block:: python

            import mlx.core as mx

            x = mx.ones((10, 10))
            mx.savez("my_path.npz", x=x)

            import mlx.nn as nn
            from mlx.utils import tree_flatten

            model = nn.TransformerEncoder(6, 128, 4)
            flat_params = tree_flatten(model.parameters())
            mx.savez("model.npz", **dict(flat_params))

        Args:
            file (file, str, pathlib.Path): Path to file to which the arrays are saved.
            *args (arrays): Arrays to be saved.
            **kwargs (arrays): Arrays to be saved. Each array will be saved
              with the associated keyword as the output file name.
      )pbdoc");
  m.def(
      "savez_compressed",
      [](nb::object file, nb::args args, const nb::kwargs& kwargs) {
        mlx_savez_helper(file, args, kwargs, /*compressed=*/true);
      },
      nb::arg(),
      "args"_a,
      "kwargs"_a,
      nb::sig(
          "def savez_compressed(file: Union[file, str, pathlib.Path], *args, **kwargs)"),
      R"pbdoc(
        Save several arrays to a binary file in compressed ``.npz`` format.

        Args:
            file (file, str, pathlib.Path): Path to file to which the arrays are saved.
            *args (arrays): Arrays to be saved.
            **kwargs (arrays): Arrays to be saved. Each array will be saved
              with the associated keyword as the output file name.
      )pbdoc");
  m.def(
      "load",
      &mlx_load_helper,
      nb::arg(),
      "format"_a = nb::none(),
      "return_metadata"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def load(file: Union[file, str, pathlib.Path], /, format: Optional[str] = None, return_metadata: bool = False, *, stream: Union[None, Stream, Device] = None) -> Union[array, dict[str, array], Tuple[dict[str, array], dict[str, Any]]]"),
      R"pbdoc(
        Load array(s) from a binary file.

        The supported formats are ``.npy``, ``.npz``, ``.safetensors``, and
        ``.gguf``.

        Args:
            file (file, str, pathlib.Path): File in which the array is saved.
            format (str, optional): Format of the file. If ``None``, the
              format is inferred from the file extension. Supported formats:
              ``npy``, ``npz``, and ``safetensors``. Default: ``None``.
            return_metadata (bool, optional): Load the metadata for formats
              which support matadata. The metadata will be returned as an
              additional dictionary. Default: ``False``.
        Returns:
            array, dict, or tuple:
                A single array if loading from a ``.npy`` file or a dict
                mapping names to arrays if loading from a ``.npz`` or
                ``.safetensors`` file. If ``return_metadata`` is ``True`` a
                tuple ``(arrays, metadata)`` will be returned where the second
                element is a dictionary containing the metadata.

        Warning:

          When loading unsupported quantization formats from GGUF, tensors
          will automatically cast to ``mx.float16``
      )pbdoc");
  m.def(
      "save_safetensors",
      &mlx_save_safetensor_helper,
      "file"_a,
      "arrays"_a,
      "metadata"_a = nb::none(),
      nb::sig(
          "def save_safetensors(file: Union[file, str, pathlib.Path], arrays: dict[str, array], metadata: Optional[dict[str, str]] = None)"),
      R"pbdoc(
        Save array(s) to a binary file in ``.safetensors`` format.

        See the `Safetensors documentation
        <https://huggingface.co/docs/safetensors/index>`_ for more
        information on the format.

        Args:
            file (file, str, pathlib.Path): File in which the array is saved.
            arrays (dict(str, array)): The dictionary of names to arrays to
              be saved.
            metadata (dict(str, str), optional): The dictionary of
              metadata to be saved.
      )pbdoc");
  m.def(
      "save_gguf",
      &mlx_save_gguf_helper,
      "file"_a,
      "arrays"_a,
      "metadata"_a = nb::none(),
      nb::sig(
          "def save_gguf(file: Union[file, str, pathlib.Path], arrays: dict[str, array], metadata: dict[str, Union[array, str, list[str]]])"),
      R"pbdoc(
        Save array(s) to a binary file in ``.gguf`` format.

        See the `GGUF documentation
        <https://github.com/ggerganov/ggml/blob/master/docs/gguf.md>`_ for
        more information on the format.

        Args:
            file (file, str, pathlib.Path): File in which the array is saved.
            arrays (dict(str, array)): The dictionary of names to arrays to
              be saved.
            metadata (dict(str, Union[array, str, list(str)])): The dictionary
               of metadata to be saved. The values can be a scalar or 1D
               obj:`array`, a :obj:`str`, or a :obj:`list` of :obj:`str`.
      )pbdoc");
  m.def(
      "where",
      [](const ScalarOrArray& condition,
         const ScalarOrArray& x_,
         const ScalarOrArray& y_,
         mx::StreamOrDevice s) {
        auto [x, y] = to_arrays(x_, y_);
        return mx::where(to_array(condition), x, y, s);
      },
      "condition"_a,
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def where(condition: Union[scalar, array], x: Union[scalar, array], y: Union[scalar, array], /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Select from ``x`` or ``y`` according to ``condition``.

        The condition and input arrays must be the same shape or
        broadcastable with each another.

        Args:
          condition (array): The condition array.
          x (array): The input selected from where condition is ``True``.
          y (array): The input selected from where condition is ``False``.

        Returns:
            array: The output containing elements selected from
            ``x`` and ``y``.
      )pbdoc");
  m.def(
      "nan_to_num",
      [](const ScalarOrArray& a,
         float nan,
         std::optional<float>& posinf,
         std::optional<float>& neginf,
         mx::StreamOrDevice s) {
        return mx::nan_to_num(to_array(a), nan, posinf, neginf, s);
      },
      nb::arg(),
      "nan"_a = 0.0f,
      "posinf"_a = nb::none(),
      "neginf"_a = nb::none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def nan_to_num(a: Union[scalar, array], nan: float = 0, posinf: Optional[float] = None, neginf: Optional[float] = None, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Replace NaN and Inf values with finite numbers.

        Args:
            a (array): Input array
            nan (float, optional): Value to replace NaN with. Default: ``0``.
            posinf (float, optional): Value to replace positive infinities
              with. If ``None``, defaults to largest finite value for the
              given data type. Default: ``None``.
            neginf (float, optional): Value to replace negative infinities
              with. If ``None``, defaults to the negative of the largest
              finite value for the given data type. Default: ``None``.

        Returns:
            array: Output array with NaN and Inf replaced.
    )pbdoc");
  m.def(
      "round",
      [](const ScalarOrArray& a, int decimals, mx::StreamOrDevice s) {
        return mx::round(to_array(a), decimals, s);
      },
      nb::arg(),
      "decimals"_a = 0,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def round(a: array, /, decimals: int = 0, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Round to the given number of decimals.

        Basically performs:

        .. code-block:: python

          s = 10**decimals
          x = round(x * s) / s

        Args:
          a (array): Input array
          decimals (int): Number of decimal places to round to. (default: 0)

        Returns:
          array: An array of the same type as ``a`` rounded to the
          given number of decimals.
      )pbdoc");
  m.def(
      "quantized_matmul",
      &mx::quantized_matmul,
      nb::arg(),
      nb::arg(),
      "scales"_a,
      "biases"_a = nb::none(),
      "transpose"_a = true,
      "group_size"_a = nb::none(),
      "bits"_a = nb::none(),
      "mode"_a = "affine",
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def quantized_matmul(x: array, w: array, /, scales: array, biases: Optional[array] = None, transpose: bool = True, group_size: Optional[int] = None, bits: Optional[int] = None, mode: str = 'affine', *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Perform the matrix multiplication with the quantized matrix ``w``. The
        quantization uses one floating point scale and bias per ``group_size`` of
        elements. Each element in ``w`` takes ``bits`` bits and is packed in an
        unsigned 32 bit integer.

        Args:
          x (array): Input array
          w (array): Quantized matrix packed in unsigned integers
          scales (array): The scales to use per ``group_size`` elements of ``w``
          biases (array, optional): The biases to use per ``group_size``
            elements of ``w``. Default: ``None``.
          transpose (bool, optional): Defines whether to multiply with the
            transposed ``w`` or not, namely whether we are performing
            ``x @ w.T`` or ``x @ w``. Default: ``True``.
          group_size (int, optional): The size of the group in ``w`` that shares a
            scale and bias. See supported values and defaults in the
            :ref:`table of quantization modes <quantize-modes>`. Default: ``None``.
          bits (int, optional): The number of bits occupied by each element of
            ``w`` in the quantized array. See supported values and defaults in the
            :ref:`table of quantization modes <quantize-modes>`. Default: ``None``.
          mode (str, optional): The quantization mode. Default: ``"affine"``.

        Returns:
          array: The result of the multiplication of ``x`` with ``w``.
      )pbdoc");
  m.def(
      "quantize",
      &mx::quantize,
      nb::arg(),
      "group_size"_a = nb::none(),
      "bits"_a = nb::none(),
      "mode"_a = "affine",
      "global_scale"_a = nb::none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def quantize(w: array, /, group_size: Optional[int] = None, bits: Optional[int] = None, mode: str = 'affine', *, global_scale: Optional[array] = None, stream: Union[None, Stream, Device] = None) -> tuple[array, array, array]"),
      R"pbdoc(
        Quantize the array ``w``.

        Note, every ``group_size`` elements in a row of ``w`` are quantized
        together. Hence, the last dimension of ``w`` should be divisible by
        ``group_size``.

        .. warning::

          ``quantize`` only supports inputs with two or more dimensions with
          the last dimension divisible by ``group_size``

        The supported quantization modes are ``"affine"``, ``"mxfp4"``,
        ``"mxfp8"``, and ``"nvfp4"``. They are described in more detail below.

        Args:
          w (array): Array to be quantized
          group_size (int, optional): The size of the group in ``w`` that shares a
            scale and bias. See supported values and defaults in the
            :ref:`table of quantization modes <quantize-modes>`. Default: ``None``.
          bits (int, optional): The number of bits occupied by each element of
            ``w`` in the quantized array. See supported values and defaults in the
            :ref:`table of quantization modes <quantize-modes>`. Default: ``None``.
          mode (str, optional): The quantization mode. Default: ``"affine"``.
          global_scale (array, optional): The per-input float32 scale used for
            ``"nvfp4"`` quantization if provided. Default: ``None``.

        Returns:
          tuple: A tuple with either two or three elements containing:

          * w_q (array): The quantized version of ``w``
          * scales (array): The quantization scales
          * biases (array): The quantization biases (returned for ``mode=="affine"``).

        Notes:
          .. _quantize-modes:

          .. table:: Quantization modes

            ======  ======================   ==========================  =============  =====
            mode    group size               bits                        scale type     bias
            ======  ======================   ==========================  =============  =====
            affine  32, 64\ :sup:`*`, 128    2, 3, 4\ :sup:`*`, 5, 6, 8  same as input  yes
            mxfp4   32\ :sup:`*`             4\ :sup:`*`                 e8m0           no
            mxfp8   32\ :sup:`*`             8\ :sup:`*`                 e8m0           no
            nvfp4   16\ :sup:`*`             4\ :sup:`*`                 e4m3           no
            ======  ======================   ==========================  =============  =====

          :sup:`*` indicates the default value when unspecified.

          The ``"affine"`` mode quantizes groups of :math:`g` consecutive
          elements in a row of ``w``. For each group the quantized
          representation of each element :math:`\hat{w_i}` is computed as follows:

          .. math::

            \begin{aligned}
              \alpha &= \max_i w_i \\
              \beta &= \min_i w_i \\
              s &= \frac{\alpha - \beta}{2^b - 1} \\
              \hat{w_i} &= \textrm{round}\left( \frac{w_i - \beta}{s}\right).
            \end{aligned}

          After the above computation, :math:`\hat{w_i}` fits in :math:`b` bits
          and is packed in an unsigned 32-bit integer from the lower to upper
          bits. For instance, for 4-bit quantization we fit 8 elements in an
          unsigned 32 bit integer where the 1st element occupies the 4 least
          significant bits, the 2nd bits 4-7 etc.

          To dequantize the elements of ``w``, we also save :math:`s` and
          :math:`\beta` which are the returned ``scales`` and
          ``biases`` respectively.

          The ``"mxfp4"``, ``"mxfp8"``, and ``"nvfp4"`` modes similarly
          quantize groups of :math:`g` elements of ``w``. For the ``"mx"``
          modes, the group size must be ``32``.  For ``"nvfp4"`` the group
          size must be 16. The elements are quantized to 4-bit or 8-bit
          precision floating-point values: E2M1 for ``"fp4"`` and E4M3 for
          ``"fp8"``. There is a shared 8-bit scale per group. The ``"mx"``
          modes use an E8M0 scale and the ``"nv"`` mode uses an E4M3 scale.
          Unlike ``affine`` quantization, these modes does not have a bias
          value.

          More details on the ``"mx"`` formats can
          be found in the `specification <https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf>`_.
      )pbdoc");
  m.def(
      "dequantize",
      &mx::dequantize,
      nb::arg(),
      "scales"_a,
      "biases"_a = nb::none(),
      "group_size"_a = nb::none(),
      "bits"_a = nb::none(),
      "mode"_a = "affine",
      "global_scale"_a = nb::none(),
      "dtype"_a = nb::none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def dequantize(w: array, /, scales: array, biases: Optional[array] = None, group_size: Optional[int] = None, bits: Optional[int] = None, mode: str = 'affine', global_scale: Optional[array] = None, dtype: Optional[Dtype] = None, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Dequantize the matrix ``w`` using quantization parameters.

        Args:
          w (array): Matrix to be dequantized
          scales (array): The scales to use per ``group_size`` elements of ``w``.
          biases (array, optional): The biases to use per ``group_size``
             elements of ``w``. Default: ``None``.
          group_size (int, optional): The size of the group in ``w`` that shares a
            scale and bias. See supported values and defaults in the
            :ref:`table of quantization modes <quantize-modes>`. Default: ``None``.
          bits (int, optional): The number of bits occupied by each element of
            ``w`` in the quantized array. See supported values and defaults in the
            :ref:`table of quantization modes <quantize-modes>`. Default: ``None``.
          global_scale (array, optional): The per-input float32 scale used for
            ``"nvfp4"`` quantization if provided. Default: ``None``.
          dtype (Dtype, optional): The data type of the dequantized output. If
            ``None`` the return type is inferred from the scales and biases
            when possible and otherwise defaults to ``bfloat16``.
            Default: ``None``.
          mode (str, optional): The quantization mode. Default: ``"affine"``.

        Returns:
          array: The dequantized version of ``w``

        Notes:
          The currently supported quantization modes are ``"affine"``,
          ``"mxfp4``, ``"mxfp8"``, and ``"nvfp4"``.

          For ``affine`` quantization, given the notation in :func:`quantize`,
          we compute :math:`w_i` from :math:`\hat{w_i}` and corresponding :math:`s`
          and :math:`\beta` as follows

          .. math::

            w_i = s \hat{w_i} + \beta
      )pbdoc");
  m.def(
      "gather_qmm",
      &mx::gather_qmm,
      nb::arg(),
      nb::arg(),
      "scales"_a,
      "biases"_a = nb::none(),
      "lhs_indices"_a = nb::none(),
      "rhs_indices"_a = nb::none(),
      "transpose"_a = true,
      "group_size"_a = nb::none(),
      "bits"_a = nb::none(),
      "mode"_a = "affine",
      nb::kw_only(),
      "sorted_indices"_a = false,
      "stream"_a = nb::none(),
      nb::sig(
          "def gather_qmm(x: array, w: array, /, scales: array, biases: Optional[array] = None, lhs_indices: Optional[array] = None, rhs_indices: Optional[array] = None, transpose: bool = True, group_size: Optional[int] = None, bits: Optional[int] = None, mode: str = 'affine', *, sorted_indices: bool = False, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Perform quantized matrix multiplication with matrix-level gather.

        This operation is the quantized equivalent to :func:`gather_mm`.
        Similar to :func:`gather_mm`, the indices ``lhs_indices`` and
        ``rhs_indices`` contain flat indices along the batch dimensions (i.e.
        all but the last two dimensions) of ``x`` and ``w`` respectively.

        Note that ``scales`` and ``biases`` must have the same batch dimensions
        as ``w`` since they represent the same quantized matrix.

        Args:
            x (array): Input array
            w (array): Quantized matrix packed in unsigned integers
            scales (array): The scales to use per ``group_size`` elements of ``w``
            biases (array, optional): The biases to use per ``group_size``
              elements of ``w``. Default: ``None``.
            lhs_indices (array, optional): Integer indices for ``x``. Default: ``None``.
            rhs_indices (array, optional): Integer indices for ``w``. Default: ``None``.
            transpose (bool, optional): Defines whether to multiply with the
              transposed ``w`` or not, namely whether we are performing
              ``x @ w.T`` or ``x @ w``. Default: ``True``.
            group_size (int, optional): The size of the group in ``w`` that shares a
              scale and bias. See supported values and defaults in the
              :ref:`table of quantization modes <quantize-modes>`. Default: ``None``.
            bits (int, optional): The number of bits occupied by each element of
              ``w`` in the quantized array. See supported values and defaults in the
              :ref:`table of quantization modes <quantize-modes>`. Default: ``None``.
            mode (str, optional): The quantization mode. Default: ``"affine"``.
            sorted_indices (bool, optional): May allow a faster implementation
              if the passed indices are sorted. Default: ``False``.

        Returns:
            array: The result of the multiplication of ``x`` with ``w``
              after gathering using ``lhs_indices`` and ``rhs_indices``.
      )pbdoc");
  m.def(
      "segmented_mm",
      &mx::segmented_mm,
      nb::arg(),
      nb::arg(),
      "segments"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def segmented_mm(a: array, b: array, /, segments: array, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Perform a matrix multiplication but segment the inner dimension and
        save the result for each segment separately.

        Args:
          a (array): Input array of shape ``MxK``.
          b (array): Input array of shape ``KxN``.
          segments (array): The offsets into the inner dimension for each segment.

        Returns:
          array: The result per segment of shape ``MxN``.
      )pbdoc");
  m.def(
      "tensordot",
      [](const mx::array& a,
         const mx::array& b,
         const std::variant<int, std::vector<std::vector<int>>>& axes,
         mx::StreamOrDevice s) {
        if (auto pv = std::get_if<int>(&axes); pv) {
          return mx::tensordot(a, b, *pv, s);
        } else {
          auto& x = std::get<std::vector<std::vector<int>>>(axes);
          if (x.size() != 2) {
            throw std::invalid_argument(
                "[tensordot] axes must be a list of two lists.");
          }
          return mx::tensordot(a, b, x[0], x[1], s);
        }
      },
      nb::arg(),
      nb::arg(),
      "axes"_a = 2,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def tensordot(a: array, b: array, /, axes: Union[int, list[Sequence[int]]] = 2, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Compute the tensor dot product along the specified axes.

        Args:
            a (array): Input array
            b (array): Input array
            axes (int or list(list(int)), optional): The number of dimensions to
              sum over. If an integer is provided, then sum over the last
              ``axes`` dimensions of ``a`` and the first ``axes`` dimensions of
              ``b``. If a list of lists is provided, then sum over the
              corresponding dimensions of ``a`` and ``b``. Default: 2.

        Returns:
            array: The tensor dot product.
      )pbdoc");
  m.def(
      "inner",
      &mx::inner,
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def inner(a: array, b: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
      Ordinary inner product of vectors for 1-D arrays, in higher dimensions a sum product over the last axes.

      Args:
        a (array): Input array
        b (array): Input array

      Returns:
        array: The inner product.
    )pbdoc");
  m.def(
      "outer",
      &mx::outer,
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def outer(a: array, b: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
      Compute the outer product of two 1-D arrays, if the array's passed are not 1-D a flatten op will be run beforehand.

      Args:
        a (array): Input array
        b (array): Input array

      Returns:
        array: The outer product.
    )pbdoc");
  m.def(
      "tile",
      [](const mx::array& a,
         const std::variant<int, std::vector<int>>& reps,
         mx::StreamOrDevice s) {
        if (auto pv = std::get_if<int>(&reps); pv) {
          return mx::tile(a, {*pv}, s);
        } else {
          return mx::tile(a, std::get<std::vector<int>>(reps), s);
        }
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def tile(a: array, reps: Union[int, Sequence[int]], /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
      Construct an array by repeating ``a`` the number of times given by ``reps``.

      Args:
        a (array): Input array
        reps (int or list(int)): The number of times to repeat ``a`` along each axis.

      Returns:
        array: The tiled array.
    )pbdoc");
  m.def(
      "addmm",
      &mx::addmm,
      nb::arg(),
      nb::arg(),
      nb::arg(),
      "alpha"_a = 1.0f,
      "beta"_a = 1.0f,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def addmm(c: array, a: array, b: array, /, alpha: float = 1.0, beta: float = 1.0,  *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Matrix multiplication with addition and optional scaling.

        Perform the (possibly batched) matrix multiplication of two arrays and add to the result
        with optional scaling factors.

        Args:
            c (array): Input array or scalar.
            a (array): Input array or scalar.
            b (array): Input array or scalar.
            alpha (float, optional): Scaling factor for the
                matrix product of ``a`` and ``b`` (default: ``1``)
            beta (float, optional): Scaling factor for ``c`` (default: ``1``)

        Returns:
            array: ``alpha * (a @ b)  + beta * c``
      )pbdoc");
  m.def(
      "block_masked_mm",
      &mx::block_masked_mm,
      nb::arg(),
      nb::arg(),
      "block_size"_a = 64,
      "mask_out"_a = nb::none(),
      "mask_lhs"_a = nb::none(),
      "mask_rhs"_a = nb::none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def block_masked_mm(a: array, b: array, /, block_size: int = 64, mask_out: Optional[array] = None, mask_lhs: Optional[array] = None, mask_rhs: Optional[array] = None, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Matrix multiplication with block masking.

        Perform the (possibly batched) matrix multiplication of two arrays and with blocks
        of size ``block_size x block_size`` optionally masked out.

        Assuming ``a`` with shape (..., `M`, `K`) and b with shape (..., `K`, `N`)

        * ``lhs_mask`` must have shape (..., :math:`\lceil` `M` / ``block_size`` :math:`\rceil`, :math:`\lceil` `K` / ``block_size`` :math:`\rceil`)

        * ``rhs_mask`` must have shape (..., :math:`\lceil` `K` / ``block_size`` :math:`\rceil`, :math:`\lceil` `N` / ``block_size`` :math:`\rceil`)

        * ``out_mask`` must have shape (..., :math:`\lceil` `M` / ``block_size`` :math:`\rceil`, :math:`\lceil` `N` / ``block_size`` :math:`\rceil`)

        Note: Only ``block_size=64`` and ``block_size=32`` are currently supported

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.
            block_size (int): Size of blocks to be masked. Must be ``32`` or ``64``. Default: ``64``.
            mask_out (array, optional): Mask for output. Default: ``None``.
            mask_lhs (array, optional): Mask for ``a``. Default: ``None``.
            mask_rhs (array, optional): Mask for ``b``. Default: ``None``.

        Returns:
            array: The output array.
      )pbdoc");
  m.def(
      "gather_mm",
      &mx::gather_mm,
      nb::arg(),
      nb::arg(),
      "lhs_indices"_a = nb::none(),
      "rhs_indices"_a = nb::none(),
      nb::kw_only(),
      "sorted_indices"_a = false,
      "stream"_a = nb::none(),
      nb::sig(
          "def gather_mm(a: array, b: array, /, lhs_indices: array, rhs_indices: array, *, sorted_indices: bool = False, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Matrix multiplication with matrix-level gather.

        Performs a gather of the operands with the given indices followed by a
        (possibly batched) matrix multiplication of two arrays.  This operation
        is more efficient than explicitly applying a :func:`take` followed by a
        :func:`matmul`.

        The indices ``lhs_indices`` and ``rhs_indices`` contain flat indices
        along the batch dimensions (i.e. all but the last two dimensions) of
        ``a`` and ``b`` respectively.

        For ``a`` with shape ``(A1, A2, ..., AS, M, K)``, ``lhs_indices``
        contains indices from the range ``[0, A1 * A2 * ... * AS)``

        For ``b`` with shape ``(B1, B2, ..., BS, M, K)``, ``rhs_indices``
        contains indices from the range ``[0, B1 * B2 * ... * BS)``

        If only one index is passed and it is sorted, the ``sorted_indices``
        flag can be passed for a possible faster implementation.

        Args:
            a (array): Input array.
            b (array): Input array.
            lhs_indices (array, optional): Integer indices for ``a``. Default: ``None``
            rhs_indices (array, optional): Integer indices for ``b``. Default: ``None``
            sorted_indices (bool, optional): May allow a faster implementation
              if the passed indices are sorted. Default: ``False``.

        Returns:
            array: The output array.
      )pbdoc");
  m.def(
      "diagonal",
      &mx::diagonal,
      "a"_a,
      "offset"_a = 0,
      "axis1"_a = 0,
      "axis2"_a = 1,
      "stream"_a = nb::none(),
      nb::sig(
          "def diagonal(a: array, offset: int = 0, axis1: int = 0, axis2: int = 1, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Return specified diagonals.

        If ``a`` is 2-D, then a 1-D array containing the diagonal at the given
        ``offset`` is returned.

        If ``a`` has more than two dimensions, then ``axis1`` and ``axis2``
        determine the 2D subarrays from which diagonals are extracted. The new
        shape is the original shape with ``axis1`` and ``axis2`` removed and a
        new dimension inserted at the end corresponding to the diagonal.

        Args:
          a (array): Input array
          offset (int, optional): Offset of the diagonal from the main diagonal.
            Can be positive or negative. Default: ``0``.
          axis1 (int, optional): The first axis of the 2-D sub-arrays from which
              the diagonals should be taken. Default: ``0``.
          axis2 (int, optional): The second axis of the 2-D sub-arrays from which
              the diagonals should be taken. Default: ``1``.

        Returns:
            array: The diagonals of the array.
      )pbdoc");
  m.def(
      "diag",
      &mx::diag,
      nb::arg(),
      "k"_a = 0,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def diag(a: array, /, k: int = 0, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Extract a diagonal or construct a diagonal matrix.
        If ``a`` is 1-D then a diagonal matrix is constructed with ``a`` on the
        :math:`k`-th diagonal. If ``a`` is 2-D then the :math:`k`-th diagonal is
        returned.

        Args:
            a (array): 1-D or 2-D input array.
            k (int, optional): The diagonal to extract or construct.
                Default: ``0``.

        Returns:
            array: The extracted diagonal or the constructed diagonal matrix.
        )pbdoc");
  m.def(
      "trace",
      [](const mx::array& a,
         int offset,
         int axis1,
         int axis2,
         std::optional<mx::Dtype> dtype,
         mx::StreamOrDevice s) {
        if (!dtype.has_value()) {
          return mx::trace(a, offset, axis1, axis2, s);
        }
        return mx::trace(a, offset, axis1, axis2, dtype.value(), s);
      },
      nb::arg(),
      "offset"_a = 0,
      "axis1"_a = 0,
      "axis2"_a = 1,
      "dtype"_a = nb::none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def trace(a: array, /, offset: int = 0, axis1: int = 0, axis2: int = 1, dtype: Optional[Dtype] = None, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Return the sum along a specified diagonal in the given array.

        Args:
          a (array): Input array
          offset (int, optional): Offset of the diagonal from the main diagonal.
            Can be positive or negative. Default: ``0``.
          axis1 (int, optional): The first axis of the 2-D sub-arrays from which
              the diagonals should be taken. Default: ``0``.
          axis2 (int, optional): The second axis of the 2-D sub-arrays from which
              the diagonals should be taken. Default: ``1``.
          dtype (Dtype, optional): Data type of the output array. If
              unspecified the output type is inferred from the input array.

        Returns:
            array: Sum of specified diagonal.
        )pbdoc");
  m.def(
      "atleast_1d",
      [](const nb::args& arys, mx::StreamOrDevice s) -> nb::object {
        if (arys.size() == 1) {
          return nb::cast(mx::atleast_1d(nb::cast<mx::array>(arys[0]), s));
        }
        return nb::cast(
            mx::atleast_1d(nb::cast<std::vector<mx::array>>(arys), s));
      },
      "arys"_a,
      "stream"_a = nb::none(),
      nb::sig(
          "def atleast_1d(*arys: array, stream: Union[None, Stream, Device] = None) -> Union[array, list[array]]"),
      R"pbdoc(
        Convert all arrays to have at least one dimension.

        Args:
            *arys: Input arrays.
            stream (Union[None, Stream, Device], optional): The stream to execute the operation on.

        Returns:
            array or list(array): An array or list of arrays with at least one dimension.
        )pbdoc");
  m.def(
      "atleast_2d",
      [](const nb::args& arys, mx::StreamOrDevice s) -> nb::object {
        if (arys.size() == 1) {
          return nb::cast(mx::atleast_2d(nb::cast<mx::array>(arys[0]), s));
        }
        return nb::cast(
            mx::atleast_2d(nb::cast<std::vector<mx::array>>(arys), s));
      },
      "arys"_a,
      "stream"_a = nb::none(),
      nb::sig(
          "def atleast_2d(*arys: array, stream: Union[None, Stream, Device] = None) -> Union[array, list[array]]"),
      R"pbdoc(
        Convert all arrays to have at least two dimensions.

        Args:
            *arys: Input arrays.
            stream (Union[None, Stream, Device], optional): The stream to execute the operation on.

        Returns:
            array or list(array): An array or list of arrays with at least two dimensions.
        )pbdoc");
  m.def(
      "atleast_3d",
      [](const nb::args& arys, mx::StreamOrDevice s) -> nb::object {
        if (arys.size() == 1) {
          return nb::cast(mx::atleast_3d(nb::cast<mx::array>(arys[0]), s));
        }
        return nb::cast(
            mx::atleast_3d(nb::cast<std::vector<mx::array>>(arys), s));
      },
      "arys"_a,
      "stream"_a = nb::none(),
      nb::sig(
          "def atleast_3d(*arys: array, stream: Union[None, Stream, Device] = None) -> Union[array, list[array]]"),
      R"pbdoc(
        Convert all arrays to have at least three dimensions.

        Args:
            *arys: Input arrays.
            stream (Union[None, Stream, Device], optional): The stream to execute the operation on.

        Returns:
            array or list(array): An array or list of arrays with at least three dimensions.
        )pbdoc");
  m.def(
      "issubdtype",
      [](const nb::object& d1, const nb::object& d2) {
        auto dispatch_second = [](const auto& t1, const auto& d2) {
          if (nb::isinstance<mx::Dtype>(d2)) {
            return mx::issubdtype(t1, nb::cast<mx::Dtype>(d2));
          } else if (nb::isinstance<mx::Dtype::Category>(d2)) {
            return mx::issubdtype(t1, nb::cast<mx::Dtype::Category>(d2));
          } else {
            throw std::invalid_argument(
                "[issubdtype] Received invalid type for second input.");
          }
        };
        if (nb::isinstance<mx::Dtype>(d1)) {
          return dispatch_second(nb::cast<mx::Dtype>(d1), d2);
        } else if (nb::isinstance<mx::Dtype::Category>(d1)) {
          return dispatch_second(nb::cast<mx::Dtype::Category>(d1), d2);
        } else {
          throw std::invalid_argument(
              "[issubdtype] Received invalid type for first input.");
        }
      },
      ""_a,
      ""_a,
      nb::sig(
          "def issubdtype(arg1: Union[Dtype, DtypeCategory], arg2: Union[Dtype, DtypeCategory]) -> bool"),
      R"pbdoc(
        Check if a :obj:`Dtype` or :obj:`DtypeCategory` is a subtype
        of another.

        Args:
            arg1 (Union[Dtype, DtypeCategory]: First dtype or category.
            arg2 (Union[Dtype, DtypeCategory]: Second dtype or category.

        Returns:
            bool:
               A boolean indicating if the first input is a subtype of the
               second input.

        Example:

          >>> ints = mx.array([1, 2, 3], dtype=mx.int32)
          >>> mx.issubdtype(ints.dtype, mx.integer)
          True
          >>> mx.issubdtype(ints.dtype, mx.floating)
          False

          >>> floats = mx.array([1, 2, 3], dtype=mx.float32)
          >>> mx.issubdtype(floats.dtype, mx.integer)
          False
          >>> mx.issubdtype(floats.dtype, mx.floating)
          True

          Similar types of different sizes are not subdtypes of each other:

          >>> mx.issubdtype(mx.float64, mx.float32)
          False
          >>> mx.issubdtype(mx.float32, mx.float64)
          False

          but both are subtypes of `floating`:

          >>> mx.issubdtype(mx.float64, mx.floating)
          True
          >>> mx.issubdtype(mx.float32, mx.floating)
          True

          For convenience, dtype-like objects are allowed too:

          >>> mx.issubdtype(mx.float32, mx.inexact)
          True
          >>> mx.issubdtype(mx.signedinteger, mx.floating)
          False
      )pbdoc");
  m.def(
      "bitwise_and",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::bitwise_and(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def bitwise_and(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise bitwise and.

        Take the bitwise and of two arrays with numpy-style broadcasting
        semantics. Either or both input arrays can also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The bitwise and ``a & b``.
      )pbdoc");
  m.def(
      "bitwise_or",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::bitwise_or(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def bitwise_or(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise bitwise or.

        Take the bitwise or of two arrays with numpy-style broadcasting
        semantics. Either or both input arrays can also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The bitwise or``a | b``.
      )pbdoc");
  m.def(
      "bitwise_xor",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::bitwise_xor(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def bitwise_xor(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise bitwise xor.

        Take the bitwise exclusive or of two arrays with numpy-style
        broadcasting semantics. Either or both input arrays can also be
        scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The bitwise xor ``a ^ b``.
      )pbdoc");
  m.def(
      "left_shift",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::left_shift(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def left_shift(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise left shift.

        Shift the bits of the first input to the left by the second using
        numpy-style broadcasting semantics. Either or both input arrays can
        also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The bitwise left shift ``a << b``.
      )pbdoc");
  m.def(
      "right_shift",
      [](const ScalarOrArray& a_,
         const ScalarOrArray& b_,
         mx::StreamOrDevice s) {
        auto [a, b] = to_arrays(a_, b_);
        return mx::right_shift(a, b, s);
      },
      nb::arg(),
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def right_shift(a: Union[scalar, array], b: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise right shift.

        Shift the bits of the first input to the right by the second using
        numpy-style broadcasting semantics. Either or both input arrays can
        also be scalars.

        Args:
            a (array): Input array or scalar.
            b (array): Input array or scalar.

        Returns:
            array: The bitwise right shift ``a >> b``.
      )pbdoc");
  m.def(
      "bitwise_invert",
      [](const ScalarOrArray& a_, mx::StreamOrDevice s) {
        auto a = to_array(a_);
        return mx::bitwise_invert(a, s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def bitwise_invert(a: Union[scalar, array], stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Element-wise bitwise inverse.

        Take the bitwise complement of the input.

        Args:
            a (array): Input array or scalar.

        Returns:
            array: The bitwise inverse ``~a``.
      )pbdoc");
  m.def(
      "view",
      [](const ScalarOrArray& a, const mx::Dtype& dtype, mx::StreamOrDevice s) {
        return mx::view(to_array(a), dtype, s);
      },
      nb::arg(),
      "dtype"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def view(a: Union[scalar, array], dtype: Dtype, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        View the array as a different type.

        The output shape changes along the last axis if the input array's
        type and the input ``dtype`` do not have the same size.

        Note: the view op does not imply that the input and output arrays share
        their underlying data. The view only gaurantees that the binary
        representation of each element (or group of elements) is the same.

        Args:
            a (array): Input array or scalar.
            dtype (Dtype): The data type to change to.

        Returns:
            array: The array with the new type.
      )pbdoc");
  m.def(
      "hadamard_transform",
      &mx::hadamard_transform,
      nb::arg(),
      "scale"_a = nb::none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def hadamard_transform(a: array, scale: Optional[float] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Perform the Walsh-Hadamard transform along the final axis.

        Equivalent to:

        .. code-block:: python

           from scipy.linalg import hadamard

           y = (hadamard(len(x)) @ x) * scale

        Supports sizes ``n = m*2^k`` for ``m`` in ``(1, 12, 20, 28)`` and ``2^k
        <= 8192`` for float32 and ``2^k <= 16384`` for float16/bfloat16.

        Args:
            a (array): Input array or scalar.
            scale (float): Scale the output by this factor.
              Defaults to ``1/sqrt(a.shape[-1])`` so that the Hadamard matrix is orthonormal.

        Returns:
            array: The transformed array.
      )pbdoc");
  m.def(
      "einsum_path",
      [](const std::string& equation, const nb::args& operands) {
        auto arrays_list = nb::cast<std::vector<mx::array>>(operands);
        auto [path, str] = mx::einsum_path(equation, arrays_list);
        // Convert to list of tuples
        std::vector<nb::tuple> tuple_path;
        for (auto& p : path) {
          tuple_path.push_back(nb::tuple(nb::cast(p)));
        }
        return std::make_pair(tuple_path, str);
      },
      "subscripts"_a,
      "operands"_a,
      nb::sig("def einsum_path(subscripts: str, *operands)"),
      R"pbdoc(

      Compute the contraction order for the given Einstein summation.

      Args:
        subscripts (str): The Einstein summation convention equation.
        *operands (array): The input arrays.

      Returns:
        tuple(list(tuple(int, int)), str):
          The einsum path and a string containing information about the
          chosen path.
    )pbdoc");
  m.def(
      "einsum",
      [](const std::string& subscripts,
         const nb::args& operands,
         mx::StreamOrDevice s) {
        auto arrays_list = nb::cast<std::vector<mx::array>>(operands);
        return mx::einsum(subscripts, arrays_list, s);
      },
      "subscripts"_a,
      "operands"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def einsum(subscripts: str, *operands, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(

      Perform the Einstein summation convention on the operands.

      Args:
        subscripts (str): The Einstein summation convention equation.
        *operands (array): The input arrays.

      Returns:
        array: The output array.
    )pbdoc");
  m.def(
      "roll",
      [](const mx::array& a,
         const std::variant<int, mx::Shape>& shift,
         const IntOrVec& axis,
         mx::StreamOrDevice s) {
        return std::visit(
            [&](auto sh, auto ax) -> mx::array {
              if constexpr (std::is_same_v<decltype(ax), std::monostate>) {
                return mx::roll(a, sh, s);
              } else {
                return mx::roll(a, sh, ax, s);
              }
            },
            shift,
            axis);
      },
      nb::arg(),
      "shift"_a,
      "axis"_a = nb::none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def roll(a: array, shift: Union[int, Tuple[int]], axis: Union[None, int, Tuple[int]] = None, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Roll array elements along a given axis.

        Elements that are rolled beyond the end of the array are introduced at
        the beggining and vice-versa.

        If the axis is not provided the array is flattened, rolled and then the
        shape is restored.

        Args:
          a (array): Input array
          shift (int or tuple(int)): The number of places by which elements
            are shifted. If positive the array is rolled to the right, if
            negative it is rolled to the left. If an int is provided but the
            axis is a tuple then the same value is used for all axes.
          axis (int or tuple(int), optional): The axis or axes along which to
            roll the elements.
      )pbdoc");
  m.def(
      "real",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::real(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def real(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Returns the real part of a complex array.

        Args:
            a (array): Input array.

        Returns:
            array: The real part of ``a``.
      )pbdoc");
  m.def(
      "imag",
      [](const ScalarOrArray& a, mx::StreamOrDevice s) {
        return mx::imag(to_array(a), s);
      },
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def imag(a: array, /, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Returns the imaginary part of a complex array.

        Args:
            a (array): Input array.

        Returns:
            array: The imaginary part of ``a``.
      )pbdoc");
  m.def(
      "slice",
      [](const mx::array& a,
         const mx::array& start_indices,
         std::vector<int> axes,
         mx::Shape slice_size,
         mx::StreamOrDevice s) {
        return mx::slice(
            a, start_indices, std::move(axes), std::move(slice_size), s);
      },
      nb::arg(),
      "start_indices"_a,
      "axes"_a,
      "slice_size"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def slice(a: array, start_indices: array, axes: Sequence[int], slice_size: Sequence[int], *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Extract a sub-array from the input array.

        Args:
          a (array): Input array
          start_indices (array): The index location to start the slice at.
          axes (tuple(int)): The axes corresponding to the indices in ``start_indices``.
          slice_size (tuple(int)): The size of the slice.

        Returns:
          array: The sliced output array.

        Example:

          >>> a = mx.array([[1, 2, 3], [4, 5, 6]])
          >>> mx.slice(a, start_indices=mx.array(1), axes=(0,), slice_size=(1, 2))
          array([[4, 5]], dtype=int32)
          >>>
          >>> mx.slice(a, start_indices=mx.array(1), axes=(1,), slice_size=(2, 1))
          array([[2],
                 [5]], dtype=int32)
      )pbdoc");
  m.def(
      "slice_update",
      [](const mx::array& src,
         const mx::array& update,
         const mx::array& start_indices,
         std::vector<int> axes,
         mx::StreamOrDevice s) {
        return mx::slice_update(src, update, start_indices, axes, s);
      },
      nb::arg(),
      "update"_a,
      "start_indices"_a,
      "axes"_a,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def slice_update(a: array, update: array, start_indices: array, axes: Sequence[int], *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Update a sub-array of the input array.

        Args:
          a (array): The input array to update
          update (array): The update array.
          start_indices (array): The index location to start the slice at.
          axes (tuple(int)): The axes corresponding to the indices in ``start_indices``.

        Returns:
          array: The output array with the same shape and type as the input.

        Example:

          >>> a = mx.zeros((3, 3))
          >>> mx.slice_update(a, mx.ones((1, 2)), start_indices=mx.array(1, 1), axes=(0, 1))
          array([[0, 0, 0],
                 [0, 1, 0],
                 [0, 1, 0]], dtype=float32)
      )pbdoc");
  m.def(
      "contiguous",
      &mx::contiguous,
      nb::arg(),
      "allow_col_major"_a = false,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def contiguous(a: array, /, allow_col_major: bool = False, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
      Force an array to be row contiguous. Copy if necessary.

      Args:
        a (array): The input to make contiguous
        allow_col_major (bool): Consider column major as contiguous and don't copy

      Returns:
        array: The row or col contiguous output.
    )pbdoc");
  m.def(
      "broadcast_shapes",
      [](const nb::args& shapes) {
        if (shapes.size() == 0)
          throw std::invalid_argument(
              "[broadcast_shapes] Must provide at least one shape.");

        mx::Shape result = nb::cast<mx::Shape>(shapes[0]);
        for (size_t i = 1; i < shapes.size(); ++i) {
          if (!nb::isinstance<mx::Shape>(shapes[i]) &&
              !nb::isinstance<nb::tuple>(shapes[i]))
            throw std::invalid_argument(
                "[broadcast_shapes] Expects a sequence of shapes (tuple or list of ints).");
          result = mx::broadcast_shapes(result, nb::cast<mx::Shape>(shapes[i]));
        }

        return nb::tuple(nb::cast(result));
      },
      nb::sig("def broadcast_shapes(*shapes: Sequence[int]) -> Tuple[int]"),
      R"pbdoc(
        Broadcast shapes.

        Returns the shape that results from broadcasting the supplied array shapes
        against each other.

        Args:
            *shapes (Sequence[int]): The shapes to broadcast.

        Returns:
            tuple: The broadcasted shape.

        Raises:
            ValueError: If the shapes cannot be broadcast.

        Example:
            >>> mx.broadcast_shapes((1,), (3, 1))
            (3, 1)
            >>> mx.broadcast_shapes((6, 7), (5, 6, 1), (7,))
            (5, 6, 7)
            >>> mx.broadcast_shapes((5, 1, 4), (1, 3, 1))
            (5, 3, 4)
      )pbdoc");
  m.def(
      "depends",
      [](const nb::object& inputs_, const nb::object& deps_) {
        bool return_vec = false;
        std::vector<mx::array> inputs;
        std::vector<mx::array> deps;
        if (nb::isinstance<mx::array>(inputs_)) {
          inputs = {nb::cast<mx::array>(inputs_)};
        } else {
          return_vec = true;
          inputs = {nb::cast<std::vector<mx::array>>(inputs_)};
        }
        if (nb::isinstance<mx::array>(deps_)) {
          deps = {nb::cast<mx::array>(deps_)};
        } else {
          deps = {nb::cast<std::vector<mx::array>>(deps_)};
        }
        auto out = depends(inputs, deps);
        if (return_vec) {
          return nb::cast(out);
        } else {
          return nb::cast(out[0]);
        }
      },
      nb::arg(),
      nb::arg(),
      nb::sig(
          "def depends(inputs: Union[array, Sequence[array]], dependencies: Union[array, Sequence[array]])"),
      R"pbdoc(
        Insert dependencies between arrays in the graph. The outputs are
        identical to ``inputs`` but with dependencies on ``dependencies``.

        Args:
            inputs (array or Sequence[array]): The input array or arrays.
            dependencies (array or Sequence[array]): The array or arrays
              to insert dependencies on.

        Returns:
            array or Sequence[array]: The outputs which depend on dependencies.
      )pbdoc");
  m.def(
      "qqmm",
      &mx::qqmm,
      nb::arg(), // x
      nb::arg(), // w_q
      "scales"_a = nb::none(), // scales w
      "group_size"_a = nb::none(),
      "bits"_a = nb::none(),
      "mode"_a = "nvfp4",
      "global_scale_x"_a = nb::none(),
      "global_scale_w"_a = nb::none(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def qqmm(x: array, w: array, scales: Optional[array] = None, group_size: Optional[int] = None, bits: Optional[int] = None, mode: str = 'nvfp4', global_scale_x: Optional[array] = None, global_scale_w: Optional[array] = None, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
      Perform a matrix multiplication using a possibly quantized weight matrix
      ``w`` and a non-quantized input ``x``. The input ``x`` is quantized on the
      fly. The weight matrix ``w`` is used as-is if it is already quantized;
      otherwise, it is quantized on the fly.

      If ``w`` is quantized, ``scales`` must be provided, and ``group_size``,
      ``bits``, and ``mode`` must match the parameters that were used to quantize
      ``w``.

      Notes:
        If ``w`` is expected to receive gradients, it must be provided in
        non-quantized form.

        If ``x`` and `w`` are not quantized, their data types must be ``float32``,
        ``float16``, or ``bfloat16``.
        If ``w`` is quantized, it must be packed in unsigned integers.
        ``global_scale_x`` and ``global_scale_w`` are only used for ``nvfp4`` quantization.

      Args:
        x (array): Input array.
        w (array): Weight matrix. If quantized, it is packed in unsigned integers.
        scales (array, optional): The scales to use per ``group_size`` elements of
          ``w`` if ``w`` is quantized. Default: ``None``.
        group_size (int, optional): Number of elements in ``x`` and ``w`` that
          share a scale. See supported values and defaults in the
          :ref:`table of quantization modes <quantize-modes>`. Default: ``None``.
        bits (int, optional): Number of bits used to represent each element of
          ``x`` and ``w``. See supported values and defaults in the
          :ref:`table of quantization modes <quantize-modes>`. Default: ``None``.
        mode (str, optional): The quantization mode. Default: ``"nvfp4"``.
          Supported modes are ``nvfp4`` and ``mxfp8``. See the
          :ref:`table of quantization modes <quantize-modes>` for details.
        global_scale (array, optional): The per-input float32 scale used for x
            with ``"nvfp4"`` quantization. Default: ``None``.
        global_scale_w (array, optional): The per-input float32 scale used for w
            with ``"nvfp4"`` quantization. Default: ``None``.
      Returns:
        array: The result of the multiplication of quantized ``x`` with quantized ``w``.
        needed).
  )pbdoc");
  m.def(
      "from_fp8",
      &mx::from_fp8,
      nb::arg(),
      "dtype"_a = mx::bfloat16,
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def from_fp8(x: array, dtype: Dtype = bfloat16, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
      Convert the array from fp8 (e4m3) to another floating-point type.

      Args:
        x (array): The input fp8 array with type ``uint8``.
        dtype (Dtype): The data type to convert to. Default: ``bfloat16``.

      Returns:
        array: The array converted from fp8.
  )pbdoc");
  m.def(
      "to_fp8",
      &mx::to_fp8,
      nb::arg(),
      nb::kw_only(),
      "stream"_a = nb::none(),
      nb::sig(
          "def to_fp8(x: array, *, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
      Convert the array to fp8 (e4m3) from another floating-point type.

      Args:
        x (array): The input array.

      Returns:
        array: The array converted to fp8 with type ``uint8``.
  )pbdoc");
}


================================================
FILE: python/src/random.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <nanobind/nanobind.h>
#include <nanobind/stl/optional.h>
#include <nanobind/stl/variant.h>
#include <nanobind/stl/vector.h>

#include <chrono>

#include "mlx/ops.h"
#include "mlx/random.h"
#include "python/src/small_vector.h"
#include "python/src/utils.h"

namespace mx = mlx::core;
namespace nb = nanobind;
using namespace nb::literals;

class PyKeySequence {
 public:
  PyKeySequence() {
    // Destroy state before the python interpreter exits.
    auto atexit = nb::module_::import_("atexit");
    atexit.attr("register")(nb::cpp_function([this]() { state_.reset(); }));
  }

  void seed(uint64_t seed) {
    state()[0] = mx::random::key(seed);
  }

  mx::array next() {
    auto out = mx::random::split(nb::cast<mx::array>(state()[0]));
    state()[0] = out.first;
    return out.second;
  }

  nb::list& state() {
    if (!state_) {
      static auto time_seed = []() {
        auto now = std::chrono::system_clock::now();
        return std::chrono::duration_cast<std::chrono::milliseconds>(
                   now.time_since_epoch())
            .count();
      }();
      state_ = nb::list();
      state_->append(mx::random::key(time_seed));
    }
    return *state_;
  }

 private:
  std::optional<nb::list> state_;
};

PyKeySequence& default_key() {
  // Each thread has its own random key to avoid race condition.
  static thread_local PyKeySequence ks;
  return ks;
}

void init_random(nb::module_& parent_module) {
  auto m = parent_module.def_submodule(
      "random",
      "mlx.core.random: functionality related to random number generation");

  m.def("__getattr__", [&](nb::handle key) -> nb::object {
    // Create random.state lazily to avoid initializing device during import.
    if (nb::isinstance<nb::str>(key) && nb::cast<std::string>(key) == "state") {
      return default_key().state();
    }
    return nb::steal(PyErr_Format(
        PyExc_AttributeError,
        "Module 'random' has no attribute %R",
        key.ptr()));
  });
  m.def(
      "seed",
      [](uint64_t seed) { default_key().seed(seed); },
      "seed"_a,
      R"pbdoc(
        Seed the global PRNG.

        Args:
            seed (int): Seed for the global PRNG.
      )pbdoc");
  m.def(
      "key",
      &mx::random::key,
      "seed"_a,
      R"pbdoc(
        Get a PRNG key from a seed.

        Args:
            seed (int): Seed for the PRNG.

        Returns:
            array: The PRNG key array.
      )pbdoc");
  m.def(
      "split",
      nb::overload_cast<const mx::array&, int, mx::StreamOrDevice>(
          &mx::random::split),
      "key"_a,
      "num"_a = 2,
      "stream"_a = nb::none(),
      nb::sig(
          "def split(key: array, num: int = 2, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Split a PRNG key into sub keys.

        Args:
            key (array): Input key to split.
            num (int, optional): Number of sub keys. Default: ``2``.

        Returns:
            array: The array of sub keys with ``num`` as its first dimension.
      )pbdoc");
  m.def(
      "uniform",
      [](const ScalarOrArray& low,
         const ScalarOrArray& high,
         const mx::Shape& shape,
         std::optional<mx::Dtype> type,
         const std::optional<mx::array>& key_,
         mx::StreamOrDevice s) {
        auto key = key_ ? key_.value() : default_key().next();
        return mx::random::uniform(
            to_array(low),
            to_array(high),
            shape,
            type.value_or(mx::float32),
            key,
            s);
      },
      "low"_a = 0,
      "high"_a = 1,
      "shape"_a = mx::Shape{},
      "dtype"_a.none() = mx::float32,
      "key"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def uniform(low: Union[scalar, array] = 0, high: Union[scalar, array] = 1, shape: Sequence[int] = [], dtype: Optional[Dtype] = float32, key: Optional[array] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Generate uniformly distributed random numbers.

        The values are sampled uniformly in the half-open interval ``[low, high)``.
        The lower and upper bound can be scalars or arrays and must be
        broadcastable to ``shape``.

        Args:
            low (scalar or array, optional): Lower bound of the distribution.
              Default: ``0``.
            high (scalar or array, optional): Upper bound of the distribution.
              Default: ``1``.
            shape (list(int), optional): Shape of the output. Default:``()``.
            dtype (Dtype, optional): Type of the output. Default: ``float32``.
            key (array, optional): A PRNG key. Default: ``None``.

        Returns:
            array: The output array random values.
      )pbdoc");
  m.def(
      "normal",
      [](const mx::Shape& shape,
         std::optional<mx::Dtype> type,
         const std::optional<ScalarOrArray>& loc_,
         const std::optional<ScalarOrArray>& scale_,
         const std::optional<mx::array>& key_,
         mx::StreamOrDevice s) {
        auto dtype = type.value_or(mx::float32);
        auto key = key_ ? key_.value() : default_key().next();
        auto loc =
            loc_ ? std::make_optional(to_array(*loc_, dtype)) : std::nullopt;
        auto scale = scale_ ? std::make_optional(to_array(*scale_, dtype))
                            : std::nullopt;
        return mx::random::normal(shape, dtype, loc, scale, key, s);
      },
      "shape"_a = mx::Shape{},
      "dtype"_a.none() = mx::float32,
      "loc"_a = nb::none(),
      "scale"_a = nb::none(),
      "key"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def normal(shape: Sequence[int] = [], dtype: Optional[Dtype] = float32, loc: Union[scalar, array, None] = None, scale: Union[scalar, array, None] = None, key: Optional[array] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Generate normally distributed random numbers.

        If ``loc`` and ``scale`` are not provided the "standard" normal
        distribution is used. That means $x \sim \mathcal{N}(0, 1)$ for
        real numbers and $\text{Re}(x),\text{Im}(x) \sim \mathcal{N}(0,
        \frac{1}{2})$ for complex numbers.

        Args:
            shape (list(int), optional): Shape of the output. Default: ``()``.
            dtype (Dtype, optional): Type of the output. Default: ``float32``.
            loc (scalar or array, optional): Mean of the distribution.
              Default: ``None``.
            scale (scalar or array, optional): Standard deviation of the
              distribution. Default: ``None``.
            key (array, optional): A PRNG key. Default: ``None``.

        Returns:
            array: The output array of random values.
      )pbdoc");
  m.def(
      "multivariate_normal",
      [](const mx::array& mean,
         const mx::array& cov,
         const mx::Shape& shape,
         std::optional<mx::Dtype> type,
         const std::optional<mx::array>& key_,
         mx::StreamOrDevice s) {
        auto key = key_ ? key_.value() : default_key().next();
        return mx::random::multivariate_normal(
            mean, cov, shape, type.value_or(mx::float32), key, s);
      },
      "mean"_a,
      "cov"_a,
      "shape"_a = mx::Shape{},
      "dtype"_a.none() = mx::float32,
      "key"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def multivariate_normal(mean: array, cov: array, shape: Sequence[int] = [], dtype: Optional[Dtype] = float32, key: Optional[array] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Generate jointly-normal random samples given a mean and covariance.

        The matrix ``cov`` must be positive semi-definite. The behavior is
        undefined if it is not.  The only supported ``dtype`` is ``float32``.

        Args:
            mean (array): array of shape ``(..., n)``, the mean of the
              distribution.
            cov (array): array  of shape ``(..., n, n)``, the covariance
              matrix of the distribution. The batch shape ``...`` must be
              broadcast-compatible with that of ``mean``.
            shape (list(int), optional): The output shape must be
              broadcast-compatible with ``mean.shape[:-1]`` and ``cov.shape[:-2]``.
              If empty, the result shape is determined by broadcasting the batch
              shapes of ``mean`` and ``cov``. Default: ``[]``.
            dtype (Dtype, optional): The output type. Default: ``float32``.
            key (array, optional): A PRNG key. Default: ``None``.

        Returns:
            array: The output array of random values.
      )pbdoc");
  m.def(
      "randint",
      [](const ScalarOrArray& low,
         const ScalarOrArray& high,
         const mx::Shape& shape,
         std::optional<mx::Dtype> type,
         const std::optional<mx::array>& key_,
         mx::StreamOrDevice s) {
        auto key = key_ ? key_.value() : default_key().next();
        return mx::random::randint(
            to_array(low),
            to_array(high),
            shape,
            type.value_or(mx::int32),
            key,
            s);
      },
      "low"_a,
      "high"_a,
      "shape"_a = mx::Shape{},
      "dtype"_a.none() = mx::int32,
      "key"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def randint(low: Union[scalar, array], high: Union[scalar, array], shape: Sequence[int] = [], dtype: Optional[Dtype] = int32, key: Optional[array] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Generate random integers from the given interval.

        The values are sampled with equal probability from the integers in
        half-open interval ``[low, high)``. The lower and upper bound can be
        scalars or arrays and must be broadcastable to ``shape``.

        Args:
            low (scalar or array): Lower bound of the interval.
            high (scalar or array): Upper bound of the interval.
            shape (list(int), optional): Shape of the output. Default: ``()``.
            dtype (Dtype, optional): Type of the output. Default: ``int32``.
            key (array, optional): A PRNG key. Default: ``None``.

        Returns:
            array: The array of random integers.
      )pbdoc");
  m.def(
      "bernoulli",
      [](const ScalarOrArray& p_,
         const std::optional<mx::Shape> shape,
         const std::optional<mx::array>& key_,
         mx::StreamOrDevice s) {
        auto key = key_ ? key_.value() : default_key().next();
        auto p = to_array(p_);
        if (shape.has_value()) {
          return mx::random::bernoulli(p, shape.value(), key, s);
        } else {
          return mx::random::bernoulli(p, key, s);
        }
      },
      "p"_a = 0.5,
      "shape"_a = nb::none(),
      "key"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def bernoulli(p: Union[scalar, array] = 0.5, shape: Optional[Sequence[int]] = None, key: Optional[array] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Generate Bernoulli random values.

        The values are sampled from the bernoulli distribution with parameter
        ``p``. The parameter ``p`` can be a :obj:`float` or :obj:`array` and
        must be broadcastable to ``shape``.

        Args:
            p (float or array, optional): Parameter of the Bernoulli
              distribution. Default: ``0.5``.
            shape (list(int), optional): Shape of the output.
              Default: ``p.shape``.
            key (array, optional): A PRNG key. Default: ``None``.

        Returns:
            array: The array of random integers.
      )pbdoc");
  m.def(
      "truncated_normal",
      [](const ScalarOrArray& lower_,
         const ScalarOrArray& upper_,
         const std::optional<mx::Shape> shape_,
         std::optional<mx::Dtype> type,
         const std::optional<mx::array>& key_,
         mx::StreamOrDevice s) {
        auto key = key_ ? key_.value() : default_key().next();
        auto lower = to_array(lower_);
        auto upper = to_array(upper_);
        auto t = type.value_or(mx::float32);
        if (shape_.has_value()) {
          return mx::random::truncated_normal(
              lower, upper, shape_.value(), t, key, s);
        } else {
          return mx::random::truncated_normal(lower, upper, t, key, s);
        }
      },
      "lower"_a,
      "upper"_a,
      "shape"_a = nb::none(),
      "dtype"_a.none() = mx::float32,
      "key"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def truncated_normal(lower: Union[scalar, array], upper: Union[scalar, array], shape: Optional[Sequence[int]] = None, dtype: Optional[Dtype] = float32, key: Optional[array] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Generate values from a truncated normal distribution.

        The values are sampled from the truncated normal distribution
        on the domain ``(lower, upper)``. The bounds ``lower`` and ``upper``
        can be scalars or arrays and must be broadcastable to ``shape``.

        Args:
            lower (scalar or array): Lower bound of the domain.
            upper (scalar or array): Upper bound of the domain.
            shape (list(int), optional): The shape of the output.
              Default:``()``.
            dtype (Dtype, optional): The data type of the output.
              Default: ``float32``.
            key (array, optional): A PRNG key. Default: ``None``.

        Returns:
            array: The output array of random values.
      )pbdoc");
  m.def(
      "gumbel",
      [](const mx::Shape& shape,
         std::optional<mx::Dtype> type,
         const std::optional<mx::array>& key_,
         mx::StreamOrDevice s) {
        auto key = key_ ? key_.value() : default_key().next();
        return mx::random::gumbel(shape, type.value_or(mx::float32), key, s);
      },
      "shape"_a = mx::Shape{},
      "dtype"_a.none() = mx::float32,
      "key"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def gumbel(shape: Sequence[int] = [], dtype: Optional[Dtype] = float32, key: Optional[array] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Sample from the standard Gumbel distribution.

        The values are sampled from a standard Gumbel distribution
        which CDF ``exp(-exp(-x))``.

        Args:
            shape (list(int)): The shape of the output.
            dtype (Dtype, optional): The data type of the output.
              Default: ``float32``.
            key (array, optional): A PRNG key. Default: ``None``.

        Returns:
            array:
              The :class:`array` with shape ``shape`` and distributed according
              to the Gumbel distribution.
      )pbdoc");
  m.def(
      "categorical",
      [](const mx::array& logits,
         int axis,
         const std::optional<mx::Shape> shape,
         const std::optional<int> num_samples,
         const std::optional<mx::array>& key_,
         mx::StreamOrDevice s) {
        auto key = key_ ? key_.value() : default_key().next();
        if (shape.has_value() && num_samples.has_value()) {
          throw std::invalid_argument(
              "[categorical] At most one of shape or num_samples can be specified.");
        } else if (shape.has_value()) {
          return mx::random::categorical(logits, axis, shape.value(), key, s);
        } else if (num_samples.has_value()) {
          return mx::random::categorical(
              logits, axis, num_samples.value(), key, s);
        } else {
          return mx::random::categorical(logits, axis, key, s);
        }
      },
      "logits"_a,
      "axis"_a = -1,
      "shape"_a = nb::none(),
      "num_samples"_a = nb::none(),
      "key"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def categorical(logits: array, axis: int = -1, shape: Optional[Sequence[int]] = None, num_samples: Optional[int] = None, key: Optional[array] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Sample from a categorical distribution.

        The values are sampled from the categorical distribution specified by
        the unnormalized values in ``logits``. Note, at most one of ``shape``
        or ``num_samples`` can be specified. If both are ``None``, the output
        has the same shape as ``logits`` with the ``axis`` dimension removed.

        Args:
            logits (array): The *unnormalized* categorical distribution(s).
            axis (int, optional): The axis which specifies the distribution.
               Default: ``-1``.
            shape (list(int), optional): The shape of the output. This must
               be broadcast compatible with ``logits.shape`` with the ``axis``
               dimension removed. Default: ``None``
            num_samples (int, optional): The number of samples to draw from each
              of the categorical distributions in ``logits``. The output will have
              ``num_samples`` in the last dimension. Default: ``None``.
            key (array, optional): A PRNG key. Default: ``None``.

        Returns:
            array: The ``shape``-sized output array with type ``uint32``.
      )pbdoc");
  m.def(
      "laplace",
      [](const mx::Shape& shape,
         std::optional<mx::Dtype> type,
         float loc,
         float scale,
         const std::optional<mx::array>& key_,
         mx::StreamOrDevice s) {
        auto key = key_ ? key_.value() : default_key().next();
        return mx::random::laplace(
            shape, type.value_or(mx::float32), loc, scale, key, s);
      },
      "shape"_a = mx::Shape{},
      "dtype"_a.none() = mx::float32,
      "loc"_a = 0.0,
      "scale"_a = 1.0,
      "key"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def laplace(shape: Sequence[int] = [], dtype: Optional[Dtype] = float32, loc: float = 0.0, scale: float = 1.0, key: Optional[array] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Sample numbers from a Laplace distribution.

        Args:
            shape (list(int), optional): Shape of the output. Default: ``()``.
            dtype (Dtype, optional): Type of the output. Default: ``float32``.
            loc (float, optional): Mean of the distribution. Default: ``0.0``.
            scale (float, optional): The scale "b" of the Laplace distribution.
              Default:``1.0``.
            key (array, optional): A PRNG key. Default: ``None``.

        Returns:
            array: The output array of random values.
      )pbdoc");
  m.def(
      "permutation",
      [](const std::variant<nb::int_, mx::array>& x,
         int axis,
         const std::optional<mx::array>& key_,
         mx::StreamOrDevice s) {
        auto key = key_ ? key_.value() : default_key().next();
        if (auto pv = std::get_if<nb::int_>(&x); pv) {
          return mx::random::permutation(nb::cast<int>(*pv), key, s);
        } else {
          return mx::random::permutation(std::get<mx::array>(x), axis, key, s);
        }
      },
      "x"_a,
      "axis"_a = 0,
      "key"_a = nb::none(),
      "stream"_a = nb::none(),
      nb::sig(
          "def permutation(x: Union[int, array], axis: int = 0, key: Optional[array] = None, stream: Union[None, Stream, Device] = None) -> array"),
      R"pbdoc(
        Generate a random permutation or permute the entries of an array.

        Args:
            x (int or array, optional): If an integer is provided a random
              permtuation of ``mx.arange(x)`` is returned. Otherwise the entries
              of ``x`` along the given axis are randomly permuted.
            axis (int, optional): The axis to permute along. Default: ``0``.
            key (array, optional): A PRNG key. Default: ``None``.

        Returns:
            array:
              The generated random permutation or randomly permuted input array.
      )pbdoc");
}


================================================
FILE: python/src/small_vector.h
================================================
// Copyright © 2025 Apple Inc.

#pragma once

#include "mlx/small_vector.h"

#include <nanobind/stl/detail/nb_list.h>

NAMESPACE_BEGIN(NB_NAMESPACE)
NAMESPACE_BEGIN(detail)

template <typename Type, size_t Size, typename Alloc>
struct type_caster<mlx::core::SmallVector<Type, Size, Alloc>> {
  using List = mlx::core::SmallVector<Type, Size, Alloc>;
  using Caster = make_caster<Type>;

  NB_TYPE_CASTER(
      List,
      const_name("tuple[") + make_caster<Type>::Name + const_name(", ...]"))

  bool from_python(handle src, uint8_t flags, cleanup_list* cleanup) noexcept {
    size_t size;
    PyObject* temp;

    // Will initialize 'size' and 'temp'. All return values and
    // return parameters are zero/NULL in the case of a failure.
    PyObject** o = seq_get(src.ptr(), &size, &temp);

    value.clear();
    value.reserve(size);

    Caster caster;
    bool success = o != nullptr;

    flags = flags_for_local_caster<Type>(flags);

    for (size_t i = 0; i < size; ++i) {
      if (!caster.from_python(o[i], flags, cleanup) ||
          !caster.template can_cast<Type>()) {
        success = false;
        break;
      }

      value.push_back(caster.operator cast_t<Type>());
    }

    Py_XDECREF(temp);

    return success;
  }

  template <typename T>
  static handle from_cpp(T&& src, rv_policy policy, cleanup_list* cleanup) {
    object ret = steal(PyTuple_New(src.size()));

    if (ret.is_valid()) {
      Py_ssize_t index = 0;

      for (auto&& value : src) {
        handle h = Caster::from_cpp(forward_like_<T>(value), policy, cleanup);

        if (!h.is_valid()) {
          ret.reset();
          break;
        }

        NB_TUPLE_SET_ITEM(ret.ptr(), index++, h.ptr());
      }
    }

    return ret.release();
  }
};

NAMESPACE_END(detail)
NAMESPACE_END(NB_NAMESPACE)


================================================
FILE: python/src/stream.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <sstream>

#include <nanobind/nanobind.h>
#include <nanobind/stl/optional.h>
#include <nanobind/stl/string.h>
#include <nanobind/stl/variant.h>

#include "mlx/stream.h"
#include "mlx/utils.h"

namespace mx = mlx::core;
namespace nb = nanobind;
using namespace nb::literals;

// Create the StreamContext on enter and delete on exit.
class PyStreamContext {
 public:
  PyStreamContext(mx::StreamOrDevice s) : _inner(nullptr) {
    if (std::holds_alternative<std::monostate>(s)) {
      throw std::runtime_error(
          "[StreamContext] Invalid argument, please specify a stream or device.");
    }
    _s = s;
  }

  void enter() {
    _inner = new mx::StreamContext(_s);
  }

  void exit() {
    if (_inner != nullptr) {
      delete _inner;
      _inner = nullptr;
    }
  }

 private:
  mx::StreamOrDevice _s;
  mx::StreamContext* _inner;
};

void init_stream(nb::module_& m) {
  nb::class_<mx::Stream>(
      m,
      "Stream",
      R"pbdoc(
      A stream for running operations on a given device.
      )pbdoc")
      .def_ro("device", &mx::Stream::device)
      .def(
          "__repr__",
          [](const mx::Stream& s) {
            std::ostringstream os;
            os << s;
            return os.str();
          })
      .def("__eq__", [](const mx::Stream& s, const nb::object& other) {
        return nb::isinstance<mx::Stream>(other) &&
            s == nb::cast<mx::Stream>(other);
      });

  nb::implicitly_convertible<mx::Device::DeviceType, mx::Device>();

  m.def(
      "default_stream",
      &mx::default_stream,
      "device"_a,
      R"pbdoc(Get the device's default stream.)pbdoc");
  m.def(
      "set_default_stream",
      &mx::set_default_stream,
      "stream"_a,
      R"pbdoc(
        Set the default stream.

        This will make the given stream the default for the
        streams device. It will not change the default device.

        Args:
          stream (stream): Stream to make the default.
      )pbdoc");
  m.def(
      "new_stream",
      &mx::new_stream,
      "device"_a,
      R"pbdoc(Make a new stream on the given device.)pbdoc");

  nb::class_<PyStreamContext>(m, "StreamContext", R"pbdoc(
        A context manager for setting the current device and stream.

        See :func:`stream` for usage.

        Args:
            s: The stream or device to set as the default.
  )pbdoc")
      .def(nb::init<mx::StreamOrDevice>(), "s"_a)
      .def("__enter__", [](PyStreamContext& scm) { scm.enter(); })
      .def(
          "__exit__",
          [](PyStreamContext& scm,
             const std::optional<nb::type_object>& exc_type,
             const std::optional<nb::object>& exc_value,
             const std::optional<nb::object>& traceback) { scm.exit(); },
          "exc_type"_a = nb::none(),
          "exc_value"_a = nb::none(),
          "traceback"_a = nb::none());
  m.def(
      "stream",
      [](mx::StreamOrDevice s) { return PyStreamContext(s); },
      "s"_a,
      R"pbdoc(
        Create a context manager to set the default device and stream.

        Args:
            s: The :obj:`Stream` or :obj:`Device` to set as the default.

        Returns:
            A context manager that sets the default device and stream.

        Example:

        .. code-block::python

          import mlx.core as mx

          # Create a context manager for the default device and stream.
          with mx.stream(mx.cpu):
              # Operations here will use mx.cpu by default.
              pass
      )pbdoc");
  m.def(
      "synchronize",
      [](const std::optional<mx::Stream>& s) {
        s ? mx::synchronize(s.value()) : mx::synchronize();
      },
      "stream"_a = nb::none(),
      R"pbdoc(
      Synchronize with the given stream.

      Args:
        stream (Stream, optional): The stream to synchronize with. If ``None``
           then the default stream of the default device is used.
           Default: ``None``.
      )pbdoc");
}


================================================
FILE: python/src/transforms.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <algorithm>
#include <numeric>
#include <sstream>
#include <unordered_set>

#include <nanobind/nanobind.h>
#include <nanobind/stl/optional.h>
#include <nanobind/stl/pair.h>
#include <nanobind/stl/string.h>
#include <nanobind/stl/unordered_set.h>
#include <nanobind/stl/variant.h>
#include <nanobind/stl/vector.h>

#include "mlx/array.h"
#include "mlx/compile.h"
#include "mlx/compile_impl.h"
#include "mlx/transforms.h"
#include "mlx/transforms_impl.h"
#include "mlx/utils.h"
#include "python/src/mlx_func.h"
#include "python/src/small_vector.h"
#include "python/src/trees.h"

namespace mx = mlx::core;
namespace nb = nanobind;
using namespace nb::literals;

// Needed for printing shapes and strides.
using mx::operator<<;

using IntOrVec = std::variant<int, std::vector<int>>;
using StrOrSet = std::variant<std::string, std::unordered_set<std::string>>;

inline std::string type_name_str(const nb::handle& o) {
  return nb::cast<std::string>(nb::type_name(o.type()));
}

auto validate_argnums_argnames(
    const std::optional<IntOrVec>& argnums,
    const StrOrSet& argnames) {
  std::unordered_set<std::string> setnames;
  if (auto pv = std::get_if<std::string>(&argnames); pv) {
    setnames = {*pv};
  } else {
    setnames = std::get<std::unordered_set<std::string>>(argnames);
  }

  if (!argnums.has_value()) {
    // argnums was not provided and argnames was empty
    if (setnames.empty()) {
      return std::make_pair(std::vector<int>{0}, setnames);
    } else {
      return std::make_pair(std::vector<int>{}, setnames);
    }
  }

  std::vector<int> vecnums;
  if (auto pv = std::get_if<int>(&(*argnums)); pv) {
    vecnums = {*pv};
  } else {
    vecnums = std::get<std::vector<int>>(*argnums);
  }

  return std::make_pair(vecnums, setnames);
}

auto py_value_and_grad(
    const nb::callable& fun,
    std::vector<int> argnums,
    std::unordered_set<std::string> argnames,
    const std::string& error_msg_tag,
    bool scalar_func_only) {
  // Sanitize argnums
  if (argnums.size() == 0 && argnames.size() == 0) {
    throw std::invalid_argument(
        error_msg_tag + " Gradient wrt no argument requested");
  }
  for (auto arg : argnums) {
    std::sort(argnums.begin(), argnums.end());
    if (argnums[0] < 0) {
      std::ostringstream msg;
      msg << error_msg_tag
          << " Can't compute the gradient of negative argument index "
          << argnums[0];
      throw std::invalid_argument(msg.str());
    }
    for (int i = 1; i < argnums.size(); ++i) {
      if (argnums[i] == argnums[i - 1]) {
        std::ostringstream msg;
        msg << error_msg_tag << " Duplicate argument index " << argnums[0]
            << " is not allowed.";
        throw std::invalid_argument(msg.str());
      }
    }
  }

  return [fun, argnums, argnames, error_msg_tag, scalar_func_only](
             nb::args& args, nb::kwargs& kwargs) {
    // Sanitize the input
    if (argnums.size() > 0 && argnums.back() >= args.size()) {
      std::ostringstream msg;
      msg << error_msg_tag << " Can't compute the gradient of argument index "
          << argnums.back() << " because the function is called with only "
          << args.size() << " positional arguments.";
      throw std::invalid_argument(msg.str());
    }

    for (auto& key : argnames) {
      if (!kwargs.contains(key)) {
        std::ostringstream msg;
        msg << error_msg_tag
            << " Can't compute the gradient of keyword argument '" << key
            << "' because the function is called with the "
            << "following keyword arguments {";
        for (auto item : kwargs) {
          msg << nb::cast<std::string>(item.first) << ",";
        }
        msg << "}";
        throw std::invalid_argument(msg.str());
      }
    }

    // Collect the arrays
    std::vector<mx::array> arrays;
    std::vector<nb::object> array_objects;
    auto flatten_with_objects = [&arrays, &array_objects](
                                    auto tree, bool strict) {
      tree_visit(tree, [&](nb::handle obj) {
        if (nb::isinstance<mx::array>(obj)) {
          arrays.push_back(nb::cast<mx::array>(obj));
          array_objects.push_back(nb::borrow<nb::object>(obj));
        } else if (strict) {
          throw std::invalid_argument(
              "[tree_flatten] The argument should contain only arrays");
        }
      });
    };

    std::vector<int> counts(1, 0);
    std::vector<int> gradient_indices;
    for (int i = 0, j = 0; i < args.size(); ++i) {
      bool needs_grad = (j < argnums.size() && argnums[j] == i);
      auto pre_size = arrays.size();
      flatten_with_objects(args[i], /* strict = */ needs_grad);
      if (needs_grad) {
        auto old_size = gradient_indices.size();
        auto delta_size = arrays.size() - pre_size;
        gradient_indices.resize(old_size + delta_size);
        std::iota(
            gradient_indices.begin() + old_size,
            gradient_indices.end(),
            pre_size);
        j++;
        counts.push_back(delta_size);
      }
    }
    for (auto item : kwargs) {
      bool needs_grad =
          (argnames.find(nb::cast<std::string>(item.first)) != argnames.end());
      auto pre_size = arrays.size();
      flatten_with_objects(item.second, /* strict = */ needs_grad);
      if (needs_grad) {
        auto old_size = gradient_indices.size();
        auto delta_size = arrays.size() - pre_size;
        gradient_indices.resize(old_size + delta_size);
        std::iota(
            gradient_indices.begin() + old_size,
            gradient_indices.end(),
            pre_size);
        counts.push_back(delta_size);
      }
    }
    std::partial_sum(counts.cbegin(), counts.cend(), counts.begin());

    // value_out will hold the output of the python function in order to be
    // able to reconstruct the python tree of extra return values
    nb::object py_value_out;
    auto value_and_grads = mx::value_and_grad(
        [&fun,
         &array_objects,
         &args,
         &kwargs,
         &py_value_out,
         &error_msg_tag,
         scalar_func_only](const std::vector<mx::array>& a) {
          nb::list tree;
          tree.append(args);
          tree.append(kwargs);
          tree_fill(tree, a);

          // Call the python function
          py_value_out = fun(*tree[0], **tree[1]);

          // Replace the tracers with the originals. Don't overwrite
          // locations which were written to during the call to fun
          int index = 0;
          tree_visit_update(tree, [&](nb::handle node) {
            auto replace_arr = nb::cast<mx::array>(node);
            if (replace_arr.id() == a[index].id()) {
              return array_objects[index++];
            } else {
              index++;
              return nb::cast(replace_arr);
            }
          });

          // Validate the return value of the python function
          if (!nb::isinstance<mx::array>(py_value_out)) {
            if (scalar_func_only) {
              std::ostringstream msg;
              msg << error_msg_tag << " The return value of the function "
                  << "whose gradient we want to compute should be a "
                  << "scalar array; but " << type_name_str(py_value_out)
                  << " was returned.";
              throw std::invalid_argument(msg.str());
            }
            if (!nb::isinstance<nb::tuple>(py_value_out)) {
              std::ostringstream msg;
              msg << error_msg_tag << " The return value of the function "
                  << "whose gradient we want to compute should be either a "
                  << "scalar array or a tuple with the first value being a "
                  << "scalar array (Union[array, tuple[array, Any, ...]]); but "
                  << type_name_str(py_value_out) << " was returned.";
              throw std::invalid_argument(msg.str());
            }
            nb::tuple ret = nb::cast<nb::tuple>(py_value_out);
            if (ret.size() == 0) {
              std::ostringstream msg;
              msg << error_msg_tag << " The return value of the function "
                  << "whose gradient we want to compute should be either a "
                  << "scalar array or a non-empty tuple. The first value should be a "
                  << "scalar array and the rest can be anything. Instead, "
                  << "we got an empty tuple.";
              throw std::invalid_argument(msg.str());
            }
            if (!nb::isinstance<mx::array>(ret[0])) {
              std::ostringstream msg;
              msg << error_msg_tag << " The return value of the function "
                  << "whose gradient we want to compute should be either a "
                  << "scalar array or a tuple with the first value being a "
                  << "scalar array (Union[array, tuple[array, Any, ...]]); but it "
                  << "was a tuple with the first value being of type "
                  << type_name_str(ret[0]) << " .";
              throw std::invalid_argument(msg.str());
            }
          }

          return tree_flatten(py_value_out, false);
        },
        gradient_indices)(arrays);

    auto value = value_and_grads.first;
    auto gradients = value_and_grads.second;

    // Put the gradients back in their container.
    // We have the following cases:
    //
    // 1. Single python positional argument has a gradient (eg argnums=[0])
    // 2. Many python positional arguments have gradients (eg argnums=[0, 1])
    // 3. A python keyword argument has gradients
    //
    // In case 1 we return the original python variable but with the gradients.
    // In case 2 we return a tuple of the above.
    // In case 3 we return a tuple containing a tuple and dict (sth like
    // (tuple(), dict(x=mx.array(5))) ).
    nb::object positional_grads;
    nb::object keyword_grads;
    nb::object py_grads;

    // Collect the gradients for the positional arguments
    if (argnums.size() == 1) {
      positional_grads = tree_unflatten(args[argnums[0]], gradients, counts[0]);
    } else if (argnums.size() > 1) {
      nb::list grads_;
      for (int i = 0; i < argnums.size(); i++) {
        grads_.append(tree_unflatten(args[argnums[i]], gradients, counts[i]));
      }
      positional_grads = nb::tuple(grads_);
    } else {
      positional_grads = nb::none();
    }

    // No keyword argument gradients so return the tuple of gradients
    if (argnames.size() == 0) {
      py_grads = positional_grads;
    } else {
      nb::dict grads_;
      int i = 0;
      for (auto item : kwargs) {
        auto k = nb::cast<std::string>(item.first);
        if (argnames.find(k) != argnames.end()) {
          grads_[k.c_str()] = tree_unflatten(
              nb::borrow(item.second), gradients, counts[i++ + argnums.size()]);
        }
      }
      keyword_grads = grads_;

      py_grads = nb::make_tuple(positional_grads, keyword_grads);
    }

    // Put the values back in the container
    nb::object return_value = tree_unflatten(py_value_out, value);
    return std::make_pair(return_value, py_grads);
  };
}

auto py_vmap(
    const nb::callable& fun,
    const nb::object& in_axes,
    const nb::object& out_axes) {
  return [fun, in_axes, out_axes](const nb::args& args) {
    auto axes_to_flat_tree = [](const nb::object& tree,
                                const nb::object& axes,
                                bool output_axes) {
      std::vector<int> flat_axes;
      bool encountered_tuple = false;
      tree_visit(
          {tree, axes},
          [&flat_axes, &encountered_tuple, output_axes](
              const std::vector<nb::object>& inputs) {
            if (nb::isinstance<mx::array>(inputs[0])) {
              if (inputs[1].is_none()) {
                flat_axes.push_back(-1);
              } else if (nb::isinstance<nb::int_>(inputs[1])) {
                int axis = nb::cast<int>(nb::cast<nb::int_>(inputs[1]));
                const mx::array& x = nb::cast<mx::array>(inputs[0]);
                if (axis < 0) {
                  axis += x.ndim() + output_axes;
                }
                if (axis < 0 || axis >= (x.ndim() + output_axes)) {
                  std::ostringstream msg;
                  msg << "[vmap] Invalid" << (output_axes ? " output " : " ")
                      << "vectorization axis " << axis
                      << " for array with shape " << x.shape();
                  throw std::invalid_argument(msg.str());
                }
                flat_axes.push_back(axis);
              } else if (nb::isinstance<nb::tuple>(inputs[1])) {
                encountered_tuple = true;
                auto l = nb::cast<nb::tuple>(inputs[1]);
                if (l.size() == 1 && nb::isinstance<nb::int_>(l[0])) {
                  int axis = nb::cast<int>(nb::cast<nb::int_>(l[0]));
                  const mx::array& x = nb::cast<mx::array>(inputs[0]);
                  if (axis < 0) {
                    axis += x.ndim() + output_axes;
                  }
                  if (axis < 0 || axis >= (x.ndim() + output_axes)) {
                    std::ostringstream msg;
                    msg << "[vmap] Invalid" << (output_axes ? " output " : " ")
                        << "vectorization axis " << axis
                        << " for array with shape " << x.shape();
                    throw std::invalid_argument(msg.str());
                  }
                  flat_axes.push_back(axis);
                } else if (l.size() == 1 && l[0].is_none()) {
                  flat_axes.push_back(-1);
                } else {
                  throw std::invalid_argument(
                      "[vmap] axis must be int or None.");
                }
              } else {
                throw std::invalid_argument("[vmap] axis must be int or None.");
              }
            } else {
              throw std::invalid_argument(
                  "[vmap] The arguments should contain only arrays");
            }
          });
      if (encountered_tuple && !nb::isinstance<mx::array>(tree)) {
        throw std::invalid_argument("[vmap] axis must be int or None.");
      }
      return flat_axes;
    };

    // Inputs must be array or tree of arrays
    auto inputs = tree_flatten(args, true);
    auto flat_in_axes =
        axes_to_flat_tree((args.size() == 1) ? args[0] : args, in_axes, false);

    // py_value_out will hold the output of the python function in order to be
    // able to reconstruct the python tree of extra return values
    nb::object py_outputs;

    auto vmap_fn =
        [&fun, &args, &inputs, &py_outputs](const std::vector<mx::array>& a) {
          // Call the python function
          py_outputs = fun(*tree_unflatten(args, a));

          // Flatten the outputs
          return tree_flatten(py_outputs, true);
        };

    auto [trace_inputs, trace_outputs] =
        mx::detail::vmap_trace(vmap_fn, inputs, flat_in_axes);

    auto flat_out_axes = axes_to_flat_tree(py_outputs, out_axes, true);

    // Perform the vmap
    auto outputs = mx::detail::vmap_replace(
        inputs, trace_inputs, trace_outputs, flat_in_axes, flat_out_axes);

    // Put the outputs back in the container
    return tree_unflatten(py_outputs, outputs);
  };
}

struct PyCompiledFun {
  nb::callable fun;
  std::uintptr_t fun_id;
  nb::object captured_inputs;
  nb::object captured_outputs;
  bool shapeless;

  // Data to attach to the compiled function that contains the python output
  // structure and the number of arrays in said structure.
  struct AttachedData {
    nb::object output_structure;
    int num_outputs;

    AttachedData(nb::object output_structure_, int num_outputs_)
        : output_structure(output_structure_), num_outputs(num_outputs_) {}
  };

  PyCompiledFun(
      const nb::callable& fun,
      nb::object inputs,
      nb::object outputs,
      bool shapeless)
      : fun(fun),
        fun_id(reinterpret_cast<std::uintptr_t>(fun.ptr())),
        captured_inputs(inputs),
        captured_outputs(outputs),
        shapeless(shapeless) {}

  PyCompiledFun(const PyCompiledFun&) = delete;
  PyCompiledFun& operator=(const PyCompiledFun&) = delete;
  PyCompiledFun& operator=(PyCompiledFun&& other) = delete;
  PyCompiledFun(PyCompiledFun&& other)
      : fun(std::move(other.fun)),
        fun_id(reinterpret_cast<std::uintptr_t>(fun.ptr())) {
    other.fun_id = 0;
    captured_inputs = std::move(other.captured_inputs);
    captured_outputs = std::move(other.captured_outputs);
    shapeless = other.shapeless;
  };

  nb::object call_impl(const nb::args& args, const nb::kwargs& kwargs) {
    // Flat array inputs
    std::vector<mx::array> inputs;

    // Compilation constants which includes the tree structure of the arguments
    std::vector<uint64_t> constants;

    // Reserve some large primes to signify the presence of an array, a list or
    // a dict in order to encode the structure of the pytree. We choose primes
    // to reduce slightly the chances of these numbers occurring by a
    // multiplication as values in the constants list.
    constexpr uint64_t array_identifier = 18446744073709551557UL;
    constexpr uint64_t list_identifier = 18446744073709551533UL;
    constexpr uint64_t dict_identifier = 18446744073709551521UL;
    constexpr uint64_t none_identifier = 10239356951478402889UL;

    // Flatten the tree with hashed constants and structure
    std::function<void(nb::handle)> recurse;
    recurse = [&](nb::handle obj) {
      if (nb::isinstance<nb::list>(obj)) {
        auto l = nb::cast<nb::list>(obj);
        constants.push_back(list_identifier);
        for (int i = 0; i < l.size(); ++i) {
          recurse(l[i]);
        }
      } else if (nb::isinstance<nb::tuple>(obj)) {
        auto l = nb::cast<nb::tuple>(obj);
        constants.push_back(list_identifier);
        for (auto item : obj) {
          recurse(item);
        }
      } else if (nb::isinstance<nb::dict>(obj)) {
        auto d = nb::cast<nb::dict>(obj);
        constants.push_back(dict_identifier);
        for (auto item : d) {
          auto r = item.first.attr("__hash__")();
          constants.push_back(nb::cast<int64_t>(r));
          recurse(item.second);
        }
      } else if (nb::isinstance<mx::array>(obj)) {
        inputs.push_back(nb::cast<mx::array>(obj));
        constants.push_back(array_identifier);
      } else if (nb::isinstance<nb::str>(obj)) {
        auto r = obj.attr("__hash__")();
        constants.push_back(nb::cast<int64_t>(r));
      } else if (nb::isinstance<nb::int_>(obj)) {
        constants.push_back(nb::cast<int64_t>(obj));
      } else if (nb::isinstance<nb::float_>(obj)) {
        auto r = nb::cast<double>(obj);
        constants.push_back(*reinterpret_cast<uint64_t*>(&r));
      } else if (obj.is_none()) {
        constants.push_back(none_identifier);
      } else {
        std::ostringstream msg;
        msg << "[compile] Function arguments must be trees of arrays "
            << "or constants (floats, ints, strings, or None), but received "
            << "type " << type_name_str(obj) << ".";
        throw std::invalid_argument(msg.str());
      }
    };

    recurse(args);
    int num_args = inputs.size();
    recurse(kwargs);
    auto compile_fun = [this, &args, &kwargs, num_args](
                           const std::vector<mx::array>& a) {
      // Put tracers into captured inputs
      std::vector<mx::array> flat_in_captures;
      std::vector<mx::array> trace_captures;
      if (!captured_inputs.is_none()) {
        flat_in_captures = tree_flatten(captured_inputs, false);
        trace_captures.insert(
            trace_captures.end(), a.end() - flat_in_captures.size(), a.end());
        tree_fill(captured_inputs, trace_captures);
      }

      auto tree_outputs =
          fun(*tree_unflatten(args, a), **tree_unflatten(kwargs, a, num_args));
      auto [outputs, py_outputs] =
          tree_flatten_with_structure(std::move(tree_outputs), false);

      std::shared_ptr<void> extra_data =
          std::make_shared<AttachedData>(py_outputs, outputs.size());

      if (!captured_outputs.is_none()) {
        auto flat_out_captures = tree_flatten(captured_outputs, false);
        outputs.insert(
            outputs.end(),
            std::make_move_iterator(flat_out_captures.begin()),
            std::make_move_iterator(flat_out_captures.end()));
      }

      // Replace tracers with originals in captured inputs
      if (!captured_inputs.is_none()) {
        tree_replace(captured_inputs, trace_captures, flat_in_captures);
      }
      return mx::detail::ArraysAndExtra{outputs, extra_data};
    };

    if (!captured_inputs.is_none()) {
      auto flat_in_captures = tree_flatten(captured_inputs, false);
      inputs.insert(
          inputs.end(),
          std::make_move_iterator(flat_in_captures.begin()),
          std::make_move_iterator(flat_in_captures.end()));
    }

    // Compile and call
    auto [outputs, extra_data] =
        mx::detail::compile(compile_fun, fun_id, shapeless, constants)(inputs);

    int num_outputs =
        reinterpret_cast<AttachedData*>(extra_data.get())->num_outputs;
    nb::object py_outputs =
        reinterpret_cast<AttachedData*>(extra_data.get())->output_structure;

    if (!captured_outputs.is_none()) {
      std::vector<mx::array> captures(
          std::make_move_iterator(outputs.begin() + num_outputs),
          std::make_move_iterator(outputs.end()));
      tree_fill(captured_outputs, captures);
    }

    // Put the outputs back in the container
    return tree_unflatten_from_structure(std::move(py_outputs), outputs);
  }

  nb::object operator()(const nb::args& args, const nb::kwargs& kwargs) const {
    return const_cast<PyCompiledFun*>(this)->call_impl(args, kwargs);
  };

  ~PyCompiledFun() {
    nb::gil_scoped_acquire gil;

    mx::detail::compile_erase(fun_id);
    fun.reset();
    captured_inputs.reset();
    captured_outputs.reset();
  }
};

class PyCheckpointedFun {
 public:
  PyCheckpointedFun(nb::callable fun) : fun_(std::move(fun)) {}
  ~PyCheckpointedFun() {
    nb::gil_scoped_acquire gil;

    fun_.reset();
  }

  struct InnerFunction {
    nb::object fun_;
    nb::object args_structure_;
    std::weak_ptr<nb::object> output_structure_;

    InnerFunction(
        nb::object fun,
        nb::object args_structure,
        std::weak_ptr<nb::object> output_structure)
        : fun_(std::move(fun)),
          args_structure_(std::move(args_structure)),
          output_structure_(output_structure) {}
    ~InnerFunction() {
      nb::gil_scoped_acquire gil;

      fun_.reset();
      args_structure_.reset();
    }

    std::vector<mx::array> operator()(const std::vector<mx::array>& inputs) {
      auto args = nb::cast<nb::tuple>(
          tree_unflatten_from_structure(args_structure_, inputs));
      auto [outputs, output_structure] =
          tree_flatten_with_structure(fun_(*args[0], **args[1]), false);
      if (auto s = output_structure_.lock()) {
        *s = output_structure;
      }
      return outputs;
    }
  };

  nb::object call_impl(const nb::args& args, const nb::kwargs& kwargs) {
    auto output_structure = std::make_shared<nb::object>();
    auto full_args = nb::make_tuple(args, kwargs);
    auto [inputs, args_structure] =
        tree_flatten_with_structure(full_args, false);

    auto outputs = mx::checkpoint(
        InnerFunction(fun_, args_structure, output_structure))(inputs);

    return tree_unflatten_from_structure(*output_structure, outputs);
  }

  nb::object operator()(const nb::args& args, const nb::kwargs& kwargs) const {
    return const_cast<PyCheckpointedFun*>(this)->call_impl(args, kwargs);
  }

 private:
  nb::callable fun_;
};

int py_custom_function_tp_traverse(PyObject* self, visitproc visit, void* arg);

int py_custom_function_tp_clear(PyObject* self);

/**
 * PyCustomFunction is the class that implements the python decorator
 * `mx.custom_function`.
 *
 * It implements a callable that instead of simply calling `fun` it creates a
 * CustomTransforms primitive via the `custom_function` C++ op which allows us
 * to redefine the vjp, jvp and vmap transformations.
 *
 * The implementation is verbose due to explicit handling of the destruction of
 * various python objects to make sure that there is no double-free and that
 * all of them are deleted while under GIL.
 *
 * Namely, for every one of the functions passed to the C++ `custom_function`
 * we create a callable struct that holds the following python objects (when
 * needed).
 *
 *    - An nb::callable which holds the passed function or transform
 *    - An nb::object holding input structure, namely the `(args, kwargs)`
 *      passed to the function in order to be able to recreate the arguments
 *      from the input arrays.
 *    - A std::shared_ptr<nb::object> holding the output structure name the
 *      structure of the return value of `fun`. It is a shared_ptr so that it
 *      can be set when the function is called and then used in the `vjp`
 *      transform. We delete the object only when the shared_ptr is about to be
 *      deleted see `output_structure_.use_count() == 1` to make sure that the
 *      object is deleted under GIL.
 */
class PyCustomFunction {
 public:
  PyCustomFunction(nb::callable fun) : fun_(std::move(fun)) {}
  ~PyCustomFunction() {
    nb::gil_scoped_acquire gil;
    reset();
  }

  struct InnerFunction {
    nb::callable fun_;
    nb::object input_structure_;
    std::shared_ptr<nb::object> output_structure_;

    InnerFunction(
        nb::callable fun,
        nb::object input_structure,
        std::shared_ptr<nb::object> output_structure)
        : fun_(std::move(fun)),
          input_structure_(std::move(input_structure)),
          output_structure_(std::move(output_structure)) {}
    ~InnerFunction() {
      nb::gil_scoped_acquire gil;

      fun_.reset();
      input_structure_.reset();
      if (output_structure_.use_count() == 1) {
        output_structure_->reset();
      }
    }

    std::vector<mx::array> operator()(const std::vector<mx::array>& inputs) {
      nb::gil_scoped_acquire gil;

      auto new_inputs = nb::cast<nb::tuple>(
          tree_unflatten_from_structure(input_structure_, inputs));
      std::vector<mx::array> outputs;
      std::tie(outputs, *output_structure_) =
          tree_flatten_with_structure(fun_(*new_inputs[0], **new_inputs[1]));
      return outputs;
    }
  };

  struct InnerVJPFunction {
    nb::callable vjp_fun_;
    nb::object input_structure_;
    std::shared_ptr<nb::object> output_structure_;

    InnerVJPFunction(
        nb::callable vjp_fun,
        nb::object input_structure,
        std::shared_ptr<nb::object> output_structure)
        : vjp_fun_(std::move(vjp_fun)),
          input_structure_(std::move(input_structure)),
          output_structure_(std::move(output_structure)) {}
    ~InnerVJPFunction() {
      nb::gil_scoped_acquire gil;

      vjp_fun_.reset();
      input_structure_.reset();
      if (output_structure_.use_count() == 1) {
        output_structure_->reset();
      }
    }

    std::vector<mx::array> operator()(
        const std::vector<mx::array>& primals,
        const std::vector<mx::array>& cotangents,
        const std::vector<mx::array>& outputs) {
      nb::gil_scoped_acquire gil;

      auto new_inputs = nb::cast<nb::tuple>(
          tree_unflatten_from_structure(input_structure_, primals));
      auto args = nb::cast<nb::tuple>(new_inputs[0]);
      auto new_cotangents =
          tree_unflatten_from_structure(*output_structure_, cotangents);
      auto new_outputs =
          tree_unflatten_from_structure(*output_structure_, outputs);

      if (args.size() == 1) {
        return tree_flatten(
            vjp_fun_(args[0], new_cotangents, new_outputs, **new_inputs[1]),
            false);
      } else {
        return tree_flatten(
            vjp_fun_(args, new_cotangents, new_outputs, **new_inputs[1]),
            false);
      }
    }
  };

  struct InnerJVPFunction {
    nb::callable jvp_fun_;
    nb::object input_structure_;

    InnerJVPFunction(nb::callable jvp_fun, nb::object input_structure)
        : jvp_fun_(std::move(jvp_fun)),
          input_structure_(std::move(input_structure)) {}
    ~InnerJVPFunction() {
      nb::gil_scoped_acquire gil;

      jvp_fun_.reset();
      input_structure_.reset();
    }

    std::vector<mx::array> operator()(
        const std::vector<mx::array>& primals,
        const std::vector<mx::array>& tangents,
        const std::vector<int>& argnums) {
      nb::gil_scoped_acquire gil;

      auto new_inputs = nb::cast<nb::tuple>(
          tree_unflatten_from_structure(input_structure_, primals));
      auto args = nb::cast<nb::tuple>(new_inputs[0]);
      auto kwargs = nb::cast<nb::dict>(new_inputs[1]);
      if (kwargs.size() > 0) {
        throw std::invalid_argument(
            "[custom jvp] Function should only accept positional arguments");
      }

      // Make a new pytree which has tangents or None when a tangent is not
      // available.
      std::vector<bool> have_tangents(primals.size(), false);
      for (auto arg : argnums) {
        have_tangents[arg] = true;
      }
      int array_index = 0;
      int tangent_index = 0;
      auto new_tangents =
          nb::cast<nb::tuple>(tree_map(args, [&](nb::handle element) {
            if (nb::isinstance<mx::array>(element) &&
                have_tangents[array_index++]) {
              return nb::cast(tangents[tangent_index++]);
            } else {
              return nb::none();
            }
          }));

      if (args.size() == 1) {
        return tree_flatten(jvp_fun_(args[0], new_tangents[0]), false);
      } else {
        return tree_flatten(jvp_fun_(args, new_tangents), false);
      }
    }
  };

  struct InnerVmapFunction {
    nb::callable vmap_fun_;
    nb::object input_structure_;

    InnerVmapFunction(nb::callable vmap_fun, nb::object input_structure)
        : vmap_fun_(std::move(vmap_fun)),
          input_structure_(std::move(input_structure)) {}
    ~InnerVmapFunction() {
      nb::gil_scoped_acquire gil;

      vmap_fun_.reset();
      input_structure_.reset();
    }

    std::pair<std::vector<mx::array>, std::vector<int>> operator()(
        const std::vector<mx::array>& inputs,
        const std::vector<int>& axes) {
      nb::gil_scoped_acquire gil;

      auto new_inputs = nb::cast<nb::tuple>(
          tree_unflatten_from_structure(input_structure_, inputs));
      auto args = nb::cast<nb::tuple>(new_inputs[0]);
      auto kwargs = nb::cast<nb::dict>(new_inputs[1]);
      if (kwargs.size() > 0) {
        throw std::invalid_argument(
            "[custom vmap] Function should only accept positional arguments");
      }

      int arr_index = 0;
      auto new_axes =
          nb::cast<nb::tuple>(tree_map(args, [&](nb::handle element) {
            int axis = axes[arr_index++];
            if (nb::isinstance<mx::array>(element) && axis >= 0) {
              return nb::cast(axis);
            } else {
              return nb::none();
            }
          }));

      nb::object result;
      if (args.size() == 1) {
        result = vmap_fun_(args[0], new_axes[0]);
      } else {
        result = vmap_fun_(args, new_axes);
      }

      if (!nb::isinstance<nb::tuple>(result)) {
        throw std::invalid_argument(
            "[custom vmap] Vmap function should return a tuple with 2 items.");
      }
      nb::tuple result_tuple = nb::cast<nb::tuple>(result);
      if (result_tuple.size() != 2) {
        throw std::invalid_argument(
            "[custom vmap] Vmap function should return a tuple with 2 items.");
      }

      std::vector<mx::array> outputs;
      std::vector<int> output_axes;
      tree_visit({result_tuple[0], result_tuple[1]}, [&](auto objects) {
        if (nb::isinstance<mx::array>(objects[0])) {
          outputs.push_back(nb::cast<mx::array>(objects[0]));
          output_axes.push_back(
              objects[1].is_none() ? -1 : nb::cast<int>(objects[1]));
        }
      });

      return {outputs, output_axes};
    }
  };

  nb::object call_impl(const nb::args& args, const nb::kwargs& kwargs) {
    if (!vjp_fun_.has_value() && !jvp_fun_.has_value() &&
        !vmap_fun_.has_value()) {
      return fun_(*args, **kwargs);
    }

    // Extract the inputs and their structure in capturable vars
    std::vector<mx::array> input_arrays;
    nb::object input_structure;
    auto full_args = nb::make_tuple(args, kwargs);
    std::tie(input_arrays, input_structure) =
        tree_flatten_with_structure(full_args, false);

    // The output structure will be stored here to be used in the custom vjp
    // function
    auto output_structure = std::make_shared<nb::object>();

    // Make a function that calls fun_ in the forward pass and vjp_ in the
    // backward pass. Then call it immediately and return the results.
    auto f = mx::custom_function(
        InnerFunction(fun_, input_structure, output_structure),
        make_vjp_function(input_structure, output_structure),
        make_jvp_function(input_structure),
        make_vmap_function(input_structure));

    auto outputs = f(input_arrays);
    return tree_unflatten_from_structure(*output_structure, outputs);
  }

  PyCustomFunction& set_vjp(nb::callable vjp_fun) {
    vjp_fun_ = vjp_fun;
    return *this;
  }

  PyCustomFunction& set_jvp(nb::callable jvp_fun) {
    jvp_fun_ = jvp_fun;
    return *this;
  }

  PyCustomFunction& set_vmap(nb::callable vmap_fun) {
    vmap_fun_ = vmap_fun;
    return *this;
  }
  void reset() {
    fun_.reset();
    if (vjp_fun_.has_value()) {
      (*vjp_fun_).reset();
    }
    if (jvp_fun_.has_value()) {
      (*jvp_fun_).reset();
    }
    if (vmap_fun_.has_value()) {
      (*vmap_fun_).reset();
    }
  }

  friend int py_custom_function_tp_traverse(PyObject*, visitproc, void*);

 private:
  std::optional<InnerVJPFunction> make_vjp_function(
      nb::object input_structure,
      std::shared_ptr<nb::object> output_structure) {
    if (!vjp_fun_.has_value()) {
      return std::nullopt;
    }

    return InnerVJPFunction(*vjp_fun_, input_structure, output_structure);
  }

  std::optional<InnerJVPFunction> make_jvp_function(
      nb::object input_structure) {
    if (!jvp_fun_.has_value()) {
      return std::nullopt;
    }

    return InnerJVPFunction(*jvp_fun_, input_structure);
  }

  std::optional<InnerVmapFunction> make_vmap_function(
      nb::object input_structure) {
    if (!vmap_fun_.has_value()) {
      return std::nullopt;
    }

    return InnerVmapFunction(*vmap_fun_, input_structure);
  }

  nb::callable fun_;
  std::optional<nb::callable> vjp_fun_;
  std::optional<nb::callable> jvp_fun_;
  std::optional<nb::callable> vmap_fun_;
};

int py_custom_function_tp_traverse(PyObject* self, visitproc visit, void* arg) {
  Py_VISIT(Py_TYPE(self));
  if (!nb::inst_ready(self)) {
    return 0;
  }

  auto* p = nb::inst_ptr<PyCustomFunction>(self);
  nb::handle v = nb::find(p->fun_);
  Py_VISIT(v.ptr());
  if (p->vjp_fun_.has_value()) {
    nb::handle v = nb::find(*(p->vjp_fun_));
    Py_VISIT(v.ptr());
  }
  if (p->jvp_fun_.has_value()) {
    nb::handle v = nb::find(*(p->jvp_fun_));
    Py_VISIT(v.ptr());
  }
  if (p->vmap_fun_.has_value()) {
    nb::handle v = nb::find(*(p->vmap_fun_));
    Py_VISIT(v.ptr());
  }
  return 0;
}
int py_custom_function_tp_clear(PyObject* self) {
  auto* p = nb::inst_ptr<PyCustomFunction>(self);
  p->reset();
  return 0;
}
PyType_Slot py_custom_function_slots[] = {
    {Py_tp_traverse, (void*)py_custom_function_tp_traverse},
    {Py_tp_clear, (void*)py_custom_function_tp_clear},
    {0, 0}};

void init_transforms(nb::module_& m) {
  nb::class_<PyCustomFunction>(
      m,
      "custom_function",
      nb::type_slots(py_custom_function_slots),
      R"pbdoc(
      Set up a function for custom gradient and vmap definitions.

      This class is meant to be used as a function decorator. Instances are
      callables that behave identically to the wrapped function. However, when
      a function transformation is used (e.g. computing gradients using
      :func:`value_and_grad`) then the functions defined via
      :meth:`custom_function.vjp`, :meth:`custom_function.jvp` and
      :meth:`custom_function.vmap` are used instead of the default transformation.

      Note, all custom transformations are optional. Undefined transformations
      fall back to the default behaviour.

      Example:

        .. code-block:: python

            import mlx.core as mx

            @mx.custom_function
            def f(x, y):
                return mx.sin(x) * y

            @f.vjp
            def f_vjp(primals, cotangent, output):
                x, y = primals
                return cotan * mx.cos(x) * y, cotan * mx.sin(x)

            @f.jvp
            def f_jvp(primals, tangents):
              x, y = primals
              dx, dy = tangents
              return dx * mx.cos(x) * y + dy * mx.sin(x)

            @f.vmap
            def f_vmap(inputs, axes):
              x, y = inputs
              ax, ay = axes
              if ay != ax and ax is not None:
                  y = y.swapaxes(ay, ax)
              return mx.sin(x) * y, (ax or ay)

      All ``custom_function`` instances behave as pure functions. Namely, any
      variables captured will be treated as constants and no gradients will be
      computed with respect to the captured arrays. For instance:

        .. code-block:: python

          import mlx.core as mx

          def g(x, y):
            @mx.custom_function
            def f(x):
              return x * y

            @f.vjp
            def f_vjp(x, dx, fx):
              # Note that we have only x, dx and fx and nothing with respect to y
              raise ValueError("Abort!")

            return f(x)

          x = mx.array(2.0)
          y = mx.array(3.0)
          print(g(x, y))                     # prints 6.0
          print(mx.grad(g)(x, y))            # Raises exception
          print(mx.grad(g, argnums=1)(x, y)) # prints 0.0
      )pbdoc")
      .def(
          nb::init<nb::callable>(),
          "f"_a,
          nb::sig("def __init__(self, f: Callable)"))
      .def("__call__", &PyCustomFunction::call_impl)
      .def(
          "vjp",
          &PyCustomFunction::set_vjp,
          "f"_a,
          nb::sig("def vjp(self, f: Callable)"),
          R"pbdoc(
            Define a custom vjp for the wrapped function.

            The vjp function takes three arguments:

            - *primals*: A pytree that contains all the positional arguments to
              the function. It could be a single array, a tuple of arrays or a
              full blown tuple of dicts of arrays etc.
            - *cotangents*: A pytree that matches the structure of the output
              but contains the cotangents (usually the gradients of the loss
              function with respect to the outputs).
            - *outputs*: The outputs of the function to be used to avoid
              recomputing them for the gradient computation.

            The vjp function should return the same pytree structure as the
            primals but containing the corresponding computed cotangents.
          )pbdoc")
      .def(
          "jvp",
          &PyCustomFunction::set_jvp,
          "f"_a,
          nb::sig("def jvp(self, f: Callable)"),
          R"pbdoc(
            Define a custom jvp for the wrapped function.

            The jvp function takes two arguments:

            - *primals*: A pytree that contains all the positional arguments to
              the function. It could be a single array, a tuple of arrays or a
              full blown tuple of dicts of arrays etc.
            - *tangents*: A pytree that matches the structure of the inputs but
              instead contains the gradients wrt to each input. Tangents could
              be ``None`` if some inputs don't have an associated gradient.

            The jvp function should return the same pytree structure as the
            outputs of the function but containing the tangents.
          )pbdoc")
      .def(
          "vmap",
          &PyCustomFunction::set_vmap,
          "f"_a,
          nb::sig("def vmap(self, f: Callable)"),
          R"pbdoc(
            Define a custom vectorization transformation for the wrapped function.

            The vmap function takes two arguments:

            - *inputs*: A pytree that contains all the positional arguments to
              the function. It could be a single array, a tuple of arrays or a
              full blown tuple of dicts of arrays etc.
            - *axes*: A pytree that matches the structure of the inputs but
              instead contains the vectorization axis for each input or
              ``None`` if an input is not vectorized.

            The vmap function should return the outputs of the original
            function but vectorized over the provided axes. It should also
            return a pytree with the vectorization axes of each output. If some
            outputs are no longer vectorized, then their vectorization axis
            should be ``None``.
          )pbdoc");

  m.def(
      "eval",
      [](const nb::args& args) {
        std::vector<mx::array> arrays = tree_flatten(args, false);
        {
          nb::gil_scoped_release nogil;
          eval(arrays);
        }
      },
      nb::arg(),
      nb::sig("def eval(*args) -> None"),
      R"pbdoc(
        Evaluate an :class:`array` or tree of :class:`array`.

        Args:
            *args (arrays or trees of arrays): Each argument can be a single array
              or a tree of arrays. If a tree is given the nodes can be a Python
              :class:`list`, :class:`tuple` or :class:`dict`. Leaves which are not
              arrays are ignored.
      )pbdoc");
  m.def(
      "async_eval",
      [](const nb::args& args) {
        std::vector<mx::array> arrays = tree_flatten(args, false);
        {
          nb::gil_scoped_release nogil;
          async_eval(arrays);
        }
      },
      nb::arg(),
      nb::sig("def async_eval(*args)"),
      R"pbdoc(
        Asynchronously evaluate an :class:`array` or tree of :class:`array`.

        .. note::

          This is an experimental API and may change in future versions.

        Args:
            *args (arrays or trees of arrays): Each argument can be a single array
              or a tree of arrays. If a tree is given the nodes can be a Python
              :class:`list`, :class:`tuple` or :class:`dict`. Leaves which are not
              arrays are ignored.

        Example:
            >>> x = mx.array(1.0)
            >>> y = mx.exp(x)
            >>> mx.async_eval(y)
            >>> print(y)
            >>>
            >>> y = mx.exp(x)
            >>> mx.async_eval(y)
            >>> z = y + 3
            >>> mx.async_eval(z)
            >>> print(z)
      )pbdoc");
  m.def(
      "jvp",
      [](const nb::callable& fun,
         const std::vector<mx::array>& primals,
         const std::vector<mx::array>& tangents) {
        auto vfun = [&fun](const std::vector<mx::array>& primals) {
          auto out = fun(*nb::cast(primals));
          if (nb::isinstance<mx::array>(out)) {
            return std::vector<mx::array>{nb::cast<mx::array>(out)};
          } else {
            return nb::cast<std::vector<mx::array>>(out);
          }
        };
        return jvp(vfun, primals, tangents);
      },
      "fun"_a,
      "primals"_a,
      "tangents"_a,
      nb::sig(
          "def jvp(fun: Callable, primals: list[array], tangents: list[array]) -> tuple[list[array], list[array]]"),
      R"pbdoc(
        Compute the Jacobian-vector product.

        This computes the product of the Jacobian of a function ``fun`` evaluated
        at ``primals`` with the ``tangents``.

        Args:
            fun (Callable): A function which takes a variable number of :class:`array`
              and returns a single :class:`array` or list of :class:`array`.
            primals (list(array)): A list of :class:`array` at which to
              evaluate the Jacobian.
            tangents (list(array)): A list of :class:`array` which are the
              "vector" in the Jacobian-vector product. The ``tangents`` should be the
              same in number, shape, and type as the inputs of ``fun`` (i.e. the ``primals``).

        Returns:
            tuple(list(array), list(array)): A tuple with the outputs of
            ``fun`` in the first position and the Jacobian-vector products
            in the second position.

        Example:

         .. code-block:: python

             import mlx.core as mx

             outs, jvps = mx.jvp(mx.sin, (mx.array(1.0),), (mx.array(1.0),))

      )pbdoc");
  m.def(
      "vjp",
      [](const nb::callable& fun,
         const std::vector<mx::array>& primals,
         const std::vector<mx::array>& cotangents) {
        auto vfun = [&fun](const std::vector<mx::array>& primals) {
          auto out = fun(*nb::cast(primals));
          if (nb::isinstance<mx::array>(out)) {
            return std::vector<mx::array>{nb::cast<mx::array>(out)};
          } else {
            return nb::cast<std::vector<mx::array>>(out);
          }
        };
        return vjp(vfun, primals, cotangents);
      },
      "fun"_a,
      "primals"_a,
      "cotangents"_a,
      nb::sig(
          "def vjp(fun: Callable, primals: list[array], cotangents: list[array]) -> tuple[list[array], list[array]]"),
      R"pbdoc(
        Compute the vector-Jacobian product.

        Computes the product of the ``cotangents`` with the Jacobian of a
        function ``fun`` evaluated at ``primals``.

        Args:
          fun (Callable): A function which takes a variable number of :class:`array`
            and returns a single :class:`array` or list of :class:`array`.
          primals (list(array)): A list of :class:`array` at which to
            evaluate the Jacobian.
          cotangents (list(array)): A list of :class:`array` which are the
            "vector" in the vector-Jacobian product. The ``cotangents`` should be the
            same in number, shape, and type as the outputs of ``fun``.

        Returns:
            tuple(list(array), list(array)): A tuple with the outputs of
            ``fun`` in the first position and the vector-Jacobian products
            in the second position.

        Example:

         .. code-block:: python

             import mlx.core as mx

             outs, vjps = mx.vjp(mx.sin, (mx.array(1.0),), (mx.array(1.0),))

      )pbdoc");
  m.def(
      "value_and_grad",
      [](const nb::callable& fun,
         const std::optional<IntOrVec>& argnums,
         const StrOrSet& argnames) {
        auto [argnums_vec, argnames_set] =
            validate_argnums_argnames(argnums, argnames);
        return mlx_func(
            py_value_and_grad(
                fun, argnums_vec, argnames_set, "[value_and_grad]", false),
            fun);
      },
      "fun"_a,
      "argnums"_a = nb::none(),
      "argnames"_a = std::vector<std::string>{},
      nb::sig(
          "def value_and_grad(fun: Callable[P, R], argnums: Optional[Union[int, Sequence[int]]] = None, argnames: Union[str, Sequence[str]] = []) -> Callable[P, Tuple[R, Any]]"),
      R"pbdoc(
        Returns a function which computes the value and gradient of ``fun``.

        The function passed to :func:`value_and_grad` should return either
        a scalar loss or a tuple in which the first element is a scalar
        loss and the remaining elements can be anything.

        .. code-block:: python

            import mlx.core as mx

            def mse(params, inputs, targets):
                outputs = forward(params, inputs)
                lvalue = (outputs - targets).square().mean()
                return lvalue

            # Returns lvalue, dlvalue/dparams
            lvalue, grads = mx.value_and_grad(mse)(params, inputs, targets)

            def lasso(params, inputs, targets, a=1.0, b=1.0):
                outputs = forward(params, inputs)
                mse = (outputs - targets).square().mean()
                l1 = mx.abs(outputs - targets).mean()

                loss = a*mse + b*l1

                return loss, mse, l1

            (loss, mse, l1), grads = mx.value_and_grad(lasso)(params, inputs, targets)

        Args:
            fun (Callable): A function which takes a variable number of
              :class:`array` or trees of :class:`array` and returns
              a scalar output :class:`array` or a tuple the first element
              of which should be a scalar :class:`array`.
            argnums (int or list(int), optional): Specify the index (or indices)
              of the positional arguments of ``fun`` to compute the gradient
              with respect to. If neither ``argnums`` nor ``argnames`` are
              provided ``argnums`` defaults to ``0`` indicating ``fun``'s first
              argument.
            argnames (str or list(str), optional): Specify keyword arguments of
              ``fun`` to compute gradients with respect to. It defaults to [] so
              no gradients for keyword arguments by default.

        Returns:
            Callable: A function which returns a tuple where the first element
            is the output of `fun` and the second element is the gradients w.r.t.
            the loss.
      )pbdoc");
  m.def(
      "grad",
      [](const nb::callable& fun,
         const std::optional<IntOrVec>& argnums,
         const StrOrSet& argnames) {
        auto [argnums_vec, argnames_set] =
            validate_argnums_argnames(argnums, argnames);
        auto fn =
            py_value_and_grad(fun, argnums_vec, argnames_set, "[grad]", true);
        return mlx_func(
            [fn = std::move(fn)](nb::args& args, nb::kwargs& kwargs) {
              return fn(args, kwargs).second;
            },
            fun);
      },
      "fun"_a,
      "argnums"_a = nb::none(),
      "argnames"_a = std::vector<std::string>{},
      nb::sig(
          "def grad(fun: Callable[P, R], argnums: Optional[Union[int, Sequence[int]]] = None, argnames: Union[str, Sequence[str]] = []) -> Callable[P, Any]"),
      R"pbdoc(
        Returns a function which computes the gradient of ``fun``.

        Args:
            fun (Callable): A function which takes a variable number of
              :class:`array` or trees of :class:`array` and returns
              a scalar output :class:`array`.
            argnums (int or list(int), optional): Specify the index (or indices)
              of the positional arguments of ``fun`` to compute the gradient
              with respect to. If neither ``argnums`` nor ``argnames`` are
              provided ``argnums`` defaults to ``0`` indicating ``fun``'s first
              argument.
            argnames (str or list(str), optional): Specify keyword arguments of
              ``fun`` to compute gradients with respect to. It defaults to [] so
              no gradients for keyword arguments by default.

        Returns:
            Callable: A function which has the same input arguments as ``fun`` and
            returns the gradient(s).
      )pbdoc");
  m.def(
      "vmap",
      [](const nb::callable& fun,
         const nb::object& in_axes,
         const nb::object& out_axes) {
        return mlx_func(
            py_vmap(fun, in_axes, out_axes), fun, in_axes, out_axes);
      },
      "fun"_a,
      "in_axes"_a = 0,
      "out_axes"_a = 0,
      nb::sig(
          "def vmap(fun: Callable[P, R], in_axes: object = 0, out_axes: object = 0) -> Callable[P, R]"),
      R"pbdoc(
        Returns a vectorized version of ``fun``.

        Args:
            fun (Callable): A function which takes a variable number of
              :class:`array` or a tree of :class:`array` and returns
              a variable number of :class:`array` or a tree of :class:`array`.
            in_axes (int, optional): An integer or a valid prefix tree of the
              inputs to ``fun`` where each node specifies the vmapped axis. If
              the value is ``None`` then the corresponding input(s) are not vmapped.
              Defaults to ``0``.
            out_axes (int, optional): An integer or a valid prefix tree of the
              outputs of ``fun`` where each node specifies the vmapped axis. If
              the value is ``None`` then the corresponding outputs(s) are not vmapped.
              Defaults to ``0``.

        Returns:
            Callable: The vectorized function.
      )pbdoc");
  m.def(
      "compile",
      [](const nb::callable& fun,
         const nb::object& inputs,
         const nb::object& outputs,
         bool shapeless) {
        // Make sure each thread using mx.compile would clear its compile cache
        // before python interpreter exits.
        static thread_local auto clear_cache = []() {
          auto atexit = nb::module_::import_("atexit");
          atexit.attr("register")(
              nb::cpp_function(&mx::detail::compile_clear_cache));
          return true;
        };
        return mlx_func(
            nb::cpp_function(PyCompiledFun{fun, inputs, outputs, shapeless}),
            fun,
            inputs,
            outputs);
      },
      "fun"_a,
      "inputs"_a = nb::none(),
      "outputs"_a = nb::none(),
      "shapeless"_a = false,
      nb::sig(
          "def compile(fun: Callable[P, R], inputs: Optional[object] = None, outputs: Optional[object] = None, shapeless: bool = False) -> Callable[P, R]"),
      R"pbdoc(
        Returns a compiled function which produces the same output as ``fun``.

        Args:
            fun (Callable): A function which takes a variable number of
              :class:`array` or trees of :class:`array` and returns
              a variable number of :class:`array` or trees of :class:`array`.
            inputs (list or dict, optional): These inputs will be captured during
              the function compilation along with the inputs to ``fun``. The ``inputs``
              can be a :obj:`list` or a :obj:`dict` containing arbitrarily nested
              lists, dictionaries, or arrays. Leaf nodes that are not
              :obj:`array` are ignored. Default: ``None``
            outputs (list or dict, optional): These outputs will be captured and
              updated in a compiled function. The ``outputs`` can be a
              :obj:`list` or a :obj:`dict` containing arbitrarily nested lists,
              dictionaries, or arrays. Leaf nodes that are not :obj:`array` are ignored.
              Default: ``None``
            shapeless (bool, optional): A function compiled with the ``shapeless``
              option enabled will not be recompiled when the input shape changes. Not all
              functions can be compiled with ``shapeless`` enabled. Attempting to compile
              such functions with shapeless enabled will throw. Note, changing the number
              of dimensions or type of any input will result in a recompilation even with
              ``shapeless`` set to ``True``. Default: ``False``

        Returns:
            Callable: A compiled function which has the same input arguments
            as ``fun`` and returns the the same output(s).
      )pbdoc");
  m.def(
      "disable_compile",
      &mx::disable_compile,
      R"pbdoc(
        Globally disable compilation. Setting the environment variable
        ``MLX_DISABLE_COMPILE`` can also be used to disable compilation.
      )pbdoc");
  m.def(
      "enable_compile",
      &mx::enable_compile,
      R"pbdoc(
        Globally enable compilation. This will override the environment
        variable ``MLX_DISABLE_COMPILE`` if set.
      )pbdoc");
  m.def(
      "checkpoint",
      [](nb::callable fun) { return mlx_func(PyCheckpointedFun{fun}, fun); },
      "fun"_a,
      nb::sig("def checkpoint(fun: Callable[P, R]) -> Callable[P, R]"),
      R"pbdoc(
      Transform the passed callable to one that performs gradient
      checkpointing with respect to the inputs of the callable.

      Use this to reduce memory use for gradient computations at the expense of
      increased computation.

      Args:
          fun (Callable): The function to checkpoint.

      Returns:
          A callable that recomputes intermediate states during gradient
          computation.
      )pbdoc");
}


================================================
FILE: python/src/trees.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include "python/src/trees.h"

template <typename T, typename U, typename V>
void validate_subtrees(const std::vector<nb::object>& subtrees) {
  int len = nb::cast<T>(subtrees[0]).size();
  for (auto& subtree : subtrees) {
    if ((nb::isinstance<T>(subtree) && nb::cast<T>(subtree).size() != len) ||
        nb::isinstance<U>(subtree) || nb::isinstance<V>(subtree)) {
      throw std::invalid_argument(
          "[tree_map] Additional input tree is not a valid prefix of the first tree.");
    }
  }
}

nb::object tree_map(
    const std::vector<nb::object>& trees,
    std::function<nb::object(const std::vector<nb::object>&)> transform) {
  std::function<nb::object(const std::vector<nb::object>&)> recurse;

  recurse = [&](const std::vector<nb::object>& subtrees) {
    if (nb::isinstance<nb::list>(subtrees[0])) {
      nb::list l;
      std::vector<nb::object> items(subtrees.size());
      validate_subtrees<nb::list, nb::tuple, nb::dict>(subtrees);
      for (int i = 0; i < nb::cast<nb::list>(subtrees[0]).size(); ++i) {
        for (int j = 0; j < subtrees.size(); ++j) {
          if (nb::isinstance<nb::list>(subtrees[j])) {
            items[j] = nb::cast<nb::list>(subtrees[j])[i];
          } else {
            items[j] = subtrees[j];
          }
        }
        l.append(recurse(items));
      }
      return nb::cast<nb::object>(l);
    } else if (nb::isinstance<nb::tuple>(subtrees[0])) {
      //  Check the rest of the subtrees
      std::vector<nb::object> items(subtrees.size());
      int len = nb::cast<nb::tuple>(subtrees[0]).size();
      nb::list l;
      validate_subtrees<nb::tuple, nb::list, nb::dict>(subtrees);
      auto type = subtrees[0].type();
      for (int i = 0; i < len; ++i) {
        for (int j = 0; j < subtrees.size(); ++j) {
          if (nb::isinstance<nb::tuple>(subtrees[j])) {
            items[j] = nb::cast<nb::tuple>(subtrees[j])[i];
          } else {
            items[j] = subtrees[j];
          }
        }
        l.append(recurse(items));
      }
      if (PyTuple_CheckExact(subtrees[0].ptr())) {
        return nb::cast<nb::object>(nb::tuple(l));
      }
      return nb::hasattr(type, "_fields") ? type(*l) : type(l);
    } else if (nb::isinstance<nb::dict>(subtrees[0])) {
      std::vector<nb::object> items(subtrees.size());
      validate_subtrees<nb::dict, nb::list, nb::tuple>(subtrees);
      nb::dict d;
      for (auto item : nb::cast<nb::dict>(subtrees[0])) {
        for (int j = 0; j < subtrees.size(); ++j) {
          if (nb::isinstance<nb::dict>(subtrees[j])) {
            auto subdict = nb::cast<nb::dict>(subtrees[j]);
            if (!subdict.contains(item.first)) {
              throw std::invalid_argument(
                  "[tree_map] Tree is not a valid prefix tree of the first tree.");
            }
            items[j] = subdict[item.first];
          } else {
            items[j] = subtrees[j];
          }
        }
        d[item.first] = recurse(items);
      }
      return nb::cast<nb::object>(d);
    } else {
      return transform(subtrees);
    }
  };
  return recurse(trees);
}

nb::object tree_map(
    nb::object tree,
    std::function<nb::object(nb::handle)> transform) {
  return tree_map({tree}, [&](std::vector<nb::object> inputs) {
    return transform(inputs[0]);
  });
}

void tree_visit(
    const std::vector<nb::object>& trees,
    std::function<void(const std::vector<nb::object>&)> visitor) {
  std::function<void(const std::vector<nb::object>&)> recurse;

  recurse = [&](const std::vector<nb::object>& subtrees) {
    if (nb::isinstance<nb::list>(subtrees[0])) {
      std::vector<nb::object> items(subtrees.size());
      validate_subtrees<nb::list, nb::tuple, nb::dict>(subtrees);
      for (int i = 0; i < nb::cast<nb::list>(subtrees[0]).size(); ++i) {
        for (int j = 0; j < subtrees.size(); ++j) {
          if (nb::isinstance<nb::list>(subtrees[j])) {
            items[j] = nb::cast<nb::list>(subtrees[j])[i];
          } else {
            items[j] = subtrees[j];
          }
        }
        recurse(items);
      }
    } else if (nb::isinstance<nb::tuple>(subtrees[0])) {
      //  Check the rest of the subtrees
      std::vector<nb::object> items(subtrees.size());
      int len = nb::cast<nb::tuple>(subtrees[0]).size();
      validate_subtrees<nb::tuple, nb::list, nb::dict>(subtrees);
      for (int i = 0; i < len; ++i) {
        for (int j = 0; j < subtrees.size(); ++j) {
          if (nb::isinstance<nb::tuple>(subtrees[j])) {
            items[j] = nb::cast<nb::tuple>(subtrees[j])[i];
          } else {
            items[j] = subtrees[j];
          }
        }
        recurse(items);
      }
    } else if (nb::isinstance<nb::dict>(subtrees[0])) {
      std::vector<nb::object> items(subtrees.size());
      validate_subtrees<nb::dict, nb::list, nb::tuple>(subtrees);
      for (auto item : nb::cast<nb::dict>(subtrees[0])) {
        for (int j = 0; j < subtrees.size(); ++j) {
          if (nb::isinstance<nb::dict>(subtrees[j])) {
            auto subdict = nb::cast<nb::dict>(subtrees[j]);
            if (!subdict.contains(item.first)) {
              throw std::invalid_argument(
                  "[tree_visit] Tree is not a valid prefix tree of the first tree.");
            }
            items[j] = subdict[item.first];
          } else {
            items[j] = subtrees[j];
          }
        }
        recurse(items);
      }
    } else {
      visitor(subtrees);
    }
  };
  return recurse(trees);
}

void tree_visit(nb::handle tree, std::function<void(nb::handle)> visitor) {
  std::function<void(nb::handle)> recurse;
  recurse = [&](nb::handle subtree) {
    if (nb::isinstance<nb::list>(subtree) ||
        nb::isinstance<nb::tuple>(subtree)) {
      for (auto item : subtree) {
        recurse(item);
      }
    } else if (nb::isinstance<nb::dict>(subtree)) {
      for (auto item : nb::cast<nb::dict>(subtree)) {
        recurse(item.second);
      }
    } else {
      visitor(subtree);
    }
  };

  recurse(tree);
}

void tree_visit_update(
    nb::object tree,
    std::function<nb::object(nb::handle)> visitor) {
  std::function<nb::object(nb::handle)> recurse;
  recurse = [&](nb::handle subtree) {
    if (nb::isinstance<nb::list>(subtree)) {
      auto l = nb::cast<nb::list>(subtree);
      for (int i = 0; i < l.size(); ++i) {
        l[i] = recurse(l[i]);
      }
      return nb::cast<nb::object>(l);
    } else if (nb::isinstance<nb::tuple>(subtree)) {
      auto type = subtree.type();
      nb::list l(subtree);
      for (int i = 0; i < l.size(); ++i) {
        l[i] = recurse(l[i]);
      }
      if (PyTuple_CheckExact(subtree.ptr())) {
        return nb::cast<nb::object>(nb::tuple(l));
      }
      return nb::hasattr(type, "_fields") ? type(*l) : type(l);
    } else if (nb::isinstance<nb::dict>(subtree)) {
      auto d = nb::cast<nb::dict>(subtree);
      for (auto item : d) {
        d[item.first] = recurse(item.second);
      }
      return nb::cast<nb::object>(d);
    } else if (nb::isinstance<mx::array>(subtree)) {
      return visitor(subtree);
    } else {
      return nb::cast<nb::object>(subtree);
    }
  };
  recurse(tree);
}

// Fill a pytree (recursive dict or list of dict or list)
// in place with the given arrays
// Non dict or list nodes are ignored
void tree_fill(nb::object& tree, const std::vector<mx::array>& values) {
  size_t index = 0;
  tree_visit_update(
      tree, [&](nb::handle node) { return nb::cast(values[index++]); });
}

// Replace all the arrays from the src values with the dst values in the tree
void tree_replace(
    nb::object& tree,
    const std::vector<mx::array>& src,
    const std::vector<mx::array>& dst) {
  std::unordered_map<uintptr_t, mx::array> src_to_dst;
  for (int i = 0; i < src.size(); ++i) {
    src_to_dst.insert({src[i].id(), dst[i]});
  }
  tree_visit_update(tree, [&](nb::handle node) {
    auto arr = nb::cast<mx::array>(node);
    if (auto it = src_to_dst.find(arr.id()); it != src_to_dst.end()) {
      return nb::cast(it->second);
    }
    return nb::cast(arr);
  });
}

std::vector<mx::array> tree_flatten(nb::handle tree, bool strict /* = true */) {
  std::vector<mx::array> flat_tree;

  tree_visit(tree, [&](nb::handle obj) {
    if (nb::isinstance<mx::array>(obj)) {
      flat_tree.push_back(nb::cast<mx::array>(obj));
    } else if (strict) {
      throw std::invalid_argument(
          "[tree_flatten] The argument should contain only arrays");
    }
  });

  return flat_tree;
}

nb::object tree_unflatten(
    nb::object tree,
    const std::vector<mx::array>& values,
    int index /* = 0 */) {
  return tree_map(tree, [&](nb::handle obj) {
    if (nb::isinstance<mx::array>(obj)) {
      return nb::cast(values[index++]);
    } else {
      return nb::cast<nb::object>(obj);
    }
  });
}

nb::object structure_sentinel() {
  static nb::object sentinel;

  if (sentinel.ptr() == nullptr) {
    sentinel = nb::capsule(&sentinel);
    // probably not needed but this should make certain that we won't ever
    // delete the sentinel
    sentinel.inc_ref();
  }

  return sentinel;
}

std::pair<std::vector<mx::array>, nb::object> tree_flatten_with_structure(
    nb::object tree,
    bool strict /* = true */) {
  auto sentinel = structure_sentinel();
  std::vector<mx::array> flat_tree;
  auto structure = tree_map(
      tree,
      [&flat_tree, sentinel = std::move(sentinel), strict](nb::handle obj) {
        if (nb::isinstance<mx::array>(obj)) {
          flat_tree.push_back(nb::cast<mx::array>(obj));
          return sentinel;
        } else if (!strict) {
          return nb::cast<nb::object>(obj);
        } else {
          throw std::invalid_argument(
              "[tree_flatten] The argument should contain only arrays");
        }
      });

  return {flat_tree, structure};
}

nb::object tree_unflatten_from_structure(
    nb::object structure,
    const std::vector<mx::array>& values,
    int index /* = 0 */) {
  auto sentinel = structure_sentinel();
  return tree_map(structure, [&](nb::handle obj) {
    if (obj.is(sentinel)) {
      return nb::cast(values[index++]);
    } else {
      return nb::cast<nb::object>(obj);
    }
  });
}


================================================
FILE: python/src/trees.h
================================================
// Copyright © 2023-2024 Apple Inc.
#pragma once
#include <nanobind/nanobind.h>

#include "mlx/array.h"

namespace mx = mlx::core;
namespace nb = nanobind;

void tree_visit(
    const std::vector<nb::object>& trees,
    std::function<void(const std::vector<nb::object>&)> visitor);
void tree_visit(nb::handle tree, std::function<void(nb::handle)> visitor);

nb::object tree_map(
    const std::vector<nb::object>& trees,
    std::function<nb::object(const std::vector<nb::object>&)> transform);

nb::object tree_map(
    nb::object tree,
    std::function<nb::object(nb::handle)> transform);

void tree_visit_update(
    nb::object tree,
    std::function<nb::object(nb::handle)> visitor);

/**
 * Fill a pytree (recursive dict or list of dict or list) in place with the
 * given arrays. */
void tree_fill(nb::object& tree, const std::vector<mx::array>& values);

/**
 * Replace all the arrays from the src values with the dst values in the
 * tree.
 */
void tree_replace(
    nb::object& tree,
    const std::vector<mx::array>& src,
    const std::vector<mx::array>& dst);

/**
 * Flatten a tree into a vector of arrays. If strict is true, then the
 * function will throw if the tree contains a leaf which is not an array.
 */
std::vector<mx::array> tree_flatten(nb::handle tree, bool strict = true);

/**
 * Unflatten a tree from a vector of arrays.
 */
nb::object tree_unflatten(
    nb::object tree,
    const std::vector<mx::array>& values,
    int index = 0);

std::pair<std::vector<mx::array>, nb::object> tree_flatten_with_structure(
    nb::object tree,
    bool strict = true);

nb::object tree_unflatten_from_structure(
    nb::object structure,
    const std::vector<mx::array>& values,
    int index = 0);


================================================
FILE: python/src/utils.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "python/src/utils.h"
#include "mlx/ops.h"
#include "mlx/utils.h"
#include "python/src/convert.h"

mx::array to_array(
    const ScalarOrArray& v,
    std::optional<mx::Dtype> dtype /* = std::nullopt */) {
  if (auto pv = std::get_if<nb::bool_>(&v); pv) {
    return mx::array(nb::cast<bool>(*pv), dtype.value_or(mx::bool_));
  } else if (auto pv = std::get_if<nb::int_>(&v); pv) {
    auto val = nb::cast<int64_t>(*pv);
    auto default_type = (val > std::numeric_limits<int>::max() ||
                         val < std::numeric_limits<int>::min())
        ? mx::int64
        : mx::int32;
    auto out_t = dtype.value_or(default_type);
    if (mx::issubdtype(out_t, mx::integer) && out_t.size() < 8) {
      auto info = mx::iinfo(out_t);
      if (val < info.min || val > static_cast<int64_t>(info.max)) {
        std::ostringstream msg;
        msg << "Converting " << val << " to " << out_t
            << " would result in overflow.";
        throw std::invalid_argument(msg.str());
      }
    }

    // bool_ is an exception and is always promoted
    return mx::array(val, (out_t == mx::bool_) ? mx::int32 : out_t);
  } else if (auto pv = std::get_if<nb::float_>(&v); pv) {
    auto out_t = dtype.value_or(mx::float32);
    return mx::array(
        nb::cast<float>(*pv),
        mx::issubdtype(out_t, mx::floating) ? out_t : mx::float32);
  } else if (auto pv = std::get_if<std::complex<float>>(&v); pv) {
    return mx::array(static_cast<mx::complex64_t>(*pv), mx::complex64);
  } else if (auto pv = std::get_if<mx::array>(&v); pv) {
    return *pv;
  } else if (auto pv = std::get_if<
                 nb::ndarray<nb::ro, nb::c_contig, nb::device::cpu>>(&v);
             pv) {
    return nd_array_to_mlx(*pv, dtype);
  } else {
    return to_array_with_accessor(std::get<ArrayLike>(v).obj);
  }
}

std::pair<mx::array, mx::array> to_arrays(
    const ScalarOrArray& a,
    const ScalarOrArray& b) {
  // Four cases:
  // - If both a and b are arrays leave their types alone
  // - If a is an array but b is not, treat b as a weak python type
  // - If b is an array but a is not, treat a as a weak python type
  // - If neither is an array convert to arrays but leave their types alone
  auto is_mlx_array = [](const ScalarOrArray& x) {
    return std::holds_alternative<mx::array>(x) ||
        std::holds_alternative<ArrayLike>(x) &&
        nb::hasattr(std::get<ArrayLike>(x).obj, "__mlx_array__");
  };
  auto get_mlx_array = [](const ScalarOrArray& x) {
    if (auto px = std::get_if<mx::array>(&x); px) {
      return *px;
    } else {
      return nb::cast<mx::array>(
          std::get<ArrayLike>(x).obj.attr("__mlx_array__"));
    }
  };

  if (is_mlx_array(a)) {
    auto arr_a = get_mlx_array(a);
    if (is_mlx_array(b)) {
      auto arr_b = get_mlx_array(b);
      return {arr_a, arr_b};
    }
    return {arr_a, to_array(b, arr_a.dtype())};
  } else if (is_mlx_array(b)) {
    auto arr_b = get_mlx_array(b);
    return {to_array(a, arr_b.dtype()), arr_b};
  } else {
    return {to_array(a), to_array(b)};
  }
}

mx::array to_array_with_accessor(nb::object obj) {
  if (nb::isinstance<mx::array>(obj)) {
    return nb::cast<mx::array>(obj);
  } else if (nb::hasattr(obj, "__mlx_array__")) {
    return nb::cast<mx::array>(obj.attr("__mlx_array__")());
  } else {
    std::ostringstream msg;
    msg << "Invalid type " << nb::type_name(obj.type()).c_str()
        << " received in array initialization.";
    throw std::invalid_argument(msg.str());
  }
}


================================================
FILE: python/src/utils.h
================================================
// Copyright © 2023-2024 Apple Inc.
#pragma once
#include <numeric>
#include <optional>
#include <string>
#include <variant>

#include <nanobind/nanobind.h>
#include <nanobind/ndarray.h>
#include <nanobind/stl/complex.h>
#include <nanobind/stl/variant.h>

#include "mlx/array.h"
#include "python/src/convert.h"

namespace mx = mlx::core;
namespace nb = nanobind;

using IntOrVec = std::variant<std::monostate, int, std::vector<int>>;
using ScalarOrArray = std::variant<
    nb::bool_,
    nb::int_,
    nb::float_,
    // Must be above ndarray
    mx::array,
    // Must be above complex
    nb::ndarray<nb::ro, nb::c_contig, nb::device::cpu>,
    std::complex<float>,
    ArrayLike>;

inline std::vector<int> get_reduce_axes(const IntOrVec& v, int dims) {
  std::vector<int> axes;
  if (std::holds_alternative<std::monostate>(v)) {
    axes.resize(dims);
    std::iota(axes.begin(), axes.end(), 0);
  } else if (auto pv = std::get_if<int>(&v); pv) {
    axes.push_back(*pv);
  } else {
    axes = std::get<std::vector<int>>(v);
  }
  return axes;
}

inline bool is_comparable_with_array(const ScalarOrArray& v) {
  // Checks if the value can be compared to an array (or is already an
  // mlx array)
  if (auto pv = std::get_if<ArrayLike>(&v); pv) {
    auto obj = (*pv).obj;
    return nb::isinstance<mx::array>(obj) || nb::hasattr(obj, "__mlx_array__");
  } else {
    // If it's not an object, it's a scalar (nb::int_, nb::float_, etc.)
    // and can be compared to an array
    return true;
  }
}

inline nb::handle get_handle_of_object(const ScalarOrArray& v) {
  return std::get<ArrayLike>(v).obj.ptr();
}

inline void throw_invalid_operation(
    const std::string& operation,
    const ScalarOrArray operand) {
  std::ostringstream msg;
  msg << "Cannot perform " << operation << " on an mlx.core.array and "
      << nb::type_name(get_handle_of_object(operand).type()).c_str();
  throw std::invalid_argument(msg.str());
}

mx::array to_array(
    const ScalarOrArray& v,
    std::optional<mx::Dtype> dtype = std::nullopt);

std::pair<mx::array, mx::array> to_arrays(
    const ScalarOrArray& a,
    const ScalarOrArray& b);

mx::array to_array_with_accessor(nb::object obj);


================================================
FILE: python/tests/__main__.py
================================================
from . import mlx_tests

__unittest = True

mlx_tests.MLXTestRunner(module=None)


================================================
FILE: python/tests/cuda_skip.py
================================================
cuda_skip = {
    "TestLayers.test_quantized_embedding",
    # Block masked matmul NYI
    "TestBlas.test_block_masked_matmul",
    # Gather matmul NYI
    "TestBlas.test_gather_matmul",
    "TestBlas.test_gather_matmul_grad",
    "TestBlas.test_gather_mm_sorted_vjp",
    # Lapack ops NYI
    "TestLinalg.test_cholesky",
    "TestLinalg.test_cholesky_inv",
    "TestLinalg.test_eig",
    "TestLinalg.test_eigh",
    "TestLinalg.test_inverse",
    "TestVmap.test_vmap_inverse",
    "TestLinalg.test_lu",
    "TestLinalg.test_lu_factor",
    "TestLinalg.test_pseudo_inverse",
    "TestLinalg.test_qr_factorization",
    "TestInit.test_orthogonal",
    "TestLinalg.test_svd_decomposition",
    "TestVmap.test_vmap_svd",
    "TestLinalg.test_tri_inverse",
    # Quantization NYI
    "TestQuantized.test_gather_matmul_grad",
    "TestQuantized.test_gather_qmm",
    "TestQuantized.test_gather_qmm_sorted",
    "TestQuantized.test_gather_qmm_grad",
    "TestQuantized.test_non_multiples",
    "TestQuantized.test_qmm",
    "TestQuantized.test_qmm_jvp",
    "TestQuantized.test_qmm_shapes",
    "TestQuantized.test_qmm_vjp",
    "TestQuantized.test_fp_qvm",
    "TestQuantized.test_qvm",
    "TestQuantized.test_qvm_splitk",
    "TestQuantized.test_qmv_small_non_multiples",
    "TestQuantized.test_small_matrix",
    "TestQuantized.test_throw",
    "TestQuantized.test_vjp_scales_biases",
    "TestExportImport.test_export_quantized_model",
}


================================================
FILE: python/tests/mlx_distributed_tests.py
================================================
# Copyright © 2025 Apple Inc.

import mlx.core as mx
import mlx.nn as nn
import mlx_tests
from mlx.nn.layers.distributed import shard_inplace, shard_linear
from mlx.nn.utils import average_gradients


class MLXDistributedCommonTestCase(mlx_tests.MLXTestCase):
    def test_average_gradients(self):
        original_all_sum = mx.distributed.all_sum
        n_calls = 0
        xtype = None

        def new_all_sum(x, **kwargs):
            nonlocal n_calls
            nonlocal xtype

            n_calls += 1
            if xtype is not None:
                self.assertEqual(xtype, x.dtype)

            return original_all_sum(x, **kwargs)

        mx.distributed.all_sum = new_all_sum

        try:
            grads = [mx.ones(10) for i in range(10)]
            new_grads = average_gradients(grads)
            mx.eval(new_grads)
            self.assertEqual(len(new_grads), 10)
            self.assertTrue(all(mx.all(g == 1) for g in new_grads))
            self.assertEqual(n_calls, 1)

            n_calls = 0
            new_grads = average_gradients(grads, all_reduce_size=4 * 50)
            mx.eval(new_grads)
            self.assertEqual(len(new_grads), 10)
            self.assertTrue(all(mx.all(g == 1) for g in new_grads))
            self.assertEqual(n_calls, 2)

            n_calls = 0
            new_grads = average_gradients(grads, all_reduce_size=0)
            mx.eval(new_grads)
            self.assertEqual(len(new_grads), 10)
            self.assertTrue(all(mx.all(g == 1) for g in new_grads))
            self.assertEqual(n_calls, 10)

        finally:
            mx.distributed.all_sum = original_all_sum

    def test_all_reduce(self):
        g = mx.distributed.init()
        dtypes = [
            (mx.int8, 0),
            (mx.uint8, 0),
            (mx.int32, 0),
            (mx.uint32, 0),
            (mx.float32, 1e-6),
            (mx.float16, 5e-3),
            (mx.bfloat16, 1e-1),
        ]
        sizes = [
            (7,),
            (10,),
            (1024,),
            (1024, 1024),
        ]
        key = mx.random.key(0)

        for dt, rtol in dtypes:
            for sh in sizes:
                x = (mx.random.uniform(shape=(g.size(),) + sh, key=key) * 10).astype(dt)

                # All sum
                y = mx.distributed.all_sum(x[g.rank()], group=g)
                z = x.sum(0)
                maxrelerror = (y - z).abs()
                if rtol > 0:
                    maxrelerror /= z.abs()
                maxrelerror = maxrelerror.max()
                self.assertLessEqual(maxrelerror, rtol)

                # All max
                y = mx.distributed.all_max(x[g.rank()], group=g)
                z = x.max(0)
                self.assertTrue(mx.all(y == z))

                # All min
                y = mx.distributed.all_min(x[g.rank()], group=g)
                z = x.min(0)
                self.assertTrue(mx.all(y == z))

    def test_donation(self):
        x = mx.random.normal((1024,))
        mx.eval(x)
        mx.synchronize()

        mx.reset_peak_memory()
        scale = mx.array(2.0)
        y = mx.distributed.all_sum(x)
        mx.eval(y)
        mx.synchronize()
        all_sum_only = mx.get_peak_memory()
        y = mx.distributed.all_sum(x) * scale
        mx.eval(y)
        mx.synchronize()
        all_sum_with_binary = mx.get_peak_memory()

        self.assertEqual(all_sum_only, all_sum_with_binary)

    def test_shard_linear(self):
        # Seed the prng to have the same inputs and weights generated everywhere
        mx.random.seed(0xF0F0F0F0)

        # Prepare inputs
        world = mx.distributed.init()
        part = (
            slice(None),
            slice(
                world.rank() * 1024 // world.size(),
                (world.rank() + 1) * 1024 // world.size(),
            ),
        )
        x = mx.random.normal((4, 1024))

        # Create and shard some linear layers
        lin = nn.Linear(1024, 1024, bias=True)
        slin1 = shard_linear(lin, "all-to-sharded")
        slin2 = shard_linear(lin, "sharded-to-all")
        y = lin(x)
        y1 = slin1(x)
        y2 = slin2(x[part])
        self.assertTrue(mx.allclose(y, y2, atol=self.atol, rtol=self.rtol))
        self.assertTrue(mx.allclose(y[part], y1, atol=self.atol, rtol=self.rtol))

        # And their quant versions (QuantizedMatmul is not supported on CUDA)
        if not mx.cuda.is_available():
            qlin = lin.to_quantized()
            slin1 = shard_linear(qlin, "all-to-sharded")
            slin2 = shard_linear(qlin, "sharded-to-all")
            y = qlin(x)
            y1 = slin1(x)
            y2 = slin2(x[part])
            self.assertTrue(mx.allclose(y, y2, atol=self.atol, rtol=self.rtol))
            self.assertTrue(mx.allclose(y[part], y1))

            # Test non-affine quantization modes (mxfp8)
            qlin_mxfp8 = lin.to_quantized(group_size=32, bits=8, mode="mxfp8")
            self.assertEqual(qlin_mxfp8.mode, "mxfp8")

            slin1_mxfp8 = shard_linear(qlin_mxfp8, "all-to-sharded")
            slin2_mxfp8 = shard_linear(qlin_mxfp8, "sharded-to-all")

            # Verify mode is propagated
            self.assertEqual(slin1_mxfp8.mode, "mxfp8")
            self.assertEqual(slin2_mxfp8.mode, "mxfp8")

            # Verify biases parameter is not set for mxfp8
            self.assertIsNone(slin1_mxfp8.get("biases"))
            self.assertIsNone(slin2_mxfp8.get("biases"))

            y = qlin_mxfp8(x)
            y1 = slin1_mxfp8(x)
            y2 = slin2_mxfp8(x[part])
            self.assertTrue(mx.allclose(y, y2, atol=self.atol, rtol=self.rtol))
            self.assertTrue(mx.allclose(y[part], y1))

        # Check the backward works as expected
        def dummy_loss(model, x, y):
            return (model(x) * y).sum()

        mod = nn.Sequential(
            nn.Linear(128, 128),
            nn.Linear(128, 128),
            nn.Linear(128, 128),
            nn.Linear(128, 128),
        )
        smod = nn.Sequential(
            shard_linear(mod.layers[0], "all-to-sharded"),
            shard_linear(mod.layers[1], "sharded-to-all"),
            shard_linear(mod.layers[2], "all-to-sharded"),
            shard_linear(mod.layers[3], "sharded-to-all"),
        )

        grad1 = nn.value_and_grad(mod, dummy_loss)
        grad2 = nn.value_and_grad(smod, dummy_loss)

        x = mx.random.normal((4, 128))
        y = mx.random.normal((4, 128))

        l1, g1 = grad1(mod, x, y)
        l2, g2 = grad2(smod, x, y)
        mx.eval(l1, g1, l2, g2)

        part = slice(
            world.rank() * 128 // world.size(), (world.rank() + 1) * 128 // world.size()
        )
        self.assertTrue(mx.allclose(l1, l2))
        self.assertTrue(
            mx.allclose(
                g1["layers"][0]["weight"][part],
                g2["layers"][0]["weight"],
                atol=1e-6,
                rtol=1e-4,
            )
        )
        self.assertTrue(
            mx.allclose(
                g1["layers"][2]["weight"][part],
                g2["layers"][2]["weight"],
                atol=1e-6,
                rtol=1e-4,
            )
        )
        self.assertTrue(
            mx.allclose(
                g1["layers"][1]["weight"][:, part],
                g2["layers"][1]["weight"],
                atol=1e-6,
                rtol=1e-4,
            )
        )
        self.assertTrue(
            mx.allclose(
                g1["layers"][3]["weight"][:, part],
                g2["layers"][3]["weight"],
                atol=1e-6,
                rtol=1e-4,
            )
        )
        self.assertTrue(
            mx.allclose(
                g1["layers"][0]["bias"][part],
                g2["layers"][0]["bias"],
                atol=1e-6,
                rtol=1e-4,
            )
        )
        self.assertTrue(
            mx.allclose(
                g1["layers"][2]["bias"][part],
                g2["layers"][2]["bias"],
                atol=1e-6,
                rtol=1e-4,
            )
        )
        self.assertTrue(
            mx.allclose(
                g1["layers"][1]["bias"],
                g2["layers"][1]["bias"],
                atol=self.atol,
                rtol=self.rtol,
            )
        )
        self.assertTrue(
            mx.allclose(
                g1["layers"][3]["bias"],
                g2["layers"][3]["bias"],
                atol=self.atol,
                rtol=self.rtol,
            )
        )

    def test_shard_predicate(self):
        mx.random.seed(0xF0F0F0F0)

        class MyConv(nn.Module):
            def __init__(self, *args, **kwargs):
                super().__init__()
                self.aggregate = kwargs.pop("aggregate", False)
                self.conv = nn.Conv2d(*args, **kwargs)

            def __call__(self, x):
                x = self.conv(x)
                if self.aggregate:
                    x = mx.distributed.all_sum(x)
                return x

        def sharding(path, weight):
            parts = path.split(".")
            even = int(parts[1]) % 2 == 0
            if even:
                return 0
            else:
                return -1 if parts[-1] != "bias" else None

        mod = nn.Sequential(
            MyConv(3, 128, kernel_size=3),
            MyConv(128, 128, kernel_size=3),
            MyConv(128, 128, kernel_size=3),
            MyConv(128, 3, kernel_size=3),
        )
        smod = nn.Sequential(
            MyConv(3, 128, kernel_size=3),
            MyConv(128, 128, kernel_size=3, aggregate=True),
            MyConv(128, 128, kernel_size=3),
            MyConv(128, 3, kernel_size=3, aggregate=True),
        )
        smod.update(mod.parameters())
        shard_inplace(smod, sharding)

        x = mx.random.normal((4, 16, 16, 3))
        y1 = mod(x)
        y2 = smod(x)
        self.assertTrue(mx.allclose(y1, y2, atol=1e-6, rtol=1e-4))

    def test_all_gather(self):
        world = mx.distributed.init()
        dtypes = [
            mx.int8,
            mx.uint8,
            mx.int32,
            mx.uint32,
            mx.float32,
            mx.float16,
            mx.bfloat16,
        ]
        for dt in dtypes:
            x = mx.ones((2, 2, 4), dtype=dt)
            y = mx.distributed.all_gather(x)
            self.assertEqual(y.shape, (world.size() * 2, 2, 4))
            self.assertTrue(mx.all(y == 1))


================================================
FILE: python/tests/mlx_tests.py
================================================
# Copyright © 2023 Apple Inc.

import os

# Use regular fp32 precision for tests
os.environ["MLX_ENABLE_TF32"] = "0"

# Do not abort on cache thrashing
os.environ["MLX_ENABLE_CACHE_THRASHING_CHECK"] = "0"

import platform
import unittest
from typing import Any, Callable, List, Tuple, Union

import mlx.core as mx
import numpy as np


class MLXTestRunner(unittest.TestProgram):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def createTests(self, *args, **kwargs):
        super().createTests(*args, **kwargs)

        # Asume CUDA backend in this case
        device = os.getenv("DEVICE", None)
        if device is not None:
            device = getattr(mx, device)
        else:
            device = mx.default_device()

        if not (device == mx.gpu and not mx.metal.is_available()):
            return

        from cuda_skip import cuda_skip

        filtered_suite = unittest.TestSuite()

        def filter_and_add(t):
            if isinstance(t, unittest.TestSuite):
                for sub_t in t:
                    filter_and_add(sub_t)
            else:
                t_id = ".".join(t.id().split(".")[-2:])
                if t_id in cuda_skip:
                    print(f"Skipping {t_id}")
                else:
                    filtered_suite.addTest(t)

        filter_and_add(self.test)
        self.test = filtered_suite


class MLXTestCase(unittest.TestCase):
    @property
    def is_apple_silicon(self):
        return platform.machine() == "arm64" and platform.system() == "Darwin"

    def setUp(self):
        self.default = mx.default_device()
        device = os.getenv("DEVICE", None)
        if device is not None:
            device = getattr(mx, device)
            mx.set_default_device(device)

    def tearDown(self):
        mx.set_default_device(self.default)

    # Note if a tuple is passed into args, it will be considered a shape request and convert to a mx.random.normal with the shape matching the tuple
    def assertCmpNumpy(
        self,
        args: List[Union[Tuple[int], Any]],
        mx_fn: Callable[..., mx.array],
        np_fn: Callable[..., np.array],
        atol=1e-2,
        rtol=1e-2,
        dtype=mx.float32,
        **kwargs,
    ):
        assert dtype != mx.bfloat16, "numpy does not support bfloat16"
        args = [
            mx.random.normal(s, dtype=dtype) if isinstance(s, Tuple) else s
            for s in args
        ]
        mx_res = mx_fn(*args, **kwargs)
        np_res = np_fn(
            *[np.array(a) if isinstance(a, mx.array) else a for a in args], **kwargs
        )
        return self.assertEqualArray(mx_res, mx.array(np_res), atol=atol, rtol=rtol)

    def assertEqualArray(
        self,
        mx_res: mx.array,
        expected: mx.array,
        atol=1e-2,
        rtol=1e-2,
    ):
        self.assertEqual(
            tuple(mx_res.shape),
            tuple(expected.shape),
            msg=f"shape mismatch expected={expected.shape} got={mx_res.shape}",
        )
        self.assertEqual(
            mx_res.dtype,
            expected.dtype,
            msg=f"dtype mismatch expected={expected.dtype} got={mx_res.dtype}",
        )
        if not isinstance(mx_res, mx.array) and not isinstance(expected, mx.array):
            np.testing.assert_allclose(mx_res, expected, rtol=rtol, atol=atol)
            return
        elif not isinstance(mx_res, mx.array):
            mx_res = mx.array(mx_res)
        elif not isinstance(expected, mx.array):
            expected = mx.array(expected)
        self.assertTrue(mx.allclose(mx_res, expected, rtol=rtol, atol=atol))


================================================
FILE: python/tests/mpi_test_distributed.py
================================================
# Copyright © 2024 Apple Inc.

import mlx.core as mx
import mlx_distributed_tests
import mlx_tests


class TestMPIDistributed(mlx_distributed_tests.MLXDistributedCommonTestCase):
    @classmethod
    def setUpClass(cls):
        _ = mx.distributed.init(strict=True, backend="mpi")
        cls.atol = 1e-6
        cls.rtol = 1e-4

    def test_groups(self):
        world = mx.distributed.init()
        self.assertEqual(world.size(), 8)
        self.assertTrue(0 <= world.rank() < 8)

        world2 = mx.distributed.init()
        self.assertEqual(world.size(), world2.size())
        self.assertEqual(world.rank(), world2.rank())

        sub = world.split(world.rank() % 2)
        self.assertEqual(sub.size(), 4)
        self.assertEqual(sub.rank(), world.rank() // 2)

        sub = world.split(world.rank() // 2)
        self.assertEqual(sub.size(), 2)

    def test_all_reduce_extra(self):
        world = mx.distributed.init()
        dtypes = [
            (mx.int16, 0),
            (mx.uint16, 0),
            (mx.complex64, 1e-6),
        ]
        sizes = [
            (7,),
            (10,),
            (1024,),
            (1024, 1024),
        ]
        key = mx.random.key(0)
        group = world.split(world.rank() % 2)

        for dt, rtol in dtypes:
            for sh in sizes:
                for g in [world, group]:
                    x = (
                        mx.random.uniform(shape=(g.size(),) + sh, key=key) * 10
                    ).astype(dt)

                    # All sum
                    y = mx.distributed.all_sum(x[g.rank()], group=g)
                    z = x.sum(0)
                    maxrelerror = (y - z).abs()
                    if rtol > 0:
                        maxrelerror /= z.abs()
                    maxrelerror = maxrelerror.max()
                    self.assertLessEqual(maxrelerror, rtol)

                    # All max
                    y = mx.distributed.all_max(x[g.rank()], group=g)
                    z = x.max(0)
                    self.assertTrue(mx.all(y == z))

                    # All min
                    y = mx.distributed.all_min(x[g.rank()], group=g)
                    z = x.min(0)
                    self.assertTrue(mx.all(y == z))

    def test_all_gather_extra(self):
        world = mx.distributed.init()
        dtypes = [
            mx.int16,
            mx.uint16,
            mx.complex64,
        ]
        for dt in dtypes:
            x = mx.ones((2, 2, 4), dtype=dt)
            y = mx.distributed.all_gather(x)
            self.assertEqual(y.shape, (world.size() * 2, 2, 4))
            self.assertTrue(mx.all(y == 1))

        sub = world.split(world.rank() % 2)
        for dt in dtypes:
            x = mx.ones((2, 2, 4), dtype=dt)
            y = mx.distributed.all_gather(x, group=sub)
            self.assertEqual(y.shape, (sub.size() * 2, 2, 4))
            self.assertTrue(mx.all(y == 1))

    def test_mixed(self):
        # Make the following groups:
        # - world: 0 1 2 3 4 5 6 7
        # - sub_1: 0 1 0 1 0 1 0 1
        # - sub_2: 0 0 1 1 2 2 3 3
        #
        # The corresponding colors to make them are
        # - world: N/A
        # - sub_1: 0 0 1 1 2 2 3 3
        # - sub_2: 0 1 0 1 0 1 0 1

        world = mx.distributed.init()
        sub_1 = world.split(world.rank() // 2)
        sub_2 = world.split(world.rank() % 2)

        x = mx.ones((1, 8)) * world.rank()
        y = mx.distributed.all_sum(x, group=sub_1)
        z = mx.distributed.all_gather(y, group=sub_2)
        z_target = mx.arange(8).reshape(4, 2).sum(-1, keepdims=True)

        self.assertTrue(mx.all(z == z_target))

    def test_send_recv(self):
        world = mx.distributed.init()
        pairs = world.split(world.rank() // 2)
        neighbor = (pairs.rank() + 1) % 2
        send = pairs.rank() == 0

        x = mx.ones(10)
        for i in range(10):
            if send:
                mx.eval(mx.distributed.send(2 * x, neighbor, group=pairs))
            else:
                x = mx.distributed.recv_like(x, neighbor, group=pairs)
                mx.eval(x)
            send = not send

        self.assertTrue(mx.all(x == (1024 if pairs.rank() == 0 else 512)))

        # Check recv and computation in same eval:
        y = mx.ones((5, 5)) + mx.array(2.0)
        if send:
            x = mx.distributed.send(2 * x, neighbor, group=pairs)
        else:
            x = mx.distributed.recv_like(x, neighbor, group=pairs)
        mx.eval(y, x)


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/nccl_test_distributed.py
================================================
# Copyright © 2024 Apple Inc.

import mlx.core as mx
import mlx.optimizers as optim
import mlx_distributed_tests
import mlx_tests
from mlx.nn.utils import average_gradients, fsdp_apply_gradients


class TestNCCLDistributed(mlx_distributed_tests.MLXDistributedCommonTestCase):
    @classmethod
    def setUpClass(cls):
        _ = mx.distributed.init(strict=True, backend="nccl")
        cls.atol = 1e-4
        cls.rtol = 1e-4

    def test_sum_scatter(self):

        world = mx.distributed.init()

        dtypes = [
            (mx.float32, 1e-6),
            (mx.float16, 5e-3),
            (mx.bfloat16, 1e-1),
        ]
        sizes = [
            (8,),
            (64,),
            (1024,),
            (1024, 1024),
        ]
        key = mx.random.key(world.rank())

        for dt, rtol in dtypes:
            for sh in sizes:
                x = (mx.random.uniform(shape=sh, key=key) * 10).astype(dt)  # shape=sh

                # Sum scatter
                y = mx.distributed.sum_scatter(x)  # shape=sh/world.size()
                z = mx.distributed.all_sum(x)  # shape=sh
                chunk = sh[0] // world.size()
                start = world.rank() * chunk
                stop = start + chunk
                z_ref = z[start:stop]

                maxrelerror = (y - z_ref).abs()
                if rtol > 0:
                    maxrelerror /= z_ref.abs()
                maxrelerror = maxrelerror.max()
                self.assertLessEqual(maxrelerror, rtol)

    def test_groups(self):
        world = mx.distributed.init()
        self.assertEqual(world.size(), 8)
        self.assertTrue(0 <= world.rank() < 8)

        world2 = mx.distributed.init()
        self.assertEqual(world.size(), world2.size())
        self.assertEqual(world.rank(), world2.rank())

        sub = world.split(world.rank() % 2)
        self.assertEqual(sub.size(), 4)
        self.assertEqual(sub.rank(), world.rank() // 2)

        sub = world.split(world.rank() // 2)
        self.assertEqual(sub.size(), 2)

    def test_all_reduce_split(self):
        world = mx.distributed.init()
        dtypes = [
            (mx.float32, 1e-6),
            (mx.float16, 5e-3),
            (mx.bfloat16, 1e-1),
        ]
        sizes = [
            (7,),
            (10,),
            (1024,),
            (1024, 1024),
        ]
        key = mx.random.key(0)
        group = world.split(world.rank() % 2)

        for dt, rtol in dtypes:
            for sh in sizes:
                x = (
                    mx.random.uniform(shape=(group.size(),) + sh, key=key) * 10
                ).astype(dt)

                # All sum
                y = mx.distributed.all_sum(x[group.rank()], group=group)
                z = x.sum(0)
                maxrelerror = (y - z).abs()
                if rtol > 0:
                    maxrelerror /= z.abs()
                maxrelerror = maxrelerror.max()
                self.assertLessEqual(maxrelerror, rtol)

                # All max
                y = mx.distributed.all_max(x[group.rank()], group=group)
                z = x.max(0)
                self.assertTrue(mx.all(y == z))

                # All min
                y = mx.distributed.all_min(x[group.rank()], group=group)
                z = x.min(0)
                self.assertTrue(mx.all(y == z))

    def test_all_gather_split(self):
        world = mx.distributed.init()
        dtypes = [mx.float32, mx.float16, mx.bfloat16]
        sub = world.split(world.rank() % 2)
        for dt in dtypes:
            x = mx.ones((2, 2, 4), dtype=dt)
            y = mx.distributed.all_gather(x, group=sub)
            self.assertEqual(y.shape, (sub.size() * 2, 2, 4))
            self.assertTrue(mx.all(y == 1))

    def test_fsdp_apply_gradients(self):
        world = mx.distributed.init()
        N = world.size()

        params = {
            "w1": mx.ones((N * 10, 8)),
            "w2": mx.ones((N * 20,)),
        }
        grads = {
            "w1": mx.ones((N * 10, 8)) * 0.1,
            "w2": mx.ones((N * 20,)) * 0.1,
        }

        optimizer = optim.SGD(learning_rate=0.1)
        updated_params_fsdp = fsdp_apply_gradients(grads, params, optimizer)
        mx.eval(updated_params_fsdp)

        self.assertEqual(updated_params_fsdp["w1"].shape, (N * 10, 8))
        self.assertEqual(updated_params_fsdp["w2"].shape, (N * 20,))

        self.assertTrue(
            mx.allclose(
                updated_params_fsdp["w1"], mx.ones((N * 10, 8)) * 0.99, atol=1e-6
            )
        )
        self.assertTrue(
            mx.allclose(updated_params_fsdp["w2"], mx.ones((N * 20,)) * 0.99, atol=1e-6)
        )

        grads = {
            "w1": mx.ones((N * 10, 8)) * 10.0,
            "w2": mx.ones((N * 20,)) * 10.0,
        }

        new_params_clipped, grad_norm = fsdp_apply_gradients(
            grads, params, optimizer, max_norm=1.0
        )
        mx.eval(new_params_clipped, grad_norm)

        self.assertIsNotNone(grad_norm)
        expected_norm = mx.sqrt((N * 10 * 8 + N * 20) * 100.0)
        self.assertTrue(mx.allclose(grad_norm, expected_norm, atol=1e-4, rtol=1e-4))
        self.assertEqual(new_params_clipped["w1"].shape, (N * 10, 8))
        self.assertEqual(new_params_clipped["w2"].shape, (N * 20,))

        scale = 1.0 / expected_norm
        expected_update = 1.0 - 0.1 * 10.0 * scale
        self.assertTrue(
            mx.allclose(
                new_params_clipped["w1"],
                mx.ones((N * 10, 8)) * expected_update,
                atol=1e-4,
                rtol=1e-4,
            )
        )
        self.assertTrue(
            mx.allclose(
                new_params_clipped["w2"],
                mx.ones((N * 20,)) * expected_update,
                atol=1e-4,
                rtol=1e-4,
            )
        )
        params = {"w": mx.ones((N * 4,))}
        grads = {"w": mx.ones((N * 4,)) * 0.5}

        optimizer_fsdp = optim.SGD(learning_rate=0.1)
        updated_params_fsdp = fsdp_apply_gradients(grads, params, optimizer_fsdp)

        optimizer_ddp = optim.SGD(learning_rate=0.1)
        avg_grads = average_gradients(grads)
        updated_params_ddp = optimizer_ddp.apply_gradients(avg_grads, params)
        mx.eval(updated_params_ddp, updated_params_fsdp)

        self.assertTrue(
            mx.allclose(
                updated_params_fsdp["w"], updated_params_ddp["w"], atol=1e-6, rtol=1e-4
            ),
        )

    def test_fsdp_ddp_apply_gradients(self):
        world = mx.distributed.init()
        N = world.size()
        S = 4
        fsdp_group = world.split(world.rank() // S)
        dp_group = world.split(world.rank() % S)

        self.assertEqual(fsdp_group.size(), S)
        self.assertEqual(dp_group.size(), N // S)

        params = {
            "w1": mx.ones((S * 10, 8)),
            "w2": mx.ones((S * 20,)),
        }
        grads = {
            "w1": mx.ones((S * 10, 8)) * 0.1,
            "w2": mx.ones((S * 20,)) * 0.1,
        }

        optimizer = optim.SGD(learning_rate=0.1)
        updated = fsdp_apply_gradients(
            grads,
            params,
            optimizer,
            fsdp_group=fsdp_group,
            dp_group=dp_group,
        )
        mx.eval(updated)

        self.assertEqual(updated["w1"].shape, (S * 10, 8))
        self.assertEqual(updated["w2"].shape, (S * 20,))

        self.assertTrue(
            mx.allclose(updated["w1"], mx.ones((S * 10, 8)) * 0.99, atol=1e-6)
        )
        self.assertTrue(
            mx.allclose(updated["w2"], mx.ones((S * 20,)) * 0.99, atol=1e-6)
        )

        grads_big = {
            "w1": mx.ones((S * 10, 8)) * 10.0,
            "w2": mx.ones((S * 20,)) * 10.0,
        }

        optimizer2 = optim.SGD(learning_rate=0.1)
        clipped, grad_norm = fsdp_apply_gradients(
            grads_big,
            params,
            optimizer2,
            fsdp_group=fsdp_group,
            dp_group=dp_group,
            max_norm=1.0,
        )
        mx.eval(clipped, grad_norm)

        self.assertIsNotNone(grad_norm)
        expected_norm = mx.sqrt((S * 10 * 8 + S * 20) * 100.0)
        self.assertTrue(mx.allclose(grad_norm, expected_norm, atol=1e-4, rtol=1e-4))
        self.assertEqual(clipped["w1"].shape, (S * 10, 8))
        self.assertEqual(clipped["w2"].shape, (S * 20,))

        scale = 1.0 / expected_norm
        expected_update = 1.0 - 0.1 * 10.0 * scale
        self.assertTrue(
            mx.allclose(
                clipped["w1"],
                mx.ones((S * 10, 8)) * expected_update,
                atol=1e-4,
                rtol=1e-4,
            )
        )
        self.assertTrue(
            mx.allclose(
                clipped["w2"],
                mx.ones((S * 20,)) * expected_update,
                atol=1e-4,
                rtol=1e-4,
            )
        )

        params_eq = {"w": mx.ones((S * 4,))}
        grads_eq = {"w": mx.ones((S * 4,)) * 0.5}

        optimizer_hybrid = optim.SGD(learning_rate=0.1)
        updated_hybrid = fsdp_apply_gradients(
            grads_eq,
            params_eq,
            optimizer_hybrid,
            fsdp_group=fsdp_group,
            dp_group=dp_group,
        )

        optimizer_ddp = optim.SGD(learning_rate=0.1)
        avg_grads = average_gradients(grads_eq)
        updated_ddp = optimizer_ddp.apply_gradients(avg_grads, params_eq)
        mx.eval(updated_hybrid, updated_ddp)

        self.assertTrue(
            mx.allclose(updated_hybrid["w"], updated_ddp["w"], atol=1e-6, rtol=1e-4),
        )

    def test_fsdp_peak_memory(self):
        world = mx.distributed.init()
        N = world.size()
        mx.random.seed(42)
        params = {
            "w1": mx.random.normal((N * 1024, 1024)),
            "w2": mx.random.normal((N * 2048, 512)),
        }
        grads = {
            "w1": mx.random.normal((N * 1024, 1024)),
            "w2": mx.random.normal((N * 2048, 512)),
        }
        mx.eval(params, grads)
        optimizer_ddp = optim.Adam(learning_rate=0.01)
        optimizer_fsdp = optim.Adam(learning_rate=0.01)

        def pseudo_step_ddp(grads, params, optimizer):
            grads = average_gradients(grads)
            grads, grad_norm = optim.clip_grad_norm(grads, max_norm=1.0)
            params = optimizer.apply_gradients(grads, params)
            return grad_norm, params

        def pseudo_step_fsdp(grads, params, optimizer):
            params, grad_norm = fsdp_apply_gradients(
                grads, params, optimizer, max_norm=1.0
            )
            return grad_norm, params

        mx.reset_peak_memory()

        for i in range(10):
            grad_norm, params = pseudo_step_ddp(grads, params, optimizer_ddp)
            mx.eval(grad_norm, params)

        ddp_peak_memory = mx.get_peak_memory()
        mx.reset_peak_memory()

        for i in range(10):
            grad_norm, params = pseudo_step_fsdp(grads, params, optimizer_fsdp)
            mx.eval(grad_norm, params)

        fsdp_peak_memory = mx.get_peak_memory()
        self.assertTrue(fsdp_peak_memory < ddp_peak_memory)


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/ring_test_distributed.py
================================================
# Copyright © 2024 Apple Inc.

import mlx.core as mx
import mlx_distributed_tests
import mlx_tests


class TestRingDistributed(mlx_distributed_tests.MLXDistributedCommonTestCase):
    @classmethod
    def setUpClass(cls):
        _ = mx.distributed.init(strict=True, backend="ring")
        cls.atol = 1e-6
        cls.rtol = 1e-4

    def test_groups(self):
        world = mx.distributed.init()
        self.assertEqual(world.size(), 8)
        self.assertTrue(0 <= world.rank() < 8)

        world2 = mx.distributed.init()
        self.assertEqual(world.size(), world2.size())
        self.assertEqual(world.rank(), world2.rank())

        with self.assertRaises(RuntimeError):
            sub = world.split(world.rank() % 2)

    def test_all_reduce_extra(self):
        world = mx.distributed.init()
        dtypes = [
            (mx.int16, 0),
            (mx.uint16, 0),
            (mx.complex64, 1e-6),
        ]
        sizes = [
            (7,),
            (10,),
            (1024,),
            (1024, 1024),
        ]
        key = mx.random.key(0)

        for dt, rtol in dtypes:
            for sh in sizes:
                x = (
                    mx.random.uniform(shape=(world.size(),) + sh, key=key) * 10
                ).astype(dt)

                # All sum
                y = mx.distributed.all_sum(x[world.rank()])
                z = x.sum(0)
                maxrelerror = (y - z).abs()
                if rtol > 0:
                    maxrelerror /= z.abs()
                maxrelerror = maxrelerror.max()
                self.assertLessEqual(maxrelerror, rtol)

                # All max
                y = mx.distributed.all_max(x[world.rank()])
                z = x.max(0)
                self.assertTrue(mx.all(y == z))

                # All min
                y = mx.distributed.all_min(x[world.rank()])
                z = x.min(0)
                self.assertTrue(mx.all(y == z))

    def test_all_gather_extra(self):
        world = mx.distributed.init()
        dtypes = [
            mx.int16,
            mx.uint16,
            mx.complex64,
        ]
        for dt in dtypes:
            x = mx.ones((2, 2, 4), dtype=dt)
            y = mx.distributed.all_gather(x)
            self.assertEqual(y.shape, (world.size() * 2, 2, 4))
            self.assertTrue(mx.all(y == 1))

    def test_send_recv(self):
        world = mx.distributed.init()
        dtypes = [
            mx.int8,
            mx.uint8,
            mx.int16,
            mx.uint16,
            mx.int32,
            mx.uint32,
            mx.float32,
            mx.float16,
            mx.bfloat16,
            mx.complex64,
        ]
        sizes = [
            (7,),
            (10,),
            (1024,),
            (1024, 1024),
        ]
        key = mx.random.key(0)
        right = (world.rank() + 1) % world.size()
        left = (world.rank() + world.size() - 1) % world.size()
        for dt in dtypes:
            for sh in sizes:
                x = (
                    mx.random.uniform(shape=(world.size(),) + sh, key=key) * 10
                ).astype(dt)
                if world.rank() % 2 == 0:
                    y = mx.distributed.send(x[world.rank()], right)
                    z = mx.distributed.recv_like(y, left)
                    mx.eval(y, z)
                else:
                    z = mx.distributed.recv_like(x[world.rank()], left)
                    y = mx.distributed.send(x[world.rank()], right)
                    mx.eval(z, y)
                self.assertTrue(mx.all(y == x[world.rank()]))
                self.assertTrue(mx.all(z == x[left]))

    def test_all_gather_vjp(self):
        def fun(x):
            return mx.distributed.all_gather(x)[0]

        dfdx = mx.grad(fun)(mx.array(1.0))
        if mx.distributed.init().rank() == 0:
            self.assertEqual(dfdx.item(), 1.0)
        else:
            self.assertEqual(dfdx.item(), 0.0)


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_array.py
================================================
# Copyright © 2023-2024 Apple Inc.

import gc
import operator
import os
import pickle
import platform
import sys
import unittest
import weakref
from copy import copy, deepcopy
from itertools import permutations

import mlx.core as mx
import mlx_tests
import numpy as np
import psutil

try:
    import tensorflow as tf

    has_tf = True
except ImportError as e:
    has_tf = False


class TestVersion(mlx_tests.MLXTestCase):
    def test_version(self):
        v = mx.__version__
        vnums = v.split(".")
        self.assertGreaterEqual(len(vnums), 3)
        v = ".".join(str(int(vn)) for vn in vnums[:3])
        self.assertEqual(v, mx.__version__[: len(v)])


class TestDtypes(mlx_tests.MLXTestCase):
    def test_dtypes(self):
        self.assertEqual(mx.bool_.size, 1)
        self.assertEqual(mx.uint8.size, 1)
        self.assertEqual(mx.uint16.size, 2)
        self.assertEqual(mx.uint32.size, 4)
        self.assertEqual(mx.uint64.size, 8)
        self.assertEqual(mx.int8.size, 1)
        self.assertEqual(mx.int16.size, 2)
        self.assertEqual(mx.int32.size, 4)
        self.assertEqual(mx.int64.size, 8)
        self.assertEqual(mx.float16.size, 2)
        self.assertEqual(mx.float32.size, 4)
        self.assertEqual(mx.bfloat16.size, 2)
        self.assertEqual(mx.complex64.size, 8)

        self.assertEqual(str(mx.bool_), "mlx.core.bool")
        self.assertEqual(str(mx.uint8), "mlx.core.uint8")
        self.assertEqual(str(mx.uint16), "mlx.core.uint16")
        self.assertEqual(str(mx.uint32), "mlx.core.uint32")
        self.assertEqual(str(mx.uint64), "mlx.core.uint64")
        self.assertEqual(str(mx.int8), "mlx.core.int8")
        self.assertEqual(str(mx.int16), "mlx.core.int16")
        self.assertEqual(str(mx.int32), "mlx.core.int32")
        self.assertEqual(str(mx.int64), "mlx.core.int64")
        self.assertEqual(str(mx.float16), "mlx.core.float16")
        self.assertEqual(str(mx.float32), "mlx.core.float32")
        self.assertEqual(str(mx.bfloat16), "mlx.core.bfloat16")
        self.assertEqual(str(mx.complex64), "mlx.core.complex64")

    def test_scalar_conversion(self):
        dtypes = [
            "uint8",
            "uint16",
            "uint32",
            "uint64",
            "int8",
            "int16",
            "int32",
            "int64",
            "float16",
            "float32",
            "complex64",
        ]

        for dtype in dtypes:
            with self.subTest(dtype=dtype):
                x = np.array(2, dtype=getattr(np, dtype))
                y = np.min(x)

                self.assertEqual(x.dtype, y.dtype)
                self.assertTupleEqual(x.shape, y.shape)

                z = mx.array(y)
                self.assertEqual(np.array(z), x)
                self.assertEqual(np.array(z), y)
                self.assertEqual(z.dtype, getattr(mx, dtype))
                self.assertListEqual(list(z.shape), list(x.shape))
                self.assertListEqual(list(z.shape), list(y.shape))

    def test_finfo(self):
        with self.assertRaises(ValueError):
            mx.finfo(mx.int32)

        self.assertEqual(mx.finfo(mx.float32).min, np.finfo(np.float32).min)
        self.assertEqual(mx.finfo(mx.float32).max, np.finfo(np.float32).max)
        self.assertEqual(mx.finfo(mx.float32).eps, np.finfo(np.float32).eps)
        self.assertEqual(mx.finfo(mx.float32).dtype, mx.float32)

        self.assertEqual(mx.finfo(mx.float16).min, np.finfo(np.float16).min)
        self.assertEqual(mx.finfo(mx.float16).max, np.finfo(np.float16).max)
        self.assertEqual(mx.finfo(mx.float16).eps, np.finfo(np.float16).eps)
        self.assertEqual(mx.finfo(mx.float16).dtype, mx.float16)

    def test_iinfo(self):
        with self.assertRaises(ValueError):
            mx.iinfo(mx.float32)

        self.assertEqual(mx.iinfo(mx.int32).min, np.iinfo(np.int32).min)
        self.assertEqual(mx.iinfo(mx.int32).max, np.iinfo(np.int32).max)
        self.assertEqual(mx.iinfo(mx.int32).dtype, mx.int32)

        self.assertEqual(mx.iinfo(mx.uint32).min, np.iinfo(np.uint32).min)
        self.assertEqual(mx.iinfo(mx.uint32).max, np.iinfo(np.uint32).max)
        self.assertEqual(mx.iinfo(mx.int8).dtype, mx.int8)


class TestEquality(mlx_tests.MLXTestCase):
    def test_array_eq_array(self):
        a = mx.array([1, 2, 3])
        b = mx.array([1, 2, 3])
        c = mx.array([1, 2, 4])
        self.assertTrue(mx.all(a == b))
        self.assertFalse(mx.all(a == c))

    def test_array_eq_scalar(self):
        a = mx.array([1, 2, 3])
        b = 1
        c = 4
        d = 2.5
        e = mx.array([1, 2.5, 3.25])
        self.assertTrue(mx.any(a == b))
        self.assertFalse(mx.all(a == c))
        self.assertFalse(mx.all(a == d))
        self.assertTrue(mx.any(a == e))

    def test_list_equals_array(self):
        a = mx.array([1, 2, 3])
        b = [1, 2, 3]
        c = [1, 2, 4]

        # mlx array equality returns false if is compared with any kind of
        # object which is not an mlx array
        self.assertFalse(a == b)
        self.assertFalse(a == c)

    def test_tuple_equals_array(self):
        a = mx.array([1, 2, 3])
        b = (1, 2, 3)
        c = (1, 2, 4)

        # mlx array equality returns false if is compared with any kind of
        # object which is not an mlx array
        self.assertFalse(a == b)
        self.assertFalse(a == c)


class TestInequality(mlx_tests.MLXTestCase):
    def test_array_ne_array(self):
        a = mx.array([1, 2, 3])
        b = mx.array([1, 2, 3])
        c = mx.array([1, 2, 4])
        self.assertFalse(mx.any(a != b))
        self.assertTrue(mx.any(a != c))

    def test_array_ne_scalar(self):
        a = mx.array([1, 2, 3])
        b = 1
        c = 4
        d = 1.5
        e = 2.5
        f = mx.array([1, 2.5, 3.25])
        self.assertFalse(mx.all(a != b))
        self.assertTrue(mx.any(a != c))
        self.assertTrue(mx.any(a != d))
        self.assertTrue(mx.any(a != e))
        self.assertFalse(mx.all(a != f))

    def test_list_not_equals_array(self):
        a = mx.array([1, 2, 3])
        b = [1, 2, 3]
        c = [1, 2, 4]

        # mlx array inequality returns true if is compared with any kind of
        # object which is not an mlx array
        self.assertTrue(a != b)
        self.assertTrue(a != c)

    def test_dlx_device_type(self):
        a = mx.array([1, 2, 3])
        device_type, device_id = a.__dlpack_device__()
        self.assertIn(device_type, [1, 8, 13])
        self.assertEqual(device_id, 0)

        if device_type == 8:
            # Additional check if Metal is supposed to be available
            self.assertTrue(mx.metal.is_available())
        elif device_type == 1:
            # Additional check if CPU is the fallback
            self.assertFalse(mx.metal.is_available())

    def test_tuple_not_equals_array(self):
        a = mx.array([1, 2, 3])
        b = (1, 2, 3)
        c = (1, 2, 4)

        # mlx array inequality returns true if is compared with any kind of
        # object which is not an mlx array
        self.assertTrue(a != b)
        self.assertTrue(a != c)

    def test_obj_inequality_array(self):
        str_ = "hello"
        a = mx.array([1, 2, 3])
        lst_ = [1, 2, 3]
        tpl_ = (1, 2, 3)

        # check if object comparison(</>/<=/>=) with mlx array should throw an exception
        # if not, the tests will fail
        with self.assertRaises(ValueError):
            a < str_
        with self.assertRaises(ValueError):
            a > str_
        with self.assertRaises(ValueError):
            a <= str_
        with self.assertRaises(ValueError):
            a >= str_
        with self.assertRaises(ValueError):
            a < lst_
        with self.assertRaises(ValueError):
            a > lst_
        with self.assertRaises(ValueError):
            a <= lst_
        with self.assertRaises(ValueError):
            a >= lst_
        with self.assertRaises(ValueError):
            a < tpl_
        with self.assertRaises(ValueError):
            a > tpl_
        with self.assertRaises(ValueError):
            a <= tpl_
        with self.assertRaises(ValueError):
            a >= tpl_

    def test_invalid_op_on_array(self):
        str_ = "hello"
        a = mx.array([1, 2.5, 3.25])
        lst_ = [1, 2.1, 3.25]
        tpl_ = (1, 2.5, 3.25)

        with self.assertRaises(ValueError):
            a * str_
        with self.assertRaises(ValueError):
            a *= str_
        with self.assertRaises(ValueError):
            a /= lst_
        with self.assertRaises(ValueError):
            a // lst_
        with self.assertRaises(ValueError):
            a % lst_
        with self.assertRaises(ValueError):
            a**tpl_
        with self.assertRaises(ValueError):
            a & tpl_
        with self.assertRaises(ValueError):
            a | str_


class TestArray(mlx_tests.MLXTestCase):
    def test_array_basics(self):
        x = mx.array(1)
        self.assertEqual(x.size, 1)
        self.assertEqual(x.ndim, 0)
        self.assertEqual(x.itemsize, 4)
        self.assertEqual(x.nbytes, 4)
        self.assertEqual(x.shape, ())
        self.assertEqual(x.dtype, mx.int32)
        self.assertEqual(x.item(), 1)
        self.assertTrue(isinstance(x.item(), int))

        with self.assertRaises(TypeError):
            len(x)

        x = mx.array(1, mx.uint32)
        self.assertEqual(x.item(), 1)
        self.assertTrue(isinstance(x.item(), int))

        x = mx.array(1, mx.int64)
        self.assertEqual(x.item(), 1)
        self.assertTrue(isinstance(x.item(), int))

        x = mx.array(1, mx.bfloat16)
        self.assertEqual(x.item(), 1.0)

        x = mx.array(1.0)
        self.assertEqual(x.size, 1)
        self.assertEqual(x.ndim, 0)
        self.assertEqual(x.shape, ())
        self.assertEqual(x.dtype, mx.float32)
        self.assertEqual(x.item(), 1.0)
        self.assertTrue(isinstance(x.item(), float))

        x = mx.array(False)
        self.assertEqual(x.size, 1)
        self.assertEqual(x.ndim, 0)
        self.assertEqual(x.shape, ())
        self.assertEqual(x.dtype, mx.bool_)
        self.assertEqual(x.item(), False)
        self.assertTrue(isinstance(x.item(), bool))

        x = mx.array(complex(1, 1))
        self.assertEqual(x.ndim, 0)
        self.assertEqual(x.shape, ())
        self.assertEqual(x.dtype, mx.complex64)
        self.assertEqual(x.item(), complex(1, 1))
        self.assertTrue(isinstance(x.item(), complex))

        x = mx.array([True, False, True])
        self.assertEqual(x.dtype, mx.bool_)
        self.assertEqual(x.ndim, 1)
        self.assertEqual(x.shape, (3,))
        self.assertEqual(len(x), 3)

        x = mx.array([True, False, True], mx.float32)
        self.assertEqual(x.dtype, mx.float32)

        x = mx.array([0, 1, 2])
        self.assertEqual(x.dtype, mx.int32)
        self.assertEqual(x.ndim, 1)
        self.assertEqual(x.shape, (3,))

        x = mx.array([0, 1, 2], mx.float32)
        self.assertEqual(x.dtype, mx.float32)

        x = mx.array([0.0, 1.0, 2.0])
        self.assertEqual(x.dtype, mx.float32)
        self.assertEqual(x.ndim, 1)
        self.assertEqual(x.shape, (3,))

        x = mx.array([1j, 1 + 0j])
        self.assertEqual(x.dtype, mx.complex64)
        self.assertEqual(x.ndim, 1)
        self.assertEqual(x.shape, (2,))

        # From tuple
        x = mx.array((1, 2, 3), mx.int32)
        self.assertEqual(x.dtype, mx.int32)
        self.assertEqual(x.tolist(), [1, 2, 3])

    def test_bool_conversion(self):
        x = mx.array(True)
        self.assertTrue(x)
        x = mx.array(False)
        self.assertFalse(x)
        x = mx.array(1.0)
        self.assertTrue(x)
        x = mx.array(0.0)
        self.assertFalse(x)

    def test_int_type(self):
        x = mx.array(1)
        self.assertTrue(x.dtype == mx.int32)
        x = mx.array(2**32 - 1)
        self.assertTrue(x.dtype == mx.int64)
        x = mx.array(2**40)
        self.assertTrue(x.dtype == mx.int64)
        x = mx.array(2**32 - 1, dtype=mx.uint32)
        self.assertTrue(x.dtype == mx.uint32)
        x = mx.array([1, 2], dtype=mx.int64) + 0x80000000
        self.assertTrue(x.dtype == mx.int64)

    def test_construction_from_lists(self):
        x = mx.array([])
        self.assertEqual(x.size, 0)
        self.assertEqual(x.shape, (0,))
        self.assertEqual(x.dtype, mx.float32)

        x = mx.array([[], [], []])
        self.assertEqual(x.size, 0)
        self.assertEqual(x.shape, (3, 0))
        self.assertEqual(x.dtype, mx.float32)

        x = mx.array([[[], []], [[], []], [[], []]])
        self.assertEqual(x.size, 0)
        self.assertEqual(x.shape, (3, 2, 0))
        self.assertEqual(x.dtype, mx.float32)

        # Check failure cases
        with self.assertRaises(ValueError):
            x = mx.array([[[], []], [[]], [[], []]])

        with self.assertRaises(ValueError):
            x = mx.array([[[], []], [[1.0, 2.0], []], [[], []]])

        with self.assertRaises(ValueError):
            x = mx.array([[0, 1], [[0, 1], 1]])

        with self.assertRaises(ValueError):
            x = mx.array([[0, 1], ["hello", 1]])

        x = mx.array([True, False, 3])
        self.assertEqual(x.dtype, mx.int32)

        x = mx.array([True, False, 3, 4.0])
        self.assertEqual(x.dtype, mx.float32)

        x = mx.array([[True, False], [1, 3], [2, 4.0]])
        self.assertEqual(x.dtype, mx.float32)

        x = mx.array([[1.0, 2.0], [0.0, 3.9]], mx.bool_)
        self.assertEqual(x.dtype, mx.bool_)
        self.assertTrue(mx.array_equal(x, mx.array([[True, True], [False, True]])))

        x = mx.array([[1.0, 2.0], [0.0, 3.9]], mx.int32)
        self.assertTrue(mx.array_equal(x, mx.array([[1, 2], [0, 3]])))

        x = mx.array([1 + 0j, 2j, True, 0], mx.complex64)
        self.assertEqual(x.tolist(), [1 + 0j, 2j, 1 + 0j, 0j])

        xnp = np.array([0, 4294967295], dtype=np.uint32)
        x = mx.array([0, 4294967295], dtype=mx.uint32)
        self.assertTrue(np.array_equal(x, xnp))

        xnp = np.array([0, 4294967295], dtype=np.float32)
        x = mx.array([0, 4294967295], dtype=mx.float32)
        self.assertTrue(np.array_equal(x, xnp))

    def test_double_keeps_precision(self):
        x = 39.14223403241
        out = mx.array(x, dtype=mx.float64).item()
        self.assertEqual(out, x)

        out = mx.array([x], dtype=mx.float64).item()
        self.assertEqual(out, x)

    def test_construction_from_lists_of_mlx_arrays(self):
        dtypes = [
            mx.bool_,
            mx.uint8,
            mx.uint16,
            mx.uint32,
            mx.uint64,
            mx.int8,
            mx.int16,
            mx.int32,
            mx.int64,
            mx.float16,
            mx.float32,
            mx.bfloat16,
            mx.complex64,
        ]
        for x_t, y_t in permutations(dtypes, 2):
            # check type promotion and numeric correctness
            x, y = mx.array([1.0], x_t), mx.array([2.0], y_t)
            z = mx.array([x, y])
            expected = mx.stack([x, y], axis=0)
            self.assertEqualArray(z, expected)

            # check heterogeneous construction with mlx arrays and python primitive types
            x, y = mx.array([True], x_t), mx.array([False], y_t)
            z = mx.array([[x, [2.0]], [[3.0], y]])
            expected = mx.array([[[x.item()], [2.0]], [[3.0], [y.item()]]], z.dtype)
            self.assertEqualArray(z, expected)

        # check when create from an array which does not contain memory to the raw data
        x = mx.array([1.0]).astype(mx.bfloat16)  # x does not hold raw data
        for y_t in dtypes:
            y = mx.array([2.0], y_t)
            z = mx.array([x, y])
            expected = mx.stack([x, y], axis=0)
            self.assertEqualArray(z, expected)

        # shape check from `stack()`
        with self.assertRaises(ValueError) as e:
            mx.array([x, 1.0])
        self.assertEqual(
            str(e.exception), "Initialization encountered non-uniform length."
        )

        # shape check from `validate_shape`
        with self.assertRaises(ValueError) as e:
            mx.array([1.0, x])
        self.assertEqual(
            str(e.exception), "Initialization encountered non-uniform length."
        )

        # check that `[mx.array, ...]` retains the `mx.array` in the graph
        def f(x):
            y = mx.array([x, mx.array([2.0])])
            return (2 * y).sum()

        x = mx.array([1.0])
        dfdx = mx.grad(f)
        self.assertEqual(dfdx(x).item(), 2.0)

    def test_init_from_array(self):
        x = mx.array(3.0)
        y = mx.array(x)

        self.assertTrue(mx.array_equal(x, y))

        y = mx.array(x, mx.int32)
        self.assertEqual(y.dtype, mx.int32)
        self.assertEqual(y.item(), 3)

        y = mx.array(x, mx.bool_)
        self.assertEqual(y.dtype, mx.bool_)
        self.assertEqual(y.item(), True)

        y = mx.array(x, mx.complex64)
        self.assertEqual(y.dtype, mx.complex64)
        self.assertEqual(y.item(), 3.0 + 0j)

    def test_array_repr(self):
        x = mx.array(True)
        self.assertEqual(str(x), "array(True, dtype=bool)")
        x = mx.array(1)
        self.assertEqual(str(x), "array(1, dtype=int32)")
        x = mx.array(1.0)
        self.assertEqual(str(x), "array(1, dtype=float32)")

        x = mx.array([1, 0, 1])
        self.assertEqual(str(x), "array([1, 0, 1], dtype=int32)")

        x = mx.array([1] * 6)
        expected = "array([1, 1, 1, 1, 1, 1], dtype=int32)"
        self.assertEqual(str(x), expected)

        x = mx.array([1] * 7)
        expected = "array([1, 1, 1, ..., 1, 1, 1], dtype=int32)"
        self.assertEqual(str(x), expected)

        x = mx.array([[1, 2], [1, 2], [1, 2]])
        expected = "array([[1, 2],\n       [1, 2],\n       [1, 2]], dtype=int32)"
        self.assertEqual(str(x), expected)

        x = mx.array([[[1, 2], [1, 2]], [[1, 2], [1, 2]]])
        expected = (
            "array([[[1, 2],\n"
            "        [1, 2]],\n"
            "       [[1, 2],\n"
            "        [1, 2]]], dtype=int32)"
        )
        self.assertEqual(str(x), expected)

        x = mx.array([[1, 2]] * 6)
        expected = (
            "array([[1, 2],\n"
            "       [1, 2],\n"
            "       [1, 2],\n"
            "       [1, 2],\n"
            "       [1, 2],\n"
            "       [1, 2]], dtype=int32)"
        )
        self.assertEqual(str(x), expected)
        x = mx.array([[1, 2]] * 7)
        expected = (
            "array([[1, 2],\n"
            "       [1, 2],\n"
            "       [1, 2],\n"
            "       ...,\n"
            "       [1, 2],\n"
            "       [1, 2],\n"
            "       [1, 2]], dtype=int32)"
        )
        self.assertEqual(str(x), expected)

        x = mx.array([1], dtype=mx.int8)
        expected = "array([1], dtype=int8)"
        self.assertEqual(str(x), expected)
        x = mx.array([1], dtype=mx.int16)
        expected = "array([1], dtype=int16)"
        self.assertEqual(str(x), expected)
        x = mx.array([1], dtype=mx.uint8)
        expected = "array([1], dtype=uint8)"
        self.assertEqual(str(x), expected)

        # Fp16 is not supported in all platforms
        x = mx.array([1.2], dtype=mx.float16)
        expected = "array([1.2002], dtype=float16)"
        self.assertEqual(str(x), expected)

        x = mx.array([1 + 1j], dtype=mx.complex64)
        expected = "array([1+1j], dtype=complex64)"
        self.assertEqual(str(x), expected)
        x = mx.array([1 - 1j], dtype=mx.complex64)
        expected = "array([1-1j], dtype=complex64)"

        x = mx.array([1 + 1j], dtype=mx.complex64)
        expected = "array([1+1j], dtype=complex64)"
        self.assertEqual(str(x), expected)
        x = mx.array([1 - 1j], dtype=mx.complex64)
        expected = "array([1-1j], dtype=complex64)"

    def test_array_to_list(self):
        types = [mx.bool_, mx.uint32, mx.int32, mx.int64, mx.float32]
        for t in types:
            x = mx.array(1, t)
            self.assertEqual(x.tolist(), 1)

        vals = [1, 2, 3, 4]
        x = mx.array(vals)
        self.assertEqual(x.tolist(), vals)

        vals = [[1, 2], [3, 4]]
        x = mx.array(vals)
        self.assertEqual(x.tolist(), vals)

        vals = [[1, 0], [0, 1]]
        x = mx.array(vals, mx.bool_)
        self.assertEqual(x.tolist(), vals)

        vals = [[1.5, 2.5], [3.5, 4.5]]
        x = mx.array(vals)
        self.assertEqual(x.tolist(), vals)

        vals = [[[0.5, 1.5], [2.5, 3.5]], [[4.5, 5.5], [6.5, 7.5]]]
        x = mx.array(vals)
        self.assertEqual(x.tolist(), vals)

        # Empty arrays
        vals = []
        x = mx.array(vals)
        self.assertEqual(x.tolist(), vals)

        vals = [[], []]
        x = mx.array(vals)
        self.assertEqual(x.tolist(), vals)

        # Complex arrays
        vals = [0.5 + 0j, 1.5 + 1j, 2.5 + 0j, 3.5 + 1j]
        x = mx.array(vals)
        self.assertEqual(x.tolist(), vals)

        # Half types
        vals = [1.0, 2.0, 3.0, 4.0, 5.0]
        x = mx.array(vals, dtype=mx.float16)
        self.assertEqual(x.tolist(), vals)

        x = mx.array(vals, dtype=mx.bfloat16)
        self.assertEqual(x.tolist(), vals)

    def test_array_np_conversion(self):
        # Shape test
        a = np.array([])
        x = mx.array(a)
        self.assertEqual(x.size, 0)
        self.assertEqual(x.shape, (0,))
        self.assertEqual(x.dtype, mx.float32)

        a = np.array([[], [], []])
        x = mx.array(a)
        self.assertEqual(x.size, 0)
        self.assertEqual(x.shape, (3, 0))
        self.assertEqual(x.dtype, mx.float32)

        a = np.array([[[], []], [[], []], [[], []]])
        x = mx.array(a)
        self.assertEqual(x.size, 0)
        self.assertEqual(x.shape, (3, 2, 0))
        self.assertEqual(x.dtype, mx.float32)

        # Content test
        a = 2.0 * np.ones((3, 5, 4))
        x = mx.array(a)
        self.assertEqual(x.dtype, mx.float32)
        self.assertEqual(x.ndim, 3)
        self.assertEqual(x.shape, (3, 5, 4))

        y = np.asarray(x)
        self.assertTrue(np.allclose(a, y))

        a = np.array(3, dtype=np.int32)
        x = mx.array(a)
        self.assertEqual(x.dtype, mx.int32)
        self.assertEqual(x.ndim, 0)
        self.assertEqual(x.shape, ())
        self.assertEqual(x.item(), 3)

        # mlx to numpy test
        x = mx.array([True, False, True])
        y = np.asarray(x)
        self.assertEqual(y.dtype, np.bool_)
        self.assertEqual(y.ndim, 1)
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y[0], True)
        self.assertEqual(y[1], False)
        self.assertEqual(y[2], True)

        # complex64 mx <-> np
        cvals = [0j, 1, 1 + 1j]
        x = np.array(cvals)
        y = mx.array(x)
        self.assertEqual(y.dtype, mx.complex64)
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.tolist(), cvals)

        y = mx.array([0j, 1, 1 + 1j])
        x = np.asarray(y)
        self.assertEqual(x.dtype, np.complex64)
        self.assertEqual(x.shape, (3,))
        self.assertEqual(x.tolist(), cvals)

    def test_array_np_dtype_conversion(self):
        dtypes_list = [
            (mx.bool_, np.bool_),
            (mx.uint8, np.uint8),
            (mx.uint16, np.uint16),
            (mx.uint32, np.uint32),
            (mx.uint64, np.uint64),
            (mx.int8, np.int8),
            (mx.int16, np.int16),
            (mx.int32, np.int32),
            (mx.int64, np.int64),
            (mx.float16, np.float16),
            (mx.float32, np.float32),
            (mx.complex64, np.complex64),
        ]

        for mlx_dtype, np_dtype in dtypes_list:
            a_npy = np.random.uniform(low=0, high=100, size=(32,)).astype(np_dtype)
            a_mlx = mx.array(a_npy)

            self.assertEqual(a_mlx.dtype, mlx_dtype)
            self.assertTrue(np.allclose(a_mlx, a_npy))

            b_mlx = mx.random.uniform(
                low=0,
                high=10,
                shape=(32,),
            ).astype(mlx_dtype)
            b_npy = np.array(b_mlx)

            self.assertEqual(b_npy.dtype, np_dtype)

    def test_array_from_noncontiguous_np(self):
        for t in [np.int8, np.int32, np.float16, np.float32, np.complex64]:
            np_arr = np.random.uniform(size=(10, 10)).astype(np.complex64)
            np_arr = np_arr.T
            mx_arr = mx.array(np_arr)
            self.assertTrue(mx.array_equal(np_arr, mx_arr))

    def test_array_np_shape_dim_check(self):
        a_npy = np.empty(2**31, dtype=np.bool_)
        with self.assertRaises(ValueError) as e:
            mx.array(a_npy)
        self.assertEqual(
            str(e.exception), "Shape dimension falls outside supported `int` range."
        )

    def test_dtype_promotion(self):
        dtypes_list = [
            (mx.bool_, np.bool_),
            (mx.uint8, np.uint8),
            (mx.uint16, np.uint16),
            (mx.uint32, np.uint32),
            (mx.uint64, np.uint64),
            (mx.int8, np.int8),
            (mx.int16, np.int16),
            (mx.int32, np.int32),
            (mx.int64, np.int64),
            (mx.float32, np.float32),
        ]

        promotion_pairs = permutations(dtypes_list, 2)

        for (mlx_dt_1, np_dt_1), (mlx_dt_2, np_dt_2) in promotion_pairs:
            with self.subTest(dtype1=np_dt_1, dtype2=np_dt_2):
                a_npy = np.ones((3,), dtype=np_dt_1)
                b_npy = np.ones((3,), dtype=np_dt_2)

                c_npy = a_npy + b_npy

                a_mlx = mx.ones((3,), dtype=mlx_dt_1)
                b_mlx = mx.ones((3,), dtype=mlx_dt_2)

                c_mlx = a_mlx + b_mlx

                self.assertEqual(c_mlx.dtype, mx.array(c_npy).dtype)

        a_mlx = mx.ones((3,), dtype=mx.float16)
        b_mlx = mx.ones((3,), dtype=mx.float32)
        c_mlx = a_mlx + b_mlx

        self.assertEqual(c_mlx.dtype, mx.float32)

        b_mlx = mx.ones((3,), dtype=mx.int32)
        c_mlx = a_mlx + b_mlx

        self.assertEqual(c_mlx.dtype, mx.float16)

    def test_dtype_python_scalar_promotion(self):
        tests = [
            (mx.bool_, operator.mul, False, mx.bool_),
            (mx.bool_, operator.mul, 0, mx.int32),
            (mx.bool_, operator.mul, 1.0, mx.float32),
            (mx.int8, operator.mul, False, mx.int8),
            (mx.int8, operator.mul, 0, mx.int8),
            (mx.int8, operator.mul, 1.0, mx.float32),
            (mx.int16, operator.mul, False, mx.int16),
            (mx.int16, operator.mul, 0, mx.int16),
            (mx.int16, operator.mul, 1.0, mx.float32),
            (mx.int32, operator.mul, False, mx.int32),
            (mx.int32, operator.mul, 0, mx.int32),
            (mx.int32, operator.mul, 1.0, mx.float32),
            (mx.int64, operator.mul, False, mx.int64),
            (mx.int64, operator.mul, 0, mx.int64),
            (mx.int64, operator.mul, 1.0, mx.float32),
            (mx.uint8, operator.mul, False, mx.uint8),
            (mx.uint8, operator.mul, 0, mx.uint8),
            (mx.uint8, operator.mul, 1.0, mx.float32),
            (mx.uint16, operator.mul, False, mx.uint16),
            (mx.uint16, operator.mul, 0, mx.uint16),
            (mx.uint16, operator.mul, 1.0, mx.float32),
            (mx.uint32, operator.mul, False, mx.uint32),
            (mx.uint32, operator.mul, 0, mx.uint32),
            (mx.uint32, operator.mul, 1.0, mx.float32),
            (mx.uint64, operator.mul, False, mx.uint64),
            (mx.uint64, operator.mul, 0, mx.uint64),
            (mx.uint64, operator.mul, 1.0, mx.float32),
            (mx.float32, operator.mul, False, mx.float32),
            (mx.float32, operator.mul, 0, mx.float32),
            (mx.float32, operator.mul, 1.0, mx.float32),
            (mx.float16, operator.mul, False, mx.float16),
            (mx.float16, operator.mul, 0, mx.float16),
            (mx.float16, operator.mul, 1.0, mx.float16),
        ]

        for dtype_in, f, v, dtype_out in tests:
            x = mx.array(0, dtype_in)
            y = f(x, v)
            self.assertEqual(y.dtype, dtype_out)

    def test_array_comparison(self):
        a = mx.array([0.0, 1.0, 5.0])
        b = mx.array([-1.0, 2.0, 5.0])

        self.assertEqual((a < b).tolist(), [False, True, False])
        self.assertEqual((a <= b).tolist(), [False, True, True])
        self.assertEqual((a > b).tolist(), [True, False, False])
        self.assertEqual((a >= b).tolist(), [True, False, True])

        self.assertEqual((a < 5).tolist(), [True, True, False])
        self.assertEqual((5 < a).tolist(), [False, False, False])
        self.assertEqual((5 <= a).tolist(), [False, False, True])
        self.assertEqual((a > 1).tolist(), [False, False, True])
        self.assertEqual((a >= 1).tolist(), [False, True, True])

    def test_array_neg(self):
        a = mx.array([-1.0, 4.0, 0.0])

        self.assertEqual((-a).tolist(), [1.0, -4.0, 0.0])

    def test_array_type_cast(self):
        a = mx.array([0.1, 2.3, -1.3])
        b = [0, 2, -1]

        self.assertEqual(a.astype(mx.int32).tolist(), b)
        self.assertEqual(a.astype(mx.int32).dtype, mx.int32)

        b = mx.array(b).astype(mx.float32)
        self.assertEqual(b.dtype, mx.float32)

    def test_array_iteration(self):
        a = mx.array([0, 1, 2])

        for i, x in enumerate(a):
            self.assertEqual(x.item(), i)

        a = mx.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
        x, y, z = a
        self.assertEqual(x.tolist(), [1.0, 2.0])
        self.assertEqual(y.tolist(), [3.0, 4.0])
        self.assertEqual(z.tolist(), [5.0, 6.0])

    def test_array_pickle(self):
        dtypes = [
            mx.int8,
            mx.int16,
            mx.int32,
            mx.int64,
            mx.uint8,
            mx.uint16,
            mx.uint32,
            mx.uint64,
            mx.float16,
            mx.float32,
            mx.bfloat16,
            mx.complex64,
        ]

        for dtype in dtypes:
            x = mx.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]], dtype=dtype)
            state = pickle.dumps(x)
            y = pickle.loads(state)
            self.assertEqualArray(y, x)

    def test_array_copy(self):
        dtypes = [
            mx.int8,
            mx.int16,
            mx.int32,
            mx.int64,
            mx.uint8,
            mx.uint16,
            mx.uint32,
            mx.uint64,
            mx.float16,
            mx.float32,
            mx.bfloat16,
            mx.complex64,
        ]

        for copy_function in [copy, deepcopy]:
            for dtype in dtypes:
                x = mx.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]], dtype=dtype)
                y = copy_function(x)
                self.assertEqualArray(y, x)

                y -= 1
                self.assertEqualArray(y, x - 1)

    def test_indexing(self):
        # Only ellipsis is a no-op
        a_mlx = mx.array([1])[...]
        self.assertEqual(a_mlx.shape, (1,))
        self.assertEqual(a_mlx.item(), 1)

        # Basic content check, slice indexing
        a_npy = np.arange(64, dtype=np.float32)
        a_mlx = mx.array(a_npy)
        a_sliced_mlx = a_mlx[2:50:4]
        a_sliced_npy = np.asarray(a_sliced_mlx)
        self.assertTrue(np.array_equal(a_sliced_npy, a_npy[2:50:4]))

        # Basic content check, mlx array indexing
        a_npy = np.arange(64, dtype=np.int32)
        a_npy = a_npy.reshape((8, 8))
        a_mlx = mx.array(a_npy)
        idx_npy = np.array([0, 1, 2, 7, 5], dtype=np.uint32)
        idx_mlx = mx.array(idx_npy)
        a_sliced_mlx = a_mlx[idx_mlx]
        a_sliced_npy = np.asarray(a_sliced_mlx)
        self.assertTrue(np.array_equal(a_sliced_npy, a_npy[idx_npy]))

        # Basic content check, int indexing
        a_sliced_mlx = a_mlx[5]
        a_sliced_npy = np.asarray(a_sliced_mlx)
        self.assertTrue(np.array_equal(a_sliced_npy, a_npy[5]))
        self.assertEqual(len(a_sliced_npy.shape), len(a_npy[5].shape))
        self.assertEqual(len(a_sliced_npy.shape), 1)
        self.assertEqual(a_sliced_npy.shape[0], a_npy[5].shape[0])

        # Basic content check, negative indexing
        a_sliced_mlx = a_mlx[-1]
        self.assertTrue(np.array_equal(a_sliced_mlx, a_npy[-1]))

        # NumPy integer scalar indexing
        a_sliced_mlx = a_mlx[np.int64(5)]
        a_sliced_npy = np.asarray(a_sliced_mlx)
        self.assertTrue(np.array_equal(a_sliced_npy, a_npy[np.int64(5)]))

        # Basic content check, empty index
        a_sliced_mlx = a_mlx[()]
        a_sliced_npy = np.asarray(a_sliced_mlx)
        self.assertTrue(np.array_equal(a_sliced_npy, a_npy[()]))

        # Basic content check, new axis
        a_sliced_mlx = a_mlx[None]
        a_sliced_npy = np.asarray(a_sliced_mlx)
        self.assertTrue(np.array_equal(a_sliced_npy, a_npy[None]))

        a_sliced_mlx = a_mlx[:, None]
        a_sliced_npy = np.asarray(a_sliced_mlx)
        self.assertTrue(np.array_equal(a_sliced_npy, a_npy[:, None]))

        # Multi dim indexing, all ints
        self.assertEqual(a_mlx[0, 0].item(), 0)
        self.assertEqual(a_mlx[0, 0].ndim, 0)

        # Multi dim indexing, all slices
        a_sliced_mlx = a_mlx[2:4, 5:]
        a_sliced_npy = np.asarray(a_sliced_mlx)
        self.assertTrue(np.array_equal(a_sliced_npy, a_npy[2:4, 5:]))

        a_sliced_mlx = a_mlx[:, 0:5]
        a_sliced_npy = np.asarray(a_sliced_mlx)
        self.assertTrue(np.array_equal(a_sliced_npy, a_npy[:, 0:5]))

        # Slicing, strides
        a_sliced_mlx = a_mlx[:, ::2]
        a_sliced_npy = np.asarray(a_sliced_mlx)
        self.assertTrue(np.array_equal(a_sliced_npy, a_npy[:, ::2]))

        # Slicing, -ve index
        a_sliced_mlx = a_mlx[-2:, :-1]
        a_sliced_npy = np.asarray(a_sliced_mlx)
        self.assertTrue(np.array_equal(a_sliced_npy, a_npy[-2:, :-1]))

        # Slicing, start > end
        a_sliced_mlx = a_mlx[8:3]
        self.assertEqual(a_sliced_mlx.size, 0)

        # Slicing, Clipping past the end
        a_sliced_mlx = a_mlx[7:10]
        a_sliced_npy = np.asarray(a_sliced_mlx)
        self.assertTrue(np.array_equal(a_sliced_npy, a_npy[7:10]))

        # Multi dim indexing, int and slices
        a_sliced_mlx = a_mlx[0, :5]
        a_sliced_npy = np.asarray(a_sliced_mlx)
        self.assertTrue(np.array_equal(a_sliced_npy, a_npy[0, :5]))

        a_sliced_mlx = a_mlx[:, -1]
        a_sliced_npy = np.asarray(a_sliced_mlx)
        self.assertTrue(np.array_equal(a_sliced_npy, a_npy[:, -1]))

        # Multi dim indexing, int and array
        a_sliced_mlx = a_mlx[idx_mlx, 0]
        a_sliced_npy = np.asarray(a_sliced_mlx)
        self.assertTrue(np.array_equal(a_sliced_npy, a_npy[idx_npy, 0]))

        # Multi dim indexing, array and slices
        a_sliced_mlx = a_mlx[idx_mlx, :5]
        a_sliced_npy = np.asarray(a_sliced_mlx)
        self.assertTrue(np.array_equal(a_sliced_npy, a_npy[idx_npy, :5]))

        a_sliced_mlx = a_mlx[:, idx_mlx]
        a_sliced_npy = np.asarray(a_sliced_mlx)
        self.assertTrue(np.array_equal(a_sliced_npy, a_npy[:, idx_npy]))

        # Multi dim indexing with multiple arrays
        def check_slices(arr_np, *idx_np):
            arr_mlx = mx.array(arr_np)
            idx_mlx = [
                mx.array(idx) if isinstance(idx, np.ndarray) else idx for idx in idx_np
            ]
            slice_mlx = arr_mlx[tuple(idx_mlx)]
            self.assertTrue(
                np.array_equal(arr_np[tuple(idx_np)], arr_mlx[tuple(idx_mlx)])
            )

        a_np = np.arange(16).reshape(4, 4)
        check_slices(a_np, np.array([0, 1, 2, 3]), np.array([0, 1, 2, 3]))
        check_slices(a_np, np.array([0, 1, 2, 3]), np.array([1, 0, 3, 3]))
        check_slices(a_np, np.array([[0, 1]]), np.array([[0], [1], [3]]))

        a_np = np.arange(64).reshape(2, 4, 2, 4)
        check_slices(a_np, 0, np.array([0, 1, 2]))
        check_slices(a_np, slice(0, 1), np.array([0, 1, 2]))
        check_slices(
            a_np, slice(0, 1), np.array([0, 1, 2]), slice(None), slice(0, 4, 2)
        )
        check_slices(
            a_np, slice(0, 1), np.array([0, 1, 2]), slice(None), np.array([1, 2, 0])
        )
        check_slices(a_np, slice(0, 1), np.array([0, 1, 2]), 1, np.array([1, 2, 0]))
        check_slices(
            a_np, slice(0, 1), np.array([0, 1, 2]), np.array([1, 0, 0]), slice(0, 1)
        )
        check_slices(
            a_np,
            slice(0, 1),
            np.array([[0], [1], [2]]),
            np.array([[1, 0, 0]]),
            slice(0, 1),
        )
        check_slices(
            a_np,
            slice(0, 2),
            np.array([[0], [1], [2]]),
            slice(0, 2),
            np.array([[1, 0, 0]]),
        )
        for p in permutations([slice(None), slice(None), 0, np.array([1, 0])]):
            check_slices(a_np, *p)
        for p in permutations(
            [slice(None), slice(None), 0, np.array([1, 0]), None, None]
        ):
            check_slices(a_np, *p)
        for p in permutations([0, np.array([1, 0]), None, Ellipsis, slice(None)]):
            check_slices(a_np, *p)

        # Non-contiguous arrays in slicing
        a_mlx = mx.reshape(mx.arange(128), (16, 8))
        a_mlx = a_mlx[::2, :]
        a_np = np.array(a_mlx)
        idx_np = np.arange(8)[::2]
        idx_mlx = mx.arange(8)[::2]
        self.assertTrue(
            np.array_equal(a_np[idx_np, idx_np], np.array(a_mlx[idx_mlx, idx_mlx]))
        )

        # Slicing with negative indices and integer
        a_np = np.arange(10).reshape(5, 2)
        a_mlx = mx.array(a_np)
        self.assertTrue(np.array_equal(a_np[2:-1, 0], np.array(a_mlx[2:-1, 0])))

    def test_indexing_grad(self):
        x = mx.array([[1, 2], [3, 4]]).astype(mx.float32)
        ind = mx.array([0, 1, 0]).astype(mx.float32)

        def index_fn(x, ind):
            return x[ind.astype(mx.int32)].sum()

        grad_x, grad_ind = mx.grad(index_fn, argnums=(0, 1))(x, ind)
        expected = mx.array([[2, 2], [1, 1]])

        self.assertTrue(mx.array_equal(grad_x, expected))
        self.assertTrue(mx.array_equal(grad_ind, mx.zeros(ind.shape)))

    def test_setitem(self):
        a = mx.array(0)
        a[None] = 1
        self.assertEqual(a.item(), 1)

        a = mx.array([1, 2, 3])
        a[0] = 2
        self.assertEqual(a.tolist(), [2, 2, 3])

        a[-1] = 2
        self.assertEqual(a.tolist(), [2, 2, 2])

        a[np.int64(1)] = 9
        self.assertEqual(a.tolist(), [2, 9, 2])

        a[0] = mx.array([[[1]]])
        self.assertEqual(a.tolist(), [1, 9, 2])

        a[:] = 0
        self.assertEqual(a.tolist(), [0, 0, 0])

        a[None] = 1
        self.assertEqual(a.tolist(), [1, 1, 1])

        a[0:1] = 2
        self.assertEqual(a.tolist(), [2, 1, 1])

        a[0:2] = 3
        self.assertEqual(a.tolist(), [3, 3, 1])

        a[0:3] = 4
        self.assertEqual(a.tolist(), [4, 4, 4])

        a[0:1] = mx.array(0)
        self.assertEqual(a.tolist(), [0, 4, 4])

        a[0:1] = mx.array([1])
        self.assertEqual(a.tolist(), [1, 4, 4])

        with self.assertRaises(ValueError):
            a[0:1] = mx.array([2, 3])

        a[0:2] = mx.array([2, 2])
        self.assertEqual(a.tolist(), [2, 2, 4])

        a[:] = mx.array([[[[1, 1, 1]]]])
        self.assertEqual(a.tolist(), [1, 1, 1])

        # Array slices
        def check_slices(arr_np, update_np, *idx_np):
            arr_mlx = mx.array(arr_np)
            update_mlx = mx.array(update_np)
            idx_mlx = [
                mx.array(idx) if isinstance(idx, np.ndarray) else idx for idx in idx_np
            ]
            if len(idx_np) > 1:
                idx_np = tuple(idx_np)
                idx_mlx = tuple(idx_mlx)
            else:
                idx_np = idx_np[0]
                idx_mlx = idx_mlx[0]
            arr_np[idx_np] = update_np
            arr_mlx[idx_mlx] = update_mlx
            self.assertTrue(np.array_equal(arr_np, arr_mlx))

        check_slices(np.zeros((3, 3)), 1, 0)
        check_slices(np.zeros((3, 3)), 1, -1)
        check_slices(np.zeros((3, 3)), 1, slice(0, 2))
        check_slices(np.zeros((3, 3)), np.array([[0, 1, 2], [3, 4, 5]]), slice(0, 2))

        with self.assertRaises(ValueError):
            a = mx.array(0)
            a[0] = mx.array(1)

        check_slices(np.zeros((3, 3)), 1, np.array([0, 1, 2]))
        check_slices(np.zeros((3, 3)), np.array(3), np.array([0, 1, 2]))
        check_slices(np.zeros((3, 3)), np.array([3]), np.array([0, 1, 2]))
        check_slices(np.zeros((3, 3)), np.array([3]), np.array([0, 1]))
        check_slices(np.zeros((3, 2)), np.array([[3, 3], [4, 4]]), np.array([0, 1]))
        check_slices(np.zeros((3, 2)), np.array([[3, 3], [4, 4]]), np.array([0, 1]))
        check_slices(
            np.zeros((3, 2)), np.array([[3, 3], [4, 4], [5, 5]]), np.array([0, 2, 1])
        )

        # Multiple slices
        a = mx.array(0)
        a[None, None] = 1
        self.assertEqual(a.item(), 1)

        a[None, None] = mx.array(2)
        self.assertEqual(a.item(), 2)

        a[None, None] = mx.array([[[3]]])
        self.assertEqual(a.item(), 3)

        a[()] = 4
        self.assertEqual(a.item(), 4)

        a_np = np.zeros((2, 3, 4, 5))
        check_slices(a_np, 1, np.array([0, 0]), slice(0, 2), slice(0, 3), 4)
        check_slices(
            a_np,
            np.arange(10).reshape(2, 5),
            np.array([0, 0]),
            np.array([0, 1]),
            np.array([2, 3]),
        )
        check_slices(
            a_np,
            np.array([[3], [4]]),
            np.array([0, 0]),
            np.array([0, 1]),
            np.array([2, 3]),
        )
        check_slices(
            a_np, np.arange(5), np.array([0, 0]), np.array([0, 1]), np.array([2, 3])
        )
        check_slices(np.zeros(5), np.arange(2), None, None, np.array([2, 3]))
        check_slices(
            np.zeros((4, 3, 4)),
            np.arange(3),
            np.array([2, 3]),
            slice(0, 3),
            np.array([2, 3]),
        )

        with self.assertRaises(ValueError):
            a = mx.zeros((4, 3, 4))
            a[mx.array([2, 3]), None, mx.array([2, 3])] = mx.arange(2)

        with self.assertRaises(ValueError):
            a = mx.zeros((4, 3, 4))
            a[mx.array([2, 3]), None, mx.array([2, 3])] = mx.arange(3)

        check_slices(np.zeros((4, 3, 4)), 1, np.array([2, 3]), None, np.array([2, 1]))
        check_slices(
            np.zeros((4, 3, 4)), np.arange(4), np.array([2, 3]), None, np.array([2, 1])
        )
        check_slices(
            np.zeros((4, 3, 4)),
            np.arange(2 * 4).reshape(2, 1, 4),
            np.array([2, 3]),
            None,
            np.array([2, 1]),
        )

        check_slices(np.zeros((4, 4)), 1, slice(0, 2), slice(0, 2))
        check_slices(np.zeros((4, 4)), np.arange(2), slice(0, 2), slice(0, 2))
        check_slices(
            np.zeros((4, 4)), np.arange(2).reshape(2, 1), slice(0, 2), slice(0, 2)
        )
        check_slices(
            np.zeros((4, 4)), np.arange(4).reshape(2, 2), slice(0, 2), slice(0, 2)
        )

        with self.assertRaises(ValueError):
            a = mx.zeros((2, 2, 2))
            a[..., ...] = 1

        with self.assertRaises(ValueError):
            a = mx.zeros((2, 2, 2, 2, 2))
            a[0, ..., 0, ..., 0] = 1

        with self.assertRaises(ValueError):
            a = mx.zeros((2, 2))
            a[0, 0, 0] = 1

        with self.assertRaises(ValueError):
            a = mx.zeros((5, 4, 3))
            a[:, 0] = mx.ones((5, 1, 3))

        check_slices(np.zeros((2, 2, 2, 2)), 1, None, Ellipsis, None)
        check_slices(
            np.zeros((2, 2, 2, 2)), 1, np.array([0, 1]), Ellipsis, np.array([0, 1])
        )
        check_slices(
            np.zeros((2, 2, 2, 2)),
            np.arange(2 * 2 * 2).reshape(2, 2, 2),
            np.array([0, 1]),
            Ellipsis,
            np.array([0, 1]),
        )

        # Check slice assign with negative indices works
        a = mx.zeros((5, 5), mx.int32)
        a[2:-2, 2:-2] = 4
        self.assertEqual(a[2, 2].item(), 4)

        # Check slice array slice
        check_slices(
            np.zeros((5, 4, 4)),
            np.arange(4 * 2 * 3).reshape(4, 2, 3),
            slice(0, 4),
            np.array([1, 3]),
            slice(None, -1),
        )
        check_slices(
            np.zeros((5, 4, 4)),
            np.arange(4 * 2 * 2).reshape(4, 2, 2),
            slice(0, 4),
            np.array([1, 3]),
            slice(0, 4, 2),
        )

        check_slices(
            np.zeros((1, 10, 4)),
            np.arange(2 * 4).reshape(1, 2, 4),
            slice(None, None, None),
            np.array([1, 3]),
        )

        check_slices(
            np.zeros((3, 4, 5, 3)),
            np.arange(2 * 4 * 3 * 3).reshape(2, 4, 3, 3),
            np.array([2, 1]),
            slice(None, None, None),
            slice(None, None, 2),
            slice(None, None, None),
        )

        check_slices(
            np.zeros((3, 4, 5, 3)),
            np.arange(2 * 4 * 3 * 3).reshape(2, 4, 3, 3),
            np.array([2, 1]),
            slice(None, None, None),
            slice(None, None, 2),
        )

        check_slices(np.zeros((5, 4, 3)), np.ones((5, 3)), slice(None), 0)

        check_slices(np.zeros((5, 4, 3)), np.ones((5, 1, 3)), slice(None), slice(0, 1))
        check_slices(
            np.ones((3, 4, 4, 4)), np.zeros((4, 4)), 0, slice(0, 4), 3, slice(0, 4)
        )

        x = mx.zeros((2, 3, 4, 5, 3))
        x[..., 0] = 1.0
        self.assertTrue(mx.array_equal(x[..., 0], mx.ones((2, 3, 4, 5))))

        x = mx.zeros((2, 3, 4, 5, 3))
        x[:, 0] = 1.0
        self.assertTrue(mx.array_equal(x[:, 0], mx.ones((2, 4, 5, 3))))

        x = mx.zeros((2, 2, 2, 2, 2, 2))
        x[0, 0] = 1
        self.assertTrue(mx.array_equal(x[0, 0], mx.ones((2, 2, 2, 2))))

        a = mx.zeros((2, 2, 2))
        with self.assertRaises(ValueError):
            a[:, None, :] = mx.ones((2, 2, 2))

        # Ok, doesn't throw
        a[:, None, :] = mx.ones((2, 1, 2, 2))
        a[:, None, :] = mx.ones((2, 2))
        a[:, None, 0] = mx.ones((2,))
        a[:, None, 0] = mx.ones((1, 2))

    def test_array_at(self):
        a = mx.array(1)
        with self.assertRaises(ValueError):
            a.at.add(1)

        a = a.at[None].add(1)
        self.assertEqual(a.item(), 2)

        a = mx.array([0, 1, 2])
        a = a.at[1].add(2)
        self.assertEqual(a.tolist(), [0, 3, 2])

        a = a.at[mx.array([0, 0, 0, 0])].add(1)
        self.assertEqual(a.tolist(), [4, 3, 2])

        a = mx.zeros((10, 10))
        a = a.at[0].add(mx.arange(10))
        self.assertEqual(a[0].tolist(), list(range(10)))

        a = mx.zeros((10, 10))
        index_x = mx.array([0, 2, 3, 7])
        index_y = mx.array([3, 3, 1, 2])
        u = mx.random.uniform(shape=(4,))
        a = a.at[index_x, index_y].add(u)
        self.assertTrue(mx.allclose(a.sum(), u.sum()))
        self.assertEqualArray(a.sum(), u.sum(), atol=1e-6, rtol=1e-5)
        self.assertEqual(a[index_x, index_y].tolist(), u.tolist())

        # Test all array.at ops
        a = mx.random.uniform(shape=(10, 5, 2))
        idx_x = mx.array([0, 4])
        update = mx.ones((2, 5))
        a[idx_x, :, 0] = 0
        a = a.at[idx_x, :, 0].add(update)
        self.assertEqualArray(a[idx_x, :, 0], update)
        a = a.at[idx_x, :, 0].subtract(update)
        self.assertEqualArray(a[idx_x, :, 0], mx.zeros_like(update))
        a = a.at[idx_x, :, 0].add(2 * update)
        self.assertEqualArray(a[idx_x, :, 0], 2 * update)
        a = a.at[idx_x, :, 0].multiply(2 * update)
        self.assertEqualArray(a[idx_x, :, 0], 4 * update)
        a = a.at[idx_x, :, 0].divide(3 * update)
        self.assertEqualArray(a[idx_x, :, 0], (4 / 3) * update)
        a[idx_x, :, 0] = 5
        update = mx.arange(10).reshape(2, 5)
        a = a.at[idx_x, :, 0].maximum(update)
        self.assertEqualArray(a[idx_x, :, 0], mx.maximum(a[idx_x, :, 0], update))
        a[idx_x, :, 0] = 5
        a = a.at[idx_x, :, 0].minimum(update)
        self.assertEqualArray(a[idx_x, :, 0], mx.minimum(a[idx_x, :, 0], update))

        update = mx.array([1.0, 2.0])[None, None, None]
        src = mx.array([1.0, 2.0])[None, :]
        src = src.at[0:1].add(update)
        self.assertTrue(mx.array_equal(src, mx.array([[2.0, 4.0]])))

        # Test all array.at ops with slice-only indices
        a = mx.random.uniform(shape=(10, 5, 2))
        update = mx.ones((2, 5))
        a[1:3, :, 0] = 0
        a = a.at[1:3, :, 0].add(update)
        self.assertEqualArray(a[1:3, :, 0], update)
        a = a.at[1:3, :, 0].subtract(update)
        self.assertEqualArray(a[1:3, :, 0], mx.zeros_like(update))
        a = a.at[1:3, :, 0].add(2 * update)
        self.assertEqualArray(a[1:3, :, 0], 2 * update)
        a = a.at[1:3, :, 0].multiply(2 * update)
        self.assertEqualArray(a[1:3, :, 0], 4 * update)
        a = a.at[1:3, :, 0].divide(3 * update)
        self.assertEqualArray(a[1:3, :, 0], (4 / 3) * update)
        a[1:3, :, 0] = 5
        update = mx.arange(10).reshape(2, 5)
        a = a.at[1:3, :, 0].maximum(update)
        self.assertEqualArray(a[1:3, :, 0], mx.maximum(a[1:3, :, 0], update))
        a[1:3, :, 0] = 5
        a = a.at[1:3, :, 0].minimum(update)
        self.assertEqualArray(a[1:3, :, 0], mx.minimum(a[1:3, :, 0], update))

    def test_array_at_slice_update_extensive(self):
        # Test with transposed inputs
        a = mx.zeros((4, 5))
        update = mx.ones((5, 2)).T  # Shape (2, 5)
        a = a.at[1:3, :].add(update)
        self.assertEqualArray(a[1:3, :], update)

        # Test with transposed updates on transposed slice
        a = mx.zeros((5, 4))
        update = mx.ones((2, 5))
        a = a.at[:, 1:3].add(update.T)
        self.assertEqualArray(a[:, 1:3], update.T)

        # Test with slice of another array as update
        source = mx.arange(20, dtype=mx.float32).reshape(4, 5)
        a = mx.zeros((4, 5))
        update = source[1:3, :]  # Shape (2, 5)
        a = a.at[0:2, :].add(update)
        self.assertEqualArray(a[0:2, :], source[1:3, :])

        # Test with both input and update being slices
        source = mx.arange(30, dtype=mx.float32).reshape(5, 6)
        a = mx.zeros((5, 6))
        a = a.at[1:4, 1:5].add(source[0:3, 0:4])
        self.assertEqualArray(a[1:4, 1:5], source[0:3, 0:4])

        # Test with transposed slice of another array
        source = mx.arange(20, dtype=mx.float32).reshape(4, 5)
        a = mx.zeros((5, 4))
        update = source[1:3, :].T  # Shape (5, 2)
        a = a.at[:, 1:3].add(update)
        self.assertEqualArray(a[:, 1:3], update)

        # Test with negative indexing in slices
        a = mx.zeros((5, 5))
        update = mx.ones((2, 5))
        a = a.at[-3:-1, :].add(update)
        self.assertEqualArray(a[-3:-1, :], update)

        # Test with strided slices
        a = mx.zeros((6, 6))
        update = mx.ones((2, 3))
        a = a.at[1:5:2, 0:6:2].add(update)
        self.assertEqualArray(a[1:5:2, 0:6:2], update)

        # Test with slice of transposed array
        source = mx.arange(20, dtype=mx.float32).reshape(4, 5)
        a = mx.zeros((5, 4))
        update = source.T[:, 1:3]  # Shape (5, 2)
        a = a.at[:, 1:3].add(update)
        self.assertEqualArray(a[:, 1:3], update)

        # Test with 3D arrays and transposed updates
        a = mx.zeros((3, 4, 5))
        update = mx.ones((4, 3, 5)).transpose(1, 0, 2)  # Shape (3, 4, 5)
        a = a.at[:, :, :].add(update)
        self.assertEqualArray(a, update)

        # Test with slice of 3D array
        source = mx.arange(60, dtype=mx.float32).reshape(3, 4, 5)
        a = mx.zeros((3, 4, 5))
        update = source[0:2, :, :]
        a = a.at[1:3, :, :].add(update)
        self.assertEqualArray(a[1:3, :, :], source[0:2, :, :])

        # Test with mixed slice and index
        a = mx.zeros((4, 5, 6))
        update = mx.ones((2, 6))
        a = a.at[1:3, 2, :].add(update)
        self.assertEqualArray(a[1:3, 2, :], update)

        # Test with update from strided slice
        source = mx.arange(60, dtype=mx.float32).reshape(3, 4, 5)
        a = mx.zeros((3, 2, 5))
        update = source[:, ::2, :]  # Shape (3, 2, 5)
        a = a.at[:, :, :].add(update)
        self.assertEqualArray(a, update)

    def test_slice_negative_step(self):
        a_np = np.arange(20)
        a_mx = mx.array(a_np)

        # Basic negative slice
        b_np = a_np[::-1]
        b_mx = a_mx[::-1]
        self.assertTrue(np.array_equal(b_np, b_mx))

        # Bounds negative slice
        b_np = a_np[-3:3:-1]
        b_mx = a_mx[-3:3:-1]
        self.assertTrue(np.array_equal(b_np, b_mx))

        # Bounds negative slice
        b_np = a_np[25:-50:-1]
        b_mx = a_mx[25:-50:-1]
        self.assertTrue(np.array_equal(b_np, b_mx))

        # Jumping negative slice
        b_np = a_np[::-3]
        b_mx = a_mx[::-3]
        self.assertTrue(np.array_equal(b_np, b_mx))

        # Bounds and negative slice
        b_np = a_np[-3:3:-3]
        b_mx = a_mx[-3:3:-3]
        self.assertTrue(np.array_equal(b_np, b_mx))

        # Bounds and negative slice
        b_np = a_np[25:-50:-3]
        b_mx = a_mx[25:-50:-3]
        self.assertTrue(np.array_equal(b_np, b_mx))

        # Negative slice and ascending bounds
        b_np = a_np[0:20:-3]
        b_mx = a_mx[0:20:-3]
        self.assertTrue(np.array_equal(b_np, b_mx))

        # Multi-dim negative slices
        a_np = np.arange(3 * 6 * 4).reshape(3, 6, 4)
        a_mx = mx.array(a_np)

        # Flip each dim
        b_np = a_np[..., ::-1]
        b_mx = a_mx[..., ::-1]
        self.assertTrue(np.array_equal(b_np, b_mx))

        b_np = a_np[:, ::-1, :]
        b_mx = a_mx[:, ::-1, :]
        self.assertTrue(np.array_equal(b_np, b_mx))

        b_np = a_np[::-1, ...]
        b_mx = a_mx[::-1, ...]
        self.assertTrue(np.array_equal(b_np, b_mx))

        # Flip pairs of dims
        b_np = a_np[::-1, 1:5:2, ::-2]
        b_mx = a_mx[::-1, 1:5:2, ::-2]
        self.assertTrue(np.array_equal(b_np, b_mx))

        b_np = a_np[::-1, ::-2, 1:5:2]
        b_mx = a_mx[::-1, ::-2, 1:5:2]
        self.assertTrue(np.array_equal(b_np, b_mx))

        # Flip all dims
        b_np = a_np[::-1, ::-3, ::-2]
        b_mx = a_mx[::-1, ::-3, ::-2]
        self.assertTrue(np.array_equal(b_np, b_mx))

    def test_api(self):
        x = mx.array(np.random.rand(10, 10, 10))
        ops = [
            ("reshape", (100, -1)),
            "square",
            "sqrt",
            "rsqrt",
            "reciprocal",
            "exp",
            "log",
            "sin",
            "cos",
            "log1p",
            "abs",
            "log10",
            "log2",
            "conj",
            ("all", 1),
            ("any", 1),
            ("transpose", (0, 2, 1)),
            ("sum", 1),
            ("prod", 1),
            ("min", 1),
            ("max", 1),
            ("logcumsumexp", 1),
            ("logsumexp", 1),
            ("mean", 1),
            ("var", 1),
            ("argmin", 1),
            ("argmax", 1),
            ("cummax", 1),
            ("cummin", 1),
            ("cumprod", 1),
            ("cumsum", 1),
            ("diagonal", 0, 0, 1),
            ("flatten", 0, -1),
            ("moveaxis", 1, 2),
            ("round", 2),
            ("std", 1, True, 0),
            ("swapaxes", 1, 2),
        ]
        for op in ops:
            if isinstance(op, tuple):
                op, *args = op
            else:
                args = tuple()
            y1 = getattr(mx, op)(x, *args)
            y2 = getattr(x, op)(*args)
            self.assertEqual(y1.dtype, y2.dtype)
            self.assertEqual(y1.shape, y2.shape)
            self.assertTrue(mx.array_equal(y1, y2))

        y1 = mx.split(x, 2)
        y2 = x.split(2)
        self.assertEqual(len(y1), 2)
        self.assertEqual(len(y1), len(y2))
        self.assertTrue(mx.array_equal(y1[0], y2[0]))
        self.assertTrue(mx.array_equal(y1[1], y2[1]))
        x = mx.array(np.random.rand(10, 10, 1))
        y1 = mx.squeeze(x, axis=2)
        y2 = x.squeeze(axis=2)
        self.assertEqual(y1.shape, y2.shape)
        self.assertTrue(mx.array_equal(y1, y2))

    def test_memoryless_copy(self):
        a_mx = mx.ones((2, 2))
        b_mx = mx.broadcast_to(a_mx, (5, 2, 2))

        # Make np arrays without copy
        a_np = np.array(a_mx, copy=False)
        b_np = np.array(b_mx, copy=False)

        # Check that we get read-only array that does not own the underlying data
        self.assertFalse(a_np.flags.owndata)
        self.assertTrue(a_np.flags.writeable)

        # Check contents
        self.assertTrue(np.array_equal(np.ones((2, 2), dtype=np.float32), a_np))
        self.assertTrue(np.array_equal(np.ones((5, 2, 2), dtype=np.float32), b_np))

        # Check strides
        self.assertSequenceEqual(b_np.strides, (0, 8, 4))

    def test_np_array_conversion_copies_by_default(self):
        a_mx = mx.ones((2, 2))
        a_np = np.array(a_mx)
        self.assertTrue(a_np.flags.owndata)
        self.assertTrue(a_np.flags.writeable)

    def test_buffer_protocol(self):
        dtypes_list = [
            (mx.bool_, np.bool_, None),
            (mx.uint8, np.uint8, np.iinfo),
            (mx.uint16, np.uint16, np.iinfo),
            (mx.uint32, np.uint32, np.iinfo),
            (mx.uint64, np.uint64, np.iinfo),
            (mx.int8, np.int8, np.iinfo),
            (mx.int16, np.int16, np.iinfo),
            (mx.int32, np.int32, np.iinfo),
            (mx.int64, np.int64, np.iinfo),
            (mx.float16, np.float16, np.finfo),
            (mx.float32, np.float32, np.finfo),
            (mx.complex64, np.complex64, np.finfo),
        ]

        for mlx_dtype, np_dtype, info_fn in dtypes_list:
            a_np = np.random.uniform(low=0, high=100, size=(3, 4)).astype(np_dtype)
            if info_fn is not None:
                info = info_fn(np_dtype)
                a_np[0, 0] = info.min
                a_np[0, 1] = info.max
            a_mx = mx.array(a_np)
            for f in [lambda x: x, lambda x: x.T]:
                mv_mx = memoryview(f(a_mx))
                mv_np = memoryview(f(a_np))
                self.assertEqual(mv_mx.strides, mv_np.strides, f"{mlx_dtype}{np_dtype}")
                self.assertEqual(mv_mx.shape, mv_np.shape, f"{mlx_dtype}{np_dtype}")
                # correct buffer format for 8 byte (unsigned) 'long long' is Q/q, see
                # https://docs.python.org/3.10/library/struct.html#format-characters
                # numpy returns L/l, as 'long' is equivalent to 'long long' on 64bit machines, so q and l are equivalent
                # see https://github.com/pybind/pybind11/issues/1908
                if np_dtype == np.uint64:
                    self.assertEqual(mv_mx.format, "Q", f"{mlx_dtype}{np_dtype}")
                elif np_dtype == np.int64:
                    self.assertEqual(mv_mx.format, "q", f"{mlx_dtype}{np_dtype}")
                # for windows long is 32bit and numpy returns L/l.
                elif np_dtype == np.uint32 and platform.system() == "Windows":
                    self.assertEqual(mv_mx.format, "I", f"{mlx_dtype}{np_dtype}")
                elif np_dtype == np.int32 and platform.system() == "Windows":
                    self.assertEqual(mv_mx.format, "i", f"{mlx_dtype}{np_dtype}")
                else:
                    self.assertEqual(
                        mv_mx.format, mv_np.format, f"{mlx_dtype}{np_dtype}"
                    )
                self.assertFalse(mv_mx.readonly)
                back_to_npy = np.array(mv_mx, copy=False)
                self.assertEqualArray(
                    back_to_npy,
                    f(a_np),
                    atol=0,
                    rtol=0,
                )

        # extra test for bfloat16, which is not numpy convertible
        a_mx = mx.random.uniform(low=0, high=100, shape=(3, 4), dtype=mx.bfloat16)
        mv_mx = memoryview(a_mx)
        self.assertEqual(mv_mx.strides, (8, 2))
        self.assertEqual(mv_mx.shape, (3, 4))
        self.assertEqual(mv_mx.format, "B")
        with self.assertRaises(RuntimeError) as cm:
            np.array(a_mx)
        e = cm.exception
        self.assertTrue("Item size 2 for PEP 3118 buffer format string" in str(e))

        # Test buffer protocol with non-arrays ie bytes
        a = ord("a") * 257 + mx.arange(10).astype(mx.int16)
        ab = bytes(a)
        self.assertEqual(len(ab), 20)
        if sys.byteorder == "little":
            self.assertEqual(b"aaaaaaaaaa", ab[1::2])
            self.assertEqual(b"abcdefghij", ab[::2])
        else:
            self.assertEqual(b"aaaaaaaaaa", ab[::2])
            self.assertEqual(b"abcdefghij", ab[1::2])

    def test_buffer_protocol_ref_counting(self):
        a = mx.arange(3)
        wr = weakref.ref(a)
        self.assertIsNotNone(wr())
        mv = memoryview(a)
        a = None
        self.assertIsNotNone(wr())
        mv = None
        self.assertIsNone(wr())

    def test_array_view_ref_counting(self):
        a = mx.arange(3)
        wr = weakref.ref(a)
        self.assertIsNotNone(wr())
        a_np = np.array(a, copy=False)
        a = None
        self.assertIsNotNone(wr())
        a_np = None
        self.assertIsNone(wr())

    @unittest.skipIf(not has_tf, "requires TensorFlow")
    def test_buffer_protocol_tf(self):
        dtypes_list = [
            (
                mx.bool_,
                tf.bool,
                np.bool_,
            ),
            (
                mx.uint8,
                tf.uint8,
                np.uint8,
            ),
            (
                mx.uint16,
                tf.uint16,
                np.uint16,
            ),
            (
                mx.uint32,
                tf.uint32,
                np.uint32,
            ),
            (mx.uint64, tf.uint64, np.uint64),
            (mx.int8, tf.int8, np.int8),
            (mx.int16, tf.int16, np.int16),
            (mx.int32, tf.int32, np.int32),
            (mx.int64, tf.int64, np.int64),
            (mx.float16, tf.float16, np.float16),
            (mx.float32, tf.float32, np.float32),
            (
                mx.complex64,
                tf.complex64,
                np.complex64,
            ),
        ]

        for mlx_dtype, tf_dtype, np_dtype in dtypes_list:
            a_np = np.random.uniform(low=0, high=100, size=(3, 4)).astype(np_dtype)
            a_tf = tf.constant(a_np, dtype=tf_dtype)
            a_mx = mx.array(np.array(a_tf))
            for f in [
                lambda x: x,
                lambda x: tf.transpose(x) if isinstance(x, tf.Tensor) else x.T,
            ]:
                mv_mx = memoryview(f(a_mx))
                mv_tf = memoryview(f(a_tf))
                if (mv_mx.c_contiguous and mv_tf.c_contiguous) or (
                    mv_mx.f_contiguous and mv_tf.f_contiguous
                ):
                    self.assertEqual(
                        mv_mx.strides, mv_tf.strides, f"{mlx_dtype}{tf_dtype}"
                    )
                self.assertEqual(mv_mx.shape, mv_tf.shape, f"{mlx_dtype}{tf_dtype}")
                self.assertFalse(mv_mx.readonly)
                back_to_npy = np.array(mv_mx)
                self.assertEqualArray(
                    back_to_npy,
                    f(a_tf),
                    atol=0,
                    rtol=0,
                )

    def test_logical_overloads(self):
        with self.assertRaises(ValueError):
            mx.array(1.0) & mx.array(1)
        with self.assertRaises(ValueError):
            mx.array(1.0) | mx.array(1)

        self.assertEqual((mx.array(True) & True).item(), True)
        self.assertEqual((mx.array(True) & False).item(), False)
        self.assertEqual((mx.array(True) | False).item(), True)
        self.assertEqual((mx.array(False) | False).item(), False)
        self.assertEqual((~mx.array(False)).item(), True)
        self.assertEqual((mx.array(False) ^ True).item(), True)

    def test_inplace(self):
        iops = [
            "__iadd__",
            "__isub__",
            "__imul__",
            "__ifloordiv__",
            "__imod__",
            "__ipow__",
            "__ixor__",
        ]

        for op in iops:
            a = mx.array([1, 2, 3])
            a_np = np.array(a)
            b = a
            b = getattr(a, op)(3)
            self.assertTrue(mx.array_equal(a, b))
            out_np = getattr(a_np, op)(3)
            self.assertTrue(np.array_equal(out_np, a))

        with self.assertRaises(ValueError):
            a = mx.array([1])
            a /= 1

        a = mx.array([2.0])
        b = a
        b /= 2
        self.assertEqual(b.item(), 1.0)
        self.assertEqual(b.item(), a.item())

        a = mx.array(True)
        b = a
        b &= False
        self.assertEqual(b.item(), False)
        self.assertEqual(b.item(), a.item())

        a = mx.array(False)
        b = a
        b |= True
        self.assertEqual(b.item(), True)
        self.assertEqual(b.item(), a.item())

        # In-place matmul on its own
        a = mx.array([[1.0, 2.0], [3.0, 4.0]])
        b = a
        b @= a
        self.assertTrue(mx.array_equal(a, b))

        a = mx.array(False)
        a ^= True
        self.assertEqual(a.item(), True)

    def test_inplace_preserves_ids(self):
        a = mx.array([1.0])
        orig_id = id(a)
        a += mx.array(2.0)
        self.assertEqual(id(a), orig_id)

        a[0] = 2.0
        self.assertEqual(id(a), orig_id)

        a -= mx.array(3.0)
        self.assertEqual(id(a), orig_id)

        a *= mx.array(3.0)
        self.assertEqual(id(a), orig_id)

    def test_load_from_pickled_np(self):
        a = np.array([1, 2, 3], dtype=np.int32)
        b = pickle.loads(pickle.dumps(a))
        self.assertTrue(mx.array_equal(mx.array(a), mx.array(b)))

        a = np.array([1.0, 2.0, 3.0], dtype=np.float16)
        b = pickle.loads(pickle.dumps(a))
        self.assertTrue(mx.array_equal(mx.array(a), mx.array(b)))

    def test_multi_output_leak(self):
        def fun():
            a = mx.zeros((2**20))
            mx.eval(a)
            b, c = mx.divmod(a, a)
            del b, c

        fun()
        mx.synchronize()
        peak_1 = mx.get_peak_memory()
        fun()
        mx.synchronize()
        peak_2 = mx.get_peak_memory()
        self.assertEqual(peak_1, peak_2)

        def fun():
            a = mx.array([1.0, 2.0, 3.0, 4.0])
            b, _ = mx.divmod(a, a)
            return mx.log(b)

        fun()
        mx.synchronize()
        peak_1 = mx.get_peak_memory()
        fun()
        mx.synchronize()
        peak_2 = mx.get_peak_memory()
        self.assertEqual(peak_1, peak_2)

    def test_add_numpy(self):
        x = mx.array(1)
        y = np.array(2, dtype=np.int32)
        z = x + y
        self.assertEqual(z.dtype, mx.int32)
        self.assertEqual(z.item(), 3)

    def test_dlpack(self):
        x = mx.array(1, dtype=mx.int32)
        y = np.from_dlpack(x)
        self.assertTrue(mx.array_equal(y, x))

        x = mx.array([[1.0, 2.0], [3.0, 4.0]])
        y = np.from_dlpack(x)
        self.assertTrue(mx.array_equal(y, x))

        x = mx.arange(16).reshape(4, 4)
        x = x[::2, ::2]
        y = np.from_dlpack(x)
        self.assertTrue(mx.array_equal(y, x))

    def test_getitem_with_list(self):
        a = mx.array([1, 2, 3, 4, 5])
        idx = [0, 2, 4]
        self.assertTrue(np.array_equal(a[idx], np.array(a)[idx]))

        a = mx.array([[1, 2], [3, 4], [5, 6]])
        idx = [0, 2]
        self.assertTrue(np.array_equal(a[idx], np.array(a)[idx]))

        a = mx.arange(10).reshape(5, 2)
        idx = [0, 2, 4]
        self.assertTrue(np.array_equal(a[idx], np.array(a)[idx]))

        idx = [0, 2]
        a = mx.arange(16).reshape(4, 4)
        anp = np.array(a)
        self.assertTrue(np.array_equal(a[idx, 0], anp[idx, 0]))
        self.assertTrue(np.array_equal(a[idx, :], anp[idx, :]))
        self.assertTrue(np.array_equal(a[0, idx], anp[0, idx]))
        self.assertTrue(np.array_equal(a[:, idx], anp[:, idx]))

    def test_setitem_with_list(self):
        a = mx.array([1, 2, 3, 4, 5])
        anp = np.array(a)
        idx = [0, 2, 4]
        a[idx] = 3
        anp[idx] = 3
        self.assertTrue(np.array_equal(a, anp))

        a = mx.array([[1, 2], [3, 4], [5, 6]])
        idx = [0, 2]
        anp = np.array(a)
        a[idx] = 3
        anp[idx] = 3
        self.assertTrue(np.array_equal(a, anp))

        a = mx.arange(10).reshape(5, 2)
        idx = [0, 2, 4]
        anp = np.array(a)
        a[idx] = 3
        anp[idx] = 3
        self.assertTrue(np.array_equal(a, anp))

        idx = [0, 2]
        a = mx.arange(16).reshape(4, 4)
        anp = np.array(a)
        a[idx, 0] = 1
        anp[idx, 0] = 1
        self.assertTrue(np.array_equal(a, anp))

        a[idx, :] = 2
        anp[idx, :] = 2
        self.assertTrue(np.array_equal(a, anp))

        a[0, idx] = 3
        anp[0, idx] = 3
        self.assertTrue(np.array_equal(a, anp))

        a[:, idx] = 4
        anp[:, idx] = 4
        self.assertTrue(np.array_equal(a, anp))

    def test_setitem_with_boolean_mask(self):
        # Python list mask
        a = mx.array([1.0, 2.0, 3.0])
        mask = [True, False, True]
        src = mx.array([5.0, 6.0])
        expected = mx.array([5.0, 2.0, 6.0])
        a[mask] = src
        self.assertTrue(mx.array_equal(a, expected))

        # mx.array scalar mask
        a = mx.array([1.0, 2.0, 3.0])
        mask = mx.array(True)
        expected = mx.array([5.0, 5.0, 5.0])
        a[mask] = 5.0
        self.assertTrue(mx.array_equal(a, expected))

        # scalar mask
        a = mx.array([1.0, 2.0, 3.0])
        mask = True
        expected = mx.array([5.0, 5.0, 5.0])
        a[mask] = 5.0
        self.assertTrue(mx.array_equal(a, expected))

        mask_np = np.zeros((1, 10, 10), dtype=bool)
        with self.assertRaises(ValueError):
            mx.arange(1000).reshape(10, 10, 10)[mask_np] = 0

        mask_np = np.zeros((10, 10, 1), dtype=bool)
        with self.assertRaises(ValueError):
            mx.arange(1000).reshape(10, 10, 10)[mask_np] = 0

    def test_array_namespace(self):
        a = mx.array(1.0)
        api = a.__array_namespace__()
        self.assertTrue(hasattr(api, "array"))
        self.assertTrue(hasattr(api, "add"))

    def test_array_namespace_asarray(self):
        xp = mx.array(1.0).__array_namespace__()
        self.assertTrue(hasattr(xp, "asarray"))

        arr = xp.asarray([1, 2, 3])
        self.assertEqual(arr.tolist(), [1, 2, 3])

        arr_f32 = xp.asarray([1, 2, 3], dtype=mx.float32)
        self.assertEqual(arr_f32.dtype, mx.float32)

        existing = mx.array([4, 5, 6])
        arr_pass = xp.asarray(existing)
        self.assertEqual(arr_pass.tolist(), [4, 5, 6])

    def test_asarray(self):
        # List inputs
        self.assertEqual(mx.asarray([1, 2, 3]).tolist(), [1, 2, 3])
        self.assertEqual(mx.asarray([[1, 2], [3, 4]]).tolist(), [[1, 2], [3, 4]])

        # Tuple inputs
        self.assertEqual(mx.asarray((1, 2, 3)).tolist(), [1, 2, 3])
        self.assertEqual(mx.asarray(((1, 2), (3, 4))).tolist(), [[1, 2], [3, 4]])

        # Mixed nesting
        self.assertEqual(mx.asarray([(1, 2), (3, 4)]).tolist(), [[1, 2], [3, 4]])
        self.assertEqual(mx.asarray(([1, 2], [3, 4])).tolist(), [[1, 2], [3, 4]])

        # Scalar inputs
        self.assertEqual(mx.asarray(42).item(), 42)
        self.assertEqual(mx.asarray(3.14).item(), 3.140000104904175)
        self.assertEqual(mx.asarray(True).item(), True)
        self.assertEqual(mx.asarray(1 + 2j).item(), (1 + 2j))

        # MLX array inputs
        arr = mx.array([1, 2, 3])
        self.assertEqual(mx.asarray(arr).tolist(), [1, 2, 3])

        arr_int = mx.array([1, 2, 3], dtype=mx.int32)
        arr_float = mx.asarray(arr_int, dtype=mx.float32)
        self.assertEqual(arr_float.dtype, mx.float32)
        self.assertEqual(arr_float.tolist(), [1.0, 2.0, 3.0])

        # NumPy array inputs
        np_arr = np.array([1.0, 2.0, 3.0], dtype=np.float32)
        mx_arr = mx.asarray(np_arr)
        self.assertEqual(mx_arr.tolist(), [1.0, 2.0, 3.0])
        self.assertEqual(mx_arr.dtype, mx.float32)

        # dtype parameter
        self.assertEqual(mx.asarray([1, 2, 3], dtype=mx.float32).dtype, mx.float32)
        self.assertEqual(mx.asarray(42, dtype=mx.float16).dtype, mx.float16)

    def test_to_scalar(self):
        a = mx.array(1)
        self.assertEqual(int(a), 1)
        self.assertEqual(float(a), 1)

        a = mx.array(1.5)
        self.assertEqual(float(a), 1.5)
        self.assertEqual(int(a), 1)

        a = mx.zeros((2, 1))
        with self.assertRaises(ValueError):
            float(a)
        with self.assertRaises(ValueError):
            int(a)

    def test_format(self):
        a = mx.arange(3)
        self.assertEqual(f"{a[0]:.2f}", "0.00")

        b = mx.array(0.35487)
        self.assertEqual(f"{b:.1f}", "0.4")

        with self.assertRaises(TypeError):
            s = f"{a:.2f}"

        a = mx.array([1, 2, 3])
        self.assertEqual(f"{a}", "array([1, 2, 3], dtype=int32)")

    def test_deep_graphs(self):
        # The following tests should simply run cleanly without a segfault or
        # crash due to exceeding recursion depth limits.

        # Deep graph destroyed without eval
        x = mx.array([1.0, 2.0])
        for _ in range(100_000):
            x = mx.sin(x)
        del x

        # Duplicate input deep graph destroyed without eval
        x = mx.array([1.0, 2.0])
        for _ in range(100_000):
            x = x + x

        # Deep graph with siblings destroyed without eval
        x = mx.array([1, 2])
        for _ in range(100_000):
            x = mx.concatenate(mx.split(x, 2))
        del x

        # Deep graph with eval
        x = mx.array([1.0, 2.0])
        for _ in range(100_000):
            x = mx.sin(x)
        mx.eval(x)

    @unittest.skipIf(platform.system() == "Windows", "Memory info not accurate")
    def test_siblings_without_eval(self):
        def get_mem():
            process = psutil.Process(os.getpid())
            return process.memory_info().rss

        key = mx.array([1, 2])

        def t():
            a, b = mx.split(key, 2)
            a = mx.reshape(a, [])
            b = mx.reshape(b, [])
            return b

        mx.synchronize()
        t()
        gc.collect()
        expected = get_mem()
        for _ in range(100):
            t()
        used = get_mem()
        self.assertEqual(expected, used)

    def test_scalar_integer_conversion_overflow(self):
        y = mx.array(2000000000, dtype=mx.int32)
        x = 3000000000
        with self.assertRaises(ValueError):
            y + x
        with self.assertRaises(ValueError):
            mx.add(y, x)

    def test_real_imag(self):
        x = mx.array([1.0])
        self.assertEqual(x.real.item(), 1.0)
        self.assertEqual(x.imag.item(), 0.0)

        x = mx.array([1.0 + 1.0j])
        self.assertEqual(x.imag.item(), 1.0)
        self.assertEqual(x.real.item(), 1.0)

    def test_large_indices(self):
        x = mx.array([0, 1, 2])
        with self.assertRaises(ValueError):
            x[: 2**32]
        with self.assertRaises(ValueError):
            x[2**32]


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_autograd.py
================================================
# Copyright © 2023 Apple Inc.

import gc
import unittest

import mlx.core as mx
import mlx_tests


class TestAutograd(mlx_tests.MLXTestCase):
    def test_jvp(self):
        fun = lambda x: 2 * x
        out, dout = mx.jvp(fun, [mx.array(1.0)], [mx.array(2.0)])
        self.assertEqual(out[0].item(), 2.0)
        self.assertEqual(dout[0].item(), 4.0)

        fun = lambda x, y: x * y
        _, out = mx.jvp(
            fun, [mx.array(4.0), mx.array(2.0)], [mx.array(3.0), mx.array(2.0)]
        )
        self.assertEqual(out[0].item(), 4.0 * 2.0 + 2.0 * 3.0)

        fun = lambda x, y, z: (x * y, y * z)
        _, out = mx.jvp(
            fun,
            [mx.array(2.0), mx.array(4.0), mx.array(6.0)],
            [mx.array(1.0), mx.array(3.0), mx.array(1.0)],
        )
        self.assertEqual(len(out), 2)
        self.assertEqual(out[0].item(), 4.0 * 1.0 + 2.0 * 3.0)
        self.assertEqual(out[1].item(), 4.0 * 1.0 + 6.0 * 3.0)

    def test_jvp_comparison_tangent_dtype(self):
        # Comparison op JVP tangents should preserve the input tangent's
        # dtype (e.g. float32), not return bool. Using bool tangents causes
        # downstream ops like negative to crash. (issue #3081)
        x = mx.array([1.0, -2.0, 3.0])
        t = mx.ones_like(x)

        for op in [
            mx.greater,
            mx.less,
            mx.equal,
            mx.greater_equal,
            mx.less_equal,
            mx.not_equal,
        ]:
            _, tangents = mx.jvp(lambda x, _op=op: _op(x, 0.0), [x], [t])
            self.assertEqual(tangents[0].dtype, mx.float32)

    def test_vjp(self):
        fun = lambda x: 2 * x
        out, dout = mx.vjp(fun, [mx.array(1.0)], [mx.array(2.0)])
        self.assertEqual(out[0].item(), 2.0)
        self.assertEqual(dout[0].item(), 4.0)

        fun = lambda x, y: x * y
        _, dout = mx.vjp(fun, [mx.array(4.0), mx.array(2.0)], [mx.array(3.0)])
        self.assertEqual(dout[0].item(), 6.0)
        self.assertEqual(dout[1].item(), 12.0)

        fun = lambda x, y, z: (x * y, y * z)
        _, out = mx.vjp(
            fun,
            [mx.array(2.0), mx.array(4.0), mx.array(6.0)],
            [mx.array(1.0), mx.array(3.0)],
        )
        self.assertEqual(len(out), 3)
        self.assertEqual(out[0].item(), 4.0 * 1.0)
        self.assertEqual(out[1].item(), 2.0 * 1.0 + 6.0 * 3.0)
        self.assertEqual(out[2].item(), 4.0 * 3.0)

    def test_grad(self):
        fun = lambda x: x * x

        value, dfdx = mx.value_and_grad(fun)(mx.array(0.5))
        self.assertEqual(value.item(), 0.25)
        self.assertEqual(dfdx.item(), 1.0)

        dfdx = mx.grad(fun)(mx.array(0.5))
        self.assertEqual(dfdx.item(), 1.0)

        df2dx2 = mx.grad(mx.grad(fun))(mx.array(0.5))
        self.assertEqual(df2dx2.item(), 2.0)
        df3dx3 = mx.grad(mx.grad(mx.grad(fun)))(mx.array(0.5))
        self.assertEqual(df3dx3.item(), 0.0)

        fun = lambda x, y: x * y
        x = mx.array(2.0)
        y = mx.array(3.0)
        dfdx = mx.grad(fun, argnums=0)(x, y)
        self.assertEqual(dfdx.item(), 3.0)
        dfdx = mx.grad(fun, argnums=1)(x, y)
        self.assertEqual(dfdx.item(), 2.0)

        # Pass non array args to functions works
        fun = lambda x, y: x
        value, dfdx = mx.value_and_grad(fun)(mx.array(2.0), "hello")
        self.assertEqual(value.item(), 2.0)
        self.assertEqual(dfdx.item(), 1.0)

        dfdx = mx.grad(fun)(mx.array(2.0), "hello")
        self.assertEqual(dfdx.item(), 1.0)

        # Raises when function does not return array
        fun = lambda x: "hello"
        with self.assertRaises(ValueError):
            mx.grad(fun)(mx.array(2.0))

        # Raises for invalid argument number or argument type
        fun = lambda x: x
        with self.assertRaises(ValueError):
            mx.grad(fun, argnums=2)(mx.array(2.0))
        with self.assertRaises(ValueError):
            mx.grad(fun, argnums=-2)(mx.array(2.0))
        with self.assertRaises(ValueError):
            mx.grad(fun)("hello")

        # Raises when output is not a scalar array
        fun = lambda x: mx.sum(x, keepdims=True)
        with self.assertRaises(ValueError):
            mx.grad(fun)(mx.ones((2, 2)))

    def test_grad_trees(self):
        fun = lambda x, y: x * y
        value, dfdx = mx.value_and_grad(fun, (0, 1))(mx.array(0.5), mx.array(2.0))
        self.assertEqual(value.item(), 1.0)
        self.assertTrue(isinstance(dfdx, tuple))
        self.assertEqual(dfdx[0].item(), 2.0)
        self.assertEqual(dfdx[1].item(), 0.5)

        fun = lambda x, y: x * y
        value, dfdx = mx.value_and_grad(fun, 1)(mx.array(0.5), mx.array(2.0))
        self.assertEqual(value.item(), 1.0)
        self.assertEqual(dfdx.item(), 0.5)

        fun = lambda p: p["x"] * p["y"]
        value, dfdx = mx.value_and_grad(fun)({"x": mx.array(0.5), "y": mx.array(2.0)})
        self.assertEqual(value.item(), 1.0)
        self.assertEqual(dfdx["x"].item(), 2.0)
        self.assertEqual(dfdx["y"].item(), 0.5)

        fun = lambda p: p["x"] * p["y"]
        with self.assertRaises(ValueError):
            mx.value_and_grad(fun)({"x": 0.5, "y": mx.array(2.0)})
        with self.assertRaises(ValueError):
            mx.value_and_grad(fun, (0, 1))({"x": mx.array(0.5), "y": mx.array(2.0)})

        fun = lambda p, b: mx.square(p[0]["foo"][2]) * b
        value, dfdx = mx.value_and_grad(fun)(
            [{"foo": [[], [], mx.array(2.0)]}], mx.array(0.5)
        )
        self.assertEqual(value.item(), 2.0)
        self.assertEqual(dfdx[0]["foo"][2].item(), 2.0)

        fun = lambda x: x
        with self.assertRaises(TypeError):
            mx.value_and_grad(fun, (None, None))
        with self.assertRaises(ValueError):
            mx.value_and_grad(fun, tuple())
        with self.assertRaises(ValueError):
            mx.grad(fun, argnums=(0, 0))

    def test_auxiliary_values(self):
        def fun(x, y):
            l = (x * y).sum()
            extra = {"loss": l, "foo": y.square() + x.square(), "bar": [1, 2, 3, y, x]}
            return l, extra

        fun_value_grad = mx.value_and_grad(fun)
        fun_grad = mx.grad(fun)

        (loss, a), b = fun_value_grad(mx.ones((2, 2)), mx.ones((2, 2)))
        self.assertEqual(a["loss"].item(), 4)
        self.assertTrue(mx.array_equal(b, mx.ones((2, 2))))
        self.assertTrue(mx.array_equal(a["foo"], 2 * mx.ones((2, 2))))
        self.assertEqual(a["bar"][:3], [1, 2, 3])
        self.assertTrue(mx.array_equal(a["bar"][3], mx.ones((2, 2))))
        self.assertTrue(mx.array_equal(a["bar"][4], mx.ones((2, 2))))

        with self.assertRaises(ValueError):
            _ = fun_grad(mx.ones((2, 2)), mx.ones((2, 2)))

    def test_grad_kwargs(self):
        fun = lambda x, y: x * y
        a, b = mx.array(0.5), mx.array(2.0)
        dfdx = mx.grad(fun)
        self.assertEqual(dfdx(a, b).item(), 2.0)
        self.assertEqual(dfdx(a, y=b).item(), 2.0)
        with self.assertRaises(ValueError):
            dfdx(x=a, y=b).item()

        dfdy = mx.grad(fun, argnums=[], argnames=["y"])
        with self.assertRaises(ValueError):
            dfdy(a, b)
        grads = dfdy(a, y=b)
        self.assertTrue(isinstance(grads, tuple))
        self.assertTrue(grads[0] is None)
        self.assertTrue(isinstance(grads[1], dict))
        self.assertEqual(grads[1]["y"].item(), 0.5)
        grads = dfdy(x=a, y=b)
        self.assertEqual(grads[1]["y"].item(), 0.5)
        self.assertEqual(len(grads[1]), 1)

        dfdxy = mx.grad(fun, argnums=[0], argnames=["y"])
        with self.assertRaises(ValueError):
            dfdxy(a, b)
        with self.assertRaises(ValueError):
            dfdxy(x=a, y=b)
        grads = dfdxy(a, y=b)
        self.assertTrue(isinstance(grads, tuple))
        self.assertEqual(grads[0].item(), 2.0)
        self.assertTrue(isinstance(grads[1], dict))
        self.assertEqual(grads[1]["y"].item(), 0.5)

        fun = lambda x, y, z: x * y * z
        dfdxyz = mx.grad(fun, argnums=[0, 1], argnames=["z"])
        c = mx.array(4.0)
        grads = dfdxyz(a, b, z=c)
        self.assertTrue(isinstance(grads, tuple))
        self.assertTrue(isinstance(grads[0], tuple))
        self.assertEqual(grads[0][0].item(), 8.0)
        self.assertEqual(grads[0][1].item(), 2.0)
        self.assertTrue(isinstance(grads[1], dict))
        self.assertEqual(grads[1]["z"].item(), 1.0)

        fun = lambda x, y: x * y
        dfdy = mx.grad(fun, argnames=["y"])
        grads = dfdy(a, y=b)
        self.assertTrue(isinstance(grads, tuple))
        self.assertTrue(grads[0] is None)
        self.assertTrue(isinstance(grads[1], dict))
        self.assertEqual(grads[1]["y"].item(), 0.5)

    def test_captured(self):
        a = mx.array(5.0)
        f = lambda x: a + x
        g = lambda x: a + a
        h = lambda x: x + x

        dfdx = mx.grad(f)
        self.assertEqual(dfdx(a).item(), 1.0)

        dgdx = mx.grad(g)
        self.assertEqual(dgdx(a).item(), 0.0)

        dhdx = mx.grad(h)
        self.assertEqual(dhdx(a).item(), 2.0)

        d2fdx2 = mx.grad(dfdx)
        self.assertEqual(d2fdx2(a).item(), 0.0)

        d2gdx2 = mx.grad(dgdx)
        self.assertEqual(d2gdx2(a).item(), 0.0)

        d2hdx2 = mx.grad(dhdx)
        self.assertEqual(d2hdx2(a).item(), 0.0)

    def test_stop_gradient(self):
        shape_in = (4, 4)
        w_in = mx.ones(shape_in)
        x_in = mx.ones(shape_in)
        cotan = mx.ones(shape_in)

        def h(w, x):
            x1 = 2 * x
            y = mx.stop_gradient(x1)
            y1 = 3 * y
            return w @ y1

        vals, vjps = mx.vjp(h, [w_in, x_in], [cotan])
        mx.eval(vjps)

        self.assertTrue(mx.allclose(vjps[0], 24.0 * mx.ones(shape_in)))
        self.assertTrue(mx.allclose(vjps[1], mx.zeros(shape_in)))

        g = lambda x: h(w_in, x)
        vals, vjps = mx.vjp(g, [x_in], [cotan])
        mx.eval(vjps)

        self.assertTrue(mx.allclose(vjps[0], mx.zeros(shape_in)))

    def test_update_state(self):
        y = mx.array([1.0])
        state = mx.zeros((2,))

        def fn(y, x):
            nonlocal state
            x = y * x
            state = state + x
            return x.sum()

        x = mx.ones((2,))
        mx.grad(fn)(y, x)
        mx.eval(state)
        self.assertTrue(mx.allclose(state, mx.ones((2,))))

    def test_scatter_vjp(self):
        def fun(x, idx):
            x[idx] = 2.0
            return x.sum()

        dfdx = mx.grad(fun)(mx.array([1.0, 2.0, 3.0, 4.0]), mx.array([1, 3]))
        self.assertTrue(mx.array_equal(dfdx, mx.array([1.0, 0.0, 1.0, 0.0])))
        self.assertEqual(dfdx.dtype, mx.float32)

        y = mx.array([0.0, 1.0, 2.0, 3.0])

        def fun(x, idx):
            y[idx] = x
            return y.sum()

        dfdx = mx.grad(fun)(mx.array([2.0, 3.0]), mx.array([1, 3]))
        self.assertTrue(mx.array_equal(dfdx, mx.array([1.0, 1.0])))
        self.assertEqual(dfdx.dtype, mx.float32)

    def test_scatter_add_vjp(self):
        def fun(src, updates):
            x = src.at[mx.array([1, 3])].add(updates)
            return x

        cotan = mx.array([4.0, 5.0, 6.0, 7.0])
        updates = mx.array([1.0, 2.0])
        _, vjps = mx.vjp(fun, [mx.array([1.0, 2.0, 3.0, 4.0]), updates], [cotan])
        mx.eval(vjps)

        self.assertTrue(mx.allclose(vjps[0], mx.array([4.0, 5.0, 6.0, 7.0])))
        self.assertTrue(mx.allclose(vjps[1], mx.array([5.0, 7.0])))

    def test_scatter_max_vjp(self):
        def fun(src, updates):
            x = src.at[mx.array([1, 3])].maximum(updates)
            return x

        cotan = mx.array([4.0, 5.0, 6.0, 7.0])
        updates = mx.array([1.0, 2.0])
        _, vjps = mx.vjp(fun, [mx.array([1.0, 2.0, 3.0, 4.0]), updates], [cotan])
        mx.eval(vjps)

        self.assertTrue(mx.allclose(vjps[0], mx.array([4.0, 5.0, 6.0, 7.0])))
        self.assertTrue(mx.allclose(vjps[1], mx.array([0.0, 0.0])))

        updates = mx.array([5.0, 6.0])
        _, vjps = mx.vjp(fun, [mx.array([1.0, 2.0, 3.0, 4.0]), updates], [cotan])
        mx.eval(vjps)

        self.assertTrue(mx.allclose(vjps[0], mx.array([4.0, 0.0, 6.0, 0.0])))
        self.assertTrue(mx.allclose(vjps[1], mx.array([5.0, 7.0])))

    def test_scatter_min_vjp(self):
        def fun(src, updates):
            x = src.at[mx.array([1, 3])].minimum(updates)
            return x

        cotan = mx.array([4.0, 5.0, 6.0, 7.0])
        updates = mx.array([5.0, 6.0])
        _, vjps = mx.vjp(fun, [mx.array([1.0, 2.0, 3.0, 4.0]), updates], [cotan])
        mx.eval(vjps)

        self.assertTrue(mx.allclose(vjps[0], mx.array([4.0, 5.0, 6.0, 7.0])))
        self.assertTrue(mx.allclose(vjps[1], mx.array([0.0, 0.0])))

        updates = mx.array([1.0, 1.0])
        _, vjps = mx.vjp(fun, [mx.array([1.0, 2.0, 3.0, 4.0]), updates], [cotan])
        mx.eval(vjps)

        self.assertTrue(mx.allclose(vjps[0], mx.array([4.0, 0.0, 6.0, 0.0])))
        self.assertTrue(mx.allclose(vjps[1], mx.array([5.0, 7.0])))

    def test_slice_update_max_vjp(self):
        def fun(src, updates):
            x = src.at[1:3].maximum(updates)
            return x

        cotan = mx.array([4.0, 5.0, 6.0, 7.0])
        updates = mx.array([[1.0, 2.0]])
        _, vjps = mx.vjp(fun, [mx.array([1.0, 2.0, 3.0, 4.0]), updates], [cotan])
        mx.eval(vjps)

        self.assertTrue(mx.allclose(vjps[0], mx.array([4.0, 5.0, 6.0, 7.0])))
        self.assertTrue(mx.allclose(vjps[1], mx.array([[0.0, 0.0]])))

        updates = mx.array([[5.0, 6.0]])
        _, vjps = mx.vjp(fun, [mx.array([1.0, 2.0, 3.0, 4.0]), updates], [cotan])
        mx.eval(vjps)

        self.assertTrue(mx.allclose(vjps[0], mx.array([4.0, 0.0, 0.0, 7.0])))
        self.assertTrue(mx.allclose(vjps[1], mx.array([[5.0, 6.0]])))

    def test_slice_update_min_vjp(self):
        def fun(src, updates):
            x = src.at[1:3].minimum(updates)
            return x

        cotan = mx.array([4.0, 5.0, 6.0, 7.0])
        updates = mx.array([[5.0, 6.0]])
        _, vjps = mx.vjp(fun, [mx.array([1.0, 2.0, 3.0, 4.0]), updates], [cotan])
        mx.eval(vjps)

        self.assertTrue(mx.allclose(vjps[0], mx.array([4.0, 5.0, 6.0, 7.0])))
        self.assertTrue(mx.allclose(vjps[1], mx.array([[0.0, 0.0]])))

        updates = mx.array([[1.0, 1.0]])
        _, vjps = mx.vjp(fun, [mx.array([1.0, 2.0, 3.0, 4.0]), updates], [cotan])
        mx.eval(vjps)

        self.assertTrue(mx.allclose(vjps[0], mx.array([4.0, 0.0, 0.0, 7.0])))
        self.assertTrue(mx.allclose(vjps[1], mx.array([[5.0, 6.0]])))

    def test_slice_update_add_vjp(self):
        def fun(src, updates):
            x = src.at[1:3].add(updates)
            return x

        cotan = mx.array([4.0, 5.0, 6.0, 7.0])
        updates = mx.array([[1.0, 2.0]])
        _, vjps = mx.vjp(fun, [mx.array([1.0, 2.0, 3.0, 4.0]), updates], [cotan])
        mx.eval(vjps)

        self.assertTrue(mx.allclose(vjps[0], mx.array([4.0, 5.0, 6.0, 7.0])))
        self.assertTrue(mx.allclose(vjps[1], mx.array([[5.0, 6.0]])))

    def test_slice_update_multiply_vjp(self):
        def fun(src, updates):
            x = src.at[1:3].multiply(updates)
            return x

        cotan = mx.array([4.0, 5.0, 6.0, 7.0])
        updates = mx.array([[2.0, 3.0]])
        _, vjps = mx.vjp(fun, [mx.array([1.0, 2.0, 3.0, 4.0]), updates], [cotan])
        mx.eval(vjps)

        self.assertTrue(mx.allclose(vjps[0], mx.array([4.0, 10.0, 18.0, 7.0])))
        self.assertTrue(mx.allclose(vjps[1], mx.array([[10.0, 18.0]])))

    def test_split_against_slice(self):
        def f_split(x):
            a, _, b = x.split(3, -1)
            return (a * b).sum()

        def f_slice(x):
            step = x.shape[-1] // 3
            a = x[..., :step]
            b = x[..., -step:]
            return (a * b).sum()

        x = mx.random.uniform(shape=(100, 300))
        mx.eval(x)

        df1 = mx.grad(f_split)
        df2 = mx.grad(f_slice)

        self.assertTrue(mx.allclose(df1(x), df2(x)))

    def test_vjp_types(self):
        def fun(x):
            return x

        for t in [mx.float16, mx.bfloat16, mx.float32]:
            out = mx.grad(fun)(mx.array(1.0, t))
            self.assertEqual(out.dtype, t)

        def fun(x):
            return x.sum()

        for t in [mx.float16, mx.bfloat16, mx.float32]:
            out = mx.grad(fun)(mx.array(1.0, t))
            self.assertEqual(out.dtype, t)

        def fun(x, y):
            return (x + y).sum()

        for t in [mx.float16, mx.bfloat16, mx.float32]:
            out = mx.grad(fun)(mx.array(1.0, t), mx.array(1.0, t))
            self.assertEqual(out.dtype, t)

    def test_power_grad(self):
        x = mx.array(0.0)
        g = mx.grad(lambda x: x**2)(x)
        self.assertEqual(g.item(), 0.0)

        x = mx.array(0.0)
        g = mx.grad(lambda x: x**1.5)(x)
        self.assertEqual(g.item(), 0.0)

        x = mx.array(2.0)
        g = mx.grad(lambda x: x**2)(x)
        self.assertAlmostEqual(g.item(), 4.0)

    def test_eval_in_grad(self):
        arr = mx.array([1.0])
        cotan = mx.array([1.0, 1.0])
        y = mx.array([2.0, 2.0])

        def func(x):
            x = x + y
            cond = x < 1
            cond.tolist()
            return x**2

        _, vjps = mx.vjp(func, (arr,), (cotan,))
        self.assertEqual(vjps[0].item(), 12.0)

        def func(x):
            x = x + mx.array([1.0, 1.0])
            mx.eval(x)
            return x**2

        _, vjps = mx.vjp(func, (arr,), (cotan,))
        self.assertEqual(vjps[0].item(), 8.0)

    def test_power_grad(self):
        def fun(x, y):
            res = x - y
            return res**x

        grad = mx.grad(fun)(mx.array(1.0), mx.array(1.0))
        self.assertEqual(grad.item(), 1.0)

    def test_cumprod_grad(self):
        def fun(y):
            return mx.cumprod(y).sum()

        y = mx.array([2.0, 1.0, 2.0, 2.0, 3.0])
        out = mx.grad(fun)(y)
        expected = mx.array([20.0, 38.0, 18.0, 16.0, 8.0])
        self.assertTrue(mx.allclose(out, expected))

        y = mx.array([2.0, 0.0, 2.0, 2.0, 3.0])
        out = mx.grad(fun)(y)
        expected = mx.array([1.0, 38.0, 0.0, 0.0, 0.0])
        self.assertTrue(mx.allclose(out, expected))

        y = mx.array([2.0, 0.0, 2.0, 0.0, 3.0])
        out = mx.grad(fun)(y)
        expected = mx.array([1.0, 6.0, 0.0, 0.0, 0.0])
        self.assertTrue(mx.allclose(out, expected))

        def fun(y):
            return mx.cumprod(y, inclusive=False).sum()

        y = mx.array([2.0, 1.0, 2.0, 2.0, 3.0])
        out = mx.grad(fun)(y)
        expected = mx.array([8.0, 14.0, 6.0, 4.0, 0.0])
        self.assertTrue(mx.allclose(out, expected))

        y = mx.array([2.0, 0.0, 2.0, 2.0, 3.0])
        out = mx.grad(fun)(y)
        expected = mx.array([1.0, 14.0, 0.0, 0.0, 0.0])
        self.assertTrue(mx.allclose(out, expected))

        y = mx.array([2.0, 0.0, 2.0, 0.0, 3.0])
        out = mx.grad(fun)(y)
        expected = mx.array([1.0, 6.0, 0.0, 0.0, 0.0])
        self.assertTrue(mx.allclose(out, expected))

        def fun(y):
            return mx.cumprod(y, inclusive=False, reverse=True).sum()

        y = mx.array([2.0, 1.0, 2.0, 2.0, 3.0])
        out = mx.grad(fun)(y)
        expected = mx.array([0.0, 12.0, 12.0, 15.0, 11.0])
        self.assertTrue(mx.allclose(out, expected))

        y = mx.array([2.0, 0.0, 2.0, 2.0, 3.0])
        out = mx.grad(fun)(y)
        expected = mx.array([0.0, 12.0, 6.0, 9.0, 7.0])
        self.assertTrue(mx.allclose(out, expected))

        y = mx.array([2.0, 0.0, 2.0, 0.0, 3.0])
        out = mx.grad(fun)(y)
        expected = mx.array([0.0, 0.0, 0.0, 9.0, 1.0])
        self.assertTrue(mx.allclose(out, expected))

        def fun(y):
            return mx.cumprod(y, reverse=True).sum()

        y = mx.array([2.0, 1.0, 2.0, 2.0, 3.0])
        out = mx.grad(fun)(y)
        expected = mx.array([12.0, 36.0, 24.0, 27.0, 19.0])
        self.assertTrue(mx.allclose(out, expected))

        y = mx.array([2.0, 0.0, 2.0, 2.0, 3.0])
        out = mx.grad(fun)(y)
        expected = mx.array([0.0, 36.0, 6.0, 9.0, 7.0])
        self.assertTrue(mx.allclose(out, expected))

        y = mx.array([2.0, 0.0, 2.0, 0.0, 3.0])
        out = mx.grad(fun)(y)
        expected = mx.array([0.0, 0.0, 0.0, 9.0, 1.0])
        self.assertTrue(mx.allclose(out, expected))

    def test_topk_grad(self):
        a = mx.array([[1, 2, 6, 4, 5], [9, 5, 6, 7, 8]], mx.float32)

        def fun(x):
            return mx.topk(x, 2)

        out = mx.vjp(fun, (a,), (mx.ones((2, 2)),))[1][0]
        expected = mx.array([[0, 0, 1, 0, 1], [1, 0, 0, 0, 1]], mx.float32)
        self.assertTrue(mx.array_equal(out, expected))

    def test_custom_function(self):
        # Make a custom function
        my_exp = mx.custom_function(mx.exp)

        # Ensure everything works
        dy = mx.grad(my_exp)(mx.array(1.0))
        self.assertTrue(mx.allclose(dy, mx.exp(mx.array(1.0))))
        (ex,), (dex,) = mx.jvp(my_exp, [mx.array(1.0)], [mx.array(1.0)])
        self.assertTrue(mx.allclose(dex, mx.exp(mx.array(1.0))))
        self.assertTrue(mx.allclose(ex, dex))
        ex = mx.vmap(my_exp)(mx.ones(10))
        self.assertTrue(mx.allclose(ex, mx.exp(mx.ones(10))))

        # Ensure that the vjp is being overriden but everything else still
        # works.
        @my_exp.vjp
        def my_exp_vjp(x, dx, ex):
            return mx.ones_like(x) * 42

        dy = mx.grad(my_exp)(mx.array(1.0))
        self.assertTrue(mx.allclose(dy, mx.array(42.0)))
        (ex,), (dex,) = mx.jvp(my_exp, [mx.array(1.0)], [mx.array(1.0)])
        self.assertTrue(mx.allclose(dex, mx.exp(mx.array(1.0))))
        self.assertTrue(mx.allclose(ex, dex))
        ex = mx.vmap(my_exp)(mx.ones(10))
        self.assertTrue(mx.allclose(ex, mx.exp(mx.ones(10))))

        # Ensure that setting the jvp and vmap also works.
        @my_exp.jvp
        def my_exp_jvp(x, dx):
            return mx.ones_like(x) * 7 * dx

        @my_exp.vmap
        def my_exp_vmap(x, axis):
            return mx.ones_like(x) * 3, axis

        dy = mx.grad(my_exp)(mx.array(1.0))
        self.assertTrue(mx.allclose(dy, mx.array(42.0)))
        (ex,), (dex,) = mx.jvp(my_exp, [mx.array(1.0)], [mx.array(1.0)])
        self.assertTrue(mx.allclose(dex, mx.array(7.0)))
        self.assertTrue(mx.allclose(ex, mx.exp(mx.array(1.0))))
        ex = mx.vmap(my_exp)(mx.ones(10))
        self.assertTrue(mx.allclose(ex, 3 * mx.ones(10)))

        # Test pytrees
        @mx.custom_function
        def my_double(params):
            return {"out": 2 * params["x"] * params["y"]}

        dy = mx.grad(lambda p: my_double(p)["out"].sum())(
            {"x": mx.ones(2), "y": mx.ones(2)}
        )
        self.assertTrue(mx.allclose(dy["x"], mx.ones(2) * 2))
        self.assertTrue(mx.allclose(dy["y"], mx.ones(2) * 2))

        @my_double.vjp
        def random_grads(primals, cotangents, outputs):
            return {"x": mx.zeros_like(primals["x"]), "y": mx.ones_like(primals["y"])}

        dy = mx.grad(lambda p: my_double(p)["out"].sum())(
            {"x": mx.ones(2), "y": mx.ones(2)}
        )
        self.assertTrue(mx.allclose(dy["x"], mx.zeros(2)))
        self.assertTrue(mx.allclose(dy["y"], mx.ones(2)))

        def outer_f(a, b):
            return my_double({"x": a, "y": b})["out"]

        inputs = [mx.random.normal(shape=(2,)) for i in range(2)]
        tans = [mx.random.normal(shape=(2,)) for i in range(2)]
        out1, dout1 = mx.jvp(outer_f, inputs, tans)

        @my_double.jvp
        def random_grads(primals, tangents):
            return {
                "out": 2 * primals["x"] * tangents["y"]
                + 2 * primals["y"] * tangents["x"]
                + 1
            }

        out2, dout2 = mx.jvp(outer_f, inputs, tans)
        self.assertTrue(mx.allclose(out1[0], out2[0]))
        self.assertTrue(mx.allclose(dout1[0] + 1, dout2[0]))

    def test_complex_vjps(self):
        def fun(x):
            return (2.0 * mx.real(x)).sum()

        x = mx.array([0.0 + 1j, 1.0 + 0.0j, 0.5 + 0.5j])
        dfdx = mx.grad(fun)(x)
        self.assertTrue(mx.allclose(dfdx, 2 * mx.ones_like(x)))

        def fun(x):
            return (2.0 * mx.imag(x)).sum()

        x = mx.array([0.0 + 1j, 1.0 + 0.0j, 0.5 + 0.5j])
        dfdx = mx.grad(fun)(x)
        self.assertTrue(mx.allclose(dfdx, 2j * mx.ones_like(x)))

    def test_flatten_unflatten_vjps(self):
        def fun(x):
            y = mx.unflatten(x, 0, (2, 2))
            return y.sum()

        x = mx.zeros((4, 8))
        self.assertEqual(mx.grad(fun)(x).shape, (4, 8))

        def fun(x):
            y = mx.flatten(x, 0, 2)
            return y.sum()

        x = mx.zeros((2, 4, 8))
        self.assertEqual(mx.grad(fun)(x).shape, (2, 4, 8))

    def test_concatenate_vjps(self):
        def fun(x, y):
            return mx.concatenate([x, y])

        x = mx.array([1, 2, 3], mx.float32)
        y = mx.array([1, 2, 3], mx.float16)
        grads = mx.vjp(fun, (x, y), (mx.ones((6,)),))[1]
        self.assertTrue(mx.allclose(grads[0], mx.ones(3)))
        self.assertTrue(mx.allclose(grads[1], mx.ones(3)))
        self.assertEqual(grads[0].dtype, mx.float32)
        self.assertEqual(grads[1].dtype, mx.float16)

    def test_matmul_jvps(self):
        a = mx.random.uniform(shape=(4, 4))
        b = mx.random.uniform(shape=(4, 4))
        c = mx.random.uniform(shape=(4, 4))
        d = mx.random.uniform(shape=(4, 4))

        _, tangent = mx.jvp(lambda a: a @ b, (a,), (c,))
        self.assertTrue(mx.allclose(tangent[0], c @ b))

        _, tangent = mx.jvp(lambda b: a @ b, (b,), (d,))
        self.assertTrue(mx.allclose(tangent[0], a @ d))

        _, tangent = mx.jvp(lambda a, b: a @ b, (a, b), (c, d))
        self.assertTrue(mx.allclose(tangent[0], a @ d + c @ b))

        x = mx.random.uniform(shape=(4, 4))
        y = mx.random.uniform(shape=(4, 4))
        z = mx.random.uniform(shape=(4, 4))

        _, (tangent,) = mx.jvp(lambda a, b, c: a @ b + c, (a, b, c), (x, y, z))
        _, (expected,) = mx.jvp(lambda a, b, c: mx.addmm(c, a, b), (a, b, c), (x, y, z))
        self.assertTrue(mx.allclose(tangent, expected))

        _, (tangent,) = mx.jvp(lambda a, c: a @ b + c, (a, c), (x, z))
        _, (expected,) = mx.jvp(lambda a, c: mx.addmm(c, a, b), (a, c), (x, z))
        self.assertTrue(mx.allclose(tangent, expected))

        _, (tangent,) = mx.jvp(lambda b, c: a @ b + c, (b, c), (y, z))
        _, (expected,) = mx.jvp(lambda b, c: mx.addmm(c, a, b), (b, c), (y, z))
        self.assertTrue(mx.allclose(tangent, expected))

        _, (tangent,) = mx.jvp(lambda c: a @ b + c, (c,), (z,))
        _, (expected,) = mx.jvp(lambda c: mx.addmm(c, a, b), (c,), (z,))
        self.assertTrue(mx.allclose(tangent, expected))

    def test_put_along_axis_grads(self):
        a = mx.zeros((5, 1))
        b = mx.ones((2, 1))

        def fun(a, b):
            idx = mx.array([[0], [3]])
            return mx.put_along_axis(a, idx, b, axis=0)

        # Test VJP
        cotan = mx.full((5, 1), 2.0)
        _, (da, db) = mx.vjp(fun, (a, b), (cotan,))
        expected_da = mx.array([0.0, 2.0, 2.0, 0.0, 2.0])[:, None]
        expected_db = mx.array([2.0, 2.0])[:, None]
        self.assertTrue(mx.allclose(expected_da, da))
        self.assertTrue(mx.allclose(expected_db, db))

        # Test JVP
        tan_a = mx.full((5, 1), 2.0)
        tan_b = mx.full((2, 1), 3.0)
        _, (jout,) = mx.jvp(fun, (a, b), (tan_a, tan_b))
        expected = mx.array([3.0, 2.0, 2.0, 3.0, 2.0])[:, None]
        self.assertTrue(mx.allclose(expected, jout))

        def fun(a):
            idx = mx.array([[0], [3]])
            return mx.put_along_axis(a, idx, b, axis=0)

        _, (jout,) = mx.jvp(fun, (a,), (tan_a,))
        expected = mx.array([0.0, 2.0, 2.0, 0.0, 2.0])[:, None]
        self.assertTrue(mx.allclose(expected, jout))

    def test_slice_grads(self):
        # Slice
        def fun(a):
            return a[5:-6:-1]

        a = mx.ones(shape=(5,))
        cotan = mx.random.uniform(shape=(5,))
        _, (grad,) = mx.vjp(fun, (a,), (cotan,))
        self.assertTrue(mx.allclose(grad, cotan[::-1]))

        tan = mx.random.uniform(shape=(5,))
        mx.eval(tan)
        _, (grad,) = mx.jvp(fun, (a,), (tan,))
        self.assertTrue(mx.allclose(grad, tan[::-1]))

        # Slice update
        def fun(a, b):
            a[4:-5:-2] = b
            return a

        a = mx.ones(shape=(4,))
        b = mx.zeros(shape=(2,))

        cotan = mx.random.uniform(shape=(4,))
        _, (grad_a, grad_b) = mx.vjp(fun, (a, b), (cotan,))
        expected_a = mx.array(cotan)
        expected_a[1::2] = 0.0
        self.assertTrue(mx.allclose(grad_a, expected_a))
        self.assertTrue(mx.allclose(grad_b, cotan[4:-5:-2]))

        tan_a = mx.random.uniform(shape=(4,))
        tan_b = mx.random.uniform(shape=(2,))
        _, (grad,) = mx.jvp(fun, (a, b), (tan_a, tan_b))
        expected = tan_a
        expected[4:-5:-2] = tan_b
        self.assertTrue(mx.allclose(grad, expected))

    def test_leaks(self):
        for transform in [
            mx.grad,
            mx.value_and_grad,
            mx.custom_function,
            mx.checkpoint,
        ]:
            mx.synchronize()
            gc.collect()
            mem_pre = mx.get_active_memory()

            def outer():
                d = {}

                def f(x):
                    return d["x"]

                d["f"] = transform(f)
                d["x"] = mx.array([0] * 1000)

            for _ in range(5):
                outer()
                gc.collect()
            mem_post = mx.get_active_memory()
            self.assertEqual(mem_pre, mem_post)

    def test_grad_with_copies(self):
        a = mx.array(2.0)
        arrays = [a, a, a]

        def fun(arrays):
            return arrays[0] + arrays[2]

        grads = mx.grad(fun)(arrays)
        self.assertEqual(grads[0].item(), 1.0)
        self.assertEqual(grads[2].item(), 1.0)

    def test_grad_ids_pre_post(self):
        def fun(arrs):
            return arrs[0]

        arrs = [mx.array(1.0)]
        arr = arrs[0]
        mx.grad(fun)(arrs)
        self.assertEqual(id(arr), id(arrs[0]))

        def fun(arrs):
            arrs[1] = sum(arrs)
            return arrs[1]

        arrs = [mx.array(1.0), mx.array(1.0), mx.array(1.0)]
        a_0, a_1, a_2 = arrs

        mx.grad(fun)(arrs)
        self.assertEqual(id(a_0), id(arrs[0]))
        self.assertNotEqual(id(a_1), id(arrs[1]))
        self.assertEqual(id(a_2), id(arrs[2]))

    def test_grad_with_inplace_update(self):
        def loss_fn(model):
            model[1] = mx.array(2.0)
            return model[0]

        model = [
            mx.array(0.0),
            mx.array(1.0),
        ]

        grad_fn = mx.grad(loss_fn)
        grad_fn(model)
        self.assertEqual(model[1].item(), 2.0)

    def test_autograd_types(self):
        from typing import NamedTuple

        class Vector(tuple):
            pass

        class State(NamedTuple):
            a: mx.array
            b: mx.array

        def transform(x: State):
            return State(x.a + 10, x.b * 10)

        def transform_tuple(t):
            return (t[0] + 10, t[1] * 10)

        def transform_vector(t):
            return Vector([t[0] + 10, t[1] * 10])

        def loss_fn(x):
            out = transform(x)
            return out.a.sum() + out.b.sum()

        def loss_fn_tuple(x):
            out = transform_tuple(x)
            return out[0].sum() + out[1].sum()

        def loss_fn_vector(x):
            out = transform_vector(x)
            return out[0].sum() + out[1].sum()

        x_batch = State(mx.array([1, 2, 3]), mx.array([4, 5, 6]))
        grads = mx.grad(loss_fn)(x_batch)
        self.assertTrue(isinstance(grads, State))
        self.assertTrue(mx.array_equal(grads.a, mx.ones(3)))
        self.assertTrue(mx.array_equal(grads.b, mx.ones(3) * 10))

        x_batch_tuple = (mx.array([1, 2, 3]), mx.array([4, 5, 6]))
        grads = mx.grad(loss_fn_tuple)(x_batch_tuple)
        self.assertTrue(isinstance(grads, tuple))
        self.assertTrue(mx.array_equal(grads[0], mx.ones(3)))
        self.assertTrue(mx.array_equal(grads[1], mx.ones(3) * 10))

        x_batch_vector = Vector([mx.array([1, 2, 3]), mx.array([4, 5, 6])])
        grads = mx.grad(loss_fn_vector)(x_batch_vector)
        self.assertTrue(isinstance(grads, Vector))
        self.assertTrue(mx.array_equal(grads[0], mx.ones(3)))
        self.assertTrue(mx.array_equal(grads[1], mx.ones(3) * 10))

    def test_reduce_jvp(self):
        a = mx.arange(4)
        b = mx.array([3, 2, 1, 0])

        out, jout = mx.jvp(mx.sum, primals=(a,), tangents=(b,))
        self.assertEqual(jout[0].item(), 6)

        out, jout = mx.jvp(mx.prod, primals=(a,), tangents=(b,))
        self.assertEqual(jout[0].item(), 18)

        out, jout = mx.jvp(mx.min, primals=(a,), tangents=(b,))
        self.assertEqual(jout[0].item(), 3)

        out, jout = mx.jvp(mx.max, primals=(a,), tangents=(b,))
        self.assertEqual(jout[0].item(), 0)


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_bf16.py
================================================
# Copyright © 2023 Apple Inc.

import math
import unittest
from itertools import permutations

import mlx.core as mx
import mlx_tests
import numpy as np

try:
    import torch

    has_torch = True
except ImportError as e:
    has_torch = False


class TestBF16(mlx_tests.MLXTestCase):
    def __test_ops(
        self,
        ref_op,  # Function that outputs array_like
        mlx_op,  # Function that outputs array_like
        np_args,  # Numpy arguments
        ref_transform=lambda x: x,
        mlx_transform=lambda x: mx.array(x),
        atol=1e-5,
    ):
        ref_args = map(ref_transform, np_args)
        mlx_args = map(mlx_transform, np_args)

        r_ref = ref_op(*ref_args)
        r_mlx = mlx_op(*mlx_args)

        self.assertTrue(np.allclose(r_mlx, r_ref, atol=atol))

    def __default_test(
        self,
        op,
        np_args,
        simple_transform=lambda x: x,
        atol_np=1e-3,
        atol_torch=1e-5,
        np_kwargs=dict(),
        mlx_kwargs=dict(),
        torch_kwargs=dict(),
        torch_op=None,
    ):
        with self.subTest(reference="numpy"):

            def np_transform(x):
                x_mx_bf16 = mx.array(x).astype(mx.bfloat16)
                x_mx_fp32 = x_mx_bf16.astype(mx.float32)
                return np.asarray(x_mx_fp32)

            def mlx_fn(*args):
                out_bf16 = getattr(mx, op)(*args, **mlx_kwargs)
                return np.asarray(out_bf16.astype(mx.float32))

            def np_fn(*args):
                out_fp32 = getattr(np, op)(*args, **np_kwargs)
                return np_transform(out_fp32)

            ref_op = np_fn
            mlx_op = mlx_fn

            ref_transform = lambda x: simple_transform(np_transform(x))
            mlx_transform = lambda x: simple_transform(mx.array(x).astype(mx.bfloat16))

            self.__test_ops(
                ref_op,
                mlx_op,
                np_args,
                ref_transform=ref_transform,
                mlx_transform=mlx_transform,
                atol=atol_np,
            )

        if has_torch:
            with self.subTest(reference="torch"):
                torch_op = op if torch_op is None else torch_op

                def torch_fn(*args):
                    out_bf16 = getattr(torch, torch_op)(*args, **torch_kwargs)
                    return out_bf16.to(torch.float32).numpy()

                ref_op = torch_fn
                ref_transform = lambda x: simple_transform(
                    torch.from_numpy(x).to(torch.bfloat16)
                )
                self.__test_ops(
                    ref_op,
                    mlx_op,
                    np_args,
                    ref_transform=ref_transform,
                    mlx_transform=mlx_transform,
                    atol=atol_torch,
                )

    def test_unary_ops(self):
        x = np.random.rand(18, 28, 38)
        for op in ["abs", "exp", "log", "square", "sqrt"]:
            with self.subTest(op=op):
                np_args = (x.astype(np.float32),)
                self.__default_test(op, np_args)

    def test_binary_ops(self):
        x = np.random.rand(18, 28, 38)
        y = np.random.rand(18, 28, 38)
        for op in ["add", "subtract", "multiply", "divide", "maximum", "minimum"]:
            with self.subTest(op=op):
                np_args = (
                    x.astype(np.float32),
                    y.astype(np.float32),
                )
                self.__default_test(op, np_args, simple_transform=lambda x: x)
                self.__default_test(op, np_args, simple_transform=lambda x: x[:1])
                self.__default_test(op, np_args, simple_transform=lambda x: x[:, :1])

    def test_reduction_ops(self):
        x = np.random.rand(18, 28, 38).astype(np.float32)

        for op in ("min", "max"):
            with self.subTest(op=op):
                for axes in (0, 1, 2, (0, 1), (0, 2), (1, 2), (0, 1, 2)):
                    with self.subTest(axes=axes):
                        np_args = (x.astype(np.float32),)
                        self.__default_test(
                            op,
                            np_args,
                            np_kwargs={"axis": axes},
                            mlx_kwargs={"axis": axes},
                            torch_kwargs={"dim": axes},
                            torch_op="a" + op,
                        )

    def test_arg_reduction_ops(self):
        data = np.random.rand(10, 12, 13).astype(np.float32)
        x = mx.array(data).astype(mx.bfloat16)
        data = np.asarray(x.astype(mx.float32))

        for op in ["argmin", "argmax"]:
            for axis in range(3):
                for kd in [True, False]:
                    a = getattr(mx, op)(x, axis, kd)
                    b = getattr(np, op)(data, axis, keepdims=kd)
                    a = a.astype(mx.float32)
                    self.assertEqual(a.tolist(), b.tolist())

        for op in ["argmin", "argmax"]:
            a = getattr(mx, op)(x, keepdims=True)
            b = getattr(np, op)(data, keepdims=True)
            a = a.astype(mx.float32)
            self.assertEqual(a.tolist(), b.tolist())
            a = getattr(mx, op)(x)
            b = getattr(np, op)(data)
            a = a.astype(mx.float32)
            self.assertEqual(a.item(), b)

    def test_blas_ops(self):
        if mx.default_device() != mx.gpu:
            return

        def test_blas(shape_x, shape_y):
            np.random.seed(42)
            with self.subTest(shape_x=shape_x, shape_y=shape_y):
                x = np.random.normal(0.0, 1.0 / shape_x[-1], size=shape_x)
                y = np.random.normal(0.0, 1.0 / shape_x[-1], size=shape_y)

                np_args = (
                    x.astype(np.float32),
                    y.astype(np.float32),
                )
                op = "matmul"

                self.__default_test(op, np_args, atol_np=1e-3, atol_torch=1e-3)

        for shape_x, shape_y in [
            [(32, 32), (32, 32)],
            [(23, 57), (57, 1)],
            [(1, 3), (3, 128)],
            [(8, 128, 768), (768, 16)],
        ]:
            test_blas(shape_x, shape_y)

    @unittest.skipIf(not has_torch, "requires PyTorch")
    def test_conversion(self):
        a_torch = torch.tensor([1.0, 2.0, 3.0], dtype=torch.bfloat16)
        a_mx = mx.array(a_torch)
        expected = mx.array([1.0, 2.0, 3.0], mx.bfloat16)
        self.assertEqual(a_mx.dtype, mx.bfloat16)
        self.assertTrue(mx.array_equal(a_mx, expected))


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_blas.py
================================================
# Copyright © 2023-2024 Apple Inc.

import math
import unittest
from itertools import permutations

import mlx.core as mx
import mlx_tests
import numpy as np


class TestBlas(mlx_tests.MLXTestCase):
    @property
    def dtypes(self):
        return ["float32", "float16"]

    def __gemm_test(
        self,
        shape_a,
        shape_b,
        np_dtype=np.float32,
        f_np_a=lambda x: x,
        f_np_b=lambda x: x,
        f_mx_a=lambda x: x,
        f_mx_b=lambda x: x,
    ):
        with self.subTest(
            dtype=np.dtype(np_dtype).name, shape_a=shape_a, shape_b=shape_b
        ):
            np.random.seed(42)
            scale = max(np.sum(shape_a), 128)
            a_np = np.random.normal(0.0, 1.0 / scale, shape_a).astype(np_dtype)
            b_np = np.random.normal(0.0, 1.0 / scale, shape_b).astype(np_dtype)

            a_mx = mx.array(a_np)
            b_mx = mx.array(b_np)

            a_np = f_np_a(a_np.astype(np.float32))
            b_np = f_np_b(b_np.astype(np.float32))
            a_mx = f_mx_a(a_mx)
            b_mx = f_mx_b(b_mx)

            out_npy = a_np @ b_np
            out_mlx = a_mx @ b_mx

            self.assertListEqual(list(out_npy.shape), list(out_mlx.shape))
            self.assertTrue(np.allclose(out_mlx, out_npy.astype(np_dtype), atol=1e-5))

    def test_matmul_unaligned(self):
        if not mx.is_available(mx.gpu):
            return

        for dtype in self.dtypes:
            np_dtype = getattr(np, dtype)
            base_shapes = [4, 8, 16, 32, 64, 128]
            perturbations = [-2, -1, 0, 1, 2]

            for dim in base_shapes:
                for p in perturbations:
                    shape_a = (dim + p, dim + p)
                    shape_b = (dim + p, dim + p)
                    self.__gemm_test(shape_a, shape_b, np_dtype)

    def test_matvec_unaligned(self):
        a = mx.random.normal(shape=(4, 128))
        b = mx.random.normal(shape=(129,))[1:]
        out = a @ b
        np_out = np.array(a) @ np.array(b)
        self.assertTrue(np.allclose(out, np_out))

    def test_matmul_shapes(self):
        if not mx.is_available(mx.gpu):
            return

        shapes = [
            (1, 2, 1, 1),
            (1, 1, 2, 1),
            (3, 23, 457, 3),
        ]

        if mx.default_device() == mx.gpu:
            shapes += [
                (16, 768, 768, 128),
                (1, 64, 64, 4096),
            ]

        for dtype in self.dtypes:
            np_dtype = getattr(np, dtype)

            for B, M, N, K in shapes:
                with self.subTest(transpose="nn"):
                    shape_a = (B, M, K)
                    shape_b = (B, K, N)
                    self.__gemm_test(shape_a, shape_b, np_dtype)

                with self.subTest(transpose="nt"):
                    shape_a = (B, M, K)
                    shape_b = (B, N, K)
                    self.__gemm_test(
                        shape_a,
                        shape_b,
                        np_dtype,
                        f_np_b=lambda x: np.transpose(x, (0, 2, 1)),
                        f_mx_b=lambda x: mx.transpose(x, (0, 2, 1)),
                    )

                with self.subTest(transpose="tn"):
                    shape_a = (B, K, M)
                    shape_b = (B, K, N)
                    self.__gemm_test(
                        shape_a,
                        shape_b,
                        np_dtype,
                        f_np_a=lambda x: np.transpose(x, (0, 2, 1)),
                        f_mx_a=lambda x: mx.transpose(x, (0, 2, 1)),
                    )

                with self.subTest(transpose="tt"):
                    shape_a = (B, K, M)
                    shape_b = (B, N, K)
                    self.__gemm_test(
                        shape_a,
                        shape_b,
                        np_dtype,
                        f_np_a=lambda x: np.transpose(x, (0, 2, 1)),
                        f_mx_a=lambda x: mx.transpose(x, (0, 2, 1)),
                        f_np_b=lambda x: np.transpose(x, (0, 2, 1)),
                        f_mx_b=lambda x: mx.transpose(x, (0, 2, 1)),
                    )

    def test_matmul(self):
        # Note: so far, matmul only works with floating-point types
        a = mx.array([[1.0, 2.0], [3.0, 4.0]])

        b = mx.array([[0.0, -1.0], [-3.0, 3.0]])

        expected = [[-6.0, 5.0], [-12.0, 9.0]]

        self.assertEqual((a @ b).tolist(), expected)
        self.assertEqual(mx.matmul(a, b).tolist(), expected)

        # Transposed matmul
        np.random.seed(0)
        a_npy = np.random.normal(0.0, 1.0 / 128, (128, 16)).astype(np.float32)
        b_npy = np.random.normal(0.0, 1.0 / 128, (128, 16)).astype(np.float32)
        c_npy = a_npy @ np.transpose(b_npy, (1, 0))
        d_npy = np.transpose(a_npy, (1, 0)) @ b_npy

        a_mlx = mx.array(a_npy)
        b_mlx = mx.array(b_npy)
        c_mlx = a_mlx @ mx.transpose(b_mlx, (1, 0))
        d_mlx = mx.transpose(a_mlx, (1, 0)) @ b_mlx

        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
        self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))

        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-6))
        self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-6))

    def test_matmul_dtypes(self):
        for dt in self.dtypes:
            a_npy = np.random.normal(0.0, 1.0 / 256, (16, 16, 16)).astype(
                getattr(np, dt)
            )
            b_npy = np.random.normal(0.0, 1.0 / 256, (16, 16, 16)).astype(
                getattr(np, dt)
            )
            a_mlx = mx.array(a_npy)
            b_mlx = mx.array(b_npy)

            c_npy = np.matmul(a_npy, b_npy, dtype=getattr(np, dt))
            c_mlx = a_mlx @ b_mlx

            self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-6))

    def test_matmul_batched(self):
        np.random.seed(0)
        # Batched matmul
        a_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
        b_npy = np.random.normal(0.0, 1.0 / 128, (32, 16, 16)).astype(np.float32)
        c_npy = a_npy @ b_npy

        a_mlx = mx.array(a_npy)
        b_mlx = mx.array(b_npy)
        c_mlx = a_mlx @ b_mlx

        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-6))

        # Batched and transposed matmul
        b_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
        c_npy = a_npy @ np.transpose(b_npy, (0, 2, 1))

        b_mlx = mx.array(b_npy)
        c_mlx = a_mlx @ mx.transpose(b_mlx, (0, 2, 1))

        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-6))

        # Batched matmul with simple broadcast
        a_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
        b_npy = np.random.normal(0.0, 1.0 / 128, (16, 16)).astype(np.float32)
        c_npy = a_npy @ b_npy

        a_mlx = mx.array(a_npy)
        b_mlx = mx.array(b_npy)
        c_mlx = a_mlx @ b_mlx

        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-6))

        # Both operands broadcasted
        d_npy = np.broadcast_to(b_npy, (5, 16, 16))
        d_mlx = mx.broadcast_to(b_mlx, (5, 16, 16))

        e_npy = d_npy @ d_npy
        e_mlx = d_mlx @ d_mlx

        self.assertListEqual(list(e_npy.shape), list(e_mlx.shape))
        self.assertTrue(np.allclose(e_mlx, e_npy, atol=1e-6))

        # Batched and transposed matmul with simple broadcast
        a_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
        b_npy = np.random.normal(0.0, 1.0 / 128, (128, 16)).astype(np.float32)
        a_mlx = mx.array(a_npy)
        b_mlx = mx.array(b_npy)

        c_npy = a_npy @ np.transpose(b_npy, (1, 0))
        c_mlx = a_mlx @ mx.transpose(b_mlx, (1, 0))

        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-6))

        # Matmul with vector
        a_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
        b_npy = np.random.normal(0.0, 1.0 / 128, (16,)).astype(np.float32)
        a_mlx = mx.array(a_npy)
        b_mlx = mx.array(b_npy)

        c_npy = a_npy @ b_npy
        c_mlx = a_mlx @ b_mlx

        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-6))

        # Test Multiheaded attention style matmul
        a_npy = np.random.normal(0.0, 1.0 / 128, (64, 16, 4, 32)).astype(np.float32)
        b_npy = np.random.normal(0.0, 1.0 / 128, (64, 16, 4, 32)).astype(np.float32)
        a_mlx = mx.array(a_npy)
        b_mlx = mx.array(b_npy)

        a_npy = np.transpose(a_npy, (0, 2, 1, 3))
        b_npy = np.transpose(b_npy, (0, 2, 1, 3))
        a_mlx = mx.transpose(a_mlx, (0, 2, 1, 3))
        b_mlx = mx.transpose(b_mlx, (0, 2, 1, 3))

        c_npy = a_npy @ np.transpose(b_npy, (0, 1, 3, 2))
        c_mlx = a_mlx @ mx.transpose(b_mlx, (0, 1, 3, 2))
        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-6))

    def __gemv_test(
        self,
        shape_mat,
        shape_vec,
        np_dtype=np.float32,
        mat_first=True,
        np_mat_f=lambda x: x,
        np_vec_f=lambda x: x,
        mlx_mat_f=lambda x: x,
        mlx_vec_f=lambda x: x,
    ):
        with self.subTest(
            shape_mat=shape_mat, shape_vec=shape_vec, mat_first=mat_first
        ):
            np.random.seed(42)
            scale = max(np.sum(shape_mat), 32)
            mat_npy = np.random.normal(0.0, 1.0 / scale, shape_mat).astype(np_dtype)
            vec_npy = np.random.normal(0.0, 1.0 / scale, shape_vec).astype(np_dtype)

            mat_mlx = mx.array(mat_npy)
            vec_mlx = mx.array(vec_npy)

            mat_npy = np_mat_f(mat_npy)
            vec_npy = np_vec_f(vec_npy)
            mat_mlx = mlx_mat_f(mat_mlx)
            vec_mlx = mlx_vec_f(vec_mlx)

            if mat_first:
                out_npy = mat_npy @ vec_npy
                out_mlx = mat_mlx @ vec_mlx
            else:
                out_npy = vec_npy @ mat_npy
                out_mlx = vec_mlx @ mat_mlx

            # Due to some bug, numpy sometimes has NaNs on macOS
            # See https://github.com/ml-explore/mlx/pull/3063
            nans = np.isnan(out_npy)
            if np.any(nans):
                nan_ids = np.where(nans)
                mlx_nan_ids = tuple(mx.array(n) for n in nan_ids)
                out_npy[nan_ids] = 0.0
                out_mlx[mlx_nan_ids] = 0.0

            self.assertListEqual(list(out_npy.shape), list(out_mlx.shape))
            self.assertTrue(np.allclose(out_mlx, out_npy, atol=1e-5))

    def test_matrix_vector(self):
        for dtype in self.dtypes:
            with self.subTest(dtype=dtype):
                np_dtype = getattr(np, dtype)

                # Basic square matrix test
                self.__gemv_test(
                    shape_mat=(64, 64), shape_vec=(64, 1), np_dtype=np_dtype
                )
                self.__gemv_test(
                    shape_mat=(64, 64),
                    shape_vec=(64, 1),
                    np_dtype=np_dtype,
                    mat_first=False,
                    np_vec_f=lambda x: np.transpose(x, (1, 0)),
                    mlx_vec_f=lambda x: mx.transpose(x, (1, 0)),
                )

                # Vector matrix product with aligned and unaligned shapes
                for in_len_base, out_len_base in (
                    (2, 2),
                    (32, 32),
                    (64, 64),
                    (2048, 2048),
                ):
                    for mi in (-1, 0, 1):
                        for mj in (-1, 0, 1):
                            # Vec mat
                            shape_mat = (in_len_base + mi, out_len_base + mj)
                            shape_vec = (1, in_len_base + mi)
                            self.__gemv_test(
                                shape_mat, shape_vec, mat_first=False, np_dtype=np_dtype
                            )

                            # Mat vec
                            shape_mat = (out_len_base + mj, in_len_base + mi)
                            shape_vec = (in_len_base + mi, 1)
                            self.__gemv_test(
                                shape_mat, shape_vec, mat_first=True, np_dtype=np_dtype
                            )

    def test_matrix_vector_batched(self):
        for dtype in self.dtypes:
            with self.subTest(dtype=dtype):
                np_dtype = getattr(np, dtype)

                # Batched mat vec
                for shape_mat, shape_vec in (
                    ((32, 128, 64), (32, 64, 1)),
                    ((128, 64), (32, 64, 1)),
                    ((32, 128, 64), (64, 1)),
                    ((2, 1, 8, 1, 6, 128), (2, 1, 8, 4, 128, 1)),
                ):
                    self.__gemv_test(
                        shape_mat, shape_vec, mat_first=True, np_dtype=np_dtype
                    )

                # Batched vec mat
                for shape_vec, shape_mat in (
                    ((32, 1, 128), (32, 128, 64)),
                    ((32, 1, 128), (128, 64)),
                    ((1, 128), (32, 128, 64)),
                    ((1, 8, 4, 1, 128), (1, 8, 1, 128, 6)),
                ):
                    self.__gemv_test(
                        shape_mat, shape_vec, mat_first=False, np_dtype=np_dtype
                    )

    def test_matrix_vector_broadcast(self):
        for dtype in self.dtypes:
            with self.subTest(dtype=dtype):
                np_dtype = getattr(np, dtype)

                # Different broadcasts mat vec
                for shape_mat, shape_vec in (
                    ((32, 64, 64), (32, 64, 1)),
                    ((64, 64), (32, 64, 1)),
                    ((32, 64, 64), (64, 1)),
                ):
                    self.__gemv_test(
                        shape_mat=(64, 64),
                        shape_vec=(64, 1),
                        np_dtype=np_dtype,
                        np_mat_f=(lambda mat_npy: np.broadcast_to(mat_npy, shape_mat)),
                        np_vec_f=(lambda vec_npy: np.broadcast_to(vec_npy, shape_vec)),
                        mlx_mat_f=(lambda mat_mlx: mx.broadcast_to(mat_mlx, shape_mat)),
                        mlx_vec_f=(lambda vec_mlx: mx.broadcast_to(vec_mlx, shape_vec)),
                    )

                # Different broadcasts vec mat
                for shape_vec, shape_mat in (
                    ((32, 1, 64), (32, 64, 64)),
                    ((32, 1, 64), (64, 64)),
                    ((1, 64), (32, 64, 64)),
                ):
                    self.__gemv_test(
                        shape_mat=(64, 64),
                        shape_vec=(1, 64),
                        np_dtype=np_dtype,
                        mat_first=False,
                        np_mat_f=lambda mat_npy: np.broadcast_to(mat_npy, shape_mat),
                        np_vec_f=lambda vec_npy: np.broadcast_to(vec_npy, shape_vec),
                        mlx_mat_f=lambda mat_mlx: mx.broadcast_to(mat_mlx, shape_mat),
                        mlx_vec_f=lambda vec_mlx: mx.broadcast_to(vec_mlx, shape_vec),
                    )

    def test_matrix_vector_attn(self):
        # Multi-query style attention check
        for dtype in self.dtypes:
            # fmt: off
            for (B,  D, n_kv_heads, factor,  qsl,  ksl) in (
                (1, 16,          8,      4,    1,  256),
                (1, 16,          8,      4,   32,  256),
                (1, 16,          8,      4,  256,    1),
                (4, 16,          8,      4,    1,  256),
                (4, 16,          8,      4,  256,    1),
            ):
            # fmt: on
                with self.subTest(
                        B=B, # Batch size
                        D=D, # Dimension of mm
                        n_kv_heads=n_kv_heads, # key-value heads
                        factor=factor, # factor to get query heads
                        qsl=qsl, # Query sequence length
                        ksl=ksl, # Key sequence length
                        dtype=dtype # Data type
                    ):

                    np_dtype = getattr(np, dtype)

                    # Fix shapes for kqv
                    n_q_heads = n_kv_heads * factor
                    Dk = D * n_kv_heads
                    Dq = D * n_q_heads
                    scale = 1. / math.sqrt(Dk)

                    shape_queries = (B, qsl, Dq)
                    shape_keys = (B, ksl, Dk)
                    shape_values = (B, ksl, Dk)

                    # Prepare numpy arrays
                    q_np = np.random.uniform(-scale, scale, size=shape_queries).astype(np_dtype)
                    k_np = np.random.uniform(-scale, scale, size=shape_keys).astype(np_dtype)
                    v_np = np.random.uniform(-scale, scale, size=shape_values).astype(np_dtype)

                    # Rearrange to move heads up
                    q_np_reshape = q_np.reshape(B, qsl, n_kv_heads, factor, -1).transpose(0, 2, 3, 1, 4)
                    k_np_reshape = k_np.reshape(B, ksl, n_kv_heads, 1, -1).transpose(0, 2, 3, 4, 1)
                    v_np_reshape = v_np.reshape(B, ksl, n_kv_heads, 1, -1).transpose(0, 2, 3, 1, 4)

                    # Do attn style matmul
                    s_np = q_np_reshape @ k_np_reshape
                    o_np = s_np @ v_np_reshape
                    o_np = o_np.transpose(0, 3, 1, 2, 4).reshape(B, qsl, -1)

                    # Test mlx
                    q_mx = mx.array(q_np)
                    k_mx = mx.array(k_np)
                    v_mx = mx.array(v_np)

                    # Rearrange to move heads up
                    q_mx_reshape = q_mx.reshape(B, qsl, n_kv_heads, factor, -1).transpose(0, 2, 3, 1, 4)
                    k_mx_reshape = k_mx.reshape(B, ksl, n_kv_heads, 1, -1).transpose(0, 2, 3, 4, 1)
                    v_mx_reshape = v_mx.reshape(B, ksl, n_kv_heads, 1, -1).transpose(0, 2, 3, 1, 4)

                    # Do attn style matmul
                    s_mx = q_mx_reshape @ k_mx_reshape
                    o_mx = (s_mx @ v_mx_reshape)
                    o_mx = o_mx.transpose(0, 3, 1, 2, 4).reshape(B, qsl, -1)

                    # Check against np
                    self.assertListEqual(list(s_np.shape), list(s_mx.shape))
                    self.assertTrue(np.allclose(s_np, s_mx, atol=1e-4))

                    self.assertListEqual(list(o_np.shape), list(o_mx.shape))
                    self.assertTrue(np.allclose(o_np, o_mx, atol=1e-4))

    def test_matrix_vector_edgecases(self):
        for dtype in self.dtypes:
            with self.subTest(dtype=dtype):
                np_dtype = getattr(np, dtype)

                for in_vec_len in np.arange(1, 5):
                    for out_vec_len in np.arange(1, 5):
                        for batch_size in np.arange(1, 5):
                            with self.subTest(
                                problem_shape=(batch_size, in_vec_len, out_vec_len)
                            ):
                                # Matrix vector
                                with self.subTest(transpose=False):
                                    a_npy = np.ones(
                                        (batch_size, out_vec_len, in_vec_len),
                                        dtype=np_dtype,
                                    )
                                    b_npy = np.ones(
                                        (batch_size, in_vec_len, 1), dtype=np_dtype
                                    )
                                    for i in range(batch_size):
                                        b_npy[i] *= i + 1.0

                                    a_mlx, b_mlx = map(mx.array, [a_npy, b_npy])
                                    c_npy = a_npy @ b_npy
                                    c_mlx = a_mlx @ b_mlx

                                    self.assertListEqual(
                                        list(c_npy.shape), list(c_mlx.shape)
                                    )
                                    self.assertTrue(np.array_equal(c_mlx, c_npy))

                                # Vector matrix
                                with self.subTest(transpose=True):
                                    a_npy = np.ones(
                                        (batch_size, out_vec_len, in_vec_len),
                                        dtype=np_dtype,
                                    )
                                    b_npy = np.ones(
                                        (batch_size, 1, out_vec_len), dtype=np_dtype
                                    )
                                    for i in range(batch_size):
                                        b_npy[i] *= i + 1.0

                                    a_mlx, b_mlx = map(mx.array, [a_npy, b_npy])
                                    c_npy = b_npy @ a_npy
                                    c_mlx = b_mlx @ a_mlx

                                    self.assertListEqual(
                                        list(c_npy.shape), list(c_mlx.shape)
                                    )
                                    self.assertTrue(np.array_equal(c_mlx, c_npy))

    def test_mismatch_stride_mm(self):
        np.random.seed(0)
        a_npy = np.random.normal(0.0, 1.0 / 128, (4, 16, 16)).astype(np.float32)
        b_npy = np.random.normal(0.0, 1.0 / 128, (4, 16, 16)).astype(np.float32)

        a_mlx = mx.array(a_npy)
        b_mlx = mx.array(b_npy)

        # Matmul with batches
        c_npy = a_npy[::2, :, :] @ b_npy[1::2, :, :]
        c_mlx = a_mlx[::2, :, :] @ b_mlx[1::2, :, :]

        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-5))

        # Matvec with batches
        c_npy = a_npy[::2, :, :] @ b_npy[1::2, :, 2:3]
        c_mlx = a_mlx[::2, :, :] @ b_mlx[1::2, :, 2:3]

        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-5))

        # Matmul with slice
        c_npy = a_npy[:, :8, :] @ b_npy[:, :, :8]
        c_mlx = a_mlx[:, :8, :] @ b_mlx[:, :, :8]

        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-5))

        # Matmul with slice
        c_npy = a_npy[:, :, :8] @ b_npy[:, :8, :]
        c_mlx = a_mlx[:, :, :8] @ b_mlx[:, :8, :]

        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-5))

        # Matmul transpose with slice
        c_npy = a_npy[:, :8, :] @ b_npy[:, :8, :].swapaxes(-1, -2)
        c_mlx = a_mlx[:, :8, :] @ b_mlx[:, :8, :].swapaxes(-1, -2)

        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-5))

        # Matmul transpose with slice
        c_npy = a_npy[:, :, :8] @ b_npy[:, :, :8].swapaxes(-1, -2)
        c_mlx = a_mlx[:, :, :8] @ b_mlx[:, :, :8].swapaxes(-1, -2)

        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-5))

        # Matvec with slice
        c_npy = a_npy[:, :8, :] @ b_npy[:, :, 6:7]
        c_mlx = a_mlx[:, :8, :] @ b_mlx[:, :, 6:7]

        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-5))

        # Matvec with slice
        c_npy = a_npy[:, :, :8] @ b_npy[:, 3:11, 2:3]
        c_mlx = a_mlx[:, :, :8] @ b_mlx[:, 3:11, 2:3]

        self.assertListEqual(list(c_npy.shape), list(c_mlx.shape))
        self.assertTrue(np.allclose(c_mlx, c_npy, atol=1e-5))

    def test_addmm(self):
        np.random.seed(0)
        # Batched matmul
        alpha = 0.5
        for beta in (1.0, 2.0):
            # c must broadcast to the output shape
            with self.assertRaises(ValueError):
                mx.addmm(mx.zeros((2, 2, 2)), mx.zeros((2, 2)), mx.zeros((2, 2)))

            # Regular batched case
            a_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
            b_npy = np.random.normal(0.0, 1.0 / 128, (32, 16, 16)).astype(np.float32)

            a_mlx = mx.array(a_npy)
            b_mlx = mx.array(b_npy)

            for c_shape in ((1,), (1, 16), (32, 1, 16), (1, 128, 16)):
                c_npy = np.ones(c_shape).astype(np.float32)
                c_mlx = mx.array(c_npy)

                d_npy = alpha * (a_npy @ b_npy) + beta * c_npy
                d_mlx = mx.addmm(c_mlx, a_mlx, b_mlx, alpha, beta)

                self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
                self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))

            # Batched and transposed matmul
            b_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
            b_mlx = mx.array(b_npy)

            for c_shape in ((1,), (32, 1, 128), (1, 128)):
                c_npy = np.ones(c_shape).astype(np.float32)
                c_mlx = mx.array(c_npy)

                b_np_t = np.transpose(b_npy, (0, 2, 1))
                b_mx_t = mx.transpose(b_mlx, (0, 2, 1))

                d_npy = alpha * (a_npy @ b_np_t) + beta * c_npy
                d_mlx = mx.addmm(c_mlx, a_mlx, b_mx_t, alpha, beta)

                self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
                self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))
            # Batched matmul with simple broadcast
            a_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
            b_npy = np.random.normal(0.0, 1.0 / 128, (16, 16)).astype(np.float32)

            a_mlx = mx.array(a_npy)
            b_mlx = mx.array(b_npy)

            for c_shape in ((1,), (1, 16), (32, 1, 16), (1, 128, 16)):
                c_npy = np.ones(c_shape).astype(np.float32)
                c_mlx = mx.array(c_npy)

                d_npy = alpha * (a_npy @ b_npy) + beta * c_npy
                d_mlx = mx.addmm(c_mlx, a_mlx, b_mlx, alpha, beta)

                self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
                self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))
            # Matmul with vector
            a_npy = np.random.normal(0.0, 1.0 / 128, (16,)).astype(np.float32)
            b_npy = np.random.normal(0.0, 1.0 / 128, (32, 16, 128)).astype(np.float32)
            a_mlx = mx.array(a_npy)
            b_mlx = mx.array(b_npy)

            for c_shape in ((1,), (128,), (32, 128)):
                c_npy = np.ones(c_shape).astype(np.float32)
                c_mlx = mx.array(c_npy)

                d_npy = alpha * (a_npy @ b_npy) + beta * c_npy
                d_mlx = mx.addmm(c_mlx, a_mlx, b_mlx, alpha, beta)

                self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
                self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))

            # Matmul with vector
            a_npy = np.random.normal(0.0, 1.0 / 128, (32, 128, 16)).astype(np.float32)
            b_npy = np.random.normal(0.0, 1.0 / 128, (16,)).astype(np.float32)
            a_mlx = mx.array(a_npy)
            b_mlx = mx.array(b_npy)

            for c_shape in ((1,), (32, 128)):
                c_npy = np.ones(c_shape).astype(np.float32)
                c_mlx = mx.array(c_npy)

                d_npy = alpha * (a_npy @ b_npy) + beta * c_npy
                d_mlx = mx.addmm(c_mlx, a_mlx, b_mlx, alpha, beta)

                self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
                self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))

            # Split K specializtion
            a_npy = np.random.normal(0.0, 1.0 / 128, (64, 4096)).astype(np.float32)
            b_npy = np.random.normal(0.0, 1.0 / 128, (4096, 32)).astype(np.float32)

            a_mlx = mx.array(a_npy)
            b_mlx = mx.array(b_npy)

            for c_shape in ((1,), (1, 32), (64, 1), (64, 32)):
                c_npy = np.ones(c_shape).astype(np.float32)
                c_mlx = mx.array(c_npy)

                d_npy = alpha * (a_npy @ b_npy) + beta * c_npy
                d_mlx = mx.addmm(c_mlx, a_mlx, b_mlx, alpha, beta)

                self.assertListEqual(list(d_npy.shape), list(d_mlx.shape))
                self.assertTrue(np.allclose(d_mlx, d_npy, atol=1e-5))

            # Transposed c
            a = mx.ones((10, 5)).T
            b = mx.ones((5, 5))
            out = mx.addmm(a, b, a, beta=beta, alpha=alpha)
            expected = beta * a + alpha * (b @ a)
            self.assertTrue(mx.allclose(expected, out))

            # Broadcast c
            a = mx.ones((5, 5))
            b = mx.ones((5, 5))
            c = mx.ones((1, 5))
            out = mx.addmm(c, a, b, beta=beta, alpha=alpha)
            expected = beta * c + alpha * (a @ b)
            self.assertTrue(mx.allclose(expected, out))

        # Test half precision
        for t, tol in [(mx.float16, 1e-3), (mx.bfloat16, 1e-2)]:
            c = mx.ones((32, 32)).astype(t)
            a = mx.random.uniform(shape=(32, 32)).astype(t)
            b = mx.random.uniform(shape=(32, 32)).astype(t)
            out = mx.addmm(c, a, b, alpha=0.5, beta=2.0)
            expected = 0.5 * (a @ b) + 2.0 * c
            self.assertTrue(mx.allclose(out, expected, rtol=tol, atol=tol))

    def test_addmm_grad(self):
        def make_ref_addmm(alpha, beta):
            return lambda c, a, b: alpha * (a @ b) + beta * c

        def make_addmm(alpha, beta):
            return lambda c, a, b: mx.addmm(c, a, b, alpha, beta)

        # B, M, N, K
        shapes = ((1, 64, 32, 128), (4, 28, 24, 47), (1, 1, 24, 47))

        alpha = 2.0
        for beta in (1.0, 0.5):
            f_test = make_addmm(alpha, beta)
            f_ref = make_ref_addmm(alpha, beta)

            for B, M, N, K in shapes:
                cotan = mx.ones((B, M, N))
                c = mx.random.normal((B, M, N))
                a = mx.random.normal((B, M, K))
                b = mx.random.normal((B, K, N))

                out_ref, dout_ref = mx.vjp(
                    f_ref,
                    [c, a, b],
                    [cotan],
                )
                out_test, dout_test = mx.vjp(
                    f_test,
                    [c, a, b],
                    [cotan],
                )

                self.assertTrue(mx.allclose(out_ref[0], out_test[0], atol=1e-4).item())

                for r, t in zip(dout_ref, dout_test):
                    self.assertEqual(r.shape, t.shape)
                    self.assertTrue(mx.allclose(r, t, atol=1e-4).item())

    def test_empty_matmul(self):
        a = mx.array([[], []]).T
        b = mx.array([[1.0, 2.0], [2.0, 3.0]])
        c = a @ b
        mx.eval(c)
        self.assertEqual(c.shape, (0, 2))

        a = mx.array([[1.0, 2.0], [2.0, 3.0]])
        b = mx.array([[], []])
        c = a @ b
        mx.eval(c)
        self.assertEqual(c.shape, (2, 0))

        a = mx.array([[], []]).T
        b = mx.array([[], []])
        c = a @ b
        mx.eval(c)
        self.assertEqual(c.shape, (0, 0))

        c = mx.array(1.0, dtype=mx.float32)
        a = mx.array([], dtype=mx.float32)
        b = mx.array([], dtype=mx.float32)
        out = mx.addmm(c, a, b)
        self.assertEqual(out.item(), 1.0)
        self.assertEqual(out.shape, ())

        a = mx.ones((2, 0))
        b = mx.ones((0, 2))
        c = mx.ones((2, 2))

        test_cases = [
            (0.0, 1.0),
            (0.0, 2.0),
            (0.0, 0.5),
            (0.0, 0.0),
            (1.0, 2.0),
        ]

        for alpha, beta in test_cases:
            with self.subTest(alpha=alpha, beta=beta):
                result = mx.addmm(c, a, b, alpha=alpha, beta=beta)
                expected = c * beta  # a @ b = 0 for empty matrices
                self.assertTrue(mx.allclose(result, expected))

        shapes_tests = [
            ((3, 0), (0, 3), (3, 3)),
            ((5, 0), (0, 5), (5, 5)),
            ((1, 0), (0, 10), (1, 10)),
            ((10, 0), (0, 1), (10, 1)),
        ]

        for shape_a, shape_b, shape_c in shapes_tests:
            with self.subTest(shape_a=shape_a, shape_b=shape_b, shape_c=shape_c):
                a = mx.ones(shape_a)
                b = mx.ones(shape_b)
                c = mx.ones(shape_c)
                result = mx.addmm(c, a, b, alpha=0.5, beta=2.0)
                expected = c * 2.0
                self.assertTrue(mx.allclose(result, expected))

        a = mx.ones((2, 5, 0))
        b = mx.ones((2, 0, 5))
        c = mx.ones((2, 5, 5))
        result = mx.addmm(c, a, b, alpha=0.0, beta=3.0)
        expected = c * 3.0
        self.assertTrue(mx.allclose(result, expected))

    def test_block_masked_matmul(self):
        def ref_block_masked_mm(
            a, b, block_size, out_mask=None, lhs_mask=None, rhs_mask=None
        ):
            # Get mask adjusted shapes
            M = a.shape[-2]
            N = b.shape[-1]
            K = a.shape[-1]

            bsx_shape = np.broadcast_shapes(a.shape[:-2], b.shape[:-2])

            # Expand mask dims
            def expand_mask(mask, block_size, Y, X):
                mask = mx.expand_dims(mask, (-3, -1))
                mask_shape = list(bsx_shape) + list(mask.shape[-4:])
                mask_shape[-1] = block_size
                x = mask_shape[-2] * block_size
                mask_shape[-3] = block_size
                y = mask_shape[-4] * block_size
                mask = mx.broadcast_to(mask, mask_shape)
                mask_shape = mask_shape[:-4] + [y, x]
                return mask.reshape(mask_shape)[..., :Y, :X]

            a_masked = a
            b_masked = b

            if lhs_mask is not None:
                lhs_mask = expand_mask(lhs_mask, block_size, M, K).astype(mx.float32)
                a_masked = lhs_mask * a_masked

            if rhs_mask is not None:
                rhs_mask = expand_mask(rhs_mask, block_size, K, N).astype(mx.float32)
                b_masked = rhs_mask * b_masked

            out = a_masked @ b_masked

            if out_mask is not None:
                out_mask = expand_mask(out_mask, block_size, M, N).astype(mx.float32)
                out = out * out_mask
            return out

        def run_test(a, b, block_size, out_mask, a_mask, b_mask, cotan):
            def f_ref(a_, b_):
                return ref_block_masked_mm(a_, b_, block_size, out_mask, a_mask, b_mask)

            def f_test(a_, b_):
                return mx.block_masked_mm(a_, b_, block_size, out_mask, a_mask, b_mask)

            out_ref, dout_ref = mx.vjp(f_ref, [a, b], [cotan])
            out_test, dout_test = mx.vjp(f_test, [a, b], [cotan])

            self.assertTrue(mx.allclose(out_ref[0], out_test[0], atol=1e-5).item())

            for r, t in zip(dout_ref, dout_test):
                self.assertEqual(r.shape, t.shape)
                self.assertTrue(mx.allclose(r, t, atol=1e-4).item())

        def run_test_mask_vjp(a, b, block_size, out_mask, a_mask, b_mask, cotan):
            def f_ref(a_, b_, a_mask_, b_mask_):
                return ref_block_masked_mm(
                    a_, b_, block_size, out_mask, a_mask_, b_mask_
                )

            def f_test(a_, b_, a_mask_, b_mask_):
                return mx.block_masked_mm(
                    a_, b_, block_size, out_mask, a_mask_, b_mask_
                )

            out_ref, dout_ref = mx.vjp(f_ref, [a, b, a_mask, b_mask], [cotan])
            out_test, dout_test = mx.vjp(f_test, [a, b, a_mask, b_mask], [cotan])

            mx.eval((out_ref, dout_ref, out_test, dout_test))

            self.assertTrue(mx.allclose(out_ref[0], out_test[0], atol=1e-5).item())

            for r, t in zip(dout_ref, dout_test):
                self.assertEqual(r.shape, t.shape)
                self.assertTrue(mx.allclose(r, t, atol=1e-4).item())

        def make_mask(tm_, tn_, batch, np_dtype):
            arr_np_mask = np.random.normal(size=batch + (tm_, tn_)).astype(np_dtype)
            arr_np_bool_mask = arr_np_mask < 0.0
            arr_np_mask[arr_np_bool_mask] = 0.0

            return mx.array(arr_np_bool_mask), mx.array(arr_np_mask)

        def test_shape(
            M,
            N,
            K,
            block_size,
            transpose=False,
            np_dtype=np.float32,
            batch_A=(),
            batch_B=(),
        ):
            with self.subTest(
                M=M,
                N=N,
                K=K,
                block_size=block_size,
                np_dtype=np_dtype,
                transpose=transpose,
                batch_A=batch_A,
                batch_B=batch_B,
            ):
                batch_out = np.broadcast_shapes(batch_A, batch_B)
                cotan = mx.ones(batch_out + (M, N))

                a_np = np.random.normal(size=batch_A + (M, K)).astype(np_dtype)
                b_np = np.random.normal(size=batch_B + (K, N)).astype(np_dtype)

                a_mx = mx.array(a_np)
                b_mx = mx.array(b_np)

                tm = (M + block_size - 1) // block_size
                tn = (N + block_size - 1) // block_size
                tk = (K + block_size - 1) // block_size

                a_mx_bool_mask, a_mx_mask = make_mask(tm, tk, batch_A, np_dtype)
                b_mx_bool_mask, b_mx_mask = make_mask(tk, tn, batch_B, np_dtype)
                out_mx_bool_mask, out_mx_mask = make_mask(tm, tn, batch_out, np_dtype)

                # Boolean block masks
                run_test(
                    a_mx,
                    b_mx,
                    block_size,
                    out_mx_bool_mask,
                    a_mx_bool_mask,
                    b_mx_bool_mask,
                    cotan,
                )
                run_test(a_mx, b_mx, block_size, out_mx_bool_mask, None, None, cotan)
                run_test(
                    a_mx, b_mx, block_size, None, a_mx_bool_mask, b_mx_bool_mask, cotan
                )

                # Float block masks
                run_test(
                    a_mx, b_mx, block_size, out_mx_mask, a_mx_mask, b_mx_mask, cotan
                )
                run_test(a_mx, b_mx, block_size, None, a_mx_mask, b_mx_mask, cotan)
                run_test_mask_vjp(
                    a_mx, b_mx, block_size, out_mx_mask, a_mx_mask, b_mx_mask, cotan
                )
                run_test_mask_vjp(
                    a_mx, b_mx, block_size, None, a_mx_mask, b_mx_mask, cotan
                )

        shapes = (
            (16, 16, 16, 32),
            (64, 64, 16, 32),
            (128, 128, 128, 32),
            (256, 256, 128, 64),
            (1, 128, 128, 32),
            (256, 1, 128, 64),
        )

        for M, N, K, block_size in shapes:
            test_shape(M, N, K, block_size)

        # Test broadcasting
        test_shape(64, 64, 64, 32, batch_A=(1, 2), batch_B=(2, 2))
        test_shape(1, 128, 128, 32, batch_A=(1, 2), batch_B=(2, 2))
        test_shape(128, 1, 128, 32, batch_A=(1, 2), batch_B=(2, 2))

        a_np = np.ones((128, 256)).astype(np.float32)
        b_np = np.ones((128, 1)).astype(np.float32)
        d_np = np.ones((1, 256)).astype(np.float32)
        a_mask_np = np.random.normal(size=(4, 8)).astype(np.float32)
        b_mask_np = np.ones((4, 1)).astype(np.bool_)
        d_mask_np = np.ones((1, 8)).astype(np.bool_)
        c_mask_np = np.random.normal(size=(8, 1)).astype(np.float32)
        e_mask_np = np.random.normal(size=(1, 4)).astype(np.float32)

        a_mask_np[a_mask_np < 0.0] = 0.0
        e_mask_np[e_mask_np < 0.0] = 0.0
        c_mask_np[c_mask_np < 0.0] = 0.0

        a_mx = mx.array(a_np)
        b_mx = mx.array(b_np)
        d_mx = mx.array(d_np)
        a_mask_mx = mx.array(a_mask_np)
        b_mask_mx = mx.array(b_mask_np)
        d_mask_mx = mx.array(d_mask_np)
        e_mask_mx = mx.array(e_mask_np)
        c_mask_mx = mx.array(c_mask_np)

        c_mx = mx.block_masked_mm(a_mx.T, b_mx, 32, c_mask_mx, a_mask_mx.T, b_mask_mx)
        e_mx = mx.block_masked_mm(d_mx, a_mx.T, 32, e_mask_mx, d_mask_mx, a_mask_mx.T)

        a_mask_np = np.broadcast_to(np.expand_dims(a_mask_np, (-3, -1)), (4, 32, 8, 32))
        a_mask_np = a_mask_np.reshape((128, 256))
        a_np *= a_mask_np

        c_np = a_np.T @ b_np
        e_np = d_np @ a_np.T

        c_mask_np = np.broadcast_to(np.expand_dims(c_mask_np, (-2)), (8, 32, 1))
        c_mask_np = c_mask_np.reshape((256, 1))
        c_np *= c_mask_np

        e_mask_np = np.broadcast_to(np.expand_dims(e_mask_np, (-1)), (1, 4, 32))
        e_mask_np = e_mask_np.reshape((1, 128))
        e_np *= e_mask_np

        self.assertTrue(np.allclose(c_mx, c_np, atol=1e-5))
        self.assertTrue(np.allclose(e_mx, e_np, atol=1e-5))

    def test_gather_matmul(self):
        def np_gather_mm(a, b, lhs_indices=None, rhs_indices=None):
            a = a.reshape((-1, a.shape[-2], a.shape[-1]))
            b = b.reshape((-1, b.shape[-2], b.shape[-1]))
            lhs_indices = lhs_indices or np.arange(a.shape[0])
            rhs_indices = rhs_indices or np.arange(b.shape[0])
            a = a[lhs_indices, :, :]
            b = b[rhs_indices, :, :]
            out = a @ b
            return out

        def test_shape(
            M,
            N,
            K,
            np_dtype=np.float32,
            batch_A=(),
            batch_B=(),
            lhs_indices=None,
            rhs_indices=None,
        ):
            with self.subTest(
                M=M,
                N=N,
                K=K,
                np_dtype=np_dtype,
                batch_A=batch_A,
                batch_B=batch_B,
                lhs_indices=lhs_indices,
                rhs_indices=rhs_indices,
            ):
                a_np = np.random.normal(size=batch_A + (M, K)).astype(np_dtype)
                b_np = np.random.normal(size=batch_B + (K, N)).astype(np_dtype)

                a_mx = mx.array(a_np)
                b_mx = mx.array(b_np)

                out_np = np_gather_mm(a_np, b_np, lhs_indices, rhs_indices)

                lhs_indices_mx = None if lhs_indices is None else mx.array(lhs_indices)
                rhs_indices_mx = None if rhs_indices is None else mx.array(rhs_indices)

                out_mx = mx.gather_mm(a_mx, b_mx, lhs_indices_mx, rhs_indices_mx)

                self.assertTrue(np.allclose(out_np, out_mx, atol=1e-5))

        inputs = (
            {
                "batch_A": (1,),
                "lhs_indices": (0,),
                "batch_B": (3,),
                "rhs_indices": (2, 1),
            },
            {
                "batch_A": (1,),
                "lhs_indices": None,
                "batch_B": (3,),
                "rhs_indices": (2, 1),
            },
            {
                "batch_A": (2,),
                "lhs_indices": None,
                "batch_B": (3,),
                "rhs_indices": (2, 1),
            },
            {
                "batch_A": (3,),
                "lhs_indices": (0, 2),
                "batch_B": (1,),
                "rhs_indices": (0,),
            },
            {
                "batch_A": (5,),
                "lhs_indices": (0, 2),
                "batch_B": (3,),
                "rhs_indices": (2, 1),
            },
            {
                "batch_A": (4, 2),
                "lhs_indices": (
                    (7, 6),
                    (5, 4),
                    (1, 2),
                ),
                "batch_B": (4, 1),
                "rhs_indices": ((2,), (0,), (1,)),
            },
        )

        for kwargs in inputs:
            test_shape(32, 32, 32, **kwargs)
            test_shape(16, 1, 16, **kwargs)

        # Add tests for broadcasting
        a_np = np.random.normal(size=(5, 32, 32)).astype(np.float32)
        b_np = np.random.normal(size=(3, 32, 32)).astype(np.float32)
        a_mx = mx.array(a_np)
        b_mx = mx.array(b_np)

        # Numpy
        a_np = a_np.reshape((5, 1, 32, 32))
        b_np = b_np.reshape((1, 3, 32, 32))

        a_np = np.broadcast_to(a_np, (5, 4, 32, 32))
        b_np = np.broadcast_to(b_np, (2, 3, 32, 32)).swapaxes(1, 0)

        lhs_indices = [0, 13, 12]
        rhs_indices = [0, 3, 5]

        out_np = np_gather_mm(a_np, b_np, lhs_indices, rhs_indices)

        # MLX
        a_mx = a_mx.reshape((5, 1, 32, 32))
        b_mx = b_mx.reshape((1, 3, 32, 32))

        a_mx = mx.broadcast_to(a_mx, (5, 4, 32, 32))
        b_mx = mx.broadcast_to(b_mx, (2, 3, 32, 32)).swapaxes(1, 0)

        lhs_indices_mx = mx.array(lhs_indices)
        rhs_indices_mx = mx.array(rhs_indices)

        out_mx = mx.gather_mm(a_mx, b_mx, lhs_indices_mx, rhs_indices_mx)

        self.assertTrue(np.allclose(out_np, out_mx, atol=1e-5))

        # Gemv test
        a_np = np.random.normal(size=(5, 1, 32)).astype(np.float32)
        b_np = np.random.normal(size=(3, 16, 32)).astype(np.float32)
        a_mx = mx.array(a_np)
        b_mx = mx.array(b_np)

        lhs_indices = [3, 1]
        rhs_indices = [0, 2]

        b_np_t = np.swapaxes(b_np, -1, -2)
        out_np = np_gather_mm(a_np, b_np_t, lhs_indices, rhs_indices)

        lhs_indices_mx = mx.array(lhs_indices)
        rhs_indices_mx = mx.array(rhs_indices)

        b_mx_t = mx.swapaxes(b_mx, -1, -2)
        out_mx = mx.gather_mm(a_mx, b_mx_t, lhs_indices_mx, rhs_indices_mx)

        self.assertTrue(np.allclose(out_np, out_mx, atol=1e-5))

    def test_gather_matmul_grad(self):
        lhs_indices = mx.array([[7, 6], [4, 1], [0, 2]], dtype=mx.uint32)
        rhs_indices = mx.array([[2], [0], [1]], dtype=mx.uint32)

        def f_ref(a, b):
            lhs_indices_ = mx.broadcast_to(lhs_indices, (3, 2))
            rhs_indices_ = mx.broadcast_to(rhs_indices, (3, 2))
            M = a.shape[-2]
            N = b.shape[-1]
            K = a.shape[-1]

            a = a.reshape((-1, M, K))
            b = b.reshape((-1, K, N))

            a = mx.take(a, lhs_indices_, 0)
            b = mx.take(b, rhs_indices_, 0)

            return a @ b

        def f_test(a, b):
            return mx.gather_mm(a, b, lhs_indices, rhs_indices)

        a_mx = mx.random.normal((4, 2, 32, 32))
        b_mx = mx.random.normal((4, 1, 32, 32))

        out_test = f_test(a_mx, b_mx)
        out_ref = f_ref(a_mx, b_mx)

        self.assertTrue(mx.allclose(out_test, out_ref, atol=1e-5))

        cotan = mx.ones_like(out_test)
        out_ref, dout_ref = mx.vjp(
            f_ref,
            [a_mx, b_mx],
            [cotan],
        )
        out_test, dout_test = mx.vjp(
            f_test,
            [a_mx, b_mx],
            [cotan],
        )

        for r, t in zip(dout_ref, dout_test):
            self.assertEqual(r.shape, t.shape)
            self.assertTrue(mx.allclose(r, t, atol=1e-4).item())

    def test_gather_mm_sorted(self):
        def gather_mm_ref(a, b, rhs):
            b = b[rhs]
            return a @ b

        def gather_mm_test(a, b, rhs):
            return mx.gather_mm(a, b, rhs_indices=rhs, sorted_indices=True)

        dtypes = [(mx.float32, 1e-4)]
        if mx.cuda.is_available():
            dtypes += [
                (mx.float16, 1e-3),
                (mx.bfloat16, 1e-2),
            ]

        for b_transposed in (True, False):
            for dtype, tol in dtypes:
                with self.subTest(b_transposed=b_transposed, dtype=dtype):
                    a = mx.random.normal((100, 1, 100), dtype=dtype)
                    b = mx.random.normal((8, 100, 100), dtype=dtype)
                    if b_transposed:
                        b = b.swapaxes(-1, -2)
                    rhs = mx.sort(mx.random.randint(0, 8, shape=(100,)))

                    c1 = gather_mm_ref(a, b, rhs)
                    c2 = gather_mm_test(a, b, rhs)
                    self.assertTrue(mx.allclose(c1, c2, rtol=tol, atol=tol))

    def test_gather_mm_sorted_vjp(self):
        def gather_mm_ref(a, b, rhs):
            b = b[rhs]
            return a @ b

        def gather_mm_test(a, b, rhs):
            return mx.gather_mm(a, b, rhs_indices=rhs, sorted_indices=True)

        a = mx.random.normal((100, 1, 100))
        b = mx.random.normal((8, 100, 100))
        rhs = mx.sort(mx.random.randint(0, 8, shape=(100,)))

        cotan = mx.random.normal((100, 1, 100))
        c1, dc1 = mx.vjp(
            lambda a, b: gather_mm_ref(a, b, rhs),
            [a, b],
            [cotan],
        )
        c2, dc2 = mx.vjp(
            lambda a, b: gather_mm_test(a, b, rhs),
            [a, b],
            [cotan],
        )
        self.assertTrue(mx.allclose(c1[0], c2[0], atol=1e-4))
        self.assertTrue(mx.allclose(dc1[0], dc2[0], atol=1e-4))
        self.assertTrue(mx.allclose(dc1[1], dc2[1], atol=1e-4))

    def test_segmented_mm(self):
        def segmented_mm_ref(a, b, s):
            s = s.tolist()
            c = []
            for s1, s2 in s:
                c.append(a[:, s1:s2] @ b[s1:s2, :])
            return mx.stack(c, axis=0)

        shapes = [
            (10, 10, 10),
            (10, 10, 1000),
            (1000, 1000, 1000),
        ]
        all_segments = [[0, 0, 1.0], [0, 0.5, 1.0], [r / 9 for r in range(10)]]

        for M, N, K in shapes:
            for s in all_segments:
                segments = []
                for i in range(len(s) - 1):
                    segments.append([s[i], s[i + 1]])
                segments = mx.array(segments)
                segments = mx.minimum(K - 1, (K * segments).astype(mx.uint32))
                a = mx.random.normal((M, K))
                b = mx.random.normal((K, N))
                c1 = segmented_mm_ref(a, b, segments)
                c2 = mx.segmented_mm(a, b, segments)
                self.assertTrue(mx.allclose(c1, c2, atol=1e-4))

                a = mx.random.normal((K, M))
                b = mx.random.normal((K, N))
                c1 = segmented_mm_ref(a.T, b, segments)
                c2 = mx.segmented_mm(a.T, b, segments)
                self.assertTrue(mx.allclose(c1, c2, atol=1e-4))

                a = mx.random.normal((M, K))
                b = mx.random.normal((N, K))
                c1 = segmented_mm_ref(a, b.T, segments)
                c2 = mx.segmented_mm(a, b.T, segments)
                self.assertTrue(mx.allclose(c1, c2, atol=1e-4))

                a = mx.random.normal((K, M))
                b = mx.random.normal((N, K))
                c1 = segmented_mm_ref(a.T, b.T, segments)
                c2 = mx.segmented_mm(a.T, b.T, segments)
                self.assertTrue(mx.allclose(c1, c2, atol=1e-4))

        with self.assertRaises(ValueError):
            a = mx.ones((2, 10, 10))
            s = mx.array([[0, 5], [5, 10]]).astype(mx.uint32)
            mx.segmented_mm(a, a, s)

        a = mx.ones((10, 1000))
        s = mx.random.randint(0, 16, shape=(1000,))
        s = mx.zeros(16, dtype=s.dtype).at[s].add(1)
        s = mx.sort(s)
        s = mx.cumsum(s)
        s = mx.concatenate([mx.array([0]), s])
        s = mx.as_strided(s, (16, 2), (1, 1))
        s = mx.reshape(s, (2, 2, 4, 2))
        c = mx.segmented_mm(a, a.T, s)
        self.assertEqual(c.shape, (2, 2, 4, 10, 10))

    def test_gemv_gemm_same_precision(self):
        mx.random.seed(0)
        N = 256
        if mx.is_available(mx.gpu):
            t = mx.bfloat16
            a = mx.random.normal([1, N]).astype(t)
            b = mx.concatenate([a, a], axis=0).astype(t)
            c = mx.random.normal([N, 64]).astype(t)
            out_gemv = a @ c
            out_gemm = (b @ c)[0]
            self.assertTrue(mx.allclose(out_gemv, out_gemm))

    def test_complex_gemv(self):
        M = 16
        N = 50

        def rand(shape):
            return mx.random.uniform(shape=shape) + 1j * mx.random.uniform(shape=shape)

        a = rand((M, N))
        b = rand((N, 1))
        c = mx.matmul(a, b)
        c_np = np.matmul(a, b)
        self.assertTrue(np.allclose(c, c_np))

        # Transposed
        a = rand((N, M))
        b = rand((N, 1))
        c = mx.matmul(a.T, b)
        c_np = np.matmul(np.array(a).T, b)
        self.assertTrue(np.allclose(c, c_np))

        # Check shapes
        a = mx.random.normal((2, 3)).astype(mx.complex64)
        b = mx.random.normal((3,))
        self.assertEqual((a @ b).shape, (2,))

        a = mx.random.normal((2, 3)).astype(mx.complex64)
        b = mx.random.normal((3,))
        c = mx.random.normal((2,))
        self.assertEqual(mx.addmm(c, a, b).shape, (2,))

    def test_complex_gemm(self):
        M = 16
        K = 50
        N = 32

        def rand(shape):
            return mx.random.uniform(shape=shape) + 1j * mx.random.uniform(shape=shape)

        a = rand((M, K))
        b = rand((K, N))
        c = mx.matmul(a, b)
        c_np = np.matmul(a, b)
        self.assertTrue(np.allclose(c, c_np))

        # Test addmm
        a = rand((M, K))
        b = rand((K, N))
        c = rand((M, N))
        out = mx.addmm(c, a, b, 2.0, 2.0)
        out_np = 2.0 * np.matmul(a, b) + 2.0 * c
        self.assertTrue(np.allclose(out, out_np))

        # complex with real
        a = rand((M, K)).real
        b = rand((K, N))
        c = mx.matmul(a, b)
        c_np = np.matmul(a, b)
        self.assertTrue(np.allclose(out, out_np))


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_compile.py
================================================
# Copyright © 2023-2024 Apple Inc.

import gc
import inspect
import io
import math
from functools import partial, wraps
from io import StringIO

import mlx.core as mx
import mlx_tests
import numpy as np


class TestCompile(mlx_tests.MLXTestCase):
    def test_simple_compile(self):
        def fun(x, y):
            return x + y

        compiled_fn = mx.compile(fun)
        compiled_fn = mx.compile(fun)
        x = mx.array(1.0)
        y = mx.array(1.0)
        out = compiled_fn(x, y)
        self.assertEqual(out.item(), 2.0)

        # Try again
        out = compiled_fn(x, y)
        self.assertEqual(out.item(), 2.0)

        # Change sizes
        x = mx.array([1.0, 2.0])
        out = compiled_fn(x, y)
        self.assertTrue(mx.array_equal(out, mx.array([2.0, 3.0])))

        y = mx.array([1.0, 2.0])
        out = compiled_fn(x, y)
        self.assertTrue(mx.array_equal(out, mx.array([2.0, 4.0])))

        # Change types
        x = mx.array([1, 2], mx.int32)
        y = mx.array([1, 2], mx.int32)
        out = compiled_fn(x, y)
        self.assertEqual(out.dtype, mx.int32)
        self.assertTrue(mx.array_equal(out, mx.array([2, 4])))

    def test_compile_grad(self):
        def loss_fn(x):
            return mx.exp(x).sum()

        grad_fn = mx.grad(loss_fn)

        x = mx.array([0.5, -0.5, 1.2])
        dfdx = grad_fn(x)
        compile_grad_fn = mx.compile(grad_fn)
        c_dfdx = grad_fn(x)

        self.assertTrue(mx.allclose(c_dfdx, dfdx))

        # Run it again without calling compile
        c_dfdx = compile_grad_fn(x)
        self.assertTrue(mx.allclose(c_dfdx, dfdx))

        # Run it again with calling compile
        c_dfdx = mx.compile(grad_fn)(x)
        self.assertTrue(mx.allclose(c_dfdx, dfdx))

        # Value and grad
        def loss_fn(x):
            return mx.exp(x).sum(), mx.sin(x)

        val_and_grad_fn = mx.value_and_grad(loss_fn)
        (loss, val), dfdx = val_and_grad_fn(x)
        (c_loss, c_val), c_dfdx = mx.compile(val_and_grad_fn)(x)

        self.assertTrue(mx.allclose(c_dfdx, dfdx))
        self.assertTrue(mx.allclose(c_loss, loss))
        self.assertTrue(mx.allclose(c_val, val))

    def test_compile_inputs_with_primitives(self):
        x = mx.array([1, 2, 3])
        y = mx.array([1, 2, 3])
        for _ in range(5):
            x = x + y
            y = y + 1

        def fun(x, y):
            return x * y

        out = fun(x, y)

        x = mx.array([1, 2, 3])
        y = mx.array([1, 2, 3])
        for _ in range(5):
            x = x + y
            y = y + 1

        c_out = mx.compile(fun)(x, y)
        self.assertTrue(mx.array_equal(out, c_out))

        # Try again
        c_out = mx.compile(fun)(x, y)
        self.assertTrue(mx.array_equal(out, c_out))

    def test_compile_with_closure(self):
        x = mx.array(1)

        def closure(y):
            return x + y

        compiled = mx.compile(closure)
        out = compiled(mx.array(1))
        self.assertEqual(out.item(), 2)

        # Try again
        out = compiled(mx.array(1))
        self.assertEqual(out.item(), 2)

        # Change the shape of the enclosed variable
        x = mx.array([1, 2])
        out = compiled(mx.array(1))

        # We still get the original input (closures are not updated)
        self.assertEqual(out.item(), 2)

        # Try with a tree of enclosed variables
        x = {"a": mx.array(1), "b": mx.array(2)}

        def closure(y):
            return x["a"] + y + x["b"]

        compiled = mx.compile(closure)
        out = compiled(mx.array(1))
        self.assertEqual(out.item(), 4)

        # Change the shape of one input
        x["a"] = mx.array([4, 5])
        out = compiled(mx.array(1))
        self.assertEqual(out.item(), 4)

        x["b"] = mx.array([-6, -8])
        out = compiled(mx.array(1))
        self.assertEqual(out.item(), 4)

        # Enclosed variable is not evaluated yet
        x = mx.array(1)
        x = x + x

        def closure(y):
            return x + y

        compiled = mx.compile(closure)
        out = compiled(mx.array(2))
        self.assertEqual(out.item(), 4)

        # And again
        out = compiled(mx.array(2))
        self.assertEqual(out.item(), 4)

    def test_function_creates_array(self):
        def fun(x):
            return x + mx.array(1)

        cfun = mx.compile(fun)
        out = cfun(mx.array(3))
        self.assertEqual(out.item(), 4)

        # And again
        out = cfun(mx.array(3))
        self.assertEqual(out.item(), 4)

    def test_enable_disable(self):
        def fun(x):
            y = x + 1
            z = x + 1
            return y + z

        def count_prims(outputs):
            buf = io.StringIO()
            mx.export_to_dot(buf, outputs)
            buf.seek(0)
            return len([l for l in buf.read().split() if "label" in l])

        x = mx.array(1.0)
        cfun = mx.compile(fun)
        n_compiled = count_prims(cfun(x))

        # Check disabled
        mx.disable_compile()
        n_uncompiled = count_prims(cfun(x))
        self.assertTrue(n_compiled < n_uncompiled)

        # Check renabled
        mx.enable_compile()
        n_enable_compiled = count_prims(cfun(x))
        self.assertEqual(n_compiled, n_enable_compiled)

    def test_compile_two_input_grad(self):
        def loss(w, x):
            y = x * w
            return (y * mx.exp(y)).sum()

        x = mx.array([1.0, 0.5, 2.0, -0.5])
        w = mx.array([-1.0, 0.3, 1.0, -0.9])

        expected_grad = mx.grad(loss)(w, x)
        compiled_grad = mx.compile(mx.grad(loss))(w, x)
        self.assertTrue(mx.allclose(expected_grad, compiled_grad))

    def test_vmap_compiled(self):
        def simple_unary(x):
            return -mx.exp(x)

        x = mx.array([[1.0, 2.0], [2.0, 3.0]])

        expected_out = mx.vmap(simple_unary)(x)
        out = mx.vmap(mx.compile(simple_unary))(x)
        self.assertTrue(mx.allclose(expected_out, out))

        def simple_binary(x, y):
            return mx.abs(mx.exp(x + y) + y)

        x = mx.array([[1.0, -3.0], [0.5, -0.5]])
        y = mx.array([[2.0, -1.0], [0.25, -0.25]])

        expected_out = mx.vmap(simple_binary)(x, y)
        out = mx.vmap(mx.compile(simple_binary))(x, y)
        self.assertTrue(mx.allclose(expected_out, out))

        expected_out = mx.vmap(simple_binary, in_axes=(0, 1))(x, y)
        out = mx.vmap(mx.compile(simple_binary), in_axes=(0, 1))(x, y)
        self.assertTrue(mx.allclose(expected_out, out))

        y = mx.array([0.25, -0.25])
        expected_out = mx.vmap(simple_binary, in_axes=(0, None))(x, y)
        out = mx.vmap(mx.compile(simple_binary), in_axes=(0, None))(x, y)
        self.assertTrue(mx.allclose(expected_out, out))

        def simple_unary_outer(x):
            x = mx.abs(x)

            @mx.compile
            def simple_unary_inner(z):
                return -mx.exp(x)

            return simple_unary_inner(x)

        expected_out = -mx.exp(mx.abs(x))
        out = mx.vmap(simple_unary_outer)(x)
        self.assertTrue(mx.allclose(expected_out, out))

    def test_vjp_vjp_compiled(self):
        def simple_unary(x):
            return -mx.exp(x)

        x = mx.array([[1.0, 2.0], [2.0, 3.0]])
        y = mx.array([[1.0, 1.0], [1.0, 1.0]])

        expected_out, expected_vjp_out = mx.vjp(simple_unary, (x,), (y,))
        out, vjp_out = mx.vjp(mx.compile(simple_unary), (x,), (y,))
        self.assertTrue(mx.allclose(expected_vjp_out[0], vjp_out[0]))
        self.assertTrue(mx.allclose(expected_out[0], out[0]))

        expected_out, expected_jvp_out = mx.jvp(simple_unary, (x,), (y,))
        out, jvp_out = mx.jvp(mx.compile(simple_unary), (x,), (y,))
        self.assertTrue(mx.allclose(expected_jvp_out[0], jvp_out[0]))
        self.assertTrue(mx.allclose(expected_out[0], out[0]))

        def simple_binary(x, y):
            return mx.abs(mx.exp(x + y) + y)

        x = mx.array([[1.0, -3.0], [0.5, -0.5]])
        y = mx.array([[2.0, -1.0], [0.25, -0.25]])
        cotans = mx.ones_like(x)

        expected_out, expected_vjp_out = mx.vjp(simple_binary, (x, y), (cotans,))
        out, vjp_out = mx.vjp(mx.compile(simple_binary), (x, y), (cotans,))
        self.assertTrue(mx.allclose(expected_out[0], out[0]))
        self.assertTrue(mx.allclose(expected_vjp_out[0], vjp_out[0]))
        self.assertTrue(mx.allclose(expected_vjp_out[1], vjp_out[1]))

        tans = (mx.ones_like(x), mx.ones_like(y))
        expected_out, expected_jvp_out = mx.jvp(simple_binary, (x, y), tans)
        out, jvp_out = mx.jvp(mx.compile(simple_binary), (x, y), tans)
        self.assertTrue(mx.allclose(expected_jvp_out[0], jvp_out[0]))
        self.assertTrue(mx.allclose(expected_out[0], out[0]))

    def test_transform_over_eval_compiled(self):
        def outer(x):
            y = mx.exp(mx.abs(x))
            mx.eval(y)
            return y.sum()

        x = mx.array([2.0, -1.0, 0.5])
        dfdx = mx.grad(outer)(x)

        @mx.compile
        def simple_unary(x):
            return mx.exp(mx.abs(x))

        def outer(x):
            y = simple_unary(x)
            mx.eval(y)
            return y.sum()

        cdfdx = mx.grad(outer)(x)
        self.assertTrue(mx.allclose(dfdx, cdfdx))

    def test_compile_capture(self):
        # Test update captured state outside compiled function
        state = {"y": mx.array(2)}

        @partial(mx.compile, inputs=state)
        def test_state(x):
            x = x + state["y"]
            return x

        test_state(mx.array(1))
        # Check the state is unchanged
        self.assertEqual(state["y"], 2)

        # Check the updated state is used
        state["y"] = mx.array(3)
        out = test_state(mx.array(1))
        self.assertEqual(out.item(), 4)

        # Capture list
        state = [mx.array(2)]

        @partial(mx.compile, inputs=state)
        def test_state(x):
            x = x + state[0]
            return x

        out = test_state(mx.array(1))
        self.assertEqual(out.item(), 3)
        state[0] = mx.array(3)
        out = test_state(mx.array(1))
        self.assertEqual(out.item(), 4)

        # Capture tuple of list
        state = ([mx.array(2)],)

        @partial(mx.compile, inputs=state)
        def test_state(x):
            x = x + state[0][0]
            return x

        out = test_state(mx.array(1))
        self.assertEqual(out.item(), 3)
        state[0][0] = mx.array(3)
        out = test_state(mx.array(1))
        self.assertEqual(out.item(), 4)

        # Test state updated inside compiled function
        state = {}

        @partial(mx.compile, outputs=state)
        def test_state(x):
            state["y"] = x + 3
            return mx.abs(x)

        test_state(mx.array(-1))
        self.assertEqual(state["y"].item(), 2)

        # Test state changed inside compiled function
        # triggers recompile
        state = {}

        @partial(mx.compile, inputs=state, outputs=state)
        def test_state(x):
            y = state.get("y", mx.array(0))
            state["y"] = x + y
            return x + 2 * y

        test_state(mx.array(1))
        self.assertEqual(state["y"].item(), 1)
        test_state(mx.array(1))
        self.assertEqual(state["y"].item(), 2)

    def test_compile_rng(self):
        @partial(mx.compile, inputs=mx.random.state, outputs=mx.random.state)
        def fun():
            return mx.random.uniform(shape=(10, 10))

        self.assertFalse(mx.allclose(fun(), fun(), 1e-2, 1e-2))

    def test_compile_kwargs(self):
        @mx.compile
        def fun(x, y, z):
            return x + y + z

        x = mx.array(1)
        y = mx.array(2)
        z = mx.array(3)
        out = fun(x, y=y, z=z)
        self.assertEqual(out.item(), 6)

    def test_shapeless_compile(self):
        y = 1

        @partial(mx.compile, shapeless=True)
        def fun(x):
            return x + y

        x = mx.array([1, 2])
        self.assertTrue(mx.array_equal(fun(x), mx.array([2, 3])))

        # The function is not recompiled, so the change
        # to y should not be reflected in the output
        y = 2
        x = mx.array([1, 2, 3])
        self.assertTrue(mx.array_equal(fun(x), mx.array([2, 3, 4])))

        # Type change recompiles
        x = mx.array([1.0, 2.0, 3.0])
        self.assertTrue(mx.array_equal(fun(x), mx.array([3.0, 4.0, 5.0])))

        # Dim change recompiles
        x = mx.array([[1, 2, 3]])
        self.assertTrue(mx.array_equal(fun(x), mx.array([[3, 4, 5]])))

    def test_shapeless_compile_with_broadcasts(self):
        x = mx.ones((2, 2))
        y = mx.array([2, 2])

        def fun(x, y):
            return x * y

        cfun = mx.compile(fun, shapeless=True)
        self.assertTrue(mx.array_equal(cfun(x, y), fun(x, y)))
        self.assertTrue(mx.array_equal(cfun(y, x), fun(y, x)))
        y = mx.array([[3]])
        self.assertTrue(mx.array_equal(cfun(x, y), fun(x, y)))
        self.assertTrue(mx.array_equal(cfun(y, x), fun(y, x)))

    def test_shapeless_compile_with_reduction(self):
        # Test shapeless compile with a reduction
        z = 1

        @partial(mx.compile, shapeless=True)
        def fun(x, y):
            return x + y.sum(0, keepdims=True) + z

        x = mx.ones((2, 2), mx.int32)
        y = mx.ones((2, 2), mx.int32)
        self.assertTrue(mx.array_equal(fun(x, y), mx.full(shape=(2, 2), vals=4)))
        x = mx.ones((3, 3), mx.int32)
        y = mx.ones((3, 3), mx.int32)
        z = 2
        self.assertTrue(mx.array_equal(fun(x, y), mx.full(shape=(3, 3), vals=5)))

        x1 = mx.array([[1, 2], [3, 4], [5, 6]])
        x2 = mx.array([[1, 2]])

        def fun(x):
            return x * x.sum(-1, keepdims=True)

        cfun = mx.compile(fun, shapeless=True)
        mx.eval(cfun(x1))
        self.assertTrue(mx.array_equal(fun(x2), cfun(x2)))

        def fun(x):
            return x * x.sum(-1, keepdims=False)

        cfun = mx.compile(fun, shapeless=True)
        self.assertTrue(mx.array_equal(fun(x2), cfun(x2)))

    def test_shapeless_compile_unflatten(self):
        x = mx.zeros((1, 1, 4 * 32))

        def fun(x):
            return mx.unflatten(x, -1, (4, -1))

        self.assertEqual(mx.compile(fun, shapeless=True)(x).shape, (1, 1, 4, 32))

    def test_shapeless_compile_gather(self):
        x = mx.zeros((1, 1, 32))

        def fun(x):
            return x[:, -1, :]

        self.assertEqual(mx.compile(fun, shapeless=True)(x).shape, (1, 32))

    def test_shapeless_compile_full_like(self):
        x_shape = (1, 1, 32)
        x = mx.zeros((x_shape))

        def zeros_fun(x):
            return mx.zeros_like(x)

        def ones_fun(x):
            return mx.ones_like(x)

        compiled_zero_like = mx.compile(zeros_fun, shapeless=True)
        compiled_ones_like = mx.compile(ones_fun, shapeless=True)

        self.assertEqual(compiled_zero_like(x).shape, x_shape)
        self.assertEqual(compiled_ones_like(x).shape, x_shape)

        y_shape = (2, 2, 16)
        y = mx.zeros(y_shape)

        self.assertEqual(compiled_zero_like(y).shape, y_shape)
        self.assertEqual(compiled_ones_like(y).shape, y_shape)

    def test_compile_with_constant(self):
        # Test float
        @partial(mx.compile)
        def fun(x, y):
            return x + y

        z = fun(mx.array(1.0), 1.0)
        self.assertEqual(z.item(), 2.0)

        z = fun(mx.array(1.0), 2.0)
        self.assertEqual(z.item(), 3.0)

        z = fun(mx.array(1.0), y=1.0)
        self.assertEqual(z.item(), 2.0)

        z = fun(mx.array(1.0), y=3.0)
        self.assertEqual(z.item(), 4.0)

        # Test tuple
        @partial(mx.compile)
        def fun(x, y=(1, 2)):
            return x + y[0] + y[1]

        z = fun(mx.array(1))
        self.assertEqual(z.item(), 4)

        z = fun(mx.array(1), (2, 2))
        self.assertEqual(z.item(), 5)

        z = fun(mx.array(1), (2, 1))
        self.assertEqual(z.item(), 4)

        # Test bool
        @partial(mx.compile)
        def fun(x, y):
            if y:
                return x + 1
            else:
                return x + 2

        z = fun(mx.array(1), True)
        self.assertEqual(z.item(), 2)

        z = fun(mx.array(1), False)
        self.assertEqual(z.item(), 3)

        # Test string
        @partial(mx.compile)
        def fun(x, y):
            if y == "one":
                return x + 1
            else:
                return x + 2

        z = fun(mx.array(1), "one")
        self.assertEqual(z.item(), 2)

        z = fun(mx.array(1), "two")
        self.assertEqual(z.item(), 3)

        # Test nested constant
        @partial(mx.compile)
        def fun(x, y):
            if y[0][0] == 1:
                return x + 1
            else:
                return x + 2

        z = fun(mx.array(1), [[1]])
        self.assertEqual(z.item(), 2)

        z = fun(mx.array(1), [[0]])
        self.assertEqual(z.item(), 3)

        @partial(mx.compile)
        def fun(x, a, b):
            for ai in a:
                for bi in b:
                    x = bi * x + ai
            return x

        z = fun(mx.array(1), [1, 1], [2])
        self.assertEqual(z.item(), 7)

        z = fun(mx.array(1), [1], [1, 2])
        self.assertEqual(z.item(), 5)

        counter = [0]

        @partial(mx.compile)
        def fun(x, y):
            counter[0] += 1
            return x + y

        z = fun(mx.array(1), 1)
        self.assertEqual(z.item(), 2)

        z = fun(1, mx.array(1))
        self.assertEqual(z.item(), 2)

        self.assertEqual(counter[0], 2)

        y = 1.0

        @mx.compile
        def fun(x, constant):
            return x + y

        constant1 = "abc"
        out = fun(mx.array(0.0), constant1)
        self.assertEqual(out, mx.array(1.0))

        # new object, same value, no recompilation
        y = 2.0
        constant2 = "abc".encode("utf-8").decode("utf-8")
        out = fun(mx.array(0.0), constant2)
        self.assertEqual(out, mx.array(1.0))

        # same object, new value, recompilation
        constant2 = "xyz"
        out = fun(mx.array(0.0), constant2)
        self.assertEqual(out, mx.array(2.0))

    def test_compile_inf(self):
        @mx.compile
        def fun(x):
            return mx.isinf(x + 2)

        out = fun(mx.array([0.0]))
        self.assertEqual(out.item(), False)

    def test_unsupported_input_types(self):
        class MyClass:
            value = 1

        @mx.compile
        def fun(x, y):
            return x + y.value

        with self.assertRaises(ValueError):
            out = fun(mx.array(0.0), MyClass())

        with self.assertRaises(ValueError):
            out = fun(mx.array(0.0), y=MyClass())

    def test_compile_create_list(self):
        @mx.compile
        def fun():
            return [0.1 * mx.zeros((2,)), 0.1 * mx.zeros((2,))]

        out = fun()
        mx.eval(out)

    def test_compile_vjp(self):
        def fun(w):
            w1 = w + w
            w2 = w + w
            return w @ w1 + w2 @ w2

        def step(w):
            out, grad = mx.vjp(fun, (w,), (mx.array([[1.0, 1.0], [1.0, 1.0]]),))
            return out[0], grad[0]

        w = mx.zeros((2, 2))
        mx.eval(w)

        expected = step(w)
        out = mx.compile(step)(w)
        self.assertTrue(mx.allclose(expected[0], out[0]))
        self.assertTrue(mx.allclose(expected[1], out[1]))

        def fun(w1, w2, x):
            x = x @ w1
            y = x @ w2
            x = x + y * y
            return (x * x).sum()

        w1 = mx.zeros((4, 4))
        w2 = mx.zeros((4, 4))
        x = mx.zeros((4, 4))

        def step(w1, w2, x):
            loss, gradient = mx.value_and_grad(fun)(w1, w2, x)
            w1 = w1 + gradient
            return loss, w1

        mx.eval(x, w1, w2)
        expected = step(w1, w2, x)
        out = mx.compile(step)(w1, w2, x)

        self.assertTrue(mx.allclose(expected[0], out[0]))
        self.assertTrue(mx.allclose(expected[1], out[1]))

    def test_shapeless_mean(self):
        def mean(x):
            return mx.mean(x, keepdims=True)

        cfun = mx.compile(mean)
        out = cfun(mx.ones((5, 5)))
        self.assertTrue(mx.allclose(out, mx.array(1.0)))

        cmean = mx.compile(mean, shapeless=True)

        x = mx.ones(2)
        out = cmean(x)
        self.assertTrue(mx.allclose(out, mean(x)))

        x = mx.ones(4)
        out = cmean(x)
        self.assertTrue(mx.allclose(out, mean(x)))

        x = mx.ones(7)
        out = cmean(x)
        self.assertTrue(mx.allclose(out, mean(x)))

    def test_compile_broadcast_only(self):
        def fn(a):
            a = mx.broadcast_to(a, (1,))
            return a + a

        out = mx.compile(fn)(mx.array(2.0))
        # Make sure repr can be called
        self.assertTrue(repr(out) is not None)
        self.assertTrue(mx.array_equal(out, mx.array([4.0])))

    def test_compile_with_long_name(self):
        def fn(a, b):
            for _ in range(10):
                a = a - 1.0
                b = b - 1.0
            return a + b

        out = mx.compile(fn)(mx.array(10.0), mx.array(20.0))
        self.assertEqual(out.item(), 10.0)

    def test_compile_multi_output(self):
        def fn(x):
            ys = [x]
            for i in range(5):
                ys.append(ys[-1] + x)
            return ys, mx.sum(ys[-1])

        x = mx.ones(1, dtype=mx.int32)
        y1 = mx.compile(fn)(x)[1]
        y2 = fn(x)[1]
        self.assertEqual(y1.item(), y2.item())
        self.assertEqual(y1.item(), 6)

    def test_inf_constant(self):
        def fn(x):
            return mx.where(mx.isinf(x), 0, 1)

        x = mx.array([0, float("inf"), 1], dtype=mx.bfloat16)
        self.assertTrue(mx.array_equal(mx.compile(fn)(x), fn(x)))

    def test_max_into_equal(self):
        x = mx.random.uniform(shape=(1, 2, 2))
        mx.eval(x)

        def fn():
            maxes = mx.max(x, axis=(1, 2), keepdims=True)
            return x == maxes

        out = mx.compile(fn)()
        expected = fn()
        self.assertTrue(mx.array_equal(expected, out))

    def test_dtypes(self):
        x = mx.array([0, 1, 2, 3])
        dtypes = [mx.bool_, mx.int8, mx.uint8, mx.int16, mx.uint16]
        for dtype in dtypes:
            x = x.astype(dtype)
            mx.eval(x)

            def fn(x):
                return x * 1 + 0

            out = mx.compile(fn)(x)
            expected = fn(x)
            self.assertTrue(mx.array_equal(expected, out))

    def test_compile_without_captured_inputs(self):
        x = mx.array([1, 2, 3]) + 2

        def fn(a):
            y = x + 1
            return a + y

        with self.assertRaises(ValueError):
            y = mx.compile(fn)(x)

        x = mx.array([1.0, 2.0]) + mx.array([1.0, 2.0])
        y = None

        def fn(x):
            nonlocal y
            if y is None:
                y = mx.array([1.0, 2.0])

            y = y + x
            return y

        fn(x)
        with self.assertRaises(ValueError):
            y = mx.compile(fn)(x)

    def test_compile_dynamic_dims(self):
        a = mx.random.uniform(shape=(2,) * 10)
        b = mx.random.uniform(shape=(2,) * 10)
        a = a.T
        mx.eval(a, b)

        def fn(a, b):
            return mx.abs(a + b)

        out = mx.compile(fn)(a, b)
        expected = fn(a, b)
        self.assertTrue(mx.allclose(out, expected))

    def test_compile_many_inputs(self):
        inputs = [mx.ones((2, 2, 2, 2)) for _ in range(20)]
        inputs[0] = inputs[0].T

        @mx.compile
        def fun(*inputs):
            x = inputs[0]
            for y in inputs[1:10]:
                x = x + y
            a = inputs[10]
            for b in inputs[11:]:
                a = a + b
            return x + a

        out = fun(*inputs)
        self.assertTrue(mx.allclose(out, mx.full((2, 2), 20)))

        @mx.compile
        def fun(arrs):
            for _ in range(6):
                arrs = [x + y for x, y in zip(arrs[::2], arrs[1::2])]
            return arrs[0]

        arrs = [mx.array([1.0, 2.0]) for _ in range(64)]
        out = fun(arrs)
        self.assertTrue(mx.allclose(out, mx.array([64.0, 128.0])))

        inputs = [mx.arange(16384).astype(mx.float16) for _ in range(8)]

        def fun(inputs):
            a = inputs[0] + inputs[1]
            b = inputs[2] + inputs[3]
            c = inputs[4] + inputs[5]
            d = inputs[6] + inputs[7]
            return a * b * c * d

        out = mx.compile(fun)(inputs)
        expected = fun(inputs)
        self.assertTrue(mx.allclose(out, expected))

    def test_compile_many_outputs(self):
        @mx.compile
        def fun(arr):
            arrs = [arr] * 64
            first_arrs = None
            for _ in range(6):
                arrs = [x + y for x, y in zip(arrs[::2], arrs[1::2])]
                if first_arrs is None:
                    first_arrs = arrs
            return arrs[0], first_arrs

        out = fun(mx.array([1.0, 2.0]))
        self.assertTrue(mx.allclose(out[0], mx.array([64.0, 128.0])))

    def test_shapeless_compile_matmul(self):
        a = mx.array([0.0, 1.0, 2.0])
        b = mx.array([0.0, 1.0, 2.0])

        fun = mx.compile(lambda a, b: a @ b, shapeless=True)
        self.assertTrue(mx.allclose(fun(a, b), a @ b))

    def test_shapeless_compile_slice_update(self):
        def fun(x):
            x[2] = mx.array([3.0])
            return x

        cfun = mx.compile(fun, shapeless=True)

        a = mx.array([0.0, 1.0, 2.0, 3.0])
        self.assertTrue(mx.allclose(cfun(a), fun(a)))

        a = mx.array([0.0, 1.0, 2.0, 3.0, 4.0])
        self.assertTrue(mx.allclose(cfun(a), fun(a)))

    def test_shapeless_compile_with_reshape(self):
        def fun(x):
            return x.reshape(x.shape[0] * x.shape[1], -1)

        compiled_fun = mx.compile(fun, shapeless=True)

        x = mx.zeros(shape=(2, 3, 4))
        out = compiled_fun(x)
        self.assertEqual(out.shape, (6, 4))

        x = mx.zeros(shape=(2, 3, 8))
        out = compiled_fun(x)
        self.assertEqual(out.shape, (6, 8))

        x = mx.zeros(shape=(5, 5, 5))

        with self.assertRaises(ValueError):
            compiled_fun(x)

    def test_compile_shapeless_with_broadcast(self):
        a = mx.array(0.0)
        b = mx.ones((2, 2))

        def fun(a):
            return mx.broadcast_to(a, b.shape)

        cfun = mx.compile(fun, shapeless=True)
        # Works on the first shape
        cfun(a)

        # Fails on a different shape
        with self.assertRaises(ValueError):
            cfun(mx.array(0.0).reshape(1, 1, 1))

        def fun(a, b):
            return mx.broadcast_arrays(a, b)

        cfun = mx.compile(fun, shapeless=True)
        a, b = cfun(a, b)
        self.assertEqual(a.shape, (2, 2))
        self.assertEqual(b.shape, (2, 2))

        # Batched matmul
        a = mx.zeros((2, 1, 4, 2))
        b = mx.zeros((3, 2, 5))

        def fun(a, b):
            return a @ b

        cfun = mx.compile(fun, shapeless=True)
        out = cfun(a, b)
        self.assertEqual(out.shape, (2, 3, 4, 5))

        # Shapeless compile should be preserved over vjp, jvp, vmap
        def fun(args):
            return sum(args).sum()

        a = mx.array(0.0)
        b = mx.ones((2, 2))

        cfun = mx.compile(mx.grad(fun), shapeless=True)
        out = cfun((a, b))

        self.assertEqual(out[0].shape, ())
        self.assertEqual(out[1].shape, (2, 2))

        out = cfun((b, a))

        self.assertEqual(out[0].shape, (2, 2))
        self.assertEqual(out[1].shape, ())

        # Shapeless compile should be preserved over vjp, jvp, vmap
        def fun(args):
            return (args[0] @ args[1]).sum()

        a = mx.zeros((2, 1, 4, 2))
        b = mx.zeros((3, 2, 5))

        cfun = mx.compile(mx.grad(fun), shapeless=True)
        out = cfun((a, b))

        self.assertEqual(out[0].shape, (2, 1, 4, 2))
        self.assertEqual(out[1].shape, (3, 2, 5))

        a = mx.zeros((3, 1, 4, 2))
        b = mx.zeros((2, 2, 5))

        out = cfun((a, b))

        self.assertEqual(out[0].shape, (3, 1, 4, 2))
        self.assertEqual(out[1].shape, (2, 2, 5))

    def test_leaks(self):
        gc.collect()
        if mx.metal.is_available():
            mem_pre = mx.get_active_memory()
        else:
            mem_pre = 0

        def outer():
            d = {}

            def f(x):
                return d["x"]

            d["f"] = mx.compile(f)
            d["x"] = mx.array([0] * 1000)

        for _ in range(5):
            outer()
            gc.collect()

        if mx.metal.is_available():
            mem_post = mx.get_active_memory()
        else:
            mem_post = 0

        self.assertEqual(mem_pre, mem_post)

    def test_double_constant(self):
        with mx.stream(mx.cpu):
            x = mx.array(1.0, dtype=mx.float64)

            def fun(x):
                return (x + math.pi) * 2.0

            y = fun(x).item()
            y_compiled = mx.compile(fun)(x).item()
            self.assertEqual(y, y_compiled)

    def test_shared_broadcast(self):
        def fun(x, y, z):
            yy = mx.broadcast_to(y, z.shape)
            return (x + yy * z), yy.sum()

        a = mx.random.normal((10, 10))
        b = mx.array(0.1)
        c = mx.random.normal((10, 10))
        mx.eval(a, b, c)
        fc = mx.compile(fun)
        d = fc(a, b, c)

        s = StringIO()
        mx.export_to_dot(s, a=a, b=b, c=c, d1=d[0], d2=d[1])
        s.seek(0)
        s = s.read()

        self.assertTrue("CompiledBroadcastMultiplyAdd" in s)
        d_hat = fun(a, b, c)
        self.assertTrue(mx.allclose(d[0], d_hat[0]))
        self.assertTrue(mx.allclose(d[1], d_hat[1]))

    def test_compile_large_graph_with_broadcasts(self):
        N = 20
        _as = [mx.array(2 * i, dtype=mx.float32) for i in range(N)]
        _bs = [mx.array(i, dtype=mx.float32) for i in range(N)]
        _c = mx.array(0.0)
        x = mx.random.normal((2, 2))

        def f(x):
            y = 0
            for i in range(N):
                y = y + _as[i] * x * _bs[i] * _c
            return y

        ref = f(x)
        mx.eval(ref)
        f = mx.compile(f)
        for i in range(2):
            y = f(x)
            mx.eval(y)

        self.assertTrue(mx.allclose(y, ref))

    def test_wrap_compiled(self):
        @mx.compile
        def inner():
            pass

        @wraps(inner)
        def wrapper():
            pass

    def test_compiled_preserves_attributes(self):
        def inner(x: mx.array, y: str):
            """
            A useful function.
            """
            pass

        c_inner = mx.compile(inner)
        self.assertEqual(inner.__name__, c_inner.__name__)
        self.assertEqual(inner.__qualname__, c_inner.__qualname__)
        self.assertEqual(inner.__doc__, c_inner.__doc__)
        self.assertEqual(inspect.signature(inner), inspect.signature(c_inner))

    def test_compile_with_none(self):
        @mx.compile
        def fun(x, y):
            if y is None:
                return mx.abs(x - 2.0)
            else:
                return mx.abs(x + y)

        out = fun(mx.array(1.0), None)
        self.assertEqual(out.item(), 1.0)

        out = fun(mx.array(1.0), mx.array(2.0))
        self.assertEqual(out.item(), 3.0)

    def test_compile_changing_outputs(self):
        @mx.compile
        def fun(x, y):
            if y is None:
                return 2 * x
            elif (
                isinstance(x, mx.array)
                and isinstance(y, mx.array)
                and x.dtype == y.dtype == mx.float32
            ):
                return [x + y]
            elif y.dtype == mx.bool_:
                return {"a": x, "b": y * x}
            else:
                return None

        a = fun(mx.array(1.0), mx.array(2.0))
        self.assertTrue(isinstance(a, list))
        self.assertEqual(a[0].item(), 3.0)

        b = fun(mx.array(1.0), mx.array(True))
        self.assertTrue(isinstance(b, dict))
        self.assertEqual(b["a"].item(), 1.0)
        self.assertEqual(b["b"].item(), 1.0)

        c = fun(mx.array(1.0), None)
        self.assertTrue(isinstance(c, mx.array))
        self.assertEqual(c.item(), 2.0)

        d = fun(False, mx.array(1.0))
        self.assertTrue(d is None)

    def test_compile_changing_outputs_with_state(self):
        state = [mx.array(1.0)]

        @partial(mx.compile, inputs=state, outputs=state)
        def fun(y):
            x = state[0]
            if y.dtype == mx.float32:
                state[0] = 2 * y
                return [x, y, x + y]
            elif y.dtype == mx.int32:
                state[0] *= 2
                return x + y

        for i in range(10):
            fun(mx.array(1.0))
            fun(mx.array(1))

        self.assertEqual(state[0].item(), 4)

    def test_outputs_changing(self):
        @mx.compile
        def fun(x):
            x = mx.abs(mx.negative(x))
            y = mx.abs(x)
            return x, y

        @mx.compile
        def fun2(x):
            x = mx.abs(mx.negative(x))
            y = mx.abs(x)
            return y

        a, b = fun(mx.array(-1.0))
        mx.eval(a, b)

        a = fun2(mx.array(-1.0))
        self.assertEqual(a.item(), 1.0)

    def test_multiple_compile_same_capture(self):
        def fun(do_compile):
            t = mx.ones((10,))
            u = (1.0 - t) * 0.0 + t * 3.0

            o = mx.ones((6,))
            b = o[:, None] * u

            c = b * mx.ones_like(u)

            a = mx.ones((6,))
            if do_compile:
                d = mx.compile(lambda x: x @ b)(a)
                e = mx.compile(lambda x: x @ c.T)(d)
            else:
                d = a @ b
                e = d @ c.T
            return e

        out = fun(True)
        mx.eval(out)
        expected = fun(False)
        self.assertTrue(mx.allclose(out, expected))

    def test_compile_types(self):
        from typing import NamedTuple

        class Vector(tuple):
            pass

        class State(NamedTuple):
            a: mx.array
            b: mx.array

        def transform(x: State):
            return State(x.a + 10, x.b * 10)

        def transform_tuple(t):
            return (t[0] + 10, t[1] * 10)

        def transform_vector(t):
            return Vector([t[0] + 10, t[1] * 10])

        x = State(mx.array(1), mx.array(2))

        compiled_transform = mx.compile(transform)
        compiled_transform_tuple = mx.compile(transform_tuple)
        compiled_transform_vector = mx.compile(transform_vector)

        x_batch_tuple = (mx.array([1, 2, 3]), mx.array([4, 5, 6]))
        out1 = compiled_transform_tuple(x_batch_tuple)

        self.assertTrue(isinstance(out1, tuple))
        self.assertTrue(mx.array_equal(out1[0], mx.array([11, 12, 13])))
        self.assertTrue(mx.array_equal(out1[1], mx.array([40, 50, 60])))

        x_batch = State(mx.array([1, 2, 3]), mx.array([4, 5, 6]))
        out2 = compiled_transform(x_batch)
        self.assertTrue(isinstance(out2, State))
        self.assertTrue(mx.array_equal(out2.a, mx.array([11, 12, 13])))
        self.assertTrue(mx.array_equal(out2.b, mx.array([40, 50, 60])))

        x_batch_vector = Vector([mx.array([1, 2, 3]), mx.array([4, 5, 6])])
        out3 = compiled_transform_vector(x_batch_vector)
        self.assertTrue(isinstance(out3, Vector))
        self.assertTrue(mx.array_equal(out3[0], mx.array([11, 12, 13])))
        self.assertTrue(mx.array_equal(out3[1], mx.array([40, 50, 60])))

    def test_compile_output_with_siblings(self):
        @mx.compile
        def fun(x, y):
            return mx.divmod(mx.abs(x), mx.abs(y))[0]

        out = fun(mx.array(1.0), mx.array(1.0))
        self.assertEqual(out.item(), 1.0)

        # Make sure the following compiles without issue
        def loss_fn(params, x):
            emb, w = params
            return mx.fast.layer_norm(emb[x], w, None, 1e-4).sum()

        emb = mx.zeros((10, 32))
        w = mx.zeros((32,))

        loss_and_grad_fn = mx.value_and_grad(loss_fn)

        x = mx.zeros(shape=(4, 32), dtype=mx.int32)
        mx.eval(x, emb, w)

        @mx.compile
        def step(emb, w, x):
            loss, grads = loss_and_grad_fn((emb, w), x)
            return loss, grads

        loss, grads = step(emb, w, x)
        mx.eval(loss, grads)

    def test_compile_donates_input_buffer(self):
        mx.set_default_device(mx.cpu)

        def fun(x):
            return mx.sin(x) + 1

        compiled_fn = mx.compile(fun)

        input = mx.arange(16, dtype=mx.float32)
        mx.eval(input)
        in_ptr = np.asarray(input, copy=False).__array_interface__["data"][0]

        out = compiled_fn(input)
        del input  # Ensure the reference is dropped
        mx.eval(out)

        self.assertEqual(
            np.asarray(out, copy=False).__array_interface__["data"][0], in_ptr
        )


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_constants.py
================================================
# Copyright © 2023 Apple Inc.

import unittest

import mlx.core as mx
import mlx_tests
import numpy as np


class TestConstants(mlx_tests.MLXTestCase):
    def test_constants_values(self):
        # Check if mlx constants match expected values
        self.assertAlmostEqual(
            mx.e, 2.71828182845904523536028747135266249775724709369995
        )
        self.assertAlmostEqual(
            mx.euler_gamma, 0.5772156649015328606065120900824024310421
        )
        self.assertAlmostEqual(mx.inf, float("inf"))
        self.assertTrue(np.isnan(mx.nan))
        self.assertIsNone(mx.newaxis)
        self.assertAlmostEqual(mx.pi, 3.1415926535897932384626433)

    def test_constants_availability(self):
        # Check if mlx constants are available
        self.assertTrue(hasattr(mx, "e"))
        self.assertTrue(hasattr(mx, "euler_gamma"))
        self.assertTrue(hasattr(mx, "inf"))
        self.assertTrue(hasattr(mx, "nan"))
        self.assertTrue(hasattr(mx, "newaxis"))
        self.assertTrue(hasattr(mx, "pi"))

    def test_newaxis_for_reshaping_arrays(self):
        arr_1d = mx.array([1, 2, 3, 4, 5])
        arr_2d_column = arr_1d[:, mx.newaxis]
        expected_result = mx.array([[1], [2], [3], [4], [5]])
        self.assertTrue(mx.array_equal(arr_2d_column, expected_result))


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_conv.py
================================================
# Copyright © 2023-2024 Apple Inc.

import math
import unittest
from itertools import permutations

import mlx.core as mx
import mlx_tests
import numpy as np

try:
    import torch
    import torch.nn.functional as F

    has_torch = True
except ImportError as e:
    has_torch = False


class TestConv(mlx_tests.MLXTestCase):
    def test_numpy_conv(self):
        for dtype in (
            "float16",
            "float32",
        ):
            np_dtype = getattr(np, dtype)
            for M, N, mode in (
                (1, 1, "full"),
                (25, 5, "full"),
                (24, 5, "same"),
                (24, 4, "same"),
                (24, 4, "valid"),
                (4, 24, "full"),
                (5, 25, "same"),
                (4, 25, "valid"),
            ):
                with self.subTest(dtype=dtype, M=M, N=N, mode=mode):
                    atol = 1e-6 if dtype == "float32" else 1e-5
                    a_np = np.random.rand(M).astype(np_dtype)
                    v_np = np.random.rand(N).astype(np_dtype)
                    a_mx = mx.array(a_np)
                    v_mx = mx.array(v_np)

                    c_np = np.convolve(a_np, v_np, mode=mode)
                    c_mx = mx.convolve(a_mx, v_mx, mode=mode)

                    self.assertEqual(c_mx.shape, c_np.shape)
                    self.assertTrue(np.allclose(c_mx, c_np, atol=atol))

    def test_conv_1d_groups_flipped(self):
        x = mx.broadcast_to(mx.arange(5).astype(mx.float32), (2, 5)).T
        w = mx.broadcast_to(mx.arange(4).astype(mx.float32), (2, 4))
        out = mx.conv_general(x[None], w[..., None], flip=True, groups=2)
        expected = mx.array([4.0, 4.0, 10.0, 10.0]).reshape(1, 2, 2)
        self.assertTrue(mx.allclose(out, expected))

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_conv_1D(self):
        def run_conv1D(
            N,
            C,
            O,
            iH,
            kH,
            stride,
            padding,
            dilation=1,
            groups=1,
            dtype="float32",
            atol=1e-5,
        ):
            with self.subTest(
                dtype=dtype,
                N=N,
                C=C,
                O=O,
                iH=iH,
                kH=kH,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
            ):
                np_dtype = getattr(np, dtype)
                np.random.seed(0)
                in_np = np.random.normal(0, 1.0 / C, (N, iH, C)).astype(np_dtype)
                wt_np = np.random.normal(0, 1.0 / C, (O, kH, int(C / groups))).astype(
                    np_dtype
                )

                in_mx, wt_mx = map(mx.array, (in_np, wt_np))
                in_pt, wt_pt = map(
                    lambda x: torch.from_numpy(x.transpose(0, 2, 1)), (in_np, wt_np)
                )

                out_mx = mx.conv1d(
                    in_mx,
                    wt_mx,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )
                out_pt = torch.conv1d(
                    in_pt,
                    wt_pt,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )
                out_pt = torch.transpose(out_pt, 2, 1)

                self.assertEqual(out_pt.shape, out_mx.shape)
                self.assertTrue(np.allclose(out_pt.numpy(), out_mx, atol=atol))

        for dtype in ("float32",):
            for N, C, O in (
                (1, 1, 1),
                (1, 6, 1),
                (1, 1, 6),
                (4, 32, 64),
            ):
                for iH, kH, stride, padding in (
                    (1, 1, 1, 0),
                    (3, 3, 1, 0),
                    (31, 5, 5, 2),
                ):
                    run_conv1D(N, C, O, iH, kH, stride, padding, dtype=dtype)

        # Groups tests
        N, C, O = (4, 32, 64)
        for iH, kH, stride, padding in (
            (1, 1, 1, 0),
            (3, 3, 1, 0),
            (31, 5, 5, 2),
        ):
            for group in (1, 2, 4, 8, 16, 32):
                run_conv1D(N, C, O, iH, kH, stride, padding, groups=group, dtype=dtype)

        # Strided inputs tests
        for tpose_in, tpose_wt in (
            ((0, 2, 1), (0, 1, 2)),
            ((0, 2, 1), (0, 2, 1)),
        ):
            with self.subTest(name="strided", tpose_in=tpose_in, tpose_wt=tpose_wt):
                in_np = np.random.normal(0, 1.0 / 16, (16, 16, 16)).astype(np.float32)
                wt_np = np.random.normal(0, 1.0 / 16, (16, 16, 16)).astype(np.float32)

                in_mx, wt_mx = map(mx.array, (in_np, wt_np))
                in_mx_t = mx.transpose(in_mx, tpose_in)
                wt_mx_t = mx.transpose(wt_mx, tpose_wt)
                out_mx = mx.conv1d(in_mx_t, wt_mx_t)

                in_pt, wt_pt = map(
                    lambda x: torch.from_numpy(x.transpose(0, 2, 1)),
                    (in_np.transpose(tpose_in), wt_np.transpose(tpose_wt)),
                )

                out_pt = torch.conv1d(in_pt, wt_pt)
                out_pt = torch.transpose(out_pt, 2, 1)

                self.assertEqual(out_pt.shape, out_mx.shape)
                self.assertTrue(np.allclose(out_pt.numpy(), out_mx, atol=1e-5))

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_conv_1D_grad(self):
        def run_conv1D_grad(
            N,
            C,
            O,
            iH,
            kH,
            stride,
            padding,
            dilation=1,
            groups=1,
            dtype="float32",
            atol=1e-5,
        ):
            with self.subTest(
                dtype=dtype,
                N=N,
                C=C,
                O=O,
                iH=iH,
                kH=kH,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
            ):
                np_dtype = getattr(np, dtype)
                np.random.seed(0)
                oH = 1 + ((iH + 2 * padding - dilation * (kH - 1) - 1) // stride)

                in_np = np.random.normal(0, 1.0 / C, (N, iH, C)).astype(np_dtype)
                wt_np = np.random.normal(0, 1.0 / C, (O, kH, C)).astype(np_dtype)
                ct_np = np.random.normal(0, 1.0 / C, (N, oH, O)).astype(np_dtype)

                in_mx, wt_mx, ct_mx = map(mx.array, (in_np, wt_np, ct_np))
                in_pt, wt_pt, ct_pt = map(
                    lambda x: torch.from_numpy(x.transpose(0, 2, 1)),
                    (in_np, wt_np, ct_np),
                )

                def f(a, b):
                    return mx.conv1d(
                        a,
                        b,
                        stride=stride,
                        padding=padding,
                        dilation=dilation,
                        groups=groups,
                    )

                _, outs_mx = mx.vjp(
                    f,
                    [
                        in_mx,
                        wt_mx,
                    ],
                    [
                        ct_mx,
                    ],
                )
                pt_grad_in = F.grad.conv1d_input(
                    in_pt.shape,
                    wt_pt,
                    ct_pt,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )
                pt_grad_wt = F.grad.conv1d_weight(
                    in_pt,
                    wt_pt.shape,
                    ct_pt,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )
                pt_grad_in = torch.transpose(pt_grad_in, 2, 1).numpy()
                pt_grad_wt = torch.transpose(pt_grad_wt, 2, 1).numpy()

                mx_grad_in, mx_grad_wt = outs_mx

                self.assertEqual(pt_grad_in.shape, mx_grad_in.shape)
                self.assertEqual(in_mx.shape, mx_grad_in.shape)
                self.assertTrue(np.allclose(pt_grad_in, mx_grad_in, atol=atol))

                self.assertEqual(pt_grad_wt.shape, mx_grad_wt.shape)
                self.assertEqual(wt_mx.shape, mx_grad_wt.shape)
                self.assertTrue(np.allclose(pt_grad_wt, mx_grad_wt, atol=atol))

        for dtype in ("float32",):
            for N, C, O in (
                (1, 1, 1),
                (1, 6, 1),
                (1, 1, 6),
                (4, 32, 64),
            ):
                for iH, kH, stride, padding in (
                    (1, 1, 1, 0),
                    (3, 3, 1, 0),
                    (31, 5, 5, 2),
                ):
                    run_conv1D_grad(N, C, O, iH, kH, stride, padding, dtype=dtype)

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_conv_2D(self):
        def run_conv2D(
            N,
            C,
            O,
            idim,
            kdim,
            stride,
            padding,
            dilation=(1, 1),
            groups=1,
            dtype="float32",
        ):
            with self.subTest(
                dtype=dtype,
                N=N,
                C=C,
                O=O,
                idim=idim,
                kdim=kdim,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
            ):
                np.random.seed(0)
                iH, iW = idim
                kH, kW = kdim
                scale = 1.0 / math.sqrt(kH * kW * C)
                in_np = np.random.normal(0.0, scale, (N, iH, iW, C))
                wt_np = np.random.normal(0.0, 1.0, (O, kH, kW, int(C / groups)))

                mx_dtype = getattr(mx, dtype)
                torch_dtype = getattr(torch, dtype)
                in_mx, wt_mx = map(
                    lambda x: mx.array(x).astype(mx_dtype), (in_np, wt_np)
                )
                in_pt, wt_pt = map(
                    lambda x: torch.from_numpy(x.transpose(0, 3, 1, 2))
                    .to("cpu")
                    .to(torch_dtype),
                    (in_np, wt_np),
                )

                out_mx = mx.conv2d(
                    in_mx,
                    wt_mx,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                ).astype(mx.float32)
                out_pt = torch.conv2d(
                    in_pt,
                    wt_pt,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )
                out_pt = (
                    torch.permute(out_pt, (0, 2, 3, 1))
                    .to(torch.float32)
                    .numpy(force=True)
                )

                self.assertEqual(out_pt.shape, out_mx.shape)
                if dtype == "bfloat16":
                    atol, rtol = 1e-1, 1e-3
                else:
                    atol, rtol = 1e-5, 1e-6
                self.assertTrue(np.allclose(out_pt, out_mx, atol=atol))

        for dtype in ("float32", "bfloat16"):
            for N, C, O in (
                (1, 1, 1),
                (1, 6, 1),
                (1, 1, 6),
                (4, 32, 64),
            ):
                for idim, kdim, stride, padding in (
                    ((1, 1), (1, 1), (1, 1), (0, 0)),
                    ((3, 3), (3, 1), (1, 1), (0, 0)),
                    ((31, 31), (5, 5), (5, 5), (2, 2)),
                ):
                    run_conv2D(N, C, O, idim, kdim, stride, padding, dtype=dtype)

            # Groups tests
            N, C, O = (4, 32, 64)
            for idim, kdim, stride, padding in (
                ((1, 1), (1, 1), (1, 1), (0, 0)),
                ((3, 3), (3, 1), (1, 1), (0, 0)),
                ((31, 31), (5, 5), (5, 5), (2, 2)),
            ):
                for group in (1, 2, 4, 8, 16, 32):
                    run_conv2D(
                        N, C, O, idim, kdim, stride, padding, groups=group, dtype=dtype
                    )

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_conv_2D_grad(self):
        def run_conv2D_grad(
            N,
            C,
            O,
            idim,
            kdim,
            stride,
            padding,
            dilation=(1, 1),
            groups=1,
            dtype="float32",
            atol=1e-5,
        ):
            with self.subTest(
                dtype=dtype,
                N=N,
                C=C,
                O=O,
                idim=idim,
                kdim=kdim,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
            ):
                np_dtype = getattr(np, dtype)
                np.random.seed(0)
                iH, iW = idim
                kH, kW = kdim
                scale = 1.0 / math.sqrt(kH * kW * C)

                oH = 1 + (
                    (iH + 2 * padding[0] - dilation[0] * (kH - 1) - 1) // stride[0]
                )
                oW = 1 + (
                    (iW + 2 * padding[1] - dilation[1] * (kW - 1) - 1) // stride[1]
                )

                in_np = np.random.normal(0.0, scale, (N, iH, iW, C)).astype(np_dtype)
                wt_np = np.random.normal(0.0, scale, (O, kH, kW, C)).astype(np_dtype)
                ct_np = np.random.normal(0.0, scale, (N, oH, oW, O)).astype(np_dtype)

                in_mx, wt_mx, ct_mx = map(mx.array, (in_np, wt_np, ct_np))
                in_pt, wt_pt, ct_pt = map(
                    lambda x: torch.from_numpy(x.transpose(0, 3, 1, 2)).to("cpu"),
                    (in_np, wt_np, ct_np),
                )

                def f(a, b):
                    return mx.conv2d(
                        a,
                        b,
                        stride=stride,
                        padding=padding,
                        dilation=dilation,
                        groups=groups,
                    )

                _, outs_mx = mx.vjp(
                    f,
                    [in_mx, wt_mx],
                    [ct_mx],
                )
                pt_grad_in = F.grad.conv2d_input(
                    in_pt.shape,
                    wt_pt,
                    ct_pt,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )
                pt_grad_wt = F.grad.conv2d_weight(
                    in_pt,
                    wt_pt.shape,
                    ct_pt,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )
                pt_grad_in = torch.permute(pt_grad_in, (0, 2, 3, 1)).numpy()
                pt_grad_wt = torch.permute(pt_grad_wt, (0, 2, 3, 1)).numpy()

                mx_grad_in, mx_grad_wt = outs_mx

                self.assertEqual(pt_grad_in.shape, mx_grad_in.shape)
                self.assertEqual(in_mx.shape, mx_grad_in.shape)
                self.assertTrue(np.allclose(pt_grad_in, mx_grad_in, atol=atol))

                self.assertEqual(pt_grad_wt.shape, mx_grad_wt.shape)
                self.assertEqual(wt_mx.shape, mx_grad_wt.shape)
                self.assertTrue(np.allclose(pt_grad_wt, mx_grad_wt, atol=atol))

        for dtype in ("float32",):
            for N, C, O in ((1, 1, 1), (1, 6, 1), (1, 1, 6), (4, 32, 64), (4, 16, 32)):
                for idim, kdim, stride, padding, dilation in (
                    ((1, 1), (1, 1), (1, 1), (0, 0), (1, 1)),
                    ((3, 3), (3, 1), (1, 1), (0, 0), (1, 1)),
                    ((31, 31), (5, 5), (5, 5), (2, 2), (1, 1)),
                    ((32, 32), (3, 3), (2, 2), (1, 1), (1, 1)),
                    ((31, 31), (5, 5), (5, 5), (2, 2), (3, 2)),
                    ((32, 32), (3, 3), (2, 2), (1, 1), (3, 2)),
                ):
                    run_conv2D_grad(
                        N, C, O, idim, kdim, stride, padding, dilation, dtype=dtype
                    )

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_conv_3D(self):
        def run_conv3D(
            N,
            C,
            O,
            idim,
            kdim,
            stride,
            padding,
            dilation=(1, 1, 1),
            groups=1,
            dtype="float32",
            atol=1e-5,
        ):
            with self.subTest(
                dtype=dtype,
                N=N,
                C=C,
                O=O,
                idim=idim,
                kdim=kdim,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
            ):
                np_dtype = getattr(np, dtype)
                np.random.seed(0)
                iD, iH, iW = idim
                kD, kH, kW = kdim
                scale = 1.0 / math.sqrt(kD * kH * kW * C)
                in_np = np.random.normal(0.0, scale, (N, iD, iH, iW, C)).astype(
                    np_dtype
                )
                wt_np = np.random.normal(0.0, 1.0, (O, kD, kH, kW, C)).astype(np_dtype)

                in_mx, wt_mx = map(mx.array, (in_np, wt_np))
                in_pt, wt_pt = map(
                    lambda x: torch.from_numpy(x.transpose(0, 4, 1, 2, 3)).to("cpu"),
                    (in_np, wt_np),
                )

                out_mx = mx.conv3d(
                    in_mx,
                    wt_mx,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )
                out_pt = torch.conv3d(
                    in_pt,
                    wt_pt,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )
                out_pt = torch.permute(out_pt, (0, 2, 3, 4, 1)).numpy(force=True)

                self.assertEqual(out_pt.shape, out_mx.shape)
                self.assertTrue(np.allclose(out_pt, out_mx, atol=atol))

        for dtype in ("float32",):
            for N, C, O in (
                (1, 1, 1),
                (1, 6, 1),
                (1, 1, 6),
                (4, 16, 32),
            ):
                for idim, kdim, stride, padding in (
                    ((1, 1, 1), (1, 1, 1), (1, 1, 1), (0, 0, 0)),
                    ((3, 3, 3), (3, 1, 1), (1, 1, 1), (0, 0, 0)),
                    ((31, 31, 31), (5, 5, 5), (5, 5, 5), (2, 2, 2)),
                ):
                    run_conv3D(N, C, O, idim, kdim, stride, padding, dtype=dtype)

            N, C, O = (2, 4, 4)
            idim, kdim, stride, padding = (6, 6, 6), (3, 1, 1), (1, 1, 1), (0, 0, 0)
            run_conv3D(
                N, C, O, idim, kdim, stride, padding, dilation=(2, 2, 2), dtype=dtype
            )

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_conv_3D_grad(self):
        def run_conv3D_grad(
            N,
            C,
            O,
            idim,
            kdim,
            stride,
            padding,
            dilation=(1, 1, 1),
            groups=1,
            dtype="float32",
            atol=1e-5,
        ):
            with self.subTest(
                dtype=dtype,
                N=N,
                C=C,
                O=O,
                idim=idim,
                kdim=kdim,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
            ):
                np_dtype = getattr(np, dtype)
                np.random.seed(0)
                iD, iH, iW = idim
                kD, kH, kW = kdim
                scale = 1.0 / math.sqrt(kD * kH * kW * C)

                oD = 1 + (
                    (iD + 2 * padding[0] - dilation[0] * (kD - 1) - 1) // stride[0]
                )
                oH = 1 + (
                    (iH + 2 * padding[1] - dilation[1] * (kH - 1) - 1) // stride[1]
                )
                oW = 1 + (
                    (iW + 2 * padding[2] - dilation[2] * (kW - 1) - 1) // stride[2]
                )

                in_np = np.random.normal(0.0, scale, (N, iD, iH, iW, C)).astype(
                    np_dtype
                )
                wt_np = np.random.normal(0.0, scale, (O, kD, kH, kW, C)).astype(
                    np_dtype
                )
                ct_np = np.random.normal(0.0, scale, (N, oD, oH, oW, O)).astype(
                    np_dtype
                )

                in_mx, wt_mx, ct_mx = map(mx.array, (in_np, wt_np, ct_np))
                in_pt, wt_pt, ct_pt = map(
                    lambda x: torch.from_numpy(x.transpose(0, 4, 1, 2, 3)).to("cpu"),
                    (in_np, wt_np, ct_np),
                )

                def f(a, b):
                    return mx.conv3d(
                        a,
                        b,
                        stride=stride,
                        padding=padding,
                        dilation=dilation,
                        groups=groups,
                    )

                _, outs_mx = mx.vjp(
                    f,
                    [in_mx, wt_mx],
                    [ct_mx],
                )
                pt_grad_in = F.grad.conv3d_input(
                    in_pt.shape,
                    wt_pt,
                    ct_pt,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )
                pt_grad_wt = F.grad.conv3d_weight(
                    in_pt,
                    wt_pt.shape,
                    ct_pt,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )
                pt_grad_in = torch.permute(pt_grad_in, (0, 2, 3, 4, 1)).numpy()
                pt_grad_wt = torch.permute(pt_grad_wt, (0, 2, 3, 4, 1)).numpy()

                mx_grad_in, mx_grad_wt = outs_mx

                self.assertEqual(pt_grad_in.shape, mx_grad_in.shape)
                self.assertEqual(in_mx.shape, mx_grad_in.shape)
                self.assertTrue(np.allclose(pt_grad_in, mx_grad_in, atol=atol))

                self.assertEqual(pt_grad_wt.shape, mx_grad_wt.shape)
                self.assertEqual(wt_mx.shape, mx_grad_wt.shape)
                self.assertTrue(np.allclose(pt_grad_wt, mx_grad_wt, atol=atol))

        for dtype in ("float32",):
            for N, C, O in ((1, 1, 1), (1, 6, 1), (1, 1, 6), (4, 16, 32), (4, 8, 16)):
                for idim, kdim, stride, padding, dilation in (
                    ((1, 1, 1), (1, 1, 1), (1, 1, 1), (0, 0, 0), (1, 1, 1)),
                    ((3, 3, 3), (3, 1, 1), (1, 1, 1), (0, 0, 0), (1, 1, 1)),
                    ((15, 15, 15), (5, 5, 5), (5, 5, 5), (2, 2, 2), (1, 1, 1)),
                    ((16, 16, 16), (3, 3, 3), (2, 2, 2), (1, 1, 1), (1, 1, 1)),
                    ((15, 15, 15), (5, 5, 5), (5, 5, 5), (2, 2, 2), (3, 2, 2)),
                    ((16, 16, 16), (3, 3, 3), (2, 2, 2), (1, 1, 1), (3, 2, 2)),
                ):
                    run_conv3D_grad(
                        N, C, O, idim, kdim, stride, padding, dilation, dtype=dtype
                    )

    def __conv_general_test(
        self,
        in_shape,
        wt_shape,
        stride=1,
        padding=0,
        kernel_dilation=1,
        input_dilation=1,
        groups=1,
        flip=False,
        np_dtype=np.float32,
        atol=1e-5,
    ):
        with self.subTest(
            in_shape=in_shape,
            wt_shape=wt_shape,
            stride=stride,
            padding=padding,
            kernel_dilation=kernel_dilation,
            input_dilation=input_dilation,
            groups=groups,
            flip=flip,
            np_dtype=np_dtype,
        ):
            np.random.seed(0)
            scale = 1.0 / math.sqrt(np.prod(wt_shape[1:]))
            scale = min(0.3, scale)
            in_np = np.random.normal(0, scale, in_shape).astype(np_dtype)
            wt_np = np.random.normal(0, scale, wt_shape).astype(np_dtype)

            in_mx, wt_mx = map(mx.array, (in_np, wt_np))

            in_pt, wt_pt = map(
                lambda x: torch.from_numpy(np.moveaxis(x, -1, 1)).to("cpu"),
                (in_np, wt_np),
            )

            out_mx = mx.conv_general(
                in_mx,
                wt_mx,
                stride=stride,
                padding=padding,
                kernel_dilation=kernel_dilation,
                input_dilation=input_dilation,
                groups=groups,
                flip=flip,
            )

            def conv_general_pt(
                inp, wt, stride, padding, kernel_dilation, input_dilation, groups, flip
            ):
                C = inp.size()[1]
                ndim = inp.ndim - 2
                map_ints = lambda x: [x] * ndim if isinstance(x, int) else x

                stride, padding, kernel_dilation, input_dilation = map(
                    map_ints, (stride, padding, kernel_dilation, input_dilation)
                )

                torch_convt_list = (
                    F.conv_transpose1d,
                    F.conv_transpose2d,
                    F.conv_transpose3d,
                )
                torch_conv_list = (F.conv1d, F.conv2d, F.conv3d)

                conv_f = torch_conv_list[ndim - 1]
                convt_f = torch_convt_list[ndim - 1]

                if flip:
                    wt = torch.flip(wt, tuple(np.arange(2, wt.ndim)))

                if not np.all(input_dilation == 1):
                    ones = torch.ones(
                        [C]
                        + [
                            1,
                        ]
                        * (ndim + 1)
                    ).to(inp.dtype)
                    inp = convt_f(inp, ones, stride=input_dilation, groups=C)

                return conv_f(
                    inp,
                    wt,
                    stride=stride,
                    padding=padding,
                    dilation=kernel_dilation,
                    groups=groups,
                )

            out_pt = conv_general_pt(
                in_pt,
                wt_pt,
                stride=stride,
                padding=padding,
                kernel_dilation=kernel_dilation,
                input_dilation=input_dilation,
                groups=groups,
                flip=flip,
            )

            out_pt = np.moveaxis(out_pt.numpy(), 1, -1)

            self.assertEqual(out_mx.shape, out_pt.shape)
            self.assertTrue(np.allclose(out_mx, out_pt, atol=atol))

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_conv_general(self):
        in_shape = (2, 32, 32, 16)
        wt_shape = (32, 5, 5, 16)
        stride = (1, 1)
        padding = (2, 2)
        kernel_dilation = (2, 3)
        input_dilation = (1, 1)
        flip = False

        self.__conv_general_test(
            in_shape,
            wt_shape,
            stride,
            padding,
            kernel_dilation,
            input_dilation,
            flip=flip,
        )

        in_shape = (2, 32, 32, 16)
        wt_shape = (32, 5, 10, 16)
        stride = (2, 3)
        padding = (0, 0)
        kernel_dilation = (3, 2)
        input_dilation = (2, 4)
        flip = False

        self.__conv_general_test(
            in_shape,
            wt_shape,
            stride,
            padding,
            kernel_dilation,
            input_dilation,
            flip=flip,
        )

        in_shape = (2, 32, 32, 16)
        wt_shape = (32, 5, 10, 16)
        stride = (2, 2)
        padding = (3, 2)
        kernel_dilation = (3, 2)
        input_dilation = (2, 4)
        flip = False

        self.__conv_general_test(
            in_shape,
            wt_shape,
            stride,
            padding,
            kernel_dilation,
            input_dilation,
            flip=flip,
        )

        in_shape = (2, 32, 32, 16)
        wt_shape = (32, 5, 10, 16)
        stride = (2, 3)
        padding = (3, 2)
        kernel_dilation = (3, 2)
        input_dilation = (2, 5)
        flip = False

        self.__conv_general_test(
            in_shape,
            wt_shape,
            stride,
            padding,
            kernel_dilation,
            input_dilation,
            flip=flip,
        )

        in_shape = (2, 32, 32, 16)
        wt_shape = (32, 5, 5, 16)
        stride = (2, 3)
        padding = (0, 0)
        kernel_dilation = (3, 1)
        input_dilation = (2, 5)
        flip = True

        self.__conv_general_test(
            in_shape,
            wt_shape,
            stride,
            padding,
            kernel_dilation,
            input_dilation,
            flip=flip,
        )

    def test_conv_general_flip_grad(self):
        for s in (1, 2):
            w = mx.random.normal(shape=(1, 2, 2, 1))
            x = mx.random.normal(shape=(1, 2, 2, 1))

            def conv_t(w):
                return mx.conv_general(
                    x,
                    w,
                    stride=1,
                    padding=(1, 1),
                    kernel_dilation=1,
                    input_dilation=s,
                    flip=True,
                )

            cotan = mx.random.normal(shape=(1, 2 + s, 2 + s, 1))

            dw = mx.vjp(conv_t, (w,), (cotan,))[1][0]

            x = x.squeeze()
            cotan = cotan.squeeze()
            dw = dw.squeeze()

            dw00 = (cotan[:-1:s, :-1:s] * x).sum()
            dw01 = (cotan[:-1:s, 1::s] * x).sum()
            dw10 = (cotan[1::s, :-1:s] * x).sum()
            dw11 = (cotan[1::s, 1::s] * x).sum()
            expected = mx.array([[dw00, dw01], [dw10, dw11]])
            self.assertTrue(mx.allclose(dw, expected, rtol=1e-5, atol=1e-5))

        # Test with input dilation
        inputs = mx.random.normal((1, 14, 14, 2))
        kernel = mx.random.normal((2, 7, 7, 2))

        def conv_flip(kernel):
            return mx.conv_general(
                inputs,
                kernel,
                stride=1,
                padding=([6, 6], [15, 15]),
                kernel_dilation=(1, 1),
                input_dilation=(16, 16),
                groups=1,
                flip=True,
            ).sum()

        def reverse_sequence(xs, axis=0):
            indices = mx.arange(xs.shape[axis] - 1, -1, -1)
            return mx.take(xs, indices, axis=axis)

        def conv_manual_flip(kernel):
            for ax in range(1, kernel.ndim - 1):
                kernel = reverse_sequence(kernel, axis=ax)
            return mx.conv_general(
                inputs,
                kernel,
                stride=1,
                padding=([6, 6], [15, 15]),
                kernel_dilation=(1, 1),
                input_dilation=(16, 16),
                groups=1,
                flip=False,
            ).sum()

        grad = mx.grad(conv_flip)(kernel)
        expected_grad = mx.grad(conv_manual_flip)(kernel)
        self.assertTrue(mx.allclose(grad, expected_grad))

    def test_conv_groups_grad(self):
        def fn(x, w):
            num_groups = x.shape[-1] // w.shape[-1]
            return mx.conv1d(x, w, groups=num_groups)

        def fn_gt(x, w):
            num_groups = x.shape[-1] // w.shape[-1]
            group_size = w.shape[-1]
            ws = w.reshape(num_groups, -1, *w.shape[1:]).split(num_groups)
            xs = x.reshape(*x.shape[:-1], num_groups, -1).split(num_groups, axis=-2)
            return mx.concatenate(
                [mx.conv_general(x.squeeze(-2), w.squeeze(0)) for x, w in zip(xs, ws)],
                axis=-1,
            )

        mx.random.seed(3)

        w = mx.random.normal(shape=(2, 3, 1))
        x = mx.random.normal(shape=(1, 5, 2))
        cotans = (mx.ones(shape=(1, 3, 2)),)
        grads = mx.vjp(fn, (x, w), cotans)[1]
        expected = mx.vjp(fn_gt, (x, w), cotans)[1]
        self.assertTrue(mx.allclose(expected[0], grads[0]))
        self.assertTrue(mx.allclose(expected[1], grads[1]))

        w = mx.random.normal(shape=(2, 3, 2))
        x = mx.random.normal(shape=(1, 5, 4))
        cotans = (mx.ones(shape=(1, 3, 2)),)
        grads = mx.vjp(fn, (x, w), cotans)[1]
        expected = mx.vjp(fn_gt, (x, w), cotans)[1]
        self.assertTrue(mx.allclose(expected[0], grads[0]))
        self.assertTrue(mx.allclose(expected[1], grads[1]))

        w = mx.random.normal(shape=(6, 3, 2))
        x = mx.random.normal(shape=(1, 5, 4))
        cotans = (mx.ones(shape=(1, 3, 6)),)
        grads = mx.vjp(fn, (x, w), cotans)[1]
        expected = mx.vjp(fn_gt, (x, w), cotans)[1]
        self.assertTrue(mx.allclose(expected[0], grads[0]))
        self.assertTrue(mx.allclose(expected[1], grads[1]))

        # Test 2D
        w = mx.random.normal(shape=(2, 3, 3, 1))
        x = mx.random.normal(shape=(1, 5, 5, 2))
        cotans = (mx.ones(shape=(1, 3, 3, 2)),)
        grads = mx.vjp(fn, (x, w), cotans)[1]
        expected = mx.vjp(fn_gt, (x, w), cotans)[1]
        self.assertTrue(mx.allclose(expected[0], grads[0]))
        self.assertTrue(mx.allclose(expected[1], grads[1]))

        # Test with flip
        def fn(x, w):
            num_groups = x.shape[-1] // w.shape[-1]
            return mx.conv_general(x, w, groups=num_groups, flip=True)

        def fn_gt(x, w):
            num_groups = x.shape[-1] // w.shape[-1]
            group_size = w.shape[-1]
            ws = w.reshape(num_groups, -1, *w.shape[1:]).split(num_groups)
            xs = x.reshape(*x.shape[:-1], num_groups, -1).split(num_groups, axis=-2)
            return mx.concatenate(
                [
                    mx.conv_general(x.squeeze(-2), w.squeeze(0), flip=True)
                    for x, w in zip(xs, ws)
                ],
                axis=-1,
            )

        w = mx.random.normal(shape=(2, 3, 1))
        x = mx.random.normal(shape=(1, 5, 2))
        cotans = (mx.ones(shape=(1, 3, 2)),)
        grads = mx.vjp(fn, (x, w), cotans)[1]
        expected = mx.vjp(fn_gt, (x, w), cotans)[1]
        self.assertTrue(mx.allclose(expected[0], grads[0]))
        self.assertTrue(mx.allclose(expected[1], grads[1]))

        w = mx.random.normal(shape=(2, 3, 2))
        x = mx.random.normal(shape=(1, 5, 4))
        cotans = (mx.ones(shape=(1, 3, 2)),)
        grads = mx.vjp(fn, (x, w), cotans)[1]
        expected = mx.vjp(fn_gt, (x, w), cotans)[1]
        self.assertTrue(mx.allclose(expected[0], grads[0]))
        self.assertTrue(mx.allclose(expected[1], grads[1]))

        # Test 2D
        w = mx.random.normal(shape=(2, 3, 3, 1))
        x = mx.random.normal(shape=(1, 5, 5, 2))
        cotans = (mx.ones(shape=(1, 3, 3, 2)),)
        grads = mx.vjp(fn, (x, w), cotans)[1]
        expected = mx.vjp(fn_gt, (x, w), cotans)[1]
        self.assertTrue(mx.allclose(expected[0], grads[0]))
        self.assertTrue(mx.allclose(expected[1], grads[1]))

    def test_repeated_conv(self):
        x = mx.random.normal((1, 3, 3, 320))
        w = mx.random.normal((320, 3, 3, 320))
        for i in range(8):
            y1 = mx.conv2d(x, w, (1, 1), (1, 1), (1, 1), 1)
            y2 = mx.conv2d(x, w, (1, 1), (1, 1), (1, 1), 1)
            self.assertTrue(mx.allclose(y1, y2))

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_conv_depthwise(self):

        # fmt: off
        shapes = (
            # N,   H,   W,    C   kH,  kW,   O, strides, padding,  groups
            ( 2,  16,  16,   32,   1,   1,  32,  (2, 2),  (1, 1),    32),
            ( 1,  16,  16,   32,   3,   3,  32,  (2, 2),  (1, 1),    32),
            ( 1,  32,  32,   32,   7,   7,  32,  (1, 1),  (3, 3),    32),
            ( 3,  32,  32,   32,   5,   5,  32,  (1, 2),  (0, 0),    32),
            ( 1,  32,  32,   32,   7,   7,  32,  (2, 1),  (1, 3),    32),
        )
        # fmt: on

        dtypes = [np.float32]
        if mx.default_device() == mx.gpu:
            dtypes += [np.float16]

        for N, H, W, C, kH, kW, O, strides, padding, groups in shapes:
            for dtype in dtypes:
                for flip in [False, True]:
                    Cw = C // groups

                    self.__conv_general_test(
                        (N, H, W, C),
                        (O, kH, kW, Cw),
                        strides,
                        padding,
                        kernel_dilation=1,
                        input_dilation=1,
                        groups=groups,
                        flip=flip,
                        np_dtype=dtype,
                        atol=2e-5 if dtype == np.float32 else 5e-4,
                    )

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_asymmetric_padding(self):
        inputs = np.random.normal(size=(2, 8, 8, 8, 3)).astype(np.float32)
        kernel = np.random.normal(size=(2, 3, 3, 3, 3)).astype(np.float32)
        strides = (2, 2, 2)

        pt_out = torch.conv3d(
            torch.permute(torch.tensor(inputs), (0, 4, 1, 2, 3)),
            torch.permute(torch.tensor(kernel), (0, 4, 1, 2, 3)),
            stride=strides,
            padding=2,
        )
        pt_out = torch.permute(pt_out, (0, 2, 3, 4, 1))[:, 1:, 1:, 1:, :].numpy()

        mx_out = mx.conv_general(
            mx.array(inputs),
            mx.array(kernel),
            stride=strides,
            padding=([0, 0, 0], [1, 1, 1]),
        )

        self.assertTrue(mx.allclose(mx_out, mx.array(pt_out), atol=1e-3, rtol=1e-3))

        inputs = np.random.normal(size=(2, 10, 10, 3)).astype(np.float32)
        kernel = np.random.normal(size=(2, 2, 2, 3)).astype(np.float32)

        pt_out = torch.conv2d(
            torch.permute(torch.tensor(inputs), (0, 3, 1, 2)),
            torch.permute(torch.tensor(kernel), (0, 3, 1, 2)),
            stride=1,
            padding=(1, 0),
        )
        pt_out = torch.permute(pt_out, (0, 2, 3, 1))[:, 1:].numpy()

        mx_out = mx.conv_general(
            mx.array(inputs),
            mx.array(kernel),
            stride=1,
            padding=([0, 0], [1, 0]),
        )
        self.assertTrue(mx.allclose(mx_out, mx.array(pt_out), atol=1e-3, rtol=1e-3))

    def test_basic_grad_shapes(self):
        def loss_fn(kernel, inputs, strides, groups):
            return mx.sum(
                mx.conv_general(
                    inputs,
                    kernel,
                    stride=strides,
                    groups=groups,
                )
            )

        for in_shape, k_shape, strides, groups in [
            ((3, 5, 4), (6, 2, 2), (2,), 2),
            ((3, 5, 4), (24, 2, 1), (2,), 4),
            ((3, 5, 5, 4), (6, 2, 2, 2), (2, 1), 2),
            ((3, 5, 5, 4), (24, 2, 2, 1), (2, 2), 4),
        ]:
            grads = mx.grad(loss_fn)(
                mx.zeros(k_shape), mx.zeros(in_shape), strides, groups
            )
            self.assertEqual(grads.shape, k_shape)

    def test_conv_1d_with_2d(self):
        x = mx.random.uniform(shape=(2, 10, 16))
        y = mx.random.normal(shape=(16, 3, 16))

        out = mx.conv1d(x, y, padding=1)
        out_2d = mx.conv2d(
            mx.expand_dims(x, axis=2), mx.expand_dims(y, axis=2), padding=(1, 0)
        )

        self.assertTrue(mx.allclose(out, out_2d.squeeze(2)))

        x = mx.random.uniform(shape=(2, 10, 4))
        y = mx.random.normal(shape=(4, 3, 4))

        out = mx.conv1d(x, y, padding=1)
        out_2d = mx.conv2d(
            mx.expand_dims(x, axis=2), mx.expand_dims(y, axis=2), padding=(1, 0)
        )

        self.assertTrue(mx.allclose(out, out_2d.squeeze(2)))

    def test_conv2d_unaligned_channels(self):
        x = mx.random.uniform(shape=(2, 16, 16, 21))
        w = mx.random.uniform(shape=(32, 3, 3, 21))
        y = mx.conv2d(x, w, stream=mx.cpu)
        y_hat = mx.conv2d(x, w)
        self.assertTrue(mx.allclose(y, y_hat))

        x = mx.random.uniform(shape=(2, 16, 16, 21))
        w = mx.random.uniform(shape=(21, 3, 3, 21))
        y = mx.conv2d(x, w, stream=mx.cpu)
        y_hat = mx.conv2d(x, w)
        self.assertTrue(mx.allclose(y, y_hat))

    def test_conv2d_large_filter_small_channels(self):
        x = mx.random.normal(shape=(1, 181, 181, 1))
        w = mx.random.normal(shape=(1, 182, 182, 1))
        y = mx.conv2d(x, w, (1, 1), (1, 1), stream=mx.cpu)
        y_hat = mx.conv2d(x, w, (1, 1), (1, 1))
        self.assertTrue(mx.allclose(y, y_hat, rtol=1e-3, atol=1e-3))


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_conv_transpose.py
================================================
# Copyright © 2023-2024 Apple Inc.

import math
import unittest
from itertools import permutations

import mlx.core as mx
import mlx_tests
import numpy as np

try:
    import torch
    import torch.nn.functional as F

    has_torch = True
except ImportError as e:
    has_torch = False


class TestConvTranspose(mlx_tests.MLXTestCase):
    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_conv_transpose_1D(self):
        def run_conv_transpose_1D(
            N,
            C,
            O,
            iH,
            kH,
            stride,
            padding,
            output_padding=0,
            dilation=1,
            groups=1,
            dtype="float32",
            atol=1e-5,
        ):
            with self.subTest(
                dtype=dtype,
                N=N,
                C=C,
                O=O,
                iH=iH,
                kH=kH,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
            ):
                np_dtype = getattr(np, dtype)
                np.random.seed(0)
                in_np = np.random.normal(0, 1.0 / C, (N, iH, C)).astype(np_dtype)
                wt_np = np.random.normal(0, 1.0 / C, (O, kH, int(C / groups))).astype(
                    np_dtype
                )

                in_mx, wt_mx = map(mx.array, (in_np, wt_np))
                in_pt = torch.from_numpy(in_np.transpose(0, 2, 1))
                wt_pt = torch.from_numpy(wt_np.transpose(2, 0, 1))

                out_mx = mx.conv_transpose1d(
                    in_mx,
                    wt_mx,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )
                out_pt = torch.conv_transpose1d(
                    in_pt,
                    wt_pt,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )
                out_pt = torch.transpose(out_pt, 2, 1)

                self.assertEqual(out_pt.shape, out_mx.shape)
                self.assertTrue(np.allclose(out_pt.numpy(), out_mx, atol=atol))

        for dtype in ("float32",):
            for N, C, O in (
                (1, 1, 1),
                (1, 6, 1),
                (1, 1, 6),
                (4, 32, 64),
            ):
                for iH, kH, stride, padding in (
                    (1, 1, 1, 0),
                    (3, 3, 1, 0),
                    (31, 5, 5, 2),
                ):
                    run_conv_transpose_1D(N, C, O, iH, kH, stride, padding, dtype=dtype)

        # Groups tests
        N, C, O = (4, 32, 64)
        for iH, kH, stride, padding in (
            (1, 1, 1, 0),
            (3, 3, 1, 0),
            (31, 5, 5, 2),
        ):
            for group in (1,):
                run_conv_transpose_1D(
                    N, C, O, iH, kH, stride, padding, groups=group, dtype=dtype
                )

        # Strided inputs tests
        for tpose_in, tpose_wt in (
            ((0, 2, 1), (0, 1, 2)),
            ((0, 2, 1), (0, 2, 1)),
        ):
            with self.subTest(name="strided", tpose_in=tpose_in, tpose_wt=tpose_wt):
                in_np = np.random.normal(0, 1.0 / 16, (16, 16, 16)).astype(np.float32)
                wt_np = np.random.normal(0, 1.0 / 16, (16, 16, 16)).astype(np.float32)

                in_mx, wt_mx = map(mx.array, (in_np, wt_np))
                in_mx_t = mx.transpose(in_mx, tpose_in)
                wt_mx_t = mx.transpose(wt_mx, tpose_wt)
                out_mx = mx.conv_transpose1d(in_mx_t, wt_mx_t)

                in_pt = torch.from_numpy(in_np.transpose(tpose_in).transpose(0, 2, 1))
                wt_pt = torch.from_numpy(wt_np.transpose(tpose_wt).transpose(2, 0, 1))

                out_pt = torch.conv_transpose1d(in_pt, wt_pt)
                out_pt = torch.transpose(out_pt, 2, 1)

                self.assertEqual(out_pt.shape, out_mx.shape)
                self.assertTrue(np.allclose(out_pt.numpy(), out_mx, atol=1e-5))

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_conv_transpose_1D_grad(self):
        def run_conv_transpose1D_grad(
            N,
            C,
            O,
            iH,
            kH,
            stride,
            padding,
            dilation=1,
            groups=1,
            dtype="float32",
            atol=1e-5,
        ):
            with self.subTest(
                dtype=dtype,
                N=N,
                C=C,
                O=O,
                iH=iH,
                kH=kH,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
            ):
                np_dtype = getattr(np, dtype)
                np.random.seed(0)
                # oH = 1 + ((iH + 2 * padding - dilation * (kH - 1) - 1) // stride)

                in_np = np.random.normal(0, 1.0 / C, (N, iH, C)).astype(np_dtype)
                wt_np = np.random.normal(0, 1.0 / C, (O, kH, C)).astype(np_dtype)

                in_mx, wt_mx = map(mx.array, (in_np, wt_np))
                in_pt = torch.from_numpy(in_np.transpose(0, 2, 1)).requires_grad_(True)
                wt_pt = torch.from_numpy(wt_np.transpose(2, 0, 1)).requires_grad_(True)

                out_pt = F.conv_transpose1d(
                    in_pt, wt_pt, stride=stride, padding=padding, dilation=dilation
                )

                # use torch to compute ct
                out_pt.retain_grad()
                out_pt.sum().backward()

                pt_grad_in = in_pt.grad.permute(0, 2, 1).numpy()
                pt_grad_wt = wt_pt.grad.permute(1, 2, 0).numpy()

                ct_mx = mx.array(out_pt.grad.numpy().transpose(0, 2, 1))

                def f(a, b):
                    return mx.conv_transpose1d(
                        a,
                        b,
                        stride=stride,
                        padding=padding,
                        dilation=dilation,
                        groups=groups,
                    )

                _, outs_mx = mx.vjp(
                    f,
                    [
                        in_mx,
                        wt_mx,
                    ],
                    [
                        ct_mx,
                    ],
                )

                mx_grad_in, mx_grad_wt = outs_mx

                self.assertEqual(pt_grad_in.shape, mx_grad_in.shape)
                self.assertEqual(in_mx.shape, mx_grad_in.shape)
                self.assertTrue(np.allclose(pt_grad_in, mx_grad_in, atol=atol))

                self.assertEqual(pt_grad_wt.shape, mx_grad_wt.shape)
                self.assertEqual(wt_mx.shape, mx_grad_wt.shape)
                self.assertTrue(np.allclose(pt_grad_wt, mx_grad_wt, atol=atol))

        for dtype in ("float32",):
            for N, C, O in (
                (1, 1, 1),
                (1, 6, 1),
                (1, 1, 6),
                (4, 32, 64),
            ):
                for iH, kH, stride, padding in (
                    (1, 1, 1, 0),
                    (3, 3, 1, 0),
                    (31, 5, 5, 2),
                ):
                    run_conv_transpose1D_grad(
                        N, C, O, iH, kH, stride, padding, dtype=dtype
                    )

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_conv_transpose_2D(self):
        def run_conv_transpose2D(
            N,
            C,
            O,
            idim,
            kdim,
            stride,
            padding,
            dilation=(1, 1),
            groups=1,
            dtype="float32",
            atol=1e-5,
        ):
            with self.subTest(
                dtype=dtype,
                N=N,
                C=C,
                O=O,
                idim=idim,
                kdim=kdim,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
            ):
                np_dtype = getattr(np, dtype)
                np.random.seed(0)
                iH, iW = idim
                kH, kW = kdim
                scale = 1.0 / math.sqrt(kH * kW * C)
                in_np = np.random.normal(0.0, scale, (N, iH, iW, C)).astype(np_dtype)
                wt_np = np.random.normal(0.0, 1.0, (O, kH, kW, int(C / groups))).astype(
                    np_dtype
                )

                in_mx, wt_mx = map(mx.array, (in_np, wt_np))
                in_pt = torch.from_numpy(in_np.transpose(0, 3, 1, 2)).to("cpu")
                wt_pt = torch.from_numpy(wt_np.transpose(3, 0, 1, 2)).to("cpu")

                out_mx = mx.conv_transpose2d(
                    in_mx,
                    wt_mx,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )
                out_pt = torch.conv_transpose2d(
                    in_pt,
                    wt_pt,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )
                out_pt = torch.permute(out_pt, (0, 2, 3, 1)).numpy(force=True)

                self.assertEqual(out_pt.shape, out_mx.shape)
                self.assertTrue(np.allclose(out_pt, out_mx, atol=atol))

        for dtype in ("float32",):
            for N, C, O in (
                (1, 1, 1),
                (1, 6, 1),
                (1, 1, 6),
                (4, 32, 64),
            ):
                for idim, kdim, stride, padding in (
                    ((1, 1), (1, 1), (1, 1), (0, 0)),
                    ((3, 3), (3, 1), (1, 1), (0, 0)),
                    ((31, 31), (5, 5), (5, 5), (2, 2)),
                ):
                    run_conv_transpose2D(
                        N, C, O, idim, kdim, stride, padding, dtype=dtype
                    )

            # Groups tests
            N, C, O = (4, 32, 64)
            for idim, kdim, stride, padding in (
                ((1, 1), (1, 1), (1, 1), (0, 0)),
                ((3, 3), (3, 1), (1, 1), (0, 0)),
                ((31, 31), (5, 5), (5, 5), (2, 2)),
            ):
                for group in (1,):
                    run_conv_transpose2D(
                        N, C, O, idim, kdim, stride, padding, groups=group, dtype=dtype
                    )

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_conv_transpose_2D_grad(self):
        def run_conv_transpose2D_grad(
            N,
            C,
            O,
            idim,
            kdim,
            stride,
            padding,
            dilation=(1, 1),
            groups=1,
            dtype="float32",
            atol=1e-5,
        ):
            with self.subTest(
                dtype=dtype,
                N=N,
                C=C,
                O=O,
                idim=idim,
                kdim=kdim,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
            ):
                np_dtype = getattr(np, dtype)
                np.random.seed(0)
                iH, iW = idim
                kH, kW = kdim
                scale = 1.0 / math.sqrt(kH * kW * C * O)

                in_np = np.random.normal(0.0, scale, (N, iH, iW, C)).astype(np_dtype)
                wt_np = np.random.normal(0.0, scale, (O, kH, kW, C)).astype(np_dtype)

                in_mx, wt_mx = map(mx.array, (in_np, wt_np))
                in_pt = torch.from_numpy(in_np.transpose(0, 3, 1, 2)).requires_grad_(
                    True
                )
                wt_pt = torch.from_numpy(wt_np.transpose(3, 0, 1, 2)).requires_grad_(
                    True
                )

                out_pt = F.conv_transpose2d(
                    in_pt, wt_pt, stride=stride, padding=padding, dilation=dilation
                )

                # use torch to compute ct
                out_pt.retain_grad()
                out_pt.sum().backward()

                pt_grad_in = in_pt.grad.permute(0, 2, 3, 1).numpy()
                pt_grad_wt = wt_pt.grad.permute(1, 2, 3, 0).numpy()

                ct_mx = mx.array(out_pt.grad.numpy().transpose(0, 2, 3, 1))

                def f(a, b):
                    return mx.conv_transpose2d(
                        a,
                        b,
                        stride=stride,
                        padding=padding,
                        dilation=dilation,
                        groups=groups,
                    )

                _, outs_mx = mx.vjp(
                    f,
                    [in_mx, wt_mx],
                    [ct_mx],
                )

                mx_grad_in, mx_grad_wt = outs_mx

                self.assertEqual(pt_grad_in.shape, mx_grad_in.shape)
                self.assertEqual(in_mx.shape, mx_grad_in.shape)
                self.assertTrue(np.allclose(pt_grad_in, mx_grad_in, atol=atol))

                self.assertEqual(pt_grad_wt.shape, mx_grad_wt.shape)
                self.assertEqual(wt_mx.shape, mx_grad_wt.shape)
                self.assertTrue(np.allclose(pt_grad_wt, mx_grad_wt, atol=atol))

        for dtype in ("float32",):
            for N, C, O in ((1, 1, 1), (1, 6, 1), (1, 1, 6), (4, 32, 64), (4, 16, 32)):
                for idim, kdim, stride, padding, dilation in (
                    ((1, 1), (1, 1), (1, 1), (0, 0), (1, 1)),
                    ((3, 3), (3, 1), (1, 1), (0, 0), (1, 1)),
                    ((31, 31), (5, 5), (5, 5), (2, 2), (1, 1)),
                    ((32, 32), (3, 3), (2, 2), (1, 1), (1, 1)),
                    ((31, 31), (5, 5), (5, 5), (2, 2), (3, 2)),
                    ((32, 32), (3, 3), (2, 2), (1, 1), (3, 2)),
                ):
                    run_conv_transpose2D_grad(
                        N, C, O, idim, kdim, stride, padding, dilation, dtype=dtype
                    )

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_conv_transpose_3D(self):
        def run_conv_transpose3D(
            N,
            C,
            O,
            idim,
            kdim,
            stride,
            padding,
            dilation=(1, 1, 1),
            groups=1,
            dtype="float32",
            atol=1e-5,
        ):
            with self.subTest(
                dtype=dtype,
                N=N,
                C=C,
                O=O,
                idim=idim,
                kdim=kdim,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
            ):
                np_dtype = getattr(np, dtype)
                np.random.seed(0)
                iD, iH, iW = idim
                kD, kH, kW = kdim
                scale = 1.0 / math.sqrt(kD * kH * kW * C * O)
                in_np = np.random.normal(0.0, scale, (N, iD, iH, iW, C)).astype(
                    np_dtype
                )
                wt_np = np.random.normal(0.0, 1.0, (O, kD, kH, kW, C)).astype(np_dtype)

                in_mx, wt_mx = map(mx.array, (in_np, wt_np))
                in_pt = torch.from_numpy(in_np.transpose(0, 4, 1, 2, 3))
                wt_pt = torch.from_numpy(wt_np.transpose(4, 0, 1, 2, 3))

                out_mx = mx.conv_transpose3d(
                    in_mx,
                    wt_mx,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )
                out_pt = torch.conv_transpose3d(
                    in_pt,
                    wt_pt,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )
                out_pt = torch.permute(out_pt, (0, 2, 3, 4, 1)).numpy(force=True)

                self.assertEqual(out_pt.shape, out_mx.shape)
                self.assertTrue(np.allclose(out_pt, out_mx, atol=atol))

        for dtype in ("float32",):
            for N, C, O in (
                (1, 1, 1),
                (1, 6, 1),
                (1, 1, 6),
                (2, 8, 16),
            ):
                for idim, kdim, stride, padding in (
                    ((1, 1, 1), (1, 1, 1), (1, 1, 1), (0, 0, 0)),
                    ((3, 3, 3), (3, 1, 1), (1, 1, 1), (0, 0, 0)),
                    ((15, 15, 15), (3, 3, 3), (3, 3, 3), (2, 2, 2)),
                ):
                    run_conv_transpose3D(
                        N, C, O, idim, kdim, stride, padding, dtype=dtype
                    )

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_conv_transpose_3D_grad(self):
        def run_conv_transpose3D_grad(
            N,
            C,
            O,
            idim,
            kdim,
            stride,
            padding,
            dilation=(1, 1, 1),
            groups=1,
            dtype="float32",
            atol=1e-4,
        ):
            with self.subTest(
                dtype=dtype,
                N=N,
                C=C,
                O=O,
                idim=idim,
                kdim=kdim,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
            ):
                np_dtype = getattr(np, dtype)
                np.random.seed(0)
                iD, iH, iW = idim
                kD, kH, kW = kdim
                scale = 1.0 / math.sqrt(kD * kH * kW * C * O)

                in_np = np.random.normal(0.0, scale, (N, iD, iH, iW, C)).astype(
                    np_dtype
                )
                wt_np = np.random.normal(0.0, scale, (O, kD, kH, kW, C)).astype(
                    np_dtype
                )

                in_mx, wt_mx = map(mx.array, (in_np, wt_np))
                in_pt = torch.from_numpy(in_np.transpose(0, 4, 1, 2, 3)).requires_grad_(
                    True
                )
                wt_pt = torch.from_numpy(wt_np.transpose(4, 0, 1, 2, 3)).requires_grad_(
                    True
                )

                out_pt = F.conv_transpose3d(
                    in_pt,
                    wt_pt,
                    stride=stride,
                    padding=padding,
                    dilation=dilation,
                    groups=groups,
                )

                # use torch to compute ct
                out_pt.retain_grad()
                out_pt.sum().backward()

                pt_grad_in = in_pt.grad.permute(0, 2, 3, 4, 1).numpy()
                pt_grad_wt = wt_pt.grad.permute(1, 2, 3, 4, 0).numpy()

                ct_mx = mx.array(out_pt.grad.numpy().transpose(0, 2, 3, 4, 1))

                def f(a, b):
                    return mx.conv_transpose3d(
                        a,
                        b,
                        stride=stride,
                        padding=padding,
                        dilation=dilation,
                        groups=groups,
                    )

                _, outs_mx = mx.vjp(
                    f,
                    [in_mx, wt_mx],
                    [ct_mx],
                )

                mx_grad_in, mx_grad_wt = outs_mx

                self.assertEqual(pt_grad_in.shape, mx_grad_in.shape)
                self.assertEqual(in_mx.shape, mx_grad_in.shape)
                self.assertTrue(np.allclose(pt_grad_in, mx_grad_in, atol=atol))

                self.assertEqual(pt_grad_wt.shape, mx_grad_wt.shape)
                self.assertEqual(wt_mx.shape, mx_grad_wt.shape)
                self.assertTrue(np.allclose(pt_grad_wt, mx_grad_wt, atol=atol))

        for dtype in ("float32",):
            for N, C, O in ((1, 1, 1), (1, 6, 1), (1, 1, 6), (2, 4, 8), (2, 8, 16)):
                for idim, kdim, stride, padding, dilation in (
                    ((1, 1, 1), (1, 1, 1), (1, 1, 1), (0, 0, 0), (1, 1, 1)),
                    ((3, 3, 3), (3, 1, 1), (1, 1, 1), (0, 0, 0), (1, 1, 1)),
                    ((7, 7, 7), (5, 5, 5), (5, 5, 5), (2, 2, 2), (1, 1, 1)),
                    ((8, 8, 8), (3, 3, 3), (2, 2, 2), (1, 1, 1), (1, 1, 1)),
                    ((7, 7, 7), (5, 5, 5), (3, 3, 3), (2, 2, 2), (3, 2, 2)),
                    ((8, 8, 8), (3, 3, 3), (2, 2, 2), (1, 1, 1), (3, 2, 2)),
                ):
                    run_conv_transpose3D_grad(
                        N, C, O, idim, kdim, stride, padding, dilation, dtype=dtype
                    )

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_conv_tranpose_1d_output_padding(self):
        def run_conv_transpose_1d_output_padding(
            N, C, O, iH, kH, stride, padding, output_padding, dtype="float32", atol=1e-5
        ):
            with self.subTest(
                dtype=dtype,
                N=N,
                C=C,
                O=O,
                iH=iH,
                kH=kH,
                stride=stride,
                padding=padding,
                output_padding=output_padding,
            ):
                np_dtype = getattr(np, dtype)
                np.random.seed(0)
                in_np = np.random.normal(0, 1.0 / C, (N, iH, C)).astype(np_dtype)
                wt_np = np.random.normal(0, 1.0 / C, (O, kH, C)).astype(np_dtype)

                in_mx, wt_mx = map(mx.array, (in_np, wt_np))
                in_pt = torch.from_numpy(in_np.transpose(0, 2, 1))
                wt_pt = torch.from_numpy(wt_np.transpose(2, 0, 1))

                out_mx = mx.conv_transpose1d(
                    in_mx,
                    wt_mx,
                    stride=stride,
                    padding=padding,
                    output_padding=output_padding,
                )

                out_pt = torch.conv_transpose1d(
                    in_pt,
                    wt_pt,
                    stride=stride,
                    padding=padding,
                    output_padding=output_padding,
                )
                out_pt = torch.transpose(out_pt, 2, 1)

                self.assertEqual(out_pt.shape, out_mx.shape)
                self.assertTrue(np.allclose(out_pt.numpy(), out_mx, atol=atol))

        for dtype in ("float32",):
            for N, C, O in ((1, 1, 1), (1, 6, 1), (4, 32, 64)):
                for iH, kH, stride, padding, output_padding in (
                    (3, 2, 2, 0, 1),
                    (5, 3, 2, 1, 0),
                    (7, 4, 3, 1, 2),
                ):
                    run_conv_transpose_1d_output_padding(
                        N, C, O, iH, kH, stride, padding, output_padding, dtype=dtype
                    )

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_conv_transpose_2d_output_padding(self):
        def run_conv_transpose_2d_output_padding(
            N,
            C,
            O,
            idim,
            kdim,
            stride,
            padding,
            output_padding,
            dtype="float32",
            atol=1e-5,
        ):
            with self.subTest(
                dtype=dtype,
                N=N,
                C=C,
                O=O,
                idim=idim,
                kdim=kdim,
                stride=stride,
                padding=padding,
                output_padding=output_padding,
            ):
                np_dtype = getattr(np, dtype)
                np.random.seed(0)
                iH, iW = idim
                kH, kW = kdim
                in_np = np.random.normal(0, 1.0 / C, (N, iH, iW, C)).astype(np_dtype)
                wt_np = np.random.normal(0, 1.0 / C, (O, kH, kW, C)).astype(np_dtype)

                in_mx, wt_mx = map(mx.array, (in_np, wt_np))
                in_pt = torch.from_numpy(in_np.transpose(0, 3, 1, 2))
                wt_pt = torch.from_numpy(wt_np.transpose(3, 0, 1, 2))

                out_mx = mx.conv_transpose2d(
                    in_mx,
                    wt_mx,
                    stride=stride,
                    padding=padding,
                    output_padding=output_padding,
                )

                out_pt = torch.conv_transpose2d(
                    in_pt,
                    wt_pt,
                    stride=stride,
                    padding=padding,
                    output_padding=output_padding,
                )
                out_pt = torch.permute(out_pt, (0, 2, 3, 1)).numpy(force=True)

                self.assertEqual(out_pt.shape, out_mx.shape)
                self.assertTrue(np.allclose(out_pt, out_mx, atol=atol))

        for dtype in ("float32",):
            for N, C, O in ((1, 1, 1), (1, 6, 1), (4, 32, 64)):
                for idim, kdim, stride, padding, output_padding in (
                    ((3, 3), (2, 2), (2, 2), (0, 0), (1, 1)),
                    ((5, 5), (3, 3), (2, 2), (1, 1), (0, 0)),
                    ((7, 7), (4, 4), (3, 3), (1, 1), (2, 2)),
                ):
                    run_conv_transpose_2d_output_padding(
                        N,
                        C,
                        O,
                        idim,
                        kdim,
                        stride,
                        padding,
                        output_padding,
                        dtype=dtype,
                    )

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_conv_transpose_3d_output_padding(self):
        def run_conv_transpose_3d_output_padding(
            N,
            C,
            O,
            idim,
            kdim,
            stride,
            padding,
            output_padding,
            dtype="float32",
            atol=1e-5,
        ):
            with self.subTest(
                dtype=dtype,
                N=N,
                C=C,
                O=O,
                idim=idim,
                kdim=kdim,
                stride=stride,
                padding=padding,
                output_padding=output_padding,
            ):
                np_dtype = getattr(np, dtype)
                np.random.seed(0)
                iD, iH, iW = idim
                kD, kH, kW = kdim
                in_np = np.random.normal(0, 1.0 / C, (N, iD, iH, iW, C)).astype(
                    np_dtype
                )
                wt_np = np.random.normal(0, 1.0 / C, (O, kD, kH, kW, C)).astype(
                    np_dtype
                )

                in_mx, wt_mx = map(mx.array, (in_np, wt_np))
                in_pt = torch.from_numpy(in_np.transpose(0, 4, 1, 2, 3))
                wt_pt = torch.from_numpy(wt_np.transpose(4, 0, 1, 2, 3))

                out_mx = mx.conv_transpose3d(
                    in_mx,
                    wt_mx,
                    stride=stride,
                    padding=padding,
                    output_padding=output_padding,
                )
                out_pt = torch.conv_transpose3d(
                    in_pt,
                    wt_pt,
                    stride=stride,
                    padding=padding,
                    output_padding=output_padding,
                )
                out_pt = torch.permute(out_pt, (0, 2, 3, 4, 1)).numpy(force=True)

                self.assertEqual(out_pt.shape, out_mx.shape)
                self.assertTrue(np.allclose(out_pt, out_mx, atol=atol))

        for dtype in ("float32",):
            for N, C, O in ((1, 1, 1), (1, 6, 1), (4, 32, 64)):
                for idim, kdim, stride, padding, output_padding in (
                    ((3, 3, 3), (2, 2, 2), (2, 2, 2), (0, 0, 0), (1, 1, 1)),
                    ((5, 5, 5), (3, 3, 3), (2, 2, 2), (1, 1, 1), (0, 0, 0)),
                    ((7, 7, 7), (4, 4, 4), (3, 3, 3), (1, 1, 1), (2, 2, 2)),
                ):
                    run_conv_transpose_3d_output_padding(
                        N,
                        C,
                        O,
                        idim,
                        kdim,
                        stride,
                        padding,
                        output_padding,
                        dtype=dtype,
                    )


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_device.py
================================================
# Copyright © 2023 Apple Inc.

import unittest

import mlx.core as mx
import mlx_tests


# Don't inherit from MLXTestCase to avoid call to setUp
class TestDefaultDevice(unittest.TestCase):
    def test_mlx_default_device(self):
        device = mx.default_device()
        if mx.is_available(mx.gpu):
            self.assertEqual(device, mx.Device(mx.gpu))
            self.assertEqual(str(device), "Device(gpu, 0)")
            self.assertEqual(device, mx.gpu)
            self.assertEqual(mx.gpu, device)
        else:
            self.assertEqual(device.type, mx.Device(mx.cpu))
            with self.assertRaises(ValueError):
                mx.set_default_device(mx.gpu)


class TestDevice(mlx_tests.MLXTestCase):
    def test_device(self):
        device = mx.default_device()

        cpu = mx.Device(mx.cpu)
        mx.set_default_device(cpu)
        self.assertEqual(mx.default_device(), cpu)
        self.assertEqual(str(cpu), "Device(cpu, 0)")

        mx.set_default_device(mx.cpu)
        self.assertEqual(mx.default_device(), mx.cpu)
        self.assertEqual(cpu, mx.cpu)
        self.assertEqual(mx.cpu, cpu)

        # Restore device
        mx.set_default_device(device)

    @unittest.skipIf(not mx.is_available(mx.gpu), "GPU is not available")
    def test_device_context(self):
        default = mx.default_device()
        diff = mx.cpu if default == mx.gpu else mx.gpu
        self.assertNotEqual(default, diff)
        with mx.stream(diff):
            a = mx.add(mx.zeros((2, 2)), mx.ones((2, 2)))
            mx.eval(a)
            self.assertEqual(mx.default_device(), diff)
        self.assertEqual(mx.default_device(), default)

    def test_op_on_device(self):
        x = mx.array(1.0)
        y = mx.array(1.0)

        a = mx.add(x, y, stream=None)
        b = mx.add(x, y, stream=mx.default_device())
        self.assertEqual(a.item(), b.item())
        b = mx.add(x, y, stream=mx.cpu)
        self.assertEqual(a.item(), b.item())

        if mx.metal.is_available():
            b = mx.add(x, y, stream=mx.gpu)
            self.assertEqual(a.item(), b.item())


class TestStream(mlx_tests.MLXTestCase):
    def test_stream(self):
        s1 = mx.default_stream(mx.default_device())
        self.assertEqual(s1.device, mx.default_device())

        s2 = mx.new_stream(mx.default_device())
        self.assertEqual(s2.device, mx.default_device())
        self.assertNotEqual(s1, s2)

        if mx.is_available(mx.gpu):
            s_gpu = mx.default_stream(mx.gpu)
            self.assertEqual(s_gpu.device, mx.gpu)
        else:
            with self.assertRaises(ValueError):
                mx.default_stream(mx.gpu)

        s_cpu = mx.default_stream(mx.cpu)
        self.assertEqual(s_cpu.device, mx.cpu)

        s_cpu = mx.new_stream(mx.cpu)
        self.assertEqual(s_cpu.device, mx.cpu)

        if mx.is_available(mx.gpu):
            s_gpu = mx.new_stream(mx.gpu)
            self.assertEqual(s_gpu.device, mx.gpu)
        else:
            with self.assertRaises(ValueError):
                mx.new_stream(mx.gpu)

    def test_op_on_stream(self):
        x = mx.array(1.0)
        y = mx.array(1.0)

        a = mx.add(x, y, stream=mx.default_stream(mx.default_device()))

        if mx.is_available(mx.gpu):
            b = mx.add(x, y, stream=mx.default_stream(mx.gpu))
            self.assertEqual(a.item(), b.item())
            s_gpu = mx.new_stream(mx.gpu)
            b = mx.add(x, y, stream=s_gpu)
            self.assertEqual(a.item(), b.item())

        b = mx.add(x, y, stream=mx.default_stream(mx.cpu))
        self.assertEqual(a.item(), b.item())
        s_cpu = mx.new_stream(mx.cpu)
        b = mx.add(x, y, stream=s_cpu)
        self.assertEqual(a.item(), b.item())


class TestDeviceInfo(mlx_tests.MLXTestCase):
    def test_device_count(self):
        cpu_count = mx.device_count(mx.cpu)
        self.assertIsInstance(cpu_count, int)
        self.assertEqual(cpu_count, 1)

        gpu_count = mx.device_count(mx.gpu)
        self.assertIsInstance(gpu_count, int)
        self.assertGreaterEqual(gpu_count, 0)

    def test_device_info_cpu(self):
        info = mx.device_info(mx.cpu)
        self.assertIsInstance(info, dict)
        self.assertIn("device_name", info)
        self.assertTrue(len(info["device_name"]) > 0)
        self.assertIn("architecture", info)

    @unittest.skipIf(not mx.is_available(mx.gpu), "GPU is not available")
    def test_device_info_gpu(self):
        gpu_count = mx.device_count(mx.gpu)
        for i in range(gpu_count):
            info = mx.device_info(mx.Device(mx.gpu, i))
            self.assertIsInstance(info, dict)
            self.assertIn("device_name", info)
            self.assertTrue(len(info["device_name"]) > 0)
            self.assertIn("architecture", info)

    def test_device_info_default(self):
        info = mx.device_info()
        self.assertIsInstance(info, dict)
        self.assertIn("device_name", info)


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_double.py
================================================
# Copyright © 2024 Apple Inc.

import math
import os
import unittest

import mlx.core as mx
import mlx_tests
import numpy as np


class TestDouble(mlx_tests.MLXTestCase):
    def test_unary_ops(self):
        shape = (3, 3)
        x = mx.random.normal(shape=shape)

        if mx.default_device() == mx.gpu:
            with self.assertRaises(ValueError):
                x.astype(mx.float64)

        x_double = x.astype(mx.float64, stream=mx.cpu)

        ops = [
            mx.abs,
            mx.arccos,
            mx.arccosh,
            mx.arcsin,
            mx.arcsinh,
            mx.arctan,
            mx.arctanh,
            mx.ceil,
            mx.erf,
            mx.erfinv,
            mx.exp,
            mx.expm1,
            mx.floor,
            mx.log,
            mx.logical_not,
            mx.negative,
            mx.round,
            mx.sin,
            mx.sinh,
            mx.sqrt,
            mx.rsqrt,
            mx.tan,
            mx.tanh,
        ]
        for op in ops:
            if mx.default_device() == mx.gpu:
                with self.assertRaises(ValueError):
                    op(x_double)
                continue
            y = op(x)
            y_double = op(x_double)
            self.assertTrue(
                mx.allclose(y, y_double.astype(mx.float32, mx.cpu), equal_nan=True)
            )

    def test_binary_ops(self):
        shape = (3, 3)
        a = mx.random.normal(shape=shape)
        b = mx.random.normal(shape=shape)

        a_double = a.astype(mx.float64, stream=mx.cpu)
        b_double = b.astype(mx.float64, stream=mx.cpu)

        ops = [
            mx.add,
            mx.arctan2,
            mx.divide,
            mx.multiply,
            mx.subtract,
            mx.logical_and,
            mx.logical_or,
            mx.remainder,
            mx.maximum,
            mx.minimum,
            mx.power,
            mx.equal,
            mx.greater,
            mx.greater_equal,
            mx.less,
            mx.less_equal,
            mx.not_equal,
            mx.logaddexp,
        ]
        for op in ops:
            if mx.default_device() == mx.gpu:
                with self.assertRaises(ValueError):
                    op(a_double, b_double)
                continue
            y = op(a, b)
            y_double = op(a_double, b_double)
            self.assertTrue(
                mx.allclose(y, y_double.astype(mx.float32, mx.cpu), equal_nan=True)
            )

    def test_where(self):
        shape = (3, 3)
        cond = mx.random.uniform(shape=shape) > 0.5
        a = mx.random.normal(shape=shape)
        b = mx.random.normal(shape=shape)

        a_double = a.astype(mx.float64, stream=mx.cpu)
        b_double = b.astype(mx.float64, stream=mx.cpu)

        if mx.default_device() == mx.gpu:
            with self.assertRaises(ValueError):
                mx.where(cond, a_double, b_double)
            return
        y = mx.where(cond, a, b)
        y_double = mx.where(cond, a_double, b_double)
        self.assertTrue(mx.allclose(y, y_double.astype(mx.float32, mx.cpu)))

    def test_reductions(self):
        shape = (32, 32)
        a = mx.random.normal(shape=shape)
        a_double = a.astype(mx.float64, stream=mx.cpu)

        axes = [0, 1, (0, 1)]
        ops = [mx.sum, mx.prod, mx.min, mx.max, mx.any, mx.all]

        for op in ops:
            for ax in axes:
                if mx.default_device() == mx.gpu:
                    with self.assertRaises(ValueError):
                        op(a_double, axis=ax)
                    continue
                y = op(a)
                y_double = op(a_double)
                self.assertTrue(mx.allclose(y, y_double.astype(mx.float32, mx.cpu)))

    def test_get_and_set_item(self):
        shape = (3, 3)
        a = mx.random.normal(shape=shape)
        b = mx.random.normal(shape=(2,))
        a_double = a.astype(mx.float64, stream=mx.cpu)
        b_double = b.astype(mx.float64, stream=mx.cpu)
        idx_i = mx.array([0, 2])
        idx_j = mx.array([0, 2])

        if mx.default_device() == mx.gpu:
            with self.assertRaises(ValueError):
                a_double[idx_i, idx_j]
        else:
            y = a[idx_i, idx_j]
            y_double = a_double[idx_i, idx_j]
            self.assertTrue(mx.allclose(y, y_double.astype(mx.float32, mx.cpu)))

        if mx.default_device() == mx.gpu:
            with self.assertRaises(ValueError):
                a_double[idx_i, idx_j] = b_double
        else:
            a[idx_i, idx_j] = b
            a_double[idx_i, idx_j] = b_double
            self.assertTrue(mx.allclose(a, a_double.astype(mx.float32, mx.cpu)))

    def test_gemm(self):
        shape = (8, 8)
        a = mx.random.normal(shape=shape)
        b = mx.random.normal(shape=shape)

        a_double = a.astype(mx.float64, stream=mx.cpu)
        b_double = b.astype(mx.float64, stream=mx.cpu)

        if mx.default_device() == mx.gpu:
            with self.assertRaises(ValueError):
                a_double @ b_double
            return
        y = a @ b
        y_double = a_double @ b_double
        self.assertTrue(
            mx.allclose(y, y_double.astype(mx.float32, mx.cpu), equal_nan=True)
        )

    def test_type_promotion(self):
        import mlx.core as mx

        a = mx.array([4, 8], mx.float64)
        b = mx.array([4, 8], mx.int32)

        with mx.stream(mx.cpu):
            c = a + b
            self.assertEqual(c.dtype, mx.float64)

    def test_lapack(self):
        with mx.stream(mx.cpu):
            # QRF
            A = mx.array([[2.0, 3.0], [1.0, 2.0]], dtype=mx.float64)
            Q, R = mx.linalg.qr(A)
            out = Q @ R
            self.assertTrue(mx.allclose(out, A))
            out = Q.T @ Q
            self.assertTrue(mx.allclose(out, mx.eye(2)))
            self.assertTrue(mx.allclose(mx.tril(R, -1), mx.zeros_like(R)))
            self.assertEqual(Q.dtype, mx.float64)
            self.assertEqual(R.dtype, mx.float64)

            # SVD
            A = mx.array(
                [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], dtype=mx.float64
            )
            U, S, Vt = mx.linalg.svd(A)
            self.assertTrue(mx.allclose(U[:, : len(S)] @ mx.diag(S) @ Vt, A))

            # Inverse
            A = mx.array([[1, 2, 3], [6, -5, 4], [-9, 8, 7]], dtype=mx.float64)
            A_inv = mx.linalg.inv(A)
            self.assertTrue(mx.allclose(A @ A_inv, mx.eye(A.shape[0])))

            # Tri inv
            A = mx.array([[1, 0, 0], [6, -5, 0], [-9, 8, 7]], dtype=mx.float64)
            B = mx.array([[7, 0, 0], [3, -2, 0], [1, 8, 3]], dtype=mx.float64)
            AB = mx.stack([A, B])
            invs = mx.linalg.tri_inv(AB, upper=False)
            for M, M_inv in zip(AB, invs):
                self.assertTrue(mx.allclose(M @ M_inv, mx.eye(M.shape[0])))

            # Cholesky
            sqrtA = mx.array(
                [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]], dtype=mx.float64
            )
            A = sqrtA.T @ sqrtA / 81
            L = mx.linalg.cholesky(A)
            U = mx.linalg.cholesky(A, upper=True)
            self.assertTrue(mx.allclose(L @ L.T, A))
            self.assertTrue(mx.allclose(U.T @ U, A))

            # Psueod inverse
            A = mx.array([[1, 2, 3], [6, -5, 4], [-9, 8, 7]], dtype=mx.float64)
            A_plus = mx.linalg.pinv(A)
            self.assertTrue(mx.allclose(A @ A_plus @ A, A))

            # Eigh
            def check_eigs_and_vecs(A_np, kwargs={}):
                A = mx.array(A_np, dtype=mx.float64)
                eig_vals, eig_vecs = mx.linalg.eigh(A, **kwargs)
                eig_vals_np, _ = np.linalg.eigh(A_np, **kwargs)
                self.assertTrue(np.allclose(eig_vals, eig_vals_np))
                self.assertTrue(
                    mx.allclose(A @ eig_vecs, eig_vals[..., None, :] * eig_vecs)
                )

                eig_vals_only = mx.linalg.eigvalsh(A, **kwargs)
                self.assertTrue(mx.allclose(eig_vals, eig_vals_only))

            # Test a simple 2x2 symmetric matrix
            A_np = np.array([[1.0, 2.0], [2.0, 4.0]], dtype=np.float64)
            check_eigs_and_vecs(A_np)

            # Test a larger random symmetric matrix
            n = 5
            np.random.seed(1)
            A_np = np.random.randn(n, n).astype(np.float64)
            A_np = (A_np + A_np.T) / 2
            check_eigs_and_vecs(A_np)

            # Test with upper triangle
            check_eigs_and_vecs(A_np, {"UPLO": "U"})

            # LU factorization
            # Test 3x3 matrix
            a = mx.array(
                [[3.0, 1.0, 2.0], [1.0, 8.0, 6.0], [9.0, 2.0, 5.0]], dtype=mx.float64
            )
            P, L, U = mx.linalg.lu(a)
            self.assertTrue(mx.allclose(L[P, :] @ U, a))

            # Solve triangular
            # Test lower triangular matrix
            a = mx.array(
                [[4.0, 0.0, 0.0], [2.0, 3.0, 0.0], [1.0, -2.0, 5.0]], dtype=mx.float64
            )
            b = mx.array([8.0, 14.0, 3.0], dtype=mx.float64)

            result = mx.linalg.solve_triangular(a, b, upper=False)
            expected = np.linalg.solve(np.array(a), np.array(b))
            self.assertTrue(np.allclose(result, expected))

            # Test upper triangular matrix
            a = mx.array(
                [[3.0, 2.0, 1.0], [0.0, 5.0, 4.0], [0.0, 0.0, 6.0]], dtype=mx.float64
            )
            b = mx.array([13.0, 33.0, 18.0], dtype=mx.float64)

            result = mx.linalg.solve_triangular(a, b, upper=True)
            expected = np.linalg.solve(np.array(a), np.array(b))
            self.assertTrue(np.allclose(result, expected))

    def test_conversion(self):
        a = mx.array([1.0, 2.0], mx.float64)
        b = np.array(a)
        self.assertTrue(np.array_equal(a, b))

        a = mx.array([1.0, 2.0], mx.float64)
        b = a.tolist()
        self.assertEqual(b, [1.0, 2.0])

    def test_linspace(self):
        with mx.stream(mx.cpu):
            vals = mx.linspace(0, math.pi, 2, mx.float64)
            self.assertEqual(vals.tolist()[1], math.pi)


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_einsum.py
================================================
# Copyright © 2024 Apple Inc.

import unittest

import mlx.core as mx
import mlx_tests
import numpy as np


class TestEinsum(mlx_tests.MLXTestCase):

    def test_simple_path(self):
        a = mx.zeros((5, 5))
        path = mx.einsum_path("ii", a)
        self.assertEqual(path[0], [(0,)])

        path = mx.einsum_path("ij->i", a)
        self.assertEqual(path[0], [(0,)])

        path = mx.einsum_path("ii->i", a)
        self.assertEqual(path[0], [(0,)])

        a = mx.zeros((5, 8))
        b = mx.zeros((8, 3))
        path = mx.einsum_path("ij,jk", a, b)
        self.assertEqual(path[0], [(0, 1)])
        path = mx.einsum_path("ij,jk -> ijk", a, b)
        self.assertEqual(path[0], [(0, 1)])

        a = mx.zeros((5, 8))
        b = mx.zeros((8, 3))
        c = mx.zeros((3, 7))
        path = mx.einsum_path("ij,jk,kl", a, b, c)

        self.assertEqual(path[0], [(0, 1), (0, 1)])

        a = mx.zeros((5, 8))
        b = mx.zeros((8, 10))
        c = mx.zeros((10, 7))
        path = mx.einsum_path("ij,jk,kl", a, b, c)
        self.assertEqual(path[0], [(1, 2), (0, 1)])

    def test_longer_paths(self):
        chars = "abcdefghijklmopqABC"
        sizes = [2, 3, 4, 5, 4, 3, 2, 6, 5, 4, 3, 2, 5, 7, 4, 3, 2, 3, 4]
        dim_dict = {c: s for c, s in zip(chars, sizes)}
        cases = [
            "eb,cb,fb->cef",
            "dd,fb,be,cdb->cef",
            "dd,fb,be,cdb->cef",
            "bca,cdb,dbf,afc->",
            "dcc,fce,ea,dbf->ab",
            "dcc,fce,ea,dbf->ab",
        ]

        for case in cases:
            subscripts = case[: case.find("->")].split(",")
            inputs = []
            for s in subscripts:
                shape = [dim_dict[c] for c in s]
                inputs.append(np.ones(shape))
            np_path = np.einsum_path(case, *inputs)

            inputs = [mx.array(i) for i in inputs]
            mx_path = mx.einsum_path(case, *inputs)
            self.assertEqual(np_path[0][1:], mx_path[0])

    def test_simple_einsum(self):
        a = mx.arange(4 * 4).reshape(4, 4)
        a_mx = mx.einsum("ii->i", a)
        a_np = np.einsum("ii->i", a)
        self.assertTrue(np.array_equal(a_mx, a_np))

        a = mx.arange(2 * 2 * 2).reshape(2, 2, 2)
        a_mx = mx.einsum("iii->i", a)
        a_np = np.einsum("iii->i", a)
        self.assertTrue(np.array_equal(a_mx, a_np))

        a = mx.arange(2 * 2 * 3 * 3).reshape(2, 2, 3, 3)
        a_mx = mx.einsum("iijj->ij", a)
        a_np = np.einsum("iijj->ij", a)
        self.assertTrue(np.array_equal(a_mx, a_np))

        a = mx.arange(2 * 2 * 3 * 3).reshape(2, 3, 2, 3)
        a_mx = mx.einsum("ijij->ij", a)
        a_np = np.einsum("ijij->ij", a)
        self.assertTrue(np.array_equal(a_mx, a_np))

        # Test some simple reductions
        a = mx.arange(2 * 2).reshape(2, 2)
        a_mx = mx.einsum("ii", a)
        a_np = np.einsum("ii", a)
        self.assertTrue(np.array_equal(a_mx, a_np))

        a = mx.arange(2 * 4).reshape(2, 4)
        a_mx = mx.einsum("ij->", a)
        a_np = np.einsum("ij->", a)
        self.assertTrue(np.array_equal(a_mx, a_np))

        a = mx.arange(2 * 4).reshape(2, 4)
        a_mx = mx.einsum("ij->i", a)
        a_np = np.einsum("ij->i", a)
        self.assertTrue(np.array_equal(a_mx, a_np))

        a = mx.arange(2 * 4).reshape(2, 4)
        a_mx = mx.einsum("ij->j", a)
        a_np = np.einsum("ij->j", a)
        self.assertTrue(np.array_equal(a_mx, a_np))

        a = mx.arange(2 * 2 * 2).reshape(2, 2, 2)
        a_mx = mx.einsum("iii->", a)
        a_np = np.einsum("iii->", a)
        self.assertTrue(np.array_equal(a_mx, a_np))

        a = mx.arange(2 * 2 * 3 * 3).reshape(2, 3, 2, 3)
        a_mx = mx.einsum("ijij->j", a)
        a_np = np.einsum("ijij->j", a)
        self.assertTrue(np.array_equal(a_mx, a_np))

        # Test some simple transposes
        a = mx.arange(2 * 4).reshape(2, 4)
        a_mx = mx.einsum("ij", a)
        a_np = np.einsum("ij", a)
        self.assertTrue(np.array_equal(a_mx, a_np))

        a = mx.arange(2 * 4).reshape(2, 4)
        a_mx = mx.einsum("ij->ji", a)
        a_np = np.einsum("ij->ji", a)
        self.assertTrue(np.array_equal(a_mx, a_np))

        a = mx.arange(2 * 3 * 4).reshape(2, 3, 4)
        a_mx = mx.einsum("ijk->jki", a)
        a_np = np.einsum("ijk->jki", a)
        self.assertTrue(np.array_equal(a_mx, a_np))

    def test_two_input_einsum(self):

        # Matmul
        a = mx.full((2, 8), 1.0)
        b = mx.full((8, 2), 1.0)
        a_mx = mx.einsum("ik,kj", a, b)
        a_np = np.einsum("ik,kj", a, b)
        self.assertTrue(np.array_equal(a_mx, a_np))

        # Matmul + transpose
        a = mx.full((2, 8), 1.0)
        b = mx.full((8, 3), 1.0)
        a_mx = mx.einsum("ik,kj->ji", a, b)
        a_np = np.einsum("ik,kj->ji", a, b)
        self.assertTrue(np.array_equal(a_mx, a_np))

        # Inner product
        a = mx.full((4,), 1.0)
        b = mx.full((4,), 1.0)
        a_mx = mx.einsum("i,i", a, b)
        a_np = np.einsum("i,i", a, b)
        self.assertTrue(np.array_equal(a_mx, a_np))

        # Outer product
        a = mx.full((4,), 0.5)
        b = mx.full((6,), 2.0)
        a_mx = mx.einsum("i,j->ij", a, b)
        a_np = np.einsum("i,j->ij", a, b)
        self.assertTrue(np.array_equal(a_mx, a_np))

        # Elementwise multiply
        a = mx.full((2, 8), 1.0)
        b = mx.full((2, 8), 1.0)
        a_mx = mx.einsum("ij,ij->ij", a, b)
        a_np = np.einsum("ij,ij->ij", a, b)
        self.assertTrue(np.array_equal(a_mx, a_np))

        # Medley
        a = mx.full((2, 8, 3, 5), 1.0)
        b = mx.full((3, 7, 5, 2), 1.0)
        a_mx = mx.einsum("abcd,fgda->bfca", a, b)
        a_np = np.einsum("abcd,fgda->bfca", a, b)
        self.assertTrue(np.array_equal(a_mx, a_np))

    def test_sum_first(self):
        a = mx.full((5, 8), 1.0)
        b = mx.full((8, 2), 1.0)
        a_mx = mx.einsum("ab,bc->c", a, b)
        a_np = np.einsum("ab,bc->c", a, b)
        self.assertTrue(np.array_equal(a_mx, a_np))

    def test_broadcasting(self):
        a = mx.full((5, 1), 1.0)
        b = mx.full((8, 2), 1.0)
        a_mx = mx.einsum("ab,bc->c", a, b)
        return
        a_np = np.einsum("ab,bc->c", a, b)
        self.assertTrue(np.array_equal(a_mx, a_np))

        a = mx.random.uniform(shape=(5, 1, 3, 1))
        b = mx.random.uniform(shape=(1, 7, 1, 2))
        a_mx = mx.einsum("abcd,cdab->abcd", a, b)
        a_np = np.einsum("abcd,cdab->abcd", a, b)
        self.assertTrue(np.allclose(a_mx, a_np))

    def test_attention(self):
        q = mx.random.uniform(shape=(2, 3, 4, 5))
        k = mx.random.uniform(shape=(2, 3, 4, 5))
        v = mx.random.uniform(shape=(2, 3, 4, 5))

        s = mx.einsum("itjk,iujk->ijtu", q, k)
        out_mx = mx.einsum("ijtu,iujk->itjk", s, v)

        s = np.einsum("itjk,iujk->ijtu", q, k)
        out_np = np.einsum("ijtu,iujk->itjk", s, v)

        self.assertTrue(np.allclose(out_mx, out_np))

    def test_multi_input_einsum(self):
        a = mx.ones((3, 4, 5))
        out_mx = mx.einsum("ijk,lmk,ijf->lf", a, a, a)
        out_np = np.einsum("ijk,lmk,ijf->lf", a, a, a)
        self.assertTrue(np.allclose(out_mx, out_np))

    def test_opt_einsum_test_cases(self):
        # Test cases from
        # https://github.com/dgasmith/opt_einsum/blob/c826bb7df16f470a69f7bf90598fc27586209d11/opt_einsum/tests/test_contract.py#L11
        tests = [
            # Test hadamard-like products
            "a,ab,abc->abc",
            "a,b,ab->ab",
            # Test index-transformations
            "ea,fb,gc,hd,abcd->efgh",
            "ea,fb,abcd,gc,hd->efgh",
            "abcd,ea,fb,gc,hd->efgh",
            # Test complex contractions
            "acdf,jbje,gihb,hfac,gfac,gifabc,hfac",
            "cd,bdhe,aidb,hgca,gc,hgibcd,hgac",
            "abhe,hidj,jgba,hiab,gab",
            "bde,cdh,agdb,hica,ibd,hgicd,hiac",
            "chd,bde,agbc,hiad,hgc,hgi,hiad",
            "chd,bde,agbc,hiad,bdi,cgh,agdb",
            "bdhe,acad,hiab,agac,hibd",
            # Test collapse
            "ab,ab,c->",
            "ab,ab,c->c",
            "ab,ab,cd,cd->",
            "ab,ab,cd,cd->ac",
            "ab,ab,cd,cd->cd",
            "ab,ab,cd,cd,ef,ef->",
            # Test outer prodcuts
            "ab,cd,ef->abcdef",
            "ab,cd,ef->acdf",
            "ab,cd,de->abcde",
            "ab,cd,de->be",
            "ab,bcd,cd->abcd",
            "ab,bcd,cd->abd",
            # Random test cases that have previously failed
            "eb,cb,fb->cef",
            "dd,fb,be,cdb->cef",
            "bca,cdb,dbf,afc->",
            "dcc,fce,ea,dbf->ab",
            "fdf,cdd,ccd,afe->ae",
            "abcd,ad",
            "ed,fcd,ff,bcf->be",
            "baa,dcf,af,cde->be",
            "bd,db,eac->ace",
            "fff,fae,bef,def->abd",
            "efc,dbc,acf,fd->abe",
            # Inner products
            "ab,ab",
            "ab,ba",
            "abc,abc",
            "abc,bac",
            "abc,cba",
            # GEMM test cases
            "ab,bc",
            "ab,cb",
            "ba,bc",
            "ba,cb",
            "abcd,cd",
            "abcd,ab",
            "abcd,cdef",
            "abcd,cdef->feba",
            "abcd,efdc",
            # Inner then dot
            "aab,bc->ac",
            "ab,bcc->ac",
            "aab,bcc->ac",
            "baa,bcc->ac",
            "aab,ccb->ac",
            # Randomly build test caes
            "aab,fa,df,ecc->bde",
            "ecb,fef,bad,ed->ac",
            "bcf,bbb,fbf,fc->",
            "bb,ff,be->e",
            "bcb,bb,fc,fff->",
            "fbb,dfd,fc,fc->",
            "afd,ba,cc,dc->bf",
            "adb,bc,fa,cfc->d",
            "bbd,bda,fc,db->acf",
            "dba,ead,cad->bce",
            "aef,fbc,dca->bde",
        ]

        size_dict = dict(zip("abcdefghij", [2, 3, 4, 5, 2, 3, 4, 5, 2, 3]))

        def inputs_for_case(test_case):
            inputs = test_case.split("->")[0].split(",")
            return [
                mx.random.uniform(shape=tuple(size_dict[c] for c in inp))
                for inp in inputs
            ]

        for test_case in tests:
            inputs = inputs_for_case(test_case)
            np_out = np.einsum(test_case, *inputs)
            mx_out = mx.einsum(test_case, *inputs)
            self.assertTrue(np.allclose(mx_out, np_out, rtol=1e-4, atol=1e-4))

    def test_ellipses(self):
        size_dict = dict(zip("abcdefghij", [2, 3, 4, 5, 2, 3, 4, 5, 2, 3]))

        def inputs_for_case(test_case):
            inputs = test_case.split("->")[0].split(",")
            return [
                mx.random.uniform(shape=tuple(size_dict[c] for c in inp))
                for inp in inputs
            ]

        tests = [
            ("abc->ab", "...c->..."),
            ("abcd->ad", "a...d->..."),
            ("abij,abgj->abig", "...ij,...gj->...ig"),
            ("abij,abgj->abig", "...ij,...gj->..."),
            ("abhh->abh", "...hh->...h"),
            ("abhh->abh", "...hh->...h"),
            ("bch,abcj->abchj", "...h,...j->...hj"),
            ("bc,cd->bd", "...c,cd"),
            ("abc,acd->bd", "...bc,...cd"),
            ("abcd,c->abd", "...cd,c"),
            ("abcd,c->abd", "...cd,c..."),
            ("abcd,c->abd", "...cd,c...->d..."),
            ("abc,b->abc", "ab...,b...->ab..."),
            ("abc,b->abc", "ab...,...b->ab..."),
            ("abc,b->abc", "ab...,b->ab..."),
            ("ab,bc->ac", "ab...,b...->a..."),
            ("ab,bc->ac", "ab...,...bc->a...c"),
            ("ab,bc->ac", "ab,b...->a..."),
            ("abcdef,defg->abcg", "...def,defg->...g"),
        ]
        for test_case in tests:
            inputs = inputs_for_case(test_case[0])
            np_out = np.einsum(test_case[1], *inputs)
            mx_out = mx.einsum(test_case[1], *inputs)
            self.assertTrue(np.allclose(mx_out, np_out, rtol=1e-4, atol=1e-4))

        error_tests = [
            ("abc,abc->ab", "a...b...c,a...b...c->abc"),
        ]
        for test_case in error_tests:
            inputs = inputs_for_case(test_case[0])
            with self.assertRaises(ValueError):
                mx.einsum(test_case[1], *inputs)


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_eval.py
================================================
# Copyright © 2023 Apple Inc.

import unittest
from functools import partial

import mlx.core as mx
import mlx_tests


class TestEval(mlx_tests.MLXTestCase):
    def test_eval(self):
        arrs = [mx.ones((2, 2)) for _ in range(4)]
        mx.eval(*arrs)
        for x in arrs:
            self.assertEqual(x.tolist(), [[1, 1], [1, 1]])

    def test_retain_graph(self):
        def fun(x):
            y = 3 * x
            mx.eval(y)
            return 2 * y

        dfun_dx = mx.grad(fun)
        y = dfun_dx(mx.array(1.0))
        self.assertEqual(y.item(), 6.0)

    def test_eval_mixed(self):
        x = mx.array(1) + 1 + 1
        y = 0
        z = "hello"
        state = [x, y, z]
        mx.eval(state)
        self.assertEqual(x.item(), 3)

    def test_async_eval(self):
        x = mx.array(1) + mx.array(1) + mx.array(1)
        mx.async_eval(x)
        self.assertEqual(x.item(), 3)

        # It should be safe to call eval on the array which has been async
        # eval'ed
        x = mx.array(1) + mx.array(1) + mx.array(1)
        self.assertEqual(x.item(), 3)

        x = mx.array([1, 2, 3])
        y = 2 * x
        mx.async_eval(y)
        z = 2 * y
        mx.async_eval(z)
        self.assertTrue(mx.array_equal(y, mx.array([2, 4, 6])))
        self.assertTrue(mx.array_equal(z, mx.array([4, 8, 12])))

    def test_async_eval_twice(self):
        for _ in range(1000):
            x = mx.array(1) + mx.array(1) + mx.array(1)
            mx.async_eval(x)
            y = x + 1
            mx.async_eval(y)
            self.assertEqual(x.item(), 3)
            self.assertEqual(y.item(), 4)

    def test_async_eval_in_trace(self):
        def fun(x):
            y = x + 1.0
            mx.async_eval(y)
            return mx.exp(y)

        # Raises
        with self.assertRaises(ValueError):
            mx.grad(fun)(mx.array(1.0))

        # Also raises
        with self.assertRaises(ValueError):
            mx.vmap(fun)(mx.ones((2, 2)))

    def test_async_eval_into_eval(self):
        x = mx.array(1)
        y = x + 1
        mx.async_eval(y)
        a = y - 10
        b = mx.abs(a)
        self.assertEqual(b.item(), 8)

    def test_async_eval_into_eval_diff_stream(self):
        s = mx.new_stream(mx.cpu)
        x = mx.array(0)
        y = x - 5
        mx.async_eval(y)
        z = mx.abs(y, stream=s)
        self.assertEqual(z.item(), 5)

    def test_eval_slow_fast_multi_stream(self):
        x = mx.ones((8000,))
        y = mx.abs(mx.array(-1.0))
        for _ in range(20):
            x = x + mx.array(1.0)
        z = mx.add(x, y, stream=mx.cpu)
        self.assertTrue(mx.allclose(z, mx.full((8000,), 22.0)))

        # Switch eval order
        x = mx.ones((8000,))
        y = mx.abs(mx.array(-1.0))
        for _ in range(20):
            x = x + mx.array(1.0)
        z = mx.add(y, x, stream=mx.cpu)
        self.assertTrue(mx.allclose(z, mx.full((8000,), 22.0)))

    def test_multi_output_eval_during_transform(self):
        x = mx.random.uniform(shape=(1024,))
        y = mx.ones((1024,))
        mx.eval(x, y)

        def fn(x):
            a, b = mx.divmod(x, x)
            mx.eval(a)
            return a

        out = mx.vjp(fn, (x,), (y,))
        out = mx.vjp(fn, (x,), (y,))
        peak_mem = mx.get_peak_memory()
        out = mx.vjp(fn, (x,), (y,))
        self.assertEqual(peak_mem, mx.get_peak_memory())

    def test_async_eval_with_multiple_streams(self):
        x = mx.array([1.0])
        y = mx.array([1.0])
        a = mx.array([1.0])
        b = mx.array([1.0])

        d = mx.default_device()
        s2 = mx.new_stream(d)

        for _ in range(50):
            for _ in range(20):
                x = x + y
            mx.async_eval(x)
            mx.eval(a + b)

    def test_donation_for_noops(self):
        def fun(x):
            s = x.shape
            for _ in range(10):
                x = mx.abs(x)
                x = mx.reshape(x, (-1,))
                x = x.T.T
                x = mx.stop_gradient(x)
                x = mx.abs(x)
            return x

        x = mx.zeros((4096, 4096))
        mx.eval(x)
        pre = mx.get_peak_memory()
        out = fun(x)
        del x
        mx.eval(out)
        post = mx.get_peak_memory()
        self.assertEqual(pre, post)

        def fun(x):
            for _ in range(10):
                x = mx.abs(x)
                x = x[:-1]
                x = mx.abs(x)
            return x

        x = mx.zeros((4096 * 4096,))
        mx.eval(x)
        pre = mx.get_peak_memory()
        out = fun(x)
        del x
        mx.eval(out)
        post = mx.get_peak_memory()
        self.assertEqual(pre, post)

    @unittest.skipIf(not mx.is_available(mx.gpu), "GPU is not available")
    def test_multistream_deadlock(self):
        s1 = mx.default_stream(mx.gpu)
        s2 = mx.new_stream(mx.gpu)

        x = mx.array(1.0)
        x = mx.abs(x, stream=s1)
        for _ in range(1000):
            x = mx.abs(x, stream=s2)
        mx.eval(x)

        s1 = mx.default_stream(mx.gpu)
        s2 = mx.new_stream(mx.gpu)
        old_limit = mx.set_memory_limit(1000)

        x = mx.ones((512, 512), stream=s2)
        for _ in range(80):
            x = mx.abs(x, stream=s1)
        y = mx.abs(x, stream=s2)
        z = mx.abs(y, stream=s2)
        mx.eval(z)
        mx.set_memory_limit(old_limit)


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_export_import.py
================================================
# Copyright © 2024 Apple Inc.

import gc
import os
import tempfile
import unittest

import mlx.core as mx
import mlx.nn as nn
import mlx_tests


class TestExportImport(mlx_tests.MLXTestCase):

    @classmethod
    def setUpClass(cls):
        cls.test_dir_fid = tempfile.TemporaryDirectory()
        cls.test_dir = cls.test_dir_fid.name
        if not os.path.isdir(cls.test_dir):
            os.mkdir(cls.test_dir)

    @classmethod
    def tearDownClass(cls):
        cls.test_dir_fid.cleanup()

    def test_basic_export_import(self):
        path = os.path.join(self.test_dir, "fn.mlxfn")

        # Function with no inputs
        def fun():
            return mx.zeros((3, 3))

        mx.export_function(path, fun)
        imported = mx.import_function(path)

        expected = fun()
        (out,) = imported()
        self.assertTrue(mx.array_equal(out, expected))

        # Simple function with inputs
        def fun(x):
            return mx.abs(mx.sin(x))

        inputs = mx.array([1.0, 2.0, 3.0, 4.0, 5.0])

        mx.export_function(path, fun, inputs)
        imported = mx.import_function(path)

        expected = fun(inputs)
        (out,) = imported(inputs)
        self.assertTrue(mx.allclose(out, expected))

        # Inputs in a list or tuple
        def fun(x):
            x = mx.abs(mx.sin(x))
            return x

        mx.export_function(path, fun, [inputs])
        imported = mx.import_function(path)

        expected = fun(inputs)
        (out,) = imported([inputs])
        self.assertTrue(mx.allclose(out, expected))

        (out,) = imported(inputs)
        self.assertTrue(mx.allclose(out, expected))

        mx.export_function(path, fun, (inputs,))
        imported = mx.import_function(path)
        (out,) = imported((inputs,))
        self.assertTrue(mx.allclose(out, expected))

        # Outputs in a list
        def fun(x):
            return [mx.abs(mx.sin(x))]

        mx.export_function(path, fun, inputs)
        imported = mx.import_function(path)
        (out,) = imported(inputs)
        self.assertTrue(mx.allclose(out, expected))

        # Outputs in a tuple
        def fun(x):
            return (mx.abs(mx.sin(x)),)

        mx.export_function(path, fun, inputs)
        imported = mx.import_function(path)
        (out,) = imported(inputs)
        self.assertTrue(mx.allclose(out, expected))

        # Check throws on invalid inputs / outputs
        def fun(x):
            return mx.abs(x)

        with self.assertRaises(ValueError):
            mx.export_function(path, fun, "hi")

        with self.assertRaises(ValueError):
            mx.export_function(path, fun, mx.array(1.0), "hi")

        def fun(x):
            return mx.abs(x[0][0])

        with self.assertRaises(ValueError):
            mx.export_function(path, fun, [[mx.array(1.0)]])

        def fun():
            return (mx.zeros((3, 3)), 1)

        with self.assertRaises(ValueError):
            mx.export_function(path, fun)

        def fun():
            return (mx.zeros((3, 3)), [mx.zeros((3, 3))])

        with self.assertRaises(ValueError):
            mx.export_function(path, fun)

        def fun(x, y):
            return x + y

        mx.export_function(path, fun, mx.array(1.0), mx.array(1.0))
        imported = mx.import_function(path)

        with self.assertRaises(ValueError):
            imported(mx.array(1.0), 1.0)

        with self.assertRaises(ValueError):
            imported(mx.array(1.0), mx.array(1.0), mx.array(1.0))

        with self.assertRaises(ValueError):
            imported(mx.array(1.0), [mx.array(1.0)])

    def test_export_random_sample(self):
        path = os.path.join(self.test_dir, "fn.mlxfn")

        mx.random.seed(5)

        def fun():
            return mx.random.uniform(shape=(3,))

        mx.export_function(path, fun)
        imported = mx.import_function(path)

        (out,) = imported()

        mx.random.seed(5)
        expected = fun()

        self.assertTrue(mx.array_equal(out, expected))

    def test_export_with_kwargs(self):
        path = os.path.join(self.test_dir, "fn.mlxfn")

        def fun(x, z=None):
            out = x
            if z is not None:
                out += z
            return out

        x = mx.array([1, 2, 3])
        y = mx.array([1, 1, 0])
        z = mx.array([2, 2, 2])

        mx.export_function(path, fun, (x,), {"z": z})
        imported_fun = mx.import_function(path)

        with self.assertRaises(ValueError):
            imported_fun(x, z)

        with self.assertRaises(ValueError):
            imported_fun(x, y=z)

        with self.assertRaises(ValueError):
            imported_fun((x,), {"y": z})

        out = imported_fun(x, z=z)[0]
        self.assertTrue(mx.array_equal(out, mx.array([3, 4, 5])))

        out = imported_fun((x,), {"z": z})[0]
        self.assertTrue(mx.array_equal(out, mx.array([3, 4, 5])))

        mx.export_function(path, fun, x, z=z)
        imported_fun = mx.import_function(path)
        out = imported_fun(x, z=z)[0]
        self.assertTrue(mx.array_equal(out, mx.array([3, 4, 5])))

        out = imported_fun((x,), {"z": z})[0]
        self.assertTrue(mx.array_equal(out, mx.array([3, 4, 5])))

        # Only specify kwargs
        mx.export_function(path, fun, x=x, z=z)
        imported_fun = mx.import_function(path)
        with self.assertRaises(ValueError):
            out = imported_fun(x, z=z)[0]

        out = imported_fun(x=x, z=z)[0]
        self.assertTrue(mx.array_equal(out, mx.array([3, 4, 5])))

        out = imported_fun({"x": x, "z": z})[0]
        self.assertTrue(mx.array_equal(out, mx.array([3, 4, 5])))

    def test_export_variable_inputs(self):
        path = os.path.join(self.test_dir, "fn.mlxfn")

        def fun(x, y, z=None):
            out = x + y
            if z is not None:
                out += z
            return out

        with mx.exporter(path, fun) as exporter:
            exporter(mx.array([1, 2, 3]), mx.array([1, 1, 1]))
            exporter(mx.array([1, 2, 3]), mx.array([1, 1, 1]), z=mx.array([2]))

        with self.assertRaises(RuntimeError):
            exporter(mx.array([1, 2, 3, 4]), mx.array([1, 1, 1, 1]))

        imported_fun = mx.import_function(path)
        out = imported_fun(mx.array([1, 2, 3]), mx.array([1, 1, 1]))[0]
        self.assertTrue(mx.array_equal(out, mx.array([2, 3, 4])))

        out = imported_fun(mx.array([1, 2, 3]), mx.array([1, 1, 1]), z=mx.array([2]))[0]
        self.assertTrue(mx.array_equal(out, mx.array([4, 5, 6])))

        with self.assertRaises(ValueError):
            imported_fun(mx.array([1, 2, 3, 4]), mx.array([1, 1, 1, 1]))

        # A function with a large constant
        constant = mx.zeros((16, 2048))
        mx.eval(constant)

        def fun(*args):
            return constant + sum(args)

        with mx.exporter(path, fun) as exporter:
            for i in range(5):
                exporter(*[mx.array(1)] * i)

        # Check the exported file size < constant size + small amount
        constants_size = constant.nbytes + 8192
        self.assertTrue(os.path.getsize(path) < constants_size)

    def test_leaks(self):
        path = os.path.join(self.test_dir, "fn.mlxfn")
        mx.synchronize()
        if mx.metal.is_available():
            mem_pre = mx.get_active_memory()
        else:
            mem_pre = 0

        def outer():
            d = {}

            def f(x):
                return d["x"]

            d["f"] = mx.exporter(path, f)
            d["x"] = mx.array([0] * 1000)

        for _ in range(5):
            outer()
            gc.collect()

        if mx.metal.is_available():
            mem_post = mx.get_active_memory()
        else:
            mem_post = 0

        self.assertEqual(mem_pre, mem_post)

    def test_export_import_shapeless(self):
        path = os.path.join(self.test_dir, "fn.mlxfn")

        def fun(*args):
            return sum(args)

        with mx.exporter(path, fun, shapeless=True) as exporter:
            exporter(mx.array(1))
            exporter(mx.array(1), mx.array(2))
            exporter(mx.array(1), mx.array(2), mx.array(3))

        f2 = mx.import_function(path)
        self.assertEqual(f2(mx.array(1))[0].item(), 1)
        self.assertEqual(f2(mx.array(1), mx.array(1))[0].item(), 2)
        self.assertEqual(f2(mx.array(1), mx.array(1), mx.array(1))[0].item(), 3)
        with self.assertRaises(ValueError):
            f2(mx.array(10), mx.array([5, 10, 20]))

    def test_export_scatter_gather(self):
        path = os.path.join(self.test_dir, "fn.mlxfn")

        def fun(a, b):
            return mx.take_along_axis(a, b, axis=0)

        x = mx.random.uniform(shape=(4, 4))
        y = mx.array([[0, 1, 2, 3], [1, 2, 0, 3]])
        mx.export_function(path, fun, (x, y))
        imported_fun = mx.import_function(path)
        expected = fun(x, y)
        out = imported_fun(x, y)[0]
        self.assertTrue(mx.array_equal(expected, out))

        def fun(a, b, c):
            return mx.put_along_axis(a, b, c, axis=0)

        x = mx.random.uniform(shape=(4, 4))
        y = mx.array([[0, 1, 2, 3], [1, 2, 0, 3]])
        z = mx.random.uniform(shape=(2, 4))
        mx.export_function(path, fun, (x, y, z))
        imported_fun = mx.import_function(path)
        expected = fun(x, y, z)
        out = imported_fun(x, y, z)[0]
        self.assertTrue(mx.array_equal(expected, out))

    def test_export_conv(self):
        path = os.path.join(self.test_dir, "fn.mlxfn")

        class Model(nn.Module):
            def __init__(self):
                super().__init__()
                self.c1 = nn.Conv2d(
                    3, 16, kernel_size=3, stride=1, padding=1, bias=False
                )
                self.c2 = nn.Conv2d(
                    16, 16, kernel_size=3, stride=2, padding=1, bias=False
                )
                self.c3 = nn.Conv2d(
                    16, 16, kernel_size=3, stride=1, padding=2, bias=False
                )

            def __call__(self, x):
                return self.c3(self.c2(self.c1(x)))

        model = Model()
        mx.eval(model.parameters())

        def forward(x):
            return model(x)

        input_data = mx.random.normal(shape=(4, 32, 32, 3))
        mx.export_function(path, forward, input_data)

        imported_fn = mx.import_function(path)
        out = imported_fn(input_data)[0]
        expected = forward(input_data)
        self.assertTrue(mx.allclose(expected, out))

    def test_export_conv_shapeless(self):
        # Conv1d (NLC)
        path = os.path.join(self.test_dir, "conv1d.mlxfn")

        class M1(nn.Module):
            def __init__(self):
                super().__init__()
                self.c = nn.Conv1d(3, 8, kernel_size=3, stride=2, padding=1, bias=False)

            def __call__(self, x):
                return self.c(x)

        m1 = M1()
        mx.eval(m1.parameters())

        def f1(x):
            return m1(x)

        x = mx.random.normal(shape=(4, 64, 3))
        mx.export_function(path, f1, x, shapeless=True)
        f1_imp = mx.import_function(path)
        for shape in [(4, 64, 3), (1, 33, 3), (2, 128, 3)]:
            xt = mx.random.normal(shape=shape)
            self.assertTrue(mx.allclose(f1_imp(xt)[0], f1(xt)))

        # Conv2d (NHWC)
        path = os.path.join(self.test_dir, "conv2d.mlxfn")

        class M2(nn.Module):
            def __init__(self):
                super().__init__()
                self.c = nn.Conv2d(3, 6, kernel_size=3, stride=2, padding=1, bias=False)

            def __call__(self, x):
                return self.c(x)

        m2 = M2()
        mx.eval(m2.parameters())

        def f2(x):
            return m2(x)

        x = mx.random.normal(shape=(2, 32, 32, 3))
        mx.export_function(path, f2, x, shapeless=True)
        f2_imp = mx.import_function(path)
        for shape in [(2, 32, 32, 3), (1, 31, 31, 3), (4, 64, 48, 3)]:
            xt = mx.random.normal(shape=shape)
            self.assertTrue(mx.allclose(f2_imp(xt)[0], f2(xt)))

        # Conv3d (NDHWC)
        path = os.path.join(self.test_dir, "conv3d.mlxfn")

        class M3(nn.Module):
            def __init__(self):
                super().__init__()
                self.c = nn.Conv3d(2, 4, kernel_size=3, stride=2, padding=1, bias=False)

            def __call__(self, x):
                return self.c(x)

        m3 = M3()
        mx.eval(m3.parameters())

        def f3(x):
            return m3(x)

        x = mx.random.normal(shape=(1, 8, 8, 8, 2))
        mx.export_function(path, f3, x, shapeless=True)
        f3_imp = mx.import_function(path)
        for shape in [(1, 8, 8, 8, 2), (2, 7, 8, 9, 2), (1, 16, 16, 4, 2)]:
            xt = mx.random.normal(shape=shape)
            self.assertTrue(mx.allclose(f3_imp(xt)[0], f3(xt)))

        # Grouped Conv2d (NHWC)
        path = os.path.join(self.test_dir, "conv2d_grouped.mlxfn")

        class MG(nn.Module):
            def __init__(self):
                super().__init__()
                self.c = nn.Conv2d(
                    4, 6, kernel_size=3, stride=2, padding=1, groups=2, bias=False
                )

            def __call__(self, x):
                return self.c(x)

        mg = MG()
        mx.eval(mg.parameters())

        def fg(x):
            return mg(x)

        x = mx.random.normal(shape=(2, 32, 32, 4))
        mx.export_function(path, fg, x, shapeless=True)
        fg_imp = mx.import_function(path)
        for shape in [(2, 32, 32, 4), (1, 32, 32, 4), (3, 15, 20, 4)]:
            xt = mx.random.normal(shape=shape)
            self.assertTrue(mx.allclose(fg_imp(xt)[0], fg(xt)))

    def test_export_control_flow(self):

        def fun(x, y):
            if y.shape[0] <= 2:
                return x + y
            else:
                return x + 2 * y

        for y in (mx.array([1, 2, 3]), mx.array([1, 2])):
            for shapeless in (True, False):
                with self.subTest(y=y, shapeless=shapeless):
                    x = mx.array(1)
                    export_path = os.path.join(self.test_dir, "control_flow.mlxfn")
                    mx.export_function(export_path, fun, x, y, shapeless=shapeless)

                    imported_fn = mx.import_function(export_path)
                    self.assertTrue(mx.array_equal(imported_fn(x, y)[0], fun(x, y)))

    def test_export_quantized_model(self):
        for shapeless in (True, False):
            with self.subTest(shapeless=shapeless):
                model = nn.Sequential(
                    nn.Linear(1024, 512), nn.ReLU(), nn.Linear(512, 1024)
                )
                model.eval()
                mx.eval(model.parameters())
                input_data = mx.ones(shape=(512, 1024))
                nn.quantize(model)
                self.assertTrue(isinstance(model.layers[0], nn.QuantizedLinear))
                self.assertTrue(isinstance(model.layers[2], nn.QuantizedLinear))
                mx.eval(model.parameters())

                export_path = os.path.join(self.test_dir, "quantized_linear.mlxfn")
                mx.export_function(export_path, model, input_data, shapeless=shapeless)

                imported_fn = mx.import_function(export_path)
                self.assertTrue(
                    mx.array_equal(imported_fn(input_data)[0], model(input_data))
                )

    def test_export_kwarg_ordering(self):
        path = os.path.join(self.test_dir, "fun.mlxfn")

        def fn(x, y):
            return x - y

        mx.export_function(path, fn, x=mx.array(1.0), y=mx.array(1.0))
        imported = mx.import_function(path)
        out = imported(x=mx.array(2.0), y=mx.array(3.0))[0]
        self.assertEqual(out.item(), -1.0)
        out = imported(y=mx.array(2.0), x=mx.array(3.0))[0]
        self.assertEqual(out.item(), 1.0)

    def test_export_with_callback(self):

        def fn(x, y):
            return mx.log(mx.abs(x - y)).astype(mx.int32)

        n_in = None
        n_out = None
        n_const = None
        keywords = None
        primitives = []
        primitive_args = []

        def callback(args):
            nonlocal n_in, n_out, n_const, keywords, primitives
            t = args["type"]
            if t == "inputs":
                n_in = len(args["inputs"])
            elif args["type"] == "outputs":
                n_out = len(args["outputs"])
            elif args["type"] == "keyword_inputs":
                keywords = args["keywords"]
            elif t == "constants":
                n_const = len(args["constants"])
            elif t == "primitive":
                primitives.append(args["name"])
                primitive_args.append(args["arguments"])

        mx.export_function(callback, fn, mx.array(1.0), y=mx.array(1.0))
        self.assertEqual(n_in, 2)
        self.assertEqual(n_out, 1)
        self.assertEqual(n_const, 0)
        self.assertEqual(len(keywords), 1)
        self.assertEqual(keywords[0][0], "y")
        self.assertEqual(primitives, ["Subtract", "Abs", "Log", "AsType"])
        self.assertEqual(primitive_args[0], [])
        self.assertEqual(primitive_args[1], [])
        self.assertEqual(primitive_args[2], [2])
        self.assertEqual(primitive_args[3], [mx.int32])

    @unittest.skipIf(not mx.is_available(mx.gpu), "No GPU available")
    def test_export_import_custom_kernel(self):
        if mx.metal.is_available():
            source = """
                uint elem = thread_position_in_grid.x;
                out1[elem] = a[elem];
            """
            custom_kernel = mx.fast.metal_kernel
        elif mx.cuda.is_available():
            source = """
                auto elem = cooperative_groups::this_grid().thread_rank();
                out1[elem] = a[elem];
            """
            custom_kernel = mx.fast.cuda_kernel

        kernel = custom_kernel(
            name="basic",
            input_names=["a"],
            output_names=["out1"],
            source=source,
        )

        def call(a):
            return kernel(
                inputs=[a],
                grid=(4, 1, 1),
                threadgroup=(2, 1, 1),
                output_shapes=[(2, 2)],
                output_dtypes=[mx.float32],
                stream=mx.gpu,
            )[0]

        mx.random.seed(7)
        a = mx.random.normal(shape=(2, 2))

        path = os.path.join(self.test_dir, "fn.mlxfn")
        expected = call(a)
        mx.export_function(path, call, a)

        imported = mx.import_function(path)

        out = imported(a)[0]
        self.assertTrue(mx.allclose(expected, out))

    def test_export_import_multi_with_constants(self):

        path = os.path.join(self.test_dir, "fn.mlxfn")

        def fun(y):
            i = y.shape[0]
            x = mx.array(i)
            for j in range(10):
                x = x + mx.array(i + j)
            return x * y.sum()

        ys = [mx.array([1]), mx.array([1, 1]), mx.array([1, 1, 1])]

        with mx.exporter(path, fun) as exporter:
            for y in ys:
                exporter(y)

        imported = mx.import_function(path)
        for y in ys:
            self.assertEqual(imported(y)[0].item(), fun(y).item())

    def test_export_import_scatter_sum(self):
        def fun(x, y, z):
            return x.at[y].add(z)

        x = mx.array([1, 2, 3])
        y = mx.array([0, 0, 1])
        z = mx.array([1, 1, 1])
        path = os.path.join(self.test_dir, "fn.mlxfn")
        mx.export_function(path, fun, x, y, z)

        imported = mx.import_function(path)
        self.assertTrue(mx.array_equal(imported(x, y, z)[0], fun(x, y, z)))


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_fast.py
================================================
# Copyright © 2023-2024 Apple Inc.

import math
import unittest

import mlx.core as mx
import mlx_tests


def rope_orig(x, dims, traditional, base, scale, offset, freqs=None):
    N = x.shape[-2]
    dtype = x.dtype
    half_D = dims // 2
    positions = mx.arange(N, dtype=dtype)
    if isinstance(offset, mx.array) and offset.size > 1:
        expand = tuple(range(1, x.ndim - 1))
        positions = mx.expand_dims(offset, expand) + positions
    else:
        positions = offset + positions
    positions = positions * scale
    if freqs is None:
        inv_freqs = mx.exp(
            -mx.arange(0.0, half_D, dtype=dtype) * (math.log(base) / half_D)
        )
    else:
        inv_freqs = (1 / freqs).astype(x.dtype)
    theta = mx.expand_dims(positions, -1) * inv_freqs
    costheta, sintheta = mx.cos(theta), mx.sin(theta)
    if traditional:
        x1 = x[..., :dims:2]
        x2 = x[..., 1:dims:2]
        rx1 = x1 * costheta - x2 * sintheta
        rx2 = x1 * sintheta + x2 * costheta
        rx = mx.concatenate([rx1[..., None], rx2[..., None]], axis=-1)
        if dims < x.shape[-1]:
            rx = mx.reshape(rx, (*x.shape[:-1], dims))
            rx = mx.concatenate([rx, x[..., dims:]], axis=-1)
        return mx.reshape(rx, x.shape)
    else:
        x1 = x[..., : dims // 2]
        x2 = x[..., dims // 2 : dims]
        rx1 = x1 * costheta - x2 * sintheta
        rx2 = x1 * sintheta + x2 * costheta
        if dims < x.shape[-1]:
            rx = mx.concatenate([rx1, rx2, x[..., dims:]], axis=-1)
        else:
            rx = mx.concatenate([rx1, rx2], axis=-1)
        return rx


def rms_norm(x, weight, eps):
    x = x.astype(mx.float32)
    x = x * mx.rsqrt(x.square().mean(-1, keepdims=True) + eps)
    return weight * x.astype(weight.dtype)


def layer_norm(x, weight, bias, eps):
    ot = x.dtype
    x = x.astype(mx.float32)
    mean = x.mean(axis=-1, keepdims=True)
    var = x.var(axis=-1, keepdims=True)
    x = (x - mean) * mx.rsqrt(var + eps)
    x = x.astype(ot)
    if weight is not None:
        x = x * weight
    if bias is not None:
        x = x + bias
    return x


class TestFast(mlx_tests.MLXTestCase):
    def test_rope(self):
        T = 4

        # Defaults: dims, dtype, base, scale, offset, traditional
        defaults = (8, mx.float32, 10000.0, 1.0, 0, False)

        # Per dtype absolute tolerance
        tolerances = {mx.float32: 1e-6, mx.float16: 1e-3, mx.bfloat16: 1e-2}

        # Test cases:
        dtypes = [mx.float32, mx.float16, mx.bfloat16]
        bases = [10000.0, 1000000.0]
        scales = [1.0, 2.0]
        offsets = [0, 3, mx.array(3)]
        traditional = [True, False]

        for traditional in [True, False]:
            dims, dtype, _, scale, offset, _ = defaults
            for base in bases:
                x = mx.random.uniform(shape=(2, T, dims)).astype(dtype)
                rx = rope_orig(x, dims, traditional, base, scale, offset)
                rx_fast = mx.fast.rope(
                    x,
                    dims,
                    traditional=traditional,
                    base=base,
                    scale=scale,
                    offset=offset,
                )
                self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])

            dims, _, base, scale, offset, _ = defaults
            for dtype in dtypes:
                x = mx.random.uniform(shape=(2, T, dims)).astype(dtype)
                rx = rope_orig(x, dims, traditional, base, scale, offset)
                rx_fast = mx.fast.rope(
                    x,
                    dims,
                    traditional=traditional,
                    base=base,
                    scale=scale,
                    offset=offset,
                )
                if dtype != mx.float32:
                    ry = rope_orig(
                        x.astype(mx.float32), dims, traditional, base, scale, offset
                    )
                    self.assertLess(mx.abs(ry - rx_fast).max(), tolerances[dtype])
                self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])

            dims, dtype, base, scale, _, _ = defaults
            for offset in offsets:
                x = mx.random.uniform(shape=(2, T, dims)).astype(dtype)
                rx = rope_orig(x, dims, traditional, base, scale, offset)
                rx_fast = mx.fast.rope(
                    x,
                    dims,
                    traditional=traditional,
                    base=base,
                    scale=scale,
                    offset=offset,
                )
                self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])

            dims, dtype, base, _, offset, _ = defaults
            for scale in scales:
                x = mx.random.uniform(shape=(2, T, dims)).astype(dtype)
                rx = rope_orig(x, dims, traditional, base, scale, offset)
                rx_fast = mx.fast.rope(
                    x,
                    dims,
                    traditional=traditional,
                    base=base,
                    scale=scale,
                    offset=offset,
                )
                self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])

        # Test transpose into rope
        dims, _, base, scale, offset, traditional = defaults
        x = mx.random.uniform(shape=(1, 1, 4, dims)).swapaxes(1, 2)
        rx = rope_orig(x, dims, traditional, base, scale, offset)
        rx_fast = mx.fast.rope(
            1.0 * x,  # multiply here to allow donation
            dims,
            traditional=traditional,
            base=base,
            scale=scale,
            offset=offset,
        )
        self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[mx.float32])

        # Test raises with integer inputs
        dims, _, base, scale, offset, traditional = defaults
        x = (mx.random.uniform(shape=(2, T, dims)) * 10).astype(mx.int32)
        with self.assertRaises(ValueError):
            y = mx.fast.rope(
                x, dims, traditional=traditional, base=base, scale=scale, offset=offset
            )

    def test_rope_dims_validation(self):
        T = 4
        feature_dim = 64
        x = mx.random.uniform(shape=(1, T, feature_dim))

        # dims = 0 should raise
        with self.assertRaises(ValueError):
            mx.fast.rope(
                x, dims=0, traditional=False, base=10000.0, scale=1.0, offset=0
            )

        # negative dims should raise
        with self.assertRaises(ValueError):
            mx.fast.rope(
                x, dims=-2, traditional=False, base=10000.0, scale=1.0, offset=0
            )

        # odd dims should raise
        with self.assertRaises(ValueError):
            mx.fast.rope(
                x, dims=7, traditional=False, base=10000.0, scale=1.0, offset=0
            )

        # dims > feature_dim should raise
        with self.assertRaises(ValueError):
            mx.fast.rope(
                x, dims=128, traditional=False, base=10000.0, scale=1.0, offset=0
            )

        # valid dims should not raise
        mx.fast.rope(x, dims=32, traditional=False, base=10000.0, scale=1.0, offset=0)
        mx.fast.rope(
            x, dims=feature_dim, traditional=False, base=10000.0, scale=1.0, offset=0
        )

    def test_rope_with_freqs(self):
        mx.random.seed(0)

        # Check throws
        T = 4
        dims = 8
        x = mx.random.uniform(shape=(2, T, dims))

        with self.assertRaises(ValueError):
            freqs = mx.random.uniform(shape=(dims - 1,))
            mx.fast.rope(
                x,
                dims,
                traditional=False,
                base=None,
                scale=1.0,
                offset=0,
                freqs=freqs,
            )
        with self.assertRaises(ValueError):
            freqs = mx.random.uniform(shape=(1, dims))
            mx.fast.rope(
                x,
                dims,
                traditional=False,
                base=None,
                scale=1.0,
                offset=0,
                freqs=freqs,
            )

        freqs = mx.random.uniform(shape=(dims // 2,))

        tolerances = {mx.float32: 1e-5, mx.float16: 1e-2}
        for dtype in [mx.float32, mx.float16]:
            x_ = x.astype(dtype)
            rx = rope_orig(x_, dims, False, None, 1.0, 0, freqs)
            rx_fast = mx.fast.rope(
                x_,
                dims,
                traditional=False,
                base=None,
                scale=1.0,
                offset=0,
                freqs=freqs,
            )
            self.assertEqual(dtype, rx.dtype)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])
            return

        # Test single vector
        x = mx.random.uniform(shape=(1, 1, dims))
        rx = rope_orig(x, dims, False, None, 1.0, 0, freqs)
        rx_fast = mx.fast.rope(
            x,
            dims,
            traditional=False,
            base=None,
            scale=1.0,
            offset=0,
            freqs=freqs,
        )
        self.assertLess(mx.abs(rx - rx_fast).max(), 1e-5)

        # Test grad with freqs
        f1 = lambda x, y: (rope_orig(x, dims, False, None, 1.0, 0, freqs) * y).sum()
        f2 = lambda x, y: (
            mx.fast.rope(
                x,
                dims,
                traditional=False,
                base=None,
                scale=1.0,
                offset=0,
                freqs=freqs,
            )
            * y
        ).sum()

        x = mx.random.uniform(shape=(2, 4, dims))
        y = mx.random.uniform(shape=(2, 4, dims))
        g1 = mx.grad(f1)(x, y)
        g2 = mx.grad(f2)(x, y)
        self.assertLess(mx.abs(g1 - g2).max(), 1e-5)

    def test_rope_grad(self):
        D = 32
        defaults = (D, 10000.0, 1.0, 0, False)
        for dims in (D, D // 2):
            for traditional in (True, False):
                _, base, scale, offset, _ = defaults
                f1 = lambda x, y: (
                    rope_orig(x, dims, traditional, base, scale, offset) * y
                ).sum()
                f2 = lambda x, y: (
                    mx.fast.rope(
                        x,
                        dims,
                        traditional=traditional,
                        base=base,
                        scale=scale,
                        offset=offset,
                    )
                    * y
                ).sum()

                x = mx.random.uniform(shape=(2, 100, D))
                y = mx.random.uniform(shape=(2, 100, D))
                g1 = mx.grad(f1)(x, y)
                g2 = mx.grad(f2)(x, y)
                self.assertLess(mx.abs(g1 - g2).max(), 1e-5)

    def test_rope_batch(self):
        T = 4
        base = 10000.0
        scale = 1.0
        traditional = True
        batch_sizes = [3, 8, 11]
        num_heads = [1, 3, 5]
        dims = 32

        x = mx.random.uniform(shape=(8, 4, T, dims))

        offset = mx.array([1, 2, 3])
        with self.assertRaises(ValueError):
            mx.fast.rope(
                x,
                dims,
                traditional=traditional,
                base=base,
                scale=scale,
                offset=offset,
            )

        for batch_size in batch_sizes:
            for n_head in num_heads:
                x = mx.random.uniform(shape=(batch_size, n_head, T, dims))
                offset = mx.arange(batch_size)
                rx = rope_orig(x, dims, traditional, base, scale, offset)
                rx_fast = mx.fast.rope(
                    x,
                    dims,
                    traditional=traditional,
                    base=base,
                    scale=scale,
                    offset=offset,
                )
                self.assertLess(mx.abs(rx - rx_fast).max(), 1e-5)
        x = mx.random.normal(shape=(2, 6, 8, 64)).transpose(0, 2, 1, 3)
        dims = 64
        offset = 0
        rx_fast = mx.fast.rope(
            x, dims, traditional=traditional, scale=scale, base=base, offset=offset
        )
        rx_fast_single = mx.fast.rope(
            x[0:1], dims, traditional=traditional, scale=scale, base=base, offset=offset
        )

        rx = rope_orig(x, dims, traditional, base, scale, offset)
        self.assertLess(mx.abs(rx - rx_fast).max(), 1e-5)

    def test_rope_with_large_offset(self):
        x = mx.random.normal(shape=(1, 1, 1024, 32))
        rx_fp32 = mx.fast.rope(
            x,
            32,
            traditional=False,
            scale=1.0,
            base=10000,
            offset=4000,
        )
        rx_bf16 = mx.fast.rope(
            x.astype(mx.bfloat16),
            32,
            traditional=False,
            scale=1.0,
            base=10000,
            offset=4000,
        )
        self.assertLess((rx_fp32 - rx_bf16).abs().max(), 1e-1)

    def test_rms_norm(self):
        # Per dtype absolute tolerance
        tolerances = {mx.float32: 1e-6, mx.float16: 1e-3, mx.bfloat16: 1e-2}

        dtypes = [mx.float32, mx.float16, mx.bfloat16]
        epss = [1e-3, 1e-5]
        dimss = [31, 32, 33]
        defaults = (mx.float32, 1e-5, 32)

        for dtype in dtypes:
            _, eps, dims = defaults
            x = mx.random.uniform(
                shape=(
                    2,
                    dims,
                )
            ).astype(dtype)
            weight = mx.random.uniform(shape=(dims,)).astype(dtype)
            rx = rms_norm(x, weight, eps)
            rx_fast = mx.fast.rms_norm(x, weight, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])
            rx = rms_norm(x, mx.ones_like(weight), eps)
            rx_fast = mx.fast.rms_norm(x, None, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])

        for eps in epss:
            dtype, _, dims = defaults
            x = mx.random.uniform(shape=(2, dims)).astype(dtype)
            weight = mx.random.uniform(shape=(dims,)).astype(dtype)
            rx = rms_norm(x, weight, eps)
            rx_fast = mx.fast.rms_norm(x, weight, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])
            rx = rms_norm(x, mx.ones_like(weight), eps)
            rx_fast = mx.fast.rms_norm(x, None, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])

        for dims in dimss:
            dtype, eps, _ = defaults
            x = mx.random.uniform(shape=(2, dims)).astype(dtype)
            weight = mx.random.uniform(shape=(dims,)).astype(dtype)
            rx = rms_norm(x, weight, eps)
            rx_fast = mx.fast.rms_norm(x, weight, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])
            rx = rms_norm(x, mx.ones_like(weight), eps)
            rx_fast = mx.fast.rms_norm(x, None, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])

        # Test > 4096
        dims, dtype, eps = 4099, mx.float32, 1e-5
        x = mx.random.uniform(shape=(dims,)).astype(dtype)
        weight = mx.random.uniform(shape=(dims,)).astype(dtype)
        rx = rms_norm(x, weight, eps)
        rx_fast = mx.fast.rms_norm(x, weight, eps)
        self.assertLess(mx.abs(rx - rx_fast).max(), 1e-6)

        # Wrong size w raises
        with self.assertRaises(ValueError):
            x = mx.random.uniform(shape=(1, 5))
            mx.fast.rms_norm(x, mx.ones((4,)), 1e-5)

    def test_rms_norm_grad(self):
        D = 32
        eps = 1e-5
        f1 = lambda x, w, y: (rms_norm(x, w, eps) * y).sum()
        f2 = lambda x, w, y: (mx.fast.rms_norm(x, w, eps) * y).sum()
        f3 = lambda x, y: (rms_norm(x, mx.ones((x.shape[-1],)), eps) * y).sum()
        f4 = lambda x, y: (mx.fast.rms_norm(x, None, eps) * y).sum()

        x = mx.random.uniform(shape=(8, 100, D))
        w = mx.random.uniform(shape=(D,))
        y = mx.random.uniform(shape=(8, 100, D))
        gx1, gw1 = mx.grad(f1, argnums=(0, 1))(x, w, y)
        gx2, gw2 = mx.grad(f2, argnums=(0, 1))(x, w, y)
        self.assertLess(mx.abs(gx1 - gx2).max(), 1e-5)
        self.assertLess(mx.abs(gw1 - gw2).max() / mx.abs(gw1).mean(), 1e-5)
        gx1 = mx.grad(f3, argnums=(0,))(x, y)
        gx2 = mx.grad(f4, argnums=(0,))(x, y)
        self.assertLess(mx.abs(gx1 - gx2).max(), 1e-5)

        D = 8192
        x = mx.random.uniform(shape=(2, 2, D))
        w = mx.random.uniform(shape=(D,))
        y = mx.random.uniform(shape=(2, 2, D))
        gx1, gw1 = mx.grad(f1, argnums=(0, 1))(x, w, y)
        gx2, gw2 = mx.grad(f2, argnums=(0, 1))(x, w, y)
        self.assertLess(mx.abs(gx1 - gx2).max(), 1e-5)
        self.assertLess(mx.abs(gw1 - gw2).max() / mx.abs(gw1).mean(), 1e-5)
        gx1 = mx.grad(f3, argnums=(0,))(x, y)
        gx2 = mx.grad(f4, argnums=(0,))(x, y)
        self.assertLess(mx.abs(gx1 - gx2).max(), 1e-5)

        def gf(f):
            def inner(x, w, y):
                gx, gw = mx.grad(f, argnums=(0, 1))(x, w, y)
                return (gx + gw).sum()

            return inner

        gx1, gw1 = mx.grad(gf(f1), argnums=(0, 1))(x, w, y)
        gx2, gw2 = mx.grad(gf(f2), argnums=(0, 1))(x, w, y)
        self.assertLess(mx.abs(gx1 - gx2).max(), 1e-5)
        self.assertLess(mx.abs(gw1 - gw2).max() / mx.abs(gw1).mean(), 1e-5)

    def test_layer_norm_dim_check(self):
        with self.assertRaises(ValueError):
            weight = mx.ones((129,))
            x = mx.random.randint(low=0, high=10, shape=(4, 128))
            mx.fast.layer_norm(x, weight, None, 1e-3)

        with self.assertRaises(ValueError):
            bias = mx.ones((129,))
            x = mx.random.randint(low=0, high=10, shape=(4, 128))
            mx.fast.layer_norm(x, None, bias, 1e-3)

    def test_layer_norm(self):
        # Per dtype absolute tolerance
        tolerances = {mx.float32: 1e-5, mx.float16: 5e-3, mx.bfloat16: 5e-2}

        dtypes = [mx.float32, mx.float16, mx.bfloat16]
        epss = [1e-3, 1e-5]
        dimss = [31, 32, 33]
        defaults = (mx.float32, 1e-5, 32)

        for dtype in dtypes:
            _, eps, dims = defaults
            x = mx.random.uniform(
                shape=(
                    2,
                    dims,
                )
            ).astype(dtype)
            weight = mx.random.uniform(shape=(dims,)).astype(dtype)
            bias = mx.random.uniform(shape=(dims,)).astype(dtype)
            rx = layer_norm(x, weight, bias, eps)
            rx_fast = mx.fast.layer_norm(x, weight, bias, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])
            rx = layer_norm(x, weight, None, eps)
            rx_fast = mx.fast.layer_norm(x, weight, None, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])
            rx = layer_norm(x, None, bias, eps)
            rx_fast = mx.fast.layer_norm(x, None, bias, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])
            rx = layer_norm(x, None, None, eps)
            rx_fast = mx.fast.layer_norm(x, None, None, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])

        for eps in epss:
            dtype, _, dims = defaults
            x = mx.random.uniform(shape=(2, dims)).astype(dtype)
            weight = mx.random.uniform(shape=(dims,)).astype(dtype)
            bias = mx.random.uniform(shape=(dims,)).astype(dtype)
            rx = layer_norm(x, weight, bias, eps)
            rx_fast = mx.fast.layer_norm(x, weight, bias, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])
            rx = layer_norm(x, weight, None, eps)
            rx_fast = mx.fast.layer_norm(x, weight, None, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])
            rx = layer_norm(x, None, bias, eps)
            rx_fast = mx.fast.layer_norm(x, None, bias, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])
            rx = layer_norm(x, None, None, eps)
            rx_fast = mx.fast.layer_norm(x, None, None, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])

        for dims in dimss:
            dtype, eps, _ = defaults
            x = mx.random.uniform(shape=(2, dims)).astype(dtype)
            weight = mx.random.uniform(shape=(dims,)).astype(dtype)
            bias = mx.random.uniform(shape=(dims,)).astype(dtype)
            rx = layer_norm(x, weight, bias, eps)
            rx_fast = mx.fast.layer_norm(x, weight, bias, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])
            rx = layer_norm(x, weight, None, eps)
            rx_fast = mx.fast.layer_norm(x, weight, None, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])
            rx = layer_norm(x, None, bias, eps)
            rx_fast = mx.fast.layer_norm(x, None, bias, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])
            rx = layer_norm(x, None, None, eps)
            rx_fast = mx.fast.layer_norm(x, None, None, eps)
            self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])

        # Test > 4096
        dims, dtype, eps = 4099, mx.float32, 1e-5
        x = mx.random.uniform(shape=(dims,)).astype(dtype)
        weight = mx.random.uniform(shape=(dims,)).astype(dtype)
        bias = mx.random.uniform(shape=(dims,)).astype(dtype)
        rx = layer_norm(x, weight, bias, eps)
        rx_fast = mx.fast.layer_norm(x, weight, bias, eps)
        self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])
        rx = layer_norm(x, weight, None, eps)
        rx_fast = mx.fast.layer_norm(x, weight, None, eps)
        self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])
        rx = layer_norm(x, None, bias, eps)
        rx_fast = mx.fast.layer_norm(x, None, bias, eps)
        self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])
        rx = layer_norm(x, None, None, eps)
        rx_fast = mx.fast.layer_norm(x, None, None, eps)
        self.assertLess(mx.abs(rx - rx_fast).max(), tolerances[dtype])

    def test_slice_into_layer_norm(self):
        dim = 128
        eps = 1e-5
        x = mx.random.uniform(shape=(8, 100, 128))[:, 99:]
        rx_fast = mx.fast.layer_norm(x, weight=None, bias=None, eps=eps)
        rx = layer_norm(x, None, None, eps)
        self.assertLess(mx.abs(rx - rx_fast).max(), 1e-4)

    def test_layer_norm_grad(self):
        D = 32
        eps = 1e-5
        f1 = lambda x, w, b, y: (layer_norm(x, w, b, eps) * y).sum()
        f2 = lambda x, w, b, y: (mx.fast.layer_norm(x, w, b, eps) * y).sum()

        x = mx.random.uniform(shape=(8, 100, D))
        w = mx.random.uniform(shape=(D,))
        b = mx.random.uniform(shape=(D,))
        y = mx.random.uniform(shape=(8, 100, D))

        gx1, gw1, gb1 = mx.grad(f1, argnums=(0, 1, 2))(x, w, b, y)
        gx2, gw2, gb2 = mx.grad(f2, argnums=(0, 1, 2))(x, w, b, y)
        self.assertLess(mx.abs(gx1 - gx2).max(), 1e-5)
        self.assertLess(mx.abs(gw1 - gw2).max() / mx.abs(gw1).mean(), 1e-5)
        self.assertLess(mx.abs(gb1 - gb2).max() / mx.abs(gb1).mean(), 1e-5)

        D = 8192
        x = mx.random.uniform(shape=(8, 100, D))
        w = mx.random.uniform(shape=(D,))
        b = mx.random.uniform(shape=(D,))
        y = mx.random.uniform(shape=(8, 100, D))

        gx1, gw1, gb1 = mx.grad(f1, argnums=(0, 1, 2))(x, w, b, y)
        gx2, gw2, gb2 = mx.grad(f2, argnums=(0, 1, 2))(x, w, b, y)
        self.assertLess(mx.abs(gx1 - gx2).max(), 5e-5)
        self.assertLess(mx.abs(gw1 - gw2).max() / mx.abs(gw1).mean(), 5e-5)
        self.assertLess(mx.abs(gb1 - gb2).max() / mx.abs(gb1).mean(), 5e-5)

        def gf(f):
            def inner(x, w, b, y):
                gx, gw, gb = mx.grad(f, argnums=(0, 1, 2))(x, w, b, y)
                return ((gx + gw + gb) * y).sum()

            return inner

        gx1, gw1, gb1 = mx.grad(gf(f1), argnums=(0, 1, 2))(x, w, b, y)
        gx2, gw2, gb2 = mx.grad(gf(f2), argnums=(0, 1, 2))(x, w, b, y)
        self.assertLess(mx.abs(gx1 - gx2).max() / mx.abs(gx1).mean(), 5e-5)
        self.assertLess(mx.abs(gw1 - gw2).max() / mx.abs(gw1).mean(), 5e-5)
        self.assertLess(mx.abs(gb1).max(), 1e-9)
        self.assertLess(mx.abs(gb2).max(), 1e-9)

    def test_layer_norm_grad_no_bias(self):
        # Second-order gradient through layer_norm with weight but no bias.
        # Regression test: the VJP fallback had zeros_like(w) instead of
        # zeros_like(b) for the bias placeholder gradient, causing a shape
        # mismatch that crashes on higher-order differentiation.
        D = 8
        eps = 1e-5
        x = mx.random.uniform(shape=(2, 4, D))
        w = mx.random.uniform(shape=(D,))
        y = mx.random.uniform(shape=(2, 4, D))
        mx.eval(x, w, y)

        f_ref = lambda x, w, y: (layer_norm(x, w, None, eps) * y).sum()
        f_fast = lambda x, w, y: (mx.fast.layer_norm(x, w, None, eps) * y).sum()

        # First order should match reference
        gx1, gw1 = mx.grad(f_ref, argnums=(0, 1))(x, w, y)
        gx2, gw2 = mx.grad(f_fast, argnums=(0, 1))(x, w, y)
        self.assertLess(mx.abs(gx1 - gx2).max(), 1e-5)
        self.assertLess(mx.abs(gw1 - gw2).max() / mx.abs(gw1).mean(), 1e-5)

        # Second order — this crashes without the fix due to shape mismatch
        # in the bias placeholder gradient: zeros_like(w) shape (D,) vs
        # expected zeros_like(b) shape ()
        def gf(f):
            def inner(x, w, y):
                gx, gw = mx.grad(f, argnums=(0, 1))(x, w, y)
                return ((gx + gw) * y).sum()

            return inner

        gx1, gw1 = mx.grad(gf(f_ref), argnums=(0, 1))(x, w, y)
        gx2, gw2 = mx.grad(gf(f_fast), argnums=(0, 1))(x, w, y)
        self.assertLess(mx.abs(gx1 - gx2).max() / mx.abs(gx1).mean(), 5e-5)
        self.assertLess(mx.abs(gw1 - gw2).max() / mx.abs(gw1).mean(), 5e-5)

    def test_layer_norm_grad_no_params(self):
        eps = 1e-5
        f1 = lambda x: layer_norm(x, None, None, eps).sum()
        f2 = lambda x: mx.fast.layer_norm(x, None, None, eps).sum()
        x = mx.random.normal(shape=(2, 2, 8))
        mx.eval(x)

        gx1 = mx.grad(f1)(x)
        gx2 = mx.grad(f2)(x)
        self.assertTrue(mx.allclose(gx1, gx2, atol=1e-6))

    def test_layer_norm_grad_params(self):
        eps = 1e-5
        f1 = lambda params, x: (layer_norm(x, params[0], params[1], eps)).sum()
        f2 = lambda params, x: (mx.fast.layer_norm(x, params[0], params[1], eps)).sum()

        w = mx.ones((8,))
        b = mx.zeros((8,))
        x = mx.random.normal(shape=(2, 2, 8))
        mx.eval(x, w, b)

        gw1, gb1 = mx.grad(f1)((w, b), x)
        gw2, gb2 = mx.grad(f2)((w, b), x)
        self.assertLess(mx.abs(gw1 - gw2).max() / mx.abs(gw1).mean(), 1e-5)
        self.assertLess(mx.abs(gb1 - gb2).max() / mx.abs(gb1).mean(), 1e-5)

    def test_fast_transforms(self):
        x = mx.random.uniform(shape=(2, 2, 8))

        defaults = (8, False, 10000.0, 1.0, 0)
        dims, traditional, base, scale, offset = defaults

        # VJP
        _, vjp_out = mx.vjp(lambda x: rope_orig(x, *defaults), (x,), (mx.ones_like(x),))
        _, vjp_fast_out = mx.vjp(
            lambda x: mx.fast.rope(
                x, dims, traditional=traditional, base=base, scale=scale, offset=offset
            ),
            (x,),
            (mx.ones_like(x),),
        )
        self.assertTrue(mx.allclose(vjp_out[0], vjp_fast_out[0]))

        # JVP
        _, jvp_out = mx.jvp(lambda x: rope_orig(x, *defaults), (x,), (mx.ones_like(x),))
        _, jvp_fast_out = mx.jvp(
            lambda x: mx.fast.rope(
                x, dims, traditional=traditional, base=base, scale=scale, offset=offset
            ),
            (x,),
            (mx.ones_like(x),),
        )
        self.assertTrue(mx.allclose(jvp_out[0], jvp_fast_out[0]))

        # VMAP
        x = mx.random.uniform(shape=(2, 2, 2, 8))
        vmap_out = mx.vmap(lambda x: rope_orig(x, *defaults))(x)
        vmap_fast_out = mx.vmap(
            lambda x: mx.fast.rope(
                x, dims, traditional=traditional, base=base, scale=scale, offset=offset
            )
        )(x)
        self.assertTrue(mx.allclose(vmap_out, vmap_fast_out))

    @unittest.skipIf(not mx.is_available(mx.gpu), "No GPU available")
    def test_custom_kernel_basic(self):
        if mx.metal.is_available():
            source = """
                uint elem = thread_position_in_grid.x;
                out1[elem] = a[elem];
            """
            custom_kernel = mx.fast.metal_kernel
        elif mx.cuda.is_available():
            source = """
                auto elem = cooperative_groups::this_grid().thread_rank();
                out1[elem] = a[elem];
            """
            custom_kernel = mx.fast.cuda_kernel

        mx.random.seed(7)
        a = mx.random.normal(shape=(2, 2))
        kernel = custom_kernel(
            name="basic",
            input_names=["a"],
            output_names=["out1"],
            source=source,
        )
        out = kernel(
            inputs=[a],
            grid=(4, 1, 1),
            threadgroup=(2, 1, 1),
            output_shapes=[(2, 2)],
            output_dtypes=[mx.float32],
            stream=mx.gpu,
        )
        self.assertTrue(mx.allclose(out[0], a))

    @unittest.skipIf(not mx.is_available(mx.gpu), "No GPU available")
    def test_custom_kernel_args(self):
        if mx.metal.is_available():
            source = """
                uint elem = thread_position_in_grid.x;
                T tmp = a[0];
                if (e) {
                    out1[elem] = a[1] + b[2] + c[3] + d + f;
                } else {
                    out1[elem] = 1;
                }
                out2[elem] = a[1] + b[2] + c[1] - d;
            """
            custom_kernel = mx.fast.metal_kernel
        elif mx.cuda.is_available():
            source = """
                auto elem = cooperative_groups::this_grid().thread_rank();
                T tmp = a[0];
                if (e) {
                    out1[elem] = a[1] + b[2] + static_cast<float>(c[3]) + d[0] + f;
                } else {
                    out1[elem] = 1;
                }
                out2[elem] = a[1] + b[2] + static_cast<float>(c[1]) - d[0];
            """
            custom_kernel = mx.fast.cuda_kernel

        mx.random.seed(7)
        a = mx.random.normal(shape=(3, 6))
        c = mx.random.normal(shape=(2, 2)).astype(mx.bfloat16)

        kernel = custom_kernel(
            name="arg_test",
            input_names=["a", "b", "c", "d"],
            output_names=["out1", "out2"],
            source=source,
        )
        out = kernel(
            inputs=[
                a,
                mx.array([3, 4, 5]),
                c,
                7.3,
            ],
            template=[
                ("e", True),
                ("f", 3),
                ("T", mx.float16),
            ],
            grid=(6, 1, 1),
            threadgroup=(2, 1, 1),
            output_shapes=[(3, 2), (3, 2)],
            output_dtypes=[mx.float32, mx.int32],
            stream=mx.gpu,
        )

        self.assertTrue(mx.allclose(out[0], mx.full((3, 2), 14.0484)))
        self.assertTrue(mx.allclose(out[1], mx.full((3, 2), -2, dtype=mx.int32)))

    @unittest.skipIf(not mx.is_available(mx.gpu), "No GPU available")
    def test_custom_kernel_strides(self):
        if mx.metal.is_available():
            source = """
                uint elem = thread_position_in_grid.x;
                uint loc = elem_to_loc(elem, inp_shape, inp_strides, inp_ndim);
                T tmp = inp[loc];
                out[elem] = metal::precise::exp(tmp) * threads_per_simdgroup;
            """
            source_contig = """
                uint elem = thread_position_in_grid.x;
                T tmp = inp[elem];
                out[elem] = metal::precise::exp(tmp) * threads_per_simdgroup;
            """
            custom_kernel = mx.fast.metal_kernel
        elif mx.cuda.is_available():
            source = """
                auto elem = cooperative_groups::this_grid().thread_rank();
                auto loc = elem_to_loc(elem, inp_shape.data(), inp_strides.data(), inp_ndim);
                T tmp = inp[loc];
                out[elem] = exp(tmp) * WARP_SIZE;
            """
            source_contig = """
                auto elem = cooperative_groups::this_grid().thread_rank();
                T tmp = inp[elem];
                out[elem] = exp(tmp) * WARP_SIZE;
            """
            custom_kernel = mx.fast.cuda_kernel

        mx.random.seed(7)
        a = mx.random.normal(shape=(3, 6))

        # non contiguous
        a = mx.tile(a[::2], [4, 1])

        for contig in [True, False]:
            kernel = custom_kernel(
                name="myexp" + str(contig),
                input_names=["inp"],
                output_names=["out"],
                source=source_contig if contig else source,
                ensure_row_contiguous=contig,
            )
            outputs = kernel(
                inputs=[a],
                template=[("T", mx.float32)],
                grid=(a.size, 1, 1),
                threadgroup=(256, 1, 1),
                output_shapes=[a.shape],
                output_dtypes=[a.dtype],
                stream=mx.gpu,
            )
            self.assertTrue(mx.allclose(mx.exp(a) * 32, outputs[0]))

    @unittest.skipIf(not mx.is_available(mx.gpu), "No GPU available")
    def test_custom_kernel_helper(self):
        if mx.metal.is_available():
            header = """
            template <typename T>
            T do_exp(T x) {
                return metal::precise::exp(x);
            }
            """
            source = """
                uint elem = thread_position_in_grid.x;
                out1[elem] = do_exp(a[elem]);
            """
            custom_kernel = mx.fast.metal_kernel
        elif mx.cuda.is_available():
            header = """
            template <typename T>
            __device__ T do_exp(T x) {
                return exp(x);
            }
            """
            source = """
                auto elem = cooperative_groups::this_grid().thread_rank();
                out1[elem] = do_exp(a[elem]);
            """
            custom_kernel = mx.fast.cuda_kernel

        mx.random.seed(7)
        a = mx.random.normal(shape=(2, 2))
        kernel = custom_kernel(
            name="helper",
            input_names=["a"],
            output_names=["out1"],
            header=header,
            source=source,
        )
        out = kernel(
            inputs=[a],
            grid=(4, 1, 1),
            threadgroup=(2, 1, 1),
            output_shapes=[(2, 2)],
            output_dtypes=[mx.float32],
            stream=mx.gpu,
        )
        self.assertTrue(mx.allclose(out[0], mx.exp(a)))

    @unittest.skipIf(not mx.is_available(mx.gpu), "No GPU available")
    def test_custom_kernel_attributes(self):
        if mx.metal.is_available():
            source = "out[0] = threads_per_threadgroup.x;"
            custom_kernel = mx.fast.metal_kernel
        elif mx.cuda.is_available():
            source = "out[0] = blockDim.x;"
            custom_kernel = mx.fast.cuda_kernel

        a = mx.zeros(shape=(1, 1))
        kernel = custom_kernel(
            name="test_fun",
            input_names=["a"],
            output_names=["out"],
            source=source,
        )
        out = kernel(
            inputs=[a],
            grid=(2, 1, 1),
            threadgroup=(2, 1, 1),
            output_shapes=[(1, 1)],
            output_dtypes=[mx.uint32],
            stream=mx.gpu,
        )[0]
        self.assertEqual(out.item(), 2)

    @unittest.skipIf(not mx.metal.is_available(), "Metal is not available")
    def test_custom_kernel_caching(self):
        def call_kernel(a: mx.array, source):
            kernel = mx.fast.metal_kernel(
                name="my_kernel",
                input_names=["inp"],
                output_names=["out"],
                source=source,
            )
            return kernel(
                inputs=[a],
                grid=(a.size, 1, 1),
                threadgroup=(a.size, 1, 1),
                output_shapes=[a.shape],
                output_dtypes=[a.dtype],
                stream=mx.gpu,
            )[0]

        a = mx.random.normal(shape=(32,))

        source = """
            uint elem = thread_position_in_grid.x;
            out[elem] = 0.0;
        """

        out = call_kernel(a, source)
        self.assertTrue(mx.array_equal(out, mx.zeros_like(out)))

        source = """
            uint elem = thread_position_in_grid.x;
            out[elem] = 1.0;
        """
        out = call_kernel(a, source)
        self.assertTrue(mx.array_equal(out, mx.ones_like(out)))


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_fast_sdpa.py
================================================
import math
import unittest
from itertools import product

import mlx.core as mx
import mlx_tests
import numpy as np


def mlx_ref_attn(q, k, v, scale=1.0, mask=None, sinks=None):
    q_dtype = q.dtype
    q = q * mx.array(scale, q_dtype)
    n_q_heads = q.shape[-3]
    n_kv_heads = k.shape[-3]
    n_repeats = n_q_heads // n_kv_heads

    B = q.shape[0]
    L = q.shape[2]
    kL = k.shape[2]

    if n_repeats > 1:
        q = mx.reshape(q, [B, n_kv_heads, n_repeats, L, -1])
        k = mx.expand_dims(k, 2)
        v = mx.expand_dims(v, 2)

    scores = q @ mx.swapaxes(k, -1, -2)
    is_causal = mask == "causal"
    if mask is not None:

        if is_causal:
            offset = kL - L
            q_indices = mx.arange(L) + offset
            k_indices = mx.arange(kL)
            mask = q_indices[:, None] >= k_indices[None]

        if n_repeats > 1 and mask.ndim >= 3:
            if mask.shape[-3] == 1:
                mask = mx.expand_dims(mask, -3)
            else:
                mask = mx.unflatten(mask, -3, (n_kv_heads, n_repeats))

        if mask.dtype == mx.bool_:
            scores = mx.where(mask, scores, mx.finfo(scores.dtype).min)
        else:
            scores += mask

    if sinks is not None:
        sinks = mx.expand_dims(sinks, (0, 2, 3))
        if n_repeats > 1:
            sinks = mx.unflatten(sinks, 1, (n_kv_heads, n_repeats))
        score_shape = list(scores.shape)
        score_shape[-1] = 1
        sinks = mx.broadcast_to(sinks, score_shape)
        scores = mx.concatenate([sinks, scores], axis=-1)

    scores = mx.softmax(scores, axis=-1, precise=True)
    if sinks is not None:
        scores = scores[..., 1:]

    out = scores @ v
    if n_repeats > 1:
        out = mx.reshape(out, [B, n_q_heads, L, -1])
    return out


def do_attention(f, q, k, v, scale, mask=None, transpose=False):
    if transpose:
        q_t = mx.transpose(q, (0, 2, 1, 3))
        k_t = mx.transpose(k, (0, 2, 1, 3))
        v_t = mx.transpose(v, (0, 2, 1, 3))
        o_t = f(q_t, k_t, v_t, scale=scale, mask=mask)
        return mx.transpose(o_t, (0, 2, 1, 3))
    else:
        return f(q, k, v, scale=scale, mask=mask)


def prepare_inputs(B, qL, kL, D, qH, kH, mask, transpose, dtype):
    mx.random.seed(0)

    scale = 1.0 / math.sqrt(D)
    shape_q = (B, qL, qH, D) if transpose else (B, qH, qL, D)
    shape_kv = (B, kL, kH, D) if transpose else (B, kH, kL, D)

    q = mx.random.uniform(0.0, 0.5, shape_q, dtype)
    k = mx.random.uniform(0.0, 0.5, shape_kv, dtype)
    v = mx.random.uniform(0.0, scale, shape_kv, dtype)

    if mask is not None:
        if mask == "additive":
            mask = mx.random.uniform(0.0, 0.5, (B, qH, qL, kL), dtype)
        elif mask == "bool":
            mask = mx.random.uniform(0.0, 1.0, (B, qH, qL, kL)) < 0.5

    return q, k, v, scale, mask


# SDPA for MHA (n_heads == n_kv_heads)
def mlx_primitives_sdpa(q, k, v, scale, mask=None):
    p = (q * scale) @ k.transpose(0, 1, 3, 2)
    qL = q.shape[2]
    kL = k.shape[2]
    is_causal = mask == "causal"
    if mask is not None:
        if is_causal:
            offset = kL - qL
            q_indices = mx.arange(qL) + offset
            k_indices = mx.arange(kL)
            mask = q_indices[:, None] >= k_indices[None]
            p = mx.where(mask, p, mx.finfo(mx.float32).min)
        elif mask.dtype == mx.bool_:
            p = mx.where(mask, p, mx.finfo(mx.float32).min)
        else:
            p += mask
    scores = mx.softmax(p.astype(mx.float32), axis=-1).astype(p.dtype)
    return scores @ v


class TestFastSDPA(mlx_tests.MLXTestCase):
    def test_sdpa_vector_kv_transposed_head_seq(self):
        D = 64
        Nq = 4
        Nkv = 1
        scale = 1.0
        mx.random.seed(0)
        q = 5e-1 * mx.random.normal(shape=(1, Nq, 1, D))

        lengths = [43, 4096]
        for L in lengths:
            k = 5e-1 * mx.random.normal(shape=(1, L, Nkv, D))
            v = 5e-1 * mx.random.normal(shape=(1, L, Nkv, D))
            k = k.swapaxes(1, 2)
            v = v.swapaxes(1, 2)
            masks = [
                mx.array(True),
                mx.array([True] * (L - 10) + [False] * 10),
                mx.random.uniform(shape=(Nq, 1, L)) > 0.2,
                mx.random.uniform(shape=(L, 1, Nq)).T > 0.2,
            ]

            for m in masks:
                ref = mlx_primitives_sdpa(q, k, v, scale, mask=m)
                out = mx.fast.scaled_dot_product_attention(
                    q,
                    k,
                    v,
                    scale=scale,
                    mask=m,
                )
                self.assertTrue(mx.allclose(ref, out, atol=1e-4, rtol=1e-4))

    def test_sdpa_vector(self):
        D = 64
        L = 43
        Nq = 4
        Nkv = 1
        scale = 1.0
        mx.random.seed(0)
        q = 5e-1 * mx.random.normal(shape=(1, Nq, 1, D))
        k = 5e-1 * mx.random.normal(shape=(1, Nkv, L, D))
        v = 5e-1 * mx.random.normal(shape=(1, Nkv, L, D))

        with self.assertRaises(ValueError):
            mx.fast.scaled_dot_product_attention(
                q,
                k,
                v,
                scale=scale,
                mask=mx.full((Nq, 2, L), False),
            )

        masks = [
            None,
            mx.array(True),
            mx.array([True] * (L - 10) + [False] * 10),
            mx.random.uniform(shape=(Nq, 1, L)) > 0.2,
            mx.random.uniform(shape=(L, 1, Nq)).T > 0.2,
            mx.random.uniform(shape=(Nq, 1, L)),
            mx.random.uniform(shape=(L, 1, Nq)).T,
            mx.log(mx.random.uniform(shape=(Nq, 1, L)) > 0.2),
            mx.log(mx.random.uniform(shape=(L, 1, Nq)).T > 0.2),
            "causal",
        ]
        for m in masks:
            ref = mlx_primitives_sdpa(q, k, v, scale, mask=m)
            out = mx.fast.scaled_dot_product_attention(
                q,
                k,
                v,
                scale=scale,
                mask=m,
            )
            self.assertTrue(mx.allclose(ref, out, atol=1e-4, rtol=1e-4))

        L = 4096
        scale = 1.0
        mx.random.seed(0)
        q = 5e-1 * mx.random.normal(shape=(1, Nq, 1, D))
        k = 5e-1 * mx.random.normal(shape=(1, Nkv, L, D))
        v = 5e-1 * mx.random.normal(shape=(1, Nkv, L, D))

        masks = [
            mx.array(True),
            mx.array([True] * (L - 10) + [False] * 10),
            mx.random.uniform(shape=(Nq, 1, L)) > 0.2,
            mx.random.uniform(shape=(L, 1, Nq)).T > 0.2,
            mx.random.uniform(shape=(Nq, 1, L)),
            mx.random.uniform(shape=(L, 1, Nq)).T,
            mx.log(mx.random.uniform(shape=(Nq, 1, L)) > 0.2),
            mx.log(mx.random.uniform(shape=(L, 1, Nq)).T > 0.2),
            "causal",
        ]
        for m in masks:
            ref = mlx_primitives_sdpa(q, k, v, scale, mask=m)
            out = mx.fast.scaled_dot_product_attention(
                q,
                k,
                v,
                scale=scale,
                mask=m,
            )
            self.assertTrue(mx.allclose(ref, out, atol=1e-4, rtol=1e-4))

    def test_sdpa_fully_masked(self):
        Lkv = 8
        mask = mx.array(False)
        for D in [128]:
            for Lq in [1, 8, 32]:
                q = mx.random.normal(shape=(1, 4, Lq, D))
                k = mx.random.normal(shape=(1, 4, Lkv, D))
                v = mx.random.normal(shape=(1, 4, Lkv, D))

                out = mx.fast.scaled_dot_product_attention(q, k, v, mask=mask, scale=1)
                self.assertFalse(mx.any(mx.isnan(out)))

    def test_sdpa_inf_score(self):
        Lkv = 8
        for D in [4, 128]:
            for Lq in [1, 8]:
                q = mx.ones(shape=(1, 4, Lq, D))
                k = mx.ones(shape=(1, 4, Lkv, D))
                v = mx.random.normal(shape=(1, 4, Lkv, D))
                k[..., 0, :] = -float("inf")
                ref = mlx_primitives_sdpa(q, k, v, scale=1, mask=None)
                out = mx.fast.scaled_dot_product_attention(q, k, v, mask=None, scale=1)
                self.assertTrue(mx.allclose(ref, out, atol=1e-4, rtol=1e-4))

    def test_sdpa_few_query(self):
        D = 64
        L = 43
        Lq = 8
        Nq = 8
        Nkv = 1
        scale = 1.0
        mx.random.seed(0)
        q = 5e-1 * mx.random.normal(shape=(1, Lq, Nq, D))
        q = q.swapaxes(1, 2)
        k = 5e-1 * mx.random.normal(shape=(1, Nkv, L, D))
        v = 5e-1 * mx.random.normal(shape=(1, Nkv, L, D))

        masks = [
            None,
            mx.array(True),
            mx.array([True] * (L - 10) + [False] * 10),
            mx.random.uniform(shape=(Nq, 1, L)) > 0.2,
            mx.random.uniform(shape=(L, 1, Nq)).T > 0.2,
            "causal",
        ]
        for m in masks:
            ref = mlx_primitives_sdpa(q, k, v, scale, mask=m)
            out = mx.fast.scaled_dot_product_attention(
                q,
                k,
                v,
                scale=scale,
                mask=m,
            )
            self.assertTrue(mx.allclose(ref, out, atol=1e-4, rtol=1e-4))

        L = 4096
        scale = 1.0
        mx.random.seed(0)
        q = 5e-1 * mx.random.normal(shape=(1, Nq, Lq, D))
        k = 5e-1 * mx.random.normal(shape=(1, Nkv, L, D))
        v = 5e-1 * mx.random.normal(shape=(1, Nkv, L, D))

        masks = [
            None,
            mx.array(True),
            mx.array([True] * (L - 10) + [False] * 10),
            mx.random.uniform(shape=(Nq, 1, L)) > 0.2,
            mx.random.uniform(shape=(L, 1, Nq)).T > 0.2,
            "causal",
        ]
        for m in masks:
            ref = mlx_primitives_sdpa(q, k, v, scale, mask=m)
            out = mx.fast.scaled_dot_product_attention(
                q,
                k,
                v,
                scale=scale,
                mask=m,
            )
            self.assertTrue(mx.allclose(ref, out, atol=1e-4, rtol=1e-4))

    @unittest.skip("Different head and value dims is not enabled")
    def test_sdpa_vector_value_dims(self):
        D = 192
        V = 128
        Nq = 4
        Nkv = 1
        scale = 1.0
        mx.random.seed(0)

        for L in [43, 128, 237, 8192]:
            q = 5e-1 * mx.random.normal(shape=(1, Nq, 1, D))
            k = 5e-1 * mx.random.normal(shape=(1, Nkv, L, D))
            v = 5e-1 * mx.random.normal(shape=(1, Nkv, L, V))
            ref = mlx_primitives_sdpa(q, k, v, scale)
            out = mx.fast.scaled_dot_product_attention(q, k, v, scale=scale)
            self.assertTrue(mx.allclose(ref, out, atol=1e-4, rtol=1e-4))

    def test_sdpa_vector_batched(self):
        D = 64
        q = mx.random.normal(shape=(2, 1, 3, D))
        k = mx.random.normal(shape=(2, 1, 3, D))
        v = mx.random.normal(shape=(2, 1, 3, D))

        out = mx.fast.scaled_dot_product_attention(q, k, v, mask=None, scale=1.0)
        ref = mlx_ref_attn(q, k, v)
        self.assertTrue(mx.allclose(ref, out, atol=1e-4, rtol=1e-4))

        q = mx.random.normal(shape=(2, 4, 3, D))
        out = mx.fast.scaled_dot_product_attention(q, k, v, mask=None, scale=1.0)
        ref = mlx_ref_attn(q, k, v)
        self.assertTrue(mx.allclose(ref, out, atol=1e-4, rtol=1e-4))

        q = mx.random.normal(shape=(2, 3, 4, D)).swapaxes(1, 2)
        out = mx.fast.scaled_dot_product_attention(q, k, v, mask=None, scale=1.0)
        ref = mlx_ref_attn(q, k, v)
        self.assertTrue(mx.allclose(ref, out, atol=1e-4, rtol=1e-4))

        k = mx.random.normal(shape=(2, 3, 1, D)).swapaxes(1, 2)
        out = mx.fast.scaled_dot_product_attention(q, k, v, mask=None, scale=1.0)
        ref = mlx_ref_attn(q, k, v)
        self.assertTrue(mx.allclose(ref, out, atol=1e-4, rtol=1e-4))

        q = mx.random.normal(shape=(2, 4, 3, D))
        k = mx.random.normal(shape=(2, 3, 2, D)).swapaxes(1, 2)
        v = mx.random.normal(shape=(2, 2, 3, D))
        out = mx.fast.scaled_dot_product_attention(q, k, v, mask=None, scale=1.0)
        ref = mlx_ref_attn(q, k, v)
        self.assertTrue(mx.allclose(ref, out, atol=1e-4, rtol=1e-4))

        q = mx.random.normal(shape=(2, 4, 3, D))
        k = mx.random.normal(shape=(2, 1, 3, D))
        v = mx.random.normal(shape=(2, 1, 3, D))
        mask = 10 * mx.random.normal(shape=(1, 2, 3, 3)).swapaxes(0, 1)
        out = mx.fast.scaled_dot_product_attention(q, k, v, mask=mask, scale=1.0)
        ref = mlx_ref_attn(q, k, v, mask=mask)
        self.assertTrue(mx.allclose(ref, out, atol=1e-4, rtol=1e-4))

    @unittest.skipIf(not mx.is_available(mx.gpu), "too slow on CPU")
    def test_sdpa(self):
        # fmt: off
        shapes_64 = [
            # (  B,   qsl,   ksl, head_dim, n_qh, n_kvh)
            (  1,    20,    20,       64,    3,     3),
            (  1,    63,    63,       64,   24,    24),
            (  1,   129,   129,       64,   24,    24),
            (  1,   400,   400,       64,   24,    24),
            (  1,   128,   128,       64,   32,    32),
            (  1,    64,   128,       64,   32,    32),
            (  1,    65,   128,       64,   32,     8),
            (  1,    64,   127,       64,   32,     8),
            (  1,    65,   127,       64,   32,     8),
            (  1,   127,    65,       64,   32,     8),
        ]
        shapes_128 = [
            # (  B,   qsl,   ksl, head_dim, n_qh, n_kvh)
            (  1,   128,   128,      128,   32,     8),
            (  1,    64,   128,      128,   32,     8),
            (  1,    65,   127,      128,   32,     8),
            (  1,   127,    65,      128,   32,     8),
        ]
        for ksl in [7, 9, 32, 63, 67, 129, 400, 2000]:
            shapes_128.append((1, 1, ksl, 128, 32, 32))
            shapes_128.append((1, 1, ksl, 128, 32, 8))
        # fmt: on

        shapes = shapes_64 + shapes_128
        dtypes = [mx.float16]
        if mx.metal.is_available():
            dtypes.append(mx.float32)
        masks = [None, "additive", "bool", "causal"]
        transposes = (False, True)

        for dtype, t, mask_str, (B, qL, kL, D, qH, kH) in product(
            dtypes, transposes, masks, shapes
        ):
            with self.subTest(
                B=B,
                qsl=qL,
                ksl=kL,
                head_dim=D,
                n_q_heads=qH,
                n_kv_heads=kH,
                mask=mask_str,
                transpose=t,
                dtype=dtype,
            ):
                q, k, v, scale, mask = prepare_inputs(
                    B, qL, kL, D, qH, kH, mask_str, t, dtype
                )

                out_ref = do_attention(mlx_ref_attn, q, k, v, scale, mask, t)

                out_fst = do_attention(
                    mx.fast.scaled_dot_product_attention,
                    q,
                    k,
                    v,
                    scale,
                    mask,
                    t,
                )

                # For causal mask when qL > kL, first qL-kL rows are undefined
                # Compare only the valid portion
                if mask_str == "causal" and qL > kL:
                    offset = qL - kL
                    if t:  # transpose=True: shape is (B, qL, qH, D)
                        out_ref = out_ref[:, offset:, :, :]
                        out_fst = out_fst[:, offset:, :, :]
                    else:  # transpose=False: shape is (B, qH, qL, D)
                        out_ref = out_ref[:, :, offset:, :]
                        out_fst = out_fst[:, :, offset:, :]

                atol = 2e-5 if dtype == mx.float32 else 3e-4

                self.assertListEqual(list(out_ref.shape), list(out_fst.shape))

                diff = mx.abs(out_fst - out_ref) - atol * mx.abs(out_ref)
                self.assertLessEqual(mx.max(diff).item(), atol)

    def test_sdpa_broadcast_mask(self):
        mask = mx.array(True)
        D = 64
        Nq = 4
        Nkv = 1
        scale = 1.0
        L = 256

        mx.random.seed(0)
        q = 5e-1 * mx.random.normal(shape=(1, Nq, L, D))
        k = 5e-1 * mx.random.normal(shape=(1, Nkv, L, D))
        v = 5e-1 * mx.random.normal(shape=(1, Nkv, L, D))
        ref = mlx_primitives_sdpa(q, k, v, scale, mask=mask)
        out = mx.fast.scaled_dot_product_attention(q, k, v, scale=scale, mask=mask)
        self.assertTrue(mx.allclose(ref, out, atol=1e-4, rtol=1e-4))

    def test_sdpa_noncontiguous_inputs(self):
        mask = mx.ones(shape=(4, 1, 7, 7), dtype=mx.bool_)
        mx.random.seed(0)
        q = mx.random.normal(shape=(4, 7, 32, 64)).swapaxes(1, 2)

        k = mx.random.normal(shape=(4, 7, 8, 64)).swapaxes(1, 2)
        v = mx.random.normal(shape=(4, 7, 8, 64)).swapaxes(1, 2)
        out = mx.fast.scaled_dot_product_attention(q, k, v, scale=1.0, mask=mask)
        ref = mlx_ref_attn(q, k, v, scale=1.0, mask=mask)
        self.assertTrue(mx.allclose(ref, out, atol=1e-4, rtol=1e-4))

    def test_sdpa_promote_mask(self):
        mask = mx.array(2.0, mx.bfloat16)
        D = 64
        Nq = 4
        Nkv = 1
        scale = 1.0
        L = 256

        mx.random.seed(0)
        q = 5e-1 * mx.random.normal(shape=(1, Nq, L, D))
        k = 5e-1 * mx.random.normal(shape=(1, Nkv, L, D))
        v = 5e-1 * mx.random.normal(shape=(1, Nkv, L, D))
        ref = mlx_primitives_sdpa(q, k, v, scale, mask=mask)
        out = mx.fast.scaled_dot_product_attention(q, k, v, scale=scale, mask=mask)
        self.assertTrue(mx.allclose(ref, out, atol=1e-4, rtol=1e-4))

    def test_sdpa_nan_bug(self):
        N = 128
        q_shape = (1, 1, N, 128)
        kv_shape = (1, 1, N, 128)
        q = mx.random.uniform(shape=q_shape)
        k = mx.random.uniform(shape=kv_shape)
        v = mx.random.uniform(shape=kv_shape)

        # Make boolean window causal mask
        linds = rinds = mx.arange(N)
        linds = linds[:, None]
        rinds = rinds[None]
        mask = linds >= rinds
        mask = mask & (linds <= rinds + 111)

        out = mx.fast.scaled_dot_product_attention(q, k, v, mask=mask, scale=1.0)
        expected = mlx_ref_attn(q, k, v, mask=mask, scale=1.0)
        self.assertFalse(mx.isnan(out).any().item())
        self.assertLessEqual(mx.abs(out - expected).max().item(), 1e-4)

        # And an additive one
        mask = mx.log(mask)

        out = mx.fast.scaled_dot_product_attention(q, k, v, mask=mask, scale=1.0)
        expected = mlx_ref_attn(q, k, v, mask=mask, scale=1.0)
        self.assertFalse(mx.isnan(out).any().item())
        self.assertLessEqual(mx.abs(out - expected).max().item(), 1e-4)

    def test_sdpa_attention_sinks(self):
        B = 2
        N_q = N_kv = 8
        T_q = T_kv = 128
        D = 64

        q = mx.random.normal(shape=(B, N_q, T_q, D))
        k = mx.random.normal(shape=(B, N_kv, T_kv, D))
        v = mx.random.normal(shape=(B, N_kv, T_kv, D))
        scale = D**-0.5

        # sinks should promote to correct type
        sinks = mx.random.normal(shape=(N_q,))
        with self.assertRaises(ValueError):
            mx.fast.scaled_dot_product_attention(
                q.astype(mx.float16),
                k.astype(mx.float16),
                v.astype(mx.float16),
                scale=scale,
                sinks=sinks,
            )

        # Wrong shapes
        sinks = mx.random.normal(shape=(N_q + 1,))
        with self.assertRaises(ValueError):
            mx.fast.scaled_dot_product_attention(q, k, v, scale=scale, sinks=sinks)

        sinks = mx.random.normal(shape=())
        with self.assertRaises(ValueError):
            mx.fast.scaled_dot_product_attention(q, k, v, scale=scale, sinks=sinks)

        for T_q, T_kv, N_kv, dtype in product(
            (1, 128),
            (128, 4096),
            (2, 8),
            (mx.float16, mx.float32),
        ):
            with self.subTest(T_q=T_q, T_kv=T_kv, N_kv=N_kv, dtype=dtype):
                q = mx.random.normal(shape=(B, N_q, T_q, D), dtype=dtype)
                k = mx.random.normal(shape=(B, N_kv, T_kv, D), dtype=dtype)
                v = mx.random.normal(shape=(B, N_kv, T_kv, D), dtype=dtype)
                sinks = 10 * mx.random.normal(shape=(N_q,), dtype=dtype)

                expected = mlx_ref_attn(q, k, v, scale, sinks=sinks)
                out = mx.fast.scaled_dot_product_attention(
                    q, k, v, scale=scale, sinks=sinks
                )
                atol = 1e-5 if dtype == mx.float32 else 1e-2
                self.assertTrue(mx.allclose(out, expected, atol=atol))

    def test_sdpa_grad(self):
        # High tolerance due to cuDNN SDPA kernel requiring tf32.
        tolerance = {"rtol": 1e-2, "atol": 1e-2}

        def test_vjp(slow, fast, primals):
            cotan = mx.ones_like(primals[0])
            o1, vjp1 = mx.vjp(slow, primals, [cotan])
            o2, vjp2 = mx.vjp(fast, primals, [cotan])

            self.assertTrue(mx.allclose(o1[0], o2[0], **tolerance))
            for i in range(3):
                self.assertTrue(mx.allclose(vjp1[i], vjp2[i], **tolerance))

        def test_grad(slow, fast, args):
            g1 = mx.grad(slow)(*args)
            g2 = mx.grad(fast)(*args)

            self.assertTrue(mx.allclose(g1, g2, **tolerance))

        B, N_kv, T, D = (2, 8, 128, 64)
        scale = D**-0.5

        for N_q in (8, 32):
            q = mx.random.normal(shape=(B, N_q, T, D), dtype=mx.float16)
            k = mx.random.normal(shape=(B, N_kv, T, D), dtype=mx.float16)
            v = mx.random.normal(shape=(B, N_kv, T, D), dtype=mx.float16)

            mask_additive = mx.random.normal((B, N_q, T, T), dtype=mx.float16)
            mask_bool = mx.random.uniform(0, 1, (B, N_q, T, T), dtype=mx.float16) < 0.5

            for mask in (None, "causal", mask_additive, mask_bool):
                sdpa_slow = lambda q, k, v: mlx_ref_attn(
                    q, k, v, scale=scale, mask=mask
                )
                sdpa_fast = lambda q, k, v: mx.fast.scaled_dot_product_attention(
                    q, k, v, scale=scale, mask=mask
                )
                test_vjp(sdpa_slow, sdpa_fast, [q, k, v])

                loss_slow = lambda q, k, v: mlx_ref_attn(
                    q, k, v, scale=scale, mask=mask
                ).sum()
                loss_fast = lambda q, k, v: mx.fast.scaled_dot_product_attention(
                    q, k, v, scale=scale, mask=mask
                ).sum()
                test_grad(loss_slow, loss_fast, [q, k, v])

    def test_sdpa_sliced(self):
        N = 8
        D = 64
        scale = D**-0.5

        for B, T_q, T_kv, offset, mask in product(
            (1, 2, 4),
            (1, 8),
            (256, 512),
            (8, 9, 64, 79),
            (None, "causal"),
        ):
            with self.subTest(B=B, T_q=T_q, T_kv=T_kv, offset=offset, mask=mask):
                q = mx.random.normal((B, N, T_q, D), mx.float16)
                k = mx.random.normal((B, N, T_kv, D), mx.float16)
                v = mx.random.normal((B, N, T_kv, D), mx.float16)

                k = k[..., :offset, :]
                v = v[..., :offset, :]

                ref = mlx_ref_attn(q, k, v, scale=scale, mask=mask)

                for i in range(2):
                    out = mx.fast.scaled_dot_product_attention(
                        q, k, v, scale=scale, mask=mask
                    )
                    if B == 1:
                        tolerance = {"rtol": 1e-3, "atol": 1e-3}
                    else:
                        tolerance = {"rtol": 1e-2, "atol": 1e-2}
                    self.assertTrue(mx.allclose(ref, out, **tolerance))


if __name__ == "__main__":
    mlx_tests.MLXTestRunner(failfast=True)


================================================
FILE: python/tests/test_fft.py
================================================
# Copyright © 2023 Apple Inc.

import itertools
import unittest

import mlx.core as mx
import mlx_tests
import numpy as np

try:
    import torch

    has_torch = True
except ImportError as e:
    has_torch = False


class TestFFT(mlx_tests.MLXTestCase):
    def check_mx_np(self, op_mx, op_np, a_np, atol=1e-5, rtol=1e-6, **kwargs):
        out_np = op_np(a_np, **kwargs)
        a_mx = mx.array(a_np)
        out_mx = op_mx(a_mx, **kwargs)
        np.testing.assert_allclose(out_np, out_mx, atol=atol, rtol=rtol)

    def test_fft(self):
        r = np.random.rand(100).astype(np.float32)
        i = np.random.rand(100).astype(np.float32)
        a_np = r + 1j * i
        self.check_mx_np(mx.fft.fft, np.fft.fft, a_np)

        # Check with slicing and padding
        r = np.random.rand(100).astype(np.float32)
        i = np.random.rand(100).astype(np.float32)
        a_np = r + 1j * i
        self.check_mx_np(mx.fft.fft, np.fft.fft, a_np, n=80)
        self.check_mx_np(mx.fft.fft, np.fft.fft, a_np, n=120)

        # Check different axes
        r = np.random.rand(100, 100).astype(np.float32)
        i = np.random.rand(100, 100).astype(np.float32)
        a_np = r + 1j * i
        self.check_mx_np(mx.fft.fft, np.fft.fft, a_np, axis=0)
        self.check_mx_np(mx.fft.fft, np.fft.fft, a_np, axis=1)

        # Check real fft
        a_np = np.random.rand(100).astype(np.float32)
        self.check_mx_np(mx.fft.rfft, np.fft.rfft, a_np)
        self.check_mx_np(mx.fft.rfft, np.fft.rfft, a_np, n=80)
        self.check_mx_np(mx.fft.rfft, np.fft.rfft, a_np, n=120)

        # Check real inverse
        r = np.random.rand(100, 100).astype(np.float32)
        i = np.random.rand(100, 100).astype(np.float32)
        a_np = r + 1j * i
        self.check_mx_np(mx.fft.ifft, np.fft.ifft, a_np)
        self.check_mx_np(mx.fft.ifft, np.fft.ifft, a_np, n=80)
        self.check_mx_np(mx.fft.ifft, np.fft.ifft, a_np, n=120)

        x = np.fft.rfft(np.real(a_np))
        self.check_mx_np(mx.fft.irfft, np.fft.irfft, x)

    def test_fftn(self):
        r = np.random.randn(8, 8, 8).astype(np.float32)
        i = np.random.randn(8, 8, 8).astype(np.float32)
        a = r + 1j * i

        axes = [None, (1, 2), (2, 1), (0, 2)]
        shapes = [None, (10, 5), (5, 10)]
        ops = [
            "fft2",
            "ifft2",
            "rfft2",
            "irfft2",
            "fftn",
            "ifftn",
            "rfftn",
            "irfftn",
        ]

        for op, ax, s in itertools.product(ops, axes, shapes):
            if ax is None and s is not None:
                continue
            x = a
            if op in ["rfft2", "rfftn"]:
                x = r
            elif op == "irfft2":
                x = np.ascontiguousarray(np.fft.rfft2(r, axes=ax, s=s))
            elif op == "irfftn":
                x = np.ascontiguousarray(np.fft.rfftn(r, axes=ax, s=s))
            mx_op = getattr(mx.fft, op)
            np_op = getattr(np.fft, op)
            self.check_mx_np(mx_op, np_op, x, axes=ax, s=s)

        # Explicitly exercise transposed layouts and axes that are not
        # physically last in memory order.
        xt = np.transpose(a, (1, 2, 0))
        self.check_mx_np(mx.fft.fftn, np.fft.fftn, xt, axes=(2, 0))
        self.check_mx_np(mx.fft.ifftn, np.fft.ifftn, xt, axes=(2, 0))

        rt = np.transpose(r, (1, 2, 0))
        self.check_mx_np(mx.fft.rfftn, np.fft.rfftn, rt, axes=(2, 0))
        irfft_in = np.ascontiguousarray(np.fft.rfftn(rt, axes=(2, 0)))
        self.check_mx_np(mx.fft.irfftn, np.fft.irfftn, irfft_in, axes=(2, 0))

    def _run_ffts(self, shape, atol=1e-4, rtol=1e-4):
        np.random.seed(9)

        r = np.random.rand(*shape).astype(np.float32)
        i = np.random.rand(*shape).astype(np.float32)
        a_np = r + 1j * i
        self.check_mx_np(mx.fft.fft, np.fft.fft, a_np, atol=atol, rtol=rtol)
        self.check_mx_np(mx.fft.ifft, np.fft.ifft, a_np, atol=atol, rtol=rtol)

        self.check_mx_np(mx.fft.rfft, np.fft.rfft, r, atol=atol, rtol=rtol)

        ia_np = np.fft.rfft(r)
        self.check_mx_np(
            mx.fft.irfft, np.fft.irfft, ia_np, atol=atol, rtol=rtol, n=shape[-1]
        )
        self.check_mx_np(mx.fft.irfft, np.fft.irfft, ia_np, atol=atol, rtol=rtol)

    def test_fft_shared_mem(self):
        nums = np.concatenate(
            [
                # small radix
                np.arange(2, 14),
                # powers of 2
                [2**k for k in range(4, 13)],
                # stockham
                [3 * 3 * 3, 3 * 11, 11 * 13 * 2, 7 * 4 * 13 * 11, 13 * 13 * 11],
                # rader
                [17, 23, 29, 17 * 8 * 3, 23 * 2, 1153, 1982],
                # bluestein
                [47, 83, 17 * 17],
                # large stockham
                [3159, 3645, 3969, 4004],
            ]
        )
        for batch_size in (1, 3, 32):
            for num in nums:
                atol = 1e-4 if num < 1025 else 1e-3
                self._run_ffts((batch_size, num), atol=atol)

    @unittest.skip("Too slow for CI but useful for local testing.")
    def test_fft_exhaustive(self):
        nums = range(2, 4097)
        for batch_size in (1, 3, 32):
            for num in nums:
                print(num)
                atol = 1e-4 if num < 1025 else 1e-3
                self._run_ffts((batch_size, num), atol=atol)

    def test_fft_big_powers_of_two(self):
        # TODO: improve precision on big powers of two on GPU
        for k in range(12, 17):
            self._run_ffts((3, 2**k), atol=1e-3)

        for k in range(17, 20):
            self._run_ffts((3, 2**k), atol=1e-2)

    def test_fft_large_numbers(self):
        numbers = [
            1037,  # prime > 2048
            18247,  # medium size prime factors
            1259 * 11,  # large prime factors
            7883,  # large prime
            3**8,  # large stockham decomposable
            3109,  # bluestein
            4006,  # large rader
        ]
        for large_num in numbers:
            self._run_ffts((1, large_num), atol=1e-3)

    def test_fft_contiguity(self):
        r = np.random.rand(4, 8).astype(np.float32)
        i = np.random.rand(4, 8).astype(np.float32)
        a_np = r + 1j * i
        a_mx = mx.array(a_np)

        # non-contiguous in the FFT dim
        out_mx = mx.fft.fft(a_mx[:, ::2])
        out_np = np.fft.fft(a_np[:, ::2])
        np.testing.assert_allclose(out_np, out_mx, atol=1e-5, rtol=1e-5)

        # non-contiguous not in the FFT dim
        out_mx = mx.fft.fft(a_mx[::2])
        out_np = np.fft.fft(a_np[::2])
        np.testing.assert_allclose(out_np, out_mx, atol=1e-5, rtol=1e-5)

        out_mx = mx.broadcast_to(mx.reshape(mx.transpose(a_mx), (4, 8, 1)), (4, 8, 16))
        out_np = np.broadcast_to(np.reshape(np.transpose(a_np), (4, 8, 1)), (4, 8, 16))
        np.testing.assert_allclose(out_np, out_mx, atol=1e-5, rtol=1e-5)

        out2_mx = mx.fft.fft(mx.abs(out_mx) + 4)
        out2_np = np.fft.fft(np.abs(out_np) + 4)
        np.testing.assert_allclose(out2_mx, out2_np, atol=1e-5, rtol=1e-5)

        b_np = np.array([[0, 1, 2, 3]])
        out_mx = mx.abs(mx.fft.fft(mx.tile(mx.reshape(mx.array(b_np), (1, 4)), (4, 1))))
        out_np = np.abs(np.fft.fft(np.tile(np.reshape(np.array(b_np), (1, 4)), (4, 1))))
        np.testing.assert_allclose(out_mx, out_np, atol=1e-5, rtol=1e-5)

    def test_fft_into_ifft(self):
        n_fft = 8193
        mx.random.seed(0)

        segment = mx.random.normal(shape=[1, n_fft]) + 1j * mx.random.normal(
            shape=(1, n_fft)
        )
        segment = mx.fft.fft(segment, n=n_fft)
        r = mx.fft.ifft(segment, n=n_fft)
        r_np = np.fft.ifft(segment, n=n_fft)
        self.assertTrue(np.allclose(r, r_np, atol=1e-5, rtol=1e-5))

    def test_fft_throws(self):
        x = mx.array(3.0)
        with self.assertRaises(ValueError):
            mx.fft.irfftn(x)

    def test_fftshift(self):
        # Test 1D arrays
        r = np.random.rand(100).astype(np.float32)
        self.check_mx_np(mx.fft.fftshift, np.fft.fftshift, r)

        # Test with specific axis
        r = np.random.rand(4, 6).astype(np.float32)
        self.check_mx_np(mx.fft.fftshift, np.fft.fftshift, r, axes=[0])
        self.check_mx_np(mx.fft.fftshift, np.fft.fftshift, r, axes=[1])
        self.check_mx_np(mx.fft.fftshift, np.fft.fftshift, r, axes=[0, 1])

        # Test with negative axes
        self.check_mx_np(mx.fft.fftshift, np.fft.fftshift, r, axes=[-1])

        # Test with odd lengths
        r = np.random.rand(5, 7).astype(np.float32)
        self.check_mx_np(mx.fft.fftshift, np.fft.fftshift, r)
        self.check_mx_np(mx.fft.fftshift, np.fft.fftshift, r, axes=[0])

        # Test with complex input
        r = np.random.rand(8, 8).astype(np.float32)
        i = np.random.rand(8, 8).astype(np.float32)
        c = r + 1j * i
        self.check_mx_np(mx.fft.fftshift, np.fft.fftshift, c)

    def test_ifftshift(self):
        # Test 1D arrays
        r = np.random.rand(100).astype(np.float32)
        self.check_mx_np(mx.fft.ifftshift, np.fft.ifftshift, r)

        # Test with specific axis
        r = np.random.rand(4, 6).astype(np.float32)
        self.check_mx_np(mx.fft.ifftshift, np.fft.ifftshift, r, axes=[0])
        self.check_mx_np(mx.fft.ifftshift, np.fft.ifftshift, r, axes=[1])
        self.check_mx_np(mx.fft.ifftshift, np.fft.ifftshift, r, axes=[0, 1])

        # Test with negative axes
        self.check_mx_np(mx.fft.ifftshift, np.fft.ifftshift, r, axes=[-1])

        # Test with odd lengths
        r = np.random.rand(5, 7).astype(np.float32)
        self.check_mx_np(mx.fft.ifftshift, np.fft.ifftshift, r)
        self.check_mx_np(mx.fft.ifftshift, np.fft.ifftshift, r, axes=[0])

        # Test with complex input
        r = np.random.rand(8, 8).astype(np.float32)
        i = np.random.rand(8, 8).astype(np.float32)
        c = r + 1j * i
        self.check_mx_np(mx.fft.ifftshift, np.fft.ifftshift, c)

    def test_fftshift_errors(self):
        # Test invalid axes
        x = mx.array(np.random.rand(4, 4).astype(np.float32))
        with self.assertRaises(ValueError):
            mx.fft.fftshift(x, axes=[2])
        with self.assertRaises(ValueError):
            mx.fft.fftshift(x, axes=[-3])

        # Test empty array
        x = mx.array([])
        self.assertTrue(mx.array_equal(mx.fft.fftshift(x), x))

    @unittest.skipIf(not has_torch, "requires PyTorch")
    def test_fft_grads(self):
        real = [True, False]
        inverse = [True, False]
        axes = [
            (-1,),
            (-2, -1),
        ]
        shapes = [
            (4, 4),
            (2, 4),
            (2, 7),
            (7, 7),
        ]

        mxffts = {
            (True, True): mx.fft.irfftn,
            (True, False): mx.fft.rfftn,
            (False, True): mx.fft.ifftn,
            (False, False): mx.fft.fftn,
        }
        tffts = {
            (True, True): torch.fft.irfftn,
            (True, False): torch.fft.rfftn,
            (False, True): torch.fft.ifftn,
            (False, False): torch.fft.fftn,
        }

        for r, i, ax, sh in itertools.product(real, inverse, axes, shapes):

            def f(x):
                y = mxffts[r, i](x)
                return (mx.abs(y) ** 2).sum()

            def g(x):
                y = tffts[r, i](x)
                return (torch.abs(y) ** 2).sum()

            if r and not i:
                x = mx.random.normal(sh)
            else:
                x = mx.random.normal((*sh, 2)).view(mx.complex64).squeeze()
            fx = f(x)
            gx = g(torch.tensor(x))
            self.assertLess((fx - gx).abs().max() / gx.abs().mean(), 1e-4)

            dfdx = mx.grad(f)(x)
            dgdx = torch.func.grad(g)(torch.tensor(x))
            self.assertLess((dfdx - dgdx).abs().max() / dgdx.abs().mean(), 1e-4)


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_graph.py
================================================
# Copyright © 2023 Apple Inc.

import io
import unittest

import mlx.core as mx
import mlx_tests


class TestGraph(mlx_tests.MLXTestCase):
    def test_to_dot(self):
        # Simply test that a few cases run.
        # Nothing too specific about the graph format
        # for now to keep it flexible
        a = mx.array(1.0)
        f = io.StringIO()
        mx.export_to_dot(f, a)
        f.seek(0)
        self.assertTrue(len(f.read()) > 0)

        b = mx.array(2.0)
        c = a + b
        f = io.StringIO()
        mx.export_to_dot(f, c)
        f.seek(0)
        self.assertTrue(len(f.read()) > 0)

        # Multi output case
        c = mx.divmod(a, b)
        f = io.StringIO()
        mx.export_to_dot(f, *c)
        f.seek(0)
        self.assertTrue(len(f.read()) > 0)


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_init.py
================================================
# Copyright © 2023 Apple Inc.
import unittest

import mlx.core as mx
import mlx.nn.init as init
import mlx_tests
import numpy as np


class TestInit(mlx_tests.MLXTestCase):
    def test_constant(self):
        value = 5.0

        for dtype in [mx.float32, mx.float16]:
            initializer = init.constant(value, dtype)
            for shape in [(3,), (3, 3), (3, 3, 3)]:
                result = initializer(mx.array(mx.zeros(shape)))
                with self.subTest(shape=shape):
                    self.assertEqual(result.shape, shape)
                    self.assertEqual(result.dtype, dtype)

    def test_normal(self):
        mean = 0.0
        std = 1.0
        for dtype in [mx.float32, mx.float16]:
            initializer = init.normal(mean, std, dtype=dtype)
            for shape in [(3,), (3, 3), (3, 3, 3)]:
                result = initializer(mx.array(np.empty(shape)))
                with self.subTest(shape=shape):
                    self.assertEqual(result.shape, shape)
                    self.assertEqual(result.dtype, dtype)

    def test_uniform(self):
        low = -1.0
        high = 1.0

        for dtype in [mx.float32, mx.float16]:
            initializer = init.uniform(low, high, dtype)
            for shape in [(3,), (3, 3), (3, 3, 3)]:
                result = initializer(mx.array(np.empty(shape)))
                with self.subTest(shape=shape):
                    self.assertEqual(result.shape, shape)
                    self.assertEqual(result.dtype, dtype)
                    self.assertTrue(mx.all(result >= low) and mx.all(result <= high))

    def test_identity(self):
        for dtype in [mx.float32, mx.float16]:
            initializer = init.identity(dtype)
            for shape in [(3,), (3, 3), (3, 3, 3)]:
                result = initializer(mx.zeros((3, 3)))
                self.assertTrue(mx.array_equal(result, mx.eye(3)))
                self.assertEqual(result.dtype, dtype)
                with self.assertRaises(ValueError):
                    result = initializer(mx.zeros((3, 2)))

    def test_glorot_normal(self):
        for dtype in [mx.float32, mx.float16]:
            initializer = init.glorot_normal(dtype)
            for shape in [(3, 3), (3, 3, 3)]:
                result = initializer(mx.array(np.empty(shape)))
                with self.subTest(shape=shape):
                    self.assertEqual(result.shape, shape)
                    self.assertEqual(result.dtype, dtype)

    def test_glorot_uniform(self):
        for dtype in [mx.float32, mx.float16]:
            initializer = init.glorot_uniform(dtype)
            for shape in [(3, 3), (3, 3, 3)]:
                result = initializer(mx.array(np.empty(shape)))
                with self.subTest(shape=shape):
                    self.assertEqual(result.shape, shape)
                    self.assertEqual(result.dtype, dtype)

    def test_he_normal(self):
        for dtype in [mx.float32, mx.float16]:
            initializer = init.he_normal(dtype)
            for shape in [(3, 3), (3, 3, 3)]:
                result = initializer(mx.array(np.empty(shape)))
                with self.subTest(shape=shape):
                    self.assertEqual(result.shape, shape)
                    self.assertEqual(result.dtype, dtype)

    def test_he_uniform(self):
        for dtype in [mx.float32, mx.float16]:
            initializer = init.he_uniform(dtype)
            for shape in [(3, 3), (3, 3, 3)]:
                result = initializer(mx.array(np.empty(shape)))
                with self.subTest(shape=shape):
                    self.assertEqual(result.shape, shape)
                    self.assertEqual(result.dtype, dtype)

    def test_sparse(self):
        mean = 0.0
        std = 1.0
        sparsity = 0.5
        for dtype in [mx.float32, mx.float16]:
            initializer = init.sparse(sparsity, mean, std, dtype=dtype)
            for shape in [(3, 2), (2, 2), (4, 3)]:
                result = initializer(mx.array(np.empty(shape)))
                with self.subTest(shape=shape):
                    self.assertEqual(result.shape, shape)
                    self.assertEqual(result.dtype, dtype)
                    self.assertEqual(
                        (mx.sum(result == 0) >= 0.5 * shape[0] * shape[1]), True
                    )
            with self.assertRaises(ValueError):
                result = initializer(mx.zeros((1,)))

    def test_orthogonal(self):
        initializer = init.orthogonal(gain=1.0, dtype=mx.float32)

        # Test with a square matrix
        shape = (4, 4)
        result = initializer(mx.zeros(shape, dtype=mx.float32))
        self.assertEqual(result.shape, shape)
        self.assertEqual(result.dtype, mx.float32)

        I = result @ result.T
        eye = mx.eye(shape[0], dtype=mx.float32)
        self.assertTrue(
            mx.allclose(I, eye, atol=1e-5), "Orthogonal init failed on a square matrix."
        )

        # Test with a rectangular matrix: more rows than cols
        shape = (6, 4)
        result = initializer(mx.zeros(shape, dtype=mx.float32))
        self.assertEqual(result.shape, shape)
        self.assertEqual(result.dtype, mx.float32)

        I = result.T @ result
        eye = mx.eye(shape[1], dtype=mx.float32)
        self.assertTrue(
            mx.allclose(I, eye, atol=1e-5),
            "Orthogonal init failed on a rectangular matrix.",
        )


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_linalg.py
================================================
# Copyright © 2023 Apple Inc.

import itertools
import math
import unittest

import mlx.core as mx
import mlx_tests
import numpy as np


class TestLinalg(mlx_tests.MLXTestCase):
    def test_norm(self):
        vector_ords = [None, 0.5, 0, 1, 2, 3, -1, float("inf"), -float("inf")]
        matrix_ords = [None, "fro", "nuc", -1, 1, -2, 2, float("inf"), -float("inf")]

        for shape in [(3,), (2, 3), (2, 3, 3)]:
            x_mx = mx.arange(1, math.prod(shape) + 1, dtype=mx.float32).reshape(shape)
            x_np = np.arange(1, math.prod(shape) + 1, dtype=np.float32).reshape(shape)
            # Test when at least one axis is provided
            for num_axes in range(1, len(shape)):
                if num_axes == 1:
                    ords = vector_ords
                else:
                    ords = matrix_ords
                for axis in itertools.combinations(range(len(shape)), num_axes):
                    for keepdims in [True, False]:
                        for o in ords:
                            stream = (
                                mx.cpu if o in ["nuc", -2, 2] else mx.default_device()
                            )
                            out_np = np.linalg.norm(
                                x_np, ord=o, axis=axis, keepdims=keepdims
                            )
                            out_mx = mx.linalg.norm(
                                x_mx, ord=o, axis=axis, keepdims=keepdims, stream=stream
                            )
                            with self.subTest(
                                shape=shape, ord=o, axis=axis, keepdims=keepdims
                            ):
                                self.assertTrue(
                                    np.allclose(out_np, out_mx, atol=1e-5, rtol=1e-6)
                                )

        # Test only ord provided
        for shape in [(3,), (2, 3)]:
            x_mx = mx.arange(1, math.prod(shape) + 1).reshape(shape)
            x_np = np.arange(1, math.prod(shape) + 1).reshape(shape)
            for o in [None, 1, -1, float("inf"), -float("inf")]:
                for keepdims in [True, False]:
                    out_np = np.linalg.norm(x_np, ord=o, keepdims=keepdims)
                    out_mx = mx.linalg.norm(x_mx, ord=o, keepdims=keepdims)
                    with self.subTest(shape=shape, ord=o, keepdims=keepdims):
                        self.assertTrue(
                            np.allclose(out_np, out_mx, atol=1e-5, rtol=1e-6)
                        )

        # Test no ord and no axis provided
        for shape in [(3,), (2, 3), (2, 3, 3)]:
            x_mx = mx.arange(1, math.prod(shape) + 1).reshape(shape)
            x_np = np.arange(1, math.prod(shape) + 1).reshape(shape)
            for keepdims in [True, False]:
                out_np = np.linalg.norm(x_np, keepdims=keepdims)
                out_mx = mx.linalg.norm(x_mx, keepdims=keepdims)
                with self.subTest(shape=shape, keepdims=keepdims):
                    self.assertTrue(np.allclose(out_np, out_mx, atol=1e-5, rtol=1e-6))

    def test_complex_norm(self):
        for shape in [(3,), (2, 3), (2, 3, 3)]:
            x_np = np.random.uniform(size=shape).astype(
                np.float32
            ) + 1j * np.random.uniform(size=shape).astype(np.float32)
            x_mx = mx.array(x_np)
            out_np = np.linalg.norm(x_np)
            out_mx = mx.linalg.norm(x_mx)
            with self.subTest(shape=shape):
                self.assertTrue(np.allclose(out_np, out_mx, atol=1e-5, rtol=1e-6))
            for num_axes in range(1, len(shape)):
                for axis in itertools.combinations(range(len(shape)), num_axes):
                    out_np = np.linalg.norm(x_np, axis=axis)
                    out_mx = mx.linalg.norm(x_mx, axis=axis)
                    with self.subTest(shape=shape, axis=axis):
                        self.assertTrue(
                            np.allclose(out_np, out_mx, atol=1e-5, rtol=1e-6)
                        )

        x_np = np.random.uniform(size=(4, 4)).astype(
            np.float32
        ) + 1j * np.random.uniform(size=(4, 4)).astype(np.float32)
        x_mx = mx.array(x_np)
        out_np = np.linalg.norm(x_np, ord="fro")
        out_mx = mx.linalg.norm(x_mx, ord="fro")
        self.assertTrue(np.allclose(out_np, out_mx, atol=1e-5, rtol=1e-6))

    def test_qr_factorization(self):
        with self.assertRaises(ValueError):
            mx.linalg.qr(mx.array(0.0))

        with self.assertRaises(ValueError):
            mx.linalg.qr(mx.array([0.0, 1.0]))

        with self.assertRaises(ValueError):
            mx.linalg.qr(mx.array([[0, 1], [1, 0]]))

        A = mx.array([[2.0, 3.0], [1.0, 2.0]])
        Q, R = mx.linalg.qr(A, stream=mx.cpu)
        out = Q @ R
        self.assertTrue(mx.allclose(out, A))
        out = Q.T @ Q
        self.assertTrue(mx.allclose(out, mx.eye(2), rtol=1e-5, atol=1e-7))
        self.assertTrue(mx.allclose(mx.tril(R, -1), mx.zeros_like(R)))
        self.assertEqual(Q.dtype, mx.float32)
        self.assertEqual(R.dtype, mx.float32)

        # Multiple matrices
        B = mx.array([[-1.0, 2.0], [-4.0, 1.0]])
        A = mx.stack([A, B])
        Q, R = mx.linalg.qr(A, stream=mx.cpu)
        for a, q, r in zip(A, Q, R):
            out = q @ r
            self.assertTrue(mx.allclose(out, a))
            out = q.T @ q
            self.assertTrue(mx.allclose(out, mx.eye(2), rtol=1e-5, atol=1e-7))
            self.assertTrue(mx.allclose(mx.tril(r, -1), mx.zeros_like(r)))

        # Non square matrices
        for shape in [(4, 8), (8, 4)]:
            A = mx.random.uniform(shape=shape)
            Q, R = mx.linalg.qr(A, stream=mx.cpu)
            out = Q @ R
            self.assertTrue(mx.allclose(out, A, rtol=1e-4, atol=1e-6))
            out = Q.T @ Q
            self.assertTrue(
                mx.allclose(out, mx.eye(min(A.shape)), rtol=1e-4, atol=1e-6)
            )

    def test_svd_decomposition(self):
        A = mx.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], dtype=mx.float32)
        U, S, Vt = mx.linalg.svd(A, compute_uv=True, stream=mx.cpu)
        self.assertTrue(
            mx.allclose(U[:, : len(S)] @ mx.diag(S) @ Vt, A, rtol=1e-5, atol=1e-7)
        )

        S = mx.linalg.svd(A, compute_uv=False, stream=mx.cpu)
        self.assertTrue(
            mx.allclose(
                mx.linalg.norm(S), mx.linalg.norm(A, ord="fro"), rtol=1e-5, atol=1e-7
            )
        )

        # Multiple matrices
        B = A + 10.0
        AB = mx.stack([A, B])
        Us, Ss, Vts = mx.linalg.svd(AB, compute_uv=True, stream=mx.cpu)
        for M, U, S, Vt in zip([A, B], Us, Ss, Vts):
            self.assertTrue(
                mx.allclose(U[:, : len(S)] @ mx.diag(S) @ Vt, M, rtol=1e-5, atol=1e-7)
            )

        Ss = mx.linalg.svd(AB, compute_uv=False, stream=mx.cpu)
        for M, S in zip([A, B], Ss):
            self.assertTrue(
                mx.allclose(
                    mx.linalg.norm(S),
                    mx.linalg.norm(M, ord="fro"),
                    rtol=1e-5,
                    atol=1e-7,
                )
            )

        # Test float64 - use CPU stream since float64 is not supported on GPU
        with mx.stream(mx.cpu):
            A_f64 = mx.array(
                [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], dtype=mx.float64
            )
            U_f64, S_f64, Vt_f64 = mx.linalg.svd(A_f64, compute_uv=True)
            mx.eval(U_f64, S_f64, Vt_f64)
            self.assertTrue(
                mx.allclose(
                    U_f64[:, : len(S_f64)] @ mx.diag(S_f64) @ Vt_f64,
                    A_f64,
                    rtol=1e-5,
                    atol=1e-7,
                )
            )
            self.assertEqual(S_f64.dtype, mx.float64)

        # Test complex64 - use CPU stream since complex64 is not supported on GPU
        with mx.stream(mx.cpu):
            A_c64 = mx.array(
                [[1.0 + 1j, 2.0 + 2j], [3.0 + 3j, 4.0 + 4j]], dtype=mx.complex64
            )
            U_c64, S_c64, Vt_c64 = mx.linalg.svd(A_c64, compute_uv=True)
            mx.eval(U_c64, S_c64, Vt_c64)
            self.assertTrue(
                mx.allclose(
                    U_c64[:, : len(S_c64)] @ mx.diag(S_c64) @ Vt_c64,
                    A_c64,
                    rtol=1e-5,
                    atol=1e-7,
                )
            )
            self.assertEqual(S_c64.dtype, mx.float32)
            self.assertEqual(U_c64.dtype, mx.complex64)
            self.assertEqual(Vt_c64.dtype, mx.complex64)

    def test_inverse(self):
        A = mx.array([[1, 2, 3], [6, -5, 4], [-9, 8, 7]], dtype=mx.float32)
        A_inv = mx.linalg.inv(A, stream=mx.cpu)
        self.assertTrue(mx.allclose(A @ A_inv, mx.eye(A.shape[0]), rtol=0, atol=1e-6))

        # Multiple matrices
        B = A - 100
        AB = mx.stack([A, B])
        invs = mx.linalg.inv(AB, stream=mx.cpu)
        for M, M_inv in zip(AB, invs):
            self.assertTrue(
                mx.allclose(M @ M_inv, mx.eye(M.shape[0]), rtol=0, atol=1e-5)
            )

    def test_tri_inverse(self):
        for upper in (False, True):
            A = mx.array([[1, 0, 0], [6, -5, 0], [-9, 8, 7]], dtype=mx.float32)
            B = mx.array([[7, 0, 0], [3, -2, 0], [1, 8, 3]], dtype=mx.float32)
            if upper:
                A = A.T
                B = B.T
            AB = mx.stack([A, B])
            invs = mx.linalg.tri_inv(AB, upper=upper, stream=mx.cpu)
            for M, M_inv in zip(AB, invs):
                self.assertTrue(
                    mx.allclose(M @ M_inv, mx.eye(M.shape[0]), rtol=0, atol=1e-5)
                )

        # Ensure that tri_inv will 0-out the supposedly 0 triangle
        x = mx.random.normal((2, 8, 8))
        y1 = mx.linalg.tri_inv(x, upper=True, stream=mx.cpu)
        y2 = mx.linalg.tri_inv(x, upper=False, stream=mx.cpu)
        self.assertTrue(mx.all(y1 == mx.triu(y1)))
        self.assertTrue(mx.all(y2 == mx.tril(y2)))

    def test_cholesky(self):
        sqrtA = mx.array(
            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]], dtype=mx.float32
        )
        A = sqrtA.T @ sqrtA / 81
        L = mx.linalg.cholesky(A, stream=mx.cpu)
        U = mx.linalg.cholesky(A, upper=True, stream=mx.cpu)
        self.assertTrue(mx.allclose(L @ L.T, A, rtol=1e-5, atol=1e-7))
        self.assertTrue(mx.allclose(U.T @ U, A, rtol=1e-5, atol=1e-7))

        # Multiple matrices
        B = A + 1 / 9
        AB = mx.stack([A, B])
        Ls = mx.linalg.cholesky(AB, stream=mx.cpu)
        for M, L in zip(AB, Ls):
            self.assertTrue(mx.allclose(L @ L.T, M, rtol=1e-5, atol=1e-7))

    def test_pseudo_inverse(self):
        A = mx.array([[1, 2, 3], [6, -5, 4], [-9, 8, 7]], dtype=mx.float32)
        A_plus = mx.linalg.pinv(A, stream=mx.cpu)
        self.assertTrue(mx.allclose(A @ A_plus @ A, A, rtol=0, atol=1e-5))

        # Multiple matrices
        B = A - 100
        AB = mx.stack([A, B])
        pinvs = mx.linalg.pinv(AB, stream=mx.cpu)
        for M, M_plus in zip(AB, pinvs):
            self.assertTrue(mx.allclose(M @ M_plus @ M, M, rtol=0, atol=1e-3))

        # Test singular matrix
        A = mx.array([[4.0, 1.0], [4.0, 1.0]])
        A_plus = mx.linalg.pinv(A, stream=mx.cpu)
        self.assertTrue(mx.allclose(A @ A_plus @ A, A))

    def test_cholesky_inv(self):
        mx.random.seed(7)

        sqrtA = mx.array(
            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]], dtype=mx.float32
        )
        A = sqrtA.T @ sqrtA / 81

        N = 3
        A = mx.random.uniform(shape=(N, N))
        A = A @ A.T

        for upper in (False, True):
            L = mx.linalg.cholesky(A, upper=upper, stream=mx.cpu)
            A_inv = mx.linalg.cholesky_inv(L, upper=upper, stream=mx.cpu)
            self.assertTrue(mx.allclose(A @ A_inv, mx.eye(N), atol=1e-4))

        # Multiple matrices
        B = A + 1 / 9
        AB = mx.stack([A, B])
        Ls = mx.linalg.cholesky(AB, stream=mx.cpu)
        for upper in (False, True):
            Ls = mx.linalg.cholesky(AB, upper=upper, stream=mx.cpu)
            AB_inv = mx.linalg.cholesky_inv(Ls, upper=upper, stream=mx.cpu)
            for M, M_inv in zip(AB, AB_inv):
                self.assertTrue(mx.allclose(M @ M_inv, mx.eye(N), atol=1e-4))

    def test_cross_product(self):
        a = mx.array([1.0, 2.0, 3.0])
        b = mx.array([4.0, 5.0, 6.0])
        result = mx.linalg.cross(a, b)
        expected = np.cross(a, b)
        self.assertTrue(np.allclose(result, expected))

        # Test with negative values
        a = mx.array([-1.0, -2.0, -3.0])
        b = mx.array([4.0, -5.0, 6.0])
        result = mx.linalg.cross(a, b)
        expected = np.cross(a, b)
        self.assertTrue(np.allclose(result, expected))

        # Test with integer values
        a = mx.array([1, 2, 3])
        b = mx.array([4, 5, 6])
        result = mx.linalg.cross(a, b)
        expected = np.cross(a, b)
        self.assertTrue(np.allclose(result, expected))

        # Test with 2D arrays and axis parameter
        a = mx.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
        b = mx.array([[4.0, 5.0, 6.0], [1.0, 2.0, 3.0]])
        result = mx.linalg.cross(a, b, axis=1)
        expected = np.cross(a, b, axis=1)
        self.assertTrue(np.allclose(result, expected))

        # Test with broadcast
        a = mx.random.uniform(shape=(2, 1, 3))
        b = mx.random.uniform(shape=(1, 2, 3))
        result = mx.linalg.cross(a, b)
        expected = np.cross(a, b)
        self.assertTrue(np.allclose(result, expected))

        # Type promotion
        a = mx.array([1.0, 2.0, 3.0])
        b = mx.array([4, 5, 6])
        result = mx.linalg.cross(a, b)
        expected = np.cross(a, b)
        self.assertTrue(np.allclose(result, expected))

        # Test with incorrect vector size (should raise an exception)
        a = mx.array([1.0])
        b = mx.array([4.0])
        with self.assertRaises(ValueError):
            mx.linalg.cross(a, b)

    def test_eig(self):
        tols = {"atol": 1e-5, "rtol": 1e-5}

        def check_eigs_and_vecs(A_np, kwargs={}):
            A = mx.array(A_np)
            eig_vals, eig_vecs = mx.linalg.eig(A, stream=mx.cpu, **kwargs)
            self.assertTrue(
                mx.allclose(A @ eig_vecs, eig_vals[..., None, :] * eig_vecs, **tols)
            )
            eig_vals_only = mx.linalg.eigvals(A, stream=mx.cpu, **kwargs)
            self.assertTrue(mx.allclose(eig_vals, eig_vals_only, **tols))

        # Test a simple 2x2 matrix
        A_np = np.array([[1.0, 1.0], [3.0, 4.0]], dtype=np.float32)
        check_eigs_and_vecs(A_np)

        # Test complex eigenvalues
        A_np = np.array([[1.0, -1.0], [1.0, 1.0]], dtype=np.float32)
        check_eigs_and_vecs(A_np)

        # Test a larger random symmetric matrix
        n = 5
        np.random.seed(1)
        A_np = np.random.randn(n, n).astype(np.float32)
        check_eigs_and_vecs(A_np)

        # Test with batched input
        A_np = np.random.randn(3, n, n).astype(np.float32)
        check_eigs_and_vecs(A_np)

        # Test float64 - use CPU stream since float64 is not supported on GPU
        with mx.stream(mx.cpu):
            A_np_f64 = np.array([[1.0, 1.0], [3.0, 4.0]], dtype=np.float64)
            A_f64 = mx.array(A_np_f64, dtype=mx.float64)
            eig_vals_f64, eig_vecs_f64 = mx.linalg.eig(A_f64)
            mx.eval(eig_vals_f64, eig_vecs_f64)
            self.assertTrue(
                mx.allclose(
                    A_f64 @ eig_vecs_f64,
                    eig_vals_f64[..., None, :] * eig_vecs_f64,
                    rtol=1e-5,
                    atol=1e-5,
                )
            )
            # Eigenvalues should be complex64 (output dtype)
            self.assertEqual(eig_vals_f64.dtype, mx.complex64)
            self.assertEqual(eig_vecs_f64.dtype, mx.complex64)

        # Test complex64 input - use CPU stream since complex64 is not supported on GPU
        with mx.stream(mx.cpu):
            A_np_c64 = np.array(
                [[1.0 + 1j, 2.0 + 2j], [3.0 + 3j, 4.0 + 4j]], dtype=np.complex64
            )
            A_c64 = mx.array(A_np_c64, dtype=mx.complex64)
            eig_vals_c64, eig_vecs_c64 = mx.linalg.eig(A_c64)
            mx.eval(eig_vals_c64, eig_vecs_c64)
            self.assertTrue(
                mx.allclose(
                    A_c64 @ eig_vecs_c64,
                    eig_vals_c64[..., None, :] * eig_vecs_c64,
                    rtol=1e-5,
                    atol=1e-5,
                )
            )
            self.assertEqual(eig_vals_c64.dtype, mx.complex64)
            self.assertEqual(eig_vecs_c64.dtype, mx.complex64)

        # Test error cases
        with self.assertRaises(ValueError):
            mx.linalg.eig(mx.array([1.0, 2.0]))  # 1D array

        with self.assertRaises(ValueError):
            mx.linalg.eig(
                mx.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
            )  # Non-square matrix

        with self.assertRaises(ValueError):
            mx.linalg.eigvals(mx.array([1.0, 2.0]))  # 1D array

        with self.assertRaises(ValueError):
            mx.linalg.eigvals(
                mx.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
            )  # Non-square matrix

    def test_eigh(self):
        tols = {"atol": 1e-5, "rtol": 1e-5}

        def check_eigs_and_vecs(A_np, kwargs={}):
            A = mx.array(A_np)
            eig_vals, eig_vecs = mx.linalg.eigh(A, stream=mx.cpu, **kwargs)
            eig_vals_np, _ = np.linalg.eigh(A_np, **kwargs)
            self.assertTrue(np.allclose(eig_vals, eig_vals_np, **tols))
            self.assertTrue(
                mx.allclose(A @ eig_vecs, eig_vals[..., None, :] * eig_vecs, **tols)
            )

            eig_vals_only = mx.linalg.eigvalsh(A, stream=mx.cpu, **kwargs)
            self.assertTrue(mx.allclose(eig_vals, eig_vals_only, **tols))

        # Test a simple 2x2 symmetric matrix
        A_np = np.array([[1.0, 2.0], [2.0, 4.0]], dtype=np.float32)
        check_eigs_and_vecs(A_np)

        # Test a larger random symmetric matrix
        n = 5
        np.random.seed(1)
        A_np = np.random.randn(n, n).astype(np.float32)
        A_np = (A_np + A_np.T) / 2
        check_eigs_and_vecs(A_np)

        # Test with upper triangle
        check_eigs_and_vecs(A_np, {"UPLO": "U"})

        # Test with batched input
        A_np = np.random.randn(3, n, n).astype(np.float32)
        A_np = (A_np + np.transpose(A_np, (0, 2, 1))) / 2
        check_eigs_and_vecs(A_np)

        # Test with complex inputs
        A_np = (
            np.random.randn(8, 8, 2).astype(np.float32).view(np.complex64).squeeze(-1)
        )
        A_np = A_np + A_np.T.conj()
        check_eigs_and_vecs(A_np)

        # Test error cases
        with self.assertRaises(ValueError):
            mx.linalg.eigh(mx.array([1.0, 2.0]))  # 1D array

        with self.assertRaises(ValueError):
            mx.linalg.eigh(
                mx.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
            )  # Non-square matrix

        with self.assertRaises(ValueError):
            mx.linalg.eigvalsh(mx.array([1.0, 2.0]))  # 1D array

        with self.assertRaises(ValueError):
            mx.linalg.eigvalsh(
                mx.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
            )  # Non-square matrix

    def test_lu(self):
        with self.assertRaises(ValueError):
            mx.linalg.lu(mx.array(0.0), stream=mx.cpu)

        with self.assertRaises(ValueError):
            mx.linalg.lu(mx.array([0.0, 1.0]), stream=mx.cpu)

        with self.assertRaises(ValueError):
            mx.linalg.lu(mx.array([[0, 1], [1, 0]]), stream=mx.cpu)

        # Test 3x3 matrix
        a = mx.array([[3.0, 1.0, 2.0], [1.0, 8.0, 6.0], [9.0, 2.0, 5.0]])
        P, L, U = mx.linalg.lu(a, stream=mx.cpu)
        self.assertTrue(mx.allclose(L[P, :] @ U, a))

        # Test batch dimension
        a = mx.broadcast_to(a, (5, 5, 3, 3))
        P, L, U = mx.linalg.lu(a, stream=mx.cpu)
        L = mx.take_along_axis(L, P[..., None], axis=-2)
        self.assertTrue(mx.allclose(L @ U, a))

        # Test non-square matrix
        a = mx.array([[3.0, 1.0, 2.0], [1.0, 8.0, 6.0]])
        P, L, U = mx.linalg.lu(a, stream=mx.cpu)
        self.assertTrue(mx.allclose(L[P, :] @ U, a))

        a = mx.array([[3.0, 1.0], [1.0, 8.0], [9.0, 2.0]])
        P, L, U = mx.linalg.lu(a, stream=mx.cpu)
        self.assertTrue(mx.allclose(L[P, :] @ U, a))

    def test_lu_factor(self):
        mx.random.seed(7)

        # Test 3x3 matrix
        a = mx.random.uniform(shape=(5, 5))
        LU, pivots = mx.linalg.lu_factor(a, stream=mx.cpu)
        n = a.shape[-1]

        pivots = pivots.tolist()
        perm = list(range(n))
        for i in range(len(pivots)):
            perm[i], perm[pivots[i]] = perm[pivots[i]], perm[i]

        L = mx.add(mx.tril(LU, k=-1), mx.eye(n))
        U = mx.triu(LU)
        self.assertTrue(mx.allclose(L @ U, a[perm, :]))

    def test_solve(self):
        mx.random.seed(7)

        # Test 3x3 matrix with 1D rhs
        a = mx.array([[3.0, 1.0, 2.0], [1.0, 8.0, 6.0], [9.0, 2.0, 5.0]])
        b = mx.array([11.0, 35.0, 28.0])

        result = mx.linalg.solve(a, b, stream=mx.cpu)
        expected = np.linalg.solve(a, b)
        self.assertTrue(np.allclose(result, expected))

        # Test symmetric positive-definite matrix
        N = 5
        a = mx.random.uniform(shape=(N, N))
        a = mx.matmul(a, a.T) + N * mx.eye(N)
        b = mx.random.uniform(shape=(N, 1))

        result = mx.linalg.solve(a, b, stream=mx.cpu)
        expected = np.linalg.solve(a, b)
        self.assertTrue(np.allclose(result, expected))

        # Test batch dimension
        a = mx.random.uniform(shape=(5, 5, 4, 4))
        b = mx.random.uniform(shape=(5, 5, 4, 1))

        result = mx.linalg.solve(a, b, stream=mx.cpu)
        expected = np.linalg.solve(a, b)
        self.assertTrue(np.allclose(result, expected, atol=1e-5))

        # Test large matrix
        N = 1000
        a = mx.random.uniform(shape=(N, N))
        b = mx.random.uniform(shape=(N, 1))

        result = mx.linalg.solve(a, b, stream=mx.cpu)
        expected = np.linalg.solve(a, b)
        self.assertTrue(np.allclose(result, expected, atol=1e-3))

        # Test multi-column rhs
        a = mx.random.uniform(shape=(5, 5))
        b = mx.random.uniform(shape=(5, 8))

        result = mx.linalg.solve(a, b, stream=mx.cpu)
        expected = np.linalg.solve(a, b)
        self.assertTrue(np.allclose(result, expected))

        # Test batched multi-column rhs
        a = mx.broadcast_to(a, (3, 2, 5, 5))
        b = mx.broadcast_to(b, (3, 1, 5, 8))

        result = mx.linalg.solve(a, b, stream=mx.cpu)
        expected = np.linalg.solve(a, b)
        self.assertTrue(np.allclose(result, expected, rtol=1e-5, atol=1e-5))

    def test_solve_triangular(self):
        # Test lower triangular matrix
        a = mx.array([[4.0, 0.0, 0.0], [2.0, 3.0, 0.0], [1.0, -2.0, 5.0]])
        b = mx.array([8.0, 14.0, 3.0])

        result = mx.linalg.solve_triangular(a, b, upper=False, stream=mx.cpu)
        expected = np.linalg.solve(a, b)
        self.assertTrue(np.allclose(result, expected))

        # Test upper triangular matrix
        a = mx.array([[3.0, 2.0, 1.0], [0.0, 5.0, 4.0], [0.0, 0.0, 6.0]])
        b = mx.array([13.0, 33.0, 18.0])

        result = mx.linalg.solve_triangular(a, b, upper=True, stream=mx.cpu)
        expected = np.linalg.solve(a, b)
        self.assertTrue(np.allclose(result, expected))

        # Test batch multi-column rhs
        a = mx.broadcast_to(a, (3, 4, 3, 3))
        b = mx.broadcast_to(mx.expand_dims(b, -1), (3, 4, 3, 8))

        result = mx.linalg.solve_triangular(a, b, upper=True, stream=mx.cpu)
        expected = np.linalg.solve(a, b)
        self.assertTrue(np.allclose(result, expected))


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_load.py
================================================
# Copyright © 2023 Apple Inc.

import os
import platform
import tempfile
import unittest
from pathlib import Path

import mlx.core as mx
import mlx_tests
import numpy as np


class TestLoad(mlx_tests.MLXTestCase):
    dtypes = [
        "uint8",
        "uint16",
        "uint32",
        "uint64",
        "int8",
        "int16",
        "int32",
        "int64",
        "float32",
        "float16",
        "complex64",
    ]

    @classmethod
    def setUpClass(cls):
        cls.test_dir_fid = tempfile.TemporaryDirectory()
        cls.test_dir = cls.test_dir_fid.name
        if not os.path.isdir(cls.test_dir):
            os.mkdir(cls.test_dir)

    @classmethod
    def tearDownClass(cls):
        cls.test_dir_fid.cleanup()

    def test_save_and_load(self):
        for dt in self.dtypes:
            with self.subTest(dtype=dt):
                for i, shape in enumerate([(1,), (23,), (1024, 1024), (4, 6, 3, 1, 2)]):
                    with self.subTest(shape=shape):
                        save_file_mlx = os.path.join(self.test_dir, f"mlx_{dt}_{i}.npy")
                        save_file_npy = os.path.join(self.test_dir, f"npy_{dt}_{i}.npy")

                        save_arr = np.random.uniform(0.0, 32.0, size=shape)
                        save_arr_npy = save_arr.astype(getattr(np, dt))
                        save_arr_mlx = mx.array(save_arr_npy)

                        mx.save(save_file_mlx, save_arr_mlx)
                        np.save(save_file_npy, save_arr_npy)

                        # Load array saved by mlx as mlx array
                        load_arr_mlx_mlx = mx.load(save_file_mlx)
                        self.assertTrue(mx.array_equal(load_arr_mlx_mlx, save_arr_mlx))

                        # Load array saved by numpy as mlx array
                        load_arr_npy_mlx = mx.load(save_file_npy)
                        self.assertTrue(mx.array_equal(load_arr_npy_mlx, save_arr_mlx))

                        # Load array saved by mlx as numpy array
                        load_arr_mlx_npy = np.load(save_file_mlx)
                        self.assertTrue(np.array_equal(load_arr_mlx_npy, save_arr_npy))

        save_file = os.path.join(self.test_dir, f"mlx_path.npy")
        save_arr = mx.ones((32,))
        mx.save(Path(save_file), save_arr)

        # Load array saved by mlx as mlx array
        load_arr = mx.load(Path(save_file))
        self.assertTrue(mx.array_equal(load_arr, save_arr))

    def test_load_npy_dtype(self):
        save_file = os.path.join(self.test_dir, "mlx_path.npy")
        a = np.random.randn(8).astype(np.float64)
        np.save(save_file, a)
        out = mx.load(save_file, stream=mx.cpu)
        self.assertEqual(out.dtype, mx.float64)
        self.assertTrue(np.array_equal(np.array(out), a))

        a = np.random.randn(8).astype(np.float64)
        b = np.random.randn(8).astype(np.float64)
        c = a + 0j * b
        np.save(save_file, c)
        with self.assertRaises(Exception):
            out = mx.load(save_file, stream=mx.cpu)

    def test_save_and_load_safetensors(self):
        test_file = os.path.join(self.test_dir, "test.safetensors")
        with self.assertRaises(Exception):
            mx.save_safetensors(test_file, {"a": mx.ones((4, 4))}, {"testing": 0})

        for obj in [str, Path]:
            mx.save_safetensors(
                obj(test_file),
                {"test": mx.ones((2, 2))},
                {"testing": "test", "format": "mlx"},
            )
            res = mx.load(obj(test_file), return_metadata=True)
            self.assertEqual(len(res), 2)
            self.assertEqual(res[1], {"testing": "test", "format": "mlx"})

        for dt in self.dtypes + ["bfloat16"]:
            with self.subTest(dtype=dt):
                for i, shape in enumerate([(1,), (23,), (1024, 1024), (4, 6, 3, 1, 2)]):
                    with self.subTest(shape=shape):
                        save_file_mlx = os.path.join(
                            self.test_dir, f"mlx_{dt}_{i}_fs.safetensors"
                        )
                        save_dict = {
                            "test": (
                                mx.random.normal(shape=shape, dtype=getattr(mx, dt))
                                if dt in ["float32", "float16", "bfloat16"]
                                else mx.ones(shape, dtype=getattr(mx, dt))
                            )
                        }

                        with open(save_file_mlx, "wb") as f:
                            mx.save_safetensors(f, save_dict)
                        with open(save_file_mlx, "rb") as f:
                            load_dict = mx.load(f)

                        self.assertTrue("test" in load_dict)
                        self.assertTrue(
                            mx.array_equal(load_dict["test"], save_dict["test"])
                        )

    @unittest.skipIf(platform.system() == "Windows", "GGUF is disabled on Windows")
    def test_save_and_load_gguf(self):
        if not os.path.isdir(self.test_dir):
            os.mkdir(self.test_dir)

        # TODO: Add support for other dtypes (self.dtypes + ["bfloat16"])
        supported_dtypes = ["float16", "float32", "int8", "int16", "int32"]
        for dt in supported_dtypes:
            with self.subTest(dtype=dt):
                for i, shape in enumerate([(1,), (23,), (1024, 1024), (4, 6, 3, 1, 2)]):
                    with self.subTest(shape=shape):
                        save_file_mlx = os.path.join(
                            self.test_dir, f"mlx_{dt}_{i}_fs.gguf"
                        )
                        save_dict = {
                            "test": (
                                mx.random.normal(shape=shape, dtype=getattr(mx, dt))
                                if dt in ["float32", "float16", "bfloat16"]
                                else mx.ones(shape, dtype=getattr(mx, dt))
                            )
                        }

                        mx.save_gguf(save_file_mlx, save_dict)
                        load_dict = mx.load(save_file_mlx)

                        self.assertTrue("test" in load_dict)
                        self.assertTrue(
                            mx.array_equal(load_dict["test"], save_dict["test"])
                        )

        save_file_mlx = os.path.join(self.test_dir, f"mlx_path_test_fs.gguf")
        save_dict = {"test": mx.ones(shape)}
        mx.save_gguf(Path(save_file_mlx), save_dict)
        load_dict = mx.load(Path(save_file_mlx))
        self.assertTrue("test" in load_dict)
        self.assertTrue(mx.array_equal(load_dict["test"], save_dict["test"]))

    def test_load_f8_e4m3(self):
        if not os.path.isdir(self.test_dir):
            os.mkdir(self.test_dir)

        expected = [
            0,
            448,
            -448,
            -0.875,
            0.4375,
            -0.005859,
            -1.25,
            -1.25,
            -1.5,
            -0.0039,
        ]
        expected = mx.array(expected, dtype=mx.bfloat16)
        contents = b'H\x00\x00\x00\x00\x00\x00\x00{"tensor":{"dtype":"F8_E4M3","shape":[10],"data_offsets":[0,10]}}       \x00~\xfe\xb6.\x83\xba\xba\xbc\x82'
        with tempfile.NamedTemporaryFile(suffix=".safetensors") as f:
            f.write(contents)
            f.seek(0)
            out = mx.load(f)["tensor"]
        self.assertTrue(mx.allclose(mx.from_fp8(out), expected))

    @unittest.skipIf(platform.system() == "Windows", "GGUF is disabled on Windows")
    def test_save_and_load_gguf_metadata_basic(self):
        if not os.path.isdir(self.test_dir):
            os.mkdir(self.test_dir)

        save_file_mlx = os.path.join(self.test_dir, f"mlx_gguf_with_metadata.gguf")
        save_dict = {"test": mx.ones((4, 4), dtype=mx.int32)}
        metadata = {}

        # Empty works
        mx.save_gguf(save_file_mlx, save_dict, metadata)

        # Loads without the metadata
        load_dict = mx.load(save_file_mlx)
        self.assertTrue("test" in load_dict)
        self.assertTrue(mx.array_equal(load_dict["test"], save_dict["test"]))

        # Loads empty metadata
        load_dict, meta_load_dict = mx.load(save_file_mlx, return_metadata=True)
        self.assertTrue("test" in load_dict)
        self.assertTrue(mx.array_equal(load_dict["test"], save_dict["test"]))
        self.assertEqual(len(meta_load_dict), 0)

        # Loads string metadata
        metadata = {"meta": "data"}
        mx.save_gguf(save_file_mlx, save_dict, metadata)
        load_dict, meta_load_dict = mx.load(save_file_mlx, return_metadata=True)
        self.assertTrue("test" in load_dict)
        self.assertTrue(mx.array_equal(load_dict["test"], save_dict["test"]))
        self.assertEqual(len(meta_load_dict), 1)
        self.assertTrue("meta" in meta_load_dict)
        self.assertEqual(meta_load_dict["meta"], "data")

    @unittest.skipIf(platform.system() == "Windows", "GGUF is disabled on Windows")
    def test_save_and_load_gguf_metadata_arrays(self):
        if not os.path.isdir(self.test_dir):
            os.mkdir(self.test_dir)

        save_file_mlx = os.path.join(self.test_dir, f"mlx_gguf_with_metadata.gguf")
        save_dict = {"test": mx.ones((4, 4), dtype=mx.int32)}

        # Test scalars and one dimensional arrays
        for t in [
            mx.uint8,
            mx.int8,
            mx.uint16,
            mx.int16,
            mx.uint32,
            mx.int32,
            mx.uint64,
            mx.int64,
            mx.float32,
        ]:
            for shape in [(), (2,)]:
                arr = mx.random.uniform(shape=shape).astype(t)
                metadata = {"meta": arr}
                mx.save_gguf(save_file_mlx, save_dict, metadata)
                _, meta_load_dict = mx.load(save_file_mlx, return_metadata=True)
                self.assertEqual(len(meta_load_dict), 1)
                self.assertTrue("meta" in meta_load_dict)
                self.assertTrue(mx.array_equal(meta_load_dict["meta"], arr))
                self.assertEqual(meta_load_dict["meta"].dtype, arr.dtype)

        for t in [mx.float16, mx.bfloat16, mx.complex64]:
            with self.assertRaises(ValueError):
                arr = mx.array(1, t)
                metadata = {"meta": arr}
                mx.save_gguf(save_file_mlx, save_dict, metadata)

    @unittest.skipIf(platform.system() == "Windows", "GGUF is disabled on Windows")
    def test_save_and_load_gguf_metadata_mixed(self):
        if not os.path.isdir(self.test_dir):
            os.mkdir(self.test_dir)

        save_file_mlx = os.path.join(self.test_dir, f"mlx_gguf_with_metadata.gguf")
        save_dict = {"test": mx.ones((4, 4), dtype=mx.int32)}

        # Test string and array
        arr = mx.array(1.5)
        metadata = {"meta1": arr, "meta2": "data"}
        mx.save_gguf(save_file_mlx, save_dict, metadata)
        _, meta_load_dict = mx.load(save_file_mlx, return_metadata=True)
        self.assertEqual(len(meta_load_dict), 2)
        self.assertTrue("meta1" in meta_load_dict)
        self.assertTrue(mx.array_equal(meta_load_dict["meta1"], arr))
        self.assertEqual(meta_load_dict["meta1"].dtype, arr.dtype)
        self.assertTrue("meta2" in meta_load_dict)
        self.assertEqual(meta_load_dict["meta2"], "data")

        # Test list of strings
        metadata = {"meta": ["data1", "data2", "data345"]}
        mx.save_gguf(save_file_mlx, save_dict, metadata)
        _, meta_load_dict = mx.load(save_file_mlx, return_metadata=True)
        self.assertEqual(len(meta_load_dict), 1)
        self.assertEqual(meta_load_dict["meta"], metadata["meta"])

        # Test a combination of stuff
        metadata = {
            "meta1": ["data1", "data2", "data345"],
            "meta2": mx.array([1, 2, 3, 4]),
            "meta3": "data",
            "meta4": mx.array(1.5),
        }
        mx.save_gguf(save_file_mlx, save_dict, metadata)
        _, meta_load_dict = mx.load(save_file_mlx, return_metadata=True)
        self.assertEqual(len(meta_load_dict), 4)
        for k, v in metadata.items():
            if isinstance(v, mx.array):
                self.assertTrue(mx.array_equal(meta_load_dict[k], v))
            else:
                self.assertEqual(meta_load_dict[k], v)

    def test_save_and_load_fs(self):
        if not os.path.isdir(self.test_dir):
            os.mkdir(self.test_dir)

        for dt in self.dtypes:
            with self.subTest(dtype=dt):
                for i, shape in enumerate([(1,), (23,), (1024, 1024), (4, 6, 3, 1, 2)]):
                    with self.subTest(shape=shape):
                        save_file_mlx = os.path.join(
                            self.test_dir, f"mlx_{dt}_{i}_fs.npy"
                        )
                        save_file_npy = os.path.join(
                            self.test_dir, f"npy_{dt}_{i}_fs.npy"
                        )

                        save_arr = np.random.uniform(0.0, 32.0, size=shape)
                        save_arr_npy = save_arr.astype(getattr(np, dt))
                        save_arr_mlx = mx.array(save_arr_npy)

                        with open(save_file_mlx, "wb") as f:
                            mx.save(f, save_arr_mlx)

                        np.save(save_file_npy, save_arr_npy)

                        # Load array saved by mlx as mlx array
                        with open(save_file_mlx, "rb") as f:
                            load_arr_mlx_mlx = mx.load(f)
                        self.assertTrue(mx.array_equal(load_arr_mlx_mlx, save_arr_mlx))

                        # Load array saved by numpy as mlx array
                        with open(save_file_npy, "rb") as f:
                            load_arr_npy_mlx = mx.load(f)
                        self.assertTrue(mx.array_equal(load_arr_npy_mlx, save_arr_mlx))

                        # Load array saved by mlx as numpy array
                        load_arr_mlx_npy = np.load(save_file_mlx)
                        self.assertTrue(np.array_equal(load_arr_mlx_npy, save_arr_npy))

    def test_savez_and_loadz(self):
        if not os.path.isdir(self.test_dir):
            os.mkdir(self.test_dir)

        for dt in self.dtypes:
            with self.subTest(dtype=dt):
                shapes = [(6,), (6, 6), (4, 1, 3, 1, 2)]
                save_file_mlx_uncomp = os.path.join(
                    self.test_dir, f"mlx_{dt}_uncomp.npz"
                )
                save_file_npy_uncomp = os.path.join(
                    self.test_dir, f"npy_{dt}_uncomp.npz"
                )
                save_file_mlx_comp = os.path.join(self.test_dir, f"mlx_{dt}_comp.npz")
                save_file_npy_comp = os.path.join(self.test_dir, f"npy_{dt}_comp.npz")

                # Make dictionary of multiple
                save_arrs_npy = {
                    f"save_arr_{i}": np.random.uniform(
                        0.0, 32.0, size=shapes[i]
                    ).astype(getattr(np, dt))
                    for i in range(len(shapes))
                }
                save_arrs_mlx = {k: mx.array(v) for k, v in save_arrs_npy.items()}

                # Save as npz files
                np.savez(save_file_npy_uncomp, **save_arrs_npy)
                mx.savez(save_file_mlx_uncomp, **save_arrs_mlx)
                np.savez_compressed(save_file_npy_comp, **save_arrs_npy)
                mx.savez_compressed(save_file_mlx_comp, **save_arrs_mlx)

                for save_file_npy, save_file_mlx in (
                    (save_file_npy_uncomp, save_file_mlx_uncomp),
                    (save_file_npy_comp, save_file_mlx_comp),
                ):
                    # Load array saved by mlx as mlx array
                    load_arr_mlx_mlx = mx.load(save_file_mlx)
                    for k, v in load_arr_mlx_mlx.items():
                        self.assertTrue(mx.array_equal(save_arrs_mlx[k], v))

                    # Load arrays saved by numpy as mlx arrays
                    load_arr_npy_mlx = mx.load(save_file_npy)
                    for k, v in load_arr_npy_mlx.items():
                        self.assertTrue(mx.array_equal(save_arrs_mlx[k], v))

                    # Load array saved by mlx as numpy array
                    load_arr_mlx_npy = np.load(save_file_mlx)
                    for k, v in load_arr_mlx_npy.items():
                        self.assertTrue(np.array_equal(save_arrs_npy[k], v))

    def test_non_contiguous(self):
        a = mx.broadcast_to(mx.array([1, 2]), [4, 2])

        save_file = os.path.join(self.test_dir, "a.npy")
        mx.save(save_file, a)
        aload = mx.load(save_file)
        self.assertTrue(mx.array_equal(a, aload))

        save_file = os.path.join(self.test_dir, "a.safetensors")
        mx.save_safetensors(save_file, {"a": a})
        aload = mx.load(save_file)["a"]
        self.assertTrue(mx.array_equal(a, aload))

        if platform.system() == "Windows":
            return

        save_file = os.path.join(self.test_dir, "a.gguf")
        mx.save_gguf(save_file, {"a": a})
        aload = mx.load(save_file)["a"]
        self.assertTrue(mx.array_equal(a, aload))

        # safetensors and gguf only work with row contiguous
        # make sure col contiguous is handled properly
        save_file = os.path.join(self.test_dir, "a.safetensors")
        a = mx.arange(4).reshape(2, 2).T
        mx.save_safetensors(save_file, {"a": a})
        aload = mx.load(save_file)["a"]
        self.assertTrue(mx.array_equal(a, aload))

        save_file = os.path.join(self.test_dir, "a.gguf")
        mx.save_gguf(save_file, {"a": a})
        aload = mx.load(save_file)["a"]
        self.assertTrue(mx.array_equal(a, aload))

    def test_load_donation(self):
        x = mx.random.normal((1024,))
        mx.eval(x)
        save_file = os.path.join(self.test_dir, "donation.npy")
        mx.save(save_file, x)
        mx.synchronize()

        mx.reset_peak_memory()
        scale = mx.array(2.0)
        y = mx.load(save_file)
        mx.eval(y)
        mx.synchronize()
        load_only = mx.get_peak_memory()
        y = mx.load(save_file) * scale
        mx.eval(y)
        mx.synchronize()
        load_with_binary = mx.get_peak_memory()

        self.assertEqual(load_only, load_with_binary)


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_losses.py
================================================
# Copyright © 2023 Apple Inc.

import unittest

import mlx.core as mx
import mlx.nn as nn
import mlx_tests
import numpy as np


class TestLosses(mlx_tests.MLXTestCase):
    def test_cross_entropy(self):
        # No weights, no label smoothing
        logits = mx.array([[0.0, -float("inf")], [-float("inf"), 0.0]])
        indices = mx.array([0, 1])
        expected = mx.array([0.0, 0.0])
        loss = nn.losses.cross_entropy(logits, indices, reduction="none")
        self.assertTrue(mx.allclose(loss, expected))

        probs = mx.array([[1.0, 0.0], [0.0, 1.0]])
        loss = nn.losses.cross_entropy(logits, probs, reduction="none")
        self.assertTrue(mx.isnan(loss).all())  # produce NaNs, like PyTorch

        # With weights, no label smoothing
        logits = mx.array([[2.0, -1.0], [-1.0, 2.0]])
        indices = mx.array([0, 1])
        weights = mx.array([1.0, 2.0])
        expected = mx.array([0.04858735, 0.0971747])
        loss = nn.losses.cross_entropy(
            logits, indices, weights=weights, reduction="none"
        )
        self.assertTrue(mx.allclose(loss, expected))

        probs = mx.array([[1.0, 0.0], [0.0, 1.0]])
        loss = nn.losses.cross_entropy(logits, probs, weights=weights, reduction="none")
        self.assertTrue(mx.allclose(loss, expected))

        # No weights, with label smoothing
        logits = mx.array([[2.0, -1.0], [-1.0, 2.0]])
        indices = mx.array([0, 1])
        expected = mx.array([0.498587, 0.498587])
        loss = nn.losses.cross_entropy(
            logits, indices, label_smoothing=0.3, reduction="none"
        )
        self.assertTrue(mx.allclose(loss, expected))

        probs = mx.array([[1.0, 0.0], [0.0, 1.0]])
        loss = nn.losses.cross_entropy(
            logits, probs, label_smoothing=0.3, reduction="none"
        )
        self.assertTrue(mx.allclose(loss, expected))

        # With weights and label smoothing
        logits = mx.array([[2.0, -1.0], [-1.0, 2.0]])
        indices = mx.array([0, 1])
        weights = mx.array([1.0, 2.0])
        expected = mx.array([0.49858734, 0.9971747])
        loss = nn.losses.cross_entropy(
            logits, indices, weights=weights, label_smoothing=0.3, reduction="none"
        )
        self.assertTrue(mx.allclose(loss, expected))

        # Test a different axis
        logits = mx.random.normal((4, 8))
        targets = mx.array([1, 2, 3, 0])
        loss = nn.losses.cross_entropy(
            logits.T,
            targets,
            axis=0,
        )
        targets = mx.array([1, 2, 3, 0])
        expected = nn.losses.cross_entropy(
            logits,
            targets,
            axis=-1,
        )
        self.assertTrue(mx.allclose(loss, expected))

    def test_binary_cross_entropy(self):
        def _test_logits_as_inputs():
            logits = mx.array([0.105361, 0.223144, 1.20397, 0.916291])
            targets = mx.array([0, 0, 1, 1])

            # Test with reduction 'none'
            losses_none = nn.losses.binary_cross_entropy(
                logits, targets, reduction="none"
            )
            expected_none = mx.array([0.747215, 0.810930, 0.262365, 0.336472])
            self.assertTrue(mx.allclose(losses_none, expected_none))

            # Test with reduction 'mean'
            losses_mean = nn.losses.binary_cross_entropy(
                logits, targets, reduction="mean"
            )
            expected_mean = mx.mean(expected_none)
            self.assertTrue(mx.allclose(losses_mean, expected_mean))

            # Test with reduction 'sum'
            losses_sum = nn.losses.binary_cross_entropy(
                logits, targets, reduction="sum"
            )
            expected_sum = mx.sum(expected_none)
            self.assertTrue(mx.allclose(losses_sum, expected_sum))

            # With weights, no label smoothing
            weights = mx.array([1.0, 2.0, 1.0, 2.0])
            expected = mx.array([0.747215, 1.62186, 0.262365, 0.672944])
            loss = nn.losses.binary_cross_entropy(
                logits, targets, weights=weights, reduction="none"
            )
            self.assertTrue(mx.allclose(loss, expected))

        def _test_probs_as_inputs():
            probs = mx.array([0.5, 0.6, 0.7, 0.8])
            targets = mx.array([0, 0, 1, 1])

            # Test with reduction 'none'
            losses_none = nn.losses.binary_cross_entropy(
                probs, targets, with_logits=False, reduction="none"
            )
            expected_none = mx.array([0.693147, 0.916291, 0.356675, 0.223144])
            self.assertTrue(mx.allclose(losses_none, expected_none))

            # Test with reduction 'mean'
            losses_mean = nn.losses.binary_cross_entropy(
                probs, targets, with_logits=False, reduction="mean"
            )
            expected_mean = mx.mean(expected_none)
            self.assertTrue(mx.allclose(losses_mean, expected_mean))

            # Test with reduction 'sum'
            losses_sum = nn.losses.binary_cross_entropy(
                probs, targets, with_logits=False, reduction="sum"
            )
            expected_sum = mx.sum(expected_none)
            self.assertTrue(mx.allclose(losses_sum, expected_sum))

        def _test_tiny_probs_as_inputs():
            TINY_PROB = 1e-59
            probs = mx.array([0, TINY_PROB, 1 - TINY_PROB, 1])
            targets = mx.array([0, 0, 1, 1])

            losses_none = nn.losses.binary_cross_entropy(
                probs, targets, with_logits=False, reduction="none"
            )
            expected_none = mx.array([0.0, TINY_PROB, TINY_PROB, 0.0])
            self.assertTrue(mx.allclose(losses_none, expected_none))

            # Test with reduction 'mean'
            losses_mean = nn.losses.binary_cross_entropy(
                probs, targets, with_logits=False, reduction="mean"
            )
            expected_mean = mx.mean(expected_none)
            self.assertTrue(mx.allclose(losses_mean, expected_mean))

            # Test with reduction 'sum'
            losses_sum = nn.losses.binary_cross_entropy(
                probs, targets, with_logits=False, reduction="sum"
            )
            expected_sum = mx.sum(expected_none)
            self.assertTrue(mx.allclose(losses_sum, expected_sum))

        _test_logits_as_inputs()
        _test_probs_as_inputs()
        _test_tiny_probs_as_inputs()

    def test_l1_loss(self):
        predictions = mx.array([0.5, 0.2, 0.9, 0.0])
        targets = mx.array([0.5, 0.2, 0.9, 0.0])

        # Expected result
        expected_none = mx.array([0, 0, 0, 0]).astype(mx.float32)
        expected_sum = mx.sum(expected_none)
        expected_mean = mx.mean(expected_none)

        losses = nn.losses.l1_loss(predictions, targets, reduction="none")
        self.assertTrue(
            mx.array_equal(losses, expected_none),
            "Test failed for l1_loss --reduction='none'",
        )

        losses = nn.losses.l1_loss(predictions, targets, reduction="sum")
        self.assertTrue(mx.array_equal(losses, expected_sum))

        losses = nn.losses.l1_loss(predictions, targets, reduction="mean")
        self.assertTrue(mx.array_equal(losses, expected_mean))

    def test_mse_loss(self):
        predictions = mx.array([0.5, 0.2, 0.9, 0.0])
        targets = mx.array([0.7, 0.1, 0.8, 0.2])

        expected_none = mx.array([0.04, 0.01, 0.01, 0.04])
        expected_mean = mx.mean(expected_none)
        expected_sum = mx.sum(expected_none)

        # Test with reduction 'none'
        losses_none = nn.losses.mse_loss(predictions, targets, reduction="none")
        self.assertTrue(
            np.allclose(losses_none, expected_none, 1e-5),
            "Test case failed for mse_loss --reduction='none'",
        )

        # Test with reduction 'mean'
        losses_mean = nn.losses.mse_loss(predictions, targets, reduction="mean")
        self.assertEqual(
            losses_mean,
            expected_mean,
            "Test case failed for mse_loss --reduction='mean'",
        )

        # Test with reduction 'sum'
        losses_sum = nn.losses.mse_loss(predictions, targets, reduction="sum")
        self.assertEqual(
            losses_sum, expected_sum, "Test case failed for mse_loss --reduction='sum'"
        )

    def test_smooth_l1_loss(self):
        predictions = mx.array([1.5, 2.5, 0.5, 3.5])
        targets = mx.array([1.0, 2.0, 0.5, 2.5])
        beta = 1.0

        # Expected results
        expected_none = mx.array([0.125, 0.125, 0.0, 0.5])
        expected_sum = mx.sum(expected_none)
        expected_mean = mx.mean(expected_none)

        # Test with reduction 'none'
        loss_none = nn.losses.smooth_l1_loss(
            predictions, targets, beta, reduction="none"
        )
        self.assertTrue(
            mx.array_equal(loss_none, expected_none),
            "Test case failed for smooth_l1_loss --reduction='none'",
        )

        # Test with reduction 'sum'
        loss_sum = nn.losses.smooth_l1_loss(predictions, targets, beta, reduction="sum")
        self.assertEqual(
            loss_sum,
            expected_sum,
            "Test case failed for smooth_l1_loss --reduction='sum'",
        )

        # Test with reduction 'mean'
        loss_mean = nn.losses.smooth_l1_loss(
            predictions, targets, beta, reduction="mean"
        )
        self.assertEqual(
            loss_mean,
            expected_mean,
            "Test case failed for smooth_l1_loss --reduction='mean'",
        )

    def test_nll_loss(self):
        logits = mx.array([[0.0, -float("inf")], [-float("inf"), 0.0]])
        targets = mx.array([0, 1])

        # Test with reduction 'none'
        losses_none = nn.losses.nll_loss(logits, targets, reduction="none")
        expected_none = mx.array([0.0, 0.0])
        self.assertTrue(mx.array_equal(losses_none, expected_none))

        # Test with reduction 'mean'
        losses_mean = nn.losses.nll_loss(logits, targets, reduction="mean")
        expected_mean = mx.mean(expected_none)
        self.assertEqual(losses_mean, expected_mean)

        # Test with reduction 'sum'
        losses_sum = nn.losses.nll_loss(logits, targets, reduction="sum")
        expected_sum = mx.sum(expected_none)
        self.assertEqual(losses_sum, expected_sum)

    def test_gaussian_nll_loss(self):
        inputs = mx.array([[0.1, 0.2], [0.3, 0.4]])
        targets = mx.array([[0.2, 0.1], [0.1, 0.2]])
        vars = mx.array([[0.1, 0.2], [0.3, 0.4]])

        # Test with reduction 'none', full=False
        losses_none = nn.losses.gaussian_nll_loss(
            inputs, targets, vars, reduction="none"
        )
        expected_none = mx.array([[-1.101293, -0.779719], [-0.535320, -0.408145]])
        self.assertTrue(mx.allclose(losses_none, expected_none))

        # Test with reduction 'mean', full=False
        losses_mean = nn.losses.gaussian_nll_loss(
            inputs, targets, vars, reduction="mean"
        )
        expected_mean = mx.mean(expected_none)
        self.assertTrue(mx.allclose(losses_mean, expected_mean))

        # Test with reduction 'sum', full=False
        losses_sum = nn.losses.gaussian_nll_loss(inputs, targets, vars, reduction="sum")
        expected_sum = mx.sum(expected_none)
        self.assertTrue(mx.allclose(losses_sum, expected_sum))

        # Test with reduction='none', full=True
        losses_none_full = nn.losses.gaussian_nll_loss(
            inputs, targets, vars, full=True, reduction="none"
        )
        expected_none_full = mx.array([[-0.182354, 0.139220], [0.383619, 0.510793]])
        self.assertTrue(mx.allclose(losses_none_full, expected_none_full))

        # Test with reduction='mean', full=True
        losses_mean_full = nn.losses.gaussian_nll_loss(
            inputs, targets, vars, full=True, reduction="mean"
        )
        expected_mean_full = mx.mean(expected_none_full)
        self.assertTrue(mx.allclose(losses_mean_full, expected_mean_full))

        # Test with reduction='sum', full=True
        losses_sum_full = nn.losses.gaussian_nll_loss(
            inputs, targets, vars, full=True, reduction="sum"
        )
        expected_sum_full = mx.sum(expected_none_full)
        self.assertTrue(mx.allclose(losses_sum_full, expected_sum_full))

    def test_kl_div_loss(self):
        p_logits = mx.log(mx.array([[0.5, 0.5], [0.8, 0.2]]))
        q_logits = mx.log(mx.array([[0.5, 0.5], [0.2, 0.8]]))

        # Test with reduction 'none'
        losses_none = nn.losses.kl_div_loss(p_logits, q_logits, reduction="none")
        expected_none = mx.array([0.0, 0.831777])
        self.assertTrue(mx.allclose(losses_none, expected_none))

        # Test with reduction 'mean'
        losses_mean = nn.losses.kl_div_loss(p_logits, q_logits, reduction="mean")
        expected_mean = mx.mean(expected_none)
        self.assertTrue(mx.allclose(losses_mean, expected_mean))

        # Test with reduction 'sum'
        losses_sum = nn.losses.kl_div_loss(p_logits, q_logits, reduction="sum")
        expected_sum = mx.sum(expected_none)
        self.assertTrue(mx.allclose(losses_sum, expected_sum))

    def test_triplet_loss(self):
        anchors = mx.array([[1, 2, 3], [1, 2, 3]])
        positives = mx.array([[4, 5, 6], [0, -1, 2]])
        negatives = mx.array([[7, 8, 9], [3, 2, 3]])

        # Test with reduction 'none'
        losses_none = nn.losses.triplet_loss(
            anchors, positives, negatives, reduction="none"
        )
        expected_none = mx.array([0, 2.31662])
        self.assertTrue(mx.allclose(losses_none, expected_none))

        # Test with reduction 'mean'
        losses_mean = nn.losses.triplet_loss(
            anchors, positives, negatives, reduction="mean"
        )
        expected_mean = mx.mean(expected_none)
        self.assertTrue(mx.allclose(losses_mean, expected_mean))

        # Test with reduction 'sum'
        losses_sum = nn.losses.triplet_loss(
            anchors, positives, negatives, reduction="sum"
        )
        expected_sum = mx.sum(expected_none)
        self.assertTrue(mx.allclose(losses_sum, expected_sum))

    def test_hinge_loss(self):
        inputs = mx.ones((2, 4))
        targets = mx.zeros((2, 4))
        loss = nn.losses.hinge_loss(inputs, targets, reduction="mean")
        self.assertEqual(loss, 1.0)

    def test_huber_loss(self):
        inputs = mx.ones((2, 4))
        targets = mx.zeros((2, 4))
        loss = nn.losses.huber_loss(inputs, targets, reduction="mean")
        self.assertEqual(loss, 0.5)

    def test_log_cosh_loss(self):
        inputs = mx.ones((2, 4))
        targets = mx.zeros((2, 4))
        loss = nn.losses.log_cosh_loss(inputs, targets, reduction="mean")
        self.assertAlmostEqual(loss.item(), 0.433781, places=6)

    def test_cosine_similarity_loss(self):
        embeddings1 = mx.array([[0.5, 0.5, 0.2, 0.9], [0.1, 0.3, 0.5, 0.5]])
        embeddings2 = mx.array([[0.6, 0.4, 0.3, 0.8], [0.2, 0.5, 0.6, 0.4]])

        # Test with reduction 'none'
        losses_none = nn.losses.cosine_similarity_loss(
            embeddings1, embeddings2, reduction="none"
        )
        expected_none = mx.array([0.985344, 0.961074])
        self.assertTrue(mx.allclose(losses_none, expected_none))

        # Test with reduction 'mean'
        losses_mean = nn.losses.cosine_similarity_loss(
            embeddings1, embeddings2, reduction="mean"
        )
        expected_mean = mx.mean(expected_none)
        self.assertTrue(mx.allclose(losses_mean, expected_mean))

        # Test with reduction 'sum'
        losses_sum = nn.losses.cosine_similarity_loss(
            embeddings1, embeddings2, reduction="sum"
        )
        expected_sum = mx.sum(expected_none)
        self.assertTrue(mx.allclose(losses_sum, expected_sum))

    def test_margin_ranking_loss(self):
        inputs1 = mx.array([-0.573409, -0.765166, -0.0638])
        inputs2 = mx.array([0.75596, 0.225763, 0.256995])
        targets = mx.array([1, 1, -1])

        # Test with no margin
        losses = nn.losses.margin_ranking_loss(
            inputs1, inputs2, targets, reduction="none"
        )
        expected = mx.array([1.329369, 0.990929, 0.0])
        self.assertTrue(mx.allclose(losses, expected))

        # Test with margin
        losses = nn.losses.margin_ranking_loss(
            inputs1, inputs2, targets, margin=0.5, reduction="none"
        )
        expected = mx.array([1.829369, 1.490929, 0.179205])
        self.assertTrue(mx.allclose(losses, expected))


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_memory.py
================================================
# Copyright © 2023-2024 Apple Inc.

import unittest

import mlx.core as mx
import mlx_tests


class TestMemory(mlx_tests.MLXTestCase):
    def test_memory_info(self):
        old_limit = mx.set_cache_limit(0)

        a = mx.zeros((4096,))
        mx.eval(a)
        del a
        self.assertEqual(mx.get_cache_memory(), 0)
        self.assertEqual(mx.set_cache_limit(old_limit), 0)
        self.assertEqual(mx.set_cache_limit(old_limit), old_limit)

        old_limit = mx.set_memory_limit(10)
        self.assertEqual(mx.set_memory_limit(old_limit), 10)
        self.assertEqual(mx.set_memory_limit(old_limit), old_limit)

        # Query active and peak memory
        a = mx.zeros((4096,))
        mx.eval(a)
        mx.synchronize()
        active_mem = mx.get_active_memory()
        self.assertTrue(active_mem >= 4096 * 4)

        b = mx.zeros((4096,))
        mx.eval(b)
        del b
        mx.synchronize()

        new_active_mem = mx.get_active_memory()
        self.assertEqual(new_active_mem, active_mem)
        peak_mem = mx.get_peak_memory()
        self.assertTrue(peak_mem >= 4096 * 8)

        if mx.metal.is_available():
            cache_mem = mx.get_cache_memory()
            self.assertTrue(cache_mem >= 4096 * 4)

        mx.clear_cache()
        self.assertEqual(mx.get_cache_memory(), 0)

        mx.reset_peak_memory()
        self.assertEqual(mx.get_peak_memory(), 0)

    @unittest.skipIf(not mx.metal.is_available(), "Metal is not available")
    def test_wired_memory(self):
        old_limit = mx.set_wired_limit(1000)
        old_limit = mx.set_wired_limit(0)
        self.assertEqual(old_limit, 1000)

        max_size = mx.device_info(mx.gpu)["max_recommended_working_set_size"]
        with self.assertRaises(ValueError):
            mx.set_wired_limit(max_size + 10)

    def test_active_memory_count(self):
        mx.synchronize()
        mx.clear_cache()
        init_mem = mx.get_active_memory()
        a = mx.zeros((128, 128))
        mx.eval(a)
        mx.synchronize()
        del a
        a = mx.zeros((90, 128))
        mx.eval(a)
        mx.synchronize()
        del a
        self.assertEqual(init_mem, mx.get_active_memory())


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_nn.py
================================================
# Copyright © 2023-2024 Apple Inc.

import os
import tempfile
import unittest

import mlx.core as mx
import mlx.nn as nn
import mlx_tests
import numpy as np
from mlx.utils import tree_flatten, tree_map, tree_reduce


class TestBase(mlx_tests.MLXTestCase):
    def test_module_utilities(self):
        m = nn.Sequential(
            nn.Sequential(nn.Linear(2, 10), nn.relu),
            nn.Sequential(nn.Linear(10, 10), nn.ReLU()),
            nn.Linear(10, 1),
            mx.sigmoid,
        )

        children = m.children()
        self.assertTrue(isinstance(children, dict))
        self.assertEqual(len(children), 1)
        self.assertTrue(isinstance(children["layers"], list))
        self.assertEqual(len(children["layers"]), 4)
        self.assertEqual(children["layers"][3], {})
        flat_children = tree_flatten(children, is_leaf=nn.Module.is_module)
        self.assertEqual(len(flat_children), 3)

        leaves = tree_flatten(m.leaf_modules(), is_leaf=nn.Module.is_module)
        self.assertEqual(len(leaves), 4)
        self.assertEqual(leaves[0][0], "layers.0.layers.0")
        self.assertEqual(leaves[1][0], "layers.1.layers.0")
        self.assertEqual(leaves[2][0], "layers.1.layers.1")
        self.assertEqual(leaves[3][0], "layers.2")
        self.assertTrue(leaves[0][1] is m.layers[0].layers[0])
        self.assertTrue(leaves[1][1] is m.layers[1].layers[0])
        self.assertTrue(leaves[2][1] is m.layers[1].layers[1])
        self.assertTrue(leaves[3][1] is m.layers[2])

        m.eval()

        def assert_not_training(k, m):
            self.assertFalse(m.training)

        m.apply_to_modules(assert_not_training)

        m.train()

        def assert_training(k, m):
            self.assertTrue(m.training)

        m.apply_to_modules(assert_training)

    def test_module_attributes(self):
        class Model(nn.Module):
            def __init__(self):
                super().__init__()
                self.val = None
                self.initialize()

            def initialize(self):
                self.val = mx.array(1.0)

        model = Model()
        self.assertTrue(mx.array_equal(model.val, mx.array(1.0)))

        model.val = None
        self.assertEqual(model.val, None)

        model.val = mx.array([3])
        self.assertEqual(model.val.item(), 3)

    def test_model_with_dict(self):
        class DictModule(nn.Module):
            def __init__(self):
                super().__init__()
                self.weights = {"w1": mx.zeros((2, 2)), "w2": mx.ones((2, 2))}

        model = DictModule()
        params = tree_flatten(model.parameters(), destination={})
        self.assertEqual(len(params), 2)
        self.assertTrue(mx.array_equal(params["weights.w1"], mx.zeros((2, 2))))
        self.assertTrue(mx.array_equal(params["weights.w2"], mx.ones((2, 2))))

    def test_save_npz_weights(self):
        def make_model():
            return nn.Sequential(nn.Linear(2, 2), nn.ReLU(), nn.Linear(2, 2))

        m = make_model()
        tdir = tempfile.TemporaryDirectory()
        npz_file = os.path.join(tdir.name, "model.npz")
        m.save_weights(npz_file)
        m_load = make_model()
        m_load.load_weights(npz_file)

        # Eval before cleanup so model file is unlocked.
        mx.eval(m_load.state)
        tdir.cleanup()

        eq_tree = tree_map(mx.array_equal, m.parameters(), m_load.parameters())
        self.assertTrue(all(tree_flatten(eq_tree)))

    def test_save_safetensors_weights(self):
        def make_model():
            return nn.Sequential(nn.Linear(2, 2), nn.ReLU(), nn.Linear(2, 2), nn.ReLU())

        m = make_model()
        tdir = tempfile.TemporaryDirectory()
        safetensors_file = os.path.join(tdir.name, "model.safetensors")
        m.save_weights(safetensors_file)
        m_load = make_model()
        m_load.load_weights(safetensors_file)

        # Eval before cleanup so model file is unlocked.
        mx.eval(m_load.state)
        tdir.cleanup()

        eq_tree = tree_map(mx.array_equal, m.parameters(), m_load.parameters())
        self.assertTrue(all(tree_flatten(eq_tree)))

    def test_load_from_weights(self):
        m = nn.Linear(2, 2)

        # Too few weights
        weights = [("weight", mx.ones((2, 2)))]
        with self.assertRaises(ValueError):
            m.load_weights(weights)

        m.load_weights(weights, strict=False)
        self.assertTrue(mx.array_equal(m.weight, weights[0][1]))

        # Wrong name
        with self.assertRaises(ValueError):
            m.load_weights([("weihgt", mx.ones((2, 2)))])

        # Ok
        m.load_weights([("weihgt", mx.ones((2, 2)))], strict=False)

        # Too many weights
        with self.assertRaises(ValueError):
            m.load_weights(
                [
                    ("weight", mx.ones((2, 2))),
                    ("bias", mx.ones((2,))),
                    ("bias2", mx.ones((2,))),
                ]
            )

        # Wrong shape
        with self.assertRaises(ValueError):
            m.load_weights(
                [
                    ("weight", mx.ones((2, 2))),
                    ("bias", mx.ones((2, 1))),
                ]
            )

        # Wrong type
        with self.assertRaises(ValueError):
            m.load_weights(
                [
                    ("weight", mx.ones((2, 2))),
                    ("bias", 3),
                ]
            )

        # Empty weights is ok if strict is false
        m.load_weights([], strict=False)

        # Extra weights for non-existent layers are filtered when strict
        # is false. Flat keys like "extra.weight" are silently dropped by
        # Module.update, but nested indexed keys like "layers.1.weight"
        # cause an IndexError in tree_unflatten/update without filtering.
        m = nn.Sequential(nn.Linear(2, 2))
        m.load_weights(
            [
                ("layers.0.weight", mx.ones((2, 2))),
                ("layers.0.bias", mx.ones((2,))),
                ("layers.1.weight", mx.ones((2, 2))),
                ("layers.1.bias", mx.ones((2,))),
            ],
            strict=False,
        )
        self.assertTrue(mx.array_equal(m.layers[0].weight, mx.ones((2, 2))))
        self.assertEqual(len(m.layers), 1)

    def test_module_state(self):
        m = nn.Linear(10, 1)
        m.state["hello"] = "world"
        self.assertEqual(m.state["hello"], "world")

    def test_chaining(self):
        m = nn.Sequential(nn.Linear(2, 2), nn.ReLU(), nn.Linear(2, 1))
        pre_freeze_num_params = len(m.parameters())
        m.freeze().unfreeze()
        self.assertEqual(len(m.parameters()), pre_freeze_num_params)
        params_dict = m.parameters()

        self.assertFalse(m.update(params_dict).eval()._training)
        self.assertTrue(m.train()._training)

    def test_quantize(self):
        m = nn.Sequential(nn.Embedding(5, 256), nn.ReLU(), nn.Linear(256, 256))
        nn.quantize(m)
        self.assertTrue(isinstance(m.layers[0], nn.QuantizedEmbedding))
        self.assertTrue(isinstance(m.layers[1], nn.ReLU))
        self.assertTrue(isinstance(m.layers[2], nn.QuantizedLinear))

        m = nn.Sequential(nn.Embedding(5, 256), nn.ReLU(), nn.Linear(256, 256))
        nn.quantize(m, class_predicate=lambda _, m: isinstance(m, nn.Linear))
        self.assertTrue(isinstance(m.layers[0], nn.Embedding))
        self.assertTrue(isinstance(m.layers[1], nn.ReLU))
        self.assertTrue(isinstance(m.layers[2], nn.QuantizedLinear))

        nn.quantize(m, group_size=32, mode="mxfp4")
        self.assertTrue(isinstance(m.layers[0], nn.QuantizedEmbedding))
        self.assertTrue(isinstance(m.layers[1], nn.ReLU))
        self.assertTrue(isinstance(m.layers[2], nn.QuantizedLinear))
        self.assertTrue(isinstance(m.layers[2].scales, mx.array))

        m = nn.Sequential(
            nn.Embedding(5, 256), nn.ReLU(), nn.Linear(256, 256, bias=False)
        )
        nn.quantize(
            m,
            group_size=32,
            mode="mxfp8",
            quantize_input=True,
            class_predicate=lambda path, module: isinstance(module, nn.Linear),
        )
        self.assertTrue(isinstance(m.layers[0], nn.Embedding))
        self.assertTrue(isinstance(m.layers[1], nn.ReLU))
        self.assertTrue(isinstance(m.layers[2], nn.QQLinear))

        # Check that Embedding does not support quantize_input
        m = nn.Sequential(
            nn.Embedding(5, 256), nn.ReLU(), nn.Linear(256, 256, bias=False)
        )
        with self.assertRaises(ValueError) as context:
            nn.quantize(m, group_size=32, mode="mxfp8", quantize_input=True)

    def test_quantize_freeze(self):
        lin = nn.Linear(512, 512)
        qlin = lin.to_quantized()
        qlin.unfreeze(keys=["scales"])
        size = tree_reduce(lambda acc, p: acc + p.size, qlin.trainable_parameters(), 0)
        self.assertTrue(size > 0)

    def test_quantized_sharded_linear_construction(self):
        input_dims, output_dims = 1536, 1024
        for bits in [2, 3, 4, 5, 6, 8]:
            lin = nn.Linear(input_dims, output_dims)
            qlin = lin.to_quantized(bits=bits)

            slin1 = nn.QuantizedAllToShardedLinear.from_quantized_linear(qlin)
            self.assertEqual(slin1.weight.shape, qlin.weight.shape)

            slin2 = nn.QuantizedShardedToAllLinear.from_quantized_linear(qlin)
            self.assertEqual(slin2.weight.shape, qlin.weight.shape)

    def test_grad_of_module(self):
        class Model(nn.Module):
            def __init__(self):
                super().__init__()
                self.m1 = nn.Linear(3, 3)

        model = Model()

        def loss_fn(model):
            return model.m1(x).sum()

        x = mx.zeros((3,))
        mx.grad(loss_fn)(model)

    def test_update(self):
        m = nn.Sequential(nn.Linear(3, 3), nn.Linear(3, 3))

        # Updating non-existent parameters
        with self.assertRaises(ValueError):
            updates = {"layers": [{"value": 0}]}
            m.update(updates)

        with self.assertRaises(ValueError):
            updates = {"layers": ["hello"]}
            m.update(updates)

        # Wronge type
        with self.assertRaises(ValueError):
            updates = {"layers": [{"weight": "hi"}]}
            m.update(updates)

    def test_update_modules(self):
        m = nn.Sequential(nn.Linear(3, 3), nn.Linear(3, 3))

        # Updating non-existent modules should not be allowed by default
        with self.assertRaises(ValueError):
            m = m.update_modules({"values": [0, 1]})

        # Update wrong types
        with self.assertRaises(ValueError):
            m = m.update_modules({"layers": [0, 1]})

        class MyModule(nn.Module):
            def __init__(self):
                super().__init__()
                self.test = mx.array(1.0)
                self.list = [mx.array(1.0), mx.array(2.0)]

        m = MyModule()
        with self.assertRaises(ValueError):
            m = m.update_modules({"test": "hi"})
        with self.assertRaises(ValueError):
            m = m.update_modules({"list": ["hi"]})

        # Allow updating a strict subset
        m = nn.Sequential(nn.Linear(3, 3), nn.Linear(3, 3))
        m.update_modules({"layers": [{}, nn.Linear(3, 4)]})
        self.assertEqual(m.layers[1].weight.shape, (4, 3))

        # Using leaf_modules in the update should always work
        class MyModel(nn.Module):
            def __init__(self):
                super().__init__()
                self.stuff = [nn.Linear(2, 2), 0, nn.Linear(2, 2)]
                self.more_stuff = {"hi": nn.Linear(2, 2), "bye": 0}

        m = MyModel()
        m.update_modules(m.leaf_modules())

    def test_parameter_deletion(self):
        m = nn.Linear(32, 32)
        del m.weight
        self.assertFalse(hasattr(m, "weight"))

    def test_circular_leaks(self):
        y = mx.random.uniform(1)
        mx.eval(y)

        def make_and_update():
            model = nn.Linear(1024, 512)
            mx.eval(model.parameters())
            leaves = {}
            model.update_modules(leaves)

        mx.synchronize()
        pre = mx.get_active_memory()
        make_and_update()
        mx.synchronize()
        post = mx.get_active_memory()
        self.assertEqual(pre, post)


class TestLayers(mlx_tests.MLXTestCase):
    def test_identity(self):
        inputs = mx.zeros((10, 4))
        layer = nn.Identity()
        outputs = layer(inputs)
        self.assertEqual(inputs.shape, outputs.shape)

    def test_linear(self):
        inputs = mx.zeros((10, 4))
        layer = nn.Linear(input_dims=4, output_dims=8)
        outputs = layer(inputs)
        self.assertEqual(outputs.shape, (10, 8))

    def test_bilinear(self):
        inputs1 = mx.zeros((10, 2))
        inputs2 = mx.zeros((10, 4))
        layer = nn.Bilinear(input1_dims=2, input2_dims=4, output_dims=6)
        outputs = layer(inputs1, inputs2)
        self.assertEqual(outputs.shape, (10, 6))

    def test_group_norm(self):
        x = mx.arange(100, dtype=mx.float32)
        x = x.reshape(1, 10, 10, 1)
        x = mx.broadcast_to(x, (2, 10, 10, 4))
        x = mx.concatenate([x, 0.5 * x], axis=-1)

        # Group norm in groups last mode
        g = nn.GroupNorm(2, 8)
        y = g(x)
        means = y.reshape(2, -1, 2).mean(axis=1)
        var = y.reshape(2, -1, 2).var(axis=1)
        self.assertTrue(np.allclose(means, np.zeros_like(means), atol=1e-6))
        self.assertTrue(np.allclose(var, np.ones_like(var), atol=1e-6))
        g.weight = g.weight * 2
        g.bias = g.bias + 3
        y = g(x)
        means = y.reshape(2, -1, 2).mean(axis=1)
        var = y.reshape(2, -1, 2).var(axis=1)
        self.assertTrue(np.allclose(means, 3 * np.ones_like(means), atol=1e-6))
        self.assertTrue(np.allclose(var, 4 * np.ones_like(var), atol=1e-6))

        # Group norm in groups first mode
        g = nn.GroupNorm(2, 8, pytorch_compatible=True)
        y = g(x)
        means = y.reshape(2, -1, 2, 4).mean(axis=(1, -1))
        var = y.reshape(2, -1, 2, 4).var(axis=(1, -1))
        self.assertTrue(np.allclose(means, np.zeros_like(means), atol=1e-6))
        self.assertTrue(np.allclose(var, np.ones_like(var), atol=1e-6))
        g.weight = g.weight * 2
        g.bias = g.bias + 3
        y = g(x)
        means = y.reshape(2, -1, 2, 4).mean(axis=(1, -1))
        var = y.reshape(2, -1, 2, 4).var(axis=(1, -1))
        self.assertTrue(np.allclose(means, 3 * np.ones_like(means), atol=1e-6))
        self.assertTrue(np.allclose(var, 4 * np.ones_like(var), atol=1e-6))

    def test_instance_norm(self):
        # Test InstanceNorm1d
        x = mx.array(
            [
                [
                    [-0.0119524, 1.1263, 2.02223],
                    [-0.500331, 0.517899, -1.21143],
                    [1.12958, -0.21413, -2.48738],
                    [1.39955, 0.891329, 1.63289],
                ],
                [
                    [0.241417, -0.619157, -0.77484],
                    [-1.42512, 0.970817, -1.31352],
                    [2.739, -1.2506, 1.56844],
                    [-1.23175, 0.32756, 1.13969],
                ],
            ]
        )
        inorm = nn.InstanceNorm(dims=3)
        y = inorm(x)
        expected_y = [
            [
                [-0.657082, 1.07593, 1.0712],
                [-1.27879, -0.123074, -0.632505],
                [0.796101, -1.56572, -1.30476],
                [1.13978, 0.612862, 0.866067],
            ],
            [
                [0.0964426, -0.557906, -0.759885],
                [-0.904772, 1.30444, -1.20013],
                [1.59693, -1.29752, 1.15521],
                [-0.7886, 0.550987, 0.804807],
            ],
        ]
        self.assertTrue(x.shape == y.shape)
        self.assertTrue(np.allclose(y, expected_y, atol=1e-5))
        # Test InstanceNorm2d
        x = mx.array(
            [
                [
                    [
                        [-0.458824, 0.483254, -0.58611],
                        [-0.447996, -0.176577, -0.622545],
                        [0.0486988, -0.0611224, 1.8845],
                    ],
                    [
                        [1.13049, 0.345315, -0.926389],
                        [0.301795, 0.99207, -0.184927],
                        [-2.23876, -0.758631, -1.12639],
                    ],
                    [
                        [0.0986325, -1.82973, -0.241765],
                        [-1.25257, 0.154442, -0.556204],
                        [-0.329399, -0.319107, 0.830584],
                    ],
                ],
                [
                    [
                        [1.04407, 0.073752, 0.407081],
                        [0.0800776, 1.2513, 1.20627],
                        [0.782321, -0.444367, 0.563132],
                    ],
                    [
                        [0.671423, -1.21689, -1.88979],
                        [-0.110299, -1.42248, 1.17838],
                        [0.159905, 0.516452, -0.539121],
                    ],
                    [
                        [0.810252, 1.50456, 1.08659],
                        [0.182597, 0.0576239, 0.973883],
                        [-0.0621687, 0.184253, 0.784216],
                    ],
                ],
            ]
        )
        inorm = nn.InstanceNorm(dims=3)
        y = inorm(x)
        expected_y = [
            [
                [
                    [-0.120422, 0.801503, -0.463983],
                    [-0.108465, -0.0608611, -0.504602],
                    [0.440008, 0.090032, 2.29032],
                ],
                [
                    [1.63457, 0.621224, -0.843335],
                    [0.719488, 1.4665, -0.0167344],
                    [-2.08591, -0.821575, -1.0663],
                ],
                [
                    [0.495147, -2.22145, -0.0800989],
                    [-0.996913, 0.371763, -0.430643],
                    [0.022495, -0.24714, 1.11538],
                ],
            ],
            [
                [
                    [1.5975, 0.0190292, -0.0123306],
                    [-0.776381, 1.28291, 0.817237],
                    [0.952927, -0.537076, 0.149652],
                ],
                [
                    [0.679836, -1.36624, -2.39651],
                    [-1.24519, -1.5869, 0.788287],
                    [-0.579802, 0.494186, -0.994499],
                ],
                [
                    [1.02171, 1.55474, 0.693008],
                    [-0.523922, 0.00171862, 0.576016],
                    [-1.12667, 0.137632, 0.37914],
                ],
            ],
        ]
        self.assertTrue(x.shape == y.shape)
        self.assertTrue(np.allclose(y, expected_y, atol=1e-5))
        # # Test InstanceNorm3d
        x = mx.array(
            [
                [
                    [
                        [[0.777621, 0.528145, -1.56133], [-2.1722, 0.128192, 0.153862]],
                        [
                            [-1.41317, 0.476288, -1.20411],
                            [0.284446, -0.649858, 0.152112],
                        ],
                    ],
                    [
                        [[0.11, -0.12431, 1.18768], [-0.837743, 1.93502, 0.00236324]],
                        [
                            [-2.40205, -1.25873, -2.04243],
                            [0.336682, -0.261986, 1.54289],
                        ],
                    ],
                    [
                        [
                            [0.789185, -1.63747, 0.67917],
                            [-1.42998, -1.73247, -0.402572],
                        ],
                        [
                            [-0.459489, -2.15559, -0.249959],
                            [0.0298199, 0.10275, -0.821897],
                        ],
                    ],
                ],
                [
                    [
                        [
                            [-2.12354, 0.643973, 0.72391],
                            [0.317797, -0.682916, 0.016364],
                        ],
                        [
                            [-0.146628, -0.987925, 0.573199],
                            [0.0329215, 1.54086, 0.213092],
                        ],
                    ],
                    [
                        [
                            [-1.55784, 0.71179, -0.0678402],
                            [2.41031, -0.290786, 0.00449439],
                        ],
                        [
                            [0.226341, 0.057712, -1.58342],
                            [0.265387, -0.742304, 1.28133],
                        ],
                    ],
                    [
                        [
                            [0.990317, -0.399875, -0.357647],
                            [0.475161, -1.10479, -1.07389],
                        ],
                        [
                            [-1.37804, 1.40097, 0.141618],
                            [-0.501041, 0.0723374, -0.386141],
                        ],
                    ],
                ],
            ]
        )
        inorm = nn.InstanceNorm(dims=3)
        y = inorm(x)
        expected_y = [
            [
                [
                    [[1.23593, 0.821849, -1.30944], [-1.54739, 0.462867, 0.357126]],
                    [[-0.831204, 0.775304, -0.962338], [0.770588, -0.23548, 0.355425]],
                ],
                [
                    [[0.605988, 0.236231, 1.36163], [-0.288258, 2.0846, 0.209922]],
                    [[-1.76427, -0.78198, -1.77689], [0.819875, 0.112659, 1.70677]],
                ],
                [
                    [[1.24684, -1.12192, 0.867539], [-0.847068, -1.20719, -0.183531]],
                    [
                        [0.0686449, -1.58697, -0.0352458],
                        [0.530334, 0.440032, -0.590967],
                    ],
                ],
            ],
            [
                [
                    [[-1.75315, 0.733967, 1.04349], [0.343736, -0.822472, 0.080661]],
                    [[-0.0551618, -1.18025, 0.838402], [0.0990544, 1.78602, 0.348368]],
                ],
                [
                    [[-1.26726, 0.813517, -0.033924], [2.14101, -0.362504, 0.0645089]],
                    [[0.265184, 0.0462839, -2.09632], [0.298721, -0.892134, 1.80203]],
                ],
                [
                    [[0.921369, -0.490465, -0.428293], [0.478897, -1.31732, -1.40296]],
                    [[-1.11283, 1.62192, 0.251107], [-0.35957, 0.0634394, -0.467067]],
                ],
            ],
        ]
        self.assertTrue(x.shape == y.shape)
        self.assertTrue(np.allclose(y, expected_y, atol=1e-5))
        # Test repr
        self.assertTrue(str(inorm) == "InstanceNorm(3, eps=1e-05, affine=False)")

    def test_batch_norm(self):
        mx.random.seed(42)
        x = mx.random.normal((5, 4), dtype=mx.float32)

        # Batch norm
        bn = nn.BatchNorm(num_features=4, affine=True)
        self.assertTrue(mx.allclose(bn.running_mean, mx.zeros_like(bn.running_mean)))
        self.assertTrue(mx.allclose(bn.running_var, mx.ones_like(bn.running_var)))
        y = bn(x)
        expected_y = mx.array(
            [
                [-0.439520, 1.647328, -0.955515, 1.966031],
                [-1.726690, -1.449826, -0.234026, -0.723364],
                [0.938414, -0.349603, -0.354470, -0.175369],
                [0.305006, 0.234914, -0.393017, -0.459385],
                [0.922789, -0.082813, 1.937028, -0.607913],
            ],
        )
        expected_mean = mx.array([0.008929, 0.005680, -0.016092, 0.027778])
        expected_var = mx.array([0.928435, 1.00455, 1.04117, 0.94258])
        self.assertTrue(x.shape == y.shape)
        self.assertTrue(mx.allclose(y, expected_y, atol=1e-5))
        self.assertTrue(mx.allclose(bn.running_mean, expected_mean, atol=1e-5))
        self.assertTrue(mx.allclose(bn.running_var, expected_var, atol=1e-5))

        # test eval mode
        bn.eval()
        y = bn(x)
        expected_y = mx.array(
            [
                [-0.15984, 1.73159, -1.25456, 1.57891],
                [-0.872193, -1.4281, -0.414439, -0.228678],
                [0.602743, -0.30566, -0.554687, 0.139639],
                [0.252199, 0.29066, -0.599572, -0.0512532],
                [0.594096, -0.0334829, 2.11359, -0.151081],
            ]
        )

        self.assertTrue(x.shape == y.shape)
        self.assertTrue(mx.allclose(y, expected_y, atol=1e-5))

        # test_no_affine
        bn = nn.BatchNorm(num_features=4, affine=False)
        y = bn(x)
        expected_y = mx.array(
            [
                [-0.439520, 1.647328, -0.955515, 1.966031],
                [-1.726690, -1.449826, -0.234026, -0.723364],
                [0.938414, -0.349603, -0.354470, -0.175369],
                [0.305006, 0.234914, -0.393017, -0.459385],
                [0.922789, -0.082813, 1.937028, -0.607913],
            ]
        )
        self.assertTrue(x.shape == y.shape)
        self.assertTrue(mx.allclose(y, expected_y, atol=1e-5))

        # test with 3D input
        mx.random.seed(42)
        N = 2
        L = 4
        C = 5
        x = mx.random.normal((N, L, C), dtype=mx.float32)

        # Batch norm
        bn = nn.BatchNorm(num_features=C, affine=True)
        self.assertTrue(mx.allclose(bn.running_mean, mx.zeros_like(bn.running_mean)))
        self.assertTrue(mx.allclose(bn.running_var, mx.ones_like(bn.running_var)))
        y = bn(x)
        self.assertTrue(x.shape == y.shape)
        expected_y = mx.array(
            [
                [
                    [-0.335754, 0.342054, 1.02653, 0.628588, -1.63899],
                    [1.92092, 0.432319, 0.343043, 1.95489, 1.0696],
                    [-0.853748, 1.3661, 0.868569, 0.0199196, -0.887284],
                    [0.459206, -0.684822, -0.706354, -0.271531, 0.566341],
                ],
                [
                    [-0.921179, 0.684951, -0.77466, -0.490372, -0.247032],
                    [1.10839, -2.13179, 0.628924, -1.62639, -0.539708],
                    [-0.348943, 0.412194, -2.03818, 0.524972, 1.64568],
                    [-1.02889, -0.421, 0.652127, -0.740079, 0.0313996],
                ],
            ]
        )
        self.assertTrue(mx.allclose(y, expected_y, atol=1e-5))
        expected_mean = mx.array(
            [[[0.00207845, -5.3259e-05, 0.04755, -0.0697296, 0.0236228]]]
        )
        expected_var = mx.array([[[0.968415, 1.05322, 0.96913, 0.932305, 0.967224]]])
        self.assertTrue(mx.allclose(bn.running_mean, expected_mean, atol=1e-5))
        self.assertTrue(mx.allclose(bn.running_var, expected_var, atol=1e-5))

        x = mx.random.normal((N, L, C, L, C), dtype=mx.float32)
        with self.assertRaises(ValueError):
            y = bn(x)

        # Check that the running stats are in the param dictionary
        bn_parameters = bn.parameters()
        self.assertIn("running_mean", bn_parameters)
        self.assertIn("running_var", bn_parameters)
        self.assertIn("weight", bn_parameters)
        self.assertIn("bias", bn_parameters)

        bn_trainable = bn.trainable_parameters()
        self.assertNotIn("running_mean", bn_trainable)
        self.assertNotIn("running_var", bn_trainable)
        self.assertIn("weight", bn_trainable)
        self.assertIn("bias", bn_trainable)

        bn.unfreeze()
        bn_trainable = bn.trainable_parameters()
        self.assertNotIn("running_mean", bn_trainable)
        self.assertNotIn("running_var", bn_trainable)
        self.assertIn("weight", bn_trainable)
        self.assertIn("bias", bn_trainable)

    def test_batch_norm_stats(self):
        batch_size = 2
        num_features = 4
        h = 3
        w = 3
        momentum = 0.1

        batch_norm = nn.BatchNorm(num_features)

        batch_norm.train()
        running_mean = batch_norm.running_mean
        running_var = batch_norm.running_var

        data = mx.random.normal((batch_size, num_features))

        normalized_data = batch_norm(data)
        means = mx.mean(data, axis=0)
        variances = mx.var(data, axis=0)
        running_mean = (1 - momentum) * running_mean + momentum * means
        running_var = (1 - momentum) * running_var + momentum * variances
        self.assertTrue(mx.allclose(batch_norm.running_mean, running_mean, atol=1e-5))
        self.assertTrue(mx.allclose(batch_norm.running_var, running_var, atol=1e-5))

        batch_norm = nn.BatchNorm(num_features)

        batch_norm.train()
        running_mean = batch_norm.running_mean
        running_var = batch_norm.running_var
        data = mx.random.normal((batch_size, h, w, num_features))

        normalized_data = batch_norm(data)
        means = mx.mean(data, axis=(0, 1, 2))
        variances = mx.var(data, axis=(0, 1, 2))
        running_mean = (1 - momentum) * running_mean + momentum * means
        running_var = (1 - momentum) * running_var + momentum * variances
        self.assertTrue(mx.allclose(batch_norm.running_mean, running_mean, atol=1e-5))
        self.assertTrue(mx.allclose(batch_norm.running_var, running_var, atol=1e-5))

        self.assertEqual(batch_norm.running_mean.shape, running_mean.shape)
        self.assertEqual(batch_norm.running_var.shape, running_var.shape)

    def test_conv1d(self):
        N = 5
        L = 12
        ks = 3
        C_in = 2
        C_out = 4
        x = mx.ones((N, L, C_in))
        c = nn.Conv1d(in_channels=C_in, out_channels=C_out, kernel_size=ks)
        c.weight = mx.ones_like(c.weight)
        y = c(x)
        self.assertEqual(y.shape, (N, L - ks + 1, C_out))
        self.assertTrue(mx.allclose(y, mx.full(y.shape, ks * C_in, mx.float32)))

        c = nn.Conv1d(in_channels=C_in, out_channels=C_out, kernel_size=ks, stride=2)
        y = c(x)
        self.assertEqual(y.shape, (N, (L - ks + 1) // 2, C_out))
        self.assertTrue("bias" in c.parameters())

        dil = 2
        c = nn.Conv1d(
            in_channels=C_in, out_channels=C_out, kernel_size=ks, dilation=dil
        )
        y = c(x)
        self.assertEqual(y.shape, (N, L - (ks - 1) * dil, C_out))

        c = nn.Conv1d(in_channels=C_in, out_channels=C_out, kernel_size=ks, bias=False)
        self.assertTrue("bias" not in c.parameters())

        groups = C_in
        c = nn.Conv1d(
            in_channels=C_in, out_channels=C_out, kernel_size=ks, groups=groups
        )
        y = c(x)
        self.assertEqual(c.weight.shape, (C_out, ks, C_in // groups))
        self.assertEqual(y.shape, (N, L - ks + 1, C_out))

    def test_conv2d(self):
        x = mx.ones((4, 8, 8, 3))
        c = nn.Conv2d(3, 1, 8)
        y = c(x)
        self.assertEqual(y.shape, (4, 1, 1, 1))
        c.weight = mx.ones_like(c.weight) / 8 / 8 / 3
        y = c(x)
        self.assertTrue(np.allclose(y[:, 0, 0, 0], x.mean(axis=(1, 2, 3))))

        # 3x3 conv no padding stride 1
        c = nn.Conv2d(3, 8, 3)
        y = c(x)
        self.assertEqual(y.shape, (4, 6, 6, 8))
        self.assertLess(mx.abs(y - c.weight.sum((1, 2, 3))).max(), 1e-4)

        # 3x3 conv padding 1 stride 1
        c = nn.Conv2d(3, 8, 3, padding=1)
        y = c(x)
        self.assertEqual(y.shape, (4, 8, 8, 8))
        self.assertLess(mx.abs(y[:, 1:7, 1:7] - c.weight.sum((1, 2, 3))).max(), 1e-4)
        self.assertLess(
            mx.abs(y[:, 0, 0] - c.weight[:, 1:, 1:].sum(axis=(1, 2, 3))).max(),
            1e-4,
        )
        self.assertLess(
            mx.abs(y[:, 7, 7] - c.weight[:, :-1, :-1].sum(axis=(1, 2, 3))).max(),
            1e-4,
        )
        self.assertLess(
            mx.abs(y[:, 1:7, 7] - c.weight[:, :, :-1].sum(axis=(1, 2, 3))).max(),
            1e-4,
        )
        self.assertLess(
            mx.abs(y[:, 7, 1:7] - c.weight[:, :-1, :].sum(axis=(1, 2, 3))).max(),
            1e-4,
        )

        # 3x3 conv no padding stride 2
        c = nn.Conv2d(3, 8, 3, padding=0, stride=2)
        y = c(x)
        self.assertEqual(y.shape, (4, 3, 3, 8))
        self.assertLess(mx.abs(y - c.weight.sum((1, 2, 3))).max(), 1e-4)

        c = nn.Conv2d(3, 8, 3, dilation=2)
        y = c(x)
        self.assertEqual(y.shape, (4, 4, 4, 8))
        self.assertLess(mx.abs(y - c.weight.sum((1, 2, 3))).max(), 1e-4)

        # 3x3 conv groups > 1
        x = mx.ones((4, 7, 7, 4))
        c = nn.Conv2d(4, 8, 3, padding=1, stride=1, groups=2)
        y = c(x)
        self.assertEqual(y.shape, (4, 7, 7, 8))

    def test_sequential(self):
        x = mx.ones((10, 2))
        m = nn.Sequential(nn.Linear(2, 10), nn.ReLU(), nn.Linear(10, 1))
        y = m(x)
        self.assertEqual(y.shape, (10, 1))
        params = m.parameters()
        self.assertTrue("layers" in params)
        self.assertEqual(len(params["layers"]), 3)
        self.assertTrue("weight" in params["layers"][0])
        self.assertEqual(len(params["layers"][1]), 0)
        self.assertTrue("weight" in params["layers"][2])

        m.layers[1] = nn.relu
        y2 = m(x)
        self.assertTrue(mx.array_equal(y, y2))

    def test_gelu(self):
        inputs = [1.15286231, -0.81037411, 0.35816911, 0.77484438, 0.66276414]

        # From: jax.nn.gelu(np.array(inputs), approximate=False)
        expected = np.array(
            [1.0093501, -0.16925684, 0.22918941, 0.60498625, 0.49459383]
        )
        # From: jax.nn.gelu(np.array(inputs), approximate=True)
        expected_approx = np.array(
            [1.0091482, -0.1693441, 0.22918446, 0.60491, 0.4945476]
        )

        out = nn.GELU()(mx.array(inputs))
        self.assertTrue(np.allclose(out, expected))

        # Test the precise/tanh approximation
        out_approx = nn.GELU(approx="precise")(mx.array(inputs))
        out_approx_tanh = nn.GELU(approx="tanh")(mx.array(inputs))
        self.assertTrue(np.allclose(out_approx, expected_approx))
        self.assertTrue(np.allclose(out_approx_tanh, expected_approx))
        self.assertTrue(np.allclose(out_approx, out_approx_tanh))

        # Crudely check the approximations
        x = mx.arange(-6.0, 6.0, 12 / 100)
        y = nn.gelu(x)
        y_hat1 = nn.gelu_approx(x)
        y_hat2 = nn.gelu_fast_approx(x)
        self.assertLess(mx.abs(y - y_hat1).max(), 0.0005)
        self.assertLess(mx.abs(y - y_hat2).max(), 0.025)

    def test_sin_pe(self):
        m = nn.SinusoidalPositionalEncoding(16, min_freq=0.01)
        x = mx.arange(10)
        y = m(x)

        self.assertEqual(y.shape, (10, 16))
        similarities = y @ y.T
        self.assertLess(
            mx.abs(similarities[mx.arange(10), mx.arange(10)] - 1).max(), 1e-5
        )

    def test_sigmoid(self):
        x = mx.array([1.0, 0.0, -1.0])
        y1 = mx.sigmoid(x)
        y2 = nn.activations.sigmoid(x)
        y3 = nn.Sigmoid()(x)

        self.assertEqualArray(y1, y2, atol=0, rtol=0)
        self.assertEqualArray(y1, y3, atol=0, rtol=0)

    def test_relu(self):
        x = mx.array([1.0, -1.0, 0.0])
        y = nn.relu(x)
        self.assertTrue(mx.array_equal(y, mx.array([1.0, 0.0, 0.0])))
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.dtype, mx.float32)

    def test_leaky_relu(self):
        x = mx.array([1.0, -1.0, 0.0])
        y = nn.leaky_relu(x)
        self.assertTrue(mx.array_equal(y, mx.array([1.0, -0.01, 0.0])))
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.dtype, mx.float32)

        y = nn.LeakyReLU(negative_slope=0.1)(x)
        self.assertTrue(mx.array_equal(y, mx.array([1.0, -0.1, 0.0])))
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.dtype, mx.float32)

    def test_elu(self):
        x = mx.array([1.0, -1.0, 0.0])
        y = nn.elu(x)
        epsilon = 1e-4
        expected_y = mx.array([1.0, -0.6321, 0.0])
        self.assertTrue(mx.all(mx.abs(y - expected_y) < epsilon))
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.dtype, mx.float32)

        y = nn.ELU(alpha=1.1)(x)
        epsilon = 1e-4
        expected_y = mx.array([1.0, -0.6953, 0.0])
        self.assertTrue(mx.all(mx.abs(y - expected_y) < epsilon))
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.dtype, mx.float32)

    def test_relu6(self):
        x = mx.array([1.0, -1.0, 0.0, 7.0, -7.0])
        y = nn.relu6(x)
        self.assertTrue(mx.array_equal(y, mx.array([1.0, 0.0, 0.0, 6.0, 0.0])))
        self.assertEqual(y.shape, (5,))
        self.assertEqual(y.dtype, mx.float32)

    def test_softmax(self):
        x = mx.array([1.0, -1.0, 0.0])
        y = nn.softmax(x)
        epsilon = 1e-4
        expected_y = mx.array([0.6652, 0.0900, 0.2447])
        self.assertTrue(mx.all(mx.abs(y - expected_y) < epsilon))
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.dtype, mx.float32)

    def test_softmin(self):
        x = mx.array([1.0, 2.0, 3.0])
        y = nn.softmin(x)
        epsilon = 1e-4
        expected_y = mx.array([0.6652, 0.2447, 0.0900])
        self.assertTrue(mx.all(mx.abs(y - expected_y) < epsilon))
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.dtype, mx.float32)

    def test_softplus(self):
        x = mx.array([1.0, -1.0, 0.0])
        y = nn.softplus(x)
        epsilon = 1e-4
        expected_y = mx.array([1.3133, 0.3133, 0.6931])
        self.assertTrue(mx.all(mx.abs(y - expected_y) < epsilon))
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.dtype, mx.float32)

    def test_softsign(self):
        x = mx.array([1.0, -1.0, 0.0])
        y = nn.softsign(x)
        epsilon = 1e-4
        expected_y = mx.array([0.5, -0.5, 0.0])
        self.assertTrue(mx.all(mx.abs(y - expected_y) < epsilon))
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.dtype, mx.float32)

    def test_softshrink(self):
        x = mx.array([1.0, -1.0, 0.0])
        y = nn.softshrink(x)
        epsilon = 1e-4
        expected_y = mx.array([0.5, -0.5, 0.0])
        self.assertTrue(mx.all(mx.abs(y - expected_y) < epsilon))
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.dtype, mx.float32)

        y = nn.Softshrink(lambd=0.7)(x)
        expected_y = mx.array([0.3, -0.3, 0.0])
        self.assertTrue(mx.all(mx.abs(y - expected_y) < epsilon))
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.dtype, mx.float32)

    def test_celu(self):
        x = mx.array([1.0, -1.0, 0.0])
        y = nn.celu(x)
        epsilon = 1e-4
        expected_y = mx.array([1.0, -0.6321, 0.0])
        self.assertTrue(mx.all(mx.abs(y - expected_y) < epsilon))
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.dtype, mx.float32)

        y = nn.CELU(alpha=1.1)(x)
        expected_y = mx.array([1.0, -0.6568, 0.0])
        self.assertTrue(mx.all(mx.abs(y - expected_y) < epsilon))
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.dtype, mx.float32)

    def test_log_softmax(self):
        x = mx.array([1.0, 2.0, 3.0])
        y = nn.log_softmax(x)
        epsilon = 1e-4
        expected_y = mx.array([-2.4076, -1.4076, -0.4076])
        self.assertTrue(mx.all(mx.abs(y - expected_y) < epsilon))
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.dtype, mx.float32)

    def test_log_sigmoid(self):
        x = mx.array([1.0, -1.0, 0.0])
        y = nn.log_sigmoid(x)
        epsilon = 1e-4
        expected_y = mx.array([-0.3133, -1.3133, -0.6931])
        self.assertTrue(mx.all(mx.abs(y - expected_y) < epsilon))
        self.assertEqual(y.shape, (3,))
        self.assertEqual(y.dtype, mx.float32)

    def test_prelu(self):
        self.assertEqualArray(
            nn.PReLU()(mx.array([1.0, -1.0, 0.0, 0.5])),
            mx.array([1.0, -0.25, 0.0, 0.5]),
        )

    def test_mish(self):
        self.assertEqualArray(
            nn.Mish()(mx.array([1.0, -1.0, 0.0, 0.5])),
            mx.array([0.8651, -0.3034, 0.0000, 0.3752]),
        )

    def test_hardswish(self):
        x = mx.array([-3.0, -1.5, 0.0, 1.5, 3.0])
        y = nn.hardswish(x)
        epsilon = 1e-4
        expected_y = mx.array([0.0, -0.375, 0.0, 1.125, 3.0])
        self.assertTrue(mx.all(mx.abs(y - expected_y) < epsilon))
        self.assertEqual(y.shape, (5,))
        self.assertEqual(y.dtype, mx.float32)

    def test_glu(self):
        x = mx.array([[[1.0, 2.0, 3.0, 4.0]]], dtype=mx.float32)
        y = mx.array([[[0.952574, 1.96403]]], dtype=mx.float32)
        out = nn.glu(x)
        self.assertEqualArray(out, y)

    def test_hard_tanh(self):
        x = mx.array([1.0, -2.0, 0.0, 0.5, 2.0])
        y = nn.hard_tanh(x)
        expected_y = mx.array([1.0, -1.0, 0.0, 0.5, 1.0])
        self.assertTrue(mx.array_equal(y, expected_y))
        self.assertEqual(y.shape, (5,))
        self.assertEqual(y.dtype, mx.float32)

    def test_hard_shrink(self):
        x = mx.array([1.0, -0.5, 0.0, 0.5, -1.5])
        y = nn.hard_shrink(x)
        expected_y = mx.array([1.0, 0.0, 0.0, 0.0, -1.5])
        self.assertTrue(mx.array_equal(y, expected_y))
        self.assertEqual(y.shape, (5,))
        self.assertEqual(y.dtype, mx.float32)

        y = nn.hard_shrink(x, lambd=0.1)
        expected_y = mx.array([1.0, -0.5, 0.0, 0.5, -1.5])
        self.assertTrue(mx.array_equal(y, expected_y))
        self.assertEqual(y.shape, (5,))
        self.assertEqual(y.dtype, mx.float32)

    def test_rope(self):
        for kwargs in [{}, {"traditional": False}, {"base": 10000}, {"scale": 0.25}]:
            rope = nn.RoPE(4, **kwargs)
            shape = (1, 3, 4)
            x = mx.random.uniform(shape=shape)
            y = rope(x)
            self.assertEqual(y.shape, shape)
            self.assertEqual(y.dtype, mx.float32)

            y = rope(x, offset=3)
            self.assertEqual(y.shape, shape)

            y = rope(x.astype(mx.float16))
            self.assertEqual(y.dtype, mx.float16)

    def test_alibi(self):
        alibi = nn.ALiBi()
        shape = (1, 8, 20, 20)
        x = mx.random.uniform(shape=shape)
        y = alibi(x)
        self.assertEqual(y.shape, shape)
        self.assertEqual(y.dtype, mx.float32)

        y = alibi(x.astype(mx.float16))
        self.assertEqual(y.dtype, mx.float16)

    def test_dropout(self):
        x = mx.ones((2, 4))
        y = nn.Dropout(0.5)(x)
        self.assertEqual(y.shape, x.shape)
        self.assertEqual(y.dtype, mx.float32)

        x = mx.ones((2, 4), dtype=mx.bfloat16)
        y = nn.Dropout(0.5)(x)
        self.assertEqual(y.shape, x.shape)
        self.assertEqual(y.dtype, mx.bfloat16)

        x = mx.ones((2, 4), dtype=mx.float16)
        y = nn.Dropout(0.5)(x)
        self.assertEqual(y.shape, x.shape)
        self.assertEqual(y.dtype, mx.float16)

    def test_dropout2d(self):
        x = mx.ones((2, 4, 4, 4))
        y = nn.Dropout2d(0.5)(x)
        self.assertEqual(y.shape, x.shape)
        self.assertEqual(y.dtype, mx.float32)

        x = mx.ones((2, 4, 4, 4), dtype=mx.bfloat16)
        y = nn.Dropout2d(0.5)(x)
        self.assertEqual(y.shape, x.shape)
        self.assertEqual(y.dtype, mx.bfloat16)

        x = mx.ones((2, 4, 4, 4), dtype=mx.float16)
        y = nn.Dropout2d(0.5)(x)
        self.assertEqual(y.shape, x.shape)
        self.assertEqual(y.dtype, mx.float16)

    def test_dropout3d(self):
        x = mx.ones((2, 4, 4, 4, 4))
        y = nn.Dropout3d(0.5)(x)
        self.assertEqual(y.shape, x.shape)
        self.assertEqual(y.dtype, mx.float32)

        x = mx.ones((2, 4, 4, 4, 4), dtype=mx.bfloat16)
        y = nn.Dropout3d(0.5)(x)
        self.assertEqual(y.shape, x.shape)
        self.assertEqual(y.dtype, mx.bfloat16)

        x = mx.ones((2, 4, 4, 4, 4), dtype=mx.float16)
        y = nn.Dropout3d(0.5)(x)
        self.assertEqual(y.shape, x.shape)
        self.assertEqual(y.dtype, mx.float16)

    def test_upsample(self):
        b, h, w, c = 1, 2, 2, 1
        scale_factor = 2
        upsample_nearest = nn.Upsample(
            scale_factor=scale_factor, mode="nearest", align_corners=True
        )
        upsample_bilinear = nn.Upsample(
            scale_factor=scale_factor, mode="linear", align_corners=True
        )
        upsample_nearest = nn.Upsample(
            scale_factor=scale_factor, mode="nearest", align_corners=True
        )
        upsample_bilinear_no_align_corners = nn.Upsample(
            scale_factor=scale_factor, mode="linear", align_corners=False
        )
        upsample_nearest_no_align_corners = nn.Upsample(
            scale_factor=scale_factor, mode="nearest", align_corners=False
        )
        # Test single feature map, align corners
        x = mx.arange(b * h * w * c).reshape((b, c, h, w)).transpose((0, 2, 3, 1))
        expected_nearest = mx.array(
            [[[[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 3, 3], [2, 2, 3, 3]]]]
        ).transpose((0, 2, 3, 1))
        expected_bilinear = mx.array(
            [
                [
                    [
                        [0, 0.333333, 0.666667, 1],
                        [0.666667, 1, 1.33333, 1.66667],
                        [1.33333, 1.66667, 2, 2.33333],
                        [2, 2.33333, 2.66667, 3],
                    ]
                ]
            ]
        ).transpose((0, 2, 3, 1))
        # Test single feature map, no align corners
        x = (
            mx.arange(1, b * h * w * c + 1)
            .reshape((b, c, h, w))
            .transpose((0, 2, 3, 1))
        )
        expected_bilinear_no_align_corners = mx.array(
            [
                [
                    [
                        [1.0000, 1.2500, 1.7500, 2.0000],
                        [1.5000, 1.7500, 2.2500, 2.5000],
                        [2.5000, 2.7500, 3.2500, 3.5000],
                        [3.0000, 3.2500, 3.7500, 4.0000],
                    ]
                ]
            ]
        ).transpose((0, 2, 3, 1))
        expected_nearest_no_align_corners = mx.array(
            [[[[1, 1, 2, 2], [1, 1, 2, 2], [3, 3, 4, 4], [3, 3, 4, 4]]]]
        ).transpose((0, 2, 3, 1))
        self.assertTrue(
            np.allclose(
                upsample_nearest_no_align_corners(x), expected_nearest_no_align_corners
            )
        )
        self.assertTrue(
            np.allclose(
                upsample_bilinear_no_align_corners(x),
                expected_bilinear_no_align_corners,
            )
        )

        # Test a more complex batch
        b, h, w, c = 2, 3, 3, 2
        scale_factor = 2
        x = mx.arange((b * h * w * c)).reshape((b, c, h, w)).transpose((0, 2, 3, 1))

        upsample_nearest = nn.Upsample(
            scale_factor=scale_factor, mode="nearest", align_corners=True
        )
        upsample_bilinear = nn.Upsample(
            scale_factor=scale_factor, mode="linear", align_corners=True
        )

        expected_nearest = mx.array(
            [
                [
                    [
                        [0.0, 0.0, 1.0, 1.0, 2.0, 2.0],
                        [0.0, 0.0, 1.0, 1.0, 2.0, 2.0],
                        [3.0, 3.0, 4.0, 4.0, 5.0, 5.0],
                        [3.0, 3.0, 4.0, 4.0, 5.0, 5.0],
                        [6.0, 6.0, 7.0, 7.0, 8.0, 8.0],
                        [6.0, 6.0, 7.0, 7.0, 8.0, 8.0],
                    ],
                    [
                        [9.0, 9.0, 10.0, 10.0, 11.0, 11.0],
                        [9.0, 9.0, 10.0, 10.0, 11.0, 11.0],
                        [12.0, 12.0, 13.0, 13.0, 14.0, 14.0],
                        [12.0, 12.0, 13.0, 13.0, 14.0, 14.0],
                        [15.0, 15.0, 16.0, 16.0, 17.0, 17.0],
                        [15.0, 15.0, 16.0, 16.0, 17.0, 17.0],
                    ],
                ],
                [
                    [
                        [18.0, 18.0, 19.0, 19.0, 20.0, 20.0],
                        [18.0, 18.0, 19.0, 19.0, 20.0, 20.0],
                        [21.0, 21.0, 22.0, 22.0, 23.0, 23.0],
                        [21.0, 21.0, 22.0, 22.0, 23.0, 23.0],
                        [24.0, 24.0, 25.0, 25.0, 26.0, 26.0],
                        [24.0, 24.0, 25.0, 25.0, 26.0, 26.0],
                    ],
                    [
                        [27.0, 27.0, 28.0, 28.0, 29.0, 29.0],
                        [27.0, 27.0, 28.0, 28.0, 29.0, 29.0],
                        [30.0, 30.0, 31.0, 31.0, 32.0, 32.0],
                        [30.0, 30.0, 31.0, 31.0, 32.0, 32.0],
                        [33.0, 33.0, 34.0, 34.0, 35.0, 35.0],
                        [33.0, 33.0, 34.0, 34.0, 35.0, 35.0],
                    ],
                ],
            ]
        ).transpose((0, 2, 3, 1))
        expected_bilinear = mx.array(
            [
                [
                    [
                        [0.0, 0.4, 0.8, 1.2, 1.6, 2.0],
                        [1.2, 1.6, 2.0, 2.4, 2.8, 3.2],
                        [2.4, 2.8, 3.2, 3.6, 4.0, 4.4],
                        [3.6, 4.0, 4.4, 4.8, 5.2, 5.6],
                        [4.8, 5.2, 5.6, 6.0, 6.4, 6.8],
                        [6.0, 6.4, 6.8, 7.2, 7.6, 8.0],
                    ],
                    [
                        [9.0, 9.4, 9.8, 10.2, 10.6, 11.0],
                        [10.2, 10.6, 11.0, 11.4, 11.8, 12.2],
                        [11.4, 11.8, 12.2, 12.6, 13.0, 13.4],
                        [12.6, 13.0, 13.4, 13.8, 14.2, 14.6],
                        [13.8, 14.2, 14.6, 15.0, 15.4, 15.8],
                        [15.0, 15.4, 15.8, 16.2, 16.6, 17.0],
                    ],
                ],
                [
                    [
                        [18.0, 18.4, 18.8, 19.2, 19.6, 20.0],
                        [19.2, 19.6, 20.0, 20.4, 20.8, 21.2],
                        [20.4, 20.8, 21.2, 21.6, 22.0, 22.4],
                        [21.6, 22.0, 22.4, 22.8, 23.2, 23.6],
                        [22.8, 23.2, 23.6, 24.0, 24.4, 24.8],
                        [24.0, 24.4, 24.8, 25.2, 25.6, 26.0],
                    ],
                    [
                        [27.0, 27.4, 27.8, 28.2, 28.6, 29.0],
                        [28.2, 28.6, 29.0, 29.4, 29.8, 30.2],
                        [29.4, 29.8, 30.2, 30.6, 31.0, 31.4],
                        [30.6, 31.0, 31.4, 31.8, 32.2, 32.6],
                        [31.8, 32.2, 32.6, 33.0, 33.4, 33.8],
                        [33.0, 33.4, 33.8, 34.2, 34.6, 35.0],
                    ],
                ],
            ]
        ).transpose((0, 2, 3, 1))
        self.assertTrue(np.allclose(upsample_nearest(x), expected_nearest))
        self.assertTrue(np.allclose(upsample_bilinear(x), expected_bilinear))

        # Test different height and width scale_factor
        b, h, w, c = 1, 2, 2, 2
        x = mx.arange(b * h * w * c).reshape((b, c, h, w)).transpose((0, 2, 3, 1))
        upsample_nearest = nn.Upsample(
            scale_factor=(2, 3), mode="nearest", align_corners=True
        )
        upsample_bilinear = nn.Upsample(
            scale_factor=(2, 3), mode="linear", align_corners=True
        )

        expected_nearest = mx.array(
            [
                [
                    [
                        [0, 0, 0, 1, 1, 1],
                        [0, 0, 0, 1, 1, 1],
                        [2, 2, 2, 3, 3, 3],
                        [2, 2, 2, 3, 3, 3],
                    ],
                    [
                        [4, 4, 4, 5, 5, 5],
                        [4, 4, 4, 5, 5, 5],
                        [6, 6, 6, 7, 7, 7],
                        [6, 6, 6, 7, 7, 7],
                    ],
                ]
            ]
        ).transpose((0, 2, 3, 1))
        expected_bilinear = mx.array(
            [
                [
                    [
                        [0, 0.2, 0.4, 0.6, 0.8, 1],
                        [0.666667, 0.866667, 1.06667, 1.26667, 1.46667, 1.66667],
                        [1.33333, 1.53333, 1.73333, 1.93333, 2.13333, 2.33333],
                        [2, 2.2, 2.4, 2.6, 2.8, 3],
                    ],
                    [
                        [4, 4.2, 4.4, 4.6, 4.8, 5],
                        [4.66667, 4.86667, 5.06667, 5.26667, 5.46667, 5.66667],
                        [5.33333, 5.53333, 5.73333, 5.93333, 6.13333, 6.33333],
                        [6, 6.2, 6.4, 6.6, 6.8, 7],
                    ],
                ]
            ]
        ).transpose((0, 2, 3, 1))
        self.assertTrue(np.allclose(upsample_nearest(x), expected_nearest))
        self.assertTrue(np.allclose(upsample_bilinear(x), expected_bilinear))

        # Test repr
        self.assertEqual(
            str(nn.Upsample(scale_factor=2)),
            "Upsample(scale_factor=2.0, mode='nearest', align_corners=False)",
        )
        self.assertEqual(
            str(nn.Upsample(scale_factor=(2, 3))),
            "Upsample(scale_factor=(2.0, 3.0), mode='nearest', align_corners=False)",
        )

    def test_pooling(self):
        # Test 1d pooling
        x = mx.array(
            [
                [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]],
                [[12, 13, 14], [15, 16, 17], [18, 19, 20], [21, 22, 23]],
            ]
        )
        expected_max_pool_output_no_padding_stride_1 = [
            [[3.0, 4.0, 5.0], [6.0, 7.0, 8.0], [9.0, 10.0, 11.0]],
            [[15.0, 16.0, 17.0], [18.0, 19.0, 20.0], [21.0, 22.0, 23.0]],
        ]
        expected_max_pool_output_no_padding_stride_2 = [
            [[3.0, 4.0, 5.0], [9.0, 10.0, 11.0]],
            [[15.0, 16.0, 17.0], [21.0, 22.0, 23.0]],
        ]
        expected_max_pool_output_padding_1_stride_2 = [
            [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [9.0, 10.0, 11.0]],
            [[12.0, 13.0, 14.0], [18.0, 19.0, 20.0], [21.0, 22.0, 23.0]],
        ]
        expected_max_pool_output_padding_1_stride_2_kernel_3 = [
            [[3.0, 4.0, 5.0], [9.0, 10.0, 11.0]],
            [[15.0, 16.0, 17.0], [21.0, 22.0, 23.0]],
        ]
        expected_avg_pool_output_no_padding_stride_1 = [
            [
                [1.5000, 2.5000, 3.5000],
                [4.5000, 5.5000, 6.5000],
                [7.5000, 8.5000, 9.5000],
            ],
            [
                [13.5000, 14.5000, 15.5000],
                [16.5000, 17.5000, 18.5000],
                [19.5000, 20.5000, 21.5000],
            ],
        ]
        expected_avg_pool_output_no_padding_stride_2 = [
            [[1.5000, 2.5000, 3.5000], [7.5000, 8.5000, 9.5000]],
            [[13.5000, 14.5000, 15.5000], [19.5000, 20.5000, 21.5000]],
        ]
        expected_avg_pool_output_padding_1_stride_2 = [
            [
                [0.0000, 0.5000, 1.0000],
                [4.5000, 5.5000, 6.5000],
                [4.5000, 5.0000, 5.5000],
            ],
            [
                [6.0000, 6.5000, 7.0000],
                [16.5000, 17.5000, 18.5000],
                [10.5000, 11.0000, 11.5000],
            ],
        ]
        expected_avg_pool_output_padding_1_kernel_3 = [
            [[1, 1.66667, 2.33333], [6, 7, 8]],
            [[9, 9.66667, 10.3333], [18, 19, 20]],
        ]
        self.assertTrue(
            np.array_equal(
                nn.MaxPool1d(kernel_size=2, stride=1, padding=0)(x),
                expected_max_pool_output_no_padding_stride_1,
            )
        )
        self.assertTrue(
            np.array_equal(
                nn.MaxPool1d(kernel_size=2, stride=2, padding=0)(x),
                expected_max_pool_output_no_padding_stride_2,
            )
        )
        self.assertTrue(
            np.array_equal(
                nn.MaxPool1d(kernel_size=2, stride=2, padding=1)(x),
                expected_max_pool_output_padding_1_stride_2,
            )
        )
        self.assertTrue(
            np.array_equal(
                nn.MaxPool1d(kernel_size=3, stride=2, padding=1)(x),
                expected_max_pool_output_padding_1_stride_2_kernel_3,
            )
        )
        self.assertTrue(
            np.allclose(
                nn.AvgPool1d(kernel_size=2, stride=1, padding=0)(x),
                expected_avg_pool_output_no_padding_stride_1,
            )
        )
        self.assertTrue(
            np.allclose(
                nn.AvgPool1d(kernel_size=2, stride=2, padding=0)(x),
                expected_avg_pool_output_no_padding_stride_2,
            )
        )
        self.assertTrue(
            np.allclose(
                nn.AvgPool1d(kernel_size=2, stride=2, padding=1)(x),
                expected_avg_pool_output_padding_1_stride_2,
            )
        )
        self.assertTrue(
            np.allclose(
                nn.AvgPool1d(kernel_size=3, stride=2, padding=1)(x),
                expected_avg_pool_output_padding_1_kernel_3,
            )
        )
        # Test 2d pooling
        x = mx.array(
            [
                [
                    [[0, 16], [1, 17], [2, 18], [3, 19]],
                    [[4, 20], [5, 21], [6, 22], [7, 23]],
                    [[8, 24], [9, 25], [10, 26], [11, 27]],
                    [[12, 28], [13, 29], [14, 30], [15, 31]],
                ]
            ]
        )
        expected_max_pool_output_no_padding_stride_1 = [
            [
                [[5, 21], [6, 22], [7, 23]],
                [[9, 25], [10, 26], [11, 27]],
                [[13, 29], [14, 30], [15, 31]],
            ]
        ]
        expected_max_pool_output_no_padding_stride_2 = [
            [[[5, 21], [7, 23]], [[13, 29], [15, 31]]]
        ]
        expected_max_pool_output_padding_1 = [
            [
                [[0, 16], [2, 18], [3, 19]],
                [[8, 24], [10, 26], [11, 27]],
                [[12, 28], [14, 30], [15, 31]],
            ]
        ]
        expected_mean_pool_output_no_padding_stride_1 = [
            [
                [[2.5000, 18.5000], [3.5000, 19.5000], [4.5000, 20.5000]],
                [[6.5000, 22.5000], [7.5000, 23.5000], [8.5000, 24.5000]],
                [[10.5000, 26.5000], [11.5000, 27.5000], [12.5000, 28.5000]],
            ]
        ]
        expected_mean_pool_output_no_padding_stride_2 = [
            [
                [[2.5000, 18.5000], [4.5000, 20.5000]],
                [[10.5000, 26.5000], [12.5000, 28.5000]],
            ]
        ]
        expected_mean_pool_output_padding_1 = [
            [
                [[0.0000, 4.0000], [0.7500, 8.7500], [0.7500, 4.7500]],
                [[3.0000, 11.0000], [7.5000, 23.5000], [4.5000, 12.5000]],
                [[3.0000, 7.0000], [6.7500, 14.7500], [3.7500, 7.7500]],
            ]
        ]
        self.assertTrue(
            np.array_equal(
                nn.MaxPool2d(kernel_size=2, stride=1, padding=0)(x),
                expected_max_pool_output_no_padding_stride_1,
            )
        )
        self.assertTrue(
            np.array_equal(
                nn.MaxPool2d(kernel_size=2, stride=2, padding=0)(x),
                expected_max_pool_output_no_padding_stride_2,
            )
        )
        self.assertTrue(
            np.array_equal(
                nn.MaxPool2d(kernel_size=2, stride=2, padding=1)(x),
                expected_max_pool_output_padding_1,
            )
        )
        # Average pooling
        self.assertTrue(
            np.allclose(
                nn.AvgPool2d(kernel_size=2, stride=1, padding=0)(x),
                expected_mean_pool_output_no_padding_stride_1,
            )
        )
        self.assertTrue(
            np.array_equal(
                nn.AvgPool2d(kernel_size=2, stride=2, padding=0)(x),
                expected_mean_pool_output_no_padding_stride_2,
            )
        )
        self.assertTrue(
            np.array_equal(
                nn.AvgPool2d(kernel_size=2, stride=2, padding=1)(x),
                expected_mean_pool_output_padding_1,
            )
        )
        # Test multiple batches
        x = mx.array(
            [
                [
                    [[0, 1], [2, 3], [4, 5], [6, 7]],
                    [[8, 9], [10, 11], [12, 13], [14, 15]],
                    [[16, 17], [18, 19], [20, 21], [22, 23]],
                    [[24, 25], [26, 27], [28, 29], [30, 31]],
                ],
                [
                    [[32, 33], [34, 35], [36, 37], [38, 39]],
                    [[40, 41], [42, 43], [44, 45], [46, 47]],
                    [[48, 49], [50, 51], [52, 53], [54, 55]],
                    [[56, 57], [58, 59], [60, 61], [62, 63]],
                ],
            ]
        )
        expected_max_pool_output = [
            [[[10.0, 11.0], [14.0, 15.0]], [[26.0, 27.0], [30.0, 31.0]]],
            [[[42.0, 43.0], [46.0, 47.0]], [[58.0, 59.0], [62.0, 63.0]]],
        ]
        expected_avg_pool_output = [
            [[[2.22222, 2.66667], [5.33333, 6]], [[11.3333, 12], [20, 21]]],
            [[[16.4444, 16.8889], [26.6667, 27.3333]], [[32.6667, 33.3333], [52, 53]]],
        ]
        self.assertTrue(
            np.array_equal(
                nn.MaxPool2d(kernel_size=3, stride=2, padding=1)(x),
                expected_max_pool_output,
            )
        )
        self.assertTrue(
            np.allclose(
                nn.AvgPool2d(kernel_size=3, stride=2, padding=1)(x),
                expected_avg_pool_output,
            )
        )
        # Test irregular kernel (2, 4), stride (3, 1) and padding (1, 2)
        x = mx.array(
            [
                [
                    [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]],
                    [[12, 13, 14], [15, 16, 17], [18, 19, 20], [21, 22, 23]],
                    [[24, 25, 26], [27, 28, 29], [30, 31, 32], [33, 34, 35]],
                    [[36, 37, 38], [39, 40, 41], [42, 43, 44], [45, 46, 47]],
                ],
                [
                    [[48, 49, 50], [51, 52, 53], [54, 55, 56], [57, 58, 59]],
                    [[60, 61, 62], [63, 64, 65], [66, 67, 68], [69, 70, 71]],
                    [[72, 73, 74], [75, 76, 77], [78, 79, 80], [81, 82, 83]],
                    [[84, 85, 86], [87, 88, 89], [90, 91, 92], [93, 94, 95]],
                ],
            ]
        )
        expected_irregular_max_pool_output = [
            [
                [
                    [3.0, 4.0, 5.0],
                    [6.0, 7.0, 8.0],
                    [9.0, 10.0, 11.0],
                    [9.0, 10.0, 11.0],
                    [9.0, 10.0, 11.0],
                ],
                [
                    [39.0, 40.0, 41.0],
                    [42.0, 43.0, 44.0],
                    [45.0, 46.0, 47.0],
                    [45.0, 46.0, 47.0],
                    [45.0, 46.0, 47.0],
                ],
            ],
            [
                [
                    [51.0, 52.0, 53.0],
                    [54.0, 55.0, 56.0],
                    [57.0, 58.0, 59.0],
                    [57.0, 58.0, 59.0],
                    [57.0, 58.0, 59.0],
                ],
                [
                    [87.0, 88.0, 89.0],
                    [90.0, 91.0, 92.0],
                    [93.0, 94.0, 95.0],
                    [93.0, 94.0, 95.0],
                    [93.0, 94.0, 95.0],
                ],
            ],
        ]
        expected_irregular_average_pool_output = [
            [
                [
                    [0.3750, 0.6250, 0.8750],
                    [1.1250, 1.5000, 1.8750],
                    [2.2500, 2.7500, 3.2500],
                    [2.2500, 2.6250, 3.0000],
                    [1.8750, 2.1250, 2.3750],
                ],
                [
                    [15.7500, 16.2500, 16.7500],
                    [24.7500, 25.5000, 26.2500],
                    [34.5000, 35.5000, 36.5000],
                    [27.0000, 27.7500, 28.5000],
                    [18.7500, 19.2500, 19.7500],
                ],
            ],
            [
                [
                    [12.3750, 12.6250, 12.8750],
                    [19.1250, 19.5000, 19.8750],
                    [26.2500, 26.7500, 27.2500],
                    [20.2500, 20.6250, 21.0000],
                    [13.8750, 14.1250, 14.3750],
                ],
                [
                    [39.7500, 40.2500, 40.7500],
                    [60.7500, 61.5000, 62.2500],
                    [82.5000, 83.5000, 84.5000],
                    [63.0000, 63.7500, 64.5000],
                    [42.7500, 43.2500, 43.7500],
                ],
            ],
        ]
        self.assertTrue(
            np.array_equal(
                nn.MaxPool2d(kernel_size=(2, 4), stride=(3, 1), padding=(1, 2))(x),
                expected_irregular_max_pool_output,
            )
        )
        self.assertTrue(
            np.allclose(
                nn.AvgPool2d(kernel_size=(2, 4), stride=(3, 1), padding=(1, 2))(x),
                expected_irregular_average_pool_output,
            )
        )
        # Test repr
        self.assertEqual(
            str(nn.MaxPool1d(kernel_size=3, padding=2)),
            "MaxPool1d(kernel_size=(3,), stride=(3,), padding=(2,))",
        )
        self.assertEqual(
            str(nn.AvgPool1d(kernel_size=2, stride=3)),
            "AvgPool1d(kernel_size=(2,), stride=(3,), padding=(0,))",
        )
        self.assertEqual(
            str(nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
            "MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))",
        )
        self.assertEqual(
            str(nn.AvgPool2d(kernel_size=(1, 2), stride=2, padding=(1, 2))),
            "AvgPool2d(kernel_size=(1, 2), stride=(2, 2), padding=(1, 2))",
        )
        # Test 3d pooling
        x = mx.array(
            [
                [
                    [
                        [[0, 1, 2], [3, 4, 5], [6, 7, 8]],
                        [[9, 10, 11], [12, 13, 14], [15, 16, 17]],
                        [[18, 19, 20], [21, 22, 23], [24, 25, 26]],
                    ],
                    [
                        [[27, 28, 29], [30, 31, 32], [33, 34, 35]],
                        [[36, 37, 38], [39, 40, 41], [42, 43, 44]],
                        [[45, 46, 47], [48, 49, 50], [51, 52, 53]],
                    ],
                ]
            ]
        )
        expected_max_pool_output_no_padding_stride_1 = [
            [[[[39, 40, 41], [42, 43, 44]], [[48, 49, 50], [51, 52, 53]]]]
        ]

        expected_max_pool_output_no_padding_stride_2 = [[[[[39, 40, 41]]]]]
        expected_max_pool_output_padding_1 = [
            [
                [[[0, 1, 2], [6, 7, 8]], [[18, 19, 20], [24, 25, 26]]],
                [[[27, 28, 29], [33, 34, 35]], [[45, 46, 47], [51, 52, 53]]],
            ]
        ]
        expected_irregular_max_pool_output = [
            [
                [[[9, 10, 11], [12, 13, 14], [15, 16, 17]]],
                [[[36, 37, 38], [39, 40, 41], [42, 43, 44]]],
            ]
        ]

        self.assertTrue(
            np.array_equal(
                nn.MaxPool3d(kernel_size=2, stride=1, padding=0)(x),
                expected_max_pool_output_no_padding_stride_1,
            )
        )
        self.assertTrue(
            np.array_equal(
                nn.MaxPool3d(kernel_size=2, stride=2, padding=0)(x),
                expected_max_pool_output_no_padding_stride_2,
            )
        )
        self.assertTrue(
            np.array_equal(
                nn.MaxPool3d(kernel_size=2, stride=2, padding=1)(x),
                expected_max_pool_output_padding_1,
            )
        )
        self.assertTrue(
            np.array_equal(
                nn.MaxPool3d(kernel_size=(1, 2, 1), stride=(1, 2, 1))(x),
                expected_irregular_max_pool_output,
            )
        )
        self.assertEqual(
            str(nn.MaxPool3d(kernel_size=3, stride=3, padding=2)),
            "MaxPool3d(kernel_size=(3, 3, 3), stride=(3, 3, 3), padding=(2, 2, 2))",
        )

        expected_avg_pool_output_no_padding_stride_1 = [
            [
                [
                    [[19.5, 20.5, 21.5], [22.5, 23.5, 24.5]],
                    [[28.5, 29.5, 30.5], [31.5, 32.5, 33.5]],
                ]
            ]
        ]

        expected_avg_pool_output_no_padding_stride_2 = [[[[[19.5, 20.5, 21.5]]]]]
        expected_avg_pool_output_padding_1 = [
            [
                [
                    [[0, 0.125, 0.25], [1.125, 1.375, 1.625]],
                    [[3.375, 3.625, 3.875], [9, 9.5, 10]],
                ],
                [
                    [[3.375, 3.5, 3.625], [7.875, 8.125, 8.375]],
                    [[10.125, 10.375, 10.625], [22.5, 23, 23.5]],
                ],
            ]
        ]
        expected_irregular_avg_pool_output = [
            [
                [[[4.5, 5.5, 6.5], [7.5, 8.5, 9.5], [10.5, 11.5, 12.5]]],
                [[[31.5, 32.5, 33.5], [34.5, 35.5, 36.5], [37.5, 38.5, 39.5]]],
            ]
        ]

        self.assertTrue(
            np.array_equal(
                nn.AvgPool3d(kernel_size=2, stride=1, padding=0)(x),
                expected_avg_pool_output_no_padding_stride_1,
            )
        )
        self.assertTrue(
            np.array_equal(
                nn.AvgPool3d(kernel_size=2, stride=2, padding=0)(x),
                expected_avg_pool_output_no_padding_stride_2,
            )
        )
        self.assertTrue(
            np.array_equal(
                nn.AvgPool3d(kernel_size=2, stride=2, padding=1)(x),
                expected_avg_pool_output_padding_1,
            )
        )
        self.assertTrue(
            np.array_equal(
                nn.AvgPool3d(kernel_size=(1, 2, 1), stride=(1, 2, 1))(x),
                expected_irregular_avg_pool_output,
            )
        )
        self.assertEqual(
            str(nn.AvgPool3d(kernel_size=3, stride=3, padding=2)),
            "AvgPool3d(kernel_size=(3, 3, 3), stride=(3, 3, 3), padding=(2, 2, 2))",
        )

    def test_set_dtype(self):
        def assert_dtype(layer, dtype):
            for k, v in tree_flatten(layer.parameters()):
                self.assertEqual(v.dtype, dtype, f"dtype mismatch for {k}")

        layer = nn.Linear(input_dims=4, output_dims=8, bias=True)
        assert_dtype(layer, mx.float32)

        layer.set_dtype(mx.bfloat16)
        assert_dtype(layer, mx.bfloat16)

        layer.set_dtype(mx.float32, lambda x: False)
        assert_dtype(layer, mx.bfloat16)

        layer.set_dtype(mx.int32, lambda x: True)
        assert_dtype(layer, mx.int32)

        layer.set_dtype(mx.int64, predicate=None)
        assert_dtype(layer, mx.int64)

        layer.set_dtype(mx.int16, lambda x: mx.issubdtype(x, mx.integer))
        assert_dtype(layer, mx.int16)

    def test_rnn(self):
        layer = nn.RNN(input_size=5, hidden_size=12, bias=True)
        inp = mx.random.normal((2, 25, 5))

        h_out = layer(inp)
        self.assertEqual(h_out.shape, (2, 25, 12))

        layer = nn.RNN(
            5,
            12,
            bias=False,
            nonlinearity=lambda x: mx.maximum(0, x),
        )

        h_out = layer(inp)
        self.assertEqual(h_out.shape, (2, 25, 12))

        with self.assertRaises(ValueError):
            nn.RNN(5, 12, nonlinearity="tanh")

        inp = mx.random.normal((44, 5))
        h_out = layer(inp)
        self.assertEqual(h_out.shape, (44, 12))

        h_out = layer(inp, hidden=h_out[-1, :])
        self.assertEqual(h_out.shape, (44, 12))

    def test_gru(self):
        layer = nn.GRU(5, 12, bias=True)
        inp = mx.random.normal((2, 25, 5))

        h_out = layer(inp)
        self.assertEqual(h_out.shape, (2, 25, 12))

        h_out = layer(inp, hidden=h_out[:, -1, :])
        self.assertEqual(h_out.shape, (2, 25, 12))

        inp = mx.random.normal((44, 5))
        h_out = layer(inp)
        self.assertEqual(h_out.shape, (44, 12))

        h_out = layer(inp, h_out[-1, :])
        self.assertEqual(h_out.shape, (44, 12))

        # hidden=None should be equivalent to hidden=zeros (issue #3249)
        for bias in [True, False]:
            layer = nn.GRU(5, 12, bias=bias)
            inp = mx.random.normal((2, 25, 5))
            h_none = layer(inp)
            h_zeros = layer(inp, hidden=mx.zeros((2, 12)))
            self.assertTrue(mx.allclose(h_none, h_zeros).item())

    def test_lstm(self):
        layer = nn.LSTM(5, 12)
        inp = mx.random.normal((2, 25, 5))

        h_out, c_out = layer(inp)
        self.assertEqual(h_out.shape, (2, 25, 12))
        self.assertEqual(c_out.shape, (2, 25, 12))

        h_out, c_out = layer(inp, hidden=h_out[:, -1, :], cell=c_out[:, -1, :])
        self.assertEqual(h_out.shape, (2, 25, 12))
        self.assertEqual(c_out.shape, (2, 25, 12))

        inp = mx.random.normal((44, 5))
        h_out, c_out = layer(inp)
        self.assertEqual(h_out.shape, (44, 12))
        self.assertEqual(c_out.shape, (44, 12))

        inp = mx.random.normal((44, 5))
        h_out, c_out = layer(inp, hidden=h_out[-1, :], cell=c_out[-1, :])
        self.assertEqual(h_out.shape, (44, 12))
        self.assertEqual(c_out.shape, (44, 12))

    def test_quantized_embedding(self):
        emb = nn.Embedding(32, 256)
        qemb = nn.QuantizedEmbedding.from_embedding(emb, bits=8)
        x = mx.array([2, 6, 9, 3, 0, 3])
        y = emb(x)
        yq = qemb(x)
        self.assertLess((y - yq).abs().max(), qemb.scales.max())

        x = mx.random.uniform(shape=(2, 256))
        y = emb.as_linear(x)
        yq = qemb.as_linear(x)

        def cosine(a, b):
            ab = (a * b).sum(-1)
            aa = mx.linalg.norm(a, axis=-1)
            bb = mx.linalg.norm(b, axis=-1)
            return ab / aa / bb

        self.assertGreater(cosine(y, yq).min(), 0.99)

    def test_causal_mask(self):
        mask = nn.MultiHeadAttention.create_additive_causal_mask(4, mx.float16)
        self.assertFalse(mx.any(mx.isnan(mask)))
        self.assertTrue(mask[0, -1].item() < 0)

        mask = nn.MultiHeadAttention.create_additive_causal_mask(4, mx.bfloat16)
        self.assertFalse(mx.any(mx.isnan(mask)))
        self.assertTrue(mask[0, -1].item() < 0)

    def test_attention(self):
        attn = nn.MultiHeadAttention(32, 4)
        x = mx.random.normal(shape=(2, 5, 32))
        out = attn(x, x, x)
        self.assertEqual(out.shape, x.shape)


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_ops.py
================================================
# Copyright © 2023-2024 Apple Inc.

import math
import os
import unittest
from itertools import permutations, product

import mlx.core as mx
import mlx_tests
import numpy as np


def np_wrap_between(x, a):
    """Wraps `x` between `[-a, a]`."""
    two_a = 2 * a
    zero = 0
    rem = np.remainder(np.add(x, a), two_a)
    if isinstance(rem, np.ndarray):
        rem = np.select(rem < zero, np.add(rem, two_a), rem)
    else:
        rem = np.add(rem, two_a) if rem < zero else rem
    return np.subtract(rem, a)


def np_logaddexp(x1: np.ndarray, x2: np.ndarray):
    amax = np.maximum(x1, x2)
    if np.issubdtype(x1.dtype, np.floating):
        delta = np.subtract(x1, x2)
        if isinstance(delta, np.ndarray):
            return np.select(
                np.isnan(delta),
                np.add(x1, x2),
                np.add(amax, np.log1p(np.exp(np.negative(np.abs(delta))))),
            )
        else:
            return (
                np.add(x1, x2)
                if np.isnan(delta)
                else np.add(amax, np.log1p(np.exp(np.negative(np.abs(delta)))))
            )
    else:
        delta = np.subtract(np.add(x1, x2), np.multiply(amax, 2))
        out = np.add(amax, np.log1p(np.exp(delta)))
        return np.real(out) + 1j * np_wrap_between(np.imag(out), np.pi)


def np_cumlogaddexp(x1: np.ndarray, axis: int = -1):
    out = x1.copy()
    for i in range(1, out.shape[axis]):
        out[i] = np_logaddexp(out[i], out[i - 1])
    return out


class TestOps(mlx_tests.MLXTestCase):
    def test_full_ones_zeros(self):
        x = mx.full(2, 3.0)
        self.assertEqual(x.shape, (2,))
        self.assertEqual(x.tolist(), [3.0, 3.0])

        x = mx.full((2, 3), 2.0)
        self.assertEqual(x.dtype, mx.float32)
        self.assertEqual(x.shape, (2, 3))
        self.assertEqual(x.tolist(), [[2, 2, 2], [2, 2, 2]])

        x = mx.full([3, 2], mx.array([False, True]))
        self.assertEqual(x.dtype, mx.bool_)
        self.assertEqual(x.tolist(), [[False, True], [False, True], [False, True]])

        x = mx.full([3, 2], mx.array([2.0, 3.0]))
        self.assertEqual(x.tolist(), [[2, 3], [2, 3], [2, 3]])

        x = mx.zeros(2)
        self.assertEqual(x.shape, (2,))
        self.assertEqual(x.tolist(), [0.0, 0.0])

        x = mx.ones(2)
        self.assertEqual(x.shape, (2,))
        self.assertEqual(x.tolist(), [1.0, 1.0])

        for t in [mx.bool_, mx.int32, mx.float32]:
            x = mx.zeros([2, 2], t)
            self.assertEqual(x.dtype, t)
            self.assertTrue(mx.array_equal(x, mx.array([[0, 0], [0, 0]])))
            y = mx.zeros_like(x)
            self.assertEqual(y.dtype, t)
            self.assertTrue(mx.array_equal(y, x))

            x = mx.ones([2, 2], t)
            self.assertEqual(x.dtype, t)
            self.assertTrue(mx.array_equal(x, mx.array([[1, 1], [1, 1]])))
            y = mx.ones_like(x)
            self.assertEqual(y.dtype, t)
            self.assertTrue(mx.array_equal(y, x))

    def test_scalar_inputs(self):
        # Check combinations of python types
        a = mx.add(False, True)
        self.assertEqual(a.dtype, mx.bool_)
        self.assertEqual(a.item(), True)

        a = mx.add(1, 2)
        self.assertEqual(a.dtype, mx.int32)
        self.assertEqual(a.item(), 3)

        a = mx.add(1.0, 2.0)
        self.assertEqual(a.dtype, mx.float32)
        self.assertEqual(a.item(), 3.0)

        a = mx.add(True, 2)
        self.assertEqual(a.dtype, mx.int32)
        self.assertEqual(a.item(), 3)

        a = mx.add(True, 2.0)
        self.assertEqual(a.dtype, mx.float32)
        self.assertEqual(a.item(), 3.0)

        a = mx.add(1, 2.0)
        self.assertEqual(a.dtype, mx.float32)
        self.assertEqual(a.item(), 3.0)

        a = mx.add(2, True)
        self.assertEqual(a.dtype, mx.int32)
        self.assertEqual(a.item(), 3)

        a = mx.add(2.0, True)
        self.assertEqual(a.dtype, mx.float32)
        self.assertEqual(a.item(), 3.0)

        a = mx.add(2.0, 1)
        self.assertEqual(a.dtype, mx.float32)
        self.assertEqual(a.item(), 3.0)

        # Check combinations with mlx arrays
        a = mx.add(mx.array(True), False)
        self.assertEqual(a.dtype, mx.bool_)
        self.assertEqual(a.item(), True)

        a = mx.add(mx.array(1), False)
        self.assertEqual(a.dtype, mx.int32)
        self.assertEqual(a.item(), 1.0)

        # Edge case: take the type of the scalar
        a = mx.add(mx.array(True), 1)
        self.assertEqual(a.dtype, mx.int32)
        self.assertEqual(a.item(), 2)

        a = mx.add(mx.array(1.0), 1)
        self.assertEqual(a.dtype, mx.float32)
        self.assertEqual(a.item(), 2.0)

        a = mx.add(1, mx.array(1.0))
        self.assertEqual(a.dtype, mx.float32)
        self.assertEqual(a.item(), 2.0)

        binary_ops = [
            "add",
            "subtract",
            "multiply",
            "divide",
            "floor_divide",
            "remainder",
            "equal",
            "not_equal",
            "less",
            "greater",
            "less_equal",
            "greater_equal",
            "maximum",
            "minimum",
        ]

        for op in binary_ops:
            npop = getattr(np, op)
            mlxop = getattr(mx, op)

            # Avoid subtract from bool and divide by 0
            for x in [-1, 0, 1, -1.0, 1.0]:
                for y in [True, -1, 1, -1.0, 1.0]:
                    self.assertEqual(npop(x, y).item(), mlxop(x, y).item())

    def test_add(self):
        x = mx.array(1)
        y = mx.array(1)
        z = mx.add(x, y)
        self.assertEqual(z.item(), 2)

        x = mx.array(False, mx.bool_)
        z = x + 1
        self.assertEqual(z.dtype, mx.int32)
        self.assertEqual(z.item(), 1)
        z = 2 + x
        self.assertEqual(z.dtype, mx.int32)
        self.assertEqual(z.item(), 2)

        x = mx.array(1, mx.uint32)
        z = x + 3
        self.assertEqual(z.dtype, mx.uint32)
        self.assertEqual(z.item(), 4)

        z = 3 + x
        self.assertEqual(z.dtype, mx.uint32)
        self.assertEqual(z.item(), 4)

        z = x + 3.0
        self.assertEqual(z.dtype, mx.float32)
        self.assertEqual(z.item(), 4.0)

        z = 3.0 + x
        self.assertEqual(z.dtype, mx.float32)
        self.assertEqual(z.item(), 4.0)

        x = mx.array(1, mx.int64)
        z = x + 3
        self.assertEqual(z.dtype, mx.int64)
        self.assertEqual(z.item(), 4)
        z = 3 + x
        self.assertEqual(z.dtype, mx.int64)
        self.assertEqual(z.item(), 4)
        z = x + 3.0
        self.assertEqual(z.dtype, mx.float32)
        self.assertEqual(z.item(), 4.0)
        z = 3.0 + x
        self.assertEqual(z.dtype, mx.float32)
        self.assertEqual(z.item(), 4.0)

        x = mx.array(1, mx.float32)
        z = x + 3
        self.assertEqual(z.dtype, mx.float32)
        self.assertEqual(z.item(), 4)
        z = 3 + x
        self.assertEqual(z.dtype, mx.float32)
        self.assertEqual(z.item(), 4)

    def test_subtract(self):
        x = mx.array(4.0)
        y = mx.array(3.0)

        z = mx.subtract(x, y)
        self.assertEqual(z.dtype, mx.float32)
        self.assertEqual(z.item(), 1.0)

        z = x - 3.0
        self.assertEqual(z.dtype, mx.float32)
        self.assertEqual(z.item(), 1.0)

        z = 5.0 - x
        self.assertEqual(z.dtype, mx.float32)
        self.assertEqual(z.item(), 1.0)

    def test_multiply(self):
        x = mx.array(2.0)
        y = mx.array(3.0)

        z = mx.multiply(x, y)
        self.assertEqual(z.dtype, mx.float32)
        self.assertEqual(z.item(), 6.0)

        z = x * 3.0
        self.assertEqual(z.dtype, mx.float32)
        self.assertEqual(z.item(), 6.0)

        z = 3.0 * x
        self.assertEqual(z.dtype, mx.float32)
        self.assertEqual(z.item(), 6.0)

    def test_divide(self):
        x = mx.array(2.0)
        y = mx.array(4.0)

        z = mx.divide(x, y)
        self.assertEqual(z.dtype, mx.float32)
        self.assertEqual(z.item(), 0.5)

        z = x / 4.0
        self.assertEqual(z.dtype, mx.float32)
        self.assertEqual(z.item(), 0.5)

        z = 1.0 / x
        self.assertEqual(z.dtype, mx.float32)
        self.assertEqual(z.item(), 0.5)

        x = x.astype(mx.float16)
        z = x / 4.0
        self.assertEqual(z.dtype, mx.float16)

        x = x.astype(mx.float16)
        z = 4.0 / x
        self.assertEqual(z.dtype, mx.float16)

        x = mx.array(5)
        y = mx.array(2)
        z = x / y
        self.assertEqual(z.dtype, mx.float32)
        self.assertEqual(z.item(), 2.5)

        z = x // y
        self.assertEqual(z.dtype, mx.int32)
        self.assertEqual(z.item(), 2)

    def test_remainder(self):
        for dt in [mx.int32, mx.float32]:
            x = mx.array(2, dtype=dt)
            y = mx.array(4, dtype=dt)

            z1 = mx.remainder(x, y)
            z2 = mx.remainder(y, x)
            self.assertEqual(z1.dtype, dt)
            self.assertEqual(z1.item(), 2)
            self.assertEqual(z2.item(), 0)

            z = x % 4
            self.assertEqual(z.dtype, dt)
            self.assertEqual(z.item(), 2)

            z = 1 % x
            self.assertEqual(z.dtype, dt)
            self.assertEqual(z.item(), 1)

            z = -1 % x
            self.assertEqual(z.dtype, dt)
            self.assertEqual(z.item(), 1)

            z = -1 % -x
            self.assertEqual(z.dtype, dt)
            self.assertEqual(z.item(), -1)

            x = mx.arange(10).astype(dt) - 5
            y = x % 5
            z = x % -5
            self.assertEqual(y.tolist(), [0, 1, 2, 3, 4, 0, 1, 2, 3, 4])
            self.assertEqual(z.tolist(), [0, -4, -3, -2, -1, 0, -4, -3, -2, -1])

        z = -mx.ones(64) % mx.full(64, 2)
        self.assertTrue(mx.array_equal(z, mx.ones(64)))

    def test_comparisons(self):
        a = mx.array([0.0, 1.0, 5.0])
        b = mx.array([-1.0, 2.0, 5.0])

        self.assertEqual(mx.less(a, b).tolist(), [False, True, False])
        self.assertEqual(mx.less_equal(a, b).tolist(), [False, True, True])
        self.assertEqual(mx.greater(a, b).tolist(), [True, False, False])
        self.assertEqual(mx.greater_equal(a, b).tolist(), [True, False, True])

        self.assertEqual(mx.less(a, 5).tolist(), [True, True, False])
        self.assertEqual(mx.less(5, a).tolist(), [False, False, False])
        self.assertEqual(mx.less_equal(5, a).tolist(), [False, False, True])
        self.assertEqual(mx.greater(a, 1).tolist(), [False, False, True])
        self.assertEqual(mx.greater_equal(a, 1).tolist(), [False, True, True])

        a = mx.array([0.0, 1.0, 5.0, -1.0])
        b = mx.array([0.0, 2.0, 5.0, 3.0])
        self.assertEqual(mx.equal(a, b).tolist(), [True, False, True, False])
        self.assertEqual(mx.not_equal(a, b).tolist(), [False, True, False, True])

    def test_array_equal(self):
        x = mx.array([1, 2, 3, 4])
        y = mx.array([1, 2, 3, 4])
        self.assertTrue(mx.array_equal(x, y))

        y = mx.array([1, 2, 4, 5])
        self.assertFalse(mx.array_equal(x, y))

        y = mx.array([1, 2, 3])
        self.assertFalse(mx.array_equal(x, y))

        # Can still be equal with different types
        y = mx.array([1.0, 2.0, 3.0, 4.0])
        self.assertTrue(mx.array_equal(x, y))

        x = mx.array([0.0, float("nan")])
        y = mx.array([0.0, float("nan")])
        self.assertFalse(mx.array_equal(x, y))
        self.assertTrue(mx.array_equal(x, y, equal_nan=True))

        for t in [mx.float32, mx.float16, mx.bfloat16, mx.complex64]:
            with self.subTest(type=t):
                x = mx.array([0.0, float("nan")]).astype(t)
                y = mx.array([0.0, float("nan")]).astype(t)
                self.assertFalse(mx.array_equal(x, y))
                self.assertTrue(mx.array_equal(x, y, equal_nan=True))

    def test_isnan(self):
        x = mx.array([0.0, float("nan")])
        self.assertEqual(mx.isnan(x).tolist(), [False, True])

        x = mx.array([0.0, float("nan")]).astype(mx.float16)
        self.assertEqual(mx.isnan(x).tolist(), [False, True])

        x = mx.array([0.0, float("nan")]).astype(mx.bfloat16)
        self.assertEqual(mx.isnan(x).tolist(), [False, True])

        x = mx.array([0.0, float("nan")]).astype(mx.complex64)
        self.assertEqual(mx.isnan(x).tolist(), [False, True])

        self.assertEqual(mx.isnan(0 * mx.array(float("inf"))).tolist(), True)

    def test_isinf(self):
        x = mx.array([0.0, float("inf")])
        self.assertEqual(mx.isinf(x).tolist(), [False, True])

        x = mx.array([0.0, float("inf")]).astype(mx.float16)
        self.assertEqual(mx.isinf(x).tolist(), [False, True])

        x = mx.array([0.0, float("inf")]).astype(mx.bfloat16)
        self.assertEqual(mx.isinf(x).tolist(), [False, True])

        x = mx.array([0.0, float("inf")]).astype(mx.complex64)
        self.assertEqual(mx.isinf(x).tolist(), [False, True])

        self.assertEqual(mx.isinf(0 * mx.array(float("inf"))).tolist(), False)

        x = mx.array([-2147483648, 0, 2147483647], dtype=mx.int32)
        result = mx.isinf(x)
        self.assertEqual(result.tolist(), [False, False, False])

        x = mx.array([-32768, 0, 32767], dtype=mx.int16)
        result = mx.isinf(x)
        self.assertEqual(result.tolist(), [False, False, False])

    def test_isfinite(self):
        x = mx.array([0.0, float("inf"), float("nan")])
        self.assertEqual(mx.isfinite(x).tolist(), [True, False, False])

        x = x.astype(mx.float16)
        self.assertEqual(mx.isfinite(x).tolist(), [True, False, False])

        x = x.astype(mx.bfloat16)
        self.assertEqual(mx.isfinite(x).tolist(), [True, False, False])

    def test_tri(self):
        for shape in [[4], [4, 4], [2, 10]]:
            for diag in [-1, 0, 1, -2]:
                self.assertCmpNumpy(shape, mx.tri, np.tri, k=diag)
        self.assertEqual(mx.tri(1, 1).dtype, mx.float32)
        self.assertEqual(mx.tri(1, 1, dtype=mx.bfloat16).dtype, mx.bfloat16)

    def test_tril(self):
        for diag in [-1, 0, 1, -2]:
            self.assertCmpNumpy([(10, 10)], mx.tril, np.tril, k=diag)

        with self.assertRaises(Exception):
            mx.tril(mx.zeros((1)))

    def test_triu(self):
        for diag in [-1, 0, 1, -2]:
            self.assertCmpNumpy([(10, 10)], mx.triu, np.triu, k=diag)
        with self.assertRaises(Exception):
            mx.triu(mx.zeros((1)))

    def test_minimum(self):
        x = mx.array([0.0, -5, 10.0])
        y = mx.array([1.0, -7.0, 3.0])

        expected = [0, -7, 3]
        self.assertListEqual(mx.minimum(x, y).tolist(), expected)

        a = mx.array([float("nan")])
        b = mx.array([0.0])
        self.assertTrue(math.isnan(mx.minimum(a, b).item()))
        self.assertTrue(math.isnan(mx.minimum(b, a).item()))

    def test_maximum(self):
        x = mx.array([0.0, -5, 10.0])
        y = mx.array([1.0, -7.0, 3.0])

        expected = [1, -5, 10]
        self.assertListEqual(mx.maximum(x, y).tolist(), expected)

        a = mx.array([float("nan")])
        b = mx.array([0.0])
        self.assertTrue(math.isnan(mx.maximum(a, b).item()))
        self.assertTrue(math.isnan(mx.maximum(b, a).item()))

    def test_floor(self):
        x = mx.array([-22.03, 19.98, -27, 9, 0.0, -np.inf, np.inf])
        expected = [-23, 19, -27, 9, 0, -np.inf, np.inf]
        self.assertListEqual(mx.floor(x).tolist(), expected)

        with self.assertRaises(ValueError):
            mx.floor(mx.array([22 + 3j, 19 + 98j]))

    def test_ceil(self):
        x = mx.array([-22.03, 19.98, -27, 9, 0.0, -np.inf, np.inf])
        expected = [-22, 20, -27, 9, 0, -np.inf, np.inf]
        self.assertListEqual(mx.ceil(x).tolist(), expected)

        with self.assertRaises(ValueError):
            mx.ceil(mx.array([22 + 3j, 19 + 98j]))

    def test_isposinf(self):
        x = mx.array([0.0, float("-inf")])
        self.assertEqual(mx.isposinf(x).tolist(), [False, False])

        x = mx.array([0.0, float("-inf")]).astype(mx.float16)
        self.assertEqual(mx.isposinf(x).tolist(), [False, False])

        x = mx.array([0.0, float("-inf")]).astype(mx.bfloat16)
        self.assertEqual(mx.isposinf(x).tolist(), [False, False])

        x = mx.array([0.0, float("-inf")]).astype(mx.complex64)
        self.assertEqual(mx.isposinf(x).tolist(), [False, False])

        self.assertEqual(mx.isposinf(0 * mx.array(float("inf"))).tolist(), False)

        x = mx.array([-2147483648, 0, 2147483647], dtype=mx.int32)
        result = mx.isposinf(x)
        self.assertEqual(result.tolist(), [False, False, False])

        x = mx.array([-32768, 0, 32767], dtype=mx.int16)
        result = mx.isposinf(x)
        self.assertEqual(result.tolist(), [False, False, False])

    def test_isneginf(self):
        x = mx.array([0.0, float("-inf")])
        self.assertEqual(mx.isneginf(x).tolist(), [False, True])

        x = mx.array([0.0, float("-inf")]).astype(mx.float16)
        self.assertEqual(mx.isneginf(x).tolist(), [False, True])

        x = mx.array([0.0, float("-inf")]).astype(mx.bfloat16)
        self.assertEqual(mx.isneginf(x).tolist(), [False, True])

        x = mx.array([0.0, float("-inf")]).astype(mx.complex64)
        self.assertEqual(mx.isneginf(x).tolist(), [False, True])

        self.assertEqual(mx.isneginf(0 * mx.array(float("inf"))).tolist(), False)

        x = mx.array([-2147483648, 0, 2147483647], dtype=mx.int32)
        result = mx.isneginf(x)
        self.assertEqual(result.tolist(), [False, False, False])

        x = mx.array([-32768, 0, 32767], dtype=mx.int16)
        result = mx.isneginf(x)
        self.assertEqual(result.tolist(), [False, False, False])

    def test_round(self):
        # float
        x = mx.array(
            [0.5, -0.5, 1.5, -1.5, -21.03, 19.98, -27, 9, 0.0, -np.inf, np.inf]
        )
        expected = [0, -0, 2, -2, -21, 20, -27, 9, 0, -np.inf, np.inf]
        self.assertListEqual(mx.round(x).tolist(), expected)

        # complex
        y = mx.round(mx.array([22.2 + 3.6j, 18.5 + 98.2j]))
        self.assertListEqual(y.tolist(), [22 + 4j, 18 + 98j])

        # decimals
        y0 = mx.round(mx.array([15, 122], mx.int32), decimals=0)
        y1 = mx.round(mx.array([15, 122], mx.int32), decimals=-1)
        y2 = mx.round(mx.array([15, 122], mx.int32), decimals=-2)
        self.assertEqual(y0.dtype, mx.int32)
        self.assertEqual(y1.dtype, mx.int32)
        self.assertEqual(y2.dtype, mx.int32)
        self.assertListEqual(y0.tolist(), [15, 122])
        self.assertListEqual(y1.tolist(), [20, 120])
        self.assertListEqual(y2.tolist(), [0, 100])

        y1 = mx.round(mx.array([1.537, 1.471], mx.float32), decimals=1)
        y2 = mx.round(mx.array([1.537, 1.471], mx.float32), decimals=2)
        self.assertTrue(mx.allclose(y1, mx.array([1.5, 1.5])))
        self.assertTrue(mx.allclose(y2, mx.array([1.54, 1.47])))

        # check round to nearest for different types
        dtypes = [mx.bfloat16, mx.float16, mx.float32]
        for dtype in dtypes:
            x = mx.arange(10, dtype=dtype) - 4.5
            x = mx.round(x)
            self.assertEqual(
                x.astype(mx.float32).tolist(),
                [-4.0, -4.0, -2.0, -2.0, -0.0, 0.0, 2.0, 2.0, 4.0, 4.0],
            )

    def test_transpose_noargs(self):
        x = mx.array([[0, 1, 1], [1, 0, 0]])

        expected = [
            [0, 1],
            [1, 0],
            [1, 0],
        ]

        self.assertListEqual(mx.transpose(x).tolist(), expected)

    def test_transpose_axis(self):
        x = mx.array(
            [
                [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]],
                [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]],
            ]
        )
        expected = [
            [[0, 4, 8], [1, 5, 9], [2, 6, 10], [3, 7, 11]],
            [[12, 16, 20], [13, 17, 21], [14, 18, 22], [15, 19, 23]],
        ]

        self.assertListEqual(mx.transpose(x, axes=(0, 2, 1)).tolist(), expected)

    def test_move_swap_axes(self):
        x = mx.zeros((2, 3, 4))
        self.assertEqual(mx.moveaxis(x, 0, 2).shape, (3, 4, 2))
        self.assertEqual(x.moveaxis(0, 2).shape, (3, 4, 2))
        self.assertEqual(mx.swapaxes(x, 0, 2).shape, (4, 3, 2))
        self.assertEqual(x.swapaxes(0, 2).shape, (4, 3, 2))

    def test_sum(self):
        x = mx.array(
            [
                [1, 2],
                [3, 3],
            ]
        )
        self.assertEqual(mx.sum(x).item(), 9)
        y = mx.sum(x, keepdims=True)
        self.assertEqual(y, mx.array(9))
        self.assertEqual(y.shape, (1, 1))

        self.assertEqual(mx.sum(x, axis=0).tolist(), [4, 5])
        self.assertEqual(mx.sum(x, axis=1).tolist(), [3, 6])

        x_npy = np.arange(3 * 5 * 4 * 7).astype(np.float32)
        x_npy = np.reshape(x_npy, (3, 5, 4, 7))
        x_mlx = mx.array(x_npy)

        for axis in (None, 0, 1, 2, 3, (0, 1), (2, 3), (1, 2, 3)):
            sum_npy = np.sum(x_npy, axis=axis)
            sum_mlx = np.asarray(mx.sum(x_mlx, axis=axis))
            self.assertListEqual(list(sum_npy.shape), list(sum_mlx.shape))
            self.assertTrue(np.all(sum_npy == sum_mlx))

        x_npy = np.array([1.0, 2.0, 3.0, 4.0]).astype(np.float32)
        x_mlx = mx.array(x_npy)

        y_npy = x_npy[0:4:2]
        y_npy = np.broadcast_to(y_npy, (2, 2))

        y_mlx = x_mlx[0:4:2]
        y_mlx = mx.broadcast_to(y_mlx, (2, 2))

        for axis in (None, 0, 1, (0, 1)):
            sum_npy = np.sum(y_npy, axis=axis)
            sum_mlx = np.asarray(mx.sum(y_mlx, axis=axis))
            self.assertListEqual(list(sum_npy.shape), list(sum_mlx.shape))
            self.assertTrue(np.all(sum_npy == sum_mlx))

        x_npy = (
            np.arange(3 * 2 * 3 * 3 * 3 * 3)
            .reshape(3, 2, 3, 3, 3, 3)
            .astype(np.float32)
        )
        x_mlx = mx.array(x_npy)

        y_mlx = x_mlx.sum(axis=(0, 1, 3, 4, 5))
        y_npy = x_npy.sum(axis=(0, 1, 3, 4, 5))

        self.assertTrue(np.array_equal(y_mlx, y_npy))

    def test_prod(self):
        x = mx.array(
            [
                [1, 2],
                [3, 3],
            ]
        )
        self.assertEqual(mx.prod(x).item(), 18)
        y = mx.prod(x, keepdims=True)
        self.assertEqual(y, mx.array(18))
        self.assertEqual(y.shape, (1, 1))

        self.assertEqual(mx.prod(x, axis=0).tolist(), [3, 6])
        self.assertEqual(mx.prod(x, axis=1).tolist(), [2, 9])

    def test_min_and_max(self):
        x = mx.array(
            [
                [1, 2],
                [3, 4],
            ]
        )
        self.assertEqual(mx.min(x).item(), 1)
        self.assertEqual(mx.max(x).item(), 4)
        y = mx.min(x, keepdims=True)
        self.assertEqual(y.shape, (1, 1))
        self.assertEqual(y, mx.array(1))

        y = mx.max(x, keepdims=True)
        self.assertEqual(y.shape, (1, 1))
        self.assertEqual(y, mx.array(4))

        self.assertEqual(mx.min(x, axis=0).tolist(), [1, 2])
        self.assertEqual(mx.min(x, axis=1).tolist(), [1, 3])
        self.assertEqual(mx.max(x, axis=0).tolist(), [3, 4])
        self.assertEqual(mx.max(x, axis=1).tolist(), [2, 4])

    def test_argmin_argmax(self):
        data = np.random.rand(10, 12, 13)
        x = mx.array(data)
        for op in ["argmin", "argmax"]:
            for axis in range(3):
                for kd in [True, False]:
                    a = getattr(mx, op)(x, axis, kd)
                    b = getattr(np, op)(data, axis, keepdims=kd)
                    self.assertEqual(a.tolist(), b.tolist())

        for op in ["argmin", "argmax"]:
            a = getattr(mx, op)(x, keepdims=True)
            b = getattr(np, op)(data, keepdims=True)
            self.assertEqual(a.tolist(), b.tolist())
            a = getattr(mx, op)(x)
            b = getattr(np, op)(data)
            self.assertEqual(a.item(), b)

    def test_broadcast(self):
        a_npy = np.reshape(np.arange(200), (10, 20))
        a_mlx = mx.array(a_npy)

        b_npy = np.broadcast_to(a_npy, (30, 10, 20))
        b_mlx = mx.broadcast_to(a_mlx, (30, 10, 20))
        self.assertListEqual(list(b_npy.shape), list(b_mlx.shape))
        self.assertTrue(np.array_equal(b_npy, b_mlx))

        b_npy = np.broadcast_to(a_npy, (1, 10, 20))
        b_mlx = mx.broadcast_to(a_mlx, (1, 10, 20))
        self.assertListEqual(list(b_npy.shape), list(b_mlx.shape))
        self.assertTrue(np.array_equal(b_npy, b_mlx))

        b_npy = np.broadcast_to(1, (10, 20))
        b_mlx = mx.broadcast_to(1, (10, 20))
        self.assertListEqual(list(b_npy.shape), list(b_mlx.shape))
        self.assertTrue(np.array_equal(b_npy, b_mlx))

    def test_logsumexp(self):
        def logsumexp(x, axes=None):
            maxs = mx.max(x, axis=axes, keepdims=True)
            return mx.log(mx.sum(mx.exp(x - maxs), axis=axes, keepdims=True)) + maxs

        x = mx.array(
            [
                [1.0, 2.0],
                [3.0, 4.0],
            ]
        )
        self.assertTrue(math.isclose(mx.logsumexp(x).item(), logsumexp(x).item()))

        x = mx.random.uniform(shape=(1025,))
        self.assertTrue(mx.allclose(mx.logsumexp(x), logsumexp(x)))

        # Transposed
        x = mx.random.uniform(shape=(2, 2, 8))
        x = x.swapaxes(0, 1)
        self.assertTrue(mx.allclose(mx.logsumexp(x), logsumexp(x)))

        # Broadcast
        x = mx.broadcast_to(mx.random.uniform(shape=(2, 1, 8)), (2, 2, 8))
        self.assertTrue(mx.allclose(mx.logsumexp(x), logsumexp(x)))

        # Large
        x = mx.random.uniform(shape=(1025,))
        x = mx.broadcast_to(mx.random.uniform(shape=(2, 1, 8)), (2, 2, 8))
        self.assertTrue(mx.allclose(mx.logsumexp(x), logsumexp(x)))

    def test_mean(self):
        x = mx.array(
            [
                [1, 2],
                [3, 4],
            ]
        )
        self.assertEqual(mx.mean(x).item(), 2.5)
        y = mx.mean(x, keepdims=True)
        self.assertEqual(y, mx.array(2.5))
        self.assertEqual(y.shape, (1, 1))

        self.assertEqual(mx.mean(x, axis=0).tolist(), [2, 3])
        self.assertEqual(mx.mean(x, axis=1).tolist(), [1.5, 3.5])

    def test_median(self):
        x = mx.array([])
        with self.assertRaises(ValueError):
            mx.median(x, axis=0)
        x = mx.array([0, 1, 2, 3, 4])
        with self.assertRaises(ValueError):
            mx.median(x, axis=(0, 1))
        with self.assertRaises(ValueError):
            mx.median(x, axis=(0, 0))

        out = mx.median(x)
        self.assertEqual(out.shape, ())
        self.assertEqual(out.item(), 2)
        out = mx.median(x, keepdims=True)
        self.assertEqual(out.shape, (1,))

        x = mx.array([0, 1, 2, 3, 4, 5])
        out = mx.median(x)
        self.assertEqual(out.item(), 2.5)

        x = mx.random.normal((5, 5, 5, 5))
        out = mx.median(x, axis=(0, 2), keepdims=True)
        out_np = np.median(x, axis=(0, 2), keepdims=True)
        self.assertTrue(np.allclose(out, out_np))

        out = mx.median(x, axis=(1, 3), keepdims=True)
        out_np = np.median(x, axis=(1, 3), keepdims=True)
        self.assertTrue(np.allclose(out, out_np))

        out = mx.median(x, axis=(0, 1, 3), keepdims=True)
        out_np = np.median(x, axis=(0, 1, 3), keepdims=True)
        self.assertTrue(np.allclose(out, out_np))

    def test_var(self):
        x = mx.array(
            [
                [1, 2],
                [3, 4],
            ]
        )
        self.assertEqual(mx.var(x).item(), 1.25)
        y = mx.var(x, keepdims=True)
        self.assertEqual(y, mx.array(1.25))
        self.assertEqual(y.shape, (1, 1))

        self.assertEqual(mx.var(x, axis=0).tolist(), [1.0, 1.0])
        self.assertEqual(mx.var(x, axis=1).tolist(), [0.25, 0.25])

        x = mx.array([1.0, 2.0])
        out = mx.var(x, ddof=2)
        self.assertEqual(out.item(), float("inf"))

        x = mx.array([1.0, 2.0])
        out = mx.var(x, ddof=3)
        self.assertEqual(out.item(), float("inf"))

    def test_std(self):
        x = mx.random.uniform(shape=(5, 5))
        x_np = np.array(x)
        self.assertAlmostEqual(mx.std(x).item(), x_np.std().item(), places=6)

    def test_abs(self):
        a = mx.array([-1.0, 1.0, -2.0, 3.0])
        result = mx.abs(a)
        expected = np.abs(a, dtype=np.float32)
        self.assertTrue(np.allclose(result, expected))

        self.assertTrue(np.allclose(a.abs(), abs(a)))

    def test_negative(self):
        a = mx.array([-1.0, 1.0, -2.0, 3.0])
        result = mx.negative(a)
        expected = np.negative(a, dtype=np.float32)
        self.assertTrue(np.allclose(result, expected))

    def test_sign(self):
        a = mx.array([-1.0, 1.0, 0.0, -2.0, 3.0])
        result = mx.sign(a)
        expected = np.sign(a, dtype=np.float32)
        self.assertTrue(np.allclose(result, expected))

        a = mx.array([-1.0, 1.0, 0.0, -2.0, 3.0])
        b = mx.array([-4.0, -3.0, 1.0, 0.0, 3.0])
        c = a + b * 1j
        result = mx.sign(c)
        # np.sign differs in NumPy 1 and 2 so
        # we manually implement the NumPy 2 version here.
        expected = c / np.abs(c)
        self.assertTrue(np.allclose(result, expected))

    def test_logical_not(self):
        a = mx.array([-1.0, 1.0, 0.0, 1.0, -2.0, 3.0])
        result = mx.logical_not(a)
        expected = np.logical_not(a)
        self.assertTrue(np.array_equal(result, expected))

    def test_logical_and(self):
        a = mx.array([True, False, True, False])
        b = mx.array([True, True, False, False])
        result = mx.logical_and(a, b)
        expected = np.logical_and(a, b)
        self.assertTrue(np.array_equal(result, expected))

        # test overloaded operator
        result = a & b
        self.assertTrue(np.array_equal(result, expected))

    def test_logical_or(self):
        a = mx.array([True, False, True, False])
        b = mx.array([True, True, False, False])
        result = mx.logical_or(a, b)
        expected = np.logical_or(a, b)
        self.assertTrue(np.array_equal(result, expected))

        # test overloaded operator
        result = a | b
        self.assertTrue(np.array_equal(result, expected))

    def test_square(self):
        a = mx.array([0.1, 0.5, 1.0, 10.0])
        result = mx.square(a)
        expected = np.square(a, dtype=np.float32)

        self.assertTrue(np.allclose(result, expected))

    def test_sqrt(self):
        a = mx.array([0.1, 0.5, 1.0, 10.0])
        result = mx.sqrt(a)
        expected = np.sqrt(a, dtype=np.float32)
        self.assertTrue(np.allclose(result, expected))

    def test_rsqrt(self):
        a = mx.array([0.1, 0.5, 1.0, 10.0])
        result = mx.rsqrt(a)
        expected = 1.0 / np.sqrt(a, dtype=np.float32)
        self.assertTrue(np.allclose(result, expected))

    def test_reciprocal(self):
        a = mx.array([0.1, 0.5, 1.0, 2.0])
        result = mx.reciprocal(a)
        expected = np.reciprocal(a, dtype=np.float32)
        self.assertTrue(np.allclose(result, expected))

    def test_logaddexp(self):
        a = mx.array([0, 1, 2, 9.0])
        b = mx.array([1, 0, 4, 2.5])

        result = mx.logaddexp(a, b)
        expected = np.logaddexp(a, b, dtype=np.float32)

        self.assertTrue(np.allclose(result, expected))

        # Complex test

        a = mx.array([0, 1, 2, 9.0]) + 1j
        b = mx.array([1, 0, 4, 2.5]) + 1j

        result = mx.logaddexp(a, b)
        expected = np_logaddexp(np.array(a), np.array(b))

        self.assertTrue(np.allclose(result, expected))

        a = mx.array([float("nan")])
        b = mx.array([0.0])
        self.assertTrue(math.isnan(mx.logaddexp(a, b).item()))

    def test_log(self):
        a = mx.array([1, 0.5, 10, 100])
        result = mx.log(a)
        expected = np.log(a, dtype=np.float32)

        self.assertTrue(np.allclose(result, expected))

        a = mx.array(1.0) + 1j * mx.array(2.0)
        result = mx.log(a)
        expected = np.log(np.array(a))
        self.assertTrue(np.allclose(result, expected))

    def test_log2(self):
        a = mx.array([0.5, 1, 2, 10, 16])
        result = mx.log2(a)
        expected = np.log2(a, dtype=np.float32)

        self.assertTrue(np.allclose(result, expected))

        a = mx.array(1.0) + 1j * mx.array(2.0)
        result = mx.log2(a)
        expected = np.log2(np.array(a))
        self.assertTrue(np.allclose(result, expected))

    def test_log10(self):
        a = mx.array([0.1, 1, 10, 20, 100])
        result = mx.log10(a)
        expected = np.log10(a, dtype=np.float32)

        self.assertTrue(np.allclose(result, expected))

        a = mx.array(1.0) + 1j * mx.array(2.0)
        result = mx.log10(a)
        expected = np.log10(np.array(a))
        self.assertTrue(np.allclose(result, expected))

    def test_exp(self):
        a = mx.array([0, 0.5, -0.5, 5])
        result = mx.exp(a)
        expected = np.exp(a, dtype=np.float32)

        self.assertTrue(np.allclose(result, expected))

    def test_expm1(self):
        a = mx.array([-88, -87, 0, 0.5, -0.5, 5, 87, 88, 89, 90])
        result = mx.expm1(a)
        errs = np.seterr(over="ignore")
        expected = np.expm1(a)
        np.seterr(over=errs["over"])
        self.assertTrue(np.allclose(result, expected, rtol=1e-3, atol=1e-4))

    def test_erf(self):
        inputs = [-5, 0.0, 0.5, 1.0, 2.0, 10.0]
        x = mx.array(inputs)
        expected = np.array([math.erf(i) for i in inputs])
        self.assertTrue(np.allclose(mx.erf(x), expected))

    def test_erfinv(self):
        inputs = [-5.0, -1.0, 0.5, 0.0, 0.5, 1.0, 5.0]
        x = mx.array(inputs)
        # Output of:
        # scipy.special.erfinv([-5.0, -1.0, 0.5, 0.0, 0.5, 1.0, 5.0])
        expected = np.array(
            [
                float("nan"),
                -float("inf"),
                0.47693628,
                0.0,
                0.47693628,
                float("inf"),
                float("nan"),
            ]
        ).astype(np.float32)
        self.assertTrue(np.allclose(mx.erfinv(x), expected, equal_nan=True))

        result = mx.erfinv(mx.array([0.9999999403953552] * 8))
        expected = mx.array([3.8325066566467285] * 8)
        self.assertTrue(mx.allclose(result, expected))

    def test_sin(self):
        a = mx.array(
            [0, math.pi / 4, math.pi / 2, math.pi, 3 * math.pi / 4, 2 * math.pi]
        )
        result = mx.sin(a)
        expected = np.sin(a, dtype=np.float32)

        self.assertTrue(np.allclose(result, expected))

    def test_cos(self):
        a = mx.array(
            [0, math.pi / 4, math.pi / 2, math.pi, 3 * math.pi / 4, 2 * math.pi]
        )
        result = mx.cos(a)
        expected = np.cos(a, dtype=np.float32)

        self.assertTrue(np.allclose(result, expected))

    def test_degrees(self):
        a = mx.array(
            [0, math.pi / 4, math.pi / 2, math.pi, 3 * math.pi / 4, 2 * math.pi]
        )
        result = mx.degrees(a)
        expected = np.degrees(a, dtype=np.float32)

        self.assertTrue(np.allclose(result, expected))

    def test_radians(self):
        a = mx.array([0.0, 45.0, 90.0, 180.0, 270.0, 360.0])
        result = mx.radians(a)
        expected = np.radians(a, dtype=np.float32)

        self.assertTrue(np.allclose(result, expected))

    def test_log1p(self):
        a = mx.array([1, 0.5, 10, 100])
        result = mx.log1p(a)
        expected = np.log1p(a, dtype=np.float32)

        self.assertTrue(np.allclose(result, expected))

        # Complex test
        a = mx.array([1, 0.5, 10, 100]) + 1j
        result = mx.log1p(a)
        expected = np.log1p(a, dtype=np.complex64)

        self.assertTrue(np.allclose(result, expected))

    def test_sigmoid(self):
        a = mx.array([0.0, 1.0, -1.0, 5.0, -5.0])
        result = mx.sigmoid(a)
        expected = 1 / (1 + np.exp(-a, dtype=np.float32))
        self.assertTrue(np.allclose(result, expected))

        # Low precision
        a = mx.array(-8.0).astype(mx.float16)
        self.assertNotEqual(mx.sigmoid(a).item(), 0.0)
        a = mx.array(8.0).astype(mx.float16)
        self.assertNotEqual(mx.sigmoid(a).item(), 1.0)

    def test_allclose(self):
        a = mx.array(1.0)
        b = mx.array(1.0)

        self.assertTrue(mx.allclose(a, b).item())

        b = mx.array(1.1)
        self.assertFalse(mx.allclose(a, b).item())
        self.assertTrue(mx.allclose(a, b, 0.1).item())
        self.assertFalse(mx.allclose(a, b, 0.01).item())
        self.assertTrue(mx.allclose(a, b, 0.01, 0.1).item())

        c = mx.array(float("inf"))
        self.assertTrue(mx.allclose(c, c).item())

    def test_isclose(self):
        a = mx.array([float("inf"), float("inf"), float("-inf")])
        b = mx.array([float("inf"), float("-inf"), float("-inf")])

        self.assertListEqual(mx.isclose(a, b).tolist(), [True, False, True])

        a = mx.array([np.nan])
        self.assertListEqual(mx.isclose(a, a).tolist(), [False])

        a = mx.array([np.nan])
        self.assertListEqual(mx.isclose(a, a, equal_nan=True).tolist(), [True])

    def test_all(self):
        a = mx.array([[True, False], [True, True]])

        self.assertFalse(mx.all(a).item())
        self.assertEqual(mx.all(a, keepdims=True).shape, (1, 1))
        self.assertFalse(mx.all(a, axis=[0, 1]).item())
        self.assertEqual(mx.all(a, axis=[0]).tolist(), [True, False])
        self.assertEqual(mx.all(a, axis=[1]).tolist(), [False, True])
        self.assertEqual(mx.all(a, axis=0).tolist(), [True, False])
        self.assertEqual(mx.all(a, axis=1).tolist(), [False, True])

    def test_any(self):
        a = mx.array([[True, False], [False, False]])

        self.assertTrue(mx.any(a).item())
        self.assertEqual(mx.any(a, keepdims=True).shape, (1, 1))
        self.assertTrue(mx.any(a, axis=[0, 1]).item())
        self.assertEqual(mx.any(a, axis=[0]).tolist(), [True, False])
        self.assertEqual(mx.any(a, axis=[1]).tolist(), [True, False])
        self.assertEqual(mx.any(a, axis=0).tolist(), [True, False])
        self.assertEqual(mx.any(a, axis=1).tolist(), [True, False])

    def test_stop_gradient(self):
        def func(x):
            return mx.sum(2 * x + mx.stop_gradient(3 * x))

        x = mx.array([0.0, 0.1, -3])
        expected = [2, 2, 2]

        self.assertListEqual(mx.grad(func)(x).tolist(), expected)

    def test_kron(self):
        # Basic vector test
        x = mx.array([1, 2])
        y = mx.array([3, 4])
        z = mx.kron(x, y)
        self.assertEqual(z.tolist(), [3, 4, 6, 8])

        # Basic matrix test
        x = mx.array([[1, 2], [3, 4]])
        y = mx.array([[0, 5], [6, 7]])
        z = mx.kron(x, y)
        self.assertEqual(
            z.tolist(),
            [[0, 5, 0, 10], [6, 7, 12, 14], [0, 15, 0, 20], [18, 21, 24, 28]],
        )

        # Test with different dimensions
        x = mx.array([1, 2])  # (2,)
        y = mx.array([[3, 4], [5, 6]])  # (2, 2)
        z = mx.kron(x, y)
        self.assertEqual(z.tolist(), [[3, 4, 6, 8], [5, 6, 10, 12]])

        # Test with empty array
        x = mx.array([])
        y = mx.array([1, 2])
        with self.assertRaises(ValueError):
            mx.kron(x, y)

    def test_take(self):
        # Shape: 4 x 3 x 2
        l = [
            [[1, 3], [-2, -2], [-3, -2]],
            [[2, 4], [-3, 2], [-4, -2]],
            [[2, 3], [2, 4], [2, 1]],
            [[1, -5], [3, -1], [2, 3]],
        ]

        a = mx.array(l)
        a_npy = np.array(l)

        indices = [0, -1]
        flatten_take = mx.take(a, mx.array(indices)).tolist()
        flatten_take_expected = np.take(a_npy, np.array(indices)).tolist()
        self.assertListEqual(flatten_take, flatten_take_expected)

        indices = [-1, 2, 0]
        axis_take = mx.take(a, mx.array(indices), axis=0).tolist()
        axis_take_expected = np.take(a_npy, np.array(indices), axis=0).tolist()
        self.assertListEqual(axis_take, axis_take_expected)

        indices = [0, 0, -2]
        axis_take = mx.take(a, mx.array(indices), axis=1).tolist()
        axis_take_expected = np.take(a_npy, np.array(indices), axis=1).tolist()
        self.assertListEqual(axis_take, axis_take_expected)

        indices = [0, -1, -1]
        axis_take = mx.take(a, mx.array(indices), axis=-1).tolist()
        axis_take_expected = np.take(a_npy, np.array(indices), axis=-1).tolist()
        self.assertListEqual(axis_take, axis_take_expected)

        a_npy = np.arange(8 * 8 * 8, dtype=np.int32)
        a_npy = a_npy.reshape((8, 8, 8))
        idx_npy = np.arange(6, dtype=np.uint32)
        idx_npy = idx_npy.reshape((2, 3))
        a_mlx = mx.array(a_npy)
        idx_mlx = mx.array(idx_npy)

        a_npy_taken = np.take(a_npy, idx_npy)
        a_mlx_taken = mx.take(a_mlx, idx_mlx)
        self.assertEqual(a_npy_taken.shape, a_mlx_taken.shape)
        self.assertListEqual(a_npy_taken.tolist(), a_mlx_taken.tolist())

        a_npy_taken = np.take(a_npy, idx_npy, axis=0)
        a_mlx_taken = mx.take(a_mlx, idx_mlx, axis=0)
        self.assertEqual(a_npy_taken.shape, a_mlx_taken.shape)
        self.assertListEqual(a_npy_taken.tolist(), a_mlx_taken.tolist())

        a_npy_taken = np.take(a_npy, idx_npy, axis=1)
        a_mlx_taken = mx.take(a_mlx, idx_mlx, axis=1)
        self.assertEqual(a_npy_taken.shape, a_mlx_taken.shape)
        self.assertListEqual(a_npy_taken.tolist(), a_mlx_taken.tolist())

        a_npy_taken = np.take(a_npy, idx_npy, axis=2)
        a_mlx_taken = mx.take(a_mlx, idx_mlx, axis=2)
        self.assertEqual(a_npy_taken.shape, a_mlx_taken.shape)
        self.assertListEqual(a_npy_taken.tolist(), a_mlx_taken.tolist())

        # Take with integer index
        a = mx.arange(8).reshape(2, 4)
        out = mx.take(a, 1, axis=0)
        self.assertTrue(mx.array_equal(out, mx.array([4, 5, 6, 7])))
        out = mx.take(a, 1, axis=1)
        self.assertTrue(mx.array_equal(out, mx.array([1, 5])))

        # Take with multi-dim scalar preserves dims
        out = mx.take(a, mx.array(1), axis=0)
        self.assertEqual(out.shape, (4,))

        out = mx.take(a, mx.array([1]), axis=0)
        self.assertEqual(out.shape, (1, 4))

        out = mx.take(a, mx.array([[1]]), axis=0)
        self.assertEqual(out.shape, (1, 1, 4))

        # Take from empty array works in some cases
        a = mx.zeros((4, 0))
        out = mx.take(a, mx.array([1, 2]), axis=0)
        self.assertEqual(out.shape, (2, 0))
        self.assertEqual(out.dtype, a.dtype)
        with self.assertRaises(ValueError):
            mx.take(a, mx.array([[1]]), axis=1)

    def test_take_along_axis(self):
        a_np = np.arange(8).reshape(2, 2, 2)
        a_mlx = mx.array(a_np)
        idx_np = np.array([1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0])
        idx_mlx = mx.array(idx_np)

        for ax in [None, 0, 1, 2]:
            if ax == None:
                shape = [-1]
            else:
                shape = [2] * 3
                shape[ax] = 3
            out_np = np.take_along_axis(a_np, idx_np.reshape(shape), axis=ax)
            out_mlx = mx.take_along_axis(a_mlx, mx.reshape(idx_mlx, shape), axis=ax)
            self.assertTrue(np.array_equal(out_np, np.array(out_mlx)))

    def test_put_along_axis(self):
        for ax in [None, 0, 1, 2]:
            a_np = np.arange(16).reshape(2, 2, 4).astype(np.int32)
            a_mlx = mx.array(a_np)

            if ax == None:
                idx_np = np.random.permutation(a_np.size)
                values_np = np.random.randint(low=0, high=100, size=(16,))
            else:
                shape = list(a_np.shape)
                shape[ax] = 2
                idx_np = np.random.choice(a_np.shape[ax], replace=False, size=(2,))
                idx_np = np.expand_dims(idx_np, list(range(1, 2 - ax + 1)))
                idx_np = np.broadcast_to(idx_np, shape)
                values_np = np.random.randint(low=0, high=100, size=shape)

            idx_np.astype(np.int32)
            values_np.astype(a_np.dtype)

            idx_mlx = mx.array(idx_np)
            values_mlx = mx.array(values_np)

            np.put_along_axis(a_np, idx_np, values_np, axis=ax)
            out_mlx = mx.put_along_axis(a_mlx, idx_mlx, values_mlx, axis=ax)
            self.assertTrue(np.array_equal(a_np, out_mlx))

        source = mx.zeros((1, 1, 8, 32))
        indices = mx.array([0, 2, 4, 5]).reshape((1, 1, 4, 1))
        update = mx.array(1.0)

        out_mlx = mx.put_along_axis(source, indices, update, axis=-2)
        out_np = np.array(source)
        np.put_along_axis(out_np, np.array(indices), np.array(update), axis=-2)
        self.assertTrue(np.array_equal(out_np, np.array(out_mlx)))

        a = mx.array([], mx.float32)
        b = mx.put_along_axis(a, a, a, axis=None)
        mx.eval(b)
        self.assertEqual(b.size, 0)
        self.assertEqual(b.shape, a.shape)

    def test_split(self):
        a = mx.array([1, 2, 3])
        splits = mx.split(a, 3)
        for e, x in enumerate(splits):
            self.assertEqual(x.item(), e + 1)

        a = mx.array([[1, 2], [3, 4], [5, 6]])
        x, y, z = mx.split(a, 3, axis=0)
        self.assertEqual(x.tolist(), [[1, 2]])
        self.assertEqual(y.tolist(), [[3, 4]])
        self.assertEqual(z.tolist(), [[5, 6]])

        with self.assertRaises(ValueError):
            mx.split(a, 3, axis=2)

        a = mx.arange(8)
        x, y, z = mx.split(a, [1, 5])
        self.assertEqual(x.tolist(), [0])
        self.assertEqual(y.tolist(), [1, 2, 3, 4])
        self.assertEqual(z.tolist(), [5, 6, 7])

    def test_split_invalid_num_splits(self):
        """Regression: split with num_splits <= 0 should raise, not crash."""
        a = mx.arange(6)

        # num_splits = 0: should raise cleanly (was UB via divide-by-zero)
        with self.assertRaises(ValueError):
            mx.split(a, 0)

        # num_splits = -1: should raise cleanly (was SIGBUS via huge allocation)
        with self.assertRaises(ValueError):
            mx.split(a, -1)

        # Also check with explicit axis
        b = mx.zeros((4, 6))
        with self.assertRaises(ValueError):
            mx.split(b, 0, axis=1)
        with self.assertRaises(ValueError):
            mx.split(b, -2, axis=0)

    def test_arange_overload_dispatch(self):
        with self.assertRaises(ValueError):
            a = mx.arange(float("nan"), 1, 5)
        with self.assertRaises(ValueError):
            a = mx.arange(0, float("nan"), 5)
        with self.assertRaises(ValueError):
            a = mx.arange(0, 2, float("nan"))
        with self.assertRaises(ValueError):
            a = mx.arange(0, float("inf"), float("inf"))
        with self.assertRaises(ValueError):
            a = mx.arange(float("inf"), 1, float("inf"))
        with self.assertRaises(ValueError):
            a = mx.arange(float("inf"), 1, 5)
        with self.assertRaises(TypeError):
            INT_MAX = 2147483647
            a = mx.arange(0, INT_MAX + 1, 1)

        a = mx.arange(5)
        expected = [0, 1, 2, 3, 4]
        self.assertListEqual(a.tolist(), expected)

        a = mx.arange(1, 5)
        expected = [1, 2, 3, 4]
        self.assertListEqual(a.tolist(), expected)

        a = mx.arange(-3, step=-1)
        expected = [0, -1, -2]
        self.assertListEqual(a.tolist(), expected)

        a = mx.arange(stop=2, step=0.5)
        expected = [0, 0.5, 1.0, 1.5]
        self.assertListEqual(a.tolist(), expected)

        with self.assertRaises(TypeError):
            mx.arange(start=1, step=2)

        a = mx.arange(stop=3)
        expected = [0, 1, 2]
        self.assertListEqual(a.tolist(), expected)

    def test_arange_inferred_dtype(self):
        a = mx.arange(5)
        self.assertEqual(a.dtype, mx.int32)

        a = mx.arange(5.0)
        self.assertEqual(a.dtype, mx.float32)

        a = mx.arange(1, 3.0)
        self.assertEqual(a.dtype, mx.float32)

        a = mx.arange(1, 3, dtype=mx.float32)
        self.assertEqual(a.dtype, mx.float32)

        a = mx.arange(1, 5, 1)
        self.assertEqual(a.dtype, mx.int32)

        a = mx.arange(1.0, 5, 1)
        self.assertEqual(a.dtype, mx.float32)

        a = mx.arange(1, 5.0, 1)
        self.assertEqual(a.dtype, mx.float32)

        a = mx.arange(1, 5, 1.0)
        self.assertEqual(a.dtype, mx.float32)

        a = mx.arange(1.0, 3.0, 0.2, dtype=mx.int32)
        self.assertEqual(a.dtype, mx.int32)

    def test_arange_corner_cases_cast(self):
        a = mx.arange(0, 3, 0.2, dtype=mx.int32)
        expected = [0] * 15
        self.assertListEqual(a.tolist(), expected)
        self.assertEqual(a.dtype, mx.int32)

        a = mx.arange(-1, -4, -0.9, dtype=mx.int32)
        expected = [-1] * 4
        self.assertListEqual(a.tolist(), expected)
        self.assertEqual(a.dtype, mx.int32)

        a = mx.arange(-1, -20, -1.2, dtype=mx.int32)
        expected = [
            -1,
            -2,
            -3,
            -4,
            -5,
            -6,
            -7,
            -8,
            -9,
            -10,
            -11,
            -12,
            -13,
            -14,
            -15,
            -16,
        ]
        self.assertListEqual(a.tolist(), expected)
        self.assertEqual(a.dtype, mx.int32)

        a = mx.arange(0, 10, 100)
        expected = [0]
        self.assertListEqual(a.tolist(), expected)
        self.assertEqual(a.dtype, mx.int32)

        a = mx.arange(10, 0, 1)
        expected = []
        self.assertListEqual(a.tolist(), expected)

        a = mx.arange(10, 0, float("inf"))
        expected = []
        self.assertListEqual(a.tolist(), expected)

        a = mx.arange(0, 10, float("inf"))
        expected = [0]
        self.assertListEqual(a.tolist(), expected)

        a = mx.arange(0, -10, float("-inf"))
        expected = [0]
        self.assertListEqual(a.tolist(), expected)

    def test_hanning_general(self):
        a = mx.hanning(10)
        expected = np.hanning(10)
        self.assertTrue(np.allclose(a, expected, atol=1e-5))

        a = mx.hanning(1)
        self.assertEqual(a.item(), 1.0)

        a = mx.hanning(0)
        self.assertEqual(a.size, 0)
        self.assertEqual(a.dtype, mx.float32)

    def test_hamming_general(self):
        a = mx.hamming(10)
        expected = np.hamming(10)
        self.assertTrue(np.allclose(a, expected, atol=1e-5))

        a = mx.hamming(1)
        self.assertEqual(a.item(), 1.0)

        a = mx.hamming(0)
        self.assertEqual(a.size, 0)
        self.assertEqual(a.dtype, mx.float32)

    def test_bartlett_general(self):
        a = mx.bartlett(10)
        expected = np.bartlett(10)
        self.assertTrue(np.allclose(a, expected, atol=1e-5))

        a = mx.bartlett(1)
        self.assertEqual(a.item(), 1.0)

        a = mx.bartlett(0)
        self.assertEqual(a.size, 0)
        self.assertEqual(a.dtype, mx.float32)

    def test_blackman_general(self):
        a = mx.blackman(10)
        expected = np.blackman(10)
        self.assertTrue(np.allclose(a, expected, atol=1e-5))

        a = mx.blackman(1)
        self.assertEqual(a.item(), 1.0)

        a = mx.blackman(0)
        self.assertEqual(a.size, 0)
        self.assertEqual(a.dtype, mx.float32)

    def test_unary_ops(self):
        def test_ops(npop, mlxop, x, y, atol, rtol):
            r_np = npop(x)
            r_mlx = mlxop(y)
            mx.eval(r_mlx)
            self.assertTrue(np.allclose(r_np, r_mlx, atol=atol, rtol=rtol))

        x = np.random.rand(18, 28, 38)
        for op in ["abs", "exp", "log", "square", "sqrt"]:
            with self.subTest(op=op):
                float_dtypes = [("float16", 1e-3, 1e-3), ("float32", 1e-6, 1e-5)]

                for dtype, atol, rtol in float_dtypes:
                    with self.subTest(dtype=dtype):
                        x_ = x.astype(getattr(np, dtype))
                        y_ = mx.array(x_)
                        test_ops(getattr(np, op), getattr(mx, op), x_, y_, atol, rtol)

    def test_unary_ops_from_non_array(self):
        unary_ops = [
            "abs",
            "exp",
            "log",
            "square",
            "sqrt",
            "sin",
            "cos",
            "tan",
            "sinh",
            "cosh",
            "tanh",
            "sign",
            "negative",
            "expm1",
            "arcsin",
            "arccos",
            "arctan",
            "arcsinh",
            "arctanh",
            "degrees",
            "radians",
            "log2",
            "log10",
            "log1p",
            "floor",
            "ceil",
            "conjugate",
        ]

        x = 0.5
        x_np = np.random.rand(10).astype(np.float32)
        for op in unary_ops:
            with self.subTest(op=op):
                # Test from scalar
                expected = getattr(np, op)(x)
                out = getattr(mx, op)(x)

                # Check close
                self.assertTrue(np.allclose(expected, out, equal_nan=True))

                # Test from NumPy
                expected = getattr(np, op)(x_np)
                out = getattr(mx, op)(x_np)

                # Check close
                self.assertTrue(np.allclose(expected, np.array(out), equal_nan=True))

    def test_trig_ops(self):
        def test_ops(npop, mlxop, x, y, atol, rtol):
            r_np = npop(x)
            r_mlx = mlxop(y)
            mx.eval(r_mlx)

            self.assertTrue(
                np.allclose(r_np, r_mlx, atol=atol, rtol=rtol, equal_nan=True)
            )

        x = np.random.rand(9, 12, 18)
        xi = np.random.rand(9, 12, 18)
        base_ops = ["sin", "cos", "tan"]
        hyperbolic_ops = ["sinh", "cosh", "tanh"]
        all_fwd_ops = base_ops + hyperbolic_ops

        for op in all_fwd_ops:
            with self.subTest(op=op):
                float_dtypes = [("float16", 1e-3, 1e-3), ("float32", 1e-6, 1e-5)]

                for dtype, atol, rtol in float_dtypes:
                    with self.subTest(dtype=dtype):
                        x_ = x.astype(getattr(np, dtype))
                        y_ = mx.array(x_)
                        test_ops(getattr(np, op), getattr(mx, op), x_, y_, atol, rtol)

            with self.subTest(op=op):
                dtype = "complex64"
                with self.subTest(dtype=dtype):
                    x_ = x + 1.0j * xi
                    x_ = x_.astype(getattr(np, dtype))
                    y_ = mx.array(x_)
                    test_ops(getattr(np, op), getattr(mx, op), x_, y_, 1e-5, 1e-5)

            with self.subTest(op="arc" + op):
                float_dtypes = [("float16", 1e-3, 1e-3), ("float32", 1e-6, 1e-5)]
                op_inv = "arc" + op

                for dtype, atol, rtol in float_dtypes:
                    with self.subTest(dtype=dtype):
                        np_op_fwd = getattr(np, op)
                        x_ = np_op_fwd(x).astype(getattr(np, dtype))
                        y_ = mx.array(x_)
                        test_ops(
                            getattr(np, op_inv), getattr(mx, op_inv), x_, y_, atol, rtol
                        )

        # Test grads
        np_vjp_funcs = {
            "sin": lambda primal, cotan: cotan * np.cos(primal),
            "cos": lambda primal, cotan: -cotan * np.sin(primal),
            "tan": lambda primal, cotan: cotan / (np.cos(primal) ** 2),
            "sinh": lambda primal, cotan: cotan * np.cosh(primal),
            "cosh": lambda primal, cotan: cotan * np.sinh(primal),
            "tanh": lambda primal, cotan: cotan / (np.cosh(primal) ** 2),
            "arcsin": lambda primal, cotan: cotan / np.sqrt(1.0 - primal**2),
            "arccos": lambda primal, cotan: -cotan / np.sqrt(1.0 - primal**2),
            "arctan": lambda primal, cotan: cotan / (1.0 + primal**2),
            "arctan2": lambda primal, cotan: cotan / (1.0 + primal**2),
            "arcsinh": lambda primal, cotan: cotan / np.sqrt(primal**2 + 1),
            "arccosh": lambda primal, cotan: cotan / np.sqrt(primal**2 - 1),
            "arctanh": lambda primal, cotan: cotan / (1.0 - primal**2),
        }
        with self.subTest(name="grads"):
            for op in all_fwd_ops:
                with self.subTest(op=op):
                    primal_np = xi.astype(np.float32)
                    primal_mx = mx.array(primal_np)
                    x_ = x.astype(np.float32)
                    y_ = mx.array(x_)
                    op_ = op

                    np_vjp = lambda x: np_vjp_funcs[op_](primal_np, x)
                    mx_vjp = lambda x: mx.vjp(getattr(mx, op_), [primal_mx], [x])[1][0]
                    test_ops(np_vjp, mx_vjp, x_, y_, 1e-5, 1e-5)

                with self.subTest(op="arc" + op):
                    np_op_fwd = getattr(np, op)
                    primal_np = np_op_fwd(xi).astype(np.float32)

                    # To avoid divide by zero error
                    if op == "cosh":
                        primal_np[np.isclose(primal_np, 1.0)] += 1e-3
                    elif op == "cos":
                        primal_np[np.isclose(primal_np, 1.0)] -= 1e-3

                    primal_mx = mx.array(primal_np)
                    x_ = x.astype(np.float32)
                    y_ = mx.array(x_)
                    op_ = "arc" + op

                    np_vjp = lambda x: np_vjp_funcs[op_](primal_np, x)
                    mx_vjp = lambda x: mx.vjp(getattr(mx, op_), [primal_mx], [x])[1][0]
                    test_ops(np_vjp, mx_vjp, x_, y_, 1e-5, 1e-5)

    def test_binary_ops(self):
        def test_ops(npop, mlxop, x1, x2, y1, y2, atol):
            r_np = npop(x1, x2)
            r_mlx = mlxop(y1, y2)
            mx.eval(r_mlx)
            self.assertTrue(np.allclose(r_np, r_mlx, atol=atol))

            r_np = npop(x1[:1], x2)
            r_mlx = mlxop(y1[:1], y2)
            mx.eval(r_mlx)
            self.assertTrue(np.allclose(r_np, r_mlx, atol=atol))

            r_np = npop(x1[:, :1], x2)
            r_mlx = mlxop(y1[:, :1], y2)
            mx.eval(r_mlx)
            self.assertTrue(np.allclose(r_np, r_mlx, atol=atol))

            r_np = npop(x1[:, :, :1], x2)
            r_mlx = mlxop(y1[:, :, :1], y2)
            mx.eval(r_mlx)
            self.assertTrue(np.allclose(r_np, r_mlx, atol=atol))

        x1 = np.maximum(np.random.rand(18, 28, 38), 0.1)
        x2 = np.maximum(np.random.rand(18, 28, 38), 0.1)
        y1 = mx.array(x1)
        y2 = mx.array(x2)
        mx.eval(y1, y2)
        for op in [
            "add",
            "subtract",
            "multiply",
            "divide",
            "floor_divide",
            "maximum",
            "minimum",
            "power",
        ]:
            with self.subTest(op=op):
                int_dtypes = [
                    "int8",
                    "int16",
                    "int32",
                    "int64",
                    "uint8",
                    "uint16",
                    "uint32",
                    "uint64",
                ]
                float_dtypes = ["float16", "float32"]

                dtypes = {
                    "divide": float_dtypes,
                    "power": float_dtypes,
                    "floor_divide": ["float32"] + int_dtypes,
                }
                dtypes = dtypes.get(op, int_dtypes + float_dtypes)

                for dtype in dtypes:
                    atol = 1e-3 if dtype == "float16" else 1e-6
                    with self.subTest(dtype=dtype):
                        m = 10 if dtype in int_dtypes else 1
                        x1_ = (x1 * m).astype(getattr(np, dtype))
                        x2_ = (x2 * m).astype(getattr(np, dtype))
                        y1_ = mx.array(x1_)
                        y2_ = mx.array(x2_)
                        test_ops(
                            getattr(np, op), getattr(mx, op), x1_, x2_, y1_, y2_, atol
                        )

    def test_irregular_binary_ops(self):
        # Check transposed binary ops
        dims = [2, 3, 4, 5]
        size = 3
        trial_mul = 2
        np.random.seed(0)
        for d in dims:
            anp = np.random.randint(-20, 20, (size**d,)).reshape([size] * d)
            bnp = np.random.randint(-20, 20, (size**d,)).reshape([size] * d)
            for _ in range(trial_mul * d):
                amlx = mx.array(anp)
                bmlx = mx.array(bnp)
                a_t = np.random.permutation(d).tolist()
                b_t = np.random.permutation(d).tolist()
                outnp = np.add(anp.transpose(a_t), bnp.transpose(b_t))
                outmlx = mx.add(mx.transpose(amlx, a_t), mx.transpose(bmlx, b_t))
                self.assertTrue(np.array_equal(outnp, outmlx))

        # Check broadcast binary ops
        for d in dims:
            anp = np.random.randint(-20, 20, (size**d,)).reshape([size] * d)
            for n_bsx in range(d):
                bnp = np.random.randint(-20, 20, (size**n_bsx,)).reshape([size] * n_bsx)
                for _ in range(trial_mul * d):
                    amlx = mx.array(anp)
                    bmlx = mx.array(bnp)
                    b_shape = [1] * (d - n_bsx) + [size] * n_bsx
                    np.random.shuffle(b_shape)
                    outnp = np.add(anp, bnp.reshape(b_shape))
                    outmlx = mx.add(amlx, mx.reshape(bmlx, b_shape))
                    self.assertTrue(np.array_equal(outnp, outmlx))

        # Check strided binary ops
        for d in dims:
            a = np.random.randint(-20, 20, (10,) * d)
            b = np.random.randint(-20, 20, (10,) * d)
            a_ = mx.array(a)
            b_ = mx.array(b)
            for t in permutations(range(d)):
                for s in range(d):
                    idx = tuple(
                        [slice(None)] * s
                        + [slice(None, None, 2)]
                        + [slice(None)] * (d - s - 1)
                    )
                    c = a.transpose(t)[idx] + b[idx]
                    c_ = mx.transpose(a_, t)[idx] + b_[idx]
                    self.assertTrue(np.array_equal(c, c_))

    def test_softmax(self):
        cases = [(np.float32, 1e-6), (np.float16, 1e-3)]

        for dtype, atol in cases:
            a_npy = np.random.randn(16, 8, 32).astype(dtype)
            a_mlx = mx.array(a_npy)

            def np_softmax(x, axis):
                ex = np.exp(x - np.max(x, axis=axis, keepdims=True))
                return ex / np.sum(ex, axis=axis, keepdims=True)

            for axes in (None, 0, 1, 2, (0, 1), (1, 2), (0, 2), (0, 1, 2)):
                b_npy = np_softmax(a_npy, axes)
                b_mlx = mx.softmax(a_mlx, axes)
                self.assertTrue(np.allclose(b_npy, b_mlx, atol=atol))

        for s in [100, 2049, 4097, 8193]:
            a = np.full(s, -np.inf)
            a[-1] = 0.0
            a = mx.softmax(mx.array(a))
            self.assertFalse(np.any(np.isnan(a)))
            self.assertTrue((a[:-1] < 1e-9).all())
            self.assertEqual(a[-1], 1)

        # Sliced inputs
        y = mx.random.uniform(shape=(8, 4))
        out = mx.softmax(y[:, 0:2], axis=-1)
        self.assertAlmostEqual(out.sum().item(), 8.0, 5)

        # Precise
        for t in [mx.float16, mx.bfloat16]:
            a = (10 * mx.random.normal(shape=(1024,))).astype(t)
            out_expect = mx.softmax(a.astype(mx.float32)).astype(t)
            out = mx.softmax(a, axis=-1, precise=True)
            self.assertTrue(mx.allclose(out_expect, out))

        # All Infs give NaNs
        for n in [127, 128, 129]:
            x = mx.full((n,), vals=-float("inf"))
            self.assertTrue(mx.all(mx.isnan(mx.softmax(x))))

        # Transposed inputs
        a = mx.random.uniform(shape=(32, 32, 32))
        b = mx.softmax(a, axis=-1)
        c = mx.softmax(a.swapaxes(0, 1), axis=-1).swapaxes(0, 1)
        self.assertEqual((b - c).abs().max().item(), 0.0)

        with self.assertRaises(ValueError):
            mx.softmax(mx.array(1.0), axis=-1)

    def test_concatenate(self):
        a_npy = np.random.randn(32, 32, 32)
        b_npy = np.random.randn(32, 32, 32)
        a_mlx = mx.array(a_npy)
        b_mlx = mx.array(b_npy)

        for axis in (None, 0, 1, 2):
            for p in permutations([0, 1, 2]):
                c_npy = np.concatenate([a_npy, np.transpose(b_npy, p)], axis=axis)
                c_mlx = mx.concatenate([a_mlx, mx.transpose(b_mlx, p)], axis=axis)
                self.assertEqual(list(c_npy.shape), list(c_mlx.shape))
                self.assertTrue(np.allclose(c_npy, c_mlx, atol=1e-6))

        with self.assertRaises(ValueError):
            a = mx.array([[1, 2], [1, 2], [1, 2]])
            b = mx.array([1, 2])
            mx.concatenate([a, b], axis=0)

        # Cocnatenate with 0-sized array
        a = mx.zeros((2, 0, 2))
        b = mx.zeros((2, 2, 2))
        out = mx.concatenate([a, b], axis=1)
        self.assertTrue(mx.array_equal(out, b))

    def test_meshgrid(self):
        x = mx.array([1, 2, 3], dtype=mx.int32)
        y = np.array([1, 2, 3], dtype=np.int32)

        # Test single input
        a_mlx = mx.meshgrid(x)
        a_np = np.meshgrid(y)
        self.assertEqualArray(a_mlx[0], mx.array(a_np[0]))

        # Test sparse
        a_mlx, b_mlx, c_mlx = mx.meshgrid(x, x, x, sparse=True)
        a_np, b_np, c_np = np.meshgrid(y, y, y, sparse=True)
        self.assertEqualArray(a_mlx, mx.array(a_np))
        self.assertEqualArray(b_mlx, mx.array(b_np))
        self.assertEqualArray(c_mlx, mx.array(c_np))

        # Test different lengths
        x = mx.array([1, 2], dtype=mx.int32)
        y = mx.array([1, 2, 3], dtype=mx.int32)
        z = np.array([1, 2], dtype=np.int32)
        w = np.array([1, 2, 3], dtype=np.int32)
        a_mlx, b_mlx = mx.meshgrid(x, y)
        a_np, b_np = np.meshgrid(z, w)
        self.assertEqualArray(a_mlx, mx.array(a_np))
        self.assertEqualArray(b_mlx, mx.array(b_np))

        # Test empty input
        x = mx.array([], dtype=mx.int32)
        y = np.array([], dtype=np.int32)
        a_mlx = mx.meshgrid(x)
        a_np = np.meshgrid(y)
        self.assertEqualArray(a_mlx[0], mx.array(a_np[0]))

        # Test float32 input
        x = mx.array([1.1, 2.2, 3.3], dtype=mx.float32)
        y = np.array([1.1, 2.2, 3.3], dtype=np.float32)
        a_mlx = mx.meshgrid(x, x, x)
        a_np = np.meshgrid(y, y, y)
        self.assertEqualArray(a_mlx[0], mx.array(a_np[0]))
        self.assertEqualArray(a_mlx[1], mx.array(a_np[1]))
        self.assertEqualArray(a_mlx[2], mx.array(a_np[2]))

        # Test ij indexing
        x = mx.array([1.1, 2.2, 3.3, 4.4, 5.5], dtype=mx.float32)
        y = np.array([1.1, 2.2, 3.3, 4.4, 5.5], dtype=np.float32)
        a_mlx = mx.meshgrid(x, x, indexing="ij")
        a_np = np.meshgrid(y, y, indexing="ij")
        self.assertEqualArray(a_mlx[0], mx.array(a_np[0]))
        self.assertEqualArray(a_mlx[1], mx.array(a_np[1]))

        # Test different lengths, sparse, and ij indexing
        a = mx.array([1, 2], dtype=mx.int64)
        b = mx.array([1, 2, 3], dtype=mx.int64)
        c = mx.array([1, 2, 3, 4], dtype=mx.int64)
        x = np.array([1, 2], dtype=np.int64)
        y = np.array([1, 2, 3], dtype=np.int64)
        z = np.array([1, 2, 3, 4], dtype=np.int64)
        a_mlx, b_mlx, c_mlx = mx.meshgrid(a, b, c, sparse=True, indexing="ij")
        a_np, b_np, c_np = np.meshgrid(x, y, z, sparse=True, indexing="ij")
        self.assertEqualArray(a_mlx, mx.array(a_np))
        self.assertEqualArray(b_mlx, mx.array(b_np))
        self.assertEqualArray(c_mlx, mx.array(c_np))

    def test_pad(self):
        pad_width_and_values = [
            ([(1, 1), (1, 1), (1, 1)], 0),
            ([(1, 1), (1, 1), (1, 1)], 5),
            ([(3, 0), (0, 2), (5, 7)], 0),
            ([(3, 0), (0, 2), (5, 7)], -7),
            ([(0, 0), (0, 0), (0, 0)], 0),
        ]

        for pw, v in pad_width_and_values:
            with self.subTest(pad_width=pw, value=v):
                a_npy = np.random.randn(16, 16, 16).astype(np.float32)
                a_mlx = mx.array(a_npy)

                b_npy = np.pad(a_npy, pw, constant_values=v)
                b_mlx = mx.pad(a_mlx, pw, constant_values=v)

                self.assertEqual(list(b_npy.shape), list(b_mlx.shape))
                self.assertTrue(np.allclose(b_npy, b_mlx, atol=1e-6))

                b_npy = np.pad(a_npy, pw, mode="edge")
                b_mlx = mx.pad(a_mlx, pw, mode="edge")

                self.assertEqual(list(b_npy.shape), list(b_mlx.shape))
                self.assertTrue(np.allclose(b_npy, b_mlx, atol=1e-6))

        a = mx.zeros((1, 1, 1))
        self.assertEqual(mx.pad(a, 1).shape, (3, 3, 3))
        self.assertEqual(mx.pad(a, (1,)).shape, (3, 3, 3))
        self.assertEqual(mx.pad(a, [1]).shape, (3, 3, 3))
        self.assertEqual(mx.pad(a, (1, 2)).shape, (4, 4, 4))
        self.assertEqual(mx.pad(a, [(1, 2)]).shape, (4, 4, 4))
        self.assertEqual(mx.pad(a, ((1, 2),)).shape, (4, 4, 4))
        self.assertEqual(mx.pad(a, ((1, 2), (2, 1), (2, 2))).shape, (4, 4, 5))

        # Test grads
        a_fwd = mx.array(np.random.rand(16, 16).astype(np.float32))
        a_bwd = mx.ones((22, 22))
        f = lambda x: mx.pad(x, ((4, 2), (2, 4)))

        _, df = mx.vjp(f, [a_fwd], [a_bwd])
        self.assertTrue(mx.allclose(a_bwd[4:-2, 2:-4], df[0]).item())

    def test_where(self):
        self.assertCmpNumpy([True, mx.array([[1, 2], [3, 4]]), 1], mx.where, np.where)
        self.assertCmpNumpy([True, 1, mx.array([[1, 2], [3, 4]])], mx.where, np.where)
        self.assertCmpNumpy(
            [
                mx.array([[True, False], [False, True]]),
                mx.array([[1, 2], [3, 4]]),
                mx.array([5, 6]),
            ],
            mx.where,
            np.where,
        )

        # Check non-contiguous input with several dimensions
        shape = [1, 2, 2, 3, 3, 1]
        strides = [16, 4, 1, 4, 1, 1]
        x = mx.ones(shape=(1, 4, 4, 1))
        x = mx.as_strided(x, shape, strides)
        out = mx.where(mx.isnan(x), mx.nan, x)
        self.assertTrue(mx.allclose(out, mx.ones_like(out)))

    def test_nan_to_num(self):
        a = mx.array([6, float("inf"), 2, 0])
        out_mx = mx.nan_to_num(a)
        out_np = np.nan_to_num(a)
        self.assertTrue(np.allclose(out_mx, out_np))

        for t in [mx.float32, mx.float16]:
            a = mx.array([float("inf"), 6.9, float("nan"), float("-inf")])
            out_mx = mx.nan_to_num(a)
            out_np = np.nan_to_num(a)
            self.assertTrue(np.allclose(out_mx, out_np))

            a = mx.array([float("inf"), 6.9, float("nan"), float("-inf")]).astype(t)
            out_np = np.nan_to_num(a, nan=0.0, posinf=1000, neginf=-1000)
            out_mx = mx.nan_to_num(a, nan=0.0, posinf=1000, neginf=-1000)
            self.assertTrue(np.allclose(out_mx, out_np))

    def test_as_strided(self):
        x_npy = np.random.randn(128).astype(np.float32)
        x_mlx = mx.array(x_npy)

        shapes = [(10, 10), (5, 5), (2, 20), (10,)]
        strides = [(3, 3), (7, 1), (1, 5), (4,)]
        for shape, stride in zip(shapes, strides):
            for offset in [0, 1, 3]:
                y_npy = np.lib.stride_tricks.as_strided(
                    x_npy[offset:], shape, np.multiply(stride, 4)
                )
                y_mlx = mx.as_strided(x_mlx, shape, stride, offset)
                self.assertTrue(np.array_equal(y_npy, y_mlx))

        x = mx.random.uniform(shape=(32,))
        y = mx.as_strided(x, (x.size,), (-1,), x.size - 1)
        self.assertTrue(mx.array_equal(y, x[::-1]))

    def test_logcumsumexp(self):
        npop = np.logaddexp.accumulate
        mxop = mx.logcumsumexp

        a_npy = np.random.randn(32, 32, 32).astype(np.float32)
        a_mlx = mx.array(a_npy)

        for axis in (0, 1, 2):
            c_npy = npop(a_npy, axis=axis)
            c_mlx = mxop(a_mlx, axis=axis)
            self.assertTrue(np.allclose(c_npy, c_mlx, rtol=1e-3, atol=1e-3))

        edge_cases_npy = [
            np.float32([-float("inf")] * 8),
            np.float32([-float("inf"), 0, -float("inf")]),
            np.float32([-float("inf"), float("inf"), -float("inf")]),
        ]
        edge_cases_mlx = [mx.array(a) for a in edge_cases_npy]

        for a_npy, a_mlx in zip(edge_cases_npy, edge_cases_mlx):
            c_npy = npop(a_npy, axis=0)
            c_mlx = mxop(a_mlx, axis=0)
            self.assertTrue(np.allclose(c_npy, c_mlx, rtol=1e-3, atol=1e-3))

        # Complex tests

        a_npy = np.array([1, 2, 3]).astype(np.float32) + 1j
        a_mlx = mx.array(a_npy)
        c_npy = np_cumlogaddexp(a_npy, axis=-1)
        c_mlx = mxop(a_mlx, axis=-1)
        self.assertTrue(np.allclose(c_npy, c_mlx, rtol=1e-3, atol=1e-3))

    def test_scans(self):
        a_npy = np.random.randn(32, 32, 32).astype(np.float32)
        a_mlx = mx.array(a_npy)

        for op in ["cumsum", "cumprod"]:
            npop = getattr(np, op)
            mxop = getattr(mx, op)
            for axis in (None, 0, 1, 2):
                c_npy = npop(a_npy, axis=axis)
                c_mlx = mxop(a_mlx, axis=axis)
                self.assertTrue(np.allclose(c_npy, c_mlx, rtol=1e-3, atol=1e-3))

        # Complex test

        a_npy = np.random.randn(32, 32, 32).astype(np.float32) + 0.5j
        a_mlx = mx.array(a_npy)

        for op in ["cumsum", "cumprod"]:
            npop = getattr(np, op)
            mxop = getattr(mx, op)
            for axis in (None, 0, 1, 2):
                c_npy = npop(a_npy, axis=axis)
                c_mlx = mxop(a_mlx, axis=axis)
                self.assertTrue(np.allclose(c_npy, c_mlx, rtol=1e-3, atol=1e-3))

        a_mlx = mx.random.randint(shape=(32, 32, 32), low=-100, high=100)
        for dt in [mx.int32, mx.int64]:
            mxx = a_mlx.astype(dt)
            npx = np.array(mxx)
            for op in ["cumsum", "cumprod"]:
                npop = getattr(np, op)
                mxop = getattr(mx, op)
                for axis in (None, 0, 1, 2):
                    c_npy = npop(npx, axis=axis, dtype=npx.dtype)
                    c_mlx = mxop(mxx, axis=axis)
                    self.assertTrue(np.array_equal(c_npy, c_mlx))

        a_mlx = mx.random.randint(shape=(32, 32, 32), low=-100, high=100)
        for op in ["cumsum", "cumprod", "cummax", "cummin"]:
            mxop = getattr(mx, op)
            c1 = mxop(a_mlx, axis=2)
            c2 = mxop(a_mlx, axis=2, inclusive=False, reverse=False)
            self.assertTrue(mx.array_equal(c1[:, :, :-1], c2[:, :, 1:]))
            c1 = mxop(a_mlx, axis=1)
            c2 = mxop(a_mlx, axis=1, inclusive=False, reverse=False)
            self.assertTrue(mx.array_equal(c1[:, :-1, :], c2[:, 1:, :]))
            c1 = mxop(a_mlx, axis=0)
            c2 = mxop(a_mlx, axis=0, inclusive=False, reverse=False)
            self.assertTrue(mx.array_equal(c1[:-1, :, :], c2[1:, :, :]))

            rev_idx = mx.arange(31, -1, -1)
            c1 = mxop(a_mlx[:, :, rev_idx], axis=2)[:, :, rev_idx]
            c2 = mxop(a_mlx, axis=2, inclusive=True, reverse=True)
            self.assertTrue(mx.array_equal(c1, c2))
            c1 = mxop(a_mlx[:, rev_idx, :], axis=1)[:, rev_idx, :]
            c2 = mxop(a_mlx, axis=1, inclusive=True, reverse=True)
            self.assertTrue(mx.array_equal(c1, c2))
            c1 = mxop(a_mlx[rev_idx, :, :], axis=0)[rev_idx, :, :]
            c2 = mxop(a_mlx, axis=0, inclusive=True, reverse=True)
            self.assertTrue(mx.array_equal(c1, c2))

            rev_idx = mx.arange(31, -1, -1)
            c1 = mxop(a_mlx[:, :, rev_idx], axis=2)[:, :, rev_idx][:, :, 1:]
            c2 = mxop(a_mlx, axis=2, inclusive=False, reverse=True)[:, :, :-1]
            self.assertTrue(mx.array_equal(c1, c2))
            c1 = mxop(a_mlx[:, rev_idx, :], axis=1)[:, rev_idx, :][:, 1:, :]
            c2 = mxop(a_mlx, axis=1, inclusive=False, reverse=True)[:, :-1, :]
            self.assertTrue(mx.array_equal(c1, c2))
            c1 = mxop(a_mlx[rev_idx, :, :], axis=0)[rev_idx, :, :][1:, :, :]
            c2 = mxop(a_mlx, axis=0, inclusive=False, reverse=True)[:-1, :, :]
            self.assertTrue(mx.array_equal(c1, c2))

        a = mx.random.uniform(shape=(8, 32))
        mat = mx.tri(32)
        for t in [mx.float16, mx.bfloat16]:
            a_t = a.astype(t)
            mat_t = mat.astype(t)
            out = mx.cumsum(a_t, axis=-1)
            expected = (mat_t * a_t[:, None, :]).sum(axis=-1)
            self.assertTrue(mx.allclose(out, expected, rtol=0.02, atol=1e-3))
        sizes = [1023, 1024, 1025, 2047, 2048, 2049]
        for s in sizes:
            a = mx.ones((s,), mx.int32)
            out = mx.cumsum(a)
            expected = mx.arange(1, s + 1, dtype=mx.int32)
            self.assertTrue(mx.array_equal(expected, out))

            # non-contiguous scan
            a = mx.ones((s, 2), mx.int32)
            out = mx.cumsum(a, axis=0)
            expected = mx.repeat(expected[:, None], 2, axis=1)
            self.assertTrue(mx.array_equal(expected, out))

        # Test donation
        def fn(its):
            x = mx.ones((32,))
            for _ in range(its):
                x = mx.cumsum(x)
            return x

        mx.synchronize()
        mx.eval(fn(2))
        mx.synchronize()
        mem2 = mx.get_peak_memory()
        mx.eval(fn(4))
        mx.synchronize()
        mem4 = mx.get_peak_memory()
        self.assertEqual(mem2, mem4)

    def test_squeeze_expand(self):
        a = mx.zeros((2, 1, 2, 1))
        self.assertEqual(mx.squeeze(a).shape, (2, 2))
        self.assertEqual(mx.squeeze(a, 1).shape, (2, 2, 1))
        self.assertEqual(mx.squeeze(a, [1, 3]).shape, (2, 2))
        self.assertEqual(a.squeeze().shape, (2, 2))
        self.assertEqual(a.squeeze(1).shape, (2, 2, 1))
        self.assertEqual(a.squeeze([1, 3]).shape, (2, 2))

        a = mx.zeros((2, 2))
        self.assertEqual(mx.squeeze(a).shape, (2, 2))

        self.assertEqual(mx.expand_dims(a, 0).shape, (1, 2, 2))
        self.assertEqual(mx.expand_dims(a, (0, 1)).shape, (1, 1, 2, 2))
        self.assertEqual(mx.expand_dims(a, [0, -1]).shape, (1, 2, 2, 1))

    def test_sort(self):
        shape = (6, 4, 10)
        tests = product(
            ("int32", "float32"),  # type
            (None, 0, 1, 2),  # axis
            (True, False),  # strided
        )
        for dtype, axis, strided in tests:
            with self.subTest(dtype=dtype, axis=axis, strided=strided):
                np.random.seed(0)
                np_dtype = getattr(np, dtype)
                a_np = np.random.uniform(0, 100, size=shape).astype(np_dtype)
                a_mx = mx.array(a_np)
                if strided:
                    a_mx = a_mx[::2, :, ::2]
                    a_np = a_np[::2, :, ::2]

                b_np = np.sort(a_np, axis=axis)
                b_mx = mx.sort(a_mx, axis=axis)

                self.assertTrue(np.array_equal(b_np, b_mx))
                self.assertEqual(b_mx.dtype, a_mx.dtype)

                c_np = np.argsort(a_np, axis=axis)
                c_mx = mx.argsort(a_mx, axis=axis)
                d_np = np.take_along_axis(a_np, c_np, axis=axis)
                d_mx = mx.take_along_axis(a_mx, c_mx, axis=axis)

                self.assertTrue(np.array_equal(d_np, d_mx))
                self.assertEqual(c_mx.dtype, mx.uint32)

        # Set random seed
        np.random.seed(0)

        # Test multi-block sort
        for strided in (False, True):
            with self.subTest(strided=strided):
                a_np = np.random.normal(size=(32769,)).astype(np.float32)
                a_mx = mx.array(a_np)

                if strided:
                    a_mx = a_mx[::3]
                    a_np = a_np[::3]

                b_np = np.sort(a_np)
                b_mx = mx.sort(a_mx)

                self.assertTrue(np.array_equal(b_np, b_mx))
                self.assertEqual(b_mx.dtype, a_mx.dtype)

                # Test multi-dum multi-block sort
                a_np = np.random.normal(size=(2, 4, 32769)).astype(np.float32)
                a_mx = mx.array(a_np)

                if strided:
                    a_mx = a_mx[..., ::3]
                    a_np = a_np[..., ::3]

                b_np = np.sort(a_np, axis=-1)
                b_mx = mx.sort(a_mx, axis=-1)

                self.assertTrue(np.array_equal(b_np, b_mx))
                self.assertEqual(b_mx.dtype, a_mx.dtype)

                a_np = np.random.normal(size=(2, 32769, 4)).astype(np.float32)
                a_mx = mx.array(a_np)

                if strided:
                    a_mx = a_mx[:, ::3]
                    a_np = a_np[:, ::3]

                b_np = np.sort(a_np, axis=1)
                b_mx = mx.sort(a_mx, axis=1)

                self.assertTrue(np.array_equal(b_np, b_mx))
                self.assertEqual(b_mx.dtype, a_mx.dtype)

        # test 0 strides
        a_np = np.array([1, 0, 2, 1, 3, 0, 4, 0])
        a_mx = mx.array(a_np)
        b_np = np.broadcast_to(a_np, (16, 8))
        b_mx = mx.broadcast_to(a_mx, (16, 8))
        mx.eval(b_mx)
        for axis in (0, 1):
            c_np = np.sort(b_np, axis=axis)
            c_mx = mx.sort(b_mx, axis=axis)
            self.assertTrue(np.array_equal(c_np, c_mx))
            self.assertEqual(b_mx.dtype, c_mx.dtype)

        # Test very large array
        if mx.default_device() == mx.gpu:
            a_np = np.random.normal(20, 20, size=(2**22)).astype(np.float32)
            a_mx = mx.array(a_np)

            b_np = np.sort(a_np)
            b_mx = mx.sort(a_mx)
            self.assertTrue(np.array_equal(b_np, b_mx))

        # 1D strided sort
        a = mx.array([[4, 3], [2, 1], [5, 4], [3, 2]])
        out = mx.argsort(a[:, 1])
        expected = mx.array([1, 3, 0, 2], dtype=mx.uint32)
        self.assertTrue(mx.array_equal(out, expected))

        # Test array with singleton dim
        out = mx.sort(mx.array([1, 2, 3]), axis=0)
        self.assertTrue(mx.array_equal(out, mx.array([1, 2, 3])))

        x = np.random.uniform(size=(1, 4, 8, 1)).astype(np.float32)
        y_np = np.sort(x, axis=-2)
        y_mx = mx.sort(mx.array(x), axis=-2)
        self.assertTrue(np.array_equal(y_np, y_mx))

        # Test many segments
        a = mx.random.uniform(shape=(512, 128))
        y_mx = mx.sort(a, axis=-1)
        y_np = np.sort(np.array(a), axis=-1)
        self.assertTrue(np.array_equal(y_np, y_mx))

    def test_partition(self):
        shape = (3, 4, 5)
        for dtype in ("int32", "float32"):
            for axis in (None, 0, 1, 2):
                for kth in (-2, 0, 2):
                    with self.subTest(dtype=dtype, axis=axis, kth=kth):
                        np.random.seed(0)
                        np_dtype = getattr(np, dtype)
                        a_np = np.random.uniform(0, 100, size=shape).astype(np_dtype)
                        a_mx = mx.array(a_np)

                        b_np = np.partition(a_np, kth, axis=axis)
                        b_mx = mx.partition(a_mx, kth, axis=axis)

                        c_np = np.take(b_np, (kth,), axis=axis)
                        c_mx = np.take(np.array(b_mx), (kth,), axis=axis)

                        self.assertTrue(np.array_equal(c_np, c_mx))
                        self.assertEqual(b_mx.dtype, a_mx.dtype)

                        if kth >= 0:
                            top_k_mx = mx.topk(a_mx, kth, axis=axis)
                            top_k_np = np.take(
                                np.partition(a_np, -kth, axis=axis), (-kth,), axis=axis
                            )
                            self.assertTrue(np.all(top_k_np <= top_k_mx))
                            self.assertEqual(top_k_mx.dtype, a_mx.dtype)
                            N = a_mx.shape[axis] if axis is not None else a_mx.size
                            M = top_k_mx.shape[axis or 0]
                            self.assertEqual(M, (kth + N) % N)

    def test_argpartition(self):
        x = mx.broadcast_to(mx.array([1, 2, 3]), (2, 3))
        out = mx.argpartition(x, kth=1, axis=0)
        expected = mx.array([[0, 0, 0], [1, 1, 1]])
        self.assertTrue(mx.array_equal(out, expected))

        x = mx.array([[1, 2], [3, 4]]).T
        out = mx.argpartition(x, kth=1, axis=0)
        expected = mx.array([[0, 0], [1, 1]])
        self.assertTrue(mx.array_equal(out, expected))

    @unittest.skipIf(
        os.getenv("LOW_MEMORY", None) is not None,
        "This test requires a lot of memory",
    )
    def test_large_binary(self):
        a = mx.ones([1000, 2147484], mx.int8)
        b = mx.ones([2147484], mx.int8)
        self.assertEqual((a + b)[0, 0].item(), 2)

    def test_eye(self):
        self.assertCmpNumpy([3], mx.eye, np.eye)
        # Test for non-square matrix
        self.assertCmpNumpy([3, 4], mx.eye, np.eye)
        # Test with positive k parameter
        self.assertCmpNumpy([3, 4], mx.eye, np.eye, k=1)
        # Test with negative k parameter
        self.assertCmpNumpy([5, 6], mx.eye, np.eye, k=-2)

    def test_stack(self):
        a = mx.ones((2,))
        np_a = np.ones((2,))
        b = mx.ones((2,))
        np_b = np.ones((2,))

        # One dimensional stack axis=0
        c = mx.stack([a, b])
        np_c = np.stack([np_a, np_b])
        self.assertTrue(np.array_equal(c, np_c))

        # One dimensional stack axis=1
        c = mx.stack([a, b], axis=1)
        np_c = np.stack([np_a, np_b], axis=1)
        self.assertTrue(np.array_equal(c, np_c))

        a = mx.ones((1, 2))
        np_a = np.ones((1, 2))
        b = mx.ones((1, 2))
        np_b = np.ones((1, 2))

        # Two dimensional stack axis=0
        c = mx.stack([a, b])
        np_c = np.stack([np_a, np_b])
        self.assertTrue(np.array_equal(c, np_c))

        # Two dimensional stack axis=1
        c = mx.stack([a, b], axis=1)
        np_c = np.stack([np_a, np_b], axis=1)
        self.assertTrue(np.array_equal(c, np_c))

    def test_flatten(self):
        x = mx.zeros([2, 3, 4])
        self.assertEqual(mx.flatten(x).shape, (2 * 3 * 4,))
        self.assertEqual(mx.flatten(x, start_axis=1).shape, (2, 3 * 4))
        self.assertEqual(mx.flatten(x, end_axis=1).shape, (2 * 3, 4))
        self.assertEqual(x.flatten().shape, (2 * 3 * 4,))
        self.assertEqual(x.flatten(start_axis=1).shape, (2, 3 * 4))
        self.assertEqual(x.flatten(end_axis=1).shape, (2 * 3, 4))

    def test_clip(self):
        a = np.array([1, 4, 3, 8, 5], np.int32)
        expected = np.clip(a, 2, 6)
        clipped = mx.clip(mx.array(a), 2, 6)
        self.assertTrue(np.array_equal(clipped, expected))

        a = np.array([-1, 1, 0, 5], np.int32)
        expected = np.clip(a, 0, None)
        clipped = mx.clip(mx.array(a), 0, None)
        self.assertTrue(np.array_equal(clipped, expected))

        a = np.array([2, 3, 4, 5], np.int32)
        expected = np.clip(a, None, 4)
        clipped = mx.clip(mx.array(a), None, 4)
        self.assertTrue(np.array_equal(clipped, expected))

        mins = np.array([3, 1, 5, 5])
        a = np.array([2, 3, 4, 5], np.int32)
        expected = np.clip(a, mins, 4)
        clipped = mx.clip(mx.array(a), mx.array(mins), 4)
        self.assertTrue(np.array_equal(clipped, expected))

        maxs = np.array([5, -1, 2, 9])
        a = np.array([2, 3, 4, 5], np.int32)
        expected = np.clip(a, mins, maxs)
        clipped = mx.clip(mx.array(a), mx.array(mins), mx.array(maxs))
        self.assertTrue(np.array_equal(clipped, expected))

        # Check clip output types
        a = mx.array([1, 2, 3], mx.int16)
        out_t = mx.clip(a, a_min=0, a_max=5).dtype
        self.assertEqual(out_t, mx.int16)

        out_t = mx.clip(a, a_min=0.0, a_max=5).dtype
        self.assertEqual(out_t, mx.float32)

        a = mx.array([1, 2, 3], mx.float16)
        out_t = mx.clip(a, a_min=0.0, a_max=5).dtype
        self.assertEqual(out_t, mx.float16)

        a = mx.array([1, 2, 3], mx.float16)
        out_t = mx.clip(a, a_min=0.0, a_max=mx.array(1.0)).dtype
        self.assertEqual(out_t, mx.float32)

    def test_linspace(self):
        # Test default num = 50
        a = mx.linspace(0, 1)
        expected = mx.array(np.linspace(0, 1))
        self.assertEqualArray(a, expected)

        # Test int64 dtype
        b = mx.linspace(0, 10, 5, mx.int64)
        expected = mx.array(np.linspace(0, 10, 5, dtype=int))
        self.assertEqualArray(b, expected)

        # Test negative sequence with float start and stop
        c = mx.linspace(-2.7, -0.7, 7)
        expected = mx.array(np.linspace(-2.7, -0.7, 7))
        self.assertEqualArray(c, expected)

        # Test irrational step size of 1/9
        d = mx.linspace(0, 1, 10)
        expected = mx.array(np.linspace(0, 1, 10))
        self.assertEqualArray(d, expected)

        # Test num equal to 1
        d = mx.linspace(1, 10, 1)
        expected = mx.array(np.linspace(1, 10, 1))
        self.assertEqualArray(d, expected)

        # Ensure that the start and stop are always the ones provided
        ranges = mx.random.normal((16, 2)).tolist()
        nums = (2 + mx.random.uniform(shape=(16,)) * 10).astype(mx.uint32).tolist()
        for (a, b), n in zip(ranges, nums):
            d = mx.linspace(a, b, n).tolist()
            self.assertEqual(d[0], a)
            self.assertEqual(d[-1], b)

    def test_repeat(self):
        # Setup data for the tests
        data = mx.array([[[13, 3], [16, 6]], [[14, 4], [15, 5]], [[11, 1], [12, 2]]])
        # Test repeat 0 times
        self.assertCmpNumpy([data, 0], mx.repeat, np.repeat)
        # Test repeat along axis 0
        self.assertCmpNumpy([data, 2], mx.repeat, np.repeat, axis=0)
        # Test repeat along axis 1
        self.assertCmpNumpy([data, 2], mx.repeat, np.repeat, axis=1)
        # Test repeat along the last axis (default)
        self.assertCmpNumpy([data, 2], mx.repeat, np.repeat)
        # Test repeat with a 1D array along axis 0
        self.assertCmpNumpy([mx.array([1, 3, 2]), 3], mx.repeat, np.repeat, axis=0)
        # Test repeat with a 2D array along axis 0
        self.assertCmpNumpy(
            [mx.array([[1, 2, 3], [4, 5, 4], [0, 1, 2]]), 2],
            mx.repeat,
            np.repeat,
            axis=0,
        )

    def test_tensordot(self):
        # No fp16 matmuls on common cpu backend
        if not self.is_apple_silicon:
            dtypes = [mx.float32]
        else:
            dtypes = [mx.float16, mx.float32]
        for dtype in dtypes:
            with self.subTest(dtype=dtype):
                self.assertCmpNumpy(
                    [(3, 4, 5), (4, 3, 2)],
                    mx.tensordot,
                    np.tensordot,
                    dtype=dtype,
                    axes=([1, 0], [0, 1]),
                )
                self.assertCmpNumpy(
                    [(3, 4, 5), (4, 5, 6)],
                    mx.tensordot,
                    np.tensordot,
                    dtype=dtype,
                    axes=2,
                )
                self.assertCmpNumpy(
                    [(3, 5, 4, 6), (6, 4, 5, 3)],
                    mx.tensordot,
                    np.tensordot,
                    dtype=dtype,
                    axes=([2, 1, 3], [1, 2, 0]),
                )

    def test_inner(self):
        self.assertCmpNumpy([(3,), (3,)], mx.inner, np.inner)
        self.assertCmpNumpy([(1, 1, 2), (3, 2)], mx.inner, np.inner)
        self.assertCmpNumpy([(2, 3, 4), (4,)], mx.inner, np.inner)

    def test_outer(self):
        self.assertCmpNumpy([(3,), (3,)], mx.outer, np.outer)
        self.assertCmpNumpy(
            [
                mx.ones(
                    5,
                ),
                mx.linspace(-2, 2, 5),
            ],
            mx.outer,
            np.outer,
        )
        self.assertCmpNumpy(
            [
                1j * mx.linspace(2, -2, 5),
                mx.ones(
                    5,
                ),
            ],
            mx.outer,
            np.outer,
        )

    def test_divmod(self):
        # A few sizes for the inputs with and without broadcasting
        sizes = [
            ((1,), (1,)),
            ((1,), (10,)),
            ((10,), (1,)),
            ((3,), (3,)),
            ((2, 2, 2), (1, 2, 1)),
            ((2, 1, 2), (1, 2, 1)),
            ((2, 2, 2, 2), (2, 2, 2, 2)),
        ]
        types = [np.uint16, np.uint32, np.int32, np.float16, np.float32]
        for s1, s2 in sizes:
            for t in types:
                a_np = np.random.uniform(1, 100, size=s1).astype(t)
                b_np = np.random.uniform(1, 100, size=s2).astype(t)
                np_out = np.divmod(a_np, b_np)
                mx_out = mx.divmod(mx.array(a_np), mx.array(b_np))
                self.assertTrue(
                    np.allclose(np_out[0], mx_out[0]), msg=f"Shapes {s1} {s2}, Type {t}"
                )

    def test_tile(self):
        self.assertCmpNumpy([(2,), [2]], mx.tile, np.tile)
        self.assertCmpNumpy([(2, 3, 4), [2]], mx.tile, np.tile)
        self.assertCmpNumpy([(2, 3, 4), [2, 1]], mx.tile, np.tile)
        self.assertCmpNumpy(
            [
                (2, 3, 4),
                [
                    2,
                    2,
                ],
            ],
            mx.tile,
            np.tile,
        )
        self.assertCmpNumpy([(3,), [2, 2, 2]], mx.tile, np.tile)

    def test_empty_matmuls(self):
        a = mx.array([])
        b = mx.array([])
        self.assertEqual(mx.inner(a, b).item(), 0.0)

        a = mx.zeros((10, 0))
        b = mx.zeros((0, 10))
        out = a @ b
        self.assertTrue(mx.array_equal(out, mx.zeros((10, 10))))

    def test_diagonal(self):
        x = mx.array(
            [
                [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]],
                [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]],
            ]
        )
        expected = [[0, 13], [4, 17], [8, 21]]

        self.assertListEqual(mx.diagonal(x, 0, -1, 0).tolist(), expected)

        expected = [[1, 14], [5, 18], [9, 22]]
        self.assertListEqual(mx.diagonal(x, -1, 2, 0).tolist(), expected)

    def test_diag(self):
        # Test 1D input
        x = mx.array([1, 2, 3, 4])
        expected = mx.array([[1, 0, 0, 0], [0, 2, 0, 0], [0, 0, 3, 0], [0, 0, 0, 4]])
        result = mx.diag(x)
        self.assertTrue(mx.array_equal(result, expected))

        # Test 1D with offset
        x = mx.array([2, 6])
        result = mx.diag(x, k=5)
        expected = mx.array(np.diag(x, k=5))
        self.assertTrue(mx.array_equal(result, expected))

        # Test 2D input
        x = mx.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        expected = mx.array([1, 5, 9])
        result = mx.diag(x)
        self.assertTrue(mx.array_equal(result, expected))

        # Test with offset
        expected = mx.array([2, 6])
        result = mx.diag(x, 1)
        self.assertTrue(mx.array_equal(result, expected))

        # Test non-square
        x = mx.array([[1, 2, 3], [4, 5, 6]])
        result = mx.diag(x)
        expected = mx.array(np.diag(x))
        self.assertTrue(mx.array_equal(result, expected))

        result = mx.diag(x, k=10)
        expected = mx.array(np.diag(x, k=10))
        self.assertTrue(mx.array_equal(result, expected))

        result = mx.diag(x, k=-10)
        expected = mx.array(np.diag(x, k=-10))
        self.assertTrue(mx.array_equal(result, expected))

        result = mx.diag(x, k=-1)
        expected = mx.array(np.diag(x, k=-1))
        self.assertTrue(mx.array_equal(result, expected))

    def test_trace(self):
        a_mx = mx.arange(9, dtype=mx.int64).reshape((3, 3))
        a_np = np.arange(9, dtype=np.int64).reshape((3, 3))

        # Test 2D array
        result = mx.trace(a_mx)
        expected = np.trace(a_np)
        self.assertEqualArray(result, mx.array(expected))

        # Test dtype
        result = mx.trace(a_mx, dtype=mx.float16)
        expected = np.trace(a_np, dtype=np.float16)
        self.assertEqualArray(result, mx.array(expected))

        # Test offset
        result = mx.trace(a_mx, offset=1)
        expected = np.trace(a_np, offset=1)
        self.assertEqualArray(result, mx.array(expected))

        # Test axis1 and axis2
        b_mx = mx.arange(27, dtype=mx.int64).reshape(3, 3, 3)
        b_np = np.arange(27, dtype=np.int64).reshape(3, 3, 3)

        result = mx.trace(b_mx, axis1=1, axis2=2)
        expected = np.trace(b_np, axis1=1, axis2=2)
        self.assertEqualArray(result, mx.array(expected))

        # Test offset, axis1, axis2, and dtype
        result = mx.trace(b_mx, offset=1, axis1=1, axis2=2, dtype=mx.float32)
        expected = np.trace(b_np, offset=1, axis1=1, axis2=2, dtype=np.float32)
        self.assertEqualArray(result, mx.array(expected))

    def test_atleast_1d(self):
        # Test 1D input
        arrays = [
            [1],
            [1, 2, 3],
            [1, 2, 3, 4],
            [[1], [2], [3]],
            [[1, 2], [3, 4]],
            [[1, 2, 3], [4, 5, 6]],
            [[[[1]], [[2]], [[3]]]],
        ]

        mx_arrays = [mx.atleast_1d(mx.array(x)) for x in arrays]
        atleast_arrays = mx.atleast_1d(*mx_arrays)

        for i, array in enumerate(arrays):
            mx_res = mx.atleast_1d(mx.array(array))
            np_res = np.atleast_1d(np.array(array))
            self.assertEqual(mx_res.shape, np_res.shape)
            self.assertEqual(mx_res.ndim, np_res.ndim)
            self.assertTrue(mx.array_equal(mx_res, atleast_arrays[i]))

    def test_atleast_2d(self):
        # Test 1D input
        arrays = [
            [1],
            [1, 2, 3],
            [1, 2, 3, 4],
            [[1], [2], [3]],
            [[1, 2], [3, 4]],
            [[1, 2, 3], [4, 5, 6]],
            [[[[1]], [[2]], [[3]]]],
        ]

        mx_arrays = [mx.atleast_2d(mx.array(x)) for x in arrays]
        atleast_arrays = mx.atleast_2d(*mx_arrays)

        for i, array in enumerate(arrays):
            mx_res = mx.atleast_2d(mx.array(array))
            np_res = np.atleast_2d(np.array(array))
            self.assertEqual(mx_res.shape, np_res.shape)
            self.assertEqual(mx_res.ndim, np_res.ndim)
            self.assertTrue(mx.array_equal(mx_res, atleast_arrays[i]))

    def test_atleast_3d(self):
        # Test 1D input
        arrays = [
            [1],
            [1, 2, 3],
            [1, 2, 3, 4],
            [[1], [2], [3]],
            [[1, 2], [3, 4]],
            [[1, 2, 3], [4, 5, 6]],
            [[[[1]], [[2]], [[3]]]],
        ]

        mx_arrays = [mx.atleast_3d(mx.array(x)) for x in arrays]
        atleast_arrays = mx.atleast_3d(*mx_arrays)

        for i, array in enumerate(arrays):
            mx_res = mx.atleast_3d(mx.array(array))
            np_res = np.atleast_3d(np.array(array))
            self.assertEqual(mx_res.shape, np_res.shape)
            self.assertEqual(mx_res.ndim, np_res.ndim)
            self.assertTrue(mx.array_equal(mx_res, atleast_arrays[i]))

    def test_issubdtype(self):
        self.assertTrue(mx.issubdtype(mx.bfloat16, mx.inexact))

        cats = [
            "complexfloating",
            "floating",
            "inexact",
            "signedinteger",
            "unsignedinteger",
            "integer",
            "number",
            "generic",
            "bool_",
            "uint8",
            "uint16",
            "uint32",
            "uint64",
            "int8",
            "int16",
            "int32",
            "int64",
            "float16",
            "float32",
            "complex64",
        ]

        for a in cats:
            for b in cats:
                self.assertEqual(
                    mx.issubdtype(getattr(mx, a), getattr(mx, b)),
                    np.issubdtype(getattr(np, a), getattr(np, b)),
                    f"mx and np don't aggree on {a}, {b}",
                )

    def test_bitwise_ops(self):
        types = [
            mx.uint8,
            mx.uint16,
            mx.uint32,
            mx.uint64,
            mx.int8,
            mx.int16,
            mx.int32,
            mx.int64,
        ]
        a = mx.random.randint(0, 4096, (1000,))
        b = mx.random.randint(0, 4096, (1000,))
        for op in ["bitwise_and", "bitwise_or", "bitwise_xor"]:
            for t in types:
                a_mlx = a.astype(t)
                b_mlx = b.astype(t)
                a_np = np.array(a_mlx)
                b_np = np.array(b_mlx)
                out_mlx = getattr(mx, op)(a_mlx, b_mlx)
                out_np = getattr(np, op)(a_np, b_np)
                self.assertTrue(np.array_equal(np.array(out_mlx), out_np))
        for op in ["left_shift", "right_shift"]:
            for t in types:
                a_mlx = a.astype(t)
                b_mlx = mx.random.randint(0, t.size, (1000,)).astype(t)
                a_np = np.array(a_mlx)
                b_np = np.array(b_mlx)
                out_mlx = getattr(mx, op)(a_mlx, b_mlx)
                out_np = getattr(np, op)(a_np, b_np)
                self.assertTrue(np.array_equal(np.array(out_mlx), out_np))

        for t in types:
            a_mlx = a.astype(t)
            a_np = np.array(a_mlx)

            out_mlx = ~a_mlx
            out_np = ~a_np
            self.assertTrue(np.array_equal(np.array(out_mlx), out_np))

            out_mlx = mx.bitwise_invert(a_mlx)
            out_np = mx.bitwise_invert(a_np)
            self.assertTrue(np.array_equal(np.array(out_mlx), out_np))

        # Check broadcasting
        a = mx.ones((3, 1, 5), dtype=mx.bool_)
        b = mx.zeros((1, 2, 5), dtype=mx.bool_)
        c = a | b
        self.assertEqual(c.shape, (3, 2, 5))
        self.assertTrue(mx.array_equal(c, mx.ones((3, 2, 5), dtype=mx.bool_)))

    def test_bitwise_grad(self):
        a = np.random.randint(0, 10, size=(4, 3))
        b = np.random.randint(0, 10, size=(4, 3))
        cotangent = np.random.randint(0, 10, size=(4, 3))
        a = mx.array(a)
        b = mx.array(b)
        cotangent = mx.array(cotangent)

        def bitwise(a, b):
            return a.astype(mx.int32) & b.astype(mx.int32)

        _, vjps = mx.vjp(bitwise, [a, b], [cotangent])
        for vjp in vjps:
            self.assertFalse(np.any(np.array(vjp)))

    def test_conjugate(self):
        shape = (3, 5, 7)
        a = np.random.normal(size=shape) + 1j * np.random.normal(size=shape)
        a = a.astype(np.complex64)
        ops = ["conjugate", "conj"]
        for op in ops:
            out_mlx = getattr(mx, op)(mx.array(a))
            out_np = getattr(np, op)(a)
            self.assertTrue(np.array_equal(np.array(out_mlx), out_np))
        out_mlx = mx.array(a).conj()
        out_np = a.conj()
        self.assertTrue(np.array_equal(np.array(out_mlx), out_np))

    def test_view(self):
        # Check scalar
        out = mx.array(1, mx.int8).view(mx.uint8).item()
        self.assertEqual(out, 1)

        a = mx.random.randint(shape=(4, 2, 4), low=-100, high=100)
        a_np = np.array(a)

        for t in ["bool_", "int16", "float32", "int64"]:
            out = a.view(getattr(mx, t))
            expected = a_np.view(getattr(np, t))
            self.assertTrue(np.array_equal(out, expected, equal_nan=True))

        # Irregular strides
        a = mx.random.randint(shape=(2, 4), low=-100, high=100)
        a = mx.broadcast_to(a, shape=(4, 2, 4))

        for t in ["bool_", "int16", "float32", "int64"]:
            out = a.view(getattr(mx, t))
            a_out = out.view(mx.int32)
            self.assertTrue(mx.array_equal(a_out, a, equal_nan=True))

        a = mx.random.randint(shape=(4, 4), low=-100, high=100).T
        for t in ["bool_", "int16", "float32", "int64"]:
            out = a.view(getattr(mx, t))
            a_out = out.view(mx.int32)
            self.assertTrue(mx.array_equal(a_out, a, equal_nan=True))

    def _hadamard(self, N):
        # Matches scipy.linalg.hadamard
        H = np.array([[1]], dtype=np.int64)
        for i in range(0, np.log2(N).astype(np.int64)):
            H = np.vstack((np.hstack((H, H)), np.hstack((H, -H))))
        return H

    def test_hadamard(self):
        with self.assertRaises(ValueError):
            mx.hadamard_transform(mx.array([]))

        h28_str = """
        +------++----++-+--+-+--++--
        -+-----+++-----+-+--+-+--++-
        --+-----+++---+-+-+----+--++
        ---+-----+++---+-+-+-+--+--+
        ----+-----+++---+-+-+++--+--
        -----+-----++++--+-+--++--+-
        ------++----++-+--+-+--++--+
        --++++-+-------++--+++-+--+-
        ---++++-+-----+-++--+-+-+--+
        +---+++--+----++-++--+-+-+--
        ++---++---+----++-++--+-+-+-
        +++---+----+----++-++--+-+-+
        ++++--------+-+--++-++--+-+-
        -++++--------+++--++--+--+-+
        -+-++-++--++--+--------++++-
        +-+-++--+--++--+--------++++
        -+-+-++--+--++--+----+---+++
        +-+-+-++--+--+---+---++---++
        ++-+-+-++--+------+--+++---+
        -++-+-+-++--+------+-++++---
        +-++-+---++--+------+-++++--
        -++--++-+-++-+++----++------
        +-++--++-+-++-+++-----+-----
        ++-++---+-+-++-+++-----+----
        -++-++-+-+-+-+--+++-----+---
        --++-++++-+-+----+++-----+--
        +--++-+-++-+-+----+++-----+-
        ++--++-+-++-+-+----++------+
        """

        def parse_h_string(h_str):
            return np.array(
                [[1 if s == "+" else -1 for s in row] for row in h_str.split()]
            )

        h28 = parse_h_string(h28_str)

        x = mx.array(5)
        y = mx.hadamard_transform(x)
        self.assertEqual(y.item(), 5)

        x = mx.array(5)
        y = mx.hadamard_transform(x, scale=0.2)
        self.assertEqual(y.item(), 1)

        x = mx.random.normal((8, 8, 1))
        y = mx.hadamard_transform(x)
        self.assertTrue(mx.all(y == x).item())

        # Too slow to compare to numpy so let's compare CPU to GPU
        if mx.default_device() == mx.gpu:
            rk = mx.random.key(42)
            for k in range(14, 17):
                for m in [1, 3, 5, 7]:
                    x = mx.random.normal((4, m * 2**k), key=rk)
                    y1 = mx.hadamard_transform(x, stream=mx.cpu)
                    y2 = mx.hadamard_transform(x, stream=mx.gpu)
                    self.assertLess(mx.abs(y1 - y2).max().item(), 5e-6)

        np.random.seed(7)
        tests = product([np.float32, np.float16, np.int32], [1, 28], range(1, 14))
        for dtype, m, k in tests:
            # skip large m=28 cases because they're very slow in NumPy
            if m > 1 and k > 8:
                continue
            with self.subTest(dtype=dtype, m=m, k=k):
                n = m * 2**k
                b = 4
                scale = 0.34
                x = np.random.normal(size=(b, n)).astype(dtype)
                # contiguity check
                x = mx.array(x)[::2]
                y = mx.hadamard_transform(x, scale=scale)
                mx.eval(y)
                h = (
                    self._hadamard(2**k)
                    if m == 1
                    else np.kron(h28, self._hadamard(2**k))
                )
                y_np = np.einsum("ij,bj->bi", h, x) * scale
                atol = 2e-4 if dtype == np.float32 else 5e-2 * k
                np.testing.assert_allclose(y, y_np, atol=atol)

                # bfloat16 emulation on M1 means 2**14 doesn't fit in threadgroup memory
                if dtype == np.float16 and k < 14:
                    y_bf16 = mx.hadamard_transform(x.astype(mx.bfloat16), scale=scale)
                    np.testing.assert_allclose(
                        y_bf16.astype(mx.float16), y, atol=atol * 2
                    )

    def test_hadamard_grad_vmap(self):
        np.random.seed(4)

        for k in range(2, 8):
            n = 2**k
            x = np.random.normal(size=(n,))
            h = self._hadamard(n)
            c = np.random.normal(size=(n,))
            x = mx.array(x).astype(mx.float32)
            h = mx.array(h).astype(mx.float32)
            c = mx.array(c).astype(mx.float32)

            def hadamard_transform(x):
                return h @ x / mx.sqrt(x.shape[-1])

            out = mx.vjp(hadamard_transform, [x], [c])
            out_t = mx.vjp(mx.hadamard_transform, [x], [c])
            np.testing.assert_allclose(out, out_t, atol=1e-4)

            for axis in (0, 1, 2):
                vht = mx.vmap(mx.vmap(hadamard_transform, 0, 0), axis, axis)
                vht_t = mx.vmap(mx.vmap(mx.hadamard_transform, 0, 0), axis, axis)

                xb = mx.array(np.random.normal(size=(n, n, n)))
                out = vht(xb)
                out_t = vht_t(xb)
                np.testing.assert_allclose(out, out_t, atol=1e-4)

    def test_roll(self):
        x = mx.arange(10).reshape(2, 5)

        for s in [-2, -1, 0, 1, 2]:
            y1 = np.roll(x, s)
            y2 = mx.roll(x, s)
            self.assertTrue(mx.array_equal(y1, y2).item())

            y1 = np.roll(x, (s, s, s))
            y2 = mx.roll(x, (s, s, s))
            self.assertTrue(mx.array_equal(y1, y2).item())

        shifts = [
            1,
            2,
            -1,
            -2,
            (1, 1),
            (-1, 2),
            (33, 33),
        ]
        axes = [
            0,
            1,
            (1, 0),
            (0, 1),
            (0, 0),
            (1, 1),
        ]
        for s, a in product(shifts, axes):
            y1 = np.roll(x, s, a)
            y2 = mx.roll(x, s, a)
            self.assertTrue(mx.array_equal(y1, y2).item())

    def test_roll_errors(self):
        x = mx.array([])
        result = mx.roll(x, [0], [0])
        self.assertTrue(mx.array_equal(result, x))

    def test_real_imag(self):
        x = mx.random.uniform(shape=(4, 4))
        out = mx.real(x)
        self.assertTrue(mx.array_equal(x, out))

        out = mx.imag(x)
        self.assertTrue(mx.array_equal(mx.zeros_like(x), out))

        y = mx.random.uniform(shape=(4, 4))
        z = x + 1j * y
        self.assertEqual(mx.real(z).dtype, mx.float32)
        self.assertTrue(mx.array_equal(mx.real(z), x))
        self.assertEqual(mx.imag(z).dtype, mx.float32)
        self.assertTrue(mx.array_equal(mx.imag(z), y))

    def test_dynamic_slicing(self):
        x = mx.random.randint(0, 100, shape=(4, 4, 4))
        expected = x[1:, 2:, 3:]
        out = mx.slice(x, mx.array([1, 2, 3]), (0, 1, 2), (3, 2, 1))
        self.assertTrue(mx.array_equal(expected, out))

        x = mx.zeros(shape=(4, 4, 4))
        update = mx.random.randint(0, 100, shape=(3, 2, 1))
        out = mx.slice_update(x, update, mx.array([1, 2, 3]), (0, 1, 2))
        expected = mx.zeros_like(x)
        expected[1:, 2:, 3:] = update
        self.assertTrue(mx.array_equal(expected, out))

    def test_broadcast_arrays(self):
        a = mx.array(1)
        b = mx.array(1.0)
        a, b = mx.broadcast_arrays(a, b)
        self.assertEqual(a.shape, ())
        self.assertEqual(a.dtype, mx.int32)
        self.assertEqual(b.shape, ())
        self.assertEqual(b.dtype, mx.float32)

        a, b = mx.broadcast_arrays(mx.zeros((3, 1, 2)), mx.zeros((4, 1)))
        self.assertEqual(a.shape, (3, 4, 2))
        self.assertEqual(b.shape, (3, 4, 2))

    def test_slice_update_reversed(self):
        a = mx.array([1, 2, 3, 4])
        b = a[::-1]
        b[::2] = 0
        self.assertTrue(mx.array_equal(b, mx.array([0, 3, 0, 1])))

    def test_slice_with_negative_stride(self):
        a = mx.random.uniform(shape=(128, 4))
        out = a[::-1]
        self.assertTrue(mx.array_equal(out[-1, :], a[0, :]))

        a = mx.arange(8)
        for _ in range(4):
            a = a[::-1]
        self.assertTrue(mx.array_equal(a, mx.arange(8)))

    def test_complex_ops(self):
        x = mx.array(
            [
                3.0 + 4.0j,
                -5.0 + 12.0j,
                -8.0 + 0.0j,
                0.0 + 9.0j,
                0.0 + 0.0j,
            ]
        )

        ops = ["arccos", "arcsin", "arctan", "square", "sqrt"]
        for op in ops:
            with self.subTest(op=op):
                np_op = getattr(np, op)
                mx_op = getattr(mx, op)
                self.assertTrue(np.allclose(mx_op(x), np_op(x)))

        x = mx.array(
            [
                3.0 + 4.0j,
                -5.0 + 12.0j,
                -8.0 + 0.0j,
                0.0 + 9.0j,
                9.0 + 1.0j,
            ]
        )
        self.assertTrue(np.allclose(mx.rsqrt(x), 1.0 / np.sqrt(x)))

    def test_complex_power(self):
        out = mx.power(mx.array(0j), 2)
        self.assertEqual(out.item(), 0j)

        out = mx.power(mx.array(0j), float("nan"))
        self.assertTrue(mx.isnan(out))

    def test_irregular_alignments(self):
        # Unaligned unary op
        a = mx.ones((64, 1))
        b = -a[1:]
        self.assertTrue(mx.all(b == -1.0))

        # Unaligned binary op
        a = mx.ones((64, 1))
        b = a[1:]
        c = b + b
        self.assertTrue(mx.all(c == 2.0))

        # Unaligned ternary op
        a = mx.ones((64, 1))
        b = mx.zeros((63, 1))
        c = mx.ones((63, 1)).astype(mx.bool_)
        d = mx.where(c, a[1:], b)
        self.assertTrue(mx.all(d == 1.0))

    def test_integer_power(self):
        x = mx.power(2, mx.array([8, 8, 8, 8, 8, 8, 8, 8]))
        self.assertTrue(mx.all(x == 256))

        # Doesn't hang
        x = mx.power(2, -1)

    def test_depends(self):
        a = mx.array([1.0, 2.0, 3.0])
        b = mx.exp(a)
        c = mx.log(a)
        out = mx.depends([b], [c])[0]
        self.assertTrue(mx.array_equal(out, b))

        a = mx.array([1.0, 2.0, 3.0])
        b = mx.exp(a)
        c = mx.log(a)
        out = mx.depends(b, c)
        self.assertTrue(mx.array_equal(out, b))

    def test_masked_scatter(self):
        # boolean mask updates matching numpy semantics
        a = mx.array([1.0, 2.0, 3.0])
        mask = mx.array([True, False, True])
        src = mx.array([5.0, 6.0])
        expected = mx.array([5.0, 2.0, 6.0])
        a[mask] = src
        self.assertTrue(mx.array_equal(a, expected))

        # non-boolean mask raises
        b = mx.array([1.0, 2.0, 3.0])
        bad_mask = mx.array([1, 0, 1])
        src = mx.array([4.0, 5.0])
        with self.assertRaises((TypeError, ValueError)):
            b[bad_mask] = src

        # mask matching leading dimension selects entire trailing slices
        c = mx.array([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]])
        mask = mx.array([True, False])
        src = mx.array([2.0, 3.0, 4.0])
        expected = mx.array([[2.0, 3.0, 4.0], [1.0, 1.0, 1.0]])
        c[mask] = src
        self.assertTrue(mx.array_equal(c, expected))

        # scalar source applies to all selected entries
        c = mx.array([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]])
        mask = mx.array([True, False])
        src = 2.0
        expected = mx.array([[2.0, 2.0, 2.0], [1.0, 1.0, 1.0]])
        c[mask] = src
        self.assertTrue(mx.array_equal(c, expected))

        # mask with no updates leaves values unchanged
        d = mx.array([[7.0, 8.0], [9.0, 10.0]])
        mask = mx.zeros_like(d).astype(mx.bool_)
        src = mx.array([1.0])
        d[mask] = src
        self.assertTrue(mx.array_equal(d, mx.array([[7.0, 8.0], [9.0, 10.0]])))

        # empty mask leaves array unchanged
        e = mx.zeros((0,), dtype=mx.float32)
        mask = mx.zeros((0,), dtype=mx.bool_)
        src = mx.zeros((0,), dtype=mx.float32)
        e[mask] = src
        self.assertTrue(mx.array_equal(e, mx.zeros((0,), dtype=mx.float32)))

        # strided target, mask, and source derived from slices
        target = mx.arange(10.0, dtype=mx.float32)[1::2]
        mask = mx.array(
            [False, True, False, False, True, False, False, True, False, False],
            dtype=mx.bool_,
        )[1::2]
        src = mx.arange(-4.0, 0.0, dtype=mx.float32)[::2]

        target[mask] = src
        self.assertTrue(
            mx.array_equal(
                target, mx.array([-4.0, 3.0, 5.0, -2.0, 9.0], dtype=mx.float32)
            )
        )

    def test_broadcast_shapes(self):
        # Basic broadcasting
        self.assertEqual(mx.broadcast_shapes((1, 2, 3), (3,)), (1, 2, 3))
        self.assertEqual(mx.broadcast_shapes((4, 1, 6), (5, 6)), (4, 5, 6))
        self.assertEqual(mx.broadcast_shapes((5, 1, 4), (1, 3, 4)), (5, 3, 4))

        # Multiple arguments
        self.assertEqual(mx.broadcast_shapes((1, 1), (1, 8), (7, 1)), (7, 8))
        self.assertEqual(
            mx.broadcast_shapes((6, 1, 5), (1, 7, 1), (6, 7, 5)), (6, 7, 5)
        )

        # Same shapes
        self.assertEqual(mx.broadcast_shapes((3, 4, 5), (3, 4, 5)), (3, 4, 5))

        # Single argument
        self.assertEqual(mx.broadcast_shapes((2, 3)), (2, 3))

        # Empty shapes
        self.assertEqual(mx.broadcast_shapes((), ()), ())
        self.assertEqual(mx.broadcast_shapes((), (1,)), (1,))
        self.assertEqual(mx.broadcast_shapes((1,), ()), (1,))

        # Broadcasting with zeroes
        self.assertEqual(mx.broadcast_shapes((0,), (0,)), (0,))
        self.assertEqual(mx.broadcast_shapes((1, 0, 5), (3, 1, 5)), (3, 0, 5))
        self.assertEqual(mx.broadcast_shapes((5, 0), (0, 5, 0)), (0, 5, 0))

        # Error cases
        with self.assertRaises(ValueError):
            mx.broadcast_shapes((3, 4), (4, 3))

        with self.assertRaises(ValueError):
            mx.broadcast_shapes((2, 3, 4), (2, 5, 4))

        with self.assertRaises(ValueError):
            mx.broadcast_shapes()

    def test_sort_nan(self):
        for dtype in [mx.float32, mx.float16, mx.bfloat16]:
            with self.subTest(dtype=dtype):
                x = mx.array([3.0, mx.nan, 2.0, 0.0], dtype=dtype)
                expected = mx.array([0.0, 2.0, 3.0, mx.nan], dtype=dtype)
                self.assertTrue(mx.array_equal(mx.sort(x), expected, equal_nan=True))

        x = mx.array([3.0, mx.nan, 2.0, 0.0]) + 1j * mx.array([1.0] * 4)

    def test_argsort_nan(self):
        for dtype in [mx.float32, mx.float16, mx.bfloat16]:
            with self.subTest(dtype=dtype):
                x = mx.array([3.0, mx.nan, 2.0, 0.0], dtype=dtype)
                expected = mx.array([0.0, 2.0, 3.0, mx.nan], dtype=dtype)
                indices = mx.argsort(x)
                sorted_x = mx.take(x, indices)
                self.assertTrue(mx.array_equal(sorted_x, expected, equal_nan=True))

    def test_to_from_fp8(self):
        vals = mx.array(
            [448, 256, 192, 128, 96, 64, 48, 32, 24, 16, 12, 8, 6, 4, 3, 2, 0.015625]
        )
        self.assertTrue(mx.array_equal(mx.from_fp8(mx.to_fp8(vals)), vals))
        self.assertTrue(mx.array_equal(mx.from_fp8(mx.to_fp8(-vals)), -vals))


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_optimizers.py
================================================
# Copyright © 2023 Apple Inc.

import inspect
import math
import unittest
from functools import partial

import mlx.core as mx
import mlx.nn as nn
import mlx.optimizers as opt
import mlx.utils
import mlx_tests
import numpy as np
from mlx.utils import tree_flatten, tree_map, tree_unflatten

try:
    import torch
    import torch.nn.functional as F

    has_torch = True
except ImportError as e:
    has_torch = False


def get_all_optimizers():
    classes = dict()
    for name, obj in inspect.getmembers(opt):
        if (
            inspect.isclass(obj)
            and issubclass(obj, opt.Optimizer)
            and obj != opt.Optimizer
        ):
            classes[name] = obj
    return classes


def tree_equal(fn, *args):
    return all(v for _, v in tree_flatten(tree_map(fn, *args)))


optimizers_dict = get_all_optimizers()
del optimizers_dict["MultiOptimizer"]


class TestOptimizers(mlx_tests.MLXTestCase):
    def test_optimizer_state(self):
        optim = opt.SGD(0.1)
        optim.state["hello"] = "world"
        self.assertEqual(optim.state["hello"], "world")

        optim.state = {0: 1}
        self.assertEqual(optim.state, {0: 1})

    def test_optimizers(self):
        params = {
            "first": [mx.zeros((10,)), mx.zeros((1,))],
            "second": mx.zeros((1,)),
        }
        grads = tree_map(lambda x: mx.ones_like(x), params)

        for optim_class in optimizers_dict.values():
            optim = optim_class(0.1)
            update = optim.apply_gradients(grads, params)
            mx.eval(update)
            equal_shape = tree_map(lambda x, y: x.shape == y.shape, params, update)
            all_equal = all(v for _, v in mlx.utils.tree_flatten(equal_shape))
            self.assertTrue(all_equal)

    def test_types_conserved(self):
        params = {"w": mx.ones((5, 5), mx.float16)}
        grads = tree_map(lambda x: mx.ones_like(x), params)
        for optim_class in optimizers_dict.values():
            optim = optim_class(0.1)
            update = optim.apply_gradients(grads, params)
            self.assertEqual(update["w"].dtype, mx.float16)

    def test_sgd(self):
        params = {
            "first": [mx.zeros((10,)), mx.zeros((1,))],
            "second": mx.zeros((1,)),
        }
        grads = tree_map(lambda x: mx.ones_like(x), params)

        # Explicit init
        optim = opt.SGD(learning_rate=1e-2, momentum=0.9)
        optim.init(params)
        self.assertTrue(
            tree_equal(
                lambda p, s: mx.array_equal(s["v"], mx.zeros_like(p)),
                params,
                optim.state,
            )
        )

        # Implicit init
        optim = opt.SGD(learning_rate=1e-2, momentum=0.9)
        optim.apply_gradients(grads, params)
        self.assertTrue(
            tree_equal(lambda g, s: mx.array_equal(s["v"], g), grads, optim.state)
        )

    def test_rmsprop(self):
        params = {
            "first": [mx.zeros((10,)), mx.zeros((1,))],
            "second": mx.zeros((1,)),
        }
        grads = tree_map(lambda x: mx.ones_like(x), params)

        # Explicit init
        optim = opt.RMSprop(learning_rate=1e-2)
        optim.init(params)
        self.assertTrue(
            tree_equal(
                lambda p, s: mx.array_equal(s["v"], mx.zeros_like(p)),
                params,
                optim.state,
            )
        )

        # Implicit init
        alpha = 0.99
        optim = opt.RMSprop(learning_rate=1e-2, alpha=alpha)
        optim.apply_gradients(grads, params)
        self.assertTrue(
            tree_equal(
                lambda g, s: mx.allclose(s["v"], (1 - alpha) * g), grads, optim.state
            )
        )

    def test_adagrad(self):
        params = {
            "first": [mx.zeros((10,)), mx.zeros((1,))],
            "second": mx.zeros((1,)),
        }
        grads = tree_map(lambda x: mx.ones_like(x), params)

        # Explicit init
        optim = opt.Adagrad(learning_rate=1e-2)
        optim.init(params)
        self.assertTrue(
            tree_equal(
                lambda p, s: mx.array_equal(s["v"], mx.zeros_like(p)),
                params,
                optim.state,
            )
        )

    def test_adadelta(self):
        params = {
            "first": [mx.zeros((10,)), mx.zeros((1,))],
            "second": mx.zeros((1,)),
        }
        grads = tree_map(lambda x: mx.ones_like(x), params)

        # Explicit init
        optim = opt.AdaDelta(learning_rate=1e-2)
        optim.init(params)
        self.assertTrue(
            tree_equal(
                lambda p, s: mx.array_equal(s["v"], mx.zeros_like(p)),
                params,
                optim.state,
            )
        )
        self.assertTrue(
            tree_equal(
                lambda p, s: mx.array_equal(s["u"], mx.zeros_like(p)),
                params,
                optim.state,
            )
        )

    def test_adam(self):
        params = {
            "first": [mx.zeros((10,)), mx.zeros((1,))],
            "second": mx.zeros((1,)),
        }
        grads = tree_map(lambda x: mx.ones_like(x), params)

        # Explicit init
        for optimizer in [opt.Adam, opt.AdamW, opt.Adamax]:
            optim = optimizer(learning_rate=1e-2)
            optim.init(params)
            self.assertTrue(
                tree_equal(
                    lambda p, s: mx.array_equal(s["v"], mx.zeros_like(p)),
                    params,
                    optim.state,
                )
            )
            self.assertTrue(
                tree_equal(
                    lambda p, s: mx.array_equal(s["m"], mx.zeros_like(p)),
                    params,
                    optim.state,
                )
            )

        # Test for correct gradient type propagation
        params = tree_map(lambda x: x.astype(mx.float16), params)
        grads = tree_map(lambda x: x.astype(mx.float16), grads)
        optim = opt.Adam(1e-2, bias_correction=True)
        new_params = optim.apply_gradients(grads, params)
        self.assertTrue(tree_equal(lambda p: p.dtype == mx.float16, new_params))

    @unittest.skipIf(not has_torch, "requires Torch")
    def test_adamw_matches_pytorch(self):
        mx.random.seed(0)
        np.random.seed(0)

        model = nn.Linear(3, 1)
        init_weight = np.array(model.weight.tolist())
        init_bias = np.array(model.bias.tolist())

        def loss_fn(model, x, y):
            pred = model(x)
            return nn.losses.mse_loss(pred, y)

        x = np.random.rand(3, 3)
        y = np.random.rand(3, 1)

        optimizer = opt.AdamW(learning_rate=3e-4, bias_correction=True)
        loss_and_grad_fn = nn.value_and_grad(model, loss_fn)
        loss, grads = loss_and_grad_fn(model, mx.array(x), mx.array(y))
        optimizer.update(model, grads)

        # Equivalent torch code
        torch_model = torch.nn.Linear(3, 1)

        # copy over the parameters
        torch_model.weight.data = torch.tensor(init_weight, dtype=torch.float32)
        torch_model.bias.data = torch.tensor(init_bias, dtype=torch.float32)

        torch_optimizer = torch.optim.AdamW(torch_model.parameters(), lr=3e-4)
        torch_optimizer.zero_grad()
        pred = torch_model(torch.tensor(x, dtype=torch.float32))
        loss = torch.nn.MSELoss()(pred, torch.tensor(y, dtype=torch.float32))
        loss.backward()
        torch_optimizer.step()

        for name, param in torch_model.named_parameters():
            mlx_grad = np.array(grads[name])
            torch_grad = param.grad.detach().numpy()
            self.assertTrue(np.allclose(torch_grad, mlx_grad))

        for name, param in torch_model.named_parameters():
            mlx_param = np.array(model[name])
            torch_param = param.data.detach().numpy()
            self.assertTrue(np.allclose(torch_param, mlx_param))

    def test_lion(self):
        params = {
            "first": [mx.zeros((10,)), mx.zeros((1,))],
            "second": mx.zeros((1,)),
        }
        grads = tree_map(lambda x: mx.ones_like(x), params)

        # Explicit init
        optim = opt.Lion(learning_rate=1e-2)
        optim.init(params)
        self.assertTrue(
            tree_equal(
                lambda p, s: mx.array_equal(s["m"], mx.zeros_like(p)),
                params,
                optim.state,
            )
        )

    def test_adafactor(self):
        x = mx.zeros((5, 5))
        params = {"x": x}
        grad = {"x": mx.ones_like(x)}
        optimizer = opt.Adafactor()
        for _ in range(2):
            xp = optimizer.apply_gradients(grad, params)
            self.assertEqual(xp["x"].dtype, x.dtype)
            self.assertEqual(xp["x"].shape, x.shape)

        x = mx.zeros((5, 5), mx.float16)
        params = {"x": x}
        grad = {"x": mx.ones_like(x)}
        optimizer = opt.Adafactor()
        for _ in range(2):
            xp = optimizer.apply_gradients(grad, params)
            self.assertEqual(xp["x"].dtype, x.dtype)
            self.assertEqual(xp["x"].shape, x.shape)
        self.assertEqual(optimizer.state["step"], 2)

    def test_muon(self):
        params = {
            "first": [mx.zeros((10, 5)), mx.zeros((1,))],
            "second": mx.zeros((3, 3)),
            "conv": mx.zeros((16, 8, 3, 3)),
        }
        grads = tree_map(lambda x: mx.ones_like(x), params)

        # Explicit init
        optim = opt.Muon(learning_rate=1e-2, momentum=0.95, nesterov=True)
        optim.init(params)
        self.assertTrue(
            tree_equal(
                lambda p, s: mx.array_equal(s["v"], mx.zeros_like(p)),
                params,
                optim.state,
            )
        )

        # Test update
        updated_params = optim.apply_gradients(grads, params)

        # Check that shapes are preserved
        self.assertTrue(
            tree_equal(
                lambda p, u: p.shape == u.shape,
                params,
                updated_params,
            )
        )

        # Check that parameters actually changed
        self.assertFalse(
            tree_equal(
                lambda p, u: mx.array_equal(p, u),
                params,
                updated_params,
            )
        )

        # Test with different configurations
        optim_no_nesterov = opt.Muon(learning_rate=1e-2, momentum=0.95, nesterov=False)
        optim_no_nesterov.apply_gradients(grads, params)

        optim_no_momentum = opt.Muon(learning_rate=1e-2, momentum=0.0)
        optim_no_momentum.apply_gradients(grads, params)

    def test_compiled_optimizer(self):
        model = nn.Linear(10, 10)
        x = mx.random.uniform(shape=(2, 10))
        optim = opt.SGD(learning_rate=1e-2, momentum=0.9)

        orig_params = model.parameters()

        def loss(model, x):
            return model(x).sum()

        # Uncompiled version
        def step(x):
            _, grad = nn.value_and_grad(model, loss)(model, x)
            optim.update(model, grad)

        step(x)
        uncompiled_params = model.parameters()

        # Pure version
        def loss(params, x):
            model.update(params)
            return model(x).sum()

        model.update(orig_params)
        optim = opt.SGD(learning_rate=1e-2, momentum=0.9)

        @mx.compile
        def step(params, opt_state, x):
            grad = mx.grad(loss)(params, x)
            optim.state = opt_state
            params = optim.apply_gradients(grad, params)
            return params, optim.state

        optim.init(model.parameters())
        pure_params, _ = step(model.parameters(), optim.state, x)
        self.assertTrue(mx.allclose(pure_params["weight"], uncompiled_params["weight"]))
        self.assertTrue(mx.allclose(pure_params["bias"], uncompiled_params["bias"]))

        # Impure version
        def loss(model, x):
            return model(x).sum()

        model.update(orig_params)
        optim = opt.SGD(learning_rate=1e-2, momentum=0.9)
        state = [model.state, optim.state]

        @partial(mx.compile, inputs=state, outputs=state)
        def step(x):
            _, grad = nn.value_and_grad(model, loss)(model, x)
            optim.update(model, grad)

        step(x)
        impure_params = model.parameters()
        self.assertTrue(
            mx.allclose(impure_params["weight"], uncompiled_params["weight"])
        )
        self.assertTrue(mx.allclose(impure_params["bias"], uncompiled_params["bias"]))

    def test_update_lr_compiled(self):
        params = {"w": mx.ones((5, 5))}
        grads = tree_map(lambda x: mx.ones_like(x), params)
        optim = opt.SGD(-1.0)

        @partial(mx.compile, inputs=optim.state)
        def update(grads):
            return optim.apply_gradients(grads, params)

        result = update(grads)
        self.assertTrue(mx.allclose(result["w"], mx.full((5, 5), 2.0)))
        optim.learning_rate = -2.0
        result = update(grads)
        self.assertTrue(mx.allclose(result["w"], mx.full((5, 5), 3.0)))


class TestSchedulers(mlx_tests.MLXTestCase):
    def test_decay_lr(self):
        for optim_class in optimizers_dict.values():
            lr_schedule = opt.step_decay(1e-1, 0.9, 1)
            optimizer = optim_class(learning_rate=lr_schedule)

            params = {"w": mx.ones((5, 5))}
            grads = tree_map(lambda x: mx.ones_like(x), params)

            for it in range(10):
                optimizer.apply_gradients(grads, params)
                expected_lr = 0.1 * (0.9**it)
                self.assertAlmostEqual(optimizer.learning_rate, expected_lr, delta=1e-7)

    def test_step_decay(self):
        lr_schedule = opt.step_decay(1e-1, 0.9, 1000)
        lr = lr_schedule(2500)
        expected_lr = 0.1 * (0.9**2)
        self.assertAlmostEqual(lr, expected_lr, delta=1e-7)

    def test_exponential_decay(self):
        lr_schedule = opt.exponential_decay(1e-1, 0.99)
        lr = lr_schedule(10)
        expected_lr = 0.1 * (0.99**10)
        self.assertAlmostEqual(lr, expected_lr, delta=1e-7)

    def test_cosine_decay(self):
        lr_schedule = opt.cosine_decay(0.1, 10)
        lr = lr_schedule(4)
        expected_lr = 0.1 * 0.5 * (1.0 + math.cos(math.pi * 4 / 10))
        self.assertAlmostEqual(lr, expected_lr, delta=1e-7)

        lr_schedule = opt.cosine_decay(0.1, 10, 0.05)
        lr = lr_schedule(9)
        expected_end_lr = 0.05
        self.assertGreater(lr, expected_end_lr)
        lr = lr_schedule(20)
        self.assertEqual(lr, expected_end_lr)

    def test_schedule_joiner(self):
        boundaries = [2, 3, 4]
        schedules = [lambda _: 3, lambda _: 4, lambda _: 5]
        with self.assertRaises(ValueError):
            opt.schedulers.join_schedules(schedules, boundaries)
        boundaries = [2, 4]
        schedule = opt.schedulers.join_schedules(schedules, boundaries)
        self.assertEqual(schedule(0).item(), 3)
        self.assertEqual(schedule(1).item(), 3)
        self.assertEqual(schedule(2).item(), 4)
        self.assertEqual(schedule(3).item(), 4)
        self.assertEqual(schedule(5).item(), 5)
        self.assertEqual(schedule(7).item(), 5)

    def test_linear_warmup_with_cosine_decay(self):
        warmup_schedule = opt.schedulers.linear_schedule(0.0, 1e-5, 100)
        cosine_schedule = opt.schedulers.cosine_decay(1e-5, 100)
        cos_with_warmup = opt.schedulers.join_schedules(
            [warmup_schedule, cosine_schedule], [101]
        )
        self.assertEqual(cos_with_warmup(0), 0.0)
        self.assertAlmostEqual(cos_with_warmup(101), 1e-5, delta=1e-1)
        optimizer = opt.Adam(learning_rate=cos_with_warmup)
        for _ in range(100):
            optimizer.update({}, {})
        self.assertAlmostEqual(optimizer.learning_rate.item(), 1e-5, delta=1e-1)
        for _ in range(100):
            optimizer.update({}, {})
        expected_lr = 1e-5 * 0.5 * (1.0 + math.cos(math.pi * 200 / 10))
        self.assertAlmostEqual(optimizer.learning_rate.item(), expected_lr, delta=1e-1)

    def test_compile_with_schedule(self):
        lr_schedule = opt.exponential_decay(1e-1, 0.9)
        optimizer = opt.SGD(learning_rate=lr_schedule)

        @partial(mx.compile, inputs=optimizer.state, outputs=optimizer.state)
        def update():
            optimizer.update({}, {})

        for step in range(5):
            update()
            self.assertAlmostEqual(lr_schedule(step), optimizer.learning_rate.item())

    def test_clip_grad_norm(self):
        # Test with small gradients that do not require clipping
        small_grads = {
            "first": [mx.array([0.1, 0.2]), mx.array([0.1])],
            "second": mx.array([0.3]),
        }
        max_norm = 10.0  # A large max_norm that shouldn't trigger clipping
        clipped_grads, total_norm = opt.clip_grad_norm(small_grads, max_norm)
        self.assertTrue(
            tree_equal(lambda x, y: mx.array_equal(x, y), small_grads, clipped_grads),
            "Gradients should not be modified when clipping is not necessary.",
        )

        # Test with large gradients that require clipping
        large_grads = {
            "first": [mx.array([10, 20]), mx.array([10])],
            "second": mx.array([30]),
        }
        max_norm = 1.0  # A small max_norm that should trigger clipping
        clipped_grads, total_norm = opt.clip_grad_norm(large_grads, max_norm)
        # Correctly extract only the gradient values for norm calculation
        clipped_values = [value for _, value in tree_flatten(clipped_grads)]
        norm_of_clipped = mx.sqrt(
            sum(mx.square(g).sum() for g in clipped_values)
        ).item()
        self.assertAlmostEqual(
            norm_of_clipped,
            max_norm,
            places=6,
            msg="Clipped gradients norm should be close to the specified max_norm.",
        )

        # Ensures that the scaling was done correctly
        scale = max_norm / total_norm
        expected_grads = tree_map(lambda g: g * scale, large_grads)
        self.assertTrue(
            tree_equal(
                lambda x, y: mx.allclose(x, y, atol=1e-6), expected_grads, clipped_grads
            ),
            "Gradients were not scaled correctly during clipping.",
        )

    def test_init_from_state(self):
        class Model(nn.Module):
            def __init__(self):
                super().__init__()
                self.l1 = nn.Linear(2, 2)
                self.drop = nn.Dropout(p=0.5)
                self.l2 = nn.Linear(2, 2)
                self.vals = [nn.Linear(2, 2), nn.ReLU(), nn.ReLU()]

        model = Model()
        optimizer = opt.Adam(learning_rate=3e-4)
        optimizer.init(model.trainable_parameters())

        # Flatten the state for serialization
        state = tree_flatten(optimizer.state)

        # Make a new optimizer and load the state
        optimizer = opt.Adam(learning_rate=3e-4)
        optimizer.state = tree_unflatten(state)

        # This should work without any errors
        grads = model.trainable_parameters()
        optimizer.update(model, grads)

    def test_multi_optimizer(self):
        class Model(nn.Module):
            def __init__(self):
                super().__init__()
                self.l1 = nn.Linear(2, 2)
                self.drop = nn.Dropout(p=0.5)
                self.l2 = nn.Linear(2, 2)
                self.vals = [nn.Linear(2, 2), nn.ReLU(), nn.ReLU()]

        model = Model()
        optimizer = opt.MultiOptimizer(
            [opt.Adam(learning_rate=0.001), opt.SGD(learning_rate=0.1)],
            [lambda name, weight: weight.ndim > 1],
        )
        optimizer.init(model.trainable_parameters())

        self.assertEqual(len(optimizer.state["states"]), 2)

        adam_states = tree_flatten(optimizer.state["states"][0])
        sgd_states = tree_flatten(optimizer.state["states"][1])
        self.assertEqual((len(sgd_states) - 2) * 2, len(adam_states) - 2)
        self.assertFalse(any("bias" in k for k, v in adam_states))
        self.assertFalse(any("weight" in k for k, v in sgd_states))


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_quantized.py
================================================
# Copyright © 2023 Apple Inc.

import unittest
from itertools import product

import mlx.core as mx
import mlx_tests


class TestQuantized(mlx_tests.MLXTestCase):
    def test_quantize_dequantize(self):
        w = mx.random.normal(shape=(128, 512))
        for gs in [32, 64, 128]:
            for b in [2, 3, 5, 6, 4, 8]:
                with self.subTest(gs=gs, b=b):
                    w_q, scales, biases = mx.quantize(w, group_size=gs, bits=b)
                    w_hat = mx.dequantize(w_q, scales, biases, gs, b)
                    errors = (w - w_hat).abs().reshape(*scales.shape, -1)
                    eps = 1e-6
                    self.assertTrue((errors <= (scales[..., None] + eps).abs()).all())

        # test quantize/dequantize 0s
        a = mx.zeros((256, 512))
        for gs in [32, 64, 128]:
            for b in [2, 3, 4, 5, 6, 8]:
                w_q, scales, biases = mx.quantize(a, gs, b)
                a_hat = mx.dequantize(w_q, scales, biases, gs, b)
                self.assertTrue(mx.all(a_hat == 0))

    def test_mxfp4_quantize_dequantize(self):
        lut = mx.array(
            [
                +0.0,
                +0.5,
                +1.0,
                +1.5,
                +2.0,
                +3.0,
                +4.0,
                +6.0,
                -0.0,
                -0.5,
                -1.0,
                -1.5,
                -2.0,
                -3.0,
                -4.0,
                -6.0,
            ]
        )
        w = lut[mx.random.randint(0, 16, shape=(128, 512))]
        w = w.reshape(-1, 32)
        w[:, 0] = 6
        w = (w + 3e-6).astype(mx.bfloat16)

        # Invalid bits / group size
        with self.assertRaises(ValueError):
            mx.quantize(w, bits=3, mode="mxfp4")

        with self.assertRaises(ValueError):
            mx.quantize(w, group_size=64, mode="mxfp4")

        w_q, scales = mx.quantize(w, mode="mxfp4")
        with self.assertRaises(ValueError):
            mx.dequantize(w_q, scales, bits=3, mode="mxfp4")

        with self.assertRaises(ValueError):
            mx.dequantize(w_q, scales, group_size=64, mode="mxfp4")

        # Invalid output type
        with self.assertRaises(ValueError):
            mx.dequantize(
                w_q, scales, group_size=32, bits=4, mode="mxfp4", dtype=mx.int32
            )

        w_hat = mx.dequantize(w_q, scales, mode="mxfp4")
        self.assertTrue(mx.allclose(w, w_hat, rtol=1e-5, atol=1e-5))

        # test quantize/dequantize 0s
        a = mx.zeros((256, 512))
        w_q, scales = mx.quantize(a, mode="mxfp4")
        w_hat = mx.dequantize(w_q, scales, mode="mxfp4")
        self.assertTrue(mx.all(w_hat == 0))

    def test_mxfp8_quantize_dequantize(self):
        w = 2 * mx.random.uniform(shape=(512, 32)) - 1
        w = w.astype(mx.bfloat16)

        # Invalid bits / group size
        with self.assertRaises(ValueError):
            mx.quantize(w, bits=3, mode="mxfp8")

        with self.assertRaises(ValueError):
            mx.quantize(w, group_size=32, bits=7, mode="mxfp8")
        w_q, scales = mx.quantize(w, group_size=32, mode="mxfp8")

        with self.assertRaises(ValueError):
            mx.dequantize(w_q, scales, group_size=16, mode="mxfp8")

        with self.assertRaises(ValueError):
            mx.dequantize(w_q, scales, bits=4, mode="mxfp8")

        w_hat = mx.dequantize(w_q, scales, mode="mxfp8")

        self.assertTrue(mx.allclose(w, w_hat, rtol=1e-1, atol=1e-1))

        # test quantize/dequantize 0s
        a = mx.zeros((256, 512))
        w_q, scales = mx.quantize(a, mode="mxfp8")
        w_hat = mx.dequantize(w_q, scales, mode="mxfp8")
        self.assertTrue(mx.all(w_hat == 0))

    def test_nvfp4_quantize_dequantize(self):
        lut = mx.array(
            [
                +0.0,
                +0.5,
                +1.0,
                +1.5,
                +2.0,
                +3.0,
                +4.0,
                +6.0,
                -0.0,
                -0.5,
                -1.0,
                -1.5,
                -2.0,
                -3.0,
                -4.0,
                -6.0,
            ]
        )
        w = lut[mx.random.randint(0, 16, shape=(128, 512))]
        w = w.reshape(-1, 16)
        w[:, 0] = 6
        w = (w + 3e-6).astype(mx.bfloat16)

        # Invalid bits / group size
        with self.assertRaises(ValueError):
            mx.quantize(w, bits=3, mode="nvfp4")

        with self.assertRaises(ValueError):
            mx.quantize(w, group_size=64, mode="nvfp4")

        w_q, scales = mx.quantize(w, mode="nvfp4")

        with self.assertRaises(ValueError):
            mx.dequantize(w_q, scales, bits=3, mode="nvfp4")

        with self.assertRaises(ValueError):
            mx.dequantize(w_q, scales, group_size=32, mode="nvfp4")

        w_hat = mx.dequantize(w_q, scales, mode="nvfp4")
        self.assertTrue(mx.allclose(w, w_hat, rtol=1e-5, atol=1e-5))

        # test quantize/dequantize 0s
        a = mx.zeros((256, 512))
        w_q, scales = mx.quantize(a, mode="nvfp4")
        w_hat = mx.dequantize(w_q, scales, mode="nvfp4")
        self.assertTrue(mx.all(w_hat == 0))

        # Test nvfp4 quantize/dequantize with tensor-scale global_scale
        # currently supported only on cpu and cuda
        if not mx.metal.is_available():
            global_scale = w.abs().max().astype(mx.float32)
        else:
            global_scale = None

        w_q, scales = mx.quantize(w, mode="nvfp4", global_scale=global_scale)
        w_hat = mx.dequantize(
            w_q, scales, group_size=16, bits=4, mode="nvfp4", global_scale=global_scale
        )
        self.assertTrue(mx.allclose(w, w_hat, rtol=1e-5, atol=1e-5))

    def test_qqmv(self):
        key = mx.random.key(0)
        k1, k2 = mx.random.split(key)
        tests = product(
            [256, 512, 67],  # M
            [64, 256],  # N
        )
        modes = ["nvfp4", "mxfp8"]
        for M, N in tests:
            for mode in modes:
                with self.subTest(shape=(M, N), mode=mode):
                    x_shape = (1, N)
                    w_shape = (M, N)

                    x = mx.random.normal(shape=x_shape, key=k1)
                    x_hat = mx.dequantize(
                        *mx.quantize(x, mode=mode), mode=mode, dtype=mx.float32
                    )

                    w = mx.random.normal(shape=w_shape, key=k2)
                    w_q, scales = mx.quantize(w, mode=mode)
                    w_hat = mx.dequantize(w_q, scales, mode=mode, dtype=mx.float32)
                    y_q = mx.qqmm(
                        x,
                        w_q,
                        scales,
                        mode=mode,
                    )
                    y_hat = x_hat @ mx.swapaxes(w_hat, -1, -2)
                    self.assertEqual(y_q.shape, y_hat.shape)
                    self.assertLess((y_q - y_hat).abs().max(), 1e-3)

    def test_qmm(self):
        key = mx.random.key(0)
        k1, k2 = mx.random.split(key)
        dtype = mx.float16 if (mx.default_device() == mx.gpu) else mx.float32
        tests = product(
            [128, 64, 32],  # group_size
            [2, 4, 8],  # bits
            [8, 32, 33, 64],  # M
            [128, 256],  # N
            [128, 256],  # K
            [True, False],  # transposed
        )
        for group_size, bits, M, N, K, transposed in tests:
            with self.subTest(
                shape=(M, N, K),
                group_size=group_size,
                bits=bits,
                transposed=transposed,
            ):
                x = mx.random.normal(shape=(M, K), key=k1) / K**0.5
                w = (
                    mx.random.normal(shape=(N, K) if transposed else (K, N), key=k2)
                    / K**0.5
                )
                x = x.astype(dtype)
                w = w.astype(dtype)
                w_q, scales, biases = mx.quantize(w, group_size, bits)
                w_hat = mx.dequantize(w_q, scales, biases, group_size, bits)
                y_q = mx.quantized_matmul(
                    x, w_q, scales, biases, transposed, group_size, bits
                )
                y_hat = (x @ w_hat.T) if transposed else (x @ w_hat)
                self.assertEqual(y_q.shape, y_hat.shape)

                tol = 1e-3 if dtype == mx.float32 else 1.5e-3
                self.assertLess((y_q - y_hat).abs().max(), tol)

    def test_qmm_vjp(self):
        key = mx.random.key(0)
        k1, k2 = mx.random.split(key)

        bits = 8
        group_size = 64
        M = 64
        N = 1024
        K = 512

        x = mx.random.normal(shape=(2, M, K), key=k1)
        c = mx.ones(shape=(2, M, N))

        transposes = [True, False]
        for transposed in transposes:
            w = mx.random.normal(shape=(N, K) if transposed else (K, N), key=k2)
            w_q, scales, biases = mx.quantize(w, group_size, bits)

            def fn(x):
                return mx.quantized_matmul(
                    x, w_q, scales, biases, transposed, group_size, bits
                )

            _, vjp_out = mx.vjp(fn, primals=(x,), cotangents=(c,))

            expected_out = mx.quantized_matmul(
                c, w_q, scales, biases, not transposed, group_size, bits
            )
            self.assertTrue(mx.allclose(vjp_out[0], expected_out))

    def test_qmm_jvp(self):
        key = mx.random.key(0)
        k1, k2 = mx.random.split(key)

        bits = 8
        group_size = 64
        M = 64
        N = 128
        K = 128

        x = mx.random.normal(shape=(2, M, K), key=k1)
        x_tan = mx.ones(shape=(2, M, N))

        transposes = [True, False]
        for transposed in transposes:
            w = mx.random.normal(shape=(N, K) if transposed else (K, N), key=k2)
            w_q, scales, biases = mx.quantize(w, group_size, bits)

            def fn(x):
                return mx.quantized_matmul(
                    x, w_q, scales, biases, transposed, group_size, bits
                )

            _, jvp_out = mx.jvp(fn, primals=(x,), tangents=(x_tan,))

            expected_out = mx.quantized_matmul(
                x_tan, w_q, scales, biases, transposed, group_size, bits
            )
            self.assertTrue(mx.allclose(jvp_out[0], expected_out))

    def test_qmm_shapes(self):
        key = mx.random.key(0)
        k1, k2 = mx.random.split(key)
        group_size = 64
        bits = 4
        w = mx.random.normal(shape=(32, 256), key=k2)
        w_q, scales, biases = mx.quantize(w, group_size, bits)
        w_hat = mx.dequantize(w_q, scales, biases, group_size, bits)
        for s in [(3, 256), (2, 1, 7, 256)]:
            x = mx.random.normal(shape=s, key=k1)
            y_q = mx.quantized_matmul(x, w_q, scales, biases, True, group_size, bits)
            y_hat = x @ w_hat.T
            self.assertEqual(y_q.shape, y_hat.shape)
            self.assertLess((y_q - y_hat).abs().max(), 1e-3)

        w = mx.random.normal(shape=(256, 256), key=k2)
        w_q, scales, biases = mx.quantize(w, group_size, bits)
        w_hat = mx.dequantize(w_q, scales, biases, group_size, bits)
        for s in [(3, 256), (2, 1, 7, 256)]:
            x = mx.random.normal(shape=s, key=k1)
            y_q = mx.quantized_matmul(x, w_q, scales, biases, False, group_size, bits)
            y_hat = x @ w_hat
            self.assertEqual(y_q.shape, y_hat.shape)
            self.assertLess((y_q - y_hat).abs().max(), 1e-3)

    def test_qmv(self):
        key = mx.random.key(0)
        k1, k2 = mx.random.split(key)
        tests = product(
            [128, 64, 32],  # group_size
            [2, 3, 4, 5, 6, 8],  # bits
            [256, 512, 67],  # M
            [64, 256],  # N
            [0, 1, 3, 8],  # B
        )
        for group_size, bits, M, N, B in tests:
            if group_size > N:
                continue
            with self.subTest(shape=(B, M, N), group_size=group_size, bits=bits):
                x_shape = (3, 1, N) if B == 0 else (B, 1, N)
                w_shape = (M, N) if B == 0 else (B, M, N)
                x = mx.random.normal(shape=x_shape, key=k1)
                w = mx.random.normal(shape=w_shape, key=k2)
                w_q, scales, biases = mx.quantize(w, group_size, bits)
                w_hat = mx.dequantize(w_q, scales, biases, group_size, bits)
                y_q = mx.quantized_matmul(
                    x, w_q, scales, biases, True, group_size, bits
                )
                y_hat = x @ mx.swapaxes(w_hat, -1, -2)
                self.assertEqual(y_q.shape, y_hat.shape)
                self.assertLess((y_q - y_hat).abs().max(), 1e-3)

    def test_fp_qmv(self):
        key = mx.random.key(0)
        k1, k2 = mx.random.split(key)
        tests = product(
            [256, 512, 67],  # M
            [64, 256],  # N
            [0, 1, 3, 8],  # B
        )
        modes = ["mxfp4", "nvfp4", "mxfp8"]
        for M, N, B in tests:
            for mode in modes:
                with self.subTest(shape=(B, M, N), mode=mode):
                    x_shape = (3, 1, N) if B == 0 else (B, 1, N)
                    w_shape = (M, N) if B == 0 else (B, M, N)
                    x = mx.random.normal(shape=x_shape, key=k1)
                    w = mx.random.normal(shape=w_shape, key=k2)
                    w_q, scales = mx.quantize(w, mode=mode)
                    w_hat = mx.dequantize(w_q, scales, mode=mode)
                    y_q = mx.quantized_matmul(
                        x,
                        w_q,
                        scales,
                        transpose=True,
                        mode=mode,
                    )
                    y_hat = x @ mx.swapaxes(w_hat, -1, -2)
                    self.assertEqual(y_q.shape, y_hat.shape)
                    self.assertLess((y_q - y_hat).abs().max(), 1e-3)

        # Test multiple of 16 but not 32
        M = 128
        N = 48
        mode = "nvfp4"
        with self.subTest(shape=(B, M, N), mode=mode):
            x_shape = (1, N)
            w_shape = (M, N)
            x = mx.random.normal(shape=x_shape, key=k1)
            w = mx.random.normal(shape=w_shape, key=k2)
            w_q, scales = mx.quantize(w, mode=mode)
            w_hat = mx.dequantize(w_q, scales, mode=mode)
            y_q = mx.quantized_matmul(
                x,
                w_q,
                scales,
                transpose=True,
                mode=mode,
            )
            y_hat = x @ mx.swapaxes(w_hat, -1, -2)
            self.assertEqual(y_q.shape, y_hat.shape)
            self.assertLess((y_q - y_hat).abs().max(), 1e-3)

    def test_qvm(self):
        key = mx.random.key(0)
        k1, k2 = mx.random.split(key)
        tests = product(
            [128, 64, 32],  # group_size
            [2, 3, 4, 5, 6, 8],  # bits
            [32, 128, 256],  # M
            [128, 256, 67],  # N
            [0, 1, 3, 8],  # B
        )
        for group_size, bits, M, N, B in tests:
            with self.subTest(shape=(B, M, N), group_size=group_size, bits=bits):
                if M < group_size:
                    continue
                x_shape = (1, N) if B == 0 else (B, 1, N)
                w_shape = (N, M) if B == 0 else (B, N, M)
                x = mx.random.normal(shape=x_shape, key=k1)
                w = mx.random.normal(shape=w_shape, key=k2)
                w_q, scales, biases = mx.quantize(w, group_size, bits)
                w_hat = mx.dequantize(w_q, scales, biases, group_size, bits)
                y_q = mx.quantized_matmul(
                    x, w_q, scales, biases, False, group_size, bits
                )
                y_hat = x @ w_hat
                self.assertEqual(y_q.shape, y_hat.shape)
                self.assertLess((y_q - y_hat).abs().max(), 1e-3)

    def test_qvm_splitk(self):
        key = mx.random.key(0)
        k1, k2 = mx.random.split(key)
        tests = product(
            [128, 64, 32],  # group_size
            [2, 4, 8],  # bits
            [128],  # M
            [16384],  # N
            [1, 3],  # B
        )
        for group_size, bits, M, N, B in tests:
            with self.subTest(shape=(B, M, N), group_size=group_size, bits=bits):
                x_shape = (1, N) if B == 0 else (B, 1, N)
                w_shape = (N, M) if B == 0 else (B, N, M)
                x = 1e-1 * mx.random.normal(shape=x_shape, key=k1)
                w = 1e-1 * mx.random.normal(shape=w_shape, key=k2)
                w_q, scales, biases = mx.quantize(w, group_size, bits)
                w_hat = mx.dequantize(w_q, scales, biases, group_size, bits)
                y_q = mx.quantized_matmul(
                    x, w_q, scales, biases, False, group_size, bits
                )
                y_hat = x @ w_hat
                self.assertEqual(y_q.shape, y_hat.shape)
                self.assertLess((y_q - y_hat).abs().max(), 2e-3)

        # Test with 1D vector
        group_size = 32
        bits = 8
        N = 2048
        x = 1e-1 * mx.random.normal(shape=(N,), key=k1)
        w = 1e-1 * mx.random.normal(shape=(N, N), key=k2)
        w_q, scales, biases = mx.quantize(w, group_size, bits)
        w_hat = mx.dequantize(w_q, scales, biases, group_size, bits)
        y_q = mx.quantized_matmul(x, w_q, scales, biases, False, group_size, bits)
        y_hat = x @ w_hat
        self.assertEqual(y_q.shape, y_hat.shape)
        self.assertLess((y_q - y_hat).abs().max(), 2e-3)

    def test_fp_qvm(self):
        key = mx.random.key(0)
        k1, k2 = mx.random.split(key)
        tests = product(
            [32, 128, 256],  # M
            [128, 256, 67],  # N
            [0, 1, 3, 8],  # B
        )
        # Add a splitk
        tests = list(tests)
        tests.append((128, 16384, 0))
        modes = ["mxfp4", "nvfp4", "mxfp8"]

        for M, N, B in tests:
            for mode in modes:
                with self.subTest(shape=(B, M, N), mode=mode):
                    x_shape = (1, N) if B == 0 else (B, 1, N)
                    w_shape = (N, M) if B == 0 else (B, N, M)
                    x = mx.random.normal(shape=x_shape, key=k1)
                    w = mx.random.normal(shape=w_shape, key=k2)
                    w_q, scales = mx.quantize(w, mode=mode)
                    w_hat = mx.dequantize(w_q, scales, mode=mode)
                    y_q = mx.quantized_matmul(
                        x,
                        w_q,
                        scales,
                        transpose=False,
                        mode=mode,
                    )
                    y_hat = x @ w_hat
                    self.assertEqual(y_q.shape, y_hat.shape)
                    self.assertLess((y_q - y_hat).abs().max(), 2e-3)

    def test_mode_error_cases(self):
        w = mx.random.normal(shape=(256, 256))
        x = mx.random.normal(shape=(1, 256))

        # Invalid mode
        with self.assertRaises(ValueError):
            mx.quantize(w, mode="xyz")

        wq, scales, biases = mx.quantize(w, bits=4, group_size=32)

        with self.assertRaises(ValueError):
            mx.dequantize(wq, scales, biases, bits=4, group_size=32, mode="xyz")

        with self.assertRaises(ValueError):
            mx.quantized_matmul(
                x, wq, scales, biases, bits=4, group_size=32, mode="xyz"
            )

        rhs_indices = mx.array(0)
        with self.assertRaises(ValueError):
            mx.gather_qmm(
                x,
                wq,
                scales,
                biases,
                rhs_indices=rhs_indices,
                bits=4,
                group_size=32,
                mode="xyz",
            )

        # Only quantize floating point types
        with self.assertRaises(ValueError):
            mx.quantize(mx.zeros((128, 128), mx.int32))

        with self.assertRaises(ValueError):
            mx.quantize(mx.zeros((128, 128), mx.int32), mode="mxfp4")

        # Must have bias for affine
        with self.assertRaises(ValueError):
            mx.dequantize(wq, scales, None, bits=4, group_size=32)

        with self.assertRaises(ValueError):
            mx.quantized_matmul(x, wq, scales, None, bits=4, group_size=32)

        with self.assertRaises(ValueError):
            mx.gather_qmm(
                x, wq, scales, None, rhs_indices=rhs_indices, bits=4, group_size=32
            )

        # Must be floating point
        x = mx.zeros(shape=(256,), dtype=mx.int32)
        scales = mx.zeros(scales.shape, dtype=mx.int32)
        biases = mx.zeros(scales.shape, dtype=mx.int32)
        with self.assertRaises(ValueError):
            mx.dequantize(wq, scales, biases, bits=4, group_size=32)

        with self.assertRaises(ValueError):
            mx.quantized_matmul(x, wq, scales, biases, bits=4, group_size=32)

        with self.assertRaises(ValueError):
            mx.gather_qmm(
                x, wq, scales, biases, rhs_indices=rhs_indices, bits=4, group_size=32
            )

    def test_throw(self):
        x = mx.random.normal(shape=(10, 512))
        w = mx.random.normal(shape=(32, 512))
        w_q, scales, biases = mx.quantize(w)

        with self.assertRaises(ValueError):
            mx.quantized_matmul(x, w_q.T, scales, biases)
        with self.assertRaises(ValueError):
            mx.quantized_matmul(x, w_q.T, scales.T, biases)
        with self.assertRaises(ValueError):
            mx.quantized_matmul(x, w_q, scales, biases, False)
        with self.assertRaises(ValueError):
            mx.quantized_matmul(x, w_q, scales.T, biases.T)
        y = mx.quantized_matmul(x, w_q, scales, biases, True)
        mx.eval(y)

    def test_small_matrix(self):
        for w_shape in [(8, 256), (1, 8, 256), (3, 8, 256)]:
            with self.subTest(w_shape=w_shape):
                w = mx.random.normal(shape=(w_shape))
                w_q, scales, biases = mx.quantize(w)
                w_hat = mx.dequantize(w_q, scales, biases)

                # Test qmv
                for shape in [(3, 1, 256), (3, 4, 256)]:
                    x = mx.random.normal(shape=shape)
                    y_q = mx.quantized_matmul(x, w_q, scales, biases, transpose=True)
                    y_hat = x @ mx.swapaxes(w_hat, -1, -2)
                    self.assertEqual(y_q.shape, y_hat.shape)
                    self.assertLess((y_q - y_hat).abs().max(), 1e-3)

                # Test qmm_t
                x = mx.random.normal(shape=(3, 10, 256))
                y_q = mx.quantized_matmul(x, w_q, scales, biases, transpose=True)
                y_hat = x @ mx.swapaxes(w_hat, -1, -2)
                self.assertEqual(y_q.shape, y_hat.shape)
                self.assertLess((y_q - y_hat).abs().max(), 1e-3)

                # Test qvm
                x = mx.random.normal(shape=(3, 1, 8))
                y_q = mx.quantized_matmul(x, w_q, scales, biases, transpose=False)
                y_hat = x @ w_hat
                self.assertEqual(y_q.shape, y_hat.shape)
                self.assertLess((y_q - y_hat).abs().max(), 1e-3)

                # Test qmm
                x = mx.random.normal(shape=(3, 10, 8))
                y_q = mx.quantized_matmul(x, w_q, scales, biases, transpose=False)
                y_hat = x @ w_hat
                self.assertEqual(y_q.shape, y_hat.shape)
                self.assertLess((y_q - y_hat).abs().max(), 1e-3)

    def test_non_multiples(self):
        w = mx.random.normal(shape=(33, 256))
        w_q, scales, biases = mx.quantize(w)
        w_hat = mx.dequantize(w_q, scales, biases)

        # Test qmv
        x = mx.random.normal(shape=(1, 256))
        y_q = mx.quantized_matmul(x, w_q, scales, biases, transpose=True)
        y_hat = x @ w_hat.T
        self.assertLess((y_q - y_hat).abs().max(), 1e-3)

        # Test qmm_t
        x = mx.random.normal(shape=(10, 256))
        y_q = mx.quantized_matmul(x, w_q, scales, biases, transpose=True)
        y_hat = x @ w_hat.T
        self.assertEqual(y_q.shape, y_hat.shape)
        self.assertLess((y_q - y_hat).abs().max(), 1e-3)

        # Test qvm
        x = mx.random.normal(shape=(1, 33))
        y_q = mx.quantized_matmul(x, w_q, scales, biases, transpose=False)
        y_hat = x @ w_hat
        self.assertEqual(y_q.shape, y_hat.shape)
        self.assertLess((y_q - y_hat).abs().max(), 1e-3)

        # Test qmm
        x = mx.random.normal(shape=(10, 33))
        y_q = mx.quantized_matmul(x, w_q, scales, biases, transpose=False)
        y_hat = x @ w_hat
        self.assertEqual(y_q.shape, y_hat.shape)
        self.assertLess((y_q - y_hat).abs().max(), 1e-3)

        # Smaller than 8
        w = mx.random.normal(shape=(3, 256))
        w_q, scales, biases = mx.quantize(w)
        w_hat = mx.dequantize(w_q, scales, biases)

        # Test qmv
        x = mx.random.normal(shape=(1, 256))
        y_q = mx.quantized_matmul(x, w_q, scales, biases, transpose=True)
        y_hat = x @ w_hat.T
        self.assertLess((y_q - y_hat).abs().max(), 1e-3)

        # Test qmm_t
        x = mx.random.normal(shape=(10, 256))
        y_q = mx.quantized_matmul(x, w_q, scales, biases, transpose=True)
        y_hat = x @ w_hat.T
        self.assertEqual(y_q.shape, y_hat.shape)
        self.assertLess((y_q - y_hat).abs().max(), 1e-3)

        # Test qvm
        x = mx.random.normal(shape=(1, 3))
        y_q = mx.quantized_matmul(x, w_q, scales, biases, transpose=False)
        y_hat = x @ w_hat
        self.assertEqual(y_q.shape, y_hat.shape)
        self.assertLess((y_q - y_hat).abs().max(), 1e-3)

        # Test qmm
        x = mx.random.normal(shape=(10, 3))
        y_q = mx.quantized_matmul(x, w_q, scales, biases, transpose=False)
        y_hat = x @ w_hat
        self.assertEqual(y_q.shape, y_hat.shape)
        self.assertLess((y_q - y_hat).abs().max(), 1e-3)

        # Test with larger than 128 unaligned sizes
        w = mx.random.normal(shape=(99, 256))
        w_q, scales, biases = mx.quantize(w)
        w_hat = mx.dequantize(w_q, scales, biases)
        x = mx.random.normal(shape=(129, 256))
        y_q = mx.quantized_matmul(x, w_q, scales, biases, transpose=True)
        y_hat = x @ w_hat.T
        self.assertEqual(y_q.shape, y_hat.shape)
        self.assertLess((y_q - y_hat).abs().max(), 1e-3)

    def test_qmv_small_non_multiples(self):
        # Test very small K and N dimensions (e.g., [MxK] x [NxK].T = [MxN])
        # Each tuple is (M, K, N) representing input rows, weight cols, weight rows
        test_cases = [
            (1, 32, 3),
            (2, 32, 10),
            (1, 32, 5),
            (4, 32, 7),
        ]

        # Test different quantization settings (bits, group_size, mode)
        quantization_settings = [
            (4, 32, "affine"),
            (6, 32, "affine"),
            (4, 16, "nvfp4"),
        ]

        for M, K, N in test_cases:
            for bits, group_size, mode in quantization_settings:
                # Test without batch dimension
                with self.subTest(
                    M=M,
                    K=K,
                    N=N,
                    batch=None,
                    group_size=group_size,
                    bits=bits,
                    mode=mode,
                ):
                    w = mx.random.normal(shape=(N, K))
                    w_q, *sb = mx.quantize(
                        w,
                        group_size=group_size,
                        bits=bits,
                        mode=mode,
                    )
                    w_hat = mx.dequantize(
                        w_q,
                        *sb,
                        group_size=group_size,
                        bits=bits,
                        mode=mode,
                    )

                    # Test qmv/qmm_t (transpose=True): [MxK] @ [NxK].T = [MxN]
                    x = mx.random.normal(shape=(M, K))
                    y_q = mx.quantized_matmul(
                        x,
                        w_q,
                        *sb,
                        transpose=True,
                        group_size=group_size,
                        bits=bits,
                        mode=mode,
                    )
                    y_hat = x @ mx.swapaxes(w_hat, -1, -2)
                    self.assertEqual(y_q.shape, y_hat.shape)
                    self.assertLess((y_q - y_hat).abs().max(), 1e-3)

    def test_gather_qmm(self):
        def quantize(w, transpose=True, group_size=None, bits=None, mode="affine"):
            if mode == "affine":
                qw, s, b = mx.quantize(w, group_size=group_size, bits=bits, mode=mode)
            else:
                qw, s = mx.quantize(w, group_size=group_size, bits=bits, mode=mode)
                b = None
            w_hat = mx.dequantize(qw, s, b, group_size=group_size, bits=bits, mode=mode)
            if transpose:
                w_hat = w_hat.swapaxes(-1, -2)
            return w_hat, qw, s, b

        def test_shape(
            M,
            N,
            K,
            dtype=mx.float32,
            batch_A=(),
            batch_B=(),
            lhs_indices=None,
            rhs_indices=None,
            transpose=True,
            group_size=None,
            bits=None,
            mode="affine",
        ):
            with self.subTest(
                M=M,
                N=N,
                K=K,
                dtype=dtype,
                batch_A=batch_A,
                batch_B=batch_B,
                lhs_indices=lhs_indices,
                rhs_indices=rhs_indices,
                transpose=transpose,
                group_size=group_size,
                bits=bits,
                mode=mode,
            ):
                x = mx.random.normal(shape=batch_A + (M, K)).astype(dtype)
                w = mx.random.normal(
                    shape=batch_B + ((N, K) if transpose else (K, N))
                ).astype(dtype)
                w_hat, qw, s, b = quantize(w, transpose, group_size, bits, mode=mode)

                if lhs_indices is not None:
                    lhs_indices = mx.array(lhs_indices)
                if rhs_indices is not None:
                    rhs_indices = mx.array(rhs_indices)

                c1 = mx.gather_mm(x, w_hat, lhs_indices, rhs_indices)
                c2 = mx.gather_qmm(
                    x,
                    qw,
                    s,
                    b,
                    lhs_indices,
                    rhs_indices,
                    transpose=transpose,
                    group_size=group_size,
                    bits=bits,
                    mode=mode,
                )
                self.assertTrue(mx.allclose(c1, c2, atol=1e-4))

        inputs = (
            {
                "batch_A": (1,),
                "lhs_indices": (0,),
                "batch_B": (3,),
                "rhs_indices": (2, 1),
            },
            {
                "batch_A": (1,),
                "lhs_indices": None,
                "batch_B": (3,),
                "rhs_indices": (2, 1),
            },
            {
                "batch_A": (2,),
                "lhs_indices": None,
                "batch_B": (3,),
                "rhs_indices": (2, 1),
            },
            {
                "batch_A": (3,),
                "lhs_indices": (0, 2),
                "batch_B": (1,),
                "rhs_indices": (0,),
            },
            {
                "batch_A": (5,),
                "lhs_indices": (0, 2),
                "batch_B": (3,),
                "rhs_indices": (2, 1),
            },
            {
                "batch_A": (4, 2),
                "lhs_indices": (
                    (7, 6),
                    (5, 4),
                    (1, 2),
                ),
                "batch_B": (4, 1),
                "rhs_indices": ((2,), (0,), (1,)),
            },
            {
                "batch_A": (1,),
                "lhs_indices": (0,),
                "batch_B": (3,),
                "rhs_indices": (2, 1),
                "mode": "nvfp4",
            },
            {
                "batch_A": (1,),
                "lhs_indices": (0,),
                "batch_B": (3,),
                "rhs_indices": (2, 1),
                "mode": "mxfp4",
            },
            {
                "batch_A": (1,),
                "lhs_indices": (0,),
                "batch_B": (3,),
                "rhs_indices": (2, 1),
                "mode": "mxfp8",
            },
        )

        for kwargs in inputs:
            test_shape(1, 32, 128, **kwargs)
            test_shape(32, 32, 256, **kwargs)
            test_shape(1, 32, 256, **kwargs)
            test_shape(32, 256, 32, transpose=False, **kwargs)
            test_shape(1, 256, 32, transpose=False, **kwargs)
            test_shape(32, 32, 512, **kwargs)
            test_shape(1, 32, 512, **kwargs)
            test_shape(32, 512, 32, transpose=False, **kwargs)
            test_shape(1, 512, 32, transpose=False, **kwargs)

    def test_qmm_fp_type(self):
        indices = mx.array([[2], [0], [1]], dtype=mx.uint32)

        modes = ["mxfp8", "mxfp4"]
        for mode in modes:
            for t in [mx.bfloat16, mx.float16, mx.float32]:
                x = mx.random.normal((32, 256)).astype(t)

                w = mx.random.normal((32, 256))
                wq, s = mx.quantize(w, mode=mode)
                out = mx.quantized_matmul(x, wq, s, mode=mode)
                self.assertEqual(out.dtype, t)

                w = mx.random.normal((4, 32, 256))
                wq, s = mx.quantize(w, mode=mode)

                out = mx.gather_qmm(x, wq, s, rhs_indices=indices, mode=mode)
                self.assertEqual(out.dtype, t)

    def test_gather_matmul_grad(self):
        def quantize(w, transpose=True, group_size=64, bits=4):
            qw, s, b = mx.quantize(w, group_size=group_size, bits=bits)
            w_hat = mx.dequantize(qw, s, b, group_size=group_size, bits=bits)
            if transpose:
                w_hat = w_hat.swapaxes(-1, -2)
            return w_hat, qw, s, b

        lhs_indices = mx.array([[7, 6], [4, 1], [0, 2]], dtype=mx.uint32)
        rhs_indices = mx.array([[2], [0], [1]], dtype=mx.uint32)

        x = mx.random.normal((4, 2, 32, 256))
        w = mx.random.normal((4, 1, 32, 256))
        w_hat, qw, s, b = quantize(w)

        def f_ref(x, w, i1, i2):
            return mx.gather_mm(x, w, i1, i2).sum()

        def f_test(x, qw, s, b, i1, i2):
            return mx.gather_qmm(x, qw, s, b, i1, i2, transpose=True).sum()

        r1 = f_ref(x, w_hat, lhs_indices, rhs_indices)
        r2 = f_test(x, qw, s, b, lhs_indices, rhs_indices)
        self.assertTrue(mx.allclose(r1, r2, atol=1e-4))

        g1 = mx.grad(f_ref)(x, w_hat, lhs_indices, rhs_indices)
        g2 = mx.grad(f_test)(x, qw, s, b, lhs_indices, rhs_indices)
        self.assertTrue(mx.allclose(g1, g2, atol=1e-4))

    def test_gather_qmm_sorted(self):
        def quantize(w, transpose=True, group_size=None, mode="affine"):
            if mode == "affine":
                qw, s, b = mx.quantize(w, group_size=group_size, mode=mode)
            else:
                qw, s = mx.quantize(w, mode=mode)
                b = None

            w_hat = mx.dequantize(qw, s, b, group_size=group_size, mode=mode)
            if transpose:
                w_hat = w_hat.swapaxes(-1, -2)
            return w_hat, qw, s, b

        def gather_sort(x, indices):
            N, M = indices.shape
            indices = indices.flatten()
            order = mx.argsort(indices)
            inv_order = mx.argsort(order)
            return x.flatten(0, -3)[order // M], indices[order], inv_order

        def scatter_unsort(x, inv_order, shape=None):
            x = x[inv_order]
            if shape is not None:
                x = mx.unflatten(x, 0, shape)
            return x

        parameters = [
            # L, K, D, E, I, transpose
            (32, 512, 512, 4, 2, True, "affine"),
            (32, 512, 544, 4, 2, True, "mxfp4"),
            (32, 512, 544, 4, 2, True, "nvfp4"),
            (32, 512, 544, 4, 2, True, "mxfp8"),
            (133, 512, 512, 4, 2, True, "affine"),
            (133, 512, 555, 4, 2, True, "affine"),
            (133, 512, 512, 4, 2, True, "affine"),
            (64, 512, 512, 4, 2, False, "affine"),
            (64, 512, 544, 4, 2, False, "mxfp4"),
            (64, 512, 544, 4, 2, False, "nvfp4"),
            (64, 512, 544, 4, 2, False, "mxfp8"),
            (133, 512, 512, 4, 2, False, "affine"),
            (133, 512, 544, 4, 2, False, "affine"),
            (133, 512, 555, 4, 2, False, "affine"),
            (64, 512, 512, 4, 2, False, "affine"),
        ]

        key = mx.random.key(0)
        k1, k2, k3 = mx.random.split(key, 3)
        dtype = mx.float16 if (mx.default_device() == mx.gpu) else mx.float32

        for L, K, D, E, I, transpose, mode in parameters:
            with self.subTest(L=L, K=K, D=D, E=E, I=I, transpose=transpose, mode=mode):
                if mode != "affine":
                    group_size = None
                    dtype = (
                        mx.bfloat16 if (mx.default_device() == mx.gpu) else mx.float32
                    )
                else:
                    group_size = 64
                    dtype = (
                        mx.float16 if (mx.default_device() == mx.gpu) else mx.float32
                    )

                K, D = (K, D) if transpose else (D, K)
                ishape = (L, I)
                xshape = (L, 1, 1, K)
                wshape = (E, D, K) if transpose else (E, K, D)

                indices = (mx.random.uniform(shape=ishape, key=k1) * E).astype(
                    mx.uint32
                )
                x = mx.random.normal(xshape, key=k2) / K**0.5
                w = mx.random.normal(wshape, key=k3) / K**0.5

                x = x.astype(dtype)
                w = w.astype(dtype)

                w, *wq = quantize(
                    w, group_size=group_size, mode=mode, transpose=transpose
                )

                y1 = mx.gather_mm(x, w, rhs_indices=indices)
                y2 = mx.gather_qmm(
                    x,
                    *wq,
                    group_size=group_size,
                    mode=mode,
                    transpose=transpose,
                    rhs_indices=indices,
                )
                xs, idx, inv_order = gather_sort(x, indices)
                y3 = mx.gather_mm(xs, w, rhs_indices=idx, sorted_indices=True)

                y4 = mx.gather_qmm(
                    xs,
                    *wq,
                    group_size=group_size,
                    mode=mode,
                    rhs_indices=idx,
                    transpose=transpose,
                    sorted_indices=True,
                )
                y3 = scatter_unsort(y3, inv_order, indices.shape)
                y4 = scatter_unsort(y4, inv_order, indices.shape)

                tol = 1.5e-5 if (dtype == mx.float32) else 2.5e-4

                self.assertLess((y1 - y2).abs().max(), tol)
                self.assertLess((y1 - y3).abs().max(), tol)
                self.assertLess((y1 - y4).abs().max(), tol)

                self.assertTrue(mx.allclose(y1, y2, atol=tol))
                self.assertTrue(mx.allclose(y1, y3, atol=tol))
                self.assertTrue(mx.allclose(y1, y4, atol=tol))

    def test_gather_qmm_grad(self):
        def gather_qmm_ref(x, w, s, b, lhs, rhs, trans, sort):
            if lhs is not None:
                x = x[lhs]
            if rhs is not None:
                w = w[rhs]
                s = s[rhs]
                b = b[rhs]
            return mx.quantized_matmul(x, w, s, b, transpose=trans)

        def gather_qmm(x, w, s, b, lhs, rhs, trans, sort):
            return mx.gather_qmm(
                x,
                w,
                s,
                b,
                transpose=trans,
                lhs_indices=lhs,
                rhs_indices=rhs,
                sorted_indices=sort,
            )

        key = mx.random.key(0)
        k1, k2, k3, k4 = mx.random.split(key, 4)
        dtype = mx.float32

        x = mx.random.normal((16, 1, 256), key=k1).astype(dtype)
        w, s, b = mx.quantize(mx.random.normal((4, 256, 256), key=k2).astype(dtype))
        indices = mx.sort(mx.random.randint(0, 4, shape=(16,), key=k3))
        cotan = mx.random.normal((16, 1, 256), key=k4).astype(dtype)

        (o1,), (dx1, ds1, db1) = mx.vjp(
            lambda x, s, b: gather_qmm_ref(x, w, s, b, None, indices, True, True),
            [x, s, b],
            [cotan],
        )
        (o2,), (dx2, ds2, db2) = mx.vjp(
            lambda x, s, b: gather_qmm(x, w, s, b, None, indices, True, True),
            [x, s, b],
            [cotan],
        )

        self.assertLess((o1 - o2).abs().max(), 1e-4)
        self.assertTrue(mx.allclose(o1, o2, atol=1e-4))
        self.assertTrue(mx.allclose(dx1, dx2, atol=1e-4))
        self.assertTrue(mx.allclose(ds1, ds2, atol=1e-3))
        self.assertTrue(mx.allclose(db1, db2, atol=1e-3))

    def test_vjp_scales_biases(self):
        mx.random.seed(0)
        x = mx.random.normal(shape=(2, 2, 512))
        w = mx.random.normal(shape=(512, 512))
        wq, s, b = mx.quantize(w, bits=4, group_size=64)

        def mm(sb, x, wq):
            return mx.quantized_matmul(x, wq, *sb, bits=4, group_size=64).sum()

        params = (s, b)
        dparams = mx.grad(mm)((s, b), x, wq)

        eps = 8e-3
        # numerical grad check with a few indices
        indices = [(0, 0), (11, 4), (22, 7)]
        for idx in indices:
            for p in [0, 1]:
                params[p][idx] += eps
                out_up = mm(params, x, wq)
                params[p][idx] -= 2 * eps
                out_down = mm(params, x, wq)
                params[p][idx] += eps
                num_ds = (out_up - out_down) / (2 * eps)
                self.assertAlmostEqual(dparams[p][idx], num_ds, delta=2e-2)

    def test_fp_vjp_scales_throws(self):
        mx.random.seed(0)
        x = mx.random.normal(shape=(2, 512))
        w = mx.random.normal(shape=(512, 512))
        for mode in ["mxfp4", "mxfp8", "nvfp4"]:
            wq, s = mx.quantize(w, mode=mode)

            def mm(s, x, wq):
                return mx.quantized_matmul(x, wq, s, mode=mode).sum()

            # Should raise
            with self.assertRaises(ValueError):
                ds = mx.grad(mm)(s, x, wq)

            rhs_indices = mx.array(0)
            with self.assertRaises(ValueError):

                def gmm(s, x, wq):
                    return mx.gather_qmm(
                        x,
                        wq,
                        s,
                        rhs_indices=rhs_indices,
                        mode=mode,
                    ).sum()

                ds = mx.grad(gmm)(s, x, wq)

    def test_quantize_strided(self):
        N = 64
        mode = "nvfp4"
        w = mx.random.normal(shape=(N, N))
        w_q, scales = mx.quantize(w, mode="nvfp4")

        scales = mx.broadcast_to(mx.array(56, mx.uint8), scales.shape)
        w_hat = mx.dequantize(w_q, scales, mode=mode)
        expected = mx.dequantize(w_q, mx.contiguous(scales), mode=mode)
        self.assertTrue(mx.allclose(w_hat, expected))


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_random.py
================================================
# Copyright © 2023 Apple Inc.

import math
import unittest

import mlx.core as mx
import mlx_tests


class TestRandom(mlx_tests.MLXTestCase):
    def test_global_rng(self):
        mx.random.seed(3)
        a = mx.random.uniform()
        b = mx.random.uniform()

        mx.random.seed(3)
        x = mx.random.uniform()
        y = mx.random.uniform()

        self.assertEqual(a.item(), x.item())
        self.assertEqual(y.item(), b.item())

    def test_key(self):
        k1 = mx.random.key(0)
        k2 = mx.random.key(0)
        self.assertTrue(mx.array_equal(k1, k2))

        k2 = mx.random.key(1)
        self.assertFalse(mx.array_equal(k1, k2))

    def test_key_split(self):
        key = mx.random.key(0)

        k1, k2 = mx.random.split(key)
        self.assertFalse(mx.array_equal(k1, k2))

        r1, r2 = mx.random.split(key)
        self.assertTrue(mx.array_equal(k1, r1))
        self.assertTrue(mx.array_equal(k2, r2))

        keys = mx.random.split(key, 10)
        self.assertEqual(keys.shape, (10, 2))

    def test_uniform(self):
        key = mx.random.key(0)
        a = mx.random.uniform(key=key)
        self.assertEqual(a.shape, ())
        self.assertEqual(a.dtype, mx.float32)

        b = mx.random.uniform(key=key)
        self.assertEqual(a.item(), b.item())

        a = mx.random.uniform(shape=(2, 3))
        self.assertEqual(a.shape, (2, 3))

        a = mx.random.uniform(shape=(1000,), low=-1, high=5)
        self.assertTrue(mx.all((a > -1) < 5).item())

        a = mx.random.uniform(shape=(1000,), low=mx.array(-1), high=5)
        self.assertTrue(mx.all((a > -1) < 5).item())

        a = mx.random.uniform(low=-0.1, high=0.1, shape=(1,), dtype=mx.bfloat16)
        self.assertEqual(a.dtype, mx.bfloat16)

        self.assertEqual(mx.random.uniform().dtype, mx.random.uniform(dtype=None).dtype)

    def test_normal_and_laplace(self):
        # Same tests for normal and laplace.
        for distribution_sampler in [mx.random.normal, mx.random.laplace]:
            key = mx.random.key(0)
            a = distribution_sampler(key=key)
            self.assertEqual(a.shape, ())
            self.assertEqual(a.dtype, mx.float32)

            b = distribution_sampler(key=key)
            self.assertEqual(a.item(), b.item())

            a = distribution_sampler(shape=(2, 3))
            self.assertEqual(a.shape, (2, 3))

            ## Generate in float16 or bfloat16
            for t in [mx.float16, mx.bfloat16]:
                a = distribution_sampler(dtype=t)
                self.assertEqual(a.dtype, t)

            # Generate with a given mean and standard deviation
            loc = 1.0
            scale = 2.0

            a = distribution_sampler(shape=(3, 2), loc=loc, scale=scale, key=key)
            b = scale * distribution_sampler(shape=(3, 2), key=key) + loc
            self.assertTrue(mx.allclose(a, b))

            a = distribution_sampler(
                shape=(3, 2), loc=loc, scale=scale, dtype=mx.float16, key=key
            )
            b = (
                scale * distribution_sampler(shape=(3, 2), dtype=mx.float16, key=key)
                + loc
            )
            self.assertTrue(mx.allclose(a, b))

            self.assertEqual(
                distribution_sampler().dtype, distribution_sampler(dtype=None).dtype
            )

            # Test not getting -inf or inf with half precison
            for hp in [mx.float16, mx.bfloat16]:
                a = abs(distribution_sampler(shape=(10000,), loc=0, scale=1, dtype=hp))
                self.assertTrue(mx.all(a < mx.inf))

    def test_multivariate_normal(self):
        key = mx.random.key(0)
        mean = mx.array([0, 0])
        cov = mx.array([[1, 0], [0, 1]])

        a = mx.random.multivariate_normal(mean, cov, key=key, stream=mx.cpu)
        self.assertEqual(a.shape, (2,))

        ## Check dtypes
        for t in [mx.float32]:
            a = mx.random.multivariate_normal(
                mean, cov, dtype=t, key=key, stream=mx.cpu
            )
            self.assertEqual(a.dtype, t)
        for t in [
            mx.int8,
            mx.int32,
            mx.int64,
            mx.uint8,
            mx.uint32,
            mx.uint64,
            mx.float16,
            mx.bfloat16,
        ]:
            with self.assertRaises(ValueError):
                mx.random.multivariate_normal(
                    mean, cov, dtype=t, key=key, stream=mx.cpu
                )

        ## Check incompatible shapes
        with self.assertRaises(ValueError):
            mean = mx.zeros((2, 2))
            cov = mx.zeros((2, 2))
            mx.random.multivariate_normal(mean, cov, shape=(3,), key=key, stream=mx.cpu)

        with self.assertRaises(ValueError):
            mean = mx.zeros((2))
            cov = mx.zeros((2, 2, 2))
            mx.random.multivariate_normal(mean, cov, shape=(3,), key=key, stream=mx.cpu)

        with self.assertRaises(ValueError):
            mean = mx.zeros((3,))
            cov = mx.zeros((2, 2))
            mx.random.multivariate_normal(mean, cov, key=key, stream=mx.cpu)

        with self.assertRaises(ValueError):
            mean = mx.zeros((2,))
            cov = mx.zeros((2, 3))
            mx.random.multivariate_normal(mean, cov, key=key, stream=mx.cpu)

        ## Different shape of mean and cov
        mean = mx.array([[0, 7], [1, 2], [3, 4]])
        cov = mx.array([[1, 0.5], [0.5, 1]])
        a = mx.random.multivariate_normal(mean, cov, shape=(4, 3), stream=mx.cpu)
        self.assertEqual(a.shape, (4, 3, 2))

        ## Check correcteness of the mean and covariance
        n_test = int(1e5)

        def check_jointly_gaussian(data, mean, cov):
            empirical_mean = mx.mean(data, axis=0)
            empirical_cov = (
                (data - empirical_mean).T @ (data - empirical_mean) / data.shape[0]
            )
            N = data.shape[1]
            self.assertTrue(
                mx.allclose(
                    empirical_mean, mean, rtol=0.0, atol=10 * N**2 / math.sqrt(n_test)
                )
            )
            self.assertTrue(
                mx.allclose(
                    empirical_cov, cov, rtol=0.0, atol=10 * N**2 / math.sqrt(n_test)
                )
            )

        mean = mx.array([4.0, 7.0])
        cov = mx.array([[2, 0.5], [0.5, 1]])
        data = mx.random.multivariate_normal(
            mean, cov, shape=(n_test,), key=key, stream=mx.cpu
        )
        check_jointly_gaussian(data, mean, cov)

        mean = mx.arange(3)
        cov = mx.array([[1, -1, 0.5], [-1, 1, -0.5], [0.5, -0.5, 1]])
        data = mx.random.multivariate_normal(
            mean, cov, shape=(n_test,), key=key, stream=mx.cpu
        )
        check_jointly_gaussian(data, mean, cov)

    def test_randint(self):
        a = mx.random.randint(0, 1, [])
        self.assertEqual(a.shape, ())
        self.assertEqual(a.dtype, mx.int32)

        shape = (88,)
        low = mx.array(3)
        high = mx.array(15)

        key = mx.random.key(0)
        a = mx.random.randint(low, high, shape, key=key)
        self.assertEqual(a.shape, shape)
        self.assertEqual(a.dtype, mx.int32)

        # Check using the same key yields the same value
        b = mx.random.randint(low, high, shape, key=key)
        self.assertListEqual(a.tolist(), b.tolist())

        shape = (3, 4)
        low = mx.reshape(mx.array([0] * 3), [3, 1])
        high = mx.reshape(mx.array([12, 13, 14, 15]), [1, 4])

        a = mx.random.randint(low, high, shape)
        self.assertEqual(a.shape, shape)

        a = mx.random.randint(-10, 10, [1000, 1000])
        self.assertTrue(mx.all(-10 <= a).item() and mx.all(a < 10).item())

        a = mx.random.randint(10, -10, [1000, 1000])
        self.assertTrue(mx.all(a == 10).item())

        self.assertEqual(
            mx.random.randint(0, 1).dtype, mx.random.randint(0, 1, dtype=None).dtype
        )

    def test_bernoulli(self):
        a = mx.random.bernoulli()
        self.assertEqual(a.shape, ())
        self.assertEqual(a.dtype, mx.bool_)

        a = mx.random.bernoulli(mx.array(0.5), [5])
        self.assertEqual(a.shape, (5,))

        a = mx.random.bernoulli(mx.array([2.0, -2.0]))
        self.assertEqual(a.tolist(), [True, False])
        self.assertEqual(a.shape, (2,))

        p = mx.array([0.1, 0.2, 0.3])
        mx.reshape(p, [1, 3])
        x = mx.random.bernoulli(p, [4, 3])
        self.assertEqual(x.shape, (4, 3))

        with self.assertRaises(ValueError):
            mx.random.bernoulli(p, [2])  # Bad shape

        with self.assertRaises(ValueError):
            mx.random.bernoulli(0, [2])  # Bad type

    def test_truncated_normal(self):
        a = mx.random.truncated_normal(-2.0, 2.0)
        self.assertEqual(a.size, 1)
        self.assertEqual(a.dtype, mx.float32)

        a = mx.random.truncated_normal(mx.array([]), mx.array([]))
        self.assertEqual(a.dtype, mx.float32)
        self.assertEqual(a.size, 0)

        lower = mx.reshape(mx.array([-2.0, 0.0]), [1, 2])
        upper = mx.reshape(mx.array([0.0, 1.0, 2.0]), [3, 1])
        a = mx.random.truncated_normal(lower, upper)

        self.assertEqual(a.shape, (3, 2))
        self.assertTrue(mx.all(lower <= a).item() and mx.all(a <= upper).item())

        a = mx.random.truncated_normal(2.0, -2.0)
        self.assertTrue(mx.all(a == 2.0).item())

        a = mx.random.truncated_normal(-3.0, 3.0, [542, 399])
        self.assertEqual(a.shape, (542, 399))

        lower = mx.array([-2.0, -1.0])
        higher = mx.array([1.0, 2.0, 3.0])
        with self.assertRaises(ValueError):
            mx.random.truncated_normal(lower, higher)  # Bad shape

        self.assertEqual(
            mx.random.truncated_normal(0, 1).dtype,
            mx.random.truncated_normal(0, 1, dtype=None).dtype,
        )

    def test_gumbel(self):
        samples = mx.random.gumbel(shape=(100, 100))
        self.assertEqual(samples.shape, (100, 100))
        self.assertEqual(samples.dtype, mx.float32)
        mean = 0.5772
        # Std deviation of the sample mean is small (<0.02),
        # so this test is pretty conservative
        self.assertTrue(mx.abs(mx.mean(samples) - mean) < 0.2)

        self.assertEqual(
            mx.random.gumbel((1, 1)).dtype, mx.random.gumbel((1, 1), dtype=None).dtype
        )

    def test_categorical(self):
        logits = mx.zeros((10, 20))
        self.assertEqual(mx.random.categorical(logits, -1).shape, (10,))
        self.assertEqual(mx.random.categorical(logits, 0).shape, (20,))
        self.assertEqual(mx.random.categorical(logits, 1).shape, (10,))

        out = mx.random.categorical(logits)
        self.assertEqual(out.shape, (10,))
        self.assertEqual(out.dtype, mx.uint32)
        self.assertTrue(mx.max(out).item() < 20)

        out = mx.random.categorical(logits, 0, [5, 20])
        self.assertEqual(out.shape, (5, 20))
        self.assertTrue(mx.max(out).item() < 10)

        out = mx.random.categorical(logits, 1, num_samples=7)
        self.assertEqual(out.shape, (10, 7))
        out = mx.random.categorical(logits, 0, num_samples=7)
        self.assertEqual(out.shape, (20, 7))

        with self.assertRaises(ValueError):
            mx.random.categorical(logits, shape=[10, 5], num_samples=5)

    def test_permutation(self):
        x = sorted(mx.random.permutation(4).tolist())
        self.assertEqual([0, 1, 2, 3], x)

        x = mx.array([0, 1, 2, 3])
        x = sorted(mx.random.permutation(x).tolist())
        self.assertEqual([0, 1, 2, 3], x)

        x = mx.array([0, 1, 2, 3])
        x = sorted(mx.random.permutation(x).tolist())

        # 2-D
        x = mx.arange(16).reshape(4, 4)
        out = mx.sort(mx.random.permutation(x, axis=0), axis=0)
        self.assertTrue(mx.array_equal(x, out))
        out = mx.sort(mx.random.permutation(x, axis=1), axis=1)
        self.assertTrue(mx.array_equal(x, out))

        # Basically 0 probability this should fail.
        sorted_x = mx.arange(16384)
        x = mx.random.permutation(16384)
        self.assertFalse(mx.array_equal(sorted_x, x))

        # Preserves shape / doesn't cast input to int
        x = mx.random.permutation(mx.array([[1]]))
        self.assertEqual(x.shape, (1, 1))

    def test_complex_normal(self):
        sample = mx.random.normal(tuple(), dtype=mx.complex64)
        self.assertEqual(sample.shape, tuple())
        self.assertEqual(sample.dtype, mx.complex64)

        sample = mx.random.normal((1, 2, 3, 4), dtype=mx.complex64)
        self.assertEqual(sample.shape, (1, 2, 3, 4))
        self.assertEqual(sample.dtype, mx.complex64)

        sample = mx.random.normal((1, 2, 3, 4), dtype=mx.complex64, scale=2.0, loc=3.0)
        self.assertEqual(sample.shape, (1, 2, 3, 4))
        self.assertEqual(sample.dtype, mx.complex64)

        sample = mx.random.normal(
            (1, 2, 3, 4), dtype=mx.complex64, scale=2.0, loc=3.0 + 1j
        )
        self.assertEqual(sample.shape, (1, 2, 3, 4))
        self.assertEqual(sample.dtype, mx.complex64)

    def test_broadcastable_scale_loc(self):
        b = mx.random.normal((10, 2))
        sample = mx.random.normal((2, 10, 2), loc=b, scale=b)
        mx.eval(sample)
        self.assertEqual(sample.shape, (2, 10, 2))

        with self.assertRaises(ValueError):
            b = mx.random.normal((10,))
            sample = mx.random.normal((2, 10, 2), loc=b, scale=b)

        b = mx.random.normal((3, 1, 2))
        sample = mx.random.normal((3, 4, 2), dtype=mx.float16, loc=b, scale=b)
        mx.eval(sample)
        self.assertEqual(sample.shape, (3, 4, 2))
        self.assertEqual(sample.dtype, mx.float16)


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_reduce.py
================================================
# Copyright © 2023 Apple Inc.

import unittest
from itertools import combinations, permutations

import mlx.core as mx
import mlx_tests
import numpy as np


class TestReduce(mlx_tests.MLXTestCase):
    def test_axis_permutation_sums(self):
        for shape in [(5, 5, 1, 5, 5), (65, 65, 1, 65)]:
            with self.subTest(shape=shape):
                x_npy = (np.random.randn(*shape) * 128).astype(np.int32)
                x_mlx = mx.array(x_npy)
                for t in permutations(range(len(shape))):
                    with self.subTest(t=t):
                        y_npy = np.transpose(x_npy, t)
                        y_mlx = mx.transpose(x_mlx, t)
                        for n in range(1, len(shape) + 1):
                            for a in combinations(range(len(shape)), n):
                                with self.subTest(a=a):
                                    z_npy = np.sum(y_npy, axis=a)
                                    z_mlx = mx.sum(y_mlx, axis=a)
                                    mx.eval(z_mlx)
                                    self.assertTrue(np.all(z_npy == z_mlx))

    def test_expand_sums(self):
        x_npy = np.random.randn(5, 1, 5, 1, 5, 1).astype(np.float32)
        x_mlx = mx.array(x_npy)
        for m in range(1, 4):
            for ax in combinations([1, 3, 5], m):
                shape = np.array([5, 1, 5, 1, 5, 1])
                shape[list(ax)] = 5
                shape = shape.tolist()
                with self.subTest(shape=shape):
                    y_npy = np.broadcast_to(x_npy, shape)
                    y_mlx = mx.broadcast_to(x_mlx, shape)
                    for n in range(1, 7):
                        for a in combinations(range(6), n):
                            with self.subTest(a=a):
                                z_npy = np.sum(y_npy, axis=a) / 1000
                                z_mlx = mx.sum(y_mlx, axis=a) / 1000
                                mx.eval(z_mlx)
                                self.assertTrue(
                                    np.allclose(z_npy, np.array(z_mlx), atol=1e-4)
                                )

    def test_dtypes(self):
        int_dtypes = [
            "int8",
            "int16",
            "int32",
            "uint8",
            "uint16",
            "uint32",
            "int64",
            "uint64",
            "complex64",
        ]
        float_dtypes = ["float32"]

        for dtype in int_dtypes + float_dtypes:
            with self.subTest(dtype=dtype):
                x = np.random.uniform(0, 2, size=(3, 3, 3)).astype(getattr(np, dtype))
                y = mx.array(x)

                for op in ("sum", "prod", "min", "max"):
                    with self.subTest(op=op):
                        np_op = getattr(np, op)
                        mlx_op = getattr(mx, op)

                        for axes in (None, 0, 1, 2, (0, 1), (0, 2), (1, 2), (0, 1, 2)):
                            with self.subTest(axes=axes):
                                if op in ("sum", "prod"):
                                    r_np = np_op(
                                        x, axis=axes, dtype=(getattr(np, dtype))
                                    )
                                else:
                                    r_np = np_op(x, axis=axes)
                                r_mlx = mlx_op(y, axis=axes)
                                mx.eval(r_mlx)
                                self.assertTrue(np.allclose(r_np, r_mlx, atol=1e-4))

    def test_arg_reduce(self):
        dtypes = [
            "uint8",
            "uint16",
            "uint32",
            "uint64",
            "int8",
            "int16",
            "int32",
            "int64",
            "float16",
            "float32",
        ]
        for dtype in dtypes:
            with self.subTest(dtype=dtype):
                data = np.random.rand(10, 12, 13).astype(getattr(np, dtype))
                x = mx.array(data)
                for op in ["argmin", "argmax"]:
                    for axis in range(3):
                        for kd in [True, False]:
                            a = getattr(mx, op)(x, axis, kd)
                            b = getattr(np, op)(data, axis, keepdims=kd)
                            self.assertEqual(a.tolist(), b.tolist())

                for op in ["argmin", "argmax"]:
                    a = getattr(mx, op)(x, keepdims=True)
                    b = getattr(np, op)(data, keepdims=True)
                    self.assertEqual(a.tolist(), b.tolist())
                    a = getattr(mx, op)(x)
                    b = getattr(np, op)(data)
                    self.assertEqual(a.item(), b)

    def test_edge_case(self):
        x = (mx.random.normal((100, 1, 100, 100)) * 128).astype(mx.int32)
        x = x.transpose(0, 3, 1, 2)

        y = x.sum((0, 2, 3))
        mx.eval(y)
        z = np.array(x).sum((0, 2, 3))
        self.assertTrue(np.all(z == y))

    def test_sum_bool(self):
        x = np.random.uniform(0, 1, size=(10, 10, 10)) > 0.5
        y = mx.array(x)
        npsum = x.sum().item()
        mxsum = y.sum().item()
        self.assertEqual(npsum, mxsum)

    def test_many_reduction_axes(self):

        def check(x, axes):
            expected = x
            for ax in axes:
                expected = mx.sum(expected, axis=ax, keepdims=True)
            out = mx.sum(x, axis=axes, keepdims=True)
            self.assertTrue(mx.array_equal(out, expected))

        x = mx.random.randint(0, 10, shape=(4, 4, 4, 4, 4))
        check(x, (0, 2, 4))

        x = mx.random.randint(0, 10, shape=(4, 4, 4, 4, 4, 4, 4))
        check(x, (0, 2, 4, 6))

        x = mx.random.randint(0, 10, shape=(4, 4, 4, 4, 4, 4, 4, 4, 4))
        check(x, (0, 2, 4, 6, 8))

        x = mx.random.randint(0, 10, shape=(4, 4, 4, 4, 4, 4, 4, 4, 4, 128))
        x = x.transpose(1, 0, 2, 3, 4, 5, 6, 7, 8, 9)
        check(x, (1, 3, 5, 7, 9))

    def test_nan_propagation(self):
        dtypes = [
            "uint8",
            "uint16",
            "uint32",
            "int8",
            "int16",
            "int32",
            "float16",
            "float32",
        ]

        for dtype in dtypes:
            with self.subTest(dtype=dtype):
                x = (mx.random.normal((4, 4)) * 10).astype(getattr(mx, dtype))
                indices = mx.random.randint(0, 4, shape=(6,)).reshape(3, 2)
                for idx in indices:
                    x[idx[0], idx[1]] = mx.nan
                x_np = np.array(x)

                for op in ["max", "min"]:
                    for axis in [0, 1]:
                        out = getattr(mx, op)(x, axis=axis)
                        ref = getattr(np, op)(x_np, axis=axis)
                        self.assertTrue(np.array_equal(out, ref, equal_nan=True))

    def test_nan_propagation_complex64(self):
        complex_array_1 = mx.array(
            [1 + 1j, 2 + 2j, 3 + 3j, mx.nan + 4j], dtype=mx.complex64
        ).reshape(2, 2)
        complex_array_2 = mx.array(
            [1 + 1j, 2 + 2j, 3 + mx.nan * 1j, 4 + 4j], dtype=mx.complex64
        ).reshape(2, 2)
        complex_array_3 = mx.array(
            [1 + 1j, 2 + mx.nan * 1j, 3 + 3j, 4 + 4j], dtype=mx.complex64
        ).reshape(2, 2)
        complex_array_4 = mx.array(
            [mx.nan + 1j, 2 + 2j, 3 + 3j, 4 + 4j], dtype=mx.complex64
        ).reshape(2, 2)

        np_arrays = [
            np.array(complex_array_1),
            np.array(complex_array_2),
            np.array(complex_array_3),
            np.array(complex_array_4),
        ]

        for mx_arr, np_arr in zip(
            [complex_array_1, complex_array_2, complex_array_3, complex_array_4],
            np_arrays,
        ):
            for axis in [0, 1]:
                for op in ["max", "min"]:
                    out = getattr(mx, op)(mx_arr, axis=axis)
                    ref = getattr(np, op)(np_arr, axis=axis)
                    self.assertTrue(np.array_equal(out, ref, equal_nan=True))

    def test_long_column(self):
        a = (np.random.randn(8192, 64) * 32).astype(np.int32)
        b = mx.array(a)

        c1 = a.sum(0)
        c2 = b.sum(0)
        self.assertTrue(np.all(c1 == c2))


if __name__ == "__main__":
    mlx_tests.MLXTestRunner(failfast=True)


================================================
FILE: python/tests/test_tree.py
================================================
# Copyright © 2023 Apple Inc.

import unittest

import mlx.core as mx
import mlx.nn as nn
import mlx.utils
import mlx_tests


class TestTreeUtils(mlx_tests.MLXTestCase):
    def test_tree_map(self):
        tree = {"a": 0, "b": 1, "c": 2}
        tree = mlx.utils.tree_map(lambda x: x + 1, tree)

        expected_tree = {"a": 1, "b": 2, "c": 3}
        self.assertEqual(tree, expected_tree)

    def test_tree_flatten(self):
        tree = [{"a": 1, "b": 2}, "c"]
        vals = (1, 2, "c")
        flat_tree = mlx.utils.tree_flatten(tree)
        self.assertEqual(list(zip(*flat_tree))[1], vals)
        self.assertEqual(mlx.utils.tree_unflatten(flat_tree), tree)

    def test_merge(self):
        t1 = {"a": 0}
        t2 = {"b": 1}
        t = mlx.utils.tree_merge(t1, t2)
        self.assertEqual({"a": 0, "b": 1}, t)
        with self.assertRaises(ValueError):
            mlx.utils.tree_merge(t1, t1)
        with self.assertRaises(ValueError):
            mlx.utils.tree_merge(t, t1)

        mod1 = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
        mod2 = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
        mod = nn.Sequential(mod1, mod2)

        params1 = {"layers": [mod1.parameters()]}
        params2 = {"layers": [None, mod2.parameters()]}
        params = mlx.utils.tree_merge(params1, params2)
        for (k1, v1), (k2, v2) in zip(
            mlx.utils.tree_flatten(params), mlx.utils.tree_flatten(mod.parameters())
        ):
            self.assertEqual(k1, k2)
            self.assertTrue(mx.array_equal(v1, v2))

    def test_supported_trees(self):

        from typing import NamedTuple

        class Vector(tuple):
            pass

        class Params(NamedTuple):
            m: mx.array
            b: mx.array

        list1 = [mx.array([0, 1]), mx.array(2)]
        tuple1 = (mx.array([0, 1]), mx.array(2))
        vector1 = Vector([mx.array([0, 1]), mx.array(2)])
        params1 = Params(m=mx.array([0, 1]), b=mx.array(2))
        dict1 = {"m": mx.array([0, 1]), "b": mx.array(2)}

        add_one = lambda x: x + 1

        list2 = mlx.utils.tree_map(add_one, list1)
        tuple2 = mlx.utils.tree_map(add_one, tuple1)
        vector2 = mlx.utils.tree_map(add_one, vector1)
        params2 = mlx.utils.tree_map(add_one, params1)
        dict2 = mlx.utils.tree_map(add_one, dict1)

        self.assertTrue(isinstance(list2, list))
        self.assertTrue(mx.array_equal(list2[0], mx.array([1, 2])))
        self.assertTrue(mx.array_equal(list2[1], mx.array(3)))

        self.assertTrue(isinstance(tuple2, tuple))
        self.assertTrue(mx.array_equal(tuple2[0], mx.array([1, 2])))
        self.assertTrue(mx.array_equal(tuple2[1], mx.array(3)))

        self.assertTrue(isinstance(vector2, Vector))
        self.assertTrue(mx.array_equal(vector2[0], mx.array([1, 2])))
        self.assertTrue(mx.array_equal(vector2[1], mx.array(3)))

        self.assertTrue(isinstance(dict2, dict))
        self.assertTrue(mx.array_equal(dict2["m"], mx.array([1, 2])))
        self.assertTrue(mx.array_equal(dict2["b"], mx.array(3)))

        self.assertTrue(isinstance(params2, Params))
        self.assertTrue(mx.array_equal(params2.m, mx.array([1, 2])))
        self.assertTrue(mx.array_equal(params2.b, mx.array(3)))


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_upsample.py
================================================
# Copyright © 2023-2024 Apple Inc.

import unittest

import mlx.core as mx
import mlx.nn as nn
import mlx_tests
import numpy as np

try:
    import torch
    import torch.nn.functional as F

    has_torch = True
except ImportError as e:
    has_torch = False


class TestUpsample(mlx_tests.MLXTestCase):
    @unittest.skipIf(not has_torch, "requires Torch")
    def test_torch_upsample(self):
        def run_upsample(
            N,
            C,
            idim,
            scale_factor,
            mode,
            align_corner,
            dtype="float32",
            atol=1e-5,
        ):
            with self.subTest(
                N=N,
                C=C,
                idim=idim,
                scale_factor=scale_factor,
                mode=mode,
                align_corner=align_corner,
            ):
                np_dtype = getattr(np, dtype)
                np.random.seed(0)
                iH, iW = idim
                in_np = np.random.normal(-1.0, 1.0, (N, iH, iW, C)).astype(np_dtype)

                in_mx = mx.array(in_np)
                in_pt = torch.from_numpy(in_np.transpose(0, 3, 1, 2)).to("cpu")

                out_mx = nn.Upsample(
                    scale_factor=scale_factor,
                    mode=mode,
                    align_corners=align_corner,
                )(in_mx)
                mode_pt = {
                    "nearest": "nearest",
                    "linear": "bilinear",
                    "cubic": "bicubic",
                }[mode]
                out_pt = F.interpolate(
                    in_pt,
                    scale_factor=scale_factor,
                    mode=mode_pt,
                    align_corners=align_corner if mode != "nearest" else None,
                )
                out_pt = torch.permute(out_pt, (0, 2, 3, 1)).numpy(force=True)
                self.assertEqual(out_pt.shape, out_mx.shape)
                self.assertTrue(np.allclose(out_pt, out_mx, atol=atol))

        for dtype in ("float32",):
            for N, C in ((1, 1), (2, 3)):
                # only test cases in which target sizes are intergers
                # if not, there will be numerical difference between mlx
                # and torch due to different indices selection.
                for idim, scale_factor in (
                    ((2, 2), (1.0, 1.0)),
                    ((2, 2), (1.5, 1.5)),
                    ((2, 2), (2.0, 2.0)),
                    ((4, 4), (0.5, 0.5)),
                    ((7, 7), (2.0, 2.0)),
                    ((10, 10), (0.2, 0.2)),
                    ((10, 10), (0.3, 0.3)),
                    ((11, 21), (3.0, 3.0)),
                    ((11, 21), (3.0, 2.0)),
                ):
                    for mode in ("cubic", "linear", "nearest"):
                        for align_corner in (False, True):
                            if mode == "nearest" and align_corner:
                                continue
                            run_upsample(
                                N,
                                C,
                                idim,
                                scale_factor,
                                mode,
                                align_corner,
                                dtype=dtype,
                            )


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: python/tests/test_vmap.py
================================================
# Copyright © 2023-2024 Apple Inc.

import gc
import unittest

import mlx.core as mx
import mlx_tests


class TestVmap(mlx_tests.MLXTestCase):
    def test_basics(self):
        # Can't vmap over scalars
        with self.assertRaises(ValueError):
            mx.vmap(mx.exp)(mx.array(1.0))

        # Invalid input
        with self.assertRaises(ValueError):
            mx.vmap(mx.exp)("hello")

        # Invalid axes
        with self.assertRaises(ValueError):
            mx.vmap(mx.exp, in_axes="hello")(mx.array([0, 1]))

        with self.assertRaises(ValueError):
            mx.vmap(mx.exp, in_axes=2)(mx.array([0, 1]))

        with self.assertRaises(ValueError):
            mx.vmap(mx.exp, out_axes="hello")(mx.array([0, 1]))

        with self.assertRaises(ValueError):
            mx.vmap(mx.exp, out_axes=2)(mx.array([0, 1]))

    def test_unary(self):
        ops = [
            "abs",
            "cos",
            "erf",
            "erfinv",
            "exp",
            "log",
            "log1p",
            "log2",
            "log10",
            "logical_not",
            "negative",
            "reciprocal",
            "rsqrt",
            "sigmoid",
            "sign",
            "sin",
            "sqrt",
            "square",
            "degrees",
            "radians",
        ]
        for opname in ops:
            with self.subTest(op=opname):
                op = getattr(mx, opname)
                x = mx.arange(5)
                y = mx.vmap(op)(x)
                self.assertTrue(mx.array_equal(y, op(x), equal_nan=True))

                x = mx.arange(8).reshape(2, 4)
                y = mx.vmap(op)(x)
                self.assertTrue(mx.array_equal(y, op(x), equal_nan=True))

                y = mx.vmap(op, in_axes=1, out_axes=1)(x)
                self.assertTrue(mx.array_equal(y, op(x), equal_nan=True))

    def test_binary(self):
        ops = [
            "add",
            "divide",
            "equal",
            "greater",
            "greater_equal",
            "less",
            "less_equal",
            "logaddexp",
            "maximum",
            "minimum",
            "multiply",
            "power",
            "subtract",
            "logical_or",
            "logical_and",
        ]
        for opname in ops:
            with self.subTest(op=opname):
                op = getattr(mx, opname)
                x = mx.random.uniform(shape=(5,))
                y = mx.random.uniform(shape=(5,))
                out = mx.vmap(op)(x, y)
                self.assertTrue(mx.array_equal(out, op(x, y)))

                x = mx.random.uniform(shape=(2, 4))
                y = mx.random.uniform(shape=(2, 4))
                out = mx.vmap(op)(x, y)
                self.assertTrue(mx.array_equal(out, op(x, y)))

                out = mx.vmap(op, in_axes=(0, 0), out_axes=0)(x, y)
                self.assertTrue(mx.array_equal(out, op(x, y)))

                y = mx.random.uniform(shape=(4, 2))
                out = mx.vmap(op, in_axes=(0, 1), out_axes=0)(x, y)
                self.assertTrue(mx.array_equal(out, op(x, y.T)))

                out = mx.vmap(op, in_axes=(0, 1), out_axes=1)(x, y)
                self.assertTrue(mx.array_equal(out, op(x, y.T).T))

    def test_tree(self):
        def my_fun(tree):
            return (tree["a"] + tree["b"][0]) * tree["b"][1]

        tree = {
            "a": mx.random.uniform(shape=(2, 4)),
            "b": (
                mx.random.uniform(shape=(2, 4)),
                mx.random.uniform(shape=(2, 4)),
            ),
        }
        out = mx.vmap(my_fun)(tree)
        expected = my_fun(tree)
        self.assertTrue(mx.array_equal(out, my_fun(tree)))

        with self.assertRaises(ValueError):
            mx.vmap(my_fun, in_axes={"a": 0, "b": ((0, 0), 0)}, out_axes=0)(tree)

        out = mx.vmap(my_fun, in_axes={"a": 0, "b": 0}, out_axes=0)(tree)
        self.assertTrue(mx.array_equal(out, my_fun(tree)))

        out = mx.vmap(my_fun, in_axes={"a": 0, "b": (0, 0)}, out_axes=0)(tree)
        self.assertTrue(mx.array_equal(out, my_fun(tree)))

        tree = {
            "a": mx.random.uniform(shape=(2, 4)),
            "b": (
                mx.random.uniform(shape=(4, 2)),
                mx.random.uniform(shape=(4, 2)),
            ),
        }
        out = mx.vmap(my_fun, in_axes={"a": 0, "b": (1, 1)}, out_axes=0)(tree)
        expected = (tree["a"] + tree["b"][0].T) * tree["b"][1].T
        self.assertTrue(mx.array_equal(out, expected))

        def my_fun(x, y):
            return {"a": x + y, "b": x * y}

        x = mx.random.uniform(shape=(2, 4))
        y = mx.random.uniform(shape=(2, 4))
        out = mx.vmap(my_fun, in_axes=0, out_axes=0)(x, y)
        expected = my_fun(x, y)
        self.assertTrue(mx.array_equal(out["a"], expected["a"]))
        self.assertTrue(mx.array_equal(out["b"], expected["b"]))

        with self.assertRaises(ValueError):
            mx.vmap(my_fun, in_axes=0, out_axes=(0, 1))(x, y)

        with self.assertRaises(ValueError):
            mx.vmap(my_fun, in_axes=0, out_axes={"a": 0, "c": 1})(x, y)

        out = mx.vmap(my_fun, in_axes=0, out_axes={"a": 1, "b": 0})(x, y)
        expected = my_fun(x, y)
        self.assertTrue(mx.array_equal(out["a"].T, expected["a"]))
        self.assertTrue(mx.array_equal(out["b"], expected["b"]))

    def test_vmap_indexing(self):
        x = mx.arange(16).reshape(2, 2, 2, 2)
        inds = mx.array([[0, 1, 0], [1, 1, 0]])

        out = mx.vmap(lambda x, y: x[y], in_axes=(0, 0))(x, inds)
        expected = mx.array(
            [
                [[[0, 1], [2, 3]], [[4, 5], [6, 7]], [[0, 1], [2, 3]]],
                [[[12, 13], [14, 15]], [[12, 13], [14, 15]], [[8, 9], [10, 11]]],
            ]
        )
        self.assertTrue(mx.array_equal(out, expected))

        out = mx.vmap(lambda x, y: x[y], in_axes=(0, None))(x, inds)
        expected = mx.array(
            [
                [
                    [[[0, 1], [2, 3]], [[4, 5], [6, 7]], [[0, 1], [2, 3]]],
                    [[[4, 5], [6, 7]], [[4, 5], [6, 7]], [[0, 1], [2, 3]]],
                ],
                [
                    [[[8, 9], [10, 11]], [[12, 13], [14, 15]], [[8, 9], [10, 11]]],
                    [[[12, 13], [14, 15]], [[12, 13], [14, 15]], [[8, 9], [10, 11]]],
                ],
            ]
        )
        self.assertTrue(mx.array_equal(out, expected))

        out = mx.vmap(lambda x, y: x[y], in_axes=(None, 0))(x, inds)
        expected = mx.array(
            [
                [
                    [[[0, 1], [2, 3]], [[4, 5], [6, 7]]],
                    [[[8, 9], [10, 11]], [[12, 13], [14, 15]]],
                    [[[0, 1], [2, 3]], [[4, 5], [6, 7]]],
                ],
                [
                    [[[8, 9], [10, 11]], [[12, 13], [14, 15]]],
                    [[[8, 9], [10, 11]], [[12, 13], [14, 15]]],
                    [[[0, 1], [2, 3]], [[4, 5], [6, 7]]],
                ],
            ]
        )
        self.assertTrue(mx.array_equal(out, expected))

        inds2 = mx.array([[0, 1, 0], [0, 1, 0]])
        out = mx.vmap(lambda x, y, z: x[y, z], in_axes=(None, 0, 0))(x, inds, inds2)
        expected = mx.array(
            [
                [[[0, 1], [2, 3]], [[12, 13], [14, 15]], [[0, 1], [2, 3]]],
                [[[8, 9], [10, 11]], [[12, 13], [14, 15]], [[0, 1], [2, 3]]],
            ]
        )
        self.assertTrue(mx.array_equal(out, expected))

    def test_vmap_reduce(self):
        a = mx.ones((5, 5), mx.int32)
        out = mx.vmap(lambda x: x.sum())(a)
        self.assertTrue(mx.array_equal(out, mx.full((5,), 5)))

        out = mx.vmap(lambda x: x.sum(keepdims=True))(a)
        self.assertTrue(mx.array_equal(out, mx.full((5, 1), 5)))

        out = mx.vmap(lambda x: x.sum(axis=0))(a)
        self.assertTrue(mx.array_equal(out, mx.full((5,), 5)))

        a = mx.ones((5, 3, 2), mx.int32)
        out = mx.vmap(lambda x: x.sum(axis=(0, 1)))(a)
        self.assertTrue(mx.array_equal(out, mx.full((5,), 6)))

        a = mx.ones((5, 3, 2), mx.int32)
        out = mx.vmap(lambda x: x.sum(axis=(0, 1)), in_axes=(1,))(a)
        self.assertTrue(mx.array_equal(out, mx.full((3,), 10)))

        a = mx.ones((5, 3, 2), mx.int32)
        out = mx.vmap(lambda x: x.sum(axis=(0, 1)), in_axes=(2,))(a)
        self.assertTrue(mx.array_equal(out, mx.full((2,), 15)))

    def test_vmap_argreduce(self):
        a = mx.array([[1, 2, 3], [2, 3, 1]])
        out = mx.vmap(lambda x: mx.argmin(x))(a)
        expected = mx.array([0, 2])
        self.assertTrue(mx.array_equal(out, expected))

        out = mx.vmap(lambda x: mx.argmax(x))(a)
        expected = mx.array([2, 1])
        self.assertTrue(mx.array_equal(out, expected))

    def test_vmap_mean(self):
        a = mx.arange(8).reshape(2, 4)
        out = mx.vmap(mx.mean)(a)
        expected = mx.mean(a, axis=1)
        self.assertTrue(mx.allclose(out, expected))

        a = mx.arange(16).reshape(2, 2, 4)
        out = mx.vmap(mx.vmap(mx.mean))(a)
        expected = mx.mean(a, axis=2)
        self.assertTrue(mx.allclose(out, expected))

    def test_mismatch_input_sizes(self):
        a = mx.ones((10, 1))
        b = mx.ones((1, 1, 1, 5))

        with self.assertRaises(ValueError):
            out = mx.vmap(lambda x, y: x + y)(a, b)

        b = mx.ones((10, 5))
        with self.assertRaises(ValueError):
            out = mx.vmap(lambda x, y: x + y, in_axes=(0, 1))(a, b)

    def test_vmap_matmul(self):
        a = mx.random.uniform(shape=(2, 3, 4))
        b = mx.random.uniform(shape=(4, 3))

        # matmul
        out = mx.vmap(mx.matmul, in_axes=(0, None))(a, b)
        self.assertTrue(mx.allclose(out, a @ b))

        # addmm
        c = mx.random.uniform(shape=(3,))
        out = mx.vmap(mx.addmm, in_axes=(None, 0, None))(c, a, b)
        self.assertTrue(mx.allclose(out, mx.addmm(c, a, b)))

        b = mx.random.uniform(shape=(4, 2))

        # matmul
        out = mx.vmap(mx.matmul, in_axes=(1, None), out_axes=(1,))(a, b)
        expected = mx.moveaxis(mx.moveaxis(a, 1, 0) @ b, 0, 1)
        self.assertTrue(mx.allclose(out, expected))

        # addmm
        c = mx.random.uniform(shape=(2,))
        out = mx.vmap(mx.addmm, in_axes=(None, 1, None))(c, a, b)
        self.assertTrue(mx.allclose(out, mx.addmm(c, mx.moveaxis(a, 1, 0), b)))

        a = mx.random.uniform(shape=(2, 3, 4))
        b = mx.random.uniform(shape=(4, 2, 3))

        # matmul
        out = mx.vmap(mx.matmul, in_axes=(0, 1))(a, b)
        expected = a @ mx.moveaxis(b, 1, 0)
        self.assertTrue(mx.allclose(out, expected))

        # addmm
        c = mx.random.uniform(shape=(3, 3, 2))
        out = mx.vmap(mx.addmm, in_axes=(2, 0, 1))(c, a, b)
        expected = mx.addmm(mx.moveaxis(c, 2, 0), a, mx.moveaxis(b, 1, 0))
        self.assertTrue(mx.allclose(out, expected))

    def test_vmap_svd(self):
        a = mx.random.uniform(shape=(3, 4, 2))

        cpu_svd_full = lambda x: mx.linalg.svd(x, compute_uv=True, stream=mx.cpu)
        cpu_svd_singular = lambda x: mx.linalg.svd(x, compute_uv=False, stream=mx.cpu)

        # Vmap over the first axis (this is already supported natively by the primitive).
        Us, Ss, Vts = mx.vmap(cpu_svd_full, in_axes=(0,))(a)
        self.assertEqual(Us.shape, (a.shape[0], a.shape[1], a.shape[1]))
        self.assertEqual(Ss.shape, (a.shape[0], a.shape[2]))
        self.assertEqual(Vts.shape, (a.shape[0], a.shape[2], a.shape[2]))

        Sv = mx.vmap(cpu_svd_singular, in_axes=(0,))(a)
        self.assertEqual(Sv.shape, (a.shape[0], a.shape[2]))

        for i in range(a.shape[0]):
            M = a[i]
            U, S, Vt = Us[i], Ss[i], Vts[i]
            self.assertTrue(
                mx.allclose(U[:, : len(S)] @ mx.diag(S) @ Vt, M, rtol=1e-5, atol=1e-7)
            )
            self.assertTrue(
                mx.allclose(
                    mx.linalg.norm(Sv[i]),
                    mx.linalg.norm(M, ord="fro"),
                    rtol=1e-5,
                    atol=1e-7,
                )
            )

        # Vmap over the second axis.
        Us, Ss, Vts = mx.vmap(cpu_svd_full, in_axes=(1,))(a)
        self.assertEqual(Us.shape, (a.shape[1], a.shape[0], a.shape[0]))
        self.assertEqual(Ss.shape, (a.shape[1], a.shape[2]))
        self.assertEqual(Vts.shape, (a.shape[1], a.shape[2], a.shape[2]))

        Sv = mx.vmap(cpu_svd_singular, in_axes=(1,))(a)
        self.assertEqual(Sv.shape, (a.shape[1], a.shape[2]))

        for i in range(a.shape[1]):
            M = a[:, i, :]
            U, S, Vt = Us[i], Ss[i], Vts[i]
            self.assertTrue(
                mx.allclose(U[:, : len(S)] @ mx.diag(S) @ Vt, M, rtol=1e-5, atol=1e-7)
            )
            self.assertTrue(
                mx.allclose(
                    mx.linalg.norm(Sv[i]),
                    mx.linalg.norm(M, ord="fro"),
                    rtol=1e-5,
                    atol=1e-7,
                )
            )

    def test_vmap_inverse(self):
        mx.random.seed(42)
        a = mx.random.uniform(shape=(3, 4, 4))

        cpu_inv = lambda x: mx.linalg.inv(x, stream=mx.cpu)

        # Vmap over the first axis (this is already supported natively by the primitive).
        invs = mx.vmap(cpu_inv, in_axes=(0,))(a)

        for i in range(a.shape[0]):
            self.assertTrue(
                mx.allclose(a[i] @ invs[i], mx.eye(a.shape[1]), rtol=1e-4, atol=1e-5)
            )

        a = mx.random.uniform(shape=(4, 3, 4))

        # Without vmapping, each input matrix is not square.
        with self.assertRaises(ValueError):
            mx.eval(cpu_inv(a))

        # Vmap over the second axis.
        invs = mx.vmap(cpu_inv, in_axes=(1,))(a)

        for i in range(a.shape[1]):
            self.assertTrue(
                mx.allclose(
                    a[:, i, :] @ invs[i], mx.eye(a.shape[0]), rtol=1e-4, atol=1e-5
                )
            )

    def test_vmap_gather(self):
        def gather(a, idx):
            return a[idx]

        a = mx.array([[1, 2], [3, 4]])
        idx = mx.array(0)
        out = mx.vmap(gather, (0, None))(a, idx)
        self.assertTrue(mx.array_equal(out, mx.array([1, 3])))

        out = mx.vmap(gather, (1, None))(a, idx)
        self.assertTrue(mx.array_equal(out, mx.array([1, 2])))

        idx = mx.array([0, 1])
        out = mx.vmap(gather, (0, 0))(a, idx)
        self.assertTrue(mx.array_equal(out, mx.array([1, 4])))

        a = mx.ones((2, 3, 4))
        idx = mx.zeros(4, mx.int32)
        out = mx.vmap(gather, (2, 0))(a, idx)
        self.assertEqual(out.shape, (4, 3))

        f = mx.vmap(gather, (0, None))
        f = mx.vmap(gather, (0, 0))
        out = f(mx.ones((2, 3, 4)), mx.zeros(2, dtype=mx.int32))
        self.assertEqual(out.shape, (2, 4))

        def gather(a, idxa, idxb):
            return a[idxa, idxb]

        a = mx.ones((2, 3, 4))
        idxa = mx.zeros((2, 3), mx.int32)
        idxb = mx.zeros(3, mx.int32)
        out = mx.vmap(gather, (0, 0, None))(a, idxa, idxb)
        self.assertEqual(out.shape, (2, 3))

        idxa = mx.zeros((3, 1, 2), mx.int32)
        idxb = mx.zeros((2, 3, 1, 2), mx.int32)
        out = mx.vmap(gather, (0, None, 0))(a, idxa, idxb)
        self.assertEqual(out.shape, (2, 3, 1, 2))

        idxa = mx.zeros((3, 1, 2), mx.int32)
        idxb = mx.zeros((3, 1, 2, 2), mx.int32)
        out = mx.vmap(gather, (0, None, 3))(a, idxa, idxb)
        self.assertEqual(out.shape, (2, 3, 1, 2))

    def test_vmap_scatter(self):
        def scatter(a):
            a[mx.array(0)] = mx.array(0.0)
            return a

        a = mx.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]])
        out = mx.vmap(scatter)(a)
        expected = mx.array([[0.0, 2.0, 3.0], [0.0, 3.0, 4.0]])
        self.assertTrue(mx.allclose(out, expected))

        out = mx.vmap(scatter, in_axes=(1,), out_axes=1)(a)
        expected = mx.array([[0.0, 0.0, 0.0], [2.0, 3.0, 4.0]])
        self.assertTrue(mx.allclose(out, expected))

        def scatter_add(a):
            return a.at[mx.array(0)].add(mx.array(1.0))

        a = mx.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]])
        out = mx.vmap(scatter_add)(a)
        expected = mx.array([[2.0, 2.0, 3.0], [3.0, 3.0, 4.0]])
        self.assertTrue(mx.allclose(out, expected))

        out = mx.vmap(scatter_add, in_axes=(1,), out_axes=1)(a)
        expected = mx.array([[2.0, 3.0, 4.0], [2.0, 3.0, 4.0]])
        self.assertTrue(mx.allclose(out, expected))

        # Multiple indices
        def scatter(a):
            a[mx.array([0, 1]), mx.array([0, 1])] = mx.array((1.0, 1.0))
            return a

        a = mx.zeros((3, 3, 3))

        expected = mx.repeat(scatter(mx.zeros((3, 3)))[None], 3, axis=0)
        out = mx.vmap(scatter, in_axes=(0,), out_axes=0)(a)
        self.assertTrue(mx.allclose(out, expected))

        expected = mx.zeros((3, 3, 3))
        expected[0, :, 0] = 1
        expected[1, :, 1] = 1
        out = mx.vmap(scatter, in_axes=(1,), out_axes=1)(a)
        self.assertTrue(mx.allclose(out, expected))

        expected = mx.zeros((3, 3, 3))
        expected[0, 0, :] = 1
        expected[1, 1, :] = 1
        out = mx.vmap(scatter, in_axes=(2,), out_axes=2)(a)
        self.assertTrue(mx.allclose(out, expected))

        # vmap over src and indices
        def scatter(a, idx):
            a[idx] = mx.array(1.0)
            return a

        a = mx.zeros((3, 4))
        idx = mx.array([0, 1, 2])
        out = mx.vmap(scatter, in_axes=(0, 0), out_axes=0)(a, idx)
        self.assertTrue(mx.allclose(out, mx.eye(n=3, m=4)))

        # vmap over only indices
        out = mx.vmap(scatter, in_axes=(None, 0), out_axes=0)(a, idx)
        expected = mx.zeros((3, 3, 4))
        expected[0, 0] = 1
        expected[1, 1] = 1
        expected[2, 2] = 1
        self.assertTrue(mx.allclose(out, expected))

        # vmap over src, indices, updates
        def scatter(a, idx, updates):
            a[idx] = updates
            return a

        a = mx.zeros((3, 4))
        idx = mx.array([0, 1, 2])
        updates = mx.array([1, 2, 3])
        out = mx.vmap(scatter, in_axes=(0, 0, 0), out_axes=0)(a, idx, updates)
        expected = mx.diag(mx.array([1, 2, 3]), k=-1)[1:]
        self.assertTrue(mx.allclose(out, expected))

        # vmap over only updates
        def scatter(a, idx, updates):
            a[idx] = updates
            return a

        a = mx.zeros((3, 4))
        idx = mx.array([0])
        updates = mx.array([1, 2, 3])
        out = mx.vmap(scatter, in_axes=(None, None, 0), out_axes=0)(a, idx, updates)
        expected = mx.zeros((3, 3, 4))
        expected[:, 0] = mx.array([1, 2, 3])[:, None]
        self.assertTrue(mx.allclose(out, expected))

    def test_vmap_const_func(self):
        a = mx.random.uniform(shape=(2, 3, 4))
        b = mx.random.uniform(shape=(4, 3))

        def const_func(a, b):
            return mx.array(2)

        out = mx.vmap(const_func, in_axes=(0, None))(a, b)
        self.assertTrue(mx.array_equal(mx.full((2,), 2), out))
        out = mx.vmap(const_func, in_axes=(None, 0))(a, b)
        self.assertTrue(mx.array_equal(mx.full((4,), 2), out))
        out = mx.vmap(const_func, in_axes=(1, 1))(a, b)
        self.assertTrue(mx.array_equal(mx.full((3,), 2), out))

        with self.assertRaises(ValueError):
            out = mx.vmap(const_func, in_axes=(None, None))(a, b)

        with self.assertRaises(ValueError):
            out = mx.vmap(const_func, in_axes=(0, 0))(a, b)

    def test_vmap_concatenate(self):
        x = mx.random.uniform(shape=(2, 2, 2))

        def cat_fun(x, y):
            return mx.concatenate([x, y], axis=1)

        def cat_constant(x):
            y = mx.ones((2, 1))
            return mx.concatenate([x, y], 1)

        out = mx.vmap(cat_fun, in_axes=(0, 2))(x, x)
        target = mx.stack(
            [mx.concatenate([x[i], x[:, :, i]], axis=1) for i in range(2)]
        )
        self.assertTrue(mx.array_equal(out, target))

        out = mx.vmap(cat_constant)(x)
        target = mx.concatenate([x, mx.ones((2, 2, 1))], axis=2)
        self.assertTrue(mx.array_equal(out, target))

    def test_vmap_take_along_axis(self):
        a = mx.zeros((4, 5, 1))
        idx = mx.zeros((2, 4, 1), mx.int32)

        def fun(a, idx):
            return mx.take_along_axis(a, idx, axis=0)

        out = mx.vmap(fun, in_axes=(0, 1))(a, idx)
        self.assertEqual(out.shape, (4, 2, 1))

        idx = mx.zeros((2, 1), mx.int32)

        out = mx.vmap(fun, in_axes=(0, None))(a, idx)
        self.assertEqual(out.shape, (4, 2, 1))

        a = mx.zeros((5, 1))
        idx = mx.zeros((4, 2, 1), mx.int32)

        out = mx.vmap(fun, in_axes=(None, 0))(a, idx)
        self.assertEqual(out.shape, (4, 2, 1))

    def test_vmap_put_along_axis(self):
        a = mx.zeros((4, 5, 1))
        idx = mx.ones((2, 4, 1), mx.int32)
        upd = mx.ones((2, 4, 1))

        def fun(a, idx, upd):
            return mx.put_along_axis(a, idx, upd, axis=0)

        out = mx.vmap(fun, in_axes=(0, 1, 1))(a, idx, upd)
        self.assertEqual(out.shape, (4, 5, 1))

        upd = mx.ones((2, 1))
        out = mx.vmap(fun, in_axes=(0, 1, None))(a, idx, upd)
        self.assertEqual(out.shape, (4, 5, 1))

        idx = mx.ones((2, 1), mx.int32)
        upd = mx.ones((2, 1))
        out = mx.vmap(fun, in_axes=(0, None, None))(a, idx, upd)
        self.assertEqual(out.shape, (4, 5, 1))

        a = mx.zeros((5, 1))
        idx = mx.ones((2, 4, 1), mx.int32)
        upd = mx.ones((2, 4, 1))
        out = mx.vmap(fun, in_axes=(None, 1, 1))(a, idx, upd)
        self.assertEqual(out.shape, (4, 5, 1))

    def test_vmap_split_vmap(self):
        def fun(x):
            a, b = mx.split(x, 2, 1)
            return mx.concatenate([b, a], 1)

        x = mx.ones((5, 6, 7))
        y = mx.ones((5, 4, 6, 7))
        fx = fun(x)
        fy = mx.vmap(fun, in_axes=1)(y)
        self.assertEqual(fx.shape, (5, 6, 7))
        self.assertEqual(fy.shape, (4, 5, 6, 7))

    def test_leaks(self):
        gc.collect()
        mx.synchronize()
        if mx.metal.is_available():
            mem_pre = mx.get_active_memory()
        else:
            mem_pre = 0

        def outer():
            d = {}

            def f(x):
                return d["x"]

            d["f"] = mx.vmap(f)
            d["x"] = mx.array([0] * 1000)

        for _ in range(5):
            outer()
            gc.collect()

        mx.synchronize()
        if mx.metal.is_available():
            mem_post = mx.get_active_memory()
        else:
            mem_post = 0

        self.assertEqual(mem_pre, mem_post)

    def test_vmap_flatten(self):
        def fun(x):
            return mx.flatten(x, 0, 1)

        x = mx.zeros((2, 3, 4))

        self.assertEqual(mx.vmap(fun)(x).shape, (2, 12))
        self.assertEqual(mx.vmap(fun, in_axes=(1,))(x).shape, (3, 8))
        self.assertEqual(mx.vmap(fun, in_axes=(2,))(x).shape, (4, 6))

    def test_vmap_conv(self):
        # vmap input only
        x = mx.random.uniform(shape=(2, 2, 5, 4))
        w = mx.random.uniform(shape=(8, 3, 4))

        expected = mx.stack([mx.conv1d(xi, w) for xi in x])
        out = mx.vmap(mx.conv1d, in_axes=(0, None))(x, w)
        self.assertTrue(mx.allclose(expected, out))

        x = mx.moveaxis(x, 0, 2)
        out = mx.vmap(mx.conv1d, in_axes=(2, None))(x, w)
        self.assertTrue(mx.allclose(expected, out))

        # vmap weights only
        x = mx.random.uniform(shape=(2, 5, 4))
        w = mx.random.uniform(shape=(3, 8, 3, 4))

        expected = mx.stack([mx.conv1d(x, wi) for wi in w])
        out = mx.vmap(mx.conv1d, in_axes=(None, 0))(x, w)
        self.assertTrue(mx.allclose(expected, out))

        w = mx.moveaxis(w, 0, 1)
        out = mx.vmap(mx.conv1d, in_axes=(None, 1))(x, w)
        self.assertTrue(mx.allclose(expected, out))

        # vmap weights and input
        x = mx.random.uniform(shape=(3, 2, 5, 4))
        w = mx.random.uniform(shape=(3, 8, 3, 4))

        expected = mx.stack([mx.conv1d(xi, wi) for xi, wi in zip(x, w)])
        out = mx.vmap(mx.conv1d, in_axes=(0, 0))(x, w)
        self.assertTrue(mx.allclose(expected, out))

        x = mx.random.uniform(shape=(2, 3, 5, 4))
        w = mx.random.uniform(shape=(8, 3, 4, 3))

        expected = mx.stack([mx.conv1d(x[:, i], w[..., i]) for i in range(3)])
        out = mx.vmap(mx.conv1d, in_axes=(1, 3))(x, w)
        self.assertTrue(mx.allclose(expected, out))

        # Test with groups
        x = mx.random.uniform(shape=(3, 2, 5, 8))
        w = mx.random.uniform(shape=(3, 2, 3, 4))

        def gconv(x, w):
            return mx.conv1d(x, w, groups=2)

        expected = mx.stack([gconv(xi, wi) for xi, wi in zip(x, w)])
        out = mx.vmap(gconv, in_axes=(0, 0))(x, w)
        self.assertTrue(mx.allclose(expected, out))

    def test_vmap_types(self):

        from typing import NamedTuple

        class Vector(tuple):
            pass

        class State(NamedTuple):
            a: mx.array
            b: mx.array

        def transform(x: State):
            return State(x.a + 10, x.b * 10)

        def transform_tuple(t):
            return (t[0] + 10, t[1] * 10)

        def transform_vector(t):
            return Vector([t[0] + 10, t[1] * 10])

        x = State(mx.array(1), mx.array(2))

        vmap_transform = mx.vmap(transform)
        vmap_transform_tuple = mx.vmap(transform_tuple)
        vmap_transform_vector = mx.vmap(transform_vector)

        x_batch_tuple = (mx.array([1, 2, 3]), mx.array([4, 5, 6]))
        out1 = vmap_transform_tuple(x_batch_tuple)

        self.assertTrue(isinstance(out1, tuple))
        self.assertTrue(mx.array_equal(out1[0], mx.array([11, 12, 13])))
        self.assertTrue(mx.array_equal(out1[1], mx.array([40, 50, 60])))

        x_batch = State(mx.array([1, 2, 3]), mx.array([4, 5, 6]))
        out2 = vmap_transform(x_batch)
        self.assertTrue(isinstance(out2, State))
        self.assertTrue(mx.array_equal(out2.a, mx.array([11, 12, 13])))
        self.assertTrue(mx.array_equal(out2.b, mx.array([40, 50, 60])))

        x_batch_vector = Vector([mx.array([1, 2, 3]), mx.array([4, 5, 6])])
        out3 = vmap_transform_vector(x_batch_vector)
        self.assertTrue(isinstance(out3, Vector))
        self.assertTrue(mx.array_equal(out3[0], mx.array([11, 12, 13])))
        self.assertTrue(mx.array_equal(out3[1], mx.array([40, 50, 60])))

    def test_vmap_masked_scatter(self):
        def scatter_fn(x, m, src):
            x[m] = src
            return x

        # Batched sources
        a = mx.array([[10, 20, 30, 40], [50, 60, 70, 80]])
        mask = mx.array([[False, True, True, True], [True, False, True, True]])
        src = mx.array([[1, 2, 3], [4, 5, 6]])

        expected = mx.array([[10, 1, 2, 3], [4, 60, 5, 6]])
        vmap_scatter = mx.vmap(scatter_fn, in_axes=(0, 0, 0))
        out = vmap_scatter(a, mask, src)
        self.assertTrue(mx.array_equal(expected, out))

        # Shared source across batch (matching mask populations)
        a = mx.array([[0, 0, 0], [5, 5, 5]])
        mask = mx.array([[True, False, True], [False, True, True]])
        src = mx.array([9, 8])

        expected = mx.array([[9, 0, 8], [5, 9, 8]])
        vmap_scatter = mx.vmap(scatter_fn, in_axes=(0, 0, None))
        out = vmap_scatter(a, mask, src)
        self.assertTrue(mx.array_equal(expected, out))

        # Shared destination with batched mask and sources
        a = mx.array([10, 20, 30, 40])
        mask = mx.array([[True, False, False, True], [False, True, True, False]])
        src = mx.array([[1, 2], [3, 4]])

        expected = mx.array([[1, 20, 30, 2], [10, 3, 4, 40]])
        vmap_scatter = mx.vmap(scatter_fn, in_axes=(None, 0, 0))
        out = vmap_scatter(a, mask, src)
        self.assertTrue(mx.array_equal(expected, out))

        # Shared mask across batch with batched sources
        a = mx.array([[0, 0, 0, 0], [10, 20, 30, 40]])
        mask = mx.array([True, False, True, False])
        src = mx.array([[7, 8], [9, 10]])

        expected = mx.array([[7, 0, 8, 0], [9, 20, 10, 40]])
        vmap_scatter = mx.vmap(scatter_fn, in_axes=(0, None, 0))
        out = vmap_scatter(a, mask, src)
        self.assertTrue(mx.array_equal(expected, out))

        # Uneven mask populations with scalar broadcast
        a = mx.array([[0.0, 0.0, 0.0, 0.0], [10.0, 20.0, 30.0, 40.0]])
        mask = mx.array([[True, False, True, True], [False, True, False, False]])
        shared_src = mx.array(1.5)

        expected = mx.array(
            [[1.5, 0.0, 1.5, 1.5], [10.0, 1.5, 30.0, 40.0]], dtype=a.dtype
        )
        vmap_scatter = mx.vmap(scatter_fn, in_axes=(0, 0, None))
        out = vmap_scatter(a, mask, shared_src)
        self.assertTrue(mx.array_equal(expected, out))

        # Shared src with identical masks must restart for each batch
        a = mx.array([[0, 0, 0, 0, 0], [10, 20, 30, 40, 50]])
        mask = mx.array(
            [[True, True, True, False, False], [True, True, True, False, False]]
        )
        src = mx.array([1, 2, 3, 4, 5])

        expected = mx.array([[1, 2, 3, 0, 0], [1, 2, 3, 40, 50]])
        vmap_scatter = mx.vmap(scatter_fn, in_axes=(0, 0, None))
        out = vmap_scatter(a, mask, src)
        self.assertTrue(mx.array_equal(expected, out))

        # Double vmap
        a = mx.zeros((8, 8, 8))
        mask = mx.random.normal((8, 8, 8)) > 0
        src = mx.random.normal((8, 8))
        expected = mx.stack(
            [
                mx.stack(
                    [scatter_fn(a[i, j] + 0, mask[i, j], src[i]) for j in range(8)]
                )
                for i in range(8)
            ]
        )
        double_scatter = mx.vmap(
            mx.vmap(scatter_fn, in_axes=(0, 0, None)), in_axes=(0, 0, 0)
        )
        out = double_scatter(a + 0, mask, src)
        self.assertTrue(mx.array_equal(expected, out))


if __name__ == "__main__":
    mlx_tests.MLXTestRunner()


================================================
FILE: setup.py
================================================
# Copyright © 2023 Apple Inc.

import datetime
import os
import platform
import re
import subprocess
from functools import partial
from pathlib import Path

from setuptools import Extension, find_namespace_packages, setup
from setuptools.command.bdist_wheel import bdist_wheel
from setuptools.command.build_ext import build_ext


def cuda_toolkit_major_version():
    out = subprocess.check_output(["nvcc", "--version"], stderr=subprocess.STDOUT)
    text = out.decode()
    m = re.search(r"release (\d+)", text)
    if m:
        return int(m.group(1))
    return None


def get_version():
    with open("mlx/version.h", "r") as fid:
        for l in fid:
            if "#define MLX_VERSION_MAJOR" in l:
                major = l.split()[-1]
            if "#define MLX_VERSION_MINOR" in l:
                minor = l.split()[-1]
            if "#define MLX_VERSION_PATCH" in l:
                patch = l.split()[-1]
    version = f"{major}.{minor}.{patch}"
    pypi_release = int(os.environ.get("PYPI_RELEASE", 0))
    dev_release = int(os.environ.get("DEV_RELEASE", 0))
    if not pypi_release or dev_release:
        today = datetime.date.today()
        version = f"{version}.dev{today.year}{today.month:02d}{today.day:02d}"
    if not pypi_release and not dev_release:
        git_hash = (
            subprocess.run(
                "git rev-parse --short HEAD".split(),
                capture_output=True,
                check=True,
            )
            .stdout.strip()
            .decode()
        )
        version = f"{version}+{git_hash}"

    return version


build_stage = int(os.environ.get("MLX_BUILD_STAGE", 0))
build_macos = platform.system() == "Darwin"
build_cuda = "MLX_BUILD_CUDA=ON" in os.environ.get("CMAKE_ARGS", "")


# A CMakeExtension needs a sourcedir instead of a file list.
# The name must be the _single_ output extension from the CMake build.
# If you need multiple extensions, see scikit-build.
class CMakeExtension(Extension):
    def __init__(self, name: str, sourcedir: str = "") -> None:
        super().__init__(name, sources=[])
        self.sourcedir = os.fspath(Path(sourcedir).resolve())


class CMakeBuild(build_ext):
    def build_extension(self, ext: CMakeExtension) -> None:
        # Must be in this form due to bug in .resolve() only fixed in Python 3.10+
        ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name)  # type: ignore[no-untyped-call]
        extdir = ext_fullpath.parent.resolve()

        debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
        cfg = "Debug" if debug else "Release"

        build_temp = Path(self.build_temp) / ext.name
        if not build_temp.exists():
            build_temp.mkdir(parents=True)

        install_prefix = extdir
        pybind_out_dir = extdir
        if build_stage == 1:
            # Don't include MLX libraries in the wheel
            install_prefix = build_temp
        elif build_stage == 2:
            # Don't include Python bindings in the wheel
            pybind_out_dir = build_temp
        cmake_args = [
            f"-DCMAKE_INSTALL_PREFIX={install_prefix}",
            f"-DMLX_PYTHON_BINDINGS_OUTPUT_DIRECTORY={pybind_out_dir}",
            f"-DCMAKE_BUILD_TYPE={cfg}",
            "-DMLX_BUILD_PYTHON_BINDINGS=ON",
            "-DMLX_BUILD_TESTS=OFF",
            "-DMLX_BUILD_BENCHMARKS=OFF",
            "-DMLX_BUILD_EXAMPLES=OFF",
            "-DBUILD_SHARED_LIBS=ON",
        ]
        if build_stage == 2 and build_cuda:
            # Last arch is always real and virtual for forward-compatibility
            cuda_archs = ";".join(
                (
                    "75-real",
                    "80-real",
                    "90a-real",
                    "100a-real",
                    "120a-real",
                    "120-virtual",
                )
            )
            cmake_args += [f"-DMLX_CUDA_ARCHITECTURES={cuda_archs}"]
            # Search CUDA libs from python packages.
            cmake_args += ["-DMLX_LOAD_CUDA_LIBS_FROM_PYTHON=ON"]

        # Some generators require explcitly passing config when building.
        build_args = ["--config", cfg]
        # Adding CMake arguments set as environment variable
        # (needed e.g. to build for ARM OSx on conda-forge)
        if "CMAKE_ARGS" in os.environ:
            cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]

        # Pass version to C++
        cmake_args += [f"-DMLX_VERSION={self.distribution.get_version()}"]  # type: ignore[attr-defined]

        if build_macos:
            # Cross-compile support for macOS - respect ARCHFLAGS if set
            archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", ""))
            if archs:
                cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]

        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
        # across all generators.
        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
            build_args += [f"-j{os.cpu_count()}"]

        # Avoid cache miss when building from temporary dirs.
        os.environ["CCACHE_BASEDIR"] = os.path.realpath(self.build_temp)
        os.environ["CCACHE_NOHASHDIR"] = "true"

        subprocess.run(
            ["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True
        )
        subprocess.run(
            ["cmake", "--build", ".", "--target", "install", *build_args],
            cwd=build_temp,
            check=True,
        )

    # Make sure to copy mlx.metallib for inplace builds
    def run(self):
        super().run()

        ext = next(ext for ext in self.extensions if ext.name == "mlx.core")

        # Based on https://github.com/pypa/setuptools/blob/main/setuptools/command/build_ext.py#L102
        if self.inplace:
            # Resolve inplace package dir
            build_py = self.get_finalized_command("build_py")
            inplace_file, regular_file = self._get_inplace_equivalent(build_py, ext)

            inplace_dir = str(Path(inplace_file).parent.resolve())
            regular_dir = str(Path(regular_file).parent.resolve())

            self.copy_tree(regular_dir, inplace_dir)

        # Build type stubs.
        build_temp = Path(self.build_temp) / ext.name
        subprocess.run(
            ["cmake", "--install", build_temp, "--component", "core_stub"],
            check=True,
        )


class MLXBdistWheel(bdist_wheel):
    def get_tag(self) -> tuple[str, str, str]:
        impl, abi, plat_name = super().get_tag()
        if build_stage == 2:
            impl = self.python_tag
            abi = "none"
        return (impl, abi, plat_name)


# Read the content of README.md
with open(Path(__file__).parent / "README.md", encoding="utf-8") as f:
    long_description = f.read()


if __name__ == "__main__":
    package_dir = {"": "python"}
    packages = find_namespace_packages(
        where="python",
        exclude=[
            "src",
            "tests",
            "scripts",
            "mlx.lib",
            "mlx.include",
            "mlx.share",
            "mlx.share.**",
            "mlx.include.**",
        ],
    )

    version = get_version()

    _setup = partial(
        setup,
        version=version,
        author="MLX Contributors",
        author_email="mlx@group.apple.com",
        description="A framework for machine learning on Apple silicon.",
        long_description=long_description,
        long_description_content_type="text/markdown",
        license="MIT",
        url="https://github.com/ml-explore/mlx",
        include_package_data=True,
        package_dir=package_dir,
        zip_safe=False,
        python_requires=">=3.10",
        ext_modules=[CMakeExtension("mlx.core")],
        cmdclass={
            "build_ext": CMakeBuild,
            "bdist_wheel": MLXBdistWheel,
        },
    )

    package_data = {"mlx.core": ["*.pyi"]}

    extras = {
        "dev": [
            "numpy>=2",
            "pre-commit",
            "psutil",
            "torch>=2.9",
            "typing_extensions",
        ],
    }
    entry_points = {
        "console_scripts": [
            "mlx.launch = mlx._distributed_utils.launch:main",
            "mlx.distributed_config = mlx._distributed_utils.config:main",
        ]
    }
    install_requires = []

    # Release builds for PyPi are in two stages.
    # Each stage should be run from a clean build:
    #   python setup.py clean --all
    #
    # Stage 1:
    #  - Triggered with `MLX_BUILD_STAGE=1`
    #  - Include everything except backend-specific binaries (e.g. libmlx.so, mlx.metallib, etc)
    #  - Wheel has Python ABI and platform tags
    #  - Wheel should be built for the cross-product of python version and platforms
    #  - Package name is mlx and it depends on subpackage in stage 2 (e.g. mlx-metal)
    # Stage 2:
    #  - Triggered with `MLX_BUILD_STAGE=2`
    #  - Includes only backend-specific binaries (e.g. libmlx.so, mlx.metallib, etc)
    #  - Wheel has only platform tags
    #  - Wheel should be built only for different platforms
    #  - Package name is back-end specific, e.g mlx-metal
    if build_stage != 2:
        if build_stage == 1:
            install_requires.append(
                f'mlx-metal=={version}; platform_system == "Darwin"'
            )
            extras["cuda"] = [f'mlx-cuda-12=={version}; platform_system == "Linux"']
            for toolkit in [12, 13]:
                extras[f"cuda{toolkit}"] = [
                    f'mlx-cuda-{toolkit}=={version}; platform_system == "Linux"'
                ]
            extras["cpu"] = [f'mlx-cpu=={version}; platform_system == "Linux"']

        _setup(
            name="mlx",
            packages=packages,
            extras_require=extras,
            entry_points=entry_points,
            install_requires=install_requires,
            package_data=package_data,
        )
    else:
        if build_macos:
            name = "mlx-metal"
        elif build_cuda:
            toolkit = cuda_toolkit_major_version()
            name = f"mlx-cuda-{toolkit}"
            # Note: update following files when new dependency is added:
            # * .github/actions/build-cuda-release/action.yml
            # * mlx/backend/cuda/CMakeLists.txt
            if toolkit == 12:
                install_requires += [
                    "nvidia-cublas-cu12==12.9.*",
                    "nvidia-cuda-nvrtc-cu12==12.9.*",
                ]
            elif toolkit == 13:
                install_requires += [
                    "nvidia-cublas",
                    "nvidia-cuda-nvrtc",
                ]
            else:
                raise ValueError(f"Unknown toolkit {toolkit}")
            install_requires += [
                f"nvidia-cudnn-cu{toolkit}==9.*",
                f"nvidia-nccl-cu{toolkit}",
            ]

        else:
            name = "mlx-cpu"
        _setup(
            name=name,
            packages=["mlx"],
            install_requires=install_requires,
        )


================================================
FILE: tests/CMakeLists.txt
================================================
FetchContent_Declare(
  doctest
  GIT_REPOSITORY https://github.com/onqtam/doctest.git
  GIT_TAG v2.4.12)
FetchContent_MakeAvailable(doctest)

add_executable(tests ${PROJECT_SOURCE_DIR}/tests/tests.cpp)

if(MLX_BUILD_METAL OR MLX_BUILD_CUDA)
  set(METAL_TEST_SOURCES gpu_tests.cpp)
endif()

include(${doctest_SOURCE_DIR}/scripts/cmake/doctest.cmake)

target_sources(
  tests
  PRIVATE allocator_tests.cpp
          array_tests.cpp
          arg_reduce_tests.cpp
          autograd_tests.cpp
          blas_tests.cpp
          compile_tests.cpp
          custom_vjp_tests.cpp
          creations_tests.cpp
          device_tests.cpp
          einsum_tests.cpp
          export_import_tests.cpp
          eval_tests.cpp
          fft_tests.cpp
          load_tests.cpp
          ops_tests.cpp
          random_tests.cpp
          scheduler_tests.cpp
          utils_tests.cpp
          vmap_tests.cpp
          linalg_tests.cpp
          ${METAL_TEST_SOURCES})

target_link_libraries(tests PRIVATE mlx doctest)
target_compile_options(tests PRIVATE ${SANITIZER_COMPILE_FLAGS})
target_link_options(tests PRIVATE ${SANITIZER_LINK_FLAGS})

doctest_discover_tests(tests)
add_test(NAME tests COMMAND tests)

# Standalone test: verify clean exit when GPU work is in-flight during teardown.
# (Cannot be a doctest case because the crash occurs during static destruction.)
add_executable(test_teardown test_teardown.cpp)
target_link_libraries(test_teardown PRIVATE mlx)
target_compile_options(test_teardown PRIVATE ${SANITIZER_COMPILE_FLAGS})
target_link_options(test_teardown PRIVATE ${SANITIZER_LINK_FLAGS})
add_test(NAME teardown COMMAND test_teardown)


================================================
FILE: tests/allocator_tests.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <stdexcept>

#include "doctest/doctest.h"

#include "mlx/allocator.h"

using namespace mlx::core;

TEST_CASE("test simple allocations") {
  {
    auto buffer = allocator::malloc(sizeof(float));
    auto fptr = static_cast<float*>(buffer.raw_ptr());
    *fptr = 0.5f;
    CHECK_EQ(*fptr, 0.5f);
    allocator::free(buffer);
  }

  {
    auto buffer = allocator::malloc(128 * sizeof(int));
    int* ptr = static_cast<int*>(buffer.raw_ptr());
    for (int i = 0; i < 128; ++i) {
      ptr[i] = i;
    }
    allocator::free(buffer);
  }

  {
    auto buffer = allocator::malloc(0);
    allocator::free(buffer);
  }
}

TEST_CASE("test large allocations") {
  size_t size = 1 << 30;
  for (int i = 0; i < 100; ++i) {
    auto buffer = allocator::malloc(size);
    allocator::free(buffer);
  }
}


================================================
FILE: tests/arg_reduce_tests.cpp
================================================
// Copyright © 2023 Apple Inc.

#include "doctest/doctest.h"

#include "mlx/mlx.h"
#include "mlx/primitives.h"

using namespace mlx::core;

void test_arg_reduce_small(
    Device d,
    const array& x,
    ArgReduce::ReduceType r,
    Shape out_shape,
    int axis,
    std::vector<int> expected_output) {
  auto s = default_stream(d);
  auto y =
      array(out_shape, uint32, std::make_shared<ArgReduce>(s, r, axis), {x});
  y.eval();
  const uint32_t* ydata = y.data<uint32_t>();
  for (int i = 0; i < y.size(); i++) {
    CHECK_EQ(expected_output[i], ydata[i]);
  }
}

void test_arg_reduce_against_cpu(
    const array& x,
    ArgReduce::ReduceType r,
    Shape out_shape,
    int axis) {
  auto y1 = array(
      out_shape,
      uint32,
      std::make_shared<ArgReduce>(default_stream(Device::cpu), r, axis),
      {x});
  auto y2 = array(
      out_shape,
      uint32,
      std::make_shared<ArgReduce>(default_stream(Device::gpu), r, axis),
      {x});
  y1.eval();
  y2.eval();
  CHECK(array_equal(y1, y2).item<bool>());
}

TEST_CASE("test arg reduce small") {
  auto x = array(
      {0, 2, 1, 7, 5, -5, 0, 2, 1, 7, 5, -5,
       0, 2, 1, 7, 5, -5, 0, 2, 1, 7, 5, -5},
      {2, 3, 4});
  test_arg_reduce_small(
      Device::cpu, x, ArgReduce::ArgMin, {2, 3}, 2, {0, 1, 3, 0, 1, 3});
  test_arg_reduce_small(
      Device::cpu, x, ArgReduce::ArgMin, {2, 4}, 1, {0, 1, 1, 2, 0, 1, 1, 2});
  test_arg_reduce_small(
      Device::cpu,
      x,
      ArgReduce::ArgMin,
      {3, 4},
      0,
      {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
  test_arg_reduce_small(
      Device::cpu, x, ArgReduce::ArgMax, {2, 3}, 2, {3, 0, 1, 3, 0, 1});
  test_arg_reduce_small(
      Device::cpu, x, ArgReduce::ArgMax, {2, 4}, 1, {1, 2, 2, 0, 1, 2, 2, 0});
  test_arg_reduce_small(
      Device::cpu,
      x,
      ArgReduce::ArgMax,
      {3, 4},
      0,
      {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});

  if (!metal::is_available()) {
    INFO("Skipping arg reduction gpu tests");
    return;
  }

  test_arg_reduce_small(
      Device::gpu, x, ArgReduce::ArgMin, {2, 3}, 2, {0, 1, 3, 0, 1, 3});
  test_arg_reduce_small(
      Device::gpu, x, ArgReduce::ArgMin, {2, 4}, 1, {0, 1, 1, 2, 0, 1, 1, 2});
  test_arg_reduce_small(
      Device::gpu,
      x,
      ArgReduce::ArgMin,
      {3, 4},
      0,
      {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
  test_arg_reduce_small(
      Device::gpu, x, ArgReduce::ArgMax, {2, 3}, 2, {3, 0, 1, 3, 0, 1});
  test_arg_reduce_small(
      Device::gpu, x, ArgReduce::ArgMax, {2, 4}, 1, {1, 2, 2, 0, 1, 2, 2, 0});
  test_arg_reduce_small(
      Device::gpu,
      x,
      ArgReduce::ArgMax,
      {3, 4},
      0,
      {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
}

TEST_CASE("test arg reduce against cpu") {
  if (!metal::is_available()) {
    INFO("Skipping arg reduction gpu tests");
    return;
  }

  auto x = random::uniform(array(0.0), array(1.0), {127, 92, 55});
  x.eval();
  test_arg_reduce_against_cpu(x, ArgReduce::ArgMin, {127, 92}, 2);
  test_arg_reduce_against_cpu(x, ArgReduce::ArgMin, {127, 55}, 1);
  test_arg_reduce_against_cpu(x, ArgReduce::ArgMin, {92, 55}, 0);
  test_arg_reduce_against_cpu(x, ArgReduce::ArgMax, {127, 92}, 2);
  test_arg_reduce_against_cpu(x, ArgReduce::ArgMax, {127, 55}, 1);
  test_arg_reduce_against_cpu(x, ArgReduce::ArgMax, {92, 55}, 0);

  auto y = random::uniform(array(0.0), array(1.0), {1234});
  y.eval();
  test_arg_reduce_against_cpu(y, ArgReduce::ArgMin, {}, 0);
  test_arg_reduce_against_cpu(y, ArgReduce::ArgMax, {}, 0);
}

void test_arg_reduce_small_bool(
    Device d,
    ArgReduce::ReduceType r,
    Shape out_shape,
    int axis,
    std::vector<int> expected_output) {
  auto s = default_stream(d);
  auto x = array(
      {0, 2, 1, 7, 5, -5, 0, 2, 1, 7, 5, -5,
       0, 2, 1, 7, 5, -5, 0, 2, 1, 7, 5, -5},
      {2, 3, 4});
  x.eval();
  auto y =
      array(out_shape, uint32, std::make_shared<ArgReduce>(s, r, axis), {x});
  y.eval();
  const uint32_t* ydata = y.data<uint32_t>();
  for (int i = 0; i < y.size(); i++) {
    CHECK_EQ(expected_output[i], ydata[i]);
  }
}

TEST_CASE("test arg reduce bool") {
  if (!metal::is_available()) {
    INFO("Skipping arg reduction gpu tests");
    return;
  }
  auto x = array(
      {false, true,  true,  false, false, false, false, true,
       true,  false, true,  true,  false, true,  true,  false,
       false, false, false, true,  true,  false, true,  true},
      {2, 3, 4});
  x.eval();
  test_arg_reduce_small(
      Device::gpu, x, ArgReduce::ArgMin, {2, 3}, 2, {0, 0, 1, 0, 0, 1});
  test_arg_reduce_small(
      Device::gpu, x, ArgReduce::ArgMin, {2, 4}, 1, {0, 1, 1, 0, 0, 1, 1, 0});
  test_arg_reduce_small(
      Device::gpu,
      x,
      ArgReduce::ArgMin,
      {3, 4},
      0,
      {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
  test_arg_reduce_small(
      Device::gpu, x, ArgReduce::ArgMax, {2, 3}, 2, {1, 3, 0, 1, 3, 0});
  test_arg_reduce_small(
      Device::gpu, x, ArgReduce::ArgMax, {2, 4}, 1, {2, 0, 0, 1, 2, 0, 0, 1});
  test_arg_reduce_small(
      Device::gpu,
      x,
      ArgReduce::ArgMax,
      {3, 4},
      0,
      {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
}

TEST_CASE("test arg reduce edge cases") {
  auto a = argmin(array(1.0));
  CHECK_EQ(a.item<uint32_t>(), 0);
  auto b = argmax(array(1.0));
  CHECK_EQ(b.item<uint32_t>(), 0);
  CHECK_THROWS(argmin(array({})));
  CHECK_THROWS(argmax(array({})));
}

TEST_CASE("test arg reduce irregular strides") {
  auto x = array(
      {0, 2, 1, 7, 5, -5, 0, 2, 1, 7, 5, -5,
       0, 2, 1, 7, 5, -5, 0, 2, 1, 7, 5, -5},
      {2, 3, 4});
  x = transpose(x, {2, 0, 1});
  x.eval();
  test_arg_reduce_small(
      Device::cpu, x, ArgReduce::ArgMin, {4, 2}, 2, {0, 0, 1, 1, 1, 1, 2, 2});

  if (!metal::is_available()) {
    INFO("Skipping arg reduction gpu tests");
    return;
  }
}


================================================
FILE: tests/array_tests.cpp
================================================
// Copyright © 2023 Apple Inc.
#include <cassert>
#include <climits>
#include <stdexcept>
#include <vector>

#include "doctest/doctest.h"

#include "mlx/mlx.h"

using namespace mlx::core;

TEST_CASE("test array basics") {
  // Scalar
  array x(1.0);
  CHECK_EQ(x.size(), 1);
  CHECK_EQ(x.ndim(), 0);
  CHECK_EQ(x.shape(), Shape{});
  CHECK_THROWS_AS(x.shape(0), std::out_of_range);
  CHECK_THROWS_AS(x.shape(-1), std::out_of_range);
  CHECK_EQ(x.strides(), Strides{});
  CHECK_EQ(x.itemsize(), sizeof(float));
  CHECK_EQ(x.nbytes(), sizeof(float));
  CHECK_EQ(x.dtype(), float32);
  CHECK_EQ(x.item<float>(), 1.0);

  // Scalar with specified type
  x = array(1, float32);
  CHECK_EQ(x.dtype(), float32);
  CHECK_EQ(x.item<float>(), 1.0);

  // Scalar with specified type
  x = array(1, bool_);
  CHECK_EQ(x.dtype(), bool_);
  CHECK_EQ(x.itemsize(), sizeof(bool));
  CHECK_EQ(x.nbytes(), sizeof(bool));
  CHECK_EQ(x.item<bool>(), true);

  // Check shaped arrays
  x = array({1.0});
  CHECK_EQ(x.dtype(), float32);
  CHECK_EQ(x.size(), 1);
  CHECK_EQ(x.ndim(), 1);
  CHECK_EQ(x.shape(), Shape{1});
  CHECK_EQ(x.shape(0), 1);
  CHECK_EQ(x.shape(-1), 1);
  CHECK_THROWS_AS(x.shape(1), std::out_of_range);
  CHECK_THROWS_AS(x.shape(-2), std::out_of_range);
  CHECK_EQ(x.strides(), Strides{1});
  CHECK_EQ(x.item<float>(), 1.0);

  // Check empty array
  x = array({});
  CHECK_EQ(x.size(), 0);
  CHECK_EQ(x.dtype(), float32);
  CHECK_EQ(x.itemsize(), sizeof(float));
  CHECK_EQ(x.nbytes(), 0);
  CHECK_THROWS_AS(x.item<float>(), std::invalid_argument);

  x = array({1.0, 1.0});
  CHECK_EQ(x.size(), 2);
  CHECK_EQ(x.shape(), Shape{2});
  CHECK_EQ(x.itemsize(), sizeof(float));
  CHECK_EQ(x.nbytes(), x.itemsize() * x.size());

  // Accessing item in non-scalar array throws
  CHECK_THROWS_AS(x.item<float>(), std::invalid_argument);

  x = array({1.0, 1.0, 1.0}, {1, 3});
  CHECK_EQ(x.size(), 3);
  CHECK_EQ(x.shape(), Shape{1, 3});
  CHECK_EQ(x.strides(), Strides{3, 1});

  // Test wrong size/shapes throw:
  CHECK_THROWS_AS(array({1.0, 1.0, 1.0}, {4}), std::invalid_argument);
  CHECK_THROWS_AS(array({1.0, 1.0, 1.0}, {1, 4}), std::invalid_argument);
  CHECK_THROWS_AS(array({1.0, 1.0, 1.0}, {1, 2}), std::invalid_argument);

  // Test array ids work as expected
  x = array(1.0);
  auto y = x;
  CHECK_EQ(y.id(), x.id());
  array z(2.0);
  CHECK_NE(z.id(), x.id());
  z = x;
  CHECK_EQ(z.id(), x.id());

  // Array creation from pointer
  float data[] = {0.0, 1.0, 2.0, 3.0};
  x = array(data, {4});
  CHECK_EQ(x.dtype(), float32);
  CHECK(array_equal(x, array({0.0, 1.0, 2.0, 3.0})).item<bool>());

  // Array creation from vectors
  {
    std::vector<int> data = {0, 1, 2, 3};
    x = array(data.begin(), {4});
    CHECK_EQ(x.dtype(), int32);
    CHECK(array_equal(x, array({0, 1, 2, 3})).item<bool>());
  }

  {
    std::vector<bool> data = {false, true, false, true};
    x = array(data.begin(), {4});
    CHECK_EQ(x.dtype(), bool_);
    CHECK(array_equal(x, array({false, true, false, true})).item<bool>());
  }

  // Regression: vector<bool>::reference to fp16/bf16 stored raw bits
  {
    std::vector<bool> data = {true, false, true};
    auto bf = array(data.begin(), {3}, bfloat16);
    CHECK(array_equal(bf, array({1.0f, 0.0f, 1.0f}, bfloat16)).item<bool>());

    auto fp = array(data.begin(), {3}, float16);
    CHECK(array_equal(fp, array({1.0f, 0.0f, 1.0f}, float16)).item<bool>());
  }
}

TEST_CASE("test array types") {
#define basic_dtype_test(T, mlx_type) \
  T val = 42;                         \
  array x(val);                       \
  CHECK_EQ(x.dtype(), mlx_type);      \
  CHECK_EQ(x.item<T>(), val);         \
  x = array({val, val});              \
  CHECK_EQ(x.dtype(), mlx_type);

  // bool_
  {
    array x(true);
    CHECK_EQ(x.dtype(), bool_);
    CHECK_EQ(x.item<bool>(), true);

    x = array({true, false});
    CHECK_EQ(x.dtype(), bool_);

    x = array({true, false}, float32);
    CHECK_EQ(x.dtype(), float32);
    CHECK(array_equal(x, array({1.0f, 0.0f})).item<bool>());
  }

  // uint8
  {
    basic_dtype_test(uint8_t, uint8);
  }

  // uint16
  {
    basic_dtype_test(uint16_t, uint16);
  }

  // uint32
  {
    basic_dtype_test(uint32_t, uint32);
  }

  // uint64
  {
    basic_dtype_test(uint64_t, uint64);
  }

  // int8
  {
    basic_dtype_test(int8_t, int8);
  }

  // int16
  {
    basic_dtype_test(int16_t, int16);
  }

  // int32
  {
    basic_dtype_test(int32_t, int32);
  }

  // int64
  {
    basic_dtype_test(int64_t, int64);
  }

  // float16
  {
    basic_dtype_test(float16_t, float16);
  }

  // float32
  {
    basic_dtype_test(float, float32);
  }

  // bfloat16
  {
    basic_dtype_test(bfloat16_t, bfloat16);
  }

#undef basic_dtype_test

  // uint32
  {
    uint32_t val = UINT_MAX;
    array x(val);
    CHECK_EQ(x.dtype(), uint32);
    CHECK_EQ(x.item<uint32_t>(), val);

    x = array({1u, 2u});
    CHECK_EQ(x.dtype(), uint32);
  }

  // int32
  {
    array x(-1);
    CHECK_EQ(x.dtype(), int32);
    CHECK_EQ(x.item<int>(), -1);

    x = array({-1, 2});
    CHECK_EQ(x.dtype(), int32);

    std::vector<int> data{0, 1, 2};
    x = array(data.data(), {static_cast<int>(data.size())}, bool_);
    CHECK_EQ(x.dtype(), bool_);
    CHECK(array_equal(x, array({false, true, true})).item<bool>());
  }

  // int64
  {
    int64_t val = static_cast<int64_t>(INT_MIN) - 1;
    array x(val);
    CHECK_EQ(x.dtype(), int64);
    CHECK_EQ(x.item<int64_t>(), val);

    x = array({val, val});
    CHECK_EQ(x.dtype(), int64);
  }

  // float32
  {
    array x(3.14f);
    CHECK_EQ(x.dtype(), float32);
    CHECK_EQ(x.item<float>(), 3.14f);

    x = array(1.25);
    CHECK_EQ(x.dtype(), float32);
    CHECK_EQ(x.item<float>(), 1.25f);

    x = array({1.0f, 2.0f});
    CHECK_EQ(x.dtype(), float32);

    x = array({1.0, 2.0});
    CHECK_EQ(x.dtype(), float32);

    std::vector<double> data{1.0, 2.0, 4.0};
    x = array(data.data(), {static_cast<int>(data.size())});
    CHECK_EQ(x.dtype(), float32);
    CHECK(array_equal(x, array({1.0f, 2.0f, 4.0f})).item<bool>());
  }

  // complex64
  {
    CHECK_EQ(sizeof(complex64_t), sizeof(std::complex<float>));

    complex64_t v = {1.0f, 1.0f};
    array x(v);
    CHECK_EQ(x.dtype(), complex64);
    CHECK_EQ(x.item<complex64_t>(), v);

    array y(std::complex<float>{1.0f, 1.0f});
    CHECK_EQ(x.dtype(), complex64);
    CHECK_EQ(x.item<complex64_t>(), v);
  }
}

TEST_CASE("test array metadata") {
  array x(1.0f);
  CHECK_EQ(x.data_size(), 1);
  CHECK_EQ(x.flags().contiguous, true);
  CHECK_EQ(x.flags().row_contiguous, true);
  CHECK_EQ(x.flags().col_contiguous, true);

  x = array({1.0f}, {1, 1, 1});
  CHECK_EQ(x.data_size(), 1);
  CHECK_EQ(x.flags().contiguous, true);
  CHECK_EQ(x.flags().row_contiguous, true);
  CHECK_EQ(x.flags().col_contiguous, true);

  x = array({1.0f, 1.0f}, {1, 2});
  CHECK_EQ(x.data_size(), 2);
  CHECK_EQ(x.flags().contiguous, true);
  CHECK_EQ(x.flags().row_contiguous, true);
  CHECK_EQ(x.flags().col_contiguous, true);

  x = zeros({1, 1, 4});
  eval(x);
  CHECK_EQ(x.data_size(), 4);
  CHECK_EQ(x.flags().contiguous, true);
  CHECK_EQ(x.flags().row_contiguous, true);
  CHECK_EQ(x.flags().col_contiguous, true);

  x = zeros({2, 4});
  eval(x);
  CHECK_EQ(x.data_size(), 8);
  CHECK_EQ(x.flags().contiguous, true);
  CHECK_EQ(x.flags().row_contiguous, true);
  CHECK_EQ(x.flags().col_contiguous, false);

  x = array(1.0f);
  auto y = broadcast_to(x, {1, 1, 1});
  eval(y);
  CHECK_EQ(y.data_size(), 1);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  y = broadcast_to(x, {2, 8, 10});
  eval(y);
  CHECK_EQ(y.data_size(), 1);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, false);
  CHECK_EQ(y.flags().col_contiguous, false);

  y = broadcast_to(x, {1, 0});
  eval(y);
  CHECK_EQ(y.data_size(), 0);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  y = broadcast_to(zeros({4, 2, 1}), {4, 2, 0});
  eval(y);
  CHECK_EQ(y.data_size(), 0);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  x = array(1.0f);
  y = transpose(x);
  eval(y);
  CHECK_EQ(y.data_size(), 1);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  x = ones({1, 1, 1});
  y = transpose(x);
  eval(y);
  CHECK_EQ(y.data_size(), 1);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  x = ones({1, 1, 1});
  y = transpose(x, {0, 1, 2});
  eval(y);
  CHECK_EQ(y.data_size(), 1);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  x = ones({1, 1, 1});
  y = transpose(x, {1, 2, 0});
  eval(y);
  CHECK_EQ(y.data_size(), 1);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  x = ones({4, 1});
  y = transpose(x);
  eval(y);
  CHECK_EQ(y.data_size(), 4);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  x = ones({2, 3, 4});
  y = transpose(x);
  eval(y);
  CHECK_EQ(y.data_size(), 24);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, false);
  CHECK_EQ(y.flags().col_contiguous, true);

  y = transpose(x, {0, 2, 1});
  eval(y);
  CHECK_EQ(y.data_size(), 24);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, false);
  CHECK_EQ(y.flags().col_contiguous, false);

  y = transpose(transpose(x, {0, 2, 1}), {0, 2, 1});
  eval(y);
  CHECK_EQ(y.data_size(), 24);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, false);

  x = array(1.0f);
  y = reshape(x, {1, 1, 1});
  eval(y);
  CHECK_EQ(y.data_size(), 1);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  x = ones({2, 4});
  y = reshape(x, {8});
  eval(y);
  CHECK_EQ(y.data_size(), 8);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  y = reshape(x, {8, 1, 1});
  eval(y);
  CHECK_EQ(y.data_size(), 8);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  y = reshape(x, {1, 8, 1});
  eval(y);
  CHECK_EQ(y.data_size(), 8);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  x = ones({12});
  y = reshape(x, {2, 3, 2});
  eval(y);
  CHECK_EQ(y.data_size(), 12);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, false);

  x = array(1.0f);
  y = slice(x, {}, {});
  eval(y);
  CHECK_EQ(y.data_size(), 1);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  x = array({1.0f});
  y = slice(x, {-10}, {10}, {10});
  eval(y);
  CHECK_EQ(y.data_size(), 1);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  x = array({1.0f, 2.0f, 3.0f}, {1, 3});
  y = slice(x, {0, 0}, {1, 3}, {1, 1});
  eval(y);
  CHECK_EQ(y.data_size(), 3);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  x = array({1.0f, 2.0f, 3.0f}, {1, 3});
  y = slice(x, {0, 0}, {1, 3}, {1, 1});
  eval(y);
  CHECK_EQ(y.data_size(), 3);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  x = array({1.0f, 2.0f, 3.0f}, {1, 3});
  y = slice(x, {0, 0}, {0, 3}, {1, 1});
  eval(y);
  CHECK_EQ(y.data_size(), 0);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  x = array({1.0f, 2.0f, 3.0f}, {1, 3});
  y = slice(x, {0, 0}, {1, 2}, {1, 1});
  eval(y);
  CHECK_EQ(y.data_size(), 2);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  x = array({1.0f, 2.0f, 3.0f}, {1, 3});
  y = slice(x, {0, 0}, {1, 2}, {2, 3});
  eval(y);
  CHECK_EQ(y.shape(), Shape{1, 1});
  CHECK_EQ(y.data_size(), 1);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  x = array({0.0f, 1.0f, 2.0f, 3.0f}, {1, 4});
  y = slice(x, {0, 0}, {1, 4}, {1, 2});
  eval(y);
  CHECK_EQ(y.shape(), Shape{1, 2});
  CHECK_EQ(y.flags().contiguous, false);
  CHECK_EQ(y.flags().row_contiguous, false);
  CHECK_EQ(y.flags().col_contiguous, false);

  x = broadcast_to(array(1.0f), {4, 10});
  y = slice(x, {0, 0}, {4, 10}, {2, 2});
  eval(y);
  CHECK_EQ(y.shape(), Shape{2, 5});
  CHECK_EQ(y.data_size(), 1);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, false);
  CHECK_EQ(y.flags().col_contiguous, false);

  x = broadcast_to(array({1.0f, 2.0f}), {4, 2});
  y = slice(x, {0, 0}, {1, 2}, {1, 1});
  eval(y);
  CHECK_EQ(y.data_size(), 2);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  y = slice(x, {1, 0}, {2, 2}, {1, 1});
  eval(y);
  CHECK_EQ(y.data_size(), 2);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, true);

  x = array({0.0f, 1.0f, 2.0f, 3.0f}, {2, 2});
  y = slice(x, {0, 0}, {2, 2}, {1, 1});
  eval(y);
  CHECK_EQ(y.data_size(), 4);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, true);
  CHECK_EQ(y.flags().col_contiguous, false);

  y = slice(transpose(x), {0, 0}, {2, 2}, {1, 1});
  eval(y);
  CHECK_EQ(y.data_size(), 4);
  CHECK_EQ(y.flags().contiguous, true);
  CHECK_EQ(y.flags().row_contiguous, false);
  CHECK_EQ(y.flags().col_contiguous, true);

  x = ones({2, 4});
  auto out = split(x, 2);
  eval(out);
  for (auto y : out) {
    CHECK_EQ(y.data_size(), 4);
    CHECK_EQ(y.flags().contiguous, true);
    CHECK_EQ(y.flags().row_contiguous, true);
    CHECK_EQ(y.flags().col_contiguous, true);
  }
  out = split(x, 4, 1);
  eval(out);
  for (auto y : out) {
    CHECK_EQ(y.flags().contiguous, false);
    CHECK_EQ(y.flags().row_contiguous, false);
    CHECK_EQ(y.flags().col_contiguous, false);
  }
}

TEST_CASE("test array iteration") {
  // Dim 0 arrays
  auto arr = array(1);
  CHECK_THROWS(arr.begin());

  // Iterated arrays are read only
  CHECK(std::is_const_v<decltype(*arr.begin())>);

  arr = array({1, 2, 3, 4, 5});
  int i = 0;
  for (auto a : arr) {
    i++;
    CHECK_EQ(a.item<int>(), i);
  }
  CHECK_EQ(i, 5);

  arr = array({1, 2, 3, 4}, {2, 2});
  CHECK(array_equal(*arr.begin(), array({1, 2})).item<bool>());
  CHECK(array_equal(*(arr.begin() + 1), array({3, 4})).item<bool>());
  CHECK_EQ(arr.begin() + 2, arr.end());
}

TEST_CASE("test array shared buffer") {
  Shape shape = {2, 2};
  auto n_elem = shape[0] * shape[1];

  allocator::Buffer buf_b = allocator::malloc(n_elem * sizeof(float));
  void* buf_b_ptr = buf_b.raw_ptr();
  float* float_buf_b = (float*)buf_b_ptr;

  for (int i = 0; i < n_elem; i++) {
    float_buf_b[i] = 2.;
  }

  CHECK_EQ(float_buf_b[0], ((float*)buf_b_ptr)[0]);

  auto deleter = [float_buf_b](allocator::Buffer buf) {
    CHECK_EQ(float_buf_b, (float*)buf.raw_ptr());
    CHECK_EQ(float_buf_b[0], ((float*)buf.raw_ptr())[0]);
    allocator::free(buf);
  };

  array a = ones(shape, float32);
  array b = array(buf_b, shape, float32, deleter);

  eval(a + b);
}

TEST_CASE("test make empty array") {
  auto a = array({});
  CHECK_EQ(a.size(), 0);
  CHECK_EQ(a.dtype(), float32);

  a = array({}, int32);
  CHECK_EQ(a.size(), 0);
  CHECK_EQ(a.dtype(), int32);

  a = array({}, float32);
  CHECK_EQ(a.size(), 0);
  CHECK_EQ(a.dtype(), float32);

  a = array({}, bool_);
  CHECK_EQ(a.size(), 0);
  CHECK_EQ(a.dtype(), bool_);
}

TEST_CASE("test make array from user buffer") {
  int size = 4096;
  std::vector<int> buffer(size, 0);

  int count = 0;
  auto deleter = [&count, data = buffer.data()](void* ptr) {
    // make sure pointer is correct
    if (ptr == data) {
      count++;
    }
  };

  {
    auto a = array(buffer.data(), Shape{size}, int32, deleter);
    if (metal::is_available()) {
      CHECK_EQ(buffer.data(), a.data<int>());
    }
    auto b = a + array(1);
    eval(b);
    auto expected = ones({4096});
    CHECK(array_equal(b, expected).item<bool>());
  }
  // deleter should always get called
  CHECK_EQ(count, 1);
}

TEST_CASE("test negative indexing for shape/strides") {
  // 2D array: shape = {2, 3}
  std::vector<float> data(6, 1.0f);
  array a(data.begin(), Shape{2, 3});

  // Valid negative indexing
  CHECK_EQ(a.shape(-1), a.shape(1));
  CHECK_EQ(a.shape(-2), a.shape(0));
  CHECK_EQ(a.shape(-1), 3);
  CHECK_EQ(a.shape(-2), 2);

  CHECK_EQ(a.strides(-1), a.strides(1));
  CHECK_EQ(a.strides(-2), a.strides(0));
  CHECK_EQ(a.strides(-1), 1);
  CHECK_EQ(a.strides(-2), 3);

  // Invalid: too negative
  CHECK_THROWS_AS(a.shape(-3), std::out_of_range);
  CHECK_THROWS_AS(a.strides(-3), std::out_of_range);

  // Invalid: too positive
  CHECK_THROWS_AS(a.shape(2), std::out_of_range);
  CHECK_THROWS_AS(a.strides(2), std::out_of_range);
}


================================================
FILE: tests/autograd_tests.cpp
================================================
// Copyright © 2023 Apple Inc.

// Required for using M_2_SQRTPI in MSVC.
#define _USE_MATH_DEFINES

#include <algorithm>
#include <cmath>
#include <numeric>
#include <sstream>
#include <vector>
#include "doctest/doctest.h"

#include "mlx/graph_utils.h"
#include "mlx/mlx.h"

#include "mlx/backend/cuda/cuda.h"

using namespace mlx::core;

TEST_CASE("test stop gradient") {
  auto x = zeros({5, 5});
  auto y = stop_gradient(x);
  CHECK(array_equal(y, zeros({5, 5})).item<bool>());

  x = zeros({5, 5}, int32);
  y = stop_gradient(x);
  CHECK_EQ(y.dtype(), int32);
  CHECK(array_equal(y, zeros({5, 5}, int32)).item<bool>());

  {
    auto fun = [](array input) { return stop_gradient(add(input, ones({2}))); };
    auto vfun = vmap(fun);
    auto out = vfun(ones({3, 2}));
    CHECK(array_equal(out, full({3, 2}, 2.0)).item<bool>());
  }

  {
    auto fun = [](array input) { return add(stop_gradient(input), ones({2})); };
    auto vfun = vmap(fun);
    auto out = vfun(ones({3, 2}));
    CHECK(array_equal(out, full({3, 2}, 2.0)).item<bool>());
  }

  {
    auto x = array(1.);
    auto fun = [](array in) { return stop_gradient(add(in, in)); };
    auto out = vjp(fun, x, array(1.)).second;
    CHECK(array_equal(out, array(0.)).item<bool>());

    out = jvp(fun, x, array(1.)).second;
    CHECK(array_equal(out, array(0.)).item<bool>());
  }

  {
    auto x = array(1.);
    auto fun = [](array in) { return add(in, stop_gradient(in)); };
    auto out = vjp(fun, x, array(1.)).second;
    CHECK(array_equal(out, array(1.)).item<bool>());

    out = jvp(fun, x, array(1.)).second;
    CHECK(array_equal(out, array(1.)).item<bool>());
  }

  {
    auto x = array(1.);
    auto fun = [](array in) {
      for (int i = 0; i < 10; ++i) {
        in = add(in, in);
      }
      return stop_gradient(in);
    };
    {
      auto out = vjp(fun, x, array(1.)).second;
      std::ostringstream g_ss;
      print_graph(g_ss, out);
      auto g_str = g_ss.str();
      auto count = std::count(g_str.begin(), g_str.end(), '\n');
      CHECK(count < 5);
    }
    {
      auto out = jvp(fun, x, array(1.)).second;
      std::ostringstream g_ss;
      print_graph(g_ss, out);
      auto g_str = g_ss.str();
      auto count = std::count(g_str.begin(), g_str.end(), '\n');
      CHECK(count < 5);
    }
  }
}

TEST_CASE("test jvp") {
  {
    auto fun = [](const std::vector<array>& inputs) {
      return std::vector<array>{add(inputs[0], inputs[1])};
    };
    auto x = array(1.0f);
    auto y = array(1.0f);
    auto [out, dout] = jvp(fun, {x, y}, {array(1.0f), array(3.0f)});
    CHECK_EQ(out[0].item<float>(), 2.0f);
    CHECK_EQ(dout[0].item<float>(), 4.0f);
  }

  // Evaling in function while tracing performs graph retention
  {
    auto fun1 = [](const array& x) {
      auto y = 3 * x;
      eval(y);
      CHECK(y.is_available());
      CHECK(y.has_primitive());
      CHECK(y.is_tracer());
      return 2 * y;
    };
    CHECK_EQ(jvp(fun1, array(1.0f), array(1.0f)).second.item<float>(), 6.0f);
  }

  // Only one argument
  {
    auto x = array(1.0f);
    auto fun = [x](array in) { return add(x, in); };
    auto y = array(1.0f);
    auto out = jvp(fun, y, array(3.0f)).second;
    CHECK_EQ(out.item<float>(), 3.0f);
  }

  // Input also in capture clause
  {
    auto x = array(1.0f);
    auto fun = [x](array in) { return in + x; };
    auto out = jvp(fun, x, array(1.0f)).second;
    CHECK_EQ(out.item<float>(), 1.0f);
  }

  // Throws on incorrectly shaped inputs
  {
    auto fun = [](array in) { return add(in, in); };
    CHECK_THROWS_AS(jvp(fun, array(1), array({1, 1})), std::invalid_argument);
  }

  // Throws on wrong number of inputs
  {
    auto fun = [](std::vector<array> inputs) {
      return std::vector<array>{inputs[0], inputs[1]};
    };
    CHECK_THROWS_AS(
        jvp(fun, {array(1), array(1)}, {array(1)}), std::invalid_argument);
  }

  // No dependence between input and output
  {
    auto fun = [](array in) { return array({1.0, 1.0}); };
    auto out = jvp(fun, array(1.0f), array(1.0f)).second;
    CHECK(array_equal(out, zeros({2})).item<bool>());
  }
}

TEST_CASE("test vjp") {
  {
    auto x = array(1.0f);
    auto y = array(1.0f);
    auto fun = [y](array in) { return add(in, y); };
    auto [out, dout] = vjp(fun, x, array(1.0f));
    CHECK_EQ(out.item<float>(), 2.0f);
    CHECK_EQ(dout.item<float>(), 1.0f);
  }

  {
    auto x = array(1.0f);
    auto fun = [](array in) { return in + in + in; };
    auto out = vjp(fun, x, array(1.0f)).second;
    CHECK_EQ(out.item<float>(), 3.0f);
    out = vjp(fun, x, array(2.)).second;
    CHECK_EQ(out.item<float>(), 6.0f);
  }

  // Input also in capture clause
  {
    auto x = array(1.0f);
    auto fun = [x](array in) { return in + x; };
    auto out = vjp(fun, x, array(1.0f)).second;
    CHECK_EQ(out.item<float>(), 1.0f);
  }

  // Throws on incorrectly shaped outputs
  {
    auto fun = [](array in) { return add(in, in); };
    CHECK_THROWS_AS(vjp(fun, zeros({1}), zeros({2})), std::invalid_argument);
  }

  // Throws on wrong number of outputs
  {
    auto fun = [](std::vector<array> inputs) {
      return std::vector<array>{inputs[0], inputs[0]};
    };
    CHECK_THROWS_AS(
        vjp(fun, {zeros({1})}, {zeros({2})}), std::invalid_argument);
  }

  // No dependence between input and output
  {
    auto fun = [](array in) { return array(1.); };
    auto out = vjp(fun, zeros({2}), array(1.)).second;
    CHECK(array_equal(out, zeros({2})).item<bool>());
  }

  // Handles multiple outputs
  {
    auto x = array(1.);
    auto y = array(2.);
    auto z = array(3.);
    auto fun = [](const std::vector<array>& in) {
      return std::vector<array>{in[0] * in[1], in[1] * in[2]};
    };
    auto out = vjp(fun, {x, y, z}, {array(2.), array(3.)}).second;
    CHECK_EQ(out.size(), 3);
    CHECK_EQ(out[0].item<float>(), 2.0f * 2.0f);
    CHECK_EQ(out[1].item<float>(), 1.0f * 2.0f + 3.0f * 3.0f);
    CHECK_EQ(out[2].item<float>(), 3.0f * 2.0f);
  }
}

TEST_CASE("test grad") {
  {
    auto x = array(1.0);
    auto fun = [](array in) { return in + 1; };
    auto [y, dfdx] = value_and_grad(fun)(x);
    CHECK_EQ(y.item<float>(), 2.0f);
    CHECK_EQ(dfdx.item<float>(), 1.0f);
    auto [z, d2fdx2] = value_and_grad(grad(fun))(x);
    CHECK_EQ(z.item<float>(), 1.0f);
    CHECK_EQ(d2fdx2.item<float>(), 0.0f);
  }

  {
    auto x = array(1.);
    auto fun = [](array in) { return add(in, array(1.)); };
    auto dfdx = grad(fun);
    CHECK(array_equal(dfdx(x), array(1.)).item<bool>());
    auto d2fdx2 = grad(grad(fun));
    CHECK(array_equal(d2fdx2(x), array(0.)).item<bool>());
  }

  {
    auto x = array(1.);
    auto expfn = [](array input) { return exp(input); };
    auto dfdx = grad(expfn);
    CHECK_EQ(dfdx(x).item<float>(), doctest::Approx(std::exp(1.0f)));
    auto d2fdx2 = grad(grad(expfn));
    CHECK_EQ(d2fdx2(x).item<float>(), doctest::Approx(std::exp(1.0f)));
    auto d3fdx3 = grad(grad(grad(expfn)));
    CHECK_EQ(d3fdx3(x).item<float>(), doctest::Approx(std::exp(1.0f)));
  }

  {
    // No graph retention since the output is independent of y
    auto y = ones({3, 3});
    auto fn1 = [y](array x) {
      x = x + 2.0f;
      eval(y);
      CHECK(x.is_tracer());
      CHECK(!y.is_tracer());
      CHECK(y.is_available());
      CHECK(!y.has_primitive());
      return square(x);
    };
    auto dfdx = grad(fn1)(array(1.0f));
    CHECK_EQ(dfdx.item<float>(), 6.0f);

    // Graph automatically retained to compute the grad
    auto fn2 = [](array x) {
      x = x + 2.0f;
      eval(x);
      CHECK(x.is_tracer());
      CHECK(x.is_available());
      CHECK(x.has_primitive());
      return square(x);
    };
    dfdx = grad(fn2)(array(1.0f));
    CHECK_EQ(dfdx.item<float>(), 6.0f);
  }

  // Control flow in grad computation
  {
    auto fn = [](array x) {
      x = x + array(2.0f);
      if (x.item<float>() > 3) {
        return square(x);
      } else {
        return 4 * x;
      }
    };

    auto dfdx = grad(fn)(array(0.5f));
    CHECK_EQ(dfdx.item<float>(), 4.0f);

    dfdx = grad(fn)(array(1.5f));
    CHECK_EQ(dfdx.item<float>(), 7.0f);
  }

  // Grad with multiple inputs
  {
    auto fn = [](std::vector<array> inputs) { return inputs[0] * inputs[1]; };
    auto x = array(2.0f);
    auto y = array(3.0f);

    auto [value, grads] = value_and_grad(fn)({x, y});
    CHECK_EQ(value.item<float>(), 6.0f);
    CHECK_EQ(grads[0].item<float>(), 3.0f);

    auto dfdx = grad(fn)({x, y})[0];
    CHECK_EQ(dfdx.item<float>(), 3.0f);

    auto dfdy = grad(fn, 1)({x, y})[0];
    CHECK_EQ(dfdy.item<float>(), 2.0f);

    // Negative indexing
    dfdy = grad(fn, -1)({x, y})[0];
    CHECK_EQ(dfdy.item<float>(), 2.0f);

    grads = grad(fn, {0, 1})({x, y});
    CHECK_EQ(grads[0].item<float>(), 3.0f);
    CHECK_EQ(grads[1].item<float>(), 2.0f);

    CHECK_THROWS_AS(
        grad(fn, std::vector<int>{})({x, y}), std::invalid_argument);
    CHECK_THROWS_AS(grad(fn, {0, 1, 2})({x, y}), std::invalid_argument);
    CHECK_THROWS_AS(grad(fn, {0, 0})({x, y}), std::invalid_argument);
    CHECK_THROWS_AS(grad(fn, -3)({x, y}), std::invalid_argument);
  }
}

TEST_CASE("test creation grads") {
  // Test astype
  {
    auto fn = [](array a) { return astype(a, int32); };
    auto x = ones({4, 4}, float32);
    auto out = vjp(fn, x, full({4, 4}, 2, int32)).second;
    CHECK_EQ(out.dtype(), float32);
    CHECK(array_equal(out, full({4, 4}, 2.0f)).item<bool>());

    out = jvp(fn, x, full({4, 4}, 2, float32)).second;
    CHECK_EQ(out.dtype(), int32);
    CHECK(array_equal(out, full({4, 4}, 2, int32)).item<bool>());
  }

  // Test full
  {
    auto full_fn = [](array a) { return full({5, 5, 2}, a); };
    auto x = ones({2}, float32);
    auto out = vjp(full_fn, x, full({5, 5, 2}, 2.0f)).second;
    CHECK(array_equal(out, array({50.0f, 50.0f})).item<bool>());

    out = jvp(full_fn, x, array({3.0f, 3.0f})).second;
    CHECK(array_equal(out, full({5, 5, 2}, 3.0f)).item<bool>());
  }
}

TEST_CASE("test op vjps") {
  // Test abs
  {
    auto out = vjp([](array in) { return abs(in); }, array(-5.0f), array(1.0f));
    CHECK_EQ(out.second.item<float>(), -1.0f);
  }

  // Test sign
  {
    auto out =
        vjp([](array in) { return sign(in); }, array(-5.0f), array(10.0f));
    CHECK_EQ(out.second.item<float>(), 0.0f);
  }

  // Test negate
  {
    auto out = vjp([](array in) { return -in; }, array(1.0), array(2.0));
    CHECK(array_equal(out.second, array(-2.)).item<bool>());
  }

  // Test square
  {
    auto out =
        vjp([](array in) { return square(in); }, array(2.0f), array(3.0f));
    CHECK_EQ(out.second.item<float>(), 12.0f);
  }

  // Test sqrt
  {
    auto out = vjp(
        [](array in) { return mlx::core::sqrt(in); }, array(4.0f), array(8.0f));
    CHECK_EQ(out.second.item<float>(), 2.0f);
  }

  // Test rsqrt
  {
    auto out =
        vjp([](array in) { return rsqrt(in); }, array(4.0f), array(8.0f));
    CHECK_EQ(out.second.item<float>(), -0.5f);
  }

  // Test exp
  {
    auto out = vjp([](array in) { return exp(in); }, array(1.0f), array(2.0f));
    CHECK_EQ(out.second.item<float>(), doctest::Approx(2.0f * std::exp(1.0f)));
  }

  // Test sin
  {
    auto out =
        vjp([](array input) { return sin(input); }, array(1.0f), array(1.0f));
    CHECK(out.second.item<float>() == doctest::Approx(std::cos(1.0f)));
  }

  // Test cos
  {
    auto out =
        vjp([](array input) { return cos(input); }, array(1.0f), array(1.0f));
    CHECK(out.second.item<float>() == doctest::Approx(-std::sin(1.0f)));
  }

  // Test arctan
  {
    auto out = vjp(
        [](array input) { return arctan(input); }, array(2.0f), array(1.0f));
    CHECK(out.second.item<float>() == doctest::Approx(0.2f));
  }

  // Test arctan2
  {
    auto out = vjp(
        [](const std::vector<array>& xs) {
          return std::vector<array>{arctan2(xs[0], xs[1])};
        },
        {array(2.0f), array(3.0f)},
        {array(1.0f)});
    CHECK(out.second[0].item<float>() == doctest::Approx(3.0f / 13.0f));
    CHECK(out.second[1].item<float>() == doctest::Approx(-2.0f / 13.0f));
  }

  // Test log
  {
    auto out = vjp([](array in) { return log(in); }, array(2.0f), array(1.0f));
    CHECK_EQ(out.second.item<float>(), 0.5f);

    out = vjp([](array in) { return log(in); }, array(2.0f), array(2.0f));
    CHECK_EQ(out.second.item<float>(), 1.0f);
  }

  // Test log1p
  {
    auto out =
        vjp([](array in) { return log1p(in); }, array(1.0f), array(1.0f));
    CHECK_EQ(out.second.item<float>(), 0.5f);

    out = vjp([](array in) { return log1p(in); }, array(1.0f), array(2.0f));
    CHECK_EQ(out.second.item<float>(), 1.0f);
  }

  constexpr auto inf = std::numeric_limits<float>::infinity();

  // Test erf
  {
    auto out = vjp([](array in) { return erf(in); }, array(inf), array(1.0f));
    CHECK_EQ(out.second.item<float>(), doctest::Approx(0.0f));

    out = vjp([](array in) { return erf(in); }, array(-inf), array(2.0f));
    CHECK_EQ(out.second.item<float>(), doctest::Approx(0.0f));

    out = vjp([](array in) { return erf(in); }, array(0.0f), array(1.0f));
    CHECK_EQ(out.second.item<float>(), static_cast<float>(M_2_SQRTPI));
  }

  // Test erfinv
  {
    auto out =
        vjp([](array in) { return erfinv(in); }, array(1.0f), array(1.0f));
    CHECK_EQ(out.second.item<float>(), inf);

    out = vjp([](array in) { return erfinv(in); }, array(-1.0f), array(2.0f));
    CHECK_EQ(out.second.item<float>(), inf);

    out = vjp([](array in) { return erfinv(in); }, array(0.0f), array(1.0f));
    CHECK_EQ(out.second.item<float>(), static_cast<float>(1.0 / M_2_SQRTPI));
  }

  // Test sigmoid
  {
    auto out =
        vjp([](array in) { return sigmoid(in); }, array(0.0f), array(1.0f));
    CHECK_EQ(out.second.item<float>(), 0.25f);

    out = vjp([](array in) { return sigmoid(in); }, array(0.0f), array(2.0f));
    CHECK_EQ(out.second.item<float>(), 0.5f);
  }

  // Test add
  {
    auto fun = [](std::vector<array> inputs) {
      return std::vector<array>{inputs[0] + inputs[1]};
    };
    auto out = vjp(fun, {array(1.0), array(2.0)}, {array(3.0)}).second;
    CHECK_EQ(out[0].item<float>(), 3.0);
    CHECK_EQ(out[1].item<float>(), 3.0);

    // Check with broadcasting
    out = vjp(fun, {ones({3, 1}), ones({1, 2})}, {full({3, 2}, 2.0)}).second;
    CHECK(array_equal(out[0], full({3, 1}, 4.0)).item<bool>());
    CHECK(array_equal(out[1], full({1, 2}, 6.0)).item<bool>());
  }

  // Test subtract
  {
    auto fun = [](std::vector<array> inputs) {
      return std::vector<array>{inputs[0] - inputs[1]};
    };
    auto out = vjp(fun, {array(1.0), array(2.0)}, {array(3.0)}).second;
    CHECK_EQ(out[0].item<float>(), 3.0);
    CHECK_EQ(out[1].item<float>(), -3.0);

    // Check with broadcasting
    out = vjp(fun, {ones({3, 1}), ones({1, 2})}, {ones({3, 2})}).second;
    CHECK(array_equal(out[0], full({3, 1}, 2.0)).item<bool>());
    CHECK(array_equal(out[1], full({1, 2}, -3.0)).item<bool>());
  }

  // Test multiply
  {
    auto fun = [](std::vector<array> inputs) {
      return std::vector<array>{inputs[0] * inputs[1]};
    };
    auto out = vjp(fun, {array(4.0f), array(2.0f)}, {array(3.0f)}).second;
    CHECK_EQ(out[0].item<float>(), 6.0f);
    CHECK_EQ(out[1].item<float>(), 12.0f);

    // Check with broadcasting
    out = vjp(fun, {full({3, 1}, 2.0f), full({1, 2}, 4.0f)}, {ones({3, 2})})
              .second;
    CHECK(array_equal(out[0], full({3, 1}, 8.0f)).item<bool>());
    CHECK(array_equal(out[1], full({1, 2}, 6.0)).item<bool>());
  }

  // Test divide
  {
    auto fun = [](std::vector<array> inputs) {
      return std::vector<array>{inputs[0] / inputs[1]};
    };
    auto out = vjp(fun, {array(4.0f), array(2.0f)}, {array(1.0f)}).second;
    CHECK_EQ(out[0].item<float>(), 0.5f);
    CHECK_EQ(out[1].item<float>(), -1.0f);

    // Check with broadcasting
    out = vjp(fun, {full({3, 1}, 4.0f), full({1, 2}, 2.0f)}, {ones({3, 2})})
              .second;
    CHECK(array_equal(out[0], full({3, 1}, 1.0f)).item<bool>());
    CHECK(array_equal(out[1], full({1, 2}, -3.0f)).item<bool>());
  }

  // Test maximum
  {
    auto fun = [](std::vector<array> inputs) {
      return std::vector<array>{maximum(inputs[0], inputs[1])};
    };
    auto out = vjp(fun, {array(5.0f), array(2.0f)}, {array(2.0f)}).second;
    CHECK_EQ(out[0].item<float>(), 2.0f);
    CHECK_EQ(out[1].item<float>(), 0.0f);

    out = vjp(fun, {array(2.0f), array(2.0f)}, {array(1.0f)}).second;
    auto out_a = out[0].item<float>();
    auto out_b = out[1].item<float>();
    // When inputs are equal at most one gradient is nonzero
    CHECK(
        ((out_a == 1.0f && out_b == 0.0f) || (out_a == 0.0f && out_b == 1.0f)));
  }

  // Test minimum
  {
    auto fun = [](std::vector<array> inputs) {
      return std::vector<array>{minimum(inputs[0], inputs[1])};
    };
    auto out = vjp(fun, {array(4.0f), array(2.0f)}, {array(2.0f)}).second;
    CHECK_EQ(out[0].item<float>(), 0.0f);
    CHECK_EQ(out[1].item<float>(), 2.0f);

    out = vjp(fun, {array(2.0f), array(2.0f)}, {array(1.0f)}).second;
    auto out_a = out[0].item<float>();
    auto out_b = out[1].item<float>();
    CHECK(
        ((out_a == 1.0f && out_b == 0.0f) || (out_a == 0.0f && out_b == 1.0f)));
  }

  // Test logaddexp
  {
    auto fun = [](std::vector<array> inputs) {
      return std::vector<array>{logaddexp(inputs[0], inputs[1])};
    };

    constexpr auto inf = std::numeric_limits<float>::infinity();

    auto out = vjp(fun, {array(2.0), array(2.0f)}, {array(1.0f)}).second;
    CHECK_EQ(out[0].item<float>(), 0.5f);
    CHECK_EQ(out[1].item<float>(), 0.5f);
    out = vjp(fun, {array(2.0), array(2.0f)}, {array(2.0f)}).second;
    CHECK_EQ(out[0].item<float>(), 1.0f);
    CHECK_EQ(out[1].item<float>(), 1.0f);

    out = vjp(fun, {array(inf), array(2.0f)}, {array(1.0f)}).second;
    CHECK_EQ(out[0].item<float>(), 1.0f);
    CHECK_EQ(out[1].item<float>(), 0.0f);

    out = vjp(fun, {array(-inf), array(2.0f)}, {array(1.0f)}).second;
    CHECK_EQ(out[0].item<float>(), 0.0f);
    CHECK_EQ(out[1].item<float>(), 1.0f);

    out = vjp(fun, {array(-10.0f), array(-inf)}, {array(1.0f)}).second;
    CHECK_EQ(out[0].item<float>(), 1.0f);
    CHECK_EQ(out[1].item<float>(), 0.0f);

    out = vjp(fun, {array(-inf), array(-inf)}, {array(1.0f)}).second;
    CHECK(std::isnan(out[0].item<float>()));
    CHECK(std::isnan(out[1].item<float>()));
  }

  // Test power
  {
    auto fun = [](std::vector<array> inputs) {
      return std::vector<array>{power(inputs[0], inputs[1])};
    };
    auto out = vjp(fun, {array(4.0f), array(3.0f)}, {array(1.0f)}).second;
    CHECK_EQ(out[0].item<float>(), 48.0f);
    CHECK_EQ(out[1].item<float>(), std::log(4.0f) * 64.0f);
  }

  // Test sum
  {
    std::vector<int> axes;
    auto fun = [&axes](array input) { return sum(input, axes); };
    axes = {};
    auto out = vjp(fun, array(2.0f), array(3.0f)).second;
    CHECK_EQ(out.item<float>(), 3.0f);

    axes = {0};
    out = vjp(fun, array({}), array(3.0f)).second;
    CHECK_EQ(out.size(), 0);
    CHECK_EQ(out.shape(), Shape{0});

    axes = {0};
    out = vjp(fun, ones({2, 2, 2}), array({1.0f, 2.0f, 3.0f, 4.0f}, {2, 2}))
              .second;
    auto expected =
        array({1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f}, {2, 2, 2});
    CHECK(array_equal(out, expected).item<bool>());

    axes = {1};
    out = vjp(fun, ones({2, 2, 2}), array({1.0f, 2.0f, 3.0f, 4.0f}, {2, 2}))
              .second;
    expected =
        array({1.0f, 2.0f, 1.0f, 2.0f, 3.0f, 4.0f, 3.0f, 4.0f}, {2, 2, 2});
    CHECK(array_equal(out, expected).item<bool>());

    axes = {2};
    out = vjp(fun, ones({2, 2, 2}), array({1.0f, 2.0f, 3.0f, 4.0f}, {2, 2}))
              .second;
    expected =
        array({1.0f, 1.0f, 2.0f, 2.0f, 3.0f, 3.0f, 4.0f, 4.0f}, {2, 2, 2});
    CHECK(array_equal(out, expected).item<bool>());
  }

  // Test prod
  {
    std::vector<int> axes;
    auto fun = [&axes](array input) { return prod(input, axes); };
    axes = {};
    auto out = vjp(fun, array(2.0f), array(3.0f)).second;
    CHECK_EQ(out.item<float>(), 3.0f);

    axes = {0};
    out = vjp(fun,
              array({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {2, 3}),
              array(
                  {1.0f, 2.0f, 3.0f},
                  {
                      3,
                  }))
              .second;
    auto expected = array({4.0f, 10.0f, 18.0f, 1.0f, 4.0f, 9.0f}, {2, 3});
    CHECK(array_equal(out, expected).item<bool>());

    axes = {0, 1};
    out = vjp(fun,
              array({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {2, 3}),
              array(1.0f))
              .second;
    expected = array({720.0f, 360.0f, 240.0f, 180.0f, 144.0f, 120.0f}, {2, 3});
    CHECK(array_equal(out, expected).item<bool>());
  }
}

TEST_CASE("test gather and take grads") {
  // Check linear takes
  auto linear_f = [](array indices) {
    auto fun_linear = [&indices](array input) { return take(input, indices); };

    return fun_linear;
  };

  auto src = ones({4, 4});
  auto ind = array({0, 1, 2, 3}, uint32);
  auto out = vjp(linear_f(ind), src, ones({4})).second;
  auto out_1 = take(out, array({0}, uint32), 0);
  auto out_2 = take(out, array({1, 2, 3}, uint32), 0);
  CHECK(array_equal(out_1, ones({1, 4})).item<bool>());
  CHECK(array_equal(out_2, zeros({3, 4})).item<bool>());
  auto tangent = reshape(arange(16), {4, 4});
  out = jvp(linear_f(ind), src, tangent).second;
  CHECK(array_equal(out, array({0, 1, 2, 3})).item<bool>());

  src = ones({4});
  ind = array({0, 0, 0, 0}, uint32);
  out = vjp(linear_f(ind), src, ones({4})).second;
  out_1 = take(out, array({0}, uint32));
  CHECK_EQ(out_1.item<float>(), 4.0f);

  tangent = arange(4);
  out = jvp(linear_f(ind), src, tangent).second;
  CHECK(array_equal(out, array({0, 0, 0, 0})).item<bool>());

  // Check axis takes
  src = ones({4, 4});
  ind = array({0, 1, 2, 3}, uint32);

  auto fun = [&ind](array input) { return take(input, ind, 0); };

  out = vjp(fun, src, ones({4, 4})).second;
  CHECK(array_equal(out, src).item<bool>());

  out = jvp(fun, src, ones({4, 4})).second;
  CHECK(array_equal(out, src).item<bool>());

  // Check index throw
  auto fun_throw = [](std::vector<array> inputs) {
    return std::vector<array>{take(inputs[0], inputs[1])};
  };

  CHECK_THROWS_AS(
      vjp(fun_throw, {src, ind}, {ones({4, 4})}), std::invalid_argument);

  CHECK_THROWS_AS(
      jvp(fun_throw, {src, ind}, {ones({4, 4}), ind}), std::invalid_argument);
}

TEST_CASE("test slice grads") {
  Shape start = {5, 0, 0};
  Shape stop = {7, 2, 4};
  Shape strides = {1, 1, 1};

  auto fn = [&start, &stop, &strides](array input) {
    return slice(input, start, stop, strides);
  };

  auto src = ones({8, 8, 8});
  auto out = vjp(fn, src, ones({2, 2, 4})).second;
  CHECK_EQ(sum(out).item<float>(), 16.);

  out = jvp(fn, src, full({8, 8, 8}, 2.0f)).second;
  CHECK(array_equal(out, full({2, 2, 4}, 2.0f)).item<bool>());

  src = ones({4, 4});
  start = {2, 0};
  stop = {4, 4};
  strides = {1, 1};
  out = vjp(fn, src, ones({2, 4})).second;
  auto out_1 = take(out, array({0, 1}, uint32), 0);
  auto out_2 = take(out, array({2, 3}, uint32), 0);

  CHECK(array_equal(out_1, zeros({2, 4})).item<bool>());
  CHECK(array_equal(out_2, ones({2, 4})).item<bool>());

  start = {0, 0};
  stop = {4, 4};
  strides = {2, 2};
  auto cotangent = array({1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
  out = vjp(fn, src, cotangent).second;
  auto expected = astype(
      array({1, 0, 2, 0, 0, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0}, {4, 4}), float32);
  CHECK(array_equal(out, expected).item<bool>());

  out = jvp(fn, src, ones({4, 4})).second;
  CHECK(array_equal(out, ones({2, 2})).item<bool>());

  // Empty slices.
  start = {0, 0};
  stop = {0, 4};
  cotangent = reshape(array({}), {0, 2});
  out = vjp(fn, src, cotangent).second;
  CHECK(array_equal(out, zeros({4, 4})).item<bool>());

  out = jvp(fn, src, ones({4, 4})).second;
  CHECK_EQ(out.size(), 0);
}

TEST_CASE("test min and max vjp") {
  // Test min
  {
    std::vector<int> axes;
    array in({});
    array v({});
    array expected({});
    array out({});
    auto fun = [&axes](array input) { return min(input, axes); };

    axes = {};
    in = array({2.0f});
    out = vjp(fun, array(2.0f), array(3.0f)).second;
    CHECK_EQ(out.item<float>(), 3.0f);

    axes = {0};
    in = reshape(array({1.0f, 2.0f, 2.0f, -1.0f}), {2, 2});
    v = array({3.0f, 7.0f});
    out = vjp(fun, in, v).second;
    expected = array({3.0f, 0.0f, 0.0f, 7.0f});
    expected = reshape(expected, {2, 2});
    CHECK(array_equal(out, expected).item<bool>());

    axes = {0, 2};
    in = reshape(
        array({1.0f, 0.0f, 0.0f, 1.0f, -1.0f, -1.0f, 1.0f, 0.0f}), {2, 2, 2});
    v = array({3.0f, 7.0f});
    out = vjp(fun, in, v).second;
    expected = array({0.0f, 0.0f, 3.5f, 0.0f, 1.5f, 1.5f, 0.0f, 3.5f});
    expected = reshape(expected, {2, 2, 2});
    CHECK(array_equal(out, expected).item<bool>());
  }

  // Test max
  {
    std::vector<int> axes;
    array in({});
    array v({});
    array expected({});
    array out({});
    auto fun = [&axes](array input) { return max(input, axes); };

    axes = {};
    in = array({2.0f});
    out = vjp(fun, array(2.0f), array(3.0f)).second;
    CHECK_EQ(out.item<float>(), 3.0f);

    axes = {0};
    in = reshape(array({1.0f, 2.0f, 2.0f, -1.0f}), {2, 2});
    v = array({3.0f, 7.0f});
    out = vjp(fun, in, v).second;
    expected = array({0.0f, 7.0f, 3.0f, 0.0f});
    expected = reshape(expected, {2, 2});
    CHECK(array_equal(out, expected).item<bool>());

    axes = {0, 2};
    in = reshape(
        array({1.0f, 0.0f, 0.0f, 1.0f, -1.0f, -1.0f, 1.0f, 0.0f}), {2, 2, 2});
    v = array({3.0f, 7.0f});
    out = vjp(fun, in, v).second;
    expected = array({3.0f, 0.0f, 0.0f, 3.5f, 0.0f, 0.0f, 3.5f, 0.0f});
    expected = reshape(expected, {2, 2, 2});
    CHECK(array_equal(out, expected).item<bool>());
  }
}

TEST_CASE("test reshape and transpose grads") {
  {
    auto fn = [](array a) { return reshape(a, {3, 4}); };

    auto out = vjp(fn, ones({12}), full({3, 4}, 2.0f)).second;
    CHECK(array_equal(out, full({12}, 2.0f)).item<bool>());

    out = jvp(fn, ones({12}), full({12}, 2.0f)).second;
    CHECK(array_equal(out, full({3, 4}, 2.0f)).item<bool>());
  }

  {
    auto fn = [](array a) { return transpose(a, {1, 2, 0}); };

    auto cotan = reshape(arange(24), {3, 4, 2});
    auto out = vjp(fn, ones({2, 3, 4}), cotan).second;
    CHECK(array_equal(out, transpose(cotan, {2, 0, 1})).item<bool>());

    auto tangent = reshape(arange(24), {2, 3, 4});
    out = jvp(fn, ones({2, 3, 4}), tangent).second;
    CHECK(array_equal(out, transpose(tangent, {1, 2, 0})).item<bool>());
  }
}

TEST_CASE("test copy grads") {
  auto fn = [](array a) { return copy(a); };

  auto cotan = arange(4, float32);
  auto out = vjp(fn, ones({4}), cotan).second;
  CHECK(array_equal(out, arange(4, float32)).item<bool>());

  auto tangent = arange(4, float32);
  out = jvp(fn, ones({4}), tangent).second;
  CHECK(array_equal(out, tangent).item<bool>());
}

TEST_CASE("test matmul vjp") {
  auto fun = [](std::vector<array> inputs) {
    return std::vector<array>{matmul(inputs[0], inputs[1])};
  };

  auto a = array({1.0f, 2.0f}, {1, 2});
  auto b = array({3.0f, 4.0f}, {2, 1});
  auto out = vjp(fun, {a, b}, {array({2.0f}, {1, 1})}).second;

  CHECK(array_equal(out[0], array({6.0f, 8.0f}, {1, 2})).item<bool>());
  CHECK(array_equal(out[1], array({2.0f, 4.0f}, {2, 1})).item<bool>());

  a = array({1.0f, 2.0f}, {2, 1});
  b = array({3.0f, 4.0f}, {1, 2});
  out = vjp(fun, {a, b}, {array({1.0f, 2.0f, 3.0f, 4.0f}, {2, 2})}).second;
  CHECK(array_equal(out[0], array({11.0f, 25.0f}, {2, 1})).item<bool>());
  CHECK(array_equal(out[1], array({7.0f, 10.0f}, {1, 2})).item<bool>());

  a = array({1.0f, 2.0f, 1.0f, 2.0f}, {2, 2, 1});
  b = array({1.0f, 1.0f, 2.0f, 2.0f}, {2, 1, 2});
  auto vjps = vjp(fun, {a, b}, {ones({2, 2, 2})}).second;
  auto vjpx = array({2.0f, 2.0f, 4.0f, 4.0f}, {2, 2, 1});
  auto vjpy = array({3.0f, 3.0f, 3.0f, 3.0f}, {2, 1, 2});
  CHECK(array_equal(vjps[0], vjpx).item<bool>());
  CHECK(array_equal(vjps[1], vjpy).item<bool>());
}

TEST_CASE("test concatenate grads") {
  auto arrs = split(arange(5, float32), 5);
  eval(arrs);

  auto fn = [&arrs](const std::vector<array>& inputs) {
    arrs[2] = inputs[0];
    arrs[4] = inputs[1];
    return std::vector<array>{concatenate(arrs, 0)};
  };
  auto out = vjp(fn, {arrs[2], arrs[4]}, {arange(5, float32)}).second;

  CHECK_EQ(out.size(), 2);
  CHECK_EQ(out[0].item<float>(), 2.0f);
  CHECK_EQ(out[1].item<float>(), 4.0f);

  out = jvp(fn, {arrs[2], arrs[4]}, {array({2.0f}, {1}), array({3.0f}, {1})})
            .second;
  CHECK_EQ(out.size(), 1);
  CHECK(
      array_equal(out[0], array({0.0f, 0.0f, 2.0f, 0.0f, 3.0f})).item<bool>());
}

TEST_CASE("test split grads") {
  array x = arange(6, float32);
  eval(x);

  {
    auto fn = [](const array& x) {
      auto parts = split(x, 3);
      return parts[0] * parts[1] + parts[2];
    };
    auto out = vjp(fn, {x}, {ones({2})}).second;

    CHECK_EQ(out.size(), 6);
    CHECK(array_equal(out, array({2.0f, 3.0f, 0.0f, 1.0f, 1.0f, 1.0f}))
              .item<bool>());
  }

  {
    auto fn = [](const array& x) {
      auto parts = split(x, 3);
      return parts[0] * parts[2];
    };
    auto out = vjp(fn, {x}, {ones({2})}).second;

    CHECK_EQ(out.size(), 6);
    CHECK(array_equal(out, array({4.0f, 5.0f, 0.0f, 0.0f, 0.0f, 1.0f}))
              .item<bool>());
  }
}

TEST_CASE("test comparison grads") {
  auto x = ones({3, 1});
  auto y = zeros({1, 3});

  auto check_vjp_jvp = [&x, &y](auto fn) {
    auto fn_wrap = [&fn](std::vector<array> inputs) {
      return std::vector<array>{fn(inputs[0], inputs[1], default_device())};
    };
    auto out_shape = broadcast_shapes(x.shape(), y.shape());
    std::vector<array> vjps = vjp(fn_wrap, {x, y}, {ones(out_shape)}).second;
    bool correct = array_equal(vjps[0], zeros(x.shape())).item<bool>();
    correct &= array_equal(vjps[1], zeros(y.shape())).item<bool>();

    std::vector<array> jvps =
        jvp(fn_wrap, {x, y}, {ones(x.shape()), ones(y.shape())}).second;
    correct &= array_equal(jvps[0], zeros(out_shape)).item<bool>();
    return correct;
  };

  CHECK(check_vjp_jvp(equal));
  CHECK(check_vjp_jvp(greater));
  CHECK(check_vjp_jvp(less));
  CHECK(check_vjp_jvp(greater_equal));
  CHECK(check_vjp_jvp(less_equal));
}

TEST_CASE("test as_strided grads") {
  auto x = ones({11});
  Shape shape = {5, 5};
  Strides strides = {1, 1};
  size_t offset = 0;

  auto fun = [&shape, &strides, &offset](array x) {
    return as_strided(x, shape, strides, offset);
  };

  auto out = vjp(fun, x, ones(shape)).second;
  auto expected = array({1, 2, 3, 4, 5, 4, 3, 2, 1, 0, 0});
  CHECK(array_equal(out, expected).item<bool>());

  offset = 1;
  out = vjp(fun, x, ones(shape)).second;
  expected = array({0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0});
  CHECK(array_equal(out, expected).item<bool>());

  offset = 3;
  shape = {3, 3};
  strides = {0, 1};
  out = vjp(fun, x, ones(shape)).second;
  expected = array({0, 0, 0, 3, 3, 3, 0, 0, 0, 0, 0});
  CHECK(array_equal(out, expected).item<bool>());

  offset = 3;
  shape = {3, 3};
  strides = {0, 1};
  out = vjp(fun, x, reshape(astype(arange(9), x.dtype()), {3, 3})).second;
  expected = array({0, 0, 0, 9, 12, 15, 0, 0, 0, 0, 0});
  CHECK(array_equal(out, expected).item<bool>());
}

TEST_CASE("test jvp from vjp") {
  // Unary element-wise ops
  {
    auto x = random::uniform({5, 10});
    eval(x);

    auto compute_derivs = [&x](auto fn) {
      auto fn_wrap = [&fn](array input) { return fn(input, default_device()); };

      // Compute vjp
      array vjp_out = vjp(fn_wrap, x, ones(x.shape())).second;

      // Compute jvp
      array jvp_out = jvp(fn_wrap, x, ones(x.shape())).second;

      return array_equal(vjp_out, jvp_out).item<bool>();
    };

    CHECK(compute_derivs(mlx::core::abs));
    CHECK(compute_derivs(mlx::core::cos));
    CHECK(compute_derivs(mlx::core::erf));
    CHECK(compute_derivs(mlx::core::erfinv));
    CHECK(compute_derivs(mlx::core::exp));
    CHECK(compute_derivs(mlx::core::log));
    CHECK(compute_derivs(mlx::core::log1p));
    CHECK(compute_derivs(mlx::core::negative));
    CHECK(compute_derivs(mlx::core::sigmoid));
    CHECK(compute_derivs(mlx::core::sign));
    CHECK(compute_derivs(mlx::core::sin));
    CHECK(compute_derivs(mlx::core::square));
    CHECK(compute_derivs(mlx::core::sqrt));
    CHECK(compute_derivs(mlx::core::rsqrt));
  }

  // Binary element-wise ops
  {
    auto x = random::uniform({5, 10});
    auto y = random::uniform({5, 10});
    eval(x, y);

    auto compute_derivs = [&x, &y](auto fn) {
      auto fn_wrap = [&fn](std::vector<array> inputs) {
        return std::vector<array>{fn(inputs[0], inputs[1], default_device())};
      };

      // Compute vjp and add results
      auto vjps = vjp(fn_wrap, {x, y}, {ones(x.shape())}).second;
      array vjp_out = add(vjps[0], vjps[1]);

      // Compute jvp
      array jvp_out =
          jvp(fn_wrap, {x, y}, {ones(x.shape()), ones(y.shape())}).second[0];
      return array_equal(vjp_out, jvp_out).item<bool>();
    };

    CHECK(compute_derivs(add));
    CHECK(compute_derivs(divide));
    CHECK(compute_derivs(logaddexp));
    CHECK(compute_derivs(maximum));
    CHECK(compute_derivs(minimum));
    CHECK(compute_derivs(multiply));
    CHECK(compute_derivs(subtract));
    CHECK(compute_derivs(power));
  }

  // Conditional selection element-wise op
  {
    auto condition = random::randint(0, 2, {5, 10});
    auto x = random::uniform({5, 10});
    auto y = random::uniform({5, 10});
    eval(condition, x, y);

    auto compute_derivs = [&condition, &x, &y](auto fn) {
      auto fn_wrap = [&fn](std::vector<array> inputs) {
        return std::vector<array>{
            fn(inputs[0], inputs[1], inputs[2], default_device())};
      };

      // Compute vjp and add results
      auto vjps = vjp(fn_wrap, {condition, x, y}, {ones(x.shape())}).second;
      auto vjp_out = add(add(vjps[0], vjps[1]), vjps[2]);

      // Compute jvp
      array jvp_out =
          jvp(fn_wrap,
              {condition, x, y},
              {ones(condition.shape()), ones(y.shape()), ones(x.shape())})
              .second[0];

      array result = array_equal(vjp_out, jvp_out);
      return result.item<bool>();
    };

    CHECK(compute_derivs(where));
  }
}

TEST_CASE("test complex gradients") {
  {
    auto add_fn = [](std::vector<array> inputs) {
      return std::vector<array>{add(inputs[0], inputs[1], default_device())};
    };

    // Compute jvp
    auto x = array(complex64_t{1.0, 1.0});
    auto y = array(complex64_t{1.0, 1.0});
    auto x_tan = array(complex64_t{1.0, 2.0});
    auto y_tan = array(complex64_t{2.0, 1.0});
    auto jvp_out = jvp(add_fn, {x, y}, {x_tan, y_tan}).second;
    CHECK_EQ(jvp_out[0].item<complex64_t>(), complex64_t{3.0, 3.0});

    // Compute vjp
    auto cotan = array(complex64_t{3.0, 3.0});
    auto vjp_out = vjp(add_fn, {x, y}, {cotan}).second;
    CHECK_EQ(vjp_out[0].item<complex64_t>(), complex64_t{3.0, 3.0});
    CHECK_EQ(vjp_out[1].item<complex64_t>(), complex64_t{3.0, 3.0});
  }

  {
    auto multiply_fn =
        [](const std::vector<array>& inputs) -> std::vector<array> {
      return {multiply(inputs[0], inputs[1])};
    };

    // Compute jvp
    auto x = array(complex64_t{2.0, 4.0});
    auto y = array(3.0f);
    auto x_tan = array(complex64_t{1.0, 2.0});
    auto y_tan = array(2.0f);
    auto jvp_out = jvp(multiply_fn, {x, y}, {x_tan, y_tan}).second;
    CHECK_EQ(jvp_out[0].item<complex64_t>(), complex64_t{7.0, 14.0});

    // Compute vjp
    auto cotan = array(complex64_t{2.0, 3.0});
    auto vjp_out = vjp(multiply_fn, {x, y}, {cotan}).second;
    CHECK_EQ(vjp_out[0].dtype(), complex64);
    CHECK_EQ(vjp_out[0].item<complex64_t>(), complex64_t{6.0, 9.0});
    CHECK_EQ(vjp_out[1].dtype(), float32);
    CHECK_EQ(vjp_out[1].item<float>(), 16);
  }

  {
    auto divide_fn =
        [](const std::vector<array>& inputs) -> std::vector<array> {
      return {divide(inputs[0], inputs[1])};
    };

    // Compute jvp
    auto x = array(complex64_t{2.0, 3.0});
    auto y = array(complex64_t{1.0, 2.0});
    auto x_tan = array(complex64_t{3.0, 4.0});
    auto y_tan = array(complex64_t{4.0, -2.0});
    auto jvp_out = jvp(divide_fn, {x, y}, {x_tan, y_tan}).second;
    CHECK_EQ(
        jvp_out[0].item<complex64_t>(), doctest::Approx(complex64_t{2.6, 2.8}));

    // Compute vjp
    auto cotan = array(complex64_t{2.0, -4.0});
    auto vjp_out = vjp(divide_fn, {x, y}, {cotan}).second;
    CHECK_EQ(vjp_out[0].item<complex64_t>(), complex64_t{2.0, 0.0});
    CHECK_EQ(vjp_out[1].item<complex64_t>(), complex64_t{-3.2, -0.4});
  }
}

TEST_CASE("test scan grads") {
  // Test cumsum
  {
    int axis = 0;
    int reverse = false;
    int inclusive = true;
    auto fun = [&axis, &reverse, &inclusive](array x) {
      return cumsum(x, axis, reverse, inclusive);
    };

    auto out = vjp(fun, ones({4}), ones({4})).second;
    auto expected = array({4.0f, 3.0f, 2.0f, 1.0f}, {4});
    CHECK(array_equal(out, expected).item<bool>());

    reverse = true;
    out = vjp(fun, ones({4}), ones({4})).second;
    expected = array({1.0f, 2.0f, 3.0f, 4.0f}, {4});
    CHECK(array_equal(out, expected).item<bool>());

    reverse = true;
    inclusive = false;
    out = vjp(fun, ones({4}), ones({4})).second;
    expected = array({0.0f, 1.0f, 2.0f, 3.0f}, {4});
    CHECK(array_equal(out, expected).item<bool>());

    reverse = false;
    inclusive = false;
    out = vjp(fun, ones({4}), ones({4})).second;
    expected = array({3.0f, 2.0f, 1.0f, 0.0f}, {4});
    CHECK(array_equal(out, expected).item<bool>());
  }

  // Test cumprod
  {
    int axis = 0;
    int reverse = false;
    int inclusive = true;
    auto fun = [&axis, &reverse, &inclusive](array x) {
      return cumprod(x, axis, reverse, inclusive);
    };

    auto x = array({1.0f, 2.0f, 3.0f, 4.0f}, {4});
    auto g = array({1.0f, 2.0f, 3.0f, 4.0f}, {4});
    auto out = vjp(fun, x, g).second;
    auto expected = array({119.0f, 59.0f, 38.0f, 24.0f}, {4});
    CHECK(allclose(out, expected).item<bool>());

    reverse = true;
    out = vjp(fun, x, g).second;
    expected = array({24.0f, 36.0f, 36.0f, 31.0f}, {4});
    CHECK(array_equal(out, expected).item<bool>());

    inclusive = false;
    out = vjp(fun, x, g).second;
    expected = array({0.0f, 12.0f, 16.0f, 15.0f}, {4});
    CHECK(array_equal(out, expected).item<bool>());

    reverse = false;
    out = vjp(fun, x, g).second;
    expected = array({32.0f, 15.0f, 8.0f, 0.0f}, {4});
    CHECK(array_equal(out, expected).item<bool>());
  }

  // Test cumsum jvp
  {
    int axis = 0;
    int reverse = false;
    int inclusive = true;
    auto fun = [&axis, &reverse, &inclusive](array x) {
      return cumsum(x, axis, reverse, inclusive);
    };

    auto x = array({1.0f, 2.0f, 3.0f, 4.0f}, {4});
    auto out = jvp(fun, x, ones({4})).second;
    auto expected = array({1.0f, 2.0f, 3.0f, 4.0f}, {4});
    CHECK(array_equal(out, expected).item<bool>());

    reverse = true;
    out = jvp(fun, x, ones({4})).second;
    expected = array({4.0f, 3.0f, 2.0f, 1.0f}, {4});
    CHECK(array_equal(out, expected).item<bool>());

    inclusive = false;
    out = jvp(fun, x, ones({4})).second;
    expected = array({3.0f, 2.0f, 1.0f, 0.0f}, {4});
    CHECK(array_equal(out, expected).item<bool>());

    reverse = false;
    out = jvp(fun, x, ones({4})).second;
    expected = array({0.0f, 1.0f, 2.0f, 3.0f}, {4});
    CHECK(array_equal(out, expected).item<bool>());
  }
}

TEST_CASE("test update state") {
  auto y = array({1.0});
  auto x = array({1.0, 1.0});
  auto state = array({0.0, 0.0});
  auto fn = [&state, &x](array y) {
    x = y * x;
    state = state + x;
    return sum(x);
  };
  grad(fn)(y);
  eval(state);
  CHECK(!state.has_primitive());
  CHECK(state.is_available());
  CHECK(array_equal(state, array({1.0, 1.0})).item<bool>());
}

TEST_CASE("test grad types") {
  {
    auto fn = [](array x) { return sum(x); };

    for (auto t : {float16, bfloat16, float32}) {
      auto x = array(1.0, t);
      auto dfdx = grad(fn)(x);
      CHECK_EQ(dfdx.dtype(), t);
    }
  }

  {
    // Check for multi-input grad
    auto fn = [](std::vector<array> inputs) {
      return sum(inputs[0] + inputs[1]);
    };

    for (auto t : {float16, bfloat16, float32}) {
      auto x = array(1.0, t);
      auto y = array(1.0, t);
      auto out = grad(fn)({x, y});
      CHECK_EQ(out[0].dtype(), t);
    }
  }
}

TEST_CASE("test grad dynamic slices") {
  {
    auto fn = [](const array& x) { return slice(x, array({0}), {0}, {1, 2}); };
    auto x = array({1, 2, 3, 4}, {2, 2});
    auto out = vjp(fn, x, array({1, 1}, {1, 2})).second;
    CHECK(array_equal(out, array({1, 1, 0, 0}, {2, 2})).item<bool>());
  }
  {
    auto fn = [](const std::vector<array>& inputs) {
      const auto& x = inputs[0];
      const auto& update = inputs[1];
      return std::vector<array>{slice_update(x, update, array({0}), {0})};
    };
    auto x = zeros({2, 2});
    auto update = array({3.f, 4.f}, {1, 2});
    auto outs = vjp(fn, {x, update}, {ones({2, 2})}).second;
    CHECK(allclose(outs[0], array({0.f, 0.f, 1.f, 1.f}, {2, 2})).item<bool>());
    CHECK(allclose(outs[1], ones({1, 2})).item<bool>());
  }
}

TEST_CASE("test masked_scatter autograd") {
  // Test jvp
  {
    auto self = array({10.f, 20.f, 30.f, 40.f}, {4});
    auto mask = array({false, true, false, true}, bool_);
    auto src = array({7.f, 8.f}, {2});

    auto self_tan = array({1.f, 2.f, 3.f, 4.f}, {4});
    auto src_tan = array({9.f, 11.f}, {2});

    auto fun = [&mask](const std::vector<array>& in) {
      return std::vector<array>{masked_scatter(in[0], mask, in[1])};
    };

    auto outs = jvp(fun, {self, src}, {self_tan, src_tan}).second;
    CHECK_EQ(outs.size(), 1);
    CHECK(array_equal(outs[0], array({1.f, 9.f, 3.f, 11.f}, {4})).item<bool>());
  }

  // Test vjp
  {
    auto self = array({10.f, 20.f, 30.f, 40.f}, {4});
    auto mask = array({true, false, false, true}, bool_);
    auto src = array({7.f, 8.f}, {2});

    auto f_sum = [&mask](const std::vector<array>& xs) {
      return std::vector<array>{sum(masked_scatter(xs[0], mask, xs[1]))};
    };

    auto v = vjp(f_sum, {self, src}, {array(1.f)});
    const auto& grads = v.second;

    CHECK(array_equal(grads[0], array({0.f, 1.f, 1.f, 0.f}, {4})).item<bool>());
    CHECK(array_equal(grads[1], array({1.f, 1.f}, {2})).item<bool>());
  }
}


================================================
FILE: tests/blas_tests.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <numeric>

#include "doctest/doctest.h"

#include "mlx/mlx.h"

using namespace mlx::core;

TEST_CASE("test matmul") {
  auto a = array(1);
  auto b = array({1.0});
  CHECK_THROWS_AS(matmul(a, b), std::invalid_argument);

  a = array({1.0});
  b = array({1.0});
  auto out = matmul(a, b);
  CHECK_EQ(out.shape(), Shape{});
  CHECK_EQ(out.size(), 1);
  CHECK_EQ(out.dtype(), float32);
  CHECK_EQ(out.item<float>(), 1.0f);

  a = ones({2, 4});
  b = ones({2});
  CHECK_THROWS_AS(matmul(a, b), std::invalid_argument);

  a = ones({2, 4});
  b = ones({3, 2});
  CHECK_THROWS_AS(matmul(a, b), std::invalid_argument);

  a = ones({2, 4});
  b = ones({4, 3, 2});
  CHECK_THROWS_AS(matmul(a, b), std::invalid_argument);

  a = ones({2});
  b = ones({4, 2});
  CHECK_THROWS_AS(matmul(a, b), std::invalid_argument);

  a = ones({2, 3});
  b = ones({4, 2});
  CHECK_THROWS_AS(matmul(a, b), std::invalid_argument);

  a = ones({2, 4, 3});
  b = ones({4, 2});
  CHECK_THROWS_AS(matmul(a, b), std::invalid_argument);

  a = ones({2, 4});
  b = ones({4, 2});
  out = matmul(a, b);
  CHECK(array_equal(out, full({2, 2}, 4.0f)).item<bool>());

  a = ones({2, 4}, int32);
  b = ones({4, 2}, float32);
  out = matmul(a, b);
  CHECK(array_equal(out, full({2, 2}, 4.0f)).item<bool>());

  // Check single dimensions
  a = ones({4});
  b = ones({4, 2});
  out = matmul(a, b);
  CHECK(array_equal(out, full({2}, 4.0f)).item<bool>());

  a = ones({2, 4});
  b = ones({4});
  out = matmul(a, b);
  CHECK(array_equal(out, full({2}, 4.0f)).item<bool>());

  a = ones({4});
  b = ones({4});
  out = matmul(a, b);
  CHECK(array_equal(out, full({}, 4.0f)).item<bool>());

  // Test transposed arrays
  a = array({1.0f, 1.0f, 1.0f, 1.0f}, {1, 4});
  b = array({1.0f, 1.0f, 1.0f, 1.0f}, {4, 1});
  out = matmul(transpose(a), transpose(b));
  CHECK(array_equal(out, ones({4, 4})).item<bool>());

  a = array({1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
  b = array({1.0f, 2.0f, 1.0f, 2.0f}, {2, 2});
  out = matmul(transpose(a), b);
  CHECK(
      array_equal(out, array({4.0f, 8.0f, 6.0f, 12.0f}, {2, 2})).item<bool>());

  out = matmul(a, transpose(b));
  CHECK(
      array_equal(out, array({5.0f, 5.0f, 11.0f, 11.0f}, {2, 2})).item<bool>());

  out = matmul(transpose(a), transpose(b));
  CHECK(
      array_equal(out, array({7.0f, 7.0f, 10.0f, 10.0f}, {2, 2})).item<bool>());

  // Test broadcasting for both arrays
  a = ones({5, 4, 2});
  b = ones({2, 3});
  out = matmul(a, b);
  CHECK(array_equal(out, full({5, 4, 3}, 2.0f)).item<bool>());

  a = ones({5, 1, 4, 2});
  b = ones({1, 7, 2, 3});
  out = matmul(a, b);
  CHECK(array_equal(out, full({5, 7, 4, 3}, 2.0f)).item<bool>());

  // Test batched matmul with transpose
  a = ones({2, 2, 4});
  b = ones({2, 4, 2});
  out = matmul(transpose(a, {0, 2, 1}), transpose(b, {0, 2, 1}));
  CHECK(array_equal(out, full({2, 4, 4}, 2.0f)).item<bool>());
}


================================================
FILE: tests/compile_tests.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

// Required for using M_SQRT2 in MSVC.
#define _USE_MATH_DEFINES

#include "doctest/doctest.h"

#include "mlx/mlx.h"
#include "mlx/primitives.h"

using namespace mlx::core;

std::vector<array> simple_fun(const std::vector<array>& inputs) {
  return std::vector<array>{inputs[0] + inputs[1]};
}

TEST_CASE("test simple compile") {
  auto compfn = compile(simple_fun);
  auto out = compfn({array(1.0f), array(2.0f)})[0];
  CHECK_EQ(out.item<float>(), 3.0f);

  out = compfn({array(1.0f), array(2.0f)})[0];
  CHECK_EQ(out.item<float>(), 3.0f);

  // Change the shapes
  out = compfn({array({1.0f, 2.0f}), array(2.0f)})[0];
  CHECK(array_equal(out, array({3.0f, 4.0f})).item<bool>());

  out = compfn({array(2.0f), array({1.0f, 2.0f})})[0];
  CHECK(array_equal(out, array({3.0f, 4.0f})).item<bool>());

  // Change the types
  out = compfn({array(2, int32), array({1.0f, 2.0f})})[0];
  CHECK(array_equal(out, array({3.0f, 4.0f})).item<bool>());

  out = compfn({array(2.0f), array({1, 2}, int32)})[0];
  CHECK(array_equal(out, array({3.0f, 4.0f})).item<bool>());
}

std::vector<array> grad_fun(const std::vector<array>& inputs) {
  auto loss = [](std::vector<array> ins) { return exp(ins[0] + ins[1]); };
  return grad(loss, {0, 1})(inputs);
}

TEST_CASE("test compile with grad") {
  auto x = array(1.0f);
  auto y = array(1.0f);
  auto grads_expected = grad_fun({x, y});
  auto grads_compile = compile(grad_fun)({x, y});
  CHECK(allclose(grads_compile[0], grads_expected[0]).item<bool>());
  CHECK(allclose(grads_compile[1], grads_expected[1]).item<bool>());
}

TEST_CASE("test compile inputs with primitive") {
  auto [k1, k2] = random::split(random::key(0));
  auto x = random::uniform({5, 5}, k1);
  auto y = random::uniform({5, 5}, k2);
  auto expected = simple_fun({x, y})[0];

  x = random::uniform({5, 5}, k1);
  y = random::uniform({5, 5}, k2);
  auto out = compile(simple_fun)({x, y})[0];
  CHECK(array_equal(expected, out).item<bool>());

  // Same thing twice
  out = compile(simple_fun)({x, y})[0];
  CHECK(array_equal(expected, out).item<bool>());
}

std::vector<array> fun_creats_array(const std::vector<array>& inputs) {
  return {inputs[0] + array(1.0f)};
}

TEST_CASE("test compile with created array") {
  auto cfun = compile(fun_creats_array);
  auto out = cfun({array(2.0f)});
  CHECK_EQ(out[0].item<float>(), 3.0f);

  // Try again
  out = cfun({array(2.0f)});
  CHECK_EQ(out[0].item<float>(), 3.0f);
}

std::vector<array> inner_fun(const std::vector<array>& inputs) {
  return {array(2) * inputs[0]};
}

std::vector<array> outer_fun(const std::vector<array>& inputs) {
  auto x = inputs[0] + inputs[1];
  auto y = compile(inner_fun)({x})[0];
  return {x + y};
}

TEST_CASE("test nested compile") {
  auto cfun = compile(outer_fun);
  auto out = cfun({array(1), array(2)})[0];
  CHECK_EQ(out.item<int>(), 9);

  // Try again
  out = cfun({array(1), array(2)})[0];
  CHECK_EQ(out.item<int>(), 9);
}

TEST_CASE("test enable and disable compile") {
  CHECK_THROWS(compile(nullptr));
  disable_compile();
  compile(nullptr);
  enable_compile();
  CHECK_THROWS(compile(nullptr));
}

auto add_scalars(const std::vector<array>&) {
  auto a = array(-1.0f);
  auto b = array(-1.0f);
  return std::vector<array>{abs(a), abs(b)};
};

auto max_scalars(const std::vector<array>&) {
  auto a = array({-1.0f, 2.0f});
  auto b = maximum(a, array(0.0f));
  auto c = maximum(-a, array(0.0f));
  auto d = b + c;
  return std::vector<array>{b, c, d};
};

TEST_CASE("test simplify scalars") {
  set_compile_mode(CompileMode::no_fuse);
  {
    auto cfun = compile(add_scalars);
    auto out = cfun({});
    auto c = out[0];
    auto d = out[1];
    CHECK(c.inputs()[0].id() == d.inputs()[0].id());
  }

  {
    auto a = array({-1.0f, 2.0f});
    auto out = compile(max_scalars)({a});
    auto b = out[0];
    auto c = out[1];
    auto d = out[2];
    CHECK(b.inputs()[1].id() == c.inputs()[1].id());
  }
  set_compile_mode(CompileMode::enabled);
}

auto exp_two(const std::vector<array>& inputs) {
  auto a = inputs[0];
  return std::vector<array>{exp(a) + exp(a)};
};

TEST_CASE("test simplify") {
  set_compile_mode(CompileMode::no_fuse);
  auto a = array({1.0f, 2.0f});
  auto b = compile(exp_two)({a})[0];
  CHECK(b.inputs()[0].id() == b.inputs()[1].id());
  set_compile_mode(CompileMode::enabled);
}

TEST_CASE("test simplify noops") {
  set_compile_mode(CompileMode::no_fuse);
  auto a = array({1.0f, 2.0f});
  auto fun = [](const std::vector<array>& inputs) -> std::vector<array> {
    return {copy(stop_gradient(exp(stop_gradient(inputs[0]))))};
  };
  auto b = compile(fun)({a})[0];
  CHECK(b.inputs()[0].id() == a.id());
  set_compile_mode(CompileMode::enabled);
}

auto add_diff(const std::vector<array>& inputs) {
  auto a = inputs[0];
  return std::vector<array>{cos(a) + sin(a)};
};

TEST_CASE("test no simplify") {
  set_compile_mode(CompileMode::no_fuse);
  auto a = array({1.0f, 2.0f});
  auto b = compile(add_diff)({a})[0];
  CHECK(b.inputs()[0].id() != b.inputs()[1].id());
  set_compile_mode(CompileMode::enabled);
}

auto multi_one(const std::vector<array>&) {
  auto a = array(1.0);
  auto b = array(2.0);
  auto c = divmod(a, b);
  auto d = divmod(a, b);
  auto e = c[0] + d[0];
  auto f = c[1] + d[1];
  return std::vector<array>{e, f};
}

auto multi_two(const std::vector<array>&) {
  auto a = array(1.0);
  auto b = array(1.0);
  return divmod(a, b);
}

auto multi_three(const std::vector<array>&) {
  auto a = array(1.0);
  auto b = array(2.0);
  auto c = divmod(a, b);
  auto d = divmod(a, b);
  auto e = stack({c[0], c[1], d[0], d[1]});
  return std::vector<array>{e};
}

TEST_CASE("test simplify multi output") {
  set_compile_mode(CompileMode::no_fuse);
  {
    auto out = compile(multi_one)({});
    auto e = out[0];
    auto f = out[1];
    CHECK_EQ(e.inputs()[0].id(), e.inputs()[1].id());
    CHECK_EQ(f.inputs()[0].id(), f.inputs()[1].id());
  }

  {
    auto c = compile(multi_two)({});
    CHECK_EQ(c[0].inputs()[0].id(), c[0].inputs()[1].id());
    CHECK_EQ(c[0].inputs()[0].id(), c[1].inputs()[0].id());
    CHECK_EQ(c[1].inputs()[0].id(), c[1].inputs()[1].id());
  }

  // Make sure the output order of multi-output primitives
  // is respected in simplification
  {
    auto e = compile(multi_three)({})[0];
    CHECK_EQ(e.inputs().size(), 4);
    CHECK_EQ(e.inputs().at(0).id(), e.inputs().at(2).id());
    CHECK_EQ(e.inputs().at(1).id(), e.inputs().at(3).id());
    CHECK(array_equal(e, array({0.0f, 1.0f, 0.0f, 1.0f})).item<bool>());
  }
  set_compile_mode(CompileMode::enabled);
}

// No fusion
auto unary_fused_0(const std::vector<array>& inputs) {
  return std::vector<array>{exp(inputs[0])};
}

// All compilable
auto unary_fused_1(const std::vector<array>& inputs) {
  return std::vector<array>{abs(negative(exp(inputs[0])))};
}

auto unary_fused_1_copy(const std::vector<array>& inputs) {
  return std::vector<array>{abs(negative(exp(inputs[0])))};
}

auto unary_fused_1_diff(const std::vector<array>& inputs) {
  return std::vector<array>{abs(exp(negative(inputs[0])))};
}

// Output into un-compilable primitive
auto unary_fused_2(const std::vector<array>& inputs) {
  return std::vector<array>{sum(abs(negative(exp(inputs[0]))), true)};
}

// Input from un-compilable primitive
auto unary_fused_3(const std::vector<array>& inputs) {
  return std::vector<array>{exp(abs(negative(sum(inputs[0], true))))};
}

TEST_CASE("test compile unary fused") {
  // NB: some of these tests are brittle and may need to be
  // updated if we change compile conditions
  {
    auto cfun = compile(unary_fused_0);
    auto x = array(2.0);
    auto out = cfun({x})[0];

    auto& p = out.primitive();
    CHECK_EQ(typeid(p), typeid(Exp));
    CHECK_EQ(out.inputs()[0].id(), x.id());
  }

  {
    auto cfun = compile(unary_fused_1);
    auto x = array(2.0);
    auto out = cfun({x})[0];

    auto& p = out.primitive();
    CHECK_EQ(typeid(p), typeid(Compiled));
    CHECK_EQ(out.inputs()[0].id(), x.id());

    auto expected_out = unary_fused_1({array(2.0)})[0];
    CHECK(allclose(out, expected_out).item<bool>());
  }

  {
    auto cfun = compile(unary_fused_2);
    auto x = array({1.0, 2.0});
    auto out = cfun({x});
    CHECK_EQ(out.size(), 1);

    auto& p = out[0].primitive();
    // NB: this test is brittle, will need to update
    // it if we change compile conditions
    CHECK_EQ(typeid(p), typeid(Reduce));
    auto cout = out[0].inputs()[0];
    auto& cp = cout.primitive();
    CHECK_EQ(typeid(cp), typeid(Compiled));
    CHECK_EQ(cout.inputs()[0].id(), x.id());
  }

  {
    auto cfun = compile(unary_fused_3);
    auto x = array({1.0, 2.0});
    auto out = cfun({x});

    auto& p = out[0].primitive();
    CHECK_EQ(typeid(p), typeid(Compiled));
    auto sout = out[0].inputs()[0];
    CHECK_EQ(out[0].inputs().size(), 1);
    auto& sp = sout.primitive();
    CHECK_EQ(typeid(sp), typeid(Reduce));
    CHECK_EQ(sout.inputs()[0].id(), x.id());
  }

  // Is equivalent works
  {
    auto out1 = compile(unary_fused_1)({array(1.0)});
    auto out2 = compile(unary_fused_1_copy)({array(1.0)});
    CHECK(out1[0].primitive().is_equivalent(out2[0].primitive()));
    auto out3 = compile(unary_fused_1_diff)({array(1.0)});
    CHECK(!out1[0].primitive().is_equivalent(out3[0].primitive()));
  }
}

// All compilable
auto binary_fused_0(const std::vector<array>& inputs) {
  return std::vector<array>{inputs[0] + inputs[1]};
}

// Binary into unary
auto binary_fused_1(const std::vector<array>& inputs) {
  return std::vector<array>{abs(inputs[0] + inputs[1])};
}

// Binary into binary
auto binary_fused_2(const std::vector<array>& inputs) {
  auto x = inputs[0] + inputs[1];
  return std::vector<array>{x + inputs[0]};
}

// Binary into unary into un-compilable
auto binary_fused_3(const std::vector<array>& inputs) {
  return std::vector<array>{sum(abs(inputs[0] + inputs[1]), true)};
}

TEST_CASE("test compile binary fused") {
  {
    auto cfun = compile(binary_fused_0);
    auto x = array(2.0);
    auto y = array(2.0);
    auto out = cfun({x, y})[0];

    auto& p = out.primitive();
    CHECK_EQ(typeid(p), typeid(Add));
    CHECK_EQ(out.inputs()[0].id(), x.id());
  }

  {
    auto cfun = compile(binary_fused_1);
    auto x = array(2.0);
    auto y = array(2.0);
    auto out = cfun({x, y})[0];

    auto& p = out.primitive();
    CHECK_EQ(typeid(p), typeid(Compiled));
    CHECK_EQ(out.inputs()[0].id(), x.id());
    CHECK_EQ(out.inputs()[1].id(), y.id());

    auto expected_out = binary_fused_1({x, y})[0];
    CHECK_EQ(out.item<float>(), expected_out.item<float>());
  }

  {
    auto cfun = compile(binary_fused_2);
    auto x = array(2.0);
    auto y = array(2.0);
    auto out = cfun({x, y})[0];

    auto& p = out.primitive();
    CHECK_EQ(typeid(p), typeid(Compiled));
    CHECK_EQ(out.inputs()[0].id(), x.id());
    CHECK_EQ(out.inputs()[1].id(), y.id());
  }

  {
    auto cfun = compile(binary_fused_3);
    auto x = array({1.0, 2.0});
    auto y = array({1.0, 2.0});
    auto out = cfun({x, y})[0];

    auto& p = out.primitive();
    CHECK_EQ(typeid(p), typeid(Reduce));

    auto cout = out.inputs()[0];
    auto& cp = cout.primitive();
    CHECK_EQ(typeid(cp), typeid(Compiled));
    CHECK_EQ(cout.inputs()[0].id(), x.id());
    CHECK_EQ(cout.inputs()[1].id(), y.id());
  }
}

auto gelu_1(const std::vector<array>& inputs) {
  auto& x = inputs[0];
  auto out = x * (1.0f + erf(x / M_SQRT2)) / 2.0f;
  return std::vector<array>{out};
}

TEST_CASE("test compile gelu") {
  {
    auto cfun = compile(gelu_1);
    auto x = array(1.0);
    auto out = cfun({x})[0];
    auto& p = out.primitive();
    CHECK_EQ(typeid(p), typeid(Compiled));
    CHECK_EQ(out.inputs().size(), 4);
    for (auto& in : out.inputs()) {
      CHECK(in.inputs().empty());
    }
    auto expected_out = gelu_1({x})[0];
    CHECK(allclose(out, expected_out).item<bool>());
  }

  {
    auto cfun = compile(gelu_1);
    auto x = array({1.0, 0.5});
    auto out = cfun({x})[0];
    auto& p = out.primitive();
    CHECK_EQ(typeid(p), typeid(Compiled));
    CHECK_EQ(out.inputs().size(), 4);
    for (auto& in : out.inputs()) {
      CHECK(in.inputs().empty());
    }

    auto expected_out = gelu_1({x})[0];
    CHECK(allclose(out, expected_out).item<bool>());
  }
}

// Uncompilable input outside fused tape
auto unary_with_two_outputs(const std::vector<array>& inputs) {
  auto x = exp(inputs[0]);
  return std::vector<array>{exp(x), sum(x, true)};
}

auto uncompilable_inputs(const std::vector<array>& inputs) {
  auto& x = inputs[0];
  auto& y = inputs[1];
  return std::vector<array>{x * abs(exp(y)), sum(x, true)};
}

auto uncompilable_inputs_order_matters(const std::vector<array>& inputs) {
  auto& x = inputs[0];
  auto& y = inputs[1];
  return std::vector<array>{x / abs(exp(y)), sum(x, true)};
}

TEST_CASE("test compile tape with outside parents") {
  {
    auto cfun = compile(unary_with_two_outputs);
    auto x = array({2.0, 2.0});
    auto out = cfun({x});

    auto& p1 = out[0].primitive();
    CHECK_EQ(typeid(p1), typeid(Exp));
    auto& p2 = out[1].primitive();
    CHECK_EQ(typeid(p2), typeid(Reduce));
  }

  {
    auto cfun = compile(uncompilable_inputs);
    auto x = array({2.0, 2.0});
    auto y = array({1.6, 0.6});
    auto outs = cfun({x, y});

    auto& p1 = outs[0].primitive();
    CHECK_EQ(typeid(p1), typeid(Compiled));
    auto& p2 = outs[1].primitive();
    CHECK_EQ(typeid(p2), typeid(Reduce));
    CHECK_EQ(outs[0].inputs().size(), 2);

    auto expected_outs = uncompilable_inputs({x, y});
    CHECK(allclose(outs[0], expected_outs[0]).item<bool>());
    CHECK(allclose(outs[1], expected_outs[1]).item<bool>());
  }

  {
    auto cfun = compile(uncompilable_inputs_order_matters);
    auto x = array({2.0, 2.0});
    auto y = array({1.6, 0.6});
    auto outs = cfun({x, y});

    auto& p1 = outs[0].primitive();
    CHECK_EQ(typeid(p1), typeid(Compiled));
    auto& p2 = outs[1].primitive();
    CHECK_EQ(typeid(p2), typeid(Reduce));
    CHECK_EQ(outs[0].inputs().size(), 2);

    auto expected_outs = uncompilable_inputs_order_matters({x, y});
    CHECK(allclose(outs[0], expected_outs[0]).item<bool>());
    CHECK(allclose(outs[1], expected_outs[1]).item<bool>());
  }
}

auto compile_across_streams(const std::vector<array>& inputs) {
  auto s2 = new_stream(default_device());
  auto x = exp(abs(inputs[0]));
  auto y = exp(abs(x, s2), s2);
  return std::vector<array>{y};
}

TEST_CASE("test compile across streams") {
  auto cfun = compile(compile_across_streams);
  auto x = array({2.0f});
  auto out = cfun({x})[0];
  auto& p1 = out.primitive();
  CHECK_EQ(typeid(p1), typeid(Compiled));
  CHECK_EQ(out.inputs().size(), 1);
  auto child = out.inputs()[0];
  auto& p2 = child.primitive();
  CHECK_EQ(typeid(p2), typeid(Compiled));
  CHECK_EQ(child.inputs()[0].id(), x.id());
}

auto unary_compile_outputs(const std::vector<array>& inputs) {
  auto x = abs(inputs[0]);
  auto y = square(x);
  return std::vector<array>{x, y};
}

auto binary_compile_outputs(const std::vector<array>& inputs) {
  auto x = inputs[0];
  auto y = inputs[1];
  x = x + y;
  y = x + y;
  return std::vector<array>{x, y};
}

TEST_CASE("test compile internal output") {
  {
    auto cfun = compile(unary_compile_outputs);
    auto x = array({3, -2});
    auto outs = cfun({x});
    auto& p1 = outs[0].primitive();
    CHECK_EQ(typeid(p1), typeid(Compiled));
    auto& p2 = outs[1].primitive();
    CHECK_EQ(typeid(p2), typeid(Compiled));
    CHECK_EQ(outs[0].siblings()[0].id(), outs[1].id());
    auto expected_outs = unary_compile_outputs({x});
    CHECK(array_equal(outs[0], expected_outs[0]).item<bool>());
    CHECK(array_equal(outs[1], expected_outs[1]).item<bool>());
  }

  {
    auto cfun = compile(binary_compile_outputs);
    auto x = array({3, -2});
    auto y = array({1, -1});
    auto outs = cfun({x, y});
    auto& p1 = outs[0].primitive();
    CHECK_EQ(typeid(p1), typeid(Compiled));
    auto& p2 = outs[1].primitive();
    CHECK_EQ(typeid(p2), typeid(Compiled));
    auto expected_outs = binary_compile_outputs({x, y});
    CHECK(array_equal(outs[0], expected_outs[0]).item<bool>());
    CHECK(array_equal(outs[1], expected_outs[1]).item<bool>());
  }
}

auto deep_unary_compile(const std::vector<array>& inputs) {
  auto x = inputs[0];
  for (int i = 0; i < 10; ++i) {
    x = cos(sin(x));
  }
  return std::vector<array>{x};
}

TEST_CASE("test compile deep graph") {
  auto cfun = compile(deep_unary_compile);
  auto x = array({3.0f, -2.0f});
  auto out = cfun({x})[0];
  auto expected_out = deep_unary_compile({x})[0];
  CHECK(allclose(out, expected_out).item<bool>());
}

auto repeat_input_to_compiled(const std::vector<array>& inputs) {
  auto x = abs(exp(inputs[0]));
  auto y = abs(exp(sum(x)));
  return std::vector<array>{x + y};
}

TEST_CASE("test compile repeat input") {
  auto cfun = compile(repeat_input_to_compiled);
  auto x = array({3.0f, -2.0f});
  auto out = cfun({x})[0];
  auto expected_out = repeat_input_to_compiled({x})[0];
  CHECK(allclose(out, expected_out).item<bool>());
}

auto compile_unary_inner(const std::vector<array>& inputs) {
  auto x = inputs[0];
  return std::vector<array>{exp(exp(x))};
}

auto compile_unary_outer(const std::vector<array>& inputs) {
  auto cfun = compile(compile_unary_inner);
  return cfun(cfun(inputs));
}

TEST_CASE("test compile compiled function") {
  auto cfun = compile(compile_unary_outer);
  auto x = array({1.0f});
  auto out = cfun({x})[0];
  auto& p = out.primitive();
  CHECK_EQ(typeid(p), typeid(Compiled));
  CHECK_EQ(out.inputs()[0].id(), x.id());
}

auto grad_unary_compiled(const std::vector<array>& inputs) {
  auto gradfn = value_and_grad(compile(compile_unary_inner));
  auto [out, grad] = gradfn(inputs);
  return std::vector{out[0], grad[0]};
}

TEST_CASE("test transform compiled function") {
  auto cfun = compile(grad_unary_compiled);
  auto x = array(1.0f);
  auto outs = cfun({x});
  auto& p = outs[0].primitive();
  CHECK_EQ(typeid(p), typeid(Compiled));
  CHECK_EQ(outs[0].siblings()[0].id(), outs[1].id());
  CHECK(!outs[0].inputs()[0].has_primitive());
  CHECK(!outs[0].inputs()[1].has_primitive());
}

TEST_CASE("test fusion kernel reuse") {
  auto cfun = compile(gelu_1);
  auto x = array({2.0f, -2.0f});
  auto y = cfun({x})[0];
  auto p = std::dynamic_pointer_cast<Compiled>(y.primitive_ptr());
  eval(y);

  std::string lib_name = p->lib_name();
  CHECK(!lib_name.empty());

  x = astype(reshape(arange(10), {2, 5}), float32);
  auto z = cfun({x})[0];
  auto pz = std::dynamic_pointer_cast<Compiled>(z.primitive_ptr());
  eval(z);

  std::string lib_name_z = pz->lib_name();
  CHECK(!lib_name_z.empty());

  CHECK_EQ(lib_name, lib_name_z);
}

auto add3(const std::vector<array>& xs) {
  return std::vector<array>{xs[0] + xs[0] + xs[0]};
}

TEST_CASE("test fusion types") {
  auto cfun = compile(add3);
  auto x = array({2.0f, -2.0f});
  auto y = cfun({x})[0];
  auto p = std::dynamic_pointer_cast<Compiled>(y.primitive_ptr());
  eval(y);

  std::string lib_name = p->lib_name();
  CHECK(!lib_name.empty());

  x = array({2, -2}, int32);
  auto z = cfun({x})[0];
  auto pz = std::dynamic_pointer_cast<Compiled>(z.primitive_ptr());
  eval(z);

  std::string lib_name_z = pz->lib_name();
  CHECK(!lib_name_z.empty());
}

auto compile_shapeless_not_ok(const std::vector<array>& inputs) {
  auto x = reshape(inputs[0], {2, 2});
  return std::vector<array>{x};
}

auto compile_shapeless_ok(const std::vector<array>& inputs) {
  auto x = inputs[0] + array({2});
  return std::vector<array>{x};
}

TEST_CASE("test shapeless compile") {
  {
    auto cfun = compile(compile_shapeless_not_ok, /* shapeless */ true);
    cfun({array({1, 2, 3, 4})});
    CHECK_THROWS(cfun({array({1, 2, 3, 4, 5})}));
  }

  {
    auto cfun = compile(compile_shapeless_ok, /* shapeless */ true);
    auto out = cfun({array({1, 2})})[0];
    auto out2 = cfun({array({1, 2, 3, 4})})[0];

    // Not making a new constant array since no recompile,
    // hence the ids should be the same
    CHECK_EQ(out.inputs()[1].id(), out2.inputs()[1].id());
    CHECK(array_equal(out2, array({3, 4, 5, 6})).item<bool>());

    // Recompile since type changes
    out2 = cfun({array({1.0, 2.0})})[0];
    CHECK_NE(out.inputs()[1].id(), out2.inputs()[1].id());

    // Recompile since ndim changes
    out2 = cfun({array({1.0, 2.0}, {1, 2})})[0];
    CHECK_NE(out.inputs()[1].id(), out2.inputs()[1].id());
  }
}

auto compile_broadcast_add(const std::vector<array>& inputs) {
  auto b = zeros({8, 8});
  return std::vector<array>{inputs[0] + b};
}

TEST_CASE("test compile strides") {
  {
    auto cfun = compile(compile_broadcast_add);
    auto a = zeros({1, 8, 8});
    auto out = cfun({a})[0];
    eval(out);
    CHECK_EQ(out.strides().size(), 3);
  }
}

TEST_CASE("test compile change streams") {
  auto cfun = compile(simple_fun);
  auto out = cfun({array(1.0f), array(2.0f)})[0];
  CHECK_EQ(out.primitive().stream(), default_stream(default_device()));

  auto s = new_stream(default_device());
  StreamContext sctx(s);
  out = cfun({array(1.0f), array(2.0f)})[0];
  CHECK_EQ(out.primitive().stream(), s);
}

TEST_CASE("test compile lambda") {
  auto fun = [](const std::vector<array>& inputs) {
    return std::vector<array>{abs(inputs[0])};
  };

  auto out = compile(fun)({array(-1)});
  CHECK_EQ(out[0].item<int>(), 1);

  decltype(compile(nullptr)) c_local_fun;
  {
    auto local_fun = [](const std::vector<array>& inputs) {
      return std::vector<array>{abs(inputs[0])};
    };
    c_local_fun = compile(local_fun);
  }

  // This is ok even though local_fun is out of scope
  out = c_local_fun({array(-1)});
  CHECK_EQ(out[0].item<int>(), 1);

  {
    int x = 2;
    auto local_fun = [x](const std::vector<array>& inputs) {
      return std::vector<array>{inputs[0] + x};
    };
    c_local_fun = compile(local_fun);
  }
  // Also ok even though local_fun is out of scope.
  out = c_local_fun({array(0)});
  CHECK_EQ(out[0].item<int>(), 2);

  int x = 2;
  auto fun_with_capture = [&x](const std::vector<array>& inputs) {
    return std::vector<array>{inputs[0] + x};
  };
  auto cfun = compile(fun_with_capture);
  out = cfun({array(0)});
  CHECK_EQ(out[0].item<int>(), 2);

  // Doesn't recompile
  x = 3;
  out = cfun({array(0)});
  CHECK_EQ(out[0].item<int>(), 2);

  // Recompiles
  auto cfun2 = compile(fun_with_capture);
  out = cfun2({array(0)});
  CHECK_EQ(out[0].item<int>(), 3);
}

TEST_CASE("test compile with no-ops") {
  auto fun = [](const std::vector<array>& inputs) {
    return std::vector<array>{abs(stop_gradient(abs(inputs[0])))};
  };
  auto in = array(1.0);
  auto out = compile(fun)({in})[0];
  CHECK_EQ(out.inputs()[0].id(), in.id());
}

TEST_CASE("test compile random bits") {
  auto fun = [](const std::vector<array>& inputs) {
    auto key = inputs[0];
    auto a = random::bits({32, 32}, 4, key);
    auto b = random::bits({32, 32}, 2, key);
    return std::vector<array>{a + b};
  };
  auto in = random::key(0);
  auto expected = fun({in})[0];
  auto out = compile(fun)({in})[0];
  CHECK(array_equal(out, expected).item<bool>());
}


================================================
FILE: tests/creations_tests.cpp
================================================
// Copyright © 2023 Apple Inc.

#include "doctest/doctest.h"

#include "mlx/mlx.h"

using namespace mlx::core;

TEST_CASE("test arange") {
  // Check type is inferred correctly
  {
    auto x = arange(10);
    CHECK_EQ(x.dtype(), int32);

    x = arange(10.0);
    CHECK_EQ(x.dtype(), float32);

    x = arange(10, float32);
    CHECK_EQ(x.dtype(), float32);

    x = arange(10, float16);
    CHECK_EQ(x.dtype(), float16);

    x = arange(10, bfloat16);
    CHECK_EQ(x.dtype(), bfloat16);

    x = arange(10.0, int32);
    CHECK_EQ(x.dtype(), int32);

    x = arange(0, 10);
    CHECK_EQ(x.dtype(), int32);

    x = arange(0.0, 10.0, int32);
    CHECK_EQ(x.dtype(), int32);

    x = arange(0.0, 10.0);
    CHECK_EQ(x.dtype(), float32);

    x = arange(0, 10, float32);
    CHECK_EQ(x.dtype(), float32);

    x = arange(0, 10, 0.1, float32);
    CHECK_EQ(x.dtype(), float32);

    x = arange(0.0, 10.0, 0.5, int32);
    CHECK_EQ(x.dtype(), int32);

    x = arange(10.0, uint32);
    CHECK_EQ(x.dtype(), uint32);
    x = arange(0.0, 10.0, uint32);
    CHECK_EQ(x.dtype(), uint32);
    x = arange(0.0, 10.0, 0.5, uint32);
    CHECK_EQ(x.dtype(), uint32);

    // arange unsupported for bool_
    CHECK_THROWS_AS(arange(10, bool_), std::invalid_argument);
  }

  // Check correct sizes
  {
    auto x = arange(10);
    CHECK_EQ(x.size(), 10);

    x = arange(0.0, 10.0, 0.5);
    CHECK_EQ(x.size(), 20);

    x = arange(0.0, 10.0, 0.45);
    CHECK_EQ(x.size(), 23);

    x = arange(0, 10, 10);
    CHECK_EQ(x.size(), 1);

    x = arange(0, 10, 9);
    CHECK_EQ(x.size(), 2);

    x = arange(0, 10, 100);
    CHECK_EQ(x.size(), 1);

    x = arange(0, -10, 1);
    CHECK_EQ(x.size(), 0);

    x = arange(0, -10, -1);
    CHECK_EQ(x.size(), 10);

    x = arange(0, -10, -10);
    CHECK_EQ(x.size(), 1);
  }

  // Check values
  {
    auto x = arange(0, 3);
    CHECK(array_equal(x, array({0, 1, 2})).item<bool>());

    x = arange(0, 3, 2);
    CHECK(array_equal(x, array({0, 2})).item<bool>());

    x = arange(0, 3, 3);
    CHECK(array_equal(x, array({0})).item<bool>());

    x = arange(0, -3, 1);
    CHECK(array_equal(x, array({})).item<bool>());

    x = arange(0, 3, -1);
    CHECK(array_equal(x, array({})).item<bool>());

    x = arange(0, -3, -1);
    CHECK(array_equal(x, array({0, -1, -2})).item<bool>());

    x = arange(0.0, 5.0, 0.5, int32);
    CHECK(array_equal(x, zeros({10})).item<bool>());

    x = arange(0.0, 5.0, 1.5, int32);
    CHECK(array_equal(x, array({0, 1, 2, 3})).item<bool>());

    x = arange(0.0, 5.0, 1.0, float16);
    CHECK(array_equal(x, array({0, 1, 2, 3, 4}, float16)).item<bool>());

    x = arange(0.0, 5.0, 1.0, bfloat16);
    CHECK(array_equal(x, array({0, 1, 2, 3, 4}, bfloat16)).item<bool>());

    x = arange(0.0, 5.0, 1.5, bfloat16);
    CHECK(array_equal(x, array({0., 1.5, 3., 4.5}, bfloat16)).item<bool>());
  }
}

TEST_CASE("test astype") {
  // Check type conversions
  {
    auto x = array(1);
    auto y = astype(x, float32);
    CHECK_EQ(y.dtype(), float32);
    CHECK_EQ(y.item<float>(), 1.0f);

    y = astype(x, int32);
    CHECK_EQ(y.dtype(), int32);
    CHECK_EQ(y.item<int>(), 1);

    x = array(-3.0f);
    y = astype(x, int32);
    CHECK_EQ(y.dtype(), int32);
    CHECK_EQ(y.item<int>(), -3);
  }
}

TEST_CASE("test full") {
  // Check throws on bad shape
  {
    CHECK_THROWS(full({-5, 0}, 0));
    CHECK_THROWS(full({0, -5}, 0));
  }

  // Check full works for different types
  {
    auto x = full({}, 0);
    CHECK_EQ(x.dtype(), int32);
    CHECK_EQ(x.item<int>(), 0);

    x = full({}, 0.0);
    CHECK_EQ(x.dtype(), float32);
    CHECK_EQ(x.item<float>(), 0);

    x = full({}, false);
    CHECK_EQ(x.item<bool>(), false);

    x = full({}, 0, int32);
    CHECK_EQ(x.item<int>(), 0);

    x = full({}, 0, float32);
    CHECK_EQ(x.item<float>(), 0);

    x = full({1, 2}, 2, float32);
    CHECK(array_equal(x, array({2.0, 2.0}, {1, 2})).item<bool>());

    x = full({2, 1}, 2, float32);
    CHECK(array_equal(x, array({2.0, 2.0}, {2, 1})).item<bool>());

    x = full({2}, false);
    CHECK_EQ(x.dtype(), bool_);
    CHECK(array_equal(x, array({false, false})).item<bool>());

    x = full({2}, 1.0, bool_);
    CHECK_EQ(x.dtype(), bool_);
    CHECK(array_equal(x, array({true, true})).item<bool>());

    x = full({2}, 1.0, uint32);
    CHECK_EQ(x.dtype(), uint32);
    CHECK(array_equal(x, array({1, 1})).item<bool>());

    CHECK_THROWS_AS(full({2}, array({})), std::invalid_argument);
  }

  // Check broadcasting works
  {
    auto x = full({2, 2}, array({3, 4}, {2, 1}));
    CHECK(array_equal(x, array({3, 3, 4, 4}, {2, 2})).item<bool>());
    x = full({2, 2}, array({3, 4}, {1, 2}));
    CHECK(array_equal(x, array({3, 4, 3, 4}, {2, 2})).item<bool>());
  }

  // Check zeros and ones
  {
    auto x = zeros({2, 2}, float32);
    CHECK_EQ(x.shape(), Shape{2, 2});
    CHECK_EQ(x.ndim(), 2);
    CHECK_EQ(x.dtype(), float32);
    auto y = array({0.0, 0.0, 0.0, 0.0}, {2, 2});
    CHECK(array_equal(x, y).item<bool>());

    x = ones({2, 2}, float32);
    CHECK_EQ(x.shape(), Shape{2, 2});
    CHECK_EQ(x.ndim(), 2);
    CHECK_EQ(x.dtype(), float32);
    y = array({1.0, 1.0, 1.0, 1.0}, {2, 2});
    CHECK(array_equal(x, y).item<bool>());

    x = zeros({2, 2}, int32);
    y = zeros_like(x);
    CHECK_EQ(y.dtype(), int32);
    CHECK(array_equal(x, y).item<bool>());

    x = ones({2, 2}, int32);
    y = ones_like(x);
    CHECK_EQ(y.dtype(), int32);
    CHECK(array_equal(x, y).item<bool>());
  }

  // Works for empty shape and empty array
  {
    array x = ones({}, int32);
    CHECK_EQ(x.shape(), Shape{});
    CHECK_EQ(x.item<int>(), 1);

    x = full({0}, array({}));
    CHECK_EQ(x.shape(), Shape{0});
    CHECK_EQ(x.size(), 0);

    CHECK_THROWS_AS(full({}, array({})), std::invalid_argument);
  }
}


================================================
FILE: tests/custom_vjp_tests.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include "doctest/doctest.h"

#include "mlx/mlx.h"

using namespace mlx::core;

TEST_CASE("test simple custom vjp") {
  auto one = array(1.0);
  auto x = array(2.0);
  auto y = array(3.0);

  auto fn = [](const std::vector<array>& inputs) {
    return std::vector<array>{inputs[0] * inputs[1], inputs[0] + inputs[1]};
  };
  auto transformed_fn = custom_vjp(
      fn,
      [&](const std::vector<array>&,
          const std::vector<array>&,
          const std::vector<array>&) { return std::vector<array>{one, one}; });

  auto [z, g] = vjp(fn, {x, y}, {one, one});
  CHECK_EQ(z[0].item<float>(), 6.0f);
  CHECK_EQ(z[1].item<float>(), 5.0f);
  CHECK_EQ(g[0].item<float>(), 4.0f);
  CHECK_EQ(g[1].item<float>(), 3.0f);

  std::tie(z, g) = vjp(transformed_fn, {x, y}, {one, one});
  CHECK_EQ(z[0].item<float>(), 6.0f);
  CHECK_EQ(z[1].item<float>(), 5.0f);
  CHECK_EQ(g[0].item<float>(), 1.0f);
  CHECK_EQ(g[1].item<float>(), 1.0f);
}

TEST_CASE("test checkpointing") {
  auto one = array(1.0);
  auto x = array(2.0);
  auto y = array(3.0);

  int cnt = 0;
  auto fn = [&cnt](const std::vector<array>& inputs) {
    cnt++;
    auto x = inputs[0] * inputs[1];
    auto y = inputs[0] + inputs[1];
    return std::vector<array>{square(x + y)};
  };
  auto checkpointed_fn = checkpoint(fn);

  auto [z, g] = vjp(checkpointed_fn, {x, y}, {one});
  CHECK_EQ(z[0].item<float>(), 121.0f);
  CHECK_EQ(g[0].item<float>(), 88.0f);
  CHECK_EQ(g[1].item<float>(), 66.0f);
  CHECK_EQ(cnt, 2);
}


================================================
FILE: tests/device_tests.cpp
================================================
// Copyright © 2023 Apple Inc.

#include "doctest/doctest.h"

#include <cstdlib>

#include "mlx/mlx.h"

using namespace mlx::core;

TEST_CASE("test device placement") {
  auto device = default_device();
  Device d = gpu::is_available() ? Device::gpu : Device::cpu;
  if (std::getenv("DEVICE") == nullptr) {
    CHECK_EQ(device, d);
  }

  array x(1.0f);
  array y(1.0f);
  auto z = add(x, y, default_device());
  if (gpu::is_available()) {
    z = add(x, y, Device::gpu);
    z = add(x, y, Device(Device::gpu, 0));
  } else {
    CHECK_THROWS_AS(set_default_device(Device::gpu), std::invalid_argument);
    CHECK_THROWS_AS(add(x, y, Device::gpu), std::invalid_argument);
  }

  // Set the default device to the CPU
  set_default_device(Device::cpu);
  CHECK_EQ(default_device(), Device::cpu);

  // Revert
  set_default_device(device);
}


================================================
FILE: tests/einsum_tests.cpp
================================================
// Copyright © 2024 Apple Inc.

#include "doctest/doctest.h"
#include "mlx/mlx.h"

using namespace mlx::core;

namespace std {

// Required to make doctest compile.
ostream& operator<<(ostream& os, const vector<vector<int>>&) {
  return os;
}

} // namespace std

TEST_CASE("test einsum path") {
  std::vector<std::vector<int>> expected = {{1, 2}, {0, 1}};
  auto path =
      einsum_path("ij,jk,kl", {ones({2, 2}), ones({2, 4}), ones({4, 2})}).first;
  CHECK_EQ(path, expected);

  expected = {{0}};
  path = einsum_path("jki", {ones({2, 3, 4})}).first;
  CHECK_EQ(path, expected);

  expected = {{0, 1}};
  path = einsum_path("i,i", {ones({2}), ones({1})}).first;
  CHECK_EQ(path, expected);

  expected = {{0, 1}};
  path = einsum_path("ij,jk", {ones({2, 2}), ones({2, 2})}).first;
  CHECK_EQ(path, expected);

  expected = {{0, 1}};
  path = einsum_path("ijk,jil->kl", {ones({3, 4, 5}), ones({4, 3, 2})}).first;
  CHECK_EQ(path, expected);

  expected = {{0, 3}, {1, 3}, {0, 2}, {0, 1}};
  path = einsum_path(
             "ijk,ilm,njm,nlk,abc->",
             {ones({2, 6, 8}),
              ones({2, 4, 5}),
              ones({3, 6, 5}),
              ones({3, 4, 8}),
              ones({9, 4, 7})})
             .first;
  CHECK_EQ(path, expected);

  expected = {{0, 2}, {0, 3}, {0, 2}, {0, 1}};
  path = einsum_path(
             "ea,fb,abcd,gc,hd->efgh",
             {ones({10, 10}),
              ones({10, 10}),
              ones({10, 10, 10, 10}),
              ones({10, 10}),
              ones({10, 10})})
             .first;
  CHECK_EQ(path, expected);
}

TEST_CASE("test einsum") {
  CHECK_THROWS(einsum("i,j", {array({1.0})}));
  CHECK_THROWS(einsum("ijk", {full({2, 2}, 2.0f)}));
  CHECK_THROWS(einsum("", {}));
  CHECK_THROWS(einsum("ij", {array({1, 2})}));
  CHECK_THROWS(einsum("", {array({1, 2})}));
  CHECK_THROWS(einsum("i,ij", {array({1, 2}), array({2, 3})}));
  CHECK_THROWS(einsum("i,i", {array({1, 2}), array({2, 3, 4})}));
  CHECK_THROWS(einsum("i->ii", {array({1, 2})}));
  CHECK_THROWS(einsum("12", {zeros({4, 4})}));
  CHECK_THROWS(einsum("ii->i", {zeros({3, 2})}));

  auto x = einsum("jki", {full({2, 3, 4}, 3.0f)});
  auto expected = full({4, 2, 3}, 3.0f);
  CHECK_EQ(allclose(x, expected).item<bool>(), true);

  x = einsum("ij,jk->ik", {full({2, 2}, 2.0f), full({2, 2}, 3.0f)});
  expected = array({12.0f, 12.0f, 12.0f, 12.0f}, {2, 2});
  CHECK_EQ(allclose(x, expected).item<bool>(), true);

  x = einsum("i,j->ij", {full({2}, 15.0f), full({4}, 20.0f)});
  expected = full({2, 4}, 300.0f);
  CHECK_EQ(allclose(x, expected).item<bool>(), true);
}


================================================
FILE: tests/eval_tests.cpp
================================================
// Copyright © 2023 Apple Inc.

#include "doctest/doctest.h"

#include "mlx/mlx.h"

using namespace mlx::core;

TEST_CASE("test eval") {
  {
    array x(1.0);
    array y(1);
    array z(true);
    eval({x, y, z});
    CHECK_EQ(x.item<float>(), 1.0);
  }

  {
    array x(1.0);
    array y = ones({2, 2});
    array z(true);
    eval({x, y, z});
    CHECK(array_equal(y, array({1.0, 1.0, 1.0, 1.0}, {2, 2})).item<bool>());
  }
}

TEST_CASE("test eval multiple") {
  auto x = ones({10, 10});
  auto y = ones({10, 10});
  eval({x, y});
  CHECK(array_equal(x, y).item<bool>());

  auto a = x + y;
  auto b = x - y;
  eval({a, b});
  CHECK(array_equal(a, full({10, 10}, 2.0f)).item<bool>());
  CHECK(array_equal(b, full({10, 10}, 0.0f)).item<bool>());

  x = ones({10, 10});
  y = ones({10, 10});
  eval(x, y);
  CHECK(array_equal(x, y).item<bool>());

  a = x + y;
  b = x - y;
  eval(a, b);
  CHECK(array_equal(a, full({10, 10}, 2.0f)).item<bool>());
  CHECK(array_equal(b, full({10, 10}, 0.0f)).item<bool>());
}

TEST_CASE("test eval with tracer when not tracing") {
  // Since we are not tracing it doesn't matter that the array flags are
  // tracers they will always be detached.
  auto x = array(1);
  x.set_tracer(true);
  CHECK(!x.is_tracer());
  eval(x);
  CHECK(!x.has_primitive());
  CHECK(x.is_available());

  x = ones({2, 3});
  x.set_tracer(true);
  eval(x);
  CHECK(!x.has_primitive());
  CHECK(x.is_available());
}

TEST_CASE("test eval graph retention when not tracing") {
  // Since we are not tracing it doesn't matter that the array flags are
  // tracers they will always be detached.
  auto x = array(1);
  x.set_tracer(true);
  auto y = array(2);
  auto z = x + y;
  eval(z);
  CHECK(!z.has_primitive());
  CHECK(z.is_available());
  CHECK_EQ(z.item<int>(), 3);

  z.set_tracer(false);
  CHECK_EQ(z.item<int>(), 3);
  CHECK(!z.has_primitive());
  CHECK(z.is_available());

  z = x + y;
  auto a = z + x;
  auto b = a + y;
  eval(b);
  CHECK(!z.has_primitive());
  CHECK(z.is_available());
  CHECK(!a.has_primitive());
  CHECK(a.is_available());
}


================================================
FILE: tests/export_import_tests.cpp
================================================
// Copyright © 2024 Apple Inc.

#include <filesystem>
#include <stdexcept>
#include <vector>

#include "doctest/doctest.h"

#include "mlx/export.h"
#include "mlx/mlx.h"

using namespace mlx::core;

namespace {
std::string get_temp_file(const std::string& name) {
  return std::filesystem::temp_directory_path().append(name).string();
}
} // namespace

TEST_CASE("test export basic functions") {
  std::string file_path = get_temp_file("model.mlxfn");

  auto fun = [](std::vector<array> x) -> std::vector<array> {
    return {negative(exp(x[0]))};
  };

  export_function(file_path, fun, {array({1.0, 2.0})});

  auto imported_fun = import_function(file_path);

  // Check num inputs mismatch throws
  CHECK_THROWS_AS(
      imported_fun({array({1.0}), array({2.0})}), std::invalid_argument);

  // Check shape mismatch throws
  CHECK_THROWS_AS(imported_fun({array({1.0})}), std::invalid_argument);

  // Check type mismatch throws
  CHECK_THROWS_AS(imported_fun({array({1.0}, float16)}), std::invalid_argument);

  auto expected = fun({array({1.0, -1.0})});
  auto out = imported_fun({array({1.0, -1.0})});
  CHECK(allclose(expected[0], out[0]).item<bool>());
}

TEST_CASE("test export function with no inputs") {
  auto fun = [](std::vector<array> x) -> std::vector<array> {
    return {zeros({2, 2})};
  };

  std::string file_path = get_temp_file("model.mlxfn");

  export_function(file_path, fun, {});

  auto imported_fun = import_function(file_path);

  auto expected = fun({});
  auto out = imported_fun({});
  CHECK(allclose(expected[0], out[0]).item<bool>());
}

TEST_CASE("test export multi output primitives") {
  std::string file_path = get_temp_file("model.mlxfn");

  auto fun = [](std::vector<array> x) -> std::vector<array> {
    return {divmod(x[0], x[1])};
  };

  auto inputs = std::vector<array>{array({5.0, -10.0}), array({3.0, -2.0})};
  export_function(file_path, fun, inputs);

  auto imported_fun = import_function(file_path);

  auto expected = fun(inputs);
  auto out = imported_fun(inputs);
  CHECK(allclose(expected[0], out[0]).item<bool>());
  CHECK(allclose(expected[1], out[1]).item<bool>());
}

TEST_CASE("test export primitives with state") {
  std::string file_path = get_temp_file("model.mlxfn");

  auto fun = [](std::vector<array> x) -> std::vector<array> {
    return {argpartition(x[0], 2, 0)};
  };

  auto x = array({1, 3, 2, 4, 5, 7, 6, 8}, {4, 2});
  export_function(file_path, fun, {x});

  auto imported_fun = import_function(file_path);

  auto expected = fun({x});
  auto out = imported_fun({x});
  CHECK(allclose(expected[0], out[0]).item<bool>());
}

TEST_CASE("test export functions with kwargs") {
  std::string file_path = get_temp_file("model.mlxfn");

  auto fun = [](const Kwargs& kwargs) -> std::vector<array> {
    return {kwargs.at("x") + kwargs.at("y")};
  };

  export_function(file_path, fun, {{"x", array(1)}, {"y", array(2)}});
  auto fn = import_function(file_path);

  // Must use kwargs
  CHECK_THROWS(fn({array(1), array(2)}));

  // Wrong number of keys
  CHECK_THROWS(fn({{"x", array(1)}, {"y", array(2)}, {"z", array(3)}}));

  // Wrong keys
  CHECK_THROWS(fn({{"a", array(1)}, {"b", array(2)}}));

  // Works
  auto out = fn({{"x", array(1)}, {"y", array(2)}})[0];
  CHECK_EQ(out.item<int>(), 3);
  out = fn({}, {{"x", array(1)}, {"y", array(2)}})[0];
  CHECK_EQ(out.item<int>(), 3);
}

TEST_CASE("test export function with variable inputs") {
  std::string file_path = get_temp_file("model.mlxfn");

  auto fun = [](const std::vector<array>& args) -> std::vector<array> {
    auto out = array({1, 1, 1, 1});
    for (auto x : args) {
      out = out + x;
    }
    return {out};
  };

  {
    auto fn_exporter = exporter(file_path, fun);
    fn_exporter({array(0), array(0)});
    fn_exporter({array(0), array(0), array(0)});
  }

  auto imported_fun = import_function(file_path);

  // Call with two inputs
  auto out = imported_fun({array(1), array(2)})[0];

  CHECK(array_equal(out, array({4, 4, 4, 4})).item<bool>());

  // Call with three inputs
  out = imported_fun({array(1), array(2), array(3)})[0];
  CHECK(array_equal(out, array({7, 7, 7, 7})).item<bool>());
}

TEST_CASE("test export function on different stream") {
  std::string file_path = get_temp_file("model.mlxfn");

  auto fun = [](const std::vector<array>& args) -> std::vector<array> {
    return {abs(args[0], Stream(1000, Device::cpu))};
  };

  export_function(file_path, fun, {array({0, 1, 2})});

  // Should make a new stream that we can run computation on
  eval(import_function(file_path)({array({0, 1, 2})}));
}


================================================
FILE: tests/fft_tests.cpp
================================================
// Copyright © 2023 Apple Inc.

#include "doctest/doctest.h"

#include "mlx/mlx.h"

using namespace mlx::core;

TEST_CASE("test fft basics") {
  array x(1.0);
  CHECK_THROWS(fft::fft(x));
  CHECK_THROWS(fft::ifft(x));

  x = array({1.0});
  auto y = fft::fft(x);
  CHECK_EQ(y.dtype(), complex64);
  CHECK_EQ(y.size(), x.size());
  CHECK_EQ(y.item<complex64_t>(), complex64_t{1.0f, 0.0f});

  y = fft::ifft(x);
  CHECK_EQ(y.dtype(), complex64);
  CHECK_EQ(y.size(), x.size());
  CHECK_EQ(y.item<complex64_t>(), complex64_t{1.0f, 0.0f});

  x = array({complex64_t{1.0f, 1.0f}}, complex64);
  y = fft::fft(x);
  CHECK_EQ(y.size(), x.size());
  CHECK_EQ(y.item<complex64_t>(), complex64_t{1.0f, 1.0f});

  y = fft::ifft(x);
  CHECK_EQ(y.dtype(), complex64);
  CHECK_EQ(y.size(), x.size());
  CHECK_EQ(y.item<complex64_t>(), complex64_t{1.0f, 1.0f});

  {
    x = array({0.0f, 1.0f, 2.0f, 3.0f});
    y = fft::fft(x);
    std::initializer_list<complex64_t> expected = {
        {6.0, 0.0},
        {-2.0, 2.0},
        {-2.0, 0.0},
        {-2.0, -2.0},
    };
    CHECK_EQ(y.size(), x.size());
    CHECK(array_equal(y, array(expected)).item<bool>());

    y = fft::ifft(x);
    std::initializer_list<complex64_t> expected_inv = {
        {1.5, 0.0},
        {-0.5, -0.5},
        {-0.5, 0.0},
        {-0.5, 0.5},
    };
    CHECK(array_equal(y, array(expected_inv)).item<bool>());
  }

  {
    std::initializer_list<complex64_t> vals = {
        {1.0f, 1.0f}, {2.0f, 1.0f}, {1.0f, 2.0f}, {2.0f, 2.0f}};
    x = array(vals);
    y = fft::fft(x);
    std::initializer_list<complex64_t> expected = {
        {6.0, 6.0},
        {-1.0, -1.0},
        {-2.0, 0.0},
        {1.0, -1.0},
    };
    CHECK_EQ(y.size(), x.size());
    CHECK(array_equal(y, array(expected)).item<bool>());
    CHECK(array_equal(fft::ifft(y), x).item<bool>());
  }

  // Specify axes
  {
    x = array({0.0f, 1.0f, 2.0f, 3.0f}, {2, 2});
    std::initializer_list<complex64_t> expected_0 = {
        {2.0, 0.0},
        {4.0, 0.0},
        {-2.0, 0.0},
        {-2.0, 0.0},
    };
    y = fft::fft(x, 0);
    CHECK(array_equal(y, array(expected_0, {2, 2})).item<bool>());
    CHECK(array_equal(fft::ifft(y, 0), x).item<bool>());
    std::initializer_list<complex64_t> expected_1 = {
        {1.0, 0.0},
        {-1.0, 0.0},
        {5.0, 0.0},
        {-1.0, 0.0},
    };
    y = fft::fft(x, 1);
    CHECK(array_equal(y, array(expected_1, {2, 2})).item<bool>());
    CHECK(array_equal(fft::ifft(y, 1), x).item<bool>());
  }
}

TEST_CASE("test real ffts") {
  auto x = array({1.0});
  auto y = fft::rfft(x);
  CHECK_EQ(y.dtype(), complex64);
  CHECK_EQ(y.size(), x.size());
  CHECK_EQ(y.item<complex64_t>(), complex64_t{1.0f, 0.0f});

  {
    x = array({0.0f, 1.0f, 2.0f, 3.0f});
    y = fft::rfft(x);
    std::initializer_list<complex64_t> expected = {
        {6.0, 0.0}, {-2.0, 2.0}, {-2.0, -0.0}};
    CHECK_EQ(y.size(), x.size() / 2 + 1);
    CHECK(array_equal(y, array(expected)).item<bool>());
  }

  x = array(complex64_t{1, 1});
  CHECK_THROWS(fft::irfft(x));

  x = array({complex64_t{0, 1}, complex64_t{1, 0}});
  y = fft::irfft(x);
  CHECK_EQ(y.size(), 2);
  CHECK_EQ(y.dtype(), float32);
  CHECK(array_equal(y, array({0.5f, -0.5f})).item<bool>());
}

TEST_CASE("test fftn") {
  auto x = zeros({5, 5, 5});
  CHECK_THROWS_AS(fft::fftn(x, {}, {0, 3}), std::invalid_argument);
  CHECK_THROWS_AS(fft::fftn(x, {}, {0, -4}), std::invalid_argument);
  CHECK_THROWS_AS(fft::fftn(x, {}, {0, 0}), std::invalid_argument);
  CHECK_THROWS_AS(fft::fftn(x, {5, 5, 5}, {0}), std::invalid_argument);
  CHECK_THROWS_AS(fft::fftn(x, {0}, {}, {}), std::invalid_argument);
  CHECK_THROWS_AS(fft::fftn(x, {1, -1}, {}, {}), std::invalid_argument);

  // Test 2D FFT
  {
    x = array({0.0f, 1.0f, 2.0f, 3.0f}, {2, 2});
    std::initializer_list<complex64_t> expected = {
        {6.0, 0.0},
        {-2.0, 0.0},
        {-4.0, 0.0},
        {0.0, 0.0},
    };
    auto y = fft::fft2(x);
    CHECK(array_equal(y, array(expected, {2, 2})).item<bool>());
    CHECK(array_equal(fft::ifft2(y), x).item<bool>());
  }

  // Test 3D FFT
  {
    x = reshape(arange(8, float32), {2, 2, 2});
    std::initializer_list<complex64_t> expected = {
        {28.0, 0.0},
        {-4.0, 0.0},
        {-8.0, 0.0},
        {0.0, 0.0},
        {-16.0, 0.0},
        {0.0, 0.0},
        {0.0, 0.0},
        {0.0, 0.0},
    };
    auto y = fft::fftn(x);
    CHECK(array_equal(y, array(expected, {2, 2, 2})).item<bool>());
    CHECK(array_equal(fft::ifftn(y), x).item<bool>());

    x = reshape(arange(20, float32), {5, 4});
    y = fft::rfftn(x);
    CHECK_EQ(y.shape(), Shape{5, 3});
    y = fft::rfftn(x, {1, 0});
    CHECK_EQ(y.shape(), Shape{3, 4});

    x = reshape(arange(20, float32), {5, 4});
    y = fft::irfftn(x);
    CHECK_EQ(y.shape(), Shape{5, 6});
    y = fft::irfftn(x, {1, 0});
    CHECK_EQ(y.shape(), Shape{8, 4});
  }

  // Check the types of real ffts
  {
    x = zeros({5, 5}, float32);
    auto y = fft::rfft2(x);
    CHECK_EQ(y.shape(), Shape{5, 3});
    CHECK_EQ(y.dtype(), complex64);

    y = fft::rfftn(x);
    CHECK_EQ(y.shape(), Shape{5, 3});
    CHECK_EQ(y.dtype(), complex64);

    x = zeros({5, 5}, complex64);
    y = fft::irfft2(x);
    CHECK_EQ(y.shape(), Shape{5, 8});
    CHECK_EQ(y.dtype(), float32);

    y = fft::irfftn(x);
    CHECK_EQ(y.shape(), Shape{5, 8});
    CHECK_EQ(y.dtype(), float32);
  }

  // Test non-contiguous layouts and axes that are not physically last.
  {
    x = astype(
        transpose(reshape(arange(24, float32), {2, 3, 4}), {1, 2, 0}),
        complex64);
    auto y = fft::fftn(x, {2, 0});
    CHECK_EQ(y.shape(), x.shape());
    CHECK(allclose(fft::ifftn(y, {2, 0}), x, 1e-5, 1e-5).item<bool>());

    auto r = transpose(reshape(arange(60, float32), {3, 4, 5}), {1, 2, 0});
    auto yr = fft::rfftn(r, {2, 0});
    CHECK_EQ(yr.shape(), Shape{3, 5, 3});
    CHECK(allclose(fft::irfftn(yr, {2, 0}), r, 1e-5, 1e-5).item<bool>());
  }
}

TEST_CASE("test fft with provided shape") {
  auto x = ones({5, 5});

  auto y = fft::fft(x, 7, 0);
  CHECK_EQ(y.shape(), Shape{7, 5});

  y = fft::fft(x, 3, 0);
  CHECK_EQ(y.shape(), Shape{3, 5});

  y = fft::fft(x, 7, 1);
  CHECK_EQ(y.shape(), Shape{5, 7});

  y = fft::fft(x, 3, 1);
  CHECK_EQ(y.shape(), Shape{5, 3});

  y = fft::rfft(x, 7, 0);
  CHECK_EQ(y.shape(), Shape{4, 5});

  y = fft::rfft(x, 3, 0);
  CHECK_EQ(y.shape(), Shape{2, 5});

  y = fft::rfft(x, 3, 1);
  CHECK_EQ(y.shape(), Shape{5, 2});
}

TEST_CASE("test fft vmap") {
  auto fft_fn = [](array x) { return fft::fft(x); };
  auto x = reshape(arange(8), {2, 4});
  auto y = vmap(fft_fn)(x);
  CHECK(array_equal(y, fft::fft(x)).item<bool>());

  y = vmap(fft_fn, 1, 1)(x);
  CHECK(array_equal(y, fft::fft(x, 0)).item<bool>());

  auto rfft_fn = [](array x) { return fft::rfft(x); };

  y = vmap(rfft_fn)(x);
  CHECK(array_equal(y, fft::rfft(x)).item<bool>());

  y = vmap(rfft_fn, 1, 1)(x);
  CHECK(array_equal(y, fft::rfft(x, 0)).item<bool>());
}

TEST_CASE("test fft grads") {
  // Regular
  auto fft_fn = [](array x) { return fft::fft(x); };
  auto cotangent = astype(arange(10), complex64);
  auto vjp_out = vjp(fft_fn, zeros_like(cotangent), cotangent).second;
  CHECK(array_equal(fft::ifft(cotangent) * 10, vjp_out).item<bool>());

  auto tangent = astype(arange(10), complex64);
  auto jvp_out = jvp(fft_fn, zeros_like(tangent), tangent).second;
  CHECK(array_equal(fft::fft(tangent), jvp_out).item<bool>());

  // Inverse
  auto ifft_fn = [](array x) { return fft::ifft(x); };
  vjp_out = vjp(ifft_fn, zeros_like(cotangent), cotangent).second;
  CHECK(array_equal(fft::fft(cotangent) * 0.1, vjp_out).item<bool>());

  jvp_out = jvp(ifft_fn, zeros_like(tangent), tangent).second;
  CHECK(array_equal(fft::ifft(tangent), jvp_out).item<bool>());

  // Real
  auto rfft_fn = [](array x) { return fft::rfft(x); };
  cotangent = astype(arange(6), complex64);
  vjp_out = vjp(rfft_fn, zeros({10}), cotangent).second;
  array mask({1.0, 0.5, 0.5, 0.5, 0.5, 1.0}, complex64);
  auto expected = fft::irfft(cotangent * mask, 10, 0) * 10;
  CHECK(array_equal(expected, vjp_out).item<bool>());

  tangent = astype(arange(10), float32);
  jvp_out = jvp(rfft_fn, zeros_like(tangent), tangent).second;
  CHECK(array_equal(fft::rfft(tangent), jvp_out).item<bool>());

  // Inverse real
  auto irfft_fn = [](array x) { return fft::irfft(x); };
  cotangent = astype(arange(10), float32);
  vjp_out = vjp(irfft_fn, astype(zeros({6}), complex64), cotangent).second;
  mask = array({0.1, 0.2, 0.2, 0.2, 0.2, 0.1}, float32);
  expected = fft::rfft(cotangent) * mask;
  CHECK(array_equal(expected, vjp_out).item<bool>());

  tangent = astype(arange(10), complex64);
  jvp_out = jvp(irfft_fn, zeros_like(tangent), tangent).second;
  CHECK(array_equal(fft::irfft(tangent), jvp_out).item<bool>());

  // Check ND vjps run properly
  vjp_out = vjp([](array x) { return fft::fftn(x); },
                astype(zeros({5, 5}), complex64),
                astype(zeros({5, 5}), complex64))
                .second;
  CHECK_EQ(vjp_out.shape(), Shape{5, 5});

  vjp_out = vjp([](array x) { return fft::ifftn(x); },
                astype(zeros({5, 5}), complex64),
                astype(zeros({5, 5}), complex64))
                .second;
  CHECK_EQ(vjp_out.shape(), Shape{5, 5});

  vjp_out = vjp([](array x) { return fft::rfftn(x); },
                zeros({5, 9}),
                astype(zeros({5, 5}), complex64))
                .second;
  CHECK_EQ(vjp_out.shape(), Shape{5, 9});

  vjp_out = vjp([](array x) { return fft::irfftn(x); },
                astype(zeros({5, 5}), complex64),
                zeros({5, 8}))
                .second;
  CHECK_EQ(vjp_out.shape(), Shape{5, 5});
}

TEST_CASE("test fftshift and ifftshift") {
  // Test 1D array with even length
  auto x = arange(8);
  auto y = fft::fftshift(x);
  CHECK_EQ(y.shape(), x.shape());
  // print y
  CHECK(array_equal(y, array({4, 5, 6, 7, 0, 1, 2, 3})).item<bool>());

  // Test 1D array with odd length
  x = arange(7);
  y = fft::fftshift(x);
  CHECK_EQ(y.shape(), x.shape());
  CHECK(array_equal(y, array({4, 5, 6, 0, 1, 2, 3})).item<bool>());

  // Test 2D array
  x = reshape(arange(16), {4, 4});
  y = fft::fftshift(x);
  auto expected =
      array({10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5}, {4, 4});
  CHECK(array_equal(y, expected).item<bool>());

  // Test with specific axes
  y = fft::fftshift(x, {0});
  expected =
      array({8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, {4, 4});
  CHECK(array_equal(y, expected).item<bool>());

  y = fft::fftshift(x, {1});
  expected =
      array({2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, {4, 4});
  CHECK(array_equal(y, expected).item<bool>());

  // Test ifftshift (inverse operation)
  x = arange(8);
  y = fft::ifftshift(x);
  CHECK_EQ(y.shape(), x.shape());
  CHECK(array_equal(y, array({4, 5, 6, 7, 0, 1, 2, 3})).item<bool>());

  // Test ifftshift with odd length (different from fftshift)
  x = arange(7);
  y = fft::ifftshift(x);
  CHECK_EQ(y.shape(), x.shape());
  CHECK(array_equal(y, array({3, 4, 5, 6, 0, 1, 2})).item<bool>());

  // Test 2D ifftshift
  x = reshape(arange(16), {4, 4});
  y = fft::ifftshift(x);
  expected =
      array({10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5}, {4, 4});
  CHECK(array_equal(y, expected).item<bool>());

  // Test error cases
  CHECK_THROWS_AS(fft::fftshift(x, {3}), std::invalid_argument);
  CHECK_THROWS_AS(fft::fftshift(x, {-5}), std::invalid_argument);
  CHECK_THROWS_AS(fft::ifftshift(x, {3}), std::invalid_argument);
  CHECK_THROWS_AS(fft::ifftshift(x, {-5}), std::invalid_argument);
}


================================================
FILE: tests/gpu_tests.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include <array>

#include "doctest/doctest.h"
#include "mlx/mlx.h"

using namespace mlx::core;

static const std::array<Dtype, 5> types =
    {bool_, uint32, int32, int64, float32};

TEST_CASE("test gpu arange") {
  for (auto t : types) {
    if (t == bool_) {
      continue;
    }
    auto out_cpu = arange(1, 100, 2, t, Device::cpu);
    auto out_gpu = arange(1, 100, 2, t, Device::gpu);
    CHECK(array_equal(out_gpu, out_cpu, Device::cpu).item<bool>());

    out_cpu = arange(1, 5, 0.25, t, Device::cpu);
    out_gpu = arange(1, 5, 0.25, t, Device::gpu);
    CHECK(array_equal(out_gpu, out_cpu, Device::cpu).item<bool>());
  }
}

TEST_CASE("test gpu full") {
  for (auto t : types) {
    auto out_cpu = full({4, 4}, 2, t, Device::cpu);
    auto out_gpu = full({4, 4}, 2, t, Device::gpu);
    CHECK(array_equal(out_gpu, out_cpu, Device::cpu).item<bool>());
  }

  // Check broadcasting works
  {
    auto x = full({2, 2}, array({3, 4}, {2, 1}), Device::gpu);
    CHECK(
        array_equal(x, array({3, 3, 4, 4}, {2, 2}), Device::cpu).item<bool>());
    x = full({2, 2}, array({3, 4}, {1, 2}), Device::gpu);
    CHECK(
        array_equal(x, array({3, 4, 3, 4}, {2, 2}), Device::cpu).item<bool>());
  }

  // Check zeros and ones
  {
    auto x = zeros({2, 2}, float32, Device::gpu);
    auto y = array({0.0, 0.0, 0.0, 0.0}, {2, 2});
    CHECK(array_equal(x, y, Device::cpu).item<bool>());

    x = ones({2, 2}, float32, Device::gpu);
    y = array({1.0, 1.0, 1.0, 1.0}, {2, 2});
    CHECK(array_equal(x, y, Device::cpu).item<bool>());
  }
}

TEST_CASE("test gpu astype") {
  array x = array({-4, -3, -2, -1, 0, 1, 2, 3});
  // Check all types work
  for (auto t : types) {
    auto out_cpu = astype(x, t, Device::cpu);
    auto out_gpu = astype(x, t, Device::gpu);
    CHECK(array_equal(out_gpu, out_cpu, Device::cpu).item<bool>());
  }

  x = transpose(reshape(x, {2, 2, 2}), {1, 2, 0});
  for (auto t : types) {
    auto out_cpu = astype(x, t, Device::cpu);
    auto out_gpu = astype(x, t, Device::gpu);
    CHECK(array_equal(out_gpu, out_cpu, Device::cpu).item<bool>());
  }
}

TEST_CASE("test gpu reshape") {
  array x = array({0, 1, 2, 3, 4, 5, 6, 7});
  auto out_cpu = reshape(x, {2, 2, 2});
  auto out_gpu = reshape(x, {2, 2, 2}, Device::gpu);
  CHECK(array_equal(out_gpu, out_cpu, Device::cpu).item<bool>());

  x = transpose(reshape(x, {2, 2, 2}), {1, 2, 0});
  out_cpu = reshape(x, {4, 2});
  out_gpu = reshape(x, {4, 2}, Device::gpu);
  CHECK(array_equal(out_gpu, out_cpu, Device::cpu).item<bool>());

  out_cpu = reshape(x, {8});
  out_gpu = reshape(x, {8}, Device::gpu);
  CHECK(array_equal(out_gpu, out_cpu, Device::cpu).item<bool>());
}

TEST_CASE("test gpu reduce") {
  {
    array a(true);
    CHECK_EQ(all(a, Device::gpu).item<bool>(), true);
    CHECK_EQ(any(a, Device::gpu).item<bool>(), true);

    a = array(std::initializer_list<bool>{});
    CHECK_EQ(all(a, Device::gpu).item<bool>(), true);
    CHECK_EQ(any(a, Device::gpu).item<bool>(), false);
  }

  {
    std::vector<int> vals(33, 1);
    array a(vals.data(), {33});
    CHECK_EQ(all(a, Device::gpu).item<bool>(), true);

    vals[32] = 0;
    a = array(vals.data(), {33});
    CHECK_EQ(all(a, Device::gpu).item<bool>(), false);
  }

  {
    std::vector<int> vals(33, 0);
    array a(vals.data(), {33});
    CHECK_EQ(any(a, Device::gpu).item<bool>(), false);

    vals[32] = 1;
    a = array(vals.data(), {33});
    CHECK_EQ(any(a, Device::gpu).item<bool>(), true);
  }

  {
    std::vector<int> vals(1 << 14, 0);
    array a(vals.data(), {1 << 14});
    CHECK_EQ(all(a, Device::gpu).item<bool>(), false);
    CHECK_EQ(any(a, Device::gpu).item<bool>(), false);

    vals[4] = 1;
    vals[999] = 1;
    vals[2000] = 1;
    a = array(vals.data(), {1 << 14});
    CHECK_EQ(all(a, Device::gpu).item<bool>(), false);
    CHECK_EQ(any(a, Device::gpu).item<bool>(), true);
  }

  // sum and prod
  {
    array a = array({true, false, true});
    CHECK_EQ(sum(a, Device::gpu).item<uint32_t>(), 2);
    CHECK_EQ(prod(a, Device::gpu).item<bool>(), false);

    a = array({true, true, true});
    CHECK_EQ(sum(a, Device::gpu).item<uint32_t>(), 3);
    CHECK_EQ(prod(a, Device::gpu).item<bool>(), true);

    a = full({2, 2, 2}, 2.0f);
    CHECK_EQ(sum(a, Device::gpu).item<float>(), 16.0f);
    CHECK_EQ(prod(a, Device::gpu).item<float>(), 256.0f);

    a = full({500, 2, 2}, 1u);
    CHECK_EQ(sum(a, Device::gpu).item<uint32_t>(), 2000);
    CHECK_EQ(prod(a, Device::gpu).item<uint32_t>(), 1u);

    a = full({500, 2, 2}, 1);
    CHECK_EQ(sum(a, Device::gpu).item<int32_t>(), 2000);
    CHECK_EQ(prod(a, Device::gpu).item<int32_t>(), 1);
  }

  // sum and prod overflow
  {
    auto a = full({256, 2, 2}, 1u, uint8);
    CHECK_EQ(sum(a, Device::gpu).item<uint32_t>(), 256 * 4);
    CHECK_EQ(prod(a, Device::gpu).item<uint32_t>(), 1);

    a = full({65535, 2, 2}, 1u, uint16);
    CHECK_EQ(sum(a, Device::gpu).item<uint32_t>(), 65535 * 4);
    CHECK_EQ(prod(a, Device::gpu).item<uint32_t>(), 1);
  }
}

TEST_CASE("test gpu reduce with axes") {
  // reducing only some axes and irregular layouts
  {
    array a(1.0f);
    a = broadcast_to(a, {2, 2, 2});
    CHECK_EQ(sum(a, Device::gpu).item<float>(), 8.0f);

    a = ones({2, 4, 8, 16});
    for (auto ax : {0, 1, 2, 3}) {
      auto out_gpu = sum(a, ax, false, Device::gpu);
      auto out_cpu = sum(a, ax, false, Device::cpu);
      CHECK(array_equal(out_gpu, out_cpu, Device::cpu).item<bool>());
    }

    for (auto ax : {1, 2, 3}) {
      auto out_gpu = sum(a, {0, ax}, false, Device::gpu);
      auto out_cpu = sum(a, {0, ax}, false, Device::cpu);
      CHECK(array_equal(out_gpu, out_cpu, Device::cpu).item<bool>());
    }
    for (auto ax : {2, 3}) {
      auto out_gpu = sum(a, {0, 1, ax}, false, Device::gpu);
      auto out_cpu = sum(a, {0, 1, ax}, false, Device::cpu);
      CHECK(array_equal(out_gpu, out_cpu, Device::cpu).item<bool>());
    }
  }
}

TEST_CASE("test gpu binary ops") {
  // scalar-scalar
  {
    array a(2.0f);
    array b(4.0f);
    auto out = add(a, b, Device::gpu);
    CHECK_EQ(out.item<float>(), 6.0f);
  }

  // scalar-vector and vector-scalar
  {
    array a(2.0f);
    array b({2.0f, 4.0f, 6.0f});
    auto out = add(a, b, Device::gpu);
    auto expected = array({4.0f, 6.0f, 8.0f});
    CHECK(array_equal(out, expected, Device::cpu).item<bool>());
    out = add(b, a, Device::gpu);
    CHECK(array_equal(out, expected, Device::cpu).item<bool>());
  }

  // vector-vector
  {
    array a({0.0f, 1.0f, 2.0f});
    array b({3.0f, 4.0f, 5.0f});
    auto out = add(a, b, Device::gpu);
    auto expected = array({3.0f, 5.0f, 7.0f});
    CHECK(array_equal(out, expected, Device::cpu).item<bool>());
  }

  // general
  {
    array a({0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}, {2, 2, 2});
    array b({0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}, {2, 2, 2});
    a = transpose(a, {0, 2, 1});
    b = transpose(b, {1, 0, 2});
    auto out_gpu = add(a, b, Device::gpu);
    auto out_cpu = add(a, b, Device::cpu);
    auto expected =
        array({0.0f, 3.0f, 5.0f, 8.0f, 6.0f, 9.0f, 11.0f, 14.0f}, {2, 2, 2});
    CHECK(array_equal(out_gpu, out_cpu, Device::cpu).item<bool>());
    CHECK(array_equal(out_gpu, expected, Device::cpu).item<bool>());
  }

  // Check all types work
  for (auto t : types) {
    auto a = astype(array({0, 1, 2}), t);
    auto b = astype(array({3, 4, 5}), t);
    auto out_cpu = add(a, b, Device::cpu);
    auto out_gpu = add(a, b, Device::gpu);
    CHECK(array_equal(out_gpu, out_cpu, Device::cpu).item<bool>());
  }

  // Check subtraction
  {
    auto a = array({3, 2, 1});
    auto b = array({1, 1, 1});
    auto out = subtract(a, b, Device::gpu);
    CHECK(array_equal(out, array({2, 1, 0}), Device::cpu).item<bool>());
  }

  // Check multiplication
  {
    auto a = array({1, 2, 3});
    auto b = array({2, 2, 2});
    auto out = multiply(a, b, Device::gpu);
    CHECK(array_equal(out, array({2, 4, 6}), Device::cpu).item<bool>());
  }

  // Check division
  {
    auto x = array(1.0f);
    auto y = array(1.0f);
    CHECK_EQ(divide(x, y, Device::gpu).item<float>(), 1.0f);

    x = array(1.0f);
    y = array(0.5);
    CHECK_EQ(divide(x, y, Device::gpu).item<float>(), 2.0f);

    x = array(1.0f);
    y = array(0.0f);
    CHECK(std::isinf(divide(x, y, Device::gpu).item<float>()));

    x = array(0.0f);
    y = array(0.0f);
    CHECK(std::isnan(divide(x, y, Device::gpu).item<float>()));
  }

  // Check maximum and minimum
  {
    auto x = array(1.0f);
    auto y = array(0.0f);
    CHECK_EQ(maximum(x, y, Device::gpu).item<float>(), 1.0f);
    CHECK_EQ(minimum(x, y, Device::gpu).item<float>(), 0.0f);
    y = array(2.0f);
    CHECK_EQ(maximum(x, y, Device::gpu).item<float>(), 2.0f);
    CHECK_EQ(minimum(x, y, Device::gpu).item<float>(), 1.0f);
  }

  // Check equal
  {
    array x(1.0f);
    array y(1.0f);
    CHECK(equal(x, y, Device::gpu).item<bool>());
    x = array(0.0f);
    CHECK(!equal(x, y, Device::gpu).item<bool>());
  }

  // Greater and less
  {
    array x(1.0f);
    array y(0.0f);
    CHECK(greater(x, y, Device::gpu).item<bool>());
    CHECK(greater_equal(x, y, Device::gpu).item<bool>());
    CHECK(!greater(y, x, Device::gpu).item<bool>());
    CHECK(!greater_equal(y, x, Device::gpu).item<bool>());
    y = array(1.0f);
    CHECK(!greater(x, y, Device::gpu).item<bool>());
    CHECK(greater_equal(x, y, Device::gpu).item<bool>());

    x = array(0.0f);
    y = array(1.0f);
    CHECK(less(x, y, Device::gpu).item<bool>());
    CHECK(less_equal(x, y, Device::gpu).item<bool>());
    CHECK(!less(y, x, Device::gpu).item<bool>());
    CHECK(!less_equal(y, x, Device::gpu).item<bool>());
    y = array(0.0f);
    CHECK(!less(x, y, Device::gpu).item<bool>());
    CHECK(less_equal(x, y, Device::gpu).item<bool>());
  }

  // Check logaddexp
  {
    constexpr float inf = std::numeric_limits<float>::infinity();
    array x(inf);
    array y(2.0f);
    auto out = logaddexp(x, y, Device::gpu);
    CHECK_EQ(out.item<float>(), inf);

    x = array(-inf);
    out = logaddexp(x, y, Device::gpu);
    CHECK_EQ(out.item<float>(), 2.0f);

    y = array(-inf);
    out = logaddexp(x, y, Device::gpu);
    CHECK_EQ(out.item<float>(), -inf);
  }
}

TEST_CASE("test gpu unary ops") {
  // contiguous
  {
    array x({-1.0f, 0.0f, 1.0f});
    auto expected = array({1.0f, 0.0f, 1.0f});
    CHECK(array_equal(abs(x, Device::gpu), expected, Device::cpu).item<bool>());
  }

  // general
  {
    array x({-1.0f, 0.0f, 1.0f, 1.0f, -1.0f, 1.0f, 3.0f, -3.0f});
    auto y = slice(x, {0}, {8}, {2});
    auto expected = array({1.0f, 1.0f, 1.0f, 3.0f});
    CHECK(array_equal(abs(y, Device::gpu), expected, Device::cpu).item<bool>());

    y = slice(x, {4}, {8});
    expected = array({1.0f, 1.0f, 3.0f, 3.0f});
    CHECK(array_equal(abs(y, Device::gpu), expected, Device::cpu).item<bool>());
  }

  // Test negative
  {
    array x(1.0f);
    CHECK_EQ(negative(x, Device::gpu).item<float>(), -1.0f);
  }

  // Check all types work
  for (auto t : types) {
    if (t == bool_) {
      continue;
    }
    auto in = astype(array({1}), t);
    auto out_cpu = negative(in, Device::cpu);
    auto out_gpu = negative(in, Device::gpu);
    CHECK(array_equal(out_gpu, out_cpu, Device::cpu).item<bool>());
  }

  // Test log1p
  {
    constexpr float inf = std::numeric_limits<float>::infinity();
    array x(-1.0f);
    CHECK_EQ(log1p(x, Device::gpu).item<float>(), -inf);

    x = array(0.0f);
    CHECK_EQ(log1p(x, Device::gpu).item<float>(), 0.0f);

    x = array(1e-9f);
    CHECK_EQ(log1p(x, Device::gpu).item<float>(), 1e-9f);

    x = array(-2.0f);
    CHECK(std::isnan(log1p(x, Device::gpu).item<float>()));
  }
}

TEST_CASE("test gpu random") {
  {
    auto key = random::key(0);
    auto x = random::bits({}, 4, key, Device::gpu);
    auto y = random::bits({}, 4, key, Device::gpu);
    CHECK_EQ(x.item<uint32_t>(), 1797259609u);
    CHECK_EQ(x.item<uint32_t>(), y.item<uint32_t>());
  }

  {
    auto key = random::key(1);
    auto x = random::bits({}, 4, key, Device::gpu);
    CHECK_EQ(x.item<uint32_t>(), 507451445u);
  }

  {
    auto key = random::key(0);
    auto x = random::bits({3, 1}, 4, key, Device::gpu);
    auto expected = array({4146024105u, 1351547692u, 2718843009u}, {3, 1});
    CHECK(array_equal(x, expected, Device::cpu).item<bool>());
  }
}

TEST_CASE("test gpu matmul") {
  {
    auto a = ones({2, 2});
    auto b = ones({2, 2});
    auto out = matmul(a, b, Device::gpu);
    CHECK(array_equal(out, full({2, 2}, 2.0f), Device::cpu).item<bool>());
  }

  // Batched matmul
  {
    auto a = ones({3, 2, 2});
    auto b = ones({3, 2, 2});
    auto out = matmul(a, b, Device::gpu);
    CHECK(array_equal(out, full({3, 2, 2}, 2.0f), Device::cpu).item<bool>());
  }

  // Broadcast batched matmul
  {
    auto a = ones({1, 3, 2, 2});
    auto b = ones({3, 1, 2, 2});
    auto out = matmul(a, b, Device::gpu);
    CHECK(array_equal(out, full({3, 3, 2, 2}, 2.0f), Device::cpu).item<bool>());
  }
}

TEST_CASE("test gpu validation") {
  // Run this test with Metal validation enabled
  // METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 ./tests/tests \
  //     -tc="test metal validation"

  auto x = array({});
  eval(exp(x));

  auto y = array({});
  eval(add(x, y));

  eval(sum(x));

  x = array({1, 2, 3});
  y = array(0);
  eval(gather(x, y, 0, {0}));
  eval(gather(x, y, 0, {2}));

  eval(gather(x, y, 0, {0}));
  eval(gather(x, y, 0, {2}));

  eval(scatter(x, y, array({2}), 0));

  x = arange(0, -3, 1);
  eval(x);
  array_equal(x, array({})).item<bool>();

  x = array({1.0, 0.0});
  eval(argmax(x));

  eval(scatter_max(array(1), {}, array(2), std::vector<int>{}));
}

TEST_CASE("test memory info") {
  // Test cache limits
  {
    auto old_limit = set_cache_limit(0);
    {
      auto a = zeros({4096});
      eval(a);
    }
    CHECK_EQ(get_cache_memory(), 0);
    CHECK_EQ(set_cache_limit(old_limit), 0);
    CHECK_EQ(set_cache_limit(old_limit), old_limit);
  }

  // Test memory limits
  {
    auto old_limit = set_memory_limit(10);
    CHECK_EQ(set_memory_limit(old_limit), 10);
    CHECK_EQ(set_memory_limit(old_limit), old_limit);
  }

  // Query active and peak memory
  {
    auto a = zeros({4096});
    eval(a);
    synchronize();
    auto active_mem = get_active_memory();
    CHECK(active_mem >= 4096 * 4);
    {
      auto b = zeros({4096});
      eval(b);
    }
    synchronize();
    auto new_active_mem = get_active_memory();
    CHECK_EQ(new_active_mem, active_mem);
    auto peak_mem = get_peak_memory();
    CHECK(peak_mem >= 4096 * 8);

    auto cache_mem = get_cache_memory();
    CHECK(cache_mem >= 4096 * 4);
  }

  clear_cache();
  CHECK_EQ(get_cache_memory(), 0);
}


================================================
FILE: tests/linalg_tests.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

#include "doctest/doctest.h"

#include <cmath>

#include "mlx/mlx.h"
#include "mlx/ops.h"

using namespace mlx::core;
using namespace mlx::core::linalg;

TEST_CASE("[mlx.core.linalg.norm] no ord") {
  // Zero dimensions
  array x(2.0);
  CHECK_EQ(norm(x).item<float>(), 2.0f);
  CHECK_THROWS(norm(x, 0));

  x = array({1, 2, 3});
  float expected = std::sqrt(1 + 4 + 9);
  CHECK_EQ(norm(x).item<float>(), doctest::Approx(expected));
  CHECK_EQ(norm(x, 0, false).item<float>(), doctest::Approx(expected));
  CHECK_EQ(norm(x, -1, false).item<float>(), doctest::Approx(expected));
  CHECK_EQ(norm(x, -1, true).ndim(), 1);
  CHECK_THROWS(norm(x, 1));

  x = reshape(arange(9), {3, 3});
  expected =
      std::sqrt(0 + 1 + 2 * 2 + 3 * 3 + 4 * 4 + 5 * 5 + 6 * 6 + 7 * 7 + 8 * 8);

  CHECK_EQ(norm(x).item<float>(), doctest::Approx(expected));
  CHECK_EQ(
      norm(x, std::vector<int>{0, 1}).item<float>(), doctest::Approx(expected));
  CHECK(allclose(
            norm(x, 0, false),
            array(
                {std::sqrt(0 + 3 * 3 + 6 * 6),
                 std::sqrt(1 + 4 * 4 + 7 * 7),
                 std::sqrt(2 * 2 + 5 * 5 + 8 * 8)}))
            .item<bool>());
  CHECK(allclose(
            norm(x, 1, false),
            array(
                {std::sqrt(0 + 1 + 2 * 2),
                 std::sqrt(3 * 3 + 4 * 4 + 5 * 5),
                 std::sqrt(6 * 6 + 7 * 7 + 8 * 8)}))
            .item<bool>());

  x = reshape(arange(18), {2, 3, 3});
  CHECK(allclose(
            norm(x, 2, false),
            array(
                {
                    std::sqrt(0 + 1 + 2 * 2),
                    std::sqrt(3 * 3 + 4 * 4 + 5 * 5),
                    std::sqrt(6 * 6 + 7 * 7 + 8 * 8),
                    std::sqrt(9 * 9 + 10 * 10 + 11 * 11),
                    std::sqrt(12 * 12 + 13 * 13 + 14 * 14),
                    std::sqrt(15 * 15 + 16 * 16 + 17 * 17),
                },
                {2, 3}))
            .item<bool>());
  CHECK(allclose(
            norm(x, std::vector<int>{1, 2}, false),
            array(
                {std::sqrt(
                     0 + 1 + 2 * 2 + 3 * 3 + 4 * 4 + 5 * 5 + 6 * 6 + 7 * 7 +
                     8 * 8),
                 std::sqrt(
                     9 * 9 + 10 * 10 + 11 * 11 + 12 * 12 + 13 * 13 + 14 * 14 +
                     15 * 15 + 16 * 16 + 17 * 17)},
                {2}))
            .item<bool>());
  CHECK_THROWS(norm(x, std::vector<int>{0, 1, 2}));
}

TEST_CASE("[mlx.core.linalg.norm] double ord") {
  CHECK_THROWS(norm(array(0), 2.0));

  array x({1, 2, 3});

  float expected = std::sqrt(1 + 4 + 9);
  CHECK_EQ(norm(x, 2.0).item<float>(), doctest::Approx(expected));
  CHECK_EQ(norm(x, 2.0, 0).item<float>(), doctest::Approx(expected));
  CHECK_THROWS(norm(x, 2.0, 1));

  expected = 1 + 2 + 3;
  CHECK_EQ(norm(x, 1.0).item<float>(), doctest::Approx(expected));

  expected = 3;
  CHECK_EQ(norm(x, 0.0).item<float>(), doctest::Approx(expected));

  expected = 3;
  CHECK_EQ(
      norm(x, std::numeric_limits<double>::infinity()).item<float>(),
      doctest::Approx(expected));

  expected = 1;
  CHECK_EQ(
      norm(x, -std::numeric_limits<double>::infinity()).item<float>(),
      doctest::Approx(expected));

  x = reshape(arange(9, float32), {3, 3});

  CHECK(allclose(
            norm(x, 2.0, 0, false),
            array(
                {std::sqrt(0 + 3 * 3 + 6 * 6),
                 std::sqrt(1 + 4 * 4 + 7 * 7),
                 std::sqrt(2 * 2 + 5 * 5 + 8 * 8)}))
            .item<bool>());
  CHECK(allclose(
            norm(x, 2.0, 1, false),
            array(
                {sqrt(0 + 1 + 2 * 2),
                 sqrt(3 * 3 + 4 * 4 + 5 * 5),
                 sqrt(6 * 6 + 7 * 7 + 8 * 8)}))
            .item<bool>());

  CHECK_EQ(
      norm(x, 1.0, std::vector<int>{0, 1}).item<float>(),
      doctest::Approx(15.0));
  CHECK_EQ(
      norm(x, 1.0, std::vector<int>{1, 0}).item<float>(),
      doctest::Approx(21.0));
  CHECK_EQ(
      norm(x, -1.0, std::vector<int>{0, 1}).item<float>(),
      doctest::Approx(9.0));
  CHECK_EQ(
      norm(x, -1.0, std::vector<int>{1, 0}).item<float>(),
      doctest::Approx(3.0));
  CHECK_EQ(
      norm(x, 2.0, std::vector<int>{0, 1}, false, Device::cpu).item<float>(),
      doctest::Approx(14.226707));
  CHECK_EQ(
      norm(x, 2.0, std::vector<int>{1, 0}, false, Device::cpu).item<float>(),
      doctest::Approx(14.226707));
  CHECK_EQ(
      norm(x, -2.0, std::vector<int>{0, 1}, false, Device::cpu).item<float>(),
      doctest::Approx(0.0));
  CHECK_EQ(
      norm(x, -2.0, std::vector<int>{1, 0}, false, Device::cpu).item<float>(),
      doctest::Approx(0.0));
  CHECK_EQ(norm(x, 1.0, std::vector<int>{0, 1}, true).shape(), Shape{1, 1});
  CHECK_EQ(norm(x, 1.0, std::vector<int>{1, 0}, true).shape(), Shape{1, 1});
  CHECK_EQ(norm(x, -1.0, std::vector<int>{0, 1}, true).shape(), Shape{1, 1});
  CHECK_EQ(norm(x, -1.0, std::vector<int>{1, 0}, true).shape(), Shape{1, 1});
  CHECK_EQ(
      norm(x, 2.0, std::vector<int>{0, 1}, true, Device::cpu).shape(),
      Shape{1, 1});
  CHECK_EQ(
      norm(x, 2.0, std::vector<int>{1, 0}, true, Device::cpu).shape(),
      Shape{1, 1});
  CHECK_EQ(
      norm(x, -2.0, std::vector<int>{0, 1}, true, Device::cpu).shape(),
      Shape{1, 1});
  CHECK_EQ(
      norm(x, -2.0, std::vector<int>{1, 0}, true, Device::cpu).shape(),
      Shape{1, 1});

  CHECK_EQ(
      norm(x, -1.0, std::vector<int>{-2, -1}, false).item<float>(),
      doctest::Approx(9.0));
  CHECK_EQ(
      norm(x, 1.0, std::vector<int>{-2, -1}, false).item<float>(),
      doctest::Approx(15.0));
  CHECK_EQ(
      norm(x, -2.0, std::vector<int>{-2, -1}, false, Device::cpu).item<float>(),
      doctest::Approx(0.0));
  CHECK_EQ(
      norm(x, 2.0, std::vector<int>{-2, -1}, false, Device::cpu).item<float>(),
      doctest::Approx(14.226707));

  x = reshape(arange(18, float32), {2, 3, 3});
  CHECK_THROWS(norm(x, 2.0, std::vector{0, 1, 2}));
  CHECK(allclose(
            norm(x, 3.0, 0),
            array(
                {9.,
                 10.00333222,
                 11.02199456,
                 12.06217728,
                 13.12502645,
                 14.2094363,
                 15.31340617,
                 16.43469751,
                 17.57113899},
                {3, 3}))
            .item<bool>());
  CHECK(allclose(
            norm(x, 3.0, 2),
            array(
                {2.08008382,
                 6.,
                 10.23127655,
                 14.5180117,
                 18.82291607,
                 23.13593104},
                {2, 3}))
            .item<bool>());
  CHECK(
      allclose(
          norm(x, 0.0, 0), array({1., 2., 2., 2., 2., 2., 2., 2., 2.}, {3, 3}))
          .item<bool>());
  CHECK(allclose(norm(x, 0.0, 1), array({2., 3., 3., 3., 3., 3.}, {2, 3}))
            .item<bool>());
  CHECK(allclose(norm(x, 0.0, 2), array({2., 3., 3., 3., 3., 3.}, {2, 3}))
            .item<bool>());
  CHECK(allclose(
            norm(x, 1.0, 0),
            array({9., 11., 13., 15., 17., 19., 21., 23., 25.}, {3, 3}))
            .item<bool>());
  CHECK(allclose(norm(x, 1.0, 1), array({9., 12., 15., 36., 39., 42.}, {2, 3}))
            .item<bool>());
  CHECK(allclose(norm(x, 1.0, 2), array({3., 12., 21., 30., 39., 48.}, {2, 3}))
            .item<bool>());

  CHECK(allclose(norm(x, 1.0, std::vector<int>{0, 1}), array({21., 23., 25.}))
            .item<bool>());
  CHECK(allclose(norm(x, 1.0, std::vector<int>{1, 2}), array({15., 42.}))
            .item<bool>());
  CHECK(allclose(norm(x, -1.0, std::vector<int>{0, 1}), array({9., 11., 13.}))
            .item<bool>());
  CHECK(allclose(norm(x, -1.0, std::vector<int>{1, 2}), array({9., 36.}))
            .item<bool>());
  CHECK(allclose(norm(x, -1.0, std::vector<int>{1, 0}), array({9., 12., 15.}))
            .item<bool>());
  CHECK(allclose(norm(x, -1.0, std::vector<int>{2, 1}), array({3, 30}))
            .item<bool>());
  CHECK(allclose(norm(x, -1.0, std::vector<int>{1, 2}), array({9, 36}))
            .item<bool>());
  CHECK(allclose(
            norm(x, 2.0, std::vector<int>{0, 1}, false, Device::cpu),
            array({22.045408, 24.155825, 26.318918}))
            .item<bool>());
  CHECK(allclose(
            norm(x, 2.0, std::vector<int>{1, 2}, false, Device::cpu),
            array({14.226707, 39.759212}))
            .item<bool>());
  CHECK(allclose(
            norm(x, -2.0, std::vector<int>{0, 1}, false, Device::cpu),
            array({3, 2.7378995, 2.5128777}))
            .item<bool>());
  CHECK(allclose(
            norm(x, -2.0, std::vector<int>{1, 2}, false, Device::cpu),
            array({4.979028e-16, 7.009628e-16}),
            /* rtol = */ 1e-5,
            /* atol = */ 1e-6)
            .item<bool>());
}

TEST_CASE("[mlx.core.linalg.norm] string ord") {
  array x({1, 2, 3});
  CHECK_THROWS(norm(x, "fro"));

  x = reshape(arange(9, float32), {3, 3});
  CHECK_THROWS(norm(x, "bad ord"));

  CHECK_EQ(
      norm(x, "f", std::vector<int>{0, 1}).item<float>(),
      doctest::Approx(14.2828568570857));
  CHECK_EQ(
      norm(x, "fro", std::vector<int>{0, 1}).item<float>(),
      doctest::Approx(14.2828568570857));
  CHECK_EQ(
      norm(x, "nuc", std::vector<int>{0, 1}, false, Device::cpu).item<float>(),
      doctest::Approx(15.491934));

  x = reshape(arange(18, float32), {2, 3, 3});
  CHECK(allclose(
            norm(x, "fro", std::vector<int>{0, 1}),
            array({22.24859546, 24.31049156, 26.43860813}))
            .item<bool>());
  CHECK(allclose(
            norm(x, "fro", std::vector<int>{1, 2}),
            array({14.28285686, 39.7617907}))
            .item<bool>());
  CHECK(allclose(
            norm(x, "f", std::vector<int>{0, 1}),
            array({22.24859546, 24.31049156, 26.43860813}))
            .item<bool>());
  CHECK(allclose(
            norm(x, "f", std::vector<int>{1, 0}),
            array({22.24859546, 24.31049156, 26.43860813}))
            .item<bool>());
  CHECK(allclose(
            norm(x, "f", std::vector<int>{1, 2}),
            array({14.28285686, 39.7617907}))
            .item<bool>());
  CHECK(allclose(
            norm(x, "f", std::vector<int>{2, 1}),
            array({14.28285686, 39.7617907}))
            .item<bool>());
  CHECK(allclose(
            norm(x, "nuc", std::vector<int>{0, 1}, false, Device::cpu),
            array({25.045408, 26.893724, 28.831797}))
            .item<bool>());
  CHECK(allclose(
            norm(x, "nuc", std::vector<int>{1, 2}, false, Device::cpu),
            array({15.491934, 40.211937}))
            .item<bool>());
  CHECK(allclose(
            norm(x, "nuc", std::vector<int>{-2, -1}, false, Device::cpu),
            array({15.491934, 40.211937}))
            .item<bool>());
}

TEST_CASE("test QR factorization") {
  // 0D and 1D throw
  CHECK_THROWS(linalg::qr(array(0.0)));
  CHECK_THROWS(linalg::qr(array({0.0, 1.0})));

  // Unsupported types throw
  CHECK_THROWS(linalg::qr(array({0, 1}, {1, 2})));

  array A = array({2., 3., 1., 2.}, {2, 2});
  auto [Q, R] = linalg::qr(A, Device::cpu);
  auto out = matmul(Q, R);
  CHECK(allclose(out, A).item<bool>());
  out = matmul(Q, Q);
  CHECK(allclose(out, eye(2), 1e-5, 1e-7).item<bool>());
  CHECK(allclose(tril(R, -1), zeros_like(R)).item<bool>());
  CHECK_EQ(Q.dtype(), float32);
  CHECK_EQ(R.dtype(), float32);
}

TEST_CASE("test SVD factorization") {
  // 0D and 1D throw
  CHECK_THROWS(linalg::svd(array(0.0)));
  CHECK_THROWS(linalg::svd(array({0.0, 1.0})));

  // Unsupported types throw
  CHECK_THROWS(linalg::svd(array({0, 1}, {1, 2})));

  const auto prng_key = random::key(42);
  const auto A = mlx::core::random::normal({5, 4}, prng_key);
  const auto outs = linalg::svd(A, true, Device::cpu);
  CHECK_EQ(outs.size(), 3);

  const auto& U = outs[0];
  const auto& S = outs[1];
  const auto& Vt = outs[2];

  CHECK_EQ(U.shape(), Shape{5, 5});
  CHECK_EQ(S.shape(), Shape{4});
  CHECK_EQ(Vt.shape(), Shape{4, 4});

  const auto U_slice = slice(U, {0, 0}, {U.shape(0), S.shape(0)});

  const auto A_again = matmul(matmul(U_slice, diag(S)), Vt);

  CHECK(
      allclose(A_again, A, /* rtol = */ 1e-3, /* atol = */ 1e-3).item<bool>());
  CHECK_EQ(U.dtype(), float32);
  CHECK_EQ(S.dtype(), float32);
  CHECK_EQ(Vt.dtype(), float32);

  // Test singular values
  const auto& outs_sv = linalg::svd(A, false, Device::cpu);
  const auto SV = outs_sv[0];

  CHECK_EQ(SV.shape(), Shape{4});
  CHECK_EQ(SV.dtype(), float32);

  CHECK(allclose(norm(SV), norm(A, "fro")).item<bool>());
}

TEST_CASE("test matrix inversion") {
  // 0D and 1D throw
  CHECK_THROWS(linalg::inv(array(0.0), Device::cpu));
  CHECK_THROWS(linalg::inv(array({0.0, 1.0}), Device::cpu));

  // Unsupported types throw
  CHECK_THROWS(linalg::inv(array({0, 1}, {1, 2}), Device::cpu));

  // Non-square throws.
  CHECK_THROWS(linalg::inv(array({1, 2, 3, 4, 5, 6}, {2, 3}), Device::cpu));

  const auto prng_key = random::key(42);
  const auto A = random::normal({5, 5}, prng_key);
  const auto A_inv = linalg::inv(A, Device::cpu);
  const auto identity = eye(A.shape(0));

  CHECK(allclose(matmul(A, A_inv), identity, /* rtol = */ 0, /* atol = */ 1e-6)
            .item<bool>());
  CHECK(allclose(matmul(A_inv, A), identity, /* rtol = */ 0, /* atol = */ 1e-6)
            .item<bool>());
}

TEST_CASE("test matrix cholesky") {
  // 0D and 1D throw
  CHECK_THROWS(linalg::cholesky(array(0.0), /* upper = */ false, Device::cpu));
  CHECK_THROWS(
      linalg::cholesky(array({0.0, 1.0}), /* upper = */ false, Device::cpu));

  // Unsupported types throw
  CHECK_THROWS(
      linalg::cholesky(
          array({0, 1}, {1, 2}), /* upper = */ false, Device::cpu));

  // Non-square throws.
  CHECK_THROWS(
      linalg::cholesky(
          array({1, 2, 3, 4, 5, 6}, {2, 3}), /* upper = */ false, Device::cpu));

  const auto prng_key = random::key(220398);
  const auto sqrtA = random::normal({5, 5}, prng_key);
  const auto A = matmul(sqrtA, transpose(sqrtA));
  const auto L = linalg::cholesky(A, /* upper = */ false, Device::cpu);
  const auto U = linalg::cholesky(A, /* upper = */ true, Device::cpu);

  CHECK(allclose(matmul(L, transpose(L)), A, /* rtol = */ 0, /* atol = */ 1e-6)
            .item<bool>());
  CHECK(allclose(matmul(transpose(U), U), A, /* rtol = */ 0, /* atol = */ 1e-6)
            .item<bool>());
}

TEST_CASE("test matrix pseudo-inverse") {
  // 0D and 1D throw
  CHECK_THROWS(linalg::pinv(array(0.0), Device::cpu));
  CHECK_THROWS(linalg::pinv(array({0.0, 1.0}), Device::cpu));

  // Unsupported types throw
  CHECK_THROWS(linalg::pinv(array({0, 1}, {1, 2}), Device::cpu));

  { // Square m == n
    const auto A = array({1.0, 2.0, 3.0, 4.0}, {2, 2});
    const auto A_pinv = linalg::pinv(A, Device::cpu);
    const auto A_again = matmul(matmul(A, A_pinv), A);
    CHECK(allclose(A_again, A, /* rtol = */ 1e-5, /* atol = */ 1e-5)
              .item<bool>());
    const auto A_pinv_again = matmul(matmul(A_pinv, A), A_pinv);
    CHECK(allclose(A_pinv_again, A_pinv, /* rtol = */ 1e-5, /* atol = */ 1e-5)
              .item<bool>());
  }
  { // Rectangular matrix m < n
    const auto prng_key = random::key(42);
    const auto A = random::normal({4, 5}, prng_key);
    const auto A_pinv = linalg::pinv(A, Device::cpu);
    const auto zeros = zeros_like(A_pinv, Device::cpu);
    CHECK_FALSE(allclose(zeros, A_pinv, /* rtol = */ 0, /* atol = */ 1e-6)
                    .item<bool>());
    const auto A_again = matmul(matmul(A, A_pinv), A);
    CHECK(allclose(A_again, A, /* rtol = */ 1e-5, /* atol = */ 1e-5)
              .item<bool>());
    const auto A_pinv_again = matmul(matmul(A_pinv, A), A_pinv);
    CHECK(allclose(A_pinv_again, A_pinv, /* rtol = */ 1e-5, /* atol = */ 1e-5)
              .item<bool>());
  }
  { // Rectangular matrix m > n
    const auto prng_key = random::key(10);
    const auto A = random::normal({6, 5}, prng_key);
    const auto A_pinv = linalg::pinv(A, Device::cpu);
    const auto zeros2 = zeros_like(A_pinv, Device::cpu);
    CHECK_FALSE(allclose(zeros2, A_pinv, /* rtol = */ 0, /* atol = */ 1e-6)
                    .item<bool>());
    const auto A_again = matmul(matmul(A, A_pinv), A);
    CHECK(allclose(A_again, A, /* rtol = */ 1e-5, /* atol = */ 1e-5)
              .item<bool>());
    const auto A_pinv_again = matmul(matmul(A_pinv, A), A_pinv);
    CHECK(allclose(A_pinv_again, A_pinv, /* rtol = */ 1e-5, /* atol = */ 1e-5)
              .item<bool>());
  }
}

TEST_CASE("test cross product") {
  using namespace mlx::core::linalg;

  // Test for vectors of length 3
  array a = array({1.0, 2.0, 3.0});
  array b = array({4.0, 5.0, 6.0});

  array expected = array(
      {2.0 * 6.0 - 3.0 * 5.0, 3.0 * 4.0 - 1.0 * 6.0, 1.0 * 5.0 - 2.0 * 4.0});

  array result = cross(a, b);
  CHECK(allclose(result, expected).item<bool>());

  // Test for vectors of length 3 with negative values
  a = array({-1.0, -2.0, -3.0});
  b = array({4.0, -5.0, 6.0});

  expected = array(
      {-2.0 * 6.0 - (-3.0 * -5.0),
       -3.0 * 4.0 - (-1.0 * 6.0),
       -1.0 * -5.0 - (-2.0 * 4.0)});

  result = cross(a, b);
  CHECK(allclose(result, expected).item<bool>());

  // Test for incorrect vector size (should throw)
  b = array({1.0, 2.0});
  expected = array(
      {-2.0 * 0.0 - (-3.0 * 2.0),
       -3.0 * 1.0 - (-1.0 * 0.0),
       -1.0 * 2.0 - (-2.0 * 1.0)});

  result = cross(a, b);
  CHECK(allclose(result, expected).item<bool>());

  // Test for vectors of length 3 with integer values
  a = array({1, 2, 3});
  b = array({4, 5, 6});

  expected = array({2 * 6 - 3 * 5, 3 * 4 - 1 * 6, 1 * 5 - 2 * 4});

  result = cross(a, b);
  CHECK(allclose(result, expected).item<bool>());
}

TEST_CASE("test matrix eigh") {
  // 0D and 1D throw
  CHECK_THROWS(linalg::eigh(array(0.0)));
  CHECK_THROWS(linalg::eigh(array({0.0, 1.0})));
  CHECK_THROWS(linalg::eigvalsh(array(0.0)));
  CHECK_THROWS(linalg::eigvalsh(array({0.0, 1.0})));

  // Unsupported types throw
  CHECK_THROWS(linalg::eigh(array({0, 1}, {1, 2})));

  // Non-square throws
  CHECK_THROWS(linalg::eigh(array({1, 2, 3, 4, 5, 6}, {2, 3})));

  // Test a simple 2x2 symmetric matrix
  array A = array({1.0, 2.0, 2.0, 4.0}, {2, 2}, float32);
  auto [eigvals, eigvecs] = linalg::eigh(A, "L", Device::cpu);

  // Expected eigenvalues
  array expected_eigvals = array({0.0, 5.0});
  CHECK(allclose(
            eigvals,
            expected_eigvals,
            /* rtol = */ 1e-5,
            /* atol = */ 1e-5)
            .item<bool>());

  // Verify orthogonality of eigenvectors
  CHECK(allclose(
            matmul(eigvecs, transpose(eigvecs)),
            eye(2),
            /* rtol = */ 1e-5,
            /* atol = */ 1e-5)
            .item<bool>());

  // Verify eigendecomposition
  CHECK(allclose(matmul(A, eigvecs), eigvals * eigvecs).item<bool>());
}

TEST_CASE("test lu") {
  // Test 2x2 matrix
  array a = array({1., 2., 3., 4.}, {2, 2});
  auto out = linalg::lu(a, Device::cpu);
  auto L = take_along_axis(out[1], expand_dims(out[0], -1), -2);
  array expected = matmul(L, out[2]);
  CHECK(allclose(a, expected).item<bool>());

  // Test 3x3 matrix
  a = array({1., 2., 3., 4., 5., 6., 7., 8., 10.}, {3, 3});
  out = linalg::lu(a, Device::cpu);
  L = take_along_axis(out[1], expand_dims(out[0], -1), -2);
  expected = matmul(L, out[2]);
  CHECK(allclose(a, expected).item<bool>());

  // Test batch dimension
  a = broadcast_to(a, {3, 3, 3});
  out = linalg::lu(a, Device::cpu);
  L = take_along_axis(out[1], expand_dims(out[0], -1), -2);
  expected = matmul(L, out[2]);
  CHECK(allclose(a, expected).item<bool>());
}

TEST_CASE("test solve") {
  // 0D and 1D throw
  CHECK_THROWS(linalg::solve(array(0.), array(0.), Device::cpu));
  CHECK_THROWS(linalg::solve(array({0.}), array({0.}), Device::cpu));

  // Unsupported types throw
  CHECK_THROWS(
      linalg::solve(array({0, 1, 1, 2}, {2, 2}), array({1, 3}), Device::cpu));

  // Non-square throws
  array a = reshape(arange(6), {3, 2});
  array b = reshape(arange(3), {3, 1});
  CHECK_THROWS(linalg::solve(a, b, Device::cpu));

  // Test 2x2 matrix with 1D rhs
  a = array({2., 1., 1., 3.}, {2, 2});
  b = array({8., 13.}, {2});

  array result = linalg::solve(a, b, Device::cpu);
  CHECK(allclose(matmul(a, result), b).item<bool>());

  // Test 3x3 matrix
  a = array({1., 2., 3., 4., 5., 6., 7., 8., 10.}, {3, 3});
  b = array({6., 15., 25.}, {3, 1});

  result = linalg::solve(a, b, Device::cpu);
  CHECK(allclose(matmul(a, result), b).item<bool>());

  // Test batch dimension
  a = broadcast_to(a, {5, 3, 3});
  b = broadcast_to(b, {5, 3, 1});

  result = linalg::solve(a, b, Device::cpu);
  CHECK(allclose(matmul(a, result), b).item<bool>());

  // Test multi-column rhs
  a = array({2., 1., 1., 1., 3., 2., 1., 0., 0.}, {3, 3});
  b = array({4., 2., 5., 3., 6., 1.}, {3, 2});

  result = linalg::solve(a, b, Device::cpu);
  CHECK(allclose(matmul(a, result), b).item<bool>());

  // Test batch multi-column rhs
  a = broadcast_to(a, {5, 3, 3});
  b = broadcast_to(b, {5, 3, 2});

  result = linalg::solve(a, b, Device::cpu);
  CHECK(allclose(matmul(a, result), b).item<bool>());
}

TEST_CASE("test solve_triangluar") {
  // Test lower triangular matrix
  array a = array({2., 0., 0., 3., 1., 0., 1., -1., 1.}, {3, 3});
  array b = array({2., 5., 0.});

  array result =
      linalg::solve_triangular(a, b, /* upper = */ false, Device::cpu);
  array expected = array({1., 2., 1.});
  CHECK(allclose(expected, result).item<bool>());

  // Test upper triangular matrix
  a = array({2., 1., 3., 0., 4., 2., 0., 0., 1.}, {3, 3});
  b = array({5., 14., 3.});

  result = linalg::solve_triangular(a, b, /* upper = */ true, Device::cpu);
  expected = array({-3., 2., 3.});
  CHECK(allclose(expected, result).item<bool>());
}


================================================
FILE: tests/load_tests.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <filesystem>
#include <stdexcept>
#include <vector>

#include "doctest/doctest.h"

#include "mlx/mlx.h"

using namespace mlx::core;

std::string get_temp_file(const std::string& name) {
  return std::filesystem::temp_directory_path().append(name).string();
}

TEST_CASE("test save_safetensors") {
  std::string file_path = get_temp_file("test_arr.safetensors");
  auto map = std::unordered_map<std::string, array>();
  map.insert({"test", array({1.0, 2.0, 3.0, 4.0})});
  map.insert({"test2", ones({2, 2})});
  auto _metadata = std::unordered_map<std::string, std::string>();
  _metadata.insert({"test", "test"});
  _metadata.insert({"test2", "test2"});
  save_safetensors(file_path, map, _metadata);
  auto [dict, metadata] = load_safetensors(file_path);

  CHECK_EQ(metadata, _metadata);

  CHECK_EQ(dict.size(), 2);
  CHECK_EQ(dict.count("test"), 1);
  CHECK_EQ(dict.count("test2"), 1);
  array test = dict.at("test");
  CHECK_EQ(test.dtype(), float32);
  CHECK_EQ(test.shape(), Shape{4});
  CHECK(array_equal(test, array({1.0, 2.0, 3.0, 4.0})).item<bool>());
  array test2 = dict.at("test2");
  CHECK_EQ(test2.dtype(), float32);
  CHECK_EQ(test2.shape(), Shape{2, 2});
  CHECK(array_equal(test2, ones({2, 2})).item<bool>());
}

TEST_CASE("test gguf") {
  std::string file_path = get_temp_file("test_arr.gguf");
  using dict = std::unordered_map<std::string, array>;
  dict original_weights = {
      {"test", array({1.0f, 2.0f, 3.0f, 4.0f})},
      {"test2", reshape(arange(6), {3, 2})}};

  {
    // Check saving loading just arrays, no metadata
    save_gguf(file_path, original_weights);
    auto [loaded_weights, loaded_metadata] = load_gguf(file_path);
    CHECK_EQ(loaded_metadata.size(), 0);
    CHECK_EQ(loaded_weights.size(), 2);
    CHECK_EQ(loaded_weights.count("test"), 1);
    CHECK_EQ(loaded_weights.count("test2"), 1);
    for (auto [k, v] : loaded_weights) {
      CHECK(array_equal(v, original_weights.at(k)).item<bool>());
    }
  }

  // Test saving and loading string metadata
  std::unordered_map<std::string, GGUFMetaData> original_metadata;
  original_metadata.insert({"test_str", "my string"});

  save_gguf(file_path, original_weights, original_metadata);
  auto [loaded_weights, loaded_metadata] = load_gguf(file_path);
  CHECK_EQ(loaded_metadata.size(), 1);
  CHECK_EQ(loaded_metadata.count("test_str"), 1);
  CHECK_EQ(std::get<std::string>(loaded_metadata.at("test_str")), "my string");

  CHECK_EQ(loaded_weights.size(), 2);
  CHECK_EQ(loaded_weights.count("test"), 1);
  CHECK_EQ(loaded_weights.count("test2"), 1);
  for (auto [k, v] : loaded_weights) {
    CHECK(array_equal(v, original_weights.at(k)).item<bool>());
  }

  std::vector<Dtype> unsupported_types = {
      bool_, uint8, uint32, uint64, int64, bfloat16, complex64};
  for (auto t : unsupported_types) {
    dict to_save = {{"test", astype(arange(5), t)}};
    CHECK_THROWS(save_gguf(file_path, to_save, original_metadata));
  }

  std::vector<Dtype> supported_types = {int8, int32, float16, float32};
  for (auto t : supported_types) {
    auto arr = astype(arange(5), t);
    dict to_save = {{"test", arr}};
    save_gguf(file_path, to_save, original_metadata);
    const auto& [loaded_weights, loaded_metadata] = load_gguf(file_path);
    CHECK(array_equal(loaded_weights.at("test"), arr).item<bool>());
  }
}

TEST_CASE("test gguf metadata") {
  std::string file_path = get_temp_file("test_arr.gguf");
  using dict = std::unordered_map<std::string, array>;
  dict original_weights = {
      {"test", array({1.0f, 2.0f, 3.0f, 4.0f})},
      {"test2", reshape(arange(6), {3, 2})}};

  // Scalar array
  {
    std::unordered_map<std::string, GGUFMetaData> original_metadata;
    original_metadata.insert({"test_arr", array(1.0)});
    save_gguf(file_path, original_weights, original_metadata);

    auto [loaded_weights, loaded_metadata] = load_gguf(file_path);
    CHECK_EQ(loaded_metadata.size(), 1);
    CHECK_EQ(loaded_metadata.count("test_arr"), 1);

    auto arr = std::get<array>(loaded_metadata.at("test_arr"));
    CHECK_EQ(arr.item<float>(), 1.0f);
  }

  // 1D Array
  {
    std::unordered_map<std::string, GGUFMetaData> original_metadata;
    auto arr = array({1.0, 2.0});
    original_metadata.insert({"test_arr", arr});
    save_gguf(file_path, original_weights, original_metadata);

    auto [loaded_weights, loaded_metadata] = load_gguf(file_path);
    CHECK_EQ(loaded_metadata.size(), 1);
    CHECK_EQ(loaded_metadata.count("test_arr"), 1);

    auto loaded_arr = std::get<array>(loaded_metadata.at("test_arr"));
    CHECK(array_equal(arr, loaded_arr).item<bool>());

    // Preserves dims
    arr = array({1.0});
    original_metadata["test_arr"] = arr;
    save_gguf(file_path, original_weights, original_metadata);

    std::tie(loaded_weights, loaded_metadata) = load_gguf(file_path);
    CHECK_EQ(loaded_metadata.size(), 1);
    CHECK_EQ(loaded_metadata.count("test_arr"), 1);

    loaded_arr = std::get<array>(loaded_metadata.at("test_arr"));
    CHECK(array_equal(arr, loaded_arr).item<bool>());
  }

  // > 1D array throws
  {
    std::unordered_map<std::string, GGUFMetaData> original_metadata;
    original_metadata.insert({"test_arr", array({1.0}, {1, 1})});
    CHECK_THROWS(save_gguf(file_path, original_weights, original_metadata));
  }

  // empty array throws
  {
    std::unordered_map<std::string, GGUFMetaData> original_metadata;
    original_metadata.insert({"test_arr", array({})});
    CHECK_THROWS(save_gguf(file_path, original_weights, original_metadata));
  }

  // vector of string
  {
    std::unordered_map<std::string, GGUFMetaData> original_metadata;
    std::vector<std::string> data = {"data1", "data2", "data1234"};
    original_metadata.insert({"meta", data});
    save_gguf(file_path, original_weights, original_metadata);

    auto [loaded_weights, loaded_metadata] = load_gguf(file_path);
    CHECK_EQ(loaded_metadata.size(), 1);
    CHECK_EQ(loaded_metadata.count("meta"), 1);
    auto& strs = std::get<std::vector<std::string>>(loaded_metadata["meta"]);
    CHECK_EQ(strs.size(), 3);
    for (int i = 0; i < strs.size(); ++i) {
      CHECK_EQ(strs[i], data[i]);
    }
  }

  // vector of string, string, scalar, and array
  {
    std::unordered_map<std::string, GGUFMetaData> original_metadata;
    std::vector<std::string> data = {"data1", "data2", "data1234"};
    original_metadata.insert({"meta1", data});
    original_metadata.insert({"meta2", array(2.5)});
    original_metadata.insert({"meta3", array({1, 2, 3})});
    original_metadata.insert({"meta4", "last"});
    save_gguf(file_path, original_weights, original_metadata);

    auto [loaded_weights, loaded_metadata] = load_gguf(file_path);
    CHECK_EQ(loaded_metadata.size(), 4);
    auto& strs = std::get<std::vector<std::string>>(loaded_metadata["meta1"]);
    CHECK_EQ(strs.size(), 3);
    for (int i = 0; i < strs.size(); ++i) {
      CHECK_EQ(strs[i], data[i]);
    }
    auto& arr = std::get<array>(loaded_metadata["meta2"]);
    CHECK_EQ(arr.item<float>(), 2.5);

    arr = std::get<array>(loaded_metadata["meta3"]);
    CHECK(array_equal(arr, array({1, 2, 3})).item<bool>());

    auto& str = std::get<std::string>(loaded_metadata["meta4"]);
    CHECK_EQ(str, "last");
  }
}

TEST_CASE("test single array serialization") {
  // Basic test
  {
    auto a = random::uniform(-5.f, 5.f, {2, 5, 12}, float32);

    std::string file_path = get_temp_file("test_arr.npy");

    save(file_path, a);
    auto b = load(file_path);

    CHECK_EQ(a.dtype(), b.dtype());
    CHECK_EQ(a.shape(), b.shape());
    CHECK(array_equal(a, b).item<bool>());
  }

  // Other shapes
  {
    auto a = random::uniform(
        -5.f,
        5.f,
        {
            1,
        },
        float32);

    std::string file_path = get_temp_file("test_arr_0.npy");

    save(file_path, a);
    auto b = load(file_path);

    CHECK_EQ(a.dtype(), b.dtype());
    CHECK_EQ(a.shape(), b.shape());
    CHECK(array_equal(a, b).item<bool>());
  }

  {
    auto a = random::uniform(
        -5.f,
        5.f,
        {
            46,
        },
        float32);

    std::string file_path = get_temp_file("test_arr_1.npy");

    save(file_path, a);
    auto b = load(file_path);

    CHECK_EQ(a.dtype(), b.dtype());
    CHECK_EQ(a.shape(), b.shape());
    CHECK(array_equal(a, b).item<bool>());
  }

  {
    auto a = random::uniform(-5.f, 5.f, {5, 2, 1, 3, 4}, float32);

    std::string file_path = get_temp_file("test_arr_2.npy");

    save(file_path, a);
    auto b = load(file_path);

    CHECK_EQ(a.dtype(), b.dtype());
    CHECK_EQ(a.shape(), b.shape());
    CHECK(array_equal(a, b).item<bool>());
  }
}


================================================
FILE: tests/ops_tests.cpp
================================================
// Copyright © 2023-2024 Apple Inc.

// Required for using M_PI_2 in MSVC.
#define _USE_MATH_DEFINES
#include <cmath>
#include <numeric>

#include "doctest/doctest.h"

#include "mlx/backend/cuda/cuda.h"
#include "mlx/mlx.h"

using namespace mlx::core;

TEST_CASE("test copy") {
  array x(1.0);
  auto y = copy(x);
  CHECK_EQ(y.shape(), Shape{});
  CHECK_NE(y.id(), x.id());
  CHECK_EQ(y.item<float>(), 1.0f);

  x = array({1, 2}, {2, 1});
  y = copy(x);
  CHECK_EQ(y.shape(), Shape{2, 1});
  CHECK_EQ(y.dtype(), int32);
  CHECK_NE(y.id(), x.id());
  CHECK(array_equal(y, x).item<bool>());
}

TEST_CASE("test reshape") {
  array x(1.0);
  CHECK_EQ(reshape(x, {}).shape(), Shape{});
  CHECK_THROWS_AS(reshape(x, {2}), std::invalid_argument);
  auto y = reshape(x, {1, 1, 1});
  CHECK_EQ(y.shape(), Shape{1, 1, 1});
  y = reshape(x, {-1, 1, 1});
  CHECK_EQ(y.shape(), Shape{1, 1, 1});
  y = reshape(x, {1, 1, -1});
  CHECK_EQ(y.shape(), Shape{1, 1, 1});
  CHECK_THROWS_AS(reshape(x, {1, -1, -1}), std::invalid_argument);
  CHECK_THROWS_AS(reshape(x, {2, -1}), std::invalid_argument);

  x = zeros({2, 2, 2});
  y = reshape(x, {8});
  CHECK_EQ(y.shape(), Shape{8});
  CHECK_THROWS_AS(reshape(x, {7}), std::invalid_argument);
  y = reshape(x, {-1});
  CHECK_EQ(y.shape(), Shape{8});
  y = reshape(x, {-1, 2});
  CHECK_EQ(y.shape(), Shape{4, 2});
  CHECK_THROWS_AS(reshape(x, {-1, 7}), std::invalid_argument);

  // Works with empty array
  x = array({});
  y = reshape(x, {0, 0, 0});
  CHECK_EQ(y.shape(), Shape{0, 0, 0});
  y.eval();
  CHECK_EQ(y.size(), 0);
  CHECK_THROWS_AS(reshape(x, {}), std::invalid_argument);
  CHECK_THROWS_AS(reshape(x, {1}), std::invalid_argument);
  y = reshape(x, {1, 5, 0});
  CHECK_EQ(y.shape(), Shape{1, 5, 0});

  // Check that reshaping a transposed array doesn't result in a copy
  x = reshape(arange(64), {2, 4, 8});
  x.eval();
  CHECK_EQ(x.strides()[0], 32);
  CHECK_EQ(x.strides()[1], 8);
  CHECK_EQ(x.strides()[2], 1);
  y = reshape(transpose(x, {0, 2, 1}), {2, 4, 2, 4});
  y.eval();
  CHECK_EQ(y.strides()[0], 32);
  CHECK_EQ(y.strides()[1], 2);
  CHECK_EQ(y.strides()[2], 1);
  CHECK_EQ(y.strides()[3], 8);
  CHECK_EQ(x.data<int32_t>(), y.data<int32_t>());

  // Split transposed (2, 8, 4) -> (2, 8, 2, 2)
  y = reshape(transpose(x, {0, 2, 1}), {2, 8, 2, 2});
  y.eval();
  CHECK_EQ(y.strides()[0], 32);
  CHECK_EQ(y.strides()[1], 1);
  CHECK_EQ(y.strides()[2], 16);
  CHECK_EQ(y.strides()[3], 8);
  CHECK_EQ(x.data<int32_t>(), y.data<int32_t>());

  // Split transposed (2, 8, 4) -> (2, 8, 2, 1, 2)
  y = reshape(transpose(x, {0, 2, 1}), {2, 8, 2, 1, 2});
  y.eval();
  CHECK_EQ(y.strides()[0], 32);
  CHECK_EQ(y.strides()[1], 1);
  CHECK_EQ(y.strides()[2], 16);
  // y.strides()[3] can be anything since y.shape()[3] == 1
  CHECK_EQ(y.strides()[4], 8);
  CHECK_EQ(x.data<int32_t>(), y.data<int32_t>());

  // Split transposed (2, 8, 4) -> (2, 8, 2, 1, 2, 1)
  y = reshape(transpose(x, {0, 2, 1}), {2, 8, 2, 1, 2, 1});
  y.eval();
  CHECK_EQ(y.strides()[0], 32);
  CHECK_EQ(y.strides()[1], 1);
  CHECK_EQ(y.strides()[2], 16);
  // y.strides()[3] can be anything since y.shape()[3] == 1
  CHECK_EQ(y.strides()[4], 8);
  // y.strides()[5] can be anything since y.shape()[5] == 1
  CHECK_EQ(x.data<int32_t>(), y.data<int32_t>());

  // Check contiguity preservation
  x = ones({10, 10});
  eval(x);
  CHECK(x.flags().row_contiguous);
  CHECK(!x.flags().col_contiguous);
  y = reshape(x, {2, 5, 10});
  eval(y);
  CHECK(y.flags().row_contiguous);
  CHECK(!y.flags().col_contiguous);
  y = reshape(x, {10, 1, 10, 1});
  eval(y);
  CHECK(y.flags().row_contiguous);
  CHECK(!y.flags().col_contiguous);
  x = transpose(x, {1, 0});
  eval(x);
  CHECK(!x.flags().row_contiguous);
  CHECK(x.flags().col_contiguous);
  y = reshape(x, {2, 5, 10});
  eval(y);
  CHECK(!y.flags().row_contiguous);
  CHECK(y.flags().col_contiguous);
  y = reshape(x, {2, 50});
  eval(y);
  CHECK(y.flags().row_contiguous);
  CHECK(!y.flags().col_contiguous);
  y = reshape(x, {10, 1, 10, 1});
  eval(y);
  CHECK(!y.flags().row_contiguous);
  CHECK(y.flags().col_contiguous);
}

TEST_CASE("test flatten") {
  array x = zeros({2, 3, 4});
  CHECK_EQ(flatten(x).shape(), Shape({2 * 3 * 4}));

  CHECK_EQ(flatten(x, 1, 1).shape(), Shape({2, 3, 4}));
  CHECK_EQ(flatten(x, 1, 2).shape(), Shape({2, 3 * 4}));
  CHECK_EQ(flatten(x, 1, 3).shape(), Shape({2, 3 * 4}));
  CHECK_EQ(flatten(x, 1, -1).shape(), Shape({2, 3 * 4}));
  CHECK_EQ(flatten(x, -2, -1).shape(), Shape({2, 3 * 4}));
  CHECK_EQ(flatten(x, -3, -1).shape(), Shape({2 * 3 * 4}));
  CHECK_EQ(flatten(x, -4, -1).shape(), Shape({2 * 3 * 4}));

  // Check start > end throws
  CHECK_THROWS(flatten(x, 2, 1));

  // Check start >= ndim throws
  CHECK_THROWS(flatten(x, 5, 6));

  // Check end < 0 throws
  CHECK_THROWS(flatten(x, -5, -4));

  // Check scalar flattens to 1D
  x = array(1);
  CHECK_EQ(flatten(x, -3, -1).shape(), Shape({1}));
  CHECK_EQ(flatten(x, 0, 0).shape(), Shape({1}));
}

TEST_CASE("test unflatten") {
  array x = array(1);
  CHECK_THROWS(unflatten(x, 0, {1, 1}));

  x = array({1});
  auto out = unflatten(x, 0, {1, 1});
  CHECK_EQ(out.shape(), Shape({1, 1}));
  CHECK_THROWS(unflatten(x, 1, {1, 1}));
  CHECK_THROWS(unflatten(x, 0, {-1, -1}));
  CHECK_THROWS(unflatten(x, 0, {-1, 2}));
  CHECK_THROWS(unflatten(x, 0, {}));

  x = zeros({4, 8});
  out = unflatten(x, 1, {2, 2, 2});
  CHECK_EQ(out.shape(), Shape({4, 2, 2, 2}));
}

TEST_CASE("test squeeze and expand") {
  array x = zeros({2, 1, 2, 1, 2, 1});
  CHECK_EQ(squeeze(x).shape(), Shape{2, 2, 2});
  CHECK_EQ(squeeze(x, {1, 3, 5}).shape(), Shape{2, 2, 2});
  CHECK_EQ(squeeze(x, {-1, -3, -5}).shape(), Shape{2, 2, 2});
  CHECK_EQ(squeeze(x, 1).shape(), Shape{2, 2, 1, 2, 1});
  CHECK_EQ(squeeze(x, -1).shape(), Shape{2, 1, 2, 1, 2});

  CHECK_THROWS(squeeze(x, 0));
  CHECK_THROWS(squeeze(x, 2));
  CHECK_THROWS(squeeze(x, {1, 3, 1}));
  CHECK_THROWS(squeeze(x, {1, 3, -3}));

  x = zeros({2, 2});
  CHECK_EQ(expand_dims(x, 0).shape(), Shape{1, 2, 2});
  CHECK_EQ(expand_dims(x, -1).shape(), Shape{2, 2, 1});
  CHECK_EQ(expand_dims(x, 1).shape(), Shape{2, 1, 2});
  CHECK_EQ(expand_dims(x, {0, 1, 2}).shape(), Shape{1, 1, 1, 2, 2});
  CHECK_EQ(
      expand_dims(x, {0, 1, 2, 5, 6, 7}).shape(),
      Shape{1, 1, 1, 2, 2, 1, 1, 1});

  CHECK_THROWS(expand_dims(x, 3));
  CHECK_THROWS(expand_dims(x, -4));
  CHECK_THROWS(expand_dims(x, {0, 1, 0}));
  CHECK_THROWS(expand_dims(x, {0, 1, -4}));
}

TEST_CASE("test slice") {
  array x = array(3);
  auto out = slice(x, {}, {});
  CHECK_EQ(out.item<int>(), 3);
  CHECK_THROWS_AS(slice(x, {1}, {2}), std::invalid_argument);
  CHECK_THROWS_AS(slice(x, {}, {2}), std::invalid_argument);
  CHECK_THROWS_AS(slice(x, {0}, {}), std::invalid_argument);

  x = array({3});
  out = slice(x, {0}, {1});
  CHECK_EQ(out.item<int>(), 3);
  out = slice(x, {-1}, {1});
  CHECK_EQ(out.item<int>(), 3);

  out = slice(x, {-3}, {10});
  CHECK_EQ(out.item<int>(), 3);

  out = slice(x, {1}, {0});
  eval(out);
  CHECK_EQ(out.shape(), Shape{0});

  out = slice(x, {0}, {1}, {1});
  CHECK_EQ(out.item<int>(), 3);

  out = slice(x, {0}, {1}, {10});
  CHECK_EQ(out.item<int>(), 3);

  x = array({0, 1, 2, 3, 4, 5, 6, 7}, {2, 4});
  out = slice(x, {0, 0}, {2, 2});
  CHECK(array_equal(out, array({0, 1, 4, 5}, {2, 2})).item<bool>());

  out = slice(x, {0, 0}, {0, 2});
  CHECK(array_equal(out, reshape(array({}), {0, 2})).item<bool>());

  out = slice(x, {0, 2}, {2, 3});
  CHECK(array_equal(out, array({2, 6}, {2, 1})).item<bool>());

  out = slice(x, {0, 0}, {2, 4}, {1, 2});
  CHECK(array_equal(out, array({0, 2, 4, 6}, {2, 2})).item<bool>());

  // Check contiguity preservation
  x = ones({10, 10});
  eval(x);
  CHECK(x.flags().row_contiguous);
  CHECK(!x.flags().col_contiguous);
  out = slice(x, {0, 0}, {10, 5});
  eval(out);
  CHECK(!out.flags().row_contiguous);
  CHECK(!out.flags().col_contiguous);
  out = slice(x, {0, 0}, {5, 10});
  eval(out);
  CHECK(out.flags().row_contiguous);
  CHECK(!out.flags().col_contiguous);
  x = transpose(x, {1, 0});
  eval(x);
  CHECK(!x.flags().row_contiguous);
  CHECK(x.flags().col_contiguous);
  out = slice(x, {0, 0}, {10, 5});
  eval(out);
  CHECK(!out.flags().row_contiguous);
  CHECK(out.flags().col_contiguous);
  out = slice(x, {0, 0}, {5, 10});
  eval(out);
  CHECK(!out.flags().row_contiguous);
  CHECK(!out.flags().col_contiguous);

  x = ones({6, 4, 10});
  out = slice(x, {0, 0, 0}, {6, 4, 10}, {2, 1, 2});
  eval(out);
  CHECK(!out.flags().contiguous);
  CHECK(!out.flags().row_contiguous);
  CHECK(!out.flags().col_contiguous);

  // Check data size correctness
  x = ones({4});
  out = slice(x, {0}, {2});
  eval(out);
  CHECK_EQ(out.data_size(), 2);

  out = slice(x, {2}, {4});
  eval(out);
  CHECK_EQ(out.data_size(), 2);

  out = slice(x, {0}, {4}, {2});
  eval(out);
  CHECK_EQ(out.data_size(), 3);

  x = ones({4, 4});
  out = slice(x, {0, 0}, {2, 4});
  eval(out);
  CHECK_EQ(out.data_size(), 8);

  out = slice(x, {0, 0}, {1, 2});
  eval(out);
  CHECK_EQ(out.data_size(), 2);

  out = slice(x, {0, 1}, {4, 4});
  eval(out);
  CHECK_EQ(out.data_size(), 15);

  out = slice(x, {1, 2}, {3, 4});
  eval(out);
  CHECK_EQ(out.data_size(), 6);

  x = ones({4, 4, 4});
  out = slice(x, {0, 0, 0}, {4, 2, 2});
  eval(out);
  CHECK_EQ(out.data_size(), 54);

  x = ones({4, 4, 4});
  out = slice(x, {2, 2, 2}, {3, 3, 3});
  eval(out);
  CHECK_EQ(out.data_size(), 1);

  x = ones({4, 4, 4});
  out = slice(x, {2, 2, 2}, {3, 4, 3});
  eval(out);
  CHECK_EQ(out.data_size(), 5);

  x = ones({8});
  out = slice(x, {7}, {-9}, {-1});
  eval(out);
  CHECK_EQ(out.data_size(), 8);

  out = slice(x, {7}, {-9}, {-1});
  eval(out);
  CHECK_EQ(out.data_size(), 8);

  x = ones({4, 2});
  out = slice(x, {3, 0}, {-5, 2}, {-1, 1});
  eval(out);
  CHECK_EQ(out.data_size(), 8);
}

TEST_CASE("test slice update") {
  array x = array({0., 0., 0., 0., 0., 0., 0., 0.}, {8}, float32);
  array y = array(
      {
          1.,
          2.,
          3.,
          4.,
      },
      {4},
      float32);

  auto out = slice_update(x, y, {2}, {6}, {1});
  CHECK(array_equal(slice(out, {2}, {6}, {1}), y).item<bool>());

  out = slice_update(x, y, {5}, {1}, {-1});
  CHECK(array_equal(slice(out, {5}, {1}, {-1}), y).item<bool>());

  x = reshape(x, {2, 4});
  out = slice_update(x, y, {0, 0}, {2, 4}, {1, 1});
  out = reshape(out, {8});
  CHECK(array_equal(slice(out, {0}, {4}, {1}), y).item<bool>());
  CHECK(array_equal(slice(out, {4}, {8}, {1}), y).item<bool>());
}

TEST_CASE("test slice update add") {
  // Basic slice update add
  auto x = zeros({8}, float32);
  auto y = ones({4}, float32);
  auto out = slice_update_add(x, y, {2}, {6}, {1});
  auto expected = array({0.0f, 0.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f});
  CHECK(array_equal(out, expected).item<bool>());

  // Overlapping slice update add
  x = zeros({8}, float32);
  y = ones({4}, float32);
  out = slice_update_add(x, y, {2}, {6}, {1});
  out = slice_update_add(out, y, {4}, {8}, {1});
  expected = array({0.0f, 0.0f, 1.0f, 1.0f, 2.0f, 2.0f, 1.0f, 1.0f});
  CHECK(array_equal(out, expected).item<bool>());

  // Slice update add with stride
  x = zeros({10}, float32);
  y = ones({3}, float32);
  out = slice_update_add(x, y, {1}, {7}, {2});
  expected =
      array({0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f});
  CHECK(array_equal(out, expected).item<bool>());

  // 2D slice update add
  x = zeros({4, 4}, float32);
  y = ones({2, 2}, float32);
  out = slice_update_add(x, y, {1, 1}, {3, 3}, {1, 1});
  expected = reshape(
      array(
          {0.0f,
           0.0f,
           0.0f,
           0.0f,
           0.0f,
           1.0f,
           1.0f,
           0.0f,
           0.0f,
           1.0f,
           1.0f,
           0.0f,
           0.0f,
           0.0f,
           0.0f,
           0.0f},
          {4, 4}),
      {4, 4});
  CHECK(array_equal(out, expected).item<bool>());

  // Overlapping 2D slice update add
  x = zeros({4, 4}, float32);
  y = ones({2, 2}, float32);
  out = slice_update_add(x, y, {0, 0}, {2, 2}, {1, 1});
  out = slice_update_add(out, y, {1, 1}, {3, 3}, {1, 1});
  expected = reshape(
      array(
          {1.0f,
           1.0f,
           0.0f,
           0.0f,
           1.0f,
           2.0f,
           1.0f,
           0.0f,
           0.0f,
           1.0f,
           1.0f,
           0.0f,
           0.0f,
           0.0f,
           0.0f,
           0.0f},
          {4, 4}),
      {4, 4});
  CHECK(array_equal(out, expected).item<bool>());

  // Slice update add with different dtypes
  x = zeros({4}, int32);
  y = ones({2}, int32);
  out = slice_update_add(x, y, {1}, {3}, {1});
  expected = array({0, 1, 1, 0});
  CHECK(array_equal(out, expected).item<bool>());

  // Empty slice update add
  x = arange(4, float32);
  y = array({});
  out = slice_update_add(x, y, {0}, {0}, {1});
  CHECK(array_equal(out, x).item<bool>());

  // Full array slice update add
  x = ones({4}, float32);
  y = full({4}, 2.0f, float32);
  out = slice_update_add(x, y, {0}, {4}, {1});
  expected = array({3.0f, 3.0f, 3.0f, 3.0f});
  CHECK(array_equal(out, expected).item<bool>());
}

TEST_CASE("test dynamic slice") {
  auto src = reshape(arange(6), {2, 3});
  CHECK_THROWS(slice(src, array({1, 0, 0}), {0, 0, 0}, {1, 1}));
  CHECK_THROWS(slice(src, array({1, 0}), {0}, {1, 1}));
  CHECK_THROWS(slice(src, array({1}), {3}, {1, 1}));
  CHECK_THROWS(slice(src, array({1, 0}), {0, 0}, {1, 1}));

  CHECK_THROWS(slice(src, array({1}), {0}, {2, 4}));
  CHECK_THROWS(slice(src, array({1.0f}, float32), {0}, {1, 1}));

  auto out = slice(src, array({1}), {0}, {1, 2});
  auto expected = array({3, 4}, {1, 2});
  CHECK(array_equal(out, expected).item<bool>());

  out = slice(src, array({1, 1}), {0, 1}, {1, 2});
  expected = array({4, 5}, {1, 2});
  CHECK(array_equal(out, expected).item<bool>());
}

TEST_CASE("test dynamic slice update") {
  auto src = zeros({2, 3}, int32);
  auto upd = ones({1, 2}, int32);
  CHECK_THROWS(slice_update(src, upd, array({1, 0, 0}), {0, 0, 0}));
  CHECK_THROWS(slice_update(src, upd, array({1, 0}), {0}));
  CHECK_THROWS(slice_update(src, upd, array({1}), {3}));
  CHECK_THROWS(slice_update(src, upd, array({1, 0}), {0, 0}));

  upd = ones({4}, int32);
  CHECK_THROWS(slice_update(src, upd, array({1}), {0}));
  upd = ones({1, 4}, int32);
  CHECK_THROWS(slice_update(src, upd, array({1}), {0}));
  CHECK_THROWS(slice_update(src, upd, array({1.0f}, float32), {0}));

  upd = ones({1, 2}, int32);
  auto out = slice_update(src, upd, array({1}), {0});
  auto expected = reshape(array({0, 0, 0, 1, 1, 0}), {2, 3});
  CHECK(array_equal(out, expected).item<bool>());

  upd = ones({1, 2}, int32);
  out = slice_update(src, upd, array({1, 1}), {0, 1});
  expected = reshape(array({0, 0, 0, 0, 1, 1}), {2, 3});
  CHECK(array_equal(out, expected).item<bool>());
}

TEST_CASE("test split") {
  array x = array(1);
  CHECK_THROWS(split(x, 0));

  // Regression: non-scalar split with num_splits <= 0
  CHECK_THROWS(split(array({0, 1, 2, 3, 4, 5}), 0));
  CHECK_THROWS(split(array({0, 1, 2, 3, 4, 5}), -1));

  x = array({3});
  CHECK_EQ(split(x, 1)[0].item<int>(), 3);

  x = array({0, 1, 2});
  CHECK_THROWS(split(x, 3, 1));
  CHECK_THROWS(split(x, 3, -2));

  auto out = split(x, 3, 0);
  CHECK_EQ(out.size(), 3);

  out = split(x, 3, -1);
  CHECK_EQ(out.size(), 3);
  for (auto i = 0; i < 3; ++i) {
    CHECK_EQ(out[i].shape(), Shape{1});
    CHECK_EQ(out[i].dtype(), int32);
    CHECK_EQ(out[i].item<int>(), i);
  }

  x = array({0, 1, 2, 3, 4, 5}, {2, 3});
  out = split(x, 2);
  CHECK(array_equal(out[0], array({0, 1, 2}, {1, 3})).item<bool>());
  CHECK(array_equal(out[1], array({3, 4, 5}, {1, 3})).item<bool>());
  out = split(x, 3, 1);
  CHECK(array_equal(out[0], array({0, 3}, {2, 1})).item<bool>());
  CHECK(array_equal(out[1], array({1, 4}, {2, 1})).item<bool>());
  CHECK(array_equal(out[2], array({2, 5}, {2, 1})).item<bool>());

  x = zeros({8, 12});
  out = split(x, 2);
  CHECK_EQ(out.size(), 2);
  CHECK_EQ(out[0].shape(), Shape{4, 12});
  CHECK_EQ(out[1].shape(), Shape{4, 12});
  out = split(x, 3, 1);
  CHECK_EQ(out.size(), 3);
  CHECK_EQ(out[0].shape(), Shape{8, 4});
  CHECK_EQ(out[1].shape(), Shape{8, 4});
  CHECK_EQ(out[2].shape(), Shape{8, 4});

  out = split(x, Shape{});
  CHECK_EQ(out.size(), 1);
  CHECK_EQ(out[0].shape(), x.shape());

  out = split(x, {3, 7});
  CHECK_EQ(out.size(), 3);
  CHECK_EQ(out[0].shape(), Shape{3, 12});
  CHECK_EQ(out[1].shape(), Shape{4, 12});
  CHECK_EQ(out[2].shape(), Shape{1, 12});

  out = split(x, Shape{20});
  CHECK_EQ(out.size(), 2);
  CHECK_EQ(out[0].shape(), Shape{8, 12});
  CHECK_EQ(out[1].shape(), Shape{0, 12});

  // Negative indices
  out = split(x, Shape{-5});
  CHECK_EQ(out[0].shape(), Shape{3, 12});
  CHECK_EQ(out[1].shape(), Shape{5, 12});

  // Different axis
  out = split(x, {2, 8}, 1);
  CHECK_EQ(out[0].shape(), Shape{8, 2});
  CHECK_EQ(out[1].shape(), Shape{8, 6});
  CHECK_EQ(out[2].shape(), Shape{8, 4});

  // Out of order indices
  x = arange(5);
  out = split(x, {2, 1, 2});
  CHECK(array_equal(out[0], array({0, 1})).item<bool>());
  CHECK(array_equal(out[1], array({})).item<bool>());
  CHECK(array_equal(out[2], array({1})).item<bool>());
  CHECK(array_equal(out[3], array({2, 3, 4})).item<bool>());
}

TEST_CASE("test swap and move axes") {
  // Test swapaxes
  array a(0.0);
  CHECK_THROWS(swapaxes(a, 0, 0));

  a = zeros({2});
  CHECK_THROWS(swapaxes(a, 0, 1));
  CHECK_EQ(swapaxes(a, 0, 0).shape(), Shape{2});
  CHECK_EQ(swapaxes(a, -1, -1).shape(), Shape{2});

  a = zeros({2, 3, 4});
  CHECK_THROWS(swapaxes(a, 0, -4));
  CHECK_THROWS(swapaxes(a, 0, 3));
  CHECK_THROWS(swapaxes(a, 3, 0));
  CHECK_THROWS(swapaxes(a, -4, 0));
  CHECK_EQ(swapaxes(a, 0, 2).shape(), Shape{4, 3, 2});
  CHECK_EQ(swapaxes(a, 0, 1).shape(), Shape{3, 2, 4});
  CHECK_EQ(swapaxes(a, 0, -1).shape(), Shape{4, 3, 2});
  CHECK_EQ(swapaxes(a, -2, 2).shape(), Shape{2, 4, 3});

  // Test moveaxis
  a = array(0.0);
  CHECK_THROWS(moveaxis(a, 0, 0));

  a = zeros({2});
  CHECK_THROWS(moveaxis(a, 0, 1));
  CHECK_EQ(moveaxis(a, 0, 0).shape(), Shape{2});
  CHECK_EQ(moveaxis(a, -1, -1).shape(), Shape{2});

  a = zeros({2, 3, 4});
  CHECK_THROWS(moveaxis(a, 0, -4));
  CHECK_THROWS(moveaxis(a, 0, 3));
  CHECK_THROWS(moveaxis(a, 3, 0));
  CHECK_THROWS(moveaxis(a, -4, 0));
  CHECK_EQ(moveaxis(a, 0, 2).shape(), Shape{3, 4, 2});
  CHECK_EQ(moveaxis(a, 0, 1).shape(), Shape{3, 2, 4});
  CHECK_EQ(moveaxis(a, 0, -1).shape(), Shape{3, 4, 2});
  CHECK_EQ(moveaxis(a, -2, 2).shape(), Shape{2, 4, 3});
}

TEST_CASE("test transpose") {
  array x(1);
  auto y = transpose(x);
  CHECK_EQ(y.shape(), Shape{});
  CHECK_EQ(y.item<int>(), 1);
  CHECK_THROWS_AS(transpose(x, {0}), std::invalid_argument);
  CHECK_THROWS_AS(transpose(x, {1}), std::invalid_argument);

  x = array({1}, {1});
  y = transpose(x);
  CHECK_EQ(y.shape(), Shape{1});
  CHECK_EQ(y.item<int>(), 1);

  // Negative indices
  y = transpose(x, {-1});
  CHECK_EQ(y.shape(), Shape{1});
  CHECK_EQ(y.item<int>(), 1);

  CHECK_THROWS_AS(transpose(x, {1}), std::invalid_argument);
  CHECK_THROWS_AS(transpose(x, {0, 0}), std::invalid_argument);

  // Works with empty array
  x = array({});
  y = transpose(x);
  CHECK_EQ(y.shape(), Shape{0});
  y.eval();
  CHECK_EQ(y.size(), 0);

  x = array({1, 2, 3, 4, 5, 6}, {2, 3});
  y = transpose(x);
  CHECK_EQ(y.shape(), Shape{3, 2});
  y = transpose(x, {-1, 0});
  CHECK_EQ(y.shape(), Shape{3, 2});
  y = transpose(x, {-1, -2});
  CHECK_EQ(y.shape(), Shape{3, 2});
  y.eval();
  CHECK(array_equal(y, array({1, 4, 2, 5, 3, 6}, {3, 2})).item<bool>());
  y = transpose(x, {0, 1});
  CHECK_EQ(y.shape(), Shape{2, 3});
  CHECK(array_equal(y, x).item<bool>());
  y = transpose(x, {0, -1});
  CHECK_EQ(y.shape(), Shape{2, 3});
  CHECK(array_equal(y, x).item<bool>());

  CHECK_THROWS_AS(transpose(x, {}), std::invalid_argument);
  CHECK_THROWS_AS(transpose(x, {0}), std::invalid_argument);
  CHECK_THROWS_AS(transpose(x, {0, 0}), std::invalid_argument);
  CHECK_THROWS_AS(transpose(x, {0, 0, 0}), std::invalid_argument);
  CHECK_THROWS_AS(transpose(x, {0, 1, 1}), std::invalid_argument);

  x = array({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {2, 3, 2});
  y = transpose(x);
  CHECK_EQ(y.shape(), Shape{2, 3, 2});
  auto expected = array({1, 7, 3, 9, 5, 11, 2, 8, 4, 10, 6, 12}, {2, 3, 2});
  CHECK(array_equal(y, expected).item<bool>());

  y = transpose(x, {0, 1, 2});
  CHECK_EQ(y.shape(), Shape{2, 3, 2});
  CHECK(array_equal(y, x).item<bool>());
  y = transpose(x, {1, 0, 2});
  CHECK_EQ(y.shape(), Shape{3, 2, 2});
  expected = array({1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}, {3, 2, 2});
  CHECK(array_equal(y, expected).item<bool>());
  y = transpose(x, {0, 2, 1});
  CHECK_EQ(y.shape(), Shape{2, 2, 3});
  expected = array({1, 3, 5, 2, 4, 6, 7, 9, 11, 8, 10, 12}, {2, 2, 3});
  CHECK(array_equal(y, expected).item<bool>());

  // Check reshaping a transposed array
  x = array({0, 1, 2, 3, 4, 5, 6, 7}, {4, 2});
  x = reshape(transpose(x), {2, 2, 2});
  expected = array({0, 2, 4, 6, 1, 3, 5, 7}, {2, 2, 2});
  CHECK(array_equal(x, expected).item<bool>());

  // Check maintaining contiguous status
  x = array({0, 1, 2, 3, 4, 5, 6, 7}, {1, 4, 1, 2});
  CHECK(x.flags().row_contiguous);
  x = transpose(x, {2, 1, 0, 3});
  eval(x);
  CHECK(x.flags().row_contiguous);
}

TEST_CASE("test comparison ops") {
  // Empty array
  {
    array x({});
    array y({});
    auto z = x == y;
    CHECK_EQ(z.dtype(), bool_);
    CHECK_EQ(z.shape(), Shape{0});
  }

  // Basic cases
  {
    array x(1.0);
    array y(1.0);
    CHECK(equal(x, y).item<bool>());
    CHECK((x == y).item<bool>());
    CHECK((x == 1.0f).item<bool>());
    CHECK((1.0f == y).item<bool>());

    CHECK(!(x != y).item<bool>());
    CHECK(!not_equal(x, y).item<bool>());
    CHECK(!(1.0f != y).item<bool>());
    CHECK(!(x != 1.0f).item<bool>());

    CHECK(array_equal(x, y).item<bool>());

    x = array(0.0);
    CHECK(!equal(x, y).item<bool>());
    CHECK(!array_equal(x, y).item<bool>());
    CHECK(not_equal(x, y).item<bool>());
  }

  // Greater and less
  {
    array x(1.0);
    array y(0.0);
    CHECK(greater(x, y).item<bool>());
    CHECK((x > 0.0f).item<bool>());
    CHECK((1.0f > y).item<bool>());
    CHECK(greater_equal(x, y).item<bool>());
    CHECK((1.0f >= y).item<bool>());
    CHECK(!(x > 1.0f).item<bool>());
    CHECK((x >= 1.0f).item<bool>());

    CHECK(less(y, x).item<bool>());
    CHECK((y < 1.0).item<bool>());
    CHECK((y <= 1.0f).item<bool>());
    CHECK(!(x < 1.0).item<bool>());
    CHECK((x <= 1.0f).item<bool>());
  }

  // Check array_equal works
  {
    auto x = zeros({5, 5});
    auto y = zeros({5, 5});
    CHECK(array_equal(x, y).item<bool>());

    x = zeros({1, 1});
    CHECK(!array_equal(x, y).item<bool>());

    x = ones({5, 5});
    CHECK(!array_equal(x, y).item<bool>());

    x = array({0.0f, 1.0f, NAN});
    y = array({0.0f, 1.0f, NAN});
    CHECK(!array_equal(x, y).item<bool>());
    CHECK(array_equal(x, y, true).item<bool>());
  }

  // Check other types
  {
    auto x = zeros({5, 5}, int32);
    auto y = zeros({5, 5}, int32);
    CHECK(array_equal(x, y).item<bool>());

    x = ones({5, 5}, bool_);
    y = ones({5, 5}, bool_);
    CHECK(array_equal(x, y).item<bool>());
  }

  // Check type promotion
  {
    array x(1.0f);
    array y(1);
    CHECK_EQ(equal(x, y).item<bool>(), true);

    x = array(true, bool_);
    CHECK_EQ(equal(x, y).item<bool>(), true);
  }

  // Broadcasting works
  {
    auto x = zeros({1, 2});
    auto y = zeros({2, 1});
    auto z = equal(x, y);
    CHECK_EQ(z.dtype(), bool_);
    CHECK_EQ(z.shape(), Shape{2, 2});
    auto expected = array({true, true, true, true}, {2, 2});
    CHECK(array_equal(z, expected).item<bool>());

    x = array({1.0, 2.0}, {1, 2});
    y = array({1.0, 2.0}, {2, 1});
    z = equal(x, y);
    CHECK_EQ(z.dtype(), bool_);
    CHECK_EQ(z.shape(), Shape{2, 2});
    expected = array({true, false, false, true}, {2, 2});
    CHECK(array_equal(z, expected).item<bool>());

    expected = array({false, true, false, false}, {2, 2});
    z = greater(x, y);
    CHECK(array_equal(z, expected).item<bool>());

    expected = array({true, true, false, true}, {2, 2});
    z = greater_equal(x, y);
    CHECK(array_equal(z, expected).item<bool>());

    expected = array({false, false, true, false}, {2, 2});
    z = less(x, y);
    CHECK(array_equal(z, expected).item<bool>());

    expected = array({true, false, true, true}, {2, 2});
    z = less_equal(x, y);
    CHECK(array_equal(z, expected).item<bool>());
  }
}

TEST_CASE("test is nan") {
  array x(1.0f);
  CHECK_FALSE(isnan(x).item<bool>());

  array y(NAN);
  CHECK(isnan(y).item<bool>());

  array z = identity(7);
  CHECK_FALSE(all(isnan(z)).item<bool>());

  array w = array({1.0f, NAN, 2.0f});
  CHECK_FALSE(all(isnan(w)).item<bool>());

  array a(1.0f, bfloat16);
  CHECK_FALSE(isnan(a).item<bool>());

  array b(1.0f, float16);
  CHECK_FALSE(isnan(b).item<bool>());

  array c(NAN, bfloat16);
  CHECK(isnan(c).item<bool>());

  array d(NAN, float16);
  CHECK(isnan(d).item<bool>());
}

TEST_CASE("test is inf") {
  array x(1.0f);
  CHECK_FALSE(isinf(x).item<bool>());

  auto inf = std::numeric_limits<float>::infinity();

  array y(inf);
  CHECK(isinf(y).item<bool>());

  auto neginf = -std::numeric_limits<float>::infinity();
  CHECK(isinf(array(neginf)).item<bool>());

  array z = identity(7);
  CHECK_FALSE(any(isinf(z)).item<bool>());

  array w = array({1.0f, inf, 2.0f});
  CHECK(array_equal(array({false, true, false}), isinf(w)).item<bool>());

  array a(1.0f, bfloat16);
  CHECK_FALSE(isinf(a).item<bool>());

  array b(1.0f, float16);
  CHECK_FALSE(isinf(b).item<bool>());

  array c(inf, bfloat16);
  CHECK(isinf(c).item<bool>());

  array d(inf, float16);
  CHECK(isinf(d).item<bool>());
}

TEST_CASE("test all close") {
  array x(1.0f);
  array y(1.0f);
  CHECK(allclose(x, y).item<bool>());

  y = array(1.1f);
  CHECK_FALSE(allclose(x, y).item<bool>());
  CHECK(allclose(x, y, 0.1).item<bool>());
  CHECK_FALSE(allclose(x, y, 0.01).item<bool>());
  CHECK(allclose(x, y, 0.01, 0.1).item<bool>());
}

TEST_CASE("test is close") {
  {
    array a({1.0, std::numeric_limits<float>::infinity()});
    array b({1.0, std::numeric_limits<float>::infinity()});
    CHECK(array_equal(isclose(a, b), array({true, true})).item<bool>());
  }
  {
    array a({1.0, -std::numeric_limits<float>::infinity()});
    array b({1.0, -std::numeric_limits<float>::infinity()});
    CHECK(array_equal(isclose(a, b), array({true, true})).item<bool>());
  }
  {
    array a({1.0, std::numeric_limits<float>::infinity()});
    array b({1.0, -std::numeric_limits<float>::infinity()});
    CHECK(array_equal(isclose(a, b), array({true, false})).item<bool>());
  }
  {
    array a({1.0, std::nan("1"), std::nan("1")});
    array b({1.0, std::nan("1"), 2.0});
    CHECK(array_equal(isclose(a, b), array({true, false, false})).item<bool>());
  }
  {
    array a({1.0, std::nan("1"), std::nan("1")});
    array b({1.0, std::nan("1"), 2.0});
    CHECK(
        array_equal(isclose(a, b, 1e-5, 1e-8, true), array({true, true, false}))
            .item<bool>());
  }
}

TEST_CASE("test reduction ops") {
  // Check shapes and throws correctly
  {
    auto x = array(1);
    auto out = sum(x);
    CHECK_EQ(out.ndim(), 0);
    CHECK_THROWS_AS(sum(x, 0), std::out_of_range);
    CHECK_THROWS_AS(sum(x, -1), std::out_of_range);
    out = sum(x, std::vector<int>{});
    CHECK_EQ(out.shape(), Shape{});
    CHECK_EQ(out.size(), 1);

    x = array({});
    out = sum(x);
    CHECK_EQ(out.shape(), Shape{});
    CHECK_EQ(out.size(), 1);
    out = sum(x, true);
    CHECK_EQ(out.shape(), Shape{1});
    out = sum(x, std::vector<int>{});
    CHECK_EQ(out.shape(), x.shape());

    x = zeros({2});
    out = sum(x);
    CHECK_EQ(out.ndim(), 0);
    out = sum(x, -1);
    CHECK_EQ(out.ndim(), 0);
    out = sum(x, -1, true);
    CHECK_EQ(out.ndim(), 1);
    CHECK_EQ(out.shape(), Shape{1});

    CHECK_THROWS_AS(sum(x, 1), std::out_of_range);
    CHECK_THROWS_AS(sum(x, -2), std::out_of_range);
    CHECK_THROWS_AS(sum(x, {0, 0}), std::invalid_argument);
    CHECK_THROWS_AS(sum(x, {-1, 0}), std::invalid_argument);

    x = zeros({2, 3, 4});
    out = sum(x, {0, 2});
    CHECK_EQ(out.shape(), Shape{3});
    out = sum(x, std::vector<int>{});
    CHECK_EQ(out.shape(), x.shape());

    out = sum(x, {0, -1});
    CHECK_EQ(out.shape(), Shape{3});

    out = sum(x, {0, -1}, true);
    CHECK_EQ(out.shape(), Shape{1, 3, 1});

    out = sum(x, true);
    CHECK_EQ(out.shape(), Shape{1, 1, 1});

    out = sum(x);
    CHECK_EQ(out.shape(), Shape{});

    CHECK_THROWS_AS(sum(x, 3), std::out_of_range);
    CHECK_THROWS_AS(sum(x, -4), std::out_of_range);
    CHECK_THROWS_AS(sum(x, {0, 1, -2}), std::invalid_argument);
  }

  // Test sum
  {
    auto x = array({});
    CHECK_EQ(sum(x).item<float>(), 0.0f);

    x = array({1, 2, 3});
    CHECK_EQ(sum(x).item<int>(), 6);
    CHECK(array_equal(sum(x, std::vector<int>{}), x).item<bool>());

    x = ones({2, 3});
    CHECK(array_equal(sum(x, 1), full({2}, 3.0f)).item<bool>());
    CHECK(array_equal(sum(x, 0), full({3}, 2.0f)).item<bool>());
    CHECK_EQ(sum(x, {0, 1}).item<float>(), 6.0f);

    x = ones({2, 3, 4});
    CHECK(array_equal(sum(x, 0), full({3, 4}, 2.0f)).item<bool>());
    CHECK(array_equal(sum(x, 1), full({2, 4}, 3.0f)).item<bool>());
    CHECK(array_equal(sum(x, 2), full({2, 3}, 4.0f)).item<bool>());
    CHECK(array_equal(sum(x, {0, 1}), full({4}, 6.0f)).item<bool>());
    CHECK(array_equal(sum(x, {0, 2}), full({3}, 8.0f)).item<bool>());
    CHECK(array_equal(sum(x, {1, 2}), full({2}, 12.0f)).item<bool>());

    // Output for bool gets higher precision
    x = array({true, true, true});
    CHECK_EQ(sum(x).item<int32_t>(), 3);

    x = array(2.0f);
    x = broadcast_to(x, {2, 2, 2});
    CHECK_EQ(sum(x).item<float>(), 16.0f);

    // Tests with non-uniform results after reduction
    x = array({1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f}, {2, 3});
    CHECK(array_equal(sum(x, 0), full({3}, 3.0f)).item<bool>());
    CHECK(array_equal(sum(x, 1), array({3.0f, 6.0f}, {2})).item<bool>());
  }

  // Test unsigned sum
  {
    const int num_elems = 1000;

    auto x = astype(full({num_elems}, 255), uint8);
    CHECK_EQ(sum(x, Device::cpu).item<uint32_t>(), 255 * num_elems);

    x = astype(full({num_elems}, 65535), uint16);
    CHECK_EQ(sum(x, Device::cpu).item<uint32_t>(), 65535 * num_elems);

    x = full({3, 3, 3}, 10000, uint32);
    CHECK_EQ(sum(x, Device::cpu).item<uint32_t>(), 270000);

    x = full({3, 3, 3}, 10000, uint64);
    CHECK_EQ(sum(x, Device::cpu).item<uint64_t>(), 270000);
  }

  // Test prod
  {
    auto x = array({});
    CHECK_EQ(prod(x).item<float>(), 1.0f);

    x = array({2, 2, 2});
    CHECK_EQ(prod(x).item<int>(), 8);
    CHECK(array_equal(prod(x, std::vector<int>{}), x).item<bool>());

    x = full({2, 3}, 2.0f);
    CHECK(array_equal(prod(x, 1), full({2}, 8.0f)).item<bool>());
    CHECK(array_equal(prod(x, 0), full({3}, 4.0f)).item<bool>());
    CHECK_EQ(prod(x, {0, 1}).item<float>(), 64.0f);

    x = full({2, 3, 4}, 2.0f);
    CHECK(array_equal(prod(x, 0), full({3, 4}, 4.0f)).item<bool>());
    CHECK(array_equal(prod(x, 1), full({2, 4}, 8.0f)).item<bool>());
    CHECK(array_equal(prod(x, 2), full({2, 3}, 16.0f)).item<bool>());
    CHECK(array_equal(prod(x, {0, 1}), full({4}, 64.0f)).item<bool>());
    CHECK(array_equal(prod(x, {0, 2}), full({3}, 256.0f)).item<bool>());
    CHECK(array_equal(prod(x, {1, 2}), full({2}, 4096.0f)).item<bool>());

    // Tests with non-uniform results after reduction
    x = array({1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f}, {2, 3});
    CHECK(array_equal(prod(x, 0), full({3}, 2.0f)).item<bool>());
    CHECK(array_equal(prod(x, 1), array({1.0f, 8.0f}, {2})).item<bool>());

    x = array({true, true, true, false, true, false}, {2, 3});
    CHECK(array_equal(prod(x, 0), array({false, true, false})).item<bool>());
    CHECK(array_equal(prod(x, 1), array({true, false})).item<bool>());
  }

  // Test unsigned prod
  {
    auto x = array({255, 255}, {2}, uint8);
    CHECK_EQ(prod(x, Device::cpu).item<uint32_t>(), 65025);

    x = array({65535, 2}, {2}, uint16);
    CHECK_EQ(prod(x, Device::cpu).item<uint32_t>(), 131070);

    x = array({100000, 2}, {2}, uint32);
    CHECK_EQ(prod(x, Device::cpu).item<uint32_t>(), 200000);

    x = array({100000, 2}, {2}, uint64);
    CHECK_EQ(prod(x, Device::cpu).item<uint64_t>(), 200000);
  }

  // Test all
  {
    auto x = array({});
    CHECK_EQ(all(x).item<bool>(), true);

    x = array({2, 2, 2});
    CHECK_EQ(all(x).item<bool>(), true);
    auto out = all(x, std::vector<int>{});
    CHECK(array_equal(out, array({true, true, true})).item<bool>());

    x = array({0, 2, 2});
    CHECK_EQ(all(x).item<bool>(), false);

    x = array({true, true, true, false, true, false}, {2, 3});
    CHECK(array_equal(all(x, 1), array({true, false})).item<bool>());
    CHECK(array_equal(all(x, 0), array({false, true, false})).item<bool>());
  }

  // Test any
  {
    auto x = array({});
    CHECK_EQ(any(x).item<bool>(), false);

    x = array({0, 0, 0});
    CHECK_EQ(any(x).item<bool>(), false);

    x = array({0, 2, 0});
    CHECK_EQ(any(x).item<bool>(), true);
    auto out = any(x, std::vector<int>{});
    CHECK(array_equal(out, array({false, true, false})).item<bool>());

    x = array({true, false, true, false, false, false}, {2, 3});
    CHECK(array_equal(any(x, 1), array({true, false})).item<bool>());
    CHECK(array_equal(any(x, 0), array({true, false, true})).item<bool>());
  }

  // Test max and min
  {
    auto x = array({});
    CHECK_THROWS(max(x));
    CHECK_THROWS(min(x));

    x = array({1.0f, 2.0f, 3.0f});
    CHECK_EQ(max(x).item<float>(), 3.0f);
    CHECK_EQ(min(x).item<float>(), 1.0f);

    x = array({-2.0f, -1.0f});
    CHECK_EQ(max(x).item<float>(), -1.0f);
    CHECK_EQ(min(x).item<float>(), -2.0f);

    constexpr float inf = std::numeric_limits<float>::infinity();
    x = array({inf});
    CHECK_EQ(min(x).item<float>(), inf);

    x = array({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {2, 3});
    CHECK(array_equal(max(x, 0), array({4.0f, 5.0f, 6.0f})).item<bool>());
    CHECK(array_equal(max(x, 1), array({3.0f, 6.0f})).item<bool>());
    CHECK(array_equal(min(x, 0), array({1.0f, 2.0f, 3.0f})).item<bool>());
    CHECK(array_equal(min(x, 1), array({1.0f, 4.0f})).item<bool>());

    x = array({1u, 2u, 3u});
    CHECK_EQ(max(x).item<uint32_t>(), 3u);
    CHECK_EQ(min(x).item<uint32_t>(), 1u);

    x = array({1u, 2u, 3u, 4u, 5u, 6u}, {2, 3});
    CHECK(array_equal(max(x, 0), array({4u, 5u, 6u})).item<bool>());
    CHECK(array_equal(max(x, 1), array({3u, 6u})).item<bool>());
    CHECK(array_equal(min(x, 0), array({1u, 2u, 3u})).item<bool>());
    CHECK(array_equal(min(x, 1), array({1u, 4u})).item<bool>());

    x = array({true, false, true, false, false, false}, {2, 3});
    CHECK(array_equal(max(x, 1), array({true, false})).item<bool>());
    CHECK(array_equal(max(x, 0), array({true, false, true})).item<bool>());

    x = array({true, true, true, false, true, false}, {2, 3});
    CHECK(array_equal(min(x, 1), array({true, false})).item<bool>());
    CHECK(array_equal(min(x, 0), array({false, true, false})).item<bool>());

    x = array({1.0f, NAN, 3.0f, 4.0f, 5.0f, 6.0f}, {2, 3});
    CHECK(array_equal(max(x, 0), array({4.0f, NAN, 6.0f}), true).item<bool>());
    CHECK(array_equal(max(x, 1), array({NAN, 6.0f}), true).item<bool>());
  }

  // Test logsumexp
  {
    auto x = array({});
    CHECK_THROWS(logsumexp(x));

    constexpr float inf = std::numeric_limits<float>::infinity();

    x = array({-inf, -inf});
    CHECK_EQ(logsumexp(x).item<float>(), -inf);

    x = repeat(array(-inf), 5000);
    CHECK_EQ(logsumexp(x).item<float>(), -inf);

    x = array({0.0f, -inf});
    CHECK_EQ(logsumexp(x).item<float>(), 0.0f);

    x = array({0.0f, inf});
    CHECK_EQ(logsumexp(x).item<float>(), inf);

    x = reshape(arange(6, float32), {2, 3});

    std::vector<float> nums = {0.0f, 1.0f, 2.0f, 3.0f};
    x = array(nums.data(), {2, 2});
    auto y = logsumexp(x, {0, 1}, true);
    CHECK_EQ(y.shape(), Shape{1, 1});
    auto result = std::log(
        std::exp(nums[0]) + std::exp(nums[1]) + std::exp(nums[2]) +
        std::exp(nums[3]));
    CHECK(y.item<float>() == doctest::Approx(result));
    auto expected = array(
        {std::log(std::exp(nums[0]) + std::exp(nums[2])),
         std::log(std::exp(nums[1]) + std::exp(nums[3]))});
    CHECK(allclose(logsumexp(x, 0), expected).item<bool>());

    expected = array(
        {std::log(std::exp(nums[0]) + std::exp(nums[1])),
         std::log(std::exp(nums[2]) + std::exp(nums[3]))});
    CHECK(allclose(logsumexp(x, 1), expected).item<bool>());
  }

  // Test softmax
  {
    for (auto t : {float16, bfloat16, float32}) {
      const auto rtol = t == float32 ? 1e-5 : 1e-2;
      auto x = array({}, t);
      CHECK(array_equal(x, softmax(x)).item<bool>());

      // all zeros
      x = array({0., 0., 0., 0.}, t);
      auto y = array({0.25, 0.25, 0.25, 0.25}, t);
      CHECK(array_equal(y, softmax(x)).item<bool>());
      CHECK(array_equal(y, softmax(x, -1)).item<bool>());
      CHECK(array_equal(y, softmax(x, std::vector<int>{-1})).item<bool>());
      CHECK(array_equal(y, softmax(x, std::vector<int>{0})).item<bool>());

      auto ones = array(1.0f, t);
      CHECK(array_equal(ones, sum(softmax(x))).item<bool>());

      // all ones
      x = array({1., 1., 1., 1.}, t);
      CHECK(array_equal(y, softmax(x)).item<bool>());
      CHECK(array_equal(ones, sum(softmax(x))).item<bool>());

      // negative values
      x = array({-1., -2., -3., -4.}, t);
      y = array({0.643914, 0.236883, 0.0871443, 0.0320586}, t);
      CHECK(allclose(y, softmax(x), rtol).item<bool>());
      CHECK(allclose(ones, sum(softmax(x)), rtol).item<bool>());

      // positive and negative values
      x = array({1., 0., -1., 0.}, t);
      y = array({0.534447, 0.196612, 0.0723295, 0.196612}, t);
      CHECK(allclose(y, softmax(x), rtol).item<bool>());
      CHECK(allclose(ones, sum(softmax(x)), rtol).item<bool>());

      // large positive values
      x = array({1000., 1000., 1000.}, t);
      y = array({0.333333, 0.333333, 0.333333}, t);
      CHECK(allclose(y, softmax(x)).item<bool>());
      CHECK(array_equal(ones, sum(softmax(x))).item<bool>());

      // large negative values
      x = negative(x);
      CHECK(allclose(y, softmax(x)).item<bool>());
      CHECK(array_equal(ones, sum(softmax(x))).item<bool>());
    }
  }
}

TEST_CASE("test irregular binary ops") {
  // 1D strided
  {
    auto x = full({128}, 1.0f);
    auto y = full({64}, 1.0f);
    x = slice(x, {0}, {128}, {4});
    y = slice(y, {0}, {64}, {2});
    CHECK(array_equal(add(x, y), full({32}, 2.0f)).item<bool>());
  }

  // 2D broadcasts
  {
    auto x = full({32, 32}, 4.0f);
    auto y = full({32}, 4.0f);
    CHECK(array_equal(add(x, y), full({32, 32}, 8.0f)).item<bool>());
    y = reshape(y, {32, 1});
    CHECK(array_equal(add(x, y), full({32, 32}, 8.0f)).item<bool>());
    CHECK(array_equal(subtract(y, x), zeros({32, 32})).item<bool>());
  }
}

TEST_CASE("test arithmetic unary ops") {
  // Test negative
  {
    array x(1.0f);
    CHECK_EQ(negative(x).item<float>(), -1.0f);
    CHECK_EQ((-x).item<float>(), -1.0f);

    // works on empty array
    CHECK(array_equal(-array({}), array({})).item<bool>());

    // Throws on bool
    CHECK_THROWS(negative(array(true)));
  }

  // Test logical not
  {
    array x(false);
    CHECK_EQ(logical_not(x).item<bool>(), true);

    x = array(1.0f);
    auto y = logical_not(x);
    CHECK_EQ(y.dtype(), bool_);
    CHECK_EQ(y.item<bool>(), false);

    x = array(0);
    y = logical_not(x);
    CHECK_EQ(y.dtype(), bool_);
    CHECK_EQ(y.item<bool>(), true);
  }

  // Test logical and
  {
    array x(true);
    array y(true);
    CHECK_EQ(logical_and(x, y).item<bool>(), true);

    x = array(1.0f);
    y = array(1.0f);
    auto z = logical_and(x, y);
    CHECK_EQ(z.dtype(), bool_);
    CHECK_EQ(z.item<bool>(), true);

    x = array(0);
    y = array(1.0f);
    z = logical_and(x, y);
    CHECK_EQ(z.dtype(), bool_);
    CHECK_EQ(z.item<bool>(), false);
  }

  // Test logical or
  {
    array x(false);
    array y(false);
    CHECK_EQ(logical_or(x, y).item<bool>(), false);

    x = array(1.0f);
    y = array(1.0f);
    auto z = logical_or(x, y);
    CHECK_EQ(z.dtype(), bool_);
    CHECK_EQ(z.item<bool>(), true);

    x = array(0);
    y = array(1.0f);
    z = logical_or(x, y);
    CHECK_EQ(z.dtype(), bool_);
    CHECK_EQ(z.item<bool>(), true);
  }

  // Test abs
  {
    array x({-1.0f, 0.0f, 1.0f});
    CHECK(array_equal(abs(x), array({1.0f, 0.0f, 1.0f})).item<bool>());

    // works on empty array
    CHECK(array_equal(abs(array({})), array({})).item<bool>());

    // int32
    x = array({-1, 0, 1});
    CHECK(array_equal(abs(x), array({1, 0, 1})).item<bool>());

    // uint32
    x = array({1u, 0u, 1u});
    CHECK(array_equal(abs(x), array({1u, 0u, 1u})).item<bool>());

    // bool
    x = array({false, true});
    CHECK(array_equal(abs(x), array({false, true})).item<bool>());
  }

  // Test sign
  {
    array x({-1.0f, 0.0f, 1.0f});
    CHECK(array_equal(sign(x), x).item<bool>());

    // works on empty array
    CHECK(array_equal(sign(array({})), array({})).item<bool>());

    // int32
    x = array({-1, 0, 1});
    CHECK(array_equal(sign(x), x).item<bool>());

    // uint32
    x = array({1u, 0u, 1u});
    CHECK(array_equal(sign(x), x).item<bool>());

    // bool
    x = array({false, true});
    CHECK(array_equal(sign(x), x).item<bool>());

    // uint64
    array x_uint64(
        {uint64_t(0xa11cc311cb6acd70),
         uint64_t(0x7a375ac3ebb533f3),
         uint64_t(0x734969adf9d7190c),
         uint64_t(0xb400515a4f673424)});
    array expected(
        {uint64_t(0x0000000000000001),
         uint64_t(0x0000000000000001),
         uint64_t(0x0000000000000001),
         uint64_t(0x0000000000000001)});
    CHECK(array_equal(sign(x_uint64), expected).item<bool>());

    x_uint64 = array(
        {uint64_t(0xa11cc311cb6acd70),
         uint64_t(0x7a375ac3ebb533f3),
         uint64_t(0x734969adf9d7190c)});
    expected = array(
        {uint64_t(0x0000000000000001),
         uint64_t(0x0000000000000001),
         uint64_t(0x0000000000000001)});
    CHECK(array_equal(sign(x_uint64), expected).item<bool>());

    x_uint64 =
        array({uint64_t(0xa11cc311cb6acd70), uint64_t(0x7a375ac3ebb533f3)});
    expected =
        array({uint64_t(0x0000000000000001), uint64_t(0x0000000000000001)});
    CHECK(array_equal(sign(x_uint64), expected).item<bool>());

    x_uint64 = array({uint64_t(0xa11cc311cb6acd70)});
    expected = array({uint64_t(0x0000000000000001)});
    CHECK(array_equal(sign(x_uint64), expected).item<bool>());

    x_uint64 = array({uint64_t(0xffffffffffffffff)});
    expected = array({uint64_t(0x0000000000000001)});
    CHECK(array_equal(sign(x_uint64), expected).item<bool>());

    x_uint64 = array({uint64_t(0x0000000000000001)});
    expected = array({uint64_t(0x0000000000000001)});
    CHECK(array_equal(sign(x_uint64), expected).item<bool>());
  }

  constexpr float neginf = -std::numeric_limits<float>::infinity();

  // Test floor and ceil
  {
    array x(1.0f);
    CHECK_EQ(floor(x).item<float>(), 1.0f);
    CHECK_EQ(ceil(x).item<float>(), 1.0f);

    x = array(1.5f);
    CHECK_EQ(floor(x).item<float>(), 1.0f);
    CHECK_EQ(ceil(x).item<float>(), 2.0f);

    x = array(-1.5f);
    CHECK_EQ(floor(x).item<float>(), -2.0f);
    CHECK_EQ(ceil(x).item<float>(), -1.0f);

    x = array(neginf);
    CHECK_EQ(floor(x).item<float>(), neginf);
    CHECK_EQ(ceil(x).item<float>(), neginf);

    x = array(std::complex<float>(1.0f, 1.0f));
    CHECK_THROWS_AS(floor(x), std::invalid_argument);
    CHECK_THROWS_AS(ceil(x), std::invalid_argument);
  }

  // Test round
  {
    array x({0.5, -0.5, 1.5, -1.5, 2.3, 2.6});
    CHECK(array_equal(round(x), array({0, -0, 2, -2, 2, 3})).item<bool>());

    x = array({11, 222, 32});
    CHECK(array_equal(round(x, -1), array({10, 220, 30})).item<bool>());
  }

  // Test exponential
  {
    array x(0.0);
    CHECK_EQ(exp(x).item<float>(), 1.0);

    x = array(2.0);
    CHECK_EQ(exp(x).item<float>(), doctest::Approx(std::exp(2.0f)));

    CHECK(array_equal(exp(array({})), array({})).item<bool>());

    x = array(neginf);
    CHECK_EQ(exp(x).item<float>(), doctest::Approx(0.0f));

    // Integer input type
    x = array(2);
    CHECK_EQ(x.dtype(), int32);
    CHECK_EQ(exp(x).item<float>(), doctest::Approx(std::exp(2.0f)));

    // Input is irregularly strided
    x = broadcast_to(array(1.0f), {2, 2, 2});
    CHECK(allclose(exp(x), full({2, 2, 2}, std::exp(1.0f))).item<bool>());

    x = split(array({0.0f, 1.0f, 2.0f, 3.0f}, {2, 2}), 2, 1)[0];
    auto expected = array({std::exp(0.0f), std::exp(2.0f)}, {2, 1});
    CHECK(allclose(exp(x), expected).item<bool>());

    // Complex of -inf
    constexpr float inf = std::numeric_limits<float>::infinity();
    x = array(complex64_t{-inf, -inf});
    CHECK_EQ(exp(x).item<complex64_t>(), complex64_t{0, 0});
  }

  // Test expm1
  {
    array x(-1.0f);
    CHECK_EQ(expm1(x).item<float>(), doctest::Approx(std::expm1(-1.0f)));

    x = array(1.0f);
    CHECK_EQ(expm1(x).item<float>(), doctest::Approx(std::expm1(1.0f)));

    // Integer input type
    x = array(1);
    CHECK_EQ(expm1(x).dtype(), float32);
    CHECK_EQ(expm1(x).item<float>(), doctest::Approx(std::expm1(1.0f)));
  }

  // Test sine
  {
    array x(0.0);
    CHECK_EQ(sin(x).item<float>(), 0.0);

    x = array(M_PI_2);
    CHECK(sin(x).item<float>() == doctest::Approx(std::sin(M_PI_2)));

    CHECK(array_equal(sin(array({})), array({})).item<bool>());

    // Integer input type
    x = array(0);
    CHECK_EQ(x.dtype(), int32);
    CHECK_EQ(sin(x).item<float>(), std::sin(0.0f));

    // Input is irregularly strided
    x = broadcast_to(array(1.0f), {2, 2, 2});
    CHECK(allclose(sin(x), full({2, 2, 2}, std::sin(1.0f))).item<bool>());

    x = split(array({0.0f, 1.0f, 2.0f, 3.0f}, {2, 2}), 2, 1)[0];
    auto expected = array({std::sin(0.0f), std::sin(2.0f)}, {2, 1});
    CHECK(allclose(sin(x), expected).item<bool>());
  }

  // Test cos
  {
    array x(0.0);
    CHECK_EQ(cos(x).item<float>(), doctest::Approx(1.0));

    x = array(M_PI_2);
    CHECK(cos(x).item<float>() == doctest::Approx(std::cos(M_PI_2)));

    CHECK(array_equal(cos(array({})), array({})).item<bool>());

    // Integer input type
    x = array(0);
    CHECK_EQ(x.dtype(), int32);
    CHECK(cos(x).item<float>() == doctest::Approx(std::cos(0.0f)));

    // Input is irregularly strided
    x = broadcast_to(array(1.0f), {2, 2, 2});
    CHECK(allclose(cos(x), full({2, 2, 2}, std::cos(1.0f))).item<bool>());

    x = split(array({0.0f, 1.0f, 2.0f, 3.0f}, {2, 2}), 2, 1)[0];
    auto expected = array({std::cos(0.0f), std::cos(2.0f)}, {2, 1});
    CHECK(allclose(cos(x), expected).item<bool>());
  }

  // Test degrees
  {
    array x(0.0);
    CHECK_EQ(degrees(x).item<float>(), 0.0);

    x = array(M_PI_2);
    CHECK(degrees(x).item<float>() == doctest::Approx(90.0));

    CHECK(array_equal(degrees(array({})), array({})).item<bool>());

    // Integer input type
    x = array(0);
    CHECK_EQ(x.dtype(), int32);
    CHECK_EQ(degrees(x).item<float>(), 0.0);

    // Input is irregularly strided
    x = broadcast_to(array(M_PI_2), {2, 2, 2});
    CHECK(allclose(degrees(x), full({2, 2, 2}, 90.0)).item<bool>());

    float angles[] = {0.0f, M_PI_2, M_PI, 3.0f * M_PI_2};
    x = split(array(angles, {2, 2}), 2, 1)[0];
    auto expected = array({0.0f, 180.0f}, {2, 1});
    CHECK(allclose(degrees(x), expected).item<bool>());
  }

  // Test radians
  {
    array x(0.0);
    CHECK_EQ(radians(x).item<float>(), 0.0);

    x = array(90.0);
    CHECK(radians(x).item<float>() == doctest::Approx(M_PI_2));

    CHECK(array_equal(radians(array({})), array({})).item<bool>());

    // Integer input type
    x = array(90);
    CHECK_EQ(x.dtype(), int32);
    CHECK(radians(x).item<float>() == doctest::Approx(M_PI_2));

    // Input is irregularly strided
    x = broadcast_to(array(90.0f), {2, 2, 2});
    CHECK(allclose(radians(x), full({2, 2, 2}, M_PI_2)).item<bool>());

    x = split(array({0.0f, 90.0f, 180.0f, 270.0f}, {2, 2}), 2, 1)[0];
    float angles[] = {0.0f, M_PI};
    auto expected = array(angles, {2, 1});
    CHECK(allclose(radians(x), expected).item<bool>());
  }

  // Test log
  {
    array x(0.0);
    CHECK_EQ(log(x).item<float>(), neginf);

    x = array(1.0);
    CHECK_EQ(log(x).item<float>(), log(1.0f));

    // Integer input type
    x = array(1);
    CHECK_EQ(log(x).dtype(), float32);
    CHECK_EQ(log(x).item<float>(), log(1.0f));

    // Input is irregularly strided
    x = broadcast_to(array(1.0f), {2, 2, 2});
    CHECK(array_equal(log(x), full({2, 2, 2}, std::log(1.0f))).item<bool>());

    x = split(array({1.0f, 2.0f, 3.0f, 4.0f}, {2, 2}), 2, 1)[0];
    auto expected = array({std::log(1.0f), std::log(3.0f)}, {2, 1});
    CHECK(array_equal(log(x), expected).item<bool>());
  }

  // Test log2
  {
    array x(0.0);
    CHECK_EQ(log2(x).item<float>(), neginf);

    x = array(1.0);
    CHECK_EQ(log2(x).item<float>(), 0.0f);

    x = array(1024.0f);
    CHECK_EQ(log2(x).item<float>(), 10.0f);
  }

  // Test log10
  {
    array x(0.0);
    CHECK_EQ(log10(x).item<float>(), neginf);

    x = array(1.0);
    CHECK_EQ(log10(x).item<float>(), 0.0f);

    x = array(1000.0f);
    CHECK_EQ(log10(x).item<float>(), 3.0f);
  }

  // Test log1p
  {
    array x(-1.0f);
    CHECK_EQ(log1p(x).item<float>(), neginf);

    x = array(1.0f);
    CHECK_EQ(log1p(x).item<float>(), std::log1pf(1.0f));

    // Integer input type
    x = array(1);
    CHECK_EQ(log1p(x).dtype(), float32);
    CHECK_EQ(log1p(x).item<float>(), std::log1pf(1.0f));

    // Input is irregularly strided
    x = broadcast_to(array(1.0f), {2, 2, 2});
    CHECK(
        array_equal(log1p(x), full({2, 2, 2}, std::log1pf(1.0f))).item<bool>());

    x = split(array({1.0f, 2.0f, 3.0f, 4.0f}, {2, 2}), 2, 1)[0];
    auto expected = array({std::log1pf(1.0f), std::log1pf(3.0f)}, {2, 1});
    CHECK(array_equal(log1p(x), expected).item<bool>());
  }

  // Test sigmoid
  {
    array x(0.0);
    CHECK_EQ(sigmoid(x).item<float>(), 0.5f);

    // Integer input type
    x = array(0);
    CHECK_EQ(sigmoid(x).dtype(), float32);
    CHECK_EQ(sigmoid(x).item<float>(), 0.5f);

    constexpr auto inf = std::numeric_limits<float>::infinity();
    x = array(inf);
    CHECK_EQ(sigmoid(x).item<float>(), 1.0f);
    x = array(-inf);
    CHECK_EQ(sigmoid(x).item<float>(), 0.0f);
  }

  // Test square
  {
    array x(3.0);
    CHECK_EQ(square(x).item<float>(), 9.0);

    x = array(2);
    CHECK_EQ(square(x).item<int>(), 4);

    x = full({3, 3}, 2.0f);
    CHECK(array_equal(square(x), full({3, 3}, 4.0f)).item<bool>());
  }

  // Test sqrt and rsqrt
  {
    array x(4.0);
    CHECK_EQ(sqrt(x).item<float>(), 2.0);
    CHECK_EQ(rsqrt(x).item<float>(), 0.5);

    x = full({3, 3}, 9.0f);
    CHECK(array_equal(sqrt(x), full({3, 3}, 3.0f)).item<bool>());

    x = array(4, int32);
    CHECK_EQ(sqrt(x).item<float>(), 2.0f);
    CHECK_EQ(rsqrt(x).item<float>(), 0.5f);
  }

  // Test reciprocal
  {
    array x(8.0);
    CHECK_EQ(reciprocal(x).item<float>(), 0.125f);

    x = array(2);
    auto out = reciprocal(x);
    CHECK_EQ(out.dtype(), float32);
    CHECK_EQ(out.item<float>(), 0.5f);

    x = full({3, 3}, 2.0f);
    CHECK(array_equal(reciprocal(x), full({3, 3}, 0.5f)).item<bool>());
  }
}

TEST_CASE("test error functions") {
  constexpr float inf = std::numeric_limits<float>::infinity();
  array x(0.0f);
  CHECK_EQ(erf(x).item<float>(), 0.0f);
  x = array(inf);
  CHECK_EQ(erf(x).item<float>(), 1.0f);
  x = array(-inf);
  CHECK_EQ(erf(x).item<float>(), -1.0f);

  x = array(1, int32);
  CHECK_EQ(erf(x).dtype(), float32);

  x = array(0.0f);
  CHECK_EQ(erfinv(x).item<float>(), 0.0f);
  x = array(1.0f);
  CHECK_EQ(erfinv(x).item<float>(), inf);
  x = array(-1.0f);
  CHECK_EQ(erfinv(x).item<float>(), -inf);

  x = array(1, int32);
  CHECK_EQ(erfinv(x).dtype(), float32);

  x = array(2.0f);
  CHECK(std::isnan(erfinv(x).item<float>()));
  x = array(-2.0f);
  CHECK(std::isnan(erfinv(x).item<float>()));

  auto vals = {0.9f, 0.5f, 0.1f, -0.1f, -0.5f, -0.9f};
  // Expected values are generated from scipy's error function:
  //   python -c "import scipy.special as ss;
  //   vals = [0.9, 0.5, 0.1, -0.1, -0.5, -0.9];
  //   print([ss.erf(x) for x in vals])"
  {
    auto expected = {
        0.7969082124228322,
        0.5204998778130465,
        0.1124629160182849,
        -0.1124629160182849,
        -0.5204998778130465,
        -0.7969082124228322};
    for (int i = 0; i < vals.size(); ++i) {
      x = array(vals.begin()[i]);
      CHECK_EQ(erf(x).item<float>(), doctest::Approx(expected.begin()[i]));
    }
  }

  // Expected values are generated from scipy's inverse error function:
  //   python -c "import scipy.special as ss;
  //   vals = [0.9, 0.5, 0.1, -0.1, -0.5, -0.9];
  //   print([ss.erfinv(x) for x in vals])"
  {
    auto expected = {
        1.1630871536766738,
        0.4769362762044699,
        0.08885599049425778,
        -0.08885599049425769,
        -0.4769362762044699,
        -1.1630871536766743};
    for (int i = 0; i < vals.size(); ++i) {
      x = array(vals.begin()[i]);
      CHECK_EQ(erfinv(x).item<float>(), doctest::Approx(expected.begin()[i]));
    }
  }

  // float16_t
  {
    array x(0.0f, float16);
    auto out = erf(x);
    CHECK_EQ(out.dtype(), float16);
    CHECK_EQ(out.item<float16_t>(), 0.0f);

    out = erfinv(x);
    CHECK_EQ(out.dtype(), float16);
    CHECK_EQ(out.item<float16_t>(), 0.0f);
  }

  // bfloat
  {
    array x(0.0f, bfloat16);
    auto out = erf(x);
    CHECK_EQ(out.dtype(), bfloat16);
    CHECK_EQ(out.item<bfloat16_t>(), 0.0f);

    out = erfinv(x);
    CHECK_EQ(out.dtype(), bfloat16);
    CHECK_EQ(out.item<float16_t>(), 0.0f);
  }
}

TEST_CASE("test arithmetic binary ops") {
  array x(1.0);
  array y(1.0);
  auto z = add(x, y);
  CHECK_EQ(z.item<float>(), 2.0);
  z = x + y;
  CHECK_EQ(z.item<float>(), 2.0);
  z = add(z, x);
  CHECK_EQ(z.item<float>(), 3.0);
  z.eval(); // No-op
  CHECK_EQ(z.item<float>(), 3.0);

  // Chain a few adds:
  auto out = x;
  for (int i = 0; i < 10; ++i) {
    out = add(out, x);
  }
  CHECK_EQ(out.item<float>(), 11.0);

  // Works for different shapes
  x = array({1.0, 2.0, 3.0}, {1, 3});
  y = array({1.0, 2.0, 3.0}, {1, 3});
  z = add(x, y);
  CHECK_EQ(z.shape(), Shape{1, 3});
  auto eq = array_equal(z, array({2.0, 4.0, 6.0}, {1, 3}));
  CHECK(eq.item<bool>());

  // Works with scalars
  x = array({1.0, 2.0, 3.0}, {1, 3});
  y = x + 2.0;
  CHECK_EQ(y.dtype(), float32);
  eq = array_equal(y, array({3.0, 4.0, 5.0}, {1, 3}));
  CHECK(eq.item<bool>());
  y = 2.0 + x;
  CHECK_EQ(y.dtype(), float32);
  eq = array_equal(y, array({3.0, 4.0, 5.0}, {1, 3}));
  CHECK(eq.item<bool>());

  // Check type promotion
  y = 2 + x;
  CHECK_EQ(y.dtype(), float32);

  y = 2.0 + array({1, 2, 3});
  CHECK_EQ(y.dtype(), float32);
  CHECK(array_equal(y, array({3.0, 4.0, 5.0})).item<bool>());

  // Broadcasting works
  x = broadcast_to(array({1.0}), {10});
  y = broadcast_to(array({2.0}), {10});
  z = add(x, y);
  CHECK(array_equal(z, full({10}, 3.0)).item<bool>());

  x = array({1.0, 2.0}, {1, 2});
  y = array({1.0, 2.0}, {2, 1});
  z = add(x, y);
  CHECK_EQ(z.shape(), Shape{2, 2});
  eq = array_equal(z, array({2.0, 3.0, 3.0, 4.0}, {2, 2}));
  CHECK(eq.item<bool>());

  x = ones({3, 2, 1});
  z = x + 2.0;
  CHECK_EQ(z.shape(), Shape{3, 2, 1});
  eq = array_equal(z, array({3.0, 3.0, 3.0, 3.0, 3.0, 3.0}, {3, 2, 1}));
  CHECK(eq.item<bool>());

  // Works for empty arrays
  x = array({});
  y = array({});
  z = x + y;
  z.eval();
  CHECK_EQ(z.size(), 0);
  CHECK_EQ(z.shape(), Shape{0});

  // Check subtraction
  x = array({3, 2, 1});
  y = array({1, 1, 1});
  CHECK(array_equal(x - y, array({2, 1, 0})).item<bool>());

  // Check multiplication
  x = array({1, 2, 3});
  y = array({2, 2, 2});
  CHECK(array_equal(x * y, array({2, 4, 6})).item<bool>());

  // Check division
  x = array(1);
  y = array(1);
  CHECK_EQ(divide(x, y).item<float>(), 1.0f);

  x = array(1);
  y = array(0.5);
  CHECK_EQ(divide(x, y).item<float>(), 2.0f);

  x = array(1);
  y = array(4);
  CHECK_EQ(divide(x, y).item<float>(), 0.25f);

  x = array(true);
  y = array(true);
  CHECK_EQ(divide(x, y).item<float>(), 1.0f);

  x = array(false);
  y = array(true);
  CHECK_EQ(divide(x, y).item<float>(), 0.0f);

  x = array(true);
  y = array(false);
  CHECK(std::isinf(divide(x, y).item<float>()));

  x = array(false);
  y = array(false);
  CHECK(std::isnan(divide(x, y).item<float>()));

  // Check maximum and minimum
  x = array(1.0f);
  y = array(0.0f);
  CHECK_EQ(maximum(x, y).item<float>(), 1.0f);
  CHECK_EQ(minimum(x, y).item<float>(), 0.0f);
  y = array(2.0f);
  CHECK_EQ(maximum(x, y).item<float>(), 2.0f);
  CHECK_EQ(minimum(x, y).item<float>(), 1.0f);

  // Check logaddexp
  x = array(0.0f);
  y = array(0.0f);
  CHECK_EQ(logaddexp(x, y).item<float>(), std::log(2.0f));

  x = array(0u);
  y = array(10000u);
  CHECK_EQ(logaddexp(x, y).item<float>(), 10000.0f);

  constexpr float inf = std::numeric_limits<float>::infinity();
  x = array(inf);
  y = array(3.0f);
  CHECK_EQ(logaddexp(x, y).item<float>(), inf);

  x = array(-inf);
  y = array(3.0f);
  CHECK_EQ(logaddexp(x, y).item<float>(), 3.0f);

  x = array(-inf);
  y = array(-inf);
  CHECK_EQ(logaddexp(x, y).item<float>(), -inf);

  x = array(inf);
  y = array(inf);
  CHECK_EQ(logaddexp(x, y).item<float>(), inf);

  x = array(-inf);
  y = array(inf);
  CHECK_EQ(logaddexp(x, y).item<float>(), inf);

  x = array(complex64_t{1, 1});
  y = array(complex64_t{-inf, -inf});
  CHECK_EQ(logaddexp(x, y).item<complex64_t>(), complex64_t{1, 1});
}

TEST_CASE("test broadcast") {
  auto s = broadcast_shapes({1}, {1, 2});
  CHECK_EQ(s, Shape{1, 2});

  s = broadcast_shapes({1, 2}, {1});
  CHECK_EQ(s, Shape{1, 2});

  s = broadcast_shapes({2, 2}, {});
  CHECK_EQ(s, Shape{2, 2});

  s = broadcast_shapes({}, {1, 1});
  CHECK_EQ(s, Shape{1, 1});

  s = broadcast_shapes({1, 2, 1}, {2});
  CHECK_EQ(s, Shape{1, 2, 2});

  s = broadcast_shapes({2}, {1, 2, 1});
  CHECK_EQ(s, Shape{1, 2, 2});

  s = broadcast_shapes({2, 2, 2}, {1, 2, 1});
  CHECK_EQ(s, Shape{2, 2, 2});

  s = broadcast_shapes({2, 2, 2, 1}, {1, 2, 1});
  CHECK_EQ(s, Shape{2, 2, 2, 1});

  s = broadcast_shapes({0}, {0, 0});
  CHECK_EQ(s, Shape{0, 0});

  CHECK_EQ(broadcast_shapes({}, {0}), Shape{0});

  s = broadcast_shapes({5, 0}, {0, 5, 0});
  CHECK_EQ(s, Shape{0, 5, 0});

  CHECK_EQ(broadcast_shapes({}, {0}), Shape{0});
  CHECK_EQ(broadcast_shapes({1}, {0}), Shape{0});
  CHECK_EQ(broadcast_shapes({1}, {0}), Shape{0});
  CHECK_EQ(broadcast_shapes({1}, {0, 0}), Shape{0, 0});
  CHECK_EQ(broadcast_shapes({1, 1}, {0}), Shape{1, 0});
  CHECK_EQ(broadcast_shapes({1, 1}, {0, 0}), Shape{0, 0});
  CHECK_EQ(broadcast_shapes({2, 1}, {1, 0}), Shape{2, 0});
  CHECK_EQ(broadcast_shapes({2, 1}, {2, 0}), Shape{2, 0});
  CHECK_EQ(broadcast_shapes({2, 1}, {1, 2, 0}), Shape{1, 2, 0});
  CHECK_THROWS_AS(broadcast_shapes({2}, {0}), std::invalid_argument);
  CHECK_THROWS_AS(broadcast_shapes({2, 1}, {0, 0}), std::invalid_argument);

  CHECK_THROWS_AS(broadcast_shapes({3}, {2}), std::invalid_argument);
  CHECK_THROWS_AS(broadcast_shapes({1, 3}, {2}), std::invalid_argument);
  CHECK_THROWS_AS(broadcast_shapes({3}, {1, 2}), std::invalid_argument);
  CHECK_THROWS_AS(
      broadcast_shapes({1, 3, 2}, {1, 2, 2}), std::invalid_argument);

  auto x = full({1, 1}, 2.3f);
  CHECK_EQ(broadcast_to(x, {1, 1}).item<float>(), 2.3f);

  x = broadcast_to(x, {5, 1});
  CHECK_EQ(x.shape(), Shape{5, 1});
  x.eval();
  CHECK_EQ(x.strides(), Strides{0, 0});

  CHECK_THROWS_AS(broadcast_to(x, {1, 5}), std::invalid_argument);
  x = broadcast_to(x, {5, 5});
  CHECK_EQ(x.shape(), Shape{5, 5});

  x = zeros({2, 1, 2});
  x = broadcast_to(x, {4, 2, 1, 2});
  CHECK_EQ(x.shape(), Shape{4, 2, 1, 2});
  x.eval();
  CHECK_EQ(x.strides(), Strides{0, 2, 0, 1});

  // Broadcast on empty arrays works as expected
  x = array({});
  CHECK_THROWS_AS(broadcast_to(x, {1}), std::invalid_argument);

  // Broadcast to empty array works as expected
  x = array({1});
  auto y = broadcast_to(x, {0});
  eval(y);
  CHECK_EQ(y.size(), 0);
  CHECK_EQ(y.shape(), Shape{0});

  x = array({1, 2}, {2, 1});
  y = broadcast_to(x, {2, 0});
  eval(y);
  CHECK_EQ(y.size(), 0);
  CHECK_EQ(y.shape(), Shape{2, 0});

  // Check repeat application works
  x = zeros({2});
  x = broadcast_to(broadcast_to(x, {2, 2}), {2, 2});
  CHECK_EQ(x.shape(), Shape{2, 2});
  x.eval();
  CHECK_EQ(x.strides(), Strides{0, 1});
  x = broadcast_to(broadcast_to(x, {2, 2}), {2, 2, 2});
  CHECK_EQ(x.shape(), Shape{2, 2, 2});
  x.eval();
  CHECK_EQ(x.strides(), Strides{0, 0, 1});

  // Broadcast on transposed array works
  x = array({0, 1, 2, 3, 4, 5}, {2, 3});
  x = broadcast_to(transpose(x), {2, 3, 2});
  CHECK_EQ(x.shape(), Shape{2, 3, 2});
  y = broadcast_to(array({0, 3, 1, 4, 2, 5}, {3, 2}), {2, 3, 2});
  CHECK(array_equal(x, y).item<bool>());

  // Reshape on broadcasted array works
  x = array(1.0);
  x = broadcast_to(x, {2});
  x = reshape(x, {1, 2});
  CHECK(array_equal(x, ones({1, 2})).item<bool>());
}

TEST_CASE("test gather") {
  // Empty input, non-empty indices/slice
  CHECK_THROWS(gather(array({}), array({1}), 0, {1}));

  // More indices than dimensions
  CHECK_THROWS(gather(array(0), array({1}), 0, {1}));

  // Mismatch dimensions and indices
  CHECK_THROWS(gather(array({0}), {array({0})}, {0, 1}, {1}));
  CHECK_THROWS(gather(array({0}), array({0}), -1, {1}));

  // Repeat dimensions
  CHECK_THROWS(
      gather(array({0}, {1, 1}), {array({0}), array({0})}, {0, 0}, {1, 1}));

  // Slice sizes incorrect
  CHECK_THROWS(gather(array({0}), array({0}), 0, {2}));
  CHECK_THROWS(gather(array({0}), array({0}), 0, {0, 0}));
  CHECK_THROWS(gather(array({0}), array({0}), 0, {-1}));

  // Wrong index type
  CHECK_THROWS(gather(array({0}), array({0.0f}), 0, {0}));
  CHECK_THROWS(
      gather(array({0}, {1, 1}), {array({0}), array({0.0f})}, {0, 1}, {1, 1}));

  // Index arrays must be broadcastable
  CHECK_THROWS(gather(
      array({0}, {1, 1}),
      {array({0, 0, 0}, {3}), array({0, 0}, {2})},
      {0, 1},
      {1, 1}));

  // Basic test of correctness with 1D input
  auto x = arange(20);
  auto y = arange(10);
  auto out = gather(x, y, 0, {1});
  CHECK_EQ(out.shape(), Shape{10, 1});
  CHECK(array_equal(reshape(out, {-1}), y).item<bool>());

  out = gather(x, array({15}, uint32), 0, {1});
  CHECK_EQ(out.shape(), Shape{1, 1});
  CHECK_EQ(out.item<int32_t>(), 15);

  // No index gather works
  out = gather(x, {}, std::vector<int>{}, {10});
  CHECK_EQ(out.shape(), Shape{10});
  CHECK(array_equal(out, arange(10)).item<bool>());

  // Basic test of correctness with 2D input
  x = arange(128);
  x = reshape(x, {4, 32});
  y = array({0, 1}, uint32);
  out = gather(x, y, 0, {1, 32});
  CHECK_EQ(out.shape(), Shape{2, 1, 32});
  CHECK(array_equal(reshape(out, {64}), arange(64)).item<bool>());

  x = reshape(x, {64, 2});
  y = array({0}, uint32);
  out = gather(x, y, 0, {64, 1});
  CHECK_EQ(out.shape(), Shape{1, 64, 1});
  CHECK(array_equal(out, reshape(arange(0, 128, 2), {1, 64, 1})).item<bool>());

  // Basic test of correctness with 3D input
  x = arange(256);
  x = reshape(x, {8, 4, 8});
  y = array({0}, uint32);
  out = gather(x, y, 0, {8, 1, 1});
  CHECK_EQ(out.shape(), Shape{1, 8, 1, 1});
  CHECK(
      array_equal(out, reshape(arange(0, 256, 32), {1, 8, 1, 1})).item<bool>());

  x = broadcast_to(array({1, 2}), {20, 2});
  out = gather(x, array({5}), 0, {1, 1});
  CHECK_EQ(out.item<int>(), 1);
  out = gather(x, {array({5}), array({1})}, {0, 1}, {1, 1});
  CHECK_EQ(out.item<int>(), 2);
}

TEST_CASE("test take") {
  // Empty takes
  auto empty = astype(array({}), int32);
  auto z = take(array({1}), empty);
  CHECK_EQ(z.shape(), Shape{0});
  empty = reshape(empty, {1, 0, 1});
  z = take(array({1}), empty);
  CHECK_EQ(z.shape(), Shape{1, 0, 1});

  CHECK_THROWS(take(array({}), array(1)));

  z = take(array({}), empty);
  CHECK_EQ(z.size(), 0);

  // Take a single row
  auto x = reshape(arange(256), {8, 4, 8});
  z = take(x, array({0}, uint32), 0);
  CHECK_EQ(z.shape(), Shape{1, 4, 8});
  z = reshape(z, {32});
  CHECK(array_equal(z, arange(32)).item<bool>());

  z = take(x, array({1}, uint32), 0);
  z = reshape(z, {32});
  CHECK(array_equal(z, arange(32, 64)).item<bool>());

  // Take multiple rows
  x = arange(256);
  x = reshape(x, {8, 4, 8});
  z = take(x, array({0, 1}, uint32), 0);
  z = reshape(z, {64});
  CHECK(array_equal(z, arange(64)).item<bool>());

  // Take along middle axis
  x = reshape(arange(8), {2, 2, 2});
  z = take(x, array({0}), 1);
  CHECK(array_equal(z, array({0, 1, 4, 5}, {2, 1, 2})).item<bool>());

  // Irregular strides test
  auto a = array({1, 2, 3}, float32);
  auto indices = broadcast_to(array(0), {10});
  auto b = take(a, indices);
  CHECK(array_equal(b, ones({10})).item<bool>());

  // Take with 0 dim index
  z = take(array({0, 1, 2}), array(0));
  CHECK_EQ(z.item<int>(), 0);
  CHECK_EQ(z.ndim(), 0);

  // Check take with float indices crashes
  CHECK_THROWS(take(array({}), array({})));
  CHECK_THROWS(take(a, array({1.0, 2.0, 3.0})));

  // Check axis
  a = array({1, 2, 3, 4}, {2, 2});
  CHECK_THROWS(take(a, array({1}), -3));
  CHECK_THROWS(take(a, array({1}), 2));

  // Check negative indices
  a = array({1, 2, 3, 4}, {2, 2});
  CHECK_EQ(take(a, array({-1})).item<int>(), 4);
  CHECK(array_equal(take(a, array({1, -1})), array({2, 4})).item<bool>());
  CHECK(array_equal(take(a, array(-1), 0), array({3, 4})).item<bool>());

  // Check shapes
  a = zeros({2, 1, 1});
  auto out = take(a, array({1}), 0);
  CHECK(array_equal(out, zeros({1, 1, 1})).item<bool>());
  out = take(a, array({0}), 1);
  CHECK(array_equal(out, zeros({2, 1, 1})).item<bool>());
  out = take(a, array({0}), 1);
  CHECK(array_equal(out, zeros({2, 1, 1})).item<bool>());
  a = zeros({1, 2, 1});
  out = take(a, array({0}), 0);
  CHECK(array_equal(out, zeros({1, 2, 1})).item<bool>());
  out = take(a, array({0}), 1);
  CHECK(array_equal(out, zeros({1, 1, 1})).item<bool>());
  out = take(a, array({0, 1}), 1);
  CHECK(array_equal(out, zeros({1, 2, 1})).item<bool>());

  // Indices have wrong shape
  a = zeros({2, 3, 4});
  CHECK_THROWS(take(a, zeros({1, 3, 4}), 1));
  CHECK_THROWS(take(a, zeros({2, 3, 7}), 1));
  CHECK_THROWS(take(a, zeros({2, 3, 2}), 0));
}

TEST_CASE("test take along axis") {
  // No zero dim arrays
  auto a = array(1);
  CHECK_THROWS(take_along_axis(a, array(0), 0));

  // Index and array size mismatches
  a = arange(5);
  CHECK_THROWS(take_along_axis(a, array({1}), 1));
  CHECK_THROWS(take_along_axis(a, array({1}, {1, 1}), 0));
  CHECK_THROWS(take_along_axis(a, array(1), -1));

  auto out = take_along_axis(a, array({1}), 0);
  CHECK_EQ(out.item<int>(), 1);
  out = take_along_axis(a, array({1}), -1);
  CHECK_EQ(out.item<int>(), 1);

  // Empty arrays
  a = reshape(array({}), {1, 0});
  CHECK_THROWS(take_along_axis(a, array({1}), 0));

  out = take_along_axis(a, reshape(array({1}), {1, 1}), 0);
  eval(out); // Make sure it runs
  CHECK_EQ(out.shape(), Shape{1, 0});

  auto inds = reshape(astype(array({}), int32), {1, 0});
  out = take_along_axis(a, inds, 0);
  eval(out); // Make sure it runs
  CHECK_EQ(out.shape(), Shape{1, 0});

  a = array({1, 2, 3, 4}, {2, 2});
  inds = array({0, 1}, {1, 2});
  out = take_along_axis(a, inds, 0);
  CHECK(array_equal(out, array({1, 4}, {1, 2})).item<bool>());

  inds = array({0, 1, 0, 1, 0, 0, 1, 0}, {4, 2}, int32);
  out = take_along_axis(a, inds, 0);
  CHECK(array_equal(out, array({1, 4, 1, 4, 1, 2, 3, 2}, {4, 2})).item<bool>());

  inds = array({0, 1}, {2, 1});
  out = take_along_axis(a, inds, 1);
  CHECK(array_equal(out, array({1, 4}, {2, 1})).item<bool>());

  // Broadcasting works
  inds = array({0}, {1, 1});
  out = take_along_axis(a, inds, 0);
  CHECK(array_equal(out, array({1, 2}, {1, 2})).item<bool>());
  out = take_along_axis(a, inds, 1);
  CHECK(array_equal(out, array({1, 3}, {2, 1})).item<bool>());

  inds = array({0, 1, 1, 0, 0, 1}, {2, 3}, int32);
  out = take_along_axis(a, inds, 1);
  CHECK(array_equal(out, array({1, 2, 2, 3, 3, 4}, {2, 3})).item<bool>());

  a = reshape(arange(8), {2, 2, 2});
  inds = array({0, 1, 0, 0, 1, 0, 0, 1}, {2, 2, 2});
  out = take_along_axis(a, inds, 0);
  CHECK(array_equal(out, array({0, 5, 2, 3, 4, 1, 2, 7}, {2, 2, 2}))
            .item<bool>());
  out = take_along_axis(a, inds, 1);
  CHECK(array_equal(out, array({0, 3, 0, 1, 6, 5, 4, 7}, {2, 2, 2}))
            .item<bool>());
  out = take_along_axis(a, inds, 2);
  CHECK(array_equal(out, array({0, 1, 2, 2, 5, 4, 6, 7}, {2, 2, 2}))
            .item<bool>());
}

TEST_CASE("test put along axis") {
  // No zero dim arrays
  auto a = array(1);
  auto v = array(1);
  CHECK_THROWS(put_along_axis(a, array(0), v, 0));

  // Index and array size mismatches
  a = arange(5);
  CHECK_THROWS(put_along_axis(a, array({1}), array({0}), 1));
  CHECK_THROWS(put_along_axis(a, array({1}, {1, 1}), array({0}), 0));
  CHECK_THROWS(put_along_axis(a, array(1), array(0), -1));

  auto expected = array({0, 0, 2, 3, 4});
  auto out = put_along_axis(a, array({1}), array({0}), 0);
  CHECK(array_equal(out, expected).item<bool>());

  // Empty arrays
  a = reshape(array({}), {1, 0});
  CHECK_THROWS(put_along_axis(a, array({1}), array({0}), 0));

  auto inds = reshape(astype(array({}), int32), {1, 0});
  out = take_along_axis(a, inds, 0);
  eval(out); // Make sure it runs
  CHECK_EQ(out.shape(), Shape{1, 0});

  a = array({1, 2, 3, 4}, {2, 2});
  inds = array({0, 1}, {1, 2});
  out = put_along_axis(a, inds, array({0}), 0);
  expected = array({0, 2, 3, 0}, {2, 2});
  CHECK(array_equal(out, expected).item<bool>());

  inds = array({0, 0, 1, 1}, {2, 2}, int32);
  auto values = array({2, 3, 4, 5}, {2, 2}, int32);
  out = put_along_axis(a, inds, values, 0);
  CHECK(array_equal(out, array({2, 3, 4, 5}, {2, 2})).item<bool>());

  inds = array({0, 1}, {2, 1});
  out = put_along_axis(a, inds, array({0}), 1);
  expected = array({0, 2, 3, 0}, {2, 2});
  CHECK(array_equal(out, expected).item<bool>());
}

TEST_CASE("test scatter") {
  // More indices than dimensions
  CHECK_THROWS(scatter(array(0), array({1}), array(1), 0));

  // Mismatch dimensions and indices
  CHECK_THROWS(scatter(array({0}), {array({0})}, array({1}, {1, 1}), {0, 1}));
  CHECK_THROWS(scatter(array({0}), array({0}), array({1}, {1, 1}), -1));

  // Repeat dimensions
  CHECK_THROWS(scatter(
      array({0}, {1, 1}), {array({0}), array({0})}, array({1}), {0, 0}));

  // Update sizes incorrect
  CHECK_THROWS(scatter(array({0}), array({0}), array({0, 1}), 0));
  CHECK_THROWS(scatter(array({0}), array({0}), array({0, 1}, {2, 1}), 0));
  CHECK_THROWS(scatter(array({0}, {1}), array({0}), array({0, 1}, {1, 2}), 0));

  // Wrong index type
  CHECK_THROWS(scatter(array({0}), array({0.0f}), array({0}, {1, 1}), 0));
  CHECK_THROWS(scatter(
      array({0}, {1, 1}),
      {array({0}), array({0.0f})},
      array({1}, {1, 1, 1}),
      {0, 1}));

  // Index arrays must be broadcastable
  CHECK_THROWS(scatter(
      array({0}, {1, 1}),
      {array({0, 0, 0}, {3}), array({0, 0}, {2})},
      ones({3, 2, 1, 1}),
      {0, 1}));

  // Single element scatter
  auto in = zeros({4}, float32);
  auto inds = arange(2);
  auto updates = ones({2, 1}, float32);
  auto out = scatter(in, inds, updates, 0);
  CHECK(array_equal(out, array({1.0f, 1.0f, 0.0f, 0.0f})).item<bool>());

  // Single element scatter add
  in = ones({4}, float32);
  inds = array({0, 0, 3});
  updates = ones({3, 1}, float32);
  out = scatter_add(in, inds, updates, 0);
  CHECK(array_equal(out, array({3.0f, 1.0f, 1.0f, 2.0f})).item<bool>());

  // Single element scatter prod
  in = ones({4}, float32);
  inds = array({0, 0, 3});
  updates = full({3, 1}, 2.0f, float32);
  out = scatter_prod(in, inds, updates, 0);
  CHECK(array_equal(out, array({4.0f, 1.0f, 1.0f, 2.0f})).item<bool>());

  // Single element scatter max
  in = ones({4}, float32);
  inds = array({0, 0, 3});
  updates = array({1.0f, 6.0f, -2.0f}, {3, 1});
  out = scatter_max(in, inds, updates, 0);
  CHECK(array_equal(out, array({6.0f, 1.0f, 1.0f, 1.0f})).item<bool>());

  // Single element scatter min
  in = ones({4}, float32);
  inds = array({0, 0, 3});
  updates = array({1.0f, -6.0f, 2.0f}, {3, 1});
  out = scatter_min(in, inds, updates, 0);
  CHECK(array_equal(out, array({-6.0f, 1.0f, 1.0f, 1.0f})).item<bool>());

  // Empty scatter
  in = arange(4, float32);
  inds = astype(array({}), uint32);
  updates = reshape(array({}), {0, 1});
  out = scatter(in, inds, updates, 0);
  CHECK(array_equal(out, in).item<bool>());

  // Array scatters
  in = zeros({4, 4}, float32);
  inds = array({0, 1, 2, 3});
  updates = reshape(arange(16, float32), {4, 1, 4});
  out = scatter(in, inds, updates, 0);
  CHECK(array_equal(out, reshape(arange(16, float32), {4, 4})).item<bool>());

  // Array scatters with col contiguous updates
  in = zeros({4, 4}, float32);
  inds = array({0, 1, 2, 3});
  updates = transpose(reshape(arange(16, float32), {4, 1, 4}));
  out = scatter(in, inds, updates, 0);
  CHECK(array_equal(out, transpose(reshape(arange(16, float32), {4, 4})))
            .item<bool>());

  // Irregular strided index and reduce collision test
  in = zeros({10}, float32);
  inds = broadcast_to(array(3), {10});
  updates = ones({10, 1}, float32);
  out = scatter_add(in, inds, updates, 0);
  CHECK_EQ(take(out, array(3)).item<float>(), 10);

  // 1 element array with 0 dim index
  in = array({1}, int32);
  updates = array({2}, int32);
  out = scatter_max(in, array(0), updates, 0);
  CHECK_EQ(out.item<int>(), 2);

  // No index arrays or axes
  out = scatter_max(array(1), {}, array(2), std::vector<int>{});
  CHECK_EQ(out.item<int>(), 2);

  // Irregularly strided updates test
  in = ones({3, 3});
  updates = broadcast_to(array({2, 2, 2}), {1, 3, 3});
  inds = array({0});
  out = scatter(in, inds, updates, 0);
  CHECK(array_equal(out, ones({3, 3}) * 2).item<bool>());

  // Along different axis
  in = zeros({2, 3});
  updates = array({1, 2, 3, 4}, {2, 2, 1});
  inds = array({0, 2});
  out = scatter(in, inds, updates, 1);
  auto expected = array({1, 0, 3, 2, 0, 4}, {2, 3});
  CHECK(array_equal(out, expected).item<bool>());

  // Multiple index arrays
  in = zeros({2, 2});
  updates = array({1, 2}, {2, 1, 1});
  inds = array({0, 1});
  out = scatter(in, {inds, inds}, updates, {0, 1});
  CHECK(array_equal(out, array({1, 0, 0, 2}, {2, 2})).item<bool>());

  // Broadcasted indices
  in = zeros({2, 2});
  updates = array({5, 2, 9, 1}, {2, 2, 1, 1});
  auto inds0 = array({0, 1}, {2, 1});
  auto inds1 = array({0, 1}, {1, 2});
  out = scatter(in, {inds0, inds1}, updates, {0, 1});
  CHECK(array_equal(out, array({5, 2, 9, 1}, {2, 2})).item<bool>());

  // Brodacasted operand
  in = broadcast_to(array({0, 0}), {2, 2});
  updates = array({1, 1}, {2, 1, 1});
  inds = array({0, 1});
  out = scatter_add(in, inds, updates, 0);
  CHECK(array_equal(out, array({1, 0, 1, 0}, {2, 2})).item<bool>());

  // 1D scatter
  {
    auto dst = zeros({2, 4}, int32);
    auto src = reshape(array({1, 2, 3, 4}), {1, 1, 4});
    auto idx = array({1});
    auto expected = reshape(array({0, 0, 0, 0, 1, 2, 3, 4}), {2, 4});
    auto out = scatter(dst, idx, src, 0);
    CHECK(array_equal(out, expected).item<bool>());
  }

  // 1D indices with 2D update
  {
    auto dst = zeros({3, 4}, int32);
    auto indices = {array({1}), array({2})};
    auto axes = {0, 1};
    auto updates = reshape(array({1, 2, 3, 4}, int32), {1, 2, 2});
    auto out = scatter(dst, indices, updates, axes);
    auto expected =
        reshape(array({0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4}), {3, 4});
    CHECK(array_equal(out, expected).item<bool>());
  }
}

TEST_CASE("test masked_scatter") {
  // Wrong mask dtype
  CHECK_THROWS(masked_scatter(array({1, 2}), array({1, 2}), array({1, 2})));

  // Mask must be broadcastable to self array
  CHECK_THROWS(masked_scatter(
      array({1, 2, 3, 4}, {2, 2}),
      array({false, true, true, false}, {4, 1}),
      array({1, 2})));

  // 1D mask
  {
    auto self = zeros({4}, int32);
    auto mask = array({true, true, false, true});
    auto source = array({1, 2, 4});
    auto out = masked_scatter(self, mask, source);
    CHECK(array_equal(out, array({1, 2, 0, 4})).item<bool>());
  }

  // Empty mask
  {
    auto self = zeros({4}, int32);
    auto mask = array({false, false, false, false});
    auto source = array({1, 2, 4});
    auto out = masked_scatter(self, mask, source);
    CHECK(array_equal(out, self).item<bool>());
  }

  // Broadcasted mask
  {
    auto self = zeros({2, 2}, int32);
    auto mask = array({true, false});
    auto source = array({5, 6, 7, 8}, {2, 2});
    auto out = masked_scatter(self, mask, source);
    CHECK(array_equal(out, array({5, 6, 0, 0}, {2, 2})).item<bool>());
  }
}

TEST_CASE("test is positive infinity") {
  array x(1.0f);
  CHECK_FALSE(isposinf(x).item<bool>());

  array y(std::numeric_limits<float>::infinity());
  CHECK(isposinf(y).item<bool>());

  array z = identity(7);
  CHECK_FALSE(all(isposinf(z)).item<bool>());

  array w = array({1.0f, std::numeric_limits<float>::infinity(), 2.0f});
  CHECK_FALSE(all(isposinf(w)).item<bool>());

  array a(1.0f, bfloat16);
  CHECK_FALSE(isposinf(a).item<bool>());

  array b(std::numeric_limits<float>::infinity(), float16);
  CHECK(isposinf(b).item<bool>());

  array c(std::numeric_limits<float>::infinity(), bfloat16);
  CHECK(isposinf(c).item<bool>());
}

TEST_CASE("test is negative infinity") {
  array x(1.0f);
  CHECK_FALSE(isneginf(x).item<bool>());

  array y(-std::numeric_limits<float>::infinity());
  CHECK(isneginf(y).item<bool>());

  array z = identity(7);
  CHECK_FALSE(all(isneginf(z)).item<bool>());

  array w = array({1.0f, -std::numeric_limits<float>::infinity(), 2.0f});
  CHECK_FALSE(all(isneginf(w)).item<bool>());

  array a(1.0f, bfloat16);
  CHECK_FALSE(isneginf(a).item<bool>());

  array b(-std::numeric_limits<float>::infinity(), float16);
  CHECK(isneginf(b).item<bool>());

  array c(-std::numeric_limits<float>::infinity(), bfloat16);
  CHECK(isneginf(c).item<bool>());
}

TEST_CASE("test scatter types") {
  for (auto t : {bool_, uint8, uint16, int8, int16}) {
    auto in = zeros({4, 4}, t);
    auto inds = {arange(4), arange(4)};
    auto updates = ones({4, 1, 1}, t);
    auto out = scatter(in, inds, updates, {0, 1});
    auto expected =
        array({1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1}, {4, 4}, t);
    CHECK(array_equal(out, expected).item<bool>());
  }

  for (auto t : {float16, bfloat16}) {
    auto in = zeros({4, 4}, t);
    auto inds = {arange(4), arange(4)};
    auto updates = ones({4, 1, 1}, t);
    auto out = scatter(in, inds, updates, {0, 1});
    auto expected =
        array({1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1}, {4, 4}, t);
    CHECK(allclose(out, expected).item<bool>());
  }
}

TEST_CASE("test complex ops") {
  //  Creation ops
  {
    auto x = full({2, 2}, complex64_t{1, 1});
    CHECK_EQ(x.dtype(), complex64);
    std::initializer_list<complex64_t> expected = {
        {1, 1}, {1, 1}, {1, 1}, {1, 1}};
    CHECK(array_equal(x, array(expected, {2, 2})).item<bool>());
  }

  // Unary ops
  {
    std::initializer_list<complex64_t> vals = {{0, 1}, {1, 0}, {1, 1}};
    auto x = array(vals);

    auto y = abs(x);
    CHECK_EQ(y.dtype(), float32);
    CHECK(array_equal(y, array({1.0f, 1.0f, std::sqrt(2.0f)})).item<bool>());

    y = negative(x);
    std::initializer_list<complex64_t> expected = {{0, -1}, {-1, 0}, {-1, -1}};
    CHECK(array_equal(y, array(expected)).item<bool>());

    y = exp(x);
    {
      std::initializer_list<complex64_t> expected = {
          {0.54030231, 0.84147098}, {2.71828183, 0.}, {1.46869394, 2.28735529}};
      CHECK(allclose(y, array(expected)).item<bool>());
    }

    y = sin(x);
    {
      std::initializer_list<complex64_t> expected = {
          {0., 1.17520119}, {0.84147098, 0.}, {1.29845758, 0.63496391}};
      CHECK(allclose(y, array(expected)).item<bool>());
    }

    y = cos(x);
    {
      std::initializer_list<complex64_t> expected = {
          {1.54308063, -0.}, {0.54030231, -0.}, {0.83373003, -0.98889771}};
      CHECK(allclose(y, array(expected)).item<bool>());
    }
  }

  // Binary ops
  {
    std::initializer_list<complex64_t> vals_x = {{0, 1}, {1, 0}, {1, 1}};
    auto x = array(vals_x);

    std::initializer_list<complex64_t> vals_y = {{2, 0}, {1, 1}, {0, 1}};
    auto y = array(vals_y);

    auto z = add(x, y);
    {
      std::initializer_list<complex64_t> expected = {{2, 1}, {2, 1}, {1, 2}};
      CHECK(array_equal(z, array(expected)).item<bool>());
    }

    z = subtract(x, y);
    {
      std::initializer_list<complex64_t> expected = {{-2, 1}, {0, -1}, {1, 0}};
      CHECK(array_equal(z, array(expected)).item<bool>());
    }

    z = multiply(x, y);
    {
      std::initializer_list<complex64_t> expected = {{0, 2}, {1, 1}, {-1, 1}};
      CHECK(array_equal(z, array(expected)).item<bool>());
    }

    z = maximum(x, y);
    {
      std::initializer_list<complex64_t> expected = {{2, 0}, {1, 1}, {1, 1}};
      CHECK(array_equal(z, array(expected)).item<bool>());
    }
  }

  // Reductions
  if (default_device() == Device::cpu) {
    std::initializer_list<complex64_t> vals = {{0, 0}, {1, 0}, {0, 1}};
    auto x = array(vals);
    CHECK_EQ(max(x).item<complex64_t>(), complex64_t{1, 0});
    CHECK_EQ(min(x).item<complex64_t>(), complex64_t{0, 0});
    CHECK_EQ(sum(x).item<complex64_t>(), complex64_t{1, 1});
    CHECK_EQ(prod(x).item<complex64_t>(), complex64_t{0, 0});
  }
}

TEST_CASE("test as_strided op") {
  auto x = arange(10);
  auto y = as_strided(x, {3, 3}, {1, 1}, 0);
  auto expected = array({0, 1, 2, 1, 2, 3, 2, 3, 4}, {3, 3});
  CHECK(array_equal(y, expected).item<bool>());

  y = as_strided(x, {3, 3}, {0, 3}, 0);
  expected = array({0, 3, 6, 0, 3, 6, 0, 3, 6}, {3, 3});
  CHECK(array_equal(y, expected).item<bool>());

  x = reshape(x, {2, 5}); // 0 1 2 3 ...
  x = transpose(x, {1, 0}); // 0 5 1 6 2 7 ...
  y = as_strided(x, {3, 3}, {2, 1}, 1);
  expected = array({5, 1, 6, 6, 2, 7, 7, 3, 8}, {3, 3});
  CHECK(array_equal(y, expected).item<bool>());
}

TEST_CASE("test scan op") {
  auto x = array({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {2, 3});
  auto y = cumsum(x, 1, false, true);
  auto expected = array({1.0f, 3.0f, 6.0f, 4.0f, 9.0f, 15.0f}, {2, 3});
  CHECK(array_equal(y, expected).item<bool>());

  y = cumsum(x, 1, false, false);
  expected = array({0.0f, 1.0f, 3.0f, 0.0f, 4.0f, 9.0f}, {2, 3});
  CHECK(array_equal(y, expected).item<bool>());

  y = cumsum(x, 1, true, true);
  expected = array({6.0f, 5.0f, 3.0f, 15.0f, 11.0f, 6.0f}, {2, 3});
  CHECK(array_equal(y, expected).item<bool>());

  y = cumsum(x, 1, true, false);
  expected = array({5.0f, 3.0f, 0.0f, 11.0f, 6.0f, 0.0f}, {2, 3});
  CHECK(array_equal(y, expected).item<bool>());

  x = array({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, {2, 2, 2});
  y = cumsum(x, 0, false, true);
  expected =
      array({1.0f, 2.0f, 3.0f, 4.0f, 6.0f, 8.0f, 10.0f, 12.0f}, {2, 2, 2});
  CHECK(array_equal(y, expected).item<bool>());

  y = cumsum(x, 1, false, true);
  expected =
      array({1.0f, 2.0f, 4.0f, 6.0f, 5.0f, 6.0f, 12.0f, 14.0f}, {2, 2, 2});
  CHECK(array_equal(y, expected).item<bool>());

  x = array({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}, {2, 2, 2});
  y = cumsum(x, 0, true, true);
  expected =
      array({6.0f, 8.0f, 10.0f, 12.0f, 5.0f, 6.0f, 7.0f, 8.0f}, {2, 2, 2});
  CHECK(array_equal(y, expected).item<bool>());

  y = cumsum(x, 1, true, true);
  expected =
      array({4.0f, 6.0f, 3.0f, 4.0f, 12.0f, 14.0f, 7.0f, 8.0f}, {2, 2, 2});
  CHECK(array_equal(y, expected).item<bool>());

  x = reshape(x, {4, 2});
  y = cumsum(x, 0, false, false);
  expected = array({0.0f, 0.0f, 1.0f, 2.0f, 4.0f, 6.0f, 9.0f, 12.0f}, {4, 2});
  CHECK(array_equal(y, expected).item<bool>());

  y = cumsum(x, 0, true, false);
  expected =
      array({15.0f, 18.0f, 12.0f, 14.0f, 7.0f, 8.0f, 0.0f, 0.0f}, {4, 2});
  CHECK(array_equal(y, expected).item<bool>());

  // Check the vmap implementation
  auto fun = [](array x) { return cumsum(x, 0, false, true); };
  y = vmap(fun, 0, 0)(x);
  expected = array({1.0f, 3.0f, 3.0f, 7.0f, 5.0f, 11.0f, 7.0f, 15.0f}, {4, 2});
  CHECK(array_equal(y, expected).item<bool>());

  y = vmap(fun, 1, 1)(x);
  expected = array({1.0f, 2.0f, 4.0f, 6.0f, 9.0f, 12.0f, 16.0f, 20.0f}, {4, 2});
  CHECK(array_equal(y, expected).item<bool>());
}

TEST_CASE("test pad") {
  auto x = zeros({1, 2, 3});
  CHECK_EQ(pad(x, 1).shape(), Shape{3, 4, 5});
  CHECK_EQ(pad(x, {0, 1}).shape(), Shape{2, 3, 4});
  CHECK_EQ(pad(x, {{1, 1}, {1, 2}, {3, 1}}).shape(), Shape{3, 5, 7});

  x = array({1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
  auto padded_x = pad(x, 1);
  auto expected = array(
      {0.0f,
       0.0f,
       0.0f,
       0.0f,
       0.0f,
       1.0f,
       2.0f,
       0.0f,
       0.0f,
       3.0f,
       4.0f,
       0.0f,
       0.0f,
       0.0f,
       0.0f,
       0.0f},
      {4, 4});
  CHECK(array_equal(padded_x, expected).item<bool>());
}

TEST_CASE("test power") {
  CHECK_EQ(power(array(1), array(2)).item<int>(), 1);
  CHECK_EQ((power(array(-1), array(2))).item<int>(), 1);
  CHECK_EQ((power(array(-1), array(3))).item<int>(), -1);

  CHECK_EQ((power(array(true), array(false))).item<bool>(), true);
  CHECK_EQ((power(array(false), array(false))).item<bool>(), true);
  CHECK_EQ((power(array(true), array(true))).item<bool>(), true);
  CHECK_EQ((power(array(false), array(true))).item<bool>(), false);

  auto x = array(2.0f);
  CHECK_EQ(
      (power(x, array(0.5))).item<float>(),
      doctest::Approx(std::pow(2.0f, 0.5f)));
  CHECK_EQ(power(x, array(2.0f)).item<float>(), 4.0f);

  CHECK(std::isnan((power(array(-1.0f), array(0.5))).item<float>()));

  auto a = complex64_t{0.5, 0.5};
  auto b = complex64_t{0.5, 0.5};
  auto expected = std::pow(a, b);
  auto out = (power(array(a), array(b))).item<complex64_t>();
  CHECK(abs(out.real() - expected.real()) < 1e-7);
  CHECK(abs(out.imag() - expected.imag()) < 1e-7);

  a = complex64_t{-1.2, 0.1};
  b = complex64_t{2.2, 0.0};
  expected = std::pow(a, b);
  out = (power(array(a), array(b))).item<complex64_t>();
  CHECK(abs(out.real() - expected.real()) < 1e-6);
  CHECK(abs(out.imag() - expected.imag()) < 1e-6);
}

TEST_CASE("test where") {
  const float inf = std::numeric_limits<float>::infinity();

  array condition(true);
  array x(1.0f);
  array y(0.0f);
  auto out = where(condition, x, y);
  CHECK_EQ(out.dtype(), float32);
  CHECK_EQ(out.item<float>(), 1.0f);

  x = array({1, 2}, {2, 1});
  y = array({3, 4}, {1, 2});
  CHECK(array_equal(where(condition, x, y), broadcast_to(x, {2, 2}))
            .item<bool>());

  condition = array(false);
  CHECK(array_equal(where(condition, x, y), broadcast_to(y, {2, 2}))
            .item<bool>());

  condition = array({true, false});
  out = where(condition, x, y);
  auto expected = array({1, 4, 2, 4}, {2, 2});
  CHECK(array_equal(where(condition, x, y), expected).item<bool>());

  condition = array({true, false, false, true}, {2, 2});
  out = where(condition, x, y);
  expected = array({1, 4, 3, 2}, {2, 2});
  CHECK(array_equal(where(condition, x, y), expected).item<bool>());

  x = array(1);
  y = array(2);
  out = where(condition, x, y);
  expected = array({1, 2, 2, 1}, {2, 2});
  CHECK(array_equal(where(condition, x, y), expected).item<bool>());

  condition = array(true);
  x = array({1, 2, 3});
  y = array({3, 6, 13});
  CHECK(array_equal(where(condition, x, y), array({1, 2, 3})).item<bool>());

  condition = array(false);
  x = array({1, 2, 3});
  y = array({3, 6, 13});
  CHECK(array_equal(where(condition, x, y), array({3, 6, 13})).item<bool>());

  condition = array({1, 1, 0});
  x = array({1, 2, 3});
  y = array({11, 12, 13});
  CHECK(array_equal(where(condition, x, y), array({1, 2, 13})).item<bool>());

  condition = array({true, false}, {2, 1, 1});
  x = array({1, 2, 3, 4}, {2, 1, 2});
  y = array({11, 22, 33, 44}, {2, 2, 1});
  expected = array({1, 2, 1, 2, 33, 33, 44, 44}, {2, 2, 2});
  CHECK(array_equal(where(condition, x, y), expected).item<bool>());

  condition = array({true, false, false});
  x = array({inf, 2.0, 3.0});
  y = array({10.0, 20.0, -inf});
  CHECK(array_equal(where(condition, x, y), array({inf, 20.0, -inf}))
            .item<bool>());

  // 4-dim optimized case.
  condition = array({false});
  x = array({1, 2}, {2, 1, 1, 1});
  y = array({3, 4}, {1, 1, 2, 1});
  CHECK(array_equal(where(condition, x, y), array({3, 4, 3, 4}, {2, 1, 2, 1}))
            .item<bool>());

  // 5-dim optimized case.
  condition = array({true, false}, {2, 1, 1, 1, 1});
  x = array({1, 2, 3, 4}, {2, 1, 1, 1, 2});
  y = array({11, 22}, {1, 1, 2, 1, 1});
  CHECK(array_equal(
            where(condition, x, y),
            array({1, 2, 1, 2, 11, 11, 22, 22}, {2, 1, 2, 1, 2}))
            .item<bool>());
}

TEST_CASE("test stack") {
  auto x = array({});
  CHECK_EQ(stack({x}, 0).shape(), Shape{1, 0});
  CHECK_EQ(stack({x}, 1).shape(), Shape{0, 1});

  x = array({1, 2, 3}, {3});
  CHECK_EQ(stack({x}, 0).shape(), Shape{1, 3});
  CHECK_EQ(stack({x}, 1).shape(), Shape{3, 1});

  auto y = array({4, 5, 6}, {3});
  auto z = std::vector<array>{x, y};
  CHECK_EQ(stack(z).shape(), Shape{2, 3});
  CHECK_EQ(stack(z, 0).shape(), Shape{2, 3});
  CHECK_EQ(stack(z, 1).shape(), Shape{3, 2});
  CHECK_EQ(stack(z, -1).shape(), Shape{3, 2});
  CHECK_EQ(stack(z, -2).shape(), Shape{2, 3});

  CHECK_THROWS_MESSAGE(stack({}, 0), "No arrays provided for stacking");

  x = array({1, 2, 3}, {3}, float16);
  y = array({4, 5, 6}, {3}, int32);
  CHECK_EQ(stack({x, y}, 0).dtype(), float16);

  x = array({1, 2, 3}, {3}, int32);
  y = array({4, 5, 6, 7}, {4}, int32);
  CHECK_THROWS_MESSAGE(
      stack({x, y}, 0), "All arrays must have the same shape and dtype");
}

TEST_CASE("test full_like") {
  auto base_int = array({1, 2, 3}, {3}, int16);

  auto from_array_with_dtype = full_like(base_int, array(7.5f), float16);
  auto expected_float16 = array({7.5, 7.5, 7.5}, {3}, float16);
  CHECK_EQ(from_array_with_dtype.dtype(), float16);
  CHECK(array_equal(from_array_with_dtype, expected_float16).item<bool>());

  auto from_array_default_dtype = full_like(base_int, array(4.0f));
  auto expected_int16 = array({4, 4, 4}, {3}, int16);
  CHECK_EQ(from_array_default_dtype.dtype(), int16);
  CHECK(array_equal(from_array_default_dtype, expected_int16).item<bool>());

  auto from_scalar_with_dtype = full_like(base_int, 3.25f, float32);
  auto expected_float32 = array({3.25f, 3.25f, 3.25f}, {3}, float32);
  CHECK_EQ(from_scalar_with_dtype.dtype(), float32);
  CHECK(array_equal(from_scalar_with_dtype, expected_float32).item<bool>());

  auto base_float = array({1.0f, 2.0f}, {2}, float32);
  auto from_scalar_default_dtype = full_like(base_float, 2);
  auto expected_base_float = array({2.0f, 2.0f}, {2}, float32);
  CHECK_EQ(from_scalar_default_dtype.dtype(), float32);
  CHECK(
      array_equal(from_scalar_default_dtype, expected_base_float).item<bool>());
}

TEST_CASE("test eye") {
  auto eye_3 = eye(3);
  CHECK_EQ(eye_3.shape(), Shape{3, 3});
  auto expected_eye_3 =
      array({1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}, {3, 3});
  CHECK(array_equal(eye_3, expected_eye_3).item<bool>());

  auto eye_3x2 = eye(3, 2);
  CHECK_EQ(eye_3x2.shape(), Shape{3, 2});
  auto expected_eye_3x2 = array({1.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f}, {3, 2});
  CHECK(array_equal(eye_3x2, expected_eye_3x2).item<bool>());
}

TEST_CASE("test tri") {
  auto _tri = tri(4, 4, 0, float32);
  CHECK_EQ(_tri.shape(), Shape{4, 4});
  auto expected_tri = array(
      {1.0f,
       0.0f,
       0.0f,
       0.0f,
       1.0f,
       1.0f,
       0.0f,
       0.0f,
       1.0f,
       1.0f,
       1.0f,
       0.0f,
       1.0f,
       1.0f,
       1.0f,
       1.0f},
      {4, 4});
  CHECK(array_equal(_tri, expected_tri).item<bool>());
}

TEST_CASE("test tril") {
  auto _tril = tril(full({4, 4}, 2.0f, float32), 0);
  CHECK_EQ(_tril.shape(), Shape{4, 4});
  auto expected_tri = array(
      {2.0f,
       0.0f,
       0.0f,
       0.0f,
       2.0f,
       2.0f,
       0.0f,
       0.0f,
       2.0f,
       2.0f,
       2.0f,
       0.0f,
       2.0f,
       2.0f,
       2.0f,
       2.0f},
      {4, 4});
  CHECK(array_equal(_tril, expected_tri).item<bool>());
}

TEST_CASE("test triu") {
  auto _triu = triu(full({4, 4}, 2.0f, float32), 0);
  CHECK_EQ(_triu.shape(), Shape{4, 4});
  auto expected_tri = array(
      {2.0f,
       2.0f,
       2.0f,
       2.0f,
       0.0f,
       2.0f,
       2.0f,
       2.0f,
       0.0f,
       0.0f,
       2.0f,
       2.0f,
       0.0f,
       0.0f,
       0.0f,
       2.0f},
      {4, 4});
  CHECK(array_equal(_triu, expected_tri).item<bool>());
}

TEST_CASE("test identity") {
  auto id_4 = identity(4);
  CHECK_EQ(id_4.shape(), Shape{4, 4});
  auto expected_id_4 = array(
      {1.0f,
       0.0f,
       0.0f,
       0.0f,
       0.0f,
       1.0f,
       0.0f,
       0.0f,
       0.0f,
       0.0f,
       1.0f,
       0.0f,
       0.0f,
       0.0f,
       0.0f,
       1.0f},
      {4, 4});
  CHECK(array_equal(id_4, expected_id_4).item<bool>());
}

TEST_CASE("test eye with positive k offset") {
  auto eye_3_k1 = eye(3, 4, 1);
  CHECK_EQ(eye_3_k1.shape(), Shape{3, 4});
  auto expected_eye_3_k1 = array(
      {0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
      {3, 4});
  CHECK(array_equal(eye_3_k1, expected_eye_3_k1).item<bool>());
}

TEST_CASE("test eye with negative k offset") {
  auto eye_4_k_minus1 = eye(4, 3, -1);
  CHECK_EQ(eye_4_k_minus1.shape(), Shape{4, 3});
  auto expected_eye_4_k_minus1 = array(
      {0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f},
      {4, 3});
  CHECK(array_equal(eye_4_k_minus1, expected_eye_4_k_minus1).item<bool>());
}

TEST_CASE("test basic clipping") {
  array a({1.0f, 4.0f, 3.0f, 8.0f, 5.0f}, {5});
  array expected({2.0f, 4.0f, 3.0f, 6.0f, 5.0f}, {5});
  auto clipped = clip(a, array(2.0f), array(6.0f));
  CHECK(array_equal(clipped, expected).item<bool>());
}

TEST_CASE("test clipping with only min") {
  array a({-1.0f, 1.0f, 0.0f, 5.0f}, {4});
  array expected({0.0f, 1.0f, 0.0f, 5.0f}, {4});
  auto clipped = clip(a, array(0.0f), std::nullopt);
  CHECK(array_equal(clipped, expected).item<bool>());
}

TEST_CASE("test clipping with only max") {
  array a({2.0f, 3.0f, 4.0f, 5.0f}, {4});
  array expected({2.0f, 3.0f, 4.0f, 4.0f}, {4});
  auto clipped = clip(a, std::nullopt, array(4.0f));
  CHECK(array_equal(clipped, expected).item<bool>());
}

TEST_CASE("test linspace") {
  auto x = linspace(0, 10, 5);
  auto expected = array({0.0f, 2.5f, 5.0f, 7.5f, 10.0f}, {5});
  CHECK(array_equal(x, expected).item<bool>());

  x = linspace(0, 10, 5, int32);
  expected = array({0, 2, 5, 7, 10}, {5});
  CHECK(array_equal(x, expected).item<bool>());

  x = linspace(0, 1, 0);
  expected = array(std::initializer_list<float>{}, {0});
  CHECK(array_equal(x, expected).item<bool>());
}

TEST_CASE("test quantize dequantize") {
  auto x1 = ones({128, 1});
  auto x2 = expand_dims(arange(0, 512, float32), 0);
  auto x = x1 * x2;

  for (int i = 2; i <= 8; i *= 2) {
    int el_per_int = 32 / i;
    auto res = quantize(x, 128, i);
    auto x_q = res[0];
    auto scales = res[1];
    auto biases = res[2];
    CHECK_EQ(x_q.shape(), Shape{128, 512 / el_per_int});
    CHECK_EQ(scales.shape(), Shape{128, 4});
    CHECK_EQ(biases.shape(), Shape{128, 4});

    auto x_hat = dequantize(x_q, scales, biases, 128, i);
    auto max_diff = max(abs(x - x_hat)).item<float>();
    CHECK(max_diff <= 127.0 / (1 << i));
  }
}

TEST_CASE("test repeat") {
  auto data = array({13, 3, 16, 6, 14, 4, 15, 5, 11, 1, 12, 2}, {3, 2, 2});
  auto repeat_axis_0 = repeat(data, 2, 0);
  auto expected_axis_0 = array(
      {13, 3, 16, 6, 13, 3, 16, 6, 14, 4, 15, 5,
       14, 4, 15, 5, 11, 1, 12, 2, 11, 1, 12, 2},
      {6, 2, 2});

  auto repeat_axis_1 = repeat(data, 2, 1);
  auto expected_axis_1 = array(
      {13, 3, 13, 3, 16, 6, 16, 6, 14, 4, 14, 4,
       15, 5, 15, 5, 11, 1, 11, 1, 12, 2, 12, 2},
      {3, 4, 2});

  auto repeat_axis_2 = repeat(data, 2); // default axis == ndim - 1 == 2
  auto expected_axis_2 = array(
      {13, 13, 3, 3, 16, 16, 6, 6, 14, 14, 4, 4,
       15, 15, 5, 5, 11, 11, 1, 1, 12, 12, 2, 2},
      {24});

  // check output
  CHECK(array_equal(repeat_axis_0, expected_axis_0).item<bool>());
  CHECK(array_equal(repeat_axis_1, expected_axis_1).item<bool>());
  CHECK(array_equal(repeat_axis_2, expected_axis_2).item<bool>());

  auto data_2 = array({1, 3, 2}, {3});
  auto repeat_2 = repeat(data_2, 2, 0);
  auto expected_2 = array({1, 1, 3, 3, 2, 2}, {6});
  CHECK(array_equal(repeat_2, expected_2).item<bool>());

  auto data_3 = array({1, 2, 3, 4, 5, 4, 0, 1, 2}, {3, 3});
  auto repeat_3 = repeat(data_3, 2, 0);
  auto expected_3 =
      array({1, 2, 3, 1, 2, 3, 4, 5, 4, 4, 5, 4, 0, 1, 2, 0, 1, 2}, {6, 3});
  CHECK(array_equal(repeat_3, expected_3).item<bool>());

  // 0 repeats
  auto repeat_4 = repeat(data_3, 0, 0);
  auto expected_4 = array({});
  CHECK(array_equal(repeat_2, expected_2).item<bool>());

  // negative repeats
  CHECK_THROWS_AS(repeat(data_3, -3, 0), std::invalid_argument);
}

TEST_CASE("tile") {
  auto x = array({1, 2, 3}, {3});
  auto y = tile(x, {2});
  auto expected = array({1, 2, 3, 1, 2, 3}, {6});
  CHECK(array_equal(y, expected).item<bool>());
  x = array({1, 2, 3, 4}, {2, 2});
  y = tile(x, {2});
  expected = array({1, 2, 1, 2, 3, 4, 3, 4}, {2, 4});
  CHECK(array_equal(y, expected).item<bool>());
  x = array({1, 2, 3, 4}, {2, 2});
  y = tile(x, {4, 1});
  expected = array({1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4}, {8, 2});
  CHECK(array_equal(y, expected).item<bool>());

  x = array({1, 2, 3, 4}, {2, 2});
  y = tile(x, {2, 2});
  expected = array({1, 2, 1, 2, 3, 4, 3, 4, 1, 2, 1, 2, 3, 4, 3, 4}, {4, 4});
  CHECK(array_equal(y, expected).item<bool>());
  x = array({1, 2, 3}, {3});
  y = tile(x, {2, 2, 2});
  expected = array(
      {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3},
      {2, 2, 6});
  CHECK(array_equal(y, expected).item<bool>());
}

TEST_CASE("tensordot") {
  auto x = reshape(arange(60.), {3, 4, 5});
  auto y = reshape(arange(24.), {4, 3, 2});
  auto z = tensordot(x, y, {1, 0}, {0, 1});
  auto expected = array(
      {4400, 4730, 4532, 4874, 4664, 5018, 4796, 5162, 4928, 5306}, {5, 2});
  CHECK(array_equal(z, expected).item<bool>());
  x = reshape(arange(360.), {3, 4, 5, 6});
  y = reshape(arange(360.), {6, 4, 5, 3});
  CHECK_THROWS_AS(tensordot(x, y, {2, 1, 3}, {1, 2, 0}), std::invalid_argument);
  x = reshape(arange(60.), {3, 4, 5});
  y = reshape(arange(120.), {4, 5, 6});
  z = tensordot(x, y, 2);
  expected = array(
      {14820.,
       15010.,
       15200.,
       15390.,
       15580.,
       15770.,
       37620.,
       38210.,
       38800.,
       39390.,
       39980.,
       40570.,
       60420.,
       61410.,
       62400.,
       63390.,
       64380.,
       65370.},
      {3, 6});
  CHECK(array_equal(z, expected).item<bool>());
}

TEST_CASE("outer") {
  auto x = arange(1.0, 5.0);
  auto y = arange(1.0, 4.0);
  auto z = outer(x, y);
  auto expected = array(
      {1.0, 2.0, 3.0, 2.0, 4.0, 6.0, 3.0, 6.0, 9.0, 4.0, 8.0, 12.0}, {4, 3});
  CHECK(array_equal(z, expected).item<bool>());

  x = ones({5});
  y = linspace(-2., 2., 5);
  z = outer(x, y);
  expected = array(
      {-2., -1., 0.,  1.,  2., -2., -1., 0.,  1.,  2., -2., -1., 0.,
       1.,  2.,  -2., -1., 0., 1.,  2.,  -2., -1., 0., 1.,  2.},
      {5, 5});
  CHECK(array_equal(z, expected).item<bool>());
}

TEST_CASE("inner") {
  CHECK_THROWS_AS(
      inner(reshape(arange(5.), {1, 5}), reshape(arange(6.), {2, 3})),
      std::invalid_argument);
  auto x = array({1., 2., 3.});
  auto y = array({0., 1., 0.});
  auto z = inner(x, y);
  CHECK_EQ(z.item<float>(), 2.f);

  x = reshape(arange(24.), {2, 3, 4});
  y = arange(4.);
  z = inner(x, y);
  auto expected = array({14., 38., 62., 86., 110., 134.}, {2, 3});
  CHECK(array_equal(z, expected).item<bool>());

  x = reshape(arange(2.), {1, 1, 2});
  y = reshape(arange(6.), {3, 2});
  z = inner(x, y);
  expected = array({1., 3., 5.}, {1, 1, 3});
  CHECK(array_equal(z, expected).item<bool>());

  z = inner(eye(2), array(7.));
  expected = array({7., 0., 0., 7.}, {2, 2});
  CHECK(array_equal(z, expected).item<bool>());
}

TEST_CASE("test divmod") {
  auto x = array({1, 2, 3});
  auto y = array({1, 1, 1});
  auto out = divmod(x, y);
  CHECK(array_equal(out[0], array({1, 2, 3})).item<bool>());
  CHECK(array_equal(out[1], array({0, 0, 0})).item<bool>());

  x = array({5, 6, 7});
  y = array({2, 2, 2});
  out = divmod(x, y);
  CHECK(array_equal(out[0], array({2, 3, 3})).item<bool>());
  CHECK(array_equal(out[1], array({1, 0, 1})).item<bool>());

  // Siblings should be gone after evaling the graph
  CHECK(out[0].siblings().empty());
  CHECK(out[1].siblings().empty());

  x = array({5.0, 6.0, 7.0});
  y = array({2.0, 2.0, 2.0});
  out = divmod(x, y);
  CHECK(array_equal(out[0], array({2.0, 3.0, 3.0})).item<bool>());
  CHECK(array_equal(out[1], array({1.0, 0.0, 1.0})).item<bool>());

  x = array({1.0}, complex64);
  y = array({2.0}, complex64);
  CHECK_THROWS(divmod(x, y));

  // Check that we can eval on both outputs
  x = array({1.0});
  y = array({2.0});
  out = divmod(x, y);
  eval(out);
  CHECK_EQ(out[0].item<float>(), 0.0);
  CHECK_EQ(out[1].item<float>(), 1.0);

  // Check nested in the graph
  x = array({1.0});
  y = array({2.0});
  out = divmod(x, y);
  auto z = out[0] + out[1];
  CHECK_EQ(z.item<float>(), 1.0);

  // Check that we can still eval when one output goes out of scope
  std::vector<array> out_holder;
  {
    out_holder.push_back(divmod(x, y)[0]);
  }
  eval(out_holder);
  CHECK_EQ(out_holder[0].item<float>(), 0.0);

  // Check that we can still eval when the other output goes out of scope
  out_holder.clear();
  {
    out_holder.push_back(divmod(x, y)[1]);
  }
  eval(out_holder);
  CHECK_EQ(out_holder[0].item<float>(), 1.0);
}

TEST_CASE("test diagonal") {
  auto x = array({0, 1, 2, 3, 4, 5, 6, 7}, {4, 2});
  auto out = diagonal(x);
  CHECK(array_equal(out, array({0, 3}, {2})).item<bool>());

  CHECK_THROWS_AS(diagonal(x, 1, 6, 0), std::out_of_range);
  CHECK_THROWS_AS(diagonal(x, 1, 0, -3), std::out_of_range);

  x = array({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, {3, 4});
  out = diagonal(x, 2, 1, 0);
  CHECK(array_equal(out, array({8}, {1})).item<bool>());

  out = diagonal(x, -1, 0, 1);
  CHECK(array_equal(out, array({4, 9}, {2})).item<bool>());

  out = diagonal(x, -5, 0, 1);
  eval(out);
  CHECK_EQ(out.shape(), Shape{0});

  x = array({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, {3, 2, 2});
  out = diagonal(x, 1, 0, 1);
  CHECK(array_equal(out, array({2, 3}, {2, 1})).item<bool>());

  out = diagonal(x, 0, 2, 0);
  CHECK(array_equal(out, array({0, 5, 2, 7}, {2, 2})).item<bool>());

  out = diagonal(x, 1, -1, 0);
  CHECK(array_equal(out, array({4, 9, 6, 11}, {2, 2})).item<bool>());

  x = reshape(arange(16), {2, 2, 2, 2});
  out = diagonal(x, 0, 0, 1);
  CHECK(array_equal(out, array({0, 12, 1, 13, 2, 14, 3, 15}, {2, 2, 2}))
            .item<bool>());

  CHECK_THROWS_AS(diagonal(x, 0, 1, 1), std::invalid_argument);

  x = array({0, 1}, {2});
  CHECK_THROWS_AS(diagonal(x, 0, 0, 1), std::invalid_argument);
}

TEST_CASE("test diag") {
  // To few or too many dimensions
  CHECK_THROWS(diag(array(0.0)));
  CHECK_THROWS(diag(array({0.0}, {1, 1, 1})));

  // Test with 1D array
  auto x = array({0, 1, 2, 3}, {4});
  auto out = diag(x, 0);
  CHECK(
      array_equal(
          out, array({0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 3}, {4, 4}))
          .item<bool>());

  out = diag(x, 1);
  CHECK(array_equal(
            out,
            array(
                {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
                 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0},
                {5, 5}))
            .item<bool>());

  out = diag(x, -1);
  CHECK(array_equal(
            out,
            array(
                {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
                 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 3, 0},
                {5, 5}))
            .item<bool>());

  // Test with 2D array
  x = array({0, 1, 2, 3, 4, 5, 6, 7, 8}, {3, 3});
  out = diag(x, 0);
  CHECK(array_equal(out, array({0, 4, 8}, {3})).item<bool>());

  out = diag(x, 1);
  CHECK(array_equal(out, array({1, 5}, {2})).item<bool>());

  out = diag(x, -1);
  CHECK(array_equal(out, array({3, 7}, {2})).item<bool>());
}

TEST_CASE("test issubdtype") {
  const auto cats = {
      complexfloating,
      floating,
      inexact,
      signedinteger,
      unsignedinteger,
      integer,
      number,
      generic};
  const auto types = {
      bool_,
      uint8,
      uint16,
      uint32,
      uint64,
      int8,
      int16,
      int32,
      int64,
      float16,
      float32,
      bfloat16,
      complex64};
  for (const auto& type : types) {
    CHECK(issubdtype(type, type));
    CHECK(issubdtype(type, generic));
    switch (kindof(type)) {
      case Dtype::Kind::b:
        CHECK_FALSE(issubdtype(type, complexfloating));
        CHECK_FALSE(issubdtype(type, floating));
        CHECK_FALSE(issubdtype(type, inexact));
        CHECK_FALSE(issubdtype(type, signedinteger));
        CHECK_FALSE(issubdtype(type, unsignedinteger));
        CHECK_FALSE(issubdtype(type, integer));
        CHECK_FALSE(issubdtype(type, number));
        CHECK(issubdtype(type, generic));
        break;
      case Dtype::Kind::u:
        CHECK_FALSE(issubdtype(type, complexfloating));
        CHECK_FALSE(issubdtype(type, floating));
        CHECK_FALSE(issubdtype(type, inexact));
        CHECK_FALSE(issubdtype(type, signedinteger));
        CHECK(issubdtype(type, unsignedinteger));
        CHECK(issubdtype(type, integer));
        CHECK(issubdtype(type, number));
        CHECK(issubdtype(type, generic));
        break;
      case Dtype::Kind::i:
        CHECK_FALSE(issubdtype(type, complexfloating));
        CHECK_FALSE(issubdtype(type, floating));
        CHECK_FALSE(issubdtype(type, inexact));
        CHECK(issubdtype(type, signedinteger));
        CHECK_FALSE(issubdtype(type, unsignedinteger));
        CHECK(issubdtype(type, integer));
        CHECK(issubdtype(type, number));
        CHECK(issubdtype(type, generic));
        break;
      case Dtype::Kind::f:
        CHECK_FALSE(issubdtype(type, complexfloating));
        CHECK(issubdtype(type, floating));
        CHECK(issubdtype(type, inexact));
        CHECK_FALSE(issubdtype(type, signedinteger));
        CHECK_FALSE(issubdtype(type, unsignedinteger));
        CHECK_FALSE(issubdtype(type, integer));
        CHECK(issubdtype(type, number));
        CHECK(issubdtype(type, generic));
        break;
      case Dtype::Kind::c:
        CHECK(issubdtype(type, complexfloating));
        CHECK_FALSE(issubdtype(type, floating));
        CHECK(issubdtype(type, inexact));
        CHECK_FALSE(issubdtype(type, signedinteger));
        CHECK_FALSE(issubdtype(type, unsignedinteger));
        CHECK_FALSE(issubdtype(type, integer));
        CHECK(issubdtype(type, number));
        CHECK(issubdtype(type, generic));
        break;
      case Dtype::Kind::V:
        CHECK_FALSE(issubdtype(type, complexfloating));
        CHECK(issubdtype(type, floating));
        CHECK(issubdtype(type, inexact));
        CHECK_FALSE(issubdtype(type, signedinteger));
        CHECK_FALSE(issubdtype(type, unsignedinteger));
        CHECK_FALSE(issubdtype(type, integer));
        CHECK(issubdtype(type, number));
        CHECK(issubdtype(type, generic));
        break;
    }
  }

  for (const auto& type : types) {
    CHECK(issubdtype(type, type));
    CHECK(issubdtype(type, generic));
    for (auto type1 : types) {
      CHECK_EQ(issubdtype(type, type1), type == type1);
    }
  }

  for (const auto& cat : cats) {
    CHECK(issubdtype(cat, cat));
    switch (cat) {
      case Dtype::Category::complexfloating:
        CHECK(issubdtype(cat, complexfloating));
        CHECK_FALSE(issubdtype(cat, floating));
        CHECK(issubdtype(cat, inexact));
        CHECK_FALSE(issubdtype(cat, signedinteger));
        CHECK_FALSE(issubdtype(cat, unsignedinteger));
        CHECK_FALSE(issubdtype(cat, integer));
        CHECK(issubdtype(cat, number));
        CHECK(issubdtype(cat, generic));
        break;
      case Dtype::Category::floating:
        CHECK_FALSE(issubdtype(cat, complexfloating));
        CHECK(issubdtype(cat, floating));
        CHECK(issubdtype(cat, inexact));
        CHECK_FALSE(issubdtype(cat, signedinteger));
        CHECK_FALSE(issubdtype(cat, unsignedinteger));
        CHECK_FALSE(issubdtype(cat, integer));
        CHECK(issubdtype(cat, number));
        CHECK(issubdtype(cat, generic));
        break;
      case Dtype::Category::inexact:
        CHECK_FALSE(issubdtype(cat, complexfloating));
        CHECK_FALSE(issubdtype(cat, floating));
        CHECK(issubdtype(cat, inexact));
        CHECK_FALSE(issubdtype(cat, signedinteger));
        CHECK_FALSE(issubdtype(cat, unsignedinteger));
        CHECK_FALSE(issubdtype(cat, integer));
        CHECK(issubdtype(cat, number));
        CHECK(issubdtype(cat, generic));
        break;
      case Dtype::Category::signedinteger:
        CHECK_FALSE(issubdtype(cat, complexfloating));
        CHECK_FALSE(issubdtype(cat, floating));
        CHECK_FALSE(issubdtype(cat, inexact));
        CHECK(issubdtype(cat, signedinteger));
        CHECK_FALSE(issubdtype(cat, unsignedinteger));
        CHECK(issubdtype(cat, integer));
        CHECK(issubdtype(cat, number));
        CHECK(issubdtype(cat, generic));
        break;
      case Dtype::Category::unsignedinteger:
        CHECK_FALSE(issubdtype(cat, complexfloating));
        CHECK_FALSE(issubdtype(cat, floating));
        CHECK_FALSE(issubdtype(cat, inexact));
        CHECK_FALSE(issubdtype(cat, signedinteger));
        CHECK(issubdtype(cat, unsignedinteger));
        CHECK(issubdtype(cat, integer));
        CHECK(issubdtype(cat, number));
        CHECK(issubdtype(cat, generic));
        break;
      case Dtype::Category::integer:
        CHECK_FALSE(issubdtype(cat, complexfloating));
        CHECK_FALSE(issubdtype(cat, floating));
        CHECK_FALSE(issubdtype(cat, inexact));
        CHECK_FALSE(issubdtype(cat, signedinteger));
        CHECK_FALSE(issubdtype(cat, unsignedinteger));
        CHECK(issubdtype(cat, integer));
        CHECK(issubdtype(cat, number));
        CHECK(issubdtype(cat, generic));
        break;
      case Dtype::Category::number:
        CHECK_FALSE(issubdtype(cat, complexfloating));
        CHECK_FALSE(issubdtype(cat, floating));
        CHECK_FALSE(issubdtype(cat, inexact));
        CHECK_FALSE(issubdtype(cat, signedinteger));
        CHECK_FALSE(issubdtype(cat, unsignedinteger));
        CHECK_FALSE(issubdtype(cat, integer));
        CHECK(issubdtype(cat, number));
        CHECK(issubdtype(cat, generic));
        break;
      case Dtype::Category::generic:
        CHECK_FALSE(issubdtype(cat, complexfloating));
        CHECK_FALSE(issubdtype(cat, floating));
        CHECK_FALSE(issubdtype(cat, inexact));
        CHECK_FALSE(issubdtype(cat, signedinteger));
        CHECK_FALSE(issubdtype(cat, unsignedinteger));
        CHECK_FALSE(issubdtype(cat, integer));
        CHECK_FALSE(issubdtype(cat, number));
        CHECK(issubdtype(cat, generic));
        break;
    }
  }
}

TEST_CASE("test atleast_1d") {
  auto x = array(1);
  auto out = atleast_1d(x);
  CHECK_EQ(out.ndim(), 1);
  CHECK_EQ(out.shape(), Shape{1});

  x = array({1, 2, 3}, {3});
  out = atleast_1d(x);
  CHECK_EQ(out.ndim(), 1);
  CHECK_EQ(out.shape(), Shape{3});

  x = array({1, 2, 3}, {3, 1});
  out = atleast_1d(x);
  CHECK_EQ(out.ndim(), 2);
  CHECK_EQ(out.shape(), Shape{3, 1});
}

TEST_CASE("test atleast_1d vector") {
  auto x = std::vector<array>{
      array(1), array({1, 2, 3}, {3}), array({1, 2, 3}, {3, 1})};
  auto out = atleast_1d(x);
  CHECK_EQ(out.size(), 3);
  CHECK_EQ(out[0].ndim(), 1);
  CHECK_EQ(out[0].shape(), Shape{1});
  CHECK_EQ(out[1].ndim(), 1);
  CHECK_EQ(out[1].shape(), Shape{3});
  CHECK_EQ(out[2].ndim(), 2);
  CHECK_EQ(out[2].shape(), Shape{3, 1});
}

TEST_CASE("test atleast_2d") {
  auto x = array(1);
  auto out = atleast_2d(x);
  CHECK_EQ(out.ndim(), 2);
  CHECK_EQ(out.shape(), Shape{1, 1});

  x = array({1, 2, 3}, {3});
  out = atleast_2d(x);
  CHECK_EQ(out.ndim(), 2);
  CHECK_EQ(out.shape(), Shape{1, 3});

  x = array({1, 2, 3}, {3, 1});
  out = atleast_2d(x);
  CHECK_EQ(out.ndim(), 2);
  CHECK_EQ(out.shape(), Shape{3, 1});
}

TEST_CASE("test atleast_2d vector") {
  auto x = std::vector<array>{
      array(1), array({1, 2, 3}, {3}), array({1, 2, 3}, {3, 1})};
  auto out = atleast_2d(x);
  CHECK_EQ(out.size(), 3);
  CHECK_EQ(out[0].ndim(), 2);
  CHECK_EQ(out[0].shape(), Shape{1, 1});
  CHECK_EQ(out[1].ndim(), 2);
  CHECK_EQ(out[1].shape(), Shape{1, 3});
  CHECK_EQ(out[2].ndim(), 2);
  CHECK_EQ(out[2].shape(), Shape{3, 1});
}

TEST_CASE("test atleast_3d") {
  auto x = array(1);
  auto out = atleast_3d(x);
  CHECK_EQ(out.ndim(), 3);
  CHECK_EQ(out.shape(), Shape{1, 1, 1});

  x = array({1, 2, 3}, {3});
  out = atleast_3d(x);
  CHECK_EQ(out.ndim(), 3);
  CHECK_EQ(out.shape(), Shape{1, 3, 1});

  x = array({1, 2, 3}, {3, 1});
  out = atleast_3d(x);
  CHECK_EQ(out.ndim(), 3);
  CHECK_EQ(out.shape(), Shape{3, 1, 1});
}

TEST_CASE("test atleast_3d vector") {
  auto x = std::vector<array>{
      array(1), array({1, 2, 3}, {3}), array({1, 2, 3}, {3, 1})};
  auto out = atleast_3d(x);
  CHECK_EQ(out.size(), 3);
  CHECK_EQ(out[0].ndim(), 3);
  CHECK_EQ(out[0].shape(), Shape{1, 1, 1});
  CHECK_EQ(out[1].ndim(), 3);
  CHECK_EQ(out[1].shape(), Shape{1, 3, 1});
  CHECK_EQ(out[2].ndim(), 3);
  CHECK_EQ(out[2].shape(), Shape{3, 1, 1});
}

TEST_CASE("test topk") {
  auto x = reshape(arange(10), {2, 5});

  {
    auto y = topk(x, 1, 1);
    CHECK(array_equal(y, array({4, 9}, {2, 1})).item<bool>());
  }

  {
    auto y = topk(x, 2, 0);
    CHECK(array_equal(y, x).item<bool>());
  }

  {
    auto y = topk(x, 1, 0);
    CHECK(array_equal(y, array({5, 6, 7, 8, 9}, {1, 5})).item<bool>());
  }
}

TEST_CASE("test meshgrid") {
  // Test default
  auto x = array({1, 2, 3}, {3});
  auto in = std::vector<array>{x};
  auto out = meshgrid(in);
  CHECK(array_equal(out[0], x).item<bool>());

  // Test different lengths
  auto y = array({4, 5}, {2});
  in = std::vector<array>{x, y};
  out = meshgrid(in);
  auto expected_zero = array({1, 2, 3, 1, 2, 3}, {2, 3});
  auto expected_one = array({4, 4, 4, 5, 5, 5}, {2, 3});
  CHECK(array_equal(out[0], expected_zero).item<bool>());
  CHECK(array_equal(out[1], expected_one).item<bool>());

  // Test sparse true
  in = std::vector<array>{x, x};
  out = meshgrid(in, true);
  expected_zero = array({1, 2, 3}, {1, 3});
  expected_one = array({1, 2, 3}, {3, 1});
  CHECK(array_equal(out[0], expected_zero).item<bool>());
  CHECK(array_equal(out[1], expected_one).item<bool>());
}

TEST_CASE("test conv1d") {
  auto in = astype(
      array(
          {0.5488135,
           0.71518937,
           0.60276338,
           0.54488318,
           0.4236548,
           0.64589411},
          {1, 3, 2}),
      float16);

  int stride = 1;
  int padding = 1;

  {
    int groups = 1;
    auto wt = astype(
        array(
            {

                0.43758721, 0.891773,   0.96366276, 0.38344152,
                0.79172504, 0.52889492,

                0.56804456, 0.92559664, 0.07103606, 0.0871293,
                0.0202184,  0.83261985,

                0.77815675, 0.87001215, 0.97861834, 0.79915856,
                0.46147936, 0.78052918,

                0.11827443, 0.63992102, 0.14335329, 0.94466892,
                0.52184832, 0.41466194

            },
            {4, 3, 2}),
        float16);

    auto expected = array(
        {1.56836,
         0.567383,
         1.8125,
         1.29492,
         2.34375,
         1.61035,
         2.77539,
         1.61328,
         1.40527,
         0.933105,
         1.87402,
         1.09082},
        {1, 3, 4});

    auto out = conv1d(in, wt, stride, padding, /* dilation= */ 1, groups);
    CHECK(allclose(out, expected).item<bool>());
  }

  {
    int groups = 2;
    auto wt = array(
        {0.43758721,
         0.891773,
         0.96366276,

         0.38344152,
         0.79172504,
         0.52889492,

         0.56804456,
         0.92559664,
         0.07103606,

         0.0871293,
         0.0202184,
         0.83261985

        },
        {4, 3, 1});

    auto expected = array(
        {1.07007,
         0.753201,
         0.700818,
         0.468176,
         1.18568,
         0.91152,
         0.956607,
         0.611213,
         0.641404,
         0.566401,
         0.907472,
         0.0605397},
        {1, 3, 4});

    auto out = conv1d(in, wt, stride, padding, /* dilation= */ 1, groups);
    CHECK(allclose(out, expected).item<bool>());
  }
}

TEST_CASE("test conv2d") {
  auto in = array(
      {0.57429284,
       -0.21628855,
       -0.18673691,
       -0.3793517,

       0.3059678,
       -0.8137168,
       0.6168841,
       -0.26912728},
      {1, 2, 2, 2});

  std::pair<int, int> stride{1, 1};
  std::pair<int, int> padding{0, 0};

  {
    int groups = 1;

    auto wt = array(
        {0.3190391,   -0.24937038, 1.4621079,   -2.0601406,  -0.3224172,
         -0.38405436, 1.1337694,   -1.0998913,  -0.1724282,  -0.8778584,
         0.04221375,  0.58281523,  -1.1006192,  1.1447237,   0.9015907,
         0.50249434,  0.90085596,  -0.68372786, -0.12289023, -0.93576944,
         -0.26788807, 0.53035545,  -0.69166076, -0.39675352, -0.6871727,
         -0.84520566, -0.6712461,  -0.0126646,  -1.1173104,  0.2344157,
         1.6598022,   0.74204415},
        {4, 2, 2, 2});

    auto expected =
        array({1.9549234, -0.98542136, 0.2097499, 0.20991313}, {1, 1, 1, 4});
    auto out = conv2d(in, wt, stride, padding, /* dilation= */ {1, 1}, groups);
    CHECK(allclose(out, expected).item<bool>());
  }

  {
    int groups = 2;
    auto wt = array(
        {0.3190391,
         -0.24937038,

         1.46210794,
         -2.06014071,

         -0.3224172,
         -0.38405435,

         1.13376944,
         -1.09989127,

         -0.17242821,
         -0.87785842,

         0.04221375,
         0.58281521,

         -1.10061918,
         1.14472371,

         0.90159072,
         0.50249434},
        {4, 2, 2, 1});

    auto expected = array(
        {-0.59372161, -0.44505326, 0.17910982, -1.06507601}, {1, 1, 1, 4});

    auto out = conv2d(in, wt, stride, padding, /* dilation= */ {1, 1}, groups);
    CHECK(allclose(out, expected).item<bool>());
  }

  {
    in = array(
        {0.57429284,
         -0.21628855,
         -0.18673691,
         -0.3793517,

         0.3059678,
         -0.8137168,
         0.6168841,
         -0.26912728,

         0.57429284,
         -0.21628855,
         -0.18673691,
         -0.3793517,

         0.3059678,
         -0.8137168,
         0.6168841,
         -0.26912728},
        {2, 2, 2, 2});

    int groups = 2;
    auto wt = array(
        {0.3190391,
         -0.24937038,

         1.46210794,
         -2.06014071,

         -0.3224172,
         -0.38405435,

         1.13376944,
         -1.09989127,

         -0.17242821,
         -0.87785842,

         0.04221375,
         0.58281521,

         -1.10061918,
         1.14472371,

         0.90159072,
         0.50249434},
        {4, 2, 2, 1});

    auto expected = array(
        {-0.59372161, -0.44505326, 0.17910982, -1.06507601}, {1, 1, 1, 4});

    auto out = conv2d(in, wt, stride, padding, /* dilation= */ {1, 1}, groups);
    CHECK(allclose(out, expected).item<bool>());
  }
}

TEST_CASE("test trace") {
  auto in = eye(3);
  auto out = trace(in).item<float>();
  CHECK_EQ(out, 3.0);

  in = array({1, 2, 3, 4, 5, 6, 7, 8, 9}, {3, 3}, int32);
  auto out2 = trace(in).item<int>();
  CHECK_EQ(out2, 15);

  in = reshape(arange(8), {2, 2, 2});
  auto out3 = trace(in, 0, 0, 1);
  CHECK(array_equal(out3, array({6, 8}, {2})).item<bool>());

  auto out4 = trace(in, 0, 1, 2, float32);
  CHECK(array_equal(out4, array({3, 11}, {2})).item<bool>());
}

TEST_CASE("test view") {
  auto in = array(3);
  CHECK_THROWS(view(in, int64));

  in = array({1, 2, 3});
  CHECK_THROWS(view(in, int64));

  in = array({1, 2, 3, 4}, int64);
  auto out = view(in, int32);
  CHECK(array_equal(out, array({1, 0, 2, 0, 3, 0, 4, 0})).item<bool>());
}

TEST_CASE("test roll") {
  auto x = reshape(arange(10), {2, 5});

  auto y = roll(x, 2);
  CHECK(array_equal(y, array({8, 9, 0, 1, 2, 3, 4, 5, 6, 7}, {2, 5}))
            .item<bool>());

  y = roll(x, -2);
  CHECK(array_equal(y, array({2, 3, 4, 5, 6, 7, 8, 9, 0, 1}, {2, 5}))
            .item<bool>());

  y = roll(x, 2, 1);
  CHECK(array_equal(y, array({3, 4, 0, 1, 2, 8, 9, 5, 6, 7}, {2, 5}))
            .item<bool>());

  y = roll(x, -2, 1);
  CHECK(array_equal(y, array({2, 3, 4, 0, 1, 7, 8, 9, 5, 6}, {2, 5}))
            .item<bool>());

  y = roll(x, 2, {0, 0, 0});
  CHECK(array_equal(y, array({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {2, 5}))
            .item<bool>());

  y = roll(x, 1, {1, 1, 1});
  CHECK(array_equal(y, array({2, 3, 4, 0, 1, 7, 8, 9, 5, 6}, {2, 5}))
            .item<bool>());

  y = roll(x, {1, 2}, {0, 1});
  CHECK(array_equal(y, array({8, 9, 5, 6, 7, 3, 4, 0, 1, 2}, {2, 5}))
            .item<bool>());

  y = roll(array({}), 0, 0);
  CHECK(array_equal(y, array({})).item<bool>());
}

TEST_CASE("test contiguous") {
  auto x = array({1, 2, 3});
  x = contiguous(broadcast_to(x, {2, 2, 3}));
  eval(x);
  CHECK(x.flags().row_contiguous);
  CHECK_EQ(x.strides(), decltype(x.strides()){6, 3, 1});

  x = array({1, 2, 1, 2}, {2, 2});
  x = contiguous(transpose(x), true);
  eval(x);
  CHECK(x.flags().col_contiguous);
  CHECK_EQ(x.strides(), decltype(x.strides()){1, 2});
}

TEST_CASE("test bitwise shift operations") {
  std::vector<Dtype> dtypes = {
      int8, int16, int32, int64, uint8, uint16, uint32, uint64};

  for (const auto& dtype : dtypes) {
    array x = full({4}, 1, dtype);
    array y = full({4}, 2, dtype);

    auto left_shift_result = left_shift(x, y);
    CHECK_EQ(left_shift_result.dtype(), dtype);
    CHECK(array_equal(left_shift_result, array({4, 4, 4, 4}, dtype))
              .item<bool>());

    auto right_shift_result = right_shift(full({4}, 4, dtype), y);
    CHECK_EQ(right_shift_result.dtype(), dtype);
    CHECK(array_equal(right_shift_result, full({4}, 1, dtype)).item<bool>());
  }

  array x = array({127, -128}, int8);
  array y = array({1, 1}, int8);
  auto left_shift_result = left_shift(x, y);
  auto right_shift_result = right_shift(x, y);

  CHECK(array_equal(left_shift_result, array({-2, 0}, int8)).item<bool>());
  CHECK(array_equal(right_shift_result, array({63, -64}, int8)).item<bool>());

  array x_bool = full({4}, true, bool_);
  array y_bool = full({4}, true, bool_);
  auto left_shift_bool_result = left_shift(x_bool, y_bool);
  auto right_shift_bool_result = right_shift(x_bool, y_bool);

  CHECK_EQ(left_shift_bool_result.dtype(), uint8);
  CHECK(array_equal(left_shift_bool_result, full({4}, 2, uint8)).item<bool>());

  CHECK_EQ(right_shift_bool_result.dtype(), uint8);
  CHECK(array_equal(right_shift_bool_result, full({4}, 0, uint8)).item<bool>());
}

TEST_CASE("test conv_transpose1d with output_padding") {
  auto in = array({1.0, 2.0, 3.0}, {1, 1, 3});
  auto wt = array({1.0, 1.0, 1.0}, {1, 1, 3});
  int stride = 2;
  int padding = 0;
  int dilation = 1;
  int output_padding = 1;
  int groups = 1;

  auto out = conv_transpose1d(
      in, wt, stride, padding, dilation, output_padding, groups);
  auto expected = array({6.0, 0.0}, {1, 2, 1});
  CHECK(array_equal(out, expected).item<bool>());
}

TEST_CASE("test conv_transpose2d with output_padding") {
  auto in = array({1.0, 2.0, 3.0, 4.0}, {1, 1, 2, 2});
  auto wt = array({1.0, 1.0, 1.0, 1.0}, {2, 1, 1, 2});
  std::pair<int, int> stride{2, 2};
  std::pair<int, int> padding{0, 0};
  std::pair<int, int> output_padding{1, 1};
  std::pair<int, int> dilation{1, 1};
  int groups = 1;

  auto out = conv_transpose2d(
      in, wt, stride, padding, dilation, output_padding, groups);
  auto expected = array(
      {3.0,
       3.0,
       0.0,
       0.0,
       7.0,
       7.0,
       0.0,
       0.0,
       0.0,
       0.0,
       0.0,
       0.0,
       0.0,
       0.0,
       0.0,
       0.0},
      {1, 2, 4, 2});
  CHECK(array_equal(out, expected).item<bool>());
}

TEST_CASE("test conv_transpose3d with output_padding") {
  auto in = array({1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, {1, 1, 2, 2, 2});
  auto wt = array({1.0, 1.0}, {1, 1, 1, 1, 2});
  std::tuple<int, int, int> stride{2, 2, 2};
  std::tuple<int, int, int> padding{0, 0, 0};
  std::tuple<int, int, int> output_padding{1, 1, 1};
  std::tuple<int, int, int> dilation{1, 1, 1};
  int groups = 1;

  auto out = conv_transpose3d(
      in, wt, stride, padding, dilation, output_padding, groups);
  auto expected = array(
      {3.0, 0.0, 7.0, 0.0, 0.0, 0.0, 0.0, 0.0, 11.0, 0.0, 15.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,  0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,  0.0},
      {1, 2, 4, 4, 1});
  CHECK(array_equal(out, expected).item<bool>());
}

TEST_CASE("test fp8 conversion") {
  for (auto t : {float32, float16, bfloat16}) {
    array in({-1.125, -1.0, 0.0, 1.0, 1.125, 4.5, 448.0}, t);
    auto in_fp8 = to_fp8(in);
    auto out = from_fp8(in_fp8, t);
    CHECK(array_equal(out, in).item<bool>());
  }

  array in({-1.125, -1.0, 0.0, 1.0, 1.125, 4.5, 448.0});
  array noisy_in({-1.135, -1.01, 0.0001, 1.01, 1.135, 4.6, 447.0});
  auto in_fp8 = to_fp8(noisy_in);
  auto out = from_fp8(in_fp8, float32);
  CHECK(array_equal(out, in).item<bool>());

  // Overflow
  in = array({-600.0, 600.0});
  in_fp8 = to_fp8(in);
  out = from_fp8(in_fp8, float32);

  auto expected = array({-448.0f, 448.0f});
  CHECK(array_equal(out, expected, true).item<bool>());
}

TEST_CASE("test max min with nan") {
  // Test maximum and minimum with NaN values
  auto x = array({0.0f, 1.0f, NAN, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f});
  auto y = array({NAN, 1.0f, NAN, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f});
  auto expected_max = array({NAN, 1.0f, NAN, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f});
  auto expected_min = array({NAN, 1.0f, NAN, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f});
  auto max_result = maximum(x, y);
  auto min_result = minimum(x, y);
  CHECK(array_equal(max_result, expected_max, true).item<bool>());
  CHECK(array_equal(min_result, expected_min, true).item<bool>());

  // Test with all NaN values
  x = array({NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN});
  y = array({NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN});
  max_result = maximum(x, y);
  min_result = minimum(x, y);
  auto expected = array({NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN});
  CHECK(array_equal(max_result, expected, true).item<bool>());
  CHECK(array_equal(min_result, expected, true).item<bool>());
}


================================================
FILE: tests/random_tests.cpp
================================================
// Copyright © 2023 Apple Inc.

#include <numeric>

#include "doctest/doctest.h"

#include "mlx/mlx.h"

using namespace mlx::core;

TEST_CASE("test random key") {
  auto key = random::key(0);
  CHECK(array_equal(key, array({0, 0})).item<bool>());

  key = random::key(1);
  CHECK(array_equal(key, array({0, 1})).item<bool>());

  int64_t seed = static_cast<int64_t>(1) << 32;
  key = random::key(seed);
  CHECK(array_equal(key, array({1, 0})).item<bool>());

  key = random::key(seed + 1);
  CHECK(array_equal(key, array({1, 1})).item<bool>());
}

TEST_CASE("test global rng") {
  random::seed(4);
  auto x = random::bits({});
  auto y = random::bits({});

  random::seed(4);
  auto a = random::bits({});
  auto b = random::bits({});

  CHECK_EQ(x.item<uint32_t>(), a.item<uint32_t>());
  CHECK_EQ(y.item<uint32_t>(), b.item<uint32_t>());
}

TEST_CASE("test random split") {
  auto [key, subkey] = random::split(random::key(0));
  CHECK(array_equal(key, array({4146024105u, 967050713u})).item<bool>());
  CHECK(array_equal(subkey, array({2718843009u, 1272950319u})).item<bool>());

  auto keys = random::split(random::key(0), 3);
  auto expected = array(
      {2467461003u,
       428148500u,
       3186719485u,
       3840466878u,
       2562233961u,
       1946702221u},
      {3, 2});
  CHECK(array_equal(keys, expected).item<bool>());
}

TEST_CASE("test random bits") {
  // Test shapes, types, and sizes
  {
    auto key = random::key(0);
    auto x = random::bits({}, key);
    CHECK_EQ(x.size(), 1);
    CHECK_EQ(x.dtype(), uint32);

    x = random::bits({0}, key);
    CHECK(array_equal(x, array({})).item<bool>());

    // Check wrong key type or shape
    key = array({0, 0});
    CHECK_THROWS_AS(random::uniform({}, key), std::invalid_argument);
    key = array({0, 0}, {1, 2});
    CHECK_THROWS_AS(random::uniform({}, key), std::invalid_argument);
    key = array({0u, 0u, 0u}, {3, 1});
    CHECK_THROWS_AS(random::uniform({}, key), std::invalid_argument);
    key = array({0u, 0u}, {2, 1});
    CHECK_THROWS_AS(random::uniform({}, key), std::invalid_argument);
  }

  // Expected bits in the following tests were generated from
  // Jax's Threefry 2x32 implementation using the following in
  // python:
  //
  // ```
  //   import jax
  //   import jax.prng
  //   shape = (SET THIS)
  //   seed = (SET THIS)
  //   width = (SET THIS)
  //   key = jax.random.PRNGKey(seed)
  //   print(jax.prng.threefry_prng_impl.random_bits(key, width, shape))

  {
    auto key = random::key(0);
    auto x = random::bits({}, key);
    auto y = random::bits({}, key);
    CHECK_EQ(x.item<uint32_t>(), 1797259609u);
    CHECK_EQ(x.item<uint32_t>(), y.item<uint32_t>());

    x = random::bits({}, 2, key);
    CHECK_EQ(x.item<uint16_t>(), 345);

    x = random::bits({}, 1, key);
    CHECK_EQ(x.item<uint8_t>(), 89);
  }

  {
    auto key = random::key(1);
    auto x = random::bits({}, key);
    CHECK_EQ(x.item<uint32_t>(), 507451445u);

    x = random::bits({}, 2, key);
    CHECK_EQ(x.item<uint16_t>(), 6197);

    x = random::bits({}, 1, key);
    CHECK_EQ(x.item<uint8_t>(), 53);

    CHECK_THROWS(random::bits({}, 0, key));
    CHECK_THROWS(random::bits({}, 5, key));
    CHECK_THROWS(random::bits({}, -1, key));
  }

  {
    auto key = random::key(0);
    auto x = random::bits({3, 1}, key);
    auto expected = array({4146024105u, 1351547692u, 2718843009u}, {3, 1});
    CHECK(array_equal(x, expected).item<bool>());

    x = random::bits({5}, 2, key);
    expected = array({20137, 63263, 64300, 20622, 16513}, uint16);
    CHECK(array_equal(x, expected).item<bool>());
    expected = array({20137, 63263, 64300, 20622, 16513, 41486}, uint16);
    x = random::bits({6}, 2, key);
    CHECK(array_equal(x, expected).item<bool>());
    expected = array({20137, 63263, 1497, 14756, 16513, 41486, 44591}, uint16);
    x = random::bits({7}, 2, key);
    CHECK(array_equal(x, expected).item<bool>());
    x = random::bits({8}, 2, key);
    expected =
        array({20137, 63263, 1497, 14756, 16513, 41486, 44591, 19423}, uint16);
    CHECK(array_equal(x, expected).item<bool>());
  }

  {
    auto key = array({0u, 0u, 1u, 1u}, {2, 2});
    auto shape = Shape{3};
    auto fn = [&shape](array k) { return random::bits(shape, k); };

    auto expected = array(
        {4146024105u,
         1351547692u,
         2718843009u,
         3725146706u,
         1802982961u,
         1349634643u},
        {2, 3});
    CHECK(array_equal(vmap(fn)(key), expected).item<bool>());
    expected = array(
        {2441914641u,
         1110694964u,
         3819641963u,
         2441914641u,
         1110694964u,
         3819641963u},
        {2, 3});
    CHECK(array_equal(vmap(fn, 1)(key), expected).item<bool>());

    // Vmap twice
    key = array(
        {0u,
         0u,
         1u,
         1u,
         2u,
         2u,

         3u,
         3u,
         4u,
         4u,
         5u,
         5u},
        {3, 2, 2});
    shape = {2};
    auto out = vmap(vmap(fn))(key);
    expected = array(
        {928981903u,
         3453687069u,
         3606183818u,
         460005496u,

         2799733733u,
         856293553u,
         4081856343u,
         3445925136u,

         2775548010u,
         1430281703u,
         305173070u,
         2615843348u},
        {3, 2, 2});
    CHECK(array_equal(out, expected).item<bool>());

    out = vmap(vmap(fn, 1), 0)(key);
    expected = array(
        {1948878966u,
         4237131848u,
         1948878966u,
         4237131848u,

         2531170506u,
         1858648356u,
         2531170506u,
         1858648356u,

         740561898u,
         4234094099u,
         740561898u,
         4234094099u},
        {3, 2, 2});
    CHECK(array_equal(out, expected).item<bool>());
  }

  // Vmap smaller type
  {
    auto key = array({0u, 0u, 1u, 1u}, {2, 2});
    auto fn = [](array k) { return random::bits({5}, 2, k); };

    auto expected = array(
        {4146024105u,
         1351547692u,
         2718843009u,
         3725146706u,
         1802982961u,
         1349634643u},
        {2, 3});
    auto out = vmap(fn)(key);
    auto x1 = random::bits({5}, 2, take(key, array(0), 0));
    auto x2 = random::bits({5}, 2, take(key, array(1), 0));

    CHECK(array_equal(take(out, array(0), 0), x1).item<bool>());
    CHECK(array_equal(take(out, array(1), 0), x2).item<bool>());
  }
}

TEST_CASE("test random uniform") {
  // Test shapes, types, and sizes
  {
    auto x = random::uniform({});
    CHECK_EQ(x.size(), 1);
    CHECK_EQ(x.dtype(), float32);

    x = random::uniform({}, float16);
    CHECK_EQ(x.size(), 1);
    CHECK_EQ(x.dtype(), float16);

    x = random::uniform({0});
    CHECK(array_equal(x, array({})).item<bool>());

    // Non float type throws
    CHECK_THROWS_AS(random::uniform({}, int32), std::invalid_argument);

    // dtype respected
    x = random::uniform(-.1, .1, {0}, bfloat16);
    CHECK_EQ(x.dtype(), bfloat16);

    // Check broadcasting
    x = random::uniform(zeros({3, 1}), ones({1, 3}), {3, 3});
    CHECK_EQ(x.shape(), Shape{3, 3});
    CHECK_THROWS_AS(
        random::uniform(zeros({3, 3}), 1.0, {1, 3}), std::invalid_argument);
    CHECK_THROWS_AS(
        random::uniform(zeros({3, 3}), 1.0, {2, 3}), std::invalid_argument);
    CHECK_THROWS_AS(
        random::uniform(zeros({3, 1}), ones({1, 3}), {1, 3}),
        std::invalid_argument);

    // Check wrong key type or shape
    auto key = array({0, 0});
    CHECK_THROWS_AS(random::uniform({}, key), std::invalid_argument);
    key = array({0, 0}, {1, 2});
    CHECK_THROWS_AS(random::uniform({}, key), std::invalid_argument);
    key = array({0u, 0u, 0u}, {3, 1});
    CHECK_THROWS_AS(random::uniform({}, key), std::invalid_argument);
    key = array({0u, 0u}, {2, 1});
    CHECK_THROWS_AS(random::uniform({}, key), std::invalid_argument);
  }

  // Expected bits in the following tests were generated from
  // Jax's Threefry 2x32 implementation using the following in
  // python:
  //
  // ```
  //   import jax
  //   import jax.prng
  //   shape = (SET THIS)
  //   seed = (SET THIS)
  //   key = jax.random.PRNGKey(seed)
  //   print(jax.prng.threefry_prng_impl.random_bits(key, 32, shape))

  constexpr auto to_float = [](uint32_t n) {
    return static_cast<float>(n) / UINT32_MAX;
  };

  {
    auto key = random::key(0);
    auto x = random::uniform({}, key);
    auto y = random::uniform({}, key);
    auto expected = to_float(1797259609);
    CHECK_EQ(x.item<float>(), expected);
    CHECK_EQ(x.item<float>(), y.item<float>());
  }

  {
    auto key = random::key(1);
    auto x = random::uniform({}, key);
    auto expected = to_float(507451445);
    CHECK_EQ(x.item<float>(), expected);
  }

  {
    auto key = random::key(0);
    auto x = random::uniform({3, 1}, key);
    auto expected = array(
        {to_float(4146024105), to_float(1351547692), to_float(2718843009)},
        {3, 1});
    CHECK(array_equal(x, expected).item<bool>());
  }

  // Check vmap
  {
    auto key = random::key(0);
    auto fun = [](array k, array low) {
      return random::uniform(low, 1, {3}, float32, k);
    };
    auto out = vmap(fun, -1)(key, zeros({2, 3}));
    CHECK_EQ(out.shape(), Shape{2, 3});

    key = zeros({2, 2}, uint32);
    out = vmap(fun)(key, zeros({2, 3}));
    CHECK_EQ(out.shape(), Shape{2, 3});
  }

  // Check bounds are respected
  {
    auto key = random::key(128291);
    auto out = random::uniform(array(-1.0f), array(1.0f), {100}, float32, key);
    CHECK(all(less(out, array(1.0f))).item<bool>());
    CHECK(all(greater_equal(out, array(-1.0f))).item<bool>());
  }

  // Check float16
  {
    auto key = random::key(0);
    auto out = random::uniform({1000}, float16, key);
    CHECK_EQ(out.dtype(), float16);
    CHECK(all(less(out, array(1.0f))).item<bool>());
    CHECK(all(greater_equal(out, array(0.0f))).item<bool>());
    CHECK(!all(equal(out, array(0.0f))).item<bool>());
    CHECK(abs(float(mean(out).item<float16_t>()) - 0.5f) < 0.02);
  }

  {
    auto key = random::key(0);
    auto out = random::uniform({1000}, bfloat16, key);
    CHECK_EQ(out.dtype(), bfloat16);
    CHECK(all(less(out, array(1.0f))).item<bool>());
    CHECK(all(greater_equal(out, array(0.0f))).item<bool>());
    CHECK(!all(equal(out, array(0.0f))).item<bool>());
    CHECK(abs(float(mean(out).item<bfloat16_t>()) - 0.5f) < 0.02);
  }
}

TEST_CASE("test random normal") {
  // Test shapes, types, and sizes
  {
    auto x = random::normal({});
    CHECK_EQ(x.size(), 1);
    CHECK_EQ(x.dtype(), float32);

    x = random::uniform({0});
    CHECK(array_equal(x, array({})).item<bool>());

    // Non float type throws
    CHECK_THROWS_AS(random::normal({}, int32), std::invalid_argument);

    // Check wrong key type or shape
    auto key = array({0, 0});
    CHECK_THROWS_AS(random::normal({}, key), std::invalid_argument);
    key = array({0, 0}, {1, 2});
    CHECK_THROWS_AS(random::normal({}, key), std::invalid_argument);
    key = array({0u, 0u, 0u}, {3, 1});
    CHECK_THROWS_AS(random::normal({}, key), std::invalid_argument);
    key = array({0u, 0u}, {2, 1});
    CHECK_THROWS_AS(random::normal({}, key), std::invalid_argument);
  }

  {
    constexpr float inf = std::numeric_limits<float>::infinity();
    auto key = random::key(128291);
    auto out = random::normal({100}, key);
    CHECK(all(less(abs(out), array(inf))).item<bool>());
    CHECK(abs(mean(out).item<float>()) < 0.1);
  }

  {
    constexpr float inf = std::numeric_limits<float>::infinity();
    auto key = random::key(128291);
    auto out = random::normal({200}, float16, key);
    CHECK_EQ(out.dtype(), float16);
    CHECK(all(less(abs(out), array(inf))).item<bool>());
    CHECK(abs(float(mean(out).item<float16_t>())) < 0.1);
  }

  {
    constexpr float inf = std::numeric_limits<float>::infinity();
    auto key = random::key(128291);
    auto out = random::normal({200}, bfloat16, key);
    CHECK_EQ(out.dtype(), bfloat16);
    CHECK(all(less(abs(out), array(inf))).item<bool>());
    CHECK(abs(float(mean(out).item<bfloat16_t>())) < 0.1);
  }
}

TEST_CASE("test random multivariate_normal") {
  // Scope switch to the cpu for SVDs
  StreamContext sc(Device::cpu);

  {
    auto mean = zeros({3});
    auto cov = eye(3);
    auto x = random::multivariate_normal(mean, cov, {1000}, float32);
    CHECK_EQ(x.shape(), Shape{1000, 3});
    CHECK_EQ(x.dtype(), float32);
  }

  // Limit case
  {
    auto mean = array({0, 0});
    auto cov = array({1., -1, -.1, 1.});
    cov = reshape(cov, {2, 2});
    auto x = random::multivariate_normal(mean, cov, {1}, float32);
    CHECK_EQ(x.shape(), Shape{1, 2});
    CHECK_EQ(x.dtype(), float32);
  }

  // Check wrong shapes
  {
    auto mean = zeros({3, 1});
    auto cov = eye(3);
    CHECK_THROWS_AS(
        random::multivariate_normal(
            mean,
            cov,
            {
                1000,
            },
            float32),
        std::invalid_argument);
  }
  {
    auto mean = zeros({3});
    auto cov = zeros({1, 2, 3, 3});
    auto x = random::multivariate_normal(mean, cov, {1000, 2}, float32);
    CHECK_EQ(x.shape(), Shape{1000, 2, 3});
  }
  {
    auto mean = zeros({3});
    auto cov = eye(4);
    CHECK_THROWS_AS(
        random::multivariate_normal(mean, cov, {1000, 3}, float32),
        std::invalid_argument);
  }

  // Check wrong type
  {
    auto mean = zeros({3});
    auto cov = eye(3);
    CHECK_THROWS_AS(
        random::multivariate_normal(mean, cov, {1000, 3}, float16),
        std::invalid_argument);
  }
}

TEST_CASE("test random randint") {
  CHECK_THROWS_AS(
      random::randint(array(3), array(5), {1}, float32), std::invalid_argument);

  auto x = random::randint(0, 10, {}, uint32);
  CHECK_EQ(x.size(), 1);
  CHECK_EQ(x.dtype(), uint32);

  x = random::randint(0, 2, {}, bool_);
  CHECK_EQ(x.size(), 1);
  CHECK_EQ(x.dtype(), bool_);

  x = random::randint(0, 2, {}, int32);
  CHECK_EQ(x.size(), 1);
  CHECK_EQ(x.dtype(), int32);

  x = random::randint(0, 2, {}, int64);
  CHECK_EQ(x.size(), 1);
  CHECK_EQ(x.dtype(), int64);

  // Check all in bounds
  auto low = -10.0;
  auto high = 20.0;
  x = random::randint(low, high, {1000, 1000});
  CHECK((all(low <= x).item<bool>() && all(x < high).item<bool>()));

  // Check high < low => all equals to low
  low = 20.0;
  high = -10.0;
  x = random::randint(low, high, {3, 3});
  CHECK(all(equal(x, array(low))).item<bool>());

  // Check wrong key type or shape
  auto key = array({0, 0}, {1, 2});
  CHECK_THROWS_AS(
      random::randint(low, high, {}, float32, key), std::invalid_argument);
}

TEST_CASE("test random bernoulli") {
  auto x = random::bernoulli();

  CHECK_EQ(x.size(), 1);
  CHECK_EQ(x.dtype(), bool_);

  // Bernoulli parameter can have floating point type
  x = random::bernoulli(array(0.5, float16));
  CHECK_EQ(x.size(), 1);
  CHECK_EQ(x.dtype(), bool_);

  CHECK_THROWS(random::bernoulli(array(1, int32)));

  // Negative numbers allowed in Jax
  x = random::bernoulli(array(-1.0));
  CHECK_FALSE(x.item<bool>());

  x = random::bernoulli(array(5.0));
  CHECK(x.item<bool>());

  // Return array with correct shape
  x = random::bernoulli(0.5, {3, 3});
  CHECK_EQ(x.shape(), Shape{3, 3});

  // Try with p = {}
  x = random::bernoulli(array({}));
  CHECK_EQ(x.size(), 0);

  // Try broadcasting
  auto p = array({0.1, 0.2, 0.3});
  p = reshape(p, {1, 3});
  x = random::bernoulli(p, {4, 3});
  CHECK_EQ(x.shape(), Shape{4, 3});

  CHECK_THROWS_AS(random::bernoulli(array({}), {3, 3}), std::invalid_argument);

  p = array({0.1, 0.2, 0.3});
  // Ask for the wrong shape => throws
  CHECK_THROWS_AS(random::bernoulli(p, Shape{2}), std::invalid_argument);

  // Check wrong key type or shape
  auto key = array({0, 0}, {1, 2});
  CHECK_THROWS_AS(random::bernoulli(array(0.5), key), std::invalid_argument);
}

TEST_CASE("Test truncated normal") {
  auto x = random::truncated_normal(array(-2.0), array(2.0));

  CHECK_EQ(x.size(), 1);
  CHECK_EQ(x.dtype(), float32);

  x = random::truncated_normal(array(-2.0), array(2.0), {}, float16);
  CHECK_EQ(x.size(), 1);
  CHECK_EQ(x.dtype(), float16);

  // Requested shape
  x = random::truncated_normal(array(-2.0), array(2.0), {3, 4});
  CHECK_EQ(x.shape(), Shape{3, 4});

  // Empty array
  x = random::truncated_normal(array({}), array({}));
  CHECK_EQ(x.size(), 0);

  // Broadcast
  auto lower = reshape(array({-2.0, -3.0}), {1, 2});
  auto higher = reshape(array({0.0, 3.0, 1.5}), {3, 1});
  x = random::truncated_normal(lower, higher);

  // All in bounds
  CHECK_EQ(x.shape(), Shape{3, 2});
  CHECK((all(x <= higher).item<bool>() && all(lower <= x).item<bool>()));

  // high < low => all equal to low
  x = random::truncated_normal(array(2.0), array(-2.0));
  CHECK(all(x == array(2.0)).item<bool>());

  // Non broadcastable => throws
  CHECK_THROWS_AS(
      random::truncated_normal(lower, higher, {4, 2}), std::invalid_argument);

  auto key = array({0, 0}, {1, 2});
  CHECK_THROWS_AS(
      random::truncated_normal(array(-2.0), array(2.0), {1, 1}, float32, key),
      std::invalid_argument);
}

TEST_CASE("test categorical") {
  auto logits = zeros({10, 20});

  using random::categorical;

  // Invalid axes
  CHECK_THROWS(categorical(logits, 2));
  CHECK_THROWS(categorical(logits, -3));

  // Invalid requested shapes
  CHECK_THROWS(categorical(logits, 1, Shape{1}));
  CHECK_THROWS(categorical(logits, 1, Shape{11}));
  CHECK_THROWS(categorical(logits, 1, {10, 1}));

  CHECK_EQ(categorical(logits, -1).shape(), Shape{10});
  CHECK_EQ(categorical(logits, 0).shape(), Shape{20});
  CHECK_EQ(categorical(logits, 1).shape(), Shape{10});

  auto out = categorical(logits);
  CHECK_EQ(out.shape(), Shape{10});
  CHECK_EQ(out.dtype(), uint32);
  CHECK(max(out).item<uint32_t>() < 20);

  out = categorical(logits, 0, {5, 20});
  CHECK_EQ(out.shape(), Shape{5, 20});
  CHECK(max(out).item<uint32_t>() < 10);

  float inf = std::numeric_limits<float>::infinity();
  logits = array({1.0f, -2.0f, inf, 4.0f, 3.0f});
  CHECK_EQ(categorical(logits).item<uint32_t>(), 2);

  logits = array({-inf, -2.0f, -inf, -inf});
  CHECK_EQ(categorical(logits).item<uint32_t>(), 1);

  logits = zeros({5, 4, 3});
  CHECK_EQ(categorical(logits, -1, 7).shape(), Shape{5, 4, 7});
  CHECK_EQ(categorical(logits, -2, 7).shape(), Shape{5, 3, 7});
  CHECK_EQ(categorical(logits, -3, 7).shape(), Shape{4, 3, 7});
}

TEST_CASE("test laplace") {
  // Test shapes, types, and sizes
  {
    auto x = random::laplace({});
    CHECK_EQ(x.size(), 1);
    CHECK_EQ(x.dtype(), float32);

    // Non float type throws
    CHECK_THROWS_AS(random::laplace({}, int32), std::invalid_argument);

    // Check wrong key type or shape
    auto key = array({0, 0});
    CHECK_THROWS_AS(random::laplace({}, key), std::invalid_argument);
    key = array({0, 0}, {1, 2});
    CHECK_THROWS_AS(random::laplace({}, key), std::invalid_argument);
    key = array({0u, 0u, 0u}, {3, 1});
    CHECK_THROWS_AS(random::laplace({}, key), std::invalid_argument);
    key = array({0u, 0u}, {2, 1});
    CHECK_THROWS_AS(random::laplace({}, key), std::invalid_argument);
  }

  {
    constexpr float inf = std::numeric_limits<float>::infinity();
    auto key = random::key(128291);
    auto out = random::laplace({1000000}, key);
    float sample_mean = mean(out).item<float>();
    float sample_variance = var(out).item<float>();

    CHECK(all(less(abs(out), array(inf))).item<bool>());
    CHECK(abs(sample_mean) < 0.1);

    // Chebyshev's inequality.
    for (int k = 1; k <= 5; ++k) {
      float prob_above =
          mean(greater_equal(out, array(k * std::sqrt(sample_variance))))
              .item<float>();
      float bound = 1 / std::pow(k, 2);
      CHECK(prob_above < bound);
    }

    // Expected variance for Laplace distribution is 2*scale^2.
    float expected_variance = 2.0;
    CHECK(std::abs(sample_variance - expected_variance) < 0.01);

    // Expected kurtosis of Laplace distribution is 3.
    array fourth_pows = power(out - sample_mean, array(4));
    float sample_kurtosis =
        mean(fourth_pows).item<float>() / std::pow(sample_variance, 2) - 3;
    float expected_kurtosis = 3.0;
    CHECK(std::abs(sample_kurtosis - expected_kurtosis) < 0.1);
  }

  {
    constexpr float inf = std::numeric_limits<float>::infinity();
    auto key = random::key(128291);
    auto out = random::laplace({10000}, float16, key);
    CHECK_EQ(out.dtype(), float16);
    CHECK(all(less(abs(out), array(inf))).item<bool>());
    CHECK(abs(float(mean(out).item<float16_t>())) < 0.1);
  }

  {
    constexpr float inf = std::numeric_limits<float>::infinity();
    auto key = random::key(128291);
    auto out = random::laplace({10000}, bfloat16, key);
    CHECK_EQ(out.dtype(), bfloat16);
    CHECK(all(less(abs(out), array(inf))).item<bool>());
    CHECK(abs(float(mean(out).item<bfloat16_t>())) < 0.1);
  }
}


================================================
FILE: tests/scheduler_tests.cpp
================================================
// Copyright © 2023 Apple Inc.

#include "doctest/doctest.h"

#include "mlx/mlx.h"
#include "mlx/scheduler.h"

using namespace mlx::core;

TEST_CASE("test stream management") {
  auto s1 = default_stream(default_device());
  CHECK_EQ(s1.device, default_device());

  auto s2 = new_stream(default_device());
  CHECK_EQ(s2.device, default_device());
  CHECK_NE(s1, s2);

  // Check that default streams have the correct devices
  if (gpu::is_available()) {
    auto s_gpu = default_stream(Device::gpu);
    CHECK_EQ(s_gpu.device, Device::gpu);
  } else {
    CHECK_THROWS_AS(default_stream(Device::gpu), std::invalid_argument);
  }
  auto s_cpu = default_stream(Device::cpu);
  CHECK_EQ(s_cpu.device, Device::cpu);

  s_cpu = new_stream(Device::cpu);
  CHECK_EQ(s_cpu.device, Device::cpu);

  if (gpu::is_available()) {
    auto s_gpu = new_stream(Device::gpu);
    CHECK_EQ(s_gpu.device, Device::gpu);
  } else {
    CHECK_THROWS_AS(new_stream(Device::gpu), std::invalid_argument);
  }
}

TEST_CASE("test get streams") {
  auto streams = get_streams();

  // At least the default CPU stream exists
  CHECK(streams.size() >= 1);

  // All default streams should be in the list
  auto s_cpu = default_stream(Device::cpu);
  bool found_cpu = false;
  for (auto& s : streams) {
    if (s == s_cpu) {
      found_cpu = true;
    }
  }
  CHECK(found_cpu);

  // New streams show up
  auto s_new = new_stream(Device::cpu);
  streams = get_streams();
  bool found_new = false;
  for (auto& s : streams) {
    if (s == s_new) {
      found_new = true;
    }
  }
  CHECK(found_new);
}

TEST_CASE("test asynchronous launch") {
  auto s1 = default_stream(Device::cpu);
  auto s2 = new_stream(Device::cpu);

  // Make sure streams execute asynchronously
  int x = 1;
  auto p1 = std::make_shared<std::promise<void>>();
  auto p2 = std::make_shared<std::promise<void>>();
  auto f1 = p1->get_future().share();
  auto f2 = p2->get_future().share();
  auto fn1 = [&x, p = std::move(p1)]() {
    x++;
    p->set_value();
  };
  auto fn2 = [&x, p = std::move(p2), f = std::move(f1)]() {
    f.wait();
    x *= 5;
    p->set_value();
  };

  // fn2 is launched first and is waiting on fn1 but since
  // they are on different streams there is no deadlock.
  scheduler::enqueue(s2, std::move(fn2));
  scheduler::enqueue(s1, std::move(fn1));

  f2.wait();

  CHECK_EQ(x, 10);
}

TEST_CASE("test stream placement") {
  auto s1 = default_stream(Device::cpu);
  auto s2 = new_stream(Device::cpu);

  {
    // Wait on stream 1
    auto p = std::make_shared<std::promise<void>>();
    auto f = p->get_future().share();
    scheduler::enqueue(s1, [f = std::move(f)]() { f.wait(); });

    // Do some work on stream 2
    auto x = zeros({100}, float32, s2);
    auto y = ones({100}, float32, s2);
    auto z = add(x, y, s2);
    eval(z);
    p->set_value();
  }

  {
    // Wait on stream 1
    auto p = std::make_shared<std::promise<void>>();
    auto f = p->get_future().share();
    scheduler::enqueue(s1, [f = std::move(f)]() { f.wait(); });

    // Do some work on stream 2
    auto fn = [&s2](array a) { return add(a, add(a, a, s2), s2); };
    auto x = zeros({100}, s2);

    // The whole vjp computation should happen
    // on the second stream otherwise this will hang.
    auto [out, dout] = vjp(fn, x, ones({100}, s2));

    // The whole jvp computation should happen on the
    // second stream.
    std::tie(out, dout) = jvp(fn, x, ones({100}, s2));
    eval(out, dout);

    p->set_value();
  }
}

TEST_CASE("test scheduler races") {
  auto x = zeros({1});
  auto y = zeros({100});
  eval(x, y);
  auto a = exp(x);
  eval(a);
  a = exp(x);
  for (int i = 0; i < 10000; ++i) {
    y = exp(y);
  }
  eval(a, y);
}


================================================
FILE: tests/test_teardown.cpp
================================================
// Copyright © 2026 Apple Inc.
//
// Regression test for https://github.com/ml-explore/mlx/issues/3126
// Verifies that the process exits cleanly when a background thread is
// performing GPU work and the main thread exits.

#include <chrono>
#include <iostream>
#include <thread>

#include "mlx/mlx.h"

namespace mx = mlx::core;

int main() {
  using namespace std::chrono_literals;

  std::thread t([] {
    auto a = mx::random::normal({2048, 2048});
    std::cout << "START" << std::endl;
    for (int i = 0; i < 1000; ++i) {
      a = mx::matmul(a, a);
      // Eval periodically to avoid building a huge graph
      if (i % 10 == 0) {
        mx::eval(a);
        std::cout << "Step " << i << std::endl;
      }
    }
    mx::eval(a);
    std::cout << "Done: " << a.shape(0) << "x" << a.shape(1) << std::endl;
  });

  std::this_thread::sleep_for(1s);
  t.detach();
  std::cout << "Main thread exiting." << std::endl;
  return 0;
}


================================================
FILE: tests/tests.cpp
================================================
// Copyright © 2023 Apple Inc.

#define DOCTEST_CONFIG_IMPLEMENT
#include "doctest/doctest.h"

#include <cstdlib>

#include "mlx/mlx.h"

using namespace mlx::core;

int main(int argc, char** argv) {
  doctest::Context context;

  const char* device = std::getenv("DEVICE");
  if (device != nullptr && std::string(device) == "cpu") {
    set_default_device(Device::cpu);
  } else if (is_available(Device::gpu)) {
    // Use generic GPU availability check (works for Metal on macOS, or CUDA on
    // Linux/Windows)
    set_default_device(Device::gpu);
  }

  context.applyCommandLine(argc, argv);
  return context.run();
}


================================================
FILE: tests/utils_tests.cpp
================================================
// Copyright © 2023 Apple Inc.

#include "doctest/doctest.h"

#include "mlx/mlx.h"

using namespace mlx::core;

TEST_CASE("test type promotion") {
  for (auto t : {bool_, uint32, int32, int64, float32}) {
    auto a = array(0, t);
    CHECK_EQ(result_type({a}), t);

    std::vector<array> arrs = {array(0, t), array(0, t)};
    CHECK_EQ(result_type(arrs), t);
  }

  {
    std::vector<array> arrs = {array(false), array(0, int32)};
    CHECK_EQ(result_type(arrs), int32);
  }

  {
    std::vector<array> arrs = {array(0, int32), array(false), array(0.0f)};
    CHECK_EQ(result_type(arrs), float32);
  }
}

TEST_CASE("test normalize axis") {
  struct TestCase {
    int axis;
    int ndim;
    int expected;
  };

  std::vector<TestCase> testCases = {
      {0, 3, 0}, {1, 3, 1}, {2, 3, 2}, {-1, 3, 2}, {-2, 3, 1}, {-3, 3, 0}};

  for (const auto& tc : testCases) {
    CHECK_EQ(normalize_axis_index(tc.axis, tc.ndim), tc.expected);
  }

  CHECK_THROWS(normalize_axis_index(3, 3));
  CHECK_THROWS(normalize_axis_index(-4, 3));
}

TEST_CASE("test finfo") {
  CHECK_EQ(finfo(float32).dtype, float32);
  CHECK_EQ(finfo(complex64).dtype, float32);
  CHECK_EQ(finfo(float16).dtype, float16);
  CHECK_EQ(finfo(float32).min, std::numeric_limits<float>::lowest());
  CHECK_EQ(finfo(float32).max, std::numeric_limits<float>::max());
  CHECK_EQ(finfo(complex64).min, std::numeric_limits<float>::lowest());
  CHECK_EQ(finfo(complex64).max, std::numeric_limits<float>::max());
  CHECK_EQ(finfo(float16).min, -65504);
  CHECK_EQ(finfo(float16).max, 65504);
}

TEST_CASE("test iinfo") {
  CHECK_EQ(iinfo(int8).dtype, int8);
  CHECK_EQ(iinfo(int64).dtype, int64);
  CHECK_EQ(iinfo(int64).max, std::numeric_limits<int64_t>::max());
  CHECK_EQ(iinfo(uint64).max, std::numeric_limits<uint64_t>::max());
  CHECK_EQ(iinfo(uint64).max, std::numeric_limits<uint64_t>::max());
  CHECK_EQ(iinfo(uint64).min, 0);
  CHECK_EQ(iinfo(int64).min, std::numeric_limits<int64_t>::min());
}


================================================
FILE: tests/vmap_tests.cpp
================================================
// Copyright © 2023 Apple Inc.

#include "doctest/doctest.h"

#include "mlx/mlx.h"

using namespace mlx::core;

TEST_CASE("test simple vmap") {
  // vmap reshape
  {
    auto vfun = vmap([](array input) { return reshape(input, {2, 2}); });
    auto x = zeros({3, 4});
    CHECK(array_equal(vfun(x), zeros({3, 2, 2})).item<bool>());

    x = array({0, 1, 2, 3, 4, 5, 6, 7}, {2, 2, 2});
    vfun = vmap([](array input) { return reshape(input, {4}); });
    auto expected = array({0, 1, 2, 3, 4, 5, 6, 7}, {2, 4});
    CHECK(array_equal(vfun(x), expected).item<bool>());

    vfun = vmap([](array input) { return reshape(input, {4}); }, 1);
    expected = array({0, 1, 4, 5, 2, 3, 6, 7}, {2, 4});
    CHECK(array_equal(vfun(x), expected).item<bool>());

    vfun = vmap([](array input) { return reshape(input, {4}); }, 1, 1);
    expected = array({0, 2, 1, 3, 4, 6, 5, 7}, {4, 2});
    CHECK(array_equal(vfun(x), expected).item<bool>());
  }

  // vmap broadcast
  {
    auto fun = [](array input) { return broadcast_to(input, {4, 2}); };

    CHECK_THROWS_AS(vmap(fun, 0, -1), std::invalid_argument);
    CHECK_THROWS_AS(vmap(fun, -1, 0), std::invalid_argument);

    auto vfun = vmap(fun);
    auto x = zeros({3, 2});
    CHECK(array_equal(vfun(x), zeros({3, 4, 2})).item<bool>());

    vfun = vmap(fun, 0, 1);
    CHECK(array_equal(vfun(x), zeros({4, 3, 2})).item<bool>());

    vfun = vmap(fun, 0, 2);
    CHECK(array_equal(vfun(x), zeros({4, 2, 3})).item<bool>());

    vfun = vmap(fun, 0, 2);
    x = zeros({2, 3});
    CHECK_THROWS_AS(vfun(x), std::invalid_argument);

    x = zeros({2, 3});
    vfun = vmap(fun, 1);
    CHECK(array_equal(vfun(x), zeros({3, 4, 2})).item<bool>());

    vfun = vmap(fun, 1, 1);
    CHECK(array_equal(vfun(x), zeros({4, 3, 2})).item<bool>());

    vfun = vmap(fun, 1, 2);
    CHECK(array_equal(vfun(x), zeros({4, 2, 3})).item<bool>());
  }

  // vmap transpose
  {
    auto fun = [](array input) { return transpose(input); };
    auto vfun = vmap(fun);
    auto x = array({0, 1, 2, 3, 4, 5}, {3, 2});
    CHECK(array_equal(vfun(x), x).item<bool>());

    vfun = vmap(fun, 0, 1);
    CHECK(array_equal(vfun(x), transpose(x)).item<bool>());

    x = array({0, 1, 2, 3, 4, 5, 6, 7}, {2, 2, 2});
    vfun = vmap(fun);
    CHECK(array_equal(vfun(x), transpose(x, {0, 2, 1})).item<bool>());

    vfun = vmap(fun, 1, 1);
    CHECK(array_equal(vfun(x), transpose(x, {2, 1, 0})).item<bool>());

    vfun = vmap(fun, 2, 2);
    CHECK(array_equal(vfun(x), transpose(x, {1, 0, 2})).item<bool>());

    // vmap twice
    x = array(
        {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {2, 2, 2, 2});
    vfun = vmap(vmap(fun));
    CHECK(array_equal(vfun(x), transpose(x, {0, 1, 3, 2})).item<bool>());
  }

  // vmap add
  {
    auto fun = [](std::vector<array> inputs) {
      auto out = add(inputs[0], inputs[1]);
      return std::vector<array>{out};
    };

    auto vfun = vmap(fun);
    array x({1.0, 2.0}, {2, 1});
    array y({2.0, 3.0}, {2, 1});
    auto out = vfun({x, y})[0];
    CHECK(array_equal(out, array({3.0, 5.0}, {2, 1})).item<bool>());

    x = ones({2, 1, 3});
    y = ones({3, 2});
    vfun = vmap(fun, {2, 0});
    out = vfun({x, y})[0];
    CHECK(array_equal(out, full({3, 2, 2}, 2.0)).item<bool>());

    x = array(1.);
    y = ones({3, 2});
    vfun = vmap(fun, {-1, 0});
    out = vfun({x, y})[0];
    CHECK(array_equal(out, full({3, 2}, 2.0)).item<bool>());

    x = ones({3, 2});
    y = array(1.);
    vfun = vmap(fun, {0, -1});
    out = vfun({x, y})[0];
    CHECK(array_equal(out, full({3, 2}, 2.0)).item<bool>());

    CHECK_THROWS_AS(vmap(fun, {-1, 0}, {-1}), std::invalid_argument);
    CHECK_THROWS_AS(vmap(fun, {0, -1}, {-1}), std::invalid_argument);

    x = ones({3, 2, 1});
    y = ones({3, 2, 1});
    vfun = vmap(vmap(fun));
    out = vfun({x, y})[0];
    CHECK(array_equal(out, x + y).item<bool>());
  }

  // vmap where (ternary op)
  {
    auto fun = [](std::vector<array> inputs) {
      auto out = where(inputs[0], inputs[1], inputs[2]);
      return std::vector<array>{out};
    };

    auto vfun = vmap(fun);
    array cond({true, false}, {2, 1});
    array x({1.0, 2.0}, {2, 1});
    array y({2.0, 4.0}, {2, 1});
    auto out = vfun({cond, x, y})[0];
    CHECK(array_equal(out, array({1.0, 4.0}, {2, 1})).item<bool>());

    cond = array({true, true, false}, {1, 3});
    x = ones({2, 1, 3});
    y = zeros({3, 2});
    vfun = vmap(fun, {1, 2, 0});
    out = vfun({cond, x, y})[0];

    CHECK(
        array_equal(out, array({1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0}, {3, 2, 2}))
            .item<bool>());

    vfun = vmap(fun, {1, 2, 0}, {1});
    out = vfun({cond, x, y})[0];
    CHECK(
        array_equal(out, array({1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0}, {2, 3, 2}))
            .item<bool>());

    cond = array({true, false});
    x = array(2.);
    y = ones({3, 2});
    vfun = vmap(fun, {-1, -1, 0});
    out = vfun({cond, x, y})[0];
    CHECK(array_equal(out, array({2, 1, 2, 1, 2, 1}, {3, 2})).item<bool>());

    cond = array({true, false});
    x = ones({3, 2});
    y = array(2.);
    vfun = vmap(fun, {-1, 0, -1});
    out = vfun({cond, x, y})[0];
    CHECK(array_equal(out, array({1, 2, 1, 2, 1, 2}, {3, 2})).item<bool>());

    CHECK_THROWS_AS(vmap(fun, {-1, -1, -1}, {0}), std::invalid_argument);
    CHECK_THROWS_AS(vmap(fun, {-1, 0, -1}, {-1}), std::invalid_argument);
    CHECK_THROWS_AS(vmap(fun, {-1, -1, 0}, {-1}), std::invalid_argument);
    CHECK_THROWS_AS(vmap(fun, {0, -1, -1}, {-1}), std::invalid_argument);

    cond = array({1, 1, 1, 0, 0, 0}, {3, 2, 1});
    x = ones({3, 2, 1});
    y = full({3, 2, 1}, 2);
    vfun = vmap(vmap(fun));
    out = vfun({cond, x, y})[0];
    CHECK(array_equal(out, array({1, 1, 1, 2, 2, 2}, {3, 2, 1})).item<bool>());
  }

  // vmap with capturing closure
  {
    auto x = add(add(ones({2}), zeros({2})), zeros({2}));
    auto fun = [x](const array& input) { return add(input, x); };

    auto vfun = vmap(fun);
    auto y = ones({3, 2});
    CHECK(array_equal(vfun(y), full({3, 2}, 2.0f)).item<bool>());
  }
  {
    auto x = ones({4});
    auto z = x + x;
    auto vfun = vmap(
        [z](std::vector<array> inputs) {
          return std::vector<array>{add(z, inputs[1])};
        },
        {-1, 0});
    auto y = ones({3, 4});
    CHECK(array_equal(vfun({x, y})[0], full({3, 4}, 3.0)).item<bool>());
  }
}

TEST_CASE("test vmap with eval") {
  auto fun = [](std::vector<array> inputs) {
    auto x = inputs[0] + 1;
    auto y = inputs[1] + 2;
    eval(x);
    auto out = add(x, y);
    return std::vector<array>{out};
  };

  auto vfun = vmap(fun);
  array x({1.0, 2.0}, {2, 1});
  array y({2.0, 3.0}, {2, 1});
  CHECK_THROWS(vfun({x, y}));

  // Ok to eval functions of non-vmapped input
  x = array(1.0);
  vfun = vmap(fun, {-1, 0});
  CHECK(array_equal(vfun({x, y})[0], array({6.0f, 7.0f}, {2, 1})).item<bool>());

  // Not ok to eval function of vmapped input even with retain graph
  auto fun2 = [](std::vector<array> inputs) {
    auto x = inputs[0] + 1;
    auto y = inputs[1] + 2;
    eval(x);
    auto out = add(x, y);
    return std::vector<array>{out};
  };
  x = array({1.0, 2.0}, {2, 1});
  CHECK_THROWS(vmap(fun2)({x, y}));
}

TEST_CASE("test vmap comparison ops") {
  // vmap equal
  {
    auto fun = [](std::vector<array> inputs) {
      return std::vector<array>{equal(inputs[0], inputs[1])};
    };
    auto vfun = vmap(fun);
    auto x = zeros({2, 3}, float32);
    auto y = zeros({2, 3}, float32);
    auto out = vfun({x, y})[0];
    CHECK(all(out).item<bool>());

    vfun = vmap(fun, {0, -1});
    x = zeros({2, 3}, float32);
    y = zeros({3}, float32);
    out = vfun({x, y})[0];
    CHECK(all(out).item<bool>());

    vfun = vmap(fun, {0, -1});
    x = array({0, 0, 0, 1, 1, 1}, {2, 3});
    y = zeros({3}, float32);
    out = vfun({x, y})[0];
    auto expected = array({true, true, true, false, false, false}, {2, 3});
    CHECK(array_equal(out, expected).item<bool>());
  }
}

TEST_CASE("test vmap creation ops") {
  // vmap astype
  {
    auto fun = [](array in) { return astype(in, int32); };
    auto x = zeros({2, 3}, float32);
    auto out = vmap(fun)(x);
    CHECK_EQ(out.dtype(), int32);
    CHECK(array_equal(out, zeros({2, 3}, int32)).item<bool>());
  }

  // vmap full
  {
    auto fun = [](array in) { return full({2}, in); };
    auto x = array({1, 2, 3});
    auto out = vmap(fun)(x);
    auto expected = array({1, 1, 2, 2, 3, 3}, {3, 2});
    CHECK(array_equal(out, expected).item<bool>());

    x = array({1, 2, 3}, {3, 1});
    out = vmap(fun)(x);
    expected = array({1, 1, 2, 2, 3, 3}, {3, 2});
    CHECK(array_equal(out, expected).item<bool>());

    x = array({1, 2, 3}, {1, 3});
    CHECK_THROWS_AS(vmap(fun)(x), std::invalid_argument);
    out = vmap(fun, 1, 1)(x);
    expected = array({1, 2, 3, 1, 2, 3}, {2, 3});
    CHECK(array_equal(out, expected).item<bool>());
  }
}

TEST_CASE("test vmap slice") {
  {
    auto fun = [](array in) { return slice(in, {4}, {8}, {2}); };
    auto x = reshape(arange(16), {2, 8});
    auto out = vmap(fun)(x);
    auto expected = reshape(array({4, 6, 12, 14}), {2, 2});
    CHECK(array_equal(out, expected).item<bool>());
  }

  {
    auto fun = [](array in) { return slice(in, {0, 1}, {2, 3}); };
    auto x = reshape(arange(12), {2, 2, 3});
    auto out = vmap(fun, 1, 0)(x);
    auto expected = reshape(array({1, 2, 7, 8, 4, 5, 10, 11}), {2, 2, 2});
    CHECK(array_equal(out, expected).item<bool>());
  }
}

TEST_CASE("test vmap concatenate") {
  auto fun = [](std::vector<array> inputs) {
    return std::vector<array>{concatenate(inputs, 0)};
  };
  auto x = reshape(arange(4), {2, 2});
  auto y = reshape(arange(4), {2, 2});
  auto out = vmap(fun)({x, y})[0];
  auto expected = reshape(array({0, 1, 0, 1, 2, 3, 2, 3}), {2, 4});
  CHECK(array_equal(out, expected).item<bool>());
  out = vmap(fun, {1, 1})({x, y})[0];
  expected = reshape(array({0, 2, 0, 2, 1, 3, 1, 3}), {2, 4});
  CHECK(array_equal(out, expected).item<bool>());
  out = vmap(fun, {0, 1})({x, y})[0];
  expected = reshape(array({0, 1, 0, 2, 2, 3, 1, 3}), {2, 4});
  CHECK(array_equal(out, expected).item<bool>());
}

TEST_CASE("test vmap gather") {
  {
    auto fun = [](std::vector<array> inputs) {
      auto src = inputs[0];
      auto indices = inputs[1];
      auto out = squeeze(gather(src, indices, 0, {1, 2, 2}), 2);
      return std::vector<array>{out};
    };
    auto x = zeros({2, 2, 2, 2});
    auto y = array({0, 1, 0, 0, 1, 0}, {2, 3});
    auto out = vmap(fun, {0, -1})({x, y})[0];
    CHECK_EQ(out.shape(), Shape{2, 2, 3, 2, 2});
    out = vmap(fun, {0, -1}, {3})({x, y})[0];
    CHECK_EQ(out.shape(), Shape{2, 3, 2, 2, 2});
  }

  {
    auto fun = [](std::vector<array> inputs) {
      auto src = inputs[0];
      auto indices = inputs[1];
      auto out = squeeze(gather(src, indices, 0, {1, 2, 2}), 1);
      return std::vector<array>{out};
    };
    auto x = zeros({2, 2, 2, 2});
    auto y = array({0, 1, 0, 0, 1, 0}, {2, 3});
    auto out = vmap(fun, {0, 0})({x, y})[0];
    CHECK_EQ(out.shape(), Shape{2, 3, 2, 2});
  }

  {
    auto fun = [](std::vector<array> inputs) {
      auto src = inputs[0];
      auto indices = inputs[1];
      auto out = squeeze(gather(src, indices, 0, {1, 2, 2, 2}), 1);
      return std::vector<array>{out};
    };
    auto x = zeros({2, 2, 2, 2});
    auto y = array({0, 1, 0, 0, 1, 0}, {2, 3});

    auto out = vmap(fun, {-1, 0})({x, y})[0];
    CHECK_EQ(out.shape(), Shape{2, 3, 2, 2, 2});
  }

  {
    auto fun = [](std::vector<array> inputs) {
      auto src = inputs[0];
      auto indices = std::vector<array>(inputs.begin() + 1, inputs.end());
      auto out = squeeze(gather(src, indices, {0, 1}, {1, 1, 2, 2}), {1, 2});
      return std::vector<array>{out};
    };
    auto x = zeros({2, 2, 2, 2});
    auto y = array({0, 1, 0, 0, 1, 0}, {2, 3});
    auto z = array({0, 1, 0, 0, 1, 0}, {2, 3});
    auto out = vmap(fun, {-1, 0, 0})({x, y, z})[0];
    CHECK_EQ(out.shape(), Shape{2, 3, 2, 2});

    z = array({0, 1, 0, 0, 1, 0}, {3, 2});
    out = vmap(fun, {-1, 0, 1})({x, y, z})[0];
    CHECK_EQ(out.shape(), Shape{2, 3, 2, 2});
  }
}

TEST_CASE("test vmap scatter") {
  auto make_scatter_fn = [](const std::vector<array>& indices,
                            const array& updates,
                            const std::vector<int>& axes) {
    return [=](const std::vector<array>& inputs) {
      auto a = inputs.at(0);
      return std::vector<array>{scatter(a, indices, updates, axes)};
    };
  };

  {
    // vmap src on axis 0, scatter on axis 0.
    auto a = zeros({2, 3, 4});
    auto indices = array({1});
    auto updates = reshape(array({1, 2}, float32), {1, 1, 2});

    auto func = make_scatter_fn({indices}, updates, std::vector<int>{0});
    auto out = vmap(func, /* in_axes = */ {0})({a})[0];
    auto expected = array(
        {0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0},
        {2, 3, 4},
        float32);
    CHECK(array_equal(out, expected).item<bool>());
  }

  {
    // vmap src on axis 1, scatter on axis 0.
    auto a = zeros({3, 2, 4});
    auto indices = array({1});
    auto updates = reshape(array({1, 2}, float32), {1, 1, 2});

    auto func = make_scatter_fn({indices}, updates, std::vector<int>{0});
    auto out = vmap(func, /* in_axes = */ {1}, /* out_axes = */ {1})({a})[0];
    auto expected = array(
        {0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0,
         1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
        {3, 2, 4},
        float32);
    CHECK(array_equal(out, expected).item<bool>());
  }

  {
    // vmap src on axis 0, scatter on axis 1.
    auto a = zeros({2, 3, 4});
    auto indices = array({1});
    auto updates = reshape(array({1, 2}, float32), {1, 2, 1});

    auto func = make_scatter_fn({indices}, updates, std::vector<int>{1});
    auto out = vmap(func, /* in_axes = */ {0})({a})[0];
    auto expected = array(
        {0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0},
        {2, 3, 4},
        float32);
    CHECK(array_equal(out, expected).item<bool>());
  }

  {
    // vmap src on axis 2, scatter on axes (0, 1).
    auto a = zeros({2, 3, 2});
    auto indices = {array({1}), array({2})};
    auto axes = {0, 1};
    auto updates = reshape(array({1}, float32), {1, 1, 1});

    auto func = make_scatter_fn(indices, updates, axes);
    auto out = vmap(func, /* in_axes = */ {2}, /* out_axes = */ {2})({a})[0];
    auto expected =
        array({0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1}, {2, 3, 2}, float32);
    CHECK(array_equal(out, expected).item<bool>());
  }
}

TEST_CASE("test vmap SVD") {
  auto svd_full = [](std::vector<array> inputs) {
    return linalg::svd(inputs.at(0), true, Device::cpu);
  };

  auto svd_singular = [](std::vector<array> inputs) {
    return linalg::svd(inputs.at(0), false, Device::cpu);
  };

  auto a = astype(reshape(arange(24), {3, 4, 2}), float32);

  // vmap over the second axis.
  {
    auto out = vmap(svd_full, /* in_axes = */ {1})({a});
    const auto& U = out.at(0);
    const auto& S = out.at(1);
    const auto& Vt = out.at(2);

    CHECK_EQ(U.shape(), Shape{a.shape(1), a.shape(0), a.shape(0)});
    CHECK_EQ(S.shape(), Shape{a.shape(1), a.shape(2)});
    CHECK_EQ(Vt.shape(), Shape{a.shape(1), a.shape(2), a.shape(2)});
  }

  // vmap over the third axis.
  {
    auto out = vmap(svd_full, /* in_axes = */ {2})({a});
    const auto& U = out.at(0);
    const auto& S = out.at(1);
    const auto& Vt = out.at(2);

    CHECK_EQ(U.shape(), Shape{a.shape(2), a.shape(0), a.shape(0)});
    CHECK_EQ(S.shape(), Shape{a.shape(2), a.shape(0)});
    CHECK_EQ(Vt.shape(), Shape{a.shape(2), a.shape(1), a.shape(1)});
  }

  // test singular values
  {
    auto out = vmap(svd_singular, /* in_axes = */ {1})({a});
    const auto& S = out.at(0);

    CHECK_EQ(S.shape(), Shape{a.shape(1), a.shape(2)});
  }

  {
    auto out = vmap(svd_singular, /* in_axes = */ {2})({a});
    const auto& S = out.at(0);

    CHECK_EQ(S.shape(), Shape{a.shape(2), a.shape(0)});
  }
}

TEST_CASE("test vmap dynamic slices") {
  {
    auto fun = [](std::vector<array> inputs) {
      return std::vector<array>{slice(inputs[0], array({1}), {0}, {2})};
    };
    auto x = reshape(arange(12), {3, 4});
    auto out = vmap(fun)({x})[0];
    CHECK(array_equal(out, array({1, 2, 5, 6, 9, 10}, {3, 2})).item<bool>());

    out = vmap(fun, /* in_axes */ {1}, /* out_axes */ {1})({x})[0];
    CHECK(array_equal(out, array({4, 5, 6, 7, 8, 9, 10, 11}, {2, 4}))
              .item<bool>());
  }

  {
    auto fun = [](std::vector<array> inputs) {
      return std::vector<array>{
          slice_update(inputs[0], inputs[1], array({1}), {0})};
    };
    auto x = zeros({2, 2});
    auto upd = ones({2, 1});

    auto out = vmap(fun)({x, upd})[0];
    CHECK(array_equal(out, array({0, 1, 0, 1}, {2, 2})).item<bool>());

    out = vmap(fun, /* in_axes */ {1, 0}, /* out_axes */ {1})({x, upd})[0];
    CHECK(array_equal(out, array({0, 0, 1, 1}, {2, 2})).item<bool>());
  }
}