Repository: rust-lang-nursery/packed_simd
Branch: master
Commit: d938e39bee9b
Files: 363
Total size: 939.2 KB

Directory structure:
gitextract_ltzo2pap/

├── .appveyor.yml
├── .github/
│   └── workflows/
│       ├── benchmarks.yml
│       ├── ci.yml
│       ├── docs.yml
│       └── run-ci-script.yml
├── .gitignore
├── .travis.yml
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── bors.toml
├── build.rs
├── ci/
│   ├── all.sh
│   ├── android-install-ndk.sh
│   ├── android-install-sdk.sh
│   ├── android-sysimage.sh
│   ├── benchmark.sh
│   ├── deploy_and_run_on_ios_simulator.rs
│   ├── docker/
│   │   ├── aarch64-linux-android/
│   │   │   └── Dockerfile
│   │   ├── aarch64-unknown-linux-gnu/
│   │   │   └── Dockerfile
│   │   ├── arm-unknown-linux-gnueabi/
│   │   │   └── Dockerfile
│   │   ├── arm-unknown-linux-gnueabihf/
│   │   │   └── Dockerfile
│   │   ├── armv7-linux-androideabi/
│   │   │   └── Dockerfile
│   │   ├── armv7-unknown-linux-gnueabihf/
│   │   │   └── Dockerfile
│   │   ├── i586-unknown-linux-gnu/
│   │   │   └── Dockerfile
│   │   ├── i686-unknown-linux-gnu/
│   │   │   └── Dockerfile
│   │   ├── mips-unknown-linux-gnu/
│   │   │   └── Dockerfile
│   │   ├── mips64-unknown-linux-gnuabi64/
│   │   │   └── Dockerfile
│   │   ├── mips64el-unknown-linux-gnuabi64/
│   │   │   └── Dockerfile
│   │   ├── mipsel-unknown-linux-musl/
│   │   │   └── Dockerfile
│   │   ├── powerpc-unknown-linux-gnu/
│   │   │   └── Dockerfile
│   │   ├── powerpc64-unknown-linux-gnu/
│   │   │   └── Dockerfile
│   │   ├── powerpc64le-unknown-linux-gnu/
│   │   │   └── Dockerfile
│   │   ├── s390x-unknown-linux-gnu/
│   │   │   └── Dockerfile
│   │   ├── sparc64-unknown-linux-gnu/
│   │   │   └── Dockerfile
│   │   ├── thumbv7neon-linux-androideabi/
│   │   │   └── Dockerfile
│   │   ├── thumbv7neon-unknown-linux-gnueabihf/
│   │   │   └── Dockerfile
│   │   ├── wasm32-unknown-unknown/
│   │   │   └── Dockerfile
│   │   ├── x86_64-linux-android/
│   │   │   └── Dockerfile
│   │   ├── x86_64-unknown-linux-gnu/
│   │   │   └── Dockerfile
│   │   └── x86_64-unknown-linux-gnu-emulated/
│   │       └── Dockerfile
│   ├── dox.sh
│   ├── linux-s390x.sh
│   ├── linux-sparc64.sh
│   ├── lld-shim.rs
│   ├── max_line_width.sh
│   ├── run-docker.sh
│   ├── run.sh
│   ├── run_examples.sh
│   ├── runtest-android.rs
│   ├── setup_benchmarks.sh
│   └── test-runner-linux
├── contributing.md
├── examples/
│   ├── Cargo.toml
│   ├── aobench/
│   │   ├── Cargo.toml
│   │   ├── benches/
│   │   │   ├── ambient_occlusion.rs
│   │   │   ├── isec_plane.rs
│   │   │   ├── isec_sphere.rs
│   │   │   ├── random.rs
│   │   │   └── scanlines.rs
│   │   ├── benchmark.sh
│   │   ├── build.rs
│   │   ├── readme.md
│   │   ├── rustfmt.toml
│   │   ├── src/
│   │   │   ├── ambient_occlusion.rs
│   │   │   ├── geometry/
│   │   │   │   ├── mod.rs
│   │   │   │   ├── plane.rs
│   │   │   │   ├── ray.rs
│   │   │   │   ├── rayxN.rs
│   │   │   │   ├── sphere.rs
│   │   │   │   ├── vec.rs
│   │   │   │   └── vecxN.rs
│   │   │   ├── image.rs
│   │   │   ├── intersection/
│   │   │   │   ├── mod.rs
│   │   │   │   ├── packet.rs
│   │   │   │   ├── ray_plane.rs
│   │   │   │   ├── ray_sphere.rs
│   │   │   │   └── single.rs
│   │   │   ├── ispc_.rs
│   │   │   ├── lib.rs
│   │   │   ├── main.rs
│   │   │   ├── random.rs
│   │   │   ├── scalar.rs
│   │   │   ├── scalar_parallel.rs
│   │   │   ├── scene/
│   │   │   │   ├── mod.rs
│   │   │   │   ├── random.rs
│   │   │   │   └── test.rs
│   │   │   ├── tiled.rs
│   │   │   ├── tiled_parallel.rs
│   │   │   ├── vector.rs
│   │   │   └── vector_parallel.rs
│   │   └── volta/
│   │       ├── .gitignore
│   │       └── ao.ispc
│   ├── dot_product/
│   │   ├── Cargo.toml
│   │   ├── readme.md
│   │   └── src/
│   │       ├── lib.rs
│   │       ├── scalar.rs
│   │       └── simd.rs
│   ├── fannkuch_redux/
│   │   ├── Cargo.toml
│   │   ├── readme.md
│   │   └── src/
│   │       ├── fannkuchredux-output.txt
│   │       ├── lib.rs
│   │       ├── main.rs
│   │       ├── scalar.rs
│   │       └── simd.rs
│   ├── mandelbrot/
│   │   ├── Cargo.toml
│   │   ├── benchmark.sh
│   │   ├── build.rs
│   │   ├── readme.md
│   │   ├── src/
│   │   │   ├── ispc_tasks.rs
│   │   │   ├── lib.rs
│   │   │   ├── main.rs
│   │   │   ├── mandelbrot-output.txt
│   │   │   ├── scalar_par.rs
│   │   │   └── simd_par.rs
│   │   └── volta/
│   │       └── mandelbrot.ispc
│   ├── matrix_inverse/
│   │   ├── Cargo.toml
│   │   ├── readme.md
│   │   └── src/
│   │       ├── lib.rs
│   │       ├── scalar.rs
│   │       └── simd.rs
│   ├── nbody/
│   │   ├── Cargo.toml
│   │   ├── benches/
│   │   │   └── algs.rs
│   │   ├── readme.md
│   │   └── src/
│   │       ├── lib.rs
│   │       ├── main.rs
│   │       ├── nbody-output.txt
│   │       ├── scalar.rs
│   │       └── simd.rs
│   ├── options_pricing/
│   │   ├── Cargo.toml
│   │   ├── benchmark.sh
│   │   ├── build.rs
│   │   ├── readme.md
│   │   ├── src/
│   │   │   ├── ispc_.rs
│   │   │   ├── lib.rs
│   │   │   ├── main.rs
│   │   │   ├── scalar.rs
│   │   │   ├── simd.rs
│   │   │   ├── simd_kernels.rs
│   │   │   ├── simd_par.rs
│   │   │   └── sum.rs
│   │   └── volta/
│   │       ├── options.ispc
│   │       └── options_defs.h
│   ├── rust-toolchain
│   ├── slice_sum/
│   │   ├── Cargo.toml
│   │   ├── readme.md
│   │   └── src/
│   │       └── main.rs
│   ├── spectral_norm/
│   │   ├── Cargo.toml
│   │   ├── readme.md
│   │   └── src/
│   │       ├── lib.rs
│   │       ├── main.rs
│   │       ├── scalar.rs
│   │       ├── simd.rs
│   │       └── spectralnorm-output.txt
│   ├── stencil/
│   │   ├── Cargo.toml
│   │   ├── benchmark.sh
│   │   ├── build.rs
│   │   ├── readme.md
│   │   ├── src/
│   │   │   ├── ispc_loops.rs
│   │   │   ├── lib.rs
│   │   │   ├── main.rs
│   │   │   ├── scalar.rs
│   │   │   ├── simd.rs
│   │   │   └── simd_par.rs
│   │   └── volta/
│   │       ├── .gitignore
│   │       ├── Makefile
│   │       ├── common.mk
│   │       ├── stencil.cpp
│   │       ├── stencil.ispc
│   │       ├── stencil_serial.cpp
│   │       ├── tasksys.cpp
│   │       └── timing.h
│   └── triangle_xform/
│       ├── Cargo.toml
│       ├── readme.md
│       └── src/
│           ├── lib.rs
│           ├── scalar.rs
│           └── simd.rs
├── micro_benchmarks/
│   ├── Cargo.toml
│   ├── benches/
│   │   └── mask_reductions.rs
│   └── rust-toolchain
├── perf-guide/
│   ├── .gitignore
│   ├── book.toml
│   └── src/
│       ├── SUMMARY.md
│       ├── ascii.css
│       ├── bound_checks.md
│       ├── float-math/
│       │   ├── approx.md
│       │   ├── fma.md
│       │   ├── fp.md
│       │   └── svml.md
│       ├── introduction.md
│       ├── prof/
│       │   ├── linux.md
│       │   ├── mca.md
│       │   └── profiling.md
│       ├── target-feature/
│       │   ├── attribute.md
│       │   ├── features.md
│       │   ├── inlining.md
│       │   ├── practice.md
│       │   ├── runtime.md
│       │   └── rustflags.md
│       └── vert-hor-ops.md
├── rust-toolchain
├── rustfmt.toml
├── src/
│   ├── api/
│   │   ├── bit_manip.rs
│   │   ├── bitmask.rs
│   │   ├── cast/
│   │   │   ├── macros.rs
│   │   │   ├── v128.rs
│   │   │   ├── v16.rs
│   │   │   ├── v256.rs
│   │   │   ├── v32.rs
│   │   │   ├── v512.rs
│   │   │   └── v64.rs
│   │   ├── cast.rs
│   │   ├── cmp/
│   │   │   ├── eq.rs
│   │   │   ├── ord.rs
│   │   │   ├── partial_eq.rs
│   │   │   ├── partial_ord.rs
│   │   │   └── vertical.rs
│   │   ├── cmp.rs
│   │   ├── default.rs
│   │   ├── fmt/
│   │   │   ├── binary.rs
│   │   │   ├── debug.rs
│   │   │   ├── lower_hex.rs
│   │   │   ├── octal.rs
│   │   │   └── upper_hex.rs
│   │   ├── fmt.rs
│   │   ├── from/
│   │   │   ├── from_array.rs
│   │   │   └── from_vector.rs
│   │   ├── from.rs
│   │   ├── hash.rs
│   │   ├── into_bits/
│   │   │   ├── arch_specific.rs
│   │   │   ├── macros.rs
│   │   │   ├── v128.rs
│   │   │   ├── v16.rs
│   │   │   ├── v256.rs
│   │   │   ├── v32.rs
│   │   │   ├── v512.rs
│   │   │   └── v64.rs
│   │   ├── into_bits.rs
│   │   ├── math/
│   │   │   ├── float/
│   │   │   │   ├── abs.rs
│   │   │   │   ├── consts.rs
│   │   │   │   ├── cos.rs
│   │   │   │   ├── exp.rs
│   │   │   │   ├── ln.rs
│   │   │   │   ├── mul_add.rs
│   │   │   │   ├── mul_adde.rs
│   │   │   │   ├── powf.rs
│   │   │   │   ├── recpre.rs
│   │   │   │   ├── rsqrte.rs
│   │   │   │   ├── sin.rs
│   │   │   │   ├── sqrt.rs
│   │   │   │   ├── sqrte.rs
│   │   │   │   └── tanh.rs
│   │   │   └── float.rs
│   │   ├── math.rs
│   │   ├── minimal/
│   │   │   ├── iuf.rs
│   │   │   ├── mask.rs
│   │   │   └── ptr.rs
│   │   ├── minimal.rs
│   │   ├── ops/
│   │   │   ├── scalar_arithmetic.rs
│   │   │   ├── scalar_bitwise.rs
│   │   │   ├── scalar_mask_bitwise.rs
│   │   │   ├── scalar_shifts.rs
│   │   │   ├── vector_arithmetic.rs
│   │   │   ├── vector_bitwise.rs
│   │   │   ├── vector_float_min_max.rs
│   │   │   ├── vector_int_min_max.rs
│   │   │   ├── vector_mask_bitwise.rs
│   │   │   ├── vector_neg.rs
│   │   │   ├── vector_rotates.rs
│   │   │   └── vector_shifts.rs
│   │   ├── ops.rs
│   │   ├── ptr/
│   │   │   └── gather_scatter.rs
│   │   ├── ptr.rs
│   │   ├── reductions/
│   │   │   ├── bitwise.rs
│   │   │   ├── float_arithmetic.rs
│   │   │   ├── integer_arithmetic.rs
│   │   │   ├── mask.rs
│   │   │   └── min_max.rs
│   │   ├── reductions.rs
│   │   ├── select.rs
│   │   ├── shuffle.rs
│   │   ├── shuffle1_dyn.rs
│   │   ├── slice/
│   │   │   ├── from_slice.rs
│   │   │   └── write_to_slice.rs
│   │   ├── slice.rs
│   │   └── swap_bytes.rs
│   ├── api.rs
│   ├── codegen/
│   │   ├── bit_manip.rs
│   │   ├── llvm.rs
│   │   ├── math/
│   │   │   ├── float/
│   │   │   │   ├── abs.rs
│   │   │   │   ├── cos.rs
│   │   │   │   ├── cos_pi.rs
│   │   │   │   ├── exp.rs
│   │   │   │   ├── ln.rs
│   │   │   │   ├── macros.rs
│   │   │   │   ├── mul_add.rs
│   │   │   │   ├── mul_adde.rs
│   │   │   │   ├── powf.rs
│   │   │   │   ├── sin.rs
│   │   │   │   ├── sin_cos_pi.rs
│   │   │   │   ├── sin_pi.rs
│   │   │   │   ├── sqrt.rs
│   │   │   │   ├── sqrte.rs
│   │   │   │   └── tanh.rs
│   │   │   └── float.rs
│   │   ├── math.rs
│   │   ├── pointer_sized_int.rs
│   │   ├── reductions/
│   │   │   ├── mask/
│   │   │   │   ├── aarch64.rs
│   │   │   │   ├── arm.rs
│   │   │   │   ├── fallback.rs
│   │   │   │   ├── fallback_impl.rs
│   │   │   │   ├── x86/
│   │   │   │   │   ├── avx.rs
│   │   │   │   │   ├── avx2.rs
│   │   │   │   │   ├── sse.rs
│   │   │   │   │   └── sse2.rs
│   │   │   │   └── x86.rs
│   │   │   └── mask.rs
│   │   ├── reductions.rs
│   │   ├── shuffle.rs
│   │   ├── shuffle1_dyn.rs
│   │   ├── swap_bytes.rs
│   │   ├── v128.rs
│   │   ├── v16.rs
│   │   ├── v256.rs
│   │   ├── v32.rs
│   │   ├── v512.rs
│   │   ├── v64.rs
│   │   ├── vPtr.rs
│   │   └── vSize.rs
│   ├── codegen.rs
│   ├── lib.rs
│   ├── masks.rs
│   ├── sealed.rs
│   ├── testing/
│   │   ├── macros.rs
│   │   └── utils.rs
│   ├── testing.rs
│   ├── v128.rs
│   ├── v16.rs
│   ├── v256.rs
│   ├── v32.rs
│   ├── v512.rs
│   ├── v64.rs
│   ├── vPtr.rs
│   └── vSize.rs
├── tests/
│   └── endianness.rs
└── verify/
    └── verify/
        ├── Cargo.toml
        ├── readme.md
        ├── rust-toolchain
        └── src/
            ├── api/
            │   ├── math/
            │   │   └── float/
            │   │       ├── mod.rs
            │   │       └── mul_add.rs
            │   ├── math.rs
            │   ├── ops/
            │   │   ├── vector_rotates/
            │   │   │   └── x86.rs
            │   │   └── vector_rotates.rs
            │   ├── ops.rs
            │   ├── reductions/
            │   │   ├── mask/
            │   │   │   ├── avx.rs
            │   │   │   ├── avx2.rs
            │   │   │   ├── sse.rs
            │   │   │   └── sse2.rs
            │   │   └── mask.rs
            │   └── reductions.rs
            ├── api.rs
            └── lib.rs

================================================
FILE CONTENTS
================================================

================================================
FILE: .appveyor.yml
================================================
matrix:
  allow_failures:
    # FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/72
    - TARGET: i686-pc-windows-msvc
    - TARGET: i686-pc-windows-gnu
    - TARGET: x86_64-pc-windows-gnu
  fast_finish: true

environment:
  matrix:
    - TARGET: x86_64-pc-windows-msvc
      MSYSTEM: MINGW64
      NOVERIFY: "1"
    - TARGET: x86_64-pc-windows-msvc
      MSYSTEM: MINGW64
      RUSTFLAGS: "-C target-feature=+sse4.2"
      NOVERIFY: "1"
    - TARGET: x86_64-pc-windows-msvc
      MSYSTEM: MINGW64
      RUSTFLAGS: "-C target-feature=+avx"
      NOVERIFY: "1"
    - TARGET: x86_64-pc-windows-msvc
      MSYSTEM: MINGW64
      RUSTFLAGS: "-C target-feature=+avx2"
      NOVERIFY: "1"

    - TARGET: i686-pc-windows-msvc
      MSYSTEM: MINGW32
      NOVERIFY: "1"
    - TARGET: i686-pc-windows-msvc
      MSYSTEM: MINGW32
      RUSTFLAGS: "-C target-feature=+sse4.2"
      NOVERIFY: "1"
    - TARGET: i686-pc-windows-msvc
      MSYSTEM: MINGW32
      RUSTFLAGS: "-C target-feature=+avx"
      NOVERIFY: "1"
    - TARGET: i686-pc-windows-msvc
      MSYSTEM: MINGW32
      RUSTFLAGS: "-C target-feature=+avx2"
      NOVERIFY: "1"

    - TARGET: x86_64-pc-windows-gnu
      MSYSTEM: MINGW64

    - TARGET: i686-pc-windows-gnu
      MSYSTEM: MINGW32
    - TARGET: x86_64-pc-windows-gnu
      MSYSTEM: MINGW64
install:
  - ps: if (ls -r . -fi "*.rs" | sls "`t") { throw "Found tab character" }
  - ps: Start-FileDownload "https://static.rust-lang.org/dist/rust-nightly-${env:TARGET}.exe" -FileName "rust-install.exe"
  - ps: .\rust-install.exe /VERYSILENT /NORESTART /DIR="C:\rust" | Out-Null
  - ps: $env:PATH="$env:PATH;C:\rust\bin"
  - set PATH=c:\msys64\%MSYSTEM%\bin;c:\msys64\usr\bin;%PATH%
  - rustc -vV
  - cargo -vV
build: false
test_script: bash -c "ci/run.sh"


================================================
FILE: .github/workflows/benchmarks.yml
================================================
name: benchmarks

on:
  push:
    branches:
      - master
  pull_request:
  workflow_dispatch:

jobs:
  x86_64-unknown-linux-gnu:
    uses: ./.github/workflows/run-ci-script.yml
    with:
      target: x86_64-unknown-linux-gnu
      setup_script: ci/setup_benchmarks.sh
      script: ci/benchmark.sh
      norun: 1
      verify: 1
      # FIXME: figure out how to add downloaded ispc to PATH
      # features: ispc
  x86_64-apple-darwin:
    uses: ./.github/workflows/run-ci-script.yml
    with:
      target: x86_64-apple-darwin
      runner: macos-latest
      setup_script: ci/setup_benchmarks.sh
      script: ci/benchmark.sh
      norun: 1
      verify: 1
      # FIXME: figure out how to add downloaded ispc to PATH
      # features: ispc


================================================
FILE: .github/workflows/ci.yml
================================================
name: ci

# trigger for all PRs and changes to master
on:
  push:
    branches:
      - master
  pull_request:

jobs:
  rustfmt:
    uses: ./.github/workflows/run-ci-script.yml
    with:
      script: ci/all.sh check_fmt || true
  x86_64-unknown-linux-android:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: true
    with:
      target: x86_64-linux-android
  armv7-linux-androideabi:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: true
    with:
      target: armv7-linux-androideabi
  aarch64-unknown-linux-android-NEON:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: true
    with:
      target: aarch64-linux-android
      rustflags: -Ctarget-feature=+neon
  thumbv7neon-linux-androideabi:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: false
    with:
      target: thumbv7neon-linux-androideabi
  i586-unknown-linux-gnu:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: false
    with:
      target: i586-unknown-linux-gnu
      rustflags: -Crelocation-model=static
  i586-unknown-linux-gnu-SSE:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: false
    with:
      target: i586-unknown-linux-gnu
      rustflags: -Crelocation-model=static -Ctarget-feature=+sse
  i586-unknown-linux-gnu-SSE2:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: false
    with:
      target: i586-unknown-linux-gnu
      rustflags: -Crelocation-model=static -Ctarget-feature=+sse2
  i686-unknown-linux-gnu:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: false
    with:
      target: i686-unknown-linux-gnu
      rustflags: -Crelocation-model=static
  i686-unknown-linux-gnu-SSE4_2:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: false
    with:
      target: i686-unknown-linux-gnu
      rustflags: -Crelocation-model=static -Ctarget-feature=+sse4.2
  i686-unknown-linux-gnu-AVX2:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: false
    with:
      target: i686-unknown-linux-gnu
      rustflags: -Crelocation-model=static -Ctarget-feature=+avx2
  x86_64-unknown-linux-gnu:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: true
    with:
      target: x86_64-unknown-linux-gnu
  x86_64-unknown-linux-gnu-SSE4_2:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: true
    with:
      target: x86_64-unknown-linux-gnu
      rustflags: -Ctarget-feature=+sse4.2
  x86_64-unknown-linux-gnu-AVX2:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: true
    with:
      target: x86_64-unknown-linux-gnu
      rustflags: -Ctarget-feature=+avx2
  arm-unknown-linux-gnueabihf:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: true
    with:
      target: arm-unknown-linux-gnueabihf
  armv7-unknown-linux-gnueabihf:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: true
    with:
      target: armv7-unknown-linux-gnueabihf
  armv7-unknown-linux-gnueabihf-NEON:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: true
    with:
      target: armv7-unknown-linux-gnueabihf
      rustflags: -Ctarget-feature=+neon
  thumbv7neon-unknown-linux-gnueabihf:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: false
    with:
      target: thumbv7neon-unknown-linux-gnueabihf
  aarch64-unknown-linux-gnu-NEON:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: true
    with:
      target: aarch64-unknown-linux-gnu
      rustflags: -Ctarget-feature=+neon
  powerpc-unknown-linux-gnu:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: false
    with:
      target: powerpc-unknown-linux-gnu
  powerpc64-unknown-linux-gnu:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: false
    with:
      target: powerpc64-unknown-linux-gnu
  powerpc64le-unknown-linux-gnu:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: true
    with:
      target: powerpc64le-unknown-linux-gnu
  powerpc64le-unknown-linux-gnu-ALTIVEC:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: true
    with:
      target: powerpc64le-unknown-linux-gnu
      rustflags: -Ctarget-feature=+altivec
  powerpc64le-unknown-linux-gnu-VSX:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: true
    with:
      target: powerpc64le-unknown-linux-gnu
      rustflags: -Ctarget-feature=+vsx
  s390x-unknown-linux-gnu:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: false
    with:
      target: s390x-unknown-linux-gnu
  sparc64-unknown-linux-gnu:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: false
    with:
      target: sparc64-unknown-linux-gnu
  wasm32-unknown-unknown:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: false
    with:
      target: wasm32-unknown-unknown
  x86_64-apple-darwin-SSE4_2:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: true
    with:
      runner: macos-latest
      script: ci/run.sh
      target: x86_64-apple-darwin
      rustflags: -Ctarget-feature=+sse4.2
  x86_64-apple-darwin-AVX:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: true
    with:
      runner: macos-latest
      script: ci/run.sh
      target: x86_64-apple-darwin
      rustflags: -Ctarget-feature=+avx
  x86_64-apple-ios:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: true
    with:
      runner: macos-latest
      script: ci/run.sh
      target: x86_64-apple-ios
  aarch64-apple-ios:
    uses: ./.github/workflows/run-ci-script.yml
    strategy:
      fail-fast: true
    with:
      runner: macos-latest
      script: ci/run.sh
      target: aarch64-apple-ios
      rustflags: -Ctarget-feature=+neon


================================================
FILE: .github/workflows/docs.yml
================================================
name: docs

on:
  push:
    branches:
      - master

jobs:
  docs:
    uses: ./.github/workflows/run-ci-script.yml
    with:
      setup_script: cargo install mdbook
      script: ci/dox.sh


================================================
FILE: .github/workflows/run-ci-script.yml
================================================
name: run-ci-script

on:
  workflow_call:
    inputs:
      runner:
        required: false
        type: string
        default: ubuntu-latest
      target:
        required: false
        type: string
        default: ''
      rustflags:
        required: false
        type: string
        default: ''
      script:
        required: false
        type: string
        default: ci/run-docker.sh
      setup_script:
        required: false
        type: string
      norun:
        required: false
        type: string
        default: ''
      verify:
        required: false
        type: string
        default: ''
      features:
        required: false
        type: string
        default: ''

jobs:
  run-ci-script:
    runs-on: ${{ inputs.runner }}
    steps:
      - name: Checkout
        uses: actions/checkout@v2
      - name: Init Rustup Cache
        uses: actions/cache@v2
        with:
          path: |
            ~/.rustup/toolchains
          key: ${{ runner.os }}-cargo-${{ hashFiles('**/rust-toolchain') }}
      - name: Install Toolchain
        uses: dtolnay/rust-toolchain@nightly
        with:
          # FIXME: change to nightly once https://github.com/rust-lang/packed_simd/pull/350 is merged
          # needs to be kept in sync with the toolchain files
          targets: ${{ inputs.target }}
          components: rustfmt
      - name: Generate Lockfile
        run: cargo generate-lockfile
      - name: Init Cargo Cache
        uses: actions/cache@v2
        with:
          path: |
            ~/.cargo/bin/
            ~/.cargo/registry/index/
            ~/.cargo/registry/cache/
            ~/.cargo/git/db/
            target/
          key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
      - name: Setup
        if: ${{ inputs.setup_script != '' }}
        run: ${{ inputs.setup_script }}
        env:
          TARGET: ${{ inputs.target }}
          RUSTFLAGS: ${{ inputs.rustflags }}
          NORUN: ${{ inputs.norun }}
          VERIFY: ${{ inputs.verify }}
          FEATURES: ${{ inputs.features }}
      - name: Run CI Script
        timeout-minutes: 30
        run: ${{ inputs.script }}
        env:
          TARGET: ${{ inputs.target }}
          RUSTFLAGS: ${{ inputs.rustflags }}
          NORUN: ${{ inputs.norun }}
          VERIFY: ${{ inputs.verify }}
          FEATURES: ${{ inputs.features }}


================================================
FILE: .gitignore
================================================
Cargo.lock
target/

# llvm-ir and assembly
*.ll
*.d

# png files output by benchmarks
*.png

# -*- mode: gitignore; -*-
*~
\#*\#
/.emacs.desktop
/.emacs.desktop.lock
*.elc
auto-save-list
tramp
.\#*

# Org-mode
.org-id-locations
*_archive

# flymake-mode
*_flymake.*

# eshell files
/eshell/history
/eshell/lastdir

# elpa packages
/elpa/

# reftex files
*.rel

# AUCTeX auto folder
/auto/

# cask packages
.cask/
dist/

# Flycheck
flycheck_*.el

# server auth directory
/server/

# projectiles files
.projectile

# directory configuration
.dir-locals.el

================================================
FILE: .travis.yml
================================================
language: rust
rust: nightly
os: linux
dist: focal

stages:
  - tools
  - build-test-verify # Passes full test suite, permit no regressions (unless it's rustup :/)
  - 32bit-tier1
  - 64bit-tier2
  - 32bit-tier2

jobs:
  fast_finish: true
  include:
    # Android:
    - env: TARGET=x86_64-linux-android
      name: "x86_64-unknown-linux-android + SSE2"
      stage: build-test-verify
    - env: TARGET=arm-linux-androideabi
      name: "arm-linux-androideabi"
      stage: build-test-verify
    - name: "aarch64-unknown-linux-android + NEON"
      env: TARGET=aarch64-linux-android RUSTFLAGS="-C target-feature=+neon"
      stage: build-test-verify
    - env: TARGET="thumbv7neon-linux-androideabi"
      name: "thumbv7neon-linux-androideabi"
      stage: 32bit-tier2
    # Linux:
    - env: TARGET=i586-unknown-linux-gnu
      name: "i586-unknown-linux-gnu"
      stage: 32bit-tier2
    - env: TARGET=i586-unknown-linux-gnu RUSTFLAGS="-C target-feature=+sse"
      name: "i586-unknown-linux-gnu + SSE"
      stage: 32bit-tier2
    - env: TARGET=i586-unknown-linux-gnu RUSTFLAGS="-C target-feature=+sse2"
      name: "i586-unknown-linux-gnu + SSE2"
      stage: 32bit-tier2
    - env: TARGET=i686-unknown-linux-gnu
      name: "i686-unknown-linux-gnu + SSE2"
      stage: 32bit-tier1
    - env: TARGET=i686-unknown-linux-gnu RUSTFLAGS="-C target-feature=+sse4.2"
      name: "i686-unknown-linux-gnu + SSE4.2"
      stage: 32bit-tier1
    - env: TARGET=i686-unknown-linux-gnu RUSTFLAGS="-C target-feature=+avx2"
      name: "i686-unknown-linux-gnu + AVX2"
      stage: 32bit-tier1
    - env: TARGET=x86_64-unknown-linux-gnu RUSTFLAGS="-C target-feature=+sse4.2"
      name: "x86_64-unknown-linux-gnu + SSE4.2"
      stage: build-test-verify
    - env: TARGET=x86_64-unknown-linux-gnu RUSTFLAGS="-C target-feature=+avx2"
      name: "x86_64-unknown-linux-gnu + AVX2"
      stage: build-test-verify
    - env: TARGET=arm-unknown-linux-gnueabihf
      name: "arm-unknown-linux-gnueabihf"
      stage: build-test-verify
    - env: TARGET=armv7-unknown-linux-gnueabihf
      name: "armv7-unknown-linux-gnueabihf"
      stage: build-test-verify
    - env: TARGET=armv7-unknown-linux-gnueabihf RUSTFLAGS="-C target-feature=+neon"
      name: "armv7-unknown-linux-gnueabihf + NEON"
      stage: build-test-verify
    - env: TARGET="thumbv7neon-unknown-linux-gnueabihf"
      name: "thumbv7neon-unknown-linux-gnueabihf"
      stage: 32bit-tier2
    - name: "aarch64-unknown-linux-gnu + NEON"
      env: TARGET=aarch64-unknown-linux-gnu RUSTFLAGS="-C target-feature=+neon"
      stage: build-test-verify
    - env: TARGET=mips-unknown-linux-gnu
      name: "mips-unknown-linux-gnu"
      stage: 32bit-tier2
    - env: TARGET=mipsel-unknown-linux-musl
      name: "mipsel-unknown-linux-musl"
      stage: 32bit-tier2
    - env: TARGET=mips64-unknown-linux-gnuabi64
      name: "mips64-unknown-linux-gnuabi64"
      stage: 64bit-tier2
    - env: TARGET=mips64el-unknown-linux-gnuabi64
      name: "mips64el-unknown-linux-gnuabi64"
      stage: 64bit-tier2
      # FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/18
      # env: TARGET=mips64el-unknown-linux-gnuabi64 RUSTFLAGS="-C target-feature=+msa -C target-cpu=mips64r6"
    - env: TARGET=powerpc-unknown-linux-gnu
      name: "powerpc-unknown-linux-gnu"
      stage: 32bit-tier2
    - env: TARGET=powerpc64-unknown-linux-gnu
      name: "powerpc64-unknown-linux-gnu"
      stage: 64bit-tier2
    - name: "powerpc64le-unknown-linux-gnu"
      env: TARGET=powerpc64le-unknown-linux-gnu
      stage: build-test-verify
    - name: "powerpc64le-unknown-linux-gnu + ALTIVEC"
      env: TARGET=powerpc64le-unknown-linux-gnu RUSTFLAGS="-C target-feature=+altivec"
      stage: build-test-verify
    - name: "powerpc64le-unknown-linux-gnu + VSX"
      env: TARGET=powerpc64le-unknown-linux-gnu RUSTFLAGS="-C target-feature=+vsx"
      stage: build-test-verify
    - name: "s390x-unknown-linux-gnu"
      env: TARGET=s390x-unknown-linux-gnu
      stage: 64bit-tier2
    - env: TARGET=sparc64-unknown-linux-gnu
      name: "sparc64-unknown-linux-gnu"
      stage: 64bit-tier2
    # WebAssembly:
    - env: TARGET=wasm32-unknown-unknown
      name: "wasm32-unknown-unknown"
      stage: 32bit-tier2
    # MacOSX:
    - os: osx
      env: TARGET=x86_64-apple-darwin RUSTFLAGS="-C target-feature=+sse4.2"
      name: "x86_64-apple-darwin + SSE4.2"
      install: true
      script: ci/run.sh
      osx_image: xcode10
      stage: build-test-verify
      # Travis-CI OSX build bots do not support AVX2:
    - os: osx
      env: TARGET=x86_64-apple-darwin RUSTFLAGS="-C target-feature=+avx"
      name: "x86_64-apple-darwin + AVX"
      install: true
      script: ci/run.sh
      osx_image: xcode10
      stage: build-test-verify
    # *BSDs:
    #- env: TARGET=i686-unknown-freebsd NORUN=1
    #  script: ci/run.sh
    #- env: TARGET=x86_64-unknown-freebsd NORUN=1
    #  script: ci/run.sh
    #- env: TARGET=x86_64-unknown-netbsd NORUN=1
    #  script: ci/run.sh
    # Solaris:
    #- env: TARGET=x86_64-sun-solaris NORUN=1
    #  script: ci/run.sh
    # iOS:
    - os: osx
      env: TARGET=x86_64-apple-ios
      name: "x86_64-apple-ios + SSE2"
      script: ci/run.sh
      osx_image: xcode9.4
      stage: 64bit-tier2
    - name: "aarch64-apple-ios + NEON"
      env: TARGET=aarch64-apple-ios RUSTFLAGS="-C target-feature=+neon"
      os: osx
      osx_image: xcode9.4
      script: ci/run.sh
      stage: 64bit-tier2
    # BENCHMARKS:
    - name: "Benchmarks - x86_64-unknown-linux-gnu"
      install: TARGET=x86_64-unknown-linux-gnu ./ci/setup_benchmarks.sh
      # FIXME: Use `core_arch,sleef-sys` features once they works again
      script: PATH=$(pwd):$PATH NORUN=1 VERIFY=1 FEATURES=ispc ci/benchmark.sh
      stage: tools
    - name: "Benchmarks - x86_64-apple-darwin"
      install: TARGET=x86_64-apple-darwin ./ci/setup_benchmarks.sh
      # FIXME: Use `core_arch,sleef-sys` features once they works again
      script: PATH=$(pwd):$PATH NORUN=1 VERIFY=1 FEATURES=ispc ci/benchmark.sh
      os: osx
      osx_image: xcode9.4
      stage: tools
    # TOOLS:
    - name: "Documentation"
      before_install:
        - sudo add-apt-repository -y ppa:deadsnakes/ppa
        - sudo apt-get update -y
        - sudo apt-get install -y python3.9
      install:
        - cargo install mdbook
      script: ci/dox.sh
      stage: tools
    - name: "rustfmt"
      install: true
      script: |
        rustup toolchain install nightly -c rustfmt --allow-downgrade
        ci/all.sh check_fmt || true
      stage: tools

  allow_failures:
    # FIXME: ISPC cannot be found?
    - name: "Benchmarks - x86_64-apple-darwin"
    # FIXME: i686 fails in inlining, apparently
    - stage: 32bit-tier1
    #- env: TARGET=i686-unknown-freebsd NORUN=1
    #- env: TARGET=x86_64-unknown-freebsd NORUN=1
    #- env: TARGET=x86_64-unknown-netbsd NORUN=1
    #- env: TARGET=x86_64-sun-solaris NORUN=1

    # FIXME: TBD
    - stage: 64bit-tier2
    - stage: 32bit-tier2

    # FIXME: iOS
    # https://github.com/rust-lang-nursery/packed_simd/issues/26
    - env: TARGET=x86_64-apple-ios
    # Is this related to the above? Mysterious test failure
    - name: "aarch64-apple-ios + NEON"

install: travis_retry rustup target add $TARGET
before_script: cargo generate-lockfile
script: travis_wait 50 ci/run-docker.sh
after_script: sleep 5

env:
  global:
    secure: "lPHv7s6+AxQYNaFncycVFQt++Y1asQmMhOikQU1ztlP8CK7+hn2m98cg/euOJyzIOb2iJ3ZX4cGZkzw4lc59MQBByb1GtDbazQoUOzVDbVfe9BDD2f8JVoIFh1CMfjPKQ7Gg/rJqWlwrUlSd5GNxPCutKjY7qZhJuR6SQbJjlWaGN2Vd4fVCzKXz8fHRXgMEZS+d+CR4Nsrkb83J3Z4s5kSdJmhYxJ61AWjuzJVwUh4l3/HEYlSL5XXpuh5R2i7W16h1PlNdaTUgkZli1lHzO8+6Q8LzX9+XiLIEVX9lw3A2NdIKGz8E/+7Qs5oYOkwYhjROsDQxIK7xkSM30bQuN7cwMBybAVIyOPJkqXQ1dQyp83KSdsOj7JMyDDRvcEDLI6ehRlm5EcdH7YrReuboN81iUo0Sa7VsuUmgj5hjERCt9r30f9aWuitABai7vKRtjglg7Sp5CrEVPA4PQs6PqKCCRogoggbXJ/Z5Dyw/RZaXPeNR9+qIKN1Vjm9Gew1sRN2JK/3+vXTKtyJXH/uBxgJt4jQlbuShOJuF+BSfTF88sMe67a/357SSOIb4JkaCyd0flDCWYE8576kaHPlVVMT2peXee0LeRXm1e13nG3Na0t3LS/orJLPHOShNQGoDj7qAP5aEKggRya896JGwtvlaBHHTmSQh65G7cyNErZo="
branches:
  only:
    - staging # bors r+
    - trying  # bors try
    - master
notifications:
  email:
    on_success: never


================================================
FILE: Cargo.toml
================================================
[package]
name = "packed_simd"
version = "0.3.9"
description = "Portable Packed SIMD vectors"
documentation = "https://docs.rs/crate/packed_simd/"
homepage = "https://github.com/rust-lang/packed_simd"
repository = "https://github.com/rust-lang/packed_simd"
keywords = ["simd", "vector", "portability"]
categories = ["hardware-support", "concurrency", "no-std", "data-structures"]
license = "MIT OR Apache-2.0"
build = "build.rs"
edition = "2018"

[package.metadata.docs.rs]
features = ["into_bits"]
rustdoc-args = ["--cfg", "doc_cfg"]
# To build locally:
# RUSTDOCFLAGS="--cfg doc_cfg" cargo +nightly doc --features into_bits --no-deps --open

[badges]
is-it-maintained-issue-resolution = { repository = "rust-lang/packed_simd" }
is-it-maintained-open-issues = { repository = "rust-lang/packed_simd" }
maintenance = { status = "experimental" }

[dependencies]
cfg-if = "1.0.0"
core_arch = { version = "0.1.5", optional = true }
num-traits = { version = "0.2.14", default-features = false, features = ["libm"] }

[features]
default = []
into_bits = []
libcore_neon = []

[dev-dependencies]
paste = "^1"
arrayvec = { version = "^0.5", default-features = false }

[target.'cfg(target_arch = "x86_64")'.dependencies.sleef-sys]
version = "0.1.2"
optional = true

[target.wasm32-unknown-unknown.dev-dependencies]
# Keep in sync with the version on Dockerfile.
wasm-bindgen = "=0.2.87"
wasm-bindgen-test = "=0.3.37"


================================================
FILE: LICENSE-APACHE
================================================
                              Apache License
                        Version 2.0, January 2004
                     http://www.apache.org/licenses/

TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

1. Definitions.

   "License" shall mean the terms and conditions for use, reproduction,
   and distribution as defined by Sections 1 through 9 of this document.

   "Licensor" shall mean the copyright owner or entity authorized by
   the copyright owner that is granting the License.

   "Legal Entity" shall mean the union of the acting entity and all
   other entities that control, are controlled by, or are under common
   control with that entity. For the purposes of this definition,
   "control" means (i) the power, direct or indirect, to cause the
   direction or management of such entity, whether by contract or
   otherwise, or (ii) ownership of fifty percent (50%) or more of the
   outstanding shares, or (iii) beneficial ownership of such entity.

   "You" (or "Your") shall mean an individual or Legal Entity
   exercising permissions granted by this License.

   "Source" form shall mean the preferred form for making modifications,
   including but not limited to software source code, documentation
   source, and configuration files.

   "Object" form shall mean any form resulting from mechanical
   transformation or translation of a Source form, including but
   not limited to compiled object code, generated documentation,
   and conversions to other media types.

   "Work" shall mean the work of authorship, whether in Source or
   Object form, made available under the License, as indicated by a
   copyright notice that is included in or attached to the work
   (an example is provided in the Appendix below).

   "Derivative Works" shall mean any work, whether in Source or Object
   form, that is based on (or derived from) the Work and for which the
   editorial revisions, annotations, elaborations, or other modifications
   represent, as a whole, an original work of authorship. For the purposes
   of this License, Derivative Works shall not include works that remain
   separable from, or merely link (or bind by name) to the interfaces of,
   the Work and Derivative Works thereof.

   "Contribution" shall mean any work of authorship, including
   the original version of the Work and any modifications or additions
   to that Work or Derivative Works thereof, that is intentionally
   submitted to Licensor for inclusion in the Work by the copyright owner
   or by an individual or Legal Entity authorized to submit on behalf of
   the copyright owner. For the purposes of this definition, "submitted"
   means any form of electronic, verbal, or written communication sent
   to the Licensor or its representatives, including but not limited to
   communication on electronic mailing lists, source code control systems,
   and issue tracking systems that are managed by, or on behalf of, the
   Licensor for the purpose of discussing and improving the Work, but
   excluding communication that is conspicuously marked or otherwise
   designated in writing by the copyright owner as "Not a Contribution."

   "Contributor" shall mean Licensor and any individual or Legal Entity
   on behalf of whom a Contribution has been received by Licensor and
   subsequently incorporated within the Work.

2. Grant of Copyright License. Subject to the terms and conditions of
   this License, each Contributor hereby grants to You a perpetual,
   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
   copyright license to reproduce, prepare Derivative Works of,
   publicly display, publicly perform, sublicense, and distribute the
   Work and such Derivative Works in Source or Object form.

3. Grant of Patent License. Subject to the terms and conditions of
   this License, each Contributor hereby grants to You a perpetual,
   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
   (except as stated in this section) patent license to make, have made,
   use, offer to sell, sell, import, and otherwise transfer the Work,
   where such license applies only to those patent claims licensable
   by such Contributor that are necessarily infringed by their
   Contribution(s) alone or by combination of their Contribution(s)
   with the Work to which such Contribution(s) was submitted. If You
   institute patent litigation against any entity (including a
   cross-claim or counterclaim in a lawsuit) alleging that the Work
   or a Contribution incorporated within the Work constitutes direct
   or contributory patent infringement, then any patent licenses
   granted to You under this License for that Work shall terminate
   as of the date such litigation is filed.

4. Redistribution. You may reproduce and distribute copies of the
   Work or Derivative Works thereof in any medium, with or without
   modifications, and in Source or Object form, provided that You
   meet the following conditions:

   (a) You must give any other recipients of the Work or
       Derivative Works a copy of this License; and

   (b) You must cause any modified files to carry prominent notices
       stating that You changed the files; and

   (c) You must retain, in the Source form of any Derivative Works
       that You distribute, all copyright, patent, trademark, and
       attribution notices from the Source form of the Work,
       excluding those notices that do not pertain to any part of
       the Derivative Works; and

   (d) If the Work includes a "NOTICE" text file as part of its
       distribution, then any Derivative Works that You distribute must
       include a readable copy of the attribution notices contained
       within such NOTICE file, excluding those notices that do not
       pertain to any part of the Derivative Works, in at least one
       of the following places: within a NOTICE text file distributed
       as part of the Derivative Works; within the Source form or
       documentation, if provided along with the Derivative Works; or,
       within a display generated by the Derivative Works, if and
       wherever such third-party notices normally appear. The contents
       of the NOTICE file are for informational purposes only and
       do not modify the License. You may add Your own attribution
       notices within Derivative Works that You distribute, alongside
       or as an addendum to the NOTICE text from the Work, provided
       that such additional attribution notices cannot be construed
       as modifying the License.

   You may add Your own copyright statement to Your modifications and
   may provide additional or different license terms and conditions
   for use, reproduction, or distribution of Your modifications, or
   for any such Derivative Works as a whole, provided Your use,
   reproduction, and distribution of the Work otherwise complies with
   the conditions stated in this License.

5. Submission of Contributions. Unless You explicitly state otherwise,
   any Contribution intentionally submitted for inclusion in the Work
   by You to the Licensor shall be under the terms and conditions of
   this License, without any additional terms or conditions.
   Notwithstanding the above, nothing herein shall supersede or modify
   the terms of any separate license agreement you may have executed
   with Licensor regarding such Contributions.

6. Trademarks. This License does not grant permission to use the trade
   names, trademarks, service marks, or product names of the Licensor,
   except as required for reasonable and customary use in describing the
   origin of the Work and reproducing the content of the NOTICE file.

7. Disclaimer of Warranty. Unless required by applicable law or
   agreed to in writing, Licensor provides the Work (and each
   Contributor provides its Contributions) on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
   implied, including, without limitation, any warranties or conditions
   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
   PARTICULAR PURPOSE. You are solely responsible for determining the
   appropriateness of using or redistributing the Work and assume any
   risks associated with Your exercise of permissions under this License.

8. Limitation of Liability. In no event and under no legal theory,
   whether in tort (including negligence), contract, or otherwise,
   unless required by applicable law (such as deliberate and grossly
   negligent acts) or agreed to in writing, shall any Contributor be
   liable to You for damages, including any direct, indirect, special,
   incidental, or consequential damages of any character arising as a
   result of this License or out of the use or inability to use the
   Work (including but not limited to damages for loss of goodwill,
   work stoppage, computer failure or malfunction, or any and all
   other commercial damages or losses), even if such Contributor
   has been advised of the possibility of such damages.

9. Accepting Warranty or Additional Liability. While redistributing
   the Work or Derivative Works thereof, You may choose to offer,
   and charge a fee for, acceptance of support, warranty, indemnity,
   or other liability obligations and/or rights consistent with this
   License. However, in accepting such obligations, You may act only
   on Your own behalf and on Your sole responsibility, not on behalf
   of any other Contributor, and only if You agree to indemnify,
   defend, and hold each Contributor harmless for any liability
   incurred by, or claims asserted against, such Contributor by reason
   of your accepting any such warranty or additional liability.

END OF TERMS AND CONDITIONS

APPENDIX: How to apply the Apache License to your work.

   To apply the Apache License to your work, attach the following
   boilerplate notice, with the fields enclosed by brackets "[]"
   replaced with your own identifying information. (Don't include
   the brackets!)  The text should be enclosed in the appropriate
   comment syntax for the file format. We also recommend that a
   file or class name and description of purpose be included on the
   same "printed page" as the copyright notice for easier
   identification within third-party archives.

Copyright [yyyy] [name of copyright owner]

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.


================================================
FILE: LICENSE-MIT
================================================
Copyright (c) 2014 The Rust Project Developers

Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:

The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.


================================================
FILE: README.md
================================================
# `Simd<[T; N]>`

## Implementation of [Rust RFC #2366: `std::simd`][rfc2366]

[![Latest Version]][crates.io] [![docs]][master_docs]

**WARNING**: this crate only supports the most recent nightly Rust toolchain
and will be superseded by [`#![feature(portable_simd)]`](https://github.com/rust-lang/portable-simd).

## Documentation

* [API docs (`master` branch)][master_docs]
* [Performance guide][perf_guide]
* [API docs (`docs.rs`)][docs.rs]
* [RFC2366 `std::simd`][rfc2366]: - contains motivation, design rationale,
  discussion, etc.

## Examples

Most of the examples come with both a scalar and a vectorized implementation.

* [`aobench`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench)
* [`fannkuch_redux`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/fannkuch_redux)
* [`matrix inverse`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/matrix_inverse)
* [`mandelbrot`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/mandelbrot)
* [`n-body`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/nbody)
* [`options_pricing`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/options_pricing)
* [`spectral_norm`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/spectral_norm)
* [`triangle transform`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/triangle_xform)
* [`stencil`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/stencil)
* [`vector dot product`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/dot_product)

## Cargo features

* `into_bits` (default: disabled): enables `FromBits`/`IntoBits` trait
  implementations for the vector types. These allow reinterpreting the bits of a
  vector type as those of another vector type safely by just using the
  `.into_bits()` method.

## Performance

The following [ISPC] examples are also part of `packed_simd`'s
[`examples/`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/)
directory, where `packed_simd`+[`rayon`][rayon] are used to emulate [ISPC]'s
Single-Program-Multiple-Data (SPMD) programming model. The performance results
on different hardware is shown in the `readme.md` of each example. The following
table summarizes the performance ranges, where `+` means speed-up and `-`
slowdown:

* `aobench`: `[-1.02x, +1.53x]`,
* `stencil`: `[+1.06x, +1.72x]`,
* `mandelbrot`: `[-1.74x, +1.2x]`,
* `options_pricing`:
   * `black_scholes`: `+1.0x`
   * `binomial_put`: `+1.4x`

 While SPMD is not the intended use case for `packed_simd`, it is possible to
 combine the library with [`rayon`][rayon] to poorly emulate [ISPC]'s SPMD programming
 model in Rust. Writing performant code is not as straightforward as with
 [ISPC], but with some care (e.g. see the [Performance Guide][perf_guide]) one
 can easily match and often out-perform [ISPC]'s "default performance".

## Platform support

The following table describes the supported platforms: `build` shows whether
the library compiles without issues for a given target, while `run` shows
whether the test suite passes for a given target.

| **Linux**                             | **build** | **run** |
|---------------------------------------|-----------|---------|
| `i586-unknown-linux-gnu`              | ✓         | ✗       |
| `i686-unknown-linux-gnu`              | ✓         | ✗       |
| `x86_64-unknown-linux-gnu`            | ✓         | ✓       |
| `arm-unknown-linux-gnueabihf`         | ✓         | ✓       |
| `armv7-unknown-linux-gnueabi`         | ✓         | ✓       |
| `aarch64-unknown-linux-gnu`           | ✓         | ✓       |
| `powerpc-unknown-linux-gnu`           | ✓         | ✗       |
| `powerpc64-unknown-linux-gnu`         | ✓         | ✗       |
| `powerpc64le-unknown-linux-gnu`       | ✓         | ✓       |
| `s390x-unknown-linux-gnu`             | ✓         | ✗       |
| `sparc64-unknown-linux-gnu`           | ✓         | ✗       |
| `thumbv7neon-unknown-linux-gnueabihf` | ✓         | ✓       |
| **MacOSX**                            | **build** | **run** |
| `x86_64-apple-darwin`                 | ✓         | ✓       |
| **Android**                           | **build** | **run** |
| `x86_64-linux-android`                | ✓         | ✓       |
| `armv7-linux-androideabi`             | ✓         | ✗       |
| `aarch64-linux-android`               | ✓         | ✗       |
| `thumbv7neon-linux-androideabi`       | ✓         | ✗       |
| **iOS**                               | **build** | **run** |
| `x86_64-apple-ios`                    | ✗         | ✗       |
| `aarch64-apple-ios`                   | ✗         | ✗       |


## Machine code verification

The
[`verify/`](https://github.com/rust-lang-nursery/packed_simd/tree/master/verify)
crate tests disassembles the portable packed vector APIs at run-time and
compares the generated machine code against the desired one to make sure that
this crate remains efficient.

## License

This project is licensed under either of

* [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0)
  ([LICENSE-APACHE](LICENSE-APACHE))

* [MIT License](http://opensource.org/licenses/MIT)
  ([LICENSE-MIT](LICENSE-MIT))

at your option.

## Contributing

We welcome all people who want to contribute.
Please see the [contributing instructions] for more information.

Contributions in any form (issues, pull requests, etc.) to this project
must adhere to Rust's [Code of Conduct].

Unless you explicitly state otherwise, any contribution intentionally submitted
for inclusion in `packed_simd` by you, as defined in the Apache-2.0 license, shall be
dual licensed as above, without any additional terms or conditions.

[travis]: https://travis-ci.com/rust-lang/packed_simd
[Travis-CI Status]: https://travis-ci.com/rust-lang/packed_simd.svg?branch=master
[appveyor]: https://ci.appveyor.com/project/gnzlbg/packed-simd
[Appveyor Status]: https://ci.appveyor.com/api/projects/status/hd7v9dvr442hgdix?svg=true
[Latest Version]: https://img.shields.io/crates/v/packed_simd.svg
[crates.io]: https://crates.io/crates/packed_simd
[docs]: https://docs.rs/packed_simd/badge.svg
[docs.rs]: https://docs.rs/packed_simd
[master_docs]: https://rust-lang-nursery.github.io/packed_simd/packed_simd/
[perf_guide]: https://rust-lang-nursery.github.io/packed_simd/perf-guide/
[rfc2366]: https://github.com/rust-lang/rfcs/pull/2366
[ISPC]: https://ispc.github.io/
[rayon]: https://crates.io/crates/rayon
[boost_license]: https://www.boost.org/LICENSE_1_0.txt
[SLEEF]: https://sleef.org/
[sleef_sys]: https://crates.io/crates/sleef-sys
[contributing instructions]: contributing.md
[Code of Conduct]: https://www.rust-lang.org/en-US/conduct.html


================================================
FILE: bors.toml
================================================
status = [
    "continuous-integration/travis-ci/push"
]

================================================
FILE: build.rs
================================================
fn main() {
    let target = std::env::var("TARGET").expect("TARGET environment variable not defined");
    if target.contains("neon") {
        println!("cargo:rustc-cfg=libcore_neon");
    }
}


================================================
FILE: ci/all.sh
================================================
#!/usr/bin/env bash
#
# Performs an operation on all targets

set -ex

: "${1?The all.sh script requires one argument.}"

op=$1

cargo_clean() {
    cargo clean
}

cargo_check_fmt() {
    cargo fmt --all -- --check
}

cargo_fmt() {
    cargo fmt --all
}

cargo_clippy() {
    cargo clippy --all -- -D clippy::perf
}

CMD="-1"

case $op in
    clean*)
        CMD=cargo_clean
        ;;
    check_fmt*)
        CMD=cargo_check_fmt
        ;;
    fmt*)
        CMD=cargo_fmt
        ;;
    clippy)
        CMD=cargo_clippy
        ;;
    *)
        echo "Unknown operation: \"${op}\""
        exit 1
        ;;
esac

echo "Operation is: ${CMD}"

# On src/
$CMD

# Check examples/
for dir in examples/*/
do
    dir=${dir%*/}
    (
        cd "${dir%*/}"
        $CMD
    )
done

(
    cd verify/verify
    $CMD
)

(
    cd micro_benchmarks
    $CMD
)


================================================
FILE: ci/android-install-ndk.sh
================================================
#!/usr/bin/env sh
# Copyright 2016 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

set -ex

ANDROID_NDK_URL=https://dl.google.com/android/repository
ANDROID_NDK_ARCHIVE=android-ndk-r25b-linux.zip

curl -fO "$ANDROID_NDK_URL/$ANDROID_NDK_ARCHIVE"
unzip -q $ANDROID_NDK_ARCHIVE
rm $ANDROID_NDK_ARCHIVE
mv android-ndk-* ndk
rm -rf android-ndk-*


================================================
FILE: ci/android-install-sdk.sh
================================================
#!/usr/bin/env sh
# Copyright 2016 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

set -ex

# Prep the SDK and emulator
#
# Note that the update process requires that we accept a bunch of licenses, and
# we can't just pipe `yes` into it for some reason, so we take the same strategy
# located in https://github.com/appunite/docker by just wrapping it in a script
# which apparently magically accepts the licenses.

mkdir sdk
curl --retry 5 https://dl.google.com/android/repository/sdk-tools-linux-3859397.zip -O
unzip -d sdk sdk-tools-linux-3859397.zip

case "$1" in
  arm | armv7)
    abi=armeabi-v7a
    ;;

  aarch64)
    abi=arm64-v8a
    ;;

  i686)
    abi=x86
    ;;

  x86_64)
    abi=x86_64
    ;;

  *)
    echo "invalid arch: $1"
    exit 1
    ;;
esac;

# --no_https avoids
     # javax.net.ssl.SSLHandshakeException: sun.security.validator.ValidatorException: No trusted certificate found
yes | ./sdk/tools/bin/sdkmanager --licenses --no_https
yes | ./sdk/tools/bin/sdkmanager --no_https \
        "emulator" \
        "platform-tools" \
        "platforms;android-24" \
        "system-images;android-24;default;$abi"

echo "no" |
    ./sdk/tools/bin/avdmanager create avd \
        --name "${1}" \
        --package "system-images;android-24;default;$abi"


================================================
FILE: ci/android-sysimage.sh
================================================
#!/usr/bin/env bash

# Copyright 2017 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

set -ex

URL=https://dl.google.com/android/repository/sys-img/android

main() {
    local arch="${1}"
    local name="${2}"
    local dest=/system
    local td
    td="$(mktemp -d)"

    apt-get install --no-install-recommends e2tools

    pushd "${td}"
    curl --retry 5 -O "${URL}/${name}"
    unzip -q "${name}"

    local system
    system="$(find . -name system.img)"
    mkdir -p ${dest}/{bin,lib,lib64}

    # Extract android linker and libraries to /system
    # This allows android executables to be run directly (or with qemu)
    if [ "${arch}" = "x86_64" ] || [ "${arch}" = "arm64" ]; then
        e2cp -p "${system}:/bin/linker64" "${dest}/bin/"
        e2cp -p "${system}:/lib64/libdl.so" "${dest}/lib64/"
        e2cp -p "${system}:/lib64/libc.so" "${dest}/lib64/"
        e2cp -p "${system}:/lib64/libm.so" "${dest}/lib64/"
    else
        e2cp -p "${system}:/bin/linker" "${dest}/bin/"
        e2cp -p "${system}:/lib/libdl.so" "${dest}/lib/"
        e2cp -p "${system}:/lib/libc.so" "${dest}/lib/"
        e2cp -p "${system}:/lib/libm.so" "${dest}/lib/"
    fi

    # clean up
    apt-get purge --auto-remove -y e2tools

    popd

    rm -rf "${td}"
}

main "${@}"


================================================
FILE: ci/benchmark.sh
================================================
#!/usr/bin/env bash
#
# Runs all benchmarks. Controlled by the following environment variables:
#
# FEATURES={} - cargo features to pass to all benchmarks (e.g. core_arch,sleef-sys,ispc)
# NORUN={1}   - only builds the benchmarks

set -ex

if [[ ${NORUN} != 1 ]]; then
    # Most benchmarks require hyperfine; require it upfront.
    hash hyperfine 2>/dev/null || { echo >&2 "hyperfine is not in PATH."; exit 1; }
fi


# If the ispc benchmark feature is enabled, ispc must be in the path of the
# benchmarks. 
if echo "$FEATURES" | grep -q "ispc"; then
    hash ispc 2>/dev/null || { echo >&2 "ispc is not in PATH."; exit 1; }
fi

# An example with a benchmark.sh is a benchmark:
for dir in examples/*/
do
    dir=${dir%*/}
    cd ${dir%*/}
    if [ -f "benchmark.sh" ]; then
        ./benchmark.sh
    fi
    cd -
done


================================================
FILE: ci/deploy_and_run_on_ios_simulator.rs
================================================
// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// This is a script to deploy and execute a binary on an iOS simulator.
// The primary use of this is to be able to run unit tests on the simulator and
// retrieve the results.
//
// To do this through Cargo instead, use Dinghy
// (https://github.com/snipsco/dinghy): cargo dinghy install, then cargo dinghy
// test.

use std::env;
use std::fs::{self, File};
use std::io::Write;
use std::path::Path;
use std::process;
use std::process::Command;

macro_rules! t {
    ($e:expr) => (match $e {
        Ok(e) => e,
        Err(e) => panic!("{} failed with: {}", stringify!($e), e),
    })
}

// Step one: Wrap as an app
fn package_as_simulator_app(crate_name: &str, test_binary_path: &Path) {
    println!("Packaging simulator app");
    drop(fs::remove_dir_all("ios_simulator_app"));
    t!(fs::create_dir("ios_simulator_app"));
    t!(fs::copy(test_binary_path,
                Path::new("ios_simulator_app").join(crate_name)));

    let mut f = t!(File::create("ios_simulator_app/Info.plist"));
    t!(f.write_all(format!(r#"
        <?xml version="1.0" encoding="UTF-8"?>
        <!DOCTYPE plist PUBLIC
                "-//Apple//DTD PLIST 1.0//EN"
                "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
        <plist version="1.0">
            <dict>
                <key>CFBundleExecutable</key>
                <string>{}</string>
                <key>CFBundleIdentifier</key>
                <string>com.rust.unittests</string>
            </dict>
        </plist>
    "#, crate_name).as_bytes()));
}

// Step two: Start the iOS simulator
fn start_simulator() {
    println!("Looking for iOS simulator");
    let output = t!(Command::new("xcrun").arg("simctl").arg("list").output());
    assert!(output.status.success());
    let mut simulator_exists = false;
    let mut simulator_booted = false;
    let mut found_rust_sim = false;
    let stdout = t!(String::from_utf8(output.stdout));
    for line in stdout.lines() {
        if line.contains("rust_ios") {
            if found_rust_sim {
                panic!("Duplicate rust_ios simulators found. Please \
                        double-check xcrun simctl list.");
            }
            simulator_exists = true;
            simulator_booted = line.contains("(Booted)");
            found_rust_sim = true;
        }
    }

    if simulator_exists == false {
        println!("Creating iOS simulator");
        Command::new("xcrun")
                .arg("simctl")
                .arg("create")
                .arg("rust_ios")
                .arg("com.apple.CoreSimulator.SimDeviceType.iPhone-SE")
                .arg("com.apple.CoreSimulator.SimRuntime.iOS-10-2")
                .check_status();
    } else if simulator_booted == true {
        println!("Shutting down already-booted simulator");
        Command::new("xcrun")
                .arg("simctl")
                .arg("shutdown")
                .arg("rust_ios")
                .check_status();
    }

    println!("Starting iOS simulator");
    // We can't uninstall the app (if present) as that will hang if the
    // simulator isn't completely booted; just erase the simulator instead.
    Command::new("xcrun").arg("simctl").arg("erase").arg("rust_ios").check_status();
    Command::new("xcrun").arg("simctl").arg("boot").arg("rust_ios").check_status();
}

// Step three: Install the app
fn install_app_to_simulator() {
    println!("Installing app to simulator");
    Command::new("xcrun")
            .arg("simctl")
            .arg("install")
            .arg("booted")
            .arg("ios_simulator_app/")
            .check_status();
}

// Step four: Run the app
fn run_app_on_simulator() {
    println!("Running app");
    let output = t!(Command::new("xcrun")
                    .arg("simctl")
                    .arg("launch")
                    .arg("--console")
                    .arg("booted")
                    .arg("com.rust.unittests")
                    .output());

    println!("stdout --\n{}\n", String::from_utf8_lossy(&output.stdout));
    println!("stderr --\n{}\n", String::from_utf8_lossy(&output.stderr));

    let stdout = String::from_utf8_lossy(&output.stdout);
    let failed = stdout.lines()
        .find(|l| l.contains("FAILED"))
        .map(|l| l.contains("FAILED"))
        .unwrap_or(false);

    let passed = stdout.lines()
        .find(|l| l.contains("test result: ok"))
        .map(|l| l.contains("test result: ok"))
        .unwrap_or(false);

    println!("Shutting down simulator");
    Command::new("xcrun")
        .arg("simctl")
        .arg("shutdown")
        .arg("rust_ios")
        .check_status();
    if !(passed && !failed) {
        panic!("tests didn't pass");
    }
}

trait CheckStatus {
    fn check_status(&mut self);
}

impl CheckStatus for Command {
    fn check_status(&mut self) {
        println!("\trunning: {:?}", self);
        assert!(t!(self.status()).success());
    }
}

fn main() {
    let args: Vec<String> = env::args().collect();
    if args.len() != 2 {
        println!("Usage: {} <executable>", args[0]);
        process::exit(-1);
    }

    let test_binary_path = Path::new(&args[1]);
    let crate_name = test_binary_path.file_name().unwrap();

    package_as_simulator_app(crate_name.to_str().unwrap(), test_binary_path);
    start_simulator();
    install_app_to_simulator();
    run_app_on_simulator();
}


================================================
FILE: ci/docker/aarch64-linux-android/Dockerfile
================================================
FROM ubuntu:16.04

RUN dpkg --add-architecture i386 && \
    apt-get update && \
    apt-get install -y --no-install-recommends \
  file \
  make \
  curl \
  ca-certificates \
  python \
  unzip \
  expect \
  openjdk-9-jre \
  libstdc++6:i386 \
  libpulse0 \
  gcc \
  libc6-dev

WORKDIR /android/
COPY android* /android/

ENV ANDROID_ARCH=aarch64
ENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools

RUN sh /android/android-install-ndk.sh $ANDROID_ARCH
RUN sh /android/android-install-sdk.sh $ANDROID_ARCH
RUN mv /root/.android /tmp
RUN chmod 777 -R /tmp/.android
RUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/*

ENV PATH=$PATH:/rust/bin \
    CARGO_TARGET_AARCH64_LINUX_ANDROID_LINKER=aarch64-linux-android-gcc \
    CARGO_TARGET_AARCH64_LINUX_ANDROID_RUNNER=/tmp/runtest \
    OBJDUMP=aarch64-linux-android-objdump \
    HOME=/tmp

ADD runtest-android.rs /tmp/runtest.rs
ENTRYPOINT [ \
  "bash", \
  "-c", \
  # set SHELL so android can detect a 64bits system, see
  # http://stackoverflow.com/a/41789144
  "SHELL=/bin/dash /android/sdk/emulator/emulator @aarch64 -no-window & \
   rustc /tmp/runtest.rs -o /tmp/runtest && \
   exec \"$@\"", \
  "--" \
]


================================================
FILE: ci/docker/aarch64-unknown-linux-gnu/Dockerfile
================================================
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
  gcc \
  ca-certificates \
  libc6-dev \
  gcc-aarch64-linux-gnu \
  libc6-dev-arm64-cross \
  qemu-user \
  make \
  file

ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc \
    CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER="qemu-aarch64 -L /usr/aarch64-linux-gnu" \
    OBJDUMP=aarch64-linux-gnu-objdump


================================================
FILE: ci/docker/arm-unknown-linux-gnueabi/Dockerfile
================================================
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
  gcc \
  ca-certificates \
  libc6-dev \
  libc6-armel-cross \
  libc6-dev-armel-cross \
  binutils-arm-linux-gnueabi \
  gcc-arm-linux-gnueabi \
  qemu-user \
  make \
  file
ENV CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABI_LINKER=arm-linux-gnueabi-gcc \
    CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABI_RUNNER="qemu-arm -L /usr/arm-linux-gnueabi" \
    OBJDUMP=arm-linux-gnueabi-objdump


================================================
FILE: ci/docker/arm-unknown-linux-gnueabihf/Dockerfile
================================================
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
  gcc \
  ca-certificates \
  libc6-dev \
  gcc-arm-linux-gnueabihf \
  libc6-dev-armhf-cross \
  qemu-user \
  make \
  file
ENV CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc \
    CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_RUNNER="qemu-arm -L /usr/arm-linux-gnueabihf" \
    OBJDUMP=arm-linux-gnueabihf-objdump


================================================
FILE: ci/docker/armv7-linux-androideabi/Dockerfile
================================================
FROM ubuntu:16.04

RUN dpkg --add-architecture i386 && \
    apt-get update && \
    apt-get install -y --no-install-recommends \
  file \
  make \
  curl \
  ca-certificates \
  python \
  unzip \
  expect \
  openjdk-9-jre \
  libstdc++6:i386 \
  libpulse0 \
  gcc \
  libc6-dev

WORKDIR /android/
COPY android* /android/

ENV ANDROID_ARCH=arm
ENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools

RUN sh /android/android-install-ndk.sh $ANDROID_ARCH
RUN sh /android/android-install-sdk.sh $ANDROID_ARCH
RUN mv /root/.android /tmp
RUN chmod 777 -R /tmp/.android
RUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/*

ENV PATH=$PATH:/rust/bin \
    CARGO_TARGET_ARM_LINUX_ANDROIDEABI_LINKER=arm-linux-androideabi-gcc \
    CARGO_TARGET_ARM_LINUX_ANDROIDEABI_RUNNER=/tmp/runtest \
    OBJDUMP=arm-linux-androideabi-objdump \
    HOME=/tmp

ADD runtest-android.rs /tmp/runtest.rs
ENTRYPOINT [ \
  "bash", \
  "-c", \
  # set SHELL so android can detect a 64bits system, see
  # http://stackoverflow.com/a/41789144
  "SHELL=/bin/dash /android/sdk/emulator/emulator @arm -no-window & \
   rustc /tmp/runtest.rs -o /tmp/runtest && \
   exec \"$@\"", \
  "--" \
]


================================================
FILE: ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile
================================================
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
  gcc \
  ca-certificates \
  libc6-dev \
  gcc-arm-linux-gnueabihf \
  libc6-dev-armhf-cross \
  qemu-user \
  make \
  file
ENV CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc \
    CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER="qemu-arm -L /usr/arm-linux-gnueabihf" \
    OBJDUMP=arm-linux-gnueabihf-objdump


================================================
FILE: ci/docker/i586-unknown-linux-gnu/Dockerfile
================================================
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
  gcc-multilib \
  libc6-dev \
  file \
  make \
  ca-certificates


================================================
FILE: ci/docker/i686-unknown-linux-gnu/Dockerfile
================================================
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
  gcc-multilib \
  libc6-dev \
  file \
  make \
  ca-certificates


================================================
FILE: ci/docker/mips-unknown-linux-gnu/Dockerfile
================================================
FROM ubuntu:18.04

RUN apt-get update && apt-get install -y --no-install-recommends \
        gcc libc6-dev qemu-user ca-certificates \
        gcc-mips-linux-gnu libc6-dev-mips-cross \
        qemu-system-mips \
        qemu-user \
        make \
        file

ENV CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_LINKER=mips-linux-gnu-gcc \
    CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_RUNNER="qemu-mips -L /usr/mips-linux-gnu" \
    OBJDUMP=mips-linux-gnu-objdump

================================================
FILE: ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile
================================================
FROM ubuntu:18.04

RUN apt-get update && apt-get install -y --no-install-recommends \
        gcc libc6-dev qemu-user ca-certificates \
        gcc-mips64-linux-gnuabi64 libc6-dev-mips64-cross \
        qemu-system-mips64 qemu-user

ENV CARGO_TARGET_MIPS64_UNKNOWN_LINUX_GNUABI64_LINKER=mips64-linux-gnuabi64-gcc \
    CARGO_TARGET_MIPS64_UNKNOWN_LINUX_GNUABI64_RUNNER="qemu-mips64 -L /usr/mips64-linux-gnuabi64" \
    OBJDUMP=mips64-linux-gnuabi64-objdump

================================================
FILE: ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile
================================================
FROM ubuntu:18.04

RUN apt-get update && apt-get install -y --no-install-recommends \
        gcc libc6-dev qemu-user ca-certificates \
        gcc-mips64el-linux-gnuabi64 libc6-dev-mips64el-cross \
        qemu-system-mips64el

ENV CARGO_TARGET_MIPS64EL_UNKNOWN_LINUX_GNUABI64_LINKER=mips64el-linux-gnuabi64-gcc \
    CARGO_TARGET_MIPS64EL_UNKNOWN_LINUX_GNUABI64_RUNNER="qemu-mips64el -L /usr/mips64el-linux-gnuabi64" \
    OBJDUMP=mips64el-linux-gnuabi64-objdump

================================================
FILE: ci/docker/mipsel-unknown-linux-musl/Dockerfile
================================================
FROM ubuntu:18.10

RUN apt-get update && \
    apt-get install -y --no-install-recommends \
    ca-certificates \
    gcc \
    libc6-dev \
    make \
    qemu-user \
    qemu-system-mips \
    bzip2 \
    curl \
    file

RUN mkdir /toolchain

# Note that this originally came from:
# https://downloads.openwrt.org/snapshots/trunk/malta/generic/OpenWrt-Toolchain-malta-le_gcc-5.3.0_musl-1.1.15.Linux-x86_64.tar.bz2
RUN curl -L https://ci-mirrors.rust-lang.org/libc/OpenWrt-Toolchain-malta-le_gcc-5.3.0_musl-1.1.15.Linux-x86_64.tar.bz2 | \
      tar xjf - -C /toolchain --strip-components=2

ENV PATH=$PATH:/rust/bin:/toolchain/bin \
    CC_mipsel_unknown_linux_musl=mipsel-openwrt-linux-gcc \
    CARGO_TARGET_MIPSEL_UNKNOWN_LINUX_MUSL_LINKER=mipsel-openwrt-linux-gcc \
    CARGO_TARGET_MIPSEL_UNKNOWN_LINUX_MUSL_RUNNER="qemu-mipsel -L /toolchain"


================================================
FILE: ci/docker/powerpc-unknown-linux-gnu/Dockerfile
================================================
FROM ubuntu:22.04

RUN apt-get update && apt-get install -y --no-install-recommends \
        gcc libc6-dev qemu-user ca-certificates \
        gcc-powerpc-linux-gnu libc6-dev-powerpc-cross \
        qemu-system-ppc \
        make \
        file

ENV CARGO_TARGET_POWERPC_UNKNOWN_LINUX_GNU_LINKER=powerpc-linux-gnu-gcc \
    CARGO_TARGET_POWERPC_UNKNOWN_LINUX_GNU_RUNNER="qemu-ppc -cpu Vger -L /usr/powerpc-linux-gnu" \
    CC=powerpc-linux-gnu-gcc \
    OBJDUMP=powerpc-linux-gnu-objdump


================================================
FILE: ci/docker/powerpc64-unknown-linux-gnu/Dockerfile
================================================
FROM ubuntu:22.04

RUN apt-get update && apt-get install -y --no-install-recommends \
    gcc \
    ca-certificates \
    libc6-dev \
    gcc-powerpc64-linux-gnu \
    libc6-dev-ppc64-cross \
    qemu-user  \
    qemu-system-ppc \
    make \
    file 

ENV CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_LINKER=powerpc64-linux-gnu-gcc \
    CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_RUNNER="qemu-ppc64 -L /usr/powerpc64-linux-gnu" \
    CC=powerpc64-linux-gnu-gcc \
    OBJDUMP=powerpc64-linux-gnu-objdump


================================================
FILE: ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile
================================================
FROM ubuntu:22.04

RUN apt-get update && apt-get install -y --no-install-recommends \
        gcc libc6-dev qemu-user ca-certificates \
        gcc-powerpc64le-linux-gnu libc6-dev-ppc64el-cross \
        qemu-system-ppc file make

ENV CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_LINKER=powerpc64le-linux-gnu-gcc \
    CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_RUNNER="qemu-ppc64le -L /usr/powerpc64le-linux-gnu" \
    CC=powerpc64le-linux-gnu-gcc \
    OBJDUMP=powerpc64le-linux-gnu-objdump


================================================
FILE: ci/docker/s390x-unknown-linux-gnu/Dockerfile
================================================
FROM ubuntu:22.04

RUN apt-get update && \
    apt-get install -y --no-install-recommends \
    ca-certificates \
    curl \
    cmake \
    gcc \
    libc6-dev \
    g++-s390x-linux-gnu \
    libc6-dev-s390x-cross \
    qemu-user \
    make \
    file

ENV CARGO_TARGET_S390X_UNKNOWN_LINUX_GNU_LINKER=s390x-linux-gnu-gcc \
    CARGO_TARGET_S390X_UNKNOWN_LINUX_GNU_RUNNER="qemu-s390x -L /usr/s390x-linux-gnu" \
    CC_s390x_unknown_linux_gnu=s390x-linux-gnu-gcc \
    CXX_s390x_unknown_linux_gnu=s390x-linux-gnu-g++ \
    OBJDUMP=s390x-linux-gnu-objdump


================================================
FILE: ci/docker/sparc64-unknown-linux-gnu/Dockerfile
================================================
FROM debian:bookworm

RUN apt-get update && apt-get install -y --no-install-recommends \
        curl ca-certificates \
        gcc libc6-dev \
        gcc-sparc64-linux-gnu libc6-dev-sparc64-cross \
        qemu-system-sparc64 openbios-sparc seabios ipxe-qemu \
        p7zip-full cpio

COPY linux-sparc64.sh /
RUN bash /linux-sparc64.sh

COPY test-runner-linux /

ENV CARGO_TARGET_SPARC64_UNKNOWN_LINUX_GNU_LINKER=sparc64-linux-gnu-gcc \
    CARGO_TARGET_SPARC64_UNKNOWN_LINUX_GNU_RUNNER="/test-runner-linux sparc64" \
    CC_sparc64_unknown_linux_gnu=sparc64-linux-gnu-gcc \
    PATH=$PATH:/rust/bin


================================================
FILE: ci/docker/thumbv7neon-linux-androideabi/Dockerfile
================================================
FROM ubuntu:16.04

RUN dpkg --add-architecture i386 && \
    apt-get update && \
    apt-get install -y --no-install-recommends \
  file \
  make \
  curl \
  ca-certificates \
  python \
  unzip \
  expect \
  openjdk-9-jre \
  libstdc++6:i386 \
  libpulse0 \
  gcc \
  libc6-dev

WORKDIR /android/
COPY android* /android/

ENV ANDROID_ARCH=arm
ENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools

RUN sh /android/android-install-ndk.sh $ANDROID_ARCH
RUN sh /android/android-install-sdk.sh $ANDROID_ARCH
RUN mv /root/.android /tmp
RUN chmod 777 -R /tmp/.android
RUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/*

ENV PATH=$PATH:/rust/bin \
    CARGO_TARGET_THUMBV7NEON_LINUX_ANDROIDEABI_LINKER=arm-linux-androideabi-gcc \
    CARGO_TARGET_THUMBV7NEON_LINUX_ANDROIDEABI_RUNNER=/tmp/runtest \
    OBJDUMP=arm-linux-androideabi-objdump \
    HOME=/tmp

ADD runtest-android.rs /tmp/runtest.rs
ENTRYPOINT [ \
  "bash", \
  "-c", \
  # set SHELL so android can detect a 64bits system, see
  # http://stackoverflow.com/a/41789144
  "SHELL=/bin/dash /android/sdk/emulator/emulator @arm -no-window & \
   rustc /tmp/runtest.rs -o /tmp/runtest && \
   exec \"$@\"", \
  "--" \
]


================================================
FILE: ci/docker/thumbv7neon-unknown-linux-gnueabihf/Dockerfile
================================================
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
  gcc \
  ca-certificates \
  libc6-dev \
  gcc-arm-linux-gnueabihf \
  libc6-dev-armhf-cross \
  qemu-user \
  make \
  file
ENV CARGO_TARGET_THUMBV7NEON_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc \
    CARGO_TARGET_THUMBV7NEON_UNKNOWN_LINUX_GNUEABIHF_RUNNER="qemu-arm -L /usr/arm-linux-gnueabihf" \
    OBJDUMP=arm-linux-gnueabihf-objdump


================================================
FILE: ci/docker/wasm32-unknown-unknown/Dockerfile
================================================
FROM ubuntu:22.04

RUN apt-get update -y && apt-get install -y --no-install-recommends \
  ca-certificates \
  clang \
  cmake \
  curl \
  git \
  libc6-dev \
  make \
  ninja-build \
  python-is-python3 \
  xz-utils

# Install `wasm2wat`
RUN git clone --recursive https://github.com/WebAssembly/wabt
RUN make -C wabt -j$(nproc)
ENV PATH=$PATH:/wabt/bin

# Install `wasm-bindgen-test-runner`
RUN curl -L https://github.com/rustwasm/wasm-bindgen/releases/download/0.2.87/wasm-bindgen-0.2.87-x86_64-unknown-linux-musl.tar.gz \
  | tar xzf -
# Keep in sync with the version on Cargo.toml.
ENV PATH=$PATH:/wasm-bindgen-0.2.87-x86_64-unknown-linux-musl
ENV CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_RUNNER=wasm-bindgen-test-runner

# Install `node`
RUN curl https://nodejs.org/dist/v14.16.0/node-v14.16.0-linux-x64.tar.xz | tar xJf -
ENV PATH=$PATH:/node-v14.16.0-linux-x64/bin

# We use a shim linker that removes `--strip-debug` when passed to LLD. While
# this typically results in invalid debug information in release mode it doesn't
# result in an invalid names section which is what we're interested in.
COPY lld-shim.rs /
ENV CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_LINKER=/tmp/lld-shim

# Rustc isn't available until this container starts, so defer compilation of the
# shim.
ENTRYPOINT /rust/bin/rustc /lld-shim.rs -o /tmp/lld-shim && exec bash "$@"


================================================
FILE: ci/docker/x86_64-linux-android/Dockerfile
================================================
FROM ubuntu:20.04

RUN apt-get update && \
    apt-get install -y --no-install-recommends \
  ca-certificates \
  curl \
  gcc \
  libc-dev \
  python \
  unzip \
  file \
  make

WORKDIR /android/
ENV ANDROID_ARCH=x86_64
COPY android-install-ndk.sh /android/
RUN sh /android/android-install-ndk.sh

ENV STDARCH_ASSERT_INSTR_LIMIT=30

# We do not run x86_64-linux-android tests on an android emulator.
# See ci/android-sysimage.sh for informations about how tests are run.
COPY android-sysimage.sh /android/
RUN bash /android/android-sysimage.sh x86_64 x86_64-24_r07.zip

ENV PATH=$PATH:/rust/bin:/android/ndk/toolchains/llvm/prebuilt/linux-x86_64/bin \
    CARGO_TARGET_X86_64_LINUX_ANDROID_LINKER=x86_64-linux-android21-clang \
    CC_x86_64_linux_android=x86_64-linux-android21-clang \
    CXX_x86_64_linux_android=x86_64-linux-android21-clang++ \
    OBJDUMP=llvm-objdump \
    HOME=/tmp


================================================
FILE: ci/docker/x86_64-unknown-linux-gnu/Dockerfile
================================================
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
  gcc \
  libc6-dev \
  file \
  make \
  ca-certificates \
  cmake \
  libclang-dev \
  clang


================================================
FILE: ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile
================================================
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
  gcc \
  libc6-dev \
  file \
  make \
  ca-certificates \
  wget \
  bzip2 \
  cmake \
  libclang-dev \
  clang

RUN wget https://github.com/gnzlbg/intel_sde/raw/master/sde-external-8.16.0-2018-01-30-lin.tar.bz2
RUN tar -xjf sde-external-8.16.0-2018-01-30-lin.tar.bz2
ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/sde-external-8.16.0-2018-01-30-lin/sde64 --"


================================================
FILE: ci/dox.sh
================================================
#!/bin/sh

set -ex

rm -rf target/doc
mkdir -p target/doc

# Build API documentation
cargo doc --features=into_bits

# Build Performance Guide
# FIXME: https://github.com/rust-lang-nursery/mdBook/issues/780
# mdbook build perf-guide -d target/doc/perf-guide
cd perf-guide
mdbook build
cd -
cp -r perf-guide/book target/doc/perf-guide

# If we're on travis, not a PR, and on the right branch, publish!
if [ "$TRAVIS_PULL_REQUEST" = "false" ] && [ "$TRAVIS_BRANCH" = "master" ]; then
  python3 -vV
  pip -vV
  python3.9 -vV
  pip install ghp_import --user
  ghp-import -n target/doc
  git push -qf https://${GH_PAGES}@github.com/${TRAVIS_REPO_SLUG}.git gh-pages
fi


================================================
FILE: ci/linux-s390x.sh
================================================
set -ex

mkdir -m 777 /qemu
cd /qemu

curl -LO https://github.com/qemu/qemu/raw/master/pc-bios/s390-ccw.img
curl -LO http://ftp.debian.org/debian/dists/testing/main/installer-s390x/20170828/images/generic/kernel.debian
curl -LO http://ftp.debian.org/debian/dists/testing/main/installer-s390x/20170828/images/generic/initrd.debian

mv kernel.debian kernel
mv initrd.debian initrd.gz

mkdir init
cd init
gunzip -c ../initrd.gz | cpio -id
rm ../initrd.gz
cp /usr/s390x-linux-gnu/lib/libgcc_s.so.1 usr/lib/
chmod a+w .


================================================
FILE: ci/linux-sparc64.sh
================================================
set -ex

mkdir -m 777 /qemu
cd /qemu

curl -LO https://cdimage.debian.org/cdimage/ports/9.0/sparc64/iso-cd/debian-9.0-sparc64-NETINST-1.iso
7z e debian-9.0-sparc64-NETINST-1.iso boot/initrd.gz
7z e debian-9.0-sparc64-NETINST-1.iso boot/sparc64
mv sparc64 kernel
rm debian-9.0-sparc64-NETINST-1.iso

mkdir init
cd init
gunzip -c ../initrd.gz | cpio -id
rm ../initrd.gz
cp /usr/sparc64-linux-gnu/lib/libgcc_s.so.1 usr/lib/
chmod a+w .


================================================
FILE: ci/lld-shim.rs
================================================
use std::os::unix::prelude::*;
use std::process::Command;
use std::env;

fn main() {
    let args = env::args()
        .skip(1)
        .filter(|s| s != "--strip-debug")
        .collect::<Vec<_>>();
    panic!("failed to exec: {}", Command::new("rust-lld").args(&args).exec());
}


================================================
FILE: ci/max_line_width.sh
================================================
#!/usr/bin/env sh

set -x

export success=true

find . -iname '*.rs' | while read -r file; do
    result=$(grep '.\{79\}' "${file}" | grep --invert 'http')
    if [ "${result}" = "" ]
    then
        :
    else
        echo "file \"${file}\": $result"
        exit 1
    fi
done


================================================
FILE: ci/run-docker.sh
================================================
# Small script to run tests for a target (or all targets) inside all the
# respective docker images.

set -ex

run() {
    echo "Building docker container for TARGET=${TARGET} RUSTFLAGS=${RUSTFLAGS}"
    docker build -t packed_simd -f ci/docker/${TARGET}/Dockerfile ci/
    mkdir -p target
    target=$(echo "${TARGET}" | sed 's/-emulated//')
    echo "Running docker"
    docker run \
      --user `id -u`:`id -g` \
      --rm \
      --init \
      --volume $HOME/.cargo:/cargo \
      --env CARGO_HOME=/cargo \
      --volume `rustc --print sysroot`:/rust:ro \
      --env TARGET=$target \
      --env NORUN \
      --env NOVERIFY \
      --env RUSTFLAGS \
      --volume `pwd`:/checkout:ro \
      --volume `pwd`/target:/checkout/target \
      --workdir /checkout \
      --privileged \
      packed_simd \
      bash \
      -c 'PATH=$PATH:/rust/bin exec ci/run.sh'
}

if [ -z "${TARGET}" ]; then
  for d in `ls ci/docker/`; do
    run $d
  done
else
  run ${TARGET}
fi


================================================
FILE: ci/run.sh
================================================
#!/usr/bin/env bash

set -ex

: ${TARGET?"The TARGET environment variable must be set."}

# Tests are all super fast anyway, and they fault often enough on travis that
# having only one thread increases debuggability to be worth it.
#export RUST_TEST_THREADS=1
#export RUST_BACKTRACE=full
#export RUST_TEST_NOCAPTURE=1

# Some appveyor builds run out-of-memory; this attempts to mitigate that:
# https://github.com/rust-lang-nursery/packed_simd/issues/39
# export RUSTFLAGS="${RUSTFLAGS} -C codegen-units=1"
# export CARGO_BUILD_JOBS=1

export CARGO_SUBCMD=test
if [[ "${NORUN}" == "1" ]]; then
    export CARGO_SUBCMD=build
fi

if [[ ${TARGET} == "x86_64-apple-ios" ]] || [[ ${TARGET} == "i386-apple-ios" ]]; then
    export RUSTFLAGS="${RUSTFLAGS} -Clink-arg=-mios-simulator-version-min=7.0"
    rustc ./ci/deploy_and_run_on_ios_simulator.rs -o $HOME/runtest
    export CARGO_TARGET_X86_64_APPLE_IOS_RUNNER=$HOME/runtest
    export CARGO_TARGET_I386_APPLE_IOS_RUNNER=$HOME/runtest
fi

# The source directory is read-only. Need to copy internal crates to the target
# directory for their Cargo.lock to be properly written.
mkdir target || true

rustc --version
cargo --version
echo "TARGET=${TARGET}"
echo "HOST=${HOST}"
echo "RUSTFLAGS=${RUSTFLAGS}"
echo "NORUN=${NORUN}"
echo "NOVERIFY=${NOVERIFY}"
echo "CARGO_SUBCMD=${CARGO_SUBCMD}"
echo "CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS}"
echo "CARGO_INCREMENTAL=${CARGO_INCREMENTAL}"
echo "RUST_TEST_THREADS=${RUST_TEST_THREADS}"
echo "RUST_BACKTRACE=${RUST_BACKTRACE}"
echo "RUST_TEST_NOCAPTURE=${RUST_TEST_NOCAPTURE}"

cargo_test() {
    cmd="cargo ${CARGO_SUBCMD} --verbose --target=${TARGET} ${@}"
    if [ "${NORUN}" != "1" ]
    then
        if [ "$TARGET" != "wasm32-unknown-unknown" ]
        then
            cmd="$cmd -- --quiet"
        fi
    fi
    mkdir target || true
    ${cmd} 2>&1 | tee > target/output
    if [[ ${PIPESTATUS[0]} != 0 ]]; then
        cat target/output
        return 1
    fi
}

cargo_test_impl() {
    ORIGINAL_RUSTFLAGS=${RUSTFLAGS}
    RUSTFLAGS="${ORIGINAL_RUSTFLAGS} --cfg test_v16  --cfg test_v32 --cfg test_v64" cargo_test ${@}
    RUSTFLAGS="${ORIGINAL_RUSTFLAGS} --cfg test_v128 --cfg test_v256" cargo_test ${@}
    RUSTFLAGS="${ORIGINAL_RUSTFLAGS} --cfg test_v512" cargo_test ${@}
    RUSTFLAGS=${ORIGINAL_RUSTFLAGS}
}

# Debug run:
if [[ "${TARGET}" != "wasm32-unknown-unknown" ]]; then
   # Run wasm32-unknown-unknown in release mode only
   cargo_test_impl
fi

if [[ "${TARGET}" == "x86_64-unknown-linux-gnu" ]] || [[ "${TARGET}" == "x86_64-pc-windows-msvc" ]]; then
    # use sleef on linux and windows x86_64 builds
    # FIXME: Use `core_arch,sleef-sys` features once they works again
    cargo_test_impl --release --features=into_bits
else
    # FIXME: Use `core_arch` feature once it works again
    cargo_test_impl --release --features=into_bits
fi

# Verify code generation
if [[ "${NOVERIFY}" != "1" ]]; then
    cp -r verify/verify target/verify
    export STDSIMD_ASSERT_INSTR_LIMIT=30
    if [[ "${TARGET}" == "i586-unknown-linux-gnu" ]]; then
        export STDSIMD_ASSERT_INSTR_LIMIT=50
    fi
    cargo_test --release --manifest-path=target/verify/Cargo.toml
fi

# FIXME: Figure out which examples take too long to run and ignore or adjust those
#. ci/run_examples.sh


================================================
FILE: ci/run_examples.sh
================================================
# Runs all examples.

# FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/55
# All examples fail to build for `armv7-apple-ios`.
if [[ ${TARGET} == "armv7-apple-ios" ]]; then
    exit 0
fi

# FIXME: travis exceeds 50 minutes on these targets
# Skipping the examples is an attempt at preventing travis from timing-out
if [[ ${TARGET} == "arm-linux-androidabi" ]] || [[ ${TARGET} == "aarch64-linux-androidabi" ]] \
    || [[ ${TARGET} == "sparc64-unknown-linux-gnu" ]]; then
    exit 0
fi

if [[ ${TARGET} == "wasm32-unknown-unknown" ]]; then
    exit 0
fi

cp -r examples/aobench target/aobench
cargo_test --manifest-path=target/aobench/Cargo.toml --release --no-default-features
cargo_test --manifest-path=target/aobench/Cargo.toml --release --features=256bit

cp -r examples/dot_product target/dot_product
cargo_test --manifest-path=target/dot_product/Cargo.toml --release

cp -r examples/fannkuch_redux target/fannkuch_redux
cargo_test --manifest-path=target/fannkuch_redux/Cargo.toml --release

# FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/56
if [[ ${TARGET} != "i586-unknown-linux-gnu" ]]; then
    cp -r examples/mandelbrot target/mandelbrot
    cargo_test --manifest-path=target/mandelbrot/Cargo.toml --release
fi

cp -r examples/matrix_inverse target/matrix_inverse
cargo_test --manifest-path=target/matrix_inverse/Cargo.toml --release

cp -r examples/nbody target/nbody
cargo_test --manifest-path=target/nbody/Cargo.toml --release

cp -r examples/spectral_norm target/spectral_norm
cargo_test --manifest-path=target/spectral_norm/Cargo.toml --release

if [[ ${TARGET} != "i586-unknown-linux-gnu" ]]; then
    cp -r examples/stencil target/stencil
    cargo_test --manifest-path=target/stencil/Cargo.toml --release
fi

cp -r examples/triangle_xform target/triangle_xform
cargo_test --manifest-path=target/triangle_xform/Cargo.toml --release


================================================
FILE: ci/runtest-android.rs
================================================
use std::env;
use std::process::Command;
use std::path::{Path, PathBuf};

fn main() {
    let args = env::args_os()
        .skip(1)
        .filter(|arg| arg != "--quiet")
        .collect::<Vec<_>>();
    assert_eq!(args.len(), 1);
    let test = PathBuf::from(&args[0]);
    let dst = Path::new("/data/local/tmp").join(test.file_name().unwrap());

    let status = Command::new("adb")
        .arg("wait-for-device")
        .status()
        .expect("failed to run: adb wait-for-device");
    assert!(status.success());

    let status = Command::new("adb")
        .arg("push")
        .arg(&test)
        .arg(&dst)
        .status()
        .expect("failed to run: adb pushr");
    assert!(status.success());

    let output = Command::new("adb")
        .arg("shell")
        .arg(&dst)
        .output()
        .expect("failed to run: adb shell");
    assert!(status.success());

    println!("status: {}\nstdout ---\n{}\nstderr ---\n{}",
             output.status,
             String::from_utf8_lossy(&output.stdout),
             String::from_utf8_lossy(&output.stderr));

    let stdout = String::from_utf8_lossy(&output.stdout);
    let mut lines = stdout.lines().filter(|l| l.starts_with("test result"));
    if !lines.all(|l| l.contains("test result: ok") && l.contains("0 failed")) {
        panic!("failed to find successful test run");
    }
}


================================================
FILE: ci/setup_benchmarks.sh
================================================
#!/usr/bin/env bash

set -ex

# Get latest ISPC binary for the target and put it in the path
git clone https://github.com/gnzlbg/ispc-binaries
cp ispc-binaries/ispc-${TARGET} ispc


================================================
FILE: ci/test-runner-linux
================================================
#!/bin/sh

set -e

arch=$1
prog=$2

cd /qemu/init
cp -f $2 prog
find . | cpio --create --format='newc' --quiet | gzip > ../initrd.gz
cd ..

timeout 30s qemu-system-$arch \
        -m 1024 \
        -nographic \
        -kernel kernel \
        -initrd initrd.gz \
        -append init=/prog > output || true

# remove kernel messages
tr -d '\r' < output | egrep -v '^\['

# if the output contains a failure, return error
! grep FAILED output > /dev/null


================================================
FILE: contributing.md
================================================
# Contributing to `packed_simd`

Welcome! If you are reading this document, it means you are interested in contributing
to the `packed_simd` crate.

## Reporting issues

All issues with this crate are tracked using GitHub's [Issue Tracker].

You can use issues to bring bugs to the attention of the maintainers, to discuss
certain problems encountered with the crate, or to request new features (although
feature requests should be limited to things mentioned in the [RFC]).

One thing to keep in mind is to always use the **latest** nightly toolchain when
working on this crate. Due to the nature of this project, we use a lot of unstable
features, meaning breakage happens often.

[Issue Tracker]: https://github.com/rust-lang-nursery/packed_simd/issues
[RFC]: https://github.com/rust-lang/rfcs/pull/2366

### LLVM issues

The Rust compiler relies on [LLVM](https://llvm.org/) for machine code generation,
and quite a few LLVM bugs have been discovered during the development of this project.

If you encounter issues with incorrect/suboptimal codegen, which you do not encounter
when using the [SIMD vendor intrinsics](https://doc.rust-lang.org/nightly/std/arch/),
it is likely the issue is with LLVM, or this crate's interaction with it.

You should first open an issue **in this repo** to help us track the problem, and we
will help determine what is the exact cause of the problem.
If LLVM is indeed the cause, the issue will be reported upstream to the
[LLVM bugtracker](https://bugs.llvm.org/).

## Submitting Pull Requests

New code is submitted to the crate using GitHub's [pull request] mechanism.
You should first fork this repository, make your changes (preferably in a new
branch), then use GitHub's web UI to create a new PR.

[pull request]: https://help.github.com/articles/about-pull-requests/

### Examples

The `examples` directory contains code showcasing SIMD code written with this crate,
usually in comparison to scalar or ISPC code. If you have a project / idea which
uses SIMD, we'd love to add it to the examples list.

Every example should include a small `README`, describing the example code's purpose.
If your example could potentially work as a benchmark, then add a `benchmark.sh`
script to allow running the example benchmark code in CI. See an existing example's
[`benchmark.sh`](examples/aobench/benchmark.sh) for a sample.

Don't forget to update the crate's top-level `README` with a link to your example.

### Perf guide

The objective of the [performance guide][perf-guide] is to be a comprehensive
resource detailing the process of optimizing Rust code with SIMD support.

If you believe a certain section could be reworded, or if you have any tips & tricks
related to SIMD which you'd like to share, please open a PR.

[mdBook] is used to manage the formatting of the guide as a book.

[perf-guide]: https://rust-lang-nursery.github.io/packed_simd/perf-guide/
[mdBook]: https://github.com/rust-lang-nursery/mdBook


================================================
FILE: examples/Cargo.toml
================================================
# FIXME: Many members of this workspace, including aobench, mandelbrot, and stencil,
# currently trigger a "null pointer deref" warning.
# This is likely due to unsoundness inside packed_simd.
[workspace]
members = [
    "aobench",
    "dot_product",
    "fannkuch_redux",
    "mandelbrot",
    "matrix_inverse",
    "nbody",
    "options_pricing",
    "slice_sum",
    "spectral_norm",
    "stencil",
    "triangle_xform",
]

[profile.release]
# Remember to uncomment this when profiling
# debug = 2

# You can set the following to lto = 'thin' and 'codegen-units=16'
# for better compile times at the cost of performance
lto = 'fat'
codegen-units = 1
incremental = false
panic = 'abort'

[profile.bench]
# Same as above
lto = 'fat'
codegen-units = 1
incremental = false

================================================
FILE: examples/aobench/Cargo.toml
================================================
[package]
name = "aobench"
version = "0.1.0"
authors = ["gnzlbg <gonzalobg88@gmail.com>"]
autobenches = false
edition = "2018"

[[bin]]
name = "aobench"
path = "src/main.rs"

[lib]
name = "aobench_lib"
path = "src/lib.rs"

[dependencies]
structopt = "^0.3"
failure = "^0.1"
png = "^0.15"
packed_simd = { package = "packed_simd", path = "../.." }
rayon = "^1.0"
time = "^0.1"
cfg-if = "^0.1"
ispc = { version = "^1.0.4", optional = true }

[build-dependencies]
ispc = { version = "^1.0.4", optional = true }

[dev-dependencies]
criterion = { version = '^0.3', features=['real_blackbox'] }

[features]
default = [ "256bit" ]
256bit = []
sleef-sys = [ "packed_simd/sleef-sys" ]
core_arch = [ "packed_simd/core_arch" ]

[[bench]]
name = "isec_sphere"
path = "benches/isec_sphere.rs"
harness = false

[[bench]]
name = "isec_plane"
path = "benches/isec_plane.rs"
harness = false

[[bench]]
name = "ambient_occlusion"
path = "benches/ambient_occlusion.rs"
harness = false

[[bench]]
name = "random"
path = "benches/random.rs"
harness = false


================================================
FILE: examples/aobench/benches/ambient_occlusion.rs
================================================
//! Benchmarks intersection between rays and planes
#![feature(stdsimd)]

use aobench_lib::*;
use criterion::*;
use intersection::Isect;
use aobench_lib::scene::Test;

fn hit_scalar(c: &mut Criterion) {
    let mut scene = Test::default();
    c.bench(
        "scalar",
        Benchmark::new("ao_hit", move |b| {
            b.iter(|| {
                let mut isect = Isect::default();
                let isect = black_box(&mut isect);
                let s = black_box(&mut scene);
                let mut v = ambient_occlusion::scalar(s, isect);
                black_box(&mut v);
            })
        })
        .throughput(Throughput::Elements(1)),
    );
}

fn hit_vector(c: &mut Criterion) {
    let mut scene = Test::default();

    c.bench(
        "vector",
        Benchmark::new("ao_hit", move |b| {
            b.iter(|| {
                let mut isect = Isect::default();
                let isect = black_box(&mut isect);
                let s = black_box(&mut scene);
                let mut v = ambient_occlusion::vector(s, isect);
                black_box(&mut v);
            })
        })
        .throughput(Throughput::Elements(1)),
    );
}

criterion_group!(benches, hit_scalar, hit_vector);
criterion_main!(benches);


================================================
FILE: examples/aobench/benches/isec_plane.rs
================================================
//! Benchmarks intersection between rays and planes
#![feature(stdsimd)]

use criterion::*;

use crate::geometry::{f32xN, Plane, Ray, RayxN, V3DxN, V3D};
use crate::intersection::{Intersect, Isect, IsectxN};
use aobench_lib::*;

fn hit_scalar(c: &mut Criterion) {
    let mut s = Plane {
        p: V3D {
            x: 0.,
            y: 0.,
            z: 10.,
        },
        n: V3D {
            x: 0.,
            y: 0.,
            z: 1.,
        },
    };
    let mut r = Ray {
        origin: V3D {
            x: 0.,
            y: 0.,
            z: 0.,
        },
        dir: V3D {
            x: 0.,
            y: 0.,
            z: 1.,
        },
    };

    c.bench(
        "scalar",
        Benchmark::new("isec_plane_hit", move |b| {
            b.iter(|| {
                let mut isect = Isect::default();
                let isect = black_box(&mut isect);
                let s = black_box(&mut s);
                let r = black_box(&mut r);
                let mut v = r.intersect(s, *isect);
                black_box(&mut v);
                assert_eq!(v.hit, true);
            })
        })
        .throughput(Throughput::Elements(1)),
    );
}

fn miss_scalar(c: &mut Criterion) {
    let mut s = Plane {
        p: V3D {
            x: 0.,
            y: 0.,
            z: -10.,
        },
        n: V3D {
            x: 0.,
            y: 0.,
            z: 1.,
        },
    };
    let mut r = Ray {
        origin: V3D {
            x: 0.,
            y: 0.,
            z: 0.,
        },
        dir: V3D {
            x: 0.,
            y: 0.,
            z: 1.,
        },
    };

    c.bench(
        "scalar",
        Benchmark::new("isec_plane_miss", move |b| {
            b.iter(|| {
                let mut isect = Isect::default();
                let isect = black_box(&mut isect);
                let s = black_box(&mut s);
                let r = black_box(&mut r);
                let mut v = r.intersect(s, *isect);
                black_box(&mut v);
                assert_eq!(v.hit, false);
            })
        })
        .throughput(Throughput::Elements(1)),
    );
}

fn hit_vector(c: &mut Criterion) {
    let mut s = Plane {
        p: V3D {
            x: 0.,
            y: 0.,
            z: 10.,
        },
        n: V3D {
            x: 0.,
            y: 0.,
            z: 1.,
        },
    };
    let mut r = RayxN {
        origin: V3DxN {
            x: f32xN::splat(0.),
            y: f32xN::splat(0.),
            z: f32xN::splat(0.),
        },
        dir: V3DxN {
            x: f32xN::splat(0.),
            y: f32xN::splat(0.),
            z: f32xN::splat(1.),
        },
    };

    c.bench(
        "vector",
        Benchmark::new("isec_plane_hit", move |b| {
            b.iter(|| {
                let mut isect = IsectxN::default();
                let isect = black_box(&mut isect);
                let s = black_box(&mut s);
                let r = black_box(&mut r);
                let mut v = r.intersect(s, *isect);
                black_box(&mut v);
                assert_eq!(v.hit.all(), true);
            })
        })
        .throughput(Throughput::Elements(f32xN::lanes() as u64)),
    );
}

fn miss_vector(c: &mut Criterion) {
    let mut s = Plane {
        p: V3D {
            x: 0.,
            y: 0.,
            z: -10.,
        },
        n: V3D {
            x: 0.,
            y: 0.,
            z: 1.,
        },
    };
    let mut r = RayxN {
        origin: V3DxN {
            x: f32xN::splat(0.),
            y: f32xN::splat(0.),
            z: f32xN::splat(0.),
        },
        dir: V3DxN {
            x: f32xN::splat(0.),
            y: f32xN::splat(0.),
            z: f32xN::splat(1.),
        },
    };

    c.bench(
        "vector",
        Benchmark::new("isec_plane_miss", move |b| {
            b.iter(|| {
                let mut isect = IsectxN::default();
                let isect = black_box(&mut isect);
                let s = black_box(&mut s);
                let r = black_box(&mut r);
                let mut v = r.intersect(s, *isect);
                black_box(&mut v);
                assert_eq!(v.hit.any(), false);
            })
        })
        .throughput(Throughput::Elements(f32xN::lanes() as u64)),
    );
}

criterion_group!(benches, hit_scalar, miss_scalar, hit_vector, miss_vector);
criterion_main!(benches);


================================================
FILE: examples/aobench/benches/isec_sphere.rs
================================================
//! Benchmarks intersection between rays and spheres
#![feature(stdsimd)]

use crate::geometry::{f32xN, Ray, RayxN, Sphere, V3DxN, V3D};
use crate::intersection::{Intersect, Isect, IsectxN};
use aobench_lib::*;
use criterion::*;

fn hit_scalar(c: &mut Criterion) {
    let mut s = Sphere {
        center: V3D {
            x: 0.,
            y: 0.,
            z: 10.,
        },
        radius: 1.,
    };

    let mut r = Ray {
        origin: V3D {
            x: 0.,
            y: 0.,
            z: 0.,
        },
        dir: V3D {
            x: 0.,
            y: 0.,
            z: 1.,
        },
    };

    c.bench(
        "scalar",
        Benchmark::new("isec_sphere_hit", move |b| {
            b.iter(|| {
                let mut isect = Isect::default();
                let isect = black_box(&mut isect);
                let s = black_box(&mut s);
                let r = black_box(&mut r);
                let mut v = r.intersect(s, *isect);
                black_box(&mut v);
                assert_eq!(v.hit, true);
            })
        })
        .throughput(Throughput::Elements(1)),
    );
}

fn miss_scalar(c: &mut Criterion) {
    let mut s = Sphere {
        center: V3D {
            x: 0.,
            y: 0.,
            z: -10.,
        },
        radius: 1.,
    };
    let mut r = Ray {
        origin: V3D {
            x: 0.,
            y: 0.,
            z: 0.,
        },
        dir: V3D {
            x: 0.,
            y: 0.,
            z: 1.,
        },
    };

    c.bench(
        "scalar",
        Benchmark::new("isec_sphere_miss", move |b| {
            b.iter(|| {
                let mut isect = Isect::default();
                let isect = black_box(&mut isect);
                let s = black_box(&mut s);
                let r = black_box(&mut r);
                let mut v = r.intersect(s, *isect);
                black_box(&mut v);
                assert_eq!(v.hit, false);
            })
        })
        .throughput(Throughput::Elements(1)),
    );
}

fn hit_vector(c: &mut Criterion) {
    let mut s = Sphere {
        center: V3D {
            x: 0.,
            y: 0.,
            z: 10.,
        },
        radius: 1.,
    };
    let mut r = RayxN {
        origin: V3DxN {
            x: f32xN::splat(0.),
            y: f32xN::splat(0.),
            z: f32xN::splat(0.),
        },
        dir: V3DxN {
            x: f32xN::splat(0.),
            y: f32xN::splat(0.),
            z: f32xN::splat(1.),
        },
    };

    c.bench(
        "vector",
        Benchmark::new("isec_sphere_hit", move |b| {
            b.iter(|| {
                let mut isect = IsectxN::default();
                let isect = black_box(&mut isect);
                let s = black_box(&mut s);
                let r = black_box(&mut r);
                let mut v = r.intersect(s, *isect);
                black_box(&mut v);
                assert_eq!(v.hit.all(), true);
            })
        })
        .throughput(Throughput::Elements(f32xN::lanes() as u64)),
    );
}

fn miss_vector(c: &mut Criterion) {
    let mut s = Sphere {
        center: V3D {
            x: 0.,
            y: 0.,
            z: -10.,
        },
        radius: 1.,
    };
    let mut r = RayxN {
        origin: V3DxN {
            x: f32xN::splat(0.),
            y: f32xN::splat(0.),
            z: f32xN::splat(0.),
        },
        dir: V3DxN {
            x: f32xN::splat(0.),
            y: f32xN::splat(0.),
            z: f32xN::splat(1.),
        },
    };

    c.bench(
        "vector",
        Benchmark::new("isec_sphere_miss", move |b| {
            b.iter(|| {
                let mut isect = IsectxN::default();
                let isect = black_box(&mut isect);
                let s = black_box(&mut s);
                let r = black_box(&mut r);
                let mut v = r.intersect(s, *isect);
                black_box(&mut v);
                assert_eq!(v.hit.any(), false);
            })
        })
        .throughput(Throughput::Elements(f32xN::lanes() as u64)),
    );
}

criterion_group!(benches, hit_scalar, miss_scalar, hit_vector, miss_vector);
criterion_main!(benches);


================================================
FILE: examples/aobench/benches/random.rs
================================================
//! Benchmarks PNRG
#![feature(stdsimd)]

use aobench_lib::geometry::f32xN;
use aobench_lib::random;
use criterion::*;

fn random_scalar(c: &mut Criterion) {
    c.bench(
        "scalar",
        Benchmark::new("random", move |b| {
            let mut rng = random::scalar::thread_rng();
            b.iter(|| {
                black_box(rng.gen());
            })
        })
        .throughput(Throughput::Elements(1)),
    );
}

fn random_vector(c: &mut Criterion) {
    c.bench(
        "vector",
        Benchmark::new("random", move |b| {
            let mut rng = random::vector::thread_rng();
            b.iter(|| {
                black_box(rng.gen());
            })
        })
        .throughput(Throughput::Elements(f32xN::lanes() as u64)),
    );
}

criterion_group!(benches, random_scalar, random_vector);
criterion_main!(benches);


================================================
FILE: examples/aobench/benches/scanlines.rs
================================================
#![feature(test)]

use test::{black_box, Bencher};

#[bench]
fn scanlines_scalar(b: &mut Bencher) {
    let width = 50;
    let height = 50;
    let width = black_box(width);
    let height = black_box(height);

    let mut fdata = Vec::new();
    fdata.resize(width * height * 3, 0.);
    fdata = black_box(fdata);
    b.iter(|| {
        black_box(&mut fdata);
        aobench_lib::scalar::scanlines(0, height, width, height, 2, &mut fdata);
    });
}

#[bench]
fn scanlines_vector(b: &mut Bencher) {
    let width = 50;
    let height = 50;
    let width = black_box(width);
    let height = black_box(height);

    let mut fdata = Vec::new();
    fdata.resize(width * height * 3, 0.);
    fdata = black_box(fdata);
    b.iter(|| {
        black_box(&mut fdata);
        aobench_lib::vector::scanlines(0, height, width, height, 2, &mut fdata);
    });
}


================================================
FILE: examples/aobench/benchmark.sh
================================================
#!/usr/bin/env bash
#
# Runs aobench benchmarks

set -ex

export WIDTH=800
export HEIGHT=600

if [[ ${NORUN} != 1 ]]; then
    hash hyperfine 2>/dev/null || { echo >&2 "hyperfine is not in PATH."; exit 1; }
fi

ALGS=("scalar" "scalar_par" "vector" "vector_par" "tiled" "tiled_par")
if echo "$FEATURES" | grep -q "ispc"; then
    hash ispc 2>/dev/null || { echo >&2 "ispc is not in PATH."; exit 1; }
    ALGS+=("ispc" "ispc_tasks")
fi

echo "Benchmark 256-bit wide vectors"
RUSTFLAGS="-C target-cpu=native ${RUSTFLAGS}" \
         cargo build --release --no-default-features \
         --features="${FEATURES},256bit"

if [[ "${VERIFY}" == "1" ]]; then
    RUSTFLAGS="-C target-cpu=native ${RUSTFLAGS}" \
    cargo test --release --no-default-features \
          --features="${FEATURES},256bit"
fi

if [[ "${NORUN}" == "1" ]]; then
    exit 0
fi

for alg in "${ALGS[@]}"
do
    hyperfine "../target/release/aobench ${WIDTH} ${HEIGHT} --algo ${alg}"
done

echo "Benchmark 128-bit wide vectors"
RUSTFLAGS="-C target-cpu=native ${RUSTFLAGS}" \
         cargo build --release --no-default-features \
         --features="${FEATURES}"
for alg in "${ALGS[@]}"
do
    hyperfine "../target/release/aobench ${WIDTH} ${HEIGHT} --algo ${alg}"
done


================================================
FILE: examples/aobench/build.rs
================================================
fn main() {
    println!("cargo:rerun-if-changed=build.rs");

    #[cfg(feature = "ispc")]
    {
        if std::env::var("CARGO_FEATURE_ISPC").is_ok() {
            let mut cfg = ispc::Config::new();

            if cfg!(windows) {
                cfg.debug(false);
            }

            let ispc_files = vec!["volta/ao.ispc"];

            for s in &ispc_files[..] {
                cfg.file(*s);
            }

            cfg.target_isas(vec![
                ispc::opt::TargetISA::SSE2i32x4,
                ispc::opt::TargetISA::SSE4i32x4,
                ispc::opt::TargetISA::AVX1i32x8,
                ispc::opt::TargetISA::AVX2i32x8,
                ispc::opt::TargetISA::AVX512KNLi32x16,
            ]);

            cfg.compile("aobench");
        }
    }
}


================================================
FILE: examples/aobench/readme.md
================================================
# Ambient Occlusion Benchmark

> Originally written by Syoyo Fujita: https://github.com/syoyo/aobench

`aoench` is a small ambient occlusion renderer for benchmarking realworld
floating point performance in various languages.

![image_vector_par](https://user-images.githubusercontent.com/904614/41043073-653aa5be-69a3-11e8-8a9d-007def8516cc.png)

## Instructions


To run it with the default target options (replace `${NAME}` with an algorithm name):

```
> cargo run --release -- 800 600 --algo ${NAME}
```

Use `RUSTFLAGS` to set the target CPU, for example:

```
> RUSTFLAGS="-C target-cpu=native" cargo run --release -- 800 600 --algo ${NAME}
```

## Results

```
./benchmark.sh
```

On a dual core AVX1 i5 @1.8 GHz:

| 800 x 600    | time [ms] <br> Rust | speedup vs `scalar` [-] |
|--------------|---------------------|-------------------------|
| `scalar`     | 5884                | 1.0x                    |
| `scalar_par` | 2206                | 2.7x                    |
| `vector`     | 1458                | 4.0x                    |
| `vector_par` | 622                 | 9.5x                    |
| `tiled`      | 1328                | 4.4x                    |
| `tiled_par`  | 578                 | 10.2x                   |
| `ispc`       | 1158                | 5.1x                    |
| `ispc_tasks` | 567                 | 10.4x                   |

`tiled_par` is 1.02x slower than `ispc_tasks`.

On a 28 core Xeon CPU E5-2690 v4 @ 2.60GHz:

| 800 x 600    | time [ms] <br> Rust | speedup vs `scalar` [-] |
|--------------|---------------------|-------------------------|
| `scalar`     | 2981                | 1.0x                    |
| `scalar_par` | 163                 | 18.2x                   |
| `vector`     | 692                 | 4.3x                    |
| `vector_par` | 98                  | 30.4x                   |
| `tiled`      | 640                 | 4.7x                    |
| `tiled_par`  | 98                  | 30.4x                   |
| `ispc`       | 576                 | 5.2x                    |
| `ispc_tasks` | 150                 | 19.9x                   |

`tiled_par` is 1.53x faster than `ispc_tasks`.


On a 40 core Xeon Gold 6148 CPU @ 2.40GHz:

| 800 x 600    | time [ms] <br> Rust | speedup vs `scalar` [-] |
|--------------|---------------------|-------------------------|
| `scalar`     | 3215                | 1.0x                    |
| `scalar_par` | 186                 | 17.0x                   |
| `vector`     | 802                 | 4.0x                    |
| `vector_par` | 106                 | 30.3x                   |
| `tiled`      | 770                 | 4.2x                    |
| `tiled_par`  | 102                 | 32.1x                   |
| `ispc`       | 491                 | 6.5x                    |
| `ispc_tasks` | 153                 | 21.7x                   |

`tiled_par` is 1.5x faster than `ispc_tasks`.

## Overview

There are 4 main pieces in the `aobench` benchmark:

* ray-plane intersection algorithm: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/intersection/ray_plane.rs)
* ray-sphere intersection algorithm: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/intersection/ray_sphere.rs)
* ambient occlusion algorithm: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/ambient_occlusion.rs)
* ray-casting the pixels:
  * scalar serial: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/scalar.rs)
  * scalar parallel: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/scalar_parallel.rs)
  * vector serial: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/vector.rs)
  * vector parallel: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/vector_parallel.rs)

The scalar and vectorized implementations of the intersection and ao algorithms
are in the same file so that they can be easily compared.

As a comparison, the ISPC sources of the same benchmark are [here](https://github.com/ispc/ispc/tree/master/examples/aobench).


================================================
FILE: examples/aobench/rustfmt.toml
================================================
max_width = 79

================================================
FILE: examples/aobench/src/ambient_occlusion.rs
================================================
//! Ambient Occlusion implementations

use crate::geometry::{f32xN, Ray, RayxN, Selectable, V3DxN, V3D};
use crate::intersection::{Intersect, Isect, IsectxN};
use crate::scene::Scene;
use std::f32::consts::PI;

/// Scalar ambient occlusion algorithm
#[inline(always)]
pub fn scalar<S: Scene>(scene: &mut S, isect: &Isect) -> f32 {
    let mut occlusion: f32 = 0.0;

    let basis = isect.n.ortho_basis();
    let eps: f32 = 0.0001;
    let origin = isect.p + eps * isect.n;

    let ntheta: usize = S::NAO_SAMPLES;
    let nphi: usize = S::NAO_SAMPLES;
    for _i in 0..ntheta {
        for _j in 0..nphi {
            let theta = scene.rand().sqrt();
            let phi = 2. * PI * scene.rand();

            let n = V3D {
                x: phi.cos() * theta,
                y: phi.sin() * theta,
                z: (1.0 - theta * theta).sqrt(),
            };
            let dir = basis * n;
            let ray = Ray { origin, dir };

            let mut occ_isect = Isect::default();
            for s in scene.spheres() {
                occ_isect = ray.intersect(s, occ_isect);
            }
            occ_isect = ray.intersect(scene.plane(), occ_isect);

            if occ_isect.hit {
                occlusion += 1.;
            }
        }
    }

    1. - occlusion / (ntheta * nphi) as f32
}

/// Vectorized ambient occlusion algorithm using ray packets
#[inline(always)]
pub fn vector<S: Scene>(scene: &mut S, isect: &Isect) -> f32 {
    let mut occlusion = f32xN::splat(0.0);

    let basis = isect.n.ortho_basis();
    let eps: f32 = 0.0001;
    let origin = isect.p + eps * isect.n;
    let origin = V3DxN {
        x: f32xN::splat(origin.x),
        y: f32xN::splat(origin.y),
        z: f32xN::splat(origin.z),
    };

    let ntheta: usize = S::NAO_SAMPLES;
    let nphi: usize = S::NAO_SAMPLES;
    for _i in 0..ntheta {
        for _j in (0..nphi).step_by(f32xN::lanes()) {
            let (theta, phi) = scene.rand_f32xN();
            let theta = theta.sqrte();
            let (sin, cos) = (2. * phi).sin_cos_pi();

            let n = V3DxN {
                x: cos * theta,
                y: sin * theta,
                z: (f32xN::splat(1.0) - theta * theta).sqrt(),
            };
            let dir = basis * n;
            let ray = RayxN { origin, dir };

            let mut occ_isect = IsectxN::default();
            for s in scene.spheres() {
                occ_isect = ray.intersect(s, occ_isect);
            }
            occ_isect = ray.intersect(scene.plane(), occ_isect);

            occlusion += occ_isect.hit.sel(f32xN::splat(1.), f32xN::splat(0.));
        }
    }

    1. - occlusion.sum() / (ntheta * nphi) as f32
}

/// Vectorized ambient occlusion algorithm using ray packets
#[inline(always)]
pub fn vector_tiled<S: Scene>(scene: &mut S, isect: &IsectxN) -> f32xN {
    let mut occlusion = f32xN::splat(0.0);

    let basis = isect.n.ortho_basis();
    let eps = f32xN::splat(0.0001);
    let origin = isect.p + eps * isect.n;

    let ntheta: usize = S::NAO_SAMPLES;
    let nphi: usize = S::NAO_SAMPLES;
    for _i in 0..ntheta {
        for _j in 0..nphi {
            let (theta, phi) = scene.rand_f32xN();
            let theta = theta.sqrte();
            let (sin, cos) = (2. * phi).sin_cos_pi();

            let n = V3DxN {
                x: cos * theta,
                y: sin * theta,
                z: (1.0 - theta * theta).sqrt(),
            };
            let dir = basis * n;
            let ray = RayxN { origin, dir };

            let mut occ_isect = IsectxN::default();
            for s in scene.spheres() {
                occ_isect = ray.intersect(s, occ_isect);
            }
            occ_isect = ray.intersect(scene.plane(), occ_isect);

            occlusion += occ_isect.hit.sel(f32xN::splat(1.), f32xN::splat(0.));
        }
    }

    f32xN::splat(1.) - occlusion / (ntheta * nphi) as f32
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::geometry::V3D;

    #[test]
    fn sanity_hit() {
        let scene = crate::scene::Test::default();
        let mut scene_scalar = scene.clone();
        let mut scene_vector = scene.clone();
        let ray = Ray {
            origin: V3D::default(),
            dir: V3D {
                x: -0.2,
                y: -0.2,
                z: -0.2,
            },
        };
        let mut isect = Isect::default();

        for s in scene.spheres() {
            isect = ray.intersect(s, isect);
        }
        isect = ray.intersect(scene.plane(), isect);

        assert!(isect.hit);

        let ao_scalar = scalar(&mut scene_scalar, &isect);
        let ao_vector = vector(&mut scene_vector, &isect);
        assert_eq!(ao_scalar, ao_vector);
    }

    #[test]
    fn sanity_miss() {
        let scene = crate::scene::Test::default();
        let mut scene_scalar = scene.clone();
        let mut scene_vector = scene.clone();

        let ray = Ray {
            origin: V3D::default(),
            dir: V3D {
                x: 0.2,
                y: 0.2,
                z: 0.2,
            },
        };
        let mut isect = Isect::default();

        for s in scene.spheres() {
            isect = ray.intersect(s, isect);
        }
        isect = ray.intersect(scene.plane(), isect);

        assert!(!isect.hit);

        let ao_scalar = scalar(&mut scene_scalar, &isect);
        let ao_vector = vector(&mut scene_vector, &isect);
        assert_eq!(ao_scalar, ao_vector);
    }

}


================================================
FILE: examples/aobench/src/geometry/mod.rs
================================================
//! Geometry utilities

use packed_simd::*;

mod plane;
mod ray;
mod sphere;
mod vec;

mod rayxN;
mod vecxN;

pub use self::plane::Plane;
pub use self::ray::Ray;
pub use self::sphere::Sphere;
pub use self::vec::{Dot, M3x3, V3D};

pub use self::rayxN::RayxN;
pub use self::vecxN::{Selectable, V3DxN};

#[cfg(feature = "256bit")]
pub type f32xN = f32x8;
#[cfg(feature = "256bit")]
pub type u32xN = u32x8;
#[cfg(feature = "256bit")]
pub type usizexN = usizex8;
#[cfg(feature = "256bit")]
pub type m32xN = m32x8;
#[cfg(feature = "256bit")]
pub type pf32xN = Simd<[*mut f32; 8]>;

#[cfg(not(feature = "256bit"))]
pub type f32xN = f32x4;
#[cfg(not(feature = "256bit"))]
pub type u32xN = u32x4;
#[cfg(not(feature = "256bit"))]
pub type usizexN = usizex4;
#[cfg(not(feature = "256bit"))]
pub type m32xN = m32x4;
#[cfg(not(feature = "256bit"))]
pub type pf32xN = Simd<[*mut f32; 4]>;

pub trait IncrV {
    type Element;
    fn incr(x: Self::Element, step: Self::Element) -> Self;
}

impl IncrV for f32xN {
    type Element = f32;
    #[inline(always)]
    fn incr(x: f32, step: f32) -> Self {
        #[cfg(feature = "256bit")]
        {
            Self::new(
                x + 0. * step,
                x + 1. * step,
                x + 2. * step,
                x + 3. * step,
                x + 4. * step,
                x + 5. * step,
                x + 6. * step,
                x + 7. * step,
            )
        }
        #[cfg(not(feature = "256bit"))]
        {
            Self::new(
                x + 0. * step,
                x + 1. * step,
                x + 2. * step,
                x + 3. * step,
            )
        }
    }
}

impl IncrV for u32xN {
    type Element = u32;
    #[inline(always)]
    fn incr(x: u32, step: u32) -> Self {
        #[cfg(feature = "256bit")]
        {
            Self::new(
                x + 0 * step,
                x + 1 * step,
                x + 2 * step,
                x + 3 * step,
                x + 4 * step,
                x + 5 * step,
                x + 6 * step,
                x + 7 * step,
            )
        }
        #[cfg(not(feature = "256bit"))]
        {
            Self::new(x + 0 * step, x + 1 * step, x + 2 * step, x + 3 * step)
        }
    }
}

impl IncrV for usizexN {
    type Element = usize;
    #[inline(always)]
    fn incr(x: usize, step: usize) -> Self {
        #[cfg(feature = "256bit")]
        {
            Self::new(
                x + 0 * step,
                x + 1 * step,
                x + 2 * step,
                x + 3 * step,
                x + 4 * step,
                x + 5 * step,
                x + 6 * step,
                x + 7 * step,
            )
        }
        #[cfg(not(feature = "256bit"))]
        {
            Self::new(x + 0 * step, x + 1 * step, x + 2 * step, x + 3 * step)
        }
    }
}


================================================
FILE: examples/aobench/src/geometry/plane.rs
================================================
//! Plane

use crate::geometry::V3D;

#[derive(Copy, Clone, Debug)]
pub struct Plane {
    pub p: V3D,
    pub n: V3D,
}


================================================
FILE: examples/aobench/src/geometry/ray.rs
================================================
//! A ray

use crate::geometry::V3D;

/// Ray starting at `origin` in `dir` direction.
#[derive(Copy, Clone, Debug)]
pub struct Ray {
    pub origin: V3D,
    pub dir: V3D,
}


================================================
FILE: examples/aobench/src/geometry/rayxN.rs
================================================
//! Four packed rays

use crate::geometry::{Ray, V3DxN};

/// Four packed rays starting at `origin` in `dir` direction.
#[derive(Copy, Clone, Debug)]
pub struct RayxN {
    pub origin: V3DxN,
    pub dir: V3DxN,
}

impl RayxN {
    pub fn get(&self, idx: usize) -> Ray {
        Ray {
            origin: self.origin.get(idx),
            dir: self.dir.get(idx),
        }
    }
}


================================================
FILE: examples/aobench/src/geometry/sphere.rs
================================================
//! Sphere

use crate::geometry::V3D;

#[derive(Copy, Clone, Debug)]
pub struct Sphere {
    pub center: V3D,
    pub radius: f32,
}


================================================
FILE: examples/aobench/src/geometry/vec.rs
================================================
//! A simple vector type

use std::ops::*;

#[derive(Copy, Clone, Debug, PartialEq)]
pub struct V3D {
    pub x: f32,
    pub y: f32,
    pub z: f32,
}

impl Default for V3D {
    #[inline(always)]
    #[must_use]
    fn default() -> Self {
        Self {
            x: 0.,
            y: 0.,
            z: 0.,
        }
    }
}

pub type M3x3 = [V3D; 3];

impl V3D {
    #[inline(always)]
    #[must_use]
    pub fn cross(self, o: Self) -> Self {
        Self {
            x: self.y * o.z - self.z * o.y,
            y: self.z * o.x - self.x * o.z,
            z: self.x * o.y - self.y * o.x,
        }
    }
    #[inline(always)]
    #[must_use]
    pub fn normalized(self) -> Self {
        let len2 = self.dot(self);
        let invlen = len2.sqrt().recip();
        invlen * self
    }
    #[inline(always)]
    #[must_use]
    pub fn ortho_basis(self) -> M3x3 {
        let n = self;
        let mut basis = [Self::default(), Self::default(), n];

        if n.x < 0.6 && n.x > -0.6 {
            basis[1].x = 1.0;
        } else if n.y < 0.6 && n.y > -0.6 {
            basis[1].y = 1.0;
        } else if n.z < 0.6 && n.z > -0.6 {
            basis[1].z = 1.0;
        } else {
            basis[1].x = 1.0;
        }

        basis[0] = basis[1].cross(basis[2]).normalized();
        basis[1] = basis[2].cross(basis[0]).normalized();
        basis
    }
    // Fuzzy float comparison between vectors
    #[inline(always)]
    #[must_use]
    pub fn almost_eq(&self, rhs: &Self) -> bool {
        const EPSILON: f32 = 1E-3;
        (self.x - rhs.x).abs() < EPSILON
            && (self.y - rhs.y).abs() < EPSILON
            && (self.z - rhs.z).abs() < EPSILON
    }
}

impl Add for V3D {
    type Output = Self;
    #[inline(always)]
    fn add(self, o: Self) -> Self::Output {
        Self {
            x: self.x + o.x,
            y: self.y + o.y,
            z: self.z + o.z,
        }
    }
}

impl Sub for V3D {
    type Output = Self;
    #[inline(always)]
    fn sub(self, o: Self) -> Self::Output {
        Self {
            x: self.x - o.x,
            y: self.y - o.y,
            z: self.z - o.z,
        }
    }
}

impl Mul for V3D {
    type Output = Self;
    fn mul(self, o: Self) -> Self::Output {
        Self {
            x: self.x * o.x,
            y: self.y * o.y,
            z: self.z * o.z,
        }
    }
}

impl Mul<f32> for V3D {
    type Output = Self;
    #[inline(always)]
    fn mul(self, o: f32) -> Self::Output {
        Self {
            x: self.x * o,
            y: self.y * o,
            z: self.z * o,
        }
    }
}

impl Mul<V3D> for f32 {
    type Output = V3D;
    #[inline(always)]
    fn mul(self, o: V3D) -> Self::Output {
        o * self
    }
}

impl Mul<V3D> for M3x3 {
    type Output = V3D;
    #[inline(always)]
    fn mul(self, o: V3D) -> Self::Output {
        V3D {
            x: o.dot(V3D {
                x: self[0].x,
                y: self[1].x,
                z: self[2].x,
            }),
            y: o.dot(V3D {
                x: self[0].y,
                y: self[1].y,
                z: self[2].y,
            }),
            z: o.dot(V3D {
                x: self[0].z,
                y: self[1].z,
                z: self[2].z,
            }),
        }
    }
}

/// Vector dot product
pub trait Dot<O> {
    type Output;
    fn dot(self, _: O) -> Self::Output;
}

impl Dot<V3D> for V3D {
    type Output = f32;
    #[inline(always)]
    fn dot(self, o: Self) -> Self::Output {
        self.x * o.x + self.y * o.y + self.z * o.z
    }
}


================================================
FILE: examples/aobench/src/geometry/vecxN.rs
================================================
//! A simple vector type

use std::ops::*;

use crate::geometry::{f32xN, m32xN, Dot, M3x3, V3D};

#[derive(Copy, Clone, Debug)]
pub struct V3DxN {
    pub x: f32xN,
    pub y: f32xN,
    pub z: f32xN,
}

impl Default for V3DxN {
    #[inline(always)]
    #[must_use]
    fn default() -> Self {
        Self {
            x: f32xN::splat(0.),
            y: f32xN::splat(0.),
            z: f32xN::splat(0.),
        }
    }
}

impl V3DxN {
    #[inline(always)]
    #[must_use]
    pub fn normalized(self) -> Self {
        let len2 = self.dot(self);
        let invlen = len2.rsqrte();
        invlen * self
    }

    pub fn get(&self, idx: usize) -> V3D {
        V3D {
            x: self.x.extract(idx),
            y: self.y.extract(idx),
            z: self.z.extract(idx),
        }
    }

    #[must_use]
    #[inline(always)]
    pub fn ortho_basis(self) -> [Self; 3] {
        let n = self;
        let mut basis = [Self::default(), Self::default(), n];

        let max = f32xN::splat(0.6);
        let min = f32xN::splat(-0.6);
        let one = f32xN::splat(1.0);

        let mx = n.x.lt(max) & n.x.gt(min);
        let my = n.y.lt(max) & n.y.gt(min);
        let mz = n.z.lt(max) & n.z.gt(min);

        basis[1].x = (mx | (!mx & !my & !mz)).select(one, basis[1].x);
        basis[1].y = (!mx & my).select(one, basis[1].y);
        basis[1].z = (!mx & !my & mz).select(one, basis[1].z);

        basis[0] = basis[1].cross(basis[2]).normalized();
        basis[1] = basis[2].cross(basis[0]).normalized();
        basis
    }

    #[inline(always)]
    #[must_use]
    pub fn cross(self, o: Self) -> Self {
        Self {
            x: self.y * o.z - self.z * o.y,
            y: self.z * o.x - self.x * o.z,
            z: self.x * o.y - self.y * o.x,
        }
    }
}

impl Add for V3DxN {
    type Output = Self;
    #[inline(always)]
    fn add(self, o: Self) -> Self::Output {
        Self {
            x: self.x + o.x,
            y: self.y + o.y,
            z: self.z + o.z,
        }
    }
}

impl Mul for V3DxN {
    type Output = Self;
    #[inline(always)]
    fn mul(self, o: Self) -> Self::Output {
        Self {
            x: self.x * o.x,
            y: self.y * o.y,
            z: self.z * o.z,
        }
    }
}

impl Mul<V3DxN> for f32xN {
    type Output = V3DxN;
    #[inline(always)]
    fn mul(self, o: V3DxN) -> Self::Output {
        V3DxN {
            x: self * o.x,
            y: self * o.y,
            z: self * o.z,
        }
    }
}

impl Mul<V3DxN> for [V3DxN; 3] {
    type Output = V3DxN;
    #[inline(always)]
    fn mul(self, o: V3DxN) -> Self::Output {
        V3DxN {
            x: o.dot(V3DxN {
                x: self[0].x,
                y: self[1].x,
                z: self[2].x,
            }),
            y: o.dot(V3DxN {
                x: self[0].y,
                y: self[1].y,
                z: self[2].y,
            }),
            z: o.dot(V3DxN {
                x: self[0].z,
                y: self[1].z,
                z: self[2].z,
            }),
        }
    }
}

impl Sub<V3D> for V3DxN {
    type Output = Self;
    #[inline(always)]
    fn sub(self, o: V3D) -> Self::Output {
        Self {
            x: self.x - f32xN::splat(o.x),
            y: self.y - f32xN::splat(o.y),
            z: self.z - f32xN::splat(o.z),
        }
    }
}

impl Dot<V3DxN> for V3DxN {
    type Output = f32xN;
    #[inline(always)]
    fn dot(self, o: Self) -> Self::Output {
        self.x.mul_adde(o.x, self.y.mul_adde(o.y, self.z * o.z))
    }
}

impl Dot<V3D> for V3DxN {
    type Output = f32xN;
    #[inline(always)]
    fn dot(self, o: V3D) -> Self::Output {
        self.x.mul_adde(
            f32xN::splat(o.x),
            self.y.mul_adde(f32xN::splat(o.y), self.z * o.z),
        )
    }
}

pub trait Selectable<O, P> {
    type Output;
    fn sel(self, a: O, b: P) -> Self::Output;
}

impl Selectable<f32xN, f32xN> for m32xN {
    type Output = f32xN;
    #[inline(always)]
    fn sel(self, a: f32xN, b: f32xN) -> f32xN {
        self.select(a, b)
    }
}

impl Selectable<V3DxN, V3DxN> for m32xN {
    type Output = V3DxN;
    #[inline(always)]
    fn sel(self, a: V3DxN, b: V3DxN) -> V3DxN {
        V3DxN {
            x: self.select(a.x, b.x),
            y: self.select(a.y, b.y),
            z: self.select(a.z, b.z),
        }
    }
}

impl Selectable<V3D, V3DxN> for m32xN {
    type Output = V3DxN;
    #[inline(always)]
    fn sel(self, a: V3D, b: V3DxN) -> V3DxN {
        V3DxN {
            x: self.select(f32xN::splat(a.x), b.x),
            y: self.select(f32xN::splat(a.y), b.y),
            z: self.select(f32xN::splat(a.z), b.z),
        }
    }
}

impl Mul<V3DxN> for M3x3 {
    type Output = V3DxN;
    #[inline(always)]
    fn mul(self, o: V3DxN) -> Self::Output {
        V3DxN {
            x: o.x.mul_adde(
                f32xN::splat(self[0].x),
                o.y.mul_adde(
                    f32xN::splat(self[1].x),
                    o.z * f32xN::splat(self[2].x),
                ),
            ),
            y: o.x.mul_adde(
                f32xN::splat(self[0].y),
                o.y.mul_adde(
                    f32xN::splat(self[1].y),
                    o.z * f32xN::splat(self[2].y),
                ),
            ),
            z: o.x.mul_adde(
                f32xN::splat(self[0].z),
                o.y.mul_adde(
                    f32xN::splat(self[1].z),
                    o.z * f32xN::splat(self[2].z),
                ),
            ),
        }
    }
}


================================================
FILE: examples/aobench/src/image.rs
================================================
//! Image utilities

use failure::Error;
#[allow(unused)]
use png::{BitDepth, ColorType, Encoder};
use std::path::Path;

/// PNG image in RGB format
pub struct Image {
    width: usize,
    height: usize,
    data: Vec<u8>,
    pub fdata: Vec<f32>,
}

impl Image {
    pub fn new(width: usize, height: usize) -> Self {
        Self {
            width,
            height,
            data: vec![0_u8; width * height * 3 /* RGBA */],
            fdata: vec![0_f32; width * height * 3 /* RGBA */],
        }
    }

    /// Image's `(width, height)`
    pub fn size(&self) -> (usize, usize) {
        (self.width, self.height)
    }
    /// Writes the pixels into a png image at `output`.
    ///
    /// `soa` specifies whether the bytes in `fdata` are in a Struct of Arrays (rrr...ggg...bbb...)
    /// or Array of Structs (rgbrgbrgb...) format.
    pub fn write_png(
        &mut self,
        output: &Path,
        soa: bool,
    ) -> Result<(), Error> {
        fn clamp(x: f32) -> u8 {
            let mut i = (x * 255.5) as isize;

            if i < 0 {
                i = 0
            };
            if i > 255 {
                i = 255
            };

            i as u8
        }

        use std::fs::File;
        use std::io::BufWriter;

        let file = File::create(output)?;
        let buf_writer = &mut BufWriter::new(file);
        let mut encoder = Encoder::new(
            buf_writer,
            self.width as u32,
            self.height as u32,
        );

        encoder.set_color(ColorType::RGB);
        encoder.set_depth(BitDepth::Eight);
        let mut writer = encoder.write_header().unwrap();

        if soa {
            let len = (self.width * self.height) as usize;
            let (r, tail) = self.fdata.split_at(len);
            let (g, b) = tail.split_at(len);
            assert!(r.len() == len);
            assert!(g.len() == len);
            assert!(b.len() == len);

            for i in 0..len {
                self.data[3 * i + 0] = clamp(r[i]);
                self.data[3 * i + 1] = clamp(g[i]);
                self.data[3 * i + 2] = clamp(b[i]);
            }
        } else {
            for (&fp, up) in self.fdata.iter().zip(self.data.iter_mut()) {
                (*up) = clamp(fp);
            }
        }

        writer.write_image_data(&self.data)?;
        Ok(())
    }
}


================================================
FILE: examples/aobench/src/intersection/mod.rs
================================================
//! Intersection functions

/// Intersection of `I` with `Self`
pub trait Intersect<I> {
    type Isect;
    fn intersect(&self, other: &I, isect: Self::Isect) -> Self::Isect;
}

mod packet;
mod ray_plane;
mod ray_sphere;
mod single;

pub use self::packet::IsectxN;
pub use self::single::Isect;


================================================
FILE: examples/aobench/src/intersection/packet.rs
================================================
//! SIMD intersection result

use crate::geometry::{f32xN, m32xN, V3DxN};
use crate::intersection::Isect;

/// Intersection result
#[derive(Copy, Clone, Debug)]
pub struct IsectxN {
    pub t: f32xN,
    pub p: V3DxN,
    pub n: V3DxN,
    pub hit: m32xN,
}

impl Default for IsectxN {
    #[inline]
    fn default() -> Self {
        Self {
            t: f32xN::splat(1e17),
            hit: m32xN::splat(false),
            p: V3DxN::default(),
            n: V3DxN::default(),
        }
    }
}

impl IsectxN {
    pub fn get(&self, idx: usize) -> Isect {
        Isect {
            t: self.t.extract(idx),
            p: self.p.get(idx),
            n: self.n.get(idx),
            hit: self.hit.extract(idx),
        }
    }
}


================================================
FILE: examples/aobench/src/intersection/ray_plane.rs
================================================
//! Intersection of a ray with a plane

use crate::geometry::{f32xN, Dot, Plane, Ray, RayxN, Selectable};
use crate::intersection::{Intersect, Isect, IsectxN};

// Scalar ray-plane intersection
impl Intersect<Plane> for Ray {
    type Isect = Isect;
    #[inline(always)]
    fn intersect(&self, plane: &Plane, mut isect: Isect) -> Isect {
        let ray = self;
        let d = -plane.p.dot(plane.n);
        let v = ray.dir.dot(plane.n);

        if v.abs() < 1e-17 {
            return isect;
        }

        let t = -(ray.origin.dot(plane.n) + d) / v;

        if t > 0. && t < isect.t {
            isect.t = t;
            isect.hit = true;
            isect.p = ray.origin + t * ray.dir;
            isect.n = plane.n;
        }

        isect
    }
}

// Vector ray-plane intersection for a packet of rays
impl Intersect<Plane> for RayxN {
    type Isect = IsectxN;
    #[inline(always)]
    fn intersect(&self, plane: &Plane, mut isect: IsectxN) -> IsectxN {
        let ray = self;
        let d = -plane.p.dot(plane.n);
        let v = ray.dir.dot(plane.n);

        let _old_isect = isect;

        let m = v.abs().ge(f32xN::splat(1e-17));
        if m.any() {
            let t = m.sel(-(ray.origin.dot(plane.n) + d) / v, isect.t);
            let m = m & t.gt(f32xN::splat(0.)) & t.lt(isect.t);

            if m.any() {
                isect.t = m.sel(t, isect.t);
                isect.hit |= m;
                isect.p = m.sel(ray.origin + t * ray.dir, isect.p);
                isect.n = m.sel(plane.n, isect.n);
            }
        }

        #[cfg(debug_assertions)]
        {
            // Check that the vector and the scalar version produce the same results
            // for the same inputs in debug builds
            for i in 0..f32xN::lanes() {
                let old_isect_i = _old_isect.get(i);
                let ray_i = self.get(i);
                let isect_i = ray_i.intersect(plane, old_isect_i);
                assert!(isect_i.almost_eq(&isect.get(i)), "{:?} !~= {:?}\n\nplane: {:?}\n\nold_isect: {:?}\n\nrays: {:?}\n\ni: {:?}\nold_isect_i: {:?}\nray_i: {:?}\n\n", isect_i, isect.get(i), plane, _old_isect, self, i, old_isect_i, ray_i);
            }
        }

        isect
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::geometry::{m32xN, V3DxN, V3D};

    #[test]
    fn sanity() {
        let plane = Plane {
            p: V3D {
                x: 0.,
                y: 0.,
                z: -10.,
            },
            n: V3D {
                x: 0.,
                y: 0.,
                z: 1.,
            },
        };

        let ray_hit = Ray {
            origin: V3D::default(),
            dir: V3D {
                x: 0.01,
                y: 0.01,
                z: -1.,
            },
        };
        let ray_miss = Ray {
            origin: V3D::default(),
            dir: V3D {
                x: 0.,
                y: 0.,
                z: 1.,
            },
        };

        let isect_hit = ray_hit.intersect(&plane, Isect::default());
        assert!(isect_hit.hit);
        let isect_miss = ray_miss.intersect(&plane, Isect::default());
        assert!(!isect_miss.hit);

        // hit, miss, hit, miss

        #[cfg(feature = "256bit")]
        let z_val = f32xN::new(-1., 1., -1., 1., -1., 1., -1., 1.);
        #[cfg(not(feature = "256bit"))]
        let z_val = f32xN::new(-1., 1., -1., 1.);

        let rays = RayxN {
            origin: V3DxN::default(),
            dir: V3DxN {
                x: f32xN::splat(0.01),
                y: f32xN::splat(0.01),
                z: z_val,
            },
        };

        let isectxN = rays.intersect(&plane, IsectxN::default());

        #[cfg(feature = "256bit")]
        let expected =
            m32xN::new(true, false, true, false, true, false, true, false);
        #[cfg(not(feature = "256bit"))]
        let expected = m32xN::new(true, false, true, false);

        assert_eq!(isectxN.hit, expected);

        assert_eq!(isect_hit.t, isectxN.t.extract(0));
        assert_eq!(isect_hit.t, isectxN.t.extract(2));
        assert_eq!(isect_miss.t, isectxN.t.extract(1));
        assert_eq!(isect_miss.t, isectxN.t.extract(3));

        assert_eq!(isect_hit.p.x, isectxN.p.x.extract(0));
        assert_eq!(isect_hit.p.y, isectxN.p.y.extract(0));
        assert_eq!(isect_hit.p.z, isectxN.p.z.extract(0));

        assert_eq!(isect_hit.p.x, isectxN.p.x.extract(2));
        assert_eq!(isect_hit.p.y, isectxN.p.y.extract(2));
        assert_eq!(isect_hit.p.z, isectxN.p.z.extract(2));

        assert_eq!(isect_miss.p.x, isectxN.p.x.extract(1));
        assert_eq!(isect_miss.p.y, isectxN.p.y.extract(1));
        assert_eq!(isect_miss.p.z, isectxN.p.z.extract(1));

        assert_eq!(isect_miss.p.x, isectxN.p.x.extract(3));
        assert_eq!(isect_miss.p.y, isectxN.p.y.extract(3));
        assert_eq!(isect_miss.p.z, isectxN.p.z.extract(3));

        assert_eq!(isect_hit.n.x, isectxN.n.x.extract(0));
        assert_eq!(isect_hit.n.y, isectxN.n.y.extract(0));
        assert_eq!(isect_hit.n.z, isectxN.n.z.extract(0));

        assert_eq!(isect_hit.n.x, isectxN.n.x.extract(2));
        assert_eq!(isect_hit.n.y, isectxN.n.y.extract(2));
        assert_eq!(isect_hit.n.z, isectxN.n.z.extract(2));

        assert_eq!(isect_miss.n.x, isectxN.n.x.extract(1));
        assert_eq!(isect_miss.n.y, isectxN.n.y.extract(1));
        assert_eq!(isect_miss.n.z, isectxN.n.z.extract(1));

        assert_eq!(isect_miss.n.x, isectxN.n.x.extract(3));
        assert_eq!(isect_miss.n.y, isectxN.n.y.extract(3));
        assert_eq!(isect_miss.n.z, isectxN.n.z.extract(3));
    }

    #[test]
    fn bug() {
        let plane = Plane {
            p: V3D {
                x: 0.,
                y: -0.5,
                z: 0.,
            },
            n: V3D {
                x: 0.,
                y: 1.,
                z: 0.,
            },
        };
        let isect = IsectxN {
            t: f32xN::splat(2.1931846),
            p: V3DxN {
                x: f32xN::splat(-0.2608384),
                y: f32xN::splat(-0.28958648),
                z: f32xN::splat(-2.6699374),
            },
            n: V3DxN {
                x: f32xN::splat(0.47832328),
                y: f32xN::splat(-0.579173),
                z: f32xN::splat(0.6601253),
            },
            hit: m32xN::splat(true),
        };
        let rays = RayxN {
            origin: V3DxN {
                x: f32xN::splat(-0.5),
                y: f32xN::splat(-0.4999),
                z: f32xN::splat(-0.5),
            },
            dir: V3DxN {
                x: f32xN::splat(0.10904764),
                y: f32xN::splat(0.095894136),
                z: f32xN::splat(-0.98940027),
            },
        };
        let r = rays.intersect(&plane, isect);
        assert_eq!(r.hit, m32xN::splat(true));
    }
}


================================================
FILE: examples/aobench/src/intersection/ray_sphere.rs
================================================
//! Intersection of a ray with a sphere.

use crate::geometry::{f32xN, Dot, Ray, RayxN, Selectable, Sphere};
use crate::intersection::{Intersect, Isect, IsectxN};

// Scalar ray-sphere intersection
impl Intersect<Sphere> for Ray {
    type Isect = Isect;
    #[inline(always)]
    fn intersect(&self, sphere: &Sphere, mut isect: Isect) -> Isect {
        let ray = self;
        let rs = ray.origin - sphere.center;

        let b = rs.dot(ray.dir);
        let c = rs.dot(rs) - sphere.radius * sphere.radius;
        let d = b * b - c;

        if d > 0. {
            let t = -b - d.sqrt();

            if t > 0. && t < isect.t {
                isect.t = t;
                isect.hit = true;
                isect.p = ray.origin + t * ray.dir;
                isect.n = (isect.p - sphere.center).normalized();
            }
        }

        isect
    }
}

// Vector ray-sphere intersection for a packet of rays
impl Intersect<Sphere> for RayxN {
    type Isect = IsectxN;
    #[inline(always)]
    fn intersect(&self, sphere: &Sphere, mut isect: IsectxN) -> IsectxN {
        let ray = self;
        let rs = ray.origin - sphere.center;

        let b = rs.dot(ray.dir);
        let radius = f32xN::splat(sphere.radius);
        let c = radius.mul_adde(-radius, rs.dot(rs));
        let d = b.mul_adde(b, -c);

        let _old_isect = isect;

        let m = d.gt(f32xN::splat(0.));
        if m.any() {
            let t = m.sel(-b - d.sqrt(), isect.t);
            let m = m & t.gt(f32xN::splat(0.)) & t.lt(isect.t);

            if m.any() {
                isect.t = m.sel(t, isect.t);
                isect.hit |= m;
                isect.p = m.sel(ray.origin + t * ray.dir, isect.p);
                isect.n =
                    m.sel((isect.p - sphere.center).normalized(), isect.n);
            }
        }

        #[cfg(debug_assertions)]
        {
            // Check that the vector and the scalar version produce the same results
            // for the same inputs in debug builds
            for i in 0..f32xN::lanes() {
                let old_isect_i = _old_isect.get(i);
                let ray_i = self.get(i);
                let isect_i = ray_i.intersect(sphere, old_isect_i);
                assert!(isect_i.almost_eq(&isect.get(i)), "{:?} !~= {:?}\n\nsphere: {:?}\n\nold_isect: {:?}\n\nrays: {:?}\n\ni: {:?}\nold_isect_i: {:?}\nray_i: {:?}\n\n", isect_i, isect.get(i), sphere, _old_isect, self, i, old_isect_i, ray_i);
            }
        }

        isect
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::geometry::{m32xN, V3DxN, V3D};

    #[test]
    fn sanity() {
        let sphere = Sphere {
            center: V3D {
                x: 0.,
                y: 0.,
                z: -10.,
            },
            radius: 1.,
        };

        let ray_hit = Ray {
            origin: V3D::default(),
            dir: V3D {
                x: 0.01,
                y: 0.01,
                z: -1.,
            },
        };
        let ray_miss = Ray {
            origin: V3D::default(),
            dir: V3D {
                x: 0.,
                y: 0.,
                z: 1.,
            },
        };

        let isect_hit = ray_hit.intersect(&sphere, Isect::default());
        assert!(isect_hit.hit);
        let isect_miss = ray_miss.intersect(&sphere, Isect::default());
        assert!(!isect_miss.hit);

        // hit, miss, hit, miss
        #[cfg(feature = "256bit")]
        let z_val = f32xN::new(-1., 1., -1., 1., -1., 1., -1., 1.);
        #[cfg(not(feature = "256bit"))]
        let z_val = f32xN::new(-1., 1., -1., 1.);

        let rays = RayxN {
            origin: V3DxN::default(),
            dir: V3DxN {
                x: f32xN::splat(0.01),
                y: f32xN::splat(0.01),
                z: z_val,
            },
        };

        let isectxN = rays.intersect(&sphere, IsectxN::default());

        #[cfg(feature = "256bit")]
        let expected =
            m32xN::new(true, false, true, false, true, false, true, false);
        #[cfg(not(feature = "256bit"))]
        let expected = m32xN::new(true, false, true, false);

        assert_eq!(isectxN.hit, expected);

        assert_eq!(isect_hit.t, isectxN.t.extract(0));
        assert_eq!(isect_hit.t, isectxN.t.extract(2));
        assert_eq!(isect_miss.t, isectxN.t.extract(1));
        assert_eq!(isect_miss.t, isectxN.t.extract(3));

        assert_eq!(isect_hit.p.x, isectxN.p.x.extract(0));
        assert_eq!(isect_hit.p.y, isectxN.p.y.extract(0));
        assert_eq!(isect_hit.p.z, isectxN.p.z.extract(0));

        assert_eq!(isect_hit.p.x, isectxN.p.x.extract(2));
        assert_eq!(isect_hit.p.y, isectxN.p.y.extract(2));
        assert_eq!(isect_hit.p.z, isectxN.p.z.extract(2));

        assert_eq!(isect_miss.p.x, isectxN.p.x.extract(1));
        assert_eq!(isect_miss.p.y, isectxN.p.y.extract(1));
        assert_eq!(isect_miss.p.z, isectxN.p.z.extract(1));

        assert_eq!(isect_miss.p.x, isectxN.p.x.extract(3));
        assert_eq!(isect_miss.p.y, isectxN.p.y.extract(3));
        assert_eq!(isect_miss.p.z, isectxN.p.z.extract(3));

        assert_eq!(isect_hit.n.x, isectxN.n.x.extract(0));
        assert_eq!(isect_hit.n.y, isectxN.n.y.extract(0));
        assert_eq!(isect_hit.n.z, isectxN.n.z.extract(0));

        assert_eq!(isect_hit.n.x, isectxN.n.x.extract(2));
        assert_eq!(isect_hit.n.y, isectxN.n.y.extract(2));
        assert_eq!(isect_hit.n.z, isectxN.n.z.extract(2));

        assert_eq!(isect_miss.n.x, isectxN.n.x.extract(1));
        assert_eq!(isect_miss.n.y, isectxN.n.y.extract(1));
        assert_eq!(isect_miss.n.z, isectxN.n.z.extract(1));

        assert_eq!(isect_miss.n.x, isectxN.n.x.extract(3));
        assert_eq!(isect_miss.n.y, isectxN.n.y.extract(3));
        assert_eq!(isect_miss.n.z, isectxN.n.z.extract(3));
    }
}


================================================
FILE: examples/aobench/src/intersection/single.rs
================================================
//! Scalar intersection result

use crate::geometry::V3D;

/// Intersection result
#[derive(Copy, Clone, Debug)]
pub struct Isect {
    pub t: f32,
    pub p: V3D,
    pub n: V3D,
    pub hit: bool,
}

impl Default for Isect {
    #[inline]
    fn default() -> Self {
        Self {
            t: 1e17,
            hit: false,
            p: V3D::default(),
            n: V3D::default(),
        }
    }
}

impl Isect {
    #[inline(always)]
    #[must_use]
    pub fn almost_eq(&self, rhs: &Self) -> bool {
        const EPSILON: f32 = 1E-3;
        (self.t - rhs.t).abs() < EPSILON
            && self.p.almost_eq(&rhs.p)
            && self.n.almost_eq(&rhs.n)
            && self.hit == rhs.hit
    }
}


================================================
FILE: examples/aobench/src/ispc_.rs
================================================
//! Includes the ISPC implementations.
use crate::*;
use ispc::*;

ispc_module!(aobench);

pub fn ao<S: Scene>(
    _scene: &mut S,
    nsubsamples: usize,
    img: &mut crate::Image,
) {
    let (w, h) = img.size();
    unsafe {
        self::aobench::ao_ispc(
            w as i32,
            h as i32,
            nsubsamples as i32,
            img.fdata.as_mut_ptr(),
        )
    }
}

pub fn ao_tasks<S: Scene>(
    _scene: &mut S,
    nsubsamples: usize,
    img: &mut crate::Image,
) {
    let (w, h) = img.size();
    unsafe {
        self::aobench::ao_ispc_tasks(
            w as i32,
            h as i32,
            nsubsamples as i32,
            img.fdata.as_mut_ptr(),
        )
    }
}


================================================
FILE: examples/aobench/src/lib.rs
================================================
//! aobench: Ambient Occlusion Renderer benchmark.
//!
//! Based on [aobench](https://code.google.com/archive/p/aobench/) by Syoyo
//! Fujita.
// FIXME: Null pointer deref warning triggered in this example,
// likely inside a macro expansion deriving from packed_simd.
#![deny(rust_2018_idioms)]
#![allow(non_snake_case, non_camel_case_types)]
#![allow(
    clippy::many_single_char_names,
    clippy::similar_names,
    clippy::cast_precision_loss,
    clippy::inline_always,
    clippy::cast_possible_truncation,
    clippy::cast_sign_loss,
    clippy::identity_op,
    clippy::erasing_op,
    clippy::must_use_candidate,
    clippy::float_cmp
)]

pub mod ambient_occlusion;
pub mod geometry;
pub mod image;
pub mod intersection;
pub mod random;
pub mod scene;

#[cfg(feature = "ispc")]
pub mod ispc_;
pub mod scalar;
pub mod scalar_parallel;
pub mod tiled;
pub mod tiled_parallel;
pub mod vector;
pub mod vector_parallel;

pub use self::image::Image;
pub use self::scene::Scene;


================================================
FILE: examples/aobench/src/main.rs
================================================
//! aobench: Ambient Occlusion Renderer benchmark.
//!
//! Based on [aobench](https://code.google.com/archive/p/aobench/) by Syoyo
//! Fujita.
#![deny(rust_2018_idioms)]

use aobench_lib::*;
use std::path::PathBuf;
use structopt::StructOpt;

/// Command-line arguments.
#[derive(StructOpt, Debug)]
struct Opt {
    /// Image width.
    width: usize,
    /// Image height.
    height: usize,

    /// Algorithm
    #[structopt(short = "a", long = "algo")]
    algo: String,

    /// Output file.
    #[structopt(short = "o", long = "output", parse(from_os_str))]
    output: Option<PathBuf>,
}

const ALGORITHMS: &[&str] = &[
    "scalar",
    "scalar_par",
    "vector",
    "vector_par",
    "tiled",
    "tiled_par",
    "ispc",
    "ispc_tasks",
];

fn main() {
    let opt = Opt::from_args();
    let mut scene = aobench_lib::scene::Random::default();
    let mut img = Image::new(opt.width, opt.height);

    let algorithm_name = opt.algo.as_str();

    if let Some(algorithm) = ALGORITHMS.iter().find(|&&a| a == algorithm_name)
    {
        let d = time::Duration::span(|| match *algorithm {
            "scalar" => scalar::ao(&mut scene, 2, &mut img),
            "scalar_par" => scalar_parallel::ao(&mut scene, 2, &mut img),
            "vector" => vector::ao(&mut scene, 2, &mut img),
            "vector_par" => vector_parallel::ao(&mut scene, 2, &mut img),
            "tiled" => tiled::ao(&mut scene, 2, &mut img),
            "tiled_par" => tiled_parallel::ao(&mut scene, 2, &mut img),
            "ispc" => {
                #[cfg(feature = "ispc")]
                {
                    ispc_::ao(&mut scene, 2, &mut img)
                }
                #[cfg(not(feature = "ispc"))]
                {
                    panic!("the `ispc` algorithm requires building with --features=ispc");
                }
            }
            "ispc_tasks" => {
                #[cfg(feature = "ispc")]
                {
                    ispc_::ao_tasks(&mut scene, 2, &mut img)
                }
                #[cfg(not(feature = "ispc"))]
                {
                    panic!("the `ispc_task` algorithm requires building with --features=ispc");
                }
            }
            _ => unreachable!(),
        });
        let image_path = opt.output.unwrap_or_else(|| {
            PathBuf::from(format!("image_{}.png", algorithm))
        });
        img.write_png(&image_path, false)
            .expect("failed to write image");

        println!("time: {} ms", d.num_milliseconds());
    } else {
        let mut error = format!(
            "unknown algorithm: \"{}\"\nAvailable algorithms:",
            algorithm_name
        );
        for a in ALGORITHMS {
            error.push_str(&format!("\n- {}", a));
        }
        panic!("{}", error);
    }
}


================================================
FILE: examples/aobench/src/random.rs
================================================
//! Pseudo random number generators.
//!
//! Currently only `LFSR113` is implemented, since that is what ISPC uses, and it
//! allows us to compare Rust's codegen with that of ISPC for the same
//! algorithms.
//!
//! Use `{scalar,vector}::thread_rng()` to get a handle to the thread-local
//! random number generator, and call `.gen()` to generate an `f32` or an
//! `f32xN`.

/// Scalar pseudo random number generator
pub mod scalar {
    use std::cell::UnsafeCell;
    use std::rc::Rc;

    // Note: This implementation could be vectorized using an `u32x4`.
    struct RngT(u32, u32, u32, u32);

    impl RngT {
        fn from_seed(x: u32) -> Self {
            let z0 = x;
            let z1 = x ^ 0xbeef_f00d;
            let z2 = ((x & 0xffff_u32) << 16) | (x >> 16);
            let z3 = ((x & 0xff_u32) << 24)
                | ((x & 0xff00_u32) << 8)
                | ((x & 0x00ff_0000_u32) >> 8)
                | (x & 0xff00_0000_u32) >> 24;
            Self(z0, z1, z2, z3)
        }

        pub fn gen_u32(&mut self) -> u32 {
            let mut b = ((self.0 << 6) ^ self.0) >> 13;
            self.0 = ((self.0 & 4_294_967_294_u32) << 18) ^ b;
            b = ((self.1 << 2) ^ self.1) >> 27;
            self.1 = ((self.1 & 4_294_967_288_u32) << 2) ^ b;
            b = ((self.2 << 13) ^ self.2) >> 21;
            self.2 = ((self.2 & 4_294_967_280_u32) << 7) ^ b;
            b = ((self.3 << 3) ^ self.3) >> 12;
            self.3 = ((self.3 & 4_294_967_168_u32) << 13) ^ b;
            self.0 ^ self.1 ^ self.2 ^ self.3
        }

        pub fn gen(&mut self) -> f32 {
            let mut v = self.gen_u32();
            v &= (1_u32 << 23) - 1;
            let v = f32::from_bits(0x3F80_0000 | v);
            v - 1.
        }
    }

    #[derive(Clone)]
    pub struct RngH {
        rng: Rc<UnsafeCell<RngT>>,
    }

    impl RngH {
        pub fn gen(&mut self) -> f32 {
            unsafe { (*self.rng.get()).gen() }
        }
    }

    thread_local!(
        static THREAD_RNG_KEY: Rc<UnsafeCell<RngT>> = {
            Rc::new(UnsafeCell::new(RngT::from_seed(1)))
        }
    );

    pub fn thread_rng() -> RngH {
        RngH {
            rng: THREAD_RNG_KEY.with(Clone::clone),
        }
    }
}

/// Vector pseudo random number generator
pub mod vector {
    use crate::geometry::{f32xN, u32xN, IncrV};
    use std::cell::UnsafeCell;
    use std::rc::Rc;
    struct RngT(u32xN, u32xN, u32xN, u32xN);

    impl RngT {
        fn from_seed(x: u32xN) -> Self {
            let z0 = x;
            let z1 = x ^ u32xN::splat(0xbeef_f00d);
            let z2 = ((x & u32xN::splat(0xffff)) << 16) | (x >> 16);
            let z3 = ((x & u32xN::splat(0xff)) << 24)
                | ((x & u32xN::splat(0xff00)) << 8)
                | ((x & u32xN::splat(0x00ff_0000)) >> 8)
                | (x & u32xN::splat(0xff00_0000)) >> 24;
            Self(z0, z1, z2, z3)
        }

        #[inline(always)]
        pub fn gen_u32(&mut self) -> u32xN {
            let mut b = ((self.0 << 6) ^ self.0) >> 13;
            self.0 = ((self.0 & u32xN::splat(4_294_967_294)) << 18) ^ b;
            b = ((self.1 << 2) ^ self.1) >> 27;
            self.1 = ((self.1 & u32xN::splat(4_294_967_288)) << 2) ^ b;
            b = ((self.2 << 13) ^ self.2) >> 21;
            self.2 = ((self.2 & u32xN::splat(4_294_967_280)) << 7) ^ b;
            b = ((self.3 << 3) ^ self.3) >> 12;
            self.3 = ((self.3 & u32xN::splat(4_294_967_168)) << 13) ^ b;
            self.0 ^ self.1 ^ self.2 ^ self.3
        }

        #[inline(always)]
        pub fn gen(&mut self) -> f32xN {
            let mut v = self.gen_u32();
            v &= u32xN::splat((1_u32 << 23) - 1);
            let v: f32xN =
                unsafe { std::mem::transmute(u32xN::splat(0x3F80_0000) | v) };
            v - f32xN::splat(1.)
        }
    }

    #[derive(Clone)]
    pub struct RngH {
        rng: Rc<UnsafeCell<RngT>>,
    }

    impl RngH {
        #[inline(always)]
        pub fn gen(&mut self) -> f32xN {
            unsafe { (*self.rng.get()).gen() }
        }
    }

    thread_local!(
        static THREAD_RNG_KEY: Rc<UnsafeCell<RngT>> = {
            Rc::new(UnsafeCell::new(RngT::from_seed(<u32xN as IncrV>::incr(0, 1))))
        }
    );

    pub fn thread_rng() -> RngH {
        RngH {
            rng: THREAD_RNG_KEY.with(Clone::clone),
        }
    }
}


================================================
FILE: examples/aobench/src/scalar.rs
================================================
//! Scalar serial aobench

use crate::ambient_occlusion;
use crate::geometry::{Ray, V3D};
use crate::intersection::{Intersect, Isect};
use crate::scene::Scene;

pub fn ao<S: Scene>(
    scene: &mut S,
    nsubsamples: usize,
    img: &mut crate::Image,
) {
    let (w, h) = img.size();
    let image = &mut img.fdata;
    let ns = nsubsamples;
    for y in 0..h {
        for x in 0..w {
            let offset = 3 * (y * w + x);
            for u in 0..ns {
                for v in 0..ns {
                    let (x, y, u, v, h, w, ns) = (
                        x as f32, y as f32, u as f32, v as f32, h as f32,
                        w as f32, ns as f32,
                    );
                    let dir: V3D = V3D {
                        x: (x + u / ns - w / 2.) / (w / 2.) * w / h,
                        y: -(y + v / ns - h / 2.) / (h / 2.),
                        z: -1.,
                    };
                    let dir = dir.normalized();

                    let ray = Ray {
                        origin: V3D::default(),
                        dir,
                    };

                    let mut isect = Isect::default();
                    for s in scene.spheres() {
                        isect = ray.intersect(s, isect);
                    }
                    isect = ray.intersect(scene.plane(), isect);

                    let ret = if isect.hit {
                        ambient_occlusion::scalar(scene, &isect)
                    } else {
                        0.
                    };

                    // Update image for AO for this ray
                    image[offset + 0] += ret;
                    image[offset + 1] += ret;
                    image[offset + 2] += ret;
                }
            }
            // Normalize image pixels by number of samples taken per pixel
            let ns = (ns * ns) as f32;
            image[offset + 0] /= ns;
            image[offset + 1] /= ns;
            image[offset + 2] /= ns;
        }
    }
}


================================================
FILE: examples/aobench/src/scalar_parallel.rs
================================================
//! Scalar parallel aobench

use crate::ambient_occlusion;
use crate::geometry::{Ray, V3D};
use crate::intersection::{Intersect, Isect};
use crate::scene::Scene;
use rayon::prelude::*;

pub fn ao<S: Scene>(_: &mut S, nsubsamples: usize, img: &mut crate::Image) {
    let (w, h) = img.size();
    let ns = nsubsamples;
    img.fdata
        .par_chunks_mut(3 * w)
        .enumerate()
        .for_each(|(y, image)| {
            assert!(image.len() == 3 * w);
            let mut scene = S::default();
            for x in 0..w {
                let offset = 3 * x;
                for u in 0..ns {
                    for v in 0..ns {
                        let (x, y, u, v, h, w, ns) = (
                            x as f32, y as f32, u as f32, v as f32, h as f32,
                            w as f32, ns as f32,
                        );
                        let dir: V3D = V3D {
                            x: (x + u / ns - w / 2.) / (w / 2.) * w / h,
                            y: -(y + v / ns - h / 2.) / (h / 2.),
                            z: -1.,
                        };
                        let dir = dir.normalized();

                        let ray = Ray {
                            origin: V3D::default(),
                            dir,
                        };

                        let mut isect = Isect::default();
                        for s in scene.spheres() {
                            isect = ray.intersect(s, isect);
                        }
                        isect = ray.intersect(scene.plane(), isect);

                        let ret = if isect.hit {
                            ambient_occlusion::scalar(&mut scene, &isect)
                        } else {
                            0.
                        };

                        // Update image for AO for this ray
                        image[offset + 0] += ret;
                        image[offset + 1] += ret;
                        image[offset + 2] += ret;
                    }
                }
                // Normalize image pixels by number of samples taken per pixel
                let ns = (ns * ns) as f32;
                image[offset + 0] /= ns;
                image[offset + 1] /= ns;
                image[offset + 2] /= ns;
            }
        });
}


================================================
FILE: examples/aobench/src/scene/mod.rs
================================================
/// Scene interface
use crate::geometry::{f32xN, Plane, Sphere};

pub trait Scene: Send + Sync + Default {
    const NAO_SAMPLES: usize;
    fn rand(&mut self) -> f32;
    fn plane(&self) -> &Plane;
    fn spheres(&self) -> &[Sphere];
    fn rand_f32xN(&mut self) -> (f32xN, f32xN) {
        #[cfg(feature = "256bit")]
        {
            let r = [
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
            ];
            (
                f32xN::new(r[0], r[2], r[4], r[6], r[8], r[10], r[12], r[14]),
                f32xN::new(r[1], r[3], r[5], r[7], r[9], r[11], r[13], r[15]),
            )
        }
        #[cfg(not(feature = "256bit"))]
        {
            let r = [
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
                self.rand(),
            ];
            (
                f32xN::new(r[0], r[2], r[4], r[6]),
                f32xN::new(r[1], r[3], r[5], r[7]),
            )
        }
    }
}

mod random;
pub use self::random::Random;

mod test;
pub use self::test::Test;


================================================
FILE: examples/aobench/src/scene/random.rs
================================================
//! Aobench scene: 3 spheres and a plane using a random number generator

use crate::geometry::{f32xN, Plane, Sphere, V3D};
use crate::scene::Scene;

#[derive(Clone)]
pub struct Random {
    pub plane: Plane,
    pub spheres: [Sphere; 3],
}

impl Default for Random {
    fn default() -> Self {
        let plane = Plane {
            p: V3D {
                x: 0.,
                y: -0.5,
                z: 0.,
            },
            n: V3D {
                x: 0.,
                y: 1.,
                z: 0.,
            },
        };
        let spheres = [
            Sphere {
                center: V3D {
                    x: -2.,
                    y: 0.,
                    z: -3.5,
                },
                radius: 0.5,
            },
            Sphere {
                center: V3D {
                    x: -0.5,
                    y: 0.,
                    z: -3.,
                },
                radius: 0.5,
            },
            Sphere {
                center: V3D {
                    x: 1.,
                    y: 0.,
                    z: -2.2,
                },
                radius: 0.5,
            },
        ];
        Self { plane, spheres }
    }
}

impl Scene for Random {
    const NAO_SAMPLES: usize = 8;
    #[inline(always)]
    fn rand(&mut self) -> f32 {
        crate::random::scalar::thread_rng().gen()
    }
    #[inline(always)]
    fn plane(&self) -> &Plane {
        &self.plane
    }
    #[inline(always)]
    fn spheres(&self) -> &[Sphere] {
        &self.spheres
    }
    #[inline(always)]
    fn rand_f32xN(&mut self) -> (f32xN, f32xN) {
        let mut rng = crate::random::vector::thread_rng();
        (rng.gen(), rng.gen())
    }
}


================================================
FILE: examples/aobench/src/scene/test.rs
================================================
//! Aobench scene: 3 spheres and a plane using a random number generator

use crate::geometry::{Plane, Sphere, V3D};
use crate::scene::Scene;
use std::num::Wrapping;

#[derive(Clone)]
pub struct Test {
    pub plane: Plane,
    pub spheres: [Sphere; 3],
    rands: Vec<f32>,
    rand_step: Wrapping<usize>,
}

impl Default for Test {
    fn default() -> Self {
        let plane = Plane {
            p: V3D {
                x: 0.,
                y: -0.5,
                z: 0.,
            },
            n: V3D {
                x: 0.,
                y: 1.,
                z: 0.,
            },
        };
        let spheres = [
            Sphere {
                center: V3D {
                    x: -2.,
                    y: 0.,
                    z: -3.5,
                },
                radius: 0.5,
            },
            Sphere {
                center: V3D {
                    x: -0.5,
                    y: 0.,
                    z: -3.,
                },
                radius: 0.5,
            },
            Sphere {
                center: V3D {
                    x: 1.,
                    y: 0.,
                    z: -2.2,
                },
                radius: 0.5,
            },
        ];
        let mut rands = Vec::new();
        let mut rng = crate::random::scalar::thread_rng();
        for _ in 0..2 * Self::NAO_SAMPLES * Self::NAO_SAMPLES {
            rands.push(rng.gen());
        }
        let rand_step = Wrapping(0);
        Self {
            plane,
            spheres,
            rands,
            rand_step,
        }
    }
}

impl Scene for Test {
    const NAO_SAMPLES: usize = 8;
    fn rand(&mut self) -> f32 {
        let v = self.rands[self.rand_step.0];
        self.rand_step += Wrapping(1);
        if self.rand_step
            >= Wrapping(2 * Self::NAO_SAMPLES * Self::NAO_SAMPLES)
        {
            self.rand_step = Wrapping(0);
        }
        v
    }
    fn plane(&self) -> &Plane {
        &self.plane
    }
    fn spheres(&self) -> &[Sphere] {
        &self.spheres
    }
}


================================================
FILE: examples/aobench/src/tiled.rs
================================================
//! SIMD serial aobench

use crate::ambient_occlusion;
use crate::geometry::{f32xN, pf32xN, usizexN, IncrV, RayxN, V3DxN};
use crate::intersection::{Intersect, IsectxN};
use crate::scene::Scene;
use cfg_if::cfg_if;

#[inline(always)]
fn ao_impl<S: Scene>(
    scene: &mut S,
    nsubsamples: usize,
    img: &mut crate::Image,
) {
    let (w, h) = img.size();
    assert_eq!(w % f32xN::lanes(), 0);
    let image = &mut img.fdata;
    let ns = nsubsamples;
    let inv_ns = 1. / (ns as f32);
    let ptr = pf32xN::splat(image.as_mut_ptr());
    for y in 0..h {
        let yf = f32xN::splat(y as f32);
        for x in (0..w).step_by(f32xN::lanes()) {
            let xf = f32xN::incr(x as f32, 1.);
            let offset = usizexN::splat(3 * (y * w + x));
            let r_ptr = unsafe { ptr.add(offset + usizexN::incr(0, 3)) };
            let g_ptr = unsafe { ptr.add(offset + usizexN::incr(1, 3)) };
            let b_ptr = unsafe { ptr.add(offset + usizexN::incr(2, 3)) };

            for u in 0..ns {
                for v in 0..ns {
                    let du = (u as f32) * inv_ns;
                    let dv = (v as f32) * inv_ns;
                    let (hf, wf) = (h as f32, w as f32);

                    let dir = V3DxN {
                        x: (xf + f32xN::splat(du - (wf / 2.)))
                            / f32xN::splat((wf / 2.) * hf / wf),
                        y: -(yf + f32xN::splat(dv - (hf / 2.)))
                            / f32xN::splat(hf / 2.),
                        z: f32xN::splat(-1.),
                    };
                    let dir = dir.normalized();

                    let ray = RayxN {
                        origin: V3DxN::default(),
                        dir,
                    };

                    let mut isect = IsectxN::default();
                    for s in scene.spheres() {
                        isect = ray.intersect(s, isect);
                    }
                    isect = ray.intersect(scene.plane(), isect);

                    if isect.hit.any() {
                        let ret =
                            ambient_occlusion::vector_tiled(scene, &isect)
                                * f32xN::splat(inv_ns * inv_ns);

                        unsafe {
                            let img_r =
                                r_ptr.read(isect.hit, f32xN::splat(0.));
                            let img_g =
                                g_ptr.read(isect.hit, f32xN::splat(0.));
                            let img_b =
                                b_ptr.read(isect.hit, f32xN::splat(0.));

                            r_ptr.write(isect.hit, img_r + ret);
                            g_ptr.write(isect.hit, img_g + ret);
                            b_ptr.write(isect.hit, img_b + ret);
                        }
                    }
                }
            }
        }
    }
}

cfg_if! {
    if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
        #[target_feature(enable = "sse4.2")]
        unsafe fn ao_sse42<S: Scene>(scene: &mut S, nsubsamples: usize,
                                     img: &mut crate::Image) {
            ao_impl(scene, nsubsamples, img);
        }

        #[target_feature(enable = "avx")]
        unsafe fn ao_avx<S: Scene>(scene: &mut S, nsubsamples: usize,
                                   img: &mut crate::Image) {
            ao_impl(scene, nsubsamples, img);
        }

        #[target_feature(enable = "avx,fma")]
        unsafe fn ao_avx_fma<S: Scene>(scene: &mut S, nsubsamples: usize,
                                   img: &mut crate::Image) {
            ao_impl(scene, nsubsamples, img);
        }

        #[target_feature(enable = "avx2,fma")]
        unsafe fn ao_avx2<S: Scene>(scene: &mut S, nsubsamples: usize,
                                    img: &mut crate::Image) {
            ao_impl(scene, nsubsamples, img);
        }

        pub fn ao<S: Scene>(scene: &mut S, nsubsamples: usize,
                            img: &mut crate::Image) {
            unsafe {
                if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
                    ao_avx2(scene, nsubsamples, img);
                } else if is_x86_feature_detected!("avx") {
                    if is_x86_feature_detected!("fma") {
                        ao_avx_fma(scene, nsubsamples, img);
                    } else {
                        ao_avx(scene, nsubsamples, img);
                    }
                } else if is_x86_feature_detected!("sse4.2") {
                    ao_sse42(scene, nsubsamples, img);
                } else {
                    ao_impl(scene, nsubsamples, img);
                }
            }
        }
    } else {
        pub fn ao<S: Scene>(scene: &mut S, nsubsamples: usize, img: &mut crate::Image) {
            ao_impl(scene, nsubsamples, img);
        }
    }
}


================================================
FILE: examples/aobench/src/tiled_parallel.rs
================================================
//! SIMD tiled parallel aobench

use crate::ambient_occlusion;
use crate::geometry::{f32xN, pf32xN, usizexN, IncrV, RayxN, V3DxN};
use crate::intersection::{Intersect, IsectxN};
use crate::scene::Scene;
use rayon::prelude::*;

pub fn ao<S: Scene>(_: &mut S, nsubsamples: usize, img: &mut crate::Image) {
    let (w, h) = img.size();
    assert_eq!(w % f32xN::lanes(), 0);
    let ns = nsubsamples;
    let inv_ns = 1. / (ns as f32);
    let ptr = usizexN::splat(img.fdata.as_mut_ptr() as usize);
    img.fdata
        .par_chunks_mut(3 * w)
        .enumerate()
        .for_each(|(y, image)| {
            assert!(image.len() == 3 * w);
            let mut scene = S::default();
            let yf = f32xN::splat(y as f32);
            let ptr: pf32xN = unsafe { std::mem::transmute(ptr) };
            for x in (0..w).step_by(f32xN::lanes()) {
                let xf = f32xN::incr(x as f32, 1.);
                let offset = usizexN::splat(3 * (y * w + x));
                let r_ptr = unsafe { ptr.add(offset + usizexN::incr(0, 3)) };
                let g_ptr = unsafe { ptr.add(offset + usizexN::incr(1, 3)) };
                let b_ptr = unsafe { ptr.add(offset + usizexN::incr(2, 3)) };

                for u in 0..ns {
                    for v in 0..ns {
                        let du = (u as f32) * inv_ns;
                        let dv = (v as f32) * inv_ns;
                        let (hf, wf) = (h as f32, w as f32);

                        let dir = V3DxN {
                            x: (xf + f32xN::splat(du - (wf / 2.)))
                                / f32xN::splat((wf / 2.) * hf / wf),
                            y: -(yf + f32xN::splat(dv - (hf / 2.)))
                                / f32xN::splat(hf / 2.),
                            z: f32xN::splat(-1.),
                        };
                        let dir = dir.normalized();

                        let ray = RayxN {
                            origin: V3DxN::default(),
                            dir,
                        };

                        let mut isect = IsectxN::default();
                        for s in scene.spheres() {
                            isect = ray.intersect(s, isect);
                        }
                        isect = ray.intersect(scene.plane(), isect);

                        if isect.hit.any() {
                            let ret = ambient_occlusion::vector_tiled(
                                &mut scene, &isect,
                            ) * f32xN::splat(inv_ns * inv_ns);

                            unsafe {
                                let img_r =
                                    r_ptr.read(isect.hit, f32xN::splat(0.));
                                let img_g =
                                    g_ptr.read(isect.hit, f32xN::splat(0.));
                                let img_b =
                                    b_ptr.read(isect.hit, f32xN::splat(0.));

                                r_ptr.write(isect.hit, img_r + ret);
                                g_ptr.write(isect.hit, img_g + ret);
                                b_ptr.write(isect.hit, img_b + ret);
                            }
                        }
                    }
                }
            }
        });
}


================================================
FILE: examples/aobench/src/vector.rs
================================================
//! SIMD serial aobench

use crate::ambient_occlusion;
use crate::geometry::{Ray, V3D};
use crate::intersection::{Intersect, Isect};
use crate::scene::Scene;
use cfg_if::cfg_if;

#[inline(always)]
fn ao_impl<S: Scene>(
    scene: &mut S,
    nsubsamples: usize,
    img: &mut crate::Image,
) {
    let (w, h) = img.size();
    let image = &mut img.fdata;
    let ns = nsubsamples;
    let inv_ns = 1. / (ns as f32);
    for y in 0..h {
        for x in 0..w {
            let offset = 3 * (y * w + x);
            for u in 0..ns {
                for v in 0..ns {
                    let du = (u as f32) * inv_ns;
                    let dv = (v as f32) * inv_ns;

                    let (x, y, h, w) =
                        (x as f32, y as f32, h as f32, w as f32);

                    let dir = V3D {
                        x: (x + du - (w * 0.5)) / (w * 0.5) * w / h,
                        y: -(y + dv - (h * 0.5)) / (h * 0.5),
                        z: -1.,
                    };
                    let dir = dir.normalized();

                    let ray = Ray {
                        origin: V3D::default(),
                        dir,
                    };

                    let mut isect = Isect::default();
                    for s in scene.spheres() {
                        isect = ray.intersect(s, isect);
                    }
                    isect = ray.intersect(scene.plane(), isect);

                    let ret = if isect.hit {
                        ambient_occlusion::vector(scene, &isect)
                    } else {
                        0.
                    };
                    let ret = ret * inv_ns * inv_ns;

                    // Update image for AO for this ray
                    // (already normalized)
                    image[offset + 0] += ret;
                    image[offset + 1] += ret;
                    image[offset + 2] += ret;
                }
            }
        }
    }
}

cfg_if! {
    if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
        #[target_feature(enable = "sse4.2")]
        unsafe fn ao_sse42<S: Scene>(scene: &mut S, nsubsamples: usize,
                                     img: &mut crate::Image) {
            ao_impl(scene, nsubsamples, img);
        }

        #[target_feature(enable = "avx")]
        unsafe fn ao_avx<S: Scene>(scene: &mut S, nsubsamples: usize,
                                   img: &mut crate::Image) {
            ao_impl(scene, nsubsamples, img);
        }

        #[target_feature(enable = "avx,fma")]
        unsafe fn ao_avx_fma<S: Scene>(scene: &mut S, nsubsamples: usize,
                                   img: &mut crate::Image) {
            ao_impl(scene, nsubsamples, img);
        }

        #[target_feature(enable = "avx2,fma")]
        unsafe fn ao_avx2<S: Scene>(scene: &mut S, nsubsamples: usize,
                                    img: &mut crate::Image) {
            ao_impl(scene, nsubsamples, img);
        }

        pub fn ao<S: Scene>(scene: &mut S, nsubsamples: usize,
                            img: &mut crate::Image) {
            unsafe {
                if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
                    ao_avx2(scene, nsubsamples, img);
                } else if is_x86_feature_detected!("avx") {
                    if is_x86_feature_detected!("fma") {
                        ao_avx_fma(scene, nsubsamples, img);
                    } else {
                        ao_avx(scene, nsubsamples, img);
                    }
                } else if is_x86_feature_detected!("sse4.2") {
                    ao_sse42(scene, nsubsamples, img);
                } else {
                    ao_impl(scene, nsubsamples, img);
                }
            }
        }
    } else {
        pub fn ao<S: Scene>(scene: &mut S, nsubsamples: usize, img: &mut crate::Image) {
            ao_impl(scene, nsubsamples, img);
        }
    }
}


================================================
FILE: examples/aobench/src/vector_parallel.rs
================================================
//! SIMD parallel aobench

use crate::ambient_occlusion;
use crate::geometry::{Ray, V3D};
use crate::intersection::{Intersect, Isect};
use crate::scene::Scene;
use rayon::prelude::*;

pub fn ao<S: Scene>(_: &mut S, nsubsamples: usize, img: &mut crate::Image) {
    let (w, h) = img.size();
    let ns = nsubsamples;
    let inv_ns = 1. / (ns as f32);
    img.fdata
        .par_chunks_mut(3 * w)
        .enumerate()
        .for_each(|(y, image)| {
            assert!(image.len() == 3 * w);
            let mut scene = S::default();
            for x in 0..w {
                let offset = 3 * x;
                for u in 0..ns {
                    for v in 0..ns {
                        let du = (u as f32) * inv_ns;
                        let dv = (v as f32) * inv_ns;

                        let (x, y, h, w) =
                            (x as f32, y as f32, h as f32, w as f32);

                        let dir = V3D {
                            x: (x + du - (w / 2.)) / (w / 2.) * w / h,
                            y: -(y + dv - (h / 2.)) / (h / 2.),
                            z: -1.,
                        };
                        let dir = dir.normalized();

                        let ray = Ray {
                            origin: V3D::default(),
                            dir,
                        };

                        let mut isect = Isect::default();
                        for s in scene.spheres() {
                            isect = ray.intersect(s, isect);
                        }
                        isect = ray.intersect(scene.plane(), isect);

                        let ret = if isect.hit {
                            ambient_occlusion::vector(&mut scene, &isect)
                        } else {
                            0.
                        };
                        let ret = ret * inv_ns * inv_ns;

                        // Update image for AO for this ray
                        // (already normalized)
                        image[offset + 0] += ret;
                        image[offset + 1] += ret;
                        image[offset + 2] += ret;
                    }
                }
            }
        });
}


================================================
FILE: examples/aobench/volta/.gitignore
================================================
ao
*.ppm
objs/


================================================
FILE: examples/aobench/volta/ao.ispc
================================================
// -*- mode: c++ -*-
/*
  Copyright (c) 2010-2011, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:

    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.


   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
*/
/*
  Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench
*/

#define NAO_SAMPLES		8
#define M_PI 3.1415926535f

typedef float<3> vec;

struct Isect {
    float      t;
    vec        p;
    vec        n;
    int        hit; 
};

struct Sphere {
    vec        center;
    float      radius;
};

struct Plane {
    vec    p;
    vec    n;
};

struct Ray {
    vec org;
    vec dir;
};

static inline float dot(vec a, vec b) {
    return a.x * b.x + a.y * b.y + a.z * b.z;
}

static inline vec vcross(vec v0, vec v1) {
    vec ret;
    ret.x = v0.y * v1.z - v0.z * v1.y;
    ret.y = v0.z * v1.x - v0.x * v1.z;
    ret.z = v0.x * v1.y - v0.y * v1.x;
    return ret;
}

static inline void vnormalize(vec &v) {
    float len2 = dot(v, v);
    float invlen = rsqrt(len2);
    v *= invlen;
}


static void
ray_plane_intersect(Isect &isect, Ray &ray, uniform Plane &plane) {
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);

    cif (abs(v) < 1.0e-17) 
        return;
    else {
        float t = -(dot(ray.org, plane.n) + d) / v;

        cif ((t > 0.0) && (t < isect.t)) {
            isect.t = t;
            isect.hit = 1;
            isect.p = ray.org + ray.dir * t;
            isect.n = plane.n;
        }
    }
}


static inline void
ray_sphere_intersect(Isect &isect, Ray &ray, uniform Sphere &sphere) {
    vec rs = ray.org - sphere.center;

    float B = dot(rs, ray.dir);
    float C = dot(rs, rs) - sphere.radius * sphere.radius;
    float D = B * B - C;

    cif (D > 0.) {
        float t = -B - sqrt(D);

        cif ((t > 0.0) && (t < isect.t)) {
            isect.t = t;
            isect.hit = 1;
            isect.p = ray.org + t * ray.dir;
            isect.n = isect.p - sphere.center;
            vnormalize(isect.n);
        }
    }
}


static void
orthoBasis(vec basis[3], vec n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;

    if ((n.x < 0.6) && (n.x > -0.6)) {
        basis[1].x = 1.0;
    } else if ((n.y < 0.6) && (n.y > -0.6)) {
        basis[1].y = 1.0;
    } else if ((n.z < 0.6) && (n.z > -0.6)) {
        basis[1].z = 1.0;
    } else {
        basis[1].x = 1.0;
    }

    basis[0] = vcross(basis[1], basis[2]);
    vnormalize(basis[0]);

    basis[1] = vcross(basis[2], basis[0]);
    vnormalize(basis[1]);
}


static float
ambient_occlusion(Isect &isect, uniform Plane &plane, uniform Sphere spheres[3],
                  RNGState &rngstate) {
    float eps = 0.0001f;
    vec p, n;
    vec basis[3];
    float occlusion = 0.0;

    p = isect.p + eps * isect.n;

    orthoBasis(basis, isect.n);

    static const uniform int ntheta = NAO_SAMPLES;
    static const uniform int nphi   = NAO_SAMPLES;
    for (uniform int j = 0; j < ntheta; j++) {
        for (uniform int i = 0; i < nphi; i++) {
            Ray ray;
            Isect occIsect;

            float theta = sqrt(frandom(&rngstate));
            float phi   = 2.0f * M_PI * frandom(&rngstate);
            float x = cos(phi) * theta;
            float y = sin(phi) * theta;
            float z = sqrt(1.0 - theta * theta);

            // local . global
            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
            float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;
            float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;

            ray.org = p;
            ray.dir.x = rx;
            ray.dir.y = ry;
            ray.dir.z = rz;

            occIsect.t   = 1.0e+17;
            occIsect.hit = 0;

            for (uniform int snum = 0; snum < 3; ++snum)
                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
            ray_plane_intersect (occIsect, ray, plane); 

            if (occIsect.hit) occlusion += 1.0;
        }
    }

    occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);
    return occlusion;
}


/* Compute the image for the scanlines from [y0,y1), for an overall image
   of width w and height h.
 */
static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
                         uniform int h,  uniform int nsubsamples, 
                         uniform float image[]) {
    static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
    static uniform Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
        { { -0.5f, 0.0f, -3.0f }, 0.5f },
        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
    RNGState rngstate;

    seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));
    float invSamples = 1.f / nsubsamples;

    foreach_tiled(y = y0 ... y1, x = 0 ... w, 
                  u = 0 ... nsubsamples, v = 0 ... nsubsamples) {
        float du = (float)u * invSamples, dv = (float)v * invSamples;

        // Figure out x,y pixel in NDC
        float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
        float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);

        // Scale NDC based on width/height ratio, supporting non-square image output
        px *= (float)w / (float)h;

        float ret = 0.f;
        Ray ray;
        Isect isect;

        ray.org = 0.f;

        // Poor man's perspective projection
        ray.dir.x = px;
        ray.dir.y = py;
        ray.dir.z = -1.0;
        vnormalize(ray.dir);

        isect.t   = 1.0e+17;
        isect.hit = 0;

        for (uniform int snum = 0; snum < 3; ++snum)
            ray_sphere_intersect(isect, ray, spheres[snum]);
        ray_plane_intersect(isect, ray, plane);

        // Note use of 'coherent' if statement; the set of rays we
        // trace will often all hit or all miss the scene
        cif (isect.hit) {
            ret = ambient_occlusion(isect, plane, spheres, rngstate);
            ret *= invSamples * invSamples;

            int offset = 3 * (y * w + x);
            atomic_add_local(&image[offset], ret);
            atomic_add_local(&image[offset+1], ret);
            atomic_add_local(&image[offset+2], ret);
        }
    }
}


export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, 
                    uniform float image[]) {
    ao_scanlines(0, h, w, h, nsubsamples, image);
}


static void task ao_task(uniform int width, uniform int height, 
                         uniform int nsubsamples, uniform float image[]) {
    ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
}


export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
                          uniform float image[]) {
    launch[h] ao_task(w, h, nsubsamples, image);
}


================================================
FILE: examples/dot_product/Cargo.toml
================================================
[package]
name = "dot_product"
version = "0.1.0"
authors = ["Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>"]
edition = "2018"

[dependencies]
packed_simd = { package = "packed_simd", path = "../.." }

[lib]
name = "dot_product_lib"
path = "src/lib.rs"


================================================
FILE: examples/dot_product/readme.md
================================================
# Vector dot product


================================================
FILE: examples/dot_product/src/lib.rs
================================================
//! Vector dot product
#![deny(rust_2018_idioms)]
#![feature(custom_inner_attributes)]
#![allow(clippy::must_use_candidate, clippy::float_cmp)]

pub mod scalar;
pub mod simd;

#[cfg(test)]
#[rustfmt::skip]
fn test<F: Fn(&[f32], &[f32]) -> f32>(f: F) {
    let tests: &[(&[f32], &[f32], f32)] = &[
        (&[0_f32, 0., 0., 0.], &[0_f32, 0., 0., 0.], 0_f32),
        (&[0_f32, 0., 0., 1.], &[0_f32, 0., 0., 1.], 1_f32),
        (&[1_f32, 2., 3., 4.], &[0_f32, 0., 0., 0.], 0_f32),
        (&[1_f32, 2., 3., 4.], &[1_f32, 2., 3., 4.], 30_f32),
        (&[1_f32, 2., 3., 4., 1., 2., 3., 4.], &[1_f32, 1., 1., 1., 1., 1., 1., 1.], 20_f32),
    ];

    for &(a, b, output) in tests {
        assert_eq!(f(a, b), output);
    }
}


================================================
FILE: examples/dot_product/src/scalar.rs
================================================
//! Scalar implementation

pub fn dot_prod(a: &[f32], b: &[f32]) -> f32 {
    assert_eq!(a.len(), b.len());
    a.iter().zip(b.iter()).map(|v| v.0 * v.1).sum()
}

#[cfg(test)]
#[test]
fn test() {
    crate::test(dot_prod)
}


================================================
FILE: examples/dot_product/src/simd.rs
================================================
//! Scalar implementation

use packed_simd::f32x4;

pub fn dot_prod(a: &[f32], b: &[f32]) -> f32 {
    assert_eq!(a.len(), b.len());
    assert!(a.len() % 4 == 0);

    a.chunks_exact(4)
        .map(f32x4::from_slice_unaligned)
        .zip(b.chunks_exact(4).map(f32x4::from_slice_unaligned))
        .map(|(a, b)| a * b)
        .sum::<f32x4>()
        .sum()
}

#[cfg(test)]
#[test]
fn test() {
    crate::test(dot_prod)
}


================================================
FILE: examples/fannkuch_redux/Cargo.toml
================================================
[package]
name = "fannkuch_redux"
version = "0.1.0"
authors = ["gnzlbg <gonzalobg88@gmail.com>"]
edition = "2018"

[dependencies]
packed_simd = { package = "packed_simd", path = "../.." }

[[bin]]
name = "fannkuch_redux"
path = "src/main.rs"

[lib]
name = "fannkuch_redux_lib"
path = "src/lib.rs"


================================================
FILE: examples/fannkuch_redux/readme.md
================================================
# Fannkuch redux

This is the [`fannkuch redux` benchmark from the benchmarksgame][bg]. 

## Background and description

The fannkuch benchmark is defined by programs in [Performing Lisp Analysis of
the FANNKUCH
Benchmark](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.35.5124),
Kenneth R. Anderson and Duane Rettig. FANNKUCH is an abbreviation for the German
word __Pfannkuchen_, or pancakes, in analogy to flipping pancakes. The conjecture
is that the maximum count is approximated by `n*log(n)` when `n` goes to infinity.

Each program should:

* Take a permutation of `{1,...,n}`, for example: `{4,2,1,5,3}`.

* Take the first element, here `4`, and reverse the order of the first `4`
  elements: `{5,1,2,4,3}`.

* Repeat this until the first element is a `1`, so flipping won't change
  anything more: `{3,4,2,1,5}`, `{2,4,3,1,5}`, `{4,2,3,1,5}`, `{1,3,2,4,5}`.

* Count the number of flips, here `5`.

* Keep a checksum

  * `checksum = checksum + (if permutation_index is even then flips_count else
    -flips_count)`

  * `checksum = checksum + (toggle_sign_-1_1 * flips_count)`

* Do this for all `n!` permutations, and record the maximum number of flips
  needed for any permutation.

## Usage

It takes two arguments in this order:

* `n`: the input sequence length: `{1, ..., n}`
* (optional) `algorithm`: the algorithm to use - defaults to the fastest one.
  * `0`: scalar algorithm
  * `1`: SIMD algorithm

[bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/fannkuchredux.html#fannkuchredux


================================================
FILE: examples/fannkuch_redux/src/fannkuchredux-output.txt
================================================
228
Pfannkuchen(7) = 16


================================================
FILE: examples/fannkuch_redux/src/lib.rs
================================================
//! Fannkuch redux
#![deny(warnings, rust_2018_idioms)]
#![allow(non_snake_case, non_camel_case_types)]
#![allow(
    clippy::similar_names,
    clippy::many_single_char_names,
    clippy::cast_possible_truncation,
    clippy::cast_sign_loss,
    clippy::cast_possible_wrap,
    clippy::must_use_candidate,
    clippy::float_cmp
)]

pub mod scalar;
pub mod simd;

pub fn fannkuch_redux(n: usize, alg: usize) -> (i32, i32) {
    match alg {
        0 => simd::fannkuch_redux(n),
        1 => scalar::fannkuch_redux(n),
        v => panic!("unknown algorithm value: {}", v),
    }
}


================================================
FILE: examples/fannkuch_redux/src/main.rs
================================================
#![deny(rust_2018_idioms)]

use fannkuch_redux_lib::*;

fn run<O: std::io::Write>(o: &mut O, n: usize, alg: usize) {
    let (checksum, maxflips) = fannkuch_redux(n, alg);
    writeln!(o, "{}\nPfannkuchen({}) = {}", checksum, n, maxflips).unwrap();
}

fn main() {
    let n: usize =
        std::env::args().nth(1).expect("need one arg").parse().unwrap();
    assert!((3..=14).contains(&n), "n = {} is out-of-range [3, 14]", n);
    let alg = if let Some(v) = std::env::args().nth(2) {
        v.parse().unwrap()
    } else {
        0
    };

    run(&mut std::io::stdout(), n, alg);
}

#[cfg(test)]
mod tests {
    use super::*;
    static OUTPUT: &[u8] = include_bytes!("fannkuchredux-output.txt");
    #[test]
    fn verify_output_simd() {
        let mut out: Vec<u8> = Vec::new();

        run(&mut out, 7, 0);

        assert_eq!(out.len(), OUTPUT.len());
        if out != OUTPUT {
            for i in 0..out.len() {
                assert_eq!(
                    out[i], OUTPUT[i],
                    "byte {} differs - is: {:#08b} - should: {:#08b}",
                    i, out[i], OUTPUT[i]
                );
            }
        }
    }
    #[test]
    fn verify_output_scalar() {
        let mut out: Vec<u8> = Vec::new();

        run(&mut out, 7, 1);

        assert_eq!(out.len(), OUTPUT.len());
        if out != OUTPUT {
            for i in 0..out.len() {
                assert_eq!(
                    out[i], OUTPUT[i],
                    "byte {} differs - is: {:#08b} - should: {:#08b}",
                    i, out[i], OUTPUT[i]
                );
            }
        }
    }

}


================================================
FILE: examples/fannkuch_redux/src/scalar.rs
================================================
//! Scalar fannkuch redux implementation

use std::{cmp, mem, thread};

// FIXME: replace with slice rotate
fn rotate(x: &mut [i32]) {
    let mut prev = x[0];
    for place in x.iter_mut().rev() {
        prev = mem::replace(place, prev)
    }
}

fn next_permutation(perm: &mut [i32], count: &mut [i32]) {
    for i in 1..perm.len() {
        rotate(&mut perm[..=i]);
        let count_i = &mut count[i];
        if *count_i >= i as i32 {
            *count_i = 0;
        } else {
            *count_i += 1;
            break;
        }
    }
}

#[derive(Clone, Copy)]
struct P {
    p: [i32; 16],
}

#[derive(Clone, Copy)]
struct Perm {
    cnt: [i32; 16],
    fact: [u32; 16],
    n: u32,
    permcount: u32,
    perm: P,
}

impl Perm {
    fn new(n: u32) -> Self {
        let mut fact = [1; 16];
        for i in 1..=n as usize {
            fact[i] = fact[i - 1] * i as u32;
        }
        Self { cnt: [0; 16], fact, n, permcount: 0, perm: P { p: [0; 16] } }
    }

    fn get(&mut self, mut idx: i32) -> P {
        let mut pp = [0_u8; 16];
        self.permcount = idx as u32;
        for (i, place) in self.perm.p.iter_mut().enumerate() {
            *place = i as i32 + 1;
        }

        for i in (1..self.n as usize).rev() {
            let d = idx / self.fact[i] as i32;
            self.cnt[i] = d;
            idx %= self.fact[i] as i32;
            for (place, val) in pp.iter_mut().zip(self.perm.p[..=i].iter()) {
                *place = (*val) as u8
            }

            let d = d as usize;
            for j in 0..=i {
                self.perm.p[j] = i32::from(if j + d <= i {
                    pp[j + d]
                } else {
                    pp[j + d - i - 1]
                });
            }
        }

        self.perm
    }

    fn count(&self) -> u32 {
        self.permcount
    }
    fn max(&self) -> u32 {
        self.fact[self.n as usize]
    }

    fn next(&mut self) -> P {
        next_permutation(&mut self.perm.p, &mut self.cnt);
        self.permcount += 1;

        self.perm
    }
}

fn reverse(tperm: &mut [i32], k: usize) {
    tperm[..k].reverse()
}

fn work(mut perm: Perm, n: usize, max: usize) -> (i32, i32) {
    let mut checksum = 0;
    let mut maxflips = 0;

    let mut p = perm.get(n as i32);

    while perm.count() < max as u32 {
        let mut flips = 0;

        while p.p[0] != 1 {
            let k = p.p[0] as usize;
            reverse(&mut p.p, k);
            flips += 1;
        }

        checksum += if perm.count() % 2 == 0 { flips } else { -flips };
        maxflips = cmp::max(maxflips, flips);

        p = perm.next();
    }

    (checksum, maxflips)
}

pub fn fannkuch_redux(n: usize) -> (i32, i32) {
    let perm = Perm::new(n as u32);

    let m = 1;
    let mut futures = vec![];
    let k = perm.max() / m;

    for j in (0..).map(|x| x * k).take_while(|&j| j < k * m) {
        let max = cmp::min(j + k, perm.max());

        futures
            .push(thread::spawn(move || work(perm, j as usize, max as usize)))
    }

    let mut checksum = 0;
    let mut maxflips = 0;
    for fut in futures {
        let (cs, mf) = fut.join().unwrap();
        checksum += cs;
        maxflips = cmp::max(maxflips, mf);
    }
    (checksum, maxflips)
}

#[cfg(test)]
#[test]
fn test() {
    assert_eq!(fannkuch_redux(7), (228, 16));
}


================================================
FILE: examples/fannkuch_redux/src/simd.rs
================================================
//! Vectorized fannkuch redux implementation

use packed_simd::*;

struct State {
    s: [u8; 16],
    flip_masks: [u8x16; 16],
    rotate_masks: [u8x16; 16],

    maxflips: i32,
    odd: u16,
    checksum: i32,
}

impl Default for State {
    fn default() -> Self {
        Self {
            s: [0; 16],
            flip_masks: [u8x16::splat(0); 16],
            rotate_masks: [u8x16::splat(0); 16],

            maxflips: 0,
            odd: 0,
            checksum: 0,
        }
    }
}

impl State {
    fn rotate_sisd(&mut self, n: usize) {
        let c = self.s[0];
        for i in 1..=n {
            self.s[i - 1] = self.s[i];
        }
        self.s[n] = c;
    }
    fn popmasks(&mut self) {
        let mut mask = [0_u8; 16];
        for i in 0..16 {
            for (j, m) in mask.iter_mut().enumerate() {
                *m = j as u8;
            }

            for x in 0..(i + 1) / 2 {
                mask.swap(x, i - x);
            }

            self.flip_masks[i] = u8x16::from_slice_unaligned(&mask);

            for (j, s) in self.s.iter_mut().enumerate() {
                *s = j as u8;
            }
            self.rotate_sisd(i);
            self.rotate_masks[i] = self.load_s();
        }
    }
    fn rotate(&mut self, n: usize) {
        self.load_s()
            .shuffle1_dyn(self.rotate_masks[n])
            .write_to_slice_unaligned(&mut self.s)
    }

    fn load_s(&self) -> u8x16 {
        u8x16::from_slice_unaligned(&self.s)
    }

    fn tk(&mut self, n: usize) {
        #[derive(Copy, Clone, Debug)]
        struct Perm {
            perm: u8x16,
            start: u8,
            odd: u16,
        }

        let mut perms = [Perm { perm: u8x16::splat(0), start: 0, odd: 0 }; 60];

        let mut i = 0;
        let mut c = [0_u8; 16];
        let mut perm_max = 0;
        // Cache this locally outside the loop, since the compiler
        // can't optimize accesses to it otherwise.
        let mut odd = self.odd;

        while i < n {
            while i < n && perm_max < 60 {
                self.rotate(i);
                if c[i] as usize >= i {
                    c[i] = 0;
                    i += 1;
                    continue;
                }

                c[i] += 1;
                i = 1;
                odd = !odd;
                if self.s[0] != 0 {
                    if self.s[self.s[0] as usize] == 0 {
                        if self.maxflips == 0 {
                            self.maxflips = 1
                        }
                        self.checksum += if odd == 0 { 1 } else { -1 };
                    } else {
                        perms[perm_max].perm = self.load_s();
                        perms[perm_max].start = self.s[0];
                        perms[perm_max].odd = odd;
                        perm_max += 1;
                    }
                }
            }

            let mut k = 0;
            while k < std::cmp::max(1, perm_max) - 1 {
                let pk = &perms[k];
                let pk1 = &perms[k + 1];
                let mut perm1 = pk.perm;
                let mut perm2 = pk1.perm;

                let mut f1 = 0;
                let mut f2 = 0;
                let mut toterm1 = pk.start;
                let mut toterm2 = pk1.start;

                while toterm1 != 0 && toterm2 != 0 {
                    perm1 =
                        perm1.shuffle1_dyn(self.flip_masks[toterm1 as usize]);
                    perm2 =
                        perm2.shuffle1_dyn(self.flip_masks[toterm2 as usize]);
                    toterm1 = perm1.extract(0);
                    toterm2 = perm2.extract(0);

                    f1 += 1;
                    f2 += 1;
                }
                while toterm1 != 0 {
                    perm1 =
                        perm1.shuffle1_dyn(self.flip_masks[toterm1 as usize]);
                    toterm1 = perm1.extract(0);
                    f1 += 1;
                }
                while toterm2 != 0 {
                    perm2 =
                        perm2.shuffle1_dyn(self.flip_masks[toterm2 as usize]);
                    toterm2 = perm2.extract(0);
                    f2 += 1;
                }

                if f1 > self.maxflips {
                    self.maxflips = f1
                }
                if f2 > self.maxflips {
                    self.maxflips = f2
                }
                self.checksum += if pk.odd == 0 { f1 } else { -f1 };
                self.checksum += if pk1.odd == 0 { f2 } else { -f2 };

                k += 2;
            }
            while k < perm_max {
                let pk = &perms[k];
                let mut perm = pk.perm;
                let mut f = 0;
                let mut toterm = pk.start;
                while toterm != 0 {
                    perm = perm.shuffle1_dyn(self.flip_masks[toterm as usize]);
                    toterm = perm.extract(0);
                    f += 1;
                }
                if f > self.maxflips {
                    self.maxflips = f
                }
                self.checksum += if pk.odd == 0 { f } else { -f };
                k += 1
            }
            perm_max = 0;
        }
    }
}

pub fn fannkuch_redux(n: usize) -> (i32, i32) {
    let mut state = State::default();
    state.popmasks();

    for i in 0..n {
        state.s[i] = i as u8
    }
    state.tk(n);

    (state.checksum, state.maxflips)
}

#[cfg(test)]
#[test]
fn test() {
    assert_eq!(fannkuch_redux(7), (228, 16));
}


================================================
FILE: examples/mandelbrot/Cargo.toml
================================================
[package]
name = "mandelbrot"
version = "0.1.0"
authors = ["gnzlbg <gonzalobg88@gmail.com>"]
build = "build.rs"
edition = "2018"

[dependencies]
packed_simd = { package = "packed_simd", path = "../.." }
rayon = "^1.0"
ispc = { version = "^1.0.4", optional = true }
structopt = { version = "0.3.0", features = ["color"] }

[build-dependencies]
ispc = { version = "^1.0.4", optional = true }

[[bin]]
name = "mandelbrot"
path = "src/main.rs"

[lib]
name = "mandelbrot_lib"
path = "src/lib.rs"

[features]
default = []
sleef-sys = ["packed_simd/sleef-sys"]
core_arch = ["packed_simd/core_arch"]


================================================
FILE: examples/mandelbrot/benchmark.sh
================================================
#!/usr/bin/env bash
#
# Runs mandelbrot benchmarks

set -ex

WIDTH=800
HEIGHT=800

if [[ ${NORUN} != 1 ]]; then
    hash hyperfine 2>/dev/null || { echo >&2 "hyperfine is not in PATH."; exit 1; }
fi

if echo "$FEATURES" | grep -q "ispc"; then
    hash ispc 2>/dev/null || { echo >&2 "ispc is not in PATH."; exit 1; }
fi

RUSTFLAGS="-C target-cpu=native ${RUSTFLAGS}" \
         cargo build --release --features="${FEATURES}"

if [[ "${VERIFY}" == "1" ]]; then
    RUSTFLAGS="-C target-cpu=native ${RUSTFLAGS}" \
             cargo test --release --features="${FEATURES}"
fi

if [[ "${NORUN}" == "1" ]]; then
    exit 0
fi

hyperfine "../target/release/mandelbrot ${WIDTH} ${HEIGHT} --algo scalar"
hyperfine "../target/release/mandelbrot ${WIDTH} ${HEIGHT} --algo simd"

if echo "$FEATURES" | grep -q "ispc"; then
    hyperfine "../target/release/mandelbrot ${WIDTH} ${HEIGHT} --algo ispc"
fi


================================================
FILE: examples/mandelbrot/build.rs
================================================
fn main() {
    println!("cargo:rerun-if-changed=build.rs");

    #[cfg(feature = "ispc")]
    {
        if std::env::var("CARGO_FEATURE_ISPC").is_ok() {
            let mut cfg = ispc::Config::new();

            if cfg!(windows) {
                cfg.debug(false);
            }

            let ispc_files = vec!["volta/mandelbrot.ispc"];

            for s in &ispc_files[..] {
                cfg.file(*s);
            }

            cfg.target_isas(vec![
                ispc::opt::TargetISA::SSE2i32x4,
                ispc::opt::TargetISA::SSE4i32x4,
                ispc::opt::TargetISA::AVX1i32x8,
                ispc::opt::TargetISA::AVX2i32x8,
                ispc::opt::TargetISA::AVX512KNLi32x16,
            ]);

            cfg.compile("mandelbrot");
        }
    }
}


================================================
FILE: examples/mandelbrot/readme.md
================================================
# Mandelbrot

This is the [`mandelbrot` benchmark from the benchmarksgame][bg].

## Background

http://mathworld.wolfram.com/MandelbrotSet.html

## Usage

It takes four arguments in this order:

* `width`: width of the image to render
* `height`: height of the image to render
* `algorithm`: algorithm to use:
  * `scalar`: scalar algorithm
  * `simd`: parallelized SIMD algorithm
  * `ispc`: ISPC + tasks algorithm
* `--color` (optional): enables colorized output, which also determines the image format.
  * disabled (default): PBM: Portable BitMap format (black & white output)
  * enabled: PPM: Portable PixMap format (colored output)

The resulting image is piped to `stdout`.

`cargo run --release -- 400 400 --algo simd > output.ppm` outputs:

![run_400_png](https://user-images.githubusercontent.com/904614/43190942-72bdb834-8ffa-11e8-9dcf-a9a9632ae907.png)

`cargo run --release -- 400 400 --algo simd --color > output.ppm` outputs:

![run_400_400_1_1_png](https://user-images.githubusercontent.com/904614/43190948-759969a4-8ffa-11e8-81a9-35e5baef3e86.png)

## Performance

```
./benchmark.sh
```

On a dual core AVX1 i5 @1.8 GHz:

| 800 x 800  | time [ms] <br> Rust | speedup vs `scalar` [-] |
|------------|---------------------|-------------|
| `scalar`   | 86.6                | 1.0x        |
| `simd`     | 21.0                | 4.1x        |
| `ispc`     | 25.7                | 3.4x        |

`simd` algorithm is ~1.2x faster than `ispc`.

On a 28 core Xeon CPU E5-2690 v4 @ 2.60GHz:

| 800 x 800  | time [ms] <br> Rust | speedup vs `scalar` [-] |
|------------|---------------------|-------------------------|
| `scalar`   | 50.8                | 1.0x                    |
| `simd`     | 25.1                | 2x                      |
| `ispc`     | 14.4                | 3.52x                   |

`simd` algorithm is ~1.74x slower than `ispc`.

On a 40 core Xeon Gold 6148 CPU @ 2.40GHz:

| 800 x 800  | time [ms] <br> Rust | speedup vs `scalar` [-] |
|------------|---------------------|-------------|
| `scalar`   | 59.9                | 1.0x        |
| `simd`     | 29.9                | 2.0x        |
| `ispc`     | 30.3                | 2.0x        |

`simd` algorithm is as fast as `ispc`.

[bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/mandelbrot.html#mandelbrot


================================================
FILE: examples/mandelbrot/src/ispc_tasks.rs
================================================
//! Includes the ISPC implementations.
use crate::*;
use ispc::*;

ispc_module!(mandelbrot);

pub fn generate(dims: Dimensions, xr: Range, yr: Range) -> Vec<u32> {
    let (width, height) = dims;
    let Range { start: left, end: right } = xr;
    let Range { start: top, end: bottom } = yr;

    let len = width * height;
    let mut out = Vec::with_capacity(len);

    unsafe {
        mandelbrot::mandelbrot_ispc(
            left,
            bottom,
            right,
            top,
            height as i32,
            width as i32,
            ITER_LIMIT as i32,
            out.as_mut_ptr() as *mut i32,
        );

        out.set_len(len);
    }

    out
}


================================================
FILE: examples/mandelbrot/src/lib.rs
================================================
//! The mandelbrot benchmark from the [benchmarks game][bg].
//!
//! [bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/mandelbrot.html#mandelbrot

// FIXME: Null pointer deref warning triggered in this example,
// likely inside a macro expansion deriving from packed_simd.
#![deny(rust_2018_idioms)]
#![allow(
    clippy::cast_precision_loss,
    clippy::cast_sign_loss,
    clippy::cast_possible_truncation,
    clippy::must_use_candidate
)]

use rayon::prelude::*;
use std::{io, ops};

// Each algorithm implementation must expose a single public function,
// `generate`:   fn generate(dimensions: Dimensions, xr: Range, yr: Range) ->
// Vec<u8>;
//
// Generates the Mandelbrot fractal for a region of Cartesian space,
// where X is bounded by `xr.begin..xr.end` and Y by `yr.begin..yr.end`.
//
// Returns a vector of dimensions `width * height`, where each byte is
// the number of iterations the corresponding point reached before diverging.

#[cfg(feature = "ispc")]
mod ispc_tasks;
mod scalar_par;
mod simd_par;

type Range = ops::Range<f64>;
type Region = (Range, Range);

/// The width and height of a generated image
pub type Dimensions = (usize, usize);

/// The Mandelbrot algorithms supported by this crate.
#[derive(Debug, Copy, Clone)]
pub enum Algorithm {
    /// Scalar parallel algorithm
    Scalar,
    /// Parallel SIMD algorithm using Rayon
    Simd,
    /// ISPC SIMD + parallel tasks algorithm
    Ispc,
}

pub struct Mandelbrot {
    dims: Dimensions,
    data: Vec<u32>,
}

impl Mandelbrot {
    /// Generates a new image of the Mandelbrot fractal.
    pub fn generate(dims: Dimensions, algo: Algorithm) -> Self {
        Self::generate_region(dims, DEFAULT_REGION, algo)
    }

    /// Generates a new image containing a certain region of the Mandelbrot
    /// fractal.
    pub fn generate_region(
        dims: Dimensions, region: Region, algo: Algorithm,
    ) -> Self {
        let data = match algo {
            Algorithm::Scalar => {
                scalar_par::generate(dims, region.0, region.1)
            }
            Algorithm::Simd => simd_par::generate(dims, region.0, region.1),
            #[cfg(feature = "ispc")]
            Algorithm::Ispc => ispc_tasks::generate(dims, region.0, region.1),
            #[cfg(not(feature = "ispc"))]
            Algorithm::Ispc => unimplemented!(
                "This crate was built with the `ispc` feature disabled"
            ),
        };

        Self { dims, data }
    }

    /// Writes the PBM / PPM header to the output.
    fn write_header(
        &self, f: &mut dyn io::Write, color: bool,
    ) -> io::Result<()> {
        writeln!(f, "P{}", if color { 6 } else { 4 })?;
        write!(f, "{} {}", self.dims.0, self.dims.1)?;
        if color {
            write!(f, " 255")?;
        }
        writeln!(f)
    }

    /// Outputs a black/white PBM bitmap to the given writer.
    pub fn output_pbm(&self, f: &mut dyn io::Write) -> io::Result<()> {
        self.write_header(f, false)?;

        assert_eq!(
            self.data.len() % 8,
            0,
            "Output data must be a multiple of 8"
        );
        let buf = self
            .data
            .par_chunks(8)
            .map(|ch| {
                let mut result = 0;
                ch.iter().enumerate().for_each(|(i, &count)| {
                    let undiverged = count == ITER_LIMIT;
                    result |= (undiverged as u8) << (7 - i);
                });
                result
            })
            .collect::<Vec<u8>>();

        f.write_all(&buf)
    }

    /// Outputs a color PPM image to the given writer.
    pub fn output_ppm(&self, f: &mut dyn io::Write) -> io::Result<()> {
        self.write_header(f, true)?;

        let buf = self
            .data
            .par_iter()
            .flat_map(|&val| {
                const COLORS: &[(f32, f32, f32)] = &[
                    (0.0, 7.0, 100.0),
                    (32.0, 107.0, 203.0),
                    (237.0, 255.0, 255.0),
                    (255.0, 170.0, 0.0),
                    (0.0, 2.0, 0.0),
                ];
                const SCALE: u32 = 12;

                let color_count = COLORS.len() as u32;

                let color = if val == ITER_LIMIT {
                    vec![0, 0, 0]
                } else {
                    let val = (val % SCALE) * color_count / SCALE;
                    let left = val % color_count;
                    let right = (left + 1) % color_count;

                    let alpha = (val - left) as f32;
                    let (r1, g1, b1) = COLORS[left as usize];
                    let (r2, g2, b2) = COLORS[right as usize];
                    vec![
                        (r1 + (r2 - r1) * alpha) as u8,
                        (g1 + (g2 - g1) * alpha) as u8,
                        (b1 + (b2 - b1) * alpha) as u8,
                    ]
                };

                color.into_par_iter()
            })
            .collect::<Vec<_>>();

        f.write_all(&buf)
    }
}

/// Returns the default region of space to generate an image for.
///
/// This is the region containing the fractal most people think of when they
/// think of Mandelbrot, since values outside definitely diverge.
const DEFAULT_REGION: (Range, Range) = (-1.5..0.5, -1.0..1.0);

/// Threshold for Mandelbrot sequence divergence
///
/// Complex numbers which have a modulus squared greater than this are
/// considered to be diverging.
const THRESHOLD: f64 = 4.0;

/// Maximum amount of iterations to perform
///
/// Increasing this will make more features to be visible in the image,
/// assuming the resolution is large enoguh.
const ITER_LIMIT: u32 = 50;

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    #[cfg_attr(windows, ignore)]
    fn verify_all() {
        let width = 400;
        let height = 800;

        let dims = (width, height);

        let verify = |actual: &[u32], expected: &[u32]| {
            if actual != expected {
                for row in 0..height {
                    for column in 0..width {
                        let idx = row * width + column;
                        assert_eq!(
                            actual[idx], expected[idx],
                            "difference at ({}, {})",
                            row, column,
                        );
                    }
                }
            }
        };

        eprintln!("Generating Mandelbrot with scalar algorithm");
        let scalar =
            scalar_par::generate(dims, DEFAULT_REGION.0, DEFAULT_REGION.1);
        assert_eq!(scalar.len(), width * height);

        eprintln!("Generating Mandelbrot with SIMD algorithm");
        let simd =
            simd_par::generate(dims, DEFAULT_REGION.0, DEFAULT_REGION.1);
        verify(&simd[..], &scalar[..]);
    }

    fn verify_algo(algo: Algorithm) {
        static OUTPUT: &[u8] = include_bytes!("mandelbrot-output.txt");

        let (width, height) = (200, 200);

        let dims = (width, height);
        let mb = Mandelbrot::generate(dims, algo);

        let out = {
            let mut out = Vec::with_capacity(width * height);
            mb.output_pbm(&mut out).unwrap();
            out
        };

        assert_eq!(out.len(), OUTPUT.len());

        if out != OUTPUT {
            out.into_iter().zip(OUTPUT.iter()).enumerate().for_each(
                |(i, (a, &b))| {
                    assert_eq!(
                        a, b,
                        "byte {} differs - {:#08b} != {:#08b} (expected)",
                        i, a, b,
                    );
                },
            );
        }
    }

    #[test]
    fn verify_output_scalar() {
        verify_algo(Algorithm::Scalar);
    }

    #[test]
    #[cfg_attr(windows, ignore)]
    fn verify_output_simd() {
        verify_algo(Algorithm::Simd);
    }
}


================================================
FILE: examples/mandelbrot/src/main.rs
================================================
//! The Mandelbrot benchmark from the [benchmarksgame][bg]
//!
//! [bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/mandelbrot.html#mandelbrot

#![deny(rust_2018_idioms)]

use mandelbrot_lib::*;
use std::io;
use structopt::StructOpt;

/// Mandelbrot image generator.
///
/// Output is printed to `stdout`.
#[derive(StructOpt)]
struct Opt {
    /// Image width.
    width: usize,
    /// Image height.
    height: usize,

    /// Enable this to output a color image.
    #[structopt(short = "c", long = "color")]
    color: bool,

    /// Algorithm
    #[structopt(short = "a", long = "algo")]
    algo: String,
}

const ALGORITHMS: &[&str] = &["scalar", "simd", "ispc"];

fn main() {
    let opt = Opt::from_args();

    let algo = match opt.algo.as_str() {
        "scalar" => Algorithm::Scalar,
        "simd" => Algorithm::Simd,
        "ispc" => Algorithm::Ispc,
        algo => panic!(
            "Unknown algorithm: {:?}\nAvailable algorithms: {:?}",
            algo, ALGORITHMS
        ),
    };

    let mb = Mandelbrot::generate((opt.width, opt.height), algo);

    let mut stdout = io::stdout();
    if opt.color {
        mb.output_ppm(&mut stdout).unwrap();
    } else {
        mb.output_pbm(&mut stdout).unwrap();
    }
}


================================================
FILE: examples/mandelbrot/src/scalar_par.rs
================================================
//! Scalar mandelbrot implementation

use crate::*;

/// Complex number
#[repr(align(16))]
#[derive(Copy, Clone)]
struct Complex {
    real: f64,
    imag: f64,
}

impl Complex {
    /// Returns true if this member of the Mandelbrot sequence is diverging
    #[inline]
    fn diverged(&self) -> bool {
        let Self { real: x, imag: y } = self;

        let xx = x * x;
        let yy = y * y;
        let sum = xx + yy;

        sum > THRESHOLD
    }
}

/// An iterator yielding the infinite Mandelbrot sequence
struct MandelbrotIter {
    /// Initial value which generated this sequence
    start: Complex,
    /// Current iteration value
    current: Complex,
}

impl MandelbrotIter {
    /// Creates a new Mandelbrot sequence iterator for a given starting point
    fn new(start: Complex) -> Self {
        Self { start, current: start }
    }

    /// Returns the number of iterations it takes for the Mandelbrot sequence
    /// to diverge at this point, or `ITER_LIMIT` if it doesn't diverge.
    fn count(mut self) -> u32 {
        let mut z = self.start;
        for i in 0..ITER_LIMIT {
            if z.diverged() {
                return i;
            }

            z = self.next().unwrap();
        }
        ITER_LIMIT
    }
}

impl Iterator for MandelbrotIter {
    type Item = Complex;

    /// Generates the next value in the sequence
    #[inline]
    fn next(&mut self) -> Option<Complex> {
        let Complex { real: c_x, imag: c_y } = self.start;
        let Complex { real: x, imag: y } = self.current;

        let xx = x * x;
        let yy = y * y;
        let xy = x * y;

        let new_x = c_x + (xx - yy);
        let new_y = c_y + (xy + xy);

        self.current = Complex { real: new_x, imag: new_y };

        Some(self.current)
    }
}

pub fn generate(dims: Dimensions, xr: Range, yr: Range) -> Vec<u32> {
    let (width, height) = dims;

    let xs = {
        let dx = (xr.end - xr.start) / (width as f64);

        let mut buf = Vec::new();

        (0..width)
            .into_par_iter()
            .map(|j| xr.start + dx * (j as f64))
            .collect_into_vec(&mut buf);

        buf
    };

    let dy = (yr.end - yr.start) / (height as f64);

    let len = width * height;
    let mut out = Vec::with_capacity(len);
    unsafe {
        out.set_len(len);
    }

    out.par_chunks_mut(width).enumerate().for_each(|(i, row)| {
        let y = yr.start + dy * (i as f64);
        row.iter_mut().enumerate().for_each(|(j, count)| {
            let x = xs[j];
            let z = Complex { real: x, imag: y };
            *count = MandelbrotIter::new(z).count() as u32;
        });
    });

    out
}


================================================
FILE: examples/mandelbrot/src/simd_par.rs
================================================
//! Vectorized parallel Mandelbrot implementation
#![allow(non_camel_case_types)]

use crate::*;
use packed_simd::*;

type u64s = u64x8;
type u32s = u32x8;
type f64s = f64x8;
type m64s = m64x8;

/// Storage for complex numbers in SIMD format.
/// The real and imaginary parts are kept in separate registers.
#[derive(Copy, Clone)]
struct Complex {
    real: f64s,
    imag: f64s,
}

impl Complex {
    /// Returns a mask describing which members of the Mandelbrot sequence
    /// haven't diverged yet
    #[inline]
    fn undiverged(&self) -> m64s {
        let Self { real: x, imag: y } = *self;

        let xx = x * x;
        let yy = y * y;
        let sum = xx + yy;

        sum.le(f64s::splat(THRESHOLD))
    }
}

/// Mandelbrot sequence iterator using SIMD.
struct MandelbrotIter {
    /// Initial value which generated this sequence
    start: Complex,
    /// Current iteration value
    current: Complex,
}

impl MandelbrotIter {
    /// Creates a new Mandelbrot sequence iterator for a given starting point
    fn new(start: Complex) -> Self {
        Self { start, current: start }
    }

    /// Returns the number of iterations it takes for each member of the
    /// Mandelbrot sequence to diverge at this point, or `ITER_LIMIT` if
    /// they don't diverge.
    ///
    /// This function will operate on N complex numbers at once, where N is the
    /// number of lanes in a SIMD vector of doubles.
    fn count(mut self) -> u32s {
        let mut z = self.start;
        let mut count = u64s::splat(0);
        for _ in 0..ITER_LIMIT {
            // Keep track of those lanes which haven't diverged yet. The other
            // ones will be masked off.
            let undiverged = z.undiverged();

            // Stop the iteration if they all diverged. Note that we don't do
            // this check every iteration, since a branch
            // misprediction can hurt more than doing some extra
            // calculations.
            if undiverged.none() {
                break;
            }

            count += undiverged.select(u64s::splat(1), u64s::splat(0));

            z = self.next().unwrap();
        }
        count.cast()
    }
}

impl Iterator for MandelbrotIter {
    type Item = Complex;

    /// Generates the next values in the sequence
    #[inline]
    fn next(&mut self) -> Option<Complex> {
        let Complex { real: c_x, imag: c_y } = self.start;
        let Complex { real: x, imag: y } = self.current;

        let xx = x * x;
        let yy = y * y;
        let xy = x * y;

        let new_x = c_x + (xx - yy);
        let new_y = c_y + (xy + xy);

        self.current = Complex { real: new_x, imag: new_y };

        Some(self.current)
    }
}

pub fn generate(dims: Dimensions, xr: Range, yr: Range) -> Vec<u32> {
    let (width, height) = dims;

    let block_size = f64s::lanes();

    assert_eq!(
        width % block_size,
        0,
        "image width = {} is not divisible by the number of vector lanes = {}",
        width,
        block_size,
    );

    let width_in_blocks = width / block_size;

    // The initial X values are the same for every row.
    let xs = unsafe {
        let dx = (xr.end - xr.start) / (width as f64);
        let mut buf: Vec<f64s> = vec![f64s::splat(0.); width_in_blocks];

        std::slice::from_raw_parts_mut(buf.as_mut_ptr() as *mut f64, width)
            .iter_mut()
            .enumerate()
            .for_each(|(j, x)| {
                *x = xr.start + dx * (j as f64);
            });

        buf
    };

    let dy = (yr.end - yr.start) / (height as f64);

    let len = width_in_blocks * height;
    let mut out = Vec::with_capacity(len);
    unsafe {
        out.set_len(len);
    }

    out.par_chunks_mut(width_in_blocks).enumerate().for_each(|(i, row)| {
        let y = f64s::splat(yr.start + dy * (i as f64));
        row.iter_mut().enumerate().for_each(|(j, count)| {
            let x = xs[j];
            let z = Complex { real: x, imag: y };
            *count = MandelbrotIter::new(z).count();
        });
    });

    // This is safe, we're transmuting from a more-aligned type to a
    // less-aligned one.
    #[allow(clippy::unsound_collection_transmute)]
    unsafe {
        let mut out: Vec<u32> = std::mem::transmute(out);
        out.set_len(width * height);
        out
    }
}


================================================
FILE: examples/mandelbrot/volta/mandelbrot.ispc
================================================
/*
  Copyright (c) 2010-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:

    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.


   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
*/

static inline int mandel(double c_re, double c_im, int count) {
    double z_re = c_re, z_im = c_im;
    int i;
    for (i = 0; i < count; ++i) {
        if (z_re * z_re + z_im * z_im > 4.)
            break;

        double new_re = z_re*z_re - z_im*z_im;
        double new_im = 2.f * z_re * z_im;
        unmasked {
            z_re = c_re + new_re;
            z_im = c_im + new_im;
        }
    }

    return i;
}

export void mandelbrot_ispc(uniform double x0, uniform double y0, 
                            uniform double x1, uniform double y1,
                            uniform int width, uniform int height, 
                            uniform int maxIterations,
                            uniform int output[])
{
    double dx = (x1 - x0) / width;
    double dy = (y1 - y0) / height;

    for (uniform int j = 0; j < height; j++) {
        // Note that we'll be doing programCount computations in parallel,
        // so increment i by that much.  This assumes that width evenly
        // divides programCount.
        foreach (i = 0 ... width) {
            // Figure out the position on the complex plane to compute the
            // number of iterations at.  Note that the x values are
            // different across different program instances, since its
            // initializer incorporates the value of the programIndex
            // variable.
            double x = x0 + i * dx;
            double y = y0 + j * dy;

            int index = j * width + i;
            output[index] = mandel(x, y, maxIterations);
        }
    }
}

================================================
FILE: examples/matrix_inverse/Cargo.toml
================================================
[package]
name = "matrix_inverse"
version = "0.1.0"
authors = ["Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>"]
edition = "2018"

[dependencies]
packed_simd = { package = "packed_simd", path = "../.." }

[lib]
name = "matrix_inverse_lib"
path = "src/lib.rs"


================================================
FILE: examples/matrix_inverse/readme.md
================================================
# 4x4 matrix inverse 


================================================
FILE: examples/matrix_inverse/src/lib.rs
================================================
//! 4x4 matrix inverse
#![feature(custom_inner_attributes)]
#![deny(rust_2018_idioms)]
#![allow(clippy::must_use_candidate)]

pub mod scalar;
pub mod simd;

#[derive(Copy, Clone, Debug, PartialEq, PartialOrd)]
pub struct Matrix4x4([[f32; 4]; 4]);

#[cfg(test)]
#[rustfmt::skip]
fn test<F: Fn(Matrix4x4) -> Option<Matrix4x4>>(f: F) {
    let tests: &[(Matrix4x4, Option<Matrix4x4>)] = &[
        // Identity:
        (Matrix4x4([
            [1., 0., 0., 0.],
            [0., 1., 0., 0.],
            [0., 0., 1., 0.],
            [0., 0., 0., 1.],
         ]),
         Some(Matrix4x4([
             [1., 0., 0., 0.],
             [0., 1., 0., 0.],
             [0., 0., 1., 0.],
             [0., 0., 0., 1.],
         ]))
        ),
        // None:
        (Matrix4x4([
            [1., 2., 3., 4.],
            [12., 11., 10., 9.],
            [5., 6., 7., 8.],
            [16., 15., 14., 13.],
        ]),
         None
        ),
        // Other:
        (Matrix4x4([
            [1., 1., 1., 0.],
            [0., 3., 1., 2.],
            [2., 3., 1., 0.],
            [1., 0., 2., 1.],
        ]),
         Some(Matrix4x4([
             [-3., -0.5,   1.5,  1.0],
             [ 1., 0.25, -0.25, -0.5],
             [ 3., 0.25, -1.25, -0.5],
             [-3., 0.0,    1.0,  1.0],
         ]))
        ),


    ];

    for &(input, output) in tests {
        assert_eq!(f(input), output);
    }
}


================================================
FILE: examples/matrix_inverse/src/scalar.rs
================================================
//! Scalar implementation
#[rustfmt::skip]
use crate::*;

#[allow(clippy::too_many_lines)]
pub fn inv4x4(m: Matrix4x4) -> Option<Matrix4x4> {
    let m = m.0;

    let mut inv = [
        [ // row 0:
            // 0,0:
            m[1][1]  * m[2][2] * m[3][3] -
            m[1][1]  * m[2][3] * m[3][2] -
            m[2][1]  * m[1][2]  * m[3][3] +
            m[2][1]  * m[1][3]  * m[3][2] +
            m[3][1] * m[1][2]  * m[2][3] -
            m[3][1] * m[1][3]  * m[2][2],
            // 0,1:
           -m[0][1]  * m[2][2] * m[3][3] +
            m[0][1]  * m[2][3] * m[3][2] +
            m[2][1]  * m[0][2] * m[3][3] -
            m[2][1]  * m[0][3] * m[3][2] -
            m[3][1] * m[0][2] * m[2][3] +
            m[3][1] * m[0][3] * m[2][2],
            // 0,2:
            m[0][1]  * m[1][2] * m[3][3] -
            m[0][1]  * m[1][3] * m[3][2] -
            m[1][1]  * m[0][2] * m[3][3] +
            m[1][1]  * m[0][3] * m[3][2] +
            m[3][1] * m[0][2] * m[1][3] -
            m[3][1] * m[0][3] * m[1][2],
            // 0,3:
           -m[0][1] * m[1][2] * m[2][3] +
            m[0][1] * m[1][3] * m[2][2] +
            m[1][1] * m[0][2] * m[2][3] -
            m[1][1] * m[0][3] * m[2][2] -
            m[2][1] * m[0][2] * m[1][3] +
            m[2][1] * m[0][3] * m[1][2],
        ],
        [ // row 1
            // 1,0:
           -m[1][0]  * m[2][2] * m[3][3] +
            m[1][0]  * m[2][3] * m[3][2] +
            m[2][0]  * m[1][2]  * m[3][3] -
            m[2][0]  * m[1][3]  * m[3][2] -
            m[3][0] * m[1][2]  * m[2][3] +
            m[3][0] * m[1][3]  * m[2][2],
            // 1,1:
            m[0][0]  * m[2][2] * m[3][3] -
            m[0][0]  * m[2][3] * m[3][2] -
            m[2][0]  * m[0][2] * m[3][3] +
            m[2][0]  * m[0][3] * m[3][2] +
            m[3][0] * m[0][2] * m[2][3] -
            m[3][0] * m[0][3] * m[2][2],
            // 1,2:
           -m[0][0]  * m[1][2] * m[3][3] +
            m[0][0]  * m[1][3] * m[3][2] +
            m[1][0]  * m[0][2] * m[3][3] -
            m[1][0]  * m[0][3] * m[3][2] -
            m[3][0] * m[0][2] * m[1][3] +
            m[3][0] * m[0][3] * m[1][2],
            // 1,3:
            m[0][0] * m[1][2] * m[2][3] -
            m[0][0] * m[1][3] * m[2][2] -
            m[1][0] * m[0][2] * m[2][3] +
            m[1][0] * m[0][3] * m[2][2] +
            m[2][0] * m[0][2] * m[1][3] -
            m[2][0] * m[0][3] * m[1][2],
        ],
        [ // row 2
            // 2,0:
            m[1][0]  * m[2][1] * m[3][3] -
            m[1][0]  * m[2][3] * m[3][1] -
            m[2][0]  * m[1][1] * m[3][3] +
            m[2][0]  * m[1][3] * m[3][1] +
            m[3][0] * m[1][1] * m[2][3] -
            m[3][0] * m[1][3] * m[2][1],
            // 2,1:
           -m[0][0]  * m[2][1] * m[3][3] +
            m[0][0]  * m[2][3] * m[3][1] +
            m[2][0]  * m[0][1] * m[3][3] -
            m[2][0]  * m[0][3] * m[3][1] -
            m[3][0] * m[0][1] * m[2][3] +
            m[3][0] * m[0][3] * m[2][1],
            // 2,2:
            m[0][0]  * m[1][1] * m[3][3] -
            m[0][0]  * m[1][3] * m[3][1] -
            m[1][0]  * m[0][1] * m[3][3] +
            m[1][0]  * m[0][3] * m[3][1] +
            m[3][0] * m[0][1] * m[1][3] -
            m[3][0] * m[0][3] * m[1][1],
            // 2,3:
           -m[0][0] * m[1][1] * m[2][3] +
            m[0][0] * m[1][3] * m[2][1] +
            m[1][0] * m[0][1] * m[2][3] -
            m[1][0] * m[0][3] * m[2][1] -
            m[2][0] * m[0][1] * m[1][3] +
            m[2][0] * m[0][3] * m[1][1],
        ],
        [ // row 3
            // 3,0:
           -m[1][0]  * m[2][1] * m[3][2] +
            m[1][0]  * m[2][2] * m[3][1] +
            m[2][0]  * m[1][1] * m[3][2] -
            m[2][0]  * m[1][2] * m[3][1] -
            m[3][0] * m[1][1] * m[2][2] +
            m[3][0] * m[1][2] * m[2][1],
            // 3,1:
            m[0][0]  * m[2][1] * m[3][2] -
            m[0][0]  * m[2][2] * m[3][1] -
            m[2][0]  * m[0][1] * m[3][2] +
            m[2][0]  * m[0][2] * m[3][1] +
            m[3][0] * m[0][1] * m[2][2] -
            m[3][0] * m[0][2] * m[2][1],
            // 3,2:
           -m[0][0]  * m[1][1] * m[3][2] +
            m[0][0]  * m[1][2] * m[3][1] +
            m[1][0]  * m[0][1] * m[3][2] -
            m[1][0]  * m[0][2] * m[3][1] -
            m[3][0] * m[0][1] * m[1][2] +
            m[3][0] * m[0][2] * m[1][1],
            // 3,3:
            m[0][0] * m[1][1] * m[2][2] -
            m[0][0] * m[1][2] * m[2][1] -
            m[1][0] * m[0][1] * m[2][2] +
            m[1][0] * m[0][2] * m[2][1] +
            m[2][0] * m[0][1] * m[1][2] -
            m[2][0] * m[0][2] * m[1][1],
        ],
    ];

    let det = m[0][0] * inv[0][0] + m[0][1] * inv[1][0] +
              m[0][2] * inv[2][0] + m[0][3] * inv[3][0];
    if det == 0. { return None; }

    let det_inv = 1. / det;

    for row in &mut inv {
        for elem in row.iter_mut() {
            *elem *= det_inv;
        }
    }

    Some(Matrix4x4(inv))
}

#[cfg(test)]
#[test]
fn test() {
    crate::test(inv4x4)
}


================================================
FILE: examples/matrix_inverse/src/simd.rs
================================================
//! 4x4 matrix inverse using SIMD
use crate::*;
use packed_simd::shuffle;

use packed_simd::f32x4;

pub fn inv4x4(m: Matrix4x4) -> Option<Matrix4x4> {
    let m = m.0;
    let m_0 = f32x4::from_slice_unaligned(&m[0]);
    let m_1 = f32x4::from_slice_unaligned(&m[1]);
    let m_2 = f32x4::from_slice_unaligned(&m[2]);
    let m_3 = f32x4::from_slice_unaligned(&m[3]);

    let tmp1: f32x4 = shuffle!(m_0, m_1, [0, 1, 4, 5]);
    let row1: f32x4 = shuffle!(m_2, m_3, [0, 1, 4, 5]);

    let row0 = shuffle!(tmp1, row1, [0, 2, 4, 6]);
    let row1: f32x4 = shuffle!(row1, tmp1, [1, 3, 5, 7]);

    let tmp1: f32x4 = shuffle!(m_0, m_1, [2, 3, 6, 7]);
    let row3: f32x4 = shuffle!(m_2, m_3, [2, 3, 6, 7]);
    let row2 = shuffle!(tmp1, row3, [0, 2, 4, 6]);
    let row3 = shuffle!(row3, tmp1, [1, 3, 5, 7]);

    let tmp1: f32x4 = row2 * row3;
    let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]);

    let minor0 = row1 * tmp1;
    let minor1 = row0 * tmp1;
    let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]);
    let minor0 = (row1 * tmp1) - minor0;
    let minor1 = (row0 * tmp1) - minor1;
    let minor1 = shuffle!(minor1, [2, 3, 0, 1]);

    let tmp1 = row1 * row2;
    let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]);
    let minor0 = (row3 * tmp1) + minor0;
    let minor3 = row0 * tmp1;
    let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]);

    let minor0 = minor0 - row3 * tmp1;
    let minor3 = row0 * tmp1 - minor3;
    let minor3 = shuffle!(minor3, [2, 3, 0, 1]);

    let tmp1 = row3 * shuffle!(row1, [2, 3, 0, 1]);
    let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]);
    let row2 = shuffle!(row2, [2, 3, 0, 1]);
    let minor0 = row2 * tmp1 + minor0;
    let minor2 = row0 * tmp1;
    let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]);
    let minor0 = minor0 - row2 * tmp1;
    let minor2 = row0 * tmp1 - minor2;
    let minor2 = shuffle!(minor2, [2, 3, 0, 1]);

    let tmp1 = row0 * row1;
    let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]);
    let minor2 = minor2 + row3 * tmp1;
    let minor3 = row2 * tmp1 - minor3;
    let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]);
    let minor2 = row3 * tmp1 - minor2;
    let minor3 = minor3 - row2 * tmp1;

    let tmp1 = row0 * row3;
    let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]);
    let minor1 = minor1 - row2 * tmp1;
    let minor2 = row1 * tmp1 + minor2;
    let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]);
    let minor1 = row2 * tmp1 + minor1;
    let minor2 = minor2 - row1 * tmp1;

    let tmp1 = row0 * row2;
    let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]);
    let minor1 = row3 * tmp1 + minor1;
    let minor3 = minor3 - row1 * tmp1;
    let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]);
    let minor1 = minor1 - row3 * tmp1;
    let minor3 = row1 * tmp1 + minor3;

    let det = row0 * minor0;
    let det = shuffle!(det, [2, 3, 0, 1]) + det;
    let det = shuffle!(det, [1, 0, 3, 2]) + det;

    if det.sum() == 0. {
        return None;
    }
    let tmp1 = det.recpre();
    let det = tmp1 + tmp1 - det * tmp1 * tmp1;

    let res0 = minor0 * det;
    let res1 = minor1 * det;
    let res2 = minor2 * det;
    let res3 = minor3 * det;

    let mut m = m;

    res0.write_to_slice_unaligned(&mut m[0]);
    res1.write_to_slice_unaligned(&mut m[1]);
    res2.write_to_slice_unaligned(&mut m[2]);
    res3.write_to_slice_unaligned(&mut m[3]);

    Some(Matrix4x4(m))
}

#[cfg(test)]
#[test]
fn test() {
    crate::test(inv4x4)
}


================================================
FILE: examples/nbody/Cargo.toml
================================================
[package]
name = "nbody"
version = "0.1.0"
authors = ["Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>"]
edition = "2018"

[dependencies]
packed_simd = { package = "packed_simd", path = "../.." }

[[bin]]
name = "nbody"
path = "src/main.rs"

[lib]
name = "nbody_lib"
path = "src/lib.rs"

[features]
default = [ ]
sleef-sys = [ "packed_simd/sleef-sys" ]
core_arch = [ "packed_simd/core_arch" ]


================================================
FILE: examples/nbody/benches/algs.rs
================================================
//! n-body benchmarks
#![feature(test)]

extern crate nbody_lib;
extern crate test;

use test::{black_box, Bencher};

#[bench]
fn simd(b: &mut Bencher) {
    b.iter(|| black_box(nbody_lib::simd::run(black_box(10_000))))
}

#[bench]
fn scalar(b: &mut Bencher) {
    b.iter(|| black_box(nbody_lib::scalar::run(black_box(10_000))))
}


================================================
FILE: examples/nbody/readme.md
================================================
# N-Body

This is the [`n-body` benchmark from the benchmarksgame][bg]. It models the orbits
of Jovian planets, using the same simple symplectic-integrator.

## Usage

It takes two arguments in this order:

* `n`: the number of iterations to perform
* (optional) `algorithm`: the algorithm to use - defaults to the fastest one.
  * `0`: scalar algorithm
  * `1`: SIMD algorithm

## Implementation

There are three kernels, two of which are only run twice independently of the
number of iterations (`offset_momentum` and `energy`). The `advance` kernel is
run once per iterations and uses 100% of the running time.

[bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/nbody.html#nbody


================================================
FILE: examples/nbody/src/lib.rs
================================================
//! The N-body benchmark from the [benchmarks game][bg].
//!
//! [bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/nbody.html#nbody
#![deny(rust_2018_idioms)]
#![allow(
    clippy::similar_names,
    clippy::excessive_precision,
    clippy::must_use_candidate
)]

pub mod scalar;
pub mod simd;

pub fn run(n: usize, alg: usize) -> (f64, f64) {
    match alg {
        0 => scalar::run(n),
        1 => simd::run(n),
        v => panic!("unknown algorithm value: {}", v),
    }
}

#[cfg(test)]
const RESULTS: &[(usize, &str, &str)] =
    &[(1_000_usize, "-0.169075164", "-0.169087605")];


================================================
FILE: examples/nbody/src/main.rs
================================================
//! The N-body benchmark from the [benchmarks game][bg].
//!
//! [bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/nbody.html#nbody
#![deny(rust_2018_idioms)]

fn run<O: std::io::Write>(o: &mut O, n: usize, alg: usize) {
    let (energy_before, energy_after) = nbody_lib::run(n, alg);

    writeln!(o, "{:.9}", energy_before).unwrap();
    writeln!(o, "{:.9}", energy_after).unwrap();
}

fn main() {
    let n: usize = std::env::args()
        .nth(1)
        .expect("need one arg")
        .parse()
        .expect("argument should be a usize");

    let alg: usize = if let Some(v) = std::env::args().nth(2) {
        v.parse().expect("second argument must be a usize")
    } else {
        1 // SIMD algorithm
    };

    run(&mut std::io::stdout(), n, alg);
}

#[cfg(test)]
mod tests {
    use super::*;
    static OUTPUT: &[u8] = include_bytes!("nbody-output.txt");
    #[test]
    fn verify_output_simd() {
        let mut out: Vec<u8> = Vec::new();

        run(&mut out, 1000, 0);

        assert_eq!(out.len(), OUTPUT.len());
        if out != OUTPUT {
            for i in 0..out.len() {
                assert_eq!(
                    out[i], OUTPUT[i],
                    "byte {} differs - is: {:#08b} - should: {:#08b}",
                    i, out[i], OUTPUT[i]
                );
            }
        }
    }
    #[test]
    fn verify_output_scalar() {
        let mut out: Vec<u8> = Vec::new();

        run(&mut out, 1000, 1);

        assert_eq!(out.len(), OUTPUT.len());
        if out != OUTPUT {
            for i in 0..out.len() {
                assert_eq!(
                    out[i], OUTPUT[i],
                    "byte {} differs - is: {:#08b} - should: {:#08b}",
                    i, out[i], OUTPUT[i]
                );
            }
        }
    }

}


================================================
FILE: examples/nbody/src/nbody-output.txt
================================================
-0.169075164
-0.169087605


================================================
FILE: examples/nbody/src/scalar.rs
================================================
// The Computer Language Benchmarks Game
// https://benchmarksgame-team.pages.debian.net
//
// contributed by the Rust Project Developers
// contributed by TeXitoi

use std::f64::consts::PI;
const SOLAR_MASS: f64 = 4.0 * PI * PI;
const DAYS_PER_YEAR: f64 = 365.24;

struct Body {
    x: [f64; 3],
    v: [f64; 3],
    mass: f64,
}

const N_BODIES: usize = 5;
#[allow(clippy::unreadable_literal)]
const BODIES: [Body; N_BODIES] = [
    // Sun
    Body { x: [0., 0., 0.], v: [0., 0., 0.], mass: SOLAR_MASS },
    // Jupiter
    Body {
        x: [
            4.84143144246472090e+00,
            -1.16032004402742839e+00,
            -1.03622044471123109e-01,
        ],
        v: [
            1.66007664274403694e-03 * DAYS_PER_YEAR,
            7.69901118419740425e-03 * DAYS_PER_YEAR,
            -6.90460016972063023e-05 * DAYS_PER_YEAR,
        ],
        mass: 9.54791938424326609e-04 * SOLAR_MASS,
    },
    // Saturn
    Body {
        x: [
            8.34336671824457987e+00,
            4.12479856412430479e+00,
            -4.03523417114321381e-01,
        ],
        v: [
            -2.76742510726862411e-03 * DAYS_PER_YEAR,
            4.99852801234917238e-03 * DAYS_PER_YEAR,
            2.30417297573763929e-05 * DAYS_PER_YEAR,
        ],
        mass: 2.85885980666130812e-04 * SOLAR_MASS,
    },
    // Uranus
    Body {
        x: [
            1.28943695621391310e+01,
            -1.51111514016986312e+01,
            -2.23307578892655734e-01,
        ],
        v: [
            2.96460137564761618e-03 * DAYS_PER_YEAR,
            2.37847173959480950e-03 * DAYS_PER_YEAR,
            -2.96589568540237556e-05 * DAYS_PER_YEAR,
        ],
        mass: 4.36624404335156298e-05 * SOLAR_MASS,
    },
    // Neptune
    Body {
        x: [
            1.53796971148509165e+01,
            -2.59193146099879641e+01,
            1.79258772950371181e-01,
        ],
        v: [
            2.68067772490389322e-03 * DAYS_PER_YEAR,
            1.62824170038242295e-03 * DAYS_PER_YEAR,
            -9.51592254519715870e-05 * DAYS_PER_YEAR,
        ],
        mass: 5.15138902046611451e-05 * SOLAR_MASS,
    },
];

fn advance(bodies: &mut [Body; N_BODIES], dt: f64) {
    let mut b_slice: &mut [_] = bodies;
    while let Some(bi) = shift_mut_ref(&mut b_slice) {
        for bj in b_slice.iter_mut() {
            let mut dx = [0.; 3];
            for (dx, (x_i, x_j)) in
                dx.iter_mut().zip(bi.x.iter().zip(bj.x.iter()))
            {
                *dx = x_i - x_j;
            }

            let mut d2: f64 = 0.;
            for dx in &dx {
                d2 += dx * dx;
            }
            let mag = dt / (d2 * d2.sqrt());

            let massi_mag = bi.mass * mag;
            let massj_mag = bj.mass * mag;
            for (v_j, (v_i, dx)) in
                bj.v.iter_mut().zip(bi.v.iter_mut().zip(dx.iter()))
            {
                *v_j += dx * massi_mag;
                *v_i -= dx * massj_mag;
            }
        }
        for (x, v) in bi.x.iter_mut().zip(bi.v.iter()) {
            *x += dt * v;
        }
    }
}

fn energy(bodies: &[Body; N_BODIES]) -> f64 {
    let mut e = 0.0;
    let mut bodies = bodies.iter();
    while let Some(bi) = bodies.next() {
        let mut e_l = 0.;
        for v in &bi.v {
            e_l += v * v;
        }
        e += e_l * bi.mass / 2.0;
        for bj in bodies.clone() {
            let mut dist = 0.;
            for (xi, xj) in bi.x.iter().zip(bj.x.iter()) {
                let dx = xi - xj;
                dist += dx * dx;
            }
            e -= bi.mass * bj.mass / dist.sqrt();
        }
    }
    e
}

fn offset_momentum(bodies: &mut [Body; N_BODIES]) {
    let mut p = [0.; 3];
    for bi in bodies.iter() {
        for (p, v) in p.iter_mut().zip(bi.v.iter()) {
            *p += v * bi.mass;
        }
    }
    let sun = &mut bodies[0];
    for (v, p) in sun.v.iter_mut().zip(p.iter()) {
        *v = -p / SOLAR_MASS;
    }
}

/// Pop a mutable reference off the head of a slice, mutating the slice to no
/// longer contain the mutable reference.
#[allow(clippy::mut_mut)]
fn shift_mut_ref<'a, T>(r: &mut &'a mut [T]) -> Option<&'a mut T> {
    if r.is_empty() {
        return None;
    }
    let tmp = std::mem::replace(r, &mut []);
    let (h, t) = tmp.split_at_mut(1);
    *r = t;
    Some(&mut h[0])
}

pub fn run(n: usize) -> (f64, f64) {
    let mut bodies = BODIES;
    offset_momentum(&mut bodies);
    let a = energy(&bodies);
    for _ in 0..n {
        advance(&mut bodies, 0.01);
    }
    let b = energy(&bodies);
    (a, b)
}

#[cfg(test)]
mod tests {
    #[test]
    fn test() {
        for &(size, a_e, b_e) in crate::RESULTS {
            let (a, b) = super::run(size);
            assert_eq!(format!("{:.9}", a), a_e);
            assert_eq!(format!("{:.9}", b), b_e);
        }
    }
}


================================================
FILE: examples/nbody/src/simd.rs
================================================
#![deny(warnings)]

use packed_simd::*;

use std::f64::consts::PI;
const SOLAR_MASS: f64 = 4.0 * PI * PI;
const DAYS_PER_YEAR: f64 = 365.24;

pub struct Body {
    pub x: f64x4,
    pub v: f64x4,
    pub mass: f64,
}
const N_BODIES: usize = 5;
#[allow(clippy::unreadable_literal)]
const BODIES: [Body; N_BODIES] = [
    // sun:
    Body {
        x: f64x4::new(0., 0., 0., 0.),
        v: f64x4::new(0., 0., 0., 0.),
        mass: SOLAR_MASS,
    },
    // jupiter:
    Body {
        x: f64x4::new(
            4.84143144246472090e+00,
            -1.16032004402742839e+00,
            -1.03622044471123109e-01,
            0.,
        ),
        v: f64x4::new(
            1.66007664274403694e-03 * DAYS_PER_YEAR,
            7.69901118419740425e-03 * DAYS_PER_YEAR,
            -6.90460016972063023e-05 * DAYS_PER_YEAR,
            0.,
        ),
        mass: 9.54791938424326609e-04 * SOLAR_MASS,
    },
    // saturn:
    Body {
        x: f64x4::new(
            8.34336671824457987e+00,
            4.12479856412430479e+00,
            -4.03523417114321381e-01,
            0.,
        ),
        v: f64x4::new(
            -2.76742510726862411e-03 * DAYS_PER_YEAR,
            4.99852801234917238e-03 * DAYS_PER_YEAR,
            2.30417297573763929e-05 * DAYS_PER_YEAR,
            0.,
        ),
        mass: 2.85885980666130812e-04 * SOLAR_MASS,
    },
    // uranus:
    Body {
        x: f64x4::new(
            1.28943695621391310e+01,
            -1.51111514016986312e+01,
            -2.23307578892655734e-01,
            0.,
        ),
        v: f64x4::new(
            2.96460137564761618e-03 * DAYS_PER_YEAR,
            2.37847173959480950e-03 * DAYS_PER_YEAR,
            -2.96589568540237556e-05 * DAYS_PER_YEAR,
            0.,
        ),
        mass: 4.36624404335156298e-05 * SOLAR_MASS,
    },
    // neptune:
    Body {
        x: f64x4::new(
            1.53796971148509165e+01,
            -2.59193146099879641e+01,
            1.79258772950371181e-01,
            0.,
        ),
        v: f64x4::new(
            2.68067772490389322e-03 * DAYS_PER_YEAR,
            1.62824170038242295e-03 * DAYS_PER_YEAR,
            -9.51592254519715870e-05 * DAYS_PER_YEAR,
            0.,
        ),
        mass: 5.15138902046611451e-05 * SOLAR_MASS,
    },
];

pub fn offset_momentum(bodies: &mut [Body; N_BODIES]) {
    let (sun, rest) = bodies.split_at_mut(1);
    let sun = &mut sun[0];
    for body in rest {
        let m_ratio = body.mass / SOLAR_MASS;
        sun.v -= body.v * m_ratio;
    }
}

pub fn energy(bodies: &[Body; N_BODIES]) -> f64 {
    let mut e = 0.;
    for i in 0..N_BODIES {
        let bi = &bodies[i];
        e += bi.mass * (bi.v * bi.v).sum() * 0.5;
        for bj in bodies.iter().take(N_BODIES).skip(i + 1) {
            let dx = bi.x - bj.x;
            e -= bi.mass * bj.mass / (dx * dx).sum().sqrt()
        }
    }
    e
}

pub fn advance(bodies: &mut [Body; N_BODIES], dt: f64) {
    const N: usize = N_BODIES * (N_BODIES - 1) / 2;

    // compute distance between bodies:
    let mut r = [f64x4::splat(0.); N];
    {
        let mut i = 0;
        for j in 0..N_BODIES {
            for k in j + 1..N_BODIES {
                r[i] = bodies[j].x - bodies[k].x;
                i += 1;
            }
        }
    }

    let mut mag = [0.0; N];
    let mut i = 0;
    while i < N {
        let d2s = f64x2::new((r[i] * r[i]).sum(), (r[i + 1] * r[i + 1]).sum());
        let dmags = f64x2::splat(dt) / (d2s * d2s.sqrte());
        dmags.write_to_slice_unaligned(&mut mag[i..]);
        i += 2;
    }

    i = 0;
    for j in 0..N_BODIES {
        for k in j + 1..N_BODIES {
            let f = r[i] * mag[i];
            bodies[j].v -= f * bodies[k].mass;
            bodies[k].v += f * bodies[j].mass;
            i += 1
        }
    }
    for body in bodies {
        body.x += dt * body.v
    }
}

pub fn run_k<K>(n: usize, k: K) -> (f64, f64)
where
    K: Fn(&mut [Body; N_BODIES], f64),
{
    let mut bodies = BODIES;
    offset_momentum(&mut bodies);
    let energy_before = energy(&bodies);
    for _ in 0..n {
        k(&mut bodies, 0.01);
    }
    let energy_after = energy(&bodies);

    (energy_before, energy_after)
}

pub fn run(n: usize) -> (f64, f64) {
    run_k(n, advance)
}

#[cfg(test)]
mod tests {
    #[test]
    fn test() {
        for &(size, a_e, b_e) in crate::RESULTS {
            let (a, b) = super::run(size);
            assert_eq!(format!("{:.9}", a), a_e);
            assert_eq!(format!("{:.9}", b), b_e);
        }
    }
}


================================================
FILE: examples/options_pricing/Cargo.toml
================================================
[package]
name = "options_pricing"
version = "0.1.0"
authors = ["gnzlbg <gonzalobg88@gmail.com>"]
edition = "2018"

[dependencies]
packed_simd = { package = "packed_simd", path = "../.." }
time = "^0.1"
rayon = "^1.0"
ispc = { version = "^1.0.4", optional = true }

[build-dependencies]
ispc = { version = "^1.0.4", optional = true }

[[bin]]
name = "options_pricing"
path = "src/main.rs"

[lib]
name = "options_pricing_lib"
path = "src/lib.rs"

[features]
default = []
core_arch = [ "packed_simd/core_arch" ]
sleef-sys = [ "packed_simd/sleef-sys" ]
ispc_libm = [ "ispc" ]


================================================
FILE: examples/options_pricing/benchmark.sh
================================================
#!/usr/bin/env bash
#
# Runs options_pricing benchmarks

set -ex

NUM_OPTIONS_BLACK_SCHOLES=10000000

if [[ ${NORUN} != 1 ]]; then
    hash hyperfine 2>/dev/null || { echo >&2 "hyperfine is not in PATH."; exit 1; }
fi

# Black-Scholes:
ALGS=("black_scholes_scalar" "black_scholes_simd" "black_scholes_simd_par")
if echo "$FEATURES" | grep -q "ispc"; then
    hash ispc 2>/dev/null || { echo >&2 "ispc is not in PATH."; exit 1; }
    ALGS+=("black_scholes_ispc" "black_scholes_ispc_tasks")
fi


RUSTFLAGS="-C target-cpu=native ${RUSTFLAGS}" \
         cargo build --release --features="${FEATURES}"

if [[ "${NORUN}" == "1" ]]; then
    exit 0
fi

#for alg in "${ALGS[@]}"
#do
#    hyperfine "../target/release/options_pricing ${NUM_OPTIONS_BLACK_SCHOLES} ${alg}"
#done

# Binomial put:
ALGS=("binomial_put_scalar" "binomial_put_simd" "binomial_put_simd_par")
if echo "$FEATURES" | grep -q "ispc"; then
    ALGS+=("binomial_put_ispc" "binomial_put_ispc_tasks")
fi

NUM_OPTIONS_BINOMIAL_PUT=500000

for alg in "${ALGS[@]}"
do
    hyperfine "../target/release/options_pricing ${NUM_OPTIONS_BINOMIAL_PUT} ${alg}"
done


================================================
FILE: examples/options_pricing/build.rs
================================================
fn main() {
    println!("cargo:rerun-if-changed=build.rs");

    #[cfg(feature = "ispc")]
    {
        if std::env::var("CARGO_FEATURE_ISPC").is_ok() {
            let mut cfg = ispc::Config::new();

            if cfg!(windows) {
                cfg.debug(false);
            }

            let ispc_files = vec!["volta/options.ispc"];

            for s in &ispc_files[..] {
                cfg.file(*s);
            }

            cfg.target_isas(vec![
                ispc::opt::TargetISA::SSE2i32x4,
                ispc::opt::TargetISA::SSE4i32x4,
                ispc::opt::TargetISA::AVX1i32x8,
                ispc::opt::TargetISA::AVX2i32x8,
                ispc::opt::TargetISA::AVX512KNLi32x16,
            ]);

            #[cfg(feature = "ispc_libm")]
            {
                // Use the system's libm
                cfg.math_lib(ispc::opt::MathLib::System);
            }

            cfg.compile("options");
        }
    }
}


================================================
FILE: examples/options_pricing/readme.md
================================================
# Options Pricing ISPC example

This is the [`options` ISPC benchmark][ispc]:

> This program implements both the Black-Scholes and 
> Binomial options pricing models.

## Usage

```
cargo run --release --features=ispc -- ${SIZE} ${ALGORITHM}
```

## Results

```
./benchmark.sh
```

## Black-Scholes

On a dual core AVX1 i5 @1.8 GHz:

| 800 x 800    | time [ms] <br> Rust | speedup vs `scalar` [-] |
|--------------|---------------------|-------------------------|
| `scalar`     |                998 | 1.0x                       |
| `simd`       |                367 | 2.7x                       |
| `par_simd`   |               246 | 4.1x                       |
| `ispc`       |                360 | 2.8x                       |
| `ispc+tasks` |               248 | 4.0x                       |

`par_simd` and `ispc+tasks` algorithms are on par.

## Binomial put

On a dual core AVX1 i5 @1.8 GHz:

| 800 x 800    | time [ms] <br> Rust | speedup vs `scalar` [-] |
|--------------|---------------------|-------------------------|
| `scalar`     |               2057 | 1.0x                       |
| `simd`       |               651 | 3.2x                       |
| `par_simd`   |               279 | 4.3x                       |
| `ispc`       |                805 | 7.4x                       |
| `ispc+tasks` |               404 | 5.1x                       |

`par_simd` algorithm is ~1.4x faster than `ispc+tasks`.


[ispc]: https://github.com/ispc/ispc/tree/master/examples/options


================================================
FILE: examples/options_pricing/src/ispc_.rs
================================================
//! Includes the ISPC implementations.

use ispc::*;
ispc_module!(options);

pub mod black_scholes {
    use super::*;

    pub fn serial(
        sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],
        result: &mut [f32], count: usize,
    ) -> f64 {
        unsafe {
            self::options::black_scholes_ispc(
                sa.as_ptr() as *mut f32,
                xa.as_ptr() as *mut f32,
                ta.as_ptr() as *mut f32,
                ra.as_ptr() as *mut f32,
                va.as_ptr() as *mut f32,
                result.as_mut_ptr(),
                count as i32,
            )
        }
    }

    pub fn tasks(
        sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],
        result: &mut [f32], count: usize,
    ) -> f64 {
        unsafe {
            self::options::black_scholes_ispc_tasks(
                sa.as_ptr() as *mut f32,
                xa.as_ptr() as *mut f32,
                ta.as_ptr() as *mut f32,
                ra.as_ptr() as *mut f32,
                va.as_ptr() as *mut f32,
                result.as_mut_ptr(),
                count as i32,
            )
        }
    }
}

pub mod binomial_put {
    use super::*;

    pub fn serial(
        sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],
        result: &mut [f32], count: usize,
    ) -> f64 {
        unsafe {
            self::options::binomial_put_ispc(
                sa.as_ptr() as *mut f32,
                xa.as_ptr() as *mut f32,
                ta.as_ptr() as *mut f32,
                ra.as_ptr() as *mut f32,
                va.as_ptr() as *mut f32,
                result.as_mut_ptr(),
                count as i32,
            )
        }
    }

    pub fn tasks(
        sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],
        result: &mut [f32], count: usize,
    ) -> f64 {
        unsafe {
            self::options::binomial_put_ispc_tasks(
                sa.as_ptr() as *mut f32,
                xa.as_ptr() as *mut f32,
                ta.as_ptr() as *mut f32,
                ra.as_ptr() as *mut f32,
                va.as_ptr() as *mut f32,
                result.as_mut_ptr(),
                count as i32,
            )
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    #[test]
    fn black_scholes() {
        const NOPTS: usize = 1_000_000;
        let mut serial = crate::State::new(NOPTS);
        let mut tasks = crate::State::new(NOPTS);

        let serial_sum = serial.exec(black_scholes::serial);
        let tasks_sum = tasks.exec(black_scholes::tasks);

        assert_eq!(serial, tasks);
        assert_eq!(serial_sum, tasks_sum);
    }

    #[test]
    fn binomial_put() {
        const NOPTS: usize = 1_000_000;
        let mut serial = crate::State::new(NOPTS);
        let mut tasks = crate::State::new(NOPTS);

        let serial_sum = serial.exec(binomial_put::serial);
        let tasks_sum = tasks.exec(binomial_put::tasks);

        assert_eq!(serial, tasks);
        assert_eq!(serial_sum, tasks_sum);
    }
}


================================================
FILE: examples/options_pricing/src/lib.rs
================================================
#![deny(rust_2018_idioms)]
#![allow(
    clippy::inline_always,
    clippy::many_single_char_names,
    clippy::excessive_precision,
    clippy::cast_precision_loss,
    clippy::cast_possible_truncation,
    clippy::cast_possible_wrap,
    clippy::must_use_candidate,
    clippy::too_many_arguments,
    clippy::float_cmp
)]

use packed_simd::f32x8 as f32s;
use packed_simd::f64x8 as f64s;

const BINOMIAL_NUM: usize = 64;

#[cfg(feature = "ispc")]
pub mod ispc_;
pub mod scalar;
pub mod simd;
pub mod simd_kernels;
pub mod simd_par;
pub mod sum;

#[derive(PartialEq, Debug)]
pub struct State {
    s: Vec<f32>,
    x: Vec<f32>,
    t: Vec<f32>,
    r: Vec<f32>,
    v: Vec<f32>,
    result: Vec<f32>,
    count: usize,
}

impl State {
    pub fn new(count: usize) -> Self {
        Self {
            s: vec![100.; count],
            x: vec![98.; count],
            t: vec![2.; count],
            r: vec![0.02; count],
            v: vec![5.; count],
            result: vec![0.0; count],
            count,
        }
    }
    pub fn exec<F>(&mut self, model: F) -> f64
    where
        F: Fn(
            &[f32],
            &[f32],
            &[f32],
            &[f32],
            &[f32],
            &mut [f32],
            usize,
        ) -> f64,
    {
        model(
            &self.s,
            &self.x,
            &self.t,
            &self.r,
            &self.v,
            &mut self.result,
            self.count,
        )
    }
}

#[cfg(test)]
fn almost_equal(a: f64, b: f64, max_rel_diff: f64) -> bool {
    let diff = (a - b).abs();
    let a = a.abs();
    let b = b.abs();
    let largest = a.max(b);

    diff <= largest * max_rel_diff
}


================================================
FILE: examples/options_pricing/src/main.rs
================================================
#![deny(warnings, rust_2018_idioms)]
#![feature(custom_inner_attributes)]

use options_pricing_lib::*;

#[rustfmt::skip]
fn run<F>(name: &str, count: usize, f: F)
where
    F: Fn(&[f32], &[f32], &[f32], &[f32], &[f32], &mut [f32], usize) -> f64,
{
    let mut d = State::new(count);
    let t = time::Duration::span(move || { d.exec(f); } );
    println!("{}: {} ms", name, t.num_milliseconds());
}

macro_rules! ispc_alg {
    ($name:tt, $count:ident, $fun:path) => {{
        #[cfg(feature = "ispc")]
        {
            run($name, $count, $fun);
        }
        #[cfg(not(feature = "ispc"))]
        {
            panic!("algorithm {} requires --feature=ispc", $name);
        }
    }};
}

fn main() {
    let mut args = std::env::args();
    args.next();
    let num_options: usize = args
        .next()
        .unwrap()
        .parse()
        .expect("expected argument 1 of type usize: num_options");
    let algorithm: String = args
        .next()
        .unwrap()
        .parse()
        .expect("expected argument 2 of type String: algorithm");

    match algorithm.as_str() {
        "black_scholes_ispc_tasks" => ispc_alg!(
            "black_scholes_ispc_tasks",
            num_options,
            ispc_::black_scholes::tasks
        ),
        "black_scholes_ispc" => ispc_alg!(
            "black_scholes_ispc",
            num_options,
            ispc_::black_scholes::serial
        ),
        "binomial_put_ispc_tasks" => ispc_alg!(
            "binomial_put_ispc_tasks",
            num_options,
            ispc_::binomial_put::tasks
        ),
        "binomial_put_ispc" => ispc_alg!(
            "binomial_put_ispc",
            num_options,
            ispc_::binomial_put::serial
        ),
        "black_scholes_scalar" => {
            run("black_scholes_scalar", num_options, scalar::black_scholes)
        }
        "binomial_put_scalar" => {
            run("binomial_put_scalar", num_options, scalar::binomial_put)
        }
        "black_scholes_simd" => {
            run("black_scholes_simd", num_options, simd::black_scholes)
        }
        "binomial_put_simd" => {
            run("binomial_put_simd", num_options, simd::binomial_put)
        }
        "black_scholes_simd_par" => {
            run("black_scholes_simd_par", num_options, simd_par::black_scholes)
        }
        "binomial_put_simd_par" => {
            run("binomial_put_simd_par", num_options, simd_par::binomial_put)
        }
        _ => panic!("unknown algorithm: {}", algorithm),
    }
}


================================================
FILE: examples/options_pricing/src/scalar.rs
================================================
//! Scalar implementation

// Cumulative normal distribution function
#[inline(always)]
fn cnd(x: f32) -> f32 {
    const INV_SQRT_2PI: f32 = 0.398_942_280_40;

    let l = x.abs();
    let k = 1. / (1. + 0.231_641_9 * l);
    let k2 = k * k;
    let k3 = k2 * k;
    let k4 = k2 * k2;
    let k5 = k3 * k2;
    let w: f32 = 0.319_381_53 * k - 0.356_563_782 * k2
        + 1.781_477_937 * k3
        + -1.821_255_978 * k4
        + 1.330_274_429 * k5;
    let w = w * INV_SQRT_2PI * (-l * l * 0.5).exp();

    if x > 0. {
        1. - w
    } else {
        w
    }
}

pub fn black_scholes(
    sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],
    result: &mut [f32], count: usize,
) -> f64 {
    for i in 0..count {
        let s = sa[i];
        let x = xa[i];
        let t = ta[i];
        let r = ra[i];
        let v = va[i];
        let d1 = ((s / x).ln() + (r + v * v * 0.5) * t) / (v * t.sqrt());
        let d2 = d1 - v * t.sqrt();
        result[i] = s * cnd(d1) - x * (-r * t).exp() * cnd(d2);
    }
    crate::sum::slice_scalar(&result)
}

pub fn binomial_put(
    sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],
    result: &mut [f32], count: usize,
) -> f64 {
    use crate::BINOMIAL_NUM;

    for i in 0..count {
        let s = sa[i];
        let x = xa[i];
        let t = ta[i];
        let r = ra[i];
        let v = va[i];

        let dt = t / BINOMIAL_NUM as f32;
        let u = (v * dt.sqrt()).exp();
        let d = 1. / u;
        let disc = (r * dt).exp();
        let pu = (disc - d) / (u - d);

        let mut vs = [0_f32; BINOMIAL_NUM];
        for (j, v) in vs.iter_mut().enumerate() {
            let e = (2_i32 * (j as i32)).wrapping_sub(BINOMIAL_NUM as i32);
            let upow = u.powf(e as f32);
            *v = 0_f32.max(x - s * upow);
        }

        for j in (0..BINOMIAL_NUM).rev() {
            for k in 0..j {
                vs[k] = ((1. - pu) * vs[k] + pu * vs[k + 1]) / disc;
            }
        }

        result[i] = vs[0];
    }
    crate::sum::slice_scalar(&result)
}

#[cfg(feature = "ispc")]
#[cfg(test)]
mod tests {
    use super::*;
    use crate::almost_equal;
    #[test]
    fn black_scholes_ispc() {
        const NOPTS: usize = 1_000_000;
        let mut scalar = crate::State::new(NOPTS);
        let mut ispc = crate::State::new(NOPTS);

        let scalar_sum = scalar.exec(black_scholes);
        let ispc_sum = ispc.exec(crate::ispc_::black_scholes::serial);

        assert_eq!(scalar, ispc);
        assert_eq!(scalar_sum, ispc_sum);
    }

    #[test]
    fn binomial_put_ispc() {
        const NOPTS: usize = 1_000_000;
        let mut scalar = crate::State::new(NOPTS);
        let mut ispc = crate::State::new(NOPTS);

        let scalar_sum = scalar.exec(binomial_put);
        let ispc_sum = ispc.exec(crate::ispc_::binomial_put::serial);

        // FIXME: results differ slightly for each value of the result vector
        // need to figure out why
        // assert_eq!(scalar, ispc);
        assert!(almost_equal(scalar_sum, ispc_sum, 1e-5));
    }
}


================================================
FILE: examples/options_pricing/src/simd.rs
================================================
//! SIMD implementation

use crate::f32s;

pub fn serial<K>(
    sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],
    result: &mut [f32], count: usize, kernel: K,
) -> f64
where
    K: Fn(f32s, f32s, f32s, f32s, f32s) -> f32s,
{
    assert_eq!(count % f32s::lanes(), 0);
    for i in (0..count).step_by(f32s::lanes()) {
        unsafe {
            let s = f32s::from_slice_unaligned_unchecked(&sa[i..]);
            let x = f32s::from_slice_unaligned_unchecked(&xa[i..]);
            let t = f32s::from_slice_unaligned_unchecked(&ta[i..]);
            let r = f32s::from_slice_unaligned_unchecked(&ra[i..]);
            let v = f32s::from_slice_unaligned_unchecked(&va[i..]);
            let r = kernel(s, x, t, r, v);
            r.write_to_slice_unaligned_unchecked(&mut result[i..]);
        }
    }
    crate::sum::slice(&result)
}

pub fn black_scholes(
    sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],
    result: &mut [f32], count: usize,
) -> f64 {
    serial(
        sa,
        xa,
        ta,
        ra,
        va,
        result,
        count,
        crate::simd_kernels::black_scholes,
    )
}

pub fn binomial_put(
    sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],
    result: &mut [f32], count: usize,
) -> f64 {
    serial(
        sa,
        xa,
        ta,
        ra,
        va,
        result,
        count,
        crate::simd_kernels::binomial_put,
    )
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::almost_equal;
    #[test]
    fn black_scholes_scalar() {
        const NOPTS: usize = 1_000_000;
        let mut simd = crate::State::new(NOPTS);
        let mut scalar = crate::State::new(NOPTS);

        let simd_sum = simd.exec(black_scholes);
        let scalar_sum = scalar.exec(crate::scalar::black_scholes);

        assert_eq!(simd, scalar);
        assert_eq!(simd_sum, scalar_sum);
    }

    #[test]
    fn binomial_put_scalar() {
        const NOPTS: usize = 1_000_000;
        let mut simd = crate::State::new(NOPTS);
        let mut scalar = crate::State::new(NOPTS);

        let simd_sum = simd.exec(binomial_put);
        let scalar_sum = scalar.exec(crate::scalar::binomial_put);

        // assert_eq!(simd, scalar);
        // assert_eq!(simd_sum, scalar_sum);
        assert!(almost_equal(simd_sum, scalar_sum, 1e-5));
    }
}


================================================
FILE: examples/options_pricing/src/simd_kernels.rs
================================================
use crate::f32s;

// Cumulative normal distribution function
#[inline(always)]
pub fn cnd(x: f32s) -> f32s {
    const INV_SQRT_2PI: f32s = f32s::splat(0.398_942_280_40);

    let l = x.abs();
    let k = 1. / (1. + 0.231_641_9 * l);
    let k2 = k * k;
    let k3 = k2 * k;
    let k4 = k2 * k2;
    let k5 = k3 * k2;
    let w: f32s = 0.319_381_53 * k - 0.356_563_782 * k2
        + 1.781_477_937 * k3
        + -1.821_255_978 * k4
        + 1.330_274_429 * k5;
    let w = w * INV_SQRT_2PI * (-l * l * 0.5).exp();

    x.gt(f32s::splat(0.)).select(1. - w, w)
}

#[inline(always)]
pub fn black_scholes(s: f32s, x: f32s, t: f32s, r: f32s, v: f32s) -> f32s {
    let d1 = ((s / x).ln() + (r + v * v * 0.5) * t) / (v * t.sqrt());
    let d2 = d1 - v * t.sqrt();
    s * cnd(d1) - x * (-r * t).exp() * cnd(d2)
}

#[inline(always)]
pub fn binomial_put(s: f32s, x: f32s, t: f32s, r: f32s, v: f32s) -> f32s {
    use crate::BINOMIAL_NUM;

    let dt = t / BINOMIAL_NUM as f32;
    let u = (v * dt.sqrt()).exp();
    let d = 1. / u;
    let disc = (r * dt).exp();
    let inv_disc = 1. / disc;
    let pu = (disc - d) / (u - d);
    let o_m_pu = 1. - pu;

    let mut vs = [f32s::splat(0.); BINOMIAL_NUM];
    for (j, v) in vs.iter_mut().enumerate() {
        let e = (2_i32 * (j as i32)).wrapping_sub(BINOMIAL_NUM as i32);
        let upow = u.powf(f32s::splat(e as f32));
        *v = f32s::splat(0.).max(x - s * upow);
    }

    for j in (0..BINOMIAL_NUM).rev() {
        for k in 0..j {
            vs[k] = (o_m_pu * vs[k] + pu * vs[k + 1]) * inv_disc;
        }
    }

    vs[0]
}


================================================
FILE: examples/options_pricing/src/simd_par.rs
================================================
//! SIMD implementation

use crate::f32s;

pub fn parallel<K>(
    sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],
    result: &mut [f32], count: usize, kernel: K,
) -> f64
where
    K: Fn(f32s, f32s, f32s, f32s, f32s) -> f32s + Sync + Send,
{
    use rayon::prelude::*;
    assert_eq!(count % f32s::lanes(), 0);
    result.par_chunks_mut(f32s::lanes()).enumerate().for_each(
        |(i, result)| {
            debug_assert!(result.len() == 8);
            unsafe {
                let s = f32s::from_slice_unaligned_unchecked(&sa[i..]);
                let x = f32s::from_slice_unaligned_unchecked(&xa[i..]);
                let t = f32s::from_slice_unaligned_unchecked(&ta[i..]);
                let r = f32s::from_slice_unaligned_unchecked(&ra[i..]);
                let v = f32s::from_slice_unaligned_unchecked(&va[i..]);
                let r = kernel(s, x, t, r, v);
                r.write_to_slice_unaligned_unchecked(result);
            }
        },
    );
    crate::sum::slice(&result)
}

pub fn black_scholes(
    sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],
    result: &mut [f32], count: usize,
) -> f64 {
    parallel(
        sa,
        xa,
        ta,
        ra,
        va,
        result,
        count,
        crate::simd_kernels::black_scholes,
    )
}

pub fn binomial_put(
    sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],
    result: &mut [f32], count: usize,
) -> f64 {
    parallel(
        sa,
        xa,
        ta,
        ra,
        va,
        result,
        count,
        crate::simd_kernels::binomial_put,
    )
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::almost_equal;
    #[test]
    fn black_scholes_scalar() {
        const NOPTS: usize = 1_000_000;
        let mut simd_par = crate::State::new(NOPTS);
        let mut scalar = crate::State::new(NOPTS);

        let simd_par_sum = simd_par.exec(black_scholes);
        let scalar_sum = scalar.exec(crate::scalar::black_scholes);

        assert_eq!(simd_par, scalar);
        assert_eq!(simd_par_sum, scalar_sum);
    }

    #[test]
    fn binomial_put_scalar() {
        const NOPTS: usize = 1_000_000;
        let mut simd_par = crate::State::new(NOPTS);
        let mut scalar = crate::State::new(NOPTS);

        let simd_par_sum = simd_par.exec(binomial_put);
        let scalar_sum = scalar.exec(crate::scalar::binomial_put);

        // assert_eq!(simd_par, scalar);
        // assert_eq!(simd_par_sum, scalar_sum);
        assert!(almost_equal(simd_par_sum, scalar_sum, 1e-5));
    }
}


================================================
FILE: examples/options_pricing/src/sum.rs
================================================
//! Implements different algorithms for summing a slice of `f32`s

use super::{f32s, f64s};

pub fn slice(x: &[f32]) -> f64 {
    assert_eq!(f32s::lanes(), f64s::lanes());
    assert_eq!(x.len() % f32s::lanes(), 0);

    let mut sum = f64s::splat(0.);
    for i in (0..x.len()).step_by(f32s::lanes()) {
        unsafe {
            use packed_simd::Cast;
            let v: f64s = f32s::from_slice_unaligned_unchecked(&x[i..]).cast();
            sum += v;
        }
    }
    sum.sum()
}

pub fn slice_scalar(x: &[f32]) -> f64 {
    let mut sum = 0_f64;
    for &x in x {
        sum += f64::from(x);
    }
    sum
}


================================================
FILE: examples/options_pricing/volta/options.ispc
================================================
// -*- mode: c++ -*-
/*
  Copyright (c) 2010-2011, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:

    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.


   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
*/

#include "options_defs.h"

// Cumulative normal distribution function
static inline float
CND(float X) {
    float L = abs(X);

    float k = 1.0 / (1.0 + 0.2316419 * L);
    float k2 = k*k;
    float k3 = k2*k;
    float k4 = k2*k2;
    float k5 = k3*k2;

    const float invSqrt2Pi = 0.39894228040f;
    float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 +
               -1.821255978f * k4 + 1.330274429f * k5);
    w *= invSqrt2Pi * exp(-L * L * .5f);

    if (X > 0.f)
        w = 1.0 - w;
    return w;
}

static inline
uniform double sum(const uniform float result[], uniform int count) {
    double s = 0.0;
    foreach (i = 0 ... count) {
        s += (double)result[i];
    }
    return reduce_add(s);
}

task void
bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],
        uniform float ra[], uniform float va[], 
        uniform float result[], uniform int count) {
    uniform int first = taskIndex * (count/taskCount);
    uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));

    foreach (i = first ... last) {
        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];

        float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
        float d2 = d1 - v * sqrt(T);

        result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
    }
}

export uniform double
black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[],
                         uniform float ra[], uniform float va[], 
                         uniform float result[], uniform int count) {
    uniform int nTasks = max((int)64, (int)count/16384);
    launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count);
    sync;
    return sum(result, count);
}


export uniform double
black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
                   uniform float ra[], uniform float va[], 
                   uniform float result[], uniform int count) {
    foreach (i = 0 ... count) {
        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];

        float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
        float d2 = d1 - v * sqrt(T);

        result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
    }

    return sum(result, count);
}


static inline float
binomial_put(float S, float X, float T, float r, float v) {
    float V[BINOMIAL_NUM];

    float dt = T / BINOMIAL_NUM;
    float u = exp(v * sqrt(dt));
    float d = 1. / u;
    float disc = exp(r * dt);
    float Pu = (disc - d) / (u - d);

    for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {
        float upow = pow(u, (float)(2*j-BINOMIAL_NUM));
        V[j] = max(0., X - S * upow);
    }

    for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)
        for (uniform int k = 0; k < j; ++k)
            V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
    return V[0];
}


export uniform double
binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[], 
                  uniform float ra[], uniform float va[], 
                  uniform float result[], uniform int count) {
    foreach (i = 0 ... count) {
        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
        result[i] = binomial_put(S, X, T, r, v);
    }

    return sum(result, count);
}


task void
binomial_task(uniform float Sa[], uniform float Xa[], 
              uniform float Ta[], uniform float ra[], 
              uniform float va[], uniform float result[], 
              uniform int count) {
    uniform int first = taskIndex * (count/taskCount);
    uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));

    foreach (i = first ... last) {
        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
        result[i] = binomial_put(S, X, T, r, v);
    }
}


export uniform double
binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[], 
                        uniform float Ta[], uniform float ra[], 
                        uniform float va[], uniform float result[], 
                        uniform int count) {
    uniform int nTasks = max((int)64, (int)count/16384);
    launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count);
    sync;
    return sum(result, count);
}

================================================
FILE: examples/options_pricing/volta/options_defs.h
================================================
/*
  Copyright (c) 2010-2011, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:

    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.


   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
*/

#ifndef OPTIONS_DEFS_H
#define OPTIONS_DEFS_H 1

#define BINOMIAL_NUM 64


#endif // OPTIONS_DEFS_H


================================================
FILE: examples/rust-toolchain
================================================
nightly

================================================
FILE: examples/slice_sum/Cargo.toml
================================================
[package]
name = "slice_sum"
version = "0.1.0"
authors = ["gnzlbg <gonzalobg88@gmail.com>"]
edition = "2018"

[[bin]]
name = "slice_sum"
path = "src/main.rs"

[dependencies]
packed_simd = { package = "packed_simd", path = "../.." }
rayon = "^1.0"
time = "^0.1"
rand = "0.7.0"


================================================
FILE: examples/slice_sum/readme.md
================================================
# Computes the sum of a slice of floating-point numbers

This example show-cases the performance difference of computing the sum of a
`&[f32]` slice using horizontal or vertical operations. 

To run it:

```
RUSTFLAGS="-C target-cpu=native" cargo run --release
```

On my machine it prints:

```
vertical: 155 ms
horizontal: 424 ms
```

that is, on my particular the slice sum algorithm using horizontal vector
additions operation is ~2.7x slower than the one using vertical vector
operations.


================================================
FILE: examples/slice_sum/src/main.rs
================================================
#![deny(rust_2018_idioms)]

use packed_simd::f32x8 as f32s;
use std::{mem, slice};

fn init(n: usize) -> Vec<f32> {
    use rand::distributions::Standard;
    use rand::prelude::*;
    thread_rng().sample_iter(&Standard).take(n).collect()
}

fn sum_ver(x: &[f32]) -> f32 {
    assert_eq!(x.len() % f32s::lanes(), 0);

    x.chunks_exact(f32s::lanes())
        .map(f32s::from_slice_unaligned)
        .sum::<f32s>()
        .sum()
}

fn sum_hor(x: &[f32]) -> f32 {
    assert_eq!(x.len() % f32s::lanes(), 0);

    x.chunks_exact(f32s::lanes())
        .map(f32s::from_slice_unaligned)
        .map(f32s::sum)
        .sum()
}

fn sum_ver_par(x: &[f32]) -> f32 {
    use rayon::prelude::*;
    let len: usize = x.len();
    assert_eq!(len % 8, 0);

    // find the first properly aligned element
    let (i, _): (usize, _) = x
        .iter()
        .enumerate()
        .find(|&(_, y): &(usize, &f32)| {
            (y as *const f32) as usize % mem::align_of::<f32s>() == 0
        })
        .unwrap();

    let (head, tail) = x.split_at(i);
    let head_sum: f32 = head.iter().sum();

    #[allow(clippy::cast_ptr_alignment)]
    let tail: &[f32s] = unsafe {
        slice::from_raw_parts(
            tail.as_ptr() as *const f32s,
            tail.len() / f32s::lanes(),
        )
    };
    let tail_sum: f32s = tail.into_par_iter().sum();
    head_sum + tail_sum.sum()
}

fn main() {
    let n: usize = std::env::args()
        .nth(1)
        .unwrap_or_else(|| "1000000000".to_string())
        .parse()
        .expect("argument should be a usize");

    assert_eq!(n % 8, 0, "argument should be a multiple of 8");

    let s: &[f32] = &init(n);

    let iter = time::Duration::span(|| {
        let v: f32 = s.iter().sum();
        assert!(!v.is_nan());
    });
    println!("std::iter::sum: {} ms", iter.num_milliseconds());

    let rayon = time::Duration::span(|| {
        use rayon::prelude::*;
        let v: f32 = s.par_iter().sum();
        assert!(!v.is_nan());
    });
    println!("rayon::sum: {} ms", rayon.num_milliseconds());

    let ver = time::Duration::span(|| {
        assert!(!sum_ver(s).is_nan());
    });
    println!("vertical: {} ms", ver.num_milliseconds());

    let hor = time::Duration::span(|| {
        assert!(!sum_hor(s).is_nan());
    });
    println!("horizontal: {} ms", hor.num_milliseconds());
    let ver_par = time::Duration::span(|| {
        assert!(!sum_ver_par(s).is_nan());
    });
    println!("vertical_par: {} ms", ver_par.num_milliseconds());
}


================================================
FILE: examples/spectral_norm/Cargo.toml
================================================
[package]
name = "spectral_norm"
version = "0.1.0"
authors = ["gnzlbg <gonzalobg88@gmail.com>"]
edition = "2018"

[dependencies]
packed_simd = { package = "packed_simd", path = "../.." }

[[bin]]
name = "spectral_norm"
path = "src/main.rs"

[lib]
name = "spectral_norm_lib"
path = "src/lib.rs"


================================================
FILE: examples/spectral_norm/readme.md
================================================
# Spectral norm

This is the [`spectral-norm` benchmark from the benchmarksgame][bg]. 

## Background and description

MathWorld: ["Hundred-Dollar, Hundred-Digit Challenge Problems"](http://mathworld.wolfram.com/Hundred-DollarHundred-DigitChallengeProblems.html), [Challenge #3](http://mathworld.wolfram.com/SpectralNorm.html).

Each program should:

* calculate the spectral norm of an infinite matrix `A`, with entries `a11=1`,
  `a12=1/2`, `a21=1/3`, `a13=1/4`, `a22=1/5`, `a31=1/6`, etc.

* implement 4 separate functions / procedures / methods like the [C#
  program](https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/spectralnorm-csharpcore-1.html)

## Usage

It takes two arguments in this order:

* `n`: the size of the matrix `A` (n-times-n)
* (optional) `algorithm`: the algorithm to use - defaults to the fastest one.
  * `0`: scalar algorithm
  * `1`: SIMD algorithm

[bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/spectralnorm.html#spectralnorm


================================================
FILE: examples/spectral_norm/src/lib.rs
================================================
//! Spectral Norm
#![deny(rust_2018_idioms)]
#![allow(non_snake_case, non_camel_case_types)]
#![allow(
    clippy::cast_precision_loss,
    clippy::must_use_candidate
)]

pub mod scalar;
pub mod simd;

fn A(i: usize, j: usize) -> f64 {
    ((i + j) * (i + j + 1) / 2 + i + 1) as f64
}

pub fn spectral_norm(n: usize, alg: usize) -> f64 {
    match alg {
        0 => simd::spectral_norm(n),
        1 => scalar::spectral_norm(n),
        v => panic!("unknown algorithm value: {}", v),
    }
}


================================================
FILE: examples/spectral_norm/src/main.rs
================================================
extern crate spectral_norm_lib;
use spectral_norm_lib::*;

fn run<O: std::io::Write>(o: &mut O, n: usize, alg: usize) {
    let answer = spectral_norm(n, alg);
    writeln!(o, "{:.9}", answer).unwrap();
}

fn main() {
    let n: usize =
        std::env::args().nth(1).expect("need one arg").parse().unwrap();

    let alg = if let Some(v) = std::env::args().nth(2) {
        v.parse().unwrap()
    } else {
        0
    };

    run(&mut std::io::stdout(), n, alg);
}

#[cfg(test)]
mod tests {
    use super::*;
    static OUTPUT: &[u8] = include_bytes!("spectralnorm-output.txt");
    #[test]
    fn verify_output_simd() {
        let mut out: Vec<u8> = Vec::new();

        run(&mut out, 100, 0);

        assert_eq!(out.len(), OUTPUT.len());
        if out != OUTPUT {
            for i in 0..out.len() {
                assert_eq!(
                    out[i], OUTPUT[i],
                    "byte {} differs - is: {:#08b} - should: {:#08b}",
                    i, out[i], OUTPUT[i]
                );
            }
        }
    }
    #[test]
    fn verify_output_scalar() {
        let mut out: Vec<u8> = Vec::new();

        run(&mut out, 100, 1);

        assert_eq!(out.len(), OUTPUT.len());
        if out != OUTPUT {
            for i in 0..out.len() {
                assert_eq!(
                    out[i], OUTPUT[i],
                    "byte {} differs - is: {:#08b} - should: {:#08b}",
                    i, out[i], OUTPUT[i]
                );
            }
        }
    }

}


================================================
FILE: examples/spectral_norm/src/scalar.rs
================================================
//! Scalar spectral norm implementation

use crate::*;
use std::{
    iter::*,
    ops::{Add, Div},
};

struct f64x2(f64, f64);
impl Add for f64x2 {
    type Output = Self;
    fn add(self, rhs: Self) -> Self {
        Self(self.0 + rhs.0, self.1 + rhs.1)
    }
}
impl Div for f64x2 {
    type Output = Self;
    fn div(self, rhs: Self) -> Self {
        Self(self.0 / rhs.0, self.1 / rhs.1)
    }
}

pub fn spectral_norm(n: usize) -> f64 {
    assert!(n % 2 == 0, "only even lengths are accepted");
    let mut u = vec![1.0; n];
    let mut v = u.clone();
    let mut tmp = v.clone();
    for _ in 0..10 {
        mult_AtAv(&u, &mut v, &mut tmp);
        mult_AtAv(&v, &mut u, &mut tmp);
    }
    (dot(&u, &v) / dot(&v, &v)).sqrt()
}

fn mult_AtAv(v: &[f64], out: &mut [f64], tmp: &mut [f64]) {
    mult_Av(v, tmp);
    mult_Atv(tmp, out);
}

fn mult_Av(v: &[f64], out: &mut [f64]) {
    mult(v, out, 0, A);
}

fn mult_Atv(v: &[f64], out: &mut [f64]) {
    mult(v, out, 0, |i, j| A(j, i));
}

fn mult<F>(v: &[f64], out: &mut [f64], start: usize, a: F)
where
    F: Fn(usize, usize) -> f64,
{
    for (i, slot) in out.iter_mut().enumerate().map(|(i, s)| (i + start, s)) {
        let mut sum = f64x2(0.0, 0.0);
        for (j, chunk) in v.chunks(2).enumerate().map(|(j, s)| (2 * j, s)) {
            let top = f64x2(chunk[0], chunk[1]);
            let bot = f64x2(a(i, j), a(i, j + 1));
            sum = sum + top / bot;
        }
        let f64x2(a, b) = sum;
        *slot = a + b;
    }
}

fn dot(x: &[f64], y: &[f64]) -> f64 {
    x.iter().zip(y).map(|(&x, &y)| x * y).fold(0.0, |a, b| a + b)
}

#[cfg(test)]
#[test]
fn test() {
    assert_eq!(&format!("{:.9}", spectral_norm(100)), "1.274219991");
}


================================================
FILE: examples/spectral_norm/src/simd.rs
================================================
//! Vectorized spectral norm implementation

use crate::*;
use packed_simd::*;

fn mult_Av(v: &[f64], out: &mut [f64]) {
    assert!(v.len() == out.len());
    assert!(v.len() % 2 == 0);

    for (i, out) in out.iter_mut().enumerate() {
        let mut sum = f64x2::splat(0.0);

        let mut j = 0;
        while j < v.len() {
            let b = f64x2::from_slice_unaligned(&v[j..]);
            let a = f64x2::new(A(i, j), A(i, j + 1));
            sum += b / a;
            j += 2
        }
        *out = sum.sum();
    }
}

fn mult_Atv(v: &[f64], out: &mut [f64]) {
    assert!(v.len() == out.len());
    assert!(v.len() % 2 == 0);

    for (i, out) in out.iter_mut().enumerate() {
        let mut sum = f64x2::splat(0.0);

        let mut j = 0;
        while j < v.len() {
            let b = f64x2::from_slice_unaligned(&v[j..]);
            let a = f64x2::new(A(j, i), A(j + 1, i));
            sum += b / a;
            j += 2
        }
        *out = sum.sum();
    }
}

fn mult_AtAv(v: &[f64], out: &mut [f64], tmp: &mut [f64]) {
    mult_Av(v, tmp);
    mult_Atv(tmp, out);
}

pub fn spectral_norm(n: usize) -> f64 {
    assert!(n % 2 == 0, "only even lengths are accepted");

    let mut u = vec![1.0; n];
    let mut v = u.clone();
    let mut tmp = u.clone();

    for _ in 0..10 {
        mult_AtAv(&u, &mut v, &mut tmp);
        mult_AtAv(&v, &mut u, &mut tmp);
    }
    (dot(&u, &v) / dot(&v, &v)).sqrt()
}

fn dot(x: &[f64], y: &[f64]) -> f64 {
    // This is auto-vectorized:
    x.iter().zip(y).map(|(&x, &y)| x * y).fold(0.0, |a, b| a + b)
}

#[cfg(test)]
#[test]
fn test() {
    assert_eq!(&format!("{:.9}", spectral_norm(100)), "1.274219991");
}


================================================
FILE: examples/spectral_norm/src/spectralnorm-output.txt
================================================
1.274219991


================================================
FILE: examples/stencil/Cargo.toml
================================================
[package]
name = "stencil"
version = "0.1.0"
authors = ["gnzlbg <gonzalobg88@gmail.com>"]
edition = "2018"

[dependencies]
packed_simd = { package = "packed_simd", path = "../.." }
time = "^0.1"
rayon = "^1.0"
ispc = { version = "^1.0.4", optional = true }

[build-dependencies]
ispc = { version = "^1.0.4", optional = true }

[[bin]]
name = "stencil"
path = "src/main.rs"

[lib]
name = "stencil_lib"
path = "src/lib.rs"

[features]
default = []
core_arch = ["packed_simd/core_arch"]
sleef-sys = ["packed_simd/sleef-sys"]


================================================
FILE: examples/stencil/benchmark.sh
================================================
#!/usr/bin/env bash
#
# Runs aobench benchmarks

set -ex

if [[ ${NORUN} != 1 ]]; then
    hash hyperfine 2>/dev/null || { echo >&2 "hyperfine is not in PATH."; exit 1; }
fi

algs=("0" "1" "2")
if echo "$FEATURES" | grep -q "ispc"; then
    hash ispc 2>/dev/null || { echo >&2 "ispc is not in PATH."; exit 1; }
    algs+=( "3" "4" )
fi

RUSTFLAGS="-C target-cpu=native ${RUSTFLAGS}" \
         cargo build --release --no-default-features \
         --features="${FEATURES}"

if [[ "${VERIFY}" == "1" ]]; then
    RUSTFLAGS="-C target-cpu=native ${RUSTFLAGS}" \
             cargo test --release --no-default-features \
             --features="${FEATURES}"
fi

if [[ "${NORUN}" == "1" ]]; then
    exit 0
fi

for alg in "${algs[@]}"
do
    hyperfine "../target/release/stencil ${alg}"
done


================================================
FILE: examples/stencil/build.rs
================================================
fn main() {
    println!("cargo:rerun-if-changed=build.rs");

    #[cfg(feature = "ispc")]
    {
        if std::env::var("CARGO_FEATURE_ISPC").is_ok() {
            let mut cfg = ispc::Config::new();

            if cfg!(windows) {
                cfg.debug(false);
            }

            let ispc_files = vec!["volta/stencil.ispc"];

            for s in &ispc_files[..] {
                cfg.file(*s);
            }

            cfg.target_isas(vec![
                ispc::opt::TargetISA::SSE2i32x4,
                ispc::opt::TargetISA::SSE4i32x4,
                ispc::opt::TargetISA::AVX1i32x8,
                ispc::opt::TargetISA::AVX2i32x8,
                ispc::opt::TargetISA::AVX512KNLi32x16,
            ]);

            cfg.compile("stencil");
        }
    }
}


================================================
FILE: examples/stencil/readme.md
================================================
# Stencil

This is the generic [`stencil` ISPC benchmark][ispc]. 

## Usage

```
cargo run --release --features=ispc
```

will run all benchmarks including the ISPC ones. 


## Results

```
./benchmark.sh
```

On a dual core AVX1 i5 @1.8 GHz:

| 800 x 600    | time [ms] <br> Rust | speedup vs `scalar` [-] |
|--------------|---------------------|-------------------------|
| `scalar`     |                2842 |                    1.0x |
| `vector`     |                 630 |                    4.5x |
| `vector_par` |                 444 |                    6.4x |
| `ispc`       |                 558 |                     5.0x |
| `ispc_tasks` |                 470 |                    6.0x |

`vector_par` is 1.06x faster than `ispc_tasks`.

On a 28 core Xeon CPU E5-2690 v4 @ 2.60GHz:

| 800 x 600    | time [ms] <br> Rust | speedup vs `scalar` [-] |
|--------------|---------------------|-------------------------|
| `scalar`     |                1499 | 1.0x                    |
| `vector`     |                 276 | 5.4x                    |
| `vector_par` |                 167 | 9.0x                    |
| `ispc`       |                 287 | 5.2x                    |
| `ispc_tasks` |                 395 | 3.8x                    |

`vector_par` is 1.72x faster than `ispc_tasks`.

On a 40 core Xeon Gold 6148 CPU @ 2.40GHz:

| 800 x 600    | time [ms] <br> Rust | speedup vs `scalar` [-] |
|--------------|---------------------|-------------------------|
| `scalar`     |                1654 |                    1.0x |
| `vector`     |                 278 |                    6.0x |
| `vector_par` |                 148 |                    11.2x |
| `ispc`       |                 185 |                     9.0x |
| `ispc_tasks` |                 401 |                    4.1x |

`vector_par` is 1.25x faster than `ispc`.


[ispc]: https://github.com/ispc/ispc/tree/master/examples/stencil


================================================
FILE: examples/stencil/src/ispc_loops.rs
================================================
//! Includes the ISPC implementations.

use ispc::*;
ispc_module!(stencil);

pub fn serial(
    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
    a_even: &mut [f32], a_odd: &mut [f32],
) {
    unsafe {
        self::stencil::loop_stencil_ispc(
            t0,
            t1,
            x0,
            x1,
            y0,
            y1,
            z0,
            z1,
            n_x,
            n_y,
            n_z,
            coef.as_ptr(),
            vsq.as_ptr(),
            a_even.as_mut_ptr(),
            a_odd.as_mut_ptr(),
        );
    }
}

pub fn tasks(
    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
    a_even: &mut [f32], a_odd: &mut [f32],
) {
    unsafe {
        self::stencil::loop_stencil_ispc_tasks(
            t0,
            t1,
            x0,
            x1,
            y0,
            y1,
            z0,
            z1,
            n_x,
            n_y,
            n_z,
            coef.as_ptr(),
            vsq.as_ptr(),
            a_even.as_mut_ptr(),
            a_odd.as_mut_ptr(),
        );
    }
}


================================================
FILE: examples/stencil/src/lib.rs
================================================
#![feature(custom_inner_attributes, stmt_expr_attributes)]
// FIXME: Null pointer deref warning triggered in this example,
// likely inside a macro expansion deriving from packed_simd.
#![deny(rust_2018_idioms)]
#![allow(
    clippy::similar_names,
    clippy::cast_precision_loss,
    clippy::cast_sign_loss,
    clippy::too_many_arguments,
    clippy::cast_possible_wrap,
    clippy::cast_possible_truncation,
    clippy::inline_always,
    clippy::must_use_candidate
)]

#[cfg(feature = "ispc")]
pub mod ispc_loops;
pub mod scalar;
pub mod simd;
pub mod simd_par;

#[derive(Clone, PartialEq, Debug)]
pub struct Data {
    a: (Vec<f32>, Vec<f32>),
    vsq: Vec<f32>,
    coeff: [f32; 4],
    n: (i32, i32, i32),
    t: (i32, i32),
    x: (i32, i32),
    y: (i32, i32),
    z: (i32, i32),
}

impl Data {
    pub fn default() -> Self {
        Self::from_bounds(6, 4, 128, 128, 128)
    }

    pub fn benchmark() -> Self {
        Self::from_bounds(6, 4, 256, 256, 256)
    }

    pub fn from_bounds(
        max_t: i32, width: i32, n_x: i32, n_y: i32, n_z: i32,
    ) -> Self {
        #[rustfmt::skip]
        Self::new(
            0, max_t,
            width, n_x - width, width, n_y - width, width, n_z - width,
            n_x, n_y, n_z,
        )
    }

    /// Initializes data
    pub fn new(
        t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32,
        z1: i32, n_x: i32, n_y: i32, n_z: i32,
    ) -> Self {
        let n = (n_x * n_y * n_z) as usize;
        let mut data = Self {
            a: (vec![0_f32; n], vec![0_f32; n]),
            vsq: vec![0_f32; n],
            coeff: [0.5, -0.25, 0.125, -0.0625],
            n: (n_x, n_y, n_z),
            t: (t0, t1),
            x: (x0, x1),
            y: (y0, y1),
            z: (z0, z1),
        };

        data.reinit();
        data
    }

    pub fn reinit(&mut self) {
        let mut offset: usize = 0;
        for z in 0..self.n.2 {
            for y in 0..self.n.1 {
                for x in 0..self.n.0 {
                    unsafe {
                        *self.a.0.get_unchecked_mut(offset) =
                            if x < self.n.0 / 2 {
                                x as f32 / self.n.0 as f32
                            } else {
                                y as f32 / self.n.1 as f32
                            };
                        *self.a.1.get_unchecked_mut(offset) = 0.;
                        *self.vsq.get_unchecked_mut(offset) = (x * y * z)
                            as f32
                            / (self.n.0 * self.n.1 * self.n.2) as f32;
                        offset += 1;
                    }
                }
            }
        }
    }

    #[rustfmt::skip]
    pub fn exec<F>(&mut self, f: F)
    where
        F: Fn(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
            &[f32; 4], &[f32], &mut [f32], &mut [f32]),
    {
        f(
            self.t.0, self.t.1,
            self.x.0, self.x.1,
            self.y.0, self.y.1,
            self.z.0, self.z.1,
            self.n.0, self.n.1, self.n.2,
            &self.coeff, &self.vsq, &mut self.a.0, &mut self.a.1,
        );
    }
}

#[cfg(test)]
fn assert_data_eq(a: &Data, b: &Data) {
    if a == b {
        return;
    }
    assert_eq!(a.coeff, b.coeff, "coeffs differ");
    assert_eq!(a.n, b.n, "n differ");
    assert_eq!(a.t, b.t, "t differ");
    assert_eq!(a.x, b.x, "x differ");
    assert_eq!(a.y, b.y, "y differ");
    assert_eq!(a.z, b.z, "z differ");

    for z in 0..a.n.2 {
        for y in 0..a.n.1 {
            for x in 0..a.n.0 {
                let idx = (x + y * a.n.1 + z * a.n.1 * a.n.0) as usize;

                const EPSILON: f32 = 1E-4;

                assert!(
                    (a.vsq[idx] - b.vsq[idx]).abs() < EPSILON,
                    "vsq diff at idx = {} ({}, {}, {})",
                    idx,
                    x,
                    y,
                    z,
                );

                assert!(
                    (a.a.0[idx] - b.a.0[idx]).abs() < EPSILON,
                    "a.0 diff at idx = {} ({}, {}, {})",
                    idx,
                    x,
                    y,
                    z,
                );

                assert!(
                    (a.a.1[idx] - b.a.1[idx]).abs() < EPSILON,
                    "a.1 diff at idx = {} ({}, {}, {})",
                    idx,
                    x,
                    y,
                    z,
                );
            }
        }
    }
}


================================================
FILE: examples/stencil/src/main.rs
================================================
#![feature(custom_inner_attributes)]

use stencil_lib::*;

use std::env;

#[rustfmt::skip]
fn run<F>(name: &str, f: F)
where
    F: Fn(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,
        &[f32; 4], &[f32], &mut [f32], &mut [f32]),
{
    let mut d = Data::benchmark();
    let t = time::Duration::span(move || d.exec(f));
    println!("{}: {} ms", name, t.num_milliseconds());
}

fn main() {
    let mut args = env::args();
    args.next();
    let alg: usize = args.next().unwrap().parse().unwrap();

    match alg {
        0 => run("scalar", self::scalar::scalar),
        1 => run("vector", self::simd::x8),
        2 => run("vector_par", self::simd_par::x8_par),
        3 => {
            #[cfg(feature = "ispc")]
            {
                run("ispc", self::ispc_loops::serial);
            }
            #[cfg(not(feature = "ispc"))]
            {
                panic!("error: algorithm requires binary to be compiled with the ispc feature")
            }
        }
        4 => {
            #[cfg(feature = "ispc")]
            {
                run("ispc+tasks", self::ispc_loops::tasks);
            }
            #[cfg(not(feature = "ispc"))]
            {
                panic!("error: algorithm requires binary to be compiled with the ispc feature")
            }
        }
        _ => panic!("unknown algorithm"),
    }
}


================================================
FILE: examples/stencil/src/scalar.rs
================================================
//! Scalar implementation

pub fn step(
    x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32,
    _n_z: i32, coef: &[f32; 4], vsq: &[f32], a_in: &[f32], a_out: &mut [f32],
) {
    let n_xy = n_x * n_y;

    for z in z0..z1 {
        for y in y0..y1 {
            for x in x0..x1 {
                let index = (z * n_xy) + (y * n_x) + x;

                macro_rules! a_cur {
                    ($x:expr, $y:expr, $z:expr) => {
                        a_in[(index + $x + $y * n_x + $z * n_xy) as usize]
                    };
                }

                macro_rules! a_next {
                    ($x:expr, $y:expr, $z:expr) => {
                        a_out[(index + $x + $y * n_x + $z * n_xy) as usize]
                    };
                }

                let mut div: f32 = coef[0] * a_cur!(0, 0, 0);
                for i in 1..4 {
                    div += coef[i as usize]
                        * (a_cur!(i, 0, 0)
                            + a_cur!(-i, 0, 0)
                            + a_cur!(0, i, 0)
                            + a_cur!(0, -i, 0)
                            + a_cur!(0, 0, i)
                            + a_cur!(0, 0, -i));
                }
                a_next!(0, 0, 0) = 2. * a_cur!(0, 0, 0) - a_next!(0, 0, 0)
                    + vsq[index as usize] * div;
            }
        }
    }
}

pub fn scalar(
    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
    a_even: &mut [f32], a_odd: &mut [f32],
) {
    for t in t0..t1 {
        if t & 1 == 0 {
            step(
                x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even,
                a_odd,
            );
        } else {
            step(
                x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_odd,
                a_even,
            );
        }
    }
}

#[cfg(all(test, feature = "ispc"))]
mod tests {
    use super::scalar;
    use crate::ispc_loops::serial;
    use crate::{assert_data_eq, Data};

    #[test]

    fn scalar_ispc_verify() {
        let mut data_scalar = Data::default();
        data_scalar.exec(scalar);

        let mut data_ispc = Data::default();
        data_ispc.exec(serial);

        assert_data_eq(&data_scalar, &data_ispc);
    }
}


================================================
FILE: examples/stencil/src/simd.rs
================================================
//! SIMD implementation

use packed_simd::*;

#[inline(always)]
pub(crate) fn step_x8(
    x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32,
    _n_z: i32, coef: &[f32; 4], vsq: &[f32], a_in: &[f32], a_out: &mut [f32],
) {
    assert!((x1 - x0) % f32x8::lanes() as i32 == 0);
    let n_xy = n_x * n_y;
    for z in z0..z1 {
        let z_idx = z * n_xy;
        for y in y0..y1 {
            let y_idx = y * n_x;
            for x in (x0..x1).step_by(f32x8::lanes()) {
                unsafe {
                    let out_idx = x + y_idx;
                    let index: i32 = z_idx + out_idx;
                    macro_rules! a_cur {
                        ($x:expr, $y:expr, $z:expr) => {
                            f32x8::from_slice_unaligned_unchecked(
                                &a_in.get_unchecked(
                                    (index + $x + $y * n_x + $z * n_xy)
                                        as usize..,
                                ),
                            )
                        };
                    }

                    let cur_0 = a_cur!(0, 0, 0);
                    let mut div: f32x8 = *coef.get_unchecked(0) * cur_0;

                    for i in 1..4 {
                        let coef = f32x8::splat(*coef.get_unchecked(i));

                        let sum = {
                            let i = i as i32;
                            a_cur!(i, 0, 0)
                                + a_cur!(-i, 0, 0)
                                + a_cur!(0, i, 0)
                                + a_cur!(0, -i, 0)
                                + a_cur!(0, 0, i)
                                + a_cur!(0, 0, -i)
                        };

                        div = coef.mul_adde(sum, div);
                    }

                    let vsq = f32x8::from_slice_unaligned_unchecked(
                        vsq.get_unchecked(index as usize..),
                    );

                    let sum = cur_0.mul_adde(
                        f32x8::splat(2.),
                        -f32x8::from_slice_unaligned_unchecked(
                            a_out.get_unchecked(out_idx as usize..),
                        ),
                    );

                    let r = vsq.mul_adde(div, sum);
                    r.write_to_slice_unaligned_unchecked(
                        &mut a_out.get_unchecked_mut(out_idx as usize..),
                    );
                }
            }
        }
    }
}

#[inline(always)]
fn x8_impl(
    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
    a_even: &mut [f32], a_odd: &mut [f32],
) {
    for t in t0..t1 {
        if t & 1 == 0 {
            a_odd
                .chunks_mut((n_x * n_y) as usize)
                .enumerate()
                .skip(z0 as usize)
                .take((z1 - z0) as usize)
                .for_each(|(z, a_odd)| {
                    let z = z as i32;
                    #[rustfmt::skip]
                    step_x8(x0, x1, y0, y1, z, z + 1, n_x, n_y, n_z,
                        coef, vsq, a_even, a_odd,
                    );
                });
        } else {
            a_even
                .chunks_mut((n_x * n_y) as usize)
                .enumerate()
                .skip(z0 as usize)
                .take((z1 - z0) as usize)
                .for_each(|(z, a_even)| {
                    let z = z as i32;
                    #[rustfmt::skip]
                    step_x8(x0, x1, y0, y1, z, z + 1, n_x, n_y, n_z,
                            coef, vsq, a_odd, a_even,
                    );
                });
        }
    }
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2,fma")]
unsafe fn x8_impl_avx2(
    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
    a_even: &mut [f32], a_odd: &mut [f32],
) {
    #[rustfmt::skip]
    x8_impl(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
            coef, vsq, a_even, a_odd)
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx")]
unsafe fn x8_impl_avx(
    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
    a_even: &mut [f32], a_odd: &mut [f32],
) {
    #[rustfmt::skip]
    x8_impl(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
            coef, vsq, a_even, a_odd)
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse4.2")]
unsafe fn x8_impl_sse42(
    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
    a_even: &mut [f32], a_odd: &mut [f32],
) {
    #[rustfmt::skip]
    x8_impl(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
            coef, vsq, a_even, a_odd)
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse2")]
unsafe fn x8_impl_sse2(
    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
    a_even: &mut [f32], a_odd: &mut [f32],
) {
    #[rustfmt::skip]
    x8_impl(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
            coef, vsq, a_even, a_odd)
}

unsafe fn x8_impl_def(
    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
    a_even: &mut [f32], a_odd: &mut [f32],
) {
    #[rustfmt::skip]
    x8_impl(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
            coef, vsq, a_even, a_odd)
}

pub fn x8(
    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
    a_even: &mut [f32], a_odd: &mut [f32],
) {
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    unsafe {
        if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma")
        {
            #[rustfmt::skip]
            x8_impl_avx2(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
                         coef, vsq, a_even, a_odd)
        } else if is_x86_feature_detected!("avx") {
            #[rustfmt::skip]
            x8_impl_avx(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
                         coef, vsq, a_even, a_odd)
        } else if is_x86_feature_detected!("sse4.2") {
            #[rustfmt::skip]
            x8_impl_sse42(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
                         coef, vsq, a_even, a_odd)
        } else if is_x86_feature_detected!("sse2") {
            #[rustfmt::skip]
            x8_impl_sse2(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
                         coef, vsq, a_even, a_odd)
        } else {
            #[rustfmt::skip]
            x8_impl_def(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
                        coef, vsq, a_even, a_odd)
        }
    }

    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
    unsafe {
        #[rustfmt::skip]
        x8_impl_def(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
                    coef, vsq, a_even, a_odd)
    }
}

#[cfg(test)]
mod tests {
    use super::x8;
    use crate::scalar::scalar;
    use crate::{assert_data_eq, Data};

    #[test]
    fn simd_scalar_verify() {
        let mut data_simd = Data::default();
        data_simd.exec(x8);

        let mut data_scalar = Data::default();
        data_scalar.exec(scalar);

        assert_data_eq(&data_simd, &data_scalar);
    }

    #[cfg(feature = "ispc")]
    #[test]
    fn simd_ispc_verify() {
        use crate::ispc_loops::serial;

        let mut data_simd = Data::default();
        data_simd.exec(x8);

        let mut data_ispc = Data::default();
        data_ispc.exec(serial);

        assert_data_eq(&data_simd, &data_ispc);
    }
}


================================================
FILE: examples/stencil/src/simd_par.rs
================================================
//! SIMD+Rayon implementation.
use crate::simd::step_x8;
use rayon::prelude::*;

#[inline(always)]
fn x8_par_impl(
    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
    a_even: &mut [f32], a_odd: &mut [f32],
) {
    assert!((z1 - z0) <= n_z);
    for t in t0..t1 {
        if t & 1 == 0 {
            a_odd
                .par_chunks_mut((n_x * n_y) as usize)
                .enumerate()
                .skip(z0 as usize)
                .take((z1 - z0) as usize)
                .for_each(|(z, a_odd)| {
                    let z = z as i32;
                    #[rustfmt::skip]
                    step_x8(
                        x0, x1, y0, y1, z, z + 1, n_x, n_y, n_z,
                        coef, vsq, a_even, a_odd,
                    );
                });
        } else {
            a_even
                .par_chunks_mut((n_x * n_y) as usize)
                .enumerate()
                .skip(z0 as usize)
                .take((z1 - z0) as usize)
                .for_each(|(z, a_even)| {
                    let z = z as i32;
                    #[rustfmt::skip]
                    step_x8(
                        x0, x1, y0, y1, z, z + 1, n_x, n_y, n_z,
                        coef, vsq, a_odd, a_even,
                    );
                });
        }
    }
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn x8_par_impl_avx2(
    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
    a_even: &mut [f32], a_odd: &mut [f32],
) {
    x8_par_impl(
        t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even,
        a_odd,
    )
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx")]
unsafe fn x8_par_impl_avx(
    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
    a_even: &mut [f32], a_odd: &mut [f32],
) {
    x8_par_impl(
        t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even,
        a_odd,
    )
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse4.2")]
unsafe fn x8_par_impl_sse42(
    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
    a_even: &mut [f32], a_odd: &mut [f32],
) {
    x8_par_impl(
        t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even,
        a_odd,
    )
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse2")]
unsafe fn x8_par_impl_sse2(
    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
    a_even: &mut [f32], a_odd: &mut [f32],
) {
    x8_par_impl(
        t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even,
        a_odd,
    )
}

unsafe fn x8_par_impl_def(
    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
    a_even: &mut [f32], a_odd: &mut [f32],
) {
    x8_par_impl(
        t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even,
        a_odd,
    )
}

pub fn x8_par(
    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
    a_even: &mut [f32], a_odd: &mut [f32],
) {
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    unsafe {
        if is_x86_feature_detected!("avx2") {
            #[rustfmt::skip]
            x8_par_impl_avx2(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
                             coef, vsq, a_even, a_odd)
        } else if is_x86_feature_detected!("avx") {
            #[rustfmt::skip]
            x8_par_impl_avx(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
                            coef, vsq, a_even, a_odd)
        } else if is_x86_feature_detected!("sse4.2") {
            #[rustfmt::skip]
            x8_par_impl_sse42(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
                              coef, vsq, a_even, a_odd)
        } else if is_x86_feature_detected!("sse2") {
            #[rustfmt::skip]
            x8_par_impl_sse2(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
                             coef, vsq, a_even, a_odd)
        } else {
            #[rustfmt::skip]
            x8_par_impl_def(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
                            coef, vsq, a_even, a_odd)
        }
    }

    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
    unsafe {
        #[rustfmt::skip]
        x8_par_impl_def(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
                        coef, vsq, a_even, a_odd)
    }
}

#[cfg(test)]
mod tests {
    use super::x8_par;
    use crate::scalar::scalar;
    use crate::{assert_data_eq, Data};

    #[test]
    fn simd_par_verify() {
        let mut data_simd_par = Data::default();
        data_simd_par.exec(x8_par);

        let mut data_scalar = Data::default();
        data_scalar.exec(scalar);

        assert_data_eq(&data_simd_par, &data_scalar);
    }
}


================================================
FILE: examples/stencil/volta/.gitignore
================================================
# Files built by ISPC
/objs/
/stencil


================================================
FILE: examples/stencil/volta/Makefile
================================================

EXAMPLE=stencil
CPP_SRC=stencil.cpp stencil_serial.cpp
ISPC_SRC=stencil.ispc
ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16
ISPC_ARM_TARGETS=neon

include common.mk


================================================
FILE: examples/stencil/volta/common.mk
================================================

TASK_CXX=tasksys.cpp
TASK_LIB=-lpthread
TASK_OBJ=objs/tasksys.o

CXX=clang++
CXXFLAGS+=-Iobjs/ -O3 -march=native
CC=clang
CCFLAGS+=-Iobjs/ -O3 -march=native

LIBS=-lm $(TASK_LIB) -lstdc++
ISPC=ispc
ISPC_FLAGS+=-O3
ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)

ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)

ifeq ($(ARCH),x86)
  ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o)
  COMMA=,
  ifneq (,$(findstring $(COMMA),$(ISPC_IA_TARGETS)))
    #$(info multi-target detected: $(ISPC_IA_TARGETS))
    ifneq (,$(findstring sse2,$(ISPC_IA_TARGETS)))
      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse2.o)
    endif
    ifneq (,$(findstring sse4,$(ISPC_IA_TARGETS)))
      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse4.o)
    endif
    ifneq (,$(findstring avx1-,$(ISPC_IA_TARGETS)))
      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx.o)
    endif
    ifneq (,$(findstring avx1.1,$(ISPC_IA_TARGETS)))
      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx11.o)
    endif
    ifneq (,$(findstring avx2,$(ISPC_IA_TARGETS)))
      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx2.o)
    endif
    ifneq (,$(findstring avx512knl,$(ISPC_IA_TARGETS)))
      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx512knl.o)
    endif
    ifneq (,$(findstring avx512skx,$(ISPC_IA_TARGETS)))
      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx512skx.o)
    endif
  endif
  ISPC_TARGETS=$(ISPC_IA_TARGETS)
  ARCH_BIT:=$(shell getconf LONG_BIT)
  ifeq ($(ARCH_BIT),32)
    ISPC_FLAGS += --arch=x86
    CXXFLAGS += -m32
    CCFLAGS += -m32
  else
    ISPC_FLAGS += --arch=x86-64
    CXXFLAGS += -m64
    CCFLAGS += -m64
  endif
else ifeq ($(ARCH),arm)
  ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=_ispc.o))
  ISPC_TARGETS=$(ISPC_ARM_TARGETS)
else
  $(error Unknown architecture $(ARCH) from uname -m)
endif

CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o))
CC_OBJS=$(addprefix objs/, $(CC_SRC:.c=.o))
OBJS=$(CPP_OBJS) $(CC_OBJS) $(TASK_OBJ) $(ISPC_OBJS)

default: $(EXAMPLE)

all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar

.PHONY: dirs clean

dirs:
	/bin/mkdir -p objs/

objs/%.cpp objs/%.o objs/%.h: dirs

clean:
	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 ref test

$(EXAMPLE): $(OBJS)
	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)

objs/%.o: %.cpp dirs $(ISPC_HEADER)
	$(CXX) $< $(CXXFLAGS) -c -o $@

objs/%.o: %.c dirs $(ISPC_HEADER)
	$(CC) $< $(CCFLAGS) -c -o $@

objs/%.o: ../%.cpp dirs
	$(CXX) $< $(CXXFLAGS) -c -o $@

objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h dirs

objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o objs/%_ispc_avx11.o objs/%_ispc_avx2.o objs/%_ispc_avx512knl.o objs/%_ispc_avx512skx.o : %.ispc dirs
	$(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h

objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)
	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h

objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp
	$(CXX) -I../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@

$(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o
	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)

objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC)
	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h

objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
	$(CXX) -I../intrinsics $< $(CXXFLAGS) -c -o $@

$(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)

objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-1

$(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)


================================================
FILE: examples/stencil/volta/stencil.cpp
================================================
/*
  Copyright (c) 2010-2014, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
*/

#ifdef _MSC_VER
#define _CRT_SECURE_NO_WARNINGS
#define NOMINMAX
#pragma warning (disable: 4244)
#pragma warning (disable: 4305)
#endif

#include <cstdlib>
#include <stdio.h>
#include <algorithm>
#include <string.h>
#include <math.h>
#include "../timing.h"
#include "stencil_ispc.h"
using namespace ispc;


extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
                                int y0, int y1, int z0, int z1,
                                int Nx, int Ny, int Nz,
                                const float coef[5], 
                                const float vsq[],
                                float Aeven[], float Aodd[]);


void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) {
    int offset = 0;
    for (int z = 0; z < Nz; ++z)
        for (int y = 0; y < Ny; ++y)
            for (int x = 0; x < Nx; ++x, ++offset) {
                A[0][offset] = (x < Nx / 2) ? x / float(Nx) : y / float(Ny);
                A[1][offset] = 0;
                vsq[offset] = x*y*z / float(Nx * Ny * Nz);
            }
}


int main(int argc, char *argv[]) {
    static unsigned int test_iterations[] = {3, 3, 3};//the last two numbers must be equal here
    int Nx = 256, Ny = 256, Nz = 256;
    int width = 4;

    if (argc > 1) {
        if (strncmp(argv[1], "--scale=", 8) == 0) {
            float scale = atof(argv[1] + 8);
            Nx *= scale;
            Ny *= scale;
            Nz *= scale;
        }
    }
    if ((argc == 4) || (argc == 5)) {
        for (int i = 0; i < 3; i++) {
            test_iterations[i] = atoi(argv[argc - 3 + i]);
        }
    }

    float *Aserial[2], *Aispc[2];
    Aserial[0] = new float [Nx * Ny * Nz];
    Aserial[1] = new float [Nx * Ny * Nz];
    Aispc[0] = new float [Nx * Ny * Nz];
    Aispc[1] = new float [Nx * Ny * Nz];
    float *vsq = new float [Nx * Ny * Nz];

    float coeff[4] = { 0.5, -.25, .125, -.0625 }; 

    InitData(Nx, Ny, Nz, Aispc, vsq);
    //
    // Compute the image using the ispc implementation on one core; report
    // the minimum time of three runs.
    //
    double minTimeISPC = 1e30;
    for (unsigned int i = 0; i < test_iterations[0]; ++i) {
        reset_and_start_timer();
        loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
                          width, Nz - width, Nx, Ny, Nz, coeff, vsq,
                          Aispc[0], Aispc[1]);
        double dt = get_elapsed_mcycles();
        printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", dt);
        minTimeISPC = std::min(minTimeISPC, dt);
    }

    printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);

    InitData(Nx, Ny, Nz, Aispc, vsq);

    //
    // Compute the image using the ispc implementation with tasks; report
    // the minimum time of three runs.
    //
    double minTimeISPCTasks = 1e30;
    for (unsigned int i = 0; i < test_iterations[1]; ++i) {
        reset_and_start_timer();
        loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
                                width, Nz - width, Nx, Ny, Nz, coeff, vsq,
                                Aispc[0], Aispc[1]);
        double dt = get_elapsed_mcycles();
        printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", dt);
        minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
    }

    printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);

    InitData(Nx, Ny, Nz, Aserial, vsq);

    // 
    // And run the serial implementation 3 times, again reporting the
    // minimum time.
    //
    double minTimeSerial = 1e30;
    for (unsigned int i = 0; i < test_iterations[2]; ++i) {
        reset_and_start_timer();
        loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
                            width, Nz - width, Nx, Ny, Nz, coeff, vsq,
                            Aserial[0], Aserial[1]);
        double dt = get_elapsed_mcycles();
        printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt);
        minTimeSerial = std::min(minTimeSerial, dt);
    }

    printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial);

    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);

    // Check for agreement
    int offset = 0;
    for (int z = 0; z < Nz; ++z)
        for (int y = 0; y < Ny; ++y)
            for (int x = 0; x < Nx; ++x, ++offset) {
                float error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
                                    Aserial[1][offset]);
                if (error > 1e-4)
                    printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
                           x, y, z, Aispc[1][offset], Aserial[1][offset]);
            }

    return 0;
}


================================================
FILE: examples/stencil/volta/stencil.ispc
================================================
/*
  Copyright (c) 2010-2011, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:

    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.


   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
*/

static void
stencil_step(uniform int x0, uniform int x1,
             uniform int y0, uniform int y1,
             uniform int z0, uniform int z1,
             uniform int Nx, uniform int Ny, uniform int Nz,
             uniform const float coef[4], uniform const float vsq[],
             uniform const float Ain[], uniform float Aout[]) {
    const uniform int Nxy = Nx * Ny;

    foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) {
        int index = (z * Nxy) + (y * Nx) + x;
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
        float div = coef[0] * A_cur(0, 0, 0) +
            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
                       A_cur(0, 0, +3) + A_cur(0, 0, -3));

        A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
            vsq[index] * div;
    }
}

static task void
stencil_step_task(uniform int x0, uniform int x1,
                  uniform int y0, uniform int y1,
                  uniform int z0,
                  uniform int Nx, uniform int Ny, uniform int Nz,
                  uniform const float coef[4], uniform const float vsq[],
                  uniform const float Ain[], uniform float Aout[]) {
    stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1,
                 Nx, Ny, Nz, coef, vsq, Ain, Aout);
}


export void
loop_stencil_ispc_tasks(uniform int t0, uniform int t1, 
                        uniform int x0, uniform int x1,
                        uniform int y0, uniform int y1,
                        uniform int z0, uniform int z1,
                        uniform int Nx, uniform int Ny, uniform int Nz,
                        uniform const float coef[4], 
                        uniform const float vsq[],
                        uniform float Aeven[], uniform float Aodd[])
{
    for (uniform int t = t0; t < t1; ++t) {
        // Parallelize across cores as well: each task will work on a slice
        // of 1 in the z extent of the volume.
        if ((t & 1) == 0)
            launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, 
                                            coef, vsq, Aeven, Aodd);
        else
            launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, 
                                            coef, vsq, Aodd, Aeven);

        // We need to wait for all of the launched tasks to finish before
        // starting the next iteration.
        sync;
    }
}

export void
loop_stencil_ispc(uniform int t0, uniform int t1, 
                  uniform int x0, uniform int x1,
                  uniform int y0, uniform int y1,
                  uniform int z0, uniform int z1,
                  uniform int Nx, uniform int Ny, uniform int Nz,
                  uniform const float coef[4], 
                  uniform const float vsq[],
                  uniform float Aeven[], uniform float Aodd[])
{
    for (uniform int t = t0; t < t1; ++t) {
        if ((t & 1) == 0)
            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
                         Aeven, Aodd);
        else
            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
                         Aodd, Aeven);
    }
}


================================================
FILE: examples/stencil/volta/stencil_serial.cpp
================================================
/*
  Copyright (c) 2010-2011, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:

    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.


   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
*/


static void
stencil_step_serial(int x0, int x1,
             int y0, int y1,
             int z0, int z1,
             int Nx, int Ny, int Nz,
             const float coef[4], const float vsq[],
             const float Ain[], float Aout[]) {
    int Nxy = Nx * Ny;

    for (int z = z0; z < z1; ++z) {
        for (int y = y0; y < y1; ++y) {
            for (int x = x0; x < x1; ++x) {
                int index = (z * Nxy) + (y * Nx) + x;
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
                float div = coef[0] * A_cur(0, 0, 0) +
                            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
                                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
                                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
                            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
                                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
                                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
                            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
                                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
                                       A_cur(0, 0, +3) + A_cur(0, 0, -3));

                A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
                    vsq[index] * div;
            }
        }
    }
}


void loop_stencil_serial(int t0, int t1,
                         int x0, int x1,
                         int y0, int y1,
                         int z0, int z1,
                         int Nx, int Ny, int Nz,
                         const float coef[4],
                         const float vsq[],
                         float Aeven[], float Aodd[])
{
    for (int t = t0; t < t1; ++t) {
        if ((t & 1) == 0)
          stencil_step_serial(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
                         Aeven, Aodd);
        else
            stencil_step_serial(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,
                         Aodd, Aeven);
    }
}


================================================
FILE: examples/stencil/volta/tasksys.cpp
================================================
/*
  Copyright (c) 2011-2012, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:

    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.


   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
*/

/*
  This file implements simple task systems that provide the three
  entrypoints used by ispc-generated to code to handle 'launch' and 'sync'
  statements in ispc programs.  See the section "Task Parallelism: Language
  Syntax" in the ispc documentation for information about using task
  parallelism in ispc programs, and see the section "Task Parallelism:
  Runtime Requirements" for information about the task-related entrypoints
  that are implemented here.

  There are several task systems in this file, built using:
    - Microsoft's Concurrency Runtime (ISPC_USE_CONCRT)
    - Apple's Grand Central Dispatch (ISPC_USE_GCD)
    - bare pthreads (ISPC_USE_PTHREADS, ISPC_USE_PTHREADS_FULLY_SUBSCRIBED)
    - Cilk Plus (ISPC_USE_CILK)
    - TBB (ISPC_USE_TBB_TASK_GROUP, ISPC_USE_TBB_PARALLEL_FOR)
    - OpenMP (ISPC_USE_OMP)
    - HPX (ISPC_USE_HPX)

  The task system implementation can be selected at compile time, by defining 
  the appropriate preprocessor symbol on the command line (for e.g.: -D ISPC_USE_TBB).
  Not all combinations of platform and task system are meaningful.
  If no task system is requested, a reasonable default task system for the platform
  is selected.  Here are the task systems that can be selected:

#define ISPC_USE_GCD
#define ISPC_USE_CONCRT
#define ISPC_USE_PTHREADS
#define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
#define ISPC_USE_CILK
#define ISPC_USE_OMP
#define ISPC_USE_TBB_TASK_GROUP
#define ISPC_USE_TBB_PARALLEL_FOR

  The ISPC_USE_PTHREADS_FULLY_SUBSCRIBED model essentially takes over the machine
  by assigning one pthread to each hyper-thread, and then uses spinlocks and atomics
  for task management.  This model is useful for KNC where tasks can take over 
  the machine, but less so when there are other tasks that need running on the machine.

#define ISPC_USE_CREW
#define ISPC_USE_HPX
  The HPX model requires the HPX runtime environment to be set up. This can be
  done manually, e.g. with hpx::init, or by including hpx/hpx_main.hpp which
  uses the main() function as entry point and sets up the runtime system.
  Number of threads can be specified as commandline parameter with
  --hpx:threads, use "all" to spawn one thread per processing unit.

*/

#if !(defined ISPC_USE_CONCRT          || defined ISPC_USE_GCD              || \
      defined ISPC_USE_PTHREADS        || defined ISPC_USE_PTHREADS_FULLY_SUBSCRIBED || \
      defined ISPC_USE_TBB_TASK_GROUP  || defined ISPC_USE_TBB_PARALLEL_FOR || \
      defined ISPC_USE_OMP             || defined ISPC_USE_CILK             || \
      defined ISPC_USE_HPX)

    // If no task model chosen from the compiler cmdline, pick a reasonable default
    #if defined(_WIN32) || defined(_WIN64)
      #define ISPC_USE_CONCRT
    #elif defined(__linux__)
    #define ISPC_USE_PTHREADS
    #elif defined(__APPLE__)
      #define ISPC_USE_GCD
    #endif
    #if defined(__KNC__)
      #define ISPC_USE_PTHREADS
    #endif

#endif // No task model specified on compiler cmdline

#if defined(_WIN32) || defined(_WIN64)
#define ISPC_IS_WINDOWS
#elif defined(__linux__)
#define ISPC_IS_LINUX
#elif defined(__APPLE__)
#define ISPC_IS_APPLE
#endif
#if defined(__KNC__)
#define ISPC_IS_KNC
#endif


#define DBG(x) 

#ifdef ISPC_IS_WINDOWS
  #define NOMINMAX
  #include <windows.h>
#endif // ISPC_IS_WINDOWS
#ifdef ISPC_USE_CONCRT
  #include <concrt.h>
  using namespace Concurrency;
#endif // ISPC_USE_CONCRT
#ifdef ISPC_USE_GCD
  #include <dispatch/dispatch.h>
  #include <pthread.h>
#endif // ISPC_USE_GCD
#ifdef ISPC_USE_PTHREADS
  #include <pthread.h>
  #include <semaphore.h>
  #include <unistd.h>
  #include <fcntl.h>
  #include <errno.h>
  #include <sys/types.h>
  #include <sys/stat.h>
  #include <sys/param.h>
  #include <sys/sysctl.h>
  #include <vector>
  #include <algorithm>
#endif // ISPC_USE_PTHREADS
#ifdef ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
#include <pthread.h>
#include <semaphore.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/param.h>
#include <sys/sysctl.h>
#include <vector>
#include <algorithm>
//#include <stdexcept>
#include <stack>
#endif // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
#ifdef ISPC_USE_TBB_PARALLEL_FOR
  #include <tbb/parallel_for.h>
#endif // ISPC_USE_TBB_PARALLEL_FOR
#ifdef ISPC_USE_TBB_TASK_GROUP
  #include <tbb/task_group.h>
#endif // ISPC_USE_TBB_TASK_GROUP
#ifdef ISPC_USE_CILK
  #include <cilk/cilk.h>
#endif // ISPC_USE_TBB
#ifdef ISPC_USE_OMP
  #include <omp.h>
#endif // ISPC_USE_OMP
#ifdef ISPC_USE_HPX
#include <hpx/include/async.hpp>
#include <hpx/lcos/wait_all.hpp>
#endif // ISPC_USE_HPX
#ifdef ISPC_IS_LINUX
  #include <malloc.h>
#endif // ISPC_IS_LINUX

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <algorithm>

// Signature of ispc-generated 'task' functions
typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount,
                             int taskIndex, int taskCount,
                             int taskIndex0, int taskIndex1, int taskIndex2,
                             int taskCount0, int taskCount1, int taskCount2);

// Small structure used to hold the data for each task
#ifdef _MSC_VER
__declspec(align(16))
#endif
struct TaskInfo {
    TaskFuncType func;
    void *data;
    int taskIndex;
    int taskCount3d[3];
#if defined(  ISPC_USE_CONCRT)
    event taskEvent;
#endif
    int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; }
    int taskIndex0() const 
    {
      return taskIndex % taskCount3d[0];
    }
    int taskIndex1() const 
    {
      return ( taskIndex / taskCount3d[0] ) % taskCount3d[1];
    }
    int taskIndex2() const 
    {
      return taskIndex / ( taskCount3d[0]*taskCount3d[1] );
    }
    int taskCount0() const { return taskCount3d[0]; }
    int taskCount1() const { return taskCount3d[1]; }
    int taskCount2() const { return taskCount3d[2]; }
    TaskInfo() { assert(sizeof(TaskInfo) % 32 == 0); }
}
#ifndef _MSC_VER
__attribute__((aligned(32)));
#endif
;

// ispc expects these functions to have C linkage / not be mangled
extern "C" { 
    void ISPCLaunch(void **handlePtr, void *f, void *data, int countx, int county, int countz);
    void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
    void ISPCSync(void *handle);
}

///////////////////////////////////////////////////////////////////////////
// TaskGroupBase

#define LOG_TASK_QUEUE_CHUNK_SIZE 14
#define MAX_TASK_QUEUE_CHUNKS 8
#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)

#define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)

#define NUM_MEM_BUFFERS 16

class TaskGroup;

/** The TaskGroupBase structure provides common functionality for "task
    groups"; a task group is the set of tasks launched from within a single
    ispc function.  When the function is ready to return, it waits for all
    of the tasks in its task group to finish before it actually returns.
 */
class TaskGroupBase {
public:
    void Reset();

    int AllocTaskInfo(int count);
    TaskInfo *GetTaskInfo(int index);

    void *AllocMemory(int64_t size, int32_t alignment);

protected:
    TaskGroupBase();
    ~TaskGroupBase();

    int nextTaskInfoIndex;

private:
    /* We allocate blocks of TASK_QUEUE_CHUNK_SIZE TaskInfo structures as
       needed by the calling function.  We hold up to MAX_TASK_QUEUE_CHUNKS
       of these (and then exit at runtime if more than this many tasks are
       launched.)
     */
    TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];

    /* We also allocate chunks of memory to service ISPCAlloc() calls.  The
       memBuffers[] array holds pointers to this memory.  The first element
       of this array is initialized to point to mem and then any subsequent
       elements required are initialized with dynamic allocation.
     */
    int curMemBuffer, curMemBufferOffset;
    int memBufferSize[NUM_MEM_BUFFERS];
    char *memBuffers[NUM_MEM_BUFFERS];
    char mem[256];
};


inline TaskGroupBase::TaskGroupBase() { 
    nextTaskInfoIndex = 0; 

    curMemBuffer = 0; 
    curMemBufferOffset = 0;
    memBuffers[0] = mem;
    memBufferSize[0] = sizeof(mem) / sizeof(mem[0]);
    for (int i = 1; i < NUM_MEM_BUFFERS; ++i) {
        memBuffers[i] = NULL;
        memBufferSize[i] = 0;
    }

    for (int i = 0; i < MAX_TASK_QUEUE_CHUNKS; ++i)
        taskInfo[i] = NULL;
}


inline TaskGroupBase::~TaskGroupBase() {
    // Note: don't delete memBuffers[0], since it points to the start of
    // the "mem" member!
    for (int i = 1; i < NUM_MEM_BUFFERS; ++i)
        delete[](memBuffers[i]);
}


inline void
TaskGroupBase::Reset() {
    nextTaskInfoIndex = 0; 
    curMemBuffer = 0; 
    curMemBufferOffset = 0;
}


inline int
TaskGroupBase::AllocTaskInfo(int count) {
    int ret = nextTaskInfoIndex;
    nextTaskInfoIndex += count;
    return ret;
}


inline TaskInfo *
TaskGroupBase::GetTaskInfo(int index) {
    int chunk = (index >> LOG_TASK_QUEUE_CHUNK_SIZE);
    int offset = index & (TASK_QUEUE_CHUNK_SIZE-1);

    if (chunk == MAX_TASK_QUEUE_CHUNKS) {
        fprintf(stderr, "A total of %d tasks have been launched from the "
                "current function--the simple built-in task system can handle "
                "no more. You can increase the values of TASK_QUEUE_CHUNK_SIZE "
                "and LOG_TASK_QUEUE_CHUNK_SIZE to work around this limitation.  "
                "Sorry!  Exiting.\n", index);
        exit(1);
    }

    if (taskInfo[chunk] == NULL)
        taskInfo[chunk] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
    return &taskInfo[chunk][offset];
}


inline void *
TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
    char *basePtr = memBuffers[curMemBuffer];
    intptr_t iptr = (intptr_t)(basePtr + curMemBufferOffset);
    iptr = (iptr + (alignment-1)) & ~(alignment-1);

    int newOffset = int(iptr - (intptr_t)basePtr + size);
    if (newOffset < memBufferSize[curMemBuffer]) {
        curMemBufferOffset = newOffset;
        return (char *)iptr;
    }

    ++curMemBuffer;
    curMemBufferOffset = 0;
    assert(curMemBuffer < NUM_MEM_BUFFERS);

    int allocSize = 1 << (12 + curMemBuffer);
    allocSize = std::max(int(size+alignment), allocSize);
    char *newBuf = new char[allocSize];
    memBufferSize[curMemBuffer] = allocSize;
    memBuffers[curMemBuffer] = newBuf;
    return AllocMemory(size, alignment);
}


///////////////////////////////////////////////////////////////////////////
// Atomics and the like

static inline void
lMemFence() {
    // Windows atomic functions already contain the fence
    // KNC doesn't need the memory barrier
#if !defined ISPC_IS_KNC && !defined ISPC_IS_WINDOWS
    __sync_synchronize();
#endif
}

static void *
lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
#ifdef ISPC_IS_WINDOWS
    return InterlockedCompareExchangePointer(v, newValue, oldValue);
#else
    void *result = __sync_val_compare_and_swap(v, oldValue, newValue);
    lMemFence();
    return result;
#endif // ISPC_IS_WINDOWS
}

static int32_t 
lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
#ifdef ISPC_IS_WINDOWS
    return InterlockedCompareExchange((volatile LONG *)v, newValue, oldValue);
#else
    int32_t result = __sync_val_compare_and_swap(v, oldValue, newValue);
    lMemFence();
    return result;
#endif // ISPC_IS_WINDOWS
}

static inline int32_t 
lAtomicAdd(volatile int32_t *v, int32_t delta) {
#ifdef ISPC_IS_WINDOWS
    return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta;
#else
    return __sync_fetch_and_add(v, delta);
#endif
}

///////////////////////////////////////////////////////////////////////////

#ifdef ISPC_USE_CONCRT
// With ConcRT, we don't need to extend TaskGroupBase at all.
class TaskGroup : public TaskGroupBase {
public:
    void Launch(int baseIndex, int count);
    void Sync();
};
#endif // ISPC_USE_CONCRT

#ifdef ISPC_USE_GCD
/* With Grand Central Dispatch, we associate a GCD dispatch group with each
   task group.  (We'll later wait on this dispatch group when we need to
   wait on all of the tasks in the group to finish.)
 */
class TaskGroup : public TaskGroupBase {
public:
    TaskGroup() {
        gcdGroup = dispatch_group_create();
    }

    void Launch(int baseIndex, int count);
    void Sync();

private:
    dispatch_group_t gcdGroup;
};
#endif // ISPC_USE_GCD

#ifdef ISPC_USE_PTHREADS
static void *lTaskEntry(void *arg);

class TaskGroup : public TaskGroupBase {
public:
    TaskGroup() {
        numUnfinishedTasks = 0;
        waitingTasks.reserve(128);
        inActiveList = false;
    }

    void Reset() {
        TaskGroupBase::Reset();
        numUnfinishedTasks = 0;
        assert(inActiveList == false);
        lMemFence();
    }

    void Launch(int baseIndex, int count);
    void Sync();

private:
    friend void *lTaskEntry(void *arg);

    int32_t numUnfinishedTasks;
    int32_t pad[3];
    std::vector<int> waitingTasks;
    bool inActiveList;
};

#endif // ISPC_USE_PTHREADS

#ifdef ISPC_USE_CILK

class TaskGroup : public TaskGroupBase {
public:
    void Launch(int baseIndex, int count);
    void Sync();

};

#endif // ISPC_USE_CILK

#ifdef ISPC_USE_OMP

class TaskGroup : public TaskGroupBase {
public:
    void Launch(int baseIndex, int count);
    void Sync();

};

#endif // ISPC_USE_OMP

#ifdef ISPC_USE_TBB_PARALLEL_FOR

class TaskGroup : public TaskGroupBase {
public:
    void Launch(int baseIndex, int count);
    void Sync();

};

#endif // ISPC_USE_TBB_PARALLEL_FOR

#ifdef ISPC_USE_TBB_TASK_GROUP

class TaskGroup : public TaskGroupBase {
public:
    void Launch(int baseIndex, int count);
    void Sync();
private:
    tbb::task_group tbbTaskGroup;
};

#endif // ISPC_USE_TBB_TASK_GROUP

#ifdef ISPC_USE_HPX

class TaskGroup : public TaskGroupBase {
public:
    void Launch(int baseIndex, int count);
    void Sync();
private:
    std::vector<hpx::future<void>> futures;
};

#endif // ISPC_USE_HPX

///////////////////////////////////////////////////////////////////////////

///////////////////////////////////////////////////////////////////////////
// Grand Central Dispatch

#ifdef ISPC_USE_GCD

/* A simple task system for ispc programs based on Apple's Grand Central
   Dispatch. */

static dispatch_queue_t gcdQueue;
static volatile int32_t lock = 0;

static void
InitTaskSystem() {
    if (gcdQueue != NULL)
        return;

    while (1) {
        if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
            if (gcdQueue == NULL) {
                gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
                assert(gcdQueue != NULL);
                lMemFence();
            }
            lock = 0;
            break;
        }
    }
}


static void
lRunTask(void *ti) {
    TaskInfo *taskInfo = (TaskInfo *)ti;
    // FIXME: these are bogus values; may cause bugs in code that depends
    // on them having unique values in different threads.
    int threadIndex = 0;
    int threadCount = 1;

    // Actually run the task
    taskInfo->func(taskInfo->data, threadIndex, threadCount, 
                   taskInfo->taskIndex, taskInfo->taskCount(),
            taskInfo->taskIndex0(), taskInfo->taskIndex1(), taskInfo->taskIndex2(),
            taskInfo->taskCount0(), taskInfo->taskCount1(), taskInfo->taskCount2());
}


inline void
TaskGroup::Launch(int baseIndex, int count) {
    for (int i = 0; i < count; ++i) {
        TaskInfo *ti = GetTaskInfo(baseIndex + i);
        dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
    }
}


inline void
TaskGroup::Sync() {
    dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
}

#endif // ISPC_USE_GCD

///////////////////////////////////////////////////////////////////////////
// Concurrency Runtime

#ifdef ISPC_USE_CONCRT

static void
InitTaskSystem() {
    // No initialization needed
}


static void __cdecl
lRunTask(LPVOID param) {
    TaskInfo *ti = (TaskInfo *)param;
    
    // Actually run the task. 
    // FIXME: like the GCD implementation for OS X, this is passing bogus
    // values for the threadIndex and threadCount builtins, which in turn
    // will cause bugs in code that uses those.
    int threadIndex = 0;
    int threadCount = 1;
    ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());

    // Signal the event that this task is done
    ti->taskEvent.set();
}


inline void
TaskGroup::Launch(int baseIndex, int count) {
    for (int i = 0; i < count; ++i)
        CurrentScheduler::ScheduleTask(lRunTask, GetTaskInfo(baseIndex + i));
}


inline void
TaskGroup::Sync() {
    for (int i = 0; i < nextTaskInfoIndex; ++i) {
        TaskInfo *ti = GetTaskInfo(i);
        ti->taskEvent.wait();
        ti->taskEvent.reset();
    }
}

#endif // ISPC_USE_CONCRT

///////////////////////////////////////////////////////////////////////////
// pthreads

#ifdef ISPC_USE_PTHREADS

static volatile int32_t lock = 0;

static int nThreads;
static pthread_t *threads = NULL;

static pthread_mutex_t taskSysMutex;
static std::vector<TaskGroup *> activeTaskGroups;
static sem_t *workerSemaphore;

static void *
lTaskEntry(void *arg) {
    int threadIndex = (int)((int64_t)arg);
    int threadCount = nThreads;

    while (1) {
        int err;
        //
        // Wait on the semaphore until we're woken up due to the arrival of
        // more work.
        //
        if ((err = sem_wait(workerSemaphore)) != 0) {
            fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
            exit(1);
        }

        //
        // Acquire the mutex
        //
        if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
            exit(1);
        }

        if (activeTaskGroups.size() == 0) {
            //
            // Task queue is empty, go back and wait on the semaphore
            //
            if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
                fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
                exit(1);
            }
            continue;
        }

        //
        // Get the last task group on the active list and the last task
        // from its waiting tasks list.
        //
        TaskGroup *tg = activeTaskGroups.back();
        assert(tg->waitingTasks.size() > 0);
        int taskNumber = tg->waitingTasks.back();
        tg->waitingTasks.pop_back();

        if (tg->waitingTasks.size() == 0) {
            // We just took the last task from this task group, so remove
            // it from the active list.
            activeTaskGroups.pop_back();
            tg->inActiveList = false;
        }
    
        if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
            exit(1);
        }

        //
        // And now actually run the task
        //
        DBG(fprintf(stderr, "running task %d from group %p\n", taskNumber, tg));
        TaskInfo *myTask = tg->GetTaskInfo(taskNumber);
        myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex,
                     myTask->taskCount(),
            myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(),
            myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2());

        //
        // Decrement the "number of unfinished tasks" counter in the task
        // group.
        //
        lMemFence();
        lAtomicAdd(&tg->numUnfinishedTasks, -1);
    }

    pthread_exit(NULL);
    return 0;
}


static void
InitTaskSystem() {
    if (threads == NULL) {
        while (1) {
            if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
                if (threads == NULL) {
                    // We launch one fewer thread than there are cores,
                    // since the main thread here will also grab jobs from
                    // the task queue itself.
                    nThreads = sysconf(_SC_NPROCESSORS_ONLN) - 1;

                    int err;
                    if ((err = pthread_mutex_init(&taskSysMutex, NULL)) != 0) {
                        fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
                        exit(1);
                    }

                    char name[32];
                    bool success = false;
                    srand(time(NULL));
                    for (int i = 0; i < 10; i++) {
                        sprintf(name, "ispc_task.%d.%d", (int)getpid(), (int)rand());
                        workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
                        if (workerSemaphore != SEM_FAILED) {
                            success = true;
                            break;
                        }
                        fprintf(stderr, "Failed to create %s\n", name);
                    }

                    if (!success) {
                        fprintf(stderr, "Error creating semaphore (%s): %s\n", name, strerror(errno));
                        exit(1);
                    }

                    threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t));
                    for (int i = 0; i < nThreads; ++i) {
                      err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)((long long)i));
                        if (err != 0) {
                            fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
                            exit(1);
                        }
                    }

                    activeTaskGroups.reserve(64);
                }

                // Make sure all of the above goes to memory before we
                // clear the lock.
                lMemFence();
                lock = 0;
                break;
            }
        }
    }
}


inline void
TaskGroup::Launch(int baseCoord, int count) {
    //
    // Acquire mutex, add task
    //
    int err;
    if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
        exit(1);
    }

    // Add the corresponding set of tasks to the waiting-to-be-run list for
    // this task group.
    //
    // FIXME: it's a little ugly to hold a global mutex for this when we
    // only need to make sure no one else is accessing this task group's
    // waitingTasks list.  (But a small experiment in switching to a
    // per-TaskGroup mutex showed worse performance!)
    for (int i = 0; i < count; ++i)
        waitingTasks.push_back(baseCoord + i);

    // Add the task group to the global active list if it isn't there
    // already.
    if (inActiveList == false) {
        activeTaskGroups.push_back(this);
        inActiveList = true;
    }

    if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
        fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
        exit(1);
    }

    //
    // Update the count of the number of tasks left to run in this task
    // group.
    //
    lMemFence();
    lAtomicAdd(&numUnfinishedTasks, count);

    //
    // Post to the worker semaphore to wake up worker threads that are
    // sleeping waiting for tasks to show up
    //
    for (int i = 0; i < count; ++i)
        if ((err = sem_post(workerSemaphore)) != 0) {
            fprintf(stderr, "Error from sem_post: %s\n", strerror(err));
            exit(1);
        }
}


inline void
TaskGroup::Sync() {
    DBG(fprintf(stderr, "syncing %p - %d unfinished\n", tg, numUnfinishedTasks));

    while (numUnfinishedTasks > 0) {
        // All of the tasks in this group aren't finished yet.  We'll try
        // to help out here since we don't have anything else to do...

        DBG(fprintf(stderr, "while syncing %p - %d unfinished\n", tg, 
                    numUnfinishedTasks));

        //
        // Acquire the global task system mutex to grab a task to work on
        //
        int err;
        if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
            exit(1);
        }

        TaskInfo *myTask = NULL;
        TaskGroup *runtg = this;
        if (waitingTasks.size() > 0) {
            int taskNumber = waitingTasks.back();
            waitingTasks.pop_back();

            if (waitingTasks.size() == 0) {
                // There's nothing left to start running from this group,
                // so remove it from the active task list.
                activeTaskGroups.erase(std::find(activeTaskGroups.begin(),
                                                 activeTaskGroups.end(), this));
                inActiveList = false;
            }
            myTask = GetTaskInfo(taskNumber);
            DBG(fprintf(stderr, "running task %d from group %p in sync\n", taskNumber, tg));
        }
        else {
            // Other threads are already working on all of the tasks in
            // this group, so we can't help out by running one ourself.
            // We'll try to run one from another group to make ourselves
            // useful here.
            if (activeTaskGroups.size() == 0) {
                // No active task groups left--there's nothing for us to do.
                if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
                    fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
                    exit(1);
                }
                // FIXME: We basically end up busy-waiting here, which is
                // extra wasteful in a world with hyper-threading.  It would
                // be much better to put this thread to sleep on a
                // condition variable that was signaled when the last task
                // in this group was finished.
#ifndef ISPC_IS_KNC
                usleep(1);
#else
                _mm_delay_32(8);
#endif
                continue;
            }

            // Get a task to run from another task group.
            runtg = activeTaskGroups.back();
            assert(runtg->waitingTasks.size() > 0);

            int taskNumber = runtg->waitingTasks.back();
            runtg->waitingTasks.pop_back();
            if (runtg->waitingTasks.size() == 0) {
                // There's left to start running from this group, so remove
                // it from the active task list.
                activeTaskGroups.pop_back();
                runtg->inActiveList = false;
            }
            myTask = runtg->GetTaskInfo(taskNumber);
            DBG(fprintf(stderr, "running task %d from other group %p in sync\n", 
                        taskNumber, runtg));
        }

        if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
            exit(1);
        }
    
        //
        // Do work for _myTask_
        //
        // FIXME: bogus values for thread index/thread count here as well..
        myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount(),
            myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(),
            myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2());

        //
        // Decrement the number of unfinished tasks counter
        //
        lMemFence();
        lAtomicAdd(&runtg->numUnfinishedTasks, -1);
    }
    DBG(fprintf(stderr, "sync for %p done!n", tg));
}

#endif // ISPC_USE_PTHREADS

///////////////////////////////////////////////////////////////////////////
// Cilk Plus

#ifdef ISPC_USE_CILK

static void
InitTaskSystem() {
    // No initialization needed
}

inline void
TaskGroup::Launch(int baseIndex, int count) {
    cilk_for(int i = 0; i < count; i++) {
        TaskInfo *ti = GetTaskInfo(baseIndex + i);

        // Actually run the task. 
        // Cilk does not expose the task -> thread mapping so we pretend it's 1:1
        ti->func(ti->data, ti->taskIndex, ti->taskCount(),
            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
    }
}

inline void
TaskGroup::Sync() {
}

#endif // ISPC_USE_CILK

///////////////////////////////////////////////////////////////////////////
// OpenMP

#ifdef ISPC_USE_OMP

static void
InitTaskSystem() {
        // No initialization needed
}

inline void
TaskGroup::Launch(int baseIndex, int count) {
#pragma omp parallel
  {
    const int threadIndex = omp_get_thread_num();
    const int threadCount = omp_get_num_threads();

#pragma omp for schedule(runtime)
    for(int i = 0; i < count; i++) 
    {
        TaskInfo *ti = GetTaskInfo(baseIndex + i);

        // Actually run the task. 
        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
    }
  }
}

inline void
TaskGroup::Sync() {
}

#endif // ISPC_USE_OMP

///////////////////////////////////////////////////////////////////////////
// Thread Building Blocks

#ifdef ISPC_USE_TBB_PARALLEL_FOR

static void
InitTaskSystem() {
    // No initialization needed by default
    //tbb::task_scheduler_init();
}

inline void
TaskGroup::Launch(int baseIndex, int count) {
    tbb::parallel_for(0, count, [=](int i) {
        TaskInfo *ti = GetTaskInfo(baseIndex + i);

        // Actually run the task. 
        // TBB does not expose the task -> thread mapping so we pretend it's 1:1
        int threadIndex = ti->taskIndex;
        int threadCount = ti->taskCount();

        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
    });
}

inline void
TaskGroup::Sync() {
}

#endif // ISPC_USE_TBB_PARALLEL_FOR

#ifdef ISPC_USE_TBB_TASK_GROUP

static void
InitTaskSystem() {
    // No initialization needed by default
    //tbb::task_scheduler_init();
}

inline void
TaskGroup::Launch(int baseIndex, int count) {
    for (int i = 0; i < count; i++) {
        tbbTaskGroup.run([=]() {
            TaskInfo *ti = GetTaskInfo(baseIndex + i);

            // TBB does not expose the task -> thread mapping so we pretend it's 1:1
            int threadIndex = ti->taskIndex;
            int threadCount = ti->taskCount();
            ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());
        });
    }
}

inline void
TaskGroup::Sync() {
    tbbTaskGroup.wait();
}

#endif // ISPC_USE_TBB_TASK_GROUP

///////////////////////////////////////////////////////////////////////////
// ISPC_USE_HPX

#ifdef ISPC_USE_HPX

static void
InitTaskSystem() {
}

inline void
TaskGroup::Launch(int baseIndex, int count) {
    for (int i = 0; i < count; ++i) {
        TaskInfo *ti = GetTaskInfo(baseIndex + i);
        int threadIndex = i;
        int threadCount = count;
        futures.push_back(hpx::async(ti->func, ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),
            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),
            ti->taskCount0(), ti->taskCount1(), ti->taskCount2()));
    }
}

inline void
TaskGroup::Sync() {
    hpx::wait_all(futures);
    futures.clear();
}
#endif
///////////////////////////////////////////////////////////////////////////

#ifndef ISPC_USE_PTHREADS_FULLY_SUBSCRIBED

#define MAX_FREE_TASK_GROUPS 64
static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];

static inline TaskGroup *
AllocTaskGroup() {
    for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
        TaskGroup *tg = freeTaskGroups[i];
        if (tg != NULL) {
            void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg);
            if (ptr != NULL) {
                return (TaskGroup *)ptr;
            }
        }
    }

    return new TaskGroup;
}


static inline void
FreeTaskGroup(TaskGroup *tg) {
    tg->Reset();

    for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
        if (freeTaskGroups[i] == NULL) {
            void *ptr = lAtomicCompareAndSwapPointer((void **)&freeTaskGroups[i], tg, NULL);
            if (ptr == NULL)
                return;
        }
    }

    delete tg;
}

///////////////////////////////////////////////////////////////////////////

void
ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2) {
    const int count = count0*count1*count2;
    TaskGroup *taskGroup;
    if (*taskGroupPtr == NULL) {
        InitTaskSystem();
        taskGroup = AllocTaskGroup();
        *taskGroupPtr = taskGroup;
    }
    else
        taskGroup = (TaskGroup *)(*taskGroupPtr);

    int baseIndex = taskGroup->AllocTaskInfo(count);
    for (int i = 0; i < count; ++i) {
        TaskInfo *ti = taskGroup->GetTaskInfo(baseIndex+i);
        ti->func = (TaskFuncType)func;
        ti->data = data;
        ti->taskIndex = i;
        ti->taskCount3d[0] = count0;
        ti->taskCount3d[1] = count1;
        ti->taskCount3d[2] = count2;
    }
    taskGroup->Launch(baseIndex, count);
}


void
ISPCSync(void *h) {
    TaskGroup *taskGroup = (TaskGroup *)h;
    if (taskGroup != NULL) {
        taskGroup->Sync();
        FreeTaskGroup(taskGroup);
    }
}


void *
ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) {
    TaskGroup *taskGroup;
    if (*taskGroupPtr == NULL) {
        InitTaskSystem();
        taskGroup = AllocTaskGroup();
        *taskGroupPtr = taskGroup;
    }
    else
        taskGroup = (TaskGroup *)(*taskGroupPtr);

    return taskGroup->AllocMemory(size, alignment);
}

#else  // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED

#define MAX_LIVE_TASKS 1024

pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;

// Small structure used to hold the data for each task
struct Task {
public:
    TaskFuncType func;
    void *data;
    volatile int32_t taskIndex;
    int taskCount;

    volatile int numDone;
    int liveIndex; // index in live task queue

    inline int  noMoreWork() { return taskIndex >= taskCount; }
    /*! given thread is done working on this task --> decrease num locks */
    // inline void lock() { lAtomicAdd(&locks,1); }
    // inline void unlock() { lAtomicAdd(&locks,-1); }
    inline int  nextJob() { return lAtomicAdd(&taskIndex,1); }
    inline int  numJobs() { return taskCount; }
    inline void schedule(int idx) { taskIndex = 0; numDone = 0; liveIndex = idx; }
    inline void run(int idx, int threadIdx);
    inline void markOneDone() { lAtomicAdd(&numDone,1); }
    inline void wait()
    {
        while (!noMoreWork()) {
            int next = nextJob();
            if (next < numJobs()) run(next, 0);
        }
        while (numDone != taskCount) {
#ifndef ISPC_IS_KNC
            usleep(1);
#else
            _mm_delay_32(8);
#endif
        }
    }
};

///////////////////////////////////////////////////////////////////////////
class TaskSys {
    static int numThreadsRunning;
    struct LiveTask
    {
        volatile int locks; /*!< num locks on this task. gets
                                 initialized to NUM_THREADS+1, then counted
                                 down by every thread that sees this. this
                                 value is only valid when 'active' is set
                                 to true */
        volatile int active; /*! workers will spin on this until it
                                 becomes active */
        Task *task;

        inline void doneWithThis() { lAtomicAdd(&locks,-1); }
        LiveTask() : active(0), locks(-1) {}
    };

public:
    volatile int nextScheduleIndex; /*! next index in the task queue
                                        where we'll insert a live task */

    // inline int inc_begin() { int old = begin; begin = (begin+1)%MAX_TASKS; return old; }
    // inline int inc_end() { int old = end; end = (end+1)%MAX_TASKS; return old; }

    LiveTask taskQueue[MAX_LIVE_TASKS];
    std::stack<Task *> taskMem;

    static TaskSys *global;

    TaskSys() : nextScheduleIndex(0)
    {
        TaskSys::global = this;
        Task *mem = new Task[MAX_LIVE_TASKS]; //< could actually be more than _live_ tasks
        for (int i=0;i<MAX_LIVE_TASKS;i++) {
            taskMem.push(mem+i);
        }
        createThreads();
    }

    inline Task *allocOne()
    {
        pthread_mutex_lock(&mutex);
        if (taskMem.empty()) {
            fprintf(stderr, "Too many live tasks.  "
                    "Change the value of MAX_LIVE_TASKS and recompile.\n");
            exit(1);
        }
        Task *task = taskMem.top();
        taskMem.pop();
        pthread_mutex_unlock(&mutex);
        return task;
    }

    static inline void init()
    {
        if (global) return;
        pthread_mutex_lock(&mutex);
        if (global == NULL) global = new TaskSys;
        pthread_mutex_unlock(&mutex);
    }

    void createThreads();
    int nThreads;
    pthread_t *thread;

    void threadFct();

    inline void schedule(Task *t)
    {
        pthread_mutex_lock(&mutex);
        int liveIndex = nextScheduleIndex;
        nextScheduleIndex = (nextScheduleIndex+1)%MAX_LIVE_TASKS;
        if (taskQueue[liveIndex].active) {
            fprintf(stderr, "Out of task queue resources.  "
                    "Change the value of MAX_LIVE_TASKS and recompile.\n");
            exit(1);
        }
        taskQueue[liveIndex].task = t;
        t->schedule(liveIndex);
        taskQueue[liveIndex].locks = numThreadsRunning+1; // num _worker_ threads plus creator
        taskQueue[liveIndex].active = true;
        pthread_mutex_unlock(&mutex);
    }

    void sync(Task *task)
    {
        task->wait();
        int liveIndex = task->liveIndex;
        while (taskQueue[liveIndex].locks > 1) {
#ifndef ISPC_IS_KNC
            usleep(1);
#else
            _mm_delay_32(8);
#endif
        }
        _mm_free(task->data);
        pthread_mutex_lock(&mutex);
        taskMem.push(task); // recycle task index
        taskQueue[liveIndex].active = false;
        pthread_mutex_unlock(&mutex);
    }
};


void TaskSys::threadFct() 
{
    int myIndex = 0; //lAtomicAdd(&threadIdx,1);
    while (1) {
        while (!taskQueue[myIndex].active) {
#ifndef ISPC_IS_KNC
            usleep(4);
#else
            _mm_delay_32(32);
#endif
            continue;
        }

        Task *mine = taskQueue[myIndex].task;
        while (!mine->noMoreWork()) {
            int job = mine->nextJob();
            if (job >= mine->numJobs()) break;
            mine->run(job,myIndex);
        }
        taskQueue[myIndex].doneWithThis();
        myIndex = (myIndex+1)%MAX_LIVE_TASKS;
    }
}


inline void Task::run(int idx, int threadIdx) {
    (*this->func)(data,threadIdx,TaskSys::global->nThreads,idx,taskCount);
    markOneDone();
}


void *_threadFct(void *data) {
    ((TaskSys*)data)->threadFct();
    return NULL;
}


void TaskSys::createThreads() 
{
    init();
    int reserved = 4;
    int minid = 2;
    nThreads = sysconf(_SC_NPROCESSORS_ONLN) - reserved;

    thread = (pthread_t *)malloc(nThreads * sizeof(pthread_t));

    numThreadsRunning = 0;
    for (int i = 0; i < nThreads; ++i) {
        pthread_attr_t attr;
        pthread_attr_init(&attr);
        pthread_attr_setstacksize(&attr, 2*1024 * 1024);

        int threadID = minid+i;
        cpu_set_t cpuset;
        CPU_ZERO(&cpuset);
        CPU_SET(threadID,&cpuset);
        int ret = pthread_attr_setaffinity_np(&attr,sizeof(cpuset),&cpuset);

        int err = pthread_create(&thread[i], &attr, &_threadFct, this);
        ++numThreadsRunning;
        if (err != 0) {
            fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
            exit(1);
        }
    }
}

TaskSys * TaskSys::global = NULL;
int TaskSys::numThreadsRunning = 0;

///////////////////////////////////////////////////////////////////////////

void ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) 
{
    Task *ti = *(Task**)taskGroupPtr;
    ti->func = (TaskFuncType)func;
    ti->data = data;
    ti->taskIndex = 0;
    ti->taskCount = count;
    TaskSys::global->schedule(ti);
}

void ISPCSync(void *h) 
{
    Task *task = (Task *)h; 
    assert(task);
    TaskSys::global->sync(task);
}

void *ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) 
{
    TaskSys::init();
    Task *task = TaskSys::global->allocOne();
    *taskGroupPtr = task;
    task->data = _mm_malloc(size,alignment);
    return task->data;//*taskGroupPtr;
}

#endif // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED


================================================
FILE: examples/stencil/volta/timing.h
================================================
/*
  Copyright (c) 2010-2011, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:

    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.


   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
*/

#include <stdint.h>

#ifdef __arm__
#include <sys/time.h>
// There's no easy way to get a hardware clock counter on ARM, so instead
// we'll pretend it's a 1GHz processor and then compute pretend cycles
// based on elapsed time from gettimeofday().
__inline__ uint64_t rdtsc() {
  static bool first = true;
  static struct timeval tv_start;
  if (first) {
    gettimeofday(&tv_start, NULL);
    first = false;
    return 0;
  }

  struct timeval tv;
  gettimeofday(&tv, NULL);
  tv.tv_sec -= tv_start.tv_sec;
  tv.tv_usec -= tv_start.tv_usec;
  return (1000000ull * tv.tv_sec + tv.tv_usec) * 1000ull;
}

#include <sys/time.h>
static inline double rtc(void)
{
  struct timeval Tvalue;
  double etime;
  struct timezone dummy;

  gettimeofday(&Tvalue,&dummy);
  etime =  (double) Tvalue.tv_sec +
    1.e-6*((double) Tvalue.tv_usec);
  return etime;
}

#else // __arm__

#ifdef WIN32
#include <windows.h>
#define rdtsc __rdtsc
#else // WIN32
__inline__ uint64_t rdtsc() {
  uint32_t low, high;
#ifdef __x86_64
  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
#else
  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
                        ::: "%eax", "%ebx", "%ecx", "%edx" );
#endif
  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
  return (uint64_t)high << 32 | low;
}

#include <sys/time.h>
static inline double rtc(void)
{
  struct timeval Tvalue;
  double etime;
  struct timezone dummy;

  gettimeofday(&Tvalue,&dummy);
  etime =  (double) Tvalue.tv_sec +
    1.e-6*((double) Tvalue.tv_usec);
  return etime;
}

#endif // !WIN32
#endif // !__arm__            
            
static uint64_t start,  end;
static double  tstart, tend;

static inline void reset_and_start_timer()
{
    start = rdtsc();
#ifndef WIN32
    // Unused in Windows build, rtc() causing link errors
    tstart = rtc();
#endif
}

/* Returns the number of millions of elapsed processor cycles since the
   last reset_and_start_timer() call. */
static inline double get_elapsed_mcycles()
{
    end = rdtsc();
    return (end-start) / (1024. * 1024.);
}

#ifndef WIN32
// Unused in Windows build, rtc() causing link errors
static inline double get_elapsed_msec()
{
    tend = rtc();
    return (tend - tstart)*1e3;
}
#endif


================================================
FILE: examples/triangle_xform/Cargo.toml
================================================
[package]
name = "triangle_xform"
version = "0.1.0"
authors = ["Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>"]
edition = "2018"

[dependencies]
packed_simd = { package = "packed_simd", path = "../.." }

[dev-dependencies]
rand = "0.7.0"
time = "0.1.40"


================================================
FILE: examples/triangle_xform/readme.md
================================================
# Transforming triangle vertices using a transformation matrix

## Description

This example contains the SIMD implementation of a common computer graphics task:
transforming vertices with a matrix.

## Implementation

There are two implementations:

- scalar version, uses an array-of-structures layout, where each triangle contains
  three vertices, and each vertex contains only a 3D position vector; the algorithm
  operates on **one triangle at a time**.

- SIMD version, uses a structure-of-arrays layout, where the structure contains, for
  each of the X, Y, and Z components of a 3D vector, an array of their values; the
  algorithm operates on **up to N triangles at once**, where N is number of lanes in a
  SIMD register.

To simplify the implementation, the transformation matrix is composed only of simple
rotation, scaling and translation matrices.

Both implementations are single-threaded. They can be easily parallelized using [rayon]
and dividing the list of triangles into chunks.

[rayon]: https://github.com/rayon-rs/rayon

## Benchmark results

This crate is mainly intended for educational purposes, since performance improvements
will likely come from using the transformed triangles in SIMD layout further down the
pipeline.

In order to compare the generated results, the tests will convert the SIMD output back
into a scalar representation.

That being said, the crate's tests also come with a micro-benchmark.
It is recommended to increase the `TRIANGLE_COUNT` constant to the point where
you get accurate benchmark results.

Run the unit tests in release mode, and with `stdout` capture disabled:

```sh
cargo test --release -- --no-capture
```

Benchmark results on an Intel i5 with AVX, for 2^24 triangles:

| algorithm |  time  |
|-----------|--------|
|  scalar   | 255 ms |
|  simd     | 237 ms |

(**Note**: the benchmark does not take into account the time required for transforming
the data into an SIMD layout)

SIMD is a mere 7% faster than the scalar algorithm, since LLVM was already able to
vectorize most of the multiplication code. Since we're not doing a lot of processing
on the triangles after transforming them, this "benchmark" is very limited by memory
bandwidth.


================================================
FILE: examples/triangle_xform/src/lib.rs
================================================
#![allow(clippy::must_use_candidate)]

/// Simple matrix type.
/// The memory layout is the same as the one for Direct3D/OpenGL: fourth vector
/// represents the translation vector `[x, y, z]`.
type Matrix = [[f32; 3]; 4];

/// Scalar implementation of the triangle transform.
pub mod scalar;
/// SIMD implementation of the triangle transform.
pub mod simd;

#[cfg(test)]
mod tests {
    use super::*;
    use rand::prelude::*;

    const TRIANGLE_COUNT: usize = 1 << 5;

    #[test]
    fn compare_scalar_simd() {
        let dist = rand::distributions::Standard;
        let mut rng = thread_rng();

        // Generate a random triangle
        let triangles = dist
            .sample_iter(&mut rng)
            .take(TRIANGLE_COUNT)
            .collect::<Vec<scalar::Triangle>>();

        // Generate a random matrix
        let mat: Matrix = dist.sample(&mut rng);

        // Benchmark scalar performance
        let mut scalar_xformed = Vec::new();
        let scalar_dur = time::Duration::span(|| {
            scalar_xformed = triangles
                .iter()
                .map(|tri| tri.transform(mat))
                .collect::<Vec<_>>();
        });

        // Convert the random triangles to a structure-of-arrays format.
        let triangles = triangles
            .chunks(simd::VecF::lanes())
            .map(|tris| simd::Triangle::pack(tris))
            .collect::<Vec<_>>();

        // Benchmark SIMD performance
        let mut simd_xformed = Vec::new();
        let simd_dur = time::Duration::span(|| {
            simd_xformed = triangles
                .iter()
                .map(|tri| tri.transform(mat))
                .collect::<Vec<_>>();
        });

        println!("scalar: {} ms", scalar_dur.num_milliseconds());
        println!("simd: {} ms", simd_dur.num_milliseconds());

        // Convert SIMD results back to AOS layout for comparison test
        let simd_xformed = simd_xformed
            .into_iter()
            .flat_map(|tri| tri.unpack())
            .collect::<Vec<_>>();

        const EPSILON: f32 = 1E-5;

        if scalar_xformed != simd_xformed {
            scalar_xformed.into_iter().zip(simd_xformed.into_iter()).for_each(
                |(a, b)| {
                    if a != b {
                        a.0.iter().zip(b.0.iter()).for_each(
                            |(v1, v2)| {
                                v1.iter().zip(v2.iter()).for_each(
                                    |(a, b)| {
                                        assert!(
                                            (a - b).abs() <= EPSILON,
                                            "Vertex components do not match"
                                        );
                                    },
                                );
                            },
                        );
                    }
                },
            );
        }
    }
}


================================================
FILE: examples/triangle_xform/src/scalar.rs
================================================
use super::Matrix;

/// Vertex data: a single 3D vector of floats, representing position.
pub type Vertex = [f32; 3];

/// Triangle type for array-of-structs layout.
#[derive(Debug, Default, Copy, Clone, PartialEq)]
pub struct Triangle(pub [Vertex; 3]);

impl Triangle {
    /// Transforms this triangle by multiplying with a matrix.
    #[inline]
    pub fn transform(self, mat: Matrix) -> Self {
        let mut xformed: [Vertex; 3] = Default::default();

        let vertices = self.0;

        let col_a = mat[0];
        let col_b = mat[1];
        let col_c = mat[2];
        let col_d = mat[3];

        for k in 0..3 {
            let v = vertices[k];

            let x =
                col_a[0] * v[0] + col_b[0] * v[1] + col_c[0] * v[2] + col_d[0];
            let y =
                col_a[1] * v[0] + col_b[1] * v[1] + col_c[1] * v[2] + col_d[1];
            let z =
                col_a[2] * v[0] + col_b[2] * v[1] + col_c[2] * v[2] + col_d[2];

            xformed[k] = [x, y, z];
        }

        Self(xformed)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use rand::{distributions::Standard, prelude::*};

    impl Distribution<Triangle> for Standard {
        fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Triangle {
            Triangle(self.sample(rng))
        }
    }

    #[test]
    fn translate() {
        let tri =
            Triangle([[-0.5, -0.5, 0.0], [0.5, -0.5, 0.0], [0.0, 0.5, 0.0]]);

        let (x, y, z) = (-0.25, 0.5, 1.0);

        let matrix =
            [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0], [x, y, z]];

        let tri = tri.transform(matrix);

        let expected =
            Triangle([[-0.75, 0.0, 1.0], [0.25, 0.0, 1.0], [-0.25, 1.0, 1.0]]);

        assert_eq!(tri, expected);
    }
}


================================================
FILE: examples/triangle_xform/src/simd.rs
================================================
use super::Matrix;

/// SIMD vector of floats
pub type VecF = packed_simd::f32x8;

/// SIMD batch of N triangles, where N is SIMD width.
#[derive(Debug, Default, Copy, Clone)]
pub struct Triangle {
    pub x: [VecF; 3],
    pub y: [VecF; 3],
    pub z: [VecF; 3],
}

impl Triangle {
    /// Combines N scalar triangles into a single SIMD triangle.
    pub fn pack(tris: &[crate::scalar::Triangle]) -> Self {
        assert_eq!(tris.len(), VecF::lanes());

        let mut x = [VecF::splat(0.0); 3];
        let mut y = [VecF::splat(0.0); 3];
        let mut z = [VecF::splat(0.0); 3];
        (0..3).for_each(|k| {
            let x = &mut x[k];
            let y = &mut y[k];
            let z = &mut z[k];

            (0..VecF::lanes()).for_each(|i| {
                let t = tris[i];
                let vertex = t.0[k];
                let tx = vertex[0];
                let ty = vertex[1];
                let tz = vertex[2];

                *x = x.replace(i, tx);
                *y = y.replace(i, ty);
                *z = z.replace(i, tz);
            });
        });

        Self { x, y, z }
    }

    /// Unpacks the N scalar triangles into an array-of-structures layout.
    pub fn unpack(self) -> Vec<crate::scalar::Triangle> {
        let mut tris = [crate::scalar::Triangle::default(); VecF::lanes()];

        (0..3).for_each(|k| {
            (0..VecF::lanes()).for_each(|i| {
                let vtx = &mut tris[i].0;
                vtx[k][0] = self.x[k].extract(i);
                vtx[k][1] = self.y[k].extract(i);
                vtx[k][2] = self.z[k].extract(i);
            });
        });

        tris.to_vec()
    }

    /// Transforms this triangle by multiplying with a matrix.
    #[inline]
    pub fn transform(self, mat: Matrix) -> Self {
        let mut tri = Self::default();

        let x = self.x;
        let y = self.y;
        let z = self.z;

        let col_a = mat[0];
        let col_b = mat[1];
        let col_c = mat[2];
        let col_d = mat[3];

        for k in 0..3 {
            let x = x[k];
            let y = y[k];
            let z = z[k];

            tri.x[k] = col_a[0] * x + col_b[0] * y + col_c[0] * z + col_d[0];
            tri.y[k] = col_a[1] * x + col_b[1] * y + col_c[1] * z + col_d[1];
            tri.z[k] = col_a[2] * x + col_b[2] * y + col_c[2] * z + col_d[2];
        }

        tri
    }
}


================================================
FILE: micro_benchmarks/Cargo.toml
================================================
[package]
name = "micro_benchmarks"
version = "0.1.0"
authors = ["gnzlbg <gonzalobg88@gmail.com>"]
autobenches = false
edition = "2018"

[dev-dependencies]
packed_simd = { package = "packed_simd", path = ".." }
paste = "0.1.3"
criterion = "0.3"

[profile.bench]
opt-level = 3
debug = false
lto = 'fat'
debug-assertions = false
codegen-units = 1

[[bench]]
name = "mask_reductions"
harness = false


================================================
FILE: micro_benchmarks/benches/mask_reductions.rs
================================================
//! Benchmarks for the mask reductions `all`, `any`, and `none`.
#![deny(rust_2018_idioms)]
#![feature(test)]

use packed_simd::*;
use test::black_box;

use criterion::{Benchmark, Criterion, Throughput};
const NO_ITERATIONS: u32 = 1_000;

macro_rules! bench {
    ($id:ident) => {
        paste::item! {
            fn [<$id _all>](c: &mut Criterion) {
                c.bench(
                    stringify!($id),
                    Benchmark::new("all", |b| b.iter(|| {
                        let mut x: $id = Default::default();
                        for _ in 0..NO_ITERATIONS {
                            if black_box(x).all() {
                                black_box(&mut x);
                            }
                        }
                    })).throughput(Throughput::Elements(NO_ITERATIONS))
                );
            }
            fn [<$id _any>](c: &mut Criterion) {
                c.bench(
                    stringify!($id),
                    Benchmark::new("any", |b| b.iter(|| {
                        let mut x: $id = Default::default();
                        for _ in 0..NO_ITERATIONS {
                            if black_box(x).any() {
                                black_box(&mut x);
                            }
                        }
                    })).throughput(Throughput::Elements(NO_ITERATIONS))
                );
            }
            fn [<$id _none>](c: &mut Criterion) {
                c.bench(
                    stringify!($id),
                    Benchmark::new("none", |b| b.iter(|| {
                        let mut x: $id = Default::default();
                        for _ in 0..NO_ITERATIONS {
                            if black_box(x).none() {
                                black_box(&mut x);
                            }
                        }
                    })).throughput(Throughput::Elements(NO_ITERATIONS))
                );
            }
        }
    };
    ($($id:ident),*) => {
        $( bench!($id); )*
        paste::item! {
            criterion_group!(
                benches,
                $([<$id _all>]),*, $([<$id _any>]),*, $([<$id _none>]),*
            );
        }
    };
}

bench!(
    m8x2, // 16-bit wide types
    m8x8, m16x4, m32x2, // 64-bit wide types
    m8x16, m16x8, m32x4, m64x2, m128x1, // 128-bit wide types
    m8x32, m16x16, m32x8, m64x4, m128x2, // 256-bit wide types
    m8x64, m16x32, m32x16, m64x8, m128x4 // 512-bit wide types
);

criterion_main!(benches);


================================================
FILE: micro_benchmarks/rust-toolchain
================================================
nightly

================================================
FILE: perf-guide/.gitignore
================================================
/book


================================================
FILE: perf-guide/book.toml
================================================
[book]
authors = ["Gonzalo Brito Gadeschi", "Gabriel Majeri"]
multilingual = false
src = "src"
title = "Rust SIMD Performance Guide"
description = "This book describes how to write performant SIMD code in Rust."

[build]
create-missing = false

[output.html]
additional-css = ["./src/ascii.css"]


================================================
FILE: perf-guide/src/SUMMARY.md
================================================
# Summary

[Introduction](./introduction.md)

- [Floating-point Math](./float-math/fp.md)
  - [Short-vector Math Library](./float-math/svml.md)
  - [Approximate functions](./float-math/approx.md)
  - [Fused multiply-accumulate](./float-math/fma.md)

- [Target features](./target-feature/features.md)
  - [Using `RUSTFLAGS`](./target-feature/rustflags.md)
  - [Using the `target_feature` attribute](./target-feature/attribute.md)
  - [Interaction with inlining](./target-feature/inlining.md)
  - [Detecting features at runtime](./target-feature/runtime.md)

- [Bounds checking](./bound_checks.md)
- [Vertical and horizontal operations](./vert-hor-ops.md)

- [Performance profiling](./prof/profiling.md)
  - [Profiling on Linux](./prof/linux.md)
  - [Using machine code analyzers](./prof/mca.md)


================================================
FILE: perf-guide/src/ascii.css
================================================
code {
    /* "Source Code Pro" breaks ASCII art */
    font-family: Consolas, "Ubuntu Mono", Menlo, "DejaVu Sans Mono", monospace;
}


================================================
FILE: perf-guide/src/bound_checks.md
================================================
# Bounds checking

Reading and writing packed vectors to/from slices is checked by default.
Independently of the configuration options used, the safe functions:

* `Simd<[T; N]>::from_slice_aligned(& s[..])`
* `Simd<[T; N]>::write_to_slice_aligned(&mut s[..])`

always check that:

* the slice is big enough to hold the vector
* the slice is suitably aligned to perform an aligned load/store for a `Simd<[T;
  N]>` (this alignment is often much larger than that of `T`).

There are `_unaligned` versions that use unaligned load and stores, as well as
`unsafe` `_unchecked` that do not perform any checks iff `debug-assertions =
false` / `debug = false`. That is, the `_unchecked` methods do still assert size
and alignment in debug builds and could also do so in release builds depending
on the configuration options.

These assertions do often significantly impact performance and you should be
aware of them.


================================================
FILE: perf-guide/src/float-math/approx.md
================================================
# Approximate functions

<!-- TODO:

Explain that they exists, that they are often _much_ faster, how to use them,
that people should check whether the error is good enough for their
applications. Explain that this error is currently unstable and might change.
-->


================================================
FILE: perf-guide/src/float-math/fma.md
================================================
# Fused Multiply Add

<!-- TODO:
Explain that this is a compound operation, infinite precision, difference
between `mul_add` and `mul_adde`, that LLVM cannot do this by itself, etc.
-->


================================================
FILE: perf-guide/src/float-math/fp.md
================================================
# Floating-point math

This chapter contains information pertaining to working with floating-point numbers.


================================================
FILE: perf-guide/src/float-math/svml.md
================================================
# Short Vector Math Library

<!-- TODO:
Explain how is short-vector math performed by default (just scalarized libm calls).

Explain how to enable `sleef`, etc.
-->


================================================
FILE: perf-guide/src/introduction.md
================================================
# Introduction

## What is SIMD

<!-- TODO:
describe what SIMD is, which algorithms can benefit from it,
give usage examples
-->

## History of SIMD in Rust

<!-- TODO:
discuss history of unstable std::simd,
stabilization of std::arch, etc.
-->

## Discover packed_simd

<!-- TODO: describe scope of this project -->

Writing fast and portable SIMD algorithms using `packed_simd` is, unfortunately,
not trivial. There are many pitfals that one should be aware of, and some idioms
that help avoid those pitfalls.

This book attempts to document these best practices and provides practical examples
on how to apply the tips to _your_ code.


================================================
FILE: perf-guide/src/prof/linux.md
================================================
# Performance profiling on Linux

## Using `perf`

[perf](https://perf.wiki.kernel.org/) is the most powerful performance profiler
for Linux, featuring support for various hardware Performance Monitoring Units,
as well as integration with the kernel's performance events framework.

We will only look at how can the `perf` command can be used to profile SIMD code.
Full system profiling is outside of the scope of this book.

### Recording

The first step is to record a program's execution during an average workload.
It helps if you can isolate the parts of your program which have performance
issues, and set up a benchmark which can be easily (re)run.

Build the benchmark binary in release mode, after having enabled debug info:

```sh
$ cargo build --release
Finished release [optimized + debuginfo] target(s) in 0.02s
```

Then use the `perf record` subcommand:

```sh
$ perf record --call-graph=dwarf ./target/release/my-program
[ perf record: Woken up 10 times to write data ]
[ perf record: Captured and wrote 2,356 MB perf.data (292 samples) ]
```

Instead of using `--call-graph=dwarf`, which can become pretty slow, you can use
`--call-graph=lbr` if you have a processor with support for Last Branch Record
(i.e. Intel Haswell and newer).

`perf` will, by default, record the count of CPU cycles it takes to execute
various parts of your program. You can use the `-e` command line option
to enable other performance events, such as `cache-misses`. Use `perf list`
to get a list of all hardware counters supported by your CPU.

### Viewing the report

The next step is getting a bird's eye view of the program's execution.
`perf` provides a `ncurses`-based interface which will get you started.

Use `perf report` to open a visualization of your program's performance:

```sh
perf report --hierarchy -M intel
```

`--hierarchy` will display a tree-like structure of where your program spent
most of its time. `-M intel` enables disassembly output with Intel syntax, which
is subjectively more readable than the default AT&T syntax.

Here is the output from profiling the `nbody` benchmark:

```
- 100,00% nbody
  - 94,18% nbody
    + 93,48% [.] nbody_lib::simd::advance
    + 0,70% [.] nbody_lib::run
    + 5,06% libc-2.28.so
```

If you move with the arrow keys to any node in the tree, you can the press `a`
to have `perf` _annotate_ that node. This means it will:

- disassemble the function

- associate every instruction with the percentage of time which was spent executing it

- interleaves the disassembly with the source code,
  assuming it found the debug symbols
  (you can use `s` to toggle this behaviour)

`perf` will, by default, open the instruction which it identified as being the
hottest spot in the function:

```
0,76  │ movapd xmm2,xmm0
0,38  │ movhlps xmm2,xmm0
      │ addpd  xmm2,xmm0
      │ unpcklpd xmm1,xmm2
12,50 │ sqrtpd xmm0,xmm1
1,52  │ mulpd  xmm0,xmm1
```

In this case, `sqrtpd` will be highlighted in red, since that's the instruction
which the CPU spends most of its time executing.

## Using Valgrind

Valgrind is a set of tools which initially helped C/C++ programmers find unsafe
memory accesses in their code. Nowadays the project also has

- a heap profiler called `massif`

- a cache utilization profiler called `cachegrind`

- a call-graph performance profiler called `callgrind`

<!--
TODO: explain valgrind's dynamic binary translation, warn about massive
slowdown, talk about `kcachegrind` for a GUI
-->


================================================
FILE: perf-guide/src/prof/mca.md
================================================
# Machine code analysis tools

## The microarchitecture of modern CPUs

While you might have heard of Instruction Set Architectures, such as `x86` or
`arm` or `mips`, the term _microarchitecture_ (also written here as _µ-arch_),
refers to the internal details of an actual family of CPUs, such as Intel's
_Haswell_ or AMD's _Jaguar_.

Replacing scalar code with SIMD code will improve performance on all CPUs
supporting the required vector extensions.
However, due to microarchitectural differences, the actual speed-up at
runtime might vary.

**Example**: a simple example arises when optimizing for AMD K8 CPUs.
The assembly generated for an empty function should look like this:

```asm
nop
ret
```

The `nop` is used to align the `ret` instruction for better performance.
However, the compiler will actually generated the following code:

```asm
repz ret
```

The `repz` instruction will repeat the following instruction until a certain
condition. Of course, in this situation, the function will simply immediately
return, and the `ret` instruction is still aligned.
However, AMD K8's branch predictor performs better with the latter code.

For those looking to absolutely maximize performance for a certain target µ-arch,
you will have to read some CPU manuals, or ask the compiler to do it for you
with `-C target-cpu`.

### Summary of CPU internals

Modern processors are able to execute instructions out-of-order for better performance,
by utilizing tricks such as [branch prediction], [instruction pipelining],
or [superscalar execution].

[branch prediction]: https://en.wikipedia.org/wiki/Branch_predictor
[instruction pipelining]: https://en.wikipedia.org/wiki/Instruction_pipelining
[superscalar execution]: https://en.wikipedia.org/wiki/Superscalar_processor

SIMD instructions are also subject to these optimizations, meaning it can get pretty
difficult to determine where the slowdown happens.
For example, if the profiler reports a store operation is slow, one of two things
could be happening:

- the store is limited by the CPU's memory bandwidth, which is actually an ideal
  scenario, all things considered;

- memory bandwidth is nowhere near its peak, but the value to be stored is at the
  end of a long chain of operations, and this store is where the profiler
  encountered the pipeline stall;

Since most profilers are simple tools which don't understand the subtleties of
instruction scheduling, you

## Analyzing the machine code

Certain tools have knowledge of internal CPU microarchitecture, i.e. they know

- how many physical [register files] a CPU actually has

- what is the latency / throughtput of an instruction

- what [µ-ops] are generated for a set of instructions

and many other architectural details.

[register files]: https://en.wikipedia.org/wiki/Register_file
[µ-ops]: https://en.wikipedia.org/wiki/Micro-operation

These tools are therefore able to provide accurate information as to why some
instructions are inefficient, and where the bottleneck is.

The disadvantage is that the output of these tools requires advanced knowledge
of the target architecture to understand, i.e. they **cannot** point out what
the cause of the issue is explicitly.

## Intel's Architecture Code Analyzer (IACA)

[IACA] is a free tool offered by Intel for analyzing the performance of various
computational kernels.

Being a proprietary, closed source tool, it _only_ supports Intel's µ-arches.

[IACA]: https://software.intel.com/en-us/articles/intel-architecture-code-analyzer

## llvm-mca

<!--
TODO: once LLVM 7 gets released, write a chapter on using llvm-mca
with SIMD disassembly.
-->


================================================
FILE: perf-guide/src/prof/profiling.md
================================================
# Performance profiling

While the rest of the book provides practical advice on how to improve the performance
of SIMD code, this chapter is dedicated to [**performance profiling**][profiling].
Profiling consists of recording a program's execution in order to identify program
hotspots.

**Important**: most profilers require debug information in order to accurately
link the program hotspots back to the corresponding source code lines. Rust will
disable debug info generation by default for optimized builds, but you can change
that [in your `Cargo.toml`][cargo-ref].

[profiling]: https://en.wikipedia.org/wiki/Profiling_(computer_programming)
[cargo-ref]: https://doc.rust-lang.org/cargo/reference/manifest.html#the-profile-sections


================================================
FILE: perf-guide/src/target-feature/attribute.md
================================================
# The `target_feature` attribute

<!-- TODO:
Explain the `#[target_feature]` attribute
-->


================================================
FILE: perf-guide/src/target-feature/features.md
================================================
# Enabling target features

Not all processors of a certain architecture will have SIMD processing units,
and using a SIMD instruction which is not supported will trigger undefined behavior.

To allow building safe, portable programs, the Rust compiler will **not**, by default,
generate any sort of vector instructions, unless it can statically determine
they are supported. For example, on AMD64, SSE2 support is architecturally guaranteed.
The `x86_64-apple-darwin` target enables up to SSSE3. The get a defintive list of
which features are enabled by default on various platforms, refer to the target
specifications [in the compiler's source code][targets].

[targets]: https://github.com/rust-lang/rust/tree/master/src/librustc_target/spec


================================================
FILE: perf-guide/src/target-feature/inlining.md
================================================
# Inlining

<!-- TODO:
Explain how the `#[target_feature]` attribute interacts with inlining
-->


================================================
FILE: perf-guide/src/target-feature/practice.md
================================================
# Target features in practice

Using `RUSTFLAGS` will allow the crate being compiled, as well as all its
transitive dependencies to use certain target features.

A tehnique used to avoid undefined behavior at runtime is to compile and
ship multiple binaries, each compiled with a certain set of features.
This might not be feasible in some cases, and can quickly get out of hand
as more and more vector extensions are added to an architecture.

Rust can be more flexible: you can build a single binary/library which automatically
picks the best supported vector instructions depending on the host machine.
The trick consists of monomorphizing parts of the code during building, and then
using run-time feature detection to select the right code path when running.

<!-- TODO
Explain how to create efficient functions that dispatch to different
implementations at run-time without issues (e.g. using `#[inline(always)]` for
the impls, wrapping in `#[target_feature]`, and the wrapping those in a function
that does run-time feature detection).
-->

**NOTE** (x86 specific): because the AVX (256-bit) registers extend the existing
SSE (128-bit) registers, mixing SSE and AVX instructions in a program can cause
performance issues.

The solution is to compile all code, even the code written with 128-bit vectors,
with the AVX target feature enabled. This will cause the compiler to prefix the
generated instructions with the [VEX] prefix.

[VEX]: https://en.wikipedia.org/wiki/VEX_prefix


================================================
FILE: perf-guide/src/target-feature/runtime.md
================================================
# Detecting host features at runtime

<!-- TODO:
Explain cost (how it works).
-->


================================================
FILE: perf-guide/src/target-feature/rustflags.md
================================================
# Using RUSTFLAGS

One of the easiest ways to benefit from SIMD is to allow the compiler
to generate code using certain vector instruction extensions.

The environment variable `RUSTFLAGS` can be used to pass options for code
generation to the Rust compiler. These flags will affect **all** compiled crates.

There are two flags which can be used to enable specific vector extensions:

## target-feature

- Syntax: `-C target-feature=<features>`

- Provides the compiler with a comma-separated set of instruction extensions
  to enable.

  **Example**: Use `-C target-feature=+sse3,+avx` to enable generating instructions
  for [Streaming SIMD Extensions 3](https://en.wikipedia.org/wiki/SSE3) and
  [Advanced Vector Extensions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions).

- To list target triples for all targets supported by Rust, use:

  ```sh
  rustc --print target-list
  ```

- To list all support target features for a certain target triple, use:

  ```sh
  rustc --target=${TRIPLE} --print target-features
  ```

- Note that all CPU features are independent, and will have to be enabled individually.

  **Example**: Setting `-C target-feature=+avx2` will _not_ enable `fma`, even though
  all CPUs which support AVX2 also support FMA. To enable both, one has to use
  `-C target-feature=+avx2,+fma`

- Some features also depend on other features, which need to be enabled for the
  target instructions to be generated.

  **Example**: Unless `v7` is specified as the target CPU (see below), to enable
  NEON on ARM it is necessary to use `-C target-feature=+v7,+neon`.

## target-cpu

- Syntax: `-C target-cpu=<cpu>`

- Sets the identifier of a CPU family / model for which to build and optimize the code.

  **Example**: `RUSTFLAGS='-C target-cpu=cortex-a75'`

- To list all supported target CPUs for a certain target triple, use:

  ```sh
  rustc --target=${TRIPLE} --print target-cpus
  ```

  **Example**:

  ```sh
  rustc --target=i686-pc-windows-msvc --print target-cpus
  ```

- The compiler will translate this into a list of target features. Therefore,
  individual feature checks (`#[cfg(target_feature = "...")]`) will still
  work properly.

- It will cause the code generator to optimize the generated code for that
  specific CPU model.

- Using `native` as the CPU model will cause Rust to generate and optimize code
  for the CPU running the compiler. It is useful when building programs which you
  plan to only use locally. This should never be used when the generated programs
  are meant to be run on other computers, such as when packaging for distribution
  or cross-compiling.


================================================
FILE: perf-guide/src/vert-hor-ops.md
================================================
# Vertical and horizontal operations

In SIMD terminology, each vector has a certain "width" (number of lanes).
A vector processor is able to perform two kinds of operations on a vector:

- Vertical operations:
  operate on two vectors of the same width, result has same width

**Example**: vertical addition of two `f32x4` vectors

      %0     == | 2 | -3.5 |  0 | 7 |
                  +     +     +   +
      %1     == | 4 |  1.5 | -1 | 0 |
                  =     =     =   =
    %0 + %1  == | 6 |  -2  | -1 | 7 |

- Horizontal operations:
  reduce the elements of two vectors in some way,
  the result's elements combine information from the two original ones

**Example**: horizontal addition of two `u64x2` vectors

      %0     == | 1 |  3 |
                  └─+───┘
                    └───────┐
                            │
      %1     == | 4 | -1 |  │
                  └─+──┘    │
                    └───┐   │
                        │   │
                  ┌─────│───┘
                  ▼     ▼
    %0 + %1  == | 4 |   3 |

## Performance consideration of horizontal operations

The result of vertical operations, like vector negation: `-a`, for a given lane,
does not depend on the result of the operation for the other lanes. The result
of horizontal operations, like the vector `sum` reduction: `a.sum()`, depends on
the value of all vector lanes.

In virtually all architectures vertical operations are fast, while horizontal
operations are, by comparison, very slow.

Consider the following two functions for computing the sum of all `f32` values
in a slice:

```rust
fn fast_sum(x: &[f32]) -> f32 {
    assert!(x.len() % 4 == 0);
    let mut sum = f32x4::splat(0.); // [0., 0., 0., 0.]
    for i in (0..x.len()).step_by(4) {
        sum += f32x4::from_slice_unaligned(&x[i..]);
    }
    sum.sum()
}

fn slow_sum(x: &[f32]) -> f32 {
    assert!(x.len() % 4 == 0);
    let mut sum: f32 = 0.;
    for i in (0..x.len()).step_by(4) {
        sum += f32x4::from_slice_unaligned(&x[i..]).sum();
    }
    sum
}
```

The inner loop over the slice is where the bulk of the work actually happens.
There, the `fast_sum` function perform vertical operations into a vector, doing
a single horizontal reduction at the end, while the `slow_sum` function performs
horizontal vector operations inside of the loop.

On all widely-used architectures, `fast_sum` is a large constant factor faster
than `slow_sum`. You can run the [slice_sum]() example and see for yourself. On
the particular machine tested there the algorithm using the horizontal vector
addition is 2.7x slower than the one using vertical vector operations!


================================================
FILE: rust-toolchain
================================================
nightly


================================================
FILE: rustfmt.toml
================================================
max_width = 110
use_small_heuristics = "Max"
wrap_comments = true
edition = "2018"
error_on_line_overflow = true

================================================
FILE: src/api/bit_manip.rs
================================================
//! Bit manipulations.

macro_rules! impl_bit_manip {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Returns the number of ones in the binary representation of
            /// the lanes of `self`.
            #[inline]
            pub fn count_ones(self) -> Self {
                super::codegen::bit_manip::BitManip::ctpop(self)
            }

            /// Returns the number of zeros in the binary representation of
            /// the lanes of `self`.
            #[inline]
            pub fn count_zeros(self) -> Self {
                super::codegen::bit_manip::BitManip::ctpop(!self)
            }

            /// Returns the number of leading zeros in the binary
            /// representation of the lanes of `self`.
            #[inline]
            pub fn leading_zeros(self) -> Self {
                super::codegen::bit_manip::BitManip::ctlz(self)
            }

            /// Returns the number of trailing zeros in the binary
            /// representation of the lanes of `self`.
            #[inline]
            pub fn trailing_zeros(self) -> Self {
                super::codegen::bit_manip::BitManip::cttz(self)
            }
        }

        test_if! {
            $test_tt:
            paste::item! {
                #[allow(overflowing_literals)]
                pub mod [<$id _bit_manip>] {
                    #![allow(const_item_mutation)]
                    use super::*;

                    const LANE_WIDTH: usize = mem::size_of::<$elem_ty>() * 8;

                    macro_rules! test_func {
                        ($x:expr, $func:ident) => {{
                            let mut actual = $x;
                            for i in 0..$id::lanes() {
                                actual = actual.replace(
                                    i,
                                    $x.extract(i).$func() as $elem_ty
                                );
                            }
                            let expected = $x.$func();
                            assert_eq!(actual, expected);
                        }};
                    }

                    const BYTES: [u8; 64] = [
                        0, 1, 2, 3, 4, 5, 6, 7,
                        8, 9, 10, 11, 12, 13, 14, 15,
                        16, 17, 18, 19, 20, 21, 22, 23,
                        24, 25, 26, 27, 28, 29, 30, 31,
                        32, 33, 34, 35, 36, 37, 38, 39,
                        40, 41, 42, 43, 44, 45, 46, 47,
                        48, 49, 50, 51, 52, 53, 54, 55,
                        56, 57, 58, 59, 60, 61, 62, 63,
                    ];

                    fn load_bytes() -> $id {
                        let elems: &mut [$elem_ty] = unsafe {
                            slice::from_raw_parts_mut(
                                BYTES.as_mut_ptr() as *mut $elem_ty,
                                $id::lanes(),
                            )
                        };
                        $id::from_slice_unaligned(elems)
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn count_ones() {
                        test_func!($id::splat(0), count_ones);
                        test_func!($id::splat(!0), count_ones);
                        test_func!(load_bytes(), count_ones);
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn count_zeros() {
                        test_func!($id::splat(0), count_zeros);
                        test_func!($id::splat(!0), count_zeros);
                        test_func!(load_bytes(), count_zeros);
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn leading_zeros() {
                        test_func!($id::splat(0), leading_zeros);
                        test_func!($id::splat(1), leading_zeros);
                        // some implementations use `pshufb` which has unique
                        // behavior when the 8th bit is set.
                        test_func!($id::splat(0b1000_0010), leading_zeros);
                        test_func!($id::splat(!0), leading_zeros);
                        test_func!(
                            $id::splat(1 << (LANE_WIDTH - 1)),
                            leading_zeros
                        );
                        test_func!(load_bytes(), leading_zeros);
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn trailing_zeros() {
                        test_func!($id::splat(0), trailing_zeros);
                        test_func!($id::splat(1), trailing_zeros);
                        test_func!($id::splat(0b1000_0010), trailing_zeros);
                        test_func!($id::splat(!0), trailing_zeros);
                        test_func!(
                            $id::splat(1 << (LANE_WIDTH - 1)),
                            trailing_zeros
                        );
                        test_func!(load_bytes(), trailing_zeros);
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/bitmask.rs
================================================
//! Bitmask API

macro_rules! impl_bitmask {
    ($id:ident | $ibitmask_ty:ident | ($set:expr, $clear:expr)
     | $test_tt:tt) => {
        impl $id {
            /// Creates a bitmask with the MSB of each vector lane.
            ///
            /// If the vector has less than 8 lanes, the bits that do not
            /// correspond to any vector lanes are cleared.
            #[inline]
            pub fn bitmask(self) -> $ibitmask_ty {
                unsafe { codegen::llvm::simd_bitmask(self.0) }
            }
        }

        test_if! {
            $test_tt:
            paste::item! {
                #[cfg(not(
                    // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/210
                    target_endian = "big"
                ))]
                pub mod [<$id _bitmask>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn bitmask() {
                        // clear all lanes
                        let vec = $id::splat($clear as _);
                        let bitmask: $ibitmask_ty = 0;
                        assert_eq!(vec.bitmask(), bitmask);

                        // set even lanes
                        let mut vec = $id::splat($clear as _);
                        for i in 0..$id::lanes() {
                            if i % 2 == 0 {
                                vec = vec.replace(i, $set as _);
                            }
                        }
                        // create bitmask with even lanes set:
                        let mut bitmask: $ibitmask_ty = 0;
                        for i in 0..$id::lanes() {
                            if i % 2 == 0 {
                                bitmask |= 1 << i;
                            }
                        }
                        assert_eq!(vec.bitmask(), bitmask);


                        // set odd lanes
                        let mut vec = $id::splat($clear as _);
                        for i in 0..$id::lanes() {
                            if i % 2 != 0 {
                                vec = vec.replace(i, $set as _);
                            }
                        }
                        // create bitmask with odd lanes set:
                        let mut bitmask: $ibitmask_ty = 0;
                        for i in 0..$id::lanes() {
                            if i % 2 != 0 {
                                bitmask |= 1 << i;
                            }
                        }
                        assert_eq!(vec.bitmask(), bitmask);

                        // set all lanes
                        let vec = $id::splat($set as _);
                        let mut bitmask: $ibitmask_ty = 0;
                        for i in 0..$id::lanes() {
                            bitmask |= 1 << i;
                        }
                        assert_eq!(vec.bitmask(), bitmask);
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/cast/macros.rs
================================================
//! Macros implementing `FromCast`

macro_rules! impl_from_cast_ {
    ($id:ident[$test_tt:tt]: $from_ty:ident) => {
        impl crate::api::cast::FromCast<$from_ty> for $id {
            #[inline]
            fn from_cast(x: $from_ty) -> Self {
                use crate::llvm::simd_cast;
                debug_assert_eq!($from_ty::lanes(), $id::lanes());
                Simd(unsafe { simd_cast(x.0) })
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _from_cast_ $from_ty>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn test() {
                        assert_eq!($id::lanes(), $from_ty::lanes());
                    }
                }
            }
        }
    };
}

macro_rules! impl_from_cast {
    ($id:ident[$test_tt:tt]: $($from_ty:ident),*) => {
        $(
            impl_from_cast_!($id[$test_tt]: $from_ty);
        )*
    }
}

macro_rules! impl_from_cast_mask_ {
    ($id:ident[$test_tt:tt]: $from_ty:ident) => {
        impl crate::api::cast::FromCast<$from_ty> for $id {
            #[inline]
            fn from_cast(x: $from_ty) -> Self {
                debug_assert_eq!($from_ty::lanes(), $id::lanes());
                x.ne($from_ty::default())
                    .select($id::splat(true), $id::splat(false))
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _from_cast_ $from_ty>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn test() {
                        assert_eq!($id::lanes(), $from_ty::lanes());

                        let x = $from_ty::default();
                        let m: $id = x.cast();
                        assert!(m.none());
                    }
                }
            }
        }
    };
}

macro_rules! impl_from_cast_mask {
    ($id:ident[$test_tt:tt]: $($from_ty:ident),*) => {
        $(
            impl_from_cast_mask_!($id[$test_tt]: $from_ty);
        )*
    }
}

#[allow(unused)]
macro_rules! impl_into_cast {
    ($id:ident[$test_tt:tt]: $($from_ty:ident),*) => {
        $(
            impl_from_cast_!($from_ty[$test_tt]: $id);
        )*
    }
}


================================================
FILE: src/api/cast/v128.rs
================================================
//! `FromCast` and `IntoCast` implementations for portable 128-bit wide vectors
#[rustfmt::skip]

use crate::*;

impl_from_cast!(i8x16[test_v128]: u8x16, m8x16, i16x16, u16x16, m16x16, i32x16, u32x16, f32x16, m32x16);
impl_from_cast!(u8x16[test_v128]: i8x16, m8x16, i16x16, u16x16, m16x16, i32x16, u32x16, f32x16, m32x16);
impl_from_cast_mask!(m8x16[test_v128]: i8x16, u8x16, i16x16, u16x16, m16x16, i32x16, u32x16, f32x16, m32x16);

impl_from_cast!(
    i16x8[test_v128]: i8x8,
    u8x8,
    m8x8,
    u16x8,
    m16x8,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    isizex8,
    usizex8,
    msizex8
);
impl_from_cast!(
    u16x8[test_v128]: i8x8,
    u8x8,
    m8x8,
    i16x8,
    m16x8,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    isizex8,
    usizex8,
    msizex8
);
impl_from_cast_mask!(
    m16x8[test_v128]: i8x8,
    u8x8,
    m8x8,
    i16x8,
    u16x8,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    isizex8,
    usizex8,
    msizex8
);

impl_from_cast!(
    i32x4[test_v128]: i8x4,
    u8x4,
    m8x4,
    i16x4,
    u16x4,
    m16x4,
    u32x4,
    f32x4,
    m32x4,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x4,
    u128x4,
    m128x4,
    isizex4,
    usizex4,
    msizex4
);
impl_from_cast!(
    u32x4[test_v128]: i8x4,
    u8x4,
    m8x4,
    i16x4,
    u16x4,
    m16x4,
    i32x4,
    f32x4,
    m32x4,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x4,
    u128x4,
    m128x4,
    isizex4,
    usizex4,
    msizex4
);
impl_from_cast!(
    f32x4[test_v128]: i8x4,
    u8x4,
    m8x4,
    i16x4,
    u16x4,
    m16x4,
    i32x4,
    u32x4,
    m32x4,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x4,
    u128x4,
    m128x4,
    isizex4,
    usizex4,
    msizex4
);
impl_from_cast_mask!(
    m32x4[test_v128]: i8x4,
    u8x4,
    m8x4,
    i16x4,
    u16x4,
    m16x4,
    i32x4,
    u32x4,
    f32x4,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x4,
    u128x4,
    m128x4,
    isizex4,
    usizex4,
    msizex4
);

impl_from_cast!(
    i64x2[test_v128]: i8x2,
    u8x2,
    m8x2,
    i16x2,
    u16x2,
    m16x2,
    i32x2,
    u32x2,
    f32x2,
    m32x2,
    u64x2,
    f64x2,
    m64x2,
    i128x2,
    u128x2,
    m128x2,
    isizex2,
    usizex2,
    msizex2
);
impl_from_cast!(
    u64x2[test_v128]: i8x2,
    u8x2,
    m8x2,
    i16x2,
    u16x2,
    m16x2,
    i32x2,
    u32x2,
    f32x2,
    m32x2,
    i64x2,
    f64x2,
    m64x2,
    i128x2,
    u128x2,
    m128x2,
    isizex2,
    usizex2,
    msizex2
);
impl_from_cast!(
    f64x2[test_v128]: i8x2,
    u8x2,
    m8x2,
    i16x2,
    u16x2,
    m16x2,
    i32x2,
    u32x2,
    f32x2,
    m32x2,
    i64x2,
    u64x2,
    m64x2,
    i128x2,
    u128x2,
    m128x2,
    isizex2,
    usizex2,
    msizex2
);
impl_from_cast_mask!(
    m64x2[test_v128]: i8x2,
    u8x2,
    m8x2,
    i16x2,
    u16x2,
    m16x2,
    i32x2,
    u32x2,
    f32x2,
    m32x2,
    i64x2,
    u64x2,
    f64x2,
    i128x2,
    u128x2,
    m128x2,
    isizex2,
    usizex2,
    msizex2
);

impl_from_cast!(
    isizex2[test_v128]: i8x2,
    u8x2,
    m8x2,
    i16x2,
    u16x2,
    m16x2,
    i32x2,
    u32x2,
    f32x2,
    m32x2,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x2,
    u128x2,
    m128x2,
    usizex2,
    msizex2
);
impl_from_cast!(
    usizex2[test_v128]: i8x2,
    u8x2,
    m8x2,
    i16x2,
    u16x2,
    m16x2,
    i32x2,
    u32x2,
    f32x2,
    m32x2,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x2,
    u128x2,
    m128x2,
    isizex2,
    msizex2
);
impl_from_cast_mask!(
    msizex2[test_v128]: i8x2,
    u8x2,
    m8x2,
    i16x2,
    u16x2,
    m16x2,
    i32x2,
    u32x2,
    f32x2,
    m32x2,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x2,
    u128x2,
    m128x2,
    isizex2,
    usizex2
);

// FIXME[test_v128]: 64-bit single element vectors into_cast impls
impl_from_cast!(i128x1[test_v128]: u128x1, m128x1);
impl_from_cast!(u128x1[test_v128]: i128x1, m128x1);
impl_from_cast!(m128x1[test_v128]: i128x1, u128x1);


================================================
FILE: src/api/cast/v16.rs
================================================
//! `FromCast` and `IntoCast` implementations for portable 16-bit wide vectors
#[rustfmt::skip]

use crate::*;

impl_from_cast!(
    i8x2[test_v16]: u8x2,
    m8x2,
    i16x2,
    u16x2,
    m16x2,
    i32x2,
    u32x2,
    f32x2,
    m32x2,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x2,
    u128x2,
    m128x2,
    isizex2,
    usizex2,
    msizex2
);
impl_from_cast!(
    u8x2[test_v16]: i8x2,
    m8x2,
    i16x2,
    u16x2,
    m16x2,
    i32x2,
    u32x2,
    f32x2,
    m32x2,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x2,
    u128x2,
    m128x2,
    isizex2,
    usizex2,
    msizex2
);
impl_from_cast_mask!(
    m8x2[test_v16]: i8x2,
    u8x2,
    i16x2,
    u16x2,
    m16x2,
    i32x2,
    u32x2,
    f32x2,
    m32x2,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x2,
    u128x2,
    m128x2,
    isizex2,
    usizex2,
    msizex2
);


================================================
FILE: src/api/cast/v256.rs
================================================
//! `FromCast` and `IntoCast` implementations for portable 256-bit wide vectors
#[rustfmt::skip]

use crate::*;

impl_from_cast!(i8x32[test_v256]: u8x32, m8x32, i16x32, u16x32, m16x32);
impl_from_cast!(u8x32[test_v256]: i8x32, m8x32, i16x32, u16x32, m16x32);
impl_from_cast_mask!(m8x32[test_v256]: i8x32, u8x32, i16x32, u16x32, m16x32);

impl_from_cast!(i16x16[test_v256]: i8x16, u8x16, m8x16, u16x16, m16x16, i32x16, u32x16, f32x16, m32x16);
impl_from_cast!(u16x16[test_v256]: i8x16, u8x16, m8x16, i16x16, m16x16, i32x16, u32x16, f32x16, m32x16);
impl_from_cast_mask!(m16x16[test_v256]: i8x16, u8x16, m8x16, i16x16, u16x16, i32x16, u32x16, f32x16, m32x16);

impl_from_cast!(
    i32x8[test_v256]: i8x8,
    u8x8,
    m8x8,
    i16x8,
    u16x8,
    m16x8,
    u32x8,
    f32x8,
    m32x8,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    isizex8,
    usizex8,
    msizex8
);
impl_from_cast!(
    u32x8[test_v256]: i8x8,
    u8x8,
    m8x8,
    i16x8,
    u16x8,
    m16x8,
    i32x8,
    f32x8,
    m32x8,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    isizex8,
    usizex8,
    msizex8
);
impl_from_cast!(
    f32x8[test_v256]: i8x8,
    u8x8,
    m8x8,
    i16x8,
    u16x8,
    m16x8,
    i32x8,
    u32x8,
    m32x8,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    isizex8,
    usizex8,
    msizex8
);
impl_from_cast_mask!(
    m32x8[test_v256]: i8x8,
    u8x8,
    m8x8,
    i16x8,
    u16x8,
    m16x8,
    i32x8,
    u32x8,
    f32x8,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    isizex8,
    usizex8,
    msizex8
);

impl_from_cast!(
    i64x4[test_v256]: i8x4,
    u8x4,
    m8x4,
    i16x4,
    u16x4,
    m16x4,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    u64x4,
    f64x4,
    m64x4,
    i128x4,
    u128x4,
    m128x4,
    isizex4,
    usizex4,
    msizex4
);
impl_from_cast!(
    u64x4[test_v256]: i8x4,
    u8x4,
    m8x4,
    i16x4,
    u16x4,
    m16x4,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x4,
    f64x4,
    m64x4,
    i128x4,
    u128x4,
    m128x4,
    isizex4,
    usizex4,
    msizex4
);
impl_from_cast!(
    f64x4[test_v256]: i8x4,
    u8x4,
    m8x4,
    i16x4,
    u16x4,
    m16x4,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x4,
    u64x4,
    m64x4,
    i128x4,
    u128x4,
    m128x4,
    isizex4,
    usizex4,
    msizex4
);
impl_from_cast_mask!(
    m64x4[test_v256]: i8x4,
    u8x4,
    m8x4,
    i16x4,
    u16x4,
    m16x4,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x4,
    u64x4,
    f64x4,
    i128x4,
    u128x4,
    m128x4,
    isizex4,
    usizex4,
    msizex4
);

impl_from_cast!(
    i128x2[test_v256]: i8x2,
    u8x2,
    m8x2,
    i16x2,
    u16x2,
    m16x2,
    i32x2,
    u32x2,
    f32x2,
    m32x2,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    u128x2,
    m128x2,
    isizex2,
    usizex2,
    msizex2
);
impl_from_cast!(
    u128x2[test_v256]: i8x2,
    u8x2,
    m8x2,
    i16x2,
    u16x2,
    m16x2,
    i32x2,
    u32x2,
    f32x2,
    m32x2,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x2,
    m128x2,
    isizex2,
    usizex2,
    msizex2
);
impl_from_cast_mask!(
    m128x2[test_v256]: i8x2,
    u8x2,
    m8x2,
    i16x2,
    u16x2,
    m16x2,
    i32x2,
    u32x2,
    f32x2,
    m32x2,
    i64x2,
    u64x2,
    m64x2,
    f64x2,
    i128x2,
    u128x2,
    isizex2,
    usizex2,
    msizex2
);

impl_from_cast!(
    isizex4[test_v256]: i8x4,
    u8x4,
    m8x4,
    i16x4,
    u16x4,
    m16x4,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x4,
    u128x4,
    m128x4,
    usizex4,
    msizex4
);
impl_from_cast!(
    usizex4[test_v256]: i8x4,
    u8x4,
    m8x4,
    i16x4,
    u16x4,
    m16x4,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x4,
    u128x4,
    m128x4,
    isizex4,
    msizex4
);
impl_from_cast_mask!(
    msizex4[test_v256]: i8x4,
    u8x4,
    m8x4,
    i16x4,
    u16x4,
    m16x4,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x4,
    u128x4,
    m128x4,
    isizex4,
    usizex4
);


================================================
FILE: src/api/cast/v32.rs
================================================
//! `FromCast` and `IntoCast` implementations for portable 32-bit wide vectors
#[rustfmt::skip]

use crate::*;

impl_from_cast!(
    i8x4[test_v32]: u8x4,
    m8x4,
    i16x4,
    u16x4,
    m16x4,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x4,
    u128x4,
    m128x4,
    isizex4,
    usizex4,
    msizex4
);
impl_from_cast!(
    u8x4[test_v32]: i8x4,
    m8x4,
    i16x4,
    u16x4,
    m16x4,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x4,
    u128x4,
    m128x4,
    isizex4,
    usizex4,
    msizex4
);
impl_from_cast_mask!(
    m8x4[test_v32]: i8x4,
    u8x4,
    i16x4,
    u16x4,
    m16x4,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x4,
    u128x4,
    m128x4,
    isizex4,
    usizex4,
    msizex4
);

impl_from_cast!(
    i16x2[test_v32]: i8x2,
    u8x2,
    m8x2,
    u16x2,
    m16x2,
    i32x2,
    u32x2,
    f32x2,
    m32x2,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x2,
    u128x2,
    m128x2,
    isizex2,
    usizex2,
    msizex2
);
impl_from_cast!(
    u16x2[test_v32]: i8x2,
    u8x2,
    m8x2,
    i16x2,
    m16x2,
    i32x2,
    u32x2,
    f32x2,
    m32x2,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x2,
    u128x2,
    m128x2,
    isizex2,
    usizex2,
    msizex2
);
impl_from_cast_mask!(
    m16x2[test_v32]: i8x2,
    u8x2,
    m8x2,
    i16x2,
    u16x2,
    i32x2,
    u32x2,
    f32x2,
    m32x2,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x2,
    u128x2,
    m128x2,
    isizex2,
    usizex2,
    msizex2
);


================================================
FILE: src/api/cast/v512.rs
================================================
//! `FromCast` and `IntoCast` implementations for portable 512-bit wide vectors
#[rustfmt::skip]

use crate::*;

impl_from_cast!(i8x64[test_v512]: u8x64, m8x64);
impl_from_cast!(u8x64[test_v512]: i8x64, m8x64);
impl_from_cast_mask!(m8x64[test_v512]: i8x64, u8x64);

impl_from_cast!(i16x32[test_v512]: i8x32, u8x32, m8x32, u16x32, m16x32);
impl_from_cast!(u16x32[test_v512]: i8x32, u8x32, m8x32, i16x32, m16x32);
impl_from_cast_mask!(m16x32[test_v512]: i8x32, u8x32, m8x32, i16x32, u16x32);

impl_from_cast!(i32x16[test_v512]: i8x16, u8x16, m8x16, i16x16, u16x16, m16x16, u32x16, f32x16, m32x16);
impl_from_cast!(u32x16[test_v512]: i8x16, u8x16, m8x16, i16x16, u16x16, m16x16, i32x16, f32x16, m32x16);
impl_from_cast!(f32x16[test_v512]: i8x16, u8x16, m8x16, i16x16, u16x16, m16x16, i32x16, u32x16, m32x16);
impl_from_cast_mask!(m32x16[test_v512]: i8x16, u8x16, m8x16, i16x16, u16x16, m16x16, i32x16, u32x16, f32x16);

impl_from_cast!(
    i64x8[test_v512]: i8x8,
    u8x8,
    m8x8,
    i16x8,
    u16x8,
    m16x8,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    u64x8,
    f64x8,
    m64x8,
    isizex8,
    usizex8,
    msizex8
);
impl_from_cast!(
    u64x8[test_v512]: i8x8,
    u8x8,
    m8x8,
    i16x8,
    u16x8,
    m16x8,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x8,
    f64x8,
    m64x8,
    isizex8,
    usizex8,
    msizex8
);
impl_from_cast!(
    f64x8[test_v512]: i8x8,
    u8x8,
    m8x8,
    i16x8,
    u16x8,
    m16x8,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x8,
    u64x8,
    m64x8,
    isizex8,
    usizex8,
    msizex8
);
impl_from_cast_mask!(
    m64x8[test_v512]: i8x8,
    u8x8,
    m8x8,
    i16x8,
    u16x8,
    m16x8,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x8,
    u64x8,
    f64x8,
    isizex8,
    usizex8,
    msizex8
);

impl_from_cast!(
    i128x4[test_v512]: i8x4,
    u8x4,
    m8x4,
    i16x4,
    u16x4,
    m16x4,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    u128x4,
    m128x4,
    isizex4,
    usizex4,
    msizex4
);
impl_from_cast!(
    u128x4[test_v512]: i8x4,
    u8x4,
    m8x4,
    i16x4,
    u16x4,
    m16x4,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x4,
    m128x4,
    isizex4,
    usizex4,
    msizex4
);
impl_from_cast_mask!(
    m128x4[test_v512]: i8x4,
    u8x4,
    m8x4,
    i16x4,
    u16x4,
    m16x4,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x4,
    u64x4,
    m64x4,
    f64x4,
    i128x4,
    u128x4,
    isizex4,
    usizex4,
    msizex4
);

impl_from_cast!(
    isizex8[test_v512]: i8x8,
    u8x8,
    m8x8,
    i16x8,
    u16x8,
    m16x8,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    usizex8,
    msizex8
);
impl_from_cast!(
    usizex8[test_v512]: i8x8,
    u8x8,
    m8x8,
    i16x8,
    u16x8,
    m16x8,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    isizex8,
    msizex8
);
impl_from_cast_mask!(
    msizex8[test_v512]: i8x8,
    u8x8,
    m8x8,
    i16x8,
    u16x8,
    m16x8,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    isizex8,
    usizex8
);


================================================
FILE: src/api/cast/v64.rs
================================================
//! `FromCast` and `IntoCast` implementations for portable 64-bit wide vectors
#[rustfmt::skip]

use crate::*;

impl_from_cast!(
    i8x8[test_v64]: u8x8,
    m8x8,
    i16x8,
    u16x8,
    m16x8,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    isizex8,
    usizex8,
    msizex8
);
impl_from_cast!(
    u8x8[test_v64]: i8x8,
    m8x8,
    i16x8,
    u16x8,
    m16x8,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    isizex8,
    usizex8,
    msizex8
);
impl_from_cast_mask!(
    m8x8[test_v64]: i8x8,
    u8x8,
    i16x8,
    u16x8,
    m16x8,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    isizex8,
    usizex8,
    msizex8
);

impl_from_cast!(
    i16x4[test_v64]: i8x4,
    u8x4,
    m8x4,
    u16x4,
    m16x4,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x4,
    u128x4,
    m128x4,
    isizex4,
    usizex4,
    msizex4
);
impl_from_cast!(
    u16x4[test_v64]: i8x4,
    u8x4,
    m8x4,
    i16x4,
    m16x4,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x4,
    u128x4,
    m128x4,
    isizex4,
    usizex4,
    msizex4
);
impl_from_cast_mask!(
    m16x4[test_v64]: i8x4,
    u8x4,
    m8x4,
    i16x4,
    u16x4,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x4,
    u128x4,
    m128x4,
    isizex4,
    usizex4,
    msizex4
);

impl_from_cast!(
    i32x2[test_v64]: i8x2,
    u8x2,
    m8x2,
    i16x2,
    u16x2,
    m16x2,
    u32x2,
    f32x2,
    m32x2,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x2,
    u128x2,
    m128x2,
    isizex2,
    usizex2,
    msizex2
);
impl_from_cast!(
    u32x2[test_v64]: i8x2,
    u8x2,
    m8x2,
    i16x2,
    u16x2,
    m16x2,
    i32x2,
    f32x2,
    m32x2,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x2,
    u128x2,
    m128x2,
    isizex2,
    usizex2,
    msizex2
);
impl_from_cast!(
    f32x2[test_v64]: i8x2,
    u8x2,
    m8x2,
    i16x2,
    u16x2,
    m16x2,
    i32x2,
    u32x2,
    m32x2,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x2,
    u128x2,
    m128x2,
    isizex2,
    usizex2,
    msizex2
);
impl_from_cast_mask!(
    m32x2[test_v64]: i8x2,
    u8x2,
    m8x2,
    i16x2,
    u16x2,
    m16x2,
    i32x2,
    u32x2,
    f32x2,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x2,
    u128x2,
    m128x2,
    isizex2,
    usizex2,
    msizex2
);


================================================
FILE: src/api/cast.rs
================================================
//! Implementation of `FromCast` and `IntoCast`.
#![allow(clippy::module_name_repetitions)]

/// Numeric cast from `T` to `Self`.
///
/// > Note: This is a temporary workaround until the conversion traits
/// specified > in [RFC2484] are implemented.
///
/// Numeric cast between vectors with the same number of lanes, such that:
///
/// * casting integer vectors whose lane types have the same size (e.g. `i32xN`
/// -> `u32xN`) is a **no-op**,
///
/// * casting from a larger integer to a smaller integer (e.g. `u32xN` ->
/// `u8xN`) will **truncate**,
///
/// * casting from a smaller integer to a larger integer   (e.g. `u8xN` ->
///   `u32xN`) will:
///    * **zero-extend** if the source is unsigned, or
///    * **sign-extend** if the source is signed,
///
/// * casting from a float to an integer will **round the float towards zero**,
///
/// * casting from an integer to float will produce the floating point
/// representation of the integer, **rounding to nearest, ties to even**,
///
/// * casting from an `f32` to an `f64` is perfect and lossless,
///
/// * casting from an `f64` to an `f32` **rounds to nearest, ties to even**.
///
/// [RFC2484]: https://github.com/rust-lang/rfcs/pull/2484
pub trait FromCast<T>: crate::marker::Sized {
    /// Numeric cast from `T` to `Self`.
    fn from_cast(_: T) -> Self;
}

/// Numeric cast from `Self` to `T`.
///
/// > Note: This is a temporary workaround until the conversion traits
/// specified > in [RFC2484] are implemented.
///
/// Numeric cast between vectors with the same number of lanes, such that:
///
/// * casting integer vectors whose lane types have the same size (e.g. `i32xN`
/// -> `u32xN`) is a **no-op**,
///
/// * casting from a larger integer to a smaller integer (e.g. `u32xN` ->
/// `u8xN`) will **truncate**,
///
/// * casting from a smaller integer to a larger integer   (e.g. `u8xN` ->
///   `u32xN`) will:
///    * **zero-extend** if the source is unsigned, or
///    * **sign-extend** if the source is signed,
///
/// * casting from a float to an integer will **round the float towards zero**,
///
/// * casting from an integer to float will produce the floating point
/// representation of the integer, **rounding to nearest, ties to even**,
///
/// * casting from an `f32` to an `f64` is perfect and lossless,
///
/// * casting from an `f64` to an `f32` **rounds to nearest, ties to even**.
///
/// [RFC2484]: https://github.com/rust-lang/rfcs/pull/2484
pub trait Cast<T>: crate::marker::Sized {
    /// Numeric cast from `self` to `T`.
    fn cast(self) -> T;
}

/// `FromCast` implies `Cast`.
impl<T, U> Cast<U> for T
where
    U: FromCast<T>,
{
    #[inline]
    fn cast(self) -> U {
        U::from_cast(self)
    }
}

/// `FromCast` and `Cast` are reflexive
impl<T> FromCast<T> for T {
    #[inline]
    fn from_cast(t: Self) -> Self {
        t
    }
}

#[macro_use]
mod macros;

mod v16;
pub use self::v16::*;

mod v32;
pub use self::v32::*;

mod v64;
pub use self::v64::*;

mod v128;
pub use self::v128::*;

mod v256;
pub use self::v256::*;

mod v512;
pub use self::v512::*;


================================================
FILE: src/api/cmp/eq.rs
================================================
//! Implements `Eq` for vector types.

macro_rules! impl_cmp_eq {
    (
        [$elem_ty:ident; $elem_count:expr]:
        $id:ident | $test_tt:tt |
        ($true:expr, $false:expr)
    ) => {
        impl crate::cmp::Eq for $id {}
        impl crate::cmp::Eq for LexicographicallyOrdered<$id> {}

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _cmp_eq>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn eq() {
                        fn foo<E: crate::cmp::Eq>(_: E) {}
                        let a = $id::splat($false);
                        foo(a);
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/cmp/ord.rs
================================================
//! Implements `Ord` for vector types.

macro_rules! impl_cmp_ord {
    (
        [$elem_ty:ident; $elem_count:expr]:
        $id:ident | $test_tt:tt |
        ($true:expr, $false:expr)
    ) => {
        impl $id {
            /// Returns a wrapper that implements `Ord`.
            #[inline]
            pub fn lex_ord(&self) -> LexicographicallyOrdered<$id> {
                LexicographicallyOrdered(*self)
            }
        }

        impl crate::cmp::Ord for LexicographicallyOrdered<$id> {
            #[inline]
            fn cmp(&self, other: &Self) -> crate::cmp::Ordering {
                match self.partial_cmp(other) {
                    Some(x) => x,
                    None => unsafe { crate::hint::unreachable_unchecked() },
                }
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _cmp_ord>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn eq() {
                        fn foo<E: crate::cmp::Ord>(_: E) {}
                        let a = $id::splat($false);
                        foo(a.partial_lex_ord());
                        foo(a.lex_ord());
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/cmp/partial_eq.rs
================================================
//! Implements `PartialEq` for vector types.

macro_rules! impl_cmp_partial_eq {
    (
        [$elem_ty:ident; $elem_count:expr]:
        $id:ident | $test_tt:tt |
        ($true:expr, $false:expr)
    ) => {
        // FIXME: https://github.com/rust-lang-nursery/rust-clippy/issues/2892
        #[allow(clippy::partialeq_ne_impl)]
        impl crate::cmp::PartialEq<$id> for $id {
            #[inline]
            fn eq(&self, other: &Self) -> bool {
                $id::eq(*self, *other).all()
            }
            #[inline]
            fn ne(&self, other: &Self) -> bool {
                $id::ne(*self, *other).any()
            }
        }

        // FIXME: https://github.com/rust-lang-nursery/rust-clippy/issues/2892
        #[allow(clippy::partialeq_ne_impl)]
        impl crate::cmp::PartialEq<LexicographicallyOrdered<$id>> for LexicographicallyOrdered<$id> {
            #[inline]
            fn eq(&self, other: &Self) -> bool {
                self.0 == other.0
            }
            #[inline]
            fn ne(&self, other: &Self) -> bool {
                self.0 != other.0
            }
        }

        test_if! {
            $test_tt:
            paste::item! {
                pub mod [<$id _cmp_PartialEq>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn partial_eq() {
                        let a = $id::splat($false);
                        let b = $id::splat($true);

                        assert!(a != b);
                        assert!(!(a == b));
                        assert!(a == a);
                        assert!(!(a != a));

                        if $id::lanes() > 1 {
                            let a = $id::splat($false).replace(0, $true);
                            let b = $id::splat($true);

                            assert!(a != b);
                            assert!(!(a == b));
                            assert!(a == a);
                            assert!(!(a != a));
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/cmp/partial_ord.rs
================================================
//! Implements `PartialOrd` for vector types.
//!
//! This implements a lexicographical order.

macro_rules! impl_cmp_partial_ord {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Returns a wrapper that implements `PartialOrd`.
            #[inline]
            pub fn partial_lex_ord(&self) -> LexicographicallyOrdered<$id> {
                LexicographicallyOrdered(*self)
            }
        }

        impl crate::cmp::PartialOrd<LexicographicallyOrdered<$id>> for LexicographicallyOrdered<$id> {
            #[inline]
            fn partial_cmp(&self, other: &Self) -> Option<crate::cmp::Ordering> {
                if PartialEq::eq(self, other) {
                    Some(crate::cmp::Ordering::Equal)
                } else if PartialOrd::lt(self, other) {
                    Some(crate::cmp::Ordering::Less)
                } else if PartialOrd::gt(self, other) {
                    Some(crate::cmp::Ordering::Greater)
                } else {
                    None
                }
            }
            #[inline]
            fn lt(&self, other: &Self) -> bool {
                let m_lt = self.0.lt(other.0);
                let m_eq = self.0.eq(other.0);
                for i in 0..$id::lanes() {
                    if m_eq.extract(i) {
                        continue;
                    }
                    return m_lt.extract(i);
                }
                false
            }
            #[inline]
            fn le(&self, other: &Self) -> bool {
                self.lt(other) | PartialEq::eq(self, other)
            }
            #[inline]
            fn ge(&self, other: &Self) -> bool {
                self.gt(other) | PartialEq::eq(self, other)
            }
            #[inline]
            fn gt(&self, other: &Self) -> bool {
                let m_gt = self.0.gt(other.0);
                let m_eq = self.0.eq(other.0);
                for i in 0..$id::lanes() {
                    if m_eq.extract(i) {
                        continue;
                    }
                    return m_gt.extract(i);
                }
                false
            }
        }
    };
}

macro_rules! test_cmp_partial_ord_int {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _cmp_PartialOrd>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn partial_lex_ord() {
                        use crate::testing::utils::{test_cmp};
                        // constant values
                        let a = $id::splat(0);
                        let b = $id::splat(1);

                        test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),
                                 Some(crate::cmp::Ordering::Less));
                        test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),
                                 Some(crate::cmp::Ordering::Greater));
                        test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),
                                 Some(crate::cmp::Ordering::Equal));
                        test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),
                                 Some(crate::cmp::Ordering::Equal));

                        // variable values: a = [0, 1, 2, 3]; b = [3, 2, 1, 0]
                        let mut a = $id::splat(0);
                        let mut b = $id::splat(0);
                        for i in 0..$id::lanes() {
                            a = a.replace(i, i as $elem_ty);
                            b = b.replace(i, ($id::lanes() - i) as $elem_ty);
                        }
                        test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),
                                 Some(crate::cmp::Ordering::Less));
                        test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),
                                 Some(crate::cmp::Ordering::Greater));
                        test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),
                                 Some(crate::cmp::Ordering::Equal));
                        test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),
                                 Some(crate::cmp::Ordering::Equal));

                        // variable values: a = [0, 1, 2, 3]; b = [0, 1, 2, 4]
                        let mut b = a;
                        b = b.replace(
                            $id::lanes() - 1,
                            a.extract($id::lanes() - 1) + 1 as $elem_ty
                        );
                        test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),
                                 Some(crate::cmp::Ordering::Less));
                        test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),
                                 Some(crate::cmp::Ordering::Greater));
                        test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),
                                 Some(crate::cmp::Ordering::Equal));
                        test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),
                                 Some(crate::cmp::Ordering::Equal));

                        if $id::lanes() > 2 {
                            // variable values a = [0, 1, 0, 0]; b = [0, 1, 2, 3]
                            let b = a;
                            let mut a = $id::splat(0);
                            a = a.replace(1, 1 as $elem_ty);
                            test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),
                                     Some(crate::cmp::Ordering::Less));
                            test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),
                                     Some(crate::cmp::Ordering::Greater));
                            test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),
                                     Some(crate::cmp::Ordering::Equal));
                            test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),
                                     Some(crate::cmp::Ordering::Equal));

                            // variable values: a = [0, 1, 2, 3]; b = [0, 1, 3, 2]
                            let mut b = a;
                            b = b.replace(
                                2, a.extract($id::lanes() - 1) + 1 as $elem_ty
                            );
                            test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),
                                     Some(crate::cmp::Ordering::Less));
                            test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),
                                     Some(crate::cmp::Ordering::Greater));
                            test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),
                                     Some(crate::cmp::Ordering::Equal));
                            test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),
                                     Some(crate::cmp::Ordering::Equal));
                        }
                    }
                }
            }
        }
    };
}

macro_rules! test_cmp_partial_ord_mask {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _cmp_PartialOrd>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn partial_lex_ord() {
                        use crate::testing::utils::{test_cmp};
                        use crate::cmp::Ordering;

                        // constant values
                        let a = $id::splat(false);
                        let b = $id::splat(true);

                        test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),
                                 Some(Ordering::Less));
                        test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),
                                 Some(Ordering::Greater));
                        test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),
                                 Some(Ordering::Equal));
                        test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),
                                 Some(Ordering::Equal));

                        // variable values:
                        // a = [false, false, false, false];
                        // b = [false, false, false, true]
                        let a = $id::splat(false);
                        let mut b = $id::splat(false);
                        b = b.replace($id::lanes() - 1, true);
                        test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),
                                 Some(Ordering::Less));
                        test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),
                                 Some(Ordering::Greater));
                        test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),
                                 Some(Ordering::Equal));
                        test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),
                                 Some(Ordering::Equal));

                        // variable values:
                        // a = [true, true, true, false];
                        // b = [true, true, true, true]
                        let mut a = $id::splat(true);
                        let b = $id::splat(true);
                        a = a.replace($id::lanes() - 1, false);
                        test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),
                                 Some(Ordering::Less));
                        test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),
                                 Some(Ordering::Greater));
                        test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),
                                 Some(Ordering::Equal));
                        test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),
                                 Some(Ordering::Equal));

                        if $id::lanes() > 2 {
                            // variable values
                            // a = [false, true, false, false];
                            // b = [false, true, true, true]
                            let mut a = $id::splat(false);
                            let mut b = $id::splat(true);
                            a = a.replace(1, true);
                            b = b.replace(0, false);
                            test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),
                                     Some(Ordering::Less));
                            test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),
                                     Some(Ordering::Greater));
                            test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),
                                     Some(Ordering::Equal));
                            test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),
                                     Some(Ordering::Equal));
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/cmp/vertical.rs
================================================
//! Vertical (lane-wise) vector comparisons returning vector masks.

macro_rules! impl_cmp_vertical {
    (
        [$elem_ty:ident; $elem_count:expr]:
        $id:ident,
        $mask_ty:ident,
        $is_mask:expr,($true:expr, $false:expr) | $test_tt:tt
    ) => {
        impl $id {
            /// Lane-wise equality comparison.
            #[inline]
            pub fn eq(self, other: Self) -> $mask_ty {
                use crate::llvm::simd_eq;
                Simd(unsafe { simd_eq(self.0, other.0) })
            }

            /// Lane-wise inequality comparison.
            #[inline]
            pub fn ne(self, other: Self) -> $mask_ty {
                use crate::llvm::simd_ne;
                Simd(unsafe { simd_ne(self.0, other.0) })
            }

            /// Lane-wise less-than comparison.
            #[inline]
            pub fn lt(self, other: Self) -> $mask_ty {
                use crate::llvm::{simd_gt, simd_lt};
                if $is_mask {
                    Simd(unsafe { simd_gt(self.0, other.0) })
                } else {
                    Simd(unsafe { simd_lt(self.0, other.0) })
                }
            }

            /// Lane-wise less-than-or-equals comparison.
            #[inline]
            pub fn le(self, other: Self) -> $mask_ty {
                use crate::llvm::{simd_ge, simd_le};
                if $is_mask {
                    Simd(unsafe { simd_ge(self.0, other.0) })
                } else {
                    Simd(unsafe { simd_le(self.0, other.0) })
                }
            }

            /// Lane-wise greater-than comparison.
            #[inline]
            pub fn gt(self, other: Self) -> $mask_ty {
                use crate::llvm::{simd_gt, simd_lt};
                if $is_mask {
                    Simd(unsafe { simd_lt(self.0, other.0) })
                } else {
                    Simd(unsafe { simd_gt(self.0, other.0) })
                }
            }

            /// Lane-wise greater-than-or-equals comparison.
            #[inline]
            pub fn ge(self, other: Self) -> $mask_ty {
                use crate::llvm::{simd_ge, simd_le};
                if $is_mask {
                    Simd(unsafe { simd_le(self.0, other.0) })
                } else {
                    Simd(unsafe { simd_ge(self.0, other.0) })
                }
            }
        }
        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _cmp_vertical>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn cmp() {
                        let a = $id::splat($false);
                        let b = $id::splat($true);

                        let r = a.lt(b);
                        let e = $mask_ty::splat(true);
                        assert!(r == e);
                        let r = a.le(b);
                        assert!(r == e);

                        let e = $mask_ty::splat(false);
                        let r = a.gt(b);
                        assert!(r == e);
                        let r = a.ge(b);
                        assert!(r == e);
                        let r = a.eq(b);
                        assert!(r == e);

                        let mut a = a;
                        let mut b = b;
                        let mut e = e;
                        for i in 0..$id::lanes() {
                            if i % 2 == 0 {
                                a = a.replace(i, $false);
                                b = b.replace(i, $true);
                                e = e.replace(i, true);
                            } else {
                                a = a.replace(i, $true);
                                b = b.replace(i, $false);
                                e = e.replace(i, false);
                            }
                        }
                        let r = a.lt(b);
                        assert!(r == e);
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/cmp.rs
================================================
//! Implement cmp traits for vector types

#[macro_use]
mod partial_eq;

#[macro_use]
mod eq;

#[macro_use]
mod partial_ord;

#[macro_use]
mod ord;

#[macro_use]
mod vertical;


================================================
FILE: src/api/default.rs
================================================
//! Implements `Default` for vector types.

macro_rules! impl_default {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl Default for $id {
            #[inline]
            fn default() -> Self {
                Self::splat($elem_ty::default())
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                // Comparisons use integer casts within mantissa^1 range.
                #[allow(clippy::float_cmp)]
                pub mod [<$id _default>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn default() {
                        let a = $id::default();
                        for i in 0..$id::lanes() {
                            assert_eq!(a.extract(i), $elem_ty::default());
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/fmt/binary.rs
================================================
//! Implement Octal formatting

macro_rules! impl_fmt_binary {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl crate::fmt::Binary for $id {
            #[allow(clippy::missing_inline_in_public_items)]
            fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result {
                write!(f, "{}(", stringify!($id))?;
                for i in 0..$elem_count {
                    if i > 0 {
                        write!(f, ", ")?;
                    }
                    self.extract(i).fmt(f)?;
                }
                write!(f, ")")
            }
        }
        test_if! {
            $test_tt:
            paste::item! {
                pub mod [<$id _fmt_binary>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn binary() {
                        use arrayvec::{ArrayString,ArrayVec};
                        type TinyString = ArrayString<[u8; 512]>;

                        use crate::fmt::Write;
                        let v = $id::splat($elem_ty::default());
                        let mut s = TinyString::new();
                        write!(&mut s, "{:#b}", v).unwrap();

                        let mut beg = TinyString::new();
                        write!(&mut beg, "{}(", stringify!($id)).unwrap();
                        assert!(s.starts_with(beg.as_str()));
                        assert!(s.ends_with(")"));
                        let s: ArrayVec<[TinyString; 64]>
                            = s.replace(beg.as_str(), "")
                            .replace(")", "").split(",")
                            .map(|v| TinyString::from(v.trim()).unwrap())
                            .collect();
                        assert_eq!(s.len(), $id::lanes());
                        for (index, ss) in s.into_iter().enumerate() {
                            let mut e = TinyString::new();
                            write!(&mut e, "{:#b}", v.extract(index)).unwrap();
                            assert_eq!(ss, e);
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/fmt/debug.rs
================================================
//! Implement debug formatting

macro_rules! impl_fmt_debug_tests {
    ([$elem_ty:ty; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        test_if! {
            $test_tt:
            paste::item! {
                pub mod [<$id _fmt_debug>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn debug() {
                        use arrayvec::{ArrayString,ArrayVec};
                        type TinyString = ArrayString<[u8; 512]>;

                        use crate::fmt::Write;
                        let v = $id::default();
                        let mut s = TinyString::new();
                        write!(&mut s, "{:?}", v).unwrap();

                        let mut beg = TinyString::new();
                        write!(&mut beg, "{}(", stringify!($id)).unwrap();
                        assert!(s.starts_with(beg.as_str()));
                        assert!(s.ends_with(")"));
                        let s: ArrayVec<[TinyString; 64]>
                            = s.replace(beg.as_str(), "")
                            .replace(")", "").split(",")
                            .map(|v| TinyString::from(v.trim()).unwrap())
                            .collect();
                        assert_eq!(s.len(), $id::lanes());
                        for (index, ss) in s.into_iter().enumerate() {
                            let mut e = TinyString::new();
                            write!(&mut e, "{:?}", v.extract(index)).unwrap();
                            assert_eq!(ss, e);
                        }
                    }
                }
            }
        }
    };
}

macro_rules! impl_fmt_debug {
    ([$elem_ty:ty; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl crate::fmt::Debug for $id {
            #[allow(clippy::missing_inline_in_public_items)]
            fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result {
                write!(f, "{}(", stringify!($id))?;
                for i in 0..$elem_count {
                    if i > 0 {
                        write!(f, ", ")?;
                    }
                    self.extract(i).fmt(f)?;
                }
                write!(f, ")")
            }
        }
        impl_fmt_debug_tests!([$elem_ty; $elem_count]: $id | $test_tt);
    };
}


================================================
FILE: src/api/fmt/lower_hex.rs
================================================
//! Implement `LowerHex` formatting

macro_rules! impl_fmt_lower_hex {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl crate::fmt::LowerHex for $id {
            #[allow(clippy::missing_inline_in_public_items)]
            fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result {
                write!(f, "{}(", stringify!($id))?;
                for i in 0..$elem_count {
                    if i > 0 {
                        write!(f, ", ")?;
                    }
                    self.extract(i).fmt(f)?;
                }
                write!(f, ")")
            }
        }
        test_if! {
            $test_tt:
            paste::item! {
                pub mod [<$id _fmt_lower_hex>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn lower_hex() {
                        use arrayvec::{ArrayString,ArrayVec};
                        type TinyString = ArrayString<[u8; 512]>;

                        use crate::fmt::Write;
                        let v = $id::splat($elem_ty::default());
                        let mut s = TinyString::new();
                        write!(&mut s, "{:#x}", v).unwrap();

                        let mut beg = TinyString::new();
                        write!(&mut beg, "{}(", stringify!($id)).unwrap();
                        assert!(s.starts_with(beg.as_str()));
                        assert!(s.ends_with(")"));
                        let s: ArrayVec<[TinyString; 64]>
                            = s.replace(beg.as_str(), "").replace(")", "")
                            .split(",")
                            .map(|v| TinyString::from(v.trim()).unwrap())
                            .collect();
                        assert_eq!(s.len(), $id::lanes());
                        for (index, ss) in s.into_iter().enumerate() {
                            let mut e = TinyString::new();
                            write!(&mut e, "{:#x}", v.extract(index)).unwrap();
                        assert_eq!(ss, e);
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/fmt/octal.rs
================================================
//! Implement Octal formatting

macro_rules! impl_fmt_octal {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl crate::fmt::Octal for $id {
            #[allow(clippy::missing_inline_in_public_items)]
            fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result {
                write!(f, "{}(", stringify!($id))?;
                for i in 0..$elem_count {
                    if i > 0 {
                        write!(f, ", ")?;
                    }
                    self.extract(i).fmt(f)?;
                }
                write!(f, ")")
            }
        }
        test_if! {
            $test_tt:
            paste::item! {
                pub mod [<$id _fmt_octal>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn octal_hex() {
                        use arrayvec::{ArrayString,ArrayVec};
                        type TinyString = ArrayString<[u8; 512]>;

                        use crate::fmt::Write;
                        let v = $id::splat($elem_ty::default());
                        let mut s = TinyString::new();
                        write!(&mut s, "{:#o}", v).unwrap();

                        let mut beg = TinyString::new();
                        write!(&mut beg, "{}(", stringify!($id)).unwrap();
                        assert!(s.starts_with(beg.as_str()));
                        assert!(s.ends_with(")"));
                        let s: ArrayVec<[TinyString; 64]>
                            = s.replace(beg.as_str(), "").replace(")", "")
                            .split(",")
                            .map(|v| TinyString::from(v.trim()).unwrap())
                            .collect();
                        assert_eq!(s.len(), $id::lanes());
                        for (index, ss) in s.into_iter().enumerate() {
                            let mut e = TinyString::new();
                            write!(&mut e, "{:#o}", v.extract(index)).unwrap();
                            assert_eq!(ss, e);
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/fmt/upper_hex.rs
================================================
//! Implement `UpperHex` formatting

macro_rules! impl_fmt_upper_hex {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl crate::fmt::UpperHex for $id {
            #[allow(clippy::missing_inline_in_public_items)]
            fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result {
                write!(f, "{}(", stringify!($id))?;
                for i in 0..$elem_count {
                    if i > 0 {
                        write!(f, ", ")?;
                    }
                    self.extract(i).fmt(f)?;
                }
                write!(f, ")")
            }
        }
        test_if! {
            $test_tt:
            paste::item! {
                pub mod [<$id _fmt_upper_hex>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn upper_hex() {
                        use arrayvec::{ArrayString,ArrayVec};
                        type TinyString = ArrayString<[u8; 512]>;

                        use crate::fmt::Write;
                        let v = $id::splat($elem_ty::default());
                        let mut s = TinyString::new();
                        write!(&mut s, "{:#X}", v).unwrap();

                        let mut beg = TinyString::new();
                        write!(&mut beg, "{}(", stringify!($id)).unwrap();
                        assert!(s.starts_with(beg.as_str()));
                        assert!(s.ends_with(")"));
                        let s: ArrayVec<[TinyString; 64]>
                            = s.replace(beg.as_str(), "").replace(")", "")
                            .split(",")
                            .map(|v| TinyString::from(v.trim()).unwrap())
                            .collect();
                        assert_eq!(s.len(), $id::lanes());
                        for (index, ss) in s.into_iter().enumerate() {
                            let mut e = TinyString::new();
                            write!(&mut e, "{:#X}", v.extract(index)).unwrap();
                            assert_eq!(ss, e);
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/fmt.rs
================================================
//! Implements formatting APIs

#[macro_use]
mod debug;
#[macro_use]
mod lower_hex;
#[macro_use]
mod upper_hex;
#[macro_use]
mod octal;
#[macro_use]
mod binary;


================================================
FILE: src/api/from/from_array.rs
================================================
//! Implements `From<[T; N]>` and `Into<[T; N]>` for vector types.

macro_rules! impl_from_array {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt
     | ($non_default_array:expr, $non_default_vec:expr)) => {
        impl From<[$elem_ty; $elem_count]> for $id {
            #[inline]
            fn from(array: [$elem_ty; $elem_count]) -> Self {
                union U {
                    array: [$elem_ty; $elem_count],
                    vec: $id,
                }
                unsafe { U { array }.vec }
            }
        }

        impl From<$id> for [$elem_ty; $elem_count] {
            #[inline]
            fn from(vec: $id) -> Self {
                union U {
                    array: [$elem_ty; $elem_count],
                    vec: $id,
                }
                unsafe { U { vec }.array }
            }
        }

        // FIXME: `Into::into` is not inline, but due to
        // the blanket impl in `std`, which is not
        // marked `default`, we cannot override it here with
        // specialization.
        /*
        impl Into<[$elem_ty; $elem_count]> for $id {
            #[inline]
            fn into(self) -> [$elem_ty; $elem_count] {
                union U {
                    array: [$elem_ty; $elem_count],
                    vec: $id,
                }
                unsafe { U { vec: self }.array }
            }
        }

        impl Into<$id> for [$elem_ty; $elem_count] {
            #[inline]
            fn into(self) -> $id {
                union U {
                    array: [$elem_ty; $elem_count],
                    vec: $id,
                }
                unsafe { U { array: self }.vec }
            }
        }
        */

        test_if! {
            $test_tt:
            paste::item! {
                // Comparisons use integer casts within mantissa^1 range.
                #[allow(clippy::float_cmp)]
                mod [<$id _from>] {
                    use super::*;
                    #[test]
                    #[cfg_attr(miri, ignore)]
                    fn array() {
                        let vec: $id = Default::default();

                        // FIXME: Workaround for arrays with more than 32
                        // elements.
                        //
                        // Safe because we never take a reference to any
                        // uninitialized element.
                        union W {
                            array: [$elem_ty; $elem_count],
                            other: ()
                        }
                        let mut array = W { other: () };
                        for i in 0..$elem_count {
                            let default: $elem_ty = Default::default();
                            // note: array.other is the active member and
                            // initialized so we can take a reference to it:
                            let p = unsafe {
                                &mut array.other as *mut () as *mut $elem_ty
                            };
                            // note: default is a valid bit-pattern for
                            // $elem_ty:
                            unsafe {
                                crate::ptr::write(p.wrapping_add(i), default)
                            };
                        }
                        // note: the array variant of the union is properly
                        // initialized:
                        let mut array = unsafe {
                            array.array
                        };

                        array[0] = $non_default_array;
                        let vec = vec.replace(0, $non_default_vec);

                        let vec_from_array = $id::from(array);
                        assert_eq!(vec_from_array, vec);
                        let array_from_vec
                            = <[$elem_ty; $elem_count]>::from(vec);
                        // FIXME: Workaround for arrays with more than 32
                        // elements.
                        for i in 0..$elem_count {
                            assert_eq!(array_from_vec[i], array[i]);
                        }

                        let vec_from_into_array: $id = array.into();
                        assert_eq!(vec_from_into_array, vec);
                        let array_from_into_vec: [$elem_ty; $elem_count]
                            = vec.into();
                        // FIXME: Workaround for arrays with more than 32
                        // elements.
                        for i in 0..$elem_count {
                            assert_eq!(array_from_into_vec[i], array[i]);
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/from/from_vector.rs
================================================
//! Implements `From` and `Into` for vector types.

macro_rules! impl_from_vector {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt
     | $source:ident) => {
        impl From<$source> for $id {
            #[inline]
            fn from(source: $source) -> Self {
                fn static_assert_same_number_of_lanes<T, U>()
                where
                    T: crate::sealed::Simd,
                    U: crate::sealed::Simd<LanesType = T::LanesType>,
                {
                }
                use crate::llvm::simd_cast;
                static_assert_same_number_of_lanes::<$id, $source>();
                Simd(unsafe { simd_cast(source.0) })
            }
        }

        // FIXME: `Into::into` is not inline, but due to the blanket impl in
        // `std`, which is not marked `default`, we cannot override it here
        // with specialization.

        /*
           impl Into<$id> for $source {
               #[inline]
               fn into(self) -> $id {
                   unsafe { simd_cast(self) }
               }
           }
        */

        test_if! {
            $test_tt:
            paste::item! {
                pub mod [<$id _from_ $source>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn from() {
                        assert_eq!($id::lanes(), $source::lanes());
                        let source: $source = Default::default();
                        let vec: $id = Default::default();

                        let e = $id::from(source);
                        assert_eq!(e, vec);

                        let e: $id = source.into();
                        assert_eq!(e, vec);
                    }
                }
            }
        }
    };
}

macro_rules! impl_from_vectors {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt
     | $($source:ident),*) => {
        $(
            impl_from_vector!(
                [$elem_ty; $elem_count]: $id | $test_tt | $source
            );
        )*
    }
}


================================================
FILE: src/api/from.rs
================================================
//! Implementations of the `From` and `Into` traits

#[macro_use]
mod from_array;

#[macro_use]
mod from_vector;


================================================
FILE: src/api/hash.rs
================================================
//! Implements `Hash` for vector types.

macro_rules! impl_hash {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl crate::hash::Hash for $id {
            #[inline]
            fn hash<H: crate::hash::Hasher>(&self, state: &mut H) {
                unsafe {
                    union A {
                        data: [$elem_ty; $id::lanes()],
                        vec: $id,
                    }
                    A { vec: *self }.data.hash(state)
                }
            }
        }

        test_if! {
            $test_tt:
            paste::item! {
                pub mod [<$id _hash>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn hash() {
                        use crate::hash::{Hash, Hasher};
                        #[allow(deprecated)]
                        use crate::hash::{SipHasher13};
                        type A = [$elem_ty; $id::lanes()];
                        let a: A = [42 as $elem_ty; $id::lanes()];
                        assert_eq!(
                            crate::mem::size_of::<A>(),
                            crate::mem::size_of::<$id>()
                        );
                        #[allow(deprecated)]
                        let mut a_hash = SipHasher13::new();
                        let mut v_hash = a_hash.clone();
                        a.hash(&mut a_hash);

                        // Integer within mantissa^1 range.
                        #[allow(clippy::float_cmp)]
                        let v = $id::splat(42 as $elem_ty);
                        v.hash(&mut v_hash);
                        assert_eq!(a_hash.finish(), v_hash.finish());
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/into_bits/arch_specific.rs
================================================
//! `FromBits` and `IntoBits` between portable vector types and the
//! architecture-specific vector types.
#[rustfmt::skip]

// FIXME: MIPS FromBits/IntoBits

#[allow(unused)]
use crate::*;

/// This macro implements FromBits for the portable and the architecture
/// specific vector types.
///
/// The "leaf" case is at the bottom, and the most generic case is at the top.
/// The generic case is split into smaller cases recursively.
macro_rules! impl_arch {
    ([$arch_head_i:ident[$arch_head_tt:tt]: $($arch_head_ty:ident),*],
     $([$arch_tail_i:ident[$arch_tail_tt:tt]: $($arch_tail_ty:ident),*]),* |
     from: $($from_ty:ident),* | into: $($into_ty:ident),* |
     test: $test_tt:tt) => {
        impl_arch!(
            [$arch_head_i[$arch_head_tt]: $($arch_head_ty),*] |
            from: $($from_ty),* |
            into: $($into_ty),* |
            test: $test_tt
        );
        impl_arch!(
            $([$arch_tail_i[$arch_tail_tt]: $($arch_tail_ty),*]),* |
            from: $($from_ty),* |
            into: $($into_ty),* |
            test: $test_tt
        );
    };
    ([$arch:ident[$arch_tt:tt]: $($arch_ty:ident),*] |
     from: $($from_ty:ident),* | into: $($into_ty:ident),* |
     test: $test_tt:tt) => {
        // note: if target is "arm", "+v7,+neon" must be enabled
        // and the std library must be recompiled with them
        #[cfg(any(
            not(target_arch = "arm"),
            all(target_feature = "v7", target_feature = "neon",
                any(feature = "core_arch", libcore_neon)))
        )]
        // note: if target is "powerpc", "altivec" must be enabled
        // and the std library must be recompiled with it
        #[cfg(any(
            not(target_arch = "powerpc"),
            all(target_feature = "altivec", feature = "core_arch"),
        ))]
        #[cfg(target_arch = $arch_tt)]
        use crate::arch::$arch::{
            $($arch_ty),*
        };

        #[cfg(any(
            not(target_arch = "arm"),
            all(target_feature = "v7", target_feature = "neon",
                any(feature = "core_arch", libcore_neon)))
        )]
        #[cfg(any(
            not(target_arch = "powerpc"),
            all(target_feature = "altivec", feature = "core_arch"),
        ))]
        #[cfg(target_arch = $arch_tt)]
        impl_arch!($($arch_ty),* | $($from_ty),* | $($into_ty),* |
                   test: $test_tt);
    };
    ($arch_head:ident, $($arch_tail:ident),* | $($from_ty:ident),*
     | $($into_ty:ident),* | test: $test_tt:tt) => {
        impl_arch!($arch_head | $($from_ty),* | $($into_ty),* |
                   test: $test_tt);
        impl_arch!($($arch_tail),* | $($from_ty),* | $($into_ty),* |
                   test: $test_tt);
    };
    ($arch_head:ident | $($from_ty:ident),* | $($into_ty:ident),* |
     test: $test_tt:tt) => {
        impl_from_bits!($arch_head[$test_tt]: $($from_ty),*);
        impl_into_bits!($arch_head[$test_tt]: $($into_ty),*);
    };
}

////////////////////////////////////////////////////////////////////////////////
// Implementations for the 64-bit wide vector types:

// FIXME: 64-bit single element types
// FIXME: arm/aarch float16x4_t missing
impl_arch!(
    [
        arm["arm"]: int8x8_t,
        uint8x8_t,
        poly8x8_t,
        int16x4_t,
        uint16x4_t,
        poly16x4_t,
        int32x2_t,
        uint32x2_t,
        float32x2_t,
        int64x1_t,
        uint64x1_t
    ],
    [
        aarch64["aarch64"]: int8x8_t,
        uint8x8_t,
        poly8x8_t,
        int16x4_t,
        uint16x4_t,
        poly16x4_t,
        int32x2_t,
        uint32x2_t,
        float32x2_t,
        int64x1_t,
        uint64x1_t,
        float64x1_t
    ] | from: i8x8,
    u8x8,
    m8x8,
    i16x4,
    u16x4,
    m16x4,
    i32x2,
    u32x2,
    f32x2,
    m32x2 | into: i8x8,
    u8x8,
    i16x4,
    u16x4,
    i32x2,
    u32x2,
    f32x2 | test: test_v64
);

////////////////////////////////////////////////////////////////////////////////
// Implementations for the 128-bit wide vector types:

// FIXME: arm/aarch float16x8_t missing
// FIXME: ppc vector_pixel missing
// FIXME: ppc64 vector_Float16 missing
// FIXME: ppc64 vector_signed_long_long missing
// FIXME: ppc64 vector_unsigned_long_long missing
// FIXME: ppc64 vector_bool_long_long missing
// FIXME: ppc64 vector_signed___int128 missing
// FIXME: ppc64 vector_unsigned___int128 missing
impl_arch!(
    [x86["x86"]: __m128, __m128i, __m128d],
    [x86_64["x86_64"]: __m128, __m128i, __m128d],
    [
        arm["arm"]: int8x16_t,
        uint8x16_t,
        poly8x16_t,
        int16x8_t,
        uint16x8_t,
        poly16x8_t,
        int32x4_t,
        uint32x4_t,
        float32x4_t,
        int64x2_t,
        uint64x2_t
    ],
    [
        aarch64["aarch64"]: int8x16_t,
        uint8x16_t,
        poly8x16_t,
        int16x8_t,
        uint16x8_t,
        poly16x8_t,
        int32x4_t,
        uint32x4_t,
        float32x4_t,
        int64x2_t,
        uint64x2_t,
        float64x2_t
    ],
    [
        powerpc["powerpc"]: vector_signed_char,
        vector_unsigned_char,
        vector_signed_short,
        vector_unsigned_short,
        vector_signed_int,
        vector_unsigned_int,
        vector_float
    ],
    [
        powerpc64["powerpc64"]: vector_signed_char,
        vector_unsigned_char,
        vector_signed_short,
        vector_unsigned_short,
        vector_signed_int,
        vector_unsigned_int,
        vector_float,
        vector_signed_long,
        vector_unsigned_long,
        vector_double
    ] | from: i8x16,
    u8x16,
    m8x16,
    i16x8,
    u16x8,
    m16x8,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x1,
    u128x1,
    m128x1 | into: i8x16,
    u8x16,
    i16x8,
    u16x8,
    i32x4,
    u32x4,
    f32x4,
    i64x2,
    u64x2,
    f64x2,
    i128x1,
    u128x1 | test: test_v128
);

impl_arch!(
    [powerpc["powerpc"]: vector_bool_char],
    [powerpc64["powerpc64"]: vector_bool_char] | from: m8x16,
    m16x8,
    m32x4,
    m64x2,
    m128x1 | into: i8x16,
    u8x16,
    i16x8,
    u16x8,
    i32x4,
    u32x4,
    f32x4,
    i64x2,
    u64x2,
    f64x2,
    i128x1,
    u128x1,
    // Masks:
    m8x16 | test: test_v128
);

impl_arch!(
    [powerpc["powerpc"]: vector_bool_short],
    [powerpc64["powerpc64"]: vector_bool_short] | from: m16x8,
    m32x4,
    m64x2,
    m128x1 | into: i8x16,
    u8x16,
    i16x8,
    u16x8,
    i32x4,
    u32x4,
    f32x4,
    i64x2,
    u64x2,
    f64x2,
    i128x1,
    u128x1,
    // Masks:
    m8x16,
    m16x8 | test: test_v128
);

impl_arch!(
    [powerpc["powerpc"]: vector_bool_int],
    [powerpc64["powerpc64"]: vector_bool_int] | from: m32x4,
    m64x2,
    m128x1 | into: i8x16,
    u8x16,
    i16x8,
    u16x8,
    i32x4,
    u32x4,
    f32x4,
    i64x2,
    u64x2,
    f64x2,
    i128x1,
    u128x1,
    // Masks:
    m8x16,
    m16x8,
    m32x4 | test: test_v128
);

impl_arch!(
    [powerpc64["powerpc64"]: vector_bool_long] | from: m64x2,
    m128x1 | into: i8x16,
    u8x16,
    i16x8,
    u16x8,
    i32x4,
    u32x4,
    f32x4,
    i64x2,
    u64x2,
    f64x2,
    i128x1,
    u128x1,
    // Masks:
    m8x16,
    m16x8,
    m32x4,
    m64x2 | test: test_v128
);

////////////////////////////////////////////////////////////////////////////////
// Implementations for the 256-bit wide vector types

impl_arch!(
    [x86["x86"]: __m256, __m256i, __m256d],
    [x86_64["x86_64"]: __m256, __m256i, __m256d] | from: i8x32,
    u8x32,
    m8x32,
    i16x16,
    u16x16,
    m16x16,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x2,
    u128x2,
    m128x2 | into: i8x32,
    u8x32,
    i16x16,
    u16x16,
    i32x8,
    u32x8,
    f32x8,
    i64x4,
    u64x4,
    f64x4,
    i128x2,
    u128x2 | test: test_v256
);

////////////////////////////////////////////////////////////////////////////////
// FIXME: Implementations for the 512-bit wide vector types


================================================
FILE: src/api/into_bits/macros.rs
================================================
//! Macros implementing `FromBits`

macro_rules! impl_from_bits_ {
    ($id:ident[$test_tt:tt]: $from_ty:ident) => {
        impl crate::api::into_bits::FromBits<$from_ty> for $id {
            #[inline]
            fn from_bits(x: $from_ty) -> Self {
                unsafe { crate::mem::transmute(x) }
            }
        }

        test_if! {
            $test_tt:
            paste::item! {
                pub mod [<$id _from_bits_ $from_ty>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn test() {
                        use crate::{
                            ptr::{read_unaligned},
                            mem::{size_of, zeroed}
                        };
                        use crate::IntoBits;
                        assert_eq!(size_of::<$id>(),
                                   size_of::<$from_ty>());
                        // This is safe because we never create a reference to
                        // uninitialized memory:
                        let a: $from_ty = unsafe { zeroed() };

                        let b_0: $id = crate::FromBits::from_bits(a);
                        let b_1: $id = a.into_bits();

                        // Check that these are byte-wise equal, that is,
                        // that the bit patterns are identical:
                        for i in 0..size_of::<$id>() {
                            // This is safe because we only read initialized
                            // memory in bounds. Also, taking a reference to
                            // `b_i` is ok because the fields are initialized.
                            unsafe {
                                let b_0_v: u8 = read_unaligned(
                                    (&b_0 as *const $id as *const u8)
                                        .wrapping_add(i)
                                );
                                let b_1_v: u8 = read_unaligned(
                                    (&b_1 as *const $id as *const u8)
                                        .wrapping_add(i)
                                );
                                assert_eq!(b_0_v, b_1_v);
                            }
                        }
                    }
                }
            }
        }
    };
}

macro_rules! impl_from_bits {
    ($id:ident[$test_tt:tt]: $($from_ty:ident),*) => {
        $(
            impl_from_bits_!($id[$test_tt]: $from_ty);
        )*
    }
}

#[allow(unused)]
macro_rules! impl_into_bits {
    ($id:ident[$test_tt:tt]: $($from_ty:ident),*) => {
        $(
            impl_from_bits_!($from_ty[$test_tt]: $id);
        )*
    }
}


================================================
FILE: src/api/into_bits/v128.rs
================================================
//! `FromBits` and `IntoBits` implementations for portable 128-bit wide vectors
#[rustfmt::skip]

#[allow(unused)]  // wasm_bindgen_test
use crate::*;

impl_from_bits!(
    i8x16[test_v128]: u8x16,
    m8x16,
    i16x8,
    u16x8,
    m16x8,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x1,
    u128x1,
    m128x1
);
impl_from_bits!(
    u8x16[test_v128]: i8x16,
    m8x16,
    i16x8,
    u16x8,
    m16x8,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x1,
    u128x1,
    m128x1
);
impl_from_bits!(m8x16[test_v128]: m16x8, m32x4, m64x2, m128x1);

impl_from_bits!(
    i16x8[test_v128]: i8x16,
    u8x16,
    m8x16,
    u16x8,
    m16x8,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x1,
    u128x1,
    m128x1
);
impl_from_bits!(
    u16x8[test_v128]: i8x16,
    u8x16,
    m8x16,
    i16x8,
    m16x8,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x1,
    u128x1,
    m128x1
);
impl_from_bits!(m16x8[test_v128]: m32x4, m64x2, m128x1);

impl_from_bits!(
    i32x4[test_v128]: i8x16,
    u8x16,
    m8x16,
    i16x8,
    u16x8,
    m16x8,
    u32x4,
    f32x4,
    m32x4,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x1,
    u128x1,
    m128x1
);
impl_from_bits!(
    u32x4[test_v128]: i8x16,
    u8x16,
    m8x16,
    i16x8,
    u16x8,
    m16x8,
    i32x4,
    f32x4,
    m32x4,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x1,
    u128x1,
    m128x1
);
impl_from_bits!(
    f32x4[test_v128]: i8x16,
    u8x16,
    m8x16,
    i16x8,
    u16x8,
    m16x8,
    i32x4,
    u32x4,
    m32x4,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x1,
    u128x1,
    m128x1
);
impl_from_bits!(m32x4[test_v128]: m64x2, m128x1);

impl_from_bits!(
    i64x2[test_v128]: i8x16,
    u8x16,
    m8x16,
    i16x8,
    u16x8,
    m16x8,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    u64x2,
    f64x2,
    m64x2,
    i128x1,
    u128x1,
    m128x1
);
impl_from_bits!(
    u64x2[test_v128]: i8x16,
    u8x16,
    m8x16,
    i16x8,
    u16x8,
    m16x8,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x2,
    f64x2,
    m64x2,
    i128x1,
    u128x1,
    m128x1
);
impl_from_bits!(
    f64x2[test_v128]: i8x16,
    u8x16,
    m8x16,
    i16x8,
    u16x8,
    m16x8,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x2,
    u64x2,
    m64x2,
    i128x1,
    u128x1,
    m128x1
);
impl_from_bits!(m64x2[test_v128]: m128x1);

impl_from_bits!(
    i128x1[test_v128]: i8x16,
    u8x16,
    m8x16,
    i16x8,
    u16x8,
    m16x8,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    u128x1,
    m128x1
);
impl_from_bits!(
    u128x1[test_v128]: i8x16,
    u8x16,
    m8x16,
    i16x8,
    u16x8,
    m16x8,
    i32x4,
    u32x4,
    f32x4,
    m32x4,
    i64x2,
    u64x2,
    f64x2,
    m64x2,
    i128x1,
    m128x1
);
// note: m128x1 cannot be constructed from all the other masks bit patterns in
// here


================================================
FILE: src/api/into_bits/v16.rs
================================================
//! `FromBits` and `IntoBits` implementations for portable 16-bit wide vectors
#[rustfmt::skip]

#[allow(unused)]  // wasm_bindgen_test
use crate::*;

impl_from_bits!(i8x2[test_v16]: u8x2, m8x2);
impl_from_bits!(u8x2[test_v16]: i8x2, m8x2);
// note: m8x2 cannot be constructed from all i8x2 or u8x2 bit patterns


================================================
FILE: src/api/into_bits/v256.rs
================================================
//! `FromBits` and `IntoBits` implementations for portable 256-bit wide vectors
#[rustfmt::skip]

#[allow(unused)]  // wasm_bindgen_test
use crate::*;

impl_from_bits!(
    i8x32[test_v256]: u8x32,
    m8x32,
    i16x16,
    u16x16,
    m16x16,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x2,
    u128x2,
    m128x2
);
impl_from_bits!(
    u8x32[test_v256]: i8x32,
    m8x32,
    i16x16,
    u16x16,
    m16x16,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x2,
    u128x2,
    m128x2
);
impl_from_bits!(m8x32[test_v256]: m16x16, m32x8, m64x4, m128x2);

impl_from_bits!(
    i16x16[test_v256]: i8x32,
    u8x32,
    m8x32,
    u16x16,
    m16x16,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x2,
    u128x2,
    m128x2
);
impl_from_bits!(
    u16x16[test_v256]: i8x32,
    u8x32,
    m8x32,
    i16x16,
    m16x16,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x2,
    u128x2,
    m128x2
);
impl_from_bits!(m16x16[test_v256]: m32x8, m64x4, m128x2);

impl_from_bits!(
    i32x8[test_v256]: i8x32,
    u8x32,
    m8x32,
    i16x16,
    u16x16,
    m16x16,
    u32x8,
    f32x8,
    m32x8,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x2,
    u128x2,
    m128x2
);
impl_from_bits!(
    u32x8[test_v256]: i8x32,
    u8x32,
    m8x32,
    i16x16,
    u16x16,
    m16x16,
    i32x8,
    f32x8,
    m32x8,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x2,
    u128x2,
    m128x2
);
impl_from_bits!(
    f32x8[test_v256]: i8x32,
    u8x32,
    m8x32,
    i16x16,
    u16x16,
    m16x16,
    i32x8,
    u32x8,
    m32x8,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x2,
    u128x2,
    m128x2
);
impl_from_bits!(m32x8[test_v256]: m64x4, m128x2);

impl_from_bits!(
    i64x4[test_v256]: i8x32,
    u8x32,
    m8x32,
    i16x16,
    u16x16,
    m16x16,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    u64x4,
    f64x4,
    m64x4,
    i128x2,
    u128x2,
    m128x2
);
impl_from_bits!(
    u64x4[test_v256]: i8x32,
    u8x32,
    m8x32,
    i16x16,
    u16x16,
    m16x16,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x4,
    f64x4,
    m64x4,
    i128x2,
    u128x2,
    m128x2
);
impl_from_bits!(
    f64x4[test_v256]: i8x32,
    u8x32,
    m8x32,
    i16x16,
    u16x16,
    m16x16,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x4,
    u64x4,
    m64x4,
    i128x2,
    u128x2,
    m128x2
);
impl_from_bits!(m64x4[test_v256]: m128x2);

impl_from_bits!(
    i128x2[test_v256]: i8x32,
    u8x32,
    m8x32,
    i16x16,
    u16x16,
    m16x16,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    u128x2,
    m128x2
);
impl_from_bits!(
    u128x2[test_v256]: i8x32,
    u8x32,
    m8x32,
    i16x16,
    u16x16,
    m16x16,
    i32x8,
    u32x8,
    f32x8,
    m32x8,
    i64x4,
    u64x4,
    f64x4,
    m64x4,
    i128x2,
    m128x2
);
// note: m128x2 cannot be constructed from all the other masks bit patterns in
// here


================================================
FILE: src/api/into_bits/v32.rs
================================================
//! `FromBits` and `IntoBits` implementations for portable 32-bit wide vectors
#[rustfmt::skip]

#[allow(unused)]  // wasm_bindgen_test
use crate::*;

impl_from_bits!(i8x4[test_v32]: u8x4, m8x4, i16x2, u16x2, m16x2);
impl_from_bits!(u8x4[test_v32]: i8x4, m8x4, i16x2, u16x2, m16x2);
impl_from_bits!(m8x4[test_v32]: m16x2);

impl_from_bits!(i16x2[test_v32]: i8x4, u8x4, m8x4, u16x2, m16x2);
impl_from_bits!(u16x2[test_v32]: i8x4, u8x4, m8x4, i16x2, m16x2);
// note: m16x2 cannot be constructed from all m8x4 bit patterns


================================================
FILE: src/api/into_bits/v512.rs
================================================
//! `FromBits` and `IntoBits` implementations for portable 512-bit wide vectors
#[rustfmt::skip]

#[allow(unused)]  // wasm_bindgen_test
use crate::*;

impl_from_bits!(
    i8x64[test_v512]: u8x64,
    m8x64,
    i16x32,
    u16x32,
    m16x32,
    i32x16,
    u32x16,
    f32x16,
    m32x16,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    i128x4,
    u128x4,
    m128x4
);
impl_from_bits!(
    u8x64[test_v512]: i8x64,
    m8x64,
    i16x32,
    u16x32,
    m16x32,
    i32x16,
    u32x16,
    f32x16,
    m32x16,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    i128x4,
    u128x4,
    m128x4
);
impl_from_bits!(m8x64[test_v512]: m16x32, m32x16, m64x8, m128x4);

impl_from_bits!(
    i16x32[test_v512]: i8x64,
    u8x64,
    m8x64,
    u16x32,
    m16x32,
    i32x16,
    u32x16,
    f32x16,
    m32x16,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    i128x4,
    u128x4,
    m128x4
);
impl_from_bits!(
    u16x32[test_v512]: i8x64,
    u8x64,
    m8x64,
    i16x32,
    m16x32,
    i32x16,
    u32x16,
    f32x16,
    m32x16,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    i128x4,
    u128x4,
    m128x4
);
impl_from_bits!(m16x32[test_v512]: m32x16, m64x8, m128x4);

impl_from_bits!(
    i32x16[test_v512]: i8x64,
    u8x64,
    m8x64,
    i16x32,
    u16x32,
    m16x32,
    u32x16,
    f32x16,
    m32x16,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    i128x4,
    u128x4,
    m128x4
);
impl_from_bits!(
    u32x16[test_v512]: i8x64,
    u8x64,
    m8x64,
    i16x32,
    u16x32,
    m16x32,
    i32x16,
    f32x16,
    m32x16,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    i128x4,
    u128x4,
    m128x4
);
impl_from_bits!(
    f32x16[test_v512]: i8x64,
    u8x64,
    m8x64,
    i16x32,
    u16x32,
    m16x32,
    i32x16,
    u32x16,
    m32x16,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    i128x4,
    u128x4,
    m128x4
);
impl_from_bits!(m32x16[test_v512]: m64x8, m128x4);

impl_from_bits!(
    i64x8[test_v512]: i8x64,
    u8x64,
    m8x64,
    i16x32,
    u16x32,
    m16x32,
    i32x16,
    u32x16,
    f32x16,
    m32x16,
    u64x8,
    f64x8,
    m64x8,
    i128x4,
    u128x4,
    m128x4
);
impl_from_bits!(
    u64x8[test_v512]: i8x64,
    u8x64,
    m8x64,
    i16x32,
    u16x32,
    m16x32,
    i32x16,
    u32x16,
    f32x16,
    m32x16,
    i64x8,
    f64x8,
    m64x8,
    i128x4,
    u128x4,
    m128x4
);
impl_from_bits!(
    f64x8[test_v512]: i8x64,
    u8x64,
    m8x64,
    i16x32,
    u16x32,
    m16x32,
    i32x16,
    u32x16,
    f32x16,
    m32x16,
    i64x8,
    u64x8,
    m64x8,
    i128x4,
    u128x4,
    m128x4
);
impl_from_bits!(m64x8[test_v512]: m128x4);

impl_from_bits!(
    i128x4[test_v512]: i8x64,
    u8x64,
    m8x64,
    i16x32,
    u16x32,
    m16x32,
    i32x16,
    u32x16,
    f32x16,
    m32x16,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    u128x4,
    m128x4
);
impl_from_bits!(
    u128x4[test_v512]: i8x64,
    u8x64,
    m8x64,
    i16x32,
    u16x32,
    m16x32,
    i32x16,
    u32x16,
    f32x16,
    m32x16,
    i64x8,
    u64x8,
    f64x8,
    m64x8,
    i128x4,
    m128x4
);
// note: m128x4 cannot be constructed from all the other masks bit patterns in
// here


================================================
FILE: src/api/into_bits/v64.rs
================================================
//! `FromBits` and `IntoBits` implementations for portable 64-bit wide vectors
#[rustfmt::skip]

#[allow(unused)]  // wasm_bindgen_test
use crate::*;

impl_from_bits!(i8x8[test_v64]: u8x8, m8x8, i16x4, u16x4, m16x4, i32x2, u32x2, f32x2, m32x2);
impl_from_bits!(u8x8[test_v64]: i8x8, m8x8, i16x4, u16x4, m16x4, i32x2, u32x2, f32x2, m32x2);
impl_from_bits!(m8x8[test_v64]: m16x4, m32x2);

impl_from_bits!(i16x4[test_v64]: i8x8, u8x8, m8x8, u16x4, m16x4, i32x2, u32x2, f32x2, m32x2);
impl_from_bits!(u16x4[test_v64]: i8x8, u8x8, m8x8, i16x4, m16x4, i32x2, u32x2, f32x2, m32x2);
impl_from_bits!(m16x4[test_v64]: m32x2);

impl_from_bits!(i32x2[test_v64]: i8x8, u8x8, m8x8, i16x4, u16x4, m16x4, u32x2, f32x2, m32x2);
impl_from_bits!(u32x2[test_v64]: i8x8, u8x8, m8x8, i16x4, u16x4, m16x4, i32x2, f32x2, m32x2);
impl_from_bits!(f32x2[test_v64]: i8x8, u8x8, m8x8, i16x4, u16x4, m16x4, i32x2, u32x2, m32x2);
// note: m32x2 cannot be constructed from all m16x4 or m8x8 bit patterns


================================================
FILE: src/api/into_bits.rs
================================================
//! Implementation of `FromBits` and `IntoBits`.

/// Safe lossless bitwise conversion from `T` to `Self`.
#[cfg_attr(doc_cfg, doc(cfg(feature = "into_bits")))]
pub trait FromBits<T>: crate::marker::Sized {
    /// Safe lossless bitwise transmute from `T` to `Self`.
    fn from_bits(t: T) -> Self;
}

/// Safe lossless bitwise conversion from `Self` to `T`.
#[cfg_attr(doc_cfg, doc(cfg(feature = "into_bits")))]
pub trait IntoBits<T>: crate::marker::Sized {
    /// Safe lossless bitwise transmute from `self` to `T`.
    fn into_bits(self) -> T;
}

/// `FromBits` implies `IntoBits`.
impl<T, U> IntoBits<U> for T
where
    U: FromBits<T>,
{
    #[inline]
    fn into_bits(self) -> U {
        debug_assert!(crate::mem::size_of::<Self>() == crate::mem::size_of::<U>());
        U::from_bits(self)
    }
}

/// `FromBits` and `IntoBits` are reflexive
impl<T> FromBits<T> for T {
    #[inline]
    fn from_bits(t: Self) -> Self {
        t
    }
}

#[macro_use]
mod macros;

mod v16;
pub use self::v16::*;

mod v32;
pub use self::v32::*;

mod v64;
pub use self::v64::*;

mod v128;
pub use self::v128::*;

mod v256;
pub use self::v256::*;

mod v512;
pub use self::v512::*;

mod arch_specific;
pub use self::arch_specific::*;


================================================
FILE: src/api/math/float/abs.rs
================================================
//! Implements vertical (lane-wise) floating-point `abs`.

macro_rules! impl_math_float_abs {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Absolute value.
            #[inline]
            pub fn abs(self) -> Self {
                use crate::codegen::math::float::abs::Abs;
                Abs::abs(self)
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _math_abs>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn abs() {
                        let o = $id::splat(1 as $elem_ty);
                        assert_eq!(o, o.abs());

                        let mo = $id::splat(-1 as $elem_ty);
                        assert_eq!(o, mo.abs());
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/math/float/consts.rs
================================================
macro_rules! impl_float_consts {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident) => {
        impl $id {
            /// Machine epsilon value.
            pub const EPSILON: $id = $id::splat(core::$elem_ty::EPSILON);

            /// Smallest finite value.
            pub const MIN: $id = $id::splat(core::$elem_ty::MIN);

            /// Smallest positive normal value.
            pub const MIN_POSITIVE: $id = $id::splat(core::$elem_ty::MIN_POSITIVE);

            /// Largest finite value.
            pub const MAX: $id = $id::splat(core::$elem_ty::MAX);

            /// Not a Number (NaN).
            pub const NAN: $id = $id::splat(core::$elem_ty::NAN);

            /// Infinity (∞).
            pub const INFINITY: $id = $id::splat(core::$elem_ty::INFINITY);

            /// Negative infinity (-∞).
            pub const NEG_INFINITY: $id = $id::splat(core::$elem_ty::NEG_INFINITY);

            /// Archimedes' constant (π)
            pub const PI: $id = $id::splat(core::$elem_ty::consts::PI);

            /// π/2
            pub const FRAC_PI_2: $id = $id::splat(core::$elem_ty::consts::FRAC_PI_2);

            /// π/3
            pub const FRAC_PI_3: $id = $id::splat(core::$elem_ty::consts::FRAC_PI_3);

            /// π/4
            pub const FRAC_PI_4: $id = $id::splat(core::$elem_ty::consts::FRAC_PI_4);

            /// π/6
            pub const FRAC_PI_6: $id = $id::splat(core::$elem_ty::consts::FRAC_PI_6);

            /// π/8
            pub const FRAC_PI_8: $id = $id::splat(core::$elem_ty::consts::FRAC_PI_8);

            /// 1/π
            pub const FRAC_1_PI: $id = $id::splat(core::$elem_ty::consts::FRAC_1_PI);

            /// 2/π
            pub const FRAC_2_PI: $id = $id::splat(core::$elem_ty::consts::FRAC_2_PI);

            /// 2/sqrt(π)
            pub const FRAC_2_SQRT_PI: $id = $id::splat(core::$elem_ty::consts::FRAC_2_SQRT_PI);

            /// sqrt(2)
            pub const SQRT_2: $id = $id::splat(core::$elem_ty::consts::SQRT_2);

            /// 1/sqrt(2)
            pub const FRAC_1_SQRT_2: $id = $id::splat(core::$elem_ty::consts::FRAC_1_SQRT_2);

            /// Euler's number (e)
            pub const E: $id = $id::splat(core::$elem_ty::consts::E);

            /// log<sub>2</sub>(e)
            pub const LOG2_E: $id = $id::splat(core::$elem_ty::consts::LOG2_E);

            /// log<sub>10</sub>(e)
            pub const LOG10_E: $id = $id::splat(core::$elem_ty::consts::LOG10_E);

            /// ln(2)
            pub const LN_2: $id = $id::splat(core::$elem_ty::consts::LN_2);

            /// ln(10)
            pub const LN_10: $id = $id::splat(core::$elem_ty::consts::LN_10);
        }
    };
}


================================================
FILE: src/api/math/float/cos.rs
================================================
//! Implements vertical (lane-wise) floating-point `cos`.

macro_rules! impl_math_float_cos {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Cosine.
            #[inline]
            pub fn cos(self) -> Self {
                use crate::codegen::math::float::cos::Cos;
                Cos::cos(self)
            }

            /// Cosine of `self * PI`.
            #[inline]
            pub fn cos_pi(self) -> Self {
                use crate::codegen::math::float::cos_pi::CosPi;
                CosPi::cos_pi(self)
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _math_cos>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn cos() {
                        use crate::$elem_ty::consts::PI;
                        let z = $id::splat(0 as $elem_ty);
                        let o = $id::splat(1 as $elem_ty);
                        let p = $id::splat(PI as $elem_ty);
                        let ph = $id::splat(PI as $elem_ty / 2.);
                        let z_r = $id::splat((PI as $elem_ty / 2.).cos());
                        let o_r = $id::splat((PI as $elem_ty).cos());

                        assert_eq!(o, z.cos());
                        assert_eq!(z_r, ph.cos());
                        assert_eq!(o_r, p.cos());
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/math/float/exp.rs
================================================
//! Implements vertical (lane-wise) floating-point `exp`.

macro_rules! impl_math_float_exp {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Returns the exponential function of `self`: `e^(self)`.
            #[inline]
            pub fn exp(self) -> Self {
                use crate::codegen::math::float::exp::Exp;
                Exp::exp(self)
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _math_exp>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn exp() {
                        let z = $id::splat(0 as $elem_ty);
                        let o = $id::splat(1 as $elem_ty);
                        assert_eq!(o, z.exp());

                        let e = $id::splat(crate::f64::consts::E as $elem_ty);
                        let tol = $id::splat(2.4e-4 as $elem_ty);
                        assert!((e - o.exp()).abs().le(tol).all());
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/math/float/ln.rs
================================================
//! Implements vertical (lane-wise) floating-point `ln`.

macro_rules! impl_math_float_ln {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Returns the natural logarithm of `self`.
            #[inline]
            pub fn ln(self) -> Self {
                use crate::codegen::math::float::ln::Ln;
                Ln::ln(self)
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _math_ln>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn ln() {
                        let z = $id::splat(0 as $elem_ty);
                        let o = $id::splat(1 as $elem_ty);
                        assert_eq!(z, o.ln());

                        let e = $id::splat(crate::f64::consts::E as $elem_ty);
                        let tol = $id::splat(2.4e-4 as $elem_ty);
                        assert!((o - e.ln()).abs().le(tol).all());
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/math/float/mul_add.rs
================================================
//! Implements vertical (lane-wise) floating-point `mul_add`.

macro_rules! impl_math_float_mul_add {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Fused multiply add: `self * y + z`
            #[inline]
            pub fn mul_add(self, y: Self, z: Self) -> Self {
                use crate::codegen::math::float::mul_add::MulAdd;
                MulAdd::mul_add(self, y, z)
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _math_mul_add>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn mul_add() {
                        let z = $id::splat(0 as $elem_ty);
                        let o = $id::splat(1 as $elem_ty);
                        let t = $id::splat(2 as $elem_ty);
                        let t3 = $id::splat(3 as $elem_ty);
                        let f = $id::splat(4 as $elem_ty);

                        assert_eq!(z, z.mul_add(z, z));
                        assert_eq!(o, o.mul_add(o, z));
                        assert_eq!(o, o.mul_add(z, o));
                        assert_eq!(o, z.mul_add(o, o));

                        assert_eq!(t, o.mul_add(o, o));
                        assert_eq!(t, o.mul_add(t, z));
                        assert_eq!(t, t.mul_add(o, z));

                        assert_eq!(f, t.mul_add(t, z));
                        assert_eq!(f, t.mul_add(o, t));
                        assert_eq!(t3, t.mul_add(o, o));
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/math/float/mul_adde.rs
================================================
//! Implements vertical (lane-wise) floating-point `mul_adde`.

macro_rules! impl_math_float_mul_adde {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Fused multiply add estimate: ~= `self * y + z`
            ///
            /// While fused multiply-add (`fma`) has infinite precision,
            /// `mul_adde` has _at worst_ the same precision of a multiply followed by an add.
            /// This might be more efficient on architectures that do not have an `fma` instruction.
            #[inline]
            pub fn mul_adde(self, y: Self, z: Self) -> Self {
                use crate::codegen::math::float::mul_adde::MulAddE;
                MulAddE::mul_adde(self, y, z)
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _math_mul_adde>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn mul_adde() {
                        let z = $id::splat(0 as $elem_ty);
                        let o = $id::splat(1 as $elem_ty);
                        let t = $id::splat(2 as $elem_ty);
                        let t3 = $id::splat(3 as $elem_ty);
                        let f = $id::splat(4 as $elem_ty);

                        assert_eq!(z, z.mul_adde(z, z));
                        assert_eq!(o, o.mul_adde(o, z));
                        assert_eq!(o, o.mul_adde(z, o));
                        assert_eq!(o, z.mul_adde(o, o));

                        assert_eq!(t, o.mul_adde(o, o));
                        assert_eq!(t, o.mul_adde(t, z));
                        assert_eq!(t, t.mul_adde(o, z));

                        assert_eq!(f, t.mul_adde(t, z));
                        assert_eq!(f, t.mul_adde(o, t));
                        assert_eq!(t3, t.mul_adde(o, o));
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/math/float/powf.rs
================================================
//! Implements vertical (lane-wise) floating-point `powf`.

macro_rules! impl_math_float_powf {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Raises `self` number to the floating point power of `x`.
            #[inline]
            pub fn powf(self, x: Self) -> Self {
                use crate::codegen::math::float::powf::Powf;
                Powf::powf(self, x)
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _math_powf>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn powf() {
                        let z = $id::splat(0 as $elem_ty);
                        let o = $id::splat(1 as $elem_ty);
                        let t = $id::splat(2 as $elem_ty);
                        assert_eq!(o, o.powf(z));
                        assert_eq!(o, t.powf(z));
                        assert_eq!(o, o.powf(o));
                        assert_eq!(t, t.powf(o));

                        let f = $id::splat(4 as $elem_ty);
                        assert_eq!(f, t.powf(t));
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/math/float/recpre.rs
================================================
//! Implements vertical (lane-wise) floating-point `recpre`.

macro_rules! impl_math_float_recpre {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Reciprocal estimate: `~= 1. / self`.
            ///
            /// FIXME: The precision of the estimate is currently unspecified.
            #[inline]
            pub fn recpre(self) -> Self {
                $id::splat(1.) / self
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _math_recpre>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn recpre() {
                        let tol = $id::splat(2.4e-4 as $elem_ty);
                        let o = $id::splat(1 as $elem_ty);
                        let error = (o - o.recpre()).abs();
                        assert!(error.le(tol).all());

                        let t = $id::splat(2 as $elem_ty);
                        let e = 0.5;
                        let error = (e - t.recpre()).abs();
                        assert!(error.le(tol).all());
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/math/float/rsqrte.rs
================================================
//! Implements vertical (lane-wise) floating-point `rsqrte`.

macro_rules! impl_math_float_rsqrte {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Reciprocal square-root estimate: `~= 1. / self.sqrt()`.
            ///
            /// FIXME: The precision of the estimate is currently unspecified.
            #[inline]
            pub fn rsqrte(self) -> Self {
                unsafe {
                    use crate::llvm::simd_fsqrt;
                    $id::splat(1.) / Simd(simd_fsqrt(self.0))
                }
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _math_rsqrte>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn rsqrte() {
                        use crate::$elem_ty::consts::SQRT_2;
                        let tol = $id::splat(2.4e-4 as $elem_ty);
                        let o = $id::splat(1 as $elem_ty);
                        let error = (o - o.rsqrte()).abs();
                        assert!(error.le(tol).all());

                        let t = $id::splat(2 as $elem_ty);
                        let e = 1. / SQRT_2;
                        let error = (e - t.rsqrte()).abs();
                        assert!(error.le(tol).all());
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/math/float/sin.rs
================================================
//! Implements vertical (lane-wise) floating-point `sin`.

macro_rules! impl_math_float_sin {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Sine.
            #[inline]
            pub fn sin(self) -> Self {
                use crate::codegen::math::float::sin::Sin;
                Sin::sin(self)
            }

            /// Sine of `self * PI`.
            #[inline]
            pub fn sin_pi(self) -> Self {
                use crate::codegen::math::float::sin_pi::SinPi;
                SinPi::sin_pi(self)
            }

            /// Sine and cosine of `self * PI`.
            #[inline]
            pub fn sin_cos_pi(self) -> (Self, Self) {
                use crate::codegen::math::float::sin_cos_pi::SinCosPi;
                SinCosPi::sin_cos_pi(self)
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _math_sin>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn sin() {
                        use crate::$elem_ty::consts::PI;
                        let z = $id::splat(0 as $elem_ty);
                        let p = $id::splat(PI as $elem_ty);
                        let ph = $id::splat(PI as $elem_ty / 2.);
                        let o_r = $id::splat((PI as $elem_ty / 2.).sin());
                        let z_r = $id::splat((PI as $elem_ty).sin());

                        assert_eq!(z, z.sin());
                        assert_eq!(o_r, ph.sin());
                        assert_eq!(z_r, p.sin());
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/math/float/sqrt.rs
================================================
//! Implements vertical (lane-wise) floating-point `sqrt`.

macro_rules! impl_math_float_sqrt {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            #[inline]
            pub fn sqrt(self) -> Self {
                use crate::codegen::math::float::sqrt::Sqrt;
                Sqrt::sqrt(self)
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _math_sqrt>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn sqrt() {
                        use crate::$elem_ty::consts::SQRT_2;
                        let z = $id::splat(0 as $elem_ty);
                        let o = $id::splat(1 as $elem_ty);
                        assert_eq!(z, z.sqrt());
                        assert_eq!(o, o.sqrt());

                        let t = $id::splat(2 as $elem_ty);
                        let e = $id::splat(SQRT_2);
                        assert_eq!(e, t.sqrt());

                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/math/float/sqrte.rs
================================================
//! Implements vertical (lane-wise) floating-point `sqrte`.

macro_rules! impl_math_float_sqrte {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Square-root estimate.
            ///
            /// FIXME: The precision of the estimate is currently unspecified.
            #[inline]
            pub fn sqrte(self) -> Self {
                use crate::codegen::math::float::sqrte::Sqrte;
                Sqrte::sqrte(self)
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _math_sqrte>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn sqrte() {
                        use crate::$elem_ty::consts::SQRT_2;
                        let tol = $id::splat(2.4e-4 as $elem_ty);

                        let z = $id::splat(0 as $elem_ty);
                        let error = (z - z.sqrte()).abs();
                        assert!(error.le(tol).all());

                        let o = $id::splat(1 as $elem_ty);
                        let error = (o - o.sqrte()).abs();
                        assert!(error.le(tol).all());

                        let t = $id::splat(2 as $elem_ty);
                        let e = $id::splat(SQRT_2 as $elem_ty);
                        let error = (e - t.sqrte()).abs();

                        assert!(error.le(tol).all());
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/math/float/tanh.rs
================================================
//! Implements vertical (lane-wise) floating-point `tanh`.

macro_rules! impl_math_float_tanh {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Tanh.
            #[inline]
            pub fn tanh(self) -> Self {
                use crate::codegen::math::float::tanh::Tanh;
                Tanh::tanh(self)
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _math_tanh>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn tanh() {
                        let z = $id::splat(0 as $elem_ty);

                        assert_eq!(z, z.tanh());
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/math/float.rs
================================================
//! Implements vertical floating-point math operations.

#[macro_use]
mod abs;

#[macro_use]
mod consts;

#[macro_use]
mod cos;

#[macro_use]
mod exp;

#[macro_use]
mod powf;

#[macro_use]
mod ln;

#[macro_use]
mod mul_add;

#[macro_use]
mod mul_adde;

#[macro_use]
mod recpre;

#[macro_use]
mod rsqrte;

#[macro_use]
mod sin;

#[macro_use]
mod sqrt;

#[macro_use]
mod sqrte;

#[macro_use]
mod tanh;

macro_rules! impl_float_category {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident, $mask_ty:ident) => {
        impl $id {
            #[inline]
            pub fn is_nan(self) -> $mask_ty {
                self.ne(self)
            }

            #[inline]
            pub fn is_infinite(self) -> $mask_ty {
                self.eq(Self::INFINITY) | self.eq(Self::NEG_INFINITY)
            }

            #[inline]
            pub fn is_finite(self) -> $mask_ty {
                !(self.is_nan() | self.is_infinite())
            }
        }
    };
}


================================================
FILE: src/api/math.rs
================================================
//! Implements vertical math operations

#[macro_use]
mod float;


================================================
FILE: src/api/minimal/iuf.rs
================================================
//! Minimal API of signed integer, unsigned integer, and floating-point
//! vectors.

macro_rules! impl_minimal_iuf {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $ielem_ty:ident |
     $test_tt:tt | $($elem_name:ident),+ | $(#[$doc:meta])*) => {

        $(#[$doc])*
        pub type $id = Simd<[$elem_ty; $elem_count]>;

        impl sealed::Simd for $id {
            type Element = $elem_ty;
            const LANES: usize = $elem_count;
            type LanesType = [u32; $elem_count];
        }

        impl $id {
            /// Creates a new instance with each vector elements initialized
            /// with the provided values.
            #[inline]
            #[allow(clippy::too_many_arguments)]
            pub const fn new($($elem_name: $elem_ty),*) -> Self {
                Simd(codegen::$id($($elem_name as $ielem_ty),*))
            }

            /// Returns the number of vector lanes.
            #[inline]
            pub const fn lanes() -> usize {
                $elem_count
            }

            /// Constructs a new instance with each element initialized to
            /// `value`.
            #[inline]
            pub const fn splat(value: $elem_ty) -> Self {
                Simd(codegen::$id($({
                    #[allow(non_camel_case_types, dead_code)]
                    struct $elem_name;
                    value as $ielem_ty
                }),*))
            }

            /// Extracts the value at `index`.
            ///
            /// # Panics
            ///
            /// If `index >= Self::lanes()`.
            #[inline]
            pub fn extract(self, index: usize) -> $elem_ty {
                assert!(index < $elem_count);
                unsafe { self.extract_unchecked(index) }
            }

            /// Extracts the value at `index`.
            ///
            /// # Safety
            ///
            /// If `index >= Self::lanes()` the behavior is undefined.
            #[inline]
            pub unsafe fn extract_unchecked(self, index: usize) -> $elem_ty {
                use crate::llvm::simd_extract;
                let e: $ielem_ty = simd_extract(self.0, index as u32);
                e as $elem_ty
            }

            /// Returns a new vector where the value at `index` is replaced by `new_value`.
            ///
            /// # Panics
            ///
            /// If `index >= Self::lanes()`.
            #[inline]
            #[must_use = "replace does not modify the original value - \
                          it returns a new vector with the value at `index` \
                          replaced by `new_value`d"
            ]
            pub fn replace(self, index: usize, new_value: $elem_ty) -> Self {
                assert!(index < $elem_count);
                unsafe { self.replace_unchecked(index, new_value) }
            }

            /// Returns a new vector where the value at `index` is replaced by `new_value`.
            ///
            /// # Safety
            ///
            /// If `index >= Self::lanes()` the behavior is undefined.
            #[inline]
            #[must_use = "replace_unchecked does not modify the original value - \
                          it returns a new vector with the value at `index` \
                          replaced by `new_value`d"
            ]
            pub unsafe fn replace_unchecked(
                self,
                index: usize,
                new_value: $elem_ty,
            ) -> Self {
                use crate::llvm::simd_insert;
                Simd(simd_insert(self.0, index as u32, new_value as $ielem_ty))
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                // Comparisons use integer casts within mantissa^1 range.
                #[allow(clippy::float_cmp)]
                pub mod [<$id _minimal>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn minimal() {
                        // lanes:
                        assert_eq!($elem_count, $id::lanes());

                        // splat and extract / extract_unchecked:
                        const VAL: $elem_ty = 7 as $elem_ty;
                        const VEC: $id = $id::splat(VAL);
                        for i in 0..$id::lanes() {
                            assert_eq!(VAL, VEC.extract(i));
                            assert_eq!(
                                VAL, unsafe { VEC.extract_unchecked(i) }
                            );
                        }

                        // replace / replace_unchecked
                        let new_vec = VEC.replace(0, 42 as $elem_ty);
                        for i in 0..$id::lanes() {
                            if i == 0 {
                                assert_eq!(42 as $elem_ty, new_vec.extract(i));
                            } else {
                                assert_eq!(VAL, new_vec.extract(i));
                            }
                        }
                        let new_vec = unsafe {
                            VEC.replace_unchecked(0, 42 as $elem_ty)
                        };
                        for i in 0..$id::lanes() {
                            if i == 0 {
                                assert_eq!(42 as $elem_ty, new_vec.extract(i));
                            } else {
                                assert_eq!(VAL, new_vec.extract(i));
                            }
                        }
                    }

                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn extract_panic_oob() {
                        const VAL: $elem_ty = 7 as $elem_ty;
                        const VEC: $id = $id::splat(VAL);
                        let _ = VEC.extract($id::lanes());
                    }
                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn replace_panic_oob() {
                        const VAL: $elem_ty = 7 as $elem_ty;
                        const VEC: $id = $id::splat(VAL);
                        let _ = VEC.replace($id::lanes(), 42 as $elem_ty);
                    }
                }
            }
        }
    }
}


================================================
FILE: src/api/minimal/mask.rs
================================================
//! Minimal API of mask vectors.

macro_rules! impl_minimal_mask {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $ielem_ty:ident
    | $test_tt:tt | $($elem_name:ident),+ | $(#[$doc:meta])*) => {
        $(#[$doc])*
        pub type $id = Simd<[$elem_ty; $elem_count]>;

        impl sealed::Simd for $id {
            type Element = $elem_ty;
            const LANES: usize = $elem_count;
            type LanesType = [u32; $elem_count];
        }

        impl $id {
            /// Creates a new instance with each vector elements initialized
            /// with the provided values.
            #[inline]
            #[allow(clippy::too_many_arguments)]
            pub const fn new($($elem_name: bool),*) -> Self {
                Simd(codegen::$id($(Self::bool_to_internal($elem_name)),*))
            }

            /// Converts a boolean type into the type of the vector lanes.
            #[inline]
            #[allow(clippy::indexing_slicing)]
            const fn bool_to_internal(x: bool) -> $ielem_ty {
                [0 as $ielem_ty, !(0 as $ielem_ty)][x as usize]
            }

            /// Returns the number of vector lanes.
            #[inline]
            pub const fn lanes() -> usize {
                $elem_count
            }

            /// Constructs a new instance with each element initialized to
            /// `value`.
            #[inline]
            pub const fn splat(value: bool) -> Self {
                Simd(codegen::$id($({
                    #[allow(non_camel_case_types, dead_code)]
                    struct $elem_name;
                    Self::bool_to_internal(value)
                }),*))
            }

            /// Extracts the value at `index`.
            ///
            /// # Panics
            ///
            /// If `index >= Self::lanes()`.
            #[inline]
            pub fn extract(self, index: usize) -> bool {
                assert!(index < $elem_count);
                unsafe { self.extract_unchecked(index) }
            }

            /// Extracts the value at `index`.
            ///
            /// # Safety
            ///
            /// If `index >= Self::lanes()` the behavior is undefined.
            #[inline]
            pub unsafe fn extract_unchecked(self, index: usize) -> bool {
                use crate::llvm::simd_extract;
                let x: $ielem_ty = simd_extract(self.0, index as u32);
                x != 0
            }

            /// Returns a new vector where the value at `index` is replaced by
            /// `new_value`.
            ///
            /// # Panics
            ///
            /// If `index >= Self::lanes()`.
            #[inline]
            #[must_use = "replace does not modify the original value - \
                          it returns a new vector with the value at `index` \
                          replaced by `new_value`d"
            ]
            pub fn replace(self, index: usize, new_value: bool) -> Self {
                assert!(index < $elem_count);
                unsafe { self.replace_unchecked(index, new_value) }
            }

            /// Returns a new vector where the value at `index` is replaced by
            /// `new_value`.
            ///
            /// # Safety
            ///
            /// If `index >= Self::lanes()` the behavior is undefined.
            #[inline]
            #[must_use = "replace_unchecked does not modify the original value - \
                          it returns a new vector with the value at `index` \
                          replaced by `new_value`d"
            ]
            pub unsafe fn replace_unchecked(
                self,
                index: usize,
                new_value: bool,
            ) -> Self {
                use crate::llvm::simd_insert;
                Simd(simd_insert(self.0, index as u32,
                                 Self::bool_to_internal(new_value)))
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _minimal>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn minimal() {
                        // TODO: test new

                        // lanes:
                        assert_eq!($elem_count, $id::lanes());

                        // splat and extract / extract_unchecked:
                        let vec = $id::splat(true);
                        for i in 0..$id::lanes() {
                            assert_eq!(true, vec.extract(i));
                            assert_eq!(true,
                                       unsafe { vec.extract_unchecked(i) }
                            );
                        }

                        // replace / replace_unchecked
                        let new_vec = vec.replace(0, false);
                        for i in 0..$id::lanes() {
                            if i == 0 {
                                assert_eq!(false, new_vec.extract(i));
                            } else {
                                assert_eq!(true, new_vec.extract(i));
                            }
                        }
                        let new_vec = unsafe {
                            vec.replace_unchecked(0, false)
                        };
                        for i in 0..$id::lanes() {
                            if i == 0 {
                                assert_eq!(false, new_vec.extract(i));
                            } else {
                                assert_eq!(true, new_vec.extract(i));
                            }
                        }
                    }

                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)]
                    // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn extract_panic_oob() {
                        let vec = $id::splat(false);
                        let _ = vec.extract($id::lanes());
                    }
                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)]
                    // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn replace_panic_oob() {
                        let vec = $id::splat(false);
                        let _ = vec.replace($id::lanes(), true);
                    }
                }
            }
        }
    }
}


================================================
FILE: src/api/minimal/ptr.rs
================================================
//! Minimal API of pointer vectors.

macro_rules! impl_minimal_p {
    ([$elem_ty:ty; $elem_count:expr]: $id:ident, $mask_ty:ident,
     $usize_ty:ident, $isize_ty:ident | $ref:ident | $test_tt:tt
     | $($elem_name:ident),+ | ($true:expr, $false:expr) |
     $(#[$doc:meta])*) => {

        $(#[$doc])*
        pub type $id<T> = Simd<[$elem_ty; $elem_count]>;

        impl<T> sealed::Simd for $id<T> {
            type Element = $elem_ty;
            const LANES: usize = $elem_count;
            type LanesType = [u32; $elem_count];
        }

        impl<T> $id<T> {
            /// Creates a new instance with each vector elements initialized
            /// with the provided values.
            #[inline]
            #[allow(clippy::too_many_arguments)]
            pub const fn new($($elem_name: $elem_ty),*) -> Self {
                Simd(codegen::$id($($elem_name),*))
            }

            /// Returns the number of vector lanes.
            #[inline]
            pub const fn lanes() -> usize {
                $elem_count
            }

            /// Constructs a new instance with each element initialized to
            /// `value`.
            #[inline]
            pub const fn splat(value: $elem_ty) -> Self {
                Simd(codegen::$id($({
                    #[allow(non_camel_case_types, dead_code)]
                    struct $elem_name;
                    value
                }),*))
            }

            /// Constructs a new instance with each element initialized to
            /// `null`.
            #[inline]
            pub const fn null() -> Self {
                Self::splat(crate::ptr::null_mut() as $elem_ty)
            }

            /// Returns a mask that selects those lanes that contain `null`
            /// pointers.
            #[inline]
            pub fn is_null(self) -> $mask_ty {
                self.eq(Self::null())
            }

            /// Extracts the value at `index`.
            ///
            /// # Panics
            ///
            /// If `index >= Self::lanes()`.
            #[inline]
            pub fn extract(self, index: usize) -> $elem_ty {
                assert!(index < $elem_count);
                unsafe { self.extract_unchecked(index) }
            }

            /// Extracts the value at `index`.
            ///
            /// # Safety
            ///
            /// If `index >= Self::lanes()` the behavior is undefined.
            #[inline]
            pub unsafe fn extract_unchecked(self, index: usize) -> $elem_ty {
                use crate::llvm::simd_extract;
                simd_extract(self.0, index as u32)
            }

            /// Returns a new vector where the value at `index` is replaced by
            /// `new_value`.
            ///
            /// # Panics
            ///
            /// If `index >= Self::lanes()`.
            #[inline]
            #[must_use = "replace does not modify the original value - \
                          it returns a new vector with the value at `index` \
                          replaced by `new_value`d"
            ]
            #[allow(clippy::not_unsafe_ptr_arg_deref)]
            pub fn replace(self, index: usize, new_value: $elem_ty) -> Self {
                assert!(index < $elem_count);
                unsafe { self.replace_unchecked(index, new_value) }
            }

            /// Returns a new vector where the value at `index` is replaced by `new_value`.
            ///
            /// # Safety
            ///
            /// If `index >= Self::lanes()` the behavior is undefined.
            #[inline]
            #[must_use = "replace_unchecked does not modify the original value - \
                          it returns a new vector with the value at `index` \
                          replaced by `new_value`d"
            ]
            pub unsafe fn replace_unchecked(
                self,
                index: usize,
                new_value: $elem_ty,
            ) -> Self {
                use crate::llvm::simd_insert;
                Simd(simd_insert(self.0, index as u32, new_value))
            }
        }


        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _minimal>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn minimal() {
                        // lanes:
                        assert_eq!($elem_count, $id::<i32>::lanes());

                        // splat and extract / extract_unchecked:
                        let VAL7: <$id<i32> as sealed::Simd>::Element
                            = $ref!(7);
                        let VAL42: <$id<i32> as sealed::Simd>::Element
                            = $ref!(42);
                        let VEC: $id<i32> = $id::splat(VAL7);
                        for i in 0..$id::<i32>::lanes() {
                            assert_eq!(VAL7, VEC.extract(i));
                            assert_eq!(
                                VAL7, unsafe { VEC.extract_unchecked(i) }
                            );
                        }

                        // replace / replace_unchecked
                        let new_vec = VEC.replace(0, VAL42);
                        for i in 0..$id::<i32>::lanes() {
                            if i == 0 {
                                assert_eq!(VAL42, new_vec.extract(i));
                            } else {
                                assert_eq!(VAL7, new_vec.extract(i));
                            }
                        }
                        let new_vec = unsafe {
                            VEC.replace_unchecked(0, VAL42)
                        };
                        for i in 0..$id::<i32>::lanes() {
                            if i == 0 {
                                assert_eq!(VAL42, new_vec.extract(i));
                            } else {
                                assert_eq!(VAL7, new_vec.extract(i));
                            }
                        }

                        let mut n = $id::<i32>::null();
                        assert_eq!(
                            n,
                            $id::<i32>::splat(unsafe { crate::mem::zeroed() })
                        );
                        assert!(n.is_null().all());
                        n = n.replace(
                            0, unsafe { crate::mem::transmute(1_isize) }
                        );
                        assert!(!n.is_null().all());
                        if $id::<i32>::lanes() > 1 {
                            assert!(n.is_null().any());
                        } else {
                            assert!(!n.is_null().any());
                        }
                    }

                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)]
                    // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn extract_panic_oob() {
                        let VAL: <$id<i32> as sealed::Simd>::Element
                            = $ref!(7);
                        let VEC: $id<i32> = $id::splat(VAL);
                        let _ = VEC.extract($id::<i32>::lanes());
                    }

                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)]
                    // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn replace_panic_oob() {
                        let VAL: <$id<i32> as sealed::Simd>::Element
                            = $ref!(7);
                        let VAL42: <$id<i32> as sealed::Simd>::Element
                            = $ref!(42);
                        let VEC: $id<i32> = $id::splat(VAL);
                        let _ = VEC.replace($id::<i32>::lanes(), VAL42);
                    }
                }
            }
        }

        impl<T> crate::fmt::Debug for $id<T> {
            #[allow(clippy::missing_inline_in_public_items)]
            fn fmt(&self, f: &mut crate::fmt::Formatter<'_>)
                   -> crate::fmt::Result {
                write!(
                    f,
                    "{}<{}>(",
                    stringify!($id),
                    crate::intrinsics::type_name::<T>()
                )?;
                for i in 0..$elem_count {
                    if i > 0 {
                        write!(f, ", ")?;
                    }
                    self.extract(i).fmt(f)?;
                }
                write!(f, ")")
            }
        }

         test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _fmt_debug>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn debug() {
                        use arrayvec::{ArrayString,ArrayVec};
                        type TinyString = ArrayString<[u8; 512]>;

                        use crate::fmt::Write;
                        let v = $id::<i32>::default();
                        let mut s = TinyString::new();
                        write!(&mut s, "{:?}", v).unwrap();

                        let mut beg = TinyString::new();
                        write!(&mut beg, "{}<i32>(", stringify!($id)).unwrap();
                        assert!(
                            s.starts_with(beg.as_str()),
                            "s = {} (should start with = {})", s, beg
                        );
                        assert!(s.ends_with(")"));
                        let s: ArrayVec<[TinyString; 64]>
                            = s.replace(beg.as_str(), "")
                            .replace(")", "").split(",")
                            .map(|v| TinyString::from(v.trim()).unwrap())
                            .collect();
                        assert_eq!(s.len(), $id::<i32>::lanes());
                        for (index, ss) in s.into_iter().enumerate() {
                            let mut e = TinyString::new();
                            write!(&mut e, "{:?}", v.extract(index)).unwrap();
                            assert_eq!(ss, e);
                        }
                    }
                }
            }
         }

        impl<T> Default for $id<T> {
            #[inline]
            fn default() -> Self {
                // FIXME: ptrs do not implement default
                Self::null()
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _default>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn default() {
                        let a = $id::<i32>::default();
                        for i in 0..$id::<i32>::lanes() {
                            assert_eq!(
                                a.extract(i), unsafe { crate::mem::zeroed() }
                            );
                        }
                    }
                }
            }
        }

        impl<T> $id<T> {
            /// Lane-wise equality comparison.
            #[inline]
            pub fn eq(self, other: Self) -> $mask_ty {
                unsafe {
                    use crate::llvm::simd_eq;
                    let a: $usize_ty = crate::mem::transmute(self);
                    let b: $usize_ty = crate::mem::transmute(other);
                    Simd(simd_eq(a.0, b.0))
                }
            }

            /// Lane-wise inequality comparison.
            #[inline]
            pub fn ne(self, other: Self) -> $mask_ty {
                unsafe {
                    use crate::llvm::simd_ne;
                    let a: $usize_ty = crate::mem::transmute(self);
                    let b: $usize_ty = crate::mem::transmute(other);
                    Simd(simd_ne(a.0, b.0))
                }
            }

            /// Lane-wise less-than comparison.
            #[inline]
            pub fn lt(self, other: Self) -> $mask_ty {
                unsafe {
                    use crate::llvm::simd_lt;
                    let a: $usize_ty = crate::mem::transmute(self);
                    let b: $usize_ty = crate::mem::transmute(other);
                    Simd(simd_lt(a.0, b.0))
                }
            }

            /// Lane-wise less-than-or-equals comparison.
            #[inline]
            pub fn le(self, other: Self) -> $mask_ty {
                unsafe {
                    use crate::llvm::simd_le;
                    let a: $usize_ty = crate::mem::transmute(self);
                    let b: $usize_ty = crate::mem::transmute(other);
                    Simd(simd_le(a.0, b.0))
                }
            }

            /// Lane-wise greater-than comparison.
            #[inline]
            pub fn gt(self, other: Self) -> $mask_ty {
                unsafe {
                    use crate::llvm::simd_gt;
                    let a: $usize_ty = crate::mem::transmute(self);
                    let b: $usize_ty = crate::mem::transmute(other);
                    Simd(simd_gt(a.0, b.0))
                }
            }

            /// Lane-wise greater-than-or-equals comparison.
            #[inline]
            pub fn ge(self, other: Self) -> $mask_ty {
                unsafe {
                    use crate::llvm::simd_ge;
                    let a: $usize_ty = crate::mem::transmute(self);
                    let b: $usize_ty = crate::mem::transmute(other);
                    Simd(simd_ge(a.0, b.0))
                }
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _cmp_vertical>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn cmp() {
                        let a = $id::<i32>::null();
                        let b = $id::<i32>::splat(unsafe {
                            crate::mem::transmute(1_isize)
                        });

                        let r = a.lt(b);
                        let e = $mask_ty::splat(true);
                        assert!(r == e);
                        let r = a.le(b);
                        assert!(r == e);

                        let e = $mask_ty::splat(false);
                        let r = a.gt(b);
                        assert!(r == e);
                        let r = a.ge(b);
                        assert!(r == e);
                        let r = a.eq(b);
                        assert!(r == e);

                        let mut a = a;
                        let mut b = b;
                        let mut e = e;
                        for i in 0..$id::<i32>::lanes() {
                            if i % 2 == 0 {
                                a = a.replace(
                                    i,
                                    unsafe { crate::mem::transmute(0_isize) }
                                );
                                b = b.replace(
                                    i,
                                    unsafe { crate::mem::transmute(1_isize) }
                                );
                                e = e.replace(i, true);
                            } else {
                                a = a.replace(
                                    i,
                                    unsafe { crate::mem::transmute(1_isize) }
                                );
                                b = b.replace(
                                    i,
                                    unsafe { crate::mem::transmute(0_isize) }
                                );
                                e = e.replace(i, false);
                            }
                        }
                        let r = a.lt(b);
                        assert!(r == e);
                    }
                }
            }
        }

        #[allow(clippy::partialeq_ne_impl)]
        impl<T> crate::cmp::PartialEq<$id<T>> for $id<T> {
            #[inline]
            fn eq(&self, other: &Self) -> bool {
                $id::<T>::eq(*self, *other).all()
            }
            #[inline]
            fn ne(&self, other: &Self) -> bool {
                $id::<T>::ne(*self, *other).any()
            }
        }

        // FIXME: https://github.com/rust-lang-nursery/rust-clippy/issues/2892
        #[allow(clippy::partialeq_ne_impl)]
        impl<T> crate::cmp::PartialEq<LexicographicallyOrdered<$id<T>>>
            for LexicographicallyOrdered<$id<T>>
        {
            #[inline]
            fn eq(&self, other: &Self) -> bool {
                self.0 == other.0
            }
            #[inline]
            fn ne(&self, other: &Self) -> bool {
                self.0 != other.0
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _cmp_PartialEq>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn partial_eq() {
                        let a = $id::<i32>::null();
                        let b = $id::<i32>::splat(unsafe {
                            crate::mem::transmute(1_isize)
                        });

                        assert!(a != b);
                        assert!(!(a == b));
                        assert!(a == a);
                        assert!(!(a != a));

                        if $id::<i32>::lanes() > 1 {
                            let a = $id::<i32>::null().replace(0, unsafe {
                                crate::mem::transmute(1_isize)
                            });
                            let b = $id::<i32>::splat(unsafe {
                                crate::mem::transmute(1_isize)
                            });

                            assert!(a != b);
                            assert!(!(a == b));
                            assert!(a == a);
                            assert!(!(a != a));
                        }
                    }
                }
            }
        }

        impl<T> crate::cmp::Eq for $id<T> {}
        impl<T> crate::cmp::Eq for LexicographicallyOrdered<$id<T>> {}

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _cmp_eq>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn eq() {
                        fn foo<E: crate::cmp::Eq>(_: E) {}
                        let a = $id::<i32>::null();
                        foo(a);
                    }
                }
            }
        }

        impl<T> From<[$elem_ty; $elem_count]> for $id<T> {
            #[inline]
            fn from(array: [$elem_ty; $elem_count]) -> Self {
                unsafe {
                    // FIXME: unnecessary zeroing; better than UB.
                    let mut u: Self = crate::mem::zeroed();
                    crate::ptr::copy_nonoverlapping(
                        &array as *const [$elem_ty; $elem_count] as *const u8,
                        &mut u as *mut Self as *mut u8,
                        crate::mem::size_of::<Self>()
                    );
                    u
                }
            }
        }
        impl<T> Into<[$elem_ty; $elem_count]> for $id<T> {
            #[inline]
            fn into(self) -> [$elem_ty; $elem_count] {
                unsafe {
                    // FIXME: unnecessary zeroing; better than UB.
                    let mut u: [$elem_ty; $elem_count] = crate::mem::zeroed();
                    crate::ptr::copy_nonoverlapping(
                        &self as *const $id<T> as *const u8,
                        &mut u as *mut [$elem_ty; $elem_count] as *mut u8,
                        crate::mem::size_of::<Self>()
                    );
                    u
                }
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _from>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn array() {
                        let values = [1_i32; $elem_count];

                        let mut vec: $id<i32> = Default::default();
                        let mut array = [
                            $id::<i32>::null().extract(0); $elem_count
                        ];

                        for i in 0..$elem_count {
                            let ptr = &values[i] as *const i32 as *mut i32;
                            vec = vec.replace(i, ptr);
                            array[i] = ptr;
                        }

                        // FIXME: there is no impl of From<$id<T>> for [$elem_ty; N]
                        // let a0 = From::from(vec);
                        // assert_eq!(a0, array);
                        #[allow(unused_assignments)]
                        let mut a1 = array;
                        a1 = vec.into();
                        assert_eq!(a1, array);

                        let v0: $id<i32> = From::from(array);
                        assert_eq!(v0, vec);
                        let v1: $id<i32> = array.into();
                        assert_eq!(v1, vec);
                    }
                }
            }
        }

        impl<T> $id<T> {
            /// Instantiates a new vector with the values of the `slice`.
            ///
            /// # Panics
            ///
            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not aligned
            /// to an `align_of::<Self>()` boundary.
            #[inline]
            pub fn from_slice_aligned(slice: &[$elem_ty]) -> Self {
                unsafe {
                    assert!(slice.len() >= $elem_count);
                    let target_ptr = slice.as_ptr();
                    assert!(
                        target_ptr.align_offset(crate::mem::align_of::<Self>())
                            == 0
                    );
                    Self::from_slice_aligned_unchecked(slice)
                }
            }

            /// Instantiates a new vector with the values of the `slice`.
            ///
            /// # Panics
            ///
            /// If `slice.len() < Self::lanes()`.
            #[inline]
            pub fn from_slice_unaligned(slice: &[$elem_ty]) -> Self {
                unsafe {
                    assert!(slice.len() >= $elem_count);
                    Self::from_slice_unaligned_unchecked(slice)
                }
            }

            /// Instantiates a new vector with the values of the `slice`.
            ///
            /// # Safety
            ///
            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not aligned
            /// to an `align_of::<Self>()` boundary, the behavior is undefined.
            #[inline]
            pub unsafe fn from_slice_aligned_unchecked(slice: &[$elem_ty])
                                                       -> Self {
                #[allow(clippy::cast_ptr_alignment)]
                *(slice.as_ptr().cast())
            }

            /// Instantiates a new vector with the values of the `slice`.
            ///
            /// # Safety
            ///
            /// If `slice.len() < Self::lanes()` the behavior is undefined.
            #[inline]
            pub unsafe fn from_slice_unaligned_unchecked(
                slice: &[$elem_ty],
            ) -> Self {
                use crate::mem::size_of;
                let target_ptr = slice.as_ptr().cast();
                let mut x = Self::splat(crate::ptr::null_mut() as $elem_ty);
                let self_ptr = &mut x as *mut Self as *mut u8;
                crate::ptr::copy_nonoverlapping(
                    target_ptr,
                    self_ptr,
                    size_of::<Self>(),
                );
                x
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _slice_from_slice>] {
                    use super::*;
                    use crate::iter::Iterator;

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn from_slice_unaligned() {
                        let (null, non_null) = ptr_vals!($id<i32>);

                        let mut unaligned = [
                            non_null; $id::<i32>::lanes() + 1
                        ];
                        unaligned[0] = null;
                        let vec = $id::<i32>::from_slice_unaligned(
                            &unaligned[1..]
                        );
                        for (index, &b) in unaligned.iter().enumerate() {
                            if index == 0 {
                                assert_eq!(b, null);
                            } else {
                                assert_eq!(b, non_null);
                                assert_eq!(b, vec.extract(index - 1));
                            }
                        }
                    }

                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)]
                    // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn from_slice_unaligned_fail() {
                        let (_null, non_null) = ptr_vals!($id<i32>);
                        let unaligned = [non_null; $id::<i32>::lanes() + 1];
                        // the slice is not large enough => panic
                        let _vec = $id::<i32>::from_slice_unaligned(
                            &unaligned[2..]
                        );
                    }

                    union A {
                        data: [<$id<i32> as sealed::Simd>::Element;
                               2 * $id::<i32>::lanes()],
                        _vec: $id<i32>,
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn from_slice_aligned() {
                        let (null, non_null) = ptr_vals!($id<i32>);
                        let mut aligned = A {
                            data: [null; 2 * $id::<i32>::lanes()],
                        };
                        for i in
                            $id::<i32>::lanes()..(2 * $id::<i32>::lanes()) {
                            unsafe {
                                aligned.data[i] = non_null;
                            }
                        }

                        let vec = unsafe {
                            $id::<i32>::from_slice_aligned(
                                &aligned.data[$id::<i32>::lanes()..]
                            )
                        };
                        for (index, &b) in unsafe {
                            aligned.data.iter().enumerate()
                        } {
                            if index < $id::<i32>::lanes() {
                                assert_eq!(b, null);
                            } else {
                                assert_eq!(b, non_null);
                                assert_eq!(
                                    b, vec.extract(index - $id::<i32>::lanes())
                                );
                            }
                        }
                    }

                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)]
                    // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn from_slice_aligned_fail_lanes() {
                        let (_null, non_null) = ptr_vals!($id<i32>);
                        let aligned = A {
                            data: [non_null; 2 * $id::<i32>::lanes()],
                        };
                        // the slice is not large enough => panic
                        let _vec = unsafe {
                            $id::<i32>::from_slice_aligned(
                                &aligned.data[2 * $id::<i32>::lanes()..]
                            )
                        };
                    }

                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)]
                    // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn from_slice_aligned_fail_align() {
                        unsafe {
                            let (null, _non_null) = ptr_vals!($id<i32>);
                            let aligned = A {
                                data: [null; 2 * $id::<i32>::lanes()],
                            };

                            // get a pointer to the front of data
                            let ptr = aligned.data.as_ptr();
                            // offset pointer by one element
                            let ptr = ptr.wrapping_add(1);

                            if ptr.align_offset(
                                crate::mem::align_of::<$id<i32>>()
                            ) == 0 {
                                // the pointer is properly aligned, so
                                // from_slice_aligned won't fail here (e.g. this
                                // can happen for i128x1). So we panic to make
                                // the "should_fail" test pass:
                                panic!("ok");
                            }

                            // create a slice - this is safe, because the
                            // elements of the slice exist, are properly
                            // initialized, and properly aligned:
                            let s = slice::from_raw_parts(
                                ptr, $id::<i32>::lanes()
                            );
                            // this should always panic because the slice
                            // alignment does not match the alignment
                            // requirements for the vector type:
                            let _vec = $id::<i32>::from_slice_aligned(s);
                        }
                    }
                }
            }
        }

        impl<T> $id<T> {
            /// Writes the values of the vector to the `slice`.
            ///
            /// # Panics
            ///
            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not
            /// aligned to an `align_of::<Self>()` boundary.
            #[inline]
            pub fn write_to_slice_aligned(self, slice: &mut [$elem_ty]) {
                unsafe {
                    assert!(slice.len() >= $elem_count);
                    let target_ptr = slice.as_mut_ptr();
                    assert!(
                        target_ptr.align_offset(crate::mem::align_of::<Self>())
                            == 0
                    );
                    self.write_to_slice_aligned_unchecked(slice);
                }
            }

            /// Writes the values of the vector to the `slice`.
            ///
            /// # Panics
            ///
            /// If `slice.len() < Self::lanes()`.
            #[inline]
            pub fn write_to_slice_unaligned(self, slice: &mut [$elem_ty]) {
                unsafe {
                    assert!(slice.len() >= $elem_count);
                    self.write_to_slice_unaligned_unchecked(slice);
                }
            }

            /// Writes the values of the vector to the `slice`.
            ///
            /// # Safety
            ///
            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not
            /// aligned to an `align_of::<Self>()` boundary, the behavior is
            /// undefined.
            #[inline]
            pub unsafe fn write_to_slice_aligned_unchecked(
                self, slice: &mut [$elem_ty],
            ) {
                #[allow(clippy::cast_ptr_alignment)]
                *(slice.as_mut_ptr().cast()) = self;
            }

            /// Writes the values of the vector to the `slice`.
            ///
            /// # Safety
            ///
            /// If `slice.len() < Self::lanes()` the behavior is undefined.
            #[inline]
            pub unsafe fn write_to_slice_unaligned_unchecked(
                self, slice: &mut [$elem_ty],
            ) {
                let target_ptr = slice.as_mut_ptr().cast();
                let self_ptr = &self as *const Self as *const u8;
                crate::ptr::copy_nonoverlapping(
                    self_ptr,
                    target_ptr,
                    crate::mem::size_of::<Self>(),
                );
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _slice_write_to_slice>] {
                    use super::*;
                    use crate::iter::Iterator;

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn write_to_slice_unaligned() {
                        let (null, non_null) = ptr_vals!($id<i32>);
                        let mut unaligned = [null; $id::<i32>::lanes() + 1];
                        let vec = $id::<i32>::splat(non_null);
                        vec.write_to_slice_unaligned(&mut unaligned[1..]);
                        for (index, &b) in unaligned.iter().enumerate() {
                            if index == 0 {
                                assert_eq!(b, null);
                            } else {
                                assert_eq!(b, non_null);
                                assert_eq!(b, vec.extract(index - 1));
                            }
                        }
                    }

                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)]
                    // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn write_to_slice_unaligned_fail() {
                        let (null, non_null) = ptr_vals!($id<i32>);
                        let mut unaligned = [null; $id::<i32>::lanes() + 1];
                        let vec = $id::<i32>::splat(non_null);
                        // the slice is not large enough => panic
                        vec.write_to_slice_unaligned(&mut unaligned[2..]);
                    }

                    union A {
                        data: [<$id<i32> as sealed::Simd>::Element;
                               2 * $id::<i32>::lanes()],
                        _vec: $id<i32>,
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn write_to_slice_aligned() {
                        let (null, non_null) = ptr_vals!($id<i32>);
                        let mut aligned = A {
                            data: [null; 2 * $id::<i32>::lanes()],
                        };
                        let vec = $id::<i32>::splat(non_null);
                        unsafe {
                            vec.write_to_slice_aligned(
                                &mut aligned.data[$id::<i32>::lanes()..]
                            )
                        };
                        for (index, &b) in
                            unsafe { aligned.data.iter().enumerate() } {
                            if index < $id::<i32>::lanes() {
                                assert_eq!(b, null);
                            } else {
                                assert_eq!(b, non_null);
                                assert_eq!(
                                    b, vec.extract(index - $id::<i32>::lanes())
                                );
                            }
                        }
                    }

                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)]
                    // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn write_to_slice_aligned_fail_lanes() {
                        let (null, non_null) = ptr_vals!($id<i32>);
                        let mut aligned = A {
                            data: [null; 2 * $id::<i32>::lanes()],
                        };
                        let vec = $id::<i32>::splat(non_null);
                        // the slice is not large enough => panic
                        unsafe {
                            vec.write_to_slice_aligned(
                                &mut aligned.data[2 * $id::<i32>::lanes()..]
                            )
                        };
                    }

                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)]
                    // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn write_to_slice_aligned_fail_align() {
                        let (null, non_null) = ptr_vals!($id<i32>);
                        unsafe {
                            let mut aligned = A {
                                data: [null; 2 * $id::<i32>::lanes()],
                            };

                            // get a pointer to the front of data
                            let ptr = aligned.data.as_mut_ptr();
                            // offset pointer by one element
                            let ptr = ptr.wrapping_add(1);

                            if ptr.align_offset(
                                crate::mem::align_of::<$id<i32>>()
                            ) == 0 {
                                // the pointer is properly aligned, so
                                // write_to_slice_aligned won't fail here (e.g.
                                // this can happen for i128x1). So we panic to
                                // make the "should_fail" test pass:
                                panic!("ok");
                            }

                            // create a slice - this is safe, because the
                            // elements of the slice exist, are properly
                            // initialized, and properly aligned:
                            let s = slice::from_raw_parts_mut(
                                ptr, $id::<i32>::lanes()
                            );
                            // this should always panic because the slice
                            // alignment does not match the alignment
                            // requirements for the vector type:
                            let vec = $id::<i32>::splat(non_null);
                            vec.write_to_slice_aligned(s);
                        }
                    }
                }
            }
        }

        impl<T> crate::hash::Hash for $id<T> {
            #[inline]
            fn hash<H: crate::hash::Hasher>(&self, state: &mut H) {
                let s: $usize_ty = unsafe { crate::mem::transmute(*self) };
                s.hash(state)
            }
        }

        test_if! {
            $test_tt:
            paste::item! {
                pub mod [<$id _hash>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn hash() {
                        use crate::hash::{Hash, Hasher};
                        #[allow(deprecated)]
                        use crate::hash::{SipHasher13};

                        let values = [1_i32; $elem_count];

                        let mut vec: $id<i32> = Default::default();
                        let mut array = [
                            $id::<i32>::null().extract(0);
                            $elem_count
                        ];

                        for i in 0..$elem_count {
                            let ptr = &values[i] as *const i32 as *mut i32;
                            vec = vec.replace(i, ptr);
                            array[i] = ptr;
                        }

                        #[allow(deprecated)]
                        let mut a_hash = SipHasher13::new();
                        let mut v_hash = a_hash.clone();
                        array.hash(&mut a_hash);
                        vec.hash(&mut v_hash);
                        assert_eq!(a_hash.finish(), v_hash.finish());
                    }
                }
            }
        }

        impl<T> $id<T> {
            /// Calculates the offset from a pointer.
            ///
            /// `count` is in units of `T`; e.g. a count of `3` represents a
            /// pointer offset of `3 * size_of::<T>()` bytes.
            ///
            /// # Safety
            ///
            /// If any of the following conditions are violated, the result is
            /// Undefined Behavior:
            ///
            /// * Both the starting and resulting pointer must be either in
            /// bounds or one byte past the end of an allocated object.
            ///
            /// * The computed offset, in bytes, cannot overflow an `isize`.
            ///
            /// * The offset being in bounds cannot rely on "wrapping around"
            /// the address space. That is, the infinite-precision sum, in bytes
            /// must fit in a `usize`.
            ///
            /// The compiler and standard library generally tries to ensure
            /// allocations never reach a size where an offset is a concern. For
            /// instance, `Vec` and `Box` ensure they never allocate more than
            /// `isize::MAX` bytes, so `vec.as_ptr().offset(vec.len() as isize)`
            /// is always safe.
            ///
            /// Most platforms fundamentally can't even construct such an
            /// allocation. For instance, no known 64-bit platform can ever
            /// serve a request for 263 bytes due to page-table limitations or
            /// splitting the address space. However, some 32-bit and 16-bit
            /// platforms may successfully serve a request for more than
            /// `isize::MAX` bytes with things like Physical Address Extension.
            /// As such, memory acquired directly from allocators or memory
            /// mapped files may be too large to handle with this function.
            ///
            /// Consider using `wrapping_offset` instead if these constraints
            /// are difficult to satisfy. The only advantage of this method is
            /// that it enables more aggressive compiler optimizations.
            #[inline]
            pub unsafe fn offset(self, count: $isize_ty) -> Self {
                // FIXME: should use LLVM's `add nsw nuw`
                self.wrapping_offset(count)
            }

            /// Calculates the offset from a pointer using wrapping arithmetic.
            ///
            /// `count` is in units of `T`; e.g. a count of `3` represents a
            /// pointer offset of `3 * size_of::<T>()` bytes.
            ///
            /// # Safety
            ///
            /// The resulting pointer does not need to be in bounds, but it is
            /// potentially hazardous to dereference (which requires unsafe).
            ///
            /// Always use `.offset(count)` instead when possible, because
            /// offset allows the compiler to optimize better.
            #[inline]
            pub fn wrapping_offset(self, count: $isize_ty) -> Self {
                unsafe {
                    let x: $isize_ty = crate::mem::transmute(self);
                    // note: {+,*} currently performs a `wrapping_{add, mul}`
                    crate::mem::transmute(
                        x + (count * crate::mem::size_of::<T>() as isize)
                    )
                }
            }

            /// Calculates the distance between two pointers.
            ///
            /// The returned value is in units of `T`: the distance in bytes is
            /// divided by `mem::size_of::<T>()`.
            ///
            /// This function is the inverse of offset.
            ///
            /// # Safety
            ///
            /// If any of the following conditions are violated, the result is
            /// Undefined Behavior:
            ///
            /// * Both the starting and other pointer must be either in bounds
            /// or one byte past the end of the same allocated object.
            ///
            /// * The distance between the pointers, in bytes, cannot overflow
            /// an `isize`.
            ///
            /// * The distance between the pointers, in bytes, must be an exact
            /// multiple of the size of `T`.
            ///
            /// * The distance being in bounds cannot rely on "wrapping around"
            /// the address space.
            ///
            /// The compiler and standard library generally try to ensure
            /// allocations never reach a size where an offset is a concern. For
            /// instance, `Vec` and `Box` ensure they never allocate more than
            /// `isize::MAX` bytes, so `ptr_into_vec.offset_from(vec.as_ptr())`
            /// is always safe.
            ///
            /// Most platforms fundamentally can't even construct such an
            /// allocation. For instance, no known 64-bit platform can ever
            /// serve a request for 263 bytes due to page-table limitations or
            /// splitting the address space. However, some 32-bit and 16-bit
            /// platforms may successfully serve a request for more than
            /// `isize::MAX` bytes with things like Physical Address Extension.
            /// As such, memory acquired directly from allocators or memory
            /// mapped files may be too large to handle with this function.
            ///
            /// Consider using `wrapping_offset_from` instead if these constraints
            /// are difficult to satisfy. The only advantage of this method is
            /// that it enables more aggressive compiler optimizations.
            #[inline]
            pub unsafe fn offset_from(self, origin: Self) -> $isize_ty {
                // FIXME: should use LLVM's `sub nsw nuw`.
                self.wrapping_offset_from(origin)
            }

            /// Calculates the distance between two pointers.
            ///
            /// The returned value is in units of `T`: the distance in bytes is
            /// divided by `mem::size_of::<T>()`.
            ///
            /// If the address different between the two pointers is not a
            /// multiple of `mem::size_of::<T>()` then the result of the
            /// division is rounded towards zero.
            ///
            /// Though this method is safe for any two pointers, note that its
            /// result will be mostly useless if the two pointers aren't into
            /// the same allocated object, for example if they point to two
            /// different local variables.
            #[inline]
            pub fn wrapping_offset_from(self, origin: Self) -> $isize_ty {
                let x: $isize_ty = unsafe { crate::mem::transmute(self) };
                let y: $isize_ty = unsafe { crate::mem::transmute(origin) };
                // note: {-,/} currently perform wrapping_{sub, div}
                (y - x) / (crate::mem::size_of::<T>() as isize)
            }

            /// Calculates the offset from a pointer (convenience for
            /// `.offset(count as isize)`).
            ///
            /// `count` is in units of `T`; e.g. a count of 3 represents a
            /// pointer offset of `3 * size_of::<T>()` bytes.
            ///
            /// # Safety
            ///
            /// If any of the following conditions are violated, the result is
            /// Undefined Behavior:
            ///
            /// * Both the starting and resulting pointer must be either in
            /// bounds or one byte past the end of an allocated object.
            ///
            /// * The computed offset, in bytes, cannot overflow an `isize`.
            ///
            /// * The offset being in bounds cannot rely on "wrapping around"
            /// the address space. That is, the infinite-precision sum must fit
            /// in a `usize`.
            ///
            /// The compiler and standard library generally tries to ensure
            /// allocations never reach a size where an offset is a concern. For
            /// instance, `Vec` and `Box` ensure they never allocate more than
            /// `isize::MAX` bytes, so `vec.as_ptr().add(vec.len())` is always
            /// safe.
            ///
            /// Most platforms fundamentally can't even construct such an
            /// allocation. For instance, no known 64-bit platform can ever
            /// serve a request for 263 bytes due to page-table limitations or
            /// splitting the address space. However, some 32-bit and 16-bit
            /// platforms may successfully serve a request for more than
            /// `isize::MAX` bytes with things like Physical Address Extension.
            /// As such, memory acquired directly from allocators or memory
            /// mapped files may be too large to handle with this function.
            ///
            /// Consider using `wrapping_offset` instead if these constraints
            /// are difficult to satisfy. The only advantage of this method is
            /// that it enables more aggressive compiler optimizations.
            #[inline]
            #[allow(clippy::should_implement_trait)]
            pub unsafe fn add(self, count: $usize_ty) -> Self {
                self.offset(count.cast())
            }

            /// Calculates the offset from a pointer (convenience for
            /// `.offset((count as isize).wrapping_neg())`).
            ///
            /// `count` is in units of T; e.g. a `count` of 3 represents a
            /// pointer offset of `3 * size_of::<T>()` bytes.
            ///
            /// # Safety
            ///
            /// If any of the following conditions are violated, the result is
            /// Undefined Behavior:
            ///
            /// * Both the starting and resulting pointer must be either in
            /// bounds or one byte past the end of an allocated object.
            ///
            /// * The computed offset cannot exceed `isize::MAX` **bytes**.
            ///
            /// * The offset being in bounds cannot rely on "wrapping around"
            /// the address space. That is, the infinite-precision sum must fit
            /// in a usize.
            ///
            /// The compiler and standard library generally tries to ensure
            /// allocations never reach a size where an offset is a concern. For
            /// instance, `Vec` and `Box` ensure they never allocate more than
            /// `isize::MAX` bytes, so
            /// `vec.as_ptr().add(vec.len()).sub(vec.len())` is always safe.
            ///
            /// Most platforms fundamentally can't even construct such an
            /// allocation. For instance, no known 64-bit platform can ever
            /// serve a request for 2<sup>63</sup> bytes due to page-table
            /// limitations or splitting the address space. However, some 32-bit
            /// and 16-bit platforms may successfully serve a request for more
            /// than `isize::MAX` bytes with things like Physical Address
            /// Extension. As such, memory acquired directly from allocators or
            /// memory mapped files *may* be too large to handle with this
            /// function.
            ///
            /// Consider using `wrapping_offset` instead if these constraints
            /// are difficult to satisfy. The only advantage of this method is
            /// that it enables more aggressive compiler optimizations.
            #[inline]
            #[allow(clippy::should_implement_trait)]
            pub unsafe fn sub(self, count: $usize_ty) -> Self {
                let x: $isize_ty = count.cast();
                // note: - is currently wrapping_neg
                self.offset(-x)
            }

            /// Calculates the offset from a pointer using wrapping arithmetic.
            /// (convenience for `.wrapping_offset(count as isize)`)
            ///
            /// `count` is in units of T; e.g. a `count` of 3 represents a
            /// pointer offset of `3 * size_of::<T>()` bytes.
            ///
            /// # Safety
            ///
            /// The resulting pointer does not need to be in bounds, but it is
            /// potentially hazardous to dereference (which requires `unsafe`).
            ///
            /// Always use `.add(count)` instead when possible, because `add`
            /// allows the compiler to optimize better.
            #[inline]
            pub fn wrapping_add(self, count: $usize_ty) -> Self {
                self.wrapping_offset(count.cast())
            }

            /// Calculates the offset from a pointer using wrapping arithmetic.
            /// (convenience for `.wrapping_offset((count as
            /// isize).wrapping_sub())`)
            ///
            /// `count` is in units of T; e.g. a `count` of 3 represents a
            /// pointer offset of `3 * size_of::<T>()` bytes.
            ///
            /// # Safety
            ///
            /// The resulting pointer does not need to be in bounds, but it is
            /// potentially hazardous to dereference (which requires `unsafe`).
            ///
            /// Always use `.sub(count)` instead when possible, because `sub`
            /// allows the compiler to optimize better.
            #[inline]
            pub fn wrapping_sub(self, count: $usize_ty) -> Self {
                let x: $isize_ty = count.cast();
                self.wrapping_offset(-1 * x)
            }
        }

        impl<T> $id<T> {
            /// Shuffle vector elements according to `indices`.
            #[inline]
            pub fn shuffle1_dyn<I>(self, indices: I) -> Self
                where
                Self: codegen::shuffle1_dyn::Shuffle1Dyn<Indices = I>,
            {
                codegen::shuffle1_dyn::Shuffle1Dyn::shuffle1_dyn(self, indices)
            }
        }

        test_if! {
                $test_tt:
            paste::item! {
                pub mod [<$id _shuffle1_dyn>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn shuffle1_dyn() {
                        let (null, non_null) = ptr_vals!($id<i32>);

                        // alternating = [non_null, null, non_null, null, ...]
                        let mut alternating = $id::<i32>::splat(null);
                        for i in 0..$id::<i32>::lanes() {
                            if i % 2 == 0 {
                                alternating = alternating.replace(i, non_null);
                            }
                        }

                        type Indices = <$id<i32>
                            as codegen::shuffle1_dyn::Shuffle1Dyn>::Indices;
                        // even = [0, 0, 2, 2, 4, 4, ..]
                        let even = {
                            let mut v = Indices::splat(0);
                            for i in 0..$id::<i32>::lanes() {
                                if i % 2 == 0 {
                                    v = v.replace(i, (i as u8).into());
                                } else {
                                v = v.replace(i, (i as u8 - 1).into());
                                }
                            }
                            v
                        };
                        // odd = [1, 1, 3, 3, 5, 5, ...]
                        let odd = {
                            let mut v = Indices::splat(0);
                            for i in 0..$id::<i32>::lanes() {
                                if i % 2 != 0 {
                                    v = v.replace(i, (i as u8).into());
                                } else {
                                    v = v.replace(i, (i as u8 + 1).into());
                                }
                            }
                            v
                        };

                        assert_eq!(
                            alternating.shuffle1_dyn(even),
                            $id::<i32>::splat(non_null)
                        );
                        if $id::<i32>::lanes() > 1 {
                            assert_eq!(
                                alternating.shuffle1_dyn(odd),
                                $id::<i32>::splat(null)
                            );
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/minimal.rs
================================================
#[macro_use]
mod iuf;
#[macro_use]
mod mask;
#[macro_use]
mod ptr;


================================================
FILE: src/api/ops/scalar_arithmetic.rs
================================================
//! Vertical (lane-wise) vector-scalar / scalar-vector arithmetic operations.

macro_rules! impl_ops_scalar_arithmetic {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl crate::ops::Add<$elem_ty> for $id {
            type Output = Self;
            #[inline]
            fn add(self, other: $elem_ty) -> Self {
                self + $id::splat(other)
            }
        }
        impl crate::ops::Add<$id> for $elem_ty {
            type Output = $id;
            #[inline]
            fn add(self, other: $id) -> $id {
                $id::splat(self) + other
            }
        }

        impl crate::ops::Sub<$elem_ty> for $id {
            type Output = Self;
            #[inline]
            fn sub(self, other: $elem_ty) -> Self {
                self - $id::splat(other)
            }
        }
        impl crate::ops::Sub<$id> for $elem_ty {
            type Output = $id;
            #[inline]
            fn sub(self, other: $id) -> $id {
                $id::splat(self) - other
            }
        }

        impl crate::ops::Mul<$elem_ty> for $id {
            type Output = Self;
            #[inline]
            fn mul(self, other: $elem_ty) -> Self {
                self * $id::splat(other)
            }
        }
        impl crate::ops::Mul<$id> for $elem_ty {
            type Output = $id;
            #[inline]
            fn mul(self, other: $id) -> $id {
                $id::splat(self) * other
            }
        }

        impl crate::ops::Div<$elem_ty> for $id {
            type Output = Self;
            #[inline]
            fn div(self, other: $elem_ty) -> Self {
                self / $id::splat(other)
            }
        }
        impl crate::ops::Div<$id> for $elem_ty {
            type Output = $id;
            #[inline]
            fn div(self, other: $id) -> $id {
                $id::splat(self) / other
            }
        }

        impl crate::ops::Rem<$elem_ty> for $id {
            type Output = Self;
            #[inline]
            fn rem(self, other: $elem_ty) -> Self {
                self % $id::splat(other)
            }
        }
        impl crate::ops::Rem<$id> for $elem_ty {
            type Output = $id;
            #[inline]
            fn rem(self, other: $id) -> $id {
                $id::splat(self) % other
            }
        }

        impl crate::ops::AddAssign<$elem_ty> for $id {
            #[inline]
            fn add_assign(&mut self, other: $elem_ty) {
                *self = *self + other;
            }
        }

        impl crate::ops::SubAssign<$elem_ty> for $id {
            #[inline]
            fn sub_assign(&mut self, other: $elem_ty) {
                *self = *self - other;
            }
        }

        impl crate::ops::MulAssign<$elem_ty> for $id {
            #[inline]
            fn mul_assign(&mut self, other: $elem_ty) {
                *self = *self * other;
            }
        }

        impl crate::ops::DivAssign<$elem_ty> for $id {
            #[inline]
            fn div_assign(&mut self, other: $elem_ty) {
                *self = *self / other;
            }
        }

        impl crate::ops::RemAssign<$elem_ty> for $id {
            #[inline]
            fn rem_assign(&mut self, other: $elem_ty) {
                *self = *self % other;
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _ops_scalar_arith>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn ops_scalar_arithmetic() {
                        let zi = 0 as $elem_ty;
                        let oi = 1 as $elem_ty;
                        let ti = 2 as $elem_ty;
                        let fi = 4 as $elem_ty;
                        let z = $id::splat(zi);
                        let o = $id::splat(oi);
                        let t = $id::splat(ti);
                        let f = $id::splat(fi);

                        // add
                        assert_eq!(zi + z, z);
                        assert_eq!(z + zi, z);
                        assert_eq!(oi + z, o);
                        assert_eq!(o + zi, o);
                        assert_eq!(ti + z, t);
                        assert_eq!(t + zi, t);
                        assert_eq!(ti + t, f);
                        assert_eq!(t + ti, f);
                        // sub
                        assert_eq!(zi - z, z);
                        assert_eq!(z - zi, z);
                        assert_eq!(oi - z, o);
                        assert_eq!(o - zi, o);
                        assert_eq!(ti - z, t);
                        assert_eq!(t - zi, t);
                        assert_eq!(fi - t, t);
                        assert_eq!(f - ti, t);
                        assert_eq!(f - o - o, t);
                        assert_eq!(f - oi - oi, t);
                        // mul
                        assert_eq!(zi * z, z);
                        assert_eq!(z * zi, z);
                        assert_eq!(zi * o, z);
                        assert_eq!(z * oi, z);
                        assert_eq!(zi * t, z);
                        assert_eq!(z * ti, z);
                        assert_eq!(oi * t, t);
                        assert_eq!(o * ti, t);
                        assert_eq!(ti * t, f);
                        assert_eq!(t * ti, f);
                        // div
                        assert_eq!(zi / o, z);
                        assert_eq!(z / oi, z);
                        assert_eq!(ti / o, t);
                        assert_eq!(t / oi, t);
                        assert_eq!(fi / o, f);
                        assert_eq!(f / oi, f);
                        assert_eq!(ti / t, o);
                        assert_eq!(t / ti, o);
                        assert_eq!(fi / t, t);
                        assert_eq!(f / ti, t);
                        // rem
                        assert_eq!(oi % o, z);
                        assert_eq!(o % oi, z);
                        assert_eq!(fi % t, z);
                        assert_eq!(f % ti, z);

                        {
                            let mut v = z;
                            assert_eq!(v, z);
                            v += oi; // add_assign
                            assert_eq!(v, o);
                            v -= oi; // sub_assign
                            assert_eq!(v, z);
                            v = t;
                            v *= oi; // mul_assign
                            assert_eq!(v, t);
                            v *= ti;
                            assert_eq!(v, f);
                            v /= oi; // div_assign
                            assert_eq!(v, f);
                            v /= ti;
                            assert_eq!(v, t);
                            v %= ti; // rem_assign
                            assert_eq!(v, z);
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/ops/scalar_bitwise.rs
================================================
//! Vertical (lane-wise) vector-scalar / scalar-vector bitwise operations.

macro_rules! impl_ops_scalar_bitwise {
    (
        [$elem_ty:ident; $elem_count:expr]:
        $id:ident | $test_tt:tt |
        ($true:expr, $false:expr)
    ) => {
        impl crate::ops::BitXor<$elem_ty> for $id {
            type Output = Self;
            #[inline]
            fn bitxor(self, other: $elem_ty) -> Self {
                self ^ $id::splat(other)
            }
        }
        impl crate::ops::BitXor<$id> for $elem_ty {
            type Output = $id;
            #[inline]
            fn bitxor(self, other: $id) -> $id {
                $id::splat(self) ^ other
            }
        }

        impl crate::ops::BitAnd<$elem_ty> for $id {
            type Output = Self;
            #[inline]
            fn bitand(self, other: $elem_ty) -> Self {
                self & $id::splat(other)
            }
        }
        impl crate::ops::BitAnd<$id> for $elem_ty {
            type Output = $id;
            #[inline]
            fn bitand(self, other: $id) -> $id {
                $id::splat(self) & other
            }
        }

        impl crate::ops::BitOr<$elem_ty> for $id {
            type Output = Self;
            #[inline]
            fn bitor(self, other: $elem_ty) -> Self {
                self | $id::splat(other)
            }
        }
        impl crate::ops::BitOr<$id> for $elem_ty {
            type Output = $id;
            #[inline]
            fn bitor(self, other: $id) -> $id {
                $id::splat(self) | other
            }
        }

        impl crate::ops::BitAndAssign<$elem_ty> for $id {
            #[inline]
            fn bitand_assign(&mut self, other: $elem_ty) {
                *self = *self & other;
            }
        }
        impl crate::ops::BitOrAssign<$elem_ty> for $id {
            #[inline]
            fn bitor_assign(&mut self, other: $elem_ty) {
                *self = *self | other;
            }
        }
        impl crate::ops::BitXorAssign<$elem_ty> for $id {
            #[inline]
            fn bitxor_assign(&mut self, other: $elem_ty) {
                *self = *self ^ other;
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _ops_scalar_bitwise>] {
                    use super::*;

                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn ops_scalar_bitwise() {
                        let zi = 0 as $elem_ty;
                        let oi = 1 as $elem_ty;
                        let ti = 2 as $elem_ty;
                        let z = $id::splat(zi);
                        let o = $id::splat(oi);
                        let t = $id::splat(ti);

                        // BitAnd:
                        assert_eq!(oi & o, o);
                        assert_eq!(o & oi, o);
                        assert_eq!(oi & z, z);
                        assert_eq!(o & zi, z);
                        assert_eq!(zi & o, z);
                        assert_eq!(z & oi, z);
                        assert_eq!(zi & z, z);
                        assert_eq!(z & zi, z);

                        assert_eq!(ti & t, t);
                        assert_eq!(t & ti, t);
                        assert_eq!(ti & o, z);
                        assert_eq!(t & oi, z);
                        assert_eq!(oi & t, z);
                        assert_eq!(o & ti, z);

                        // BitOr:
                        assert_eq!(oi | o, o);
                        assert_eq!(o | oi, o);
                        assert_eq!(oi | z, o);
                        assert_eq!(o | zi, o);
                        assert_eq!(zi | o, o);
                        assert_eq!(z | oi, o);
                        assert_eq!(zi | z, z);
                        assert_eq!(z | zi, z);

                        assert_eq!(ti | t, t);
                        assert_eq!(t | ti, t);
                        assert_eq!(zi | t, t);
                        assert_eq!(z | ti, t);
                        assert_eq!(ti | z, t);
                        assert_eq!(t | zi, t);

                        // BitXOR:
                        assert_eq!(oi ^ o, z);
                        assert_eq!(o ^ oi, z);
                        assert_eq!(zi ^ z, z);
                        assert_eq!(z ^ zi, z);
                        assert_eq!(zi ^ o, o);
                        assert_eq!(z ^ oi, o);
                        assert_eq!(oi ^ z, o);
                        assert_eq!(o ^ zi, o);

                        assert_eq!(ti ^ t, z);
                        assert_eq!(t ^ ti, z);
                        assert_eq!(ti ^ z, t);
                        assert_eq!(t ^ zi, t);
                        assert_eq!(zi ^ t, t);
                        assert_eq!(z ^ ti, t);

                        {
                            // AndAssign:
                            let mut v = o;
                            v &= ti;
                            assert_eq!(v, z);
                        }
                        {
                            // OrAssign:
                            let mut v = z;
                            v |= oi;
                            assert_eq!(v, o);
                        }
                        {
                            // XORAssign:
                            let mut v = z;
                            v ^= oi;
                            assert_eq!(v, o);
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/ops/scalar_mask_bitwise.rs
================================================
//! Vertical (lane-wise) vector-vector bitwise operations.

macro_rules! impl_ops_scalar_mask_bitwise {
    (
        [$elem_ty:ident; $elem_count:expr]:
        $id:ident | $test_tt:tt |
        ($true:expr, $false:expr)
    ) => {
        impl crate::ops::BitXor<bool> for $id {
            type Output = Self;
            #[inline]
            fn bitxor(self, other: bool) -> Self {
                self ^ $id::splat(other)
            }
        }
        impl crate::ops::BitXor<$id> for bool {
            type Output = $id;
            #[inline]
            fn bitxor(self, other: $id) -> $id {
                $id::splat(self) ^ other
            }
        }

        impl crate::ops::BitAnd<bool> for $id {
            type Output = Self;
            #[inline]
            fn bitand(self, other: bool) -> Self {
                self & $id::splat(other)
            }
        }
        impl crate::ops::BitAnd<$id> for bool {
            type Output = $id;
            #[inline]
            fn bitand(self, other: $id) -> $id {
                $id::splat(self) & other
            }
        }

        impl crate::ops::BitOr<bool> for $id {
            type Output = Self;
            #[inline]
            fn bitor(self, other: bool) -> Self {
                self | $id::splat(other)
            }
        }
        impl crate::ops::BitOr<$id> for bool {
            type Output = $id;
            #[inline]
            fn bitor(self, other: $id) -> $id {
                $id::splat(self) | other
            }
        }

        impl crate::ops::BitAndAssign<bool> for $id {
            #[inline]
            fn bitand_assign(&mut self, other: bool) {
                *self = *self & other;
            }
        }
        impl crate::ops::BitOrAssign<bool> for $id {
            #[inline]
            fn bitor_assign(&mut self, other: bool) {
                *self = *self | other;
            }
        }
        impl crate::ops::BitXorAssign<bool> for $id {
            #[inline]
            fn bitxor_assign(&mut self, other: bool) {
                *self = *self ^ other;
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _ops_scalar_mask_bitwise>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn ops_scalar_mask_bitwise() {
                        let ti = true;
                        let fi = false;
                        let t = $id::splat(ti);
                        let f = $id::splat(fi);
                        assert!(t != f);
                        assert!(!(t == f));

                        // BitAnd:
                        assert_eq!(ti & f, f);
                        assert_eq!(t & fi, f);
                        assert_eq!(fi & t, f);
                        assert_eq!(f & ti, f);
                        assert_eq!(ti & t, t);
                        assert_eq!(t & ti, t);
                        assert_eq!(fi & f, f);
                        assert_eq!(f & fi, f);

                        // BitOr:
                        assert_eq!(ti | f, t);
                        assert_eq!(t | fi, t);
                        assert_eq!(fi | t, t);
                        assert_eq!(f | ti, t);
                        assert_eq!(ti | t, t);
                        assert_eq!(t | ti, t);
                        assert_eq!(fi | f, f);
                        assert_eq!(f | fi, f);

                        // BitXOR:
                        assert_eq!(ti ^ f, t);
                        assert_eq!(t ^ fi, t);
                        assert_eq!(fi ^ t, t);
                        assert_eq!(f ^ ti, t);
                        assert_eq!(ti ^ t, f);
                        assert_eq!(t ^ ti, f);
                        assert_eq!(fi ^ f, f);
                        assert_eq!(f ^ fi, f);

                        {
                            // AndAssign:
                            let mut v = f;
                            v &= ti;
                            assert_eq!(v, f);
                        }
                        {
                            // OrAssign:
                            let mut v = f;
                            v |= ti;
                            assert_eq!(v, t);
                        }
                        {
                            // XORAssign:
                            let mut v = f;
                            v ^= ti;
                            assert_eq!(v, t);
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/ops/scalar_shifts.rs
================================================
//! Vertical (lane-wise) vector-scalar shifts operations.

macro_rules! impl_ops_scalar_shifts {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl crate::ops::Shl<u32> for $id {
            type Output = Self;
            #[inline]
            fn shl(self, other: u32) -> Self {
                self << $id::splat(other as $elem_ty)
            }
        }
        impl crate::ops::Shr<u32> for $id {
            type Output = Self;
            #[inline]
            fn shr(self, other: u32) -> Self {
                self >> $id::splat(other as $elem_ty)
            }
        }

        impl crate::ops::ShlAssign<u32> for $id {
            #[inline]
            fn shl_assign(&mut self, other: u32) {
                *self = *self << other;
            }
        }
        impl crate::ops::ShrAssign<u32> for $id {
            #[inline]
            fn shr_assign(&mut self, other: u32) {
                *self = *self >> other;
            }
        }
        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _ops_scalar_shifts>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg_attr(any(target_arch = "s390x", target_arch = "sparc64"),
                               allow(unreachable_code, unused_variables)
                    )]
                    #[cfg(not(target_arch = "aarch64"))]
                    //~^ FIXME: https://github.com/rust-lang/packed_simd/issues/317
                    fn ops_scalar_shifts() {
                        let z = $id::splat(0 as $elem_ty);
                        let o = $id::splat(1 as $elem_ty);
                        let t = $id::splat(2 as $elem_ty);
                        let f = $id::splat(4 as $elem_ty);

                        {
                            let zi = 0 as u32;
                            let oi = 1 as u32;
                            let ti = 2 as u32;
                            let maxi
                                = (mem::size_of::<$elem_ty>() * 8 - 1) as u32;

                            // shr
                            assert_eq!(z >> zi, z);
                            assert_eq!(z >> oi, z);
                            assert_eq!(z >> ti, z);
                            assert_eq!(z >> ti, z);

                            #[cfg(any(target_arch = "s390x", target_arch = "sparc64"))] {
                                // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/13
                                return;
                            }

                            assert_eq!(o >> zi, o);
                            assert_eq!(t >> zi, t);
                            assert_eq!(f >> zi, f);
                            assert_eq!(f >> maxi, z);

                            assert_eq!(o >> oi, z);
                            assert_eq!(t >> oi, o);
                            assert_eq!(t >> ti, z);
                            assert_eq!(f >> oi, t);
                            assert_eq!(f >> ti, o);
                            assert_eq!(f >> maxi, z);

                            // shl
                            assert_eq!(z << zi, z);
                            assert_eq!(o << zi, o);
                            assert_eq!(t << zi, t);
                            assert_eq!(f << zi, f);
                            assert_eq!(f << maxi, z);

                            assert_eq!(o << oi, t);
                            assert_eq!(o << ti, f);
                            assert_eq!(t << oi, f);

                            {  // shr_assign
                                let mut v = o;
                                v >>= oi;
                                assert_eq!(v, z);
                            }
                            {  // shl_assign
                                let mut v = o;
                                v <<= oi;
                                assert_eq!(v, t);
                            }
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/ops/vector_arithmetic.rs
================================================
//! Vertical (lane-wise) vector-vector arithmetic operations.

macro_rules! impl_ops_vector_arithmetic {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl crate::ops::Add for $id {
            type Output = Self;
            #[inline]
            fn add(self, other: Self) -> Self {
                use crate::llvm::simd_add;
                unsafe { Simd(simd_add(self.0, other.0)) }
            }
        }

        impl crate::ops::Sub for $id {
            type Output = Self;
            #[inline]
            fn sub(self, other: Self) -> Self {
                use crate::llvm::simd_sub;
                unsafe { Simd(simd_sub(self.0, other.0)) }
            }
        }

        impl crate::ops::Mul for $id {
            type Output = Self;
            #[inline]
            fn mul(self, other: Self) -> Self {
                use crate::llvm::simd_mul;
                unsafe { Simd(simd_mul(self.0, other.0)) }
            }
        }

        impl crate::ops::Div for $id {
            type Output = Self;
            #[inline]
            fn div(self, other: Self) -> Self {
                use crate::llvm::simd_div;
                unsafe { Simd(simd_div(self.0, other.0)) }
            }
        }

        impl crate::ops::Rem for $id {
            type Output = Self;
            #[inline]
            fn rem(self, other: Self) -> Self {
                use crate::llvm::simd_rem;
                unsafe { Simd(simd_rem(self.0, other.0)) }
            }
        }

        impl crate::ops::AddAssign for $id {
            #[inline]
            fn add_assign(&mut self, other: Self) {
                *self = *self + other;
            }
        }

        impl crate::ops::SubAssign for $id {
            #[inline]
            fn sub_assign(&mut self, other: Self) {
                *self = *self - other;
            }
        }

        impl crate::ops::MulAssign for $id {
            #[inline]
            fn mul_assign(&mut self, other: Self) {
                *self = *self * other;
            }
        }

        impl crate::ops::DivAssign for $id {
            #[inline]
            fn div_assign(&mut self, other: Self) {
                *self = *self / other;
            }
        }

        impl crate::ops::RemAssign for $id {
            #[inline]
            fn rem_assign(&mut self, other: Self) {
                *self = *self % other;
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
               pub mod [<$id _ops_vector_arith>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn ops_vector_arithmetic() {
                        let z = $id::splat(0 as $elem_ty);
                        let o = $id::splat(1 as $elem_ty);
                        let t = $id::splat(2 as $elem_ty);
                        let f = $id::splat(4 as $elem_ty);

                        // add
                        assert_eq!(z + z, z);
                        assert_eq!(o + z, o);
                        assert_eq!(t + z, t);
                        assert_eq!(t + t, f);
                        // sub
                        assert_eq!(z - z, z);
                        assert_eq!(o - z, o);
                        assert_eq!(t - z, t);
                        assert_eq!(f - t, t);
                        assert_eq!(f - o - o, t);
                        // mul
                        assert_eq!(z * z, z);
                        assert_eq!(z * o, z);
                        assert_eq!(z * t, z);
                        assert_eq!(o * t, t);
                        assert_eq!(t * t, f);
                        // div
                        assert_eq!(z / o, z);
                        assert_eq!(t / o, t);
                        assert_eq!(f / o, f);
                        assert_eq!(t / t, o);
                        assert_eq!(f / t, t);
                        // rem
                        assert_eq!(o % o, z);
                        assert_eq!(f % t, z);

                        {
                            let mut v = z;
                            assert_eq!(v, z);
                            v += o; // add_assign
                            assert_eq!(v, o);
                            v -= o; // sub_assign
                            assert_eq!(v, z);
                            v = t;
                            v *= o; // mul_assign
                            assert_eq!(v, t);
                            v *= t;
                            assert_eq!(v, f);
                            v /= o; // div_assign
                            assert_eq!(v, f);
                            v /= t;
                            assert_eq!(v, t);
                            v %= t; // rem_assign
                            assert_eq!(v, z);
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/ops/vector_bitwise.rs
================================================
//! Vertical (lane-wise) vector-vector bitwise operations.

macro_rules! impl_ops_vector_bitwise {
    (
        [$elem_ty:ident; $elem_count:expr]:
        $id:ident | $test_tt:tt |
        ($true:expr, $false:expr)
    ) => {
        impl crate::ops::Not for $id {
            type Output = Self;
            #[inline]
            fn not(self) -> Self {
                Self::splat($true) ^ self
            }
        }
        impl crate::ops::BitXor for $id {
            type Output = Self;
            #[inline]
            fn bitxor(self, other: Self) -> Self {
                use crate::llvm::simd_xor;
                unsafe { Simd(simd_xor(self.0, other.0)) }
            }
        }
        impl crate::ops::BitAnd for $id {
            type Output = Self;
            #[inline]
            fn bitand(self, other: Self) -> Self {
                use crate::llvm::simd_and;
                unsafe { Simd(simd_and(self.0, other.0)) }
            }
        }
        impl crate::ops::BitOr for $id {
            type Output = Self;
            #[inline]
            fn bitor(self, other: Self) -> Self {
                use crate::llvm::simd_or;
                unsafe { Simd(simd_or(self.0, other.0)) }
            }
        }
        impl crate::ops::BitAndAssign for $id {
            #[inline]
            fn bitand_assign(&mut self, other: Self) {
                *self = *self & other;
            }
        }
        impl crate::ops::BitOrAssign for $id {
            #[inline]
            fn bitor_assign(&mut self, other: Self) {
                *self = *self | other;
            }
        }
        impl crate::ops::BitXorAssign for $id {
            #[inline]
            fn bitxor_assign(&mut self, other: Self) {
                *self = *self ^ other;
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _ops_vector_bitwise>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn ops_vector_bitwise() {

                        let z = $id::splat(0 as $elem_ty);
                        let o = $id::splat(1 as $elem_ty);
                        let t = $id::splat(2 as $elem_ty);
                        let m = $id::splat(!z.extract(0));

                        // Not:
                        assert_eq!(!z, m);
                        assert_eq!(!m, z);

                        // BitAnd:
                        assert_eq!(o & o, o);
                        assert_eq!(o & z, z);
                        assert_eq!(z & o, z);
                        assert_eq!(z & z, z);

                        assert_eq!(t & t, t);
                        assert_eq!(t & o, z);
                        assert_eq!(o & t, z);

                        // BitOr:
                        assert_eq!(o | o, o);
                        assert_eq!(o | z, o);
                        assert_eq!(z | o, o);
                        assert_eq!(z | z, z);

                        assert_eq!(t | t, t);
                        assert_eq!(z | t, t);
                        assert_eq!(t | z, t);

                        // BitXOR:
                        assert_eq!(o ^ o, z);
                        assert_eq!(z ^ z, z);
                        assert_eq!(z ^ o, o);
                        assert_eq!(o ^ z, o);

                        assert_eq!(t ^ t, z);
                        assert_eq!(t ^ z, t);
                        assert_eq!(z ^ t, t);

                        {
                            // AndAssign:
                            let mut v = o;
                            v &= t;
                            assert_eq!(v, z);
                        }
                        {
                            // OrAssign:
                            let mut v = z;
                            v |= o;
                            assert_eq!(v, o);
                        }
                        {
                            // XORAssign:
                            let mut v = z;
                            v ^= o;
                            assert_eq!(v, o);
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/ops/vector_float_min_max.rs
================================================
//! Vertical (lane-wise) vector `min` and `max` for floating-point vectors.

macro_rules! impl_ops_vector_float_min_max {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Minimum of two vectors.
            ///
            /// Returns a new vector containing the minimum value of each of
            /// the input vector lanes.
            #[inline]
            pub fn min(self, x: Self) -> Self {
                use crate::llvm::simd_fmin;
                unsafe { Simd(simd_fmin(self.0, x.0)) }
            }

            /// Maximum of two vectors.
            ///
            /// Returns a new vector containing the maximum value of each of
            /// the input vector lanes.
            #[inline]
            pub fn max(self, x: Self) -> Self {
                use crate::llvm::simd_fmax;
                unsafe { Simd(simd_fmax(self.0, x.0)) }
            }
        }
        test_if!{
            $test_tt:
            paste::item! {
                #[cfg(not(any(
                    // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/223
                    all(target_arch = "mips", target_endian = "big"),
                    target_arch = "mips64",
                )))]
                pub mod [<$id _ops_vector_min_max>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn min_max() {
                        let n = crate::$elem_ty::NAN;
                        let o = $id::splat(1. as $elem_ty);
                        let t = $id::splat(2. as $elem_ty);

                        let mut m = o; // [1., 2., 1., 2., ...]
                        let mut on = o;
                        for i in 0..$id::lanes() {
                            if i % 2 == 0 {
                                m = m.replace(i, 2. as $elem_ty);
                                on = on.replace(i, n);
                            }
                        }

                        assert_eq!(o.min(t), o);
                        assert_eq!(t.min(o), o);
                        assert_eq!(m.min(o), o);
                        assert_eq!(o.min(m), o);
                        assert_eq!(m.min(t), m);
                        assert_eq!(t.min(m), m);

                        assert_eq!(o.max(t), t);
                        assert_eq!(t.max(o), t);
                        assert_eq!(m.max(o), m);
                        assert_eq!(o.max(m), m);
                        assert_eq!(m.max(t), t);
                        assert_eq!(t.max(m), t);

                        assert_eq!(on.min(o), o);
                        assert_eq!(o.min(on), o);
                        assert_eq!(on.max(o), o);
                        assert_eq!(o.max(on), o);
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/ops/vector_int_min_max.rs
================================================
//! Vertical (lane-wise) vector `min` and `max` for integer vectors.

macro_rules! impl_ops_vector_int_min_max {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Minimum of two vectors.
            ///
            /// Returns a new vector containing the minimum value of each of
            /// the input vector lanes.
            #[inline]
            pub fn min(self, x: Self) -> Self {
                self.lt(x).select(self, x)
            }

            /// Maximum of two vectors.
            ///
            /// Returns a new vector containing the maximum value of each of
            /// the input vector lanes.
            #[inline]
            pub fn max(self, x: Self) -> Self {
                self.gt(x).select(self, x)
            }
        }
        test_if!{$test_tt:
        paste::item! {
            pub mod [<$id _ops_vector_min_max>] {
                use super::*;
                #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                fn min_max() {
                    let o = $id::splat(1 as $elem_ty);
                    let t = $id::splat(2 as $elem_ty);

                    let mut m = o;
                    for i in 0..$id::lanes() {
                        if i % 2 == 0 {
                            m = m.replace(i, 2 as $elem_ty);
                        }
                    }
                    assert_eq!(o.min(t), o);
                    assert_eq!(t.min(o), o);
                    assert_eq!(m.min(o), o);
                    assert_eq!(o.min(m), o);
                    assert_eq!(m.min(t), m);
                    assert_eq!(t.min(m), m);

                    assert_eq!(o.max(t), t);
                    assert_eq!(t.max(o), t);
                    assert_eq!(m.max(o), m);
                    assert_eq!(o.max(m), m);
                    assert_eq!(m.max(t), t);
                    assert_eq!(t.max(m), t);
                }
            }
        }
        }
    };
}


================================================
FILE: src/api/ops/vector_mask_bitwise.rs
================================================
//! Vertical (lane-wise) vector-vector bitwise operations.

macro_rules! impl_ops_vector_mask_bitwise {
    (
        [$elem_ty:ident; $elem_count:expr]:
        $id:ident | $test_tt:tt |
        ($true:expr, $false:expr)
    ) => {
        impl crate::ops::Not for $id {
            type Output = Self;
            #[inline]
            fn not(self) -> Self {
                Self::splat($true) ^ self
            }
        }
        impl crate::ops::BitXor for $id {
            type Output = Self;
            #[inline]
            fn bitxor(self, other: Self) -> Self {
                use crate::llvm::simd_xor;
                unsafe { Simd(simd_xor(self.0, other.0)) }
            }
        }
        impl crate::ops::BitAnd for $id {
            type Output = Self;
            #[inline]
            fn bitand(self, other: Self) -> Self {
                use crate::llvm::simd_and;
                unsafe { Simd(simd_and(self.0, other.0)) }
            }
        }
        impl crate::ops::BitOr for $id {
            type Output = Self;
            #[inline]
            fn bitor(self, other: Self) -> Self {
                use crate::llvm::simd_or;
                unsafe { Simd(simd_or(self.0, other.0)) }
            }
        }
        impl crate::ops::BitAndAssign for $id {
            #[inline]
            fn bitand_assign(&mut self, other: Self) {
                *self = *self & other;
            }
        }
        impl crate::ops::BitOrAssign for $id {
            #[inline]
            fn bitor_assign(&mut self, other: Self) {
                *self = *self | other;
            }
        }
        impl crate::ops::BitXorAssign for $id {
            #[inline]
            fn bitxor_assign(&mut self, other: Self) {
                *self = *self ^ other;
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _ops_vector_mask_bitwise>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn ops_vector_mask_bitwise() {
                        let t = $id::splat(true);
                        let f = $id::splat(false);
                        assert!(t != f);
                        assert!(!(t == f));

                        // Not:
                        assert_eq!(!t, f);
                        assert_eq!(t, !f);

                        // BitAnd:
                        assert_eq!(t & f, f);
                        assert_eq!(f & t, f);
                        assert_eq!(t & t, t);
                        assert_eq!(f & f, f);

                        // BitOr:
                        assert_eq!(t | f, t);
                        assert_eq!(f | t, t);
                        assert_eq!(t | t, t);
                        assert_eq!(f | f, f);

                        // BitXOR:
                        assert_eq!(t ^ f, t);
                        assert_eq!(f ^ t, t);
                        assert_eq!(t ^ t, f);
                        assert_eq!(f ^ f, f);

                        {
                            // AndAssign:
                            let mut v = f;
                            v &= t;
                            assert_eq!(v, f);
                        }
                        {
                            // OrAssign:
                            let mut v = f;
                            v |= t;
                            assert_eq!(v, t);
                        }
                        {
                            // XORAssign:
                            let mut v = f;
                            v ^= t;
                            assert_eq!(v, t);
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/ops/vector_neg.rs
================================================
//! Vertical (lane-wise) vector `Neg`.

macro_rules! impl_ops_vector_neg {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl crate::ops::Neg for $id {
            type Output = Self;
            #[inline]
            fn neg(self) -> Self {
                Self::splat(-1 as $elem_ty) * self
            }
        }
        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _ops_vector_neg>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn neg() {
                        let z = $id::splat(0 as $elem_ty);
                        let o = $id::splat(1 as $elem_ty);
                        let t = $id::splat(2 as $elem_ty);
                        let f = $id::splat(4 as $elem_ty);

                        let nz = $id::splat(-(0 as $elem_ty));
                        let no = $id::splat(-(1 as $elem_ty));
                        let nt = $id::splat(-(2 as $elem_ty));
                        let nf = $id::splat(-(4 as $elem_ty));

                        assert_eq!(-z, nz);
                        assert_eq!(-o, no);
                        assert_eq!(-t, nt);
                        assert_eq!(-f, nf);

                        assert_eq!(z, -nz);
                        assert_eq!(o, -no);
                        assert_eq!(t, -nt);
                        assert_eq!(f, -nf);
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/ops/vector_rotates.rs
================================================
//! Vertical (lane-wise) vector rotates operations.
#![allow(unused)]

macro_rules! impl_ops_vector_rotates {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Shifts the bits of each lane to the left by the specified
            /// amount in the corresponding lane of `n`, wrapping the
            /// truncated bits to the end of the resulting integer.
            ///
            /// Note: this is neither the same operation as `<<` nor equivalent
            /// to `slice::rotate_left`.
            #[inline]
            pub fn rotate_left(self, n: $id) -> $id {
                const LANE_WIDTH: $elem_ty =
                    crate::mem::size_of::<$elem_ty>() as $elem_ty * 8;
                // Protect against undefined behavior for over-long bit shifts
                let n = n % LANE_WIDTH;
                (self << n) | (self >> ((LANE_WIDTH - n) % LANE_WIDTH))
            }

            /// Shifts the bits of each lane to the right by the specified
            /// amount in the corresponding lane of `n`, wrapping the
            /// truncated bits to the beginning of the resulting integer.
            ///
            /// Note: this is neither the same operation as `>>` nor equivalent
            /// to `slice::rotate_right`.
            #[inline]
            pub fn rotate_right(self, n: $id) -> $id {
                const LANE_WIDTH: $elem_ty =
                    crate::mem::size_of::<$elem_ty>() as $elem_ty * 8;
                // Protect against undefined behavior for over-long bit shifts
                let n = n % LANE_WIDTH;
                (self >> n) | (self << ((LANE_WIDTH - n) % LANE_WIDTH))
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                // FIXME:
                // https://github.com/rust-lang-nursery/packed_simd/issues/75
                #[cfg(not(any(
                    target_arch = "s390x",
                    target_arch = "sparc64",
                )))]
                pub mod [<$id _ops_vector_rotate>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "aarch64"))]
                    //~^ FIXME: https://github.com/rust-lang/packed_simd/issues/317
                    fn rotate_ops() {
                        let z = $id::splat(0 as $elem_ty);
                        let o = $id::splat(1 as $elem_ty);
                        let t = $id::splat(2 as $elem_ty);
                        let f = $id::splat(4 as $elem_ty);

                        let max = $id::splat(
                            (mem::size_of::<$elem_ty>() * 8 - 1) as $elem_ty);

                        // rotate_right
                        assert_eq!(z.rotate_right(z), z);
                        assert_eq!(z.rotate_right(o), z);
                        assert_eq!(z.rotate_right(t), z);

                        assert_eq!(o.rotate_right(z), o);
                        assert_eq!(t.rotate_right(z), t);
                        assert_eq!(f.rotate_right(z), f);
                        assert_eq!(f.rotate_right(max), f << 1);

                        assert_eq!(o.rotate_right(o), o << max);
                        assert_eq!(t.rotate_right(o), o);
                        assert_eq!(t.rotate_right(t), o << max);
                        assert_eq!(f.rotate_right(o), t);
                        assert_eq!(f.rotate_right(t), o);

                        // rotate_left
                        assert_eq!(z.rotate_left(z), z);
                        assert_eq!(o.rotate_left(z), o);
                        assert_eq!(t.rotate_left(z), t);
                        assert_eq!(f.rotate_left(z), f);
                        assert_eq!(f.rotate_left(max), t);

                        assert_eq!(o.rotate_left(o), t);
                        assert_eq!(o.rotate_left(t), f);
                        assert_eq!(t.rotate_left(o), f);
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/ops/vector_shifts.rs
================================================
//! Vertical (lane-wise) vector-vector shifts operations.

macro_rules! impl_ops_vector_shifts {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl crate::ops::Shl<$id> for $id {
            type Output = Self;
            #[inline]
            fn shl(self, other: Self) -> Self {
                use crate::llvm::simd_shl;
                unsafe { Simd(simd_shl(self.0, other.0)) }
            }
        }
        impl crate::ops::Shr<$id> for $id {
            type Output = Self;
            #[inline]
            fn shr(self, other: Self) -> Self {
                use crate::llvm::simd_shr;
                unsafe { Simd(simd_shr(self.0, other.0)) }
            }
        }
        impl crate::ops::ShlAssign<$id> for $id {
            #[inline]
            fn shl_assign(&mut self, other: Self) {
                *self = *self << other;
            }
        }
        impl crate::ops::ShrAssign<$id> for $id {
            #[inline]
            fn shr_assign(&mut self, other: Self) {
                *self = *self >> other;
            }
        }
        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _ops_vector_shifts>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg_attr(any(target_arch = "s390x", target_arch = "sparc64"),
                               allow(unreachable_code, unused_variables)
                    )]
                    #[cfg(not(target_arch = "aarch64"))]
                    //~^ FIXME: https://github.com/rust-lang/packed_simd/issues/317
                    fn ops_vector_shifts() {
                        let z = $id::splat(0 as $elem_ty);
                        let o = $id::splat(1 as $elem_ty);
                        let t = $id::splat(2 as $elem_ty);
                        let f = $id::splat(4 as $elem_ty);

                        let max =$id::splat(
                            (mem::size_of::<$elem_ty>() * 8 - 1) as $elem_ty
                        );

                        // shr
                        assert_eq!(z >> z, z);
                        assert_eq!(z >> o, z);
                        assert_eq!(z >> t, z);
                        assert_eq!(z >> t, z);

                        #[cfg(any(target_arch = "s390x", target_arch = "sparc64"))] {
                            // FIXME: rust produces bad codegen for shifts:
                            // https://github.com/rust-lang-nursery/packed_simd/issues/13
                            return;
                        }

                        assert_eq!(o >> z, o);
                        assert_eq!(t >> z, t);
                        assert_eq!(f >> z, f);
                        assert_eq!(f >> max, z);

                        assert_eq!(o >> o, z);
                        assert_eq!(t >> o, o);
                        assert_eq!(t >> t, z);
                        assert_eq!(f >> o, t);
                        assert_eq!(f >> t, o);
                        assert_eq!(f >> max, z);

                        // shl
                        assert_eq!(z << z, z);
                        assert_eq!(o << z, o);
                        assert_eq!(t << z, t);
                        assert_eq!(f << z, f);
                        assert_eq!(f << max, z);

                        assert_eq!(o << o, t);
                        assert_eq!(o << t, f);
                        assert_eq!(t << o, f);

                        {
                            // shr_assign
                            let mut v = o;
                            v >>= o;
                            assert_eq!(v, z);
                        }
                        {
                            // shl_assign
                            let mut v = o;
                            v <<= o;
                            assert_eq!(v, t);
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/ops.rs
================================================
//! Implementation of the `ops` traits
#[macro_use]
mod vector_mask_bitwise;
#[macro_use]
mod scalar_mask_bitwise;

#[macro_use]
mod vector_arithmetic;
#[macro_use]
mod scalar_arithmetic;

#[macro_use]
mod vector_bitwise;
#[macro_use]
mod scalar_bitwise;

#[macro_use]
mod vector_shifts;
#[macro_use]
mod scalar_shifts;

#[macro_use]
mod vector_rotates;

#[macro_use]
mod vector_neg;

#[macro_use]
mod vector_int_min_max;

#[macro_use]
mod vector_float_min_max;


================================================
FILE: src/api/ptr/gather_scatter.rs
================================================
//! Implements masked gather and scatters for vectors of pointers

macro_rules! impl_ptr_read {
    ([$elem_ty:ty; $elem_count:expr]: $id:ident, $mask_ty:ident
     | $test_tt:tt) => {
        impl<T> $id<T>
        where
            [T; $elem_count]: sealed::SimdArray,
        {
            /// Reads selected vector elements from memory.
            ///
            /// Instantiates a new vector by reading the values from `self` for
            /// those lanes whose `mask` is `true`, and using the elements of
            /// `value` otherwise.
            ///
            /// No memory is accessed for those lanes of `self` whose `mask` is
            /// `false`.
            ///
            /// # Safety
            ///
            /// This method is unsafe because it dereferences raw pointers. The
            /// pointers must be aligned to `mem::align_of::<T>()`.
            #[inline]
            pub unsafe fn read<M>(
                self,
                mask: Simd<[M; $elem_count]>,
                value: Simd<[T; $elem_count]>,
            ) -> Simd<[T; $elem_count]>
            where
                M: sealed::Mask,
                [M; $elem_count]: sealed::SimdArray,
            {
                use crate::llvm::simd_gather;
                Simd(simd_gather(value.0, self.0, mask.0))
            }
        }

        test_if! {
            $test_tt:
            paste::item! {
                mod [<$id _read>] {
                    use super::*;
                    #[test]
                    fn read() {
                        let mut v = [0_i32; $elem_count];
                        for i in 0..$elem_count {
                            v[i] = i as i32;
                        }

                        let mut ptr = $id::<i32>::null();

                        for i in 0..$elem_count {
                            ptr = ptr.replace(i,
                                &v[i] as *const i32 as *mut i32
                            );
                        }

                        // all mask elements are true:
                        let mask = $mask_ty::splat(true);
                        let def = Simd::<[i32; $elem_count]>::splat(42_i32);
                        let r: Simd<[i32; $elem_count]> = unsafe {
                            ptr.read(mask, def)
                        };
                        assert_eq!(
                            r,
                            Simd::<[i32; $elem_count]>::from_slice_unaligned(
                                &v
                            )
                        );

                        let mut mask = mask;
                        for i in 0..$elem_count {
                            if i % 2 != 0 {
                                mask = mask.replace(i, false);
                            }
                        }

                        // even mask elements are true, odd ones are false:
                        let r: Simd<[i32; $elem_count]> = unsafe {
                            ptr.read(mask, def)
                        };
                        let mut e = v;
                        for i in 0..$elem_count {
                            if i % 2 != 0 {
                                e[i] = 42;
                            }
                        }
                        assert_eq!(
                            r,
                            Simd::<[i32; $elem_count]>::from_slice_unaligned(
                                &e
                            )
                        );

                        // all mask elements are false:
                        let mask = $mask_ty::splat(false);
                        let def = Simd::<[i32; $elem_count]>::splat(42_i32);
                        let r: Simd<[i32; $elem_count]> = unsafe {
                            ptr.read(mask, def) }
                        ;
                        assert_eq!(r, def);
                    }
                }
            }
        }
    };
}

macro_rules! impl_ptr_write {
    ([$elem_ty:ty; $elem_count:expr]: $id:ident, $mask_ty:ident
     | $test_tt:tt) => {
        impl<T> $id<T>
        where
            [T; $elem_count]: sealed::SimdArray,
        {
            /// Writes selected vector elements to memory.
            ///
            /// Writes the lanes of `values` for which the mask is `true` to
            /// their corresponding memory addresses in `self`.
            ///
            /// No memory is accessed for those lanes of `self` whose `mask` is
            /// `false`.
            ///
            /// Overlapping memory addresses of `self` are written to in order
            /// from the lest-significant to the most-significant element.
            ///
            /// # Safety
            ///
            /// This method is unsafe because it dereferences raw pointers. The
            /// pointers must be aligned to `mem::align_of::<T>()`.
            #[inline]
            pub unsafe fn write<M>(self, mask: Simd<[M; $elem_count]>, value: Simd<[T; $elem_count]>)
            where
                M: sealed::Mask,
                [M; $elem_count]: sealed::SimdArray,
            {
                use crate::llvm::simd_scatter;
                simd_scatter(value.0, self.0, mask.0)
            }
        }

        test_if! {
            $test_tt:
            paste::item! {
                mod [<$id _write>] {
                    use super::*;
                    #[test]
                    fn write() {
                        // forty_two = [42, 42, 42, ...]
                        let forty_two
                            = Simd::<[i32; $elem_count]>::splat(42_i32);

                        // This test will write to this array
                        let mut arr = [0_i32; $elem_count];
                        for i in 0..$elem_count {
                            arr[i] = i as i32;
                        }
                        // arr = [0, 1, 2, ...]

                        let mut ptr = $id::<i32>::null();
                        for i in 0..$elem_count {
                            ptr = ptr.replace(i, unsafe {
                                arr.as_ptr().add(i) as *mut i32
                            });
                        }
                        // ptr = [&arr[0], &arr[1], ...]

                        // write `forty_two` to all elements of `v`
                        {
                            let backup = arr;
                            unsafe {
                                ptr.write($mask_ty::splat(true), forty_two)
                            };
                            assert_eq!(arr, [42_i32; $elem_count]);
                            arr = backup;  // arr = [0, 1, 2, ...]
                        }

                        // write 42 to even elements of arr:
                        {
                            // set odd elements of the mask to false
                            let mut mask = $mask_ty::splat(true);
                            for i in 0..$elem_count {
                                if i % 2 != 0 {
                                    mask = mask.replace(i, false);
                                }
                            }
                            // mask = [true, false, true, false, ...]

                            // expected result r = [42, 1, 42, 3, 42, 5, ...]
                            let mut r = arr;
                            for i in 0..$elem_count {
                                if i % 2 == 0 {
                                    r[i] = 42;
                                }
                            }

                            let backup = arr;
                            unsafe { ptr.write(mask, forty_two) };
                            assert_eq!(arr, r);
                            arr = backup;  // arr = [0, 1, 2, 3, ...]
                        }

                        // write 42 to no elements of arr
                        {
                            let backup = arr;
                            unsafe {
                                ptr.write($mask_ty::splat(false), forty_two)
                            };
                            assert_eq!(arr, backup);
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/ptr.rs
================================================
//! Vector of pointers

#[macro_use]
mod gather_scatter;


================================================
FILE: src/api/reductions/bitwise.rs
================================================
//! Implements portable horizontal bitwise vector reductions.
#![allow(unused)]

macro_rules! impl_reduction_bitwise {
    (
        [$elem_ty:ident; $elem_count:expr]:
        $id:ident | $ielem_ty:ident | $test_tt:tt |
        ($convert:expr) |
        ($true:expr, $false:expr)
    ) => {
        impl $id {
            /// Lane-wise bitwise `and` of the vector elements.
            ///
            /// Note: if the vector has one lane, the first element of the
            /// vector is returned.
            #[inline]
            pub fn and(self) -> $elem_ty {
                #[cfg(not(target_arch = "aarch64"))]
                {
                    use crate::llvm::simd_reduce_and;
                    let r: $ielem_ty = unsafe { simd_reduce_and(self.0) };
                    $convert(r)
                }
                #[cfg(target_arch = "aarch64")]
                {
                    // FIXME: broken on aarch64
                    // https://github.com/rust-lang-nursery/packed_simd/issues/15
                    let mut x = self.extract(0) as $elem_ty;
                    for i in 1..$id::lanes() {
                        x &= self.extract(i) as $elem_ty;
                    }
                    x
                }
            }

            /// Lane-wise bitwise `or` of the vector elements.
            ///
            /// Note: if the vector has one lane, the first element of the
            /// vector is returned.
            #[inline]
            pub fn or(self) -> $elem_ty {
                #[cfg(not(target_arch = "aarch64"))]
                {
                    use crate::llvm::simd_reduce_or;
                    let r: $ielem_ty = unsafe { simd_reduce_or(self.0) };
                    $convert(r)
                }
                #[cfg(target_arch = "aarch64")]
                {
                    // FIXME: broken on aarch64
                    // https://github.com/rust-lang-nursery/packed_simd/issues/15
                    let mut x = self.extract(0) as $elem_ty;
                    for i in 1..$id::lanes() {
                        x |= self.extract(i) as $elem_ty;
                    }
                    x
                }
            }

            /// Lane-wise bitwise `xor` of the vector elements.
            ///
            /// Note: if the vector has one lane, the first element of the
            /// vector is returned.
            #[inline]
            pub fn xor(self) -> $elem_ty {
                #[cfg(not(target_arch = "aarch64"))]
                {
                    use crate::llvm::simd_reduce_xor;
                    let r: $ielem_ty = unsafe { simd_reduce_xor(self.0) };
                    $convert(r)
                }
                #[cfg(target_arch = "aarch64")]
                {
                    // FIXME: broken on aarch64
                    // https://github.com/rust-lang-nursery/packed_simd/issues/15
                    let mut x = self.extract(0) as $elem_ty;
                    for i in 1..$id::lanes() {
                        x ^= self.extract(i) as $elem_ty;
                    }
                    x
                }
            }
        }

        test_if!{
            $test_tt:
            paste::item! {
                pub mod [<$id _reduction_bitwise>] {
                    use super::*;

                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn and() {
                        let v = $id::splat($false);
                        assert_eq!(v.and(), $false);
                        let v = $id::splat($true);
                        assert_eq!(v.and(), $true);
                        let v = $id::splat($false);
                        let v = v.replace(0, $true);
                        if $id::lanes() > 1 {
                            assert_eq!(v.and(), $false);
                        } else {
                            assert_eq!(v.and(), $true);
                        }
                        let v = $id::splat($true);
                        let v = v.replace(0, $false);
                        assert_eq!(v.and(), $false);

                    }
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn or() {
                        let v = $id::splat($false);
                        assert_eq!(v.or(), $false);
                        let v = $id::splat($true);
                        assert_eq!(v.or(), $true);
                        let v = $id::splat($false);
                        let v = v.replace(0, $true);
                        assert_eq!(v.or(), $true);
                        let v = $id::splat($true);
                        let v = v.replace(0, $false);
                        if $id::lanes() > 1 {
                            assert_eq!(v.or(), $true);
                        } else {
                            assert_eq!(v.or(), $false);
                        }
                    }
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn xor() {
                        let v = $id::splat($false);
                        assert_eq!(v.xor(), $false);
                        let v = $id::splat($true);
                        if $id::lanes() > 1 {
                            assert_eq!(v.xor(), $false);
                        } else {
                            assert_eq!(v.xor(), $true);
                        }
                        let v = $id::splat($false);
                        let v = v.replace(0, $true);
                        assert_eq!(v.xor(), $true);
                        let v = $id::splat($true);
                        let v = v.replace(0, $false);
                        if $id::lanes() > 1 {
                            assert_eq!(v.xor(), $true);
                        } else {
                            assert_eq!(v.xor(), $false);
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/reductions/float_arithmetic.rs
================================================
//! Implements portable horizontal float vector arithmetic reductions.

macro_rules! impl_reduction_float_arithmetic {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Horizontal sum of the vector elements.
            ///
            /// The intrinsic performs a tree-reduction of the vector elements.
            /// That is, for an 8 element vector:
            ///
            /// > ((x0 + x1) + (x2 + x3)) + ((x4 + x5) + (x6 + x7))
            ///
            /// If one of the vector element is `NaN` the reduction returns
            /// `NaN`. The resulting `NaN` is not required to be equal to any
            /// of the `NaN`s in the vector.
            #[inline]
            pub fn sum(self) -> $elem_ty {
                #[cfg(not(target_arch = "aarch64"))]
                {
                    use crate::llvm::simd_reduce_add_ordered;
                    unsafe { simd_reduce_add_ordered(self.0, 0 as $elem_ty) }
                }
                #[cfg(target_arch = "aarch64")]
                {
                    // FIXME: broken on AArch64
                    // https://github.com/rust-lang-nursery/packed_simd/issues/15
                    let mut x = self.extract(0) as $elem_ty;
                    for i in 1..$id::lanes() {
                        x += self.extract(i) as $elem_ty;
                    }
                    x
                }
            }

            /// Horizontal product of the vector elements.
            ///
            /// The intrinsic performs a tree-reduction of the vector elements.
            /// That is, for an 8 element vector:
            ///
            /// > ((x0 * x1) * (x2 * x3)) * ((x4 * x5) * (x6 * x7))
            ///
            /// If one of the vector element is `NaN` the reduction returns
            /// `NaN`. The resulting `NaN` is not required to be equal to any
            /// of the `NaN`s in the vector.
            #[inline]
            pub fn product(self) -> $elem_ty {
                #[cfg(not(target_arch = "aarch64"))]
                {
                    use crate::llvm::simd_reduce_mul_ordered;
                    unsafe { simd_reduce_mul_ordered(self.0, 1 as $elem_ty) }
                }
                #[cfg(target_arch = "aarch64")]
                {
                    // FIXME: broken on AArch64
                    // https://github.com/rust-lang-nursery/packed_simd/issues/15
                    let mut x = self.extract(0) as $elem_ty;
                    for i in 1..$id::lanes() {
                        x *= self.extract(i) as $elem_ty;
                    }
                    x
                }
            }
        }

        impl crate::iter::Sum for $id {
            #[inline]
            fn sum<I: Iterator<Item = $id>>(iter: I) -> $id {
                iter.fold($id::splat(0.), crate::ops::Add::add)
            }
        }

        impl crate::iter::Product for $id {
            #[inline]
            fn product<I: Iterator<Item = $id>>(iter: I) -> $id {
                iter.fold($id::splat(1.), crate::ops::Mul::mul)
            }
        }

        impl<'a> crate::iter::Sum<&'a $id> for $id {
            #[inline]
            fn sum<I: Iterator<Item = &'a $id>>(iter: I) -> $id {
                iter.fold($id::splat(0.), |a, b| crate::ops::Add::add(a, *b))
            }
        }

        impl<'a> crate::iter::Product<&'a $id> for $id {
            #[inline]
            fn product<I: Iterator<Item = &'a $id>>(iter: I) -> $id {
                iter.fold($id::splat(1.), |a, b| crate::ops::Mul::mul(a, *b))
            }
        }

        test_if! {
            $test_tt:
            paste::item! {
                // Comparisons use integer casts within mantissa^1 range.
                #[allow(clippy::float_cmp)]
                pub mod [<$id _reduction_float_arith>] {
                    use super::*;
                    fn alternating(x: usize) -> $id {
                        let mut v = $id::splat(1 as $elem_ty);
                        for i in 0..$id::lanes() {
                            if i % x == 0 {
                                v = v.replace(i, 2 as $elem_ty);
                            }
                        }
                        v
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn sum() {
                        let v = $id::splat(0 as $elem_ty);
                        assert_eq!(v.sum(), 0 as $elem_ty);
                        let v = $id::splat(1 as $elem_ty);
                        assert_eq!(v.sum(), $id::lanes() as $elem_ty);
                        let v = alternating(2);
                        assert_eq!(
                            v.sum(),
                            ($id::lanes() / 2 + $id::lanes()) as $elem_ty
                        );
                    }
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn product() {
                        let v = $id::splat(0 as $elem_ty);
                        assert_eq!(v.product(), 0 as $elem_ty);
                        let v = $id::splat(1 as $elem_ty);
                        assert_eq!(v.product(), 1 as $elem_ty);
                        let f = match $id::lanes() {
                            64 => 16,
                            32 => 8,
                            16 => 4,
                            _ => 2,
                        };
                        let v = alternating(f);
                        assert_eq!(
                            v.product(),
                            (2_usize.pow(($id::lanes() / f) as u32)
                             as $elem_ty)
                        );
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[allow(unreachable_code)]
                    fn sum_nan() {
                        // FIXME: https://bugs.llvm.org/show_bug.cgi?id=36732
                        // https://github.com/rust-lang-nursery/packed_simd/issues/6
                        return;

                        let n0 = crate::$elem_ty::NAN;
                        let v0 = $id::splat(-3.0);
                        for i in 0..$id::lanes() {
                            let mut v = v0.replace(i, n0);
                            // If the vector contains a NaN the result is NaN:
                            assert!(
                                v.sum().is_nan(),
                                "nan at {} => {} | {:?}",
                                i,
                                v.sum(),
                                v
                            );
                            for j in 0..i {
                                v = v.replace(j, n0);
                                assert!(v.sum().is_nan());
                            }
                        }
                        let v = $id::splat(n0);
                        assert!(v.sum().is_nan(), "all nans | {:?}", v);
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[allow(unreachable_code)]
                    fn product_nan() {
                        // FIXME: https://bugs.llvm.org/show_bug.cgi?id=36732
                        // https://github.com/rust-lang-nursery/packed_simd/issues/6
                        return;

                        let n0 = crate::$elem_ty::NAN;
                        let v0 = $id::splat(-3.0);
                        for i in 0..$id::lanes() {
                            let mut v = v0.replace(i, n0);
                            // If the vector contains a NaN the result is NaN:
                            assert!(
                                v.product().is_nan(),
                                "nan at {} => {} | {:?}",
                                i,
                                v.product(),
                                v
                            );
                            for j in 0..i {
                                v = v.replace(j, n0);
                                assert!(v.product().is_nan());
                            }
                        }
                        let v = $id::splat(n0);
                        assert!(v.product().is_nan(), "all nans | {:?}", v);
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[allow(unused, dead_code)]
                    fn sum_roundoff() {
                        // Performs a tree-reduction
                        fn tree_reduce_sum(a: &[$elem_ty]) -> $elem_ty {
                            assert!(!a.is_empty());
                            if a.len() == 1 {
                                a[0]
                            } else if a.len() == 2 {
                                a[0] + a[1]
                            } else {
                                let mid = a.len() / 2;
                                let (left, right) = a.split_at(mid);
                                tree_reduce_sum(left) + tree_reduce_sum(right)
                            }
                        }

                        let mut start = crate::$elem_ty::EPSILON;
                        let mut scalar_reduction = 0. as $elem_ty;

                        let mut v = $id::splat(0. as $elem_ty);
                        for i in 0..$id::lanes() {
                            let c = if i % 2 == 0 { 1e3 } else { -1. };
                            start *= ::core::$elem_ty::consts::PI * c;
                            scalar_reduction += start;
                            v = v.replace(i, start);
                        }
                        let simd_reduction = v.sum();

                        let mut a = [0. as $elem_ty; $id::lanes()];
                        v.write_to_slice_unaligned(&mut a);
                        let tree_reduction = tree_reduce_sum(&a);

                        // tolerate 1 ULP difference:
                        let red_bits = simd_reduction.to_bits();
                        let tree_bits = tree_reduction.to_bits();
                        assert!(
                            if red_bits > tree_bits {
                                red_bits - tree_bits
                            } else {
                                tree_bits - red_bits
                            } < 2,
                            "vector: {:?} | simd_reduction: {:?} | \
tree_reduction: {} | scalar_reduction: {}",
                            v,
                            simd_reduction,
                            tree_reduction,
                            scalar_reduction
                        );
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[allow(unused, dead_code)]
                    fn product_roundoff() {
                        use ::core::convert::TryInto;
                        // Performs a tree-reduction
                        fn tree_reduce_product(a: &[$elem_ty]) -> $elem_ty {
                            assert!(!a.is_empty());
                            if a.len() == 1 {
                                a[0]
                            } else if a.len() == 2 {
                                a[0] * a[1]
                            } else {
                                let mid = a.len() / 2;
                                let (left, right) = a.split_at(mid);
                                tree_reduce_product(left)
                                    * tree_reduce_product(right)
                            }
                        }

                        let mut start = crate::$elem_ty::EPSILON;
                        let mut scalar_reduction = 1. as $elem_ty;

                        let mut v = $id::splat(0. as $elem_ty);
                        for i in 0..$id::lanes() {
                            let c = if i % 2 == 0 { 1e3 } else { -1. };
                            start *= ::core::$elem_ty::consts::PI * c;
                            scalar_reduction *= start;
                            v = v.replace(i, start);
                        }
                        let simd_reduction = v.product();

                        let mut a = [0. as $elem_ty; $id::lanes()];
                        v.write_to_slice_unaligned(&mut a);
                        let tree_reduction = tree_reduce_product(&a);

                        // FIXME: Too imprecise, even only for product(f32x8).
                        // Figure out how to narrow this down.
                        let ulp_limit = $id::lanes() / 2;
                        let red_bits = simd_reduction.to_bits();
                        let tree_bits = tree_reduction.to_bits();
                        assert!(
                            if red_bits > tree_bits {
                                red_bits - tree_bits
                            } else {
                                tree_bits - red_bits
                            } < ulp_limit.try_into().unwrap(),
                            "vector: {:?} | simd_reduction: {:?} | \
tree_reduction: {} | scalar_reduction: {}",
                            v,
                            simd_reduction,
                            tree_reduction,
                            scalar_reduction
                        );
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/reductions/integer_arithmetic.rs
================================================
//! Implements portable horizontal integer vector arithmetic reductions.

macro_rules! impl_reduction_integer_arithmetic {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $ielem_ty:ident
     | $test_tt:tt) => {
        impl $id {
            /// Horizontal wrapping sum of the vector elements.
            ///
            /// The intrinsic performs a tree-reduction of the vector elements.
            /// That is, for an 8 element vector:
            ///
            /// > ((x0 + x1) + (x2 + x3)) + ((x4 + x5) + (x6 + x7))
            ///
            /// If an operation overflows it returns the mathematical result
            /// modulo `2^n` where `n` is the number of times it overflows.
            #[inline]
            pub fn wrapping_sum(self) -> $elem_ty {
                #[cfg(not(target_arch = "aarch64"))]
                {
                    use crate::llvm::simd_reduce_add_ordered;
                    let v: $ielem_ty = unsafe { simd_reduce_add_ordered(self.0, 0 as $ielem_ty) };
                    v as $elem_ty
                }
                #[cfg(target_arch = "aarch64")]
                {
                    // FIXME: broken on AArch64
                    // https://github.com/rust-lang-nursery/packed_simd/issues/15
                    let mut x = self.extract(0) as $elem_ty;
                    for i in 1..$id::lanes() {
                        x = x.wrapping_add(self.extract(i) as $elem_ty);
                    }
                    x
                }
            }

            /// Horizontal wrapping product of the vector elements.
            ///
            /// The intrinsic performs a tree-reduction of the vector elements.
            /// That is, for an 8 element vector:
            ///
            /// > ((x0 * x1) * (x2 * x3)) * ((x4 * x5) * (x6 * x7))
            ///
            /// If an operation overflows it returns the mathematical result
            /// modulo `2^n` where `n` is the number of times it overflows.
            #[inline]
            pub fn wrapping_product(self) -> $elem_ty {
                #[cfg(not(target_arch = "aarch64"))]
                {
                    use crate::llvm::simd_reduce_mul_ordered;
                    let v: $ielem_ty = unsafe { simd_reduce_mul_ordered(self.0, 1 as $ielem_ty) };
                    v as $elem_ty
                }
                #[cfg(target_arch = "aarch64")]
                {
                    // FIXME: broken on AArch64
                    // https://github.com/rust-lang-nursery/packed_simd/issues/15
                    let mut x = self.extract(0) as $elem_ty;
                    for i in 1..$id::lanes() {
                        x = x.wrapping_mul(self.extract(i) as $elem_ty);
                    }
                    x
                }
            }
        }

        impl crate::iter::Sum for $id {
            #[inline]
            fn sum<I: Iterator<Item = $id>>(iter: I) -> $id {
                iter.fold($id::splat(0), crate::ops::Add::add)
            }
        }

        impl crate::iter::Product for $id {
            #[inline]
            fn product<I: Iterator<Item = $id>>(iter: I) -> $id {
                iter.fold($id::splat(1), crate::ops::Mul::mul)
            }
        }

        impl<'a> crate::iter::Sum<&'a $id> for $id {
            #[inline]
            fn sum<I: Iterator<Item = &'a $id>>(iter: I) -> $id {
                iter.fold($id::splat(0), |a, b| crate::ops::Add::add(a, *b))
            }
        }

        impl<'a> crate::iter::Product<&'a $id> for $id {
            #[inline]
            fn product<I: Iterator<Item = &'a $id>>(iter: I) -> $id {
                iter.fold($id::splat(1), |a, b| crate::ops::Mul::mul(a, *b))
            }
        }

        test_if! {
            $test_tt:
            paste::item! {
                pub mod [<$id _reduction_int_arith>] {
                    use super::*;

                    fn alternating(x: usize) -> $id {
                        let mut v = $id::splat(1 as $elem_ty);
                        for i in 0..$id::lanes() {
                            if i % x == 0 {
                                v = v.replace(i, 2 as $elem_ty);
                            }
                        }
                        v
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn wrapping_sum() {
                        let v = $id::splat(0 as $elem_ty);
                        assert_eq!(v.wrapping_sum(), 0 as $elem_ty);
                        let v = $id::splat(1 as $elem_ty);
                        assert_eq!(v.wrapping_sum(), $id::lanes() as $elem_ty);
                        let v = alternating(2);
                        if $id::lanes() > 1 {
                            assert_eq!(
                                v.wrapping_sum(),
                                ($id::lanes() / 2 + $id::lanes()) as $elem_ty
                            );
                        } else {
                            assert_eq!(
                                v.wrapping_sum(),
                                2 as $elem_ty
                            );
                        }
                    }
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn wrapping_sum_overflow() {
                        let start = $elem_ty::max_value()
                            - ($id::lanes() as $elem_ty / 2);

                        let v = $id::splat(start as $elem_ty);
                        let vwrapping_sum = v.wrapping_sum();

                        let mut wrapping_sum = start;
                        for _ in 1..$id::lanes() {
                            wrapping_sum = wrapping_sum.wrapping_add(start);
                        }
                        assert_eq!(wrapping_sum, vwrapping_sum, "v = {:?}", v);
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn wrapping_product() {
                        let v = $id::splat(0 as $elem_ty);
                        assert_eq!(v.wrapping_product(), 0 as $elem_ty);
                        let v = $id::splat(1 as $elem_ty);
                        assert_eq!(v.wrapping_product(), 1 as $elem_ty);
                        let f = match $id::lanes() {
                            64 => 16,
                            32 => 8,
                            16 => 4,
                            _ => 2,
                        };
                        let v = alternating(f);
                        if $id::lanes() > 1 {
                            assert_eq!(
                                v.wrapping_product(),
                                (2_usize.pow(($id::lanes() / f) as u32)
                                 as $elem_ty)
                            );
                        } else {
                            assert_eq!(
                                v.wrapping_product(),
                                2 as $elem_ty
                            );
                        }
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn wrapping_product_overflow() {
                        let start = $elem_ty::max_value()
                            - ($id::lanes() as $elem_ty / 2);

                        let v = $id::splat(start as $elem_ty);
                        let vmul = v.wrapping_product();

                        let mut mul = start;
                        for _ in 1..$id::lanes() {
                            mul = mul.wrapping_mul(start);
                        }
                        assert_eq!(mul, vmul, "v = {:?}", v);
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/reductions/mask.rs
================================================
//! Implements portable horizontal mask reductions.

macro_rules! impl_reduction_mask {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Are `all` vector lanes `true`?
            #[inline]
            pub fn all(self) -> bool {
                unsafe { crate::codegen::reductions::mask::All::all(self) }
            }
            /// Is `any` vector lane `true`?
            #[inline]
            pub fn any(self) -> bool {
                unsafe { crate::codegen::reductions::mask::Any::any(self) }
            }
            /// Are `all` vector lanes `false`?
            #[inline]
            pub fn none(self) -> bool {
                !self.any()
            }
        }

        test_if! {
            $test_tt:
            paste::item! {
                pub mod [<$id _reduction>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn all() {
                        let a = $id::splat(true);
                        assert!(a.all());
                        let a = $id::splat(false);
                        assert!(!a.all());

                        if $id::lanes() > 1 {
                            for i in 0..$id::lanes() {
                                let mut a = $id::splat(true);
                                a = a.replace(i, false);
                                assert!(!a.all());
                                let mut a = $id::splat(false);
                                a = a.replace(i, true);
                                assert!(!a.all());
                            }
                        }
                    }
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn any() {
                        let a = $id::splat(true);
                        assert!(a.any());
                        let a = $id::splat(false);
                        assert!(!a.any());

                        if $id::lanes() > 1 {
                            for i in 0..$id::lanes() {
                                let mut a = $id::splat(true);
                                a = a.replace(i, false);
                                assert!(a.any());
                                let mut a = $id::splat(false);
                                a = a.replace(i, true);
                                assert!(a.any());
                            }
                        }
                    }
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn none() {
                        let a = $id::splat(true);
                        assert!(!a.none());
                        let a = $id::splat(false);
                        assert!(a.none());

                        if $id::lanes() > 1 {
                            for i in 0..$id::lanes() {
                                let mut a = $id::splat(true);
                                a = a.replace(i, false);
                                assert!(!a.none());
                                let mut a = $id::splat(false);
                                a = a.replace(i, true);
                                assert!(!a.none());
                            }
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/reductions/min_max.rs
================================================
//! Implements portable horizontal vector min/max reductions.

macro_rules! impl_reduction_min_max {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident
     | $ielem_ty:ident | $test_tt:tt) => {
        impl $id {
            /// Largest vector element value.
            #[inline]
            pub fn max_element(self) -> $elem_ty {
                #[cfg(not(any(
                    target_arch = "aarch64",
                    target_arch = "arm",
                    target_arch = "powerpc64",
                    target_arch = "wasm32",
                )))]
                {
                    use crate::llvm::simd_reduce_max;
                    let v: $ielem_ty = unsafe { simd_reduce_max(self.0) };
                    v as $elem_ty
                }
                #[cfg(any(
                    target_arch = "aarch64",
                    target_arch = "arm",
                    target_arch = "powerpc64",
                    target_arch = "wasm32",
                ))]
                {
                    // FIXME: broken on AArch64
                    // https://github.com/rust-lang-nursery/packed_simd/issues/15
                    // FIXME: broken on WASM32
                    // https://github.com/rust-lang-nursery/packed_simd/issues/91
                    let mut x = self.extract(0);
                    for i in 1..$id::lanes() {
                        x = x.max(self.extract(i));
                    }
                    x
                }
            }

            /// Smallest vector element value.
            #[inline]
            pub fn min_element(self) -> $elem_ty {
                #[cfg(not(any(
                    target_arch = "aarch64",
                    target_arch = "arm",
                    all(target_arch = "x86", not(target_feature = "sse2")),
                    target_arch = "powerpc64",
                    target_arch = "wasm32",
                ),))]
                {
                    use crate::llvm::simd_reduce_min;
                    let v: $ielem_ty = unsafe { simd_reduce_min(self.0) };
                    v as $elem_ty
                }
                #[cfg(any(
                    target_arch = "aarch64",
                    target_arch = "arm",
                    all(target_arch = "x86", not(target_feature = "sse2")),
                    target_arch = "powerpc64",
                    target_arch = "wasm32",
                ))]
                {
                    // FIXME: broken on AArch64
                    // https://github.com/rust-lang-nursery/packed_simd/issues/15
                    // FIXME: broken on i586-unknown-linux-gnu
                    // https://github.com/rust-lang-nursery/packed_simd/issues/22
                    // FIXME: broken on WASM32
                    // https://github.com/rust-lang-nursery/packed_simd/issues/91
                    let mut x = self.extract(0);
                    for i in 1..$id::lanes() {
                        x = x.min(self.extract(i));
                    }
                    x
                }
            }
        }
        test_if! {$test_tt:
        paste::item! {
            // Comparisons use integer casts within mantissa^1 range.
            #[allow(clippy::float_cmp)]
            pub mod [<$id _reduction_min_max>] {
                use super::*;
                #[cfg_attr(not(target_arch = "wasm32"), test)]
                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                pub fn max_element() {
                    let v = $id::splat(0 as $elem_ty);
                    assert_eq!(v.max_element(), 0 as $elem_ty);
                    if $id::lanes() > 1 {
                        let v = v.replace(1, 1 as $elem_ty);
                        assert_eq!(v.max_element(), 1 as $elem_ty);
                    }
                    let v = v.replace(0, 2 as $elem_ty);
                    assert_eq!(v.max_element(), 2 as $elem_ty);
                }

                #[cfg_attr(not(target_arch = "wasm32"), test)]
                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                pub fn min_element() {
                    let v = $id::splat(0 as $elem_ty);
                    assert_eq!(v.min_element(), 0 as $elem_ty);
                    if $id::lanes() > 1 {
                        let v = v.replace(1, 1 as $elem_ty);
                        assert_eq!(v.min_element(), 0 as $elem_ty);
                    }
                    let v = $id::splat(1 as $elem_ty);
                    let v = v.replace(0, 2 as $elem_ty);
                    if $id::lanes() > 1 {
                        assert_eq!(v.min_element(), 1 as $elem_ty);
                    } else {
                        assert_eq!(v.min_element(), 2 as $elem_ty);
                    }
                    if $id::lanes() > 1 {
                        let v = $id::splat(2 as $elem_ty);
                        let v = v.replace(1, 1 as $elem_ty);
                        assert_eq!(v.min_element(), 1 as $elem_ty);
                    }
                }
            }
        }
        }
    };
}

macro_rules! test_reduction_float_min_max {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        test_if! {
            $test_tt:
            paste::item! {
                // Comparisons use integer casts within mantissa^1 range.
                #[allow(clippy::float_cmp)]
                pub mod [<$id _reduction_min_max_nan>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn min_element_test() {
                        let n = crate::$elem_ty::NAN;

                        assert_eq!(n.min(-3.), -3.);
                        assert_eq!((-3. as $elem_ty).min(n), -3.);

                        let v0 = $id::splat(-3.);

                        let target_with_broken_last_lane_nan = !cfg!(any(
                            target_arch = "arm", target_arch = "aarch64",
                            all(target_arch = "x86",
                                not(target_feature = "sse2")
                            ),
                            target_arch = "powerpc64",
                            target_arch = "wasm32",
                        ));

                        // The vector is initialized to `-3.`s: [-3, -3, -3, -3]
                        for i in 0..$id::lanes() {
                            // We replace the i-th element of the vector with
                            // `NaN`: [-3, -3, -3, NaN]
                            let mut v = v0.replace(i, n);

                            // If the NaN is in the last place, the LLVM
                            // implementation of these methods is broken on some
                            // targets:
                            if i == $id::lanes() - 1 &&
                                target_with_broken_last_lane_nan {
                                    assert_eq!(v.min_element(), -3.,
                                            "[A]: nan at {} => {} | {:?}",
                                            i, v.min_element(), v);

                                // If we replace all the elements in the vector
                                // up-to the `i-th` lane with `NaN`s, the result
                                // is still always `-3.` unless all elements of
                                // the vector are `NaN`s:
                                for j in 0..i {
                                    v = v.replace(j, n);
                                    if j == i-1 {
                                        assert!(v.min_element().is_nan(),
                                            "[B]: nan at {} => {} | {:?}",
                                            i, v.min_element(), v);
                                    } else {
                                        assert_eq!(v.min_element(), -3.,
                                            "[B]: nan at {} => {} | {:?}",
                                            i, v.min_element(), v);
                                    }
                                }

                                // We are done here, since we were in the last
                                // lane which is the last iteration of the loop.
                                break
                            }

                            // We are not in the last lane, and there is only
                            // one `NaN` in the vector.

                            // If the vector has one lane, the result is `NaN`:
                            if $id::lanes() == 1 {
                                assert!(v.min_element().is_nan(),
                                        "[C]: all nans | v={:?} | min={} | \
is_nan: {}",
                                        v, v.min_element(),
                                        v.min_element().is_nan()
                                );

                                // And we are done, since the vector only has
                                // one lane anyways.
                                break;
                            }

                            // The vector has more than one lane, since there is
                            // only one `NaN` in the vector, the result is
                            // always `-3`.
                            assert_eq!(v.min_element(), -3.,
                                       "[D]: nan at {} => {} | {:?}",
                                       i, v.min_element(), v);

                            // If we replace all the elements in the vector
                            // up-to the `i-th` lane with `NaN`s, the result is
                            // still always `-3.` unless all elements of the
                            // vector are `NaN`s:
                            for j in 0..i {
                                v = v.replace(j, n);

                                if i == $id::lanes() - 1 && j == i - 1 {
                                    // All elements of the vector are `NaN`s,
                                    // therefore the result is NaN as well.
                                    //
                                    // Note: the #lanes of the vector is > 1, so
                                    // "i - 1" does not overflow.
                                    assert!(v.min_element().is_nan(),
                                            "[E]: all nans | v={:?} | min={} | \
is_nan: {}",
                                            v, v.min_element(),
                                            v.min_element().is_nan());
                                } else {
                                    // There are non-`NaN` elements in the
                                    // vector, therefore the result is `-3.`:
                                    assert_eq!(v.min_element(), -3.,
                                               "[F]: nan at {} => {} | {:?}",
                                               i, v.min_element(), v);
                                }
                            }
                        }

                        // If the vector contains all NaNs the result is NaN:
                        assert!($id::splat(n).min_element().is_nan(),
                                "all nans | v={:?} | min={} | is_nan: {}",
                                $id::splat(n), $id::splat(n).min_element(),
                                $id::splat(n).min_element().is_nan());
                    }
                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn max_element_test() {
                        let n = crate::$elem_ty::NAN;

                        assert_eq!(n.max(-3.), -3.);
                        assert_eq!((-3. as $elem_ty).max(n), -3.);

                        let v0 = $id::splat(-3.);

                        let target_with_broken_last_lane_nan = !cfg!(any(
                            target_arch = "arm", target_arch = "aarch64",
                            target_arch = "powerpc64", target_arch = "wasm32",
                        ));

                        // The vector is initialized to `-3.`s: [-3, -3, -3, -3]
                        for i in 0..$id::lanes() {
                            // We replace the i-th element of the vector with
                            // `NaN`: [-3, -3, -3, NaN]
                            let mut v = v0.replace(i, n);

                            // If the NaN is in the last place, the LLVM
                            // implementation of these methods is broken on some
                            // targets:
                            if i == $id::lanes() - 1 &&
                              target_with_broken_last_lane_nan {
                                assert_eq!(v.max_element(), -3.,
                                        "[A]: nan at {} => {} | {:?}",
                                        i, v.max_element(), v);

                                // If we replace all the elements in the vector
                                // up-to the `i-th` lane with `NaN`s, the result
                                // is still always `-3.` unless all elements of
                                // the vector are `NaN`s:
                                for j in 0..i {
                                    v = v.replace(j, n);
                                    if j == i-1 {
                                        assert!(v.min_element().is_nan(),
                                        "[B]: nan at {} => {} | {:?}",
                                        i, v.min_element(), v);
                                    } else {
                                        assert_eq!(v.max_element(), -3.,
                                            "[B]: nan at {} => {} | {:?}",
                                            i, v.max_element(), v);
                                    }
                                }

                                // We are done here, since we were in the last
                                // lane which is the last iteration of the loop.
                                break
                            }

                            // We are not in the last lane, and there is only
                            // one `NaN` in the vector.

                            // If the vector has one lane, the result is `NaN`:
                            if $id::lanes() == 1 {
                                assert!(v.max_element().is_nan(),
                                        "[C]: all nans | v={:?} | min={} | \
is_nan: {}",
                                        v, v.max_element(),
                                        v.max_element().is_nan());

                                // And we are done, since the vector only has
                                // one lane anyways.
                                break;
                            }

                            // The vector has more than one lane, since there is
                            // only one `NaN` in the vector, the result is
                            // always `-3`.
                            assert_eq!(v.max_element(), -3.,
                                       "[D]: nan at {} => {} | {:?}",
                                       i, v.max_element(), v);

                            // If we replace all the elements in the vector
                            // up-to the `i-th` lane with `NaN`s, the result is
                            // still always `-3.` unless all elements of the
                            // vector are `NaN`s:
                            for j in 0..i {
                                v = v.replace(j, n);

                                if i == $id::lanes() - 1 && j == i - 1 {
                                    // All elements of the vector are `NaN`s,
                                    // therefore the result is NaN as well.
                                    //
                                    // Note: the #lanes of the vector is > 1, so
                                    // "i - 1" does not overflow.
                                    assert!(v.max_element().is_nan(),
                                            "[E]: all nans | v={:?} | max={} | \
is_nan: {}",
                                            v, v.max_element(),
                                            v.max_element().is_nan());
                                } else {
                                    // There are non-`NaN` elements in the
                                    // vector, therefore the result is `-3.`:
                                    assert_eq!(v.max_element(), -3.,
                                               "[F]: nan at {} => {} | {:?}",
                                               i, v.max_element(), v);
                                }
                            }
                        }

                        // If the vector contains all NaNs the result is NaN:
                        assert!($id::splat(n).max_element().is_nan(),
                                "all nans | v={:?} | max={} | is_nan: {}",
                                $id::splat(n), $id::splat(n).max_element(),
                                $id::splat(n).max_element().is_nan());
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/reductions.rs
================================================
//! Reductions

#[macro_use]
mod float_arithmetic;
#[macro_use]
mod integer_arithmetic;
#[macro_use]
mod bitwise;
#[macro_use]
mod mask;
#[macro_use]
mod min_max;


================================================
FILE: src/api/select.rs
================================================
//! Implements mask's `select`.

/// Implements mask select method
macro_rules! impl_select {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Selects elements of `a` and `b` using mask.
            ///
            /// The lanes of the result for which the mask is `true` contain
            /// the values of `a`. The remaining lanes contain the values of
            /// `b`.
            #[inline]
            pub fn select<T>(self, a: Simd<T>, b: Simd<T>) -> Simd<T>
            where
                T: sealed::SimdArray<NT = <[$elem_ty; $elem_count] as sealed::SimdArray>::NT>,
            {
                use crate::llvm::simd_select;
                Simd(unsafe { simd_select(self.0, a.0, b.0) })
            }
        }

        test_select!(bool, $id, $id, (false, true) | $test_tt);
    };
}

macro_rules! test_select {
    (
        $elem_ty:ident,
        $mask_ty:ident,
        $vec_ty:ident,($small:expr, $large:expr) |
        $test_tt:tt
    ) => {
        test_if! {
            $test_tt:
            paste::item! {
                pub mod [<$vec_ty _select>] {
                    use super::*;

                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn select() {
                        let o = $small as $elem_ty;
                        let t = $large as $elem_ty;

                        let a = $vec_ty::splat(o);
                        let b = $vec_ty::splat(t);
                        let m = a.lt(b);
                        assert_eq!(m.select(a, b), a);

                        let m = b.lt(a);
                        assert_eq!(m.select(b, a), a);

                        let mut c = a;
                        let mut d = b;
                        let mut m_e = $mask_ty::splat(false);
                        for i in 0..$vec_ty::lanes() {
                            if i % 2 == 0 {
                                let c_tmp = c.extract(i);
                                c = c.replace(i, d.extract(i));
                                d = d.replace(i, c_tmp);
                            } else {
                                m_e = m_e.replace(i, true);
                            }
                        }

                        let m = c.lt(d);
                        assert_eq!(m_e, m);
                        assert_eq!(m.select(c, d), a);
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/shuffle.rs
================================================
//! Implements portable vector shuffles with immediate indices.

// FIXME: comprehensive tests
// https://github.com/rust-lang-nursery/packed_simd/issues/20

/// Shuffles vector elements.
///
/// This macro returns a new vector that contains a shuffle of the elements in
/// one (`shuffle!(vec, [indices...])`) or two (`shuffle!(vec0, vec1,
/// [indices...])`) input vectors.
///
/// The type of `vec0` and `vec1` must be equal, and the element type of the
/// resulting vector is the element type of the input vector.
///
/// The number of `indices` must be a power-of-two in range `[0, 64)`, since
/// currently, the largest vector supported by the library has 64 lanes. The
/// length of the resulting vector equals the number of indices provided.
///
/// The indices must be in range `[0, M * N)` where `M` is the number of input
/// vectors (`1` or `2`) and `N` is the number of lanes of the input vectors.
/// The indices `i` in range `[0, N)` refer to the `i`-th element of `vec0`,
/// while the indices in range `[N, 2*N)` refer to the `i - N`-th element of
/// `vec1`.
///
/// # Examples
///
/// Shuffling elements of two vectors:
///
/// ```
/// # use packed_simd::*;
/// # fn main() {
/// // Shuffle allows reordering the elements:
/// let x = i32x4::new(1, 2, 3, 4);
/// let y = i32x4::new(5, 6, 7, 8);
/// let r = shuffle!(x, y, [4, 0, 5, 1]);
/// assert_eq!(r, i32x4::new(5, 1, 6, 2));
///
/// // The resulting vector can als be smaller than the input:
/// let r = shuffle!(x, y, [1, 6]);
/// assert_eq!(r, i32x2::new(2, 7));
///
/// // Or larger:
/// let r = shuffle!(x, y, [1, 3, 4, 2, 1, 7, 2, 2]);
/// assert_eq!(r, i32x8::new(2, 4, 5, 3, 2, 8, 3, 3));
/// // At most 2 * the number of lanes in the input vector.
/// # }
/// ```
///
/// Shuffling elements of one vector:
///
/// ```
/// # use packed_simd::*;
/// # fn main() {
/// // Shuffle allows reordering the elements of a vector:
/// let x = i32x4::new(1, 2, 3, 4);
/// let r = shuffle!(x, [2, 1, 3, 0]);
/// assert_eq!(r, i32x4::new(3, 2, 4, 1));
///
/// // The resulting vector can be smaller than the input:
/// let r = shuffle!(x, [1, 3]);
/// assert_eq!(r, i32x2::new(2, 4));
///
/// // Equal:
/// let r = shuffle!(x, [1, 3, 2, 0]);
/// assert_eq!(r, i32x4::new(2, 4, 3, 1));
///
/// // Or larger:
/// let r = shuffle!(x, [1, 3, 2, 2, 1, 3, 2, 2]);
/// assert_eq!(r, i32x8::new(2, 4, 3, 3, 2, 4, 3, 3));
/// // At most 2 * the number of lanes in the input vector.
/// # }
/// ```
#[macro_export]
macro_rules! shuffle {
    ($vec0:expr, $vec1:expr, [$l0:expr, $l1:expr]) => {{
        #[allow(unused_unsafe)]
        unsafe {
            $crate::Simd($crate::__shuffle_vector2::<{[$l0, $l1]}, _, _>(
                $vec0.0,
                $vec1.0,
            ))
        }
    }};
    ($vec0:expr, $vec1:expr, [$l0:expr, $l1:expr, $l2:expr, $l3:expr]) => {{
        #[allow(unused_unsafe)]
        unsafe {
            $crate::Simd($crate::__shuffle_vector4::<{[$l0, $l1, $l2, $l3]}, _, _>(
                $vec0.0,
                $vec1.0,
            ))
        }
    }};
    ($vec0:expr, $vec1:expr,
     [$l0:expr, $l1:expr, $l2:expr, $l3:expr,
      $l4:expr, $l5:expr, $l6:expr, $l7:expr]) => {{
        #[allow(unused_unsafe)]
        unsafe {
            $crate::Simd($crate::__shuffle_vector8::<{[$l0, $l1, $l2, $l3, $l4, $l5, $l6, $l7]}, _, _>(
                $vec0.0,
                $vec1.0,
            ))
        }
    }};
    ($vec0:expr, $vec1:expr,
     [$l0:expr, $l1:expr, $l2:expr, $l3:expr,
      $l4:expr, $l5:expr, $l6:expr, $l7:expr,
      $l8:expr, $l9:expr, $l10:expr, $l11:expr,
      $l12:expr, $l13:expr, $l14:expr, $l15:expr]) => {{
        #[allow(unused_unsafe)]
        unsafe {
            $crate::Simd($crate::__shuffle_vector16::<{
                [
                    $l0, $l1, $l2, $l3, $l4, $l5, $l6, $l7, $l8, $l9, $l10,
                    $l11, $l12, $l13, $l14, $l15,
                ]
            }, _, _>(
                $vec0.0,
                $vec1.0,
            ))
        }
    }};
    ($vec0:expr, $vec1:expr,
     [$l0:expr, $l1:expr, $l2:expr, $l3:expr,
      $l4:expr, $l5:expr, $l6:expr, $l7:expr,
      $l8:expr, $l9:expr, $l10:expr, $l11:expr,
      $l12:expr, $l13:expr, $l14:expr, $l15:expr,
      $l16:expr, $l17:expr, $l18:expr, $l19:expr,
      $l20:expr, $l21:expr, $l22:expr, $l23:expr,
      $l24:expr, $l25:expr, $l26:expr, $l27:expr,
      $l28:expr, $l29:expr, $l30:expr, $l31:expr]) => {{
        #[allow(unused_unsafe)]
        unsafe {
            $crate::Simd($crate::__shuffle_vector32::<{
                [
                    $l0, $l1, $l2, $l3, $l4, $l5, $l6, $l7, $l8, $l9, $l10,
                    $l11, $l12, $l13, $l14, $l15, $l16, $l17, $l18, $l19,
                    $l20, $l21, $l22, $l23, $l24, $l25, $l26, $l27, $l28,
                    $l29, $l30, $l31,
                ]
            }, _, _>(
                $vec0.0,
                $vec1.0,
            ))
        }
    }};
    ($vec0:expr, $vec1:expr,
     [$l0:expr, $l1:expr, $l2:expr, $l3:expr,
      $l4:expr, $l5:expr, $l6:expr, $l7:expr,
      $l8:expr, $l9:expr, $l10:expr, $l11:expr,
      $l12:expr, $l13:expr, $l14:expr, $l15:expr,
      $l16:expr, $l17:expr, $l18:expr, $l19:expr,
      $l20:expr, $l21:expr, $l22:expr, $l23:expr,
      $l24:expr, $l25:expr, $l26:expr, $l27:expr,
      $l28:expr, $l29:expr, $l30:expr, $l31:expr,
      $l32:expr, $l33:expr, $l34:expr, $l35:expr,
      $l36:expr, $l37:expr, $l38:expr, $l39:expr,
      $l40:expr, $l41:expr, $l42:expr, $l43:expr,
      $l44:expr, $l45:expr, $l46:expr, $l47:expr,
      $l48:expr, $l49:expr, $l50:expr, $l51:expr,
      $l52:expr, $l53:expr, $l54:expr, $l55:expr,
      $l56:expr, $l57:expr, $l58:expr, $l59:expr,
      $l60:expr, $l61:expr, $l62:expr, $l63:expr]) => {{
        #[allow(unused_unsafe)]
        unsafe {
            $crate::Simd($crate::__shuffle_vector64::<{[
                $l0, $l1, $l2, $l3, $l4, $l5, $l6, $l7, $l8, $l9, $l10,
                $l11, $l12, $l13, $l14, $l15, $l16, $l17, $l18, $l19,
                $l20, $l21, $l22, $l23, $l24, $l25, $l26, $l27, $l28,
                $l29, $l30, $l31, $l32, $l33, $l34, $l35, $l36, $l37,
                $l38, $l39, $l40, $l41, $l42, $l43, $l44, $l45, $l46,
                $l47, $l48, $l49, $l50, $l51, $l52, $l53, $l54, $l55,
                $l56, $l57, $l58, $l59, $l60, $l61, $l62, $l63,
            ]}, _, _>(
                $vec0.0,
                $vec1.0,
            ))
        }
     }};
    ($vec:expr, [$($l:expr),*]) => {
        match $vec {
            v => shuffle!(v, v, [$($l),*])
        }
    };
}


================================================
FILE: src/api/shuffle1_dyn.rs
================================================
//! Shuffle vector elements according to a dynamic vector of indices.

macro_rules! impl_shuffle1_dyn {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Shuffle vector elements according to `indices`.
            #[inline]
            pub fn shuffle1_dyn<I>(self, indices: I) -> Self
            where
                Self: codegen::shuffle1_dyn::Shuffle1Dyn<Indices = I>,
            {
                codegen::shuffle1_dyn::Shuffle1Dyn::shuffle1_dyn(self, indices)
            }
        }
    };
}

macro_rules! test_shuffle1_dyn {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        test_if! {
            $test_tt:
            paste::item! {
                pub mod [<$id _shuffle1_dyn>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn shuffle1_dyn() {
                        let increasing = {
                            let mut v = $id::splat(0 as $elem_ty);
                            for i in 0..$id::lanes() {
                                v = v.replace(i, i as $elem_ty);
                            }
                            v
                        };
                        let decreasing = {
                            let mut v = $id::splat(0 as $elem_ty);
                            for i in 0..$id::lanes() {
                                v = v.replace(
                                    i,
                                    ($id::lanes() - 1 - i) as $elem_ty
                                );
                            }
                            v
                        };

                        type Indices = <
                            $id as codegen::shuffle1_dyn::Shuffle1Dyn
                            >::Indices;
                        let increasing_ids: Indices = increasing.cast();
                        let decreasing_ids: Indices = decreasing.cast();

                        assert_eq!(
                            increasing.shuffle1_dyn(increasing_ids),
                            increasing,
                            "(i,i)=>i"
                        );
                        assert_eq!(
                            decreasing.shuffle1_dyn(increasing_ids),
                            decreasing,
                            "(d,i)=>d"
                        );
                        assert_eq!(
                            increasing.shuffle1_dyn(decreasing_ids),
                            decreasing,
                            "(i,d)=>d"
                        );
                        assert_eq!(
                            decreasing.shuffle1_dyn(decreasing_ids),
                            increasing,
                            "(d,d)=>i"
                        );

                        for i in 0..$id::lanes() {
                            let v_ids: Indices
                                = $id::splat(i as $elem_ty).cast();
                            assert_eq!(increasing.shuffle1_dyn(v_ids),
                                       $id::splat(increasing.extract(i))
                            );
                            assert_eq!(decreasing.shuffle1_dyn(v_ids),
                                       $id::splat(decreasing.extract(i))
                            );
                            assert_eq!(
                                $id::splat(i as $elem_ty)
                                    .shuffle1_dyn(increasing_ids),
                                $id::splat(i as $elem_ty)
                            );
                            assert_eq!(
                                $id::splat(i as $elem_ty)
                                    .shuffle1_dyn(decreasing_ids),
                                $id::splat(i as $elem_ty)
                            );
                        }
                    }
                }
            }
        }
    };
}

macro_rules! test_shuffle1_dyn_mask {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        test_if! {
            $test_tt:
            paste::item! {
                pub mod [<$id _shuffle1_dyn>] {
                    use super::*;
                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn shuffle1_dyn() {
                        // alternating = [true, false, true, false, ...]
                        let mut alternating = $id::splat(false);
                        for i in 0..$id::lanes() {
                            if i % 2 == 0 {
                                alternating = alternating.replace(i, true);
                            }
                        }

                        type Indices = <
                            $id as codegen::shuffle1_dyn::Shuffle1Dyn
                            >::Indices;
                        // even = [0, 0, 2, 2, 4, 4, ..]
                        let even = {
                            let mut v = Indices::splat(0);
                            for i in 0..$id::lanes() {
                                if i % 2 == 0 {
                                    v = v.replace(i, (i as u8).into());
                                } else {
                                    v = v.replace(i, (i as u8 - 1).into());
                                }
                            }
                            v
                        };
                        // odd = [1, 1, 3, 3, 5, 5, ...]
                        let odd = {
                            let mut v = Indices::splat(0);
                            for i in 0..$id::lanes() {
                                if i % 2 != 0 {
                                    v = v.replace(i, (i as u8).into());
                                } else {
                                    v = v.replace(i, (i as u8 + 1).into());
                                }
                            }
                            v
                        };

                        assert_eq!(
                            alternating.shuffle1_dyn(even),
                            $id::splat(true)
                        );
                        if $id::lanes() > 1 {
                            assert_eq!(
                                alternating.shuffle1_dyn(odd),
                                $id::splat(false)
                            );
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/slice/from_slice.rs
================================================
//! Implements methods to read a vector type from a slice.

macro_rules! impl_slice_from_slice {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Instantiates a new vector with the values of the `slice`.
            ///
            /// # Panics
            ///
            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not aligned
            /// to an `align_of::<Self>()` boundary.
            #[inline]
            pub fn from_slice_aligned(slice: &[$elem_ty]) -> Self {
                unsafe {
                    assert!(slice.len() >= $elem_count);
                    let target_ptr = slice.as_ptr();
                    assert_eq!(target_ptr.align_offset(crate::mem::align_of::<Self>()), 0);
                    Self::from_slice_aligned_unchecked(slice)
                }
            }

            /// Instantiates a new vector with the values of the `slice`.
            ///
            /// # Panics
            ///
            /// If `slice.len() < Self::lanes()`.
            #[inline]
            pub fn from_slice_unaligned(slice: &[$elem_ty]) -> Self {
                unsafe {
                    assert!(slice.len() >= $elem_count);
                    Self::from_slice_unaligned_unchecked(slice)
                }
            }

            /// Instantiates a new vector with the values of the `slice`.
            ///
            /// # Safety
            ///
            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not aligned
            /// to an `align_of::<Self>()` boundary, the behavior is undefined.
            #[inline]
            pub unsafe fn from_slice_aligned_unchecked(slice: &[$elem_ty]) -> Self {
                debug_assert!(slice.len() >= $elem_count);
                let target_ptr = slice.as_ptr();
                debug_assert_eq!(target_ptr.align_offset(crate::mem::align_of::<Self>()), 0);

                #[allow(clippy::cast_ptr_alignment)]
                *(target_ptr as *const Self)
            }

            /// Instantiates a new vector with the values of the `slice`.
            ///
            /// # Safety
            ///
            /// If `slice.len() < Self::lanes()` the behavior is undefined.
            #[inline]
            pub unsafe fn from_slice_unaligned_unchecked(slice: &[$elem_ty]) -> Self {
                use crate::mem::size_of;
                debug_assert!(slice.len() >= $elem_count);
                let target_ptr = slice.as_ptr().cast();
                let mut x = Self::splat(0 as $elem_ty);
                let self_ptr = &mut x as *mut Self as *mut u8;
                crate::ptr::copy_nonoverlapping(target_ptr, self_ptr, size_of::<Self>());
                x
            }
        }

        test_if! {
            $test_tt:
            paste::item! {
                // Comparisons use integer casts within mantissa^1 range.
                #[allow(clippy::float_cmp)]
                pub mod [<$id _slice_from_slice>] {
                    use super::*;
                    use crate::iter::Iterator;

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn from_slice_unaligned() {
                        let mut unaligned = [42 as $elem_ty; $id::lanes() + 1];
                        unaligned[0] = 0 as $elem_ty;
                        let vec = $id::from_slice_unaligned(&unaligned[1..]);
                        for (index, &b) in unaligned.iter().enumerate() {
                            if index == 0 {
                                assert_eq!(b, 0 as $elem_ty);
                            } else {
                                assert_eq!(b, 42 as $elem_ty);
                                assert_eq!(b, vec.extract(index - 1));
                            }
                        }
                    }

                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)]
                    // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn from_slice_unaligned_fail() {
                        let mut unaligned = [42 as $elem_ty; $id::lanes() + 1];
                        unaligned[0] = 0 as $elem_ty;
                        // the slice is not large enough => panic
                        let _vec = $id::from_slice_unaligned(&unaligned[2..]);
                    }

                    union A {
                        data: [$elem_ty; 2 * $id::lanes()],
                        _vec: $id,
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn from_slice_aligned() {
                        let mut aligned = A {
                            data: [0 as $elem_ty; 2 * $id::lanes()],
                        };
                        for i in $id::lanes()..(2 * $id::lanes()) {
                            unsafe {
                                aligned.data[i] = 42 as $elem_ty;
                            }
                        }

                        let vec = unsafe {
                            $id::from_slice_aligned(
                                &aligned.data[$id::lanes()..]
                            )
                        };
                        for (index, &b) in
                            unsafe { aligned.data.iter().enumerate() } {
                            if index < $id::lanes() {
                                assert_eq!(b, 0 as $elem_ty);
                            } else {
                                assert_eq!(b, 42 as $elem_ty);
                                assert_eq!(
                                    b, vec.extract(index - $id::lanes())
                                );
                            }
                        }
                    }

                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)]
                    // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn from_slice_aligned_fail_lanes() {
                        let aligned = A {
                            data: [0 as $elem_ty; 2 * $id::lanes()],
                        };
                        let _vec = unsafe {
                            $id::from_slice_aligned(
                                &aligned.data[2 * $id::lanes()..]
                            )
                        };
                    }

                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)]
                    // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn from_slice_aligned_fail_align() {
                        unsafe {
                            let aligned = A {
                                data: [0 as $elem_ty; 2 * $id::lanes()],
                            };

                            // get a pointer to the front of data
                            let ptr: *const $elem_ty = aligned.data.as_ptr()
                                as *const $elem_ty;
                            // offset pointer by one element
                            let ptr = ptr.wrapping_add(1);

                            if ptr.align_offset(
                                crate::mem::align_of::<$id>()
                            ) == 0 {
                                // the pointer is properly aligned, so
                                // from_slice_aligned won't fail here (e.g. this
                                // can happen for i128x1). So we panic to make
                                // the "should_fail" test pass:
                                panic!("ok");
                            }

                            // create a slice - this is safe, because the
                            // elements of the slice exist, are properly
                            // initialized, and properly aligned:
                            let s: &[$elem_ty] = slice::from_raw_parts(
                                ptr, $id::lanes()
                            );
                            // this should always panic because the slice
                            // alignment does not match the alignment
                            // requirements for the vector type:
                            let _vec = $id::from_slice_aligned(s);
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/slice/write_to_slice.rs
================================================
//! Implements methods to write a vector type to a slice.

macro_rules! impl_slice_write_to_slice {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Writes the values of the vector to the `slice`.
            ///
            /// # Panics
            ///
            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not
            /// aligned to an `align_of::<Self>()` boundary.
            #[inline]
            pub fn write_to_slice_aligned(self, slice: &mut [$elem_ty]) {
                unsafe {
                    assert!(slice.len() >= $elem_count);
                    let target_ptr = slice.as_mut_ptr();
                    assert_eq!(target_ptr.align_offset(crate::mem::align_of::<Self>()), 0);
                    self.write_to_slice_aligned_unchecked(slice);
                }
            }

            /// Writes the values of the vector to the `slice`.
            ///
            /// # Panics
            ///
            /// If `slice.len() < Self::lanes()`.
            #[inline]
            pub fn write_to_slice_unaligned(self, slice: &mut [$elem_ty]) {
                unsafe {
                    assert!(slice.len() >= $elem_count);
                    self.write_to_slice_unaligned_unchecked(slice);
                }
            }

            /// Writes the values of the vector to the `slice`.
            ///
            /// # Safety
            ///
            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not
            /// aligned to an `align_of::<Self>()` boundary, the behavior is
            /// undefined.
            #[inline]
            pub unsafe fn write_to_slice_aligned_unchecked(self, slice: &mut [$elem_ty]) {
                debug_assert!(slice.len() >= $elem_count);
                let target_ptr = slice.as_mut_ptr();
                debug_assert_eq!(target_ptr.align_offset(crate::mem::align_of::<Self>()), 0);

                #[allow(clippy::cast_ptr_alignment)]
                #[allow(clippy::cast_ptr_alignment)]
                #[allow(clippy::cast_ptr_alignment)]
                #[allow(clippy::cast_ptr_alignment)]
                *(target_ptr as *mut Self) = self;
            }

            /// Writes the values of the vector to the `slice`.
            ///
            /// # Safety
            ///
            /// If `slice.len() < Self::lanes()` the behavior is undefined.
            #[inline]
            pub unsafe fn write_to_slice_unaligned_unchecked(self, slice: &mut [$elem_ty]) {
                debug_assert!(slice.len() >= $elem_count);
                let target_ptr = slice.as_mut_ptr().cast();
                let self_ptr = &self as *const Self as *const u8;
                crate::ptr::copy_nonoverlapping(self_ptr, target_ptr, crate::mem::size_of::<Self>());
            }
        }

        test_if! {
            $test_tt:
            paste::item! {
                // Comparisons use integer casts within mantissa^1 range.
                #[allow(clippy::float_cmp)]
                pub mod [<$id _slice_write_to_slice>] {
                    use super::*;
                    use crate::iter::Iterator;

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn write_to_slice_unaligned() {
                        let mut unaligned = [0 as $elem_ty; $id::lanes() + 1];
                        let vec = $id::splat(42 as $elem_ty);
                        vec.write_to_slice_unaligned(&mut unaligned[1..]);
                        for (index, &b) in unaligned.iter().enumerate() {
                            if index == 0 {
                                assert_eq!(b, 0 as $elem_ty);
                            } else {
                                assert_eq!(b, 42 as $elem_ty);
                                assert_eq!(b, vec.extract(index - 1));
                            }
                        }
                    }

                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)]
                    // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn write_to_slice_unaligned_fail() {
                        let mut unaligned = [0 as $elem_ty; $id::lanes() + 1];
                        let vec = $id::splat(42 as $elem_ty);
                        vec.write_to_slice_unaligned(&mut unaligned[2..]);
                    }

                    union A {
                        data: [$elem_ty; 2 * $id::lanes()],
                        _vec: $id,
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)]
                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn write_to_slice_aligned() {
                        let mut aligned = A {
                            data: [0 as $elem_ty; 2 * $id::lanes()],
                        };
                        let vec = $id::splat(42 as $elem_ty);
                        unsafe {
                            vec.write_to_slice_aligned(
                                &mut aligned.data[$id::lanes()..]
                            );
                            for (idx, &b) in aligned.data.iter().enumerate() {
                                if idx < $id::lanes() {
                                    assert_eq!(b, 0 as $elem_ty);
                                } else {
                                    assert_eq!(b, 42 as $elem_ty);
                                    assert_eq!(
                                        b, vec.extract(idx - $id::lanes())
                                    );
                                }
                            }
                        }
                    }

                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)]
                    // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn write_to_slice_aligned_fail_lanes() {
                        let mut aligned = A {
                            data: [0 as $elem_ty; 2 * $id::lanes()],
                        };
                        let vec = $id::splat(42 as $elem_ty);
                        unsafe {
                            vec.write_to_slice_aligned(
                                &mut aligned.data[2 * $id::lanes()..]
                            )
                        };
                    }

                    // FIXME: wasm-bindgen-test does not support #[should_panic]
                    // #[cfg_attr(not(target_arch = "wasm32"), test)]
                    // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    #[cfg(not(target_arch = "wasm32"))]
                    #[test]
                    #[should_panic]
                    fn write_to_slice_aligned_fail_align() {
                        unsafe {
                            let mut aligned = A {
                                data: [0 as $elem_ty; 2 * $id::lanes()],
                            };

                            // get a pointer to the front of data
                            let ptr: *mut $elem_ty
                                = aligned.data.as_mut_ptr() as *mut $elem_ty;
                            // offset pointer by one element
                            let ptr = ptr.wrapping_add(1);

                            if ptr.align_offset(crate::mem::align_of::<$id>())
                                == 0 {
                                // the pointer is properly aligned, so
                                // write_to_slice_aligned won't fail here (e.g.
                                // this can happen for i128x1). So we panic to
                                // make the "should_fail" test pass:
                                panic!("ok");
                            }

                            // create a slice - this is safe, because the
                            // elements of the slice exist, are properly
                            // initialized, and properly aligned:
                            let s: &mut [$elem_ty]
                                = slice::from_raw_parts_mut(ptr, $id::lanes());
                            // this should always panic because the slice
                            // alignment does not match the alignment
                            // requirements for the vector type:
                            let vec = $id::splat(42 as $elem_ty);
                            vec.write_to_slice_aligned(s);
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api/slice.rs
================================================
//! Slice from/to methods

#[macro_use]
mod from_slice;

#[macro_use]
mod write_to_slice;


================================================
FILE: src/api/swap_bytes.rs
================================================
//! Horizontal swap bytes

macro_rules! impl_swap_bytes {
    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
        impl $id {
            /// Reverses the byte order of the vector.
            #[inline]
            pub fn swap_bytes(self) -> Self {
                super::codegen::swap_bytes::SwapBytes::swap_bytes(self)
            }

            /// Converts self to little endian from the target's endianness.
            ///
            /// On little endian this is a no-op. On big endian the bytes are
            /// swapped.
            #[inline]
            pub fn to_le(self) -> Self {
                #[cfg(target_endian = "little")]
                {
                    self
                }
                #[cfg(not(target_endian = "little"))]
                {
                    self.swap_bytes()
                }
            }

            /// Converts self to big endian from the target's endianness.
            ///
            /// On big endian this is a no-op. On little endian the bytes are
            /// swapped.
            #[inline]
            pub fn to_be(self) -> Self {
                #[cfg(target_endian = "big")]
                {
                    self
                }
                #[cfg(not(target_endian = "big"))]
                {
                    self.swap_bytes()
                }
            }

            /// Converts a vector from little endian to the target's endianness.
            ///
            /// On little endian this is a no-op. On big endian the bytes are
            /// swapped.
            #[inline]
            pub fn from_le(x: Self) -> Self {
                #[cfg(target_endian = "little")]
                {
                    x
                }
                #[cfg(not(target_endian = "little"))]
                {
                    x.swap_bytes()
                }
            }

            /// Converts a vector from big endian to the target's endianness.
            ///
            /// On big endian this is a no-op. On little endian the bytes are
            /// swapped.
            #[inline]
            pub fn from_be(x: Self) -> Self {
                #[cfg(target_endian = "big")]
                {
                    x
                }
                #[cfg(not(target_endian = "big"))]
                {
                    x.swap_bytes()
                }
            }
        }

        test_if! {
            $test_tt:
            paste::item! {
                pub mod [<$id _swap_bytes>] {
                    use super::*;

                    const BYTES: [u8; 64] = [
                        0, 1, 2, 3, 4, 5, 6, 7,
                        8, 9, 10, 11, 12, 13, 14, 15,
                        16, 17, 18, 19, 20, 21, 22, 23,
                        24, 25, 26, 27, 28, 29, 30, 31,
                        32, 33, 34, 35, 36, 37, 38, 39,
                        40, 41, 42, 43, 44, 45, 46, 47,
                        48, 49, 50, 51, 52, 53, 54, 55,
                        56, 57, 58, 59, 60, 61, 62, 63,
                    ];

                    macro_rules! swap {
                        ($func: ident) => {{
                            // catch possible future >512 vectors
                            assert!(mem::size_of::<$id>() <= 64);

                            let mut actual = BYTES;
                            let elems: &mut [$elem_ty] = unsafe {
                                slice::from_raw_parts_mut(
                                    actual.as_mut_ptr() as *mut $elem_ty,
                                    $id::lanes(),
                                )
                            };

                            let vec = $id::from_slice_unaligned(elems);
                            $id::$func(vec).write_to_slice_unaligned(elems);

                            actual
                        }};
                    }

                    macro_rules! test_swap {
                        ($func: ident) => {{
                            let actual = swap!($func);
                            let expected =
                                BYTES.iter().rev()
                                .skip(64 - crate::mem::size_of::<$id>());
                            assert!(actual.iter().zip(expected)
                                    .all(|(x, y)| x == y));
                        }};
                    }

                    macro_rules! test_no_swap {
                        ($func: ident) => {{
                            let actual = swap!($func);
                            let expected = BYTES.iter()
                                .take(mem::size_of::<$id>());

                            assert!(actual.iter().zip(expected)
                                    .all(|(x, y)| x == y));
                        }};
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn swap_bytes() {
                        test_swap!(swap_bytes);
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn to_le() {
                        #[cfg(target_endian = "little")]
                        {
                            test_no_swap!(to_le);
                        }
                        #[cfg(not(target_endian = "little"))]
                        {
                            test_swap!(to_le);
                        }
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn to_be() {
                        #[cfg(target_endian = "big")]
                        {
                            test_no_swap!(to_be);
                        }
                        #[cfg(not(target_endian = "big"))]
                        {
                            test_swap!(to_be);
                        }
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn from_le() {
                        #[cfg(target_endian = "little")]
                        {
                            test_no_swap!(from_le);
                        }
                        #[cfg(not(target_endian = "little"))]
                        {
                            test_swap!(from_le);
                        }
                    }

                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                    fn from_be() {
                        #[cfg(target_endian = "big")]
                        {
                            test_no_swap!(from_be);
                        }
                        #[cfg(not(target_endian = "big"))]
                        {
                            test_swap!(from_be);
                        }
                    }
                }
            }
        }
    };
}


================================================
FILE: src/api.rs
================================================
//! Implements the Simd<[T; N]> APIs

#[macro_use]
mod bitmask;
pub(crate) mod cast;
#[macro_use]
mod cmp;
#[macro_use]
mod default;
#[macro_use]
mod fmt;
#[macro_use]
mod from;
#[macro_use]
mod hash;
#[macro_use]
mod math;
#[macro_use]
mod minimal;
#[macro_use]
mod ops;
#[macro_use]
mod ptr;
#[macro_use]
mod reductions;
#[macro_use]
mod select;
#[macro_use]
mod shuffle;
#[macro_use]
mod shuffle1_dyn;
#[macro_use]
mod slice;
#[macro_use]
mod swap_bytes;
#[macro_use]
mod bit_manip;

#[cfg(feature = "into_bits")]
pub(crate) mod into_bits;

macro_rules! impl_i {
    ([$elem_ty:ident; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident
     | $ielem_ty:ident, $ibitmask_ty:ident | $test_tt:tt | $($elem_ids:ident),*
     | From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => {
        impl_minimal_iuf!([$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
                          | $($elem_ids),* | $(#[$doc])*);
        impl_ops_vector_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_ops_scalar_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_ops_vector_bitwise!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (!(0 as $elem_ty), 0)
        );
        impl_ops_scalar_bitwise!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (!(0 as $elem_ty), 0)
        );
        impl_ops_vector_shifts!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_ops_scalar_shifts!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_ops_vector_rotates!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_ops_vector_neg!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_ops_vector_int_min_max!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt
        );
        impl_reduction_integer_arithmetic!(
            [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
        );
        impl_reduction_min_max!(
            [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
        );
        impl_reduction_bitwise!(
            [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
            | (|x|{ x as $elem_ty }) | (!(0 as $elem_ty), 0)
        );
        impl_fmt_debug!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_fmt_lower_hex!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_fmt_upper_hex!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_fmt_octal!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_fmt_binary!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_from_array!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (1, 1));
        impl_from_vectors!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt | $($from_vec_ty),*
        );
        impl_default!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_hash!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_slice_from_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_slice_write_to_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_swap_bytes!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_bit_manip!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_cmp_partial_eq!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1)
        );
        impl_cmp_eq!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1));
        impl_cmp_vertical!(
            [$elem_ty; $elem_n]: $tuple_id, $mask_ty, false, (1, 0) | $test_tt
        );
        impl_cmp_partial_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_cmp_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1));
        impl_bitmask!($tuple_id | $ibitmask_ty | (-1, 0) | $test_tt);

        test_select!($elem_ty, $mask_ty, $tuple_id, (1, 2) | $test_tt);
        test_cmp_partial_ord_int!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        test_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
    }
}

macro_rules! impl_u {
    ([$elem_ty:ident; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident
     | $ielem_ty:ident, $ibitmask_ty:ident | $test_tt:tt | $($elem_ids:ident),*
     | From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => {
        impl_minimal_iuf!([$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
                          | $($elem_ids),* | $(#[$doc])*);
        impl_ops_vector_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_ops_scalar_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_ops_vector_bitwise!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (!(0 as $elem_ty), 0)
        );
        impl_ops_scalar_bitwise!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (!(0 as $elem_ty), 0)
        );
        impl_ops_vector_shifts!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_ops_scalar_shifts!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_ops_vector_rotates!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_ops_vector_int_min_max!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt
        );
        impl_reduction_integer_arithmetic!(
            [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
        );
        impl_reduction_min_max!(
            [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
        );
        impl_reduction_bitwise!(
            [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
            | (|x|{ x as $elem_ty }) | (!(0 as $elem_ty), 0)
        );
        impl_fmt_debug!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_fmt_lower_hex!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_fmt_upper_hex!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_fmt_octal!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_fmt_binary!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_from_array!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (1, 1));
        impl_from_vectors!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt | $($from_vec_ty),*
        );
        impl_default!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_hash!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_slice_from_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_slice_write_to_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_swap_bytes!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_bit_manip!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_cmp_partial_eq!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (1, 0)
        );
        impl_cmp_eq!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1));
        impl_cmp_vertical!(
            [$elem_ty; $elem_n]: $tuple_id, $mask_ty, false, (1, 0) | $test_tt
        );
        impl_cmp_partial_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_cmp_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1));
        impl_bitmask!($tuple_id | $ibitmask_ty | ($ielem_ty::max_value(), 0) |
                      $test_tt);

        test_select!($elem_ty, $mask_ty, $tuple_id, (1, 2) | $test_tt);
        test_cmp_partial_ord_int!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        test_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
    }
}

macro_rules! impl_f {
    ([$elem_ty:ident; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident
     | $ielem_ty:ident | $test_tt:tt | $($elem_ids:ident),*
     | From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => {
        impl_minimal_iuf!([$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
                          | $($elem_ids),* | $(#[$doc])*);
        impl_ops_vector_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_ops_scalar_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_ops_vector_neg!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_ops_vector_float_min_max!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt
        );
        impl_reduction_float_arithmetic!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_reduction_min_max!(
            [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
        );
        impl_fmt_debug!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_from_array!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (1., 1.));
        impl_from_vectors!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt | $($from_vec_ty),*
        );
        impl_default!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_cmp_partial_eq!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (1., 0.)
        );
        impl_slice_from_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_slice_write_to_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);

        impl_float_consts!([$elem_ty; $elem_n]: $tuple_id);
        impl_float_category!([$elem_ty; $elem_n]: $tuple_id, $mask_ty);

        // floating-point math
        impl_math_float_abs!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_math_float_cos!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_math_float_exp!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_math_float_ln!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_math_float_mul_add!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_math_float_mul_adde!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_math_float_powf!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_math_float_recpre!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_math_float_rsqrte!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_math_float_sin!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_math_float_sqrt!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_math_float_sqrte!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_math_float_tanh!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_cmp_vertical!(
            [$elem_ty; $elem_n]: $tuple_id, $mask_ty, false, (1., 0.)
                | $test_tt
        );

        test_select!($elem_ty, $mask_ty, $tuple_id, (1., 2.) | $test_tt);
        test_reduction_float_min_max!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt
        );
        test_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
    }
}

macro_rules! impl_m {
    ([$elem_ty:ident; $elem_n:expr]: $tuple_id:ident
     | $ielem_ty:ident, $ibitmask_ty:ident
     | $test_tt:tt | $($elem_ids:ident),* | From: $($from_vec_ty:ident),*
     | $(#[$doc:meta])*) => {
        impl_minimal_mask!(
            [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
            | $($elem_ids),* | $(#[$doc])*
        );
        impl_ops_vector_mask_bitwise!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (true, false)
        );
        impl_ops_scalar_mask_bitwise!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (true, false)
        );
        impl_reduction_bitwise!(
            [bool; $elem_n]: $tuple_id | $ielem_ty | $test_tt
                | (|x|{ x != 0 }) | (true, false)
        );
        impl_reduction_mask!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_fmt_debug!([bool; $elem_n]: $tuple_id | $test_tt);
        impl_from_array!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt
            | (crate::$elem_ty::new(true), true)
        );
        impl_from_vectors!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt | $($from_vec_ty),*
        );
        impl_default!([bool; $elem_n]: $tuple_id | $test_tt);
        impl_cmp_partial_eq!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (true, false)
        );
        impl_cmp_eq!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (true, false)
        );
        impl_cmp_vertical!(
            [$elem_ty; $elem_n]: $tuple_id, $tuple_id, true, (true, false)
            | $test_tt
        );
        impl_select!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_cmp_partial_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_cmp_ord!(
            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (false, true)
        );
        impl_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        impl_bitmask!($tuple_id | $ibitmask_ty | (true, false) | $test_tt);

        test_cmp_partial_ord_mask!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
        test_shuffle1_dyn_mask!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
    }
}

macro_rules! impl_const_p {
    ([$elem_ty:ty; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident,
     $usize_ty:ident, $isize_ty:ident
     | $test_tt:tt | $($elem_ids:ident),*
     | From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => {
        impl_minimal_p!(
            [$elem_ty; $elem_n]: $tuple_id, $mask_ty, $usize_ty, $isize_ty
                | ref_ | $test_tt | $($elem_ids),*
                | (1 as $elem_ty, 0 as $elem_ty) | $(#[$doc])*
        );
        impl_ptr_read!([$elem_ty; $elem_n]: $tuple_id, $mask_ty | $test_tt);
    }
}

macro_rules! impl_mut_p {
    ([$elem_ty:ty; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident,
     $usize_ty:ident, $isize_ty:ident
     | $test_tt:tt | $($elem_ids:ident),*
     | From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => {
        impl_minimal_p!(
            [$elem_ty; $elem_n]: $tuple_id, $mask_ty, $usize_ty, $isize_ty
                | ref_mut_ | $test_tt | $($elem_ids),*
                | (1 as $elem_ty, 0 as $elem_ty) | $(#[$doc])*
        );
        impl_ptr_read!([$elem_ty; $elem_n]: $tuple_id, $mask_ty | $test_tt);
        impl_ptr_write!([$elem_ty; $elem_n]: $tuple_id, $mask_ty | $test_tt);
    }
}


================================================
FILE: src/codegen/bit_manip.rs
================================================
//! LLVM bit manipulation intrinsics.
#[rustfmt::skip]

pub(crate) use crate::*;

#[allow(improper_ctypes, dead_code)]
extern "C" {
    #[link_name = "llvm.ctlz.v2i8"]
    fn ctlz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x2;
    #[link_name = "llvm.ctlz.v4i8"]
    fn ctlz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x4;
    #[link_name = "llvm.ctlz.v8i8"]
    fn ctlz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x8;
    #[link_name = "llvm.ctlz.v16i8"]
    fn ctlz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x16;
    #[link_name = "llvm.ctlz.v32i8"]
    fn ctlz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x32;
    #[link_name = "llvm.ctlz.v64i8"]
    fn ctlz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x64;

    #[link_name = "llvm.ctlz.v2i16"]
    fn ctlz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x2;
    #[link_name = "llvm.ctlz.v4i16"]
    fn ctlz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x4;
    #[link_name = "llvm.ctlz.v8i16"]
    fn ctlz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x8;
    #[link_name = "llvm.ctlz.v16i16"]
    fn ctlz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x16;
    #[link_name = "llvm.ctlz.v32i16"]
    fn ctlz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x32;

    #[link_name = "llvm.ctlz.v2i32"]
    fn ctlz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x2;
    #[link_name = "llvm.ctlz.v4i32"]
    fn ctlz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x4;
    #[link_name = "llvm.ctlz.v8i32"]
    fn ctlz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x8;
    #[link_name = "llvm.ctlz.v16i32"]
    fn ctlz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x16;

    #[link_name = "llvm.ctlz.v2i64"]
    fn ctlz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x2;
    #[link_name = "llvm.ctlz.v4i64"]
    fn ctlz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x4;
    #[link_name = "llvm.ctlz.v8i64"]
    fn ctlz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x8;

    #[link_name = "llvm.ctlz.v1i128"]
    fn ctlz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x1;
    #[link_name = "llvm.ctlz.v2i128"]
    fn ctlz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x2;
    #[link_name = "llvm.ctlz.v4i128"]
    fn ctlz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4;

    #[link_name = "llvm.cttz.v2i8"]
    fn cttz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x2;
    #[link_name = "llvm.cttz.v4i8"]
    fn cttz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x4;
    #[link_name = "llvm.cttz.v8i8"]
    fn cttz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x8;
    #[link_name = "llvm.cttz.v16i8"]
    fn cttz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x16;
    #[link_name = "llvm.cttz.v32i8"]
    fn cttz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x32;
    #[link_name = "llvm.cttz.v64i8"]
    fn cttz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x64;

    #[link_name = "llvm.cttz.v2i16"]
    fn cttz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x2;
    #[link_name = "llvm.cttz.v4i16"]
    fn cttz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x4;
    #[link_name = "llvm.cttz.v8i16"]
    fn cttz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x8;
    #[link_name = "llvm.cttz.v16i16"]
    fn cttz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x16;
    #[link_name = "llvm.cttz.v32i16"]
    fn cttz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x32;

    #[link_name = "llvm.cttz.v2i32"]
    fn cttz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x2;
    #[link_name = "llvm.cttz.v4i32"]
    fn cttz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x4;
    #[link_name = "llvm.cttz.v8i32"]
    fn cttz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x8;
    #[link_name = "llvm.cttz.v16i32"]
    fn cttz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x16;

    #[link_name = "llvm.cttz.v2i64"]
    fn cttz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x2;
    #[link_name = "llvm.cttz.v4i64"]
    fn cttz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x4;
    #[link_name = "llvm.cttz.v8i64"]
    fn cttz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x8;

    #[link_name = "llvm.cttz.v1i128"]
    fn cttz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x1;
    #[link_name = "llvm.cttz.v2i128"]
    fn cttz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x2;
    #[link_name = "llvm.cttz.v4i128"]
    fn cttz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4;

    #[link_name = "llvm.ctpop.v2i8"]
    fn ctpop_u8x2(x: u8x2) -> u8x2;
    #[link_name = "llvm.ctpop.v4i8"]
    fn ctpop_u8x4(x: u8x4) -> u8x4;
    #[link_name = "llvm.ctpop.v8i8"]
    fn ctpop_u8x8(x: u8x8) -> u8x8;
    #[link_name = "llvm.ctpop.v16i8"]
    fn ctpop_u8x16(x: u8x16) -> u8x16;
    #[link_name = "llvm.ctpop.v32i8"]
    fn ctpop_u8x32(x: u8x32) -> u8x32;
    #[link_name = "llvm.ctpop.v64i8"]
    fn ctpop_u8x64(x: u8x64) -> u8x64;

    #[link_name = "llvm.ctpop.v2i16"]
    fn ctpop_u16x2(x: u16x2) -> u16x2;
    #[link_name = "llvm.ctpop.v4i16"]
    fn ctpop_u16x4(x: u16x4) -> u16x4;
    #[link_name = "llvm.ctpop.v8i16"]
    fn ctpop_u16x8(x: u16x8) -> u16x8;
    #[link_name = "llvm.ctpop.v16i16"]
    fn ctpop_u16x16(x: u16x16) -> u16x16;
    #[link_name = "llvm.ctpop.v32i16"]
    fn ctpop_u16x32(x: u16x32) -> u16x32;

    #[link_name = "llvm.ctpop.v2i32"]
    fn ctpop_u32x2(x: u32x2) -> u32x2;
    #[link_name = "llvm.ctpop.v4i32"]
    fn ctpop_u32x4(x: u32x4) -> u32x4;
    #[link_name = "llvm.ctpop.v8i32"]
    fn ctpop_u32x8(x: u32x8) -> u32x8;
    #[link_name = "llvm.ctpop.v16i32"]
    fn ctpop_u32x16(x: u32x16) -> u32x16;

    #[link_name = "llvm.ctpop.v2i64"]
    fn ctpop_u64x2(x: u64x2) -> u64x2;
    #[link_name = "llvm.ctpop.v4i64"]
    fn ctpop_u64x4(x: u64x4) -> u64x4;
    #[link_name = "llvm.ctpop.v8i64"]
    fn ctpop_u64x8(x: u64x8) -> u64x8;

    #[link_name = "llvm.ctpop.v1i128"]
    fn ctpop_u128x1(x: u128x1) -> u128x1;
    #[link_name = "llvm.ctpop.v2i128"]
    fn ctpop_u128x2(x: u128x2) -> u128x2;
    #[link_name = "llvm.ctpop.v4i128"]
    fn ctpop_u128x4(x: u128x4) -> u128x4;
}

pub(crate) trait BitManip {
    fn ctpop(self) -> Self;
    fn ctlz(self) -> Self;
    fn cttz(self) -> Self;
}

macro_rules! impl_bit_manip {
    (inner: $ty:ident, $scalar:ty, $uty:ident,
     $ctpop:ident, $ctlz:ident, $cttz:ident) => {
        // FIXME: several LLVM intrinsics break on s390x https://github.com/rust-lang-nursery/packed_simd/issues/192
        #[cfg(target_arch = "s390x")]
        impl_bit_manip! { scalar: $ty, $scalar }
        #[cfg(not(target_arch = "s390x"))]
        impl BitManip for $ty {
            #[inline]
            fn ctpop(self) -> Self {
                let y: $uty = self.cast();
                unsafe { $ctpop(y).cast() }
            }

            #[inline]
            fn ctlz(self) -> Self {
                let y: $uty = self.cast();
                // the ctxx intrinsics need compile-time constant
                // `is_zero_undef`
                unsafe { $ctlz(y, false).cast() }
            }

            #[inline]
            fn cttz(self) -> Self {
                let y: $uty = self.cast();
                unsafe { $cttz(y, false).cast() }
            }
        }
    };
    (sized_inner: $ty:ident, $scalar:ty, $uty:ident) => {
        #[cfg(target_arch = "s390x")]
        impl_bit_manip! { scalar: $ty, $scalar }
        #[cfg(not(target_arch = "s390x"))]
        impl BitManip for $ty {
            #[inline]
            fn ctpop(self) -> Self {
                let y: $uty = self.cast();
                $uty::ctpop(y).cast()
            }

            #[inline]
            fn ctlz(self) -> Self {
                let y: $uty = self.cast();
                $uty::ctlz(y).cast()
            }

            #[inline]
            fn cttz(self) -> Self {
                let y: $uty = self.cast();
                $uty::cttz(y).cast()
            }
        }
    };
    (scalar: $ty:ident, $scalar:ty) => {
        impl BitManip for $ty {
            #[inline]
            fn ctpop(self) -> Self {
                let mut ones = self;
                for i in 0..Self::lanes() {
                    ones = ones.replace(i, self.extract(i).count_ones() as $scalar);
                }
                ones
            }

            #[inline]
            fn ctlz(self) -> Self {
                let mut lz = self;
                for i in 0..Self::lanes() {
                    lz = lz.replace(i, self.extract(i).leading_zeros() as $scalar);
                }
                lz
            }

            #[inline]
            fn cttz(self) -> Self {
                let mut tz = self;
                for i in 0..Self::lanes() {
                    tz = tz.replace(i, self.extract(i).trailing_zeros() as $scalar);
                }
                tz
            }
        }
    };
    ($uty:ident, $uscalar:ty, $ity:ident, $iscalar:ty,
     $ctpop:ident, $ctlz:ident, $cttz:ident) => {
        impl_bit_manip! { inner: $uty, $uscalar, $uty, $ctpop, $ctlz, $cttz }
        impl_bit_manip! { inner: $ity, $iscalar, $uty, $ctpop, $ctlz, $cttz }
    };
    (sized: $usize:ident, $uscalar:ty, $isize:ident,
     $iscalar:ty, $ty:ident) => {
        impl_bit_manip! { sized_inner: $usize, $uscalar, $ty }
        impl_bit_manip! { sized_inner: $isize, $iscalar, $ty }
    };
}

impl_bit_manip! { u8x2   ,   u8, i8x2, i8,   ctpop_u8x2,   ctlz_u8x2,   cttz_u8x2   }
impl_bit_manip! { u8x4   ,   u8, i8x4, i8,   ctpop_u8x4,   ctlz_u8x4,   cttz_u8x4   }
#[cfg(not(target_arch = "aarch64"))] // see below
impl_bit_manip! { u8x8   ,   u8, i8x8, i8,   ctpop_u8x8,   ctlz_u8x8,   cttz_u8x8   }
impl_bit_manip! { u8x16  ,  u8, i8x16, i8,  ctpop_u8x16,  ctlz_u8x16,  cttz_u8x16  }
impl_bit_manip! { u8x32  ,  u8, i8x32, i8,  ctpop_u8x32,  ctlz_u8x32,  cttz_u8x32  }
impl_bit_manip! { u8x64  ,  u8, i8x64, i8,  ctpop_u8x64,  ctlz_u8x64,  cttz_u8x64  }
impl_bit_manip! { u16x2  ,  u16, i16x2, i16,  ctpop_u16x2,  ctlz_u16x2,  cttz_u16x2  }
impl_bit_manip! { u16x4  ,  u16, i16x4, i16,  ctpop_u16x4,  ctlz_u16x4,  cttz_u16x4  }
impl_bit_manip! { u16x8  ,  u16, i16x8, i16,  ctpop_u16x8,  ctlz_u16x8,  cttz_u16x8  }
impl_bit_manip! { u16x16 , u16, i16x16, i16, ctpop_u16x16, ctlz_u16x16, cttz_u16x16 }
impl_bit_manip! { u16x32 , u16, i16x32, i16, ctpop_u16x32, ctlz_u16x32, cttz_u16x32 }
impl_bit_manip! { u32x2  ,  u32, i32x2, i32,  ctpop_u32x2,  ctlz_u32x2,  cttz_u32x2  }
impl_bit_manip! { u32x4  ,  u32, i32x4, i32,  ctpop_u32x4,  ctlz_u32x4,  cttz_u32x4  }
impl_bit_manip! { u32x8  ,  u32, i32x8, i32,  ctpop_u32x8,  ctlz_u32x8,  cttz_u32x8  }
impl_bit_manip! { u32x16 , u32, i32x16, i32, ctpop_u32x16, ctlz_u32x16, cttz_u32x16 }
impl_bit_manip! { u64x2  ,  u64, i64x2, i64,  ctpop_u64x2,  ctlz_u64x2,  cttz_u64x2  }
impl_bit_manip! { u64x4  ,  u64, i64x4, i64,  ctpop_u64x4,  ctlz_u64x4,  cttz_u64x4  }
impl_bit_manip! { u64x8  ,  u64, i64x8, i64,  ctpop_u64x8,  ctlz_u64x8,  cttz_u64x8  }
impl_bit_manip! { u128x1 , u128, i128x1, i128, ctpop_u128x1, ctlz_u128x1, cttz_u128x1 }
impl_bit_manip! { u128x2 , u128, i128x2, i128, ctpop_u128x2, ctlz_u128x2, cttz_u128x2 }
impl_bit_manip! { u128x4 , u128, i128x4, i128, ctpop_u128x4, ctlz_u128x4, cttz_u128x4 }

#[cfg(target_arch = "aarch64")]
impl BitManip for u8x8 {
    #[inline]
    fn ctpop(self) -> Self {
        let y: u8x8 = self.cast();
        unsafe { ctpop_u8x8(y).cast() }
    }

    #[inline]
    fn ctlz(self) -> Self {
        let y: u8x8 = self.cast();
        unsafe { ctlz_u8x8(y, false).cast() }
    }

    #[inline]
    fn cttz(self) -> Self {
        // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191
        // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64
        // intrinsics
        let mut tz = self;
        for i in 0..Self::lanes() {
            tz = tz.replace(i, self.extract(i).trailing_zeros() as u8);
        }
        tz
    }
}
#[cfg(target_arch = "aarch64")]
impl BitManip for i8x8 {
    #[inline]
    fn ctpop(self) -> Self {
        let y: u8x8 = self.cast();
        unsafe { ctpop_u8x8(y).cast() }
    }

    #[inline]
    fn ctlz(self) -> Self {
        let y: u8x8 = self.cast();
        unsafe { ctlz_u8x8(y, false).cast() }
    }

    #[inline]
    fn cttz(self) -> Self {
        // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191
        // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64
        // intrinsics
        let mut tz = self;
        for i in 0..Self::lanes() {
            tz = tz.replace(i, self.extract(i).trailing_zeros() as i8);
        }
        tz
    }
}

cfg_if! {
    if #[cfg(target_pointer_width = "8")] {
        impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u8x2 }
        impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u8x4 }
        impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u8x8 }
    } else if #[cfg(target_pointer_width = "16")] {
        impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u16x2 }
        impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u16x4 }
        impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u16x8 }
    } else if #[cfg(target_pointer_width = "32")] {
        impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u32x2 }
        impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u32x4 }
        impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u32x8 }
    } else if #[cfg(target_pointer_width = "64")] {
        impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u64x2 }
        impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u64x4 }
        impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u64x8 }
    } else {
        compile_error!("unsupported target_pointer_width");
    }
}


================================================
FILE: src/codegen/llvm.rs
================================================
//! LLVM's platform intrinsics
#![allow(dead_code)]

use crate::sealed::Shuffle;
#[allow(unused_imports)] // FIXME: spurious warning?
use crate::sealed::Simd;

extern "platform-intrinsic" {
    fn simd_shuffle<T, I, U>(x: T, y: T, idx: I) -> U;
}

#[allow(clippy::missing_safety_doc)]
#[inline]
pub unsafe fn __shuffle_vector2<const IDX: [u32; 2], T, U>(x: T, y: T) -> U
where
    T: Simd,
    <T as Simd>::Element: Shuffle<[u32; 2], Output = U>,
{
    simd_shuffle(x, y, IDX)
}

#[allow(clippy::missing_safety_doc)]
#[inline]
pub unsafe fn __shuffle_vector4<const IDX: [u32; 4], T, U>(x: T, y: T) -> U
where
    T: Simd,
    <T as Simd>::Element: Shuffle<[u32; 4], Output = U>,
{
    simd_shuffle(x, y, IDX)
}

#[allow(clippy::missing_safety_doc)]
#[inline]
pub unsafe fn __shuffle_vector8<const IDX: [u32; 8], T, U>(x: T, y: T) -> U
where
    T: Simd,
    <T as Simd>::Element: Shuffle<[u32; 8], Output = U>,
{
    simd_shuffle(x, y, IDX)
}

#[allow(clippy::missing_safety_doc)]
#[inline]
pub unsafe fn __shuffle_vector16<const IDX: [u32; 16], T, U>(x: T, y: T) -> U
where
    T: Simd,
    <T as Simd>::Element: Shuffle<[u32; 16], Output = U>,
{
    simd_shuffle(x, y, IDX)
}

#[allow(clippy::missing_safety_doc)]
#[inline]
pub unsafe fn __shuffle_vector32<const IDX: [u32; 32], T, U>(x: T, y: T) -> U
where
    T: Simd,
    <T as Simd>::Element: Shuffle<[u32; 32], Output = U>,
{
    simd_shuffle(x, y, IDX)
}

#[allow(clippy::missing_safety_doc)]
#[inline]
pub unsafe fn __shuffle_vector64<const IDX: [u32; 64], T, U>(x: T, y: T) -> U
where
    T: Simd,
    <T as Simd>::Element: Shuffle<[u32; 64], Output = U>,
{
    simd_shuffle(x, y, IDX)
}

extern "platform-intrinsic" {
    pub(crate) fn simd_eq<T, U>(x: T, y: T) -> U;
    pub(crate) fn simd_ne<T, U>(x: T, y: T) -> U;
    pub(crate) fn simd_lt<T, U>(x: T, y: T) -> U;
    pub(crate) fn simd_le<T, U>(x: T, y: T) -> U;
    pub(crate) fn simd_gt<T, U>(x: T, y: T) -> U;
    pub(crate) fn simd_ge<T, U>(x: T, y: T) -> U;

    pub(crate) fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T;
    pub(crate) fn simd_extract<T, U>(x: T, idx: u32) -> U;

    pub(crate) fn simd_cast<T, U>(x: T) -> U;

    pub(crate) fn simd_add<T>(x: T, y: T) -> T;
    pub(crate) fn simd_sub<T>(x: T, y: T) -> T;
    pub(crate) fn simd_mul<T>(x: T, y: T) -> T;
    pub(crate) fn simd_div<T>(x: T, y: T) -> T;
    pub(crate) fn simd_rem<T>(x: T, y: T) -> T;
    pub(crate) fn simd_shl<T>(x: T, y: T) -> T;
    pub(crate) fn simd_shr<T>(x: T, y: T) -> T;
    pub(crate) fn simd_and<T>(x: T, y: T) -> T;
    pub(crate) fn simd_or<T>(x: T, y: T) -> T;
    pub(crate) fn simd_xor<T>(x: T, y: T) -> T;

    pub(crate) fn simd_reduce_add_unordered<T, U>(x: T) -> U;
    pub(crate) fn simd_reduce_mul_unordered<T, U>(x: T) -> U;
    pub(crate) fn simd_reduce_add_ordered<T, U>(x: T, acc: U) -> U;
    pub(crate) fn simd_reduce_mul_ordered<T, U>(x: T, acc: U) -> U;
    pub(crate) fn simd_reduce_min<T, U>(x: T) -> U;
    pub(crate) fn simd_reduce_max<T, U>(x: T) -> U;
    pub(crate) fn simd_reduce_min_nanless<T, U>(x: T) -> U;
    pub(crate) fn simd_reduce_max_nanless<T, U>(x: T) -> U;
    pub(crate) fn simd_reduce_and<T, U>(x: T) -> U;
    pub(crate) fn simd_reduce_or<T, U>(x: T) -> U;
    pub(crate) fn simd_reduce_xor<T, U>(x: T) -> U;
    pub(crate) fn simd_reduce_all<T>(x: T) -> bool;
    pub(crate) fn simd_reduce_any<T>(x: T) -> bool;

    pub(crate) fn simd_select<M, T>(m: M, a: T, b: T) -> T;

    pub(crate) fn simd_fmin<T>(a: T, b: T) -> T;
    pub(crate) fn simd_fmax<T>(a: T, b: T) -> T;

    pub(crate) fn simd_fsqrt<T>(a: T) -> T;
    pub(crate) fn simd_fma<T>(a: T, b: T, c: T) -> T;

    pub(crate) fn simd_gather<T, P, M>(value: T, pointers: P, mask: M) -> T;
    pub(crate) fn simd_scatter<T, P, M>(value: T, pointers: P, mask: M);

    pub(crate) fn simd_bitmask<T, U>(value: T) -> U;
}


================================================
FILE: src/codegen/math/float/abs.rs
================================================
//! Vertical floating-point `fabs`
#![allow(unused)]

// FIXME 64-bit 1 elem vectors fabs

use crate::*;

pub(crate) trait Abs {
    fn abs(self) -> Self;
}

#[allow(improper_ctypes)]
extern "C" {
    #[link_name = "llvm.fabs.v2f32"]
    fn fabs_v2f32(x: f32x2) -> f32x2;
    #[link_name = "llvm.fabs.v4f32"]
    fn fabs_v4f32(x: f32x4) -> f32x4;
    #[link_name = "llvm.fabs.v8f32"]
    fn fabs_v8f32(x: f32x8) -> f32x8;
    #[link_name = "llvm.fabs.v16f32"]
    fn fabs_v16f32(x: f32x16) -> f32x16;
    /* FIXME 64-bit fabsgle elem vectors
    #[link_name = "llvm.fabs.v1f64"]
    fn fabs_v1f64(x: f64x1) -> f64x1;
     */
    #[link_name = "llvm.fabs.v2f64"]
    fn fabs_v2f64(x: f64x2) -> f64x2;
    #[link_name = "llvm.fabs.v4f64"]
    fn fabs_v4f64(x: f64x4) -> f64x4;
    #[link_name = "llvm.fabs.v8f64"]
    fn fabs_v8f64(x: f64x8) -> f64x8;

    #[link_name = "llvm.fabs.f32"]
    fn fabs_f32(x: f32) -> f32;
    #[link_name = "llvm.fabs.f64"]
    fn fabs_f64(x: f64) -> f64;
}

gen_unary_impl_table!(Abs, abs);

cfg_if! {
    if #[cfg(target_arch = "s390x")] {
        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
        impl_unary!(f32x2[f32; 2]: fabs_f32);
        impl_unary!(f32x4[f32; 4]: fabs_f32);
        impl_unary!(f32x8[f32; 8]: fabs_f32);
        impl_unary!(f32x16[f32; 16]: fabs_f32);

        impl_unary!(f64x2[f64; 2]: fabs_f64);
        impl_unary!(f64x4[f64; 4]: fabs_f64);
        impl_unary!(f64x8[f64; 8]: fabs_f64);
    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
        use sleef_sys::*;
        cfg_if! {
            if #[cfg(target_feature = "avx2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_avx2128);
                impl_unary!(f32x16[h => f32x8]: Sleef_fabsf8_avx2);
                impl_unary!(f64x8[h => f64x4]: Sleef_fabsd4_avx2);

                impl_unary!(f32x4: Sleef_fabsf4_avx2128);
                impl_unary!(f32x8: Sleef_fabsf8_avx2);
                impl_unary!(f64x2: Sleef_fabsd2_avx2128);
                impl_unary!(f64x4: Sleef_fabsd4_avx2);
            } else if #[cfg(target_feature = "avx")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_sse4);
                impl_unary!(f32x16[h => f32x8]: Sleef_fabsf8_avx);
                impl_unary!(f64x8[h => f64x4]: Sleef_fabsd4_avx);

                impl_unary!(f32x4: Sleef_fabsf4_sse4);
                impl_unary!(f32x8: Sleef_fabsf8_avx);
                impl_unary!(f64x2: Sleef_fabsd2_sse4);
                impl_unary!(f64x4: Sleef_fabsd4_avx);
            } else if #[cfg(target_feature = "sse4.2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_sse4);
                impl_unary!(f32x16[q => f32x4]: Sleef_fabsf4_sse4);
                impl_unary!(f64x8[q => f64x2]: Sleef_fabsd2_sse4);

                impl_unary!(f32x4: Sleef_fabsf4_sse4);
                impl_unary!(f32x8[h => f32x4]: Sleef_fabsf4_sse4);
                impl_unary!(f64x2: Sleef_fabsd2_sse4);
                impl_unary!(f64x4[h => f64x2]: Sleef_fabsd2_sse4);
            } else {
                impl_unary!(f32x2[f32; 2]: fabs_f32);
                impl_unary!(f32x16: fabs_v16f32);
                impl_unary!(f64x8: fabs_v8f64);

                impl_unary!(f32x4: fabs_v4f32);
                impl_unary!(f32x8: fabs_v8f32);
                impl_unary!(f64x2: fabs_v2f64);
                impl_unary!(f64x4: fabs_v4f64);
            }
        }
    } else {
        impl_unary!(f32x2[f32; 2]: fabs_f32);
        impl_unary!(f32x4: fabs_v4f32);
        impl_unary!(f32x8: fabs_v8f32);
        impl_unary!(f32x16: fabs_v16f32);

        impl_unary!(f64x2: fabs_v2f64);
        impl_unary!(f64x4: fabs_v4f64);
        impl_unary!(f64x8: fabs_v8f64);
    }
}


================================================
FILE: src/codegen/math/float/cos.rs
================================================
//! Vertical floating-point `cos`
#![allow(unused)]

// FIXME 64-bit 1 elem vector cos

use crate::*;

pub(crate) trait Cos {
    fn cos(self) -> Self;
}

#[allow(improper_ctypes)]
extern "C" {
    #[link_name = "llvm.cos.v2f32"]
    fn cos_v2f32(x: f32x2) -> f32x2;
    #[link_name = "llvm.cos.v4f32"]
    fn cos_v4f32(x: f32x4) -> f32x4;
    #[link_name = "llvm.cos.v8f32"]
    fn cos_v8f32(x: f32x8) -> f32x8;
    #[link_name = "llvm.cos.v16f32"]
    fn cos_v16f32(x: f32x16) -> f32x16;
    /* FIXME 64-bit cosgle elem vectors
    #[link_name = "llvm.cos.v1f64"]
    fn cos_v1f64(x: f64x1) -> f64x1;
     */
    #[link_name = "llvm.cos.v2f64"]
    fn cos_v2f64(x: f64x2) -> f64x2;
    #[link_name = "llvm.cos.v4f64"]
    fn cos_v4f64(x: f64x4) -> f64x4;
    #[link_name = "llvm.cos.v8f64"]
    fn cos_v8f64(x: f64x8) -> f64x8;

    #[link_name = "llvm.cos.f32"]
    fn cos_f32(x: f32) -> f32;
    #[link_name = "llvm.cos.f64"]
    fn cos_f64(x: f64) -> f64;
}

gen_unary_impl_table!(Cos, cos);

cfg_if! {
    if #[cfg(target_arch = "s390x")] {
        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
        impl_unary!(f32x2[f32; 2]: cos_f32);
        impl_unary!(f32x4[f32; 4]: cos_f32);
        impl_unary!(f32x8[f32; 8]: cos_f32);
        impl_unary!(f32x16[f32; 16]: cos_f32);

        impl_unary!(f64x2[f64; 2]: cos_f64);
        impl_unary!(f64x4[f64; 4]: cos_f64);
        impl_unary!(f64x8[f64; 8]: cos_f64);
    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
        use sleef_sys::*;
        cfg_if! {
            if #[cfg(target_feature = "avx2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10avx2128);
                impl_unary!(f32x16[h => f32x8]: Sleef_cosf8_u10avx2);
                impl_unary!(f64x8[h => f64x4]: Sleef_cosd4_u10avx2);

                impl_unary!(f32x4: Sleef_cosf4_u10avx2128);
                impl_unary!(f32x8: Sleef_cosf8_u10avx2);
                impl_unary!(f64x2: Sleef_cosd2_u10avx2128);
                impl_unary!(f64x4: Sleef_cosd4_u10avx2);
            } else if #[cfg(target_feature = "avx")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10sse4);
                impl_unary!(f32x16[h => f32x8]: Sleef_cosf8_u10avx);
                impl_unary!(f64x8[h => f64x4]: Sleef_cosd4_u10avx);

                impl_unary!(f32x4: Sleef_cosf4_u10sse4);
                impl_unary!(f32x8: Sleef_cosf8_u10avx);
                impl_unary!(f64x2: Sleef_cosd2_u10sse4);
                impl_unary!(f64x4: Sleef_cosd4_u10avx);
            } else if #[cfg(target_feature = "sse4.2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10sse4);
                impl_unary!(f32x16[q => f32x4]: Sleef_cosf4_u10sse4);
                impl_unary!(f64x8[q => f64x2]: Sleef_cosd2_u10sse4);

                impl_unary!(f32x4: Sleef_cosf4_u10sse4);
                impl_unary!(f32x8[h => f32x4]: Sleef_cosf4_u10sse4);
                impl_unary!(f64x2: Sleef_cosd2_u10sse4);
                impl_unary!(f64x4[h => f64x2]: Sleef_cosd2_u10sse4);
            } else {
                impl_unary!(f32x2[f32; 2]: cos_f32);
                impl_unary!(f32x16: cos_v16f32);
                impl_unary!(f64x8: cos_v8f64);

                impl_unary!(f32x4: cos_v4f32);
                impl_unary!(f32x8: cos_v8f32);
                impl_unary!(f64x2: cos_v2f64);
                impl_unary!(f64x4: cos_v4f64);
            }
        }
    } else {
        impl_unary!(f32x2[f32; 2]: cos_f32);
        impl_unary!(f32x4: cos_v4f32);
        impl_unary!(f32x8: cos_v8f32);
        impl_unary!(f32x16: cos_v16f32);

        impl_unary!(f64x2: cos_v2f64);
        impl_unary!(f64x4: cos_v4f64);
        impl_unary!(f64x8: cos_v8f64);
    }
}


================================================
FILE: src/codegen/math/float/cos_pi.rs
================================================
//! Vertical floating-point `cos`
#![allow(unused)]

// FIXME 64-bit 1 elem vectors cos_pi

use crate::*;

pub(crate) trait CosPi {
    fn cos_pi(self) -> Self;
}

gen_unary_impl_table!(CosPi, cos_pi);

macro_rules! impl_def {
    ($vid:ident, $PI:path) => {
        impl CosPi for $vid {
            #[inline]
            fn cos_pi(self) -> Self {
                (self * Self::splat($PI)).cos()
            }
        }
    };
}
macro_rules! impl_def32 {
    ($vid:ident) => {
        impl_def!($vid, crate::f32::consts::PI);
    };
}
macro_rules! impl_def64 {
    ($vid:ident) => {
        impl_def!($vid, crate::f64::consts::PI);
    };
}

cfg_if! {
    if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
        use sleef_sys::*;
        cfg_if! {
            if #[cfg(target_feature = "avx2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05avx2128);
                impl_unary!(f32x16[h => f32x8]: Sleef_cospif8_u05avx2);
                impl_unary!(f64x8[h => f64x4]: Sleef_cospid4_u05avx2);

                impl_unary!(f32x4: Sleef_cospif4_u05avx2128);
                impl_unary!(f32x8: Sleef_cospif8_u05avx2);
                impl_unary!(f64x2: Sleef_cospid2_u05avx2128);
                impl_unary!(f64x4: Sleef_cospid4_u05avx2);
            } else if #[cfg(target_feature = "avx")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05sse4);
                impl_unary!(f32x16[h => f32x8]: Sleef_cospif8_u05avx);
                impl_unary!(f64x8[h => f64x4]: Sleef_cospid4_u05avx);

                impl_unary!(f32x4: Sleef_cospif4_u05sse4);
                impl_unary!(f32x8: Sleef_cospif8_u05avx);
                impl_unary!(f64x2: Sleef_cospid2_u05sse4);
                impl_unary!(f64x4: Sleef_cospid4_u05avx);
            } else if #[cfg(target_feature = "sse4.2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05sse4);
                impl_unary!(f32x16[q => f32x4]: Sleef_cospif4_u05sse4);
                impl_unary!(f64x8[q => f64x2]: Sleef_cospid2_u05sse4);

                impl_unary!(f32x4: Sleef_cospif4_u05sse4);
                impl_unary!(f32x8[h => f32x4]: Sleef_cospif4_u05sse4);
                impl_unary!(f64x2: Sleef_cospid2_u05sse4);
                impl_unary!(f64x4[h => f64x2]: Sleef_cospid2_u05sse4);
            } else {
                impl_def32!(f32x2);
                impl_def32!(f32x4);
                impl_def32!(f32x8);
                impl_def32!(f32x16);

                impl_def64!(f64x2);
                impl_def64!(f64x4);
                impl_def64!(f64x8);
            }
        }
    } else {
        impl_def32!(f32x2);
        impl_def32!(f32x4);
        impl_def32!(f32x8);
        impl_def32!(f32x16);

        impl_def64!(f64x2);
        impl_def64!(f64x4);
        impl_def64!(f64x8);
    }
}


================================================
FILE: src/codegen/math/float/exp.rs
================================================
//! Vertical floating-point `exp`
#![allow(unused)]

// FIXME 64-bit expgle elem vectors misexpg

use crate::*;

pub(crate) trait Exp {
    fn exp(self) -> Self;
}

#[allow(improper_ctypes)]
extern "C" {
    #[link_name = "llvm.exp.v2f32"]
    fn exp_v2f32(x: f32x2) -> f32x2;
    #[link_name = "llvm.exp.v4f32"]
    fn exp_v4f32(x: f32x4) -> f32x4;
    #[link_name = "llvm.exp.v8f32"]
    fn exp_v8f32(x: f32x8) -> f32x8;
    #[link_name = "llvm.exp.v16f32"]
    fn exp_v16f32(x: f32x16) -> f32x16;
    /* FIXME 64-bit expgle elem vectors
    #[link_name = "llvm.exp.v1f64"]
    fn exp_v1f64(x: f64x1) -> f64x1;
     */
    #[link_name = "llvm.exp.v2f64"]
    fn exp_v2f64(x: f64x2) -> f64x2;
    #[link_name = "llvm.exp.v4f64"]
    fn exp_v4f64(x: f64x4) -> f64x4;
    #[link_name = "llvm.exp.v8f64"]
    fn exp_v8f64(x: f64x8) -> f64x8;

    #[link_name = "llvm.exp.f32"]
    fn exp_f32(x: f32) -> f32;
    #[link_name = "llvm.exp.f64"]
    fn exp_f64(x: f64) -> f64;
}

gen_unary_impl_table!(Exp, exp);

cfg_if! {
    if #[cfg(target_arch = "s390x")] {
        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
        impl_unary!(f32x2[f32; 2]: exp_f32);
        impl_unary!(f32x4[f32; 4]: exp_f32);
        impl_unary!(f32x8[f32; 8]: exp_f32);
        impl_unary!(f32x16[f32; 16]: exp_f32);

        impl_unary!(f64x2[f64; 2]: exp_f64);
        impl_unary!(f64x4[f64; 4]: exp_f64);
        impl_unary!(f64x8[f64; 8]: exp_f64);
    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
        use sleef_sys::*;
        cfg_if! {
            if #[cfg(target_feature = "avx2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10avx2128);
                impl_unary!(f32x16[h => f32x8]: Sleef_expf8_u10avx2);
                impl_unary!(f64x8[h => f64x4]: Sleef_expd4_u10avx2);

                impl_unary!(f32x4: Sleef_expf4_u10avx2128);
                impl_unary!(f32x8: Sleef_expf8_u10avx2);
                impl_unary!(f64x2: Sleef_expd2_u10avx2128);
                impl_unary!(f64x4: Sleef_expd4_u10avx2);
            } else if #[cfg(target_feature = "avx")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse4);
                impl_unary!(f32x16[h => f32x8]: Sleef_expf8_u10avx);
                impl_unary!(f64x8[h => f64x4]: Sleef_expd4_u10avx);

                impl_unary!(f32x4: Sleef_expf4_u10sse4);
                impl_unary!(f32x8: Sleef_expf8_u10avx);
                impl_unary!(f64x2: Sleef_expd2_u10sse4);
                impl_unary!(f64x4: Sleef_expd4_u10avx);
            } else if #[cfg(target_feature = "sse4.2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse4);
                impl_unary!(f32x16[q => f32x4]: Sleef_expf4_u10sse4);
                impl_unary!(f64x8[q => f64x2]: Sleef_expd2_u10sse4);

                impl_unary!(f32x4: Sleef_expf4_u10sse4);
                impl_unary!(f32x8[h => f32x4]: Sleef_expf4_u10sse4);
                impl_unary!(f64x2: Sleef_expd2_u10sse4);
                impl_unary!(f64x4[h => f64x2]: Sleef_expd2_u10sse4);
            } else if #[cfg(target_feature = "sse2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse2);
                impl_unary!(f32x16[q => f32x4]: Sleef_expf4_u10sse2);
                impl_unary!(f64x8[q => f64x2]: Sleef_expd2_u10sse2);

                impl_unary!(f32x4: Sleef_expf4_u10sse2);
                impl_unary!(f32x8[h => f32x4]: Sleef_expf4_u10sse2);
                impl_unary!(f64x2: Sleef_expd2_u10sse2);
                impl_unary!(f64x4[h => f64x2]: Sleef_expd2_u10sse2);
            } else {
                impl_unary!(f32x2[f32; 2]: exp_f32);
                impl_unary!(f32x16: exp_v16f32);
                impl_unary!(f64x8: exp_v8f64);

                impl_unary!(f32x4: exp_v4f32);
                impl_unary!(f32x8: exp_v8f32);
                impl_unary!(f64x2: exp_v2f64);
                impl_unary!(f64x4: exp_v4f64);
            }
        }
    } else {
        impl_unary!(f32x2[f32; 2]: exp_f32);
        impl_unary!(f32x4: exp_v4f32);
        impl_unary!(f32x8: exp_v8f32);
        impl_unary!(f32x16: exp_v16f32);

        impl_unary!(f64x2: exp_v2f64);
        impl_unary!(f64x4: exp_v4f64);
        impl_unary!(f64x8: exp_v8f64);
    }
}


================================================
FILE: src/codegen/math/float/ln.rs
================================================
//! Vertical floating-point `ln`
#![allow(unused)]

// FIXME 64-bit lngle elem vectors mislng

use crate::*;

pub(crate) trait Ln {
    fn ln(self) -> Self;
}

#[allow(improper_ctypes)]
extern "C" {
    #[link_name = "llvm.log.v2f32"]
    fn ln_v2f32(x: f32x2) -> f32x2;
    #[link_name = "llvm.log.v4f32"]
    fn ln_v4f32(x: f32x4) -> f32x4;
    #[link_name = "llvm.log.v8f32"]
    fn ln_v8f32(x: f32x8) -> f32x8;
    #[link_name = "llvm.log.v16f32"]
    fn ln_v16f32(x: f32x16) -> f32x16;
    /* FIXME 64-bit lngle elem vectors
    #[link_name = "llvm.log.v1f64"]
    fn ln_v1f64(x: f64x1) -> f64x1;
     */
    #[link_name = "llvm.log.v2f64"]
    fn ln_v2f64(x: f64x2) -> f64x2;
    #[link_name = "llvm.log.v4f64"]
    fn ln_v4f64(x: f64x4) -> f64x4;
    #[link_name = "llvm.log.v8f64"]
    fn ln_v8f64(x: f64x8) -> f64x8;

    #[link_name = "llvm.log.f32"]
    fn ln_f32(x: f32) -> f32;
    #[link_name = "llvm.log.f64"]
    fn ln_f64(x: f64) -> f64;
}

gen_unary_impl_table!(Ln, ln);

cfg_if! {
    if #[cfg(target_arch = "s390x")] {
        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
        impl_unary!(f32x2[f32; 2]: ln_f32);
        impl_unary!(f32x4[f32; 4]: ln_f32);
        impl_unary!(f32x8[f32; 8]: ln_f32);
        impl_unary!(f32x16[f32; 16]: ln_f32);

        impl_unary!(f64x2[f64; 2]: ln_f64);
        impl_unary!(f64x4[f64; 4]: ln_f64);
        impl_unary!(f64x8[f64; 8]: ln_f64);
    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
        use sleef_sys::*;
        cfg_if! {
            if #[cfg(target_feature = "avx2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10avx2128);
                impl_unary!(f32x16[h => f32x8]: Sleef_logf8_u10avx2);
                impl_unary!(f64x8[h => f64x4]: Sleef_logd4_u10avx2);

                impl_unary!(f32x4: Sleef_logf4_u10avx2128);
                impl_unary!(f32x8: Sleef_logf8_u10avx2);
                impl_unary!(f64x2: Sleef_logd2_u10avx2128);
                impl_unary!(f64x4: Sleef_logd4_u10avx2);
            } else if #[cfg(target_feature = "avx")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse4);
                impl_unary!(f32x16[h => f32x8]: Sleef_logf8_u10avx);
                impl_unary!(f64x8[h => f64x4]: Sleef_logd4_u10avx);

                impl_unary!(f32x4: Sleef_logf4_u10sse4);
                impl_unary!(f32x8: Sleef_logf8_u10avx);
                impl_unary!(f64x2: Sleef_logd2_u10sse4);
                impl_unary!(f64x4: Sleef_logd4_u10avx);
            } else if #[cfg(target_feature = "sse4.2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse4);
                impl_unary!(f32x16[q => f32x4]: Sleef_logf4_u10sse4);
                impl_unary!(f64x8[q => f64x2]: Sleef_logd2_u10sse4);

                impl_unary!(f32x4: Sleef_logf4_u10sse4);
                impl_unary!(f32x8[h => f32x4]: Sleef_logf4_u10sse4);
                impl_unary!(f64x2: Sleef_logd2_u10sse4);
                impl_unary!(f64x4[h => f64x2]: Sleef_logd2_u10sse4);
            } else if #[cfg(target_feature = "sse2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse2);
                impl_unary!(f32x16[q => f32x4]: Sleef_logf4_u10sse2);
                impl_unary!(f64x8[q => f64x2]: Sleef_logd2_u10sse2);

                impl_unary!(f32x4: Sleef_logf4_u10sse2);
                impl_unary!(f32x8[h => f32x4]: Sleef_logf4_u10sse2);
                impl_unary!(f64x2: Sleef_logd2_u10sse2);
                impl_unary!(f64x4[h => f64x2]: Sleef_logd2_u10sse2);
            } else {
                impl_unary!(f32x2[f32; 2]: ln_f32);
                impl_unary!(f32x16: ln_v16f32);
                impl_unary!(f64x8: ln_v8f64);

                impl_unary!(f32x4: ln_v4f32);
                impl_unary!(f32x8: ln_v8f32);
                impl_unary!(f64x2: ln_v2f64);
                impl_unary!(f64x4: ln_v4f64);
            }
        }
    } else {
        impl_unary!(f32x2[f32; 2]: ln_f32);
        impl_unary!(f32x4: ln_v4f32);
        impl_unary!(f32x8: ln_v8f32);
        impl_unary!(f32x16: ln_v16f32);

        impl_unary!(f64x2: ln_v2f64);
        impl_unary!(f64x4: ln_v4f64);
        impl_unary!(f64x8: ln_v8f64);
    }
}


================================================
FILE: src/codegen/math/float/macros.rs
================================================
//! Utility macros
#![allow(unused)]

macro_rules! impl_unary_ {
    // implementation mapping 1:1
    (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident,
     $fun:ident) => {
        impl $trait_id for $vec_id {
            #[inline]
            fn $trait_method(self) -> Self {
                unsafe {
                    use crate::mem::transmute;
                    transmute($fun(transmute(self)))
                }
            }
        }
    };
    // implementation mapping 1:1 for when `$fun` is a generic function
    // like some of the fp math rustc intrinsics (e.g. `fn fun<T>(x: T) -> T`).
    (gen | $trait_id:ident, $trait_method:ident, $vec_id:ident,
     $fun:ident) => {
        impl $trait_id for $vec_id {
            #[inline]
            fn $trait_method(self) -> Self {
                unsafe {
                    use crate::mem::transmute;
                    transmute($fun(self.0))
                }
            }
        }
    };
    (scalar | $trait_id:ident, $trait_method:ident,
     $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => {
        impl $trait_id for $vec_id {
            #[inline]
            fn $trait_method(self) -> Self {
                unsafe {
                    union U {
                        vec: $vec_id,
                        scalars: [$sid; $scount],
                    }
                    let mut scalars = U { vec: self }.scalars;
                    for i in &mut scalars {
                        *i = $fun(*i);
                    }
                    U { scalars }.vec
                }
            }
        }
    };
    // implementation calling fun twice on each of the vector halves:
    (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident,
     $vech_id:ident, $fun:ident) => {
        impl $trait_id for $vec_id {
            #[inline]
            fn $trait_method(self) -> Self {
                unsafe {
                    use crate::mem::transmute;
                    union U {
                        vec: $vec_id,
                        halves: [$vech_id; 2],
                    }

                    let mut halves = U { vec: self }.halves;

                    *halves.get_unchecked_mut(0) = transmute($fun(transmute(*halves.get_unchecked(0))));
                    *halves.get_unchecked_mut(1) = transmute($fun(transmute(*halves.get_unchecked(1))));

                    U { halves }.vec
                }
            }
        }
    };
    // implementation calling fun four times on each of the vector quarters:
    (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident,
     $vecq_id:ident, $fun:ident) => {
        impl $trait_id for $vec_id {
            #[inline]
            fn $trait_method(self) -> Self {
                unsafe {
                    use crate::mem::transmute;
                    union U {
                        vec: $vec_id,
                        quarters: [$vecq_id; 4],
                    }

                    let mut quarters = U { vec: self }.quarters;

                    *quarters.get_unchecked_mut(0) = transmute($fun(transmute(*quarters.get_unchecked(0))));
                    *quarters.get_unchecked_mut(1) = transmute($fun(transmute(*quarters.get_unchecked(1))));
                    *quarters.get_unchecked_mut(2) = transmute($fun(transmute(*quarters.get_unchecked(2))));
                    *quarters.get_unchecked_mut(3) = transmute($fun(transmute(*quarters.get_unchecked(3))));

                    U { quarters }.vec
                }
            }
        }
    };
    // implementation calling fun once on a vector twice as large:
    (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident,
     $vect_id:ident, $fun:ident) => {
        impl $trait_id for $vec_id {
            #[inline]
            fn $trait_method(self) -> Self {
                unsafe {
                    use crate::mem::{transmute, uninitialized};

                    union U {
                        vec: [$vec_id; 2],
                        twice: $vect_id,
                    }

                    let twice = U { vec: [self, uninitialized()] }.twice;
                    let twice = transmute($fun(transmute(twice)));

                    *(U { twice }.vec.get_unchecked(0))
                }
            }
        }
    };
}

macro_rules! gen_unary_impl_table {
    ($trait_id:ident, $trait_method:ident) => {
        macro_rules! impl_unary {
            ($vid:ident: $fun:ident) => {
                impl_unary_!(vec | $trait_id, $trait_method, $vid, $fun);
            };
            ($vid:ident[g]: $fun:ident) => {
                impl_unary_!(gen | $trait_id, $trait_method, $vid, $fun);
            };
            ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => {
                impl_unary_!(scalar | $trait_id, $trait_method, $vid, [$sid; $sc], $fun);
            };
            ($vid:ident[s]: $fun:ident) => {
                impl_unary_!(scalar | $trait_id, $trait_method, $vid, $fun);
            };
            ($vid:ident[h => $vid_h:ident]: $fun:ident) => {
                impl_unary_!(halves | $trait_id, $trait_method, $vid, $vid_h, $fun);
            };
            ($vid:ident[q => $vid_q:ident]: $fun:ident) => {
                impl_unary_!(quarter | $trait_id, $trait_method, $vid, $vid_q, $fun);
            };
            ($vid:ident[t => $vid_t:ident]: $fun:ident) => {
                impl_unary_!(twice | $trait_id, $trait_method, $vid, $vid_t, $fun);
            };
        }
    };
}

macro_rules! impl_tertiary_ {
    // implementation mapping 1:1
    (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident,
     $fun:ident) => {
        impl $trait_id for $vec_id {
            #[inline]
            fn $trait_method(self, y: Self, z: Self) -> Self {
                unsafe {
                    use crate::mem::transmute;
                    transmute($fun(transmute(self), transmute(y), transmute(z)))
                }
            }
        }
    };
    (scalar | $trait_id:ident, $trait_method:ident,
     $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => {
        impl $trait_id for $vec_id {
            #[inline]
            fn $trait_method(self, y: Self, z: Self) -> Self {
                unsafe {
                    union U {
                        vec: $vec_id,
                        scalars: [$sid; $scount],
                    }
                    let mut x = U { vec: self }.scalars;
                    let y = U { vec: y }.scalars;
                    let z = U { vec: z }.scalars;
                    for (x, (y, z)) in (&mut scalars).zip(&y).zip(&z) {
                        *i = $fun(*i, *y, *z);
                    }
                    U { vec: x }.vec
                }
            }
        }
    };
    // implementation calling fun twice on each of the vector halves:
    (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident,
     $vech_id:ident, $fun:ident) => {
        impl $trait_id for $vec_id {
            #[inline]
            fn $trait_method(self, y: Self, z: Self) -> Self {
                unsafe {
                    use crate::mem::transmute;
                    union U {
                        vec: $vec_id,
                        halves: [$vech_id; 2],
                    }

                    let mut x_halves = U { vec: self }.halves;
                    let y_halves = U { vec: y }.halves;
                    let z_halves = U { vec: z }.halves;

                    *x_halves.get_unchecked_mut(0) = transmute($fun(
                        transmute(*x_halves.get_unchecked(0)),
                        transmute(*y_halves.get_unchecked(0)),
                        transmute(*z_halves.get_unchecked(0)),
                    ));
                    *x_halves.get_unchecked_mut(1) = transmute($fun(
                        transmute(*x_halves.get_unchecked(1)),
                        transmute(*y_halves.get_unchecked(1)),
                        transmute(*z_halves.get_unchecked(1)),
                    ));

                    U { halves: x_halves }.vec
                }
            }
        }
    };
    // implementation calling fun four times on each of the vector quarters:
    (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident,
     $vecq_id:ident, $fun:ident) => {
        impl $trait_id for $vec_id {
            #[inline]
            fn $trait_method(self, y: Self, z: Self) -> Self {
                unsafe {
                    use crate::mem::transmute;
                    union U {
                        vec: $vec_id,
                        quarters: [$vecq_id; 4],
                    }

                    let mut x_quarters = U { vec: self }.quarters;
                    let y_quarters = U { vec: y }.quarters;
                    let z_quarters = U { vec: z }.quarters;

                    *x_quarters.get_unchecked_mut(0) = transmute($fun(
                        transmute(*x_quarters.get_unchecked(0)),
                        transmute(*y_quarters.get_unchecked(0)),
                        transmute(*z_quarters.get_unchecked(0)),
                    ));

                    *x_quarters.get_unchecked_mut(1) = transmute($fun(
                        transmute(*x_quarters.get_unchecked(1)),
                        transmute(*y_quarters.get_unchecked(1)),
                        transmute(*z_quarters.get_unchecked(1)),
                    ));

                    *x_quarters.get_unchecked_mut(2) = transmute($fun(
                        transmute(*x_quarters.get_unchecked(2)),
                        transmute(*y_quarters.get_unchecked(2)),
                        transmute(*z_quarters.get_unchecked(2)),
                    ));

                    *x_quarters.get_unchecked_mut(3) = transmute($fun(
                        transmute(*x_quarters.get_unchecked(3)),
                        transmute(*y_quarters.get_unchecked(3)),
                        transmute(*z_quarters.get_unchecked(3)),
                    ));

                    U { quarters: x_quarters }.vec
                }
            }
        }
    };
    // implementation calling fun once on a vector twice as large:
    (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident,
     $vect_id:ident, $fun:ident) => {
        impl $trait_id for $vec_id {
            #[inline]
            fn $trait_method(self, y: Self, z: Self) -> Self {
                unsafe {
                    use crate::mem::{transmute, uninitialized};

                    union U {
                        vec: [$vec_id; 2],
                        twice: $vect_id,
                    }

                    let x_twice = U { vec: [self, uninitialized()] }.twice;
                    let y_twice = U { vec: [y, uninitialized()] }.twice;
                    let z_twice = U { vec: [z, uninitialized()] }.twice;
                    let twice: $vect_id =
                        transmute($fun(transmute(x_twice), transmute(y_twice), transmute(z_twice)));

                    *(U { twice }.vec.get_unchecked(0))
                }
            }
        }
    };
}

macro_rules! gen_tertiary_impl_table {
    ($trait_id:ident, $trait_method:ident) => {
        macro_rules! impl_tertiary {
            ($vid:ident: $fun:ident) => {
                impl_tertiary_!(vec | $trait_id, $trait_method, $vid, $fun);
            };
            ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => {
                impl_tertiary_!(scalar | $trait_id, $trait_method, $vid, [$sid; $sc], $fun);
            };
            ($vid:ident[s]: $fun:ident) => {
                impl_tertiary_!(scalar | $trait_id, $trait_method, $vid, $fun);
            };
            ($vid:ident[h => $vid_h:ident]: $fun:ident) => {
                impl_tertiary_!(halves | $trait_id, $trait_method, $vid, $vid_h, $fun);
            };
            ($vid:ident[q => $vid_q:ident]: $fun:ident) => {
                impl_tertiary_!(quarter | $trait_id, $trait_method, $vid, $vid_q, $fun);
            };
            ($vid:ident[t => $vid_t:ident]: $fun:ident) => {
                impl_tertiary_!(twice | $trait_id, $trait_method, $vid, $vid_t, $fun);
            };
        }
    };
}

macro_rules! impl_binary_ {
    // implementation mapping 1:1
    (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident,
     $fun:ident) => {
        impl $trait_id for $vec_id {
            #[inline]
            fn $trait_method(self, y: Self) -> Self {
                unsafe {
                    use crate::mem::transmute;
                    transmute($fun(transmute(self), transmute(y)))
                }
            }
        }
    };
    (scalar | $trait_id:ident, $trait_method:ident,
     $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => {
        impl $trait_id for $vec_id {
            #[inline]
            fn $trait_method(self, y: Self) -> Self {
                unsafe {
                    union U {
                        vec: $vec_id,
                        scalars: [$sid; $scount],
                    }
                    let mut x = U { vec: self }.scalars;
                    let y = U { vec: y }.scalars;
                    for (x, y) in x.iter_mut().zip(&y) {
                        *x = $fun(*x, *y);
                    }
                    U { scalars: x }.vec
                }
            }
        }
    };
    // implementation calling fun twice on each of the vector halves:
    (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident,
     $vech_id:ident, $fun:ident) => {
        impl $trait_id for $vec_id {
            #[inline]
            fn $trait_method(self, y: Self) -> Self {
                unsafe {
                    use crate::mem::transmute;
                    union U {
                        vec: $vec_id,
                        halves: [$vech_id; 2],
                    }

                    let mut x_halves = U { vec: self }.halves;
                    let y_halves = U { vec: y }.halves;

                    *x_halves.get_unchecked_mut(0) = transmute($fun(
                        transmute(*x_halves.get_unchecked(0)),
                        transmute(*y_halves.get_unchecked(0)),
                    ));
                    *x_halves.get_unchecked_mut(1) = transmute($fun(
                        transmute(*x_halves.get_unchecked(1)),
                        transmute(*y_halves.get_unchecked(1)),
                    ));

                    U { halves: x_halves }.vec
                }
            }
        }
    };
    // implementation calling fun four times on each of the vector quarters:
    (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident,
     $vecq_id:ident, $fun:ident) => {
        impl $trait_id for $vec_id {
            #[inline]
            fn $trait_method(self, y: Self) -> Self {
                unsafe {
                    use crate::mem::transmute;
                    union U {
                        vec: $vec_id,
                        quarters: [$vecq_id; 4],
                    }

                    let mut x_quarters = U { vec: self }.quarters;
                    let y_quarters = U { vec: y }.quarters;

                    *x_quarters.get_unchecked_mut(0) = transmute($fun(
                        transmute(*x_quarters.get_unchecked(0)),
                        transmute(*y_quarters.get_unchecked(0)),
                    ));

                    *x_quarters.get_unchecked_mut(1) = transmute($fun(
                        transmute(*x_quarters.get_unchecked(1)),
                        transmute(*y_quarters.get_unchecked(1)),
                    ));

                    *x_quarters.get_unchecked_mut(2) = transmute($fun(
                        transmute(*x_quarters.get_unchecked(2)),
                        transmute(*y_quarters.get_unchecked(2)),
                    ));

                    *x_quarters.get_unchecked_mut(3) = transmute($fun(
                        transmute(*x_quarters.get_unchecked(3)),
                        transmute(*y_quarters.get_unchecked(3)),
                    ));

                    U { quarters: x_quarters }.vec
                }
            }
        }
    };
    // implementation calling fun once on a vector twice as large:
    (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident,
     $vect_id:ident, $fun:ident) => {
        impl $trait_id for $vec_id {
            #[inline]
            fn $trait_method(self, y: Self) -> Self {
                unsafe {
                    use crate::mem::{transmute, uninitialized};

                    union U {
                        vec: [$vec_id; 2],
                        twice: $vect_id,
                    }

                    let x_twice = U { vec: [self, uninitialized()] }.twice;
                    let y_twice = U { vec: [y, uninitialized()] }.twice;
                    let twice: $vect_id = transmute($fun(transmute(x_twice), transmute(y_twice)));

                    *(U { twice }.vec.get_unchecked(0))
                }
            }
        }
    };
}

macro_rules! gen_binary_impl_table {
    ($trait_id:ident, $trait_method:ident) => {
        macro_rules! impl_binary {
            ($vid:ident: $fun:ident) => {
                impl_binary_!(vec | $trait_id, $trait_method, $vid, $fun);
            };
            ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => {
                impl_binary_!(scalar | $trait_id, $trait_method, $vid, [$sid; $sc], $fun);
            };
            ($vid:ident[s]: $fun:ident) => {
                impl_binary_!(scalar | $trait_id, $trait_method, $vid, $fun);
            };
            ($vid:ident[h => $vid_h:ident]: $fun:ident) => {
                impl_binary_!(halves | $trait_id, $trait_method, $vid, $vid_h, $fun);
            };
            ($vid:ident[q => $vid_q:ident]: $fun:ident) => {
                impl_binary_!(quarter | $trait_id, $trait_method, $vid, $vid_q, $fun);
            };
            ($vid:ident[t => $vid_t:ident]: $fun:ident) => {
                impl_binary_!(twice | $trait_id, $trait_method, $vid, $vid_t, $fun);
            };
        }
    };
}


================================================
FILE: src/codegen/math/float/mul_add.rs
================================================
//! Vertical floating-point `mul_add`
#![allow(unused)]
use crate::*;

// FIXME: 64-bit 1 element mul_add

pub(crate) trait MulAdd {
    fn mul_add(self, y: Self, z: Self) -> Self;
}

#[cfg(not(target_arch = "s390x"))]
#[allow(improper_ctypes)]
extern "C" {
    #[link_name = "llvm.fma.v2f32"]
    fn fma_v2f32(x: f32x2, y: f32x2, z: f32x2) -> f32x2;
    #[link_name = "llvm.fma.v4f32"]
    fn fma_v4f32(x: f32x4, y: f32x4, z: f32x4) -> f32x4;
    #[link_name = "llvm.fma.v8f32"]
    fn fma_v8f32(x: f32x8, y: f32x8, z: f32x8) -> f32x8;
    #[link_name = "llvm.fma.v16f32"]
    fn fma_v16f32(x: f32x16, y: f32x16, z: f32x16) -> f32x16;
    /* FIXME 64-bit single elem vectors
    #[link_name = "llvm.fma.v1f64"]
    fn fma_v1f64(x: f64x1, y: f64x1, z: f64x1) -> f64x1;
    */
    #[link_name = "llvm.fma.v2f64"]
    fn fma_v2f64(x: f64x2, y: f64x2, z: f64x2) -> f64x2;
    #[link_name = "llvm.fma.v4f64"]
    fn fma_v4f64(x: f64x4, y: f64x4, z: f64x4) -> f64x4;
    #[link_name = "llvm.fma.v8f64"]
    fn fma_v8f64(x: f64x8, y: f64x8, z: f64x8) -> f64x8;
}

gen_tertiary_impl_table!(MulAdd, mul_add);

cfg_if! {
    if #[cfg(target_arch = "s390x")] {
        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
        macro_rules! impl_broken {
            ($id:ident) => {
                impl MulAdd for $id {
                    #[inline]
                    fn mul_add(self, y: Self, z: Self) -> Self {
                        self * y + z
                    }
                }
            };
        }

        impl_broken!(f32x2);
        impl_broken!(f32x4);
        impl_broken!(f32x8);
        impl_broken!(f32x16);

        impl_broken!(f64x2);
        impl_broken!(f64x4);
        impl_broken!(f64x8);
    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
        use sleef_sys::*;
        cfg_if! {
            if #[cfg(target_feature = "avx2")] {
                impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_avx2128);
                impl_tertiary!(f32x16[h => f32x8]: Sleef_fmaf8_avx2);
                impl_tertiary!(f64x8[h => f64x4]: Sleef_fmad4_avx2);

                impl_tertiary!(f32x4: Sleef_fmaf4_avx2128);
                impl_tertiary!(f32x8: Sleef_fmaf8_avx2);
                impl_tertiary!(f64x2: Sleef_fmad2_avx2128);
                impl_tertiary!(f64x4: Sleef_fmad4_avx2);
            } else if #[cfg(target_feature = "avx")] {
                impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_sse4);
                impl_tertiary!(f32x16[h => f32x8]: Sleef_fmaf8_avx);
                impl_tertiary!(f64x8[h => f64x4]: Sleef_fmad4_avx);

                impl_tertiary!(f32x4: Sleef_fmaf4_sse4);
                impl_tertiary!(f32x8: Sleef_fmaf8_avx);
                impl_tertiary!(f64x2: Sleef_fmad2_sse4);
                impl_tertiary!(f64x4: Sleef_fmad4_avx);
            } else if #[cfg(target_feature = "sse4.2")] {
                impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_sse4);
                impl_tertiary!(f32x16[q => f32x4]: Sleef_fmaf4_sse4);
                impl_tertiary!(f64x8[q => f64x2]: Sleef_fmad2_sse4);

                impl_tertiary!(f32x4: Sleef_fmaf4_sse4);
                impl_tertiary!(f32x8[h => f32x4]: Sleef_fmaf4_sse4);
                impl_tertiary!(f64x2: Sleef_fmad2_sse4);
                impl_tertiary!(f64x4[h => f64x2]: Sleef_fmad2_sse4);
            } else {
                impl_tertiary!(f32x2: fma_v2f32);
                impl_tertiary!(f32x16: fma_v16f32);
                impl_tertiary!(f64x8: fma_v8f64);

                impl_tertiary!(f32x4: fma_v4f32);
                impl_tertiary!(f32x8: fma_v8f32);
                impl_tertiary!(f64x2: fma_v2f64);
                impl_tertiary!(f64x4: fma_v4f64);
            }
        }
    } else {
        impl_tertiary!(f32x2: fma_v2f32);
        impl_tertiary!(f32x4: fma_v4f32);
        impl_tertiary!(f32x8: fma_v8f32);
        impl_tertiary!(f32x16: fma_v16f32);
        // impl_tertiary!(f64x1: fma_v1f64); // FIXME 64-bit fmagle elem vectors
        impl_tertiary!(f64x2: fma_v2f64);
        impl_tertiary!(f64x4: fma_v4f64);
        impl_tertiary!(f64x8: fma_v8f64);
    }
}


================================================
FILE: src/codegen/math/float/mul_adde.rs
================================================
//! Approximation for floating-point `mul_add`
use crate::*;

// FIXME: 64-bit 1 element mul_adde

pub(crate) trait MulAddE {
    fn mul_adde(self, y: Self, z: Self) -> Self;
}

#[cfg(not(target_arch = "s390x"))]
#[allow(improper_ctypes)]
extern "C" {
    #[link_name = "llvm.fmuladd.v2f32"]
    fn fmuladd_v2f32(x: f32x2, y: f32x2, z: f32x2) -> f32x2;
    #[link_name = "llvm.fmuladd.v4f32"]
    fn fmuladd_v4f32(x: f32x4, y: f32x4, z: f32x4) -> f32x4;
    #[link_name = "llvm.fmuladd.v8f32"]
    fn fmuladd_v8f32(x: f32x8, y: f32x8, z: f32x8) -> f32x8;
    #[link_name = "llvm.fmuladd.v16f32"]
    fn fmuladd_v16f32(x: f32x16, y: f32x16, z: f32x16) -> f32x16;
    /* FIXME 64-bit single elem vectors
    #[link_name = "llvm.fmuladd.v1f64"]
    fn fmuladd_v1f64(x: f64x1, y: f64x1, z: f64x1) -> f64x1;
    */
    #[link_name = "llvm.fmuladd.v2f64"]
    fn fmuladd_v2f64(x: f64x2, y: f64x2, z: f64x2) -> f64x2;
    #[link_name = "llvm.fmuladd.v4f64"]
    fn fmuladd_v4f64(x: f64x4, y: f64x4, z: f64x4) -> f64x4;
    #[link_name = "llvm.fmuladd.v8f64"]
    fn fmuladd_v8f64(x: f64x8, y: f64x8, z: f64x8) -> f64x8;
}

macro_rules! impl_mul_adde {
    ($id:ident : $fn:ident) => {
        impl MulAddE for $id {
            #[inline]
            fn mul_adde(self, y: Self, z: Self) -> Self {
                #[cfg(not(target_arch = "s390x"))]
                {
                    use crate::mem::transmute;
                    unsafe { transmute($fn(transmute(self), transmute(y), transmute(z))) }
                }
                #[cfg(target_arch = "s390x")]
                {
                    // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
                    self * y + z
                }
            }
        }
    };
}

impl_mul_adde!(f32x2: fmuladd_v2f32);
impl_mul_adde!(f32x4: fmuladd_v4f32);
impl_mul_adde!(f32x8: fmuladd_v8f32);
impl_mul_adde!(f32x16: fmuladd_v16f32);
// impl_mul_adde!(f64x1: fma_v1f64); // FIXME 64-bit fmagle elem vectors
impl_mul_adde!(f64x2: fmuladd_v2f64);
impl_mul_adde!(f64x4: fmuladd_v4f64);
impl_mul_adde!(f64x8: fmuladd_v8f64);


================================================
FILE: src/codegen/math/float/powf.rs
================================================
//! Vertical floating-point `powf`
#![allow(unused)]

// FIXME 64-bit powfgle elem vectors mispowfg

use crate::*;

pub(crate) trait Powf {
    fn powf(self, x: Self) -> Self;
}

#[allow(improper_ctypes)]
extern "C" {
    #[link_name = "llvm.pow.v2f32"]
    fn powf_v2f32(x: f32x2, y: f32x2) -> f32x2;
    #[link_name = "llvm.pow.v4f32"]
    fn powf_v4f32(x: f32x4, y: f32x4) -> f32x4;
    #[link_name = "llvm.pow.v8f32"]
    fn powf_v8f32(x: f32x8, y: f32x8) -> f32x8;
    #[link_name = "llvm.pow.v16f32"]
    fn powf_v16f32(x: f32x16, y: f32x16) -> f32x16;
    /* FIXME 64-bit powfgle elem vectors
    #[link_name = "llvm.pow.v1f64"]
    fn powf_v1f64(x: f64x1, y: f64x1) -> f64x1;
     */
    #[link_name = "llvm.pow.v2f64"]
    fn powf_v2f64(x: f64x2, y: f64x2) -> f64x2;
    #[link_name = "llvm.pow.v4f64"]
    fn powf_v4f64(x: f64x4, y: f64x4) -> f64x4;
    #[link_name = "llvm.pow.v8f64"]
    fn powf_v8f64(x: f64x8, y: f64x8) -> f64x8;

    #[link_name = "llvm.pow.f32"]
    fn powf_f32(x: f32, y: f32) -> f32;
    #[link_name = "llvm.pow.f64"]
    fn powf_f64(x: f64, y: f64) -> f64;
}

gen_binary_impl_table!(Powf, powf);

cfg_if! {
    if #[cfg(target_arch = "s390x")] {
        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
        impl_binary!(f32x2[f32; 2]: powf_f32);
        impl_binary!(f32x4[f32; 4]: powf_f32);
        impl_binary!(f32x8[f32; 8]: powf_f32);
        impl_binary!(f32x16[f32; 16]: powf_f32);

        impl_binary!(f64x2[f64; 2]: powf_f64);
        impl_binary!(f64x4[f64; 4]: powf_f64);
        impl_binary!(f64x8[f64; 8]: powf_f64);
    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
        use sleef_sys::*;
        cfg_if! {
            if #[cfg(target_feature = "avx2")] {
                impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10avx2128);
                impl_binary!(f32x16[h => f32x8]: Sleef_powf8_u10avx2);
                impl_binary!(f64x8[h => f64x4]: Sleef_powd4_u10avx2);

                impl_binary!(f32x4: Sleef_powf4_u10avx2128);
                impl_binary!(f32x8: Sleef_powf8_u10avx2);
                impl_binary!(f64x2: Sleef_powd2_u10avx2128);
                impl_binary!(f64x4: Sleef_powd4_u10avx2);
            } else if #[cfg(target_feature = "avx")] {
                impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse4);
                impl_binary!(f32x16[h => f32x8]: Sleef_powf8_u10avx);
                impl_binary!(f64x8[h => f64x4]: Sleef_powd4_u10avx);

                impl_binary!(f32x4: Sleef_powf4_u10sse4);
                impl_binary!(f32x8: Sleef_powf8_u10avx);
                impl_binary!(f64x2: Sleef_powd2_u10sse4);
                impl_binary!(f64x4: Sleef_powd4_u10avx);
            } else if #[cfg(target_feature = "sse4.2")] {
                impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse4);
                impl_binary!(f32x16[q => f32x4]: Sleef_powf4_u10sse4);
                impl_binary!(f64x8[q => f64x2]: Sleef_powd2_u10sse4);

                impl_binary!(f32x4: Sleef_powf4_u10sse4);
                impl_binary!(f32x8[h => f32x4]: Sleef_powf4_u10sse4);
                impl_binary!(f64x2: Sleef_powd2_u10sse4);
                impl_binary!(f64x4[h => f64x2]: Sleef_powd2_u10sse4);
            } else if #[cfg(target_feature = "sse2")] {
                impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse2);
                impl_binary!(f32x16[q => f32x4]: Sleef_powf4_u10sse2);
                impl_binary!(f64x8[q => f64x2]: Sleef_powd2_u10sse2);

                impl_binary!(f32x4: Sleef_powf4_u10sse2);
                impl_binary!(f32x8[h => f32x4]: Sleef_powf4_u10sse2);
                impl_binary!(f64x2: Sleef_powd2_u10sse2);
                impl_binary!(f64x4[h => f64x2]: Sleef_powd2_u10sse2);
            } else {
                impl_binary!(f32x2[f32; 2]: powf_f32);
                impl_binary!(f32x4: powf_v4f32);
                impl_binary!(f32x8: powf_v8f32);
                impl_binary!(f32x16: powf_v16f32);

                impl_binary!(f64x2: powf_v2f64);
                impl_binary!(f64x4: powf_v4f64);
                impl_binary!(f64x8: powf_v8f64);
            }
        }
    } else {
        impl_binary!(f32x2[f32; 2]: powf_f32);
        impl_binary!(f32x4: powf_v4f32);
        impl_binary!(f32x8: powf_v8f32);
        impl_binary!(f32x16: powf_v16f32);

        impl_binary!(f64x2: powf_v2f64);
        impl_binary!(f64x4: powf_v4f64);
        impl_binary!(f64x8: powf_v8f64);
    }
}


================================================
FILE: src/codegen/math/float/sin.rs
================================================
//! Vertical floating-point `sin`
#![allow(unused)]

// FIXME 64-bit 1 elem vectors sin

use crate::*;

pub(crate) trait Sin {
    fn sin(self) -> Self;
}

#[allow(improper_ctypes)]
extern "C" {
    #[link_name = "llvm.sin.v2f32"]
    fn sin_v2f32(x: f32x2) -> f32x2;
    #[link_name = "llvm.sin.v4f32"]
    fn sin_v4f32(x: f32x4) -> f32x4;
    #[link_name = "llvm.sin.v8f32"]
    fn sin_v8f32(x: f32x8) -> f32x8;
    #[link_name = "llvm.sin.v16f32"]
    fn sin_v16f32(x: f32x16) -> f32x16;
    /* FIXME 64-bit single elem vectors
    #[link_name = "llvm.sin.v1f64"]
    fn sin_v1f64(x: f64x1) -> f64x1;
     */
    #[link_name = "llvm.sin.v2f64"]
    fn sin_v2f64(x: f64x2) -> f64x2;
    #[link_name = "llvm.sin.v4f64"]
    fn sin_v4f64(x: f64x4) -> f64x4;
    #[link_name = "llvm.sin.v8f64"]
    fn sin_v8f64(x: f64x8) -> f64x8;

    #[link_name = "llvm.sin.f32"]
    fn sin_f32(x: f32) -> f32;
    #[link_name = "llvm.sin.f64"]
    fn sin_f64(x: f64) -> f64;
}

gen_unary_impl_table!(Sin, sin);

cfg_if! {
    if #[cfg(target_arch = "s390x")] {
        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
        impl_unary!(f32x2[f32; 2]: sin_f32);
        impl_unary!(f32x4[f32; 4]: sin_f32);
        impl_unary!(f32x8[f32; 8]: sin_f32);
        impl_unary!(f32x16[f32; 16]: sin_f32);

        impl_unary!(f64x2[f64; 2]: sin_f64);
        impl_unary!(f64x4[f64; 4]: sin_f64);
        impl_unary!(f64x8[f64; 8]: sin_f64);
    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
        use sleef_sys::*;
        cfg_if! {
            if #[cfg(target_feature = "avx2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10avx2128);
                impl_unary!(f32x16[h => f32x8]: Sleef_sinf8_u10avx2);
                impl_unary!(f64x8[h => f64x4]: Sleef_sind4_u10avx2);

                impl_unary!(f32x4: Sleef_sinf4_u10avx2128);
                impl_unary!(f32x8: Sleef_sinf8_u10avx2);
                impl_unary!(f64x2: Sleef_sind2_u10avx2128);
                impl_unary!(f64x4: Sleef_sind4_u10avx2);
            } else if #[cfg(target_feature = "avx")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10sse4);
                impl_unary!(f32x16[h => f32x8]: Sleef_sinf8_u10avx);
                impl_unary!(f64x8[h => f64x4]: Sleef_sind4_u10avx);

                impl_unary!(f32x4: Sleef_sinf4_u10sse4);
                impl_unary!(f32x8: Sleef_sinf8_u10avx);
                impl_unary!(f64x2: Sleef_sind2_u10sse4);
                impl_unary!(f64x4: Sleef_sind4_u10avx);
            } else if #[cfg(target_feature = "sse4.2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10sse4);
                impl_unary!(f32x16[q => f32x4]: Sleef_sinf4_u10sse4);
                impl_unary!(f64x8[q => f64x2]: Sleef_sind2_u10sse4);

                impl_unary!(f32x4: Sleef_sinf4_u10sse4);
                impl_unary!(f32x8[h => f32x4]: Sleef_sinf4_u10sse4);
                impl_unary!(f64x2: Sleef_sind2_u10sse4);
                impl_unary!(f64x4[h => f64x2]: Sleef_sind2_u10sse4);
            } else {
                impl_unary!(f32x2[f32; 2]: sin_f32);
                impl_unary!(f32x16: sin_v16f32);
                impl_unary!(f64x8: sin_v8f64);

                impl_unary!(f32x4: sin_v4f32);
                impl_unary!(f32x8: sin_v8f32);
                impl_unary!(f64x2: sin_v2f64);
                impl_unary!(f64x4: sin_v4f64);
            }
        }
    } else {
        impl_unary!(f32x2[f32; 2]: sin_f32);
        impl_unary!(f32x4: sin_v4f32);
        impl_unary!(f32x8: sin_v8f32);
        impl_unary!(f32x16: sin_v16f32);

        impl_unary!(f64x2: sin_v2f64);
        impl_unary!(f64x4: sin_v4f64);
        impl_unary!(f64x8: sin_v8f64);
    }
}


================================================
FILE: src/codegen/math/float/sin_cos_pi.rs
================================================
//! Vertical floating-point `sin_cos`
#![allow(unused)]

// FIXME 64-bit 1 elem vectors sin_cos

use crate::*;

pub(crate) trait SinCosPi: Sized {
    type Output;
    fn sin_cos_pi(self) -> Self::Output;
}

macro_rules! impl_def {
    ($vid:ident, $PI:path) => {
        impl SinCosPi for $vid {
            type Output = (Self, Self);
            #[inline]
            fn sin_cos_pi(self) -> Self::Output {
                let v = self * Self::splat($PI);
                (v.sin(), v.cos())
            }
        }
    };
}

macro_rules! impl_def32 {
    ($vid:ident) => {
        impl_def!($vid, crate::f32::consts::PI);
    };
}
macro_rules! impl_def64 {
    ($vid:ident) => {
        impl_def!($vid, crate::f64::consts::PI);
    };
}

macro_rules! impl_unary_t {
    ($vid:ident: $fun:ident) => {
        impl SinCosPi for $vid {
            type Output = (Self, Self);
            fn sin_cos_pi(self) -> Self::Output {
                unsafe {
                    use crate::mem::transmute;
                    transmute($fun(transmute(self)))
                }
            }
        }
    };
    ($vid:ident[t => $vid_t:ident]: $fun:ident) => {
        impl SinCosPi for $vid {
            type Output = (Self, Self);
            fn sin_cos_pi(self) -> Self::Output {
                unsafe {
                    use crate::mem::{transmute, uninitialized};

                    union U {
                        vec: [$vid; 2],
                        twice: $vid_t,
                    }

                    let twice = U { vec: [self, uninitialized()] }.twice;
                    let twice = transmute($fun(transmute(twice)));

                    union R {
                        twice: ($vid_t, $vid_t),
                        vecs: ([$vid; 2], [$vid; 2]),
                    }
                    let r = R { twice }.vecs;
                    (*r.0.get_unchecked(0), *r.0.get_unchecked(1))
                }
            }
        }
    };
    ($vid:ident[h => $vid_h:ident]: $fun:ident) => {
        impl SinCosPi for $vid {
            type Output = (Self, Self);
            fn sin_cos_pi(self) -> Self::Output {
                unsafe {
                    use crate::mem::transmute;

                    union U {
                        vec: $vid,
                        halves: [$vid_h; 2],
                    }

                    let halves = U { vec: self }.halves;

                    let res_0: ($vid_h, $vid_h) = transmute($fun(transmute(*halves.get_unchecked(0))));
                    let res_1: ($vid_h, $vid_h) = transmute($fun(transmute(*halves.get_unchecked(1))));

                    union R {
                        result: ($vid, $vid),
                        halves: ([$vid_h; 2], [$vid_h; 2]),
                    }
                    R { halves: ([res_0.0, res_1.0], [res_0.1, res_1.1]) }.result
                }
            }
        }
    };
    ($vid:ident[q => $vid_q:ident]: $fun:ident) => {
        impl SinCosPi for $vid {
            type Output = (Self, Self);
            fn sin_cos_pi(self) -> Self::Output {
                unsafe {
                    use crate::mem::transmute;

                    union U {
                        vec: $vid,
                        quarters: [$vid_q; 4],
                    }

                    let quarters = U { vec: self }.quarters;

                    let res_0: ($vid_q, $vid_q) = transmute($fun(transmute(*quarters.get_unchecked(0))));
                    let res_1: ($vid_q, $vid_q) = transmute($fun(transmute(*quarters.get_unchecked(1))));
                    let res_2: ($vid_q, $vid_q) = transmute($fun(transmute(*quarters.get_unchecked(2))));
                    let res_3: ($vid_q, $vid_q) = transmute($fun(transmute(*quarters.get_unchecked(3))));

                    union R {
                        result: ($vid, $vid),
                        quarters: ([$vid_q; 4], [$vid_q; 4]),
                    }
                    R {
                        quarters: (
                            [res_0.0, res_1.0, res_2.0, res_3.0],
                            [res_0.1, res_1.1, res_2.1, res_3.1],
                        ),
                    }
                    .result
                }
            }
        }
    };
}

cfg_if! {
    if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
        use sleef_sys::*;
        cfg_if! {
            if #[cfg(target_feature = "avx2")] {
                impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05avx2128);
                impl_unary_t!(f32x16[h => f32x8]: Sleef_sincospif8_u05avx2);
                impl_unary_t!(f64x8[h => f64x4]: Sleef_sincospid4_u05avx2);

                impl_unary_t!(f32x4: Sleef_sincospif4_u05avx2128);
                impl_unary_t!(f32x8: Sleef_sincospif8_u05avx2);
                impl_unary_t!(f64x2: Sleef_sincospid2_u05avx2128);
                impl_unary_t!(f64x4: Sleef_sincospid4_u05avx2);
            } else if #[cfg(target_feature = "avx")] {
                impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05sse4);
                impl_unary_t!(f32x16[h => f32x8]: Sleef_sincospif8_u05avx);
                impl_unary_t!(f64x8[h => f64x4]: Sleef_sincospid4_u05avx);

                impl_unary_t!(f32x4: Sleef_sincospif4_u05sse4);
                impl_unary_t!(f32x8: Sleef_sincospif8_u05avx);
                impl_unary_t!(f64x2: Sleef_sincospid2_u05sse4);
                impl_unary_t!(f64x4: Sleef_sincospid4_u05avx);
            } else if #[cfg(target_feature = "sse4.2")] {
                impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05sse4);
                impl_unary_t!(f32x16[q => f32x4]: Sleef_sincospif4_u05sse4);
                impl_unary_t!(f64x8[q => f64x2]: Sleef_sincospid2_u05sse4);

                impl_unary_t!(f32x4: Sleef_sincospif4_u05sse4);
                impl_unary_t!(f32x8[h => f32x4]: Sleef_sincospif4_u05sse4);
                impl_unary_t!(f64x2: Sleef_sincospid2_u05sse4);
                impl_unary_t!(f64x4[h => f64x2]: Sleef_sincospid2_u05sse4);
            } else {
                impl_def32!(f32x2);
                impl_def32!(f32x4);
                impl_def32!(f32x8);
                impl_def32!(f32x16);

                impl_def64!(f64x2);
                impl_def64!(f64x4);
                impl_def64!(f64x8);
            }
        }
    } else {
        impl_def32!(f32x2);
        impl_def32!(f32x4);
        impl_def32!(f32x8);
        impl_def32!(f32x16);

        impl_def64!(f64x2);
        impl_def64!(f64x4);
        impl_def64!(f64x8);
    }
}


================================================
FILE: src/codegen/math/float/sin_pi.rs
================================================
//! Vertical floating-point `sin_pi`
#![allow(unused)]

// FIXME 64-bit 1 elem vectors sin_pi

use crate::*;

pub(crate) trait SinPi {
    fn sin_pi(self) -> Self;
}

gen_unary_impl_table!(SinPi, sin_pi);

macro_rules! impl_def {
    ($vid:ident, $PI:path) => {
        impl SinPi for $vid {
            #[inline]
            fn sin_pi(self) -> Self {
                (self * Self::splat($PI)).sin()
            }
        }
    };
}
macro_rules! impl_def32 {
    ($vid:ident) => {
        impl_def!($vid, crate::f32::consts::PI);
    };
}
macro_rules! impl_def64 {
    ($vid:ident) => {
        impl_def!($vid, crate::f64::consts::PI);
    };
}

cfg_if! {
    if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
        use sleef_sys::*;
        cfg_if! {
            if #[cfg(target_feature = "avx2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05avx2128);
                impl_unary!(f32x16[h => f32x8]: Sleef_sinpif8_u05avx2);
                impl_unary!(f64x8[h => f64x4]: Sleef_sinpid4_u05avx2);

                impl_unary!(f32x4: Sleef_sinpif4_u05avx2128);
                impl_unary!(f32x8: Sleef_sinpif8_u05avx2);
                impl_unary!(f64x2: Sleef_sinpid2_u05avx2128);
                impl_unary!(f64x4: Sleef_sinpid4_u05avx2);
            } else if #[cfg(target_feature = "avx")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05sse4);
                impl_unary!(f32x16[h => f32x8]: Sleef_sinpif8_u05avx);
                impl_unary!(f64x8[h => f64x4]: Sleef_sinpid4_u05avx);

                impl_unary!(f32x4: Sleef_sinpif4_u05sse4);
                impl_unary!(f32x8: Sleef_sinpif8_u05avx);
                impl_unary!(f64x2: Sleef_sinpid2_u05sse4);
                impl_unary!(f64x4: Sleef_sinpid4_u05avx);
            } else if #[cfg(target_feature = "sse4.2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05sse4);
                impl_unary!(f32x16[q => f32x4]: Sleef_sinpif4_u05sse4);
                impl_unary!(f64x8[q => f64x2]: Sleef_sinpid2_u05sse4);

                impl_unary!(f32x4: Sleef_sinpif4_u05sse4);
                impl_unary!(f32x8[h => f32x4]: Sleef_sinpif4_u05sse4);
                impl_unary!(f64x2: Sleef_sinpid2_u05sse4);
                impl_unary!(f64x4[h => f64x2]: Sleef_sinpid2_u05sse4);
            } else {
                impl_def32!(f32x2);
                impl_def32!(f32x4);
                impl_def32!(f32x8);
                impl_def32!(f32x16);

                impl_def64!(f64x2);
                impl_def64!(f64x4);
                impl_def64!(f64x8);
            }
        }
    } else {
        impl_def32!(f32x2);
        impl_def32!(f32x4);
        impl_def32!(f32x8);
        impl_def32!(f32x16);

        impl_def64!(f64x2);
        impl_def64!(f64x4);
        impl_def64!(f64x8);
    }
}


================================================
FILE: src/codegen/math/float/sqrt.rs
================================================
//! Vertical floating-point `sqrt`
#![allow(unused)]

// FIXME 64-bit 1 elem vectors sqrt

use crate::*;

pub(crate) trait Sqrt {
    fn sqrt(self) -> Self;
}

#[allow(improper_ctypes)]
extern "C" {
    #[link_name = "llvm.sqrt.v2f32"]
    fn sqrt_v2f32(x: f32x2) -> f32x2;
    #[link_name = "llvm.sqrt.v4f32"]
    fn sqrt_v4f32(x: f32x4) -> f32x4;
    #[link_name = "llvm.sqrt.v8f32"]
    fn sqrt_v8f32(x: f32x8) -> f32x8;
    #[link_name = "llvm.sqrt.v16f32"]
    fn sqrt_v16f32(x: f32x16) -> f32x16;
    /* FIXME 64-bit sqrtgle elem vectors
    #[link_name = "llvm.sqrt.v1f64"]
    fn sqrt_v1f64(x: f64x1) -> f64x1;
     */
    #[link_name = "llvm.sqrt.v2f64"]
    fn sqrt_v2f64(x: f64x2) -> f64x2;
    #[link_name = "llvm.sqrt.v4f64"]
    fn sqrt_v4f64(x: f64x4) -> f64x4;
    #[link_name = "llvm.sqrt.v8f64"]
    fn sqrt_v8f64(x: f64x8) -> f64x8;

    #[link_name = "llvm.sqrt.f32"]
    fn sqrt_f32(x: f32) -> f32;
    #[link_name = "llvm.sqrt.f64"]
    fn sqrt_f64(x: f64) -> f64;
}

gen_unary_impl_table!(Sqrt, sqrt);

cfg_if! {
    if #[cfg(target_arch = "s390x")] {
        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
        impl_unary!(f32x2[f32; 2]: sqrt_f32);
        impl_unary!(f32x4[f32; 4]: sqrt_f32);
        impl_unary!(f32x8[f32; 8]: sqrt_f32);
        impl_unary!(f32x16[f32; 16]: sqrt_f32);

        impl_unary!(f64x2[f64; 2]: sqrt_f64);
        impl_unary!(f64x4[f64; 4]: sqrt_f64);
        impl_unary!(f64x8[f64; 8]: sqrt_f64);
    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
        use sleef_sys::*;
        cfg_if! {
            if #[cfg(target_feature = "avx2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_avx2128);
                impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_avx2);
                impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_avx2);

                impl_unary!(f32x4: Sleef_sqrtf4_avx2128);
                impl_unary!(f32x8: Sleef_sqrtf8_avx2);
                impl_unary!(f64x2: Sleef_sqrtd2_avx2128);
                impl_unary!(f64x4: Sleef_sqrtd4_avx2);
            } else if #[cfg(target_feature = "avx")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_sse4);
                impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_avx);
                impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_avx);

                impl_unary!(f32x4: Sleef_sqrtf4_sse4);
                impl_unary!(f32x8: Sleef_sqrtf8_avx);
                impl_unary!(f64x2: Sleef_sqrtd2_sse4);
                impl_unary!(f64x4: Sleef_sqrtd4_avx);
            } else if #[cfg(target_feature = "sse4.2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_sse4);
                impl_unary!(f32x16[q => f32x4]: Sleef_sqrtf4_sse4);
                impl_unary!(f64x8[q => f64x2]: Sleef_sqrtd2_sse4);

                impl_unary!(f32x4: Sleef_sqrtf4_sse4);
                impl_unary!(f32x8[h => f32x4]: Sleef_sqrtf4_sse4);
                impl_unary!(f64x2: Sleef_sqrtd2_sse4);
                impl_unary!(f64x4[h => f64x2]: Sleef_sqrtd2_sse4);
            } else {
                impl_unary!(f32x2[f32; 2]: sqrt_f32);
                impl_unary!(f32x16: sqrt_v16f32);
                impl_unary!(f64x8: sqrt_v8f64);

                impl_unary!(f32x4: sqrt_v4f32);
                impl_unary!(f32x8: sqrt_v8f32);
                impl_unary!(f64x2: sqrt_v2f64);
                impl_unary!(f64x4: sqrt_v4f64);
            }
        }
    } else {
        impl_unary!(f32x2[f32; 2]: sqrt_f32);
        impl_unary!(f32x4: sqrt_v4f32);
        impl_unary!(f32x8: sqrt_v8f32);
        impl_unary!(f32x16: sqrt_v16f32);

        impl_unary!(f64x2: sqrt_v2f64);
        impl_unary!(f64x4: sqrt_v4f64);
        impl_unary!(f64x8: sqrt_v8f64);
    }
}


================================================
FILE: src/codegen/math/float/sqrte.rs
================================================
//! Vertical floating-point `sqrt`
#![allow(unused)]

// FIXME 64-bit 1 elem vectors sqrte

use crate::llvm::simd_fsqrt;
use crate::*;

pub(crate) trait Sqrte {
    fn sqrte(self) -> Self;
}

gen_unary_impl_table!(Sqrte, sqrte);

cfg_if! {
    if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
        use sleef_sys::*;
        cfg_if! {
            if #[cfg(target_feature = "avx2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35avx2128);
                impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_u35avx2);
                impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_u35avx2);

                impl_unary!(f32x4: Sleef_sqrtf4_u35avx2128);
                impl_unary!(f32x8: Sleef_sqrtf8_u35avx2);
                impl_unary!(f64x2: Sleef_sqrtd2_u35avx2128);
                impl_unary!(f64x4: Sleef_sqrtd4_u35avx2);
            } else if #[cfg(target_feature = "avx")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35sse4);
                impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_u35avx);
                impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_u35avx);

                impl_unary!(f32x4: Sleef_sqrtf4_u35sse4);
                impl_unary!(f32x8: Sleef_sqrtf8_u35avx);
                impl_unary!(f64x2: Sleef_sqrtd2_u35sse4);
                impl_unary!(f64x4: Sleef_sqrtd4_u35avx);
            } else if #[cfg(target_feature = "sse4.2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35sse4);
                impl_unary!(f32x16[q => f32x4]: Sleef_sqrtf4_u35sse4);
                impl_unary!(f64x8[q => f64x2]: Sleef_sqrtd2_u35sse4);

                impl_unary!(f32x4: Sleef_sqrtf4_u35sse4);
                impl_unary!(f32x8[h => f32x4]: Sleef_sqrtf4_u35sse4);
                impl_unary!(f64x2: Sleef_sqrtd2_u35sse4);
                impl_unary!(f64x4[h => f64x2]: Sleef_sqrtd2_u35sse4);
            } else {
                impl_unary!(f32x2[g]: simd_fsqrt);
                impl_unary!(f32x16[g]: simd_fsqrt);
                impl_unary!(f64x8[g]: simd_fsqrt);

                impl_unary!(f32x4[g]: simd_fsqrt);
                impl_unary!(f32x8[g]: simd_fsqrt);
                impl_unary!(f64x2[g]: simd_fsqrt);
                impl_unary!(f64x4[g]: simd_fsqrt);
            }
        }
    } else {
        impl_unary!(f32x2[g]: simd_fsqrt);
        impl_unary!(f32x4[g]: simd_fsqrt);
        impl_unary!(f32x8[g]: simd_fsqrt);
        impl_unary!(f32x16[g]: simd_fsqrt);

        impl_unary!(f64x2[g]: simd_fsqrt);
        impl_unary!(f64x4[g]: simd_fsqrt);
        impl_unary!(f64x8[g]: simd_fsqrt);
    }
}


================================================
FILE: src/codegen/math/float/tanh.rs
================================================
//! Vertical floating-point `tanh`
#![allow(unused)]

// FIXME 64-bit 1 elem vectors tanh

#[cfg(not(feature = "std"))]
use num_traits::Float;

use crate::*;

pub(crate) trait Tanh {
    fn tanh(self) -> Self;
}

macro_rules! define_tanh {
    ($name:ident, $basetype:ty, $simdtype:ty, $lanes:expr, $trait:path) => {
        fn $name(x: $simdtype) -> $simdtype {
            use core::intrinsics::transmute;
            let mut buf: [$basetype; $lanes] = unsafe { transmute(x) };
            for elem in &mut buf {
                *elem = <$basetype as $trait>::tanh(*elem);
            }
            unsafe { transmute(buf) }
        }
    };

    (f32 => $name:ident, $type:ty, $lanes:expr) => {
        define_tanh!($name, f32, $type, $lanes, Float);
    };

    (f64 => $name:ident, $type:ty, $lanes:expr) => {
        define_tanh!($name, f64, $type, $lanes, Float);
    };
}

// llvm does not seem to expose the hyperbolic versions of trigonometric
// functions; we thus call the classical rust versions on all of them (which
// stem from cmath).
define_tanh!(f32 => tanh_v2f32, f32x2, 2);
define_tanh!(f32 => tanh_v4f32, f32x4, 4);
define_tanh!(f32 => tanh_v8f32, f32x8, 8);
define_tanh!(f32 => tanh_v16f32, f32x16, 16);

define_tanh!(f64 => tanh_v2f64, f64x2, 2);
define_tanh!(f64 => tanh_v4f64, f64x4, 4);
define_tanh!(f64 => tanh_v8f64, f64x8, 8);

fn tanh_f32(x: f32) -> f32 {
    Float::tanh(x)
}

fn tanh_f64(x: f64) -> f64 {
    Float::tanh(x)
}

gen_unary_impl_table!(Tanh, tanh);

cfg_if! {
    if #[cfg(target_arch = "s390x")] {
        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14
        impl_unary!(f32x2[f32; 2]: tanh_f32);
        impl_unary!(f32x4[f32; 4]: tanh_f32);
        impl_unary!(f32x8[f32; 8]: tanh_f32);
        impl_unary!(f32x16[f32; 16]: tanh_f32);

        impl_unary!(f64x2[f64; 2]: tanh_f64);
        impl_unary!(f64x4[f64; 4]: tanh_f64);
        impl_unary!(f64x8[f64; 8]: tanh_f64);
    } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] {
        use sleef_sys::*;
        cfg_if! {
            if #[cfg(target_feature = "avx2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10avx2128);
                impl_unary!(f32x16[h => f32x8]: Sleef_tanhf8_u10avx2);
                impl_unary!(f64x8[h => f64x4]: Sleef_tanhd4_u10avx2);

                impl_unary!(f32x4: Sleef_tanhf4_u10avx2128);
                impl_unary!(f32x8: Sleef_tanhf8_u10avx2);
                impl_unary!(f64x2: Sleef_tanhd2_u10avx2128);
                impl_unary!(f64x4: Sleef_tanhd4_u10avx2);
            } else if #[cfg(target_feature = "avx")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10sse4);
                impl_unary!(f32x16[h => f32x8]: Sleef_tanhf8_u10avx);
                impl_unary!(f64x8[h => f64x4]: Sleef_tanhd4_u10avx);

                impl_unary!(f32x4: Sleef_tanhf4_u10sse4);
                impl_unary!(f32x8: Sleef_tanhf8_u10avx);
                impl_unary!(f64x2: Sleef_tanhd2_u10sse4);
                impl_unary!(f64x4: Sleef_tanhd4_u10avx);
            } else if #[cfg(target_feature = "sse4.2")] {
                impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10sse4);
                impl_unary!(f32x16[q => f32x4]: Sleef_tanhf4_u10sse4);
                impl_unary!(f64x8[q => f64x2]: Sleef_tanhd2_u10sse4);

                impl_unary!(f32x4: Sleef_tanhf4_u10sse4);
                impl_unary!(f32x8[h => f32x4]: Sleef_tanhf4_u10sse4);
                impl_unary!(f64x2: Sleef_tanhd2_u10sse4);
                impl_unary!(f64x4[h => f64x2]: Sleef_tanhd2_u10sse4);
            } else {
                impl_unary!(f32x2[f32; 2]: tanh_f32);
                impl_unary!(f32x16: tanh_v16f32);
                impl_unary!(f64x8: tanh_v8f64);

                impl_unary!(f32x4: tanh_v4f32);
                impl_unary!(f32x8: tanh_v8f32);
                impl_unary!(f64x2: tanh_v2f64);
                impl_unary!(f64x4: tanh_v4f64);
            }
        }
    } else {
        impl_unary!(f32x2[f32; 2]: tanh_f32);
        impl_unary!(f32x4: tanh_v4f32);
        impl_unary!(f32x8: tanh_v8f32);
        impl_unary!(f32x16: tanh_v16f32);

        impl_unary!(f64x2: tanh_v2f64);
        impl_unary!(f64x4: tanh_v4f64);
        impl_unary!(f64x8: tanh_v8f64);
    }
}


================================================
FILE: src/codegen/math/float.rs
================================================
//! Vertical floating-point math operations.
#![allow(clippy::useless_transmute)]

#[macro_use]
pub(crate) mod macros;
pub(crate) mod abs;
pub(crate) mod cos;
pub(crate) mod cos_pi;
pub(crate) mod exp;
pub(crate) mod ln;
pub(crate) mod mul_add;
pub(crate) mod mul_adde;
pub(crate) mod powf;
pub(crate) mod sin;
pub(crate) mod sin_cos_pi;
pub(crate) mod sin_pi;
pub(crate) mod sqrt;
pub(crate) mod sqrte;
pub(crate) mod tanh;


================================================
FILE: src/codegen/math.rs
================================================
//! Vertical math operations

pub(crate) mod float;


================================================
FILE: src/codegen/pointer_sized_int.rs
================================================
//! Provides `isize` and `usize`

use cfg_if::cfg_if;

cfg_if! {
    if #[cfg(target_pointer_width = "8")] {
        pub(crate) type isize_ = i8;
        pub(crate) type usize_ = u8;
    } else if #[cfg(target_pointer_width = "16")] {
        pub(crate) type isize_ = i16;
        pub(crate) type usize_ = u16;
    } else if #[cfg(target_pointer_width = "32")] {
        pub(crate) type isize_ = i32;
        pub(crate) type usize_ = u32;

    } else if #[cfg(target_pointer_width = "64")] {
        pub(crate) type isize_ = i64;
        pub(crate) type usize_ = u64;
    } else if #[cfg(target_pointer_width = "64")] {
        pub(crate) type isize_ = i64;
        pub(crate) type usize_ = u64;
    } else if #[cfg(target_pointer_width = "128")] {
        pub(crate) type isize_ = i128;
        pub(crate) type usize_ = u128;
    } else {
        compile_error!("unsupported target_pointer_width");
    }
}


================================================
FILE: src/codegen/reductions/mask/aarch64.rs
================================================
//! Mask reductions implementation for `aarch64` targets

/// 128-bit wide vectors
macro_rules! aarch64_128_neon_impl {
    ($id:ident, $vmin:ident, $vmax:ident) => {
        impl All for $id {
            #[inline]
            #[target_feature(enable = "neon")]
            unsafe fn all(self) -> bool {
                use crate::arch::aarch64::$vmin;
                $vmin(crate::mem::transmute(self)) != 0
            }
        }
        impl Any for $id {
            #[inline]
            #[target_feature(enable = "neon")]
            unsafe fn any(self) -> bool {
                use crate::arch::aarch64::$vmax;
                $vmax(crate::mem::transmute(self)) != 0
            }
        }
    };
}

/// 64-bit wide vectors
macro_rules! aarch64_64_neon_impl {
    ($id:ident, $vec128:ident) => {
        impl All for $id {
            #[inline]
            #[target_feature(enable = "neon")]
            unsafe fn all(self) -> bool {
                // Duplicates the 64-bit vector into a 128-bit one and
                // calls all on that.
                union U {
                    halves: ($id, $id),
                    vec: $vec128,
                }
                U { halves: (self, self) }.vec.all()
            }
        }
        impl Any for $id {
            #[inline]
            #[target_feature(enable = "neon")]
            unsafe fn any(self) -> bool {
                union U {
                    halves: ($id, $id),
                    vec: $vec128,
                }
                U { halves: (self, self) }.vec.any()
            }
        }
    };
}

/// Mask reduction implementation for `aarch64` targets
macro_rules! impl_mask_reductions {
    // 64-bit wide masks
    (m8x8) => {
        aarch64_64_neon_impl!(m8x8, m8x16);
    };
    (m16x4) => {
        aarch64_64_neon_impl!(m16x4, m16x8);
    };
    (m32x2) => {
        aarch64_64_neon_impl!(m32x2, m32x4);
    };
    // 128-bit wide masks
    (m8x16) => {
        aarch64_128_neon_impl!(m8x16, vminvq_u8, vmaxvq_u8);
    };
    (m16x8) => {
        aarch64_128_neon_impl!(m16x8, vminvq_u16, vmaxvq_u16);
    };
    (m32x4) => {
        aarch64_128_neon_impl!(m32x4, vminvq_u32, vmaxvq_u32);
    };
    // Fallback to LLVM's default code-generation:
    ($id:ident) => {
        fallback_impl!($id);
    };
}


================================================
FILE: src/codegen/reductions/mask/arm.rs
================================================
//! Mask reductions implementation for `arm` targets

/// Implementation for ARM + v7 + NEON for 64-bit or 128-bit wide vectors with
/// more than two elements.
macro_rules! arm_128_v7_neon_impl {
    ($id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => {
        impl All for $id {
            #[inline]
            #[target_feature(enable = "v7,neon")]
            unsafe fn all(self) -> bool {
                use crate::arch::arm::$vpmin;
                use crate::mem::transmute;
                union U {
                    halves: ($half, $half),
                    vec: $id,
                }
                let halves = U { vec: self }.halves;
                let h: $half = transmute($vpmin(transmute(halves.0), transmute(halves.1)));
                h.all()
            }
        }
        impl Any for $id {
            #[inline]
            #[target_feature(enable = "v7,neon")]
            unsafe fn any(self) -> bool {
                use crate::arch::arm::$vpmax;
                use crate::mem::transmute;
                union U {
                    halves: ($half, $half),
                    vec: $id,
                }
                let halves = U { vec: self }.halves;
                let h: $half = transmute($vpmax(transmute(halves.0), transmute(halves.1)));
                h.any()
            }
        }
    };
}

/// Mask reduction implementation for `arm` targets
macro_rules! impl_mask_reductions {
    // 128-bit wide masks
    (m8x16) => {
        arm_128_v7_neon_impl!(m8x16, m8x8, vpmin_u8, vpmax_u8);
    };
    (m16x8) => {
        arm_128_v7_neon_impl!(m16x8, m16x4, vpmin_u16, vpmax_u16);
    };
    (m32x4) => {
        arm_128_v7_neon_impl!(m32x4, m32x2, vpmin_u32, vpmax_u32);
    };
    // Fallback to LLVM's default code-generation:
    ($id:ident) => {
        fallback_impl!($id);
    };
}


================================================
FILE: src/codegen/reductions/mask/fallback.rs
================================================
//! Default mask reduction implementations.

/// Default mask reduction implementation
macro_rules! impl_mask_reductions {
    ($id:ident) => {
        fallback_impl!($id);
    };
}


================================================
FILE: src/codegen/reductions/mask/fallback_impl.rs
================================================
//! Default implementation of a mask reduction for any target.

macro_rules! fallback_to_other_impl {
    ($id:ident, $other:ident) => {
        impl All for $id {
            #[inline]
            unsafe fn all(self) -> bool {
                let m: $other = crate::mem::transmute(self);
                m.all()
            }
        }
        impl Any for $id {
            #[inline]
            unsafe fn any(self) -> bool {
                let m: $other = crate::mem::transmute(self);
                m.any()
            }
        }
    };
}

/// Fallback implementation.
macro_rules! fallback_impl {
    // 16-bit wide masks:
    (m8x2) => {
        impl All for m8x2 {
            #[inline]
            unsafe fn all(self) -> bool {
                let i: u16 = crate::mem::transmute(self);
                i == u16::max_value()
            }
        }
        impl Any for m8x2 {
            #[inline]
            unsafe fn any(self) -> bool {
                let i: u16 = crate::mem::transmute(self);
                i != 0
            }
        }
    };
    // 32-bit wide masks
    (m8x4) => {
        impl All for m8x4 {
            #[inline]
            unsafe fn all(self) -> bool {
                let i: u32 = crate::mem::transmute(self);
                i == u32::max_value()
            }
        }
        impl Any for m8x4 {
            #[inline]
            unsafe fn any(self) -> bool {
                let i: u32 = crate::mem::transmute(self);
                i != 0
            }
        }
    };
    (m16x2) => {
        fallback_to_other_impl!(m16x2, m8x4);
    };
    // 64-bit wide masks:
    (m8x8) => {
        impl All for m8x8 {
            #[inline]
            unsafe fn all(self) -> bool {
                let i: u64 = crate::mem::transmute(self);
                i == u64::max_value()
            }
        }
        impl Any for m8x8 {
            #[inline]
            unsafe fn any(self) -> bool {
                let i: u64 = crate::mem::transmute(self);
                i != 0
            }
        }
    };
    (m16x4) => {
        fallback_to_other_impl!(m16x4, m8x8);
    };
    (m32x2) => {
        fallback_to_other_impl!(m32x2, m16x4);
    };
    // FIXME: 64x1 maxk
    // 128-bit wide masks:
    (m8x16) => {
        impl All for m8x16 {
            #[inline]
            unsafe fn all(self) -> bool {
                let i: u128 = crate::mem::transmute(self);
                i == u128::max_value()
            }
        }
        impl Any for m8x16 {
            #[inline]
            unsafe fn any(self) -> bool {
                let i: u128 = crate::mem::transmute(self);
                i != 0
            }
        }
    };
    (m16x8) => {
        fallback_to_other_impl!(m16x8, m8x16);
    };
    (m32x4) => {
        fallback_to_other_impl!(m32x4, m16x8);
    };
    (m64x2) => {
        fallback_to_other_impl!(m64x2, m32x4);
    };
    (m128x1) => {
        fallback_to_other_impl!(m128x1, m64x2);
    };
    // 256-bit wide masks
    (m8x32) => {
        impl All for m8x32 {
            #[inline]
            unsafe fn all(self) -> bool {
                let i: [u128; 2] = crate::mem::transmute(self);
                let o: [u128; 2] = [u128::max_value(); 2];
                i == o
            }
        }
        impl Any for m8x32 {
            #[inline]
            unsafe fn any(self) -> bool {
                let i: [u128; 2] = crate::mem::transmute(self);
                let o: [u128; 2] = [0; 2];
                i != o
            }
        }
    };
    (m16x16) => {
        fallback_to_other_impl!(m16x16, m8x32);
    };
    (m32x8) => {
        fallback_to_other_impl!(m32x8, m16x16);
    };
    (m64x4) => {
        fallback_to_other_impl!(m64x4, m32x8);
    };
    (m128x2) => {
        fallback_to_other_impl!(m128x2, m64x4);
    };
    // 512-bit wide masks
    (m8x64) => {
        impl All for m8x64 {
            #[inline]
            unsafe fn all(self) -> bool {
                let i: [u128; 4] = crate::mem::transmute(self);
                let o: [u128; 4] = [u128::max_value(); 4];
                i == o
            }
        }
        impl Any for m8x64 {
            #[inline]
            unsafe fn any(self) -> bool {
                let i: [u128; 4] = crate::mem::transmute(self);
                let o: [u128; 4] = [0; 4];
                i != o
            }
        }
    };
    (m16x32) => {
        fallback_to_other_impl!(m16x32, m8x64);
    };
    (m32x16) => {
        fallback_to_other_impl!(m32x16, m16x32);
    };
    (m64x8) => {
        fallback_to_other_impl!(m64x8, m32x16);
    };
    (m128x4) => {
        fallback_to_other_impl!(m128x4, m64x8);
    };
    // Masks with pointer-sized elements64
    (msizex2) => {
        cfg_if! {
            if #[cfg(target_pointer_width = "64")] {
                fallback_to_other_impl!(msizex2, m64x2);
            } else if #[cfg(target_pointer_width = "32")] {
                fallback_to_other_impl!(msizex2, m32x2);
            } else {
                compile_error!("unsupported target_pointer_width");
            }
        }
    };
    (msizex4) => {
        cfg_if! {
            if #[cfg(target_pointer_width = "64")] {
                fallback_to_other_impl!(msizex4, m64x4);
            } else if #[cfg(target_pointer_width = "32")] {
                fallback_to_other_impl!(msizex4, m32x4);
            } else {
                compile_error!("unsupported target_pointer_width");
            }
        }
    };
    (msizex8) => {
        cfg_if! {
            if #[cfg(target_pointer_width = "64")] {
                fallback_to_other_impl!(msizex8, m64x8);
            } else if #[cfg(target_pointer_width = "32")] {
                fallback_to_other_impl!(msizex8, m32x8);
            } else {
                compile_error!("unsupported target_pointer_width");
            }
        }
    };
}

macro_rules! recurse_half {
    ($vid:ident, $vid_h:ident) => {
        impl All for $vid {
            #[inline]
            unsafe fn all(self) -> bool {
                union U {
                    halves: ($vid_h, $vid_h),
                    vec: $vid,
                }
                let halves = U { vec: self }.halves;
                halves.0.all() && halves.1.all()
            }
        }
        impl Any for $vid {
            #[inline]
            unsafe fn any(self) -> bool {
                union U {
                    halves: ($vid_h, $vid_h),
                    vec: $vid,
                }
                let halves = U { vec: self }.halves;
                halves.0.any() || halves.1.any()
            }
        }
    };
}


================================================
FILE: src/codegen/reductions/mask/x86/avx.rs
================================================
//! Mask reductions implementation for `x86` and `x86_64` targets with `AVX`

/// `x86`/`x86_64` 256-bit `AVX` implementation
/// FIXME: it might be faster here to do two `_mm_movmask_epi8`
#[cfg(target_feature = "avx")]
macro_rules! x86_m8x32_avx_impl {
    ($id:ident) => {
        impl All for $id {
            #[inline]
            #[target_feature(enable = "avx")]
            unsafe fn all(self) -> bool {
                #[cfg(target_arch = "x86")]
                use crate::arch::x86::_mm256_testc_si256;
                #[cfg(target_arch = "x86_64")]
                use crate::arch::x86_64::_mm256_testc_si256;
                _mm256_testc_si256(crate::mem::transmute(self), crate::mem::transmute($id::splat(true))) != 0
            }
        }
        impl Any for $id {
            #[inline]
            #[target_feature(enable = "avx")]
            unsafe fn any(self) -> bool {
                #[cfg(target_arch = "x86")]
                use crate::arch::x86::_mm256_testz_si256;
                #[cfg(target_arch = "x86_64")]
                use crate::arch::x86_64::_mm256_testz_si256;
                _mm256_testz_si256(crate::mem::transmute(self), crate::mem::transmute(self)) == 0
            }
        }
    };
}

/// `x86`/`x86_64` 256-bit m32x8 `AVX` implementation
macro_rules! x86_m32x8_avx_impl {
    ($id:ident) => {
        impl All for $id {
            #[inline]
            #[target_feature(enable = "sse")]
            unsafe fn all(self) -> bool {
                #[cfg(target_arch = "x86")]
                use crate::arch::x86::_mm256_movemask_ps;
                #[cfg(target_arch = "x86_64")]
                use crate::arch::x86_64::_mm256_movemask_ps;
                // _mm256_movemask_ps(a) creates a 8bit mask containing the
                // most significant bit of each lane of `a`. If all bits are
                // set, then all 8 lanes of the mask are true.
                _mm256_movemask_ps(crate::mem::transmute(self)) == 0b_1111_1111_i32
            }
        }
        impl Any for $id {
            #[inline]
            #[target_feature(enable = "sse")]
            unsafe fn any(self) -> bool {
                #[cfg(target_arch = "x86")]
                use crate::arch::x86::_mm256_movemask_ps;
                #[cfg(target_arch = "x86_64")]
                use crate::arch::x86_64::_mm256_movemask_ps;

                _mm256_movemask_ps(crate::mem::transmute(self)) != 0
            }
        }
    };
}

/// `x86`/`x86_64` 256-bit m64x4 `AVX` implementation
macro_rules! x86_m64x4_avx_impl {
    ($id:ident) => {
        impl All for $id {
            #[inline]
            #[target_feature(enable = "sse")]
            unsafe fn all(self) -> bool {
                #[cfg(target_arch = "x86")]
                use crate::arch::x86::_mm256_movemask_pd;
                #[cfg(target_arch = "x86_64")]
                use crate::arch::x86_64::_mm256_movemask_pd;
                // _mm256_movemask_pd(a) creates a 4bit mask containing the
                // most significant bit of each lane of `a`. If all bits are
                // set, then all 4 lanes of the mask are true.
                _mm256_movemask_pd(crate::mem::transmute(self)) == 0b_1111_i32
            }
        }
        impl Any for $id {
            #[inline]
            #[target_feature(enable = "sse")]
            unsafe fn any(self) -> bool {
                #[cfg(target_arch = "x86")]
                use crate::arch::x86::_mm256_movemask_pd;
                #[cfg(target_arch = "x86_64")]
                use crate::arch::x86_64::_mm256_movemask_pd;

                _mm256_movemask_pd(crate::mem::transmute(self)) != 0
            }
        }
    };
}


================================================
FILE: src/codegen/reductions/mask/x86/avx2.rs
================================================
//! Mask reductions implementation for `x86` and `x86_64` targets with `AVX2`.
#![allow(unused)]

/// x86/x86_64 256-bit m8x32 AVX2 implementation
macro_rules! x86_m8x32_avx2_impl {
    ($id:ident) => {
        impl All for $id {
            #[inline]
            #[target_feature(enable = "sse2")]
            unsafe fn all(self) -> bool {
                #[cfg(target_arch = "x86")]
                use crate::arch::x86::_mm256_movemask_epi8;
                #[cfg(target_arch = "x86_64")]
                use crate::arch::x86_64::_mm256_movemask_epi8;
                // _mm256_movemask_epi8(a) creates a 32bit mask containing the
                // most significant bit of each byte of `a`. If all
                // bits are set, then all 32 lanes of the mask are
                // true.
                _mm256_movemask_epi8(crate::mem::transmute(self)) == -1_i32
            }
        }
        impl Any for $id {
            #[inline]
            #[target_feature(enable = "sse2")]
            unsafe fn any(self) -> bool {
                #[cfg(target_arch = "x86")]
                use crate::arch::x86::_mm256_movemask_epi8;
                #[cfg(target_arch = "x86_64")]
                use crate::arch::x86_64::_mm256_movemask_epi8;

                _mm256_movemask_epi8(crate::mem::transmute(self)) != 0
            }
        }
    };
}


================================================
FILE: src/codegen/reductions/mask/x86/sse.rs
================================================
//! Mask reductions implementation for `x86` and `x86_64` targets with `SSE`.
#![allow(unused)]

/// `x86`/`x86_64` 128-bit `m32x4` `SSE` implementation
macro_rules! x86_m32x4_sse_impl {
    ($id:ident) => {
        impl All for $id {
            #[inline]
            #[target_feature(enable = "sse")]
            unsafe fn all(self) -> bool {
                #[cfg(target_arch = "x86")]
                use crate::arch::x86::_mm_movemask_ps;
                #[cfg(target_arch = "x86_64")]
                use crate::arch::x86_64::_mm_movemask_ps;
                // _mm_movemask_ps(a) creates a 4bit mask containing the
                // most significant bit of each lane of `a`. If all
                // bits are set, then all 4 lanes of the mask are
                // true.
                _mm_movemask_ps(crate::mem::transmute(self)) == 0b_1111_i32
            }
        }
        impl Any for $id {
            #[inline]
            #[target_feature(enable = "sse")]
            unsafe fn any(self) -> bool {
                #[cfg(target_arch = "x86")]
                use crate::arch::x86::_mm_movemask_ps;
                #[cfg(target_arch = "x86_64")]
                use crate::arch::x86_64::_mm_movemask_ps;

                _mm_movemask_ps(crate::mem::transmute(self)) != 0
            }
        }
    };
}


================================================
FILE: src/codegen/reductions/mask/x86/sse2.rs
================================================
//! Mask reductions implementation for `x86` and `x86_64` targets with `SSE2`.
#![allow(unused)]

/// `x86`/`x86_64` 128-bit m64x2 `SSE2` implementation
macro_rules! x86_m64x2_sse2_impl {
    ($id:ident) => {
        impl All for $id {
            #[inline]
            #[target_feature(enable = "sse")]
            unsafe fn all(self) -> bool {
                #[cfg(target_arch = "x86")]
                use crate::arch::x86::_mm_movemask_pd;
                #[cfg(target_arch = "x86_64")]
                use crate::arch::x86_64::_mm_movemask_pd;
                // _mm_movemask_pd(a) creates a 2bit mask containing the
                // most significant bit of each lane of `a`. If all
                // bits are set, then all 2 lanes of the mask are
                // true.
                _mm_movemask_pd(crate::mem::transmute(self)) == 0b_11_i32
            }
        }
        impl Any for $id {
            #[inline]
            #[target_feature(enable = "sse")]
            unsafe fn any(self) -> bool {
                #[cfg(target_arch = "x86")]
                use crate::arch::x86::_mm_movemask_pd;
                #[cfg(target_arch = "x86_64")]
                use crate::arch::x86_64::_mm_movemask_pd;

                _mm_movemask_pd(crate::mem::transmute(self)) != 0
            }
        }
    };
}

/// `x86`/`x86_64` 128-bit m8x16 `SSE2` implementation
macro_rules! x86_m8x16_sse2_impl {
    ($id:ident) => {
        impl All for $id {
            #[inline]
            #[target_feature(enable = "sse2")]
            unsafe fn all(self) -> bool {
                #[cfg(target_arch = "x86")]
                use crate::arch::x86::_mm_movemask_epi8;
                #[cfg(target_arch = "x86_64")]
                use crate::arch::x86_64::_mm_movemask_epi8;
                // _mm_movemask_epi8(a) creates a 16bit mask containing the
                // most significant bit of each byte of `a`. If all
                // bits are set, then all 16 lanes of the mask are
                // true.
                _mm_movemask_epi8(crate::mem::transmute(self)) == i32::from(u16::max_value())
            }
        }
        impl Any for $id {
            #[inline]
            #[target_feature(enable = "sse2")]
            unsafe fn any(self) -> bool {
                #[cfg(target_arch = "x86")]
                use crate::arch::x86::_mm_movemask_epi8;
                #[cfg(target_arch = "x86_64")]
                use crate::arch::x86_64::_mm_movemask_epi8;

                _mm_movemask_epi8(crate::mem::transmute(self)) != 0
            }
        }
    };
}


================================================
FILE: src/codegen/reductions/mask/x86.rs
================================================
//! Mask reductions implementation for `x86` and `x86_64` targets

#[cfg(target_feature = "sse")]
#[macro_use]
mod sse;

#[cfg(target_feature = "sse2")]
#[macro_use]
mod sse2;

#[cfg(target_feature = "avx")]
#[macro_use]
mod avx;

#[cfg(target_feature = "avx2")]
#[macro_use]
mod avx2;

/// x86 64-bit m8x8 implementation
macro_rules! x86_m8x8_impl {
    ($id:ident) => {
        fallback_impl!($id);
    };
}

/// x86 128-bit m8x16 implementation
macro_rules! x86_m8x16_impl {
    ($id:ident) => {
        cfg_if! {
            if #[cfg(target_feature = "sse2")] {
                x86_m8x16_sse2_impl!($id);
            } else {
                fallback_impl!($id);
            }
        }
    };
}

/// x86 128-bit m32x4 implementation
macro_rules! x86_m32x4_impl {
    ($id:ident) => {
        cfg_if! {
            if #[cfg(target_feature = "sse")] {
                x86_m32x4_sse_impl!($id);
            } else {
                fallback_impl!($id);
            }
        }
    };
}

/// x86 128-bit m64x2 implementation
macro_rules! x86_m64x2_impl {
    ($id:ident) => {
        cfg_if! {
            if #[cfg(target_feature = "sse2")] {
                x86_m64x2_sse2_impl!($id);
            } else if #[cfg(target_feature = "sse")] {
                x86_m32x4_sse_impl!($id);
            } else {
                fallback_impl!($id);
            }
        }
    };
}

/// x86 256-bit m8x32 implementation
macro_rules! x86_m8x32_impl {
    ($id:ident, $half_id:ident) => {
        cfg_if! {
            if #[cfg(target_feature = "avx2")] {
                x86_m8x32_avx2_impl!($id);
            } else if #[cfg(target_feature = "avx")] {
                x86_m8x32_avx_impl!($id);
            } else if #[cfg(target_feature = "sse2")] {
                recurse_half!($id, $half_id);
            } else {
                fallback_impl!($id);
            }
        }
    };
}

/// x86 256-bit m32x8 implementation
macro_rules! x86_m32x8_impl {
    ($id:ident, $half_id:ident) => {
        cfg_if! {
            if #[cfg(target_feature = "avx")] {
                x86_m32x8_avx_impl!($id);
            } else if #[cfg(target_feature = "sse")] {
                recurse_half!($id, $half_id);
            } else {
                fallback_impl!($id);
            }
        }
    };
}

/// x86 256-bit m64x4 implementation
macro_rules! x86_m64x4_impl {
    ($id:ident, $half_id:ident) => {
        cfg_if! {
            if #[cfg(target_feature = "avx")] {
                x86_m64x4_avx_impl!($id);
            } else if #[cfg(target_feature = "sse")] {
                recurse_half!($id, $half_id);
            } else {
                fallback_impl!($id);
            }
        }
    };
}

/// Fallback implementation.
macro_rules! x86_intr_impl {
    ($id:ident) => {
        impl All for $id {
            #[inline]
            unsafe fn all(self) -> bool {
                use crate::llvm::simd_reduce_all;
                simd_reduce_all(self.0)
            }
        }
        impl Any for $id {
            #[inline]
            unsafe fn any(self) -> bool {
                use crate::llvm::simd_reduce_any;
                simd_reduce_any(self.0)
            }
        }
    };
}

/// Mask reduction implementation for `x86` and `x86_64` targets
macro_rules! impl_mask_reductions {
    // 64-bit wide masks
    (m8x8) => {
        x86_m8x8_impl!(m8x8);
    };
    (m16x4) => {
        x86_m8x8_impl!(m16x4);
    };
    (m32x2) => {
        x86_m8x8_impl!(m32x2);
    };
    // 128-bit wide masks
    (m8x16) => {
        x86_m8x16_impl!(m8x16);
    };
    (m16x8) => {
        x86_m8x16_impl!(m16x8);
    };
    (m32x4) => {
        x86_m32x4_impl!(m32x4);
    };
    (m64x2) => {
        x86_m64x2_impl!(m64x2);
    };
    (m128x1) => {
        x86_intr_impl!(m128x1);
    };
    // 256-bit wide masks:
    (m8x32) => {
        x86_m8x32_impl!(m8x32, m8x16);
    };
    (m16x16) => {
        x86_m8x32_impl!(m16x16, m16x8);
    };
    (m32x8) => {
        x86_m32x8_impl!(m32x8, m32x4);
    };
    (m64x4) => {
        x86_m64x4_impl!(m64x4, m64x2);
    };
    (m128x2) => {
        x86_intr_impl!(m128x2);
    };
    (msizex2) => {
        cfg_if! {
            if #[cfg(target_pointer_width = "64")] {
                fallback_to_other_impl!(msizex2, m64x2);
            } else if #[cfg(target_pointer_width = "32")] {
                fallback_to_other_impl!(msizex2, m32x2);
            } else {
                compile_error!("unsupported target_pointer_width");
            }
        }
    };
    (msizex4) => {
        cfg_if! {
            if #[cfg(target_pointer_width = "64")] {
                fallback_to_other_impl!(msizex4, m64x4);
            } else if #[cfg(target_pointer_width = "32")] {
                fallback_to_other_impl!(msizex4, m32x4);
            } else {
                compile_error!("unsupported target_pointer_width");
            }
        }
    };
    (msizex8) => {
        cfg_if! {
            if #[cfg(target_pointer_width = "64")] {
                fallback_to_other_impl!(msizex8, m64x8);
            } else if #[cfg(target_pointer_width = "32")] {
                fallback_to_other_impl!(msizex8, m32x8);
            } else {
                compile_error!("unsupported target_pointer_width");
            }
        }
    };

    // Fallback to LLVM's default code-generation:
    ($id:ident) => {
        fallback_impl!($id);
    };
}


================================================
FILE: src/codegen/reductions/mask.rs
================================================
//! Code generation workaround for `all()` mask horizontal reduction.
//!
//! Works around [LLVM bug 36702].
//!
//! [LLVM bug 36702]: https://bugs.llvm.org/show_bug.cgi?id=36702
#![allow(unused_macros)]

use crate::*;

pub(crate) trait All: crate::marker::Sized {
    unsafe fn all(self) -> bool;
}

pub(crate) trait Any: crate::marker::Sized {
    unsafe fn any(self) -> bool;
}

#[macro_use]
mod fallback_impl;

cfg_if! {
    if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
        #[macro_use]
        mod x86;
    } else if #[cfg(all(target_arch = "arm", target_feature = "v7",
                        target_feature = "neon",
                        any(feature = "core_arch", libcore_neon)))] {
        #[macro_use]
        mod arm;
    } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
        #[macro_use]
        mod aarch64;
    } else {
        #[macro_use]
        mod fallback;
    }
}

impl_mask_reductions!(m8x2);
impl_mask_reductions!(m8x4);
impl_mask_reductions!(m8x8);
impl_mask_reductions!(m8x16);
impl_mask_reductions!(m8x32);
impl_mask_reductions!(m8x64);

impl_mask_reductions!(m16x2);
impl_mask_reductions!(m16x4);
impl_mask_reductions!(m16x8);
impl_mask_reductions!(m16x16);
impl_mask_reductions!(m16x32);

impl_mask_reductions!(m32x2);
impl_mask_reductions!(m32x4);
impl_mask_reductions!(m32x8);
impl_mask_reductions!(m32x16);

// FIXME: 64-bit single element vector
// impl_mask_reductions!(m64x1);
impl_mask_reductions!(m64x2);
impl_mask_reductions!(m64x4);
impl_mask_reductions!(m64x8);

impl_mask_reductions!(m128x1);
impl_mask_reductions!(m128x2);
impl_mask_reductions!(m128x4);

impl_mask_reductions!(msizex2);
impl_mask_reductions!(msizex4);
impl_mask_reductions!(msizex8);


================================================
FILE: src/codegen/reductions.rs
================================================
pub(crate) mod mask;


================================================
FILE: src/codegen/shuffle.rs
================================================
//! Implementations of the `ShuffleResult` trait for the different numbers of
//! lanes and vector element types.

use crate::masks::*;
use crate::sealed::{Seal, Shuffle};

macro_rules! impl_shuffle {
    ($array:ty, $base:ty, $out:ty) => {
        impl Seal<$array> for $base {}
        impl Shuffle<$array> for $base {
            type Output = $out;
        }
    };
}

impl_shuffle! { [u32; 2], i8, crate::codegen::i8x2 }
impl_shuffle! { [u32; 4], i8, crate::codegen::i8x4 }
impl_shuffle! { [u32; 8], i8, crate::codegen::i8x8 }
impl_shuffle! { [u32; 16], i8, crate::codegen::i8x16 }
impl_shuffle! { [u32; 32], i8, crate::codegen::i8x32 }
impl_shuffle! { [u32; 64], i8, crate::codegen::i8x64 }

impl_shuffle! { [u32; 2], u8, crate::codegen::u8x2 }
impl_shuffle! { [u32; 4], u8, crate::codegen::u8x4 }
impl_shuffle! { [u32; 8], u8, crate::codegen::u8x8 }
impl_shuffle! { [u32; 16], u8, crate::codegen::u8x16 }
impl_shuffle! { [u32; 32], u8, crate::codegen::u8x32 }
impl_shuffle! { [u32; 64], u8, crate::codegen::u8x64 }

impl_shuffle! { [u32; 2], m8, crate::codegen::m8x2 }
impl_shuffle! { [u32; 4], m8, crate::codegen::m8x4 }
impl_shuffle! { [u32; 8], m8, crate::codegen::m8x8 }
impl_shuffle! { [u32; 16], m8, crate::codegen::m8x16 }
impl_shuffle! { [u32; 32], m8, crate::codegen::m8x32 }
impl_shuffle! { [u32; 64], m8, crate::codegen::m8x64 }

impl_shuffle! { [u32; 2], i16, crate::codegen::i16x2 }
impl_shuffle! { [u32; 4], i16, crate::codegen::i16x4 }
impl_shuffle! { [u32; 8], i16, crate::codegen::i16x8 }
impl_shuffle! { [u32; 16], i16, crate::codegen::i16x16 }
impl_shuffle! { [u32; 32], i16, crate::codegen::i16x32 }

impl_shuffle! { [u32; 2], u16, crate::codegen::u16x2 }
impl_shuffle! { [u32; 4], u16, crate::codegen::u16x4 }
impl_shuffle! { [u32; 8], u16, crate::codegen::u16x8 }
impl_shuffle! { [u32; 16], u16, crate::codegen::u16x16 }
impl_shuffle! { [u32; 32], u16, crate::codegen::u16x32 }

impl_shuffle! { [u32; 2], m16, crate::codegen::m16x2 }
impl_shuffle! { [u32; 4], m16, crate::codegen::m16x4 }
impl_shuffle! { [u32; 8], m16, crate::codegen::m16x8 }
impl_shuffle! { [u32; 16], m16, crate::codegen::m16x16 }

impl_shuffle! { [u32; 2], i32, crate::codegen::i32x2 }
impl_shuffle! { [u32; 4], i32, crate::codegen::i32x4 }
impl_shuffle! { [u32; 8], i32, crate::codegen::i32x8 }
impl_shuffle! { [u32; 16], i32, crate::codegen::i32x16 }

impl_shuffle! { [u32; 2], u32, crate::codegen::u32x2 }
impl_shuffle! { [u32; 4], u32, crate::codegen::u32x4 }
impl_shuffle! { [u32; 8], u32, crate::codegen::u32x8 }
impl_shuffle! { [u32; 16], u32, crate::codegen::u32x16 }

impl_shuffle! { [u32; 2], f32, crate::codegen::f32x2 }
impl_shuffle! { [u32; 4], f32, crate::codegen::f32x4 }
impl_shuffle! { [u32; 8], f32, crate::codegen::f32x8 }
impl_shuffle! { [u32; 16], f32, crate::codegen::f32x16 }

impl_shuffle! { [u32; 2], m32, crate::codegen::m32x2 }
impl_shuffle! { [u32; 4], m32, crate::codegen::m32x4 }
impl_shuffle! { [u32; 8], m32, crate::codegen::m32x8 }
impl_shuffle! { [u32; 16], m32, crate::codegen::m32x16 }

/* FIXME: 64-bit single element vector
impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 }
*/
impl_shuffle! { [u32; 2], i64, crate::codegen::i64x2 }
impl_shuffle! { [u32; 4], i64, crate::codegen::i64x4 }
impl_shuffle! { [u32; 8], i64, crate::codegen::i64x8 }

/* FIXME: 64-bit single element vector
impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 }
*/
impl_shuffle! { [u32; 2], u64, crate::codegen::u64x2 }
impl_shuffle! { [u32; 4], u64, crate::codegen::u64x4 }
impl_shuffle! { [u32; 8], u64, crate::codegen::u64x8 }

/* FIXME: 64-bit single element vector
impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 }
*/
impl_shuffle! { [u32; 2], f64, crate::codegen::f64x2 }
impl_shuffle! { [u32; 4], f64, crate::codegen::f64x4 }
impl_shuffle! { [u32; 8], f64, crate::codegen::f64x8 }

/* FIXME: 64-bit single element vector
impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 }
*/
impl_shuffle! { [u32; 2], m64, crate::codegen::m64x2 }
impl_shuffle! { [u32; 4], m64, crate::codegen::m64x4 }
impl_shuffle! { [u32; 8], m64, crate::codegen::m64x8 }

impl_shuffle! { [u32; 2], isize, crate::codegen::isizex2 }
impl_shuffle! { [u32; 4], isize, crate::codegen::isizex4 }
impl_shuffle! { [u32; 8], isize, crate::codegen::isizex8 }

impl_shuffle! { [u32; 2], usize, crate::codegen::usizex2 }
impl_shuffle! { [u32; 4], usize, crate::codegen::usizex4 }
impl_shuffle! { [u32; 8], usize, crate::codegen::usizex8 }

impl_shuffle! { [u32; 2], msize, crate::codegen::msizex2 }
impl_shuffle! { [u32; 4], msize, crate::codegen::msizex4 }
impl_shuffle! { [u32; 8], msize, crate::codegen::msizex8 }

impl<T> Seal<[u32; 2]> for *const T {}
impl<T> Shuffle<[u32; 2]> for *const T {
    type Output = crate::codegen::cptrx2<T>;
}
impl<T> Seal<[u32; 4]> for *const T {}
impl<T> Shuffle<[u32; 4]> for *const T {
    type Output = crate::codegen::cptrx4<T>;
}
impl<T> Seal<[u32; 8]> for *const T {}
impl<T> Shuffle<[u32; 8]> for *const T {
    type Output = crate::codegen::cptrx8<T>;
}

impl<T> Seal<[u32; 2]> for *mut T {}
impl<T> Shuffle<[u32; 2]> for *mut T {
    type Output = crate::codegen::mptrx2<T>;
}
impl<T> Seal<[u32; 4]> for *mut T {}
impl<T> Shuffle<[u32; 4]> for *mut T {
    type Output = crate::codegen::mptrx4<T>;
}
impl<T> Seal<[u32; 8]> for *mut T {}
impl<T> Shuffle<[u32; 8]> for *mut T {
    type Output = crate::codegen::mptrx8<T>;
}

impl_shuffle! { [u32; 1], i128, crate::codegen::i128x1 }
impl_shuffle! { [u32; 2], i128, crate::codegen::i128x2 }
impl_shuffle! { [u32; 4], i128, crate::codegen::i128x4 }

impl_shuffle! { [u32; 1], u128, crate::codegen::u128x1 }
impl_shuffle! { [u32; 2], u128, crate::codegen::u128x2 }
impl_shuffle! { [u32; 4], u128, crate::codegen::u128x4 }

impl_shuffle! { [u32; 1], m128, crate::codegen::m128x1 }
impl_shuffle! { [u32; 2], m128, crate::codegen::m128x2 }
impl_shuffle! { [u32; 4], m128, crate::codegen::m128x4 }


================================================
FILE: src/codegen/shuffle1_dyn.rs
================================================
//! Shuffle vector lanes with run-time indices.

use crate::*;

pub trait Shuffle1Dyn {
    type Indices;
    fn shuffle1_dyn(self, _: Self::Indices) -> Self;
}

// Fallback implementation
macro_rules! impl_fallback {
    ($id:ident) => {
        impl Shuffle1Dyn for $id {
            type Indices = Self;
            #[inline]
            fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
                let mut result = Self::splat(0);
                for i in 0..$id::lanes() {
                    result = result.replace(i, self.extract(indices.extract(i) as usize));
                }
                result
            }
        }
    };
}

macro_rules! impl_shuffle1_dyn {
    (u8x8) => {
        cfg_if! {
            if #[cfg(all(
                any(
                    all(target_arch = "aarch64", target_feature = "neon"),
                    all(target_arch = "doesnotexist", target_feature = "v7",
                        target_feature = "neon")
                ),
                any(feature = "core_arch", libcore_neon)
            )
            )] {
                impl Shuffle1Dyn for u8x8 {
                    type Indices = Self;
                    #[inline]
                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
                        #[cfg(target_arch = "aarch64")]
                        use crate::arch::aarch64::vtbl1_u8;
                        #[cfg(target_arch = "doesnotexist")]
                        use crate::arch::arm::vtbl1_u8;

                        // This is safe because the binary is compiled with
                        // neon enabled at compile-time and can therefore only
                        // run on CPUs that have it enabled.
                        unsafe {
                            Simd(mem::transmute(
                                vtbl1_u8(mem::transmute(self.0),
                                        crate::mem::transmute(indices.0))
                            ))
                        }
                    }
                }
            } else {
                impl_fallback!(u8x8);
            }
        }
    };
    (u8x16) => {
        cfg_if! {
            if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
                         target_feature = "ssse3"))] {
                impl Shuffle1Dyn for u8x16 {
                    type Indices = Self;
                    #[inline]
                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
                        #[cfg(target_arch = "x86")]
                        use crate::arch::x86::_mm_shuffle_epi8;
                        #[cfg(target_arch = "x86_64")]
                        use crate::arch::x86_64::_mm_shuffle_epi8;
                        // This is safe because the binary is compiled with
                        // ssse3 enabled at compile-time and can therefore only
                        // run on CPUs that have it enabled.
                        unsafe {
                            Simd(mem::transmute(
                                _mm_shuffle_epi8(mem::transmute(self.0),
                                                crate::mem::transmute(indices))
                            ))
                        }
                    }
                }
            } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon",
                                any(feature = "core_arch", libcore_neon)))] {
                impl Shuffle1Dyn for u8x16 {
                    type Indices = Self;
                    #[inline]
                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
                        use crate::arch::aarch64::vqtbl1q_u8;

                        // This is safe because the binary is compiled with
                        // neon enabled at compile-time and can therefore only
                        // run on CPUs that have it enabled.
                        unsafe {
                            Simd(mem::transmute(
                                vqtbl1q_u8(mem::transmute(self.0),
                                          crate::mem::transmute(indices.0))
                            ))
                        }
                    }
                }
            } else if #[cfg(all(target_arch = "doesnotexist", target_feature = "v7",
                                target_feature = "neon",
                                any(feature = "core_arch", libcore_neon)))] {
                impl Shuffle1Dyn for u8x16 {
                    type Indices = Self;
                    #[inline]
                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
                        use crate::arch::arm::vtbl2_u8;

                        // This is safe because the binary is compiled with
                        // neon enabled at compile-time and can therefore only
                        // run on CPUs that have it enabled.
                        unsafe {
                            union U {
                                j: u8x16,
                                s: (u8x8, u8x8),
                            }

                            let (i0, i1) = U { j: y }.s;

                            let r0 = vtbl2_u8(
                                mem::transmute(x),
                                crate::mem::transmute(i0)
                            );
                            let r1 = vtbl2_u8(
                                mem::transmute(x),
                                crate::mem::transmute(i1)
                            );

                            let r = U { s: (r0, r1) }.j;

                            Simd(mem::transmute(r))
                        }
                    }
                }
            } else {
                impl_fallback!(u8x16);
            }
        }
    };
    (u16x8) => {
        impl Shuffle1Dyn for u16x8 {
            type Indices = Self;
            #[inline]
            fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
                let indices: u8x8 = (indices * 2).cast();
                let indices: u8x16 = shuffle!(indices, [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7]);
                let v = u8x16::new(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
                let indices = indices + v;
                unsafe {
                    let s: u8x16 = crate::mem::transmute(self);
                    crate::mem::transmute(s.shuffle1_dyn(indices))
                }
            }
        }
    };
    (u32x4) => {
        cfg_if! {
            if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
                         target_feature = "avx"))] {
                impl Shuffle1Dyn for u32x4 {
                    type Indices = Self;
                    #[inline]
                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
                        #[cfg(target_arch = "x86")]
                        use crate::arch::x86::{_mm_permutevar_ps};
                        #[cfg(target_arch = "x86_64")]
                        use crate::arch::x86_64::{_mm_permutevar_ps};

                        unsafe {
                            crate::mem::transmute(
                                _mm_permutevar_ps(
                                    crate::mem::transmute(self.0),
                                    crate::mem::transmute(indices.0)
                                )
                            )
                        }
                    }
                }
            } else {
                impl Shuffle1Dyn for u32x4 {
                    type Indices = Self;
                    #[inline]
                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
                        let indices: u8x4 = (indices * 4).cast();
                        let indices: u8x16 = shuffle!(
                            indices,
                            [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]
                        );
                        let v = u8x16::new(
                            0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
                        );
                        let indices = indices + v;
                        unsafe {
                            let s: u8x16 =crate::mem::transmute(self);
                           crate::mem::transmute(s.shuffle1_dyn(indices))
                        }
                    }
                }
            }
        }
    };
    (u64x2) => {
        cfg_if! {
            if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
                         target_feature = "avx"))] {
                impl Shuffle1Dyn for u64x2 {
                    type Indices = Self;
                    #[inline]
                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
                        #[cfg(target_arch = "x86")]
                        use crate::arch::x86::{_mm_permutevar_pd};
                        #[cfg(target_arch = "x86_64")]
                        use crate::arch::x86_64::{_mm_permutevar_pd};
                        // _mm_permutevar_pd uses the _second_ bit of each
                        // element to perform the selection, that is: 0b00 => 0,
                        // 0b10 => 1:
                        let indices = indices << 1;
                        unsafe {
                            crate::mem::transmute(
                                _mm_permutevar_pd(
                                    crate::mem::transmute(self),
                                    crate::mem::transmute(indices)
                                )
                            )
                        }
                    }
                }
            } else {
                impl Shuffle1Dyn for u64x2 {
                    type Indices = Self;
                    #[inline]
                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
                        let indices: u8x2 = (indices * 8).cast();
                        let indices: u8x16 = shuffle!(
                            indices,
                            [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
                        );
                        let v = u8x16::new(
                            0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
                        );
                        let indices = indices + v;
                        unsafe {
                            let s: u8x16 =crate::mem::transmute(self);
                           crate::mem::transmute(s.shuffle1_dyn(indices))
                        }
                    }
                }
            }
        }
    };
    (u128x1) => {
        impl Shuffle1Dyn for u128x1 {
            type Indices = Self;
            #[inline]
            fn shuffle1_dyn(self, _indices: Self::Indices) -> Self {
                self
            }
        }
    };
    ($id:ident) => {
        impl_fallback!($id);
    };
}

impl_shuffle1_dyn!(u8x2);
impl_shuffle1_dyn!(u8x4);
impl_shuffle1_dyn!(u8x8);
impl_shuffle1_dyn!(u8x16);
impl_shuffle1_dyn!(u8x32);
impl_shuffle1_dyn!(u8x64);

impl_shuffle1_dyn!(u16x2);
impl_shuffle1_dyn!(u16x4);
impl_shuffle1_dyn!(u16x8);
impl_shuffle1_dyn!(u16x16);
impl_shuffle1_dyn!(u16x32);

impl_shuffle1_dyn!(u32x2);
impl_shuffle1_dyn!(u32x4);
impl_shuffle1_dyn!(u32x8);
impl_shuffle1_dyn!(u32x16);

impl_shuffle1_dyn!(u64x2);
impl_shuffle1_dyn!(u64x4);
impl_shuffle1_dyn!(u64x8);

impl_shuffle1_dyn!(usizex2);
impl_shuffle1_dyn!(usizex4);
impl_shuffle1_dyn!(usizex8);

impl_shuffle1_dyn!(u128x1);
impl_shuffle1_dyn!(u128x2);
impl_shuffle1_dyn!(u128x4);

// Implementation for non-unsigned vector types
macro_rules! impl_shuffle1_dyn_non_u {
    ($id:ident, $uid:ident) => {
        impl Shuffle1Dyn for $id {
            type Indices = $uid;
            #[inline]
            fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
                unsafe {
                    let u: $uid = crate::mem::transmute(self);
                    crate::mem::transmute(u.shuffle1_dyn(indices))
                }
            }
        }
    };
}

impl_shuffle1_dyn_non_u!(i8x2, u8x2);
impl_shuffle1_dyn_non_u!(i8x4, u8x4);
impl_shuffle1_dyn_non_u!(i8x8, u8x8);
impl_shuffle1_dyn_non_u!(i8x16, u8x16);
impl_shuffle1_dyn_non_u!(i8x32, u8x32);
impl_shuffle1_dyn_non_u!(i8x64, u8x64);

impl_shuffle1_dyn_non_u!(i16x2, u16x2);
impl_shuffle1_dyn_non_u!(i16x4, u16x4);
impl_shuffle1_dyn_non_u!(i16x8, u16x8);
impl_shuffle1_dyn_non_u!(i16x16, u16x16);
impl_shuffle1_dyn_non_u!(i16x32, u16x32);

impl_shuffle1_dyn_non_u!(i32x2, u32x2);
impl_shuffle1_dyn_non_u!(i32x4, u32x4);
impl_shuffle1_dyn_non_u!(i32x8, u32x8);
impl_shuffle1_dyn_non_u!(i32x16, u32x16);

impl_shuffle1_dyn_non_u!(i64x2, u64x2);
impl_shuffle1_dyn_non_u!(i64x4, u64x4);
impl_shuffle1_dyn_non_u!(i64x8, u64x8);

impl_shuffle1_dyn_non_u!(isizex2, usizex2);
impl_shuffle1_dyn_non_u!(isizex4, usizex4);
impl_shuffle1_dyn_non_u!(isizex8, usizex8);

impl_shuffle1_dyn_non_u!(i128x1, u128x1);
impl_shuffle1_dyn_non_u!(i128x2, u128x2);
impl_shuffle1_dyn_non_u!(i128x4, u128x4);

impl_shuffle1_dyn_non_u!(m8x2, u8x2);
impl_shuffle1_dyn_non_u!(m8x4, u8x4);
impl_shuffle1_dyn_non_u!(m8x8, u8x8);
impl_shuffle1_dyn_non_u!(m8x16, u8x16);
impl_shuffle1_dyn_non_u!(m8x32, u8x32);
impl_shuffle1_dyn_non_u!(m8x64, u8x64);

impl_shuffle1_dyn_non_u!(m16x2, u16x2);
impl_shuffle1_dyn_non_u!(m16x4, u16x4);
impl_shuffle1_dyn_non_u!(m16x8, u16x8);
impl_shuffle1_dyn_non_u!(m16x16, u16x16);
impl_shuffle1_dyn_non_u!(m16x32, u16x32);

impl_shuffle1_dyn_non_u!(m32x2, u32x2);
impl_shuffle1_dyn_non_u!(m32x4, u32x4);
impl_shuffle1_dyn_non_u!(m32x8, u32x8);
impl_shuffle1_dyn_non_u!(m32x16, u32x16);

impl_shuffle1_dyn_non_u!(m64x2, u64x2);
impl_shuffle1_dyn_non_u!(m64x4, u64x4);
impl_shuffle1_dyn_non_u!(m64x8, u64x8);

impl_shuffle1_dyn_non_u!(msizex2, usizex2);
impl_shuffle1_dyn_non_u!(msizex4, usizex4);
impl_shuffle1_dyn_non_u!(msizex8, usizex8);

impl_shuffle1_dyn_non_u!(m128x1, u128x1);
impl_shuffle1_dyn_non_u!(m128x2, u128x2);
impl_shuffle1_dyn_non_u!(m128x4, u128x4);

impl_shuffle1_dyn_non_u!(f32x2, u32x2);
impl_shuffle1_dyn_non_u!(f32x4, u32x4);
impl_shuffle1_dyn_non_u!(f32x8, u32x8);
impl_shuffle1_dyn_non_u!(f32x16, u32x16);

impl_shuffle1_dyn_non_u!(f64x2, u64x2);
impl_shuffle1_dyn_non_u!(f64x4, u64x4);
impl_shuffle1_dyn_non_u!(f64x8, u64x8);

// Implementation for non-unsigned vector types
macro_rules! impl_shuffle1_dyn_ptr {
    ($id:ident, $uid:ident) => {
        impl<T> Shuffle1Dyn for $id<T> {
            type Indices = $uid;
            #[inline]
            fn shuffle1_dyn(self, indices: Self::Indices) -> Self {
                unsafe {
                    let u: $uid = crate::mem::transmute(self);
                    crate::mem::transmute(u.shuffle1_dyn(indices))
                }
            }
        }
    };
}

impl_shuffle1_dyn_ptr!(cptrx2, usizex2);
impl_shuffle1_dyn_ptr!(cptrx4, usizex4);
impl_shuffle1_dyn_ptr!(cptrx8, usizex8);

impl_shuffle1_dyn_ptr!(mptrx2, usizex2);
impl_shuffle1_dyn_ptr!(mptrx4, usizex4);
impl_shuffle1_dyn_ptr!(mptrx8, usizex8);


================================================
FILE: src/codegen/swap_bytes.rs
================================================
//! Horizontal swap bytes reductions.

// FIXME: investigate using `llvm.bswap`
// https://github.com/rust-lang-nursery/packed_simd/issues/19

use crate::*;

pub(crate) trait SwapBytes {
    fn swap_bytes(self) -> Self;
}

macro_rules! impl_swap_bytes {
    (v16: $($id:ident,)+) => {
        $(
            impl SwapBytes for $id {
                #[inline]
                fn swap_bytes(self) -> Self {
                    shuffle!(self, [1, 0])
                }
            }
        )+
    };
    (v32: $($id:ident,)+) => {
        $(
            impl SwapBytes for $id {
                #[inline]
                #[allow(clippy::useless_transmute)]
                fn swap_bytes(self) -> Self {
                    unsafe {
                        let bytes: u8x4 = crate::mem::transmute(self);
                        let result: u8x4 = shuffle!(bytes, [3, 2, 1, 0]);
                        crate::mem::transmute(result)
                    }
                }
            }
        )+
    };
    (v64: $($id:ident,)+) => {
        $(
            impl SwapBytes for $id {
                #[inline]
                #[allow(clippy::useless_transmute)]
                fn swap_bytes(self) -> Self {
                    unsafe {
                        let bytes: u8x8 = crate::mem::transmute(self);
                        let result: u8x8 = shuffle!(
                            bytes, [7, 6, 5, 4, 3, 2, 1, 0]
                        );
                        crate::mem::transmute(result)
                    }
                }
            }
        )+
    };
    (v128: $($id:ident,)+) => {
        $(
            impl SwapBytes for $id {
                #[inline]
                #[allow(clippy::useless_transmute)]
                fn swap_bytes(self) -> Self {
                    unsafe {
                        let bytes: u8x16 = crate::mem::transmute(self);
                        let result: u8x16 = shuffle!(bytes, [
                            15, 14, 13, 12, 11, 10, 9, 8,
                            7, 6, 5, 4, 3, 2, 1, 0
                        ]);
                        crate::mem::transmute(result)
                    }
                }
            }
        )+
    };
    (v256: $($id:ident,)+) => {
        $(
            impl SwapBytes for $id {
                #[inline]
                #[allow(clippy::useless_transmute)]
                fn swap_bytes(self) -> Self {
                    unsafe {
                        let bytes: u8x32 = crate::mem::transmute(self);
                        let result: u8x32 = shuffle!(bytes, [
                            31, 30, 29, 28, 27, 26, 25, 24,
                            23, 22, 21, 20, 19, 18, 17, 16,
                            15, 14, 13, 12, 11, 10, 9,  8,
                            7,  6,  5,  4,  3,  2,  1,  0
                        ]);
                        crate::mem::transmute(result)
                    }
                }
            }
        )+
    };
    (v512: $($id:ident,)+) => {
        $(
            impl SwapBytes for $id {
                #[inline]
                #[allow(clippy::useless_transmute)]
                fn swap_bytes(self) -> Self {
                    unsafe {
                        let bytes: u8x64 = crate::mem::transmute(self);
                        let result: u8x64 = shuffle!(bytes, [
                            63, 62, 61, 60, 59, 58, 57, 56,
                            55, 54, 53, 52, 51, 50, 49, 48,
                            47, 46, 45, 44, 43, 42, 41, 40,
                            39, 38, 37, 36, 35, 34, 33, 32,
                            31, 30, 29, 28, 27, 26, 25, 24,
                            23, 22, 21, 20, 19, 18, 17, 16,
                            15, 14, 13, 12, 11, 10, 9,  8,
                            7,  6,  5,  4,  3,  2,  1,  0
                        ]);
                        crate::mem::transmute(result)
                    }
                }
            }
        )+
    };
}

impl_swap_bytes!(v16: u8x2, i8x2,);
impl_swap_bytes!(v32: u8x4, i8x4, u16x2, i16x2,);
// FIXME: 64-bit single element vector
impl_swap_bytes!(v64: u8x8, i8x8, u16x4, i16x4, u32x2, i32x2 /* u64x1, i64x1, */,);

impl_swap_bytes!(v128: u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2, u128x1, i128x1,);
impl_swap_bytes!(v256: u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, u64x4, i64x4, u128x2, i128x2,);

impl_swap_bytes!(v512: u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, u64x8, i64x8, u128x4, i128x4,);

cfg_if! {
    if #[cfg(target_pointer_width = "8")] {
        impl_swap_bytes!(v16: isizex2, usizex2,);
        impl_swap_bytes!(v32: isizex4, usizex4,);
        impl_swap_bytes!(v64: isizex8, usizex8,);
    } else if #[cfg(target_pointer_width = "16")] {
        impl_swap_bytes!(v32: isizex2, usizex2,);
        impl_swap_bytes!(v64: isizex4, usizex4,);
        impl_swap_bytes!(v128: isizex8, usizex8,);
    } else if #[cfg(target_pointer_width = "32")] {
        impl_swap_bytes!(v64: isizex2, usizex2,);
        impl_swap_bytes!(v128: isizex4, usizex4,);
        impl_swap_bytes!(v256: isizex8, usizex8,);
    } else if #[cfg(target_pointer_width = "64")] {
        impl_swap_bytes!(v128: isizex2, usizex2,);
        impl_swap_bytes!(v256: isizex4, usizex4,);
        impl_swap_bytes!(v512: isizex8, usizex8,);
    } else {
        compile_error!("unsupported target_pointer_width");
    }
}


================================================
FILE: src/codegen/v128.rs
================================================
//! Internal 128-bit wide vector types

use crate::masks::*;

#[rustfmt::skip]
impl_simd_array!(
    [i8; 16]: i8x16 |
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8
);
#[rustfmt::skip]
impl_simd_array!(
    [u8; 16]: u8x16 |
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8
);
#[rustfmt::skip]
impl_simd_array!(
    [m8; 16]: m8x16 |
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8
);

impl_simd_array!([i16; 8]: i16x8 | i16, i16, i16, i16, i16, i16, i16, i16);
impl_simd_array!([u16; 8]: u16x8 | u16, u16, u16, u16, u16, u16, u16, u16);
impl_simd_array!([m16; 8]: m16x8 | i16, i16, i16, i16, i16, i16, i16, i16);

impl_simd_array!([i32; 4]: i32x4 | i32, i32, i32, i32);
impl_simd_array!([u32; 4]: u32x4 | u32, u32, u32, u32);
impl_simd_array!([f32; 4]: f32x4 | f32, f32, f32, f32);
impl_simd_array!([m32; 4]: m32x4 | i32, i32, i32, i32);

impl_simd_array!([i64; 2]: i64x2 | i64, i64);
impl_simd_array!([u64; 2]: u64x2 | u64, u64);
impl_simd_array!([f64; 2]: f64x2 | f64, f64);
impl_simd_array!([m64; 2]: m64x2 | i64, i64);

impl_simd_array!([i128; 1]: i128x1 | i128);
impl_simd_array!([u128; 1]: u128x1 | u128);
impl_simd_array!([m128; 1]: m128x1 | i128);


================================================
FILE: src/codegen/v16.rs
================================================
//! Internal 16-bit wide vector types

use crate::masks::*;

impl_simd_array!([i8; 2]: i8x2 | i8, i8);
impl_simd_array!([u8; 2]: u8x2 | u8, u8);
impl_simd_array!([m8; 2]: m8x2 | i8, i8);


================================================
FILE: src/codegen/v256.rs
================================================
//! Internal 256-bit wide vector types

use crate::masks::*;

#[rustfmt::skip]
impl_simd_array!(
    [i8; 32]: i8x32 |
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8
);
#[rustfmt::skip]
impl_simd_array!(
    [u8; 32]: u8x32 |
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8
);
#[rustfmt::skip]
impl_simd_array!(
    [m8; 32]: m8x32 |
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8
);
#[rustfmt::skip]
impl_simd_array!(
    [i16; 16]: i16x16 |
    i16, i16, i16, i16,
    i16, i16, i16, i16,
    i16, i16, i16, i16,
    i16, i16, i16, i16
);
#[rustfmt::skip]
impl_simd_array!(
    [u16; 16]: u16x16 |
    u16, u16, u16, u16,
    u16, u16, u16, u16,
    u16, u16, u16, u16,
    u16, u16, u16, u16
);
#[rustfmt::skip]
impl_simd_array!(
    [m16; 16]: m16x16 |
    i16, i16, i16, i16,
    i16, i16, i16, i16,
    i16, i16, i16, i16,
    i16, i16, i16, i16
);

impl_simd_array!([i32; 8]: i32x8 | i32, i32, i32, i32, i32, i32, i32, i32);
impl_simd_array!([u32; 8]: u32x8 | u32, u32, u32, u32, u32, u32, u32, u32);
impl_simd_array!([f32; 8]: f32x8 | f32, f32, f32, f32, f32, f32, f32, f32);
impl_simd_array!([m32; 8]: m32x8 | i32, i32, i32, i32, i32, i32, i32, i32);

impl_simd_array!([i64; 4]: i64x4 | i64, i64, i64, i64);
impl_simd_array!([u64; 4]: u64x4 | u64, u64, u64, u64);
impl_simd_array!([f64; 4]: f64x4 | f64, f64, f64, f64);
impl_simd_array!([m64; 4]: m64x4 | i64, i64, i64, i64);

impl_simd_array!([i128; 2]: i128x2 | i128, i128);
impl_simd_array!([u128; 2]: u128x2 | u128, u128);
impl_simd_array!([m128; 2]: m128x2 | i128, i128);


================================================
FILE: src/codegen/v32.rs
================================================
//! Internal 32-bit wide vector types

use crate::masks::*;

impl_simd_array!([i8; 4]: i8x4 | i8, i8, i8, i8);
impl_simd_array!([u8; 4]: u8x4 | u8, u8, u8, u8);
impl_simd_array!([m8; 4]: m8x4 | i8, i8, i8, i8);

impl_simd_array!([i16; 2]: i16x2 | i16, i16);
impl_simd_array!([u16; 2]: u16x2 | u16, u16);
impl_simd_array!([m16; 2]: m16x2 | i16, i16);


================================================
FILE: src/codegen/v512.rs
================================================
//! Internal 512-bit wide vector types

use crate::masks::*;

#[rustfmt::skip]
impl_simd_array!(
    [i8; 64]: i8x64 |
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,

    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8
);
#[rustfmt::skip]
impl_simd_array!(
    [u8; 64]: u8x64 |
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8,

    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8,
    u8, u8, u8, u8
);
#[rustfmt::skip]
impl_simd_array!(
    [m8; 64]: m8x64 |
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,

    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8,
    i8, i8, i8, i8
);
#[rustfmt::skip]
impl_simd_array!(
    [i16; 32]: i16x32 |
    i16, i16, i16, i16,
    i16, i16, i16, i16,
    i16, i16, i16, i16,
    i16, i16, i16, i16,
    i16, i16, i16, i16,
    i16, i16, i16, i16,
    i16, i16, i16, i16,
    i16, i16, i16, i16
);
#[rustfmt::skip]
impl_simd_array!(
    [u16; 32]: u16x32 |
    u16, u16, u16, u16,
    u16, u16, u16, u16,
    u16, u16, u16, u16,
    u16, u16, u16, u16,
    u16, u16, u16, u16,
    u16, u16, u16, u16,
    u16, u16, u16, u16,
    u16, u16, u16, u16
);
#[rustfmt::skip]
impl_simd_array!(
    [m16; 32]: m16x32 |
    i16, i16, i16, i16,
    i16, i16, i16, i16,
    i16, i16, i16, i16,
    i16, i16, i16, i16,
    i16, i16, i16, i16,
    i16, i16, i16, i16,
    i16, i16, i16, i16,
    i16, i16, i16, i16
);

#[rustfmt::skip]
impl_simd_array!(
    [i32; 16]: i32x16 |
    i32, i32, i32, i32,
    i32, i32, i32, i32,
    i32, i32, i32, i32,
    i32, i32, i32, i32
);
#[rustfmt::skip]
impl_simd_array!(
    [u32; 16]: u32x16 |
    u32, u32, u32, u32,
    u32, u32, u32, u32,
    u32, u32, u32, u32,
    u32, u32, u32, u32
);
#[rustfmt::skip]
impl_simd_array!(
    [f32; 16]: f32x16 |
    f32, f32, f32, f32,
    f32, f32, f32, f32,
    f32, f32, f32, f32,
    f32, f32, f32, f32
);
#[rustfmt::skip]
impl_simd_array!(
    [m32; 16]: m32x16 |
    i32, i32, i32, i32,
    i32, i32, i32, i32,
    i32, i32, i32, i32,
    i32, i32, i32, i32
);

impl_simd_array!([i64; 8]: i64x8 | i64, i64, i64, i64, i64, i64, i64, i64);
impl_simd_array!([u64; 8]: u64x8 | u64, u64, u64, u64, u64, u64, u64, u64);
impl_simd_array!([f64; 8]: f64x8 | f64, f64, f64, f64, f64, f64, f64, f64);
impl_simd_array!([m64; 8]: m64x8 | i64, i64, i64, i64, i64, i64, i64, i64);

impl_simd_array!([i128; 4]: i128x4 | i128, i128, i128, i128);
impl_simd_array!([u128; 4]: u128x4 | u128, u128, u128, u128);
impl_simd_array!([m128; 4]: m128x4 | i128, i128, i128, i128);


================================================
FILE: src/codegen/v64.rs
================================================
//! Internal 64-bit wide vector types

use crate::masks::*;

impl_simd_array!([i8; 8]: i8x8 | i8, i8, i8, i8, i8, i8, i8, i8);
impl_simd_array!([u8; 8]: u8x8 | u8, u8, u8, u8, u8, u8, u8, u8);
impl_simd_array!([m8; 8]: m8x8 | i8, i8, i8, i8, i8, i8, i8, i8);

impl_simd_array!([i16; 4]: i16x4 | i16, i16, i16, i16);
impl_simd_array!([u16; 4]: u16x4 | u16, u16, u16, u16);
impl_simd_array!([m16; 4]: m16x4 | i16, i16, i16, i16);

impl_simd_array!([i32; 2]: i32x2 | i32, i32);
impl_simd_array!([u32; 2]: u32x2 | u32, u32);
impl_simd_array!([f32; 2]: f32x2 | f32, f32);
impl_simd_array!([m32; 2]: m32x2 | i32, i32);

impl_simd_array!([i64; 1]: i64x1 | i64);
impl_simd_array!([u64; 1]: u64x1 | u64);
impl_simd_array!([f64; 1]: f64x1 | f64);
impl_simd_array!([m64; 1]: m64x1 | i64);


================================================
FILE: src/codegen/vPtr.rs
================================================
//! Pointer vector types

macro_rules! impl_simd_ptr {
    ([$ptr_ty:ty; $elem_count:expr]: $tuple_id:ident | $ty:ident
     | $($tys:ty),*) => {
        #[derive(Copy, Clone)]
        #[repr(simd)]
        pub struct $tuple_id<$ty>($(pub(crate) $tys),*);
        //^^^^^^^ leaked through SimdArray

        impl<$ty> crate::sealed::Seal for [$ptr_ty; $elem_count] {}
        impl<$ty> crate::sealed::SimdArray for [$ptr_ty; $elem_count] {
            type Tuple = $tuple_id<$ptr_ty>;
            type T = $ptr_ty;
            const N: usize = $elem_count;
            type NT = [u32; $elem_count];
        }

        impl<$ty> crate::sealed::Seal for $tuple_id<$ptr_ty> {}
        impl<$ty> crate::sealed::Simd for $tuple_id<$ptr_ty> {
            type Element = $ptr_ty;
            const LANES: usize = $elem_count;
            type LanesType = [u32; $elem_count];
        }

    }
}

impl_simd_ptr!([*const T; 2]: cptrx2 | T | T, T);
impl_simd_ptr!([*const T; 4]: cptrx4 | T | T, T, T, T);
impl_simd_ptr!([*const T; 8]: cptrx8 | T | T, T, T, T, T, T, T, T);

impl_simd_ptr!([*mut T; 2]: mptrx2 | T | T, T);
impl_simd_ptr!([*mut T; 4]: mptrx4 | T | T, T, T, T);
impl_simd_ptr!([*mut T; 8]: mptrx8 | T | T, T, T, T, T, T, T, T);


================================================
FILE: src/codegen/vSize.rs
================================================
//! Vector types with pointer-sized elements

use crate::codegen::pointer_sized_int::{isize_, usize_};
use crate::masks::*;

impl_simd_array!([isize; 2]: isizex2 | isize_, isize_);
impl_simd_array!([usize; 2]: usizex2 | usize_, usize_);
impl_simd_array!([msize; 2]: msizex2 | isize_, isize_);

impl_simd_array!([isize; 4]: isizex4 | isize_, isize_, isize_, isize_);
impl_simd_array!([usize; 4]: usizex4 | usize_, usize_, usize_, usize_);
impl_simd_array!([msize; 4]: msizex4 | isize_, isize_, isize_, isize_);

impl_simd_array!([isize; 8]: isizex8 | isize_, isize_, isize_, isize_, isize_, isize_, isize_, isize_);
impl_simd_array!([usize; 8]: usizex8 | usize_, usize_, usize_, usize_, usize_, usize_, usize_, usize_);
impl_simd_array!([msize; 8]: msizex8 | isize_, isize_, isize_, isize_, isize_, isize_, isize_, isize_);


================================================
FILE: src/codegen.rs
================================================
//! Code-generation utilities

pub(crate) mod bit_manip;
pub(crate) mod llvm;
pub(crate) mod math;
pub(crate) mod reductions;
pub(crate) mod shuffle;
pub(crate) mod shuffle1_dyn;
pub(crate) mod swap_bytes;

macro_rules! impl_simd_array {
    ([$elem_ty:ident; $elem_count:expr]:
     $tuple_id:ident | $($elem_tys:ident),*) => {
        #[derive(Copy, Clone)]
        #[repr(simd)]
        pub struct $tuple_id($(pub(crate) $elem_tys),*);
        //^^^^^^^ leaked through SimdArray

        impl crate::sealed::Seal for [$elem_ty; $elem_count] {}

        impl crate::sealed::SimdArray for [$elem_ty; $elem_count] {
            type Tuple = $tuple_id;
            type T = $elem_ty;
            const N: usize = $elem_count;
            type NT = [u32; $elem_count];
        }

        impl crate::sealed::Seal for $tuple_id {}
        impl crate::sealed::Simd for $tuple_id {
            type Element = $elem_ty;
            const LANES: usize = $elem_count;
            type LanesType = [u32; $elem_count];
        }

    }
}

pub(crate) mod pointer_sized_int;

pub(crate) mod v16;
pub(crate) use self::v16::*;

pub(crate) mod v32;
pub(crate) use self::v32::*;

pub(crate) mod v64;
pub(crate) use self::v64::*;

pub(crate) mod v128;
pub(crate) use self::v128::*;

pub(crate) mod v256;
pub(crate) use self::v256::*;

pub(crate) mod v512;
pub(crate) use self::v512::*;

pub(crate) mod vSize;
pub(crate) use self::vSize::*;

pub(crate) mod vPtr;
pub(crate) use self::vPtr::*;


================================================
FILE: src/lib.rs
================================================
//! # Portable packed SIMD vectors
//!
//! This crate is proposed for stabilization as `std::packed_simd` in [RFC2366:
//! `std::simd`](https://github.com/rust-lang/rfcs/pull/2366) .
//!
//! The examples available in the
//! [`examples/`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples)
//! sub-directory of the crate showcase how to use the library in practice.
//!
//! ## Table of contents
//!
//! - [Introduction](#introduction)
//! - [Vector types](#vector-types)
//! - [Conditional operations](#conditional-operations)
//! - [Conversions](#conversions)
//! - [Hardware Features](#hardware-features)
//! - [Performance guide](https://rust-lang-nursery.github.io/packed_simd/perf-guide/)
//!
//! ## Introduction
//!
//! This crate exports [`Simd<[T; N]>`][`Simd`]: a packed vector of `N`
//! elements of type `T` as well as many type aliases for this type: for
//! example, [`f32x4`], which is just an alias for `Simd<[f32; 4]>`.
//!
//! The operations on packed vectors are, by default, "vertical", that is, they
//! are applied to each vector lane in isolation of the others:
//!
//! ```
//! # use packed_simd::*;
//! let a = i32x4::new(1, 2, 3, 4);
//! let b = i32x4::new(5, 6, 7, 8);
//! assert_eq!(a + b, i32x4::new(6, 8, 10, 12));
//! ```
//!
//! Many "horizontal" operations are also provided:
//!
//! ```
//! # use packed_simd::*;
//! # let a = i32x4::new(1, 2, 3, 4);
//! assert_eq!(a.wrapping_sum(), 10);
//! ```
//!
//! In virtually all architectures vertical operations are fast, while
//! horizontal operations are, by comparison, much slower. That is, the
//! most portably-efficient way of performing a reduction over a slice
//! is to collect the results into a vector using vertical operations,
//! and performing a single horizontal operation at the end:
//!
//! ```
//! # use packed_simd::*;
//! fn reduce(x: &[i32]) -> i32 {
//!     assert_eq!(x.len() % 4, 0);
//!     let mut sum = i32x4::splat(0); // [0, 0, 0, 0]
//!     for i in (0..x.len()).step_by(4) {
//!         sum += i32x4::from_slice_unaligned(&x[i..]);
//!     }
//!     sum.wrapping_sum()
//! }
//!
//! let x = [0, 1, 2, 3, 4, 5, 6, 7];
//! assert_eq!(reduce(&x), 28);
//! ```
//!
//! ## Vector types
//!
//! The vector type aliases are named according to the following scheme:
//!
//! > `{element_type}x{number_of_lanes} == Simd<[element_type;
//! number_of_lanes]>`
//!
//! where the following element types are supported:
//!
//! * `i{element_width}`: signed integer
//! * `u{element_width}`: unsigned integer
//! * `f{element_width}`: float
//! * `m{element_width}`: mask (see below)
//! * `*{const,mut} T`: `const` and `mut` pointers
//!
//! ## Basic operations
//!
//! ```
//! # use packed_simd::*;
//! // Sets all elements to `0`:
//! let a = i32x4::splat(0);
//!
//! // Reads a vector from a slice:
//! let mut arr = [0, 0, 0, 1, 2, 3, 4, 5];
//! let b = i32x4::from_slice_unaligned(&arr);
//!
//! // Reads the 4-th element of a vector:
//! assert_eq!(b.extract(3), 1);
//!
//! // Returns a new vector where the 4-th element is replaced with `1`:
//! let a = a.replace(3, 1);
//! assert_eq!(a, b);
//!
//! // Writes a vector to a slice:
//! let a = a.replace(2, 1);
//! a.write_to_slice_unaligned(&mut arr[4..]);
//! assert_eq!(arr, [0, 0, 0, 1, 0, 0, 1, 1]);
//! ```
//!
//! ## Conditional operations
//!
//! One often needs to perform an operation on some lanes of the vector. Vector
//! masks, like `m32x4`, allow selecting on which vector lanes an operation is
//! to be performed:
//!
//! ```
//! # use packed_simd::*;
//! let a = i32x4::new(1, 1, 2, 2);
//!
//! // Add `1` to the first two lanes of the vector.
//! let m = m16x4::new(true, true, false, false);
//! let a = m.select(a + 1, a);
//! assert_eq!(a, i32x4::splat(2));
//! ```
//!
//! The elements of a vector mask are either `true` or `false`. Here `true`
//! means that a lane is "selected", while `false` means that a lane is not
//! selected.
//!
//! All vector masks implement a `mask.select(a: T, b: T) -> T` method that
//! works on all vectors that have the same number of lanes as the mask. The
//! resulting vector contains the elements of `a` for those lanes for which the
//! mask is `true`, and the elements of `b` otherwise.
//!
//! The example constructs a mask with the first two lanes set to `true` and
//! the last two lanes set to `false`. This selects the first two lanes of `a +
//! 1` and the last two lanes of `a`, producing a vector where the first two
//! lanes have been incremented by `1`.
//!
//! > note: mask `select` can be used on vector types that have the same number
//! > of lanes as the mask. The example shows this by using [`m16x4`] instead
//! > of [`m32x4`]. It is _typically_ more performant to use a mask element
//! > width equal to the element width of the vectors being operated upon.
//! > This is, however, not true for 512-bit wide vectors when targeting
//! > AVX-512, where the most efficient masks use only 1-bit per element.
//!
//! All vertical comparison operations returns masks:
//!
//! ```
//! # use packed_simd::*;
//! let a = i32x4::new(1, 1, 3, 3);
//! let b = i32x4::new(2, 2, 0, 0);
//!
//! // ge: >= (Greater Eequal; see also lt, le, gt, eq, ne).
//! let m = a.ge(i32x4::splat(2));
//!
//! if m.any() {
//!     // all / any / none allow coherent control flow
//!     let d = m.select(a, b);
//!     assert_eq!(d, i32x4::new(2, 2, 3, 3));
//! }
//! ```
//!
//! ## Conversions
//!
//! * **lossless widening conversions**: [`From`]/[`Into`] are implemented for
//!   vectors with the same number of lanes when the conversion is value
//! preserving   (same as in `std`).
//!
//! * **safe bitwise conversions**: The cargo feature `into_bits` provides the
//!   `IntoBits/FromBits` traits (`x.into_bits()`). These perform safe bitwise
//!   `transmute`s when all bit patterns of the source type are valid bit
//!   patterns of the target type and are also implemented for the
//!   architecture-specific vector types of `std::arch`. For example, `let x:
//!   u8x8 = m8x8::splat(true).into_bits();` is provided because all `m8x8` bit
//!   patterns are valid `u8x8` bit patterns. However, the opposite is not
//! true,   not all `u8x8` bit patterns are valid `m8x8` bit-patterns, so this
//!   operation cannot be performed safely using `x.into_bits()`; one needs to
//!   use `unsafe { crate::mem::transmute(x) }` for that, making sure that the
//!   value in the `u8x8` is a valid bit-pattern of `m8x8`.
//!
//! * **numeric casts** (`as`): are performed using [`FromCast`]/[`Cast`]
//! (`x.cast()`), just like `as`:
//!
//!   * casting integer vectors whose lane types have the same size (e.g.
//! `i32xN`     -> `u32xN`) is a **no-op**,
//!
//!   * casting from a larger integer to a smaller integer (e.g. `u32xN` ->
//! `u8xN`)     will **truncate**,
//!
//!   * casting from a smaller integer to a larger integer     (e.g. `u8xN` ->
//!     `u32xN`) will:
//!        * **zero-extend** if the source is unsigned, or
//!        * **sign-extend** if the source is signed,
//!
//!   * casting from a float to an integer will **round the float towards
//! zero**,
//!
//!   * casting from an integer to float will produce the floating point
//!     representation of the integer, **rounding to nearest, ties to even**,
//!
//!   * casting from an `f32` to an `f64` is perfect and lossless,
//!
//!   * casting from an `f64` to an `f32` **rounds to nearest, ties to even**.
//!
//!   Numeric casts are not very "precise": sometimes lossy, sometimes value
//!   preserving, etc.
//!
//! ## Hardware Features
//!
//! This crate can use different hardware features based on your configured
//! `RUSTFLAGS`. For example, with no configured `RUSTFLAGS`, `u64x8` on
//! x86_64 will use SSE2 operations like `PCMPEQD`. If you configure
//! `RUSTFLAGS='-C target-feature=+avx2,+avx'` on supported x86_64 hardware
//! the same `u64x8` may use wider AVX2 operations like `VPCMPEQQ`. It is
//! important for performance and for hardware support requirements that
//! you choose an appropriate set of `target-feature` and `target-cpu`
//! options during builds. For more information, see the [Performance
//! guide](https://rust-lang-nursery.github.io/packed_simd/perf-guide/)

#![feature(
    adt_const_params,
    repr_simd,
    rustc_attrs,
    platform_intrinsics,
    stdsimd,
    arm_target_feature,
    link_llvm_intrinsics,
    core_intrinsics,
    stmt_expr_attributes,
    custom_inner_attributes,
)]
#![allow(non_camel_case_types, non_snake_case,
        // FIXME: these types are unsound in C FFI already
        // See https://github.com/rust-lang/rust/issues/53346
        improper_ctypes_definitions,
        incomplete_features,
        clippy::cast_possible_truncation,
        clippy::cast_lossless,
        clippy::cast_possible_wrap,
        clippy::cast_precision_loss,
        // TODO: manually add the `#[must_use]` attribute where appropriate
        clippy::must_use_candidate,
        // This lint is currently broken for generic code
        // See https://github.com/rust-lang/rust-clippy/issues/3410
        clippy::use_self,
        clippy::wrong_self_convention,
        clippy::from_over_into,
)]
#![cfg_attr(test, feature(hashmap_internals))]
#![cfg_attr(doc_cfg, feature(doc_cfg))]
#![deny(rust_2018_idioms, clippy::missing_inline_in_public_items)]
#![no_std]

use cfg_if::cfg_if;

cfg_if! {
    if #[cfg(feature = "core_arch")] {
        #[allow(unused_imports)]
        use core_arch as arch;
    } else {
        #[allow(unused_imports)]
        use core::arch;
    }
}

#[cfg(all(target_arch = "wasm32", test))]
use wasm_bindgen_test::*;

#[allow(unused_imports)]
use core::{
    /* arch (handled above), */ cmp, f32, f64, fmt, hash, hint, i128, i16, i32, i64, i8, intrinsics,
    isize, iter, marker, mem, ops, ptr, slice, u128, u16, u32, u64, u8, usize,
};

#[macro_use]
mod testing;
#[macro_use]
mod api;
mod codegen;
mod sealed;

pub use crate::sealed::{Mask, Shuffle, Simd as SimdVector, SimdArray};

/// Packed SIMD vector type.
///
/// # Examples
///
/// ```
/// # use packed_simd::Simd;
/// let v = Simd::<[i32; 4]>::new(0, 1, 2, 3);
/// assert_eq!(v.extract(2), 2);
/// ```
#[repr(transparent)]
#[derive(Copy, Clone)]
pub struct Simd<A: sealed::SimdArray>(
    // FIXME: this type should be private,
    // but it currently must be public for the
    // `shuffle!` macro to work: it needs to
    // access the internal `repr(simd)` type
    // to call the shuffle intrinsics.
    #[doc(hidden)] pub <A as sealed::SimdArray>::Tuple,
);

impl<A: sealed::SimdArray> sealed::Seal for Simd<A> {}

/// Wrapper over `T` implementing a lexicoraphical order via the `PartialOrd`
/// and/or `Ord` traits.
#[repr(transparent)]
#[derive(Copy, Clone, Debug)]
#[allow(clippy::missing_inline_in_public_items)]
pub struct LexicographicallyOrdered<T>(T);

mod masks;
pub use self::masks::*;

mod v16;
pub use self::v16::*;

mod v32;
pub use self::v32::*;

mod v64;
pub use self::v64::*;

mod v128;
pub use self::v128::*;

mod v256;
pub use self::v256::*;

mod v512;
pub use self::v512::*;

mod vSize;
pub use self::vSize::*;

mod vPtr;
pub use self::vPtr::*;

pub use self::api::cast::*;

#[cfg(feature = "into_bits")]
pub use self::api::into_bits::*;

// Re-export the shuffle intrinsics required by the `shuffle!` macro.
#[doc(hidden)]
pub use self::codegen::llvm::{
    __shuffle_vector16, __shuffle_vector2, __shuffle_vector32, __shuffle_vector4, __shuffle_vector64,
    __shuffle_vector8,
};

pub(crate) mod llvm {
    pub(crate) use crate::codegen::llvm::*;
}


================================================
FILE: src/masks.rs
================================================
//! Mask types

macro_rules! impl_mask_ty {
    ($id:ident : $elem_ty:ident | #[$doc:meta]) => {
        #[$doc]
        #[derive(Copy, Clone)]
        pub struct $id($elem_ty);

        impl crate::sealed::Seal for $id {}
        impl crate::sealed::Mask for $id {
            #[inline]
            fn test(&self) -> bool {
                $id::test(self)
            }
        }

        impl $id {
            /// Instantiate a mask with `value`
            #[inline]
            pub fn new(x: bool) -> Self {
                if x {
                    $id(!0)
                } else {
                    $id(0)
                }
            }
            /// Test if the mask is set
            #[inline]
            pub fn test(&self) -> bool {
                self.0 != 0
            }
        }

        impl Default for $id {
            #[inline]
            fn default() -> Self {
                $id(0)
            }
        }

        #[allow(clippy::partialeq_ne_impl)]
        impl PartialEq<$id> for $id {
            #[inline]
            fn eq(&self, other: &Self) -> bool {
                self.0 == other.0
            }
            #[inline]
            fn ne(&self, other: &Self) -> bool {
                self.0 != other.0
            }
        }

        impl Eq for $id {}

        impl PartialOrd<$id> for $id {
            #[inline]
            fn partial_cmp(&self, other: &Self) -> Option<crate::cmp::Ordering> {
                use crate::cmp::Ordering;
                if self == other {
                    Some(Ordering::Equal)
                } else if self.0 > other.0 {
                    // Note:
                    //  * false = 0_i
                    //  * true == !0_i == -1_i
                    Some(Ordering::Less)
                } else {
                    Some(Ordering::Greater)
                }
            }

            #[inline]
            fn lt(&self, other: &Self) -> bool {
                self.0 > other.0
            }
            #[inline]
            fn gt(&self, other: &Self) -> bool {
                self.0 < other.0
            }
            #[inline]
            fn le(&self, other: &Self) -> bool {
                self.0 >= other.0
            }
            #[inline]
            fn ge(&self, other: &Self) -> bool {
                self.0 <= other.0
            }
        }

        impl Ord for $id {
            #[inline]
            fn cmp(&self, other: &Self) -> crate::cmp::Ordering {
                match self.partial_cmp(other) {
                    Some(x) => x,
                    None => unsafe { crate::hint::unreachable_unchecked() },
                }
            }
        }

        impl crate::hash::Hash for $id {
            #[inline]
            fn hash<H: crate::hash::Hasher>(&self, state: &mut H) {
                (self.0 != 0).hash(state);
            }
        }

        impl crate::fmt::Debug for $id {
            #[inline]
            fn fmt(&self, fmtter: &mut crate::fmt::Formatter<'_>) -> Result<(), crate::fmt::Error> {
                write!(fmtter, "{}({})", stringify!($id), self.0 != 0)
            }
        }
    };
}

impl_mask_ty!(m8: i8 | /// 8-bit wide mask.
);
impl_mask_ty!(m16: i16 | /// 16-bit wide mask.
);
impl_mask_ty!(m32: i32 | /// 32-bit wide mask.
);
impl_mask_ty!(m64: i64 | /// 64-bit wide mask.
);
impl_mask_ty!(m128: i128 | /// 128-bit wide mask.
);
impl_mask_ty!(msize: isize | /// isize-wide mask.
);


================================================
FILE: src/sealed.rs
================================================
//! Sealed traits

/// A sealed trait, this is logically private to the crate
/// and will prevent implementations from outside the crate
pub trait Seal<T = ()> {}

/// Trait implemented by arrays that can be SIMD types.
pub trait SimdArray: Seal {
    /// The type of the #[repr(simd)] type.
    type Tuple: Copy + Clone;
    /// The element type of the vector.
    type T;
    /// The number of elements in the array.
    const N: usize;
    /// The type: `[u32; Self::N]`.
    type NT;
}

/// This traits is used to constraint the arguments
/// and result type of the portable shuffles.
#[doc(hidden)]
pub trait Shuffle<Lanes>: Seal<Lanes> {
    // Lanes is a `[u32; N]` where `N` is the number of vector lanes

    /// The result type of the shuffle.
    type Output;
}

/// This trait is implemented by all SIMD vector types.
pub trait Simd: Seal {
    /// Element type of the SIMD vector
    type Element;
    /// The number of elements in the SIMD vector.
    const LANES: usize;
    /// The type: `[u32; Self::N]`.
    type LanesType;
}

/// This trait is implemented by all mask types
pub trait Mask: Seal {
    fn test(&self) -> bool;
}


================================================
FILE: src/testing/macros.rs
================================================
//! Testing macros

macro_rules! test_if {
    ($cfg_tt:tt: $it:item) => {
        #[cfg(any(
                                                            // Test everything if:
                                                            //
                                                            // * tests are enabled,
                                                            // * no features about exclusively testing
                                                            //   specific vector classes are enabled
                                                            all(test, not(any(
                                                                test_v16,
                                                                test_v32,
                                                                test_v64,
                                                                test_v128,
                                                                test_v256,
                                                                test_v512,
                                                                test_none,  // disables all tests
                                                            ))),
                                                            // Test if:
                                                            //
                                                            // * tests are enabled
                                                            // * a particular cfg token tree returns true
                                                            all(test, $cfg_tt),
                                                        ))]
        $it
    };
}

#[cfg(test)]
#[allow(unused)]
macro_rules! ref_ {
    ($anything:tt) => {
        &$anything
    };
}

#[cfg(test)]
#[allow(unused)]
macro_rules! ref_mut_ {
    ($anything:tt) => {
        &mut $anything
    };
}


================================================
FILE: src/testing/utils.rs
================================================
//! Testing utilities

#![allow(dead_code)]
// FIXME: Or don't. But it's true this is a problematic comparison.
#![allow(clippy::neg_cmp_op_on_partial_ord)]

use crate::{cmp::PartialOrd, fmt::Debug, LexicographicallyOrdered};

/// Tests PartialOrd for `a` and `b` where `a < b` is true.
pub fn test_lt<T>(a: LexicographicallyOrdered<T>, b: LexicographicallyOrdered<T>)
where
    LexicographicallyOrdered<T>: Debug + PartialOrd,
{
    assert!(a < b, "{:?}, {:?}", a, b);
    assert!(b > a, "{:?}, {:?}", a, b);

    assert!(!(a == b), "{:?}, {:?}", a, b);
    assert_ne!(a, b, "{:?}, {:?}", a, b);

    assert!(a <= b, "{:?}, {:?}", a, b);
    assert!(b >= a, "{:?}, {:?}", a, b);

    // The elegance of the mathematical expression of irreflexivity is more
    // than clippy can handle.
    #[allow(clippy::eq_op)]
    {
        // Irreflexivity
        assert!(!(a < a), "{:?}, {:?}", a, b);
        assert!(!(b < b), "{:?}, {:?}", a, b);
        assert!(!(a > a), "{:?}, {:?}", a, b);
        assert!(!(b > b), "{:?}, {:?}", a, b);

        assert!(a <= a, "{:?}, {:?}", a, b);
        assert!(b <= b, "{:?}, {:?}", a, b);
    }
}

/// Tests PartialOrd for `a` and `b` where `a <= b` is true.
pub fn test_le<T>(a: LexicographicallyOrdered<T>, b: LexicographicallyOrdered<T>)
where
    LexicographicallyOrdered<T>: Debug + PartialOrd,
{
    assert!(a <= b, "{:?}, {:?}", a, b);
    assert!(b >= a, "{:?}, {:?}", a, b);

    assert!(a <= b, "{:?}, {:?}", a, b);
    assert!(b >= a, "{:?}, {:?}", a, b);

    if a == b {
        assert!(!(a < b), "{:?}, {:?}", a, b);
        assert!(!(b > a), "{:?}, {:?}", a, b);

        assert!(!(a != b), "{:?}, {:?}", a, b);
    } else {
        assert_ne!(a, b, "{:?}, {:?}", a, b);
        test_lt(a, b);
    }
}

/// Test PartialOrd::partial_cmp for `a` and `b` returning `Ordering`
pub fn test_cmp<T>(
    a: LexicographicallyOrdered<T>,
    b: LexicographicallyOrdered<T>,
    o: Option<crate::cmp::Ordering>,
) where
    LexicographicallyOrdered<T>: PartialOrd + Debug,
    T: Debug + crate::sealed::Simd + Copy + Clone,
    <T as crate::sealed::Simd>::Element: Default + Copy + Clone + PartialOrd,
{
    assert!(T::LANES <= 64, "array length in these two arrays needs updating");
    let mut arr_a: [T::Element; 64] = [Default::default(); 64];
    let mut arr_b: [T::Element; 64] = [Default::default(); 64];

    unsafe { crate::ptr::write_unaligned(arr_a.as_mut_ptr() as *mut LexicographicallyOrdered<T>, a) }
    unsafe { crate::ptr::write_unaligned(arr_b.as_mut_ptr() as *mut LexicographicallyOrdered<T>, b) }
    let expected = arr_a[0..T::LANES].partial_cmp(&arr_b[0..T::LANES]);
    let result = a.partial_cmp(&b);
    assert_eq!(expected, result, "{:?}, {:?}", a, b);
    assert_eq!(o, result, "{:?}, {:?}", a, b);
    match o {
        Some(crate::cmp::Ordering::Less) => {
            test_lt(a, b);
            test_le(a, b);
        }
        Some(crate::cmp::Ordering::Greater) => {
            test_lt(b, a);
            test_le(b, a);
        }
        Some(crate::cmp::Ordering::Equal) => {
            assert!(a == b, "{:?}, {:?}", a, b);
            assert!(!(a != b), "{:?}, {:?}", a, b);
            assert!(!(a < b), "{:?}, {:?}", a, b);
            assert!(!(b < a), "{:?}, {:?}", a, b);
            assert!(!(a > b), "{:?}, {:?}", a, b);
            assert!(!(b > a), "{:?}, {:?}", a, b);

            test_le(a, b);
            test_le(b, a);
        }
        None => {
            assert!(!(a == b), "{:?}, {:?}", a, b);
            assert!(!(a != b), "{:?}, {:?}", a, b);
            assert!(!(a < b), "{:?}, {:?}", a, b);
            assert!(!(a > b), "{:?}, {:?}", a, b);
            assert!(!(b < a), "{:?}, {:?}", a, b);
            assert!(!(b > a), "{:?}, {:?}", a, b);
            assert!(!(a <= b), "{:?}, {:?}", a, b);
            assert!(!(b <= a), "{:?}, {:?}", a, b);
            assert!(!(a >= b), "{:?}, {:?}", a, b);
            assert!(!(b >= a), "{:?}, {:?}", a, b);
        }
    }
}

// Returns a tuple containing two distinct pointer values of the same type as
// the element type of the Simd vector `$id`.
#[allow(unused)]
macro_rules! ptr_vals {
    ($id:ty) => {
        // expands to an expression
        #[allow(unused_unsafe)]
        unsafe {
            // all bits cleared
            let clear: <$id as sealed::Simd>::Element = crate::mem::zeroed();
            // all bits set
            let set: <$id as sealed::Simd>::Element = crate::mem::transmute(-1_isize);
            (clear, set)
        }
    };
}


================================================
FILE: src/testing.rs
================================================
//! Testing macros and other utilities.

#[macro_use]
mod macros;

#[cfg(test)]
#[macro_use]
pub(crate) mod utils;


================================================
FILE: src/v128.rs
================================================
//! 128-bit wide vector types
#[rustfmt::skip]

use crate::*;

impl_i!([i8; 16]: i8x16, m8x16 | i8, u16 | test_v128 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
        From: |
        /// A 128-bit vector with 16 `i8` lanes.
);
impl_u!([u8; 16]: u8x16, m8x16 | u8, u16 | test_v128 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
        From: |
        /// A 128-bit vector with 16 `u8` lanes.
);
impl_m!([m8; 16]: m8x16 | i8, u16 | test_v128 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
        From: m16x16 |
        /// A 128-bit vector mask with 16 `m8` lanes.
);

impl_i!([i16; 8]: i16x8, m16x8 | i16, u8 | test_v128 | x0, x1, x2, x3, x4, x5, x6, x7 |
        From: i8x8, u8x8 |
        /// A 128-bit vector with 8 `i16` lanes.
);
impl_u!([u16; 8]: u16x8, m16x8 | u16, u8 | test_v128 | x0, x1, x2, x3, x4, x5, x6, x7 |
        From: u8x8 |
        /// A 128-bit vector with 8 `u16` lanes.
);
impl_m!([m16; 8]: m16x8 | i16, u8 | test_v128 | x0, x1, x2, x3, x4, x5, x6, x7 |
        From: m8x8, m32x8 |
        /// A 128-bit vector mask with 8 `m16` lanes.
);

impl_i!([i32; 4]: i32x4, m32x4 | i32, u8 | test_v128 | x0, x1, x2, x3 |
        From: i8x4, u8x4, i16x4, u16x4  |
        /// A 128-bit vector with 4 `i32` lanes.
);
impl_u!([u32; 4]: u32x4, m32x4 | u32, u8 | test_v128 | x0, x1, x2, x3 |
        From: u8x4, u16x4 |
        /// A 128-bit vector with 4 `u32` lanes.
);
impl_f!([f32; 4]: f32x4, m32x4 | f32 | test_v128 | x0, x1, x2, x3 |
        From: i8x4, u8x4, i16x4, u16x4 |
        /// A 128-bit vector with 4 `f32` lanes.
);
impl_m!([m32; 4]: m32x4 | i32, u8 | test_v128 | x0, x1, x2, x3 |
        From: m8x4, m16x4, m64x4 |
        /// A 128-bit vector mask with 4 `m32` lanes.
);

impl_i!([i64; 2]: i64x2, m64x2 | i64, u8 | test_v128 | x0, x1 |
        From: i8x2, u8x2, i16x2, u16x2, i32x2, u32x2 |
        /// A 128-bit vector with 2 `i64` lanes.
);
impl_u!([u64; 2]: u64x2, m64x2 | u64, u8 | test_v128 | x0, x1 |
        From: u8x2, u16x2, u32x2 |
        /// A 128-bit vector with 2 `u64` lanes.
);
impl_f!([f64; 2]: f64x2, m64x2 | f64 | test_v128 | x0, x1 |
        From: i8x2, u8x2, i16x2, u16x2, i32x2, u32x2, f32x2 |
        /// A 128-bit vector with 2 `f64` lanes.
);
impl_m!([m64; 2]: m64x2 | i64, u8 | test_v128 | x0, x1 |
        From: m8x2, m16x2, m32x2, m128x2 |
        /// A 128-bit vector mask with 2 `m64` lanes.
);

impl_i!([i128; 1]: i128x1, m128x1 | i128, u8 | test_v128 | x0 |
        From: /*i8x1, u8x1, i16x1, u16x1, i32x1, u32x1, i64x1, u64x1 */ | // FIXME: unary small vector types
        /// A 128-bit vector with 1 `i128` lane.
);
impl_u!([u128; 1]: u128x1, m128x1 | u128, u8 | test_v128 | x0 |
        From: /*u8x1, u16x1, u32x1, u64x1 */ | // FIXME: unary small vector types
        /// A 128-bit vector with 1 `u128` lane.
);
impl_m!([m128; 1]: m128x1 | i128, u8 | test_v128 | x0 |
        From: /*m8x1, m16x1, m32x1, m64x1 */ | // FIXME: unary small vector types
        /// A 128-bit vector mask with 1 `m128` lane.
);


================================================
FILE: src/v16.rs
================================================
//! 16-bit wide vector types

use crate::*;

impl_i!([i8; 2]: i8x2, m8x2 | i8, u8 | test_v16 | x0, x1 |
        From: |
        /// A 16-bit vector with 2 `i8` lanes.
);
impl_u!([u8; 2]: u8x2, m8x2 | u8, u8 | test_v16 | x0, x1 |
        From: |
        /// A 16-bit vector with 2 `u8` lanes.
);
impl_m!([m8; 2]: m8x2 | i8, u8 | test_v16 | x0, x1 |
        From: m16x2, m32x2, m64x2, m128x2 |
        /// A 16-bit vector mask with 2 `m8` lanes.
);


================================================
FILE: src/v256.rs
================================================
//! 256-bit wide vector types
#[rustfmt::skip]

use crate::*;

impl_i!([i8; 32]: i8x32, m8x32 | i8, u32 | test_v256 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 |
        From: |
        /// A 256-bit vector with 32 `i8` lanes.
);
impl_u!([u8; 32]: u8x32, m8x32 | u8, u32 | test_v256 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 |
        From: |
        /// A 256-bit vector with 32 `u8` lanes.
);
impl_m!([m8; 32]: m8x32 | i8, u32 | test_v256 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 |
        From:  |
        /// A 256-bit vector mask with 32 `m8` lanes.
);

impl_i!([i16; 16]: i16x16, m16x16 | i16, u16 | test_v256 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
        From: i8x16, u8x16 |
        /// A 256-bit vector with 16 `i16` lanes.
);
impl_u!([u16; 16]: u16x16, m16x16 | u16, u16 | test_v256 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
        From: u8x16 |
        /// A 256-bit vector with 16 `u16` lanes.
);
impl_m!([m16; 16]: m16x16 | i16, u16 | test_v256 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
        From: m8x16 |
        /// A 256-bit vector mask with 16 `m16` lanes.
);

impl_i!([i32; 8]: i32x8, m32x8 | i32, u8 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7  |
        From: i8x8, u8x8, i16x8, u16x8 |
        /// A 256-bit vector with 8 `i32` lanes.
);
impl_u!([u32; 8]: u32x8, m32x8 | u32, u8 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7 |
        From: u8x8, u16x8 |
        /// A 256-bit vector with 8 `u32` lanes.
);
impl_f!([f32; 8]: f32x8, m32x8 | f32 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7 |
        From: i8x8, u8x8, i16x8, u16x8 |
        /// A 256-bit vector with 8 `f32` lanes.
);
impl_m!([m32; 8]: m32x8 | i32, u8 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7 |
        From: m8x8, m16x8 |
        /// A 256-bit vector mask with 8 `m32` lanes.
);

impl_i!([i64; 4]: i64x4, m64x4 | i64, u8 | test_v256 | x0, x1, x2, x3 |
        From: i8x4, u8x4, i16x4, u16x4, i32x4, u32x4 |
        /// A 256-bit vector with 4 `i64` lanes.
);
impl_u!([u64; 4]: u64x4, m64x4 | u64, u8 | test_v256 | x0, x1, x2, x3 |
        From: u8x4, u16x4, u32x4 |
        /// A 256-bit vector with 4 `u64` lanes.
);
impl_f!([f64; 4]: f64x4, m64x4 | f64 | test_v256 | x0, x1, x2, x3 |
        From: i8x4, u8x4, i16x4, u16x4, i32x4, u32x4, f32x4 |
        /// A 256-bit vector with 4 `f64` lanes.
);
impl_m!([m64; 4]: m64x4 | i64, u8 | test_v256 | x0, x1, x2, x3 |
        From: m8x4, m16x4, m32x4 |
        /// A 256-bit vector mask with 4 `m64` lanes.
);

impl_i!([i128; 2]: i128x2, m128x2 | i128, u8 | test_v256 | x0, x1 |
        From: i8x2, u8x2, i16x2, u16x2, i32x2, u32x2, i64x2, u64x2 |
        /// A 256-bit vector with 2 `i128` lanes.
);
impl_u!([u128; 2]: u128x2, m128x2 | u128, u8 | test_v256 | x0, x1 |
        From: u8x2, u16x2, u32x2, u64x2 |
        /// A 256-bit vector with 2 `u128` lanes.
);
impl_m!([m128; 2]: m128x2 | i128, u8 | test_v256 | x0, x1 |
        From: m8x2, m16x2, m32x2, m64x2 |
        /// A 256-bit vector mask with 2 `m128` lanes.
);


================================================
FILE: src/v32.rs
================================================
//! 32-bit wide vector types

use crate::*;

impl_i!([i8; 4]: i8x4, m8x4 | i8, u8 | test_v32 | x0, x1, x2, x3 |
        From: |
        /// A 32-bit vector with 4 `i8` lanes.
);
impl_u!([u8; 4]: u8x4, m8x4 | u8, u8 | test_v32 | x0, x1, x2, x3 |
        From: |
        /// A 32-bit vector with 4 `u8` lanes.
);
impl_m!([m8; 4]: m8x4 | i8, u8 | test_v32 | x0, x1, x2, x3 |
        From: m16x4, m32x4, m64x4 |
        /// A 32-bit vector mask with 4 `m8` lanes.
);

impl_i!([i16; 2]: i16x2, m16x2 | i16, u8 | test_v32 | x0, x1 |
        From: i8x2, u8x2 |
        /// A 32-bit vector with 2 `i16` lanes.
);
impl_u!([u16; 2]: u16x2, m16x2 | u16, u8 | test_v32 | x0, x1 |
        From: u8x2 |
        /// A 32-bit vector with 2 `u16` lanes.
);
impl_m!([m16; 2]: m16x2 | i16, u8 | test_v32 | x0, x1 |
        From: m8x2, m32x2, m64x2, m128x2 |
        /// A 32-bit vector mask with 2 `m16` lanes.
);


================================================
FILE: src/v512.rs
================================================
//! 512-bit wide vector types
#[rustfmt::skip]

use crate::*;

impl_i!([i8; 64]: i8x64, m8x64 | i8, u64 | test_v512 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31,
        x32, x33, x34, x35, x36, x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47,
        x48, x49, x50, x51, x52, x53, x54, x55, x56, x57, x58, x59, x60, x61, x62, x63 |
        From: |
        /// A 512-bit vector with 64 `i8` lanes.
);
impl_u!([u8; 64]: u8x64, m8x64 | u8, u64 | test_v512 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31,
        x32, x33, x34, x35, x36, x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47,
        x48, x49, x50, x51, x52, x53, x54, x55, x56, x57, x58, x59, x60, x61, x62, x63 |
        From: |
        /// A 512-bit vector with 64 `u8` lanes.
);
impl_m!([m8; 64]: m8x64 | i8, u64 | test_v512 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31,
        x32, x33, x34, x35, x36, x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47,
        x48, x49, x50, x51, x52, x53, x54, x55, x56, x57, x58, x59, x60, x61, x62, x63 |
        From:  |
        /// A 512-bit vector mask with 64 `m8` lanes.
);

impl_i!([i16; 32]: i16x32, m16x32 | i16, u32 | test_v512 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 |
        From: i8x32, u8x32 |
        /// A 512-bit vector with 32 `i16` lanes.
);
impl_u!([u16; 32]: u16x32, m16x32 | u16, u32 | test_v512 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 |
        From: u8x32 |
        /// A 512-bit vector with 32 `u16` lanes.
);
impl_m!([m16; 32]: m16x32 | i16, u32 | test_v512 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 |
        From: m8x32 |
        /// A 512-bit vector mask with 32 `m16` lanes.
);

impl_i!([i32; 16]: i32x16, m32x16 | i32, u16 | test_v512 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
        From: i8x16, u8x16, i16x16, u16x16 |
        /// A 512-bit vector with 16 `i32` lanes.
);
impl_u!([u32; 16]: u32x16, m32x16 | u32, u16 | test_v512 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
        From: u8x16, u16x16 |
        /// A 512-bit vector with 16 `u32` lanes.
);
impl_f!([f32; 16]: f32x16, m32x16 | f32 | test_v512 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
        From: i8x16, u8x16, i16x16, u16x16 |
        /// A 512-bit vector with 16 `f32` lanes.
);
impl_m!([m32; 16]: m32x16 | i32, u16 | test_v512 |
        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |
        From: m8x16, m16x16 |
        /// A 512-bit vector mask with 16 `m32` lanes.
);

impl_i!([i64; 8]: i64x8, m64x8 | i64, u8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 |
        From: i8x8, u8x8, i16x8, u16x8, i32x8, u32x8 |
        /// A 512-bit vector with 8 `i64` lanes.
);
impl_u!([u64; 8]: u64x8, m64x8 | u64, u8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 |
        From: u8x8, u16x8, u32x8 |
        /// A 512-bit vector with 8 `u64` lanes.
);
impl_f!([f64; 8]: f64x8, m64x8 | f64 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 |
        From: i8x8, u8x8, i16x8, u16x8, i32x8, u32x8, f32x8 |
        /// A 512-bit vector with 8 `f64` lanes.
);
impl_m!([m64; 8]: m64x8 | i64, u8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 |
        From: m8x8, m16x8, m32x8 |
        /// A 512-bit vector mask with 8 `m64` lanes.
);

impl_i!([i128; 4]: i128x4, m128x4 | i128, u8 | test_v512 | x0, x1, x2, x3 |
        From: i8x4, u8x4, i16x4, u16x4, i32x4, u32x4, i64x4, u64x4 |
        /// A 512-bit vector with 4 `i128` lanes.
);
impl_u!([u128; 4]: u128x4, m128x4 | u128, u8 | test_v512 | x0, x1, x2, x3 |
        From: u8x4, u16x4, u32x4, u64x4 |
        /// A 512-bit vector with 4 `u128` lanes.
);
impl_m!([m128; 4]: m128x4 | i128, u8 | test_v512 | x0, x1, x2, x3 |
        From: m8x4, m16x4, m32x4, m64x4 |
        /// A 512-bit vector mask with 4 `m128` lanes.
);


================================================
FILE: src/v64.rs
================================================
//! 64-bit wide vector types
#[rustfmt::skip]

use super::*;

impl_i!([i8; 8]: i8x8, m8x8 | i8, u8 | test_v64 | x0, x1, x2, x3, x4, x5, x6, x7 |
        From: |
        /// A 64-bit vector with 8 `i8` lanes.
);
impl_u!([u8; 8]: u8x8, m8x8 | u8, u8 | test_v64 | x0, x1, x2, x3, x4, x5, x6, x7 |
        From: |
        /// A 64-bit vector with 8 `u8` lanes.
);
impl_m!([m8; 8]: m8x8 | i8, u8 | test_v64 | x0, x1, x2, x3, x4, x5, x6, x7 |
        From: m16x8, m32x8 |
        /// A 64-bit vector mask with 8 `m8` lanes.
);

impl_i!([i16; 4]: i16x4, m16x4 | i16, u8 | test_v64 | x0, x1, x2, x3 |
        From: i8x4, u8x4 |
        /// A 64-bit vector with 4 `i16` lanes.
);
impl_u!([u16; 4]: u16x4, m16x4 | u16, u8 | test_v64 | x0, x1, x2, x3 |
        From: u8x4 |
        /// A 64-bit vector with 4 `u16` lanes.
);
impl_m!([m16; 4]: m16x4 | i16, u8 | test_v64 | x0, x1, x2, x3 |
        From: m8x4, m32x4, m64x4 |
        /// A 64-bit vector mask with 4 `m16` lanes.
);

impl_i!([i32; 2]: i32x2, m32x2 | i32, u8 | test_v64 | x0, x1 |
        From: i8x2, u8x2, i16x2, u16x2 |
        /// A 64-bit vector with 2 `i32` lanes.
);
impl_u!([u32; 2]: u32x2, m32x2 | u32, u8 | test_v64 | x0, x1 |
        From: u8x2, u16x2 |
        /// A 64-bit vector with 2 `u32` lanes.
);
impl_m!([m32; 2]: m32x2 | i32, u8 | test_v64 | x0, x1 |
        From: m8x2, m16x2, m64x2, m128x2 |
        /// A 64-bit vector mask with 2 `m32` lanes.
);
impl_f!([f32; 2]: f32x2, m32x2 | f32 | test_v64 | x0, x1 |
        From: i8x2, u8x2, i16x2, u16x2 |
        /// A 64-bit vector with 2 `f32` lanes.
);

/*
impl_i!([i64; 1]: i64x1, m64x1 | i64, u8 | test_v64 | x0 |
        From: /*i8x1, u8x1, i16x1, u16x1, i32x1, u32x1*/ |  // FIXME: primitive to vector conversion
        /// A 64-bit vector with 1 `i64` lanes.
);
impl_u!([u64; 1]: u64x1, m64x1 | u64, u8 | test_v64 | x0 |
        From: /*u8x1, u16x1, u32x1*/ | // FIXME: primitive to vector conversion
        /// A 64-bit vector with 1 `u64` lanes.
);
impl_m!([m64; 1]: m64x1 | i64, u8 | test_v64 | x0 |
        From: /*m8x1, m16x1, m32x1, */ m128x1 | // FIXME: unary small vector types
        /// A 64-bit vector mask with 1 `m64` lanes.
);
impl_f!([f64; 1]: f64x1, m64x1 | f64 | test_v64 | x0 |
        From: /*i8x1, u8x1, i16x1, u16x1, i32x1, u32x1, f32x1*/ | // FIXME: unary small vector types
        /// A 64-bit vector with 1 `f64` lanes.
);
*/


================================================
FILE: src/vPtr.rs
================================================
//! Vectors of pointers
#[rustfmt::skip]

use crate::*;

impl_const_p!(
    [*const T; 2]: cptrx2, msizex2, usizex2, isizex2 | test_v128 | x0, x1 | From: |
    /// A vector with 2 `*const T` lanes
);

impl_mut_p!(
    [*mut T; 2]: mptrx2, msizex2, usizex2, isizex2 | test_v128 | x0, x1 | From: |
    /// A vector with 2 `*mut T` lanes
);

impl_const_p!(
    [*const T; 4]: cptrx4, msizex4, usizex4, isizex4 | test_v256 | x0, x1, x2, x3 | From: |
    /// A vector with 4 `*const T` lanes
);

impl_mut_p!(
    [*mut T; 4]: mptrx4, msizex4, usizex4, isizex4 | test_v256 | x0, x1, x2, x3 | From: |
    /// A vector with 4 `*mut T` lanes
);

impl_const_p!(
    [*const T; 8]: cptrx8, msizex8, usizex8, isizex8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 | From: |
    /// A vector with 8 `*const T` lanes
);

impl_mut_p!(
    [*mut T; 8]: mptrx8, msizex8, usizex8, isizex8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 | From: |
    /// A vector with 8 `*mut T` lanes
);


================================================
FILE: src/vSize.rs
================================================
//! Vectors with pointer-sized elements

use crate::codegen::pointer_sized_int::{isize_, usize_};
use crate::*;

impl_i!([isize; 2]: isizex2, msizex2 | isize_, u8 | test_v128 |
        x0, x1|
        From: |
        /// A vector with 2 `isize` lanes.
);

impl_u!([usize; 2]: usizex2, msizex2 | usize_, u8 | test_v128 |
        x0, x1|
        From: |
        /// A vector with 2 `usize` lanes.
);
impl_m!([msize; 2]: msizex2 | isize_, u8 | test_v128 |
        x0, x1 |
        From: |
        /// A vector mask with 2 `msize` lanes.
);

impl_i!([isize; 4]: isizex4, msizex4 | isize_, u8 | test_v256 |
        x0, x1, x2, x3 |
        From: |
        /// A vector with 4 `isize` lanes.
);
impl_u!([usize; 4]: usizex4, msizex4 | usize_, u8 | test_v256 |
        x0, x1, x2, x3|
        From: |
        /// A vector with 4 `usize` lanes.
);
impl_m!([msize; 4]: msizex4 | isize_, u8 | test_v256 |
        x0, x1, x2, x3 |
        From: |
        /// A vector mask with 4 `msize` lanes.
);

impl_i!([isize; 8]: isizex8, msizex8 | isize_, u8 | test_v512 |
        x0, x1, x2, x3, x4, x5, x6, x7 |
        From: |
        /// A vector with 8 `isize` lanes.
);
impl_u!([usize; 8]: usizex8, msizex8 | usize_, u8 | test_v512 |
        x0, x1, x2, x3, x4, x5, x6, x7 |
        From: |
        /// A vector with 8 `usize` lanes.
);
impl_m!([msize; 8]: msizex8 | isize_, u8 | test_v512 |
        x0, x1, x2, x3, x4, x5, x6, x7 |
        From: |
        /// A vector mask with 8 `msize` lanes.
);


================================================
FILE: tests/endianness.rs
================================================
#[cfg(target_arch = "wasm32")]
use wasm_bindgen_test::*;

use packed_simd::*;
use std::{mem, slice};

#[cfg_attr(not(target_arch = "wasm32"), test)]
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn endian_indexing() {
    let v = i32x4::new(0, 1, 2, 3);
    assert_eq!(v.extract(0), 0);
    assert_eq!(v.extract(1), 1);
    assert_eq!(v.extract(2), 2);
    assert_eq!(v.extract(3), 3);
}

#[cfg_attr(not(target_arch = "wasm32"), test)]
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn endian_bitcasts() {
    #[rustfmt::skip]
    let x = i8x16::new(
        0, 1, 2, 3, 4, 5, 6, 7,
        8, 9, 10, 11, 12, 13, 14, 15,
    );
    let t: i16x8 = unsafe { mem::transmute(x) };
    let e: i16x8 = if cfg!(target_endian = "little") {
        i16x8::new(256, 770, 1284, 1798, 2312, 2826, 3340, 3854)
    } else {
        i16x8::new(1, 515, 1029, 1543, 2057, 2571, 3085, 3599)
    };
    assert_eq!(t, e);
}

#[cfg_attr(not(target_arch = "wasm32"), test)]
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn endian_casts() {
    #[rustfmt::skip]
    let x = i8x16::new(
        0, 1, 2, 3, 4, 5, 6, 7,
        8, 9, 10, 11, 12, 13, 14, 15,
    );
    let t: i16x16 = x.into(); // simd_cast
    #[rustfmt::skip]
    let e = i16x16::new(
        0, 1, 2, 3, 4, 5, 6, 7,
        8, 9, 10, 11, 12, 13, 14, 15,
    );
    assert_eq!(t, e);
}

#[cfg_attr(not(target_arch = "wasm32"), test)]
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn endian_load_and_stores() {
    #[rustfmt::skip]
    let x = i8x16::new(
        0, 1, 2, 3, 4, 5, 6, 7,
        8, 9, 10, 11, 12, 13, 14, 15,
    );
    let mut y: [i16; 8] = [0; 8];
    x.write_to_slice_unaligned(unsafe { slice::from_raw_parts_mut(&mut y as *mut _ as *mut i8, 16) });

    let e: [i16; 8] = if cfg!(target_endian = "little") {
        [256, 770, 1284, 1798, 2312, 2826, 3340, 3854]
    } else {
        [1, 515, 1029, 1543, 2057, 2571, 3085, 3599]
    };
    assert_eq!(y, e);

    let z = i8x16::from_slice_unaligned(unsafe { slice::from_raw_parts(&y as *const _ as *const i8, 16) });
    assert_eq!(z, x);
}

#[cfg_attr(not(target_arch = "wasm32"), test)]
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn endian_array_union() {
    union A {
        data: [f32; 4],
        vec: f32x4,
    }
    let x: [f32; 4] = unsafe { A { vec: f32x4::new(0., 1., 2., 3.) }.data };
    // As all of these are integer values within the mantissa^1 range, it
    // would be very unusual for them to actually fail to compare.
    #[allow(clippy::float_cmp)]
    {
        assert_eq!(x[0], 0_f32);
        assert_eq!(x[1], 1_f32);
        assert_eq!(x[2], 2_f32);
        assert_eq!(x[3], 3_f32);
    }
    let y: f32x4 = unsafe { A { data: [3., 2., 1., 0.] }.vec };
    assert_eq!(y, f32x4::new(3., 2., 1., 0.));

    union B {
        data: [i8; 16],
        vec: i8x16,
    }
    #[rustfmt::skip]
    let x = i8x16::new(
        0, 1, 2, 3, 4, 5, 6, 7,
        8, 9, 10, 11, 12, 13, 14, 15,
    );
    let x: [i8; 16] = unsafe { B { vec: x }.data };

    for (i, v) in x.iter().enumerate() {
        assert_eq!(i as i8, *v);
    }

    #[rustfmt::skip]
    let y = [
        15, 14, 13, 12, 11, 19, 9, 8,
        7, 6, 5, 4, 3, 2, 1, 0
    ];
    #[rustfmt::skip]
    let e = i8x16::new(
        15, 14, 13, 12, 11, 19, 9, 8,
        7, 6, 5, 4, 3, 2, 1, 0
    );
    let z = unsafe { B { data: y }.vec };
    assert_eq!(z, e);

    union C {
        data: [i16; 8],
        vec: i8x16,
    }
    #[rustfmt::skip]
    let x = i8x16::new(
        0, 1, 2, 3, 4, 5, 6, 7,
        8, 9, 10, 11, 12, 13, 14, 15,
    );
    let x: [i16; 8] = unsafe { C { vec: x }.data };

    let e: [i16; 8] = if cfg!(target_endian = "little") {
        [256, 770, 1284, 1798, 2312, 2826, 3340, 3854]
    } else {
        [1, 515, 1029, 1543, 2057, 2571, 3085, 3599]
    };
    assert_eq!(x, e);
}

#[cfg_attr(not(target_arch = "wasm32"), test)]
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn endian_tuple_access() {
    type F32x4T = (f32, f32, f32, f32);
    union A {
        data: F32x4T,
        vec: f32x4,
    }
    let x: F32x4T = unsafe { A { vec: f32x4::new(0., 1., 2., 3.) }.data };
    // As all of these are integer values within the mantissa^1 range, it
    // would be very unusual for them to actually fail to compare.
    #[allow(clippy::float_cmp)]
    {
        assert_eq!(x.0, 0_f32);
        assert_eq!(x.1, 1_f32);
        assert_eq!(x.2, 2_f32);
        assert_eq!(x.3, 3_f32);
    }
    let y: f32x4 = unsafe { A { data: (3., 2., 1., 0.) }.vec };
    assert_eq!(y, f32x4::new(3., 2., 1., 0.));

    #[rustfmt::skip]
    type I8x16T = (i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8);
    union B {
        data: I8x16T,
        vec: i8x16,
    }

    #[rustfmt::skip]
    let x = i8x16::new(
        0, 1, 2, 3, 4, 5, 6, 7,
        8, 9, 10, 11, 12, 13, 14, 15,
    );
    let x: I8x16T = unsafe { B { vec: x }.data };

    assert_eq!(x.0, 0);
    assert_eq!(x.1, 1);
    assert_eq!(x.2, 2);
    assert_eq!(x.3, 3);
    assert_eq!(x.4, 4);
    assert_eq!(x.5, 5);
    assert_eq!(x.6, 6);
    assert_eq!(x.7, 7);
    assert_eq!(x.8, 8);
    assert_eq!(x.9, 9);
    assert_eq!(x.10, 10);
    assert_eq!(x.11, 11);
    assert_eq!(x.12, 12);
    assert_eq!(x.13, 13);
    assert_eq!(x.14, 14);
    assert_eq!(x.15, 15);

    #[rustfmt::skip]
    let y = (
        15, 14, 13, 12, 11, 10, 9, 8,
        7, 6, 5, 4, 3, 2, 1, 0
    );
    let z: i8x16 = unsafe { B { data: y }.vec };
    #[rustfmt::skip]
    let e = i8x16::new(
        15, 14, 13, 12, 11, 10, 9, 8,
        7, 6, 5, 4, 3, 2, 1, 0
    );
    assert_eq!(e, z);

    #[rustfmt::skip]
    type I16x8T = (i16, i16, i16, i16, i16, i16, i16, i16);
    union C {
        data: I16x8T,
        vec: i8x16,
    }

    #[rustfmt::skip]
    let x = i8x16::new(
        0, 1, 2, 3, 4, 5, 6, 7,
        8, 9, 10, 11, 12, 13, 14, 15,
    );
    let x: I16x8T = unsafe { C { vec: x }.data };

    let e: [i16; 8] = if cfg!(target_endian = "little") {
        [256, 770, 1284, 1798, 2312, 2826, 3340, 3854]
    } else {
        [1, 515, 1029, 1543, 2057, 2571, 3085, 3599]
    };
    assert_eq!(x.0, e[0]);
    assert_eq!(x.1, e[1]);
    assert_eq!(x.2, e[2]);
    assert_eq!(x.3, e[3]);
    assert_eq!(x.4, e[4]);
    assert_eq!(x.5, e[5]);
    assert_eq!(x.6, e[6]);
    assert_eq!(x.7, e[7]);

    #[rustfmt::skip]
    #[repr(C)]
    #[derive(Copy ,Clone)]
    pub struct Tup(pub i8, pub i8, pub i16, pub i8, pub i8, pub i16,
                   pub i8, pub i8, pub i16, pub i8, pub i8, pub i16);

    union D {
        data: Tup,
        vec: i8x16,
    }

    #[rustfmt::skip]
    let x = i8x16::new(
        0, 1, 2, 3, 4, 5, 6, 7,
        8, 9, 10, 11, 12, 13, 14, 15,
    );
    let x: Tup = unsafe { D { vec: x }.data };

    let e: [i16; 12] = if cfg!(target_endian = "little") {
        [0, 1, 770, 4, 5, 1798, 8, 9, 2826, 12, 13, 3854]
    } else {
        [0, 1, 515, 4, 5, 1543, 8, 9, 2571, 12, 13, 3599]
    };
    assert_eq!(x.0 as i16, e[0]);
    assert_eq!(x.1 as i16, e[1]);
    assert_eq!(x.2 as i16, e[2]);
    assert_eq!(x.3 as i16, e[3]);
    assert_eq!(x.4 as i16, e[4]);
    assert_eq!(x.5 as i16, e[5]);
    assert_eq!(x.6 as i16, e[6]);
    assert_eq!(x.7 as i16, e[7]);
    assert_eq!(x.8 as i16, e[8]);
    assert_eq!(x.9 as i16, e[9]);
    assert_eq!(x.10 as i16, e[10]);
    assert_eq!(x.11 as i16, e[11]);
}


================================================
FILE: verify/verify/Cargo.toml
================================================
[package]
name = "verify"
version = "0.1.0"
authors = ["gnzlbg <gonzalobg88@gmail.com>"]
edition = "2018"

[dev-dependencies]
stdarch-test = { git = "https://github.com/rust-lang/stdarch.git"  }
packed_simd = { package = "packed_simd", path = "../.." }
cfg-if = "^0.1"
paste = "^0.1.3"


================================================
FILE: verify/verify/readme.md
================================================
# Machine code verification

## Quick start

To run the verification tests run:

```
cargo test --release
```

on this crate, eventually passing the required target features via `RUSTFLAGS`.
For example, `RUSTFLAGS="-C target-feature=+avx2"`.

This crate only contains tests, and the tests only run in `--release` mode.
Therefore, building this crate with anything different from `cargo test
--release` does not make much sense.

## How it works

This crates verifies the machine code generated for some of the portable packed
vector APIs by disassembling the API at run-time and comparing the machine code
generated against the desired one for a particular target and target features.

This is done by using the
[`stdarch-test`](https://github.com/rust-lang/stdarch/tree/master/crates/stdarch-test)
crate, which exposes the `assert_instr` procedural macro. It is used like this:

```rust
// The verification functions must be #[inline]: 
#[inline]
// Enable the target features required for the desired code generation
// on the different targets:
#[cfg_attr(
    any(target_arch = "x86", target_arch = "x86_64"),
    target_feature(enable = "avx512f,avx512vl")
)]
// Check that the disassembly contains a particular instruction:
#[cfg_attr(
    any(target_arch = "x86", target_arch = "x86_64"),
    assert_instr(vpro)
)]
unsafe fn rotate_right_variable(x: u64x8, v: u64x8) -> u64x8 {
    x.rotate_right(v)
}
```

The `assert_instr` procedural macro creates a test that contains a
`#[inline(never)]` function that calls the API. It then gets a function pointer
to this function, and calls `stdarch_test::assert` with it, the function name,
and the expected assembly instruction. `stdarch_test` uses `objdump` or similar
to disassemble itself, it then looks for the function address and name in the
disassembly, and verifies that the machine code for the function contains the
instruction.


================================================
FILE: verify/verify/rust-toolchain
================================================
nightly

================================================
FILE: verify/verify/src/api/math/float/mod.rs
================================================
mod mul_add;


================================================
FILE: verify/verify/src/api/math/float/mul_add.rs
================================================
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
mod x86 {
    mod f32x4 {
        #![allow(unused)]
        use packed_simd::*;
        use stdarch_test::assert_instr;

        #[inline]
        #[target_feature(enable = "sse,fma")]
        #[assert_instr(vfmadd)]
        unsafe fn fused_multiply_add(a: f32x4, b: f32x4, c: f32x4) -> f32x4 {
            a.mul_add(b, c)
        }

        #[inline]
        #[target_feature(enable = "sse,fma")]
        #[assert_instr(vfmsub)]
        unsafe fn fused_multiply_sub(a: f32x4, b: f32x4, c: f32x4) -> f32x4 {
            a.mul_add(b, -c)
        }

        #[inline]
        #[target_feature(enable = "sse,fma")]
        #[assert_instr(vfnmadd)]
        unsafe fn fused_negate_multiply_add(
            a: f32x4, b: f32x4, c: f32x4,
        ) -> f32x4 {
            a.mul_add(-b, c)
        }

        #[inline]
        #[target_feature(enable = "sse,fma")]
        #[assert_instr(vfnmsub)]
        unsafe fn fused_negate_multiply_sub(
            a: f32x4, b: f32x4, c: f32x4,
        ) -> f32x4 {
            a.mul_add(-b, -c)
        }

        #[inline]
        #[target_feature(enable = "sse,fma")]
        #[assert_instr(vfmaddsub)]
        unsafe fn fused_multiply_add_sub(
            a: f32x4, b: f32x4, c: f32x4,
        ) -> f32x4 {
            let add = a.mul_add(b, c);
            let sub = a.mul_add(b, -c);

            m32x4::new(false, true, false, true).select(add, sub)
        }

        #[inline]
        #[target_feature(enable = "sse,fma")]
        #[assert_instr(vfmsubadd)]
        unsafe fn fused_multiply_sub_add(
            a: f32x4, b: f32x4, c: f32x4,
        ) -> f32x4 {
            let add = a.mul_add(b, c);
            let sub = a.mul_add(b, -c);

            m32x4::new(true, false, true, false).select(add, sub)
        }
    }
}


================================================
FILE: verify/verify/src/api/math.rs
================================================
mod float;


================================================
FILE: verify/verify/src/api/ops/vector_rotates/x86.rs
================================================
mod u64x8 {
    #![allow(unused)]
    use packed_simd::*;
    use stdarch_test::assert_instr;

    #[inline]
    #[target_feature(enable = "avx512f")]
    #[assert_instr(vpro)]
    unsafe fn rotate_right_variable(x: u64x8, v: u64x8) -> u64x8 {
        x.rotate_right(v)
    }

    #[inline]
    #[target_feature(enable = "avx512f")]
    #[assert_instr(vpro)]
    unsafe fn rotate_left_variable(x: u64x8, v: u64x8) -> u64x8 {
        x.rotate_left(v)
    }

    #[inline]
    #[target_feature(enable = "avx512f")]
    #[assert_instr(vpro)]
    unsafe fn rotate_right(x: u64x8) -> u64x8 {
        x.rotate_right(u64x8::splat(12))
    }

    #[inline]
    #[target_feature(enable = "avx512f")]
    #[assert_instr(vpro)]
    unsafe fn rotate_left(x: u64x8) -> u64x8 {
        x.rotate_left(u64x8::splat(12))
    }

    #[inline]
    #[target_feature(enable = "avx512f,avx512vl")]
    #[assert_instr(vpro)]
    unsafe fn rotate_left_x2(x: u64x2) -> u64x2 {
        x.rotate_left(u64x2::splat(12))
    }
}


================================================
FILE: verify/verify/src/api/ops/vector_rotates.rs
================================================
use cfg_if::cfg_if;

cfg_if! {
    if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
        mod x86;
    }
}


================================================
FILE: verify/verify/src/api/ops.rs
================================================
mod vector_rotates;


================================================
FILE: verify/verify/src/api/reductions/mask/avx.rs
================================================
//! Verification of the mask reduction API for `x86`/`x86_64`+`SSE2`

use packed_simd::*;
use stdarch_test::assert_instr;

macro_rules! verify {
    ($id:ident => $instr:tt) => {
        verify_mask!($id["avx"] => $instr);
    }
}

// 128-bit wide:
verify!(m8x16 => vpmovmskb);
verify!(m16x8 => vpmovmskb);
verify!(m32x4 => vmovmskps);
verify!(m64x2 => vmovmskpd);
// FIXME: verify!(m128x1 => vmovmskpd);

// 256-bit wide:
verify!(m8x32 => vptest);
verify!(m16x16 => vptest);
verify!(m32x8 => vmovmskps);
verify!(m64x4 => vmovmskpd);
// FIXME: verify!(m128x2 => vmovmskpd);

// FIXME: 512-bit wide masks


================================================
FILE: verify/verify/src/api/reductions/mask/avx2.rs
================================================
//! Verification of the mask reduction API for `x86`/`x86_64`+`SSE2`

use packed_simd::*;
use stdarch_test::assert_instr;

macro_rules! verify {
    ($id:ident => $instr:tt) => {
        verify_mask!($id["avx2"] => $instr);
    }
}

// 128-bit wide:
verify!(m8x16 => vpmovmskb);
verify!(m16x8 => vpmovmskb);
verify!(m32x4 => vmovmskps);
verify!(m64x2 => vmovmskpd);
// FIXME: verify!(m128x1 => vmovmskpd);

// 256-bit wide:
verify!(m8x32 => vpmovmskb);
verify!(m16x16 => vpmovmskb);
verify!(m32x8 => vmovmskps);
verify!(m64x4 => vmovmskpd);
// FIXME: verify!(m128x2 => vmovmskpd);

// FIXME: 512-bit wide masks


================================================
FILE: verify/verify/src/api/reductions/mask/sse.rs
================================================
//! Verification of the mask reduction API for `x86`/`x86_64`+`SSE`

#![allow(unused)]
use packed_simd::*;
use stdarch_test::assert_instr;

macro_rules! verify {
    ($id:ident => $instr:tt) => {
        verify_mask!($id["sse"] => $instr);
    }
}

// 128-bit wide:
verify!(m32x4 => movmskps);
verify!(m64x2 => movmskps);
// FIXME: verify!(m128x1 => movmskps);

// 256-bit wide:
verify!(m32x8 => movmskps);
verify!(m64x4 => movmskps);
// FIXME: verify!(m128x2 => movmskps);

// FIXME: 512-bit wide masks


================================================
FILE: verify/verify/src/api/reductions/mask/sse2.rs
================================================
//! Verification of the mask reduction API for `x86`/`x86_64`+`SSE2`

use packed_simd::*;
use stdarch_test::assert_instr;

macro_rules! verify {
    ($id:ident => $instr:tt) => {
        verify_mask!($id["sse2"] => $instr);
    }
}

// 128-bit wide:
verify!(m8x16 => pmovmskb);
verify!(m16x8 => pmovmskb);
verify!(m32x4 => movmskps);
verify!(m64x2 => movmskpd);
// FIXME: verify!(m128x1 => movmskpd);

// 256-bit wide:
verify!(m8x32 => pmovmskb);
verify!(m16x16 => pmovmskb);
verify!(m32x8 => movmskps);
verify!(m64x4 => movmskpd);
// FIXME: verify!(m128x2 => movmskpd);

// FIXME: 512-bit wide masks


================================================
FILE: verify/verify/src/api/reductions/mask.rs
================================================
//! Verify the mask reduction API.

use cfg_if::cfg_if;

#[allow(unused)]
macro_rules! verify_mask {
    ($mask_id:ident[$target_feature:tt] => $all_instr:tt, $any_instr:tt,
     $none_instr:tt) => {
        paste::item! {
            #[inline]
            #[target_feature(enable = $target_feature)]
            #[assert_instr($all_instr)]
            pub unsafe fn [<$mask_id _all>](x: $mask_id) -> bool {
                x.all()
            }
            #[inline]
            #[target_feature(enable = $target_feature)]
            #[assert_instr($any_instr)]
            pub unsafe fn [<$mask_id _any>](x: $mask_id) -> bool {
                x.any()
            }
            #[inline]
            #[target_feature(enable = $target_feature)]
            #[assert_instr($none_instr)]
            pub unsafe fn [<$mask_id _none>](x: $mask_id) -> bool {
                x.none()
            }
        }
    };
    ($mask_id:ident[$target_feature:tt] => $instr:tt) => {
        verify_mask!($mask_id[$target_feature] => $instr, $instr, $instr);
    };
}

cfg_if! {
    if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64")),
             target_feature = "sse")] {
        // FIXME: avx512
        #[cfg(all(not(target_feature = "avx512f"), target_feature = "avx2"))]
        mod avx2;
        #[cfg(all(not(target_feature = "avx2"), target_feature = "avx"))]
        mod avx;
        #[cfg(all(not(target_feature = "avx"), target_feature = "sse2"))]
        mod sse2;
        #[cfg(all(not(target_feature = "sse2"), target_feature = "sse"))]
        mod sse;
    }
}


================================================
FILE: verify/verify/src/api/reductions.rs
================================================
mod mask;


================================================
FILE: verify/verify/src/api.rs
================================================
use cfg_if::cfg_if;

cfg_if! {
    if #[cfg(debug_assertions)] {
        compile_error!("the verify tests only run in --release mode");
    }
}

mod math;
mod ops;
mod reductions;


================================================
FILE: verify/verify/src/lib.rs
================================================
// FIXME: these types are unsound in C FFI already
// See https://github.com/rust-lang/rust/issues/53346
#![allow(improper_ctypes_definitions)]
#![deny(rust_2018_idioms)]
#![cfg_attr(test, feature(avx512_target_feature, abi_vectorcall))]

#[cfg(test)]
mod api;