[
  {
    "path": ".appveyor.yml",
    "content": "matrix:\n  allow_failures:\n    # FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/72\n    - TARGET: i686-pc-windows-msvc\n    - TARGET: i686-pc-windows-gnu\n    - TARGET: x86_64-pc-windows-gnu\n  fast_finish: true\n\nenvironment:\n  matrix:\n    - TARGET: x86_64-pc-windows-msvc\n      MSYSTEM: MINGW64\n      NOVERIFY: \"1\"\n    - TARGET: x86_64-pc-windows-msvc\n      MSYSTEM: MINGW64\n      RUSTFLAGS: \"-C target-feature=+sse4.2\"\n      NOVERIFY: \"1\"\n    - TARGET: x86_64-pc-windows-msvc\n      MSYSTEM: MINGW64\n      RUSTFLAGS: \"-C target-feature=+avx\"\n      NOVERIFY: \"1\"\n    - TARGET: x86_64-pc-windows-msvc\n      MSYSTEM: MINGW64\n      RUSTFLAGS: \"-C target-feature=+avx2\"\n      NOVERIFY: \"1\"\n\n    - TARGET: i686-pc-windows-msvc\n      MSYSTEM: MINGW32\n      NOVERIFY: \"1\"\n    - TARGET: i686-pc-windows-msvc\n      MSYSTEM: MINGW32\n      RUSTFLAGS: \"-C target-feature=+sse4.2\"\n      NOVERIFY: \"1\"\n    - TARGET: i686-pc-windows-msvc\n      MSYSTEM: MINGW32\n      RUSTFLAGS: \"-C target-feature=+avx\"\n      NOVERIFY: \"1\"\n    - TARGET: i686-pc-windows-msvc\n      MSYSTEM: MINGW32\n      RUSTFLAGS: \"-C target-feature=+avx2\"\n      NOVERIFY: \"1\"\n\n    - TARGET: x86_64-pc-windows-gnu\n      MSYSTEM: MINGW64\n\n    - TARGET: i686-pc-windows-gnu\n      MSYSTEM: MINGW32\n    - TARGET: x86_64-pc-windows-gnu\n      MSYSTEM: MINGW64\ninstall:\n  - ps: if (ls -r . -fi \"*.rs\" | sls \"`t\") { throw \"Found tab character\" }\n  - ps: Start-FileDownload \"https://static.rust-lang.org/dist/rust-nightly-${env:TARGET}.exe\" -FileName \"rust-install.exe\"\n  - ps: .\\rust-install.exe /VERYSILENT /NORESTART /DIR=\"C:\\rust\" | Out-Null\n  - ps: $env:PATH=\"$env:PATH;C:\\rust\\bin\"\n  - set PATH=c:\\msys64\\%MSYSTEM%\\bin;c:\\msys64\\usr\\bin;%PATH%\n  - rustc -vV\n  - cargo -vV\nbuild: false\ntest_script: bash -c \"ci/run.sh\"\n"
  },
  {
    "path": ".github/workflows/benchmarks.yml",
    "content": "name: benchmarks\n\non:\n  push:\n    branches:\n      - master\n  pull_request:\n  workflow_dispatch:\n\njobs:\n  x86_64-unknown-linux-gnu:\n    uses: ./.github/workflows/run-ci-script.yml\n    with:\n      target: x86_64-unknown-linux-gnu\n      setup_script: ci/setup_benchmarks.sh\n      script: ci/benchmark.sh\n      norun: 1\n      verify: 1\n      # FIXME: figure out how to add downloaded ispc to PATH\n      # features: ispc\n  x86_64-apple-darwin:\n    uses: ./.github/workflows/run-ci-script.yml\n    with:\n      target: x86_64-apple-darwin\n      runner: macos-latest\n      setup_script: ci/setup_benchmarks.sh\n      script: ci/benchmark.sh\n      norun: 1\n      verify: 1\n      # FIXME: figure out how to add downloaded ispc to PATH\n      # features: ispc\n"
  },
  {
    "path": ".github/workflows/ci.yml",
    "content": "name: ci\n\n# trigger for all PRs and changes to master\non:\n  push:\n    branches:\n      - master\n  pull_request:\n\njobs:\n  rustfmt:\n    uses: ./.github/workflows/run-ci-script.yml\n    with:\n      script: ci/all.sh check_fmt || true\n  x86_64-unknown-linux-android:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: true\n    with:\n      target: x86_64-linux-android\n  armv7-linux-androideabi:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: true\n    with:\n      target: armv7-linux-androideabi\n  aarch64-unknown-linux-android-NEON:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: true\n    with:\n      target: aarch64-linux-android\n      rustflags: -Ctarget-feature=+neon\n  thumbv7neon-linux-androideabi:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: false\n    with:\n      target: thumbv7neon-linux-androideabi\n  i586-unknown-linux-gnu:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: false\n    with:\n      target: i586-unknown-linux-gnu\n      rustflags: -Crelocation-model=static\n  i586-unknown-linux-gnu-SSE:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: false\n    with:\n      target: i586-unknown-linux-gnu\n      rustflags: -Crelocation-model=static -Ctarget-feature=+sse\n  i586-unknown-linux-gnu-SSE2:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: false\n    with:\n      target: i586-unknown-linux-gnu\n      rustflags: -Crelocation-model=static -Ctarget-feature=+sse2\n  i686-unknown-linux-gnu:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: false\n    with:\n      target: i686-unknown-linux-gnu\n      rustflags: -Crelocation-model=static\n  i686-unknown-linux-gnu-SSE4_2:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: false\n    with:\n      target: i686-unknown-linux-gnu\n      rustflags: -Crelocation-model=static -Ctarget-feature=+sse4.2\n  i686-unknown-linux-gnu-AVX2:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: false\n    with:\n      target: i686-unknown-linux-gnu\n      rustflags: -Crelocation-model=static -Ctarget-feature=+avx2\n  x86_64-unknown-linux-gnu:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: true\n    with:\n      target: x86_64-unknown-linux-gnu\n  x86_64-unknown-linux-gnu-SSE4_2:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: true\n    with:\n      target: x86_64-unknown-linux-gnu\n      rustflags: -Ctarget-feature=+sse4.2\n  x86_64-unknown-linux-gnu-AVX2:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: true\n    with:\n      target: x86_64-unknown-linux-gnu\n      rustflags: -Ctarget-feature=+avx2\n  arm-unknown-linux-gnueabihf:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: true\n    with:\n      target: arm-unknown-linux-gnueabihf\n  armv7-unknown-linux-gnueabihf:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: true\n    with:\n      target: armv7-unknown-linux-gnueabihf\n  armv7-unknown-linux-gnueabihf-NEON:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: true\n    with:\n      target: armv7-unknown-linux-gnueabihf\n      rustflags: -Ctarget-feature=+neon\n  thumbv7neon-unknown-linux-gnueabihf:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: false\n    with:\n      target: thumbv7neon-unknown-linux-gnueabihf\n  aarch64-unknown-linux-gnu-NEON:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: true\n    with:\n      target: aarch64-unknown-linux-gnu\n      rustflags: -Ctarget-feature=+neon\n  powerpc-unknown-linux-gnu:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: false\n    with:\n      target: powerpc-unknown-linux-gnu\n  powerpc64-unknown-linux-gnu:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: false\n    with:\n      target: powerpc64-unknown-linux-gnu\n  powerpc64le-unknown-linux-gnu:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: true\n    with:\n      target: powerpc64le-unknown-linux-gnu\n  powerpc64le-unknown-linux-gnu-ALTIVEC:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: true\n    with:\n      target: powerpc64le-unknown-linux-gnu\n      rustflags: -Ctarget-feature=+altivec\n  powerpc64le-unknown-linux-gnu-VSX:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: true\n    with:\n      target: powerpc64le-unknown-linux-gnu\n      rustflags: -Ctarget-feature=+vsx\n  s390x-unknown-linux-gnu:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: false\n    with:\n      target: s390x-unknown-linux-gnu\n  sparc64-unknown-linux-gnu:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: false\n    with:\n      target: sparc64-unknown-linux-gnu\n  wasm32-unknown-unknown:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: false\n    with:\n      target: wasm32-unknown-unknown\n  x86_64-apple-darwin-SSE4_2:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: true\n    with:\n      runner: macos-latest\n      script: ci/run.sh\n      target: x86_64-apple-darwin\n      rustflags: -Ctarget-feature=+sse4.2\n  x86_64-apple-darwin-AVX:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: true\n    with:\n      runner: macos-latest\n      script: ci/run.sh\n      target: x86_64-apple-darwin\n      rustflags: -Ctarget-feature=+avx\n  x86_64-apple-ios:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: true\n    with:\n      runner: macos-latest\n      script: ci/run.sh\n      target: x86_64-apple-ios\n  aarch64-apple-ios:\n    uses: ./.github/workflows/run-ci-script.yml\n    strategy:\n      fail-fast: true\n    with:\n      runner: macos-latest\n      script: ci/run.sh\n      target: aarch64-apple-ios\n      rustflags: -Ctarget-feature=+neon\n"
  },
  {
    "path": ".github/workflows/docs.yml",
    "content": "name: docs\n\non:\n  push:\n    branches:\n      - master\n\njobs:\n  docs:\n    uses: ./.github/workflows/run-ci-script.yml\n    with:\n      setup_script: cargo install mdbook\n      script: ci/dox.sh\n"
  },
  {
    "path": ".github/workflows/run-ci-script.yml",
    "content": "name: run-ci-script\n\non:\n  workflow_call:\n    inputs:\n      runner:\n        required: false\n        type: string\n        default: ubuntu-latest\n      target:\n        required: false\n        type: string\n        default: ''\n      rustflags:\n        required: false\n        type: string\n        default: ''\n      script:\n        required: false\n        type: string\n        default: ci/run-docker.sh\n      setup_script:\n        required: false\n        type: string\n      norun:\n        required: false\n        type: string\n        default: ''\n      verify:\n        required: false\n        type: string\n        default: ''\n      features:\n        required: false\n        type: string\n        default: ''\n\njobs:\n  run-ci-script:\n    runs-on: ${{ inputs.runner }}\n    steps:\n      - name: Checkout\n        uses: actions/checkout@v2\n      - name: Init Rustup Cache\n        uses: actions/cache@v2\n        with:\n          path: |\n            ~/.rustup/toolchains\n          key: ${{ runner.os }}-cargo-${{ hashFiles('**/rust-toolchain') }}\n      - name: Install Toolchain\n        uses: dtolnay/rust-toolchain@nightly\n        with:\n          # FIXME: change to nightly once https://github.com/rust-lang/packed_simd/pull/350 is merged\n          # needs to be kept in sync with the toolchain files\n          targets: ${{ inputs.target }}\n          components: rustfmt\n      - name: Generate Lockfile\n        run: cargo generate-lockfile\n      - name: Init Cargo Cache\n        uses: actions/cache@v2\n        with:\n          path: |\n            ~/.cargo/bin/\n            ~/.cargo/registry/index/\n            ~/.cargo/registry/cache/\n            ~/.cargo/git/db/\n            target/\n          key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}\n      - name: Setup\n        if: ${{ inputs.setup_script != '' }}\n        run: ${{ inputs.setup_script }}\n        env:\n          TARGET: ${{ inputs.target }}\n          RUSTFLAGS: ${{ inputs.rustflags }}\n          NORUN: ${{ inputs.norun }}\n          VERIFY: ${{ inputs.verify }}\n          FEATURES: ${{ inputs.features }}\n      - name: Run CI Script\n        timeout-minutes: 30\n        run: ${{ inputs.script }}\n        env:\n          TARGET: ${{ inputs.target }}\n          RUSTFLAGS: ${{ inputs.rustflags }}\n          NORUN: ${{ inputs.norun }}\n          VERIFY: ${{ inputs.verify }}\n          FEATURES: ${{ inputs.features }}\n"
  },
  {
    "path": ".gitignore",
    "content": "Cargo.lock\ntarget/\n\n# llvm-ir and assembly\n*.ll\n*.d\n\n# png files output by benchmarks\n*.png\n\n# -*- mode: gitignore; -*-\n*~\n\\#*\\#\n/.emacs.desktop\n/.emacs.desktop.lock\n*.elc\nauto-save-list\ntramp\n.\\#*\n\n# Org-mode\n.org-id-locations\n*_archive\n\n# flymake-mode\n*_flymake.*\n\n# eshell files\n/eshell/history\n/eshell/lastdir\n\n# elpa packages\n/elpa/\n\n# reftex files\n*.rel\n\n# AUCTeX auto folder\n/auto/\n\n# cask packages\n.cask/\ndist/\n\n# Flycheck\nflycheck_*.el\n\n# server auth directory\n/server/\n\n# projectiles files\n.projectile\n\n# directory configuration\n.dir-locals.el"
  },
  {
    "path": ".travis.yml",
    "content": "language: rust\nrust: nightly\nos: linux\ndist: focal\n\nstages:\n  - tools\n  - build-test-verify # Passes full test suite, permit no regressions (unless it's rustup :/)\n  - 32bit-tier1\n  - 64bit-tier2\n  - 32bit-tier2\n\njobs:\n  fast_finish: true\n  include:\n    # Android:\n    - env: TARGET=x86_64-linux-android\n      name: \"x86_64-unknown-linux-android + SSE2\"\n      stage: build-test-verify\n    - env: TARGET=arm-linux-androideabi\n      name: \"arm-linux-androideabi\"\n      stage: build-test-verify\n    - name: \"aarch64-unknown-linux-android + NEON\"\n      env: TARGET=aarch64-linux-android RUSTFLAGS=\"-C target-feature=+neon\"\n      stage: build-test-verify\n    - env: TARGET=\"thumbv7neon-linux-androideabi\"\n      name: \"thumbv7neon-linux-androideabi\"\n      stage: 32bit-tier2\n    # Linux:\n    - env: TARGET=i586-unknown-linux-gnu\n      name: \"i586-unknown-linux-gnu\"\n      stage: 32bit-tier2\n    - env: TARGET=i586-unknown-linux-gnu RUSTFLAGS=\"-C target-feature=+sse\"\n      name: \"i586-unknown-linux-gnu + SSE\"\n      stage: 32bit-tier2\n    - env: TARGET=i586-unknown-linux-gnu RUSTFLAGS=\"-C target-feature=+sse2\"\n      name: \"i586-unknown-linux-gnu + SSE2\"\n      stage: 32bit-tier2\n    - env: TARGET=i686-unknown-linux-gnu\n      name: \"i686-unknown-linux-gnu + SSE2\"\n      stage: 32bit-tier1\n    - env: TARGET=i686-unknown-linux-gnu RUSTFLAGS=\"-C target-feature=+sse4.2\"\n      name: \"i686-unknown-linux-gnu + SSE4.2\"\n      stage: 32bit-tier1\n    - env: TARGET=i686-unknown-linux-gnu RUSTFLAGS=\"-C target-feature=+avx2\"\n      name: \"i686-unknown-linux-gnu + AVX2\"\n      stage: 32bit-tier1\n    - env: TARGET=x86_64-unknown-linux-gnu RUSTFLAGS=\"-C target-feature=+sse4.2\"\n      name: \"x86_64-unknown-linux-gnu + SSE4.2\"\n      stage: build-test-verify\n    - env: TARGET=x86_64-unknown-linux-gnu RUSTFLAGS=\"-C target-feature=+avx2\"\n      name: \"x86_64-unknown-linux-gnu + AVX2\"\n      stage: build-test-verify\n    - env: TARGET=arm-unknown-linux-gnueabihf\n      name: \"arm-unknown-linux-gnueabihf\"\n      stage: build-test-verify\n    - env: TARGET=armv7-unknown-linux-gnueabihf\n      name: \"armv7-unknown-linux-gnueabihf\"\n      stage: build-test-verify\n    - env: TARGET=armv7-unknown-linux-gnueabihf RUSTFLAGS=\"-C target-feature=+neon\"\n      name: \"armv7-unknown-linux-gnueabihf + NEON\"\n      stage: build-test-verify\n    - env: TARGET=\"thumbv7neon-unknown-linux-gnueabihf\"\n      name: \"thumbv7neon-unknown-linux-gnueabihf\"\n      stage: 32bit-tier2\n    - name: \"aarch64-unknown-linux-gnu + NEON\"\n      env: TARGET=aarch64-unknown-linux-gnu RUSTFLAGS=\"-C target-feature=+neon\"\n      stage: build-test-verify\n    - env: TARGET=mips-unknown-linux-gnu\n      name: \"mips-unknown-linux-gnu\"\n      stage: 32bit-tier2\n    - env: TARGET=mipsel-unknown-linux-musl\n      name: \"mipsel-unknown-linux-musl\"\n      stage: 32bit-tier2\n    - env: TARGET=mips64-unknown-linux-gnuabi64\n      name: \"mips64-unknown-linux-gnuabi64\"\n      stage: 64bit-tier2\n    - env: TARGET=mips64el-unknown-linux-gnuabi64\n      name: \"mips64el-unknown-linux-gnuabi64\"\n      stage: 64bit-tier2\n      # FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/18\n      # env: TARGET=mips64el-unknown-linux-gnuabi64 RUSTFLAGS=\"-C target-feature=+msa -C target-cpu=mips64r6\"\n    - env: TARGET=powerpc-unknown-linux-gnu\n      name: \"powerpc-unknown-linux-gnu\"\n      stage: 32bit-tier2\n    - env: TARGET=powerpc64-unknown-linux-gnu\n      name: \"powerpc64-unknown-linux-gnu\"\n      stage: 64bit-tier2\n    - name: \"powerpc64le-unknown-linux-gnu\"\n      env: TARGET=powerpc64le-unknown-linux-gnu\n      stage: build-test-verify\n    - name: \"powerpc64le-unknown-linux-gnu + ALTIVEC\"\n      env: TARGET=powerpc64le-unknown-linux-gnu RUSTFLAGS=\"-C target-feature=+altivec\"\n      stage: build-test-verify\n    - name: \"powerpc64le-unknown-linux-gnu + VSX\"\n      env: TARGET=powerpc64le-unknown-linux-gnu RUSTFLAGS=\"-C target-feature=+vsx\"\n      stage: build-test-verify\n    - name: \"s390x-unknown-linux-gnu\"\n      env: TARGET=s390x-unknown-linux-gnu\n      stage: 64bit-tier2\n    - env: TARGET=sparc64-unknown-linux-gnu\n      name: \"sparc64-unknown-linux-gnu\"\n      stage: 64bit-tier2\n    # WebAssembly:\n    - env: TARGET=wasm32-unknown-unknown\n      name: \"wasm32-unknown-unknown\"\n      stage: 32bit-tier2\n    # MacOSX:\n    - os: osx\n      env: TARGET=x86_64-apple-darwin RUSTFLAGS=\"-C target-feature=+sse4.2\"\n      name: \"x86_64-apple-darwin + SSE4.2\"\n      install: true\n      script: ci/run.sh\n      osx_image: xcode10\n      stage: build-test-verify\n      # Travis-CI OSX build bots do not support AVX2:\n    - os: osx\n      env: TARGET=x86_64-apple-darwin RUSTFLAGS=\"-C target-feature=+avx\"\n      name: \"x86_64-apple-darwin + AVX\"\n      install: true\n      script: ci/run.sh\n      osx_image: xcode10\n      stage: build-test-verify\n    # *BSDs:\n    #- env: TARGET=i686-unknown-freebsd NORUN=1\n    #  script: ci/run.sh\n    #- env: TARGET=x86_64-unknown-freebsd NORUN=1\n    #  script: ci/run.sh\n    #- env: TARGET=x86_64-unknown-netbsd NORUN=1\n    #  script: ci/run.sh\n    # Solaris:\n    #- env: TARGET=x86_64-sun-solaris NORUN=1\n    #  script: ci/run.sh\n    # iOS:\n    - os: osx\n      env: TARGET=x86_64-apple-ios\n      name: \"x86_64-apple-ios + SSE2\"\n      script: ci/run.sh\n      osx_image: xcode9.4\n      stage: 64bit-tier2\n    - name: \"aarch64-apple-ios + NEON\"\n      env: TARGET=aarch64-apple-ios RUSTFLAGS=\"-C target-feature=+neon\"\n      os: osx\n      osx_image: xcode9.4\n      script: ci/run.sh\n      stage: 64bit-tier2\n    # BENCHMARKS:\n    - name: \"Benchmarks - x86_64-unknown-linux-gnu\"\n      install: TARGET=x86_64-unknown-linux-gnu ./ci/setup_benchmarks.sh\n      # FIXME: Use `core_arch,sleef-sys` features once they works again\n      script: PATH=$(pwd):$PATH NORUN=1 VERIFY=1 FEATURES=ispc ci/benchmark.sh\n      stage: tools\n    - name: \"Benchmarks - x86_64-apple-darwin\"\n      install: TARGET=x86_64-apple-darwin ./ci/setup_benchmarks.sh\n      # FIXME: Use `core_arch,sleef-sys` features once they works again\n      script: PATH=$(pwd):$PATH NORUN=1 VERIFY=1 FEATURES=ispc ci/benchmark.sh\n      os: osx\n      osx_image: xcode9.4\n      stage: tools\n    # TOOLS:\n    - name: \"Documentation\"\n      before_install:\n        - sudo add-apt-repository -y ppa:deadsnakes/ppa\n        - sudo apt-get update -y\n        - sudo apt-get install -y python3.9\n      install:\n        - cargo install mdbook\n      script: ci/dox.sh\n      stage: tools\n    - name: \"rustfmt\"\n      install: true\n      script: |\n        rustup toolchain install nightly -c rustfmt --allow-downgrade\n        ci/all.sh check_fmt || true\n      stage: tools\n\n  allow_failures:\n    # FIXME: ISPC cannot be found?\n    - name: \"Benchmarks - x86_64-apple-darwin\"\n    # FIXME: i686 fails in inlining, apparently\n    - stage: 32bit-tier1\n    #- env: TARGET=i686-unknown-freebsd NORUN=1\n    #- env: TARGET=x86_64-unknown-freebsd NORUN=1\n    #- env: TARGET=x86_64-unknown-netbsd NORUN=1\n    #- env: TARGET=x86_64-sun-solaris NORUN=1\n\n    # FIXME: TBD\n    - stage: 64bit-tier2\n    - stage: 32bit-tier2\n\n    # FIXME: iOS\n    # https://github.com/rust-lang-nursery/packed_simd/issues/26\n    - env: TARGET=x86_64-apple-ios\n    # Is this related to the above? Mysterious test failure\n    - name: \"aarch64-apple-ios + NEON\"\n\ninstall: travis_retry rustup target add $TARGET\nbefore_script: cargo generate-lockfile\nscript: travis_wait 50 ci/run-docker.sh\nafter_script: sleep 5\n\nenv:\n  global:\n    secure: \"lPHv7s6+AxQYNaFncycVFQt++Y1asQmMhOikQU1ztlP8CK7+hn2m98cg/euOJyzIOb2iJ3ZX4cGZkzw4lc59MQBByb1GtDbazQoUOzVDbVfe9BDD2f8JVoIFh1CMfjPKQ7Gg/rJqWlwrUlSd5GNxPCutKjY7qZhJuR6SQbJjlWaGN2Vd4fVCzKXz8fHRXgMEZS+d+CR4Nsrkb83J3Z4s5kSdJmhYxJ61AWjuzJVwUh4l3/HEYlSL5XXpuh5R2i7W16h1PlNdaTUgkZli1lHzO8+6Q8LzX9+XiLIEVX9lw3A2NdIKGz8E/+7Qs5oYOkwYhjROsDQxIK7xkSM30bQuN7cwMBybAVIyOPJkqXQ1dQyp83KSdsOj7JMyDDRvcEDLI6ehRlm5EcdH7YrReuboN81iUo0Sa7VsuUmgj5hjERCt9r30f9aWuitABai7vKRtjglg7Sp5CrEVPA4PQs6PqKCCRogoggbXJ/Z5Dyw/RZaXPeNR9+qIKN1Vjm9Gew1sRN2JK/3+vXTKtyJXH/uBxgJt4jQlbuShOJuF+BSfTF88sMe67a/357SSOIb4JkaCyd0flDCWYE8576kaHPlVVMT2peXee0LeRXm1e13nG3Na0t3LS/orJLPHOShNQGoDj7qAP5aEKggRya896JGwtvlaBHHTmSQh65G7cyNErZo=\"\nbranches:\n  only:\n    - staging # bors r+\n    - trying  # bors try\n    - master\nnotifications:\n  email:\n    on_success: never\n"
  },
  {
    "path": "Cargo.toml",
    "content": "[package]\nname = \"packed_simd\"\nversion = \"0.3.9\"\ndescription = \"Portable Packed SIMD vectors\"\ndocumentation = \"https://docs.rs/crate/packed_simd/\"\nhomepage = \"https://github.com/rust-lang/packed_simd\"\nrepository = \"https://github.com/rust-lang/packed_simd\"\nkeywords = [\"simd\", \"vector\", \"portability\"]\ncategories = [\"hardware-support\", \"concurrency\", \"no-std\", \"data-structures\"]\nlicense = \"MIT OR Apache-2.0\"\nbuild = \"build.rs\"\nedition = \"2018\"\n\n[package.metadata.docs.rs]\nfeatures = [\"into_bits\"]\nrustdoc-args = [\"--cfg\", \"doc_cfg\"]\n# To build locally:\n# RUSTDOCFLAGS=\"--cfg doc_cfg\" cargo +nightly doc --features into_bits --no-deps --open\n\n[badges]\nis-it-maintained-issue-resolution = { repository = \"rust-lang/packed_simd\" }\nis-it-maintained-open-issues = { repository = \"rust-lang/packed_simd\" }\nmaintenance = { status = \"experimental\" }\n\n[dependencies]\ncfg-if = \"1.0.0\"\ncore_arch = { version = \"0.1.5\", optional = true }\nnum-traits = { version = \"0.2.14\", default-features = false, features = [\"libm\"] }\n\n[features]\ndefault = []\ninto_bits = []\nlibcore_neon = []\n\n[dev-dependencies]\npaste = \"^1\"\narrayvec = { version = \"^0.5\", default-features = false }\n\n[target.'cfg(target_arch = \"x86_64\")'.dependencies.sleef-sys]\nversion = \"0.1.2\"\noptional = true\n\n[target.wasm32-unknown-unknown.dev-dependencies]\n# Keep in sync with the version on Dockerfile.\nwasm-bindgen = \"=0.2.87\"\nwasm-bindgen-test = \"=0.3.37\"\n"
  },
  {
    "path": "LICENSE-APACHE",
    "content": "                              Apache License\n                        Version 2.0, January 2004\n                     http://www.apache.org/licenses/\n\nTERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n1. Definitions.\n\n   \"License\" shall mean the terms and conditions for use, reproduction,\n   and distribution as defined by Sections 1 through 9 of this document.\n\n   \"Licensor\" shall mean the copyright owner or entity authorized by\n   the copyright owner that is granting the License.\n\n   \"Legal Entity\" shall mean the union of the acting entity and all\n   other entities that control, are controlled by, or are under common\n   control with that entity. For the purposes of this definition,\n   \"control\" means (i) the power, direct or indirect, to cause the\n   direction or management of such entity, whether by contract or\n   otherwise, or (ii) ownership of fifty percent (50%) or more of the\n   outstanding shares, or (iii) beneficial ownership of such entity.\n\n   \"You\" (or \"Your\") shall mean an individual or Legal Entity\n   exercising permissions granted by this License.\n\n   \"Source\" form shall mean the preferred form for making modifications,\n   including but not limited to software source code, documentation\n   source, and configuration files.\n\n   \"Object\" form shall mean any form resulting from mechanical\n   transformation or translation of a Source form, including but\n   not limited to compiled object code, generated documentation,\n   and conversions to other media types.\n\n   \"Work\" shall mean the work of authorship, whether in Source or\n   Object form, made available under the License, as indicated by a\n   copyright notice that is included in or attached to the work\n   (an example is provided in the Appendix below).\n\n   \"Derivative Works\" shall mean any work, whether in Source or Object\n   form, that is based on (or derived from) the Work and for which the\n   editorial revisions, annotations, elaborations, or other modifications\n   represent, as a whole, an original work of authorship. For the purposes\n   of this License, Derivative Works shall not include works that remain\n   separable from, or merely link (or bind by name) to the interfaces of,\n   the Work and Derivative Works thereof.\n\n   \"Contribution\" shall mean any work of authorship, including\n   the original version of the Work and any modifications or additions\n   to that Work or Derivative Works thereof, that is intentionally\n   submitted to Licensor for inclusion in the Work by the copyright owner\n   or by an individual or Legal Entity authorized to submit on behalf of\n   the copyright owner. For the purposes of this definition, \"submitted\"\n   means any form of electronic, verbal, or written communication sent\n   to the Licensor or its representatives, including but not limited to\n   communication on electronic mailing lists, source code control systems,\n   and issue tracking systems that are managed by, or on behalf of, the\n   Licensor for the purpose of discussing and improving the Work, but\n   excluding communication that is conspicuously marked or otherwise\n   designated in writing by the copyright owner as \"Not a Contribution.\"\n\n   \"Contributor\" shall mean Licensor and any individual or Legal Entity\n   on behalf of whom a Contribution has been received by Licensor and\n   subsequently incorporated within the Work.\n\n2. Grant of Copyright License. Subject to the terms and conditions of\n   this License, each Contributor hereby grants to You a perpetual,\n   worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n   copyright license to reproduce, prepare Derivative Works of,\n   publicly display, publicly perform, sublicense, and distribute the\n   Work and such Derivative Works in Source or Object form.\n\n3. Grant of Patent License. Subject to the terms and conditions of\n   this License, each Contributor hereby grants to You a perpetual,\n   worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n   (except as stated in this section) patent license to make, have made,\n   use, offer to sell, sell, import, and otherwise transfer the Work,\n   where such license applies only to those patent claims licensable\n   by such Contributor that are necessarily infringed by their\n   Contribution(s) alone or by combination of their Contribution(s)\n   with the Work to which such Contribution(s) was submitted. If You\n   institute patent litigation against any entity (including a\n   cross-claim or counterclaim in a lawsuit) alleging that the Work\n   or a Contribution incorporated within the Work constitutes direct\n   or contributory patent infringement, then any patent licenses\n   granted to You under this License for that Work shall terminate\n   as of the date such litigation is filed.\n\n4. Redistribution. You may reproduce and distribute copies of the\n   Work or Derivative Works thereof in any medium, with or without\n   modifications, and in Source or Object form, provided that You\n   meet the following conditions:\n\n   (a) You must give any other recipients of the Work or\n       Derivative Works a copy of this License; and\n\n   (b) You must cause any modified files to carry prominent notices\n       stating that You changed the files; and\n\n   (c) You must retain, in the Source form of any Derivative Works\n       that You distribute, all copyright, patent, trademark, and\n       attribution notices from the Source form of the Work,\n       excluding those notices that do not pertain to any part of\n       the Derivative Works; and\n\n   (d) If the Work includes a \"NOTICE\" text file as part of its\n       distribution, then any Derivative Works that You distribute must\n       include a readable copy of the attribution notices contained\n       within such NOTICE file, excluding those notices that do not\n       pertain to any part of the Derivative Works, in at least one\n       of the following places: within a NOTICE text file distributed\n       as part of the Derivative Works; within the Source form or\n       documentation, if provided along with the Derivative Works; or,\n       within a display generated by the Derivative Works, if and\n       wherever such third-party notices normally appear. The contents\n       of the NOTICE file are for informational purposes only and\n       do not modify the License. You may add Your own attribution\n       notices within Derivative Works that You distribute, alongside\n       or as an addendum to the NOTICE text from the Work, provided\n       that such additional attribution notices cannot be construed\n       as modifying the License.\n\n   You may add Your own copyright statement to Your modifications and\n   may provide additional or different license terms and conditions\n   for use, reproduction, or distribution of Your modifications, or\n   for any such Derivative Works as a whole, provided Your use,\n   reproduction, and distribution of the Work otherwise complies with\n   the conditions stated in this License.\n\n5. Submission of Contributions. Unless You explicitly state otherwise,\n   any Contribution intentionally submitted for inclusion in the Work\n   by You to the Licensor shall be under the terms and conditions of\n   this License, without any additional terms or conditions.\n   Notwithstanding the above, nothing herein shall supersede or modify\n   the terms of any separate license agreement you may have executed\n   with Licensor regarding such Contributions.\n\n6. Trademarks. This License does not grant permission to use the trade\n   names, trademarks, service marks, or product names of the Licensor,\n   except as required for reasonable and customary use in describing the\n   origin of the Work and reproducing the content of the NOTICE file.\n\n7. Disclaimer of Warranty. Unless required by applicable law or\n   agreed to in writing, Licensor provides the Work (and each\n   Contributor provides its Contributions) on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n   implied, including, without limitation, any warranties or conditions\n   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n   PARTICULAR PURPOSE. You are solely responsible for determining the\n   appropriateness of using or redistributing the Work and assume any\n   risks associated with Your exercise of permissions under this License.\n\n8. Limitation of Liability. In no event and under no legal theory,\n   whether in tort (including negligence), contract, or otherwise,\n   unless required by applicable law (such as deliberate and grossly\n   negligent acts) or agreed to in writing, shall any Contributor be\n   liable to You for damages, including any direct, indirect, special,\n   incidental, or consequential damages of any character arising as a\n   result of this License or out of the use or inability to use the\n   Work (including but not limited to damages for loss of goodwill,\n   work stoppage, computer failure or malfunction, or any and all\n   other commercial damages or losses), even if such Contributor\n   has been advised of the possibility of such damages.\n\n9. Accepting Warranty or Additional Liability. While redistributing\n   the Work or Derivative Works thereof, You may choose to offer,\n   and charge a fee for, acceptance of support, warranty, indemnity,\n   or other liability obligations and/or rights consistent with this\n   License. However, in accepting such obligations, You may act only\n   on Your own behalf and on Your sole responsibility, not on behalf\n   of any other Contributor, and only if You agree to indemnify,\n   defend, and hold each Contributor harmless for any liability\n   incurred by, or claims asserted against, such Contributor by reason\n   of your accepting any such warranty or additional liability.\n\nEND OF TERMS AND CONDITIONS\n\nAPPENDIX: How to apply the Apache License to your work.\n\n   To apply the Apache License to your work, attach the following\n   boilerplate notice, with the fields enclosed by brackets \"[]\"\n   replaced with your own identifying information. (Don't include\n   the brackets!)  The text should be enclosed in the appropriate\n   comment syntax for the file format. We also recommend that a\n   file or class name and description of purpose be included on the\n   same \"printed page\" as the copyright notice for easier\n   identification within third-party archives.\n\nCopyright [yyyy] [name of copyright owner]\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\n\thttp://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License.\n"
  },
  {
    "path": "LICENSE-MIT",
    "content": "Copyright (c) 2014 The Rust Project Developers\n\nPermission is hereby granted, free of charge, to any\nperson obtaining a copy of this software and associated\ndocumentation files (the \"Software\"), to deal in the\nSoftware without restriction, including without\nlimitation the rights to use, copy, modify, merge,\npublish, distribute, sublicense, and/or sell copies of\nthe Software, and to permit persons to whom the Software\nis furnished to do so, subject to the following\nconditions:\n\nThe above copyright notice and this permission notice\nshall be included in all copies or substantial portions\nof the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF\nANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED\nTO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A\nPARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT\nSHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\nCLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION\nOF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR\nIN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\nDEALINGS IN THE SOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# `Simd<[T; N]>`\n\n## Implementation of [Rust RFC #2366: `std::simd`][rfc2366]\n\n[![Latest Version]][crates.io] [![docs]][master_docs]\n\n**WARNING**: this crate only supports the most recent nightly Rust toolchain\nand will be superseded by [`#![feature(portable_simd)]`](https://github.com/rust-lang/portable-simd).\n\n## Documentation\n\n* [API docs (`master` branch)][master_docs]\n* [Performance guide][perf_guide]\n* [API docs (`docs.rs`)][docs.rs]\n* [RFC2366 `std::simd`][rfc2366]: - contains motivation, design rationale,\n  discussion, etc.\n\n## Examples\n\nMost of the examples come with both a scalar and a vectorized implementation.\n\n* [`aobench`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench)\n* [`fannkuch_redux`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/fannkuch_redux)\n* [`matrix inverse`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/matrix_inverse)\n* [`mandelbrot`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/mandelbrot)\n* [`n-body`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/nbody)\n* [`options_pricing`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/options_pricing)\n* [`spectral_norm`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/spectral_norm)\n* [`triangle transform`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/triangle_xform)\n* [`stencil`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/stencil)\n* [`vector dot product`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/dot_product)\n\n## Cargo features\n\n* `into_bits` (default: disabled): enables `FromBits`/`IntoBits` trait\n  implementations for the vector types. These allow reinterpreting the bits of a\n  vector type as those of another vector type safely by just using the\n  `.into_bits()` method.\n\n## Performance\n\nThe following [ISPC] examples are also part of `packed_simd`'s\n[`examples/`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/)\ndirectory, where `packed_simd`+[`rayon`][rayon] are used to emulate [ISPC]'s\nSingle-Program-Multiple-Data (SPMD) programming model. The performance results\non different hardware is shown in the `readme.md` of each example. The following\ntable summarizes the performance ranges, where `+` means speed-up and `-`\nslowdown:\n\n* `aobench`: `[-1.02x, +1.53x]`,\n* `stencil`: `[+1.06x, +1.72x]`,\n* `mandelbrot`: `[-1.74x, +1.2x]`,\n* `options_pricing`:\n   * `black_scholes`: `+1.0x`\n   * `binomial_put`: `+1.4x`\n\n While SPMD is not the intended use case for `packed_simd`, it is possible to\n combine the library with [`rayon`][rayon] to poorly emulate [ISPC]'s SPMD programming\n model in Rust. Writing performant code is not as straightforward as with\n [ISPC], but with some care (e.g. see the [Performance Guide][perf_guide]) one\n can easily match and often out-perform [ISPC]'s \"default performance\".\n\n## Platform support\n\nThe following table describes the supported platforms: `build` shows whether\nthe library compiles without issues for a given target, while `run` shows\nwhether the test suite passes for a given target.\n\n| **Linux**                             | **build** | **run** |\n|---------------------------------------|-----------|---------|\n| `i586-unknown-linux-gnu`              | ✓         | ✗       |\n| `i686-unknown-linux-gnu`              | ✓         | ✗       |\n| `x86_64-unknown-linux-gnu`            | ✓         | ✓       |\n| `arm-unknown-linux-gnueabihf`         | ✓         | ✓       |\n| `armv7-unknown-linux-gnueabi`         | ✓         | ✓       |\n| `aarch64-unknown-linux-gnu`           | ✓         | ✓       |\n| `powerpc-unknown-linux-gnu`           | ✓         | ✗       |\n| `powerpc64-unknown-linux-gnu`         | ✓         | ✗       |\n| `powerpc64le-unknown-linux-gnu`       | ✓         | ✓       |\n| `s390x-unknown-linux-gnu`             | ✓         | ✗       |\n| `sparc64-unknown-linux-gnu`           | ✓         | ✗       |\n| `thumbv7neon-unknown-linux-gnueabihf` | ✓         | ✓       |\n| **MacOSX**                            | **build** | **run** |\n| `x86_64-apple-darwin`                 | ✓         | ✓       |\n| **Android**                           | **build** | **run** |\n| `x86_64-linux-android`                | ✓         | ✓       |\n| `armv7-linux-androideabi`             | ✓         | ✗       |\n| `aarch64-linux-android`               | ✓         | ✗       |\n| `thumbv7neon-linux-androideabi`       | ✓         | ✗       |\n| **iOS**                               | **build** | **run** |\n| `x86_64-apple-ios`                    | ✗         | ✗       |\n| `aarch64-apple-ios`                   | ✗         | ✗       |\n\n\n## Machine code verification\n\nThe\n[`verify/`](https://github.com/rust-lang-nursery/packed_simd/tree/master/verify)\ncrate tests disassembles the portable packed vector APIs at run-time and\ncompares the generated machine code against the desired one to make sure that\nthis crate remains efficient.\n\n## License\n\nThis project is licensed under either of\n\n* [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0)\n  ([LICENSE-APACHE](LICENSE-APACHE))\n\n* [MIT License](http://opensource.org/licenses/MIT)\n  ([LICENSE-MIT](LICENSE-MIT))\n\nat your option.\n\n## Contributing\n\nWe welcome all people who want to contribute.\nPlease see the [contributing instructions] for more information.\n\nContributions in any form (issues, pull requests, etc.) to this project\nmust adhere to Rust's [Code of Conduct].\n\nUnless you explicitly state otherwise, any contribution intentionally submitted\nfor inclusion in `packed_simd` by you, as defined in the Apache-2.0 license, shall be\ndual licensed as above, without any additional terms or conditions.\n\n[travis]: https://travis-ci.com/rust-lang/packed_simd\n[Travis-CI Status]: https://travis-ci.com/rust-lang/packed_simd.svg?branch=master\n[appveyor]: https://ci.appveyor.com/project/gnzlbg/packed-simd\n[Appveyor Status]: https://ci.appveyor.com/api/projects/status/hd7v9dvr442hgdix?svg=true\n[Latest Version]: https://img.shields.io/crates/v/packed_simd.svg\n[crates.io]: https://crates.io/crates/packed_simd\n[docs]: https://docs.rs/packed_simd/badge.svg\n[docs.rs]: https://docs.rs/packed_simd\n[master_docs]: https://rust-lang-nursery.github.io/packed_simd/packed_simd/\n[perf_guide]: https://rust-lang-nursery.github.io/packed_simd/perf-guide/\n[rfc2366]: https://github.com/rust-lang/rfcs/pull/2366\n[ISPC]: https://ispc.github.io/\n[rayon]: https://crates.io/crates/rayon\n[boost_license]: https://www.boost.org/LICENSE_1_0.txt\n[SLEEF]: https://sleef.org/\n[sleef_sys]: https://crates.io/crates/sleef-sys\n[contributing instructions]: contributing.md\n[Code of Conduct]: https://www.rust-lang.org/en-US/conduct.html\n"
  },
  {
    "path": "bors.toml",
    "content": "status = [\n    \"continuous-integration/travis-ci/push\"\n]"
  },
  {
    "path": "build.rs",
    "content": "fn main() {\n    let target = std::env::var(\"TARGET\").expect(\"TARGET environment variable not defined\");\n    if target.contains(\"neon\") {\n        println!(\"cargo:rustc-cfg=libcore_neon\");\n    }\n}\n"
  },
  {
    "path": "ci/all.sh",
    "content": "#!/usr/bin/env bash\n#\n# Performs an operation on all targets\n\nset -ex\n\n: \"${1?The all.sh script requires one argument.}\"\n\nop=$1\n\ncargo_clean() {\n    cargo clean\n}\n\ncargo_check_fmt() {\n    cargo fmt --all -- --check\n}\n\ncargo_fmt() {\n    cargo fmt --all\n}\n\ncargo_clippy() {\n    cargo clippy --all -- -D clippy::perf\n}\n\nCMD=\"-1\"\n\ncase $op in\n    clean*)\n        CMD=cargo_clean\n        ;;\n    check_fmt*)\n        CMD=cargo_check_fmt\n        ;;\n    fmt*)\n        CMD=cargo_fmt\n        ;;\n    clippy)\n        CMD=cargo_clippy\n        ;;\n    *)\n        echo \"Unknown operation: \\\"${op}\\\"\"\n        exit 1\n        ;;\nesac\n\necho \"Operation is: ${CMD}\"\n\n# On src/\n$CMD\n\n# Check examples/\nfor dir in examples/*/\ndo\n    dir=${dir%*/}\n    (\n        cd \"${dir%*/}\"\n        $CMD\n    )\ndone\n\n(\n    cd verify/verify\n    $CMD\n)\n\n(\n    cd micro_benchmarks\n    $CMD\n)\n"
  },
  {
    "path": "ci/android-install-ndk.sh",
    "content": "#!/usr/bin/env sh\n# Copyright 2016 The Rust Project Developers. See the COPYRIGHT\n# file at the top-level directory of this distribution and at\n# http://rust-lang.org/COPYRIGHT.\n#\n# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or\n# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license\n# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your\n# option. This file may not be copied, modified, or distributed\n# except according to those terms.\n\nset -ex\n\nANDROID_NDK_URL=https://dl.google.com/android/repository\nANDROID_NDK_ARCHIVE=android-ndk-r25b-linux.zip\n\ncurl -fO \"$ANDROID_NDK_URL/$ANDROID_NDK_ARCHIVE\"\nunzip -q $ANDROID_NDK_ARCHIVE\nrm $ANDROID_NDK_ARCHIVE\nmv android-ndk-* ndk\nrm -rf android-ndk-*\n"
  },
  {
    "path": "ci/android-install-sdk.sh",
    "content": "#!/usr/bin/env sh\n# Copyright 2016 The Rust Project Developers. See the COPYRIGHT\n# file at the top-level directory of this distribution and at\n# http://rust-lang.org/COPYRIGHT.\n#\n# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or\n# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license\n# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your\n# option. This file may not be copied, modified, or distributed\n# except according to those terms.\n\nset -ex\n\n# Prep the SDK and emulator\n#\n# Note that the update process requires that we accept a bunch of licenses, and\n# we can't just pipe `yes` into it for some reason, so we take the same strategy\n# located in https://github.com/appunite/docker by just wrapping it in a script\n# which apparently magically accepts the licenses.\n\nmkdir sdk\ncurl --retry 5 https://dl.google.com/android/repository/sdk-tools-linux-3859397.zip -O\nunzip -d sdk sdk-tools-linux-3859397.zip\n\ncase \"$1\" in\n  arm | armv7)\n    abi=armeabi-v7a\n    ;;\n\n  aarch64)\n    abi=arm64-v8a\n    ;;\n\n  i686)\n    abi=x86\n    ;;\n\n  x86_64)\n    abi=x86_64\n    ;;\n\n  *)\n    echo \"invalid arch: $1\"\n    exit 1\n    ;;\nesac;\n\n# --no_https avoids\n     # javax.net.ssl.SSLHandshakeException: sun.security.validator.ValidatorException: No trusted certificate found\nyes | ./sdk/tools/bin/sdkmanager --licenses --no_https\nyes | ./sdk/tools/bin/sdkmanager --no_https \\\n        \"emulator\" \\\n        \"platform-tools\" \\\n        \"platforms;android-24\" \\\n        \"system-images;android-24;default;$abi\"\n\necho \"no\" |\n    ./sdk/tools/bin/avdmanager create avd \\\n        --name \"${1}\" \\\n        --package \"system-images;android-24;default;$abi\"\n"
  },
  {
    "path": "ci/android-sysimage.sh",
    "content": "#!/usr/bin/env bash\n\n# Copyright 2017 The Rust Project Developers. See the COPYRIGHT\n# file at the top-level directory of this distribution and at\n# http://rust-lang.org/COPYRIGHT.\n#\n# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or\n# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license\n# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your\n# option. This file may not be copied, modified, or distributed\n# except according to those terms.\n\nset -ex\n\nURL=https://dl.google.com/android/repository/sys-img/android\n\nmain() {\n    local arch=\"${1}\"\n    local name=\"${2}\"\n    local dest=/system\n    local td\n    td=\"$(mktemp -d)\"\n\n    apt-get install --no-install-recommends e2tools\n\n    pushd \"${td}\"\n    curl --retry 5 -O \"${URL}/${name}\"\n    unzip -q \"${name}\"\n\n    local system\n    system=\"$(find . -name system.img)\"\n    mkdir -p ${dest}/{bin,lib,lib64}\n\n    # Extract android linker and libraries to /system\n    # This allows android executables to be run directly (or with qemu)\n    if [ \"${arch}\" = \"x86_64\" ] || [ \"${arch}\" = \"arm64\" ]; then\n        e2cp -p \"${system}:/bin/linker64\" \"${dest}/bin/\"\n        e2cp -p \"${system}:/lib64/libdl.so\" \"${dest}/lib64/\"\n        e2cp -p \"${system}:/lib64/libc.so\" \"${dest}/lib64/\"\n        e2cp -p \"${system}:/lib64/libm.so\" \"${dest}/lib64/\"\n    else\n        e2cp -p \"${system}:/bin/linker\" \"${dest}/bin/\"\n        e2cp -p \"${system}:/lib/libdl.so\" \"${dest}/lib/\"\n        e2cp -p \"${system}:/lib/libc.so\" \"${dest}/lib/\"\n        e2cp -p \"${system}:/lib/libm.so\" \"${dest}/lib/\"\n    fi\n\n    # clean up\n    apt-get purge --auto-remove -y e2tools\n\n    popd\n\n    rm -rf \"${td}\"\n}\n\nmain \"${@}\"\n"
  },
  {
    "path": "ci/benchmark.sh",
    "content": "#!/usr/bin/env bash\n#\n# Runs all benchmarks. Controlled by the following environment variables:\n#\n# FEATURES={} - cargo features to pass to all benchmarks (e.g. core_arch,sleef-sys,ispc)\n# NORUN={1}   - only builds the benchmarks\n\nset -ex\n\nif [[ ${NORUN} != 1 ]]; then\n    # Most benchmarks require hyperfine; require it upfront.\n    hash hyperfine 2>/dev/null || { echo >&2 \"hyperfine is not in PATH.\"; exit 1; }\nfi\n\n\n# If the ispc benchmark feature is enabled, ispc must be in the path of the\n# benchmarks. \nif echo \"$FEATURES\" | grep -q \"ispc\"; then\n    hash ispc 2>/dev/null || { echo >&2 \"ispc is not in PATH.\"; exit 1; }\nfi\n\n# An example with a benchmark.sh is a benchmark:\nfor dir in examples/*/\ndo\n    dir=${dir%*/}\n    cd ${dir%*/}\n    if [ -f \"benchmark.sh\" ]; then\n        ./benchmark.sh\n    fi\n    cd -\ndone\n\n"
  },
  {
    "path": "ci/deploy_and_run_on_ios_simulator.rs",
    "content": "// Copyright 2017 The Rust Project Developers. See the COPYRIGHT\n// file at the top-level directory of this distribution and at\n// http://rust-lang.org/COPYRIGHT.\n//\n// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or\n// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license\n// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your\n// option. This file may not be copied, modified, or distributed\n// except according to those terms.\n\n// This is a script to deploy and execute a binary on an iOS simulator.\n// The primary use of this is to be able to run unit tests on the simulator and\n// retrieve the results.\n//\n// To do this through Cargo instead, use Dinghy\n// (https://github.com/snipsco/dinghy): cargo dinghy install, then cargo dinghy\n// test.\n\nuse std::env;\nuse std::fs::{self, File};\nuse std::io::Write;\nuse std::path::Path;\nuse std::process;\nuse std::process::Command;\n\nmacro_rules! t {\n    ($e:expr) => (match $e {\n        Ok(e) => e,\n        Err(e) => panic!(\"{} failed with: {}\", stringify!($e), e),\n    })\n}\n\n// Step one: Wrap as an app\nfn package_as_simulator_app(crate_name: &str, test_binary_path: &Path) {\n    println!(\"Packaging simulator app\");\n    drop(fs::remove_dir_all(\"ios_simulator_app\"));\n    t!(fs::create_dir(\"ios_simulator_app\"));\n    t!(fs::copy(test_binary_path,\n                Path::new(\"ios_simulator_app\").join(crate_name)));\n\n    let mut f = t!(File::create(\"ios_simulator_app/Info.plist\"));\n    t!(f.write_all(format!(r#\"\n        <?xml version=\"1.0\" encoding=\"UTF-8\"?>\n        <!DOCTYPE plist PUBLIC\n                \"-//Apple//DTD PLIST 1.0//EN\"\n                \"http://www.apple.com/DTDs/PropertyList-1.0.dtd\">\n        <plist version=\"1.0\">\n            <dict>\n                <key>CFBundleExecutable</key>\n                <string>{}</string>\n                <key>CFBundleIdentifier</key>\n                <string>com.rust.unittests</string>\n            </dict>\n        </plist>\n    \"#, crate_name).as_bytes()));\n}\n\n// Step two: Start the iOS simulator\nfn start_simulator() {\n    println!(\"Looking for iOS simulator\");\n    let output = t!(Command::new(\"xcrun\").arg(\"simctl\").arg(\"list\").output());\n    assert!(output.status.success());\n    let mut simulator_exists = false;\n    let mut simulator_booted = false;\n    let mut found_rust_sim = false;\n    let stdout = t!(String::from_utf8(output.stdout));\n    for line in stdout.lines() {\n        if line.contains(\"rust_ios\") {\n            if found_rust_sim {\n                panic!(\"Duplicate rust_ios simulators found. Please \\\n                        double-check xcrun simctl list.\");\n            }\n            simulator_exists = true;\n            simulator_booted = line.contains(\"(Booted)\");\n            found_rust_sim = true;\n        }\n    }\n\n    if simulator_exists == false {\n        println!(\"Creating iOS simulator\");\n        Command::new(\"xcrun\")\n                .arg(\"simctl\")\n                .arg(\"create\")\n                .arg(\"rust_ios\")\n                .arg(\"com.apple.CoreSimulator.SimDeviceType.iPhone-SE\")\n                .arg(\"com.apple.CoreSimulator.SimRuntime.iOS-10-2\")\n                .check_status();\n    } else if simulator_booted == true {\n        println!(\"Shutting down already-booted simulator\");\n        Command::new(\"xcrun\")\n                .arg(\"simctl\")\n                .arg(\"shutdown\")\n                .arg(\"rust_ios\")\n                .check_status();\n    }\n\n    println!(\"Starting iOS simulator\");\n    // We can't uninstall the app (if present) as that will hang if the\n    // simulator isn't completely booted; just erase the simulator instead.\n    Command::new(\"xcrun\").arg(\"simctl\").arg(\"erase\").arg(\"rust_ios\").check_status();\n    Command::new(\"xcrun\").arg(\"simctl\").arg(\"boot\").arg(\"rust_ios\").check_status();\n}\n\n// Step three: Install the app\nfn install_app_to_simulator() {\n    println!(\"Installing app to simulator\");\n    Command::new(\"xcrun\")\n            .arg(\"simctl\")\n            .arg(\"install\")\n            .arg(\"booted\")\n            .arg(\"ios_simulator_app/\")\n            .check_status();\n}\n\n// Step four: Run the app\nfn run_app_on_simulator() {\n    println!(\"Running app\");\n    let output = t!(Command::new(\"xcrun\")\n                    .arg(\"simctl\")\n                    .arg(\"launch\")\n                    .arg(\"--console\")\n                    .arg(\"booted\")\n                    .arg(\"com.rust.unittests\")\n                    .output());\n\n    println!(\"stdout --\\n{}\\n\", String::from_utf8_lossy(&output.stdout));\n    println!(\"stderr --\\n{}\\n\", String::from_utf8_lossy(&output.stderr));\n\n    let stdout = String::from_utf8_lossy(&output.stdout);\n    let failed = stdout.lines()\n        .find(|l| l.contains(\"FAILED\"))\n        .map(|l| l.contains(\"FAILED\"))\n        .unwrap_or(false);\n\n    let passed = stdout.lines()\n        .find(|l| l.contains(\"test result: ok\"))\n        .map(|l| l.contains(\"test result: ok\"))\n        .unwrap_or(false);\n\n    println!(\"Shutting down simulator\");\n    Command::new(\"xcrun\")\n        .arg(\"simctl\")\n        .arg(\"shutdown\")\n        .arg(\"rust_ios\")\n        .check_status();\n    if !(passed && !failed) {\n        panic!(\"tests didn't pass\");\n    }\n}\n\ntrait CheckStatus {\n    fn check_status(&mut self);\n}\n\nimpl CheckStatus for Command {\n    fn check_status(&mut self) {\n        println!(\"\\trunning: {:?}\", self);\n        assert!(t!(self.status()).success());\n    }\n}\n\nfn main() {\n    let args: Vec<String> = env::args().collect();\n    if args.len() != 2 {\n        println!(\"Usage: {} <executable>\", args[0]);\n        process::exit(-1);\n    }\n\n    let test_binary_path = Path::new(&args[1]);\n    let crate_name = test_binary_path.file_name().unwrap();\n\n    package_as_simulator_app(crate_name.to_str().unwrap(), test_binary_path);\n    start_simulator();\n    install_app_to_simulator();\n    run_app_on_simulator();\n}\n"
  },
  {
    "path": "ci/docker/aarch64-linux-android/Dockerfile",
    "content": "FROM ubuntu:16.04\n\nRUN dpkg --add-architecture i386 && \\\n    apt-get update && \\\n    apt-get install -y --no-install-recommends \\\n  file \\\n  make \\\n  curl \\\n  ca-certificates \\\n  python \\\n  unzip \\\n  expect \\\n  openjdk-9-jre \\\n  libstdc++6:i386 \\\n  libpulse0 \\\n  gcc \\\n  libc6-dev\n\nWORKDIR /android/\nCOPY android* /android/\n\nENV ANDROID_ARCH=aarch64\nENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools\n\nRUN sh /android/android-install-ndk.sh $ANDROID_ARCH\nRUN sh /android/android-install-sdk.sh $ANDROID_ARCH\nRUN mv /root/.android /tmp\nRUN chmod 777 -R /tmp/.android\nRUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/*\n\nENV PATH=$PATH:/rust/bin \\\n    CARGO_TARGET_AARCH64_LINUX_ANDROID_LINKER=aarch64-linux-android-gcc \\\n    CARGO_TARGET_AARCH64_LINUX_ANDROID_RUNNER=/tmp/runtest \\\n    OBJDUMP=aarch64-linux-android-objdump \\\n    HOME=/tmp\n\nADD runtest-android.rs /tmp/runtest.rs\nENTRYPOINT [ \\\n  \"bash\", \\\n  \"-c\", \\\n  # set SHELL so android can detect a 64bits system, see\n  # http://stackoverflow.com/a/41789144\n  \"SHELL=/bin/dash /android/sdk/emulator/emulator @aarch64 -no-window & \\\n   rustc /tmp/runtest.rs -o /tmp/runtest && \\\n   exec \\\"$@\\\"\", \\\n  \"--\" \\\n]\n"
  },
  {
    "path": "ci/docker/aarch64-unknown-linux-gnu/Dockerfile",
    "content": "FROM ubuntu:18.04\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n  gcc \\\n  ca-certificates \\\n  libc6-dev \\\n  gcc-aarch64-linux-gnu \\\n  libc6-dev-arm64-cross \\\n  qemu-user \\\n  make \\\n  file\n\nENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc \\\n    CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER=\"qemu-aarch64 -L /usr/aarch64-linux-gnu\" \\\n    OBJDUMP=aarch64-linux-gnu-objdump\n"
  },
  {
    "path": "ci/docker/arm-unknown-linux-gnueabi/Dockerfile",
    "content": "FROM ubuntu:18.04\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n  gcc \\\n  ca-certificates \\\n  libc6-dev \\\n  libc6-armel-cross \\\n  libc6-dev-armel-cross \\\n  binutils-arm-linux-gnueabi \\\n  gcc-arm-linux-gnueabi \\\n  qemu-user \\\n  make \\\n  file\nENV CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABI_LINKER=arm-linux-gnueabi-gcc \\\n    CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABI_RUNNER=\"qemu-arm -L /usr/arm-linux-gnueabi\" \\\n    OBJDUMP=arm-linux-gnueabi-objdump\n"
  },
  {
    "path": "ci/docker/arm-unknown-linux-gnueabihf/Dockerfile",
    "content": "FROM ubuntu:18.04\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n  gcc \\\n  ca-certificates \\\n  libc6-dev \\\n  gcc-arm-linux-gnueabihf \\\n  libc6-dev-armhf-cross \\\n  qemu-user \\\n  make \\\n  file\nENV CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc \\\n    CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_RUNNER=\"qemu-arm -L /usr/arm-linux-gnueabihf\" \\\n    OBJDUMP=arm-linux-gnueabihf-objdump\n"
  },
  {
    "path": "ci/docker/armv7-linux-androideabi/Dockerfile",
    "content": "FROM ubuntu:16.04\n\nRUN dpkg --add-architecture i386 && \\\n    apt-get update && \\\n    apt-get install -y --no-install-recommends \\\n  file \\\n  make \\\n  curl \\\n  ca-certificates \\\n  python \\\n  unzip \\\n  expect \\\n  openjdk-9-jre \\\n  libstdc++6:i386 \\\n  libpulse0 \\\n  gcc \\\n  libc6-dev\n\nWORKDIR /android/\nCOPY android* /android/\n\nENV ANDROID_ARCH=arm\nENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools\n\nRUN sh /android/android-install-ndk.sh $ANDROID_ARCH\nRUN sh /android/android-install-sdk.sh $ANDROID_ARCH\nRUN mv /root/.android /tmp\nRUN chmod 777 -R /tmp/.android\nRUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/*\n\nENV PATH=$PATH:/rust/bin \\\n    CARGO_TARGET_ARM_LINUX_ANDROIDEABI_LINKER=arm-linux-androideabi-gcc \\\n    CARGO_TARGET_ARM_LINUX_ANDROIDEABI_RUNNER=/tmp/runtest \\\n    OBJDUMP=arm-linux-androideabi-objdump \\\n    HOME=/tmp\n\nADD runtest-android.rs /tmp/runtest.rs\nENTRYPOINT [ \\\n  \"bash\", \\\n  \"-c\", \\\n  # set SHELL so android can detect a 64bits system, see\n  # http://stackoverflow.com/a/41789144\n  \"SHELL=/bin/dash /android/sdk/emulator/emulator @arm -no-window & \\\n   rustc /tmp/runtest.rs -o /tmp/runtest && \\\n   exec \\\"$@\\\"\", \\\n  \"--\" \\\n]\n"
  },
  {
    "path": "ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile",
    "content": "FROM ubuntu:18.04\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n  gcc \\\n  ca-certificates \\\n  libc6-dev \\\n  gcc-arm-linux-gnueabihf \\\n  libc6-dev-armhf-cross \\\n  qemu-user \\\n  make \\\n  file\nENV CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc \\\n    CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER=\"qemu-arm -L /usr/arm-linux-gnueabihf\" \\\n    OBJDUMP=arm-linux-gnueabihf-objdump\n"
  },
  {
    "path": "ci/docker/i586-unknown-linux-gnu/Dockerfile",
    "content": "FROM ubuntu:18.04\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n  gcc-multilib \\\n  libc6-dev \\\n  file \\\n  make \\\n  ca-certificates\n"
  },
  {
    "path": "ci/docker/i686-unknown-linux-gnu/Dockerfile",
    "content": "FROM ubuntu:18.04\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n  gcc-multilib \\\n  libc6-dev \\\n  file \\\n  make \\\n  ca-certificates\n"
  },
  {
    "path": "ci/docker/mips-unknown-linux-gnu/Dockerfile",
    "content": "FROM ubuntu:18.04\n\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n        gcc libc6-dev qemu-user ca-certificates \\\n        gcc-mips-linux-gnu libc6-dev-mips-cross \\\n        qemu-system-mips \\\n        qemu-user \\\n        make \\\n        file\n\nENV CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_LINKER=mips-linux-gnu-gcc \\\n    CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_RUNNER=\"qemu-mips -L /usr/mips-linux-gnu\" \\\n    OBJDUMP=mips-linux-gnu-objdump"
  },
  {
    "path": "ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile",
    "content": "FROM ubuntu:18.04\n\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n        gcc libc6-dev qemu-user ca-certificates \\\n        gcc-mips64-linux-gnuabi64 libc6-dev-mips64-cross \\\n        qemu-system-mips64 qemu-user\n\nENV CARGO_TARGET_MIPS64_UNKNOWN_LINUX_GNUABI64_LINKER=mips64-linux-gnuabi64-gcc \\\n    CARGO_TARGET_MIPS64_UNKNOWN_LINUX_GNUABI64_RUNNER=\"qemu-mips64 -L /usr/mips64-linux-gnuabi64\" \\\n    OBJDUMP=mips64-linux-gnuabi64-objdump"
  },
  {
    "path": "ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile",
    "content": "FROM ubuntu:18.04\n\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n        gcc libc6-dev qemu-user ca-certificates \\\n        gcc-mips64el-linux-gnuabi64 libc6-dev-mips64el-cross \\\n        qemu-system-mips64el\n\nENV CARGO_TARGET_MIPS64EL_UNKNOWN_LINUX_GNUABI64_LINKER=mips64el-linux-gnuabi64-gcc \\\n    CARGO_TARGET_MIPS64EL_UNKNOWN_LINUX_GNUABI64_RUNNER=\"qemu-mips64el -L /usr/mips64el-linux-gnuabi64\" \\\n    OBJDUMP=mips64el-linux-gnuabi64-objdump"
  },
  {
    "path": "ci/docker/mipsel-unknown-linux-musl/Dockerfile",
    "content": "FROM ubuntu:18.10\n\nRUN apt-get update && \\\n    apt-get install -y --no-install-recommends \\\n    ca-certificates \\\n    gcc \\\n    libc6-dev \\\n    make \\\n    qemu-user \\\n    qemu-system-mips \\\n    bzip2 \\\n    curl \\\n    file\n\nRUN mkdir /toolchain\n\n# Note that this originally came from:\n# https://downloads.openwrt.org/snapshots/trunk/malta/generic/OpenWrt-Toolchain-malta-le_gcc-5.3.0_musl-1.1.15.Linux-x86_64.tar.bz2\nRUN curl -L https://ci-mirrors.rust-lang.org/libc/OpenWrt-Toolchain-malta-le_gcc-5.3.0_musl-1.1.15.Linux-x86_64.tar.bz2 | \\\n      tar xjf - -C /toolchain --strip-components=2\n\nENV PATH=$PATH:/rust/bin:/toolchain/bin \\\n    CC_mipsel_unknown_linux_musl=mipsel-openwrt-linux-gcc \\\n    CARGO_TARGET_MIPSEL_UNKNOWN_LINUX_MUSL_LINKER=mipsel-openwrt-linux-gcc \\\n    CARGO_TARGET_MIPSEL_UNKNOWN_LINUX_MUSL_RUNNER=\"qemu-mipsel -L /toolchain\"\n"
  },
  {
    "path": "ci/docker/powerpc-unknown-linux-gnu/Dockerfile",
    "content": "FROM ubuntu:22.04\n\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n        gcc libc6-dev qemu-user ca-certificates \\\n        gcc-powerpc-linux-gnu libc6-dev-powerpc-cross \\\n        qemu-system-ppc \\\n        make \\\n        file\n\nENV CARGO_TARGET_POWERPC_UNKNOWN_LINUX_GNU_LINKER=powerpc-linux-gnu-gcc \\\n    CARGO_TARGET_POWERPC_UNKNOWN_LINUX_GNU_RUNNER=\"qemu-ppc -cpu Vger -L /usr/powerpc-linux-gnu\" \\\n    CC=powerpc-linux-gnu-gcc \\\n    OBJDUMP=powerpc-linux-gnu-objdump\n"
  },
  {
    "path": "ci/docker/powerpc64-unknown-linux-gnu/Dockerfile",
    "content": "FROM ubuntu:22.04\n\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n    gcc \\\n    ca-certificates \\\n    libc6-dev \\\n    gcc-powerpc64-linux-gnu \\\n    libc6-dev-ppc64-cross \\\n    qemu-user  \\\n    qemu-system-ppc \\\n    make \\\n    file \n\nENV CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_LINKER=powerpc64-linux-gnu-gcc \\\n    CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_RUNNER=\"qemu-ppc64 -L /usr/powerpc64-linux-gnu\" \\\n    CC=powerpc64-linux-gnu-gcc \\\n    OBJDUMP=powerpc64-linux-gnu-objdump\n"
  },
  {
    "path": "ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile",
    "content": "FROM ubuntu:22.04\n\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n        gcc libc6-dev qemu-user ca-certificates \\\n        gcc-powerpc64le-linux-gnu libc6-dev-ppc64el-cross \\\n        qemu-system-ppc file make\n\nENV CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_LINKER=powerpc64le-linux-gnu-gcc \\\n    CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_RUNNER=\"qemu-ppc64le -L /usr/powerpc64le-linux-gnu\" \\\n    CC=powerpc64le-linux-gnu-gcc \\\n    OBJDUMP=powerpc64le-linux-gnu-objdump\n"
  },
  {
    "path": "ci/docker/s390x-unknown-linux-gnu/Dockerfile",
    "content": "FROM ubuntu:22.04\n\nRUN apt-get update && \\\n    apt-get install -y --no-install-recommends \\\n    ca-certificates \\\n    curl \\\n    cmake \\\n    gcc \\\n    libc6-dev \\\n    g++-s390x-linux-gnu \\\n    libc6-dev-s390x-cross \\\n    qemu-user \\\n    make \\\n    file\n\nENV CARGO_TARGET_S390X_UNKNOWN_LINUX_GNU_LINKER=s390x-linux-gnu-gcc \\\n    CARGO_TARGET_S390X_UNKNOWN_LINUX_GNU_RUNNER=\"qemu-s390x -L /usr/s390x-linux-gnu\" \\\n    CC_s390x_unknown_linux_gnu=s390x-linux-gnu-gcc \\\n    CXX_s390x_unknown_linux_gnu=s390x-linux-gnu-g++ \\\n    OBJDUMP=s390x-linux-gnu-objdump\n"
  },
  {
    "path": "ci/docker/sparc64-unknown-linux-gnu/Dockerfile",
    "content": "FROM debian:bookworm\n\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n        curl ca-certificates \\\n        gcc libc6-dev \\\n        gcc-sparc64-linux-gnu libc6-dev-sparc64-cross \\\n        qemu-system-sparc64 openbios-sparc seabios ipxe-qemu \\\n        p7zip-full cpio\n\nCOPY linux-sparc64.sh /\nRUN bash /linux-sparc64.sh\n\nCOPY test-runner-linux /\n\nENV CARGO_TARGET_SPARC64_UNKNOWN_LINUX_GNU_LINKER=sparc64-linux-gnu-gcc \\\n    CARGO_TARGET_SPARC64_UNKNOWN_LINUX_GNU_RUNNER=\"/test-runner-linux sparc64\" \\\n    CC_sparc64_unknown_linux_gnu=sparc64-linux-gnu-gcc \\\n    PATH=$PATH:/rust/bin\n"
  },
  {
    "path": "ci/docker/thumbv7neon-linux-androideabi/Dockerfile",
    "content": "FROM ubuntu:16.04\n\nRUN dpkg --add-architecture i386 && \\\n    apt-get update && \\\n    apt-get install -y --no-install-recommends \\\n  file \\\n  make \\\n  curl \\\n  ca-certificates \\\n  python \\\n  unzip \\\n  expect \\\n  openjdk-9-jre \\\n  libstdc++6:i386 \\\n  libpulse0 \\\n  gcc \\\n  libc6-dev\n\nWORKDIR /android/\nCOPY android* /android/\n\nENV ANDROID_ARCH=arm\nENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools\n\nRUN sh /android/android-install-ndk.sh $ANDROID_ARCH\nRUN sh /android/android-install-sdk.sh $ANDROID_ARCH\nRUN mv /root/.android /tmp\nRUN chmod 777 -R /tmp/.android\nRUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/*\n\nENV PATH=$PATH:/rust/bin \\\n    CARGO_TARGET_THUMBV7NEON_LINUX_ANDROIDEABI_LINKER=arm-linux-androideabi-gcc \\\n    CARGO_TARGET_THUMBV7NEON_LINUX_ANDROIDEABI_RUNNER=/tmp/runtest \\\n    OBJDUMP=arm-linux-androideabi-objdump \\\n    HOME=/tmp\n\nADD runtest-android.rs /tmp/runtest.rs\nENTRYPOINT [ \\\n  \"bash\", \\\n  \"-c\", \\\n  # set SHELL so android can detect a 64bits system, see\n  # http://stackoverflow.com/a/41789144\n  \"SHELL=/bin/dash /android/sdk/emulator/emulator @arm -no-window & \\\n   rustc /tmp/runtest.rs -o /tmp/runtest && \\\n   exec \\\"$@\\\"\", \\\n  \"--\" \\\n]\n"
  },
  {
    "path": "ci/docker/thumbv7neon-unknown-linux-gnueabihf/Dockerfile",
    "content": "FROM ubuntu:18.04\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n  gcc \\\n  ca-certificates \\\n  libc6-dev \\\n  gcc-arm-linux-gnueabihf \\\n  libc6-dev-armhf-cross \\\n  qemu-user \\\n  make \\\n  file\nENV CARGO_TARGET_THUMBV7NEON_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc \\\n    CARGO_TARGET_THUMBV7NEON_UNKNOWN_LINUX_GNUEABIHF_RUNNER=\"qemu-arm -L /usr/arm-linux-gnueabihf\" \\\n    OBJDUMP=arm-linux-gnueabihf-objdump\n"
  },
  {
    "path": "ci/docker/wasm32-unknown-unknown/Dockerfile",
    "content": "FROM ubuntu:22.04\n\nRUN apt-get update -y && apt-get install -y --no-install-recommends \\\n  ca-certificates \\\n  clang \\\n  cmake \\\n  curl \\\n  git \\\n  libc6-dev \\\n  make \\\n  ninja-build \\\n  python-is-python3 \\\n  xz-utils\n\n# Install `wasm2wat`\nRUN git clone --recursive https://github.com/WebAssembly/wabt\nRUN make -C wabt -j$(nproc)\nENV PATH=$PATH:/wabt/bin\n\n# Install `wasm-bindgen-test-runner`\nRUN curl -L https://github.com/rustwasm/wasm-bindgen/releases/download/0.2.87/wasm-bindgen-0.2.87-x86_64-unknown-linux-musl.tar.gz \\\n  | tar xzf -\n# Keep in sync with the version on Cargo.toml.\nENV PATH=$PATH:/wasm-bindgen-0.2.87-x86_64-unknown-linux-musl\nENV CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_RUNNER=wasm-bindgen-test-runner\n\n# Install `node`\nRUN curl https://nodejs.org/dist/v14.16.0/node-v14.16.0-linux-x64.tar.xz | tar xJf -\nENV PATH=$PATH:/node-v14.16.0-linux-x64/bin\n\n# We use a shim linker that removes `--strip-debug` when passed to LLD. While\n# this typically results in invalid debug information in release mode it doesn't\n# result in an invalid names section which is what we're interested in.\nCOPY lld-shim.rs /\nENV CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_LINKER=/tmp/lld-shim\n\n# Rustc isn't available until this container starts, so defer compilation of the\n# shim.\nENTRYPOINT /rust/bin/rustc /lld-shim.rs -o /tmp/lld-shim && exec bash \"$@\"\n"
  },
  {
    "path": "ci/docker/x86_64-linux-android/Dockerfile",
    "content": "FROM ubuntu:20.04\n\nRUN apt-get update && \\\n    apt-get install -y --no-install-recommends \\\n  ca-certificates \\\n  curl \\\n  gcc \\\n  libc-dev \\\n  python \\\n  unzip \\\n  file \\\n  make\n\nWORKDIR /android/\nENV ANDROID_ARCH=x86_64\nCOPY android-install-ndk.sh /android/\nRUN sh /android/android-install-ndk.sh\n\nENV STDARCH_ASSERT_INSTR_LIMIT=30\n\n# We do not run x86_64-linux-android tests on an android emulator.\n# See ci/android-sysimage.sh for informations about how tests are run.\nCOPY android-sysimage.sh /android/\nRUN bash /android/android-sysimage.sh x86_64 x86_64-24_r07.zip\n\nENV PATH=$PATH:/rust/bin:/android/ndk/toolchains/llvm/prebuilt/linux-x86_64/bin \\\n    CARGO_TARGET_X86_64_LINUX_ANDROID_LINKER=x86_64-linux-android21-clang \\\n    CC_x86_64_linux_android=x86_64-linux-android21-clang \\\n    CXX_x86_64_linux_android=x86_64-linux-android21-clang++ \\\n    OBJDUMP=llvm-objdump \\\n    HOME=/tmp\n"
  },
  {
    "path": "ci/docker/x86_64-unknown-linux-gnu/Dockerfile",
    "content": "FROM ubuntu:18.04\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n  gcc \\\n  libc6-dev \\\n  file \\\n  make \\\n  ca-certificates \\\n  cmake \\\n  libclang-dev \\\n  clang\n"
  },
  {
    "path": "ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile",
    "content": "FROM ubuntu:18.04\nRUN apt-get update && apt-get install -y --no-install-recommends \\\n  gcc \\\n  libc6-dev \\\n  file \\\n  make \\\n  ca-certificates \\\n  wget \\\n  bzip2 \\\n  cmake \\\n  libclang-dev \\\n  clang\n\nRUN wget https://github.com/gnzlbg/intel_sde/raw/master/sde-external-8.16.0-2018-01-30-lin.tar.bz2\nRUN tar -xjf sde-external-8.16.0-2018-01-30-lin.tar.bz2\nENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER=\"/sde-external-8.16.0-2018-01-30-lin/sde64 --\"\n"
  },
  {
    "path": "ci/dox.sh",
    "content": "#!/bin/sh\n\nset -ex\n\nrm -rf target/doc\nmkdir -p target/doc\n\n# Build API documentation\ncargo doc --features=into_bits\n\n# Build Performance Guide\n# FIXME: https://github.com/rust-lang-nursery/mdBook/issues/780\n# mdbook build perf-guide -d target/doc/perf-guide\ncd perf-guide\nmdbook build\ncd -\ncp -r perf-guide/book target/doc/perf-guide\n\n# If we're on travis, not a PR, and on the right branch, publish!\nif [ \"$TRAVIS_PULL_REQUEST\" = \"false\" ] && [ \"$TRAVIS_BRANCH\" = \"master\" ]; then\n  python3 -vV\n  pip -vV\n  python3.9 -vV\n  pip install ghp_import --user\n  ghp-import -n target/doc\n  git push -qf https://${GH_PAGES}@github.com/${TRAVIS_REPO_SLUG}.git gh-pages\nfi\n"
  },
  {
    "path": "ci/linux-s390x.sh",
    "content": "set -ex\n\nmkdir -m 777 /qemu\ncd /qemu\n\ncurl -LO https://github.com/qemu/qemu/raw/master/pc-bios/s390-ccw.img\ncurl -LO http://ftp.debian.org/debian/dists/testing/main/installer-s390x/20170828/images/generic/kernel.debian\ncurl -LO http://ftp.debian.org/debian/dists/testing/main/installer-s390x/20170828/images/generic/initrd.debian\n\nmv kernel.debian kernel\nmv initrd.debian initrd.gz\n\nmkdir init\ncd init\ngunzip -c ../initrd.gz | cpio -id\nrm ../initrd.gz\ncp /usr/s390x-linux-gnu/lib/libgcc_s.so.1 usr/lib/\nchmod a+w .\n"
  },
  {
    "path": "ci/linux-sparc64.sh",
    "content": "set -ex\n\nmkdir -m 777 /qemu\ncd /qemu\n\ncurl -LO https://cdimage.debian.org/cdimage/ports/9.0/sparc64/iso-cd/debian-9.0-sparc64-NETINST-1.iso\n7z e debian-9.0-sparc64-NETINST-1.iso boot/initrd.gz\n7z e debian-9.0-sparc64-NETINST-1.iso boot/sparc64\nmv sparc64 kernel\nrm debian-9.0-sparc64-NETINST-1.iso\n\nmkdir init\ncd init\ngunzip -c ../initrd.gz | cpio -id\nrm ../initrd.gz\ncp /usr/sparc64-linux-gnu/lib/libgcc_s.so.1 usr/lib/\nchmod a+w .\n"
  },
  {
    "path": "ci/lld-shim.rs",
    "content": "use std::os::unix::prelude::*;\nuse std::process::Command;\nuse std::env;\n\nfn main() {\n    let args = env::args()\n        .skip(1)\n        .filter(|s| s != \"--strip-debug\")\n        .collect::<Vec<_>>();\n    panic!(\"failed to exec: {}\", Command::new(\"rust-lld\").args(&args).exec());\n}\n"
  },
  {
    "path": "ci/max_line_width.sh",
    "content": "#!/usr/bin/env sh\n\nset -x\n\nexport success=true\n\nfind . -iname '*.rs' | while read -r file; do\n    result=$(grep '.\\{79\\}' \"${file}\" | grep --invert 'http')\n    if [ \"${result}\" = \"\" ]\n    then\n        :\n    else\n        echo \"file \\\"${file}\\\": $result\"\n        exit 1\n    fi\ndone\n\n"
  },
  {
    "path": "ci/run-docker.sh",
    "content": "# Small script to run tests for a target (or all targets) inside all the\n# respective docker images.\n\nset -ex\n\nrun() {\n    echo \"Building docker container for TARGET=${TARGET} RUSTFLAGS=${RUSTFLAGS}\"\n    docker build -t packed_simd -f ci/docker/${TARGET}/Dockerfile ci/\n    mkdir -p target\n    target=$(echo \"${TARGET}\" | sed 's/-emulated//')\n    echo \"Running docker\"\n    docker run \\\n      --user `id -u`:`id -g` \\\n      --rm \\\n      --init \\\n      --volume $HOME/.cargo:/cargo \\\n      --env CARGO_HOME=/cargo \\\n      --volume `rustc --print sysroot`:/rust:ro \\\n      --env TARGET=$target \\\n      --env NORUN \\\n      --env NOVERIFY \\\n      --env RUSTFLAGS \\\n      --volume `pwd`:/checkout:ro \\\n      --volume `pwd`/target:/checkout/target \\\n      --workdir /checkout \\\n      --privileged \\\n      packed_simd \\\n      bash \\\n      -c 'PATH=$PATH:/rust/bin exec ci/run.sh'\n}\n\nif [ -z \"${TARGET}\" ]; then\n  for d in `ls ci/docker/`; do\n    run $d\n  done\nelse\n  run ${TARGET}\nfi\n"
  },
  {
    "path": "ci/run.sh",
    "content": "#!/usr/bin/env bash\n\nset -ex\n\n: ${TARGET?\"The TARGET environment variable must be set.\"}\n\n# Tests are all super fast anyway, and they fault often enough on travis that\n# having only one thread increases debuggability to be worth it.\n#export RUST_TEST_THREADS=1\n#export RUST_BACKTRACE=full\n#export RUST_TEST_NOCAPTURE=1\n\n# Some appveyor builds run out-of-memory; this attempts to mitigate that:\n# https://github.com/rust-lang-nursery/packed_simd/issues/39\n# export RUSTFLAGS=\"${RUSTFLAGS} -C codegen-units=1\"\n# export CARGO_BUILD_JOBS=1\n\nexport CARGO_SUBCMD=test\nif [[ \"${NORUN}\" == \"1\" ]]; then\n    export CARGO_SUBCMD=build\nfi\n\nif [[ ${TARGET} == \"x86_64-apple-ios\" ]] || [[ ${TARGET} == \"i386-apple-ios\" ]]; then\n    export RUSTFLAGS=\"${RUSTFLAGS} -Clink-arg=-mios-simulator-version-min=7.0\"\n    rustc ./ci/deploy_and_run_on_ios_simulator.rs -o $HOME/runtest\n    export CARGO_TARGET_X86_64_APPLE_IOS_RUNNER=$HOME/runtest\n    export CARGO_TARGET_I386_APPLE_IOS_RUNNER=$HOME/runtest\nfi\n\n# The source directory is read-only. Need to copy internal crates to the target\n# directory for their Cargo.lock to be properly written.\nmkdir target || true\n\nrustc --version\ncargo --version\necho \"TARGET=${TARGET}\"\necho \"HOST=${HOST}\"\necho \"RUSTFLAGS=${RUSTFLAGS}\"\necho \"NORUN=${NORUN}\"\necho \"NOVERIFY=${NOVERIFY}\"\necho \"CARGO_SUBCMD=${CARGO_SUBCMD}\"\necho \"CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS}\"\necho \"CARGO_INCREMENTAL=${CARGO_INCREMENTAL}\"\necho \"RUST_TEST_THREADS=${RUST_TEST_THREADS}\"\necho \"RUST_BACKTRACE=${RUST_BACKTRACE}\"\necho \"RUST_TEST_NOCAPTURE=${RUST_TEST_NOCAPTURE}\"\n\ncargo_test() {\n    cmd=\"cargo ${CARGO_SUBCMD} --verbose --target=${TARGET} ${@}\"\n    if [ \"${NORUN}\" != \"1\" ]\n    then\n        if [ \"$TARGET\" != \"wasm32-unknown-unknown\" ]\n        then\n            cmd=\"$cmd -- --quiet\"\n        fi\n    fi\n    mkdir target || true\n    ${cmd} 2>&1 | tee > target/output\n    if [[ ${PIPESTATUS[0]} != 0 ]]; then\n        cat target/output\n        return 1\n    fi\n}\n\ncargo_test_impl() {\n    ORIGINAL_RUSTFLAGS=${RUSTFLAGS}\n    RUSTFLAGS=\"${ORIGINAL_RUSTFLAGS} --cfg test_v16  --cfg test_v32 --cfg test_v64\" cargo_test ${@}\n    RUSTFLAGS=\"${ORIGINAL_RUSTFLAGS} --cfg test_v128 --cfg test_v256\" cargo_test ${@}\n    RUSTFLAGS=\"${ORIGINAL_RUSTFLAGS} --cfg test_v512\" cargo_test ${@}\n    RUSTFLAGS=${ORIGINAL_RUSTFLAGS}\n}\n\n# Debug run:\nif [[ \"${TARGET}\" != \"wasm32-unknown-unknown\" ]]; then\n   # Run wasm32-unknown-unknown in release mode only\n   cargo_test_impl\nfi\n\nif [[ \"${TARGET}\" == \"x86_64-unknown-linux-gnu\" ]] || [[ \"${TARGET}\" == \"x86_64-pc-windows-msvc\" ]]; then\n    # use sleef on linux and windows x86_64 builds\n    # FIXME: Use `core_arch,sleef-sys` features once they works again\n    cargo_test_impl --release --features=into_bits\nelse\n    # FIXME: Use `core_arch` feature once it works again\n    cargo_test_impl --release --features=into_bits\nfi\n\n# Verify code generation\nif [[ \"${NOVERIFY}\" != \"1\" ]]; then\n    cp -r verify/verify target/verify\n    export STDSIMD_ASSERT_INSTR_LIMIT=30\n    if [[ \"${TARGET}\" == \"i586-unknown-linux-gnu\" ]]; then\n        export STDSIMD_ASSERT_INSTR_LIMIT=50\n    fi\n    cargo_test --release --manifest-path=target/verify/Cargo.toml\nfi\n\n# FIXME: Figure out which examples take too long to run and ignore or adjust those\n#. ci/run_examples.sh\n"
  },
  {
    "path": "ci/run_examples.sh",
    "content": "# Runs all examples.\n\n# FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/55\n# All examples fail to build for `armv7-apple-ios`.\nif [[ ${TARGET} == \"armv7-apple-ios\" ]]; then\n    exit 0\nfi\n\n# FIXME: travis exceeds 50 minutes on these targets\n# Skipping the examples is an attempt at preventing travis from timing-out\nif [[ ${TARGET} == \"arm-linux-androidabi\" ]] || [[ ${TARGET} == \"aarch64-linux-androidabi\" ]] \\\n    || [[ ${TARGET} == \"sparc64-unknown-linux-gnu\" ]]; then\n    exit 0\nfi\n\nif [[ ${TARGET} == \"wasm32-unknown-unknown\" ]]; then\n    exit 0\nfi\n\ncp -r examples/aobench target/aobench\ncargo_test --manifest-path=target/aobench/Cargo.toml --release --no-default-features\ncargo_test --manifest-path=target/aobench/Cargo.toml --release --features=256bit\n\ncp -r examples/dot_product target/dot_product\ncargo_test --manifest-path=target/dot_product/Cargo.toml --release\n\ncp -r examples/fannkuch_redux target/fannkuch_redux\ncargo_test --manifest-path=target/fannkuch_redux/Cargo.toml --release\n\n# FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/56\nif [[ ${TARGET} != \"i586-unknown-linux-gnu\" ]]; then\n    cp -r examples/mandelbrot target/mandelbrot\n    cargo_test --manifest-path=target/mandelbrot/Cargo.toml --release\nfi\n\ncp -r examples/matrix_inverse target/matrix_inverse\ncargo_test --manifest-path=target/matrix_inverse/Cargo.toml --release\n\ncp -r examples/nbody target/nbody\ncargo_test --manifest-path=target/nbody/Cargo.toml --release\n\ncp -r examples/spectral_norm target/spectral_norm\ncargo_test --manifest-path=target/spectral_norm/Cargo.toml --release\n\nif [[ ${TARGET} != \"i586-unknown-linux-gnu\" ]]; then\n    cp -r examples/stencil target/stencil\n    cargo_test --manifest-path=target/stencil/Cargo.toml --release\nfi\n\ncp -r examples/triangle_xform target/triangle_xform\ncargo_test --manifest-path=target/triangle_xform/Cargo.toml --release\n"
  },
  {
    "path": "ci/runtest-android.rs",
    "content": "use std::env;\nuse std::process::Command;\nuse std::path::{Path, PathBuf};\n\nfn main() {\n    let args = env::args_os()\n        .skip(1)\n        .filter(|arg| arg != \"--quiet\")\n        .collect::<Vec<_>>();\n    assert_eq!(args.len(), 1);\n    let test = PathBuf::from(&args[0]);\n    let dst = Path::new(\"/data/local/tmp\").join(test.file_name().unwrap());\n\n    let status = Command::new(\"adb\")\n        .arg(\"wait-for-device\")\n        .status()\n        .expect(\"failed to run: adb wait-for-device\");\n    assert!(status.success());\n\n    let status = Command::new(\"adb\")\n        .arg(\"push\")\n        .arg(&test)\n        .arg(&dst)\n        .status()\n        .expect(\"failed to run: adb pushr\");\n    assert!(status.success());\n\n    let output = Command::new(\"adb\")\n        .arg(\"shell\")\n        .arg(&dst)\n        .output()\n        .expect(\"failed to run: adb shell\");\n    assert!(status.success());\n\n    println!(\"status: {}\\nstdout ---\\n{}\\nstderr ---\\n{}\",\n             output.status,\n             String::from_utf8_lossy(&output.stdout),\n             String::from_utf8_lossy(&output.stderr));\n\n    let stdout = String::from_utf8_lossy(&output.stdout);\n    let mut lines = stdout.lines().filter(|l| l.starts_with(\"test result\"));\n    if !lines.all(|l| l.contains(\"test result: ok\") && l.contains(\"0 failed\")) {\n        panic!(\"failed to find successful test run\");\n    }\n}\n"
  },
  {
    "path": "ci/setup_benchmarks.sh",
    "content": "#!/usr/bin/env bash\n\nset -ex\n\n# Get latest ISPC binary for the target and put it in the path\ngit clone https://github.com/gnzlbg/ispc-binaries\ncp ispc-binaries/ispc-${TARGET} ispc\n"
  },
  {
    "path": "ci/test-runner-linux",
    "content": "#!/bin/sh\n\nset -e\n\narch=$1\nprog=$2\n\ncd /qemu/init\ncp -f $2 prog\nfind . | cpio --create --format='newc' --quiet | gzip > ../initrd.gz\ncd ..\n\ntimeout 30s qemu-system-$arch \\\n        -m 1024 \\\n        -nographic \\\n        -kernel kernel \\\n        -initrd initrd.gz \\\n        -append init=/prog > output || true\n\n# remove kernel messages\ntr -d '\\r' < output | egrep -v '^\\['\n\n# if the output contains a failure, return error\n! grep FAILED output > /dev/null\n"
  },
  {
    "path": "contributing.md",
    "content": "# Contributing to `packed_simd`\n\nWelcome! If you are reading this document, it means you are interested in contributing\nto the `packed_simd` crate.\n\n## Reporting issues\n\nAll issues with this crate are tracked using GitHub's [Issue Tracker].\n\nYou can use issues to bring bugs to the attention of the maintainers, to discuss\ncertain problems encountered with the crate, or to request new features (although\nfeature requests should be limited to things mentioned in the [RFC]).\n\nOne thing to keep in mind is to always use the **latest** nightly toolchain when\nworking on this crate. Due to the nature of this project, we use a lot of unstable\nfeatures, meaning breakage happens often.\n\n[Issue Tracker]: https://github.com/rust-lang-nursery/packed_simd/issues\n[RFC]: https://github.com/rust-lang/rfcs/pull/2366\n\n### LLVM issues\n\nThe Rust compiler relies on [LLVM](https://llvm.org/) for machine code generation,\nand quite a few LLVM bugs have been discovered during the development of this project.\n\nIf you encounter issues with incorrect/suboptimal codegen, which you do not encounter\nwhen using the [SIMD vendor intrinsics](https://doc.rust-lang.org/nightly/std/arch/),\nit is likely the issue is with LLVM, or this crate's interaction with it.\n\nYou should first open an issue **in this repo** to help us track the problem, and we\nwill help determine what is the exact cause of the problem.\nIf LLVM is indeed the cause, the issue will be reported upstream to the\n[LLVM bugtracker](https://bugs.llvm.org/).\n\n## Submitting Pull Requests\n\nNew code is submitted to the crate using GitHub's [pull request] mechanism.\nYou should first fork this repository, make your changes (preferably in a new\nbranch), then use GitHub's web UI to create a new PR.\n\n[pull request]: https://help.github.com/articles/about-pull-requests/\n\n### Examples\n\nThe `examples` directory contains code showcasing SIMD code written with this crate,\nusually in comparison to scalar or ISPC code. If you have a project / idea which\nuses SIMD, we'd love to add it to the examples list.\n\nEvery example should include a small `README`, describing the example code's purpose.\nIf your example could potentially work as a benchmark, then add a `benchmark.sh`\nscript to allow running the example benchmark code in CI. See an existing example's\n[`benchmark.sh`](examples/aobench/benchmark.sh) for a sample.\n\nDon't forget to update the crate's top-level `README` with a link to your example.\n\n### Perf guide\n\nThe objective of the [performance guide][perf-guide] is to be a comprehensive\nresource detailing the process of optimizing Rust code with SIMD support.\n\nIf you believe a certain section could be reworded, or if you have any tips & tricks\nrelated to SIMD which you'd like to share, please open a PR.\n\n[mdBook] is used to manage the formatting of the guide as a book.\n\n[perf-guide]: https://rust-lang-nursery.github.io/packed_simd/perf-guide/\n[mdBook]: https://github.com/rust-lang-nursery/mdBook\n"
  },
  {
    "path": "examples/Cargo.toml",
    "content": "# FIXME: Many members of this workspace, including aobench, mandelbrot, and stencil,\n# currently trigger a \"null pointer deref\" warning.\n# This is likely due to unsoundness inside packed_simd.\n[workspace]\nmembers = [\n    \"aobench\",\n    \"dot_product\",\n    \"fannkuch_redux\",\n    \"mandelbrot\",\n    \"matrix_inverse\",\n    \"nbody\",\n    \"options_pricing\",\n    \"slice_sum\",\n    \"spectral_norm\",\n    \"stencil\",\n    \"triangle_xform\",\n]\n\n[profile.release]\n# Remember to uncomment this when profiling\n# debug = 2\n\n# You can set the following to lto = 'thin' and 'codegen-units=16'\n# for better compile times at the cost of performance\nlto = 'fat'\ncodegen-units = 1\nincremental = false\npanic = 'abort'\n\n[profile.bench]\n# Same as above\nlto = 'fat'\ncodegen-units = 1\nincremental = false"
  },
  {
    "path": "examples/aobench/Cargo.toml",
    "content": "[package]\nname = \"aobench\"\nversion = \"0.1.0\"\nauthors = [\"gnzlbg <gonzalobg88@gmail.com>\"]\nautobenches = false\nedition = \"2018\"\n\n[[bin]]\nname = \"aobench\"\npath = \"src/main.rs\"\n\n[lib]\nname = \"aobench_lib\"\npath = \"src/lib.rs\"\n\n[dependencies]\nstructopt = \"^0.3\"\nfailure = \"^0.1\"\npng = \"^0.15\"\npacked_simd = { package = \"packed_simd\", path = \"../..\" }\nrayon = \"^1.0\"\ntime = \"^0.1\"\ncfg-if = \"^0.1\"\nispc = { version = \"^1.0.4\", optional = true }\n\n[build-dependencies]\nispc = { version = \"^1.0.4\", optional = true }\n\n[dev-dependencies]\ncriterion = { version = '^0.3', features=['real_blackbox'] }\n\n[features]\ndefault = [ \"256bit\" ]\n256bit = []\nsleef-sys = [ \"packed_simd/sleef-sys\" ]\ncore_arch = [ \"packed_simd/core_arch\" ]\n\n[[bench]]\nname = \"isec_sphere\"\npath = \"benches/isec_sphere.rs\"\nharness = false\n\n[[bench]]\nname = \"isec_plane\"\npath = \"benches/isec_plane.rs\"\nharness = false\n\n[[bench]]\nname = \"ambient_occlusion\"\npath = \"benches/ambient_occlusion.rs\"\nharness = false\n\n[[bench]]\nname = \"random\"\npath = \"benches/random.rs\"\nharness = false\n"
  },
  {
    "path": "examples/aobench/benches/ambient_occlusion.rs",
    "content": "//! Benchmarks intersection between rays and planes\n#![feature(stdsimd)]\n\nuse aobench_lib::*;\nuse criterion::*;\nuse intersection::Isect;\nuse aobench_lib::scene::Test;\n\nfn hit_scalar(c: &mut Criterion) {\n    let mut scene = Test::default();\n    c.bench(\n        \"scalar\",\n        Benchmark::new(\"ao_hit\", move |b| {\n            b.iter(|| {\n                let mut isect = Isect::default();\n                let isect = black_box(&mut isect);\n                let s = black_box(&mut scene);\n                let mut v = ambient_occlusion::scalar(s, isect);\n                black_box(&mut v);\n            })\n        })\n        .throughput(Throughput::Elements(1)),\n    );\n}\n\nfn hit_vector(c: &mut Criterion) {\n    let mut scene = Test::default();\n\n    c.bench(\n        \"vector\",\n        Benchmark::new(\"ao_hit\", move |b| {\n            b.iter(|| {\n                let mut isect = Isect::default();\n                let isect = black_box(&mut isect);\n                let s = black_box(&mut scene);\n                let mut v = ambient_occlusion::vector(s, isect);\n                black_box(&mut v);\n            })\n        })\n        .throughput(Throughput::Elements(1)),\n    );\n}\n\ncriterion_group!(benches, hit_scalar, hit_vector);\ncriterion_main!(benches);\n"
  },
  {
    "path": "examples/aobench/benches/isec_plane.rs",
    "content": "//! Benchmarks intersection between rays and planes\n#![feature(stdsimd)]\n\nuse criterion::*;\n\nuse crate::geometry::{f32xN, Plane, Ray, RayxN, V3DxN, V3D};\nuse crate::intersection::{Intersect, Isect, IsectxN};\nuse aobench_lib::*;\n\nfn hit_scalar(c: &mut Criterion) {\n    let mut s = Plane {\n        p: V3D {\n            x: 0.,\n            y: 0.,\n            z: 10.,\n        },\n        n: V3D {\n            x: 0.,\n            y: 0.,\n            z: 1.,\n        },\n    };\n    let mut r = Ray {\n        origin: V3D {\n            x: 0.,\n            y: 0.,\n            z: 0.,\n        },\n        dir: V3D {\n            x: 0.,\n            y: 0.,\n            z: 1.,\n        },\n    };\n\n    c.bench(\n        \"scalar\",\n        Benchmark::new(\"isec_plane_hit\", move |b| {\n            b.iter(|| {\n                let mut isect = Isect::default();\n                let isect = black_box(&mut isect);\n                let s = black_box(&mut s);\n                let r = black_box(&mut r);\n                let mut v = r.intersect(s, *isect);\n                black_box(&mut v);\n                assert_eq!(v.hit, true);\n            })\n        })\n        .throughput(Throughput::Elements(1)),\n    );\n}\n\nfn miss_scalar(c: &mut Criterion) {\n    let mut s = Plane {\n        p: V3D {\n            x: 0.,\n            y: 0.,\n            z: -10.,\n        },\n        n: V3D {\n            x: 0.,\n            y: 0.,\n            z: 1.,\n        },\n    };\n    let mut r = Ray {\n        origin: V3D {\n            x: 0.,\n            y: 0.,\n            z: 0.,\n        },\n        dir: V3D {\n            x: 0.,\n            y: 0.,\n            z: 1.,\n        },\n    };\n\n    c.bench(\n        \"scalar\",\n        Benchmark::new(\"isec_plane_miss\", move |b| {\n            b.iter(|| {\n                let mut isect = Isect::default();\n                let isect = black_box(&mut isect);\n                let s = black_box(&mut s);\n                let r = black_box(&mut r);\n                let mut v = r.intersect(s, *isect);\n                black_box(&mut v);\n                assert_eq!(v.hit, false);\n            })\n        })\n        .throughput(Throughput::Elements(1)),\n    );\n}\n\nfn hit_vector(c: &mut Criterion) {\n    let mut s = Plane {\n        p: V3D {\n            x: 0.,\n            y: 0.,\n            z: 10.,\n        },\n        n: V3D {\n            x: 0.,\n            y: 0.,\n            z: 1.,\n        },\n    };\n    let mut r = RayxN {\n        origin: V3DxN {\n            x: f32xN::splat(0.),\n            y: f32xN::splat(0.),\n            z: f32xN::splat(0.),\n        },\n        dir: V3DxN {\n            x: f32xN::splat(0.),\n            y: f32xN::splat(0.),\n            z: f32xN::splat(1.),\n        },\n    };\n\n    c.bench(\n        \"vector\",\n        Benchmark::new(\"isec_plane_hit\", move |b| {\n            b.iter(|| {\n                let mut isect = IsectxN::default();\n                let isect = black_box(&mut isect);\n                let s = black_box(&mut s);\n                let r = black_box(&mut r);\n                let mut v = r.intersect(s, *isect);\n                black_box(&mut v);\n                assert_eq!(v.hit.all(), true);\n            })\n        })\n        .throughput(Throughput::Elements(f32xN::lanes() as u64)),\n    );\n}\n\nfn miss_vector(c: &mut Criterion) {\n    let mut s = Plane {\n        p: V3D {\n            x: 0.,\n            y: 0.,\n            z: -10.,\n        },\n        n: V3D {\n            x: 0.,\n            y: 0.,\n            z: 1.,\n        },\n    };\n    let mut r = RayxN {\n        origin: V3DxN {\n            x: f32xN::splat(0.),\n            y: f32xN::splat(0.),\n            z: f32xN::splat(0.),\n        },\n        dir: V3DxN {\n            x: f32xN::splat(0.),\n            y: f32xN::splat(0.),\n            z: f32xN::splat(1.),\n        },\n    };\n\n    c.bench(\n        \"vector\",\n        Benchmark::new(\"isec_plane_miss\", move |b| {\n            b.iter(|| {\n                let mut isect = IsectxN::default();\n                let isect = black_box(&mut isect);\n                let s = black_box(&mut s);\n                let r = black_box(&mut r);\n                let mut v = r.intersect(s, *isect);\n                black_box(&mut v);\n                assert_eq!(v.hit.any(), false);\n            })\n        })\n        .throughput(Throughput::Elements(f32xN::lanes() as u64)),\n    );\n}\n\ncriterion_group!(benches, hit_scalar, miss_scalar, hit_vector, miss_vector);\ncriterion_main!(benches);\n"
  },
  {
    "path": "examples/aobench/benches/isec_sphere.rs",
    "content": "//! Benchmarks intersection between rays and spheres\n#![feature(stdsimd)]\n\nuse crate::geometry::{f32xN, Ray, RayxN, Sphere, V3DxN, V3D};\nuse crate::intersection::{Intersect, Isect, IsectxN};\nuse aobench_lib::*;\nuse criterion::*;\n\nfn hit_scalar(c: &mut Criterion) {\n    let mut s = Sphere {\n        center: V3D {\n            x: 0.,\n            y: 0.,\n            z: 10.,\n        },\n        radius: 1.,\n    };\n\n    let mut r = Ray {\n        origin: V3D {\n            x: 0.,\n            y: 0.,\n            z: 0.,\n        },\n        dir: V3D {\n            x: 0.,\n            y: 0.,\n            z: 1.,\n        },\n    };\n\n    c.bench(\n        \"scalar\",\n        Benchmark::new(\"isec_sphere_hit\", move |b| {\n            b.iter(|| {\n                let mut isect = Isect::default();\n                let isect = black_box(&mut isect);\n                let s = black_box(&mut s);\n                let r = black_box(&mut r);\n                let mut v = r.intersect(s, *isect);\n                black_box(&mut v);\n                assert_eq!(v.hit, true);\n            })\n        })\n        .throughput(Throughput::Elements(1)),\n    );\n}\n\nfn miss_scalar(c: &mut Criterion) {\n    let mut s = Sphere {\n        center: V3D {\n            x: 0.,\n            y: 0.,\n            z: -10.,\n        },\n        radius: 1.,\n    };\n    let mut r = Ray {\n        origin: V3D {\n            x: 0.,\n            y: 0.,\n            z: 0.,\n        },\n        dir: V3D {\n            x: 0.,\n            y: 0.,\n            z: 1.,\n        },\n    };\n\n    c.bench(\n        \"scalar\",\n        Benchmark::new(\"isec_sphere_miss\", move |b| {\n            b.iter(|| {\n                let mut isect = Isect::default();\n                let isect = black_box(&mut isect);\n                let s = black_box(&mut s);\n                let r = black_box(&mut r);\n                let mut v = r.intersect(s, *isect);\n                black_box(&mut v);\n                assert_eq!(v.hit, false);\n            })\n        })\n        .throughput(Throughput::Elements(1)),\n    );\n}\n\nfn hit_vector(c: &mut Criterion) {\n    let mut s = Sphere {\n        center: V3D {\n            x: 0.,\n            y: 0.,\n            z: 10.,\n        },\n        radius: 1.,\n    };\n    let mut r = RayxN {\n        origin: V3DxN {\n            x: f32xN::splat(0.),\n            y: f32xN::splat(0.),\n            z: f32xN::splat(0.),\n        },\n        dir: V3DxN {\n            x: f32xN::splat(0.),\n            y: f32xN::splat(0.),\n            z: f32xN::splat(1.),\n        },\n    };\n\n    c.bench(\n        \"vector\",\n        Benchmark::new(\"isec_sphere_hit\", move |b| {\n            b.iter(|| {\n                let mut isect = IsectxN::default();\n                let isect = black_box(&mut isect);\n                let s = black_box(&mut s);\n                let r = black_box(&mut r);\n                let mut v = r.intersect(s, *isect);\n                black_box(&mut v);\n                assert_eq!(v.hit.all(), true);\n            })\n        })\n        .throughput(Throughput::Elements(f32xN::lanes() as u64)),\n    );\n}\n\nfn miss_vector(c: &mut Criterion) {\n    let mut s = Sphere {\n        center: V3D {\n            x: 0.,\n            y: 0.,\n            z: -10.,\n        },\n        radius: 1.,\n    };\n    let mut r = RayxN {\n        origin: V3DxN {\n            x: f32xN::splat(0.),\n            y: f32xN::splat(0.),\n            z: f32xN::splat(0.),\n        },\n        dir: V3DxN {\n            x: f32xN::splat(0.),\n            y: f32xN::splat(0.),\n            z: f32xN::splat(1.),\n        },\n    };\n\n    c.bench(\n        \"vector\",\n        Benchmark::new(\"isec_sphere_miss\", move |b| {\n            b.iter(|| {\n                let mut isect = IsectxN::default();\n                let isect = black_box(&mut isect);\n                let s = black_box(&mut s);\n                let r = black_box(&mut r);\n                let mut v = r.intersect(s, *isect);\n                black_box(&mut v);\n                assert_eq!(v.hit.any(), false);\n            })\n        })\n        .throughput(Throughput::Elements(f32xN::lanes() as u64)),\n    );\n}\n\ncriterion_group!(benches, hit_scalar, miss_scalar, hit_vector, miss_vector);\ncriterion_main!(benches);\n"
  },
  {
    "path": "examples/aobench/benches/random.rs",
    "content": "//! Benchmarks PNRG\n#![feature(stdsimd)]\n\nuse aobench_lib::geometry::f32xN;\nuse aobench_lib::random;\nuse criterion::*;\n\nfn random_scalar(c: &mut Criterion) {\n    c.bench(\n        \"scalar\",\n        Benchmark::new(\"random\", move |b| {\n            let mut rng = random::scalar::thread_rng();\n            b.iter(|| {\n                black_box(rng.gen());\n            })\n        })\n        .throughput(Throughput::Elements(1)),\n    );\n}\n\nfn random_vector(c: &mut Criterion) {\n    c.bench(\n        \"vector\",\n        Benchmark::new(\"random\", move |b| {\n            let mut rng = random::vector::thread_rng();\n            b.iter(|| {\n                black_box(rng.gen());\n            })\n        })\n        .throughput(Throughput::Elements(f32xN::lanes() as u64)),\n    );\n}\n\ncriterion_group!(benches, random_scalar, random_vector);\ncriterion_main!(benches);\n"
  },
  {
    "path": "examples/aobench/benches/scanlines.rs",
    "content": "#![feature(test)]\n\nuse test::{black_box, Bencher};\n\n#[bench]\nfn scanlines_scalar(b: &mut Bencher) {\n    let width = 50;\n    let height = 50;\n    let width = black_box(width);\n    let height = black_box(height);\n\n    let mut fdata = Vec::new();\n    fdata.resize(width * height * 3, 0.);\n    fdata = black_box(fdata);\n    b.iter(|| {\n        black_box(&mut fdata);\n        aobench_lib::scalar::scanlines(0, height, width, height, 2, &mut fdata);\n    });\n}\n\n#[bench]\nfn scanlines_vector(b: &mut Bencher) {\n    let width = 50;\n    let height = 50;\n    let width = black_box(width);\n    let height = black_box(height);\n\n    let mut fdata = Vec::new();\n    fdata.resize(width * height * 3, 0.);\n    fdata = black_box(fdata);\n    b.iter(|| {\n        black_box(&mut fdata);\n        aobench_lib::vector::scanlines(0, height, width, height, 2, &mut fdata);\n    });\n}\n"
  },
  {
    "path": "examples/aobench/benchmark.sh",
    "content": "#!/usr/bin/env bash\n#\n# Runs aobench benchmarks\n\nset -ex\n\nexport WIDTH=800\nexport HEIGHT=600\n\nif [[ ${NORUN} != 1 ]]; then\n    hash hyperfine 2>/dev/null || { echo >&2 \"hyperfine is not in PATH.\"; exit 1; }\nfi\n\nALGS=(\"scalar\" \"scalar_par\" \"vector\" \"vector_par\" \"tiled\" \"tiled_par\")\nif echo \"$FEATURES\" | grep -q \"ispc\"; then\n    hash ispc 2>/dev/null || { echo >&2 \"ispc is not in PATH.\"; exit 1; }\n    ALGS+=(\"ispc\" \"ispc_tasks\")\nfi\n\necho \"Benchmark 256-bit wide vectors\"\nRUSTFLAGS=\"-C target-cpu=native ${RUSTFLAGS}\" \\\n         cargo build --release --no-default-features \\\n         --features=\"${FEATURES},256bit\"\n\nif [[ \"${VERIFY}\" == \"1\" ]]; then\n    RUSTFLAGS=\"-C target-cpu=native ${RUSTFLAGS}\" \\\n    cargo test --release --no-default-features \\\n          --features=\"${FEATURES},256bit\"\nfi\n\nif [[ \"${NORUN}\" == \"1\" ]]; then\n    exit 0\nfi\n\nfor alg in \"${ALGS[@]}\"\ndo\n    hyperfine \"../target/release/aobench ${WIDTH} ${HEIGHT} --algo ${alg}\"\ndone\n\necho \"Benchmark 128-bit wide vectors\"\nRUSTFLAGS=\"-C target-cpu=native ${RUSTFLAGS}\" \\\n         cargo build --release --no-default-features \\\n         --features=\"${FEATURES}\"\nfor alg in \"${ALGS[@]}\"\ndo\n    hyperfine \"../target/release/aobench ${WIDTH} ${HEIGHT} --algo ${alg}\"\ndone\n"
  },
  {
    "path": "examples/aobench/build.rs",
    "content": "fn main() {\n    println!(\"cargo:rerun-if-changed=build.rs\");\n\n    #[cfg(feature = \"ispc\")]\n    {\n        if std::env::var(\"CARGO_FEATURE_ISPC\").is_ok() {\n            let mut cfg = ispc::Config::new();\n\n            if cfg!(windows) {\n                cfg.debug(false);\n            }\n\n            let ispc_files = vec![\"volta/ao.ispc\"];\n\n            for s in &ispc_files[..] {\n                cfg.file(*s);\n            }\n\n            cfg.target_isas(vec![\n                ispc::opt::TargetISA::SSE2i32x4,\n                ispc::opt::TargetISA::SSE4i32x4,\n                ispc::opt::TargetISA::AVX1i32x8,\n                ispc::opt::TargetISA::AVX2i32x8,\n                ispc::opt::TargetISA::AVX512KNLi32x16,\n            ]);\n\n            cfg.compile(\"aobench\");\n        }\n    }\n}\n"
  },
  {
    "path": "examples/aobench/readme.md",
    "content": "# Ambient Occlusion Benchmark\n\n> Originally written by Syoyo Fujita: https://github.com/syoyo/aobench\n\n`aoench` is a small ambient occlusion renderer for benchmarking realworld\nfloating point performance in various languages.\n\n![image_vector_par](https://user-images.githubusercontent.com/904614/41043073-653aa5be-69a3-11e8-8a9d-007def8516cc.png)\n\n## Instructions\n\n\nTo run it with the default target options (replace `${NAME}` with an algorithm name):\n\n```\n> cargo run --release -- 800 600 --algo ${NAME}\n```\n\nUse `RUSTFLAGS` to set the target CPU, for example:\n\n```\n> RUSTFLAGS=\"-C target-cpu=native\" cargo run --release -- 800 600 --algo ${NAME}\n```\n\n## Results\n\n```\n./benchmark.sh\n```\n\nOn a dual core AVX1 i5 @1.8 GHz:\n\n| 800 x 600    | time [ms] <br> Rust | speedup vs `scalar` [-] |\n|--------------|---------------------|-------------------------|\n| `scalar`     | 5884                | 1.0x                    |\n| `scalar_par` | 2206                | 2.7x                    |\n| `vector`     | 1458                | 4.0x                    |\n| `vector_par` | 622                 | 9.5x                    |\n| `tiled`      | 1328                | 4.4x                    |\n| `tiled_par`  | 578                 | 10.2x                   |\n| `ispc`       | 1158                | 5.1x                    |\n| `ispc_tasks` | 567                 | 10.4x                   |\n\n`tiled_par` is 1.02x slower than `ispc_tasks`.\n\nOn a 28 core Xeon CPU E5-2690 v4 @ 2.60GHz:\n\n| 800 x 600    | time [ms] <br> Rust | speedup vs `scalar` [-] |\n|--------------|---------------------|-------------------------|\n| `scalar`     | 2981                | 1.0x                    |\n| `scalar_par` | 163                 | 18.2x                   |\n| `vector`     | 692                 | 4.3x                    |\n| `vector_par` | 98                  | 30.4x                   |\n| `tiled`      | 640                 | 4.7x                    |\n| `tiled_par`  | 98                  | 30.4x                   |\n| `ispc`       | 576                 | 5.2x                    |\n| `ispc_tasks` | 150                 | 19.9x                   |\n\n`tiled_par` is 1.53x faster than `ispc_tasks`.\n\n\nOn a 40 core Xeon Gold 6148 CPU @ 2.40GHz:\n\n| 800 x 600    | time [ms] <br> Rust | speedup vs `scalar` [-] |\n|--------------|---------------------|-------------------------|\n| `scalar`     | 3215                | 1.0x                    |\n| `scalar_par` | 186                 | 17.0x                   |\n| `vector`     | 802                 | 4.0x                    |\n| `vector_par` | 106                 | 30.3x                   |\n| `tiled`      | 770                 | 4.2x                    |\n| `tiled_par`  | 102                 | 32.1x                   |\n| `ispc`       | 491                 | 6.5x                    |\n| `ispc_tasks` | 153                 | 21.7x                   |\n\n`tiled_par` is 1.5x faster than `ispc_tasks`.\n\n## Overview\n\nThere are 4 main pieces in the `aobench` benchmark:\n\n* ray-plane intersection algorithm: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/intersection/ray_plane.rs)\n* ray-sphere intersection algorithm: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/intersection/ray_sphere.rs)\n* ambient occlusion algorithm: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/ambient_occlusion.rs)\n* ray-casting the pixels:\n  * scalar serial: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/scalar.rs)\n  * scalar parallel: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/scalar_parallel.rs)\n  * vector serial: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/vector.rs)\n  * vector parallel: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/vector_parallel.rs)\n\nThe scalar and vectorized implementations of the intersection and ao algorithms\nare in the same file so that they can be easily compared.\n\nAs a comparison, the ISPC sources of the same benchmark are [here](https://github.com/ispc/ispc/tree/master/examples/aobench).\n"
  },
  {
    "path": "examples/aobench/rustfmt.toml",
    "content": "max_width = 79"
  },
  {
    "path": "examples/aobench/src/ambient_occlusion.rs",
    "content": "//! Ambient Occlusion implementations\n\nuse crate::geometry::{f32xN, Ray, RayxN, Selectable, V3DxN, V3D};\nuse crate::intersection::{Intersect, Isect, IsectxN};\nuse crate::scene::Scene;\nuse std::f32::consts::PI;\n\n/// Scalar ambient occlusion algorithm\n#[inline(always)]\npub fn scalar<S: Scene>(scene: &mut S, isect: &Isect) -> f32 {\n    let mut occlusion: f32 = 0.0;\n\n    let basis = isect.n.ortho_basis();\n    let eps: f32 = 0.0001;\n    let origin = isect.p + eps * isect.n;\n\n    let ntheta: usize = S::NAO_SAMPLES;\n    let nphi: usize = S::NAO_SAMPLES;\n    for _i in 0..ntheta {\n        for _j in 0..nphi {\n            let theta = scene.rand().sqrt();\n            let phi = 2. * PI * scene.rand();\n\n            let n = V3D {\n                x: phi.cos() * theta,\n                y: phi.sin() * theta,\n                z: (1.0 - theta * theta).sqrt(),\n            };\n            let dir = basis * n;\n            let ray = Ray { origin, dir };\n\n            let mut occ_isect = Isect::default();\n            for s in scene.spheres() {\n                occ_isect = ray.intersect(s, occ_isect);\n            }\n            occ_isect = ray.intersect(scene.plane(), occ_isect);\n\n            if occ_isect.hit {\n                occlusion += 1.;\n            }\n        }\n    }\n\n    1. - occlusion / (ntheta * nphi) as f32\n}\n\n/// Vectorized ambient occlusion algorithm using ray packets\n#[inline(always)]\npub fn vector<S: Scene>(scene: &mut S, isect: &Isect) -> f32 {\n    let mut occlusion = f32xN::splat(0.0);\n\n    let basis = isect.n.ortho_basis();\n    let eps: f32 = 0.0001;\n    let origin = isect.p + eps * isect.n;\n    let origin = V3DxN {\n        x: f32xN::splat(origin.x),\n        y: f32xN::splat(origin.y),\n        z: f32xN::splat(origin.z),\n    };\n\n    let ntheta: usize = S::NAO_SAMPLES;\n    let nphi: usize = S::NAO_SAMPLES;\n    for _i in 0..ntheta {\n        for _j in (0..nphi).step_by(f32xN::lanes()) {\n            let (theta, phi) = scene.rand_f32xN();\n            let theta = theta.sqrte();\n            let (sin, cos) = (2. * phi).sin_cos_pi();\n\n            let n = V3DxN {\n                x: cos * theta,\n                y: sin * theta,\n                z: (f32xN::splat(1.0) - theta * theta).sqrt(),\n            };\n            let dir = basis * n;\n            let ray = RayxN { origin, dir };\n\n            let mut occ_isect = IsectxN::default();\n            for s in scene.spheres() {\n                occ_isect = ray.intersect(s, occ_isect);\n            }\n            occ_isect = ray.intersect(scene.plane(), occ_isect);\n\n            occlusion += occ_isect.hit.sel(f32xN::splat(1.), f32xN::splat(0.));\n        }\n    }\n\n    1. - occlusion.sum() / (ntheta * nphi) as f32\n}\n\n/// Vectorized ambient occlusion algorithm using ray packets\n#[inline(always)]\npub fn vector_tiled<S: Scene>(scene: &mut S, isect: &IsectxN) -> f32xN {\n    let mut occlusion = f32xN::splat(0.0);\n\n    let basis = isect.n.ortho_basis();\n    let eps = f32xN::splat(0.0001);\n    let origin = isect.p + eps * isect.n;\n\n    let ntheta: usize = S::NAO_SAMPLES;\n    let nphi: usize = S::NAO_SAMPLES;\n    for _i in 0..ntheta {\n        for _j in 0..nphi {\n            let (theta, phi) = scene.rand_f32xN();\n            let theta = theta.sqrte();\n            let (sin, cos) = (2. * phi).sin_cos_pi();\n\n            let n = V3DxN {\n                x: cos * theta,\n                y: sin * theta,\n                z: (1.0 - theta * theta).sqrt(),\n            };\n            let dir = basis * n;\n            let ray = RayxN { origin, dir };\n\n            let mut occ_isect = IsectxN::default();\n            for s in scene.spheres() {\n                occ_isect = ray.intersect(s, occ_isect);\n            }\n            occ_isect = ray.intersect(scene.plane(), occ_isect);\n\n            occlusion += occ_isect.hit.sel(f32xN::splat(1.), f32xN::splat(0.));\n        }\n    }\n\n    f32xN::splat(1.) - occlusion / (ntheta * nphi) as f32\n}\n\n#[cfg(test)]\nmod tests {\n    use super::*;\n    use crate::geometry::V3D;\n\n    #[test]\n    fn sanity_hit() {\n        let scene = crate::scene::Test::default();\n        let mut scene_scalar = scene.clone();\n        let mut scene_vector = scene.clone();\n        let ray = Ray {\n            origin: V3D::default(),\n            dir: V3D {\n                x: -0.2,\n                y: -0.2,\n                z: -0.2,\n            },\n        };\n        let mut isect = Isect::default();\n\n        for s in scene.spheres() {\n            isect = ray.intersect(s, isect);\n        }\n        isect = ray.intersect(scene.plane(), isect);\n\n        assert!(isect.hit);\n\n        let ao_scalar = scalar(&mut scene_scalar, &isect);\n        let ao_vector = vector(&mut scene_vector, &isect);\n        assert_eq!(ao_scalar, ao_vector);\n    }\n\n    #[test]\n    fn sanity_miss() {\n        let scene = crate::scene::Test::default();\n        let mut scene_scalar = scene.clone();\n        let mut scene_vector = scene.clone();\n\n        let ray = Ray {\n            origin: V3D::default(),\n            dir: V3D {\n                x: 0.2,\n                y: 0.2,\n                z: 0.2,\n            },\n        };\n        let mut isect = Isect::default();\n\n        for s in scene.spheres() {\n            isect = ray.intersect(s, isect);\n        }\n        isect = ray.intersect(scene.plane(), isect);\n\n        assert!(!isect.hit);\n\n        let ao_scalar = scalar(&mut scene_scalar, &isect);\n        let ao_vector = vector(&mut scene_vector, &isect);\n        assert_eq!(ao_scalar, ao_vector);\n    }\n\n}\n"
  },
  {
    "path": "examples/aobench/src/geometry/mod.rs",
    "content": "//! Geometry utilities\n\nuse packed_simd::*;\n\nmod plane;\nmod ray;\nmod sphere;\nmod vec;\n\nmod rayxN;\nmod vecxN;\n\npub use self::plane::Plane;\npub use self::ray::Ray;\npub use self::sphere::Sphere;\npub use self::vec::{Dot, M3x3, V3D};\n\npub use self::rayxN::RayxN;\npub use self::vecxN::{Selectable, V3DxN};\n\n#[cfg(feature = \"256bit\")]\npub type f32xN = f32x8;\n#[cfg(feature = \"256bit\")]\npub type u32xN = u32x8;\n#[cfg(feature = \"256bit\")]\npub type usizexN = usizex8;\n#[cfg(feature = \"256bit\")]\npub type m32xN = m32x8;\n#[cfg(feature = \"256bit\")]\npub type pf32xN = Simd<[*mut f32; 8]>;\n\n#[cfg(not(feature = \"256bit\"))]\npub type f32xN = f32x4;\n#[cfg(not(feature = \"256bit\"))]\npub type u32xN = u32x4;\n#[cfg(not(feature = \"256bit\"))]\npub type usizexN = usizex4;\n#[cfg(not(feature = \"256bit\"))]\npub type m32xN = m32x4;\n#[cfg(not(feature = \"256bit\"))]\npub type pf32xN = Simd<[*mut f32; 4]>;\n\npub trait IncrV {\n    type Element;\n    fn incr(x: Self::Element, step: Self::Element) -> Self;\n}\n\nimpl IncrV for f32xN {\n    type Element = f32;\n    #[inline(always)]\n    fn incr(x: f32, step: f32) -> Self {\n        #[cfg(feature = \"256bit\")]\n        {\n            Self::new(\n                x + 0. * step,\n                x + 1. * step,\n                x + 2. * step,\n                x + 3. * step,\n                x + 4. * step,\n                x + 5. * step,\n                x + 6. * step,\n                x + 7. * step,\n            )\n        }\n        #[cfg(not(feature = \"256bit\"))]\n        {\n            Self::new(\n                x + 0. * step,\n                x + 1. * step,\n                x + 2. * step,\n                x + 3. * step,\n            )\n        }\n    }\n}\n\nimpl IncrV for u32xN {\n    type Element = u32;\n    #[inline(always)]\n    fn incr(x: u32, step: u32) -> Self {\n        #[cfg(feature = \"256bit\")]\n        {\n            Self::new(\n                x + 0 * step,\n                x + 1 * step,\n                x + 2 * step,\n                x + 3 * step,\n                x + 4 * step,\n                x + 5 * step,\n                x + 6 * step,\n                x + 7 * step,\n            )\n        }\n        #[cfg(not(feature = \"256bit\"))]\n        {\n            Self::new(x + 0 * step, x + 1 * step, x + 2 * step, x + 3 * step)\n        }\n    }\n}\n\nimpl IncrV for usizexN {\n    type Element = usize;\n    #[inline(always)]\n    fn incr(x: usize, step: usize) -> Self {\n        #[cfg(feature = \"256bit\")]\n        {\n            Self::new(\n                x + 0 * step,\n                x + 1 * step,\n                x + 2 * step,\n                x + 3 * step,\n                x + 4 * step,\n                x + 5 * step,\n                x + 6 * step,\n                x + 7 * step,\n            )\n        }\n        #[cfg(not(feature = \"256bit\"))]\n        {\n            Self::new(x + 0 * step, x + 1 * step, x + 2 * step, x + 3 * step)\n        }\n    }\n}\n"
  },
  {
    "path": "examples/aobench/src/geometry/plane.rs",
    "content": "//! Plane\n\nuse crate::geometry::V3D;\n\n#[derive(Copy, Clone, Debug)]\npub struct Plane {\n    pub p: V3D,\n    pub n: V3D,\n}\n"
  },
  {
    "path": "examples/aobench/src/geometry/ray.rs",
    "content": "//! A ray\n\nuse crate::geometry::V3D;\n\n/// Ray starting at `origin` in `dir` direction.\n#[derive(Copy, Clone, Debug)]\npub struct Ray {\n    pub origin: V3D,\n    pub dir: V3D,\n}\n"
  },
  {
    "path": "examples/aobench/src/geometry/rayxN.rs",
    "content": "//! Four packed rays\n\nuse crate::geometry::{Ray, V3DxN};\n\n/// Four packed rays starting at `origin` in `dir` direction.\n#[derive(Copy, Clone, Debug)]\npub struct RayxN {\n    pub origin: V3DxN,\n    pub dir: V3DxN,\n}\n\nimpl RayxN {\n    pub fn get(&self, idx: usize) -> Ray {\n        Ray {\n            origin: self.origin.get(idx),\n            dir: self.dir.get(idx),\n        }\n    }\n}\n"
  },
  {
    "path": "examples/aobench/src/geometry/sphere.rs",
    "content": "//! Sphere\n\nuse crate::geometry::V3D;\n\n#[derive(Copy, Clone, Debug)]\npub struct Sphere {\n    pub center: V3D,\n    pub radius: f32,\n}\n"
  },
  {
    "path": "examples/aobench/src/geometry/vec.rs",
    "content": "//! A simple vector type\n\nuse std::ops::*;\n\n#[derive(Copy, Clone, Debug, PartialEq)]\npub struct V3D {\n    pub x: f32,\n    pub y: f32,\n    pub z: f32,\n}\n\nimpl Default for V3D {\n    #[inline(always)]\n    #[must_use]\n    fn default() -> Self {\n        Self {\n            x: 0.,\n            y: 0.,\n            z: 0.,\n        }\n    }\n}\n\npub type M3x3 = [V3D; 3];\n\nimpl V3D {\n    #[inline(always)]\n    #[must_use]\n    pub fn cross(self, o: Self) -> Self {\n        Self {\n            x: self.y * o.z - self.z * o.y,\n            y: self.z * o.x - self.x * o.z,\n            z: self.x * o.y - self.y * o.x,\n        }\n    }\n    #[inline(always)]\n    #[must_use]\n    pub fn normalized(self) -> Self {\n        let len2 = self.dot(self);\n        let invlen = len2.sqrt().recip();\n        invlen * self\n    }\n    #[inline(always)]\n    #[must_use]\n    pub fn ortho_basis(self) -> M3x3 {\n        let n = self;\n        let mut basis = [Self::default(), Self::default(), n];\n\n        if n.x < 0.6 && n.x > -0.6 {\n            basis[1].x = 1.0;\n        } else if n.y < 0.6 && n.y > -0.6 {\n            basis[1].y = 1.0;\n        } else if n.z < 0.6 && n.z > -0.6 {\n            basis[1].z = 1.0;\n        } else {\n            basis[1].x = 1.0;\n        }\n\n        basis[0] = basis[1].cross(basis[2]).normalized();\n        basis[1] = basis[2].cross(basis[0]).normalized();\n        basis\n    }\n    // Fuzzy float comparison between vectors\n    #[inline(always)]\n    #[must_use]\n    pub fn almost_eq(&self, rhs: &Self) -> bool {\n        const EPSILON: f32 = 1E-3;\n        (self.x - rhs.x).abs() < EPSILON\n            && (self.y - rhs.y).abs() < EPSILON\n            && (self.z - rhs.z).abs() < EPSILON\n    }\n}\n\nimpl Add for V3D {\n    type Output = Self;\n    #[inline(always)]\n    fn add(self, o: Self) -> Self::Output {\n        Self {\n            x: self.x + o.x,\n            y: self.y + o.y,\n            z: self.z + o.z,\n        }\n    }\n}\n\nimpl Sub for V3D {\n    type Output = Self;\n    #[inline(always)]\n    fn sub(self, o: Self) -> Self::Output {\n        Self {\n            x: self.x - o.x,\n            y: self.y - o.y,\n            z: self.z - o.z,\n        }\n    }\n}\n\nimpl Mul for V3D {\n    type Output = Self;\n    fn mul(self, o: Self) -> Self::Output {\n        Self {\n            x: self.x * o.x,\n            y: self.y * o.y,\n            z: self.z * o.z,\n        }\n    }\n}\n\nimpl Mul<f32> for V3D {\n    type Output = Self;\n    #[inline(always)]\n    fn mul(self, o: f32) -> Self::Output {\n        Self {\n            x: self.x * o,\n            y: self.y * o,\n            z: self.z * o,\n        }\n    }\n}\n\nimpl Mul<V3D> for f32 {\n    type Output = V3D;\n    #[inline(always)]\n    fn mul(self, o: V3D) -> Self::Output {\n        o * self\n    }\n}\n\nimpl Mul<V3D> for M3x3 {\n    type Output = V3D;\n    #[inline(always)]\n    fn mul(self, o: V3D) -> Self::Output {\n        V3D {\n            x: o.dot(V3D {\n                x: self[0].x,\n                y: self[1].x,\n                z: self[2].x,\n            }),\n            y: o.dot(V3D {\n                x: self[0].y,\n                y: self[1].y,\n                z: self[2].y,\n            }),\n            z: o.dot(V3D {\n                x: self[0].z,\n                y: self[1].z,\n                z: self[2].z,\n            }),\n        }\n    }\n}\n\n/// Vector dot product\npub trait Dot<O> {\n    type Output;\n    fn dot(self, _: O) -> Self::Output;\n}\n\nimpl Dot<V3D> for V3D {\n    type Output = f32;\n    #[inline(always)]\n    fn dot(self, o: Self) -> Self::Output {\n        self.x * o.x + self.y * o.y + self.z * o.z\n    }\n}\n"
  },
  {
    "path": "examples/aobench/src/geometry/vecxN.rs",
    "content": "//! A simple vector type\n\nuse std::ops::*;\n\nuse crate::geometry::{f32xN, m32xN, Dot, M3x3, V3D};\n\n#[derive(Copy, Clone, Debug)]\npub struct V3DxN {\n    pub x: f32xN,\n    pub y: f32xN,\n    pub z: f32xN,\n}\n\nimpl Default for V3DxN {\n    #[inline(always)]\n    #[must_use]\n    fn default() -> Self {\n        Self {\n            x: f32xN::splat(0.),\n            y: f32xN::splat(0.),\n            z: f32xN::splat(0.),\n        }\n    }\n}\n\nimpl V3DxN {\n    #[inline(always)]\n    #[must_use]\n    pub fn normalized(self) -> Self {\n        let len2 = self.dot(self);\n        let invlen = len2.rsqrte();\n        invlen * self\n    }\n\n    pub fn get(&self, idx: usize) -> V3D {\n        V3D {\n            x: self.x.extract(idx),\n            y: self.y.extract(idx),\n            z: self.z.extract(idx),\n        }\n    }\n\n    #[must_use]\n    #[inline(always)]\n    pub fn ortho_basis(self) -> [Self; 3] {\n        let n = self;\n        let mut basis = [Self::default(), Self::default(), n];\n\n        let max = f32xN::splat(0.6);\n        let min = f32xN::splat(-0.6);\n        let one = f32xN::splat(1.0);\n\n        let mx = n.x.lt(max) & n.x.gt(min);\n        let my = n.y.lt(max) & n.y.gt(min);\n        let mz = n.z.lt(max) & n.z.gt(min);\n\n        basis[1].x = (mx | (!mx & !my & !mz)).select(one, basis[1].x);\n        basis[1].y = (!mx & my).select(one, basis[1].y);\n        basis[1].z = (!mx & !my & mz).select(one, basis[1].z);\n\n        basis[0] = basis[1].cross(basis[2]).normalized();\n        basis[1] = basis[2].cross(basis[0]).normalized();\n        basis\n    }\n\n    #[inline(always)]\n    #[must_use]\n    pub fn cross(self, o: Self) -> Self {\n        Self {\n            x: self.y * o.z - self.z * o.y,\n            y: self.z * o.x - self.x * o.z,\n            z: self.x * o.y - self.y * o.x,\n        }\n    }\n}\n\nimpl Add for V3DxN {\n    type Output = Self;\n    #[inline(always)]\n    fn add(self, o: Self) -> Self::Output {\n        Self {\n            x: self.x + o.x,\n            y: self.y + o.y,\n            z: self.z + o.z,\n        }\n    }\n}\n\nimpl Mul for V3DxN {\n    type Output = Self;\n    #[inline(always)]\n    fn mul(self, o: Self) -> Self::Output {\n        Self {\n            x: self.x * o.x,\n            y: self.y * o.y,\n            z: self.z * o.z,\n        }\n    }\n}\n\nimpl Mul<V3DxN> for f32xN {\n    type Output = V3DxN;\n    #[inline(always)]\n    fn mul(self, o: V3DxN) -> Self::Output {\n        V3DxN {\n            x: self * o.x,\n            y: self * o.y,\n            z: self * o.z,\n        }\n    }\n}\n\nimpl Mul<V3DxN> for [V3DxN; 3] {\n    type Output = V3DxN;\n    #[inline(always)]\n    fn mul(self, o: V3DxN) -> Self::Output {\n        V3DxN {\n            x: o.dot(V3DxN {\n                x: self[0].x,\n                y: self[1].x,\n                z: self[2].x,\n            }),\n            y: o.dot(V3DxN {\n                x: self[0].y,\n                y: self[1].y,\n                z: self[2].y,\n            }),\n            z: o.dot(V3DxN {\n                x: self[0].z,\n                y: self[1].z,\n                z: self[2].z,\n            }),\n        }\n    }\n}\n\nimpl Sub<V3D> for V3DxN {\n    type Output = Self;\n    #[inline(always)]\n    fn sub(self, o: V3D) -> Self::Output {\n        Self {\n            x: self.x - f32xN::splat(o.x),\n            y: self.y - f32xN::splat(o.y),\n            z: self.z - f32xN::splat(o.z),\n        }\n    }\n}\n\nimpl Dot<V3DxN> for V3DxN {\n    type Output = f32xN;\n    #[inline(always)]\n    fn dot(self, o: Self) -> Self::Output {\n        self.x.mul_adde(o.x, self.y.mul_adde(o.y, self.z * o.z))\n    }\n}\n\nimpl Dot<V3D> for V3DxN {\n    type Output = f32xN;\n    #[inline(always)]\n    fn dot(self, o: V3D) -> Self::Output {\n        self.x.mul_adde(\n            f32xN::splat(o.x),\n            self.y.mul_adde(f32xN::splat(o.y), self.z * o.z),\n        )\n    }\n}\n\npub trait Selectable<O, P> {\n    type Output;\n    fn sel(self, a: O, b: P) -> Self::Output;\n}\n\nimpl Selectable<f32xN, f32xN> for m32xN {\n    type Output = f32xN;\n    #[inline(always)]\n    fn sel(self, a: f32xN, b: f32xN) -> f32xN {\n        self.select(a, b)\n    }\n}\n\nimpl Selectable<V3DxN, V3DxN> for m32xN {\n    type Output = V3DxN;\n    #[inline(always)]\n    fn sel(self, a: V3DxN, b: V3DxN) -> V3DxN {\n        V3DxN {\n            x: self.select(a.x, b.x),\n            y: self.select(a.y, b.y),\n            z: self.select(a.z, b.z),\n        }\n    }\n}\n\nimpl Selectable<V3D, V3DxN> for m32xN {\n    type Output = V3DxN;\n    #[inline(always)]\n    fn sel(self, a: V3D, b: V3DxN) -> V3DxN {\n        V3DxN {\n            x: self.select(f32xN::splat(a.x), b.x),\n            y: self.select(f32xN::splat(a.y), b.y),\n            z: self.select(f32xN::splat(a.z), b.z),\n        }\n    }\n}\n\nimpl Mul<V3DxN> for M3x3 {\n    type Output = V3DxN;\n    #[inline(always)]\n    fn mul(self, o: V3DxN) -> Self::Output {\n        V3DxN {\n            x: o.x.mul_adde(\n                f32xN::splat(self[0].x),\n                o.y.mul_adde(\n                    f32xN::splat(self[1].x),\n                    o.z * f32xN::splat(self[2].x),\n                ),\n            ),\n            y: o.x.mul_adde(\n                f32xN::splat(self[0].y),\n                o.y.mul_adde(\n                    f32xN::splat(self[1].y),\n                    o.z * f32xN::splat(self[2].y),\n                ),\n            ),\n            z: o.x.mul_adde(\n                f32xN::splat(self[0].z),\n                o.y.mul_adde(\n                    f32xN::splat(self[1].z),\n                    o.z * f32xN::splat(self[2].z),\n                ),\n            ),\n        }\n    }\n}\n"
  },
  {
    "path": "examples/aobench/src/image.rs",
    "content": "//! Image utilities\n\nuse failure::Error;\n#[allow(unused)]\nuse png::{BitDepth, ColorType, Encoder};\nuse std::path::Path;\n\n/// PNG image in RGB format\npub struct Image {\n    width: usize,\n    height: usize,\n    data: Vec<u8>,\n    pub fdata: Vec<f32>,\n}\n\nimpl Image {\n    pub fn new(width: usize, height: usize) -> Self {\n        Self {\n            width,\n            height,\n            data: vec![0_u8; width * height * 3 /* RGBA */],\n            fdata: vec![0_f32; width * height * 3 /* RGBA */],\n        }\n    }\n\n    /// Image's `(width, height)`\n    pub fn size(&self) -> (usize, usize) {\n        (self.width, self.height)\n    }\n    /// Writes the pixels into a png image at `output`.\n    ///\n    /// `soa` specifies whether the bytes in `fdata` are in a Struct of Arrays (rrr...ggg...bbb...)\n    /// or Array of Structs (rgbrgbrgb...) format.\n    pub fn write_png(\n        &mut self,\n        output: &Path,\n        soa: bool,\n    ) -> Result<(), Error> {\n        fn clamp(x: f32) -> u8 {\n            let mut i = (x * 255.5) as isize;\n\n            if i < 0 {\n                i = 0\n            };\n            if i > 255 {\n                i = 255\n            };\n\n            i as u8\n        }\n\n        use std::fs::File;\n        use std::io::BufWriter;\n\n        let file = File::create(output)?;\n        let buf_writer = &mut BufWriter::new(file);\n        let mut encoder = Encoder::new(\n            buf_writer,\n            self.width as u32,\n            self.height as u32,\n        );\n\n        encoder.set_color(ColorType::RGB);\n        encoder.set_depth(BitDepth::Eight);\n        let mut writer = encoder.write_header().unwrap();\n\n        if soa {\n            let len = (self.width * self.height) as usize;\n            let (r, tail) = self.fdata.split_at(len);\n            let (g, b) = tail.split_at(len);\n            assert!(r.len() == len);\n            assert!(g.len() == len);\n            assert!(b.len() == len);\n\n            for i in 0..len {\n                self.data[3 * i + 0] = clamp(r[i]);\n                self.data[3 * i + 1] = clamp(g[i]);\n                self.data[3 * i + 2] = clamp(b[i]);\n            }\n        } else {\n            for (&fp, up) in self.fdata.iter().zip(self.data.iter_mut()) {\n                (*up) = clamp(fp);\n            }\n        }\n\n        writer.write_image_data(&self.data)?;\n        Ok(())\n    }\n}\n"
  },
  {
    "path": "examples/aobench/src/intersection/mod.rs",
    "content": "//! Intersection functions\n\n/// Intersection of `I` with `Self`\npub trait Intersect<I> {\n    type Isect;\n    fn intersect(&self, other: &I, isect: Self::Isect) -> Self::Isect;\n}\n\nmod packet;\nmod ray_plane;\nmod ray_sphere;\nmod single;\n\npub use self::packet::IsectxN;\npub use self::single::Isect;\n"
  },
  {
    "path": "examples/aobench/src/intersection/packet.rs",
    "content": "//! SIMD intersection result\n\nuse crate::geometry::{f32xN, m32xN, V3DxN};\nuse crate::intersection::Isect;\n\n/// Intersection result\n#[derive(Copy, Clone, Debug)]\npub struct IsectxN {\n    pub t: f32xN,\n    pub p: V3DxN,\n    pub n: V3DxN,\n    pub hit: m32xN,\n}\n\nimpl Default for IsectxN {\n    #[inline]\n    fn default() -> Self {\n        Self {\n            t: f32xN::splat(1e17),\n            hit: m32xN::splat(false),\n            p: V3DxN::default(),\n            n: V3DxN::default(),\n        }\n    }\n}\n\nimpl IsectxN {\n    pub fn get(&self, idx: usize) -> Isect {\n        Isect {\n            t: self.t.extract(idx),\n            p: self.p.get(idx),\n            n: self.n.get(idx),\n            hit: self.hit.extract(idx),\n        }\n    }\n}\n"
  },
  {
    "path": "examples/aobench/src/intersection/ray_plane.rs",
    "content": "//! Intersection of a ray with a plane\n\nuse crate::geometry::{f32xN, Dot, Plane, Ray, RayxN, Selectable};\nuse crate::intersection::{Intersect, Isect, IsectxN};\n\n// Scalar ray-plane intersection\nimpl Intersect<Plane> for Ray {\n    type Isect = Isect;\n    #[inline(always)]\n    fn intersect(&self, plane: &Plane, mut isect: Isect) -> Isect {\n        let ray = self;\n        let d = -plane.p.dot(plane.n);\n        let v = ray.dir.dot(plane.n);\n\n        if v.abs() < 1e-17 {\n            return isect;\n        }\n\n        let t = -(ray.origin.dot(plane.n) + d) / v;\n\n        if t > 0. && t < isect.t {\n            isect.t = t;\n            isect.hit = true;\n            isect.p = ray.origin + t * ray.dir;\n            isect.n = plane.n;\n        }\n\n        isect\n    }\n}\n\n// Vector ray-plane intersection for a packet of rays\nimpl Intersect<Plane> for RayxN {\n    type Isect = IsectxN;\n    #[inline(always)]\n    fn intersect(&self, plane: &Plane, mut isect: IsectxN) -> IsectxN {\n        let ray = self;\n        let d = -plane.p.dot(plane.n);\n        let v = ray.dir.dot(plane.n);\n\n        let _old_isect = isect;\n\n        let m = v.abs().ge(f32xN::splat(1e-17));\n        if m.any() {\n            let t = m.sel(-(ray.origin.dot(plane.n) + d) / v, isect.t);\n            let m = m & t.gt(f32xN::splat(0.)) & t.lt(isect.t);\n\n            if m.any() {\n                isect.t = m.sel(t, isect.t);\n                isect.hit |= m;\n                isect.p = m.sel(ray.origin + t * ray.dir, isect.p);\n                isect.n = m.sel(plane.n, isect.n);\n            }\n        }\n\n        #[cfg(debug_assertions)]\n        {\n            // Check that the vector and the scalar version produce the same results\n            // for the same inputs in debug builds\n            for i in 0..f32xN::lanes() {\n                let old_isect_i = _old_isect.get(i);\n                let ray_i = self.get(i);\n                let isect_i = ray_i.intersect(plane, old_isect_i);\n                assert!(isect_i.almost_eq(&isect.get(i)), \"{:?} !~= {:?}\\n\\nplane: {:?}\\n\\nold_isect: {:?}\\n\\nrays: {:?}\\n\\ni: {:?}\\nold_isect_i: {:?}\\nray_i: {:?}\\n\\n\", isect_i, isect.get(i), plane, _old_isect, self, i, old_isect_i, ray_i);\n            }\n        }\n\n        isect\n    }\n}\n\n#[cfg(test)]\nmod tests {\n    use super::*;\n    use crate::geometry::{m32xN, V3DxN, V3D};\n\n    #[test]\n    fn sanity() {\n        let plane = Plane {\n            p: V3D {\n                x: 0.,\n                y: 0.,\n                z: -10.,\n            },\n            n: V3D {\n                x: 0.,\n                y: 0.,\n                z: 1.,\n            },\n        };\n\n        let ray_hit = Ray {\n            origin: V3D::default(),\n            dir: V3D {\n                x: 0.01,\n                y: 0.01,\n                z: -1.,\n            },\n        };\n        let ray_miss = Ray {\n            origin: V3D::default(),\n            dir: V3D {\n                x: 0.,\n                y: 0.,\n                z: 1.,\n            },\n        };\n\n        let isect_hit = ray_hit.intersect(&plane, Isect::default());\n        assert!(isect_hit.hit);\n        let isect_miss = ray_miss.intersect(&plane, Isect::default());\n        assert!(!isect_miss.hit);\n\n        // hit, miss, hit, miss\n\n        #[cfg(feature = \"256bit\")]\n        let z_val = f32xN::new(-1., 1., -1., 1., -1., 1., -1., 1.);\n        #[cfg(not(feature = \"256bit\"))]\n        let z_val = f32xN::new(-1., 1., -1., 1.);\n\n        let rays = RayxN {\n            origin: V3DxN::default(),\n            dir: V3DxN {\n                x: f32xN::splat(0.01),\n                y: f32xN::splat(0.01),\n                z: z_val,\n            },\n        };\n\n        let isectxN = rays.intersect(&plane, IsectxN::default());\n\n        #[cfg(feature = \"256bit\")]\n        let expected =\n            m32xN::new(true, false, true, false, true, false, true, false);\n        #[cfg(not(feature = \"256bit\"))]\n        let expected = m32xN::new(true, false, true, false);\n\n        assert_eq!(isectxN.hit, expected);\n\n        assert_eq!(isect_hit.t, isectxN.t.extract(0));\n        assert_eq!(isect_hit.t, isectxN.t.extract(2));\n        assert_eq!(isect_miss.t, isectxN.t.extract(1));\n        assert_eq!(isect_miss.t, isectxN.t.extract(3));\n\n        assert_eq!(isect_hit.p.x, isectxN.p.x.extract(0));\n        assert_eq!(isect_hit.p.y, isectxN.p.y.extract(0));\n        assert_eq!(isect_hit.p.z, isectxN.p.z.extract(0));\n\n        assert_eq!(isect_hit.p.x, isectxN.p.x.extract(2));\n        assert_eq!(isect_hit.p.y, isectxN.p.y.extract(2));\n        assert_eq!(isect_hit.p.z, isectxN.p.z.extract(2));\n\n        assert_eq!(isect_miss.p.x, isectxN.p.x.extract(1));\n        assert_eq!(isect_miss.p.y, isectxN.p.y.extract(1));\n        assert_eq!(isect_miss.p.z, isectxN.p.z.extract(1));\n\n        assert_eq!(isect_miss.p.x, isectxN.p.x.extract(3));\n        assert_eq!(isect_miss.p.y, isectxN.p.y.extract(3));\n        assert_eq!(isect_miss.p.z, isectxN.p.z.extract(3));\n\n        assert_eq!(isect_hit.n.x, isectxN.n.x.extract(0));\n        assert_eq!(isect_hit.n.y, isectxN.n.y.extract(0));\n        assert_eq!(isect_hit.n.z, isectxN.n.z.extract(0));\n\n        assert_eq!(isect_hit.n.x, isectxN.n.x.extract(2));\n        assert_eq!(isect_hit.n.y, isectxN.n.y.extract(2));\n        assert_eq!(isect_hit.n.z, isectxN.n.z.extract(2));\n\n        assert_eq!(isect_miss.n.x, isectxN.n.x.extract(1));\n        assert_eq!(isect_miss.n.y, isectxN.n.y.extract(1));\n        assert_eq!(isect_miss.n.z, isectxN.n.z.extract(1));\n\n        assert_eq!(isect_miss.n.x, isectxN.n.x.extract(3));\n        assert_eq!(isect_miss.n.y, isectxN.n.y.extract(3));\n        assert_eq!(isect_miss.n.z, isectxN.n.z.extract(3));\n    }\n\n    #[test]\n    fn bug() {\n        let plane = Plane {\n            p: V3D {\n                x: 0.,\n                y: -0.5,\n                z: 0.,\n            },\n            n: V3D {\n                x: 0.,\n                y: 1.,\n                z: 0.,\n            },\n        };\n        let isect = IsectxN {\n            t: f32xN::splat(2.1931846),\n            p: V3DxN {\n                x: f32xN::splat(-0.2608384),\n                y: f32xN::splat(-0.28958648),\n                z: f32xN::splat(-2.6699374),\n            },\n            n: V3DxN {\n                x: f32xN::splat(0.47832328),\n                y: f32xN::splat(-0.579173),\n                z: f32xN::splat(0.6601253),\n            },\n            hit: m32xN::splat(true),\n        };\n        let rays = RayxN {\n            origin: V3DxN {\n                x: f32xN::splat(-0.5),\n                y: f32xN::splat(-0.4999),\n                z: f32xN::splat(-0.5),\n            },\n            dir: V3DxN {\n                x: f32xN::splat(0.10904764),\n                y: f32xN::splat(0.095894136),\n                z: f32xN::splat(-0.98940027),\n            },\n        };\n        let r = rays.intersect(&plane, isect);\n        assert_eq!(r.hit, m32xN::splat(true));\n    }\n}\n"
  },
  {
    "path": "examples/aobench/src/intersection/ray_sphere.rs",
    "content": "//! Intersection of a ray with a sphere.\n\nuse crate::geometry::{f32xN, Dot, Ray, RayxN, Selectable, Sphere};\nuse crate::intersection::{Intersect, Isect, IsectxN};\n\n// Scalar ray-sphere intersection\nimpl Intersect<Sphere> for Ray {\n    type Isect = Isect;\n    #[inline(always)]\n    fn intersect(&self, sphere: &Sphere, mut isect: Isect) -> Isect {\n        let ray = self;\n        let rs = ray.origin - sphere.center;\n\n        let b = rs.dot(ray.dir);\n        let c = rs.dot(rs) - sphere.radius * sphere.radius;\n        let d = b * b - c;\n\n        if d > 0. {\n            let t = -b - d.sqrt();\n\n            if t > 0. && t < isect.t {\n                isect.t = t;\n                isect.hit = true;\n                isect.p = ray.origin + t * ray.dir;\n                isect.n = (isect.p - sphere.center).normalized();\n            }\n        }\n\n        isect\n    }\n}\n\n// Vector ray-sphere intersection for a packet of rays\nimpl Intersect<Sphere> for RayxN {\n    type Isect = IsectxN;\n    #[inline(always)]\n    fn intersect(&self, sphere: &Sphere, mut isect: IsectxN) -> IsectxN {\n        let ray = self;\n        let rs = ray.origin - sphere.center;\n\n        let b = rs.dot(ray.dir);\n        let radius = f32xN::splat(sphere.radius);\n        let c = radius.mul_adde(-radius, rs.dot(rs));\n        let d = b.mul_adde(b, -c);\n\n        let _old_isect = isect;\n\n        let m = d.gt(f32xN::splat(0.));\n        if m.any() {\n            let t = m.sel(-b - d.sqrt(), isect.t);\n            let m = m & t.gt(f32xN::splat(0.)) & t.lt(isect.t);\n\n            if m.any() {\n                isect.t = m.sel(t, isect.t);\n                isect.hit |= m;\n                isect.p = m.sel(ray.origin + t * ray.dir, isect.p);\n                isect.n =\n                    m.sel((isect.p - sphere.center).normalized(), isect.n);\n            }\n        }\n\n        #[cfg(debug_assertions)]\n        {\n            // Check that the vector and the scalar version produce the same results\n            // for the same inputs in debug builds\n            for i in 0..f32xN::lanes() {\n                let old_isect_i = _old_isect.get(i);\n                let ray_i = self.get(i);\n                let isect_i = ray_i.intersect(sphere, old_isect_i);\n                assert!(isect_i.almost_eq(&isect.get(i)), \"{:?} !~= {:?}\\n\\nsphere: {:?}\\n\\nold_isect: {:?}\\n\\nrays: {:?}\\n\\ni: {:?}\\nold_isect_i: {:?}\\nray_i: {:?}\\n\\n\", isect_i, isect.get(i), sphere, _old_isect, self, i, old_isect_i, ray_i);\n            }\n        }\n\n        isect\n    }\n}\n\n#[cfg(test)]\nmod tests {\n    use super::*;\n    use crate::geometry::{m32xN, V3DxN, V3D};\n\n    #[test]\n    fn sanity() {\n        let sphere = Sphere {\n            center: V3D {\n                x: 0.,\n                y: 0.,\n                z: -10.,\n            },\n            radius: 1.,\n        };\n\n        let ray_hit = Ray {\n            origin: V3D::default(),\n            dir: V3D {\n                x: 0.01,\n                y: 0.01,\n                z: -1.,\n            },\n        };\n        let ray_miss = Ray {\n            origin: V3D::default(),\n            dir: V3D {\n                x: 0.,\n                y: 0.,\n                z: 1.,\n            },\n        };\n\n        let isect_hit = ray_hit.intersect(&sphere, Isect::default());\n        assert!(isect_hit.hit);\n        let isect_miss = ray_miss.intersect(&sphere, Isect::default());\n        assert!(!isect_miss.hit);\n\n        // hit, miss, hit, miss\n        #[cfg(feature = \"256bit\")]\n        let z_val = f32xN::new(-1., 1., -1., 1., -1., 1., -1., 1.);\n        #[cfg(not(feature = \"256bit\"))]\n        let z_val = f32xN::new(-1., 1., -1., 1.);\n\n        let rays = RayxN {\n            origin: V3DxN::default(),\n            dir: V3DxN {\n                x: f32xN::splat(0.01),\n                y: f32xN::splat(0.01),\n                z: z_val,\n            },\n        };\n\n        let isectxN = rays.intersect(&sphere, IsectxN::default());\n\n        #[cfg(feature = \"256bit\")]\n        let expected =\n            m32xN::new(true, false, true, false, true, false, true, false);\n        #[cfg(not(feature = \"256bit\"))]\n        let expected = m32xN::new(true, false, true, false);\n\n        assert_eq!(isectxN.hit, expected);\n\n        assert_eq!(isect_hit.t, isectxN.t.extract(0));\n        assert_eq!(isect_hit.t, isectxN.t.extract(2));\n        assert_eq!(isect_miss.t, isectxN.t.extract(1));\n        assert_eq!(isect_miss.t, isectxN.t.extract(3));\n\n        assert_eq!(isect_hit.p.x, isectxN.p.x.extract(0));\n        assert_eq!(isect_hit.p.y, isectxN.p.y.extract(0));\n        assert_eq!(isect_hit.p.z, isectxN.p.z.extract(0));\n\n        assert_eq!(isect_hit.p.x, isectxN.p.x.extract(2));\n        assert_eq!(isect_hit.p.y, isectxN.p.y.extract(2));\n        assert_eq!(isect_hit.p.z, isectxN.p.z.extract(2));\n\n        assert_eq!(isect_miss.p.x, isectxN.p.x.extract(1));\n        assert_eq!(isect_miss.p.y, isectxN.p.y.extract(1));\n        assert_eq!(isect_miss.p.z, isectxN.p.z.extract(1));\n\n        assert_eq!(isect_miss.p.x, isectxN.p.x.extract(3));\n        assert_eq!(isect_miss.p.y, isectxN.p.y.extract(3));\n        assert_eq!(isect_miss.p.z, isectxN.p.z.extract(3));\n\n        assert_eq!(isect_hit.n.x, isectxN.n.x.extract(0));\n        assert_eq!(isect_hit.n.y, isectxN.n.y.extract(0));\n        assert_eq!(isect_hit.n.z, isectxN.n.z.extract(0));\n\n        assert_eq!(isect_hit.n.x, isectxN.n.x.extract(2));\n        assert_eq!(isect_hit.n.y, isectxN.n.y.extract(2));\n        assert_eq!(isect_hit.n.z, isectxN.n.z.extract(2));\n\n        assert_eq!(isect_miss.n.x, isectxN.n.x.extract(1));\n        assert_eq!(isect_miss.n.y, isectxN.n.y.extract(1));\n        assert_eq!(isect_miss.n.z, isectxN.n.z.extract(1));\n\n        assert_eq!(isect_miss.n.x, isectxN.n.x.extract(3));\n        assert_eq!(isect_miss.n.y, isectxN.n.y.extract(3));\n        assert_eq!(isect_miss.n.z, isectxN.n.z.extract(3));\n    }\n}\n"
  },
  {
    "path": "examples/aobench/src/intersection/single.rs",
    "content": "//! Scalar intersection result\n\nuse crate::geometry::V3D;\n\n/// Intersection result\n#[derive(Copy, Clone, Debug)]\npub struct Isect {\n    pub t: f32,\n    pub p: V3D,\n    pub n: V3D,\n    pub hit: bool,\n}\n\nimpl Default for Isect {\n    #[inline]\n    fn default() -> Self {\n        Self {\n            t: 1e17,\n            hit: false,\n            p: V3D::default(),\n            n: V3D::default(),\n        }\n    }\n}\n\nimpl Isect {\n    #[inline(always)]\n    #[must_use]\n    pub fn almost_eq(&self, rhs: &Self) -> bool {\n        const EPSILON: f32 = 1E-3;\n        (self.t - rhs.t).abs() < EPSILON\n            && self.p.almost_eq(&rhs.p)\n            && self.n.almost_eq(&rhs.n)\n            && self.hit == rhs.hit\n    }\n}\n"
  },
  {
    "path": "examples/aobench/src/ispc_.rs",
    "content": "//! Includes the ISPC implementations.\nuse crate::*;\nuse ispc::*;\n\nispc_module!(aobench);\n\npub fn ao<S: Scene>(\n    _scene: &mut S,\n    nsubsamples: usize,\n    img: &mut crate::Image,\n) {\n    let (w, h) = img.size();\n    unsafe {\n        self::aobench::ao_ispc(\n            w as i32,\n            h as i32,\n            nsubsamples as i32,\n            img.fdata.as_mut_ptr(),\n        )\n    }\n}\n\npub fn ao_tasks<S: Scene>(\n    _scene: &mut S,\n    nsubsamples: usize,\n    img: &mut crate::Image,\n) {\n    let (w, h) = img.size();\n    unsafe {\n        self::aobench::ao_ispc_tasks(\n            w as i32,\n            h as i32,\n            nsubsamples as i32,\n            img.fdata.as_mut_ptr(),\n        )\n    }\n}\n"
  },
  {
    "path": "examples/aobench/src/lib.rs",
    "content": "//! aobench: Ambient Occlusion Renderer benchmark.\n//!\n//! Based on [aobench](https://code.google.com/archive/p/aobench/) by Syoyo\n//! Fujita.\n// FIXME: Null pointer deref warning triggered in this example,\n// likely inside a macro expansion deriving from packed_simd.\n#![deny(rust_2018_idioms)]\n#![allow(non_snake_case, non_camel_case_types)]\n#![allow(\n    clippy::many_single_char_names,\n    clippy::similar_names,\n    clippy::cast_precision_loss,\n    clippy::inline_always,\n    clippy::cast_possible_truncation,\n    clippy::cast_sign_loss,\n    clippy::identity_op,\n    clippy::erasing_op,\n    clippy::must_use_candidate,\n    clippy::float_cmp\n)]\n\npub mod ambient_occlusion;\npub mod geometry;\npub mod image;\npub mod intersection;\npub mod random;\npub mod scene;\n\n#[cfg(feature = \"ispc\")]\npub mod ispc_;\npub mod scalar;\npub mod scalar_parallel;\npub mod tiled;\npub mod tiled_parallel;\npub mod vector;\npub mod vector_parallel;\n\npub use self::image::Image;\npub use self::scene::Scene;\n"
  },
  {
    "path": "examples/aobench/src/main.rs",
    "content": "//! aobench: Ambient Occlusion Renderer benchmark.\n//!\n//! Based on [aobench](https://code.google.com/archive/p/aobench/) by Syoyo\n//! Fujita.\n#![deny(rust_2018_idioms)]\n\nuse aobench_lib::*;\nuse std::path::PathBuf;\nuse structopt::StructOpt;\n\n/// Command-line arguments.\n#[derive(StructOpt, Debug)]\nstruct Opt {\n    /// Image width.\n    width: usize,\n    /// Image height.\n    height: usize,\n\n    /// Algorithm\n    #[structopt(short = \"a\", long = \"algo\")]\n    algo: String,\n\n    /// Output file.\n    #[structopt(short = \"o\", long = \"output\", parse(from_os_str))]\n    output: Option<PathBuf>,\n}\n\nconst ALGORITHMS: &[&str] = &[\n    \"scalar\",\n    \"scalar_par\",\n    \"vector\",\n    \"vector_par\",\n    \"tiled\",\n    \"tiled_par\",\n    \"ispc\",\n    \"ispc_tasks\",\n];\n\nfn main() {\n    let opt = Opt::from_args();\n    let mut scene = aobench_lib::scene::Random::default();\n    let mut img = Image::new(opt.width, opt.height);\n\n    let algorithm_name = opt.algo.as_str();\n\n    if let Some(algorithm) = ALGORITHMS.iter().find(|&&a| a == algorithm_name)\n    {\n        let d = time::Duration::span(|| match *algorithm {\n            \"scalar\" => scalar::ao(&mut scene, 2, &mut img),\n            \"scalar_par\" => scalar_parallel::ao(&mut scene, 2, &mut img),\n            \"vector\" => vector::ao(&mut scene, 2, &mut img),\n            \"vector_par\" => vector_parallel::ao(&mut scene, 2, &mut img),\n            \"tiled\" => tiled::ao(&mut scene, 2, &mut img),\n            \"tiled_par\" => tiled_parallel::ao(&mut scene, 2, &mut img),\n            \"ispc\" => {\n                #[cfg(feature = \"ispc\")]\n                {\n                    ispc_::ao(&mut scene, 2, &mut img)\n                }\n                #[cfg(not(feature = \"ispc\"))]\n                {\n                    panic!(\"the `ispc` algorithm requires building with --features=ispc\");\n                }\n            }\n            \"ispc_tasks\" => {\n                #[cfg(feature = \"ispc\")]\n                {\n                    ispc_::ao_tasks(&mut scene, 2, &mut img)\n                }\n                #[cfg(not(feature = \"ispc\"))]\n                {\n                    panic!(\"the `ispc_task` algorithm requires building with --features=ispc\");\n                }\n            }\n            _ => unreachable!(),\n        });\n        let image_path = opt.output.unwrap_or_else(|| {\n            PathBuf::from(format!(\"image_{}.png\", algorithm))\n        });\n        img.write_png(&image_path, false)\n            .expect(\"failed to write image\");\n\n        println!(\"time: {} ms\", d.num_milliseconds());\n    } else {\n        let mut error = format!(\n            \"unknown algorithm: \\\"{}\\\"\\nAvailable algorithms:\",\n            algorithm_name\n        );\n        for a in ALGORITHMS {\n            error.push_str(&format!(\"\\n- {}\", a));\n        }\n        panic!(\"{}\", error);\n    }\n}\n"
  },
  {
    "path": "examples/aobench/src/random.rs",
    "content": "//! Pseudo random number generators.\n//!\n//! Currently only `LFSR113` is implemented, since that is what ISPC uses, and it\n//! allows us to compare Rust's codegen with that of ISPC for the same\n//! algorithms.\n//!\n//! Use `{scalar,vector}::thread_rng()` to get a handle to the thread-local\n//! random number generator, and call `.gen()` to generate an `f32` or an\n//! `f32xN`.\n\n/// Scalar pseudo random number generator\npub mod scalar {\n    use std::cell::UnsafeCell;\n    use std::rc::Rc;\n\n    // Note: This implementation could be vectorized using an `u32x4`.\n    struct RngT(u32, u32, u32, u32);\n\n    impl RngT {\n        fn from_seed(x: u32) -> Self {\n            let z0 = x;\n            let z1 = x ^ 0xbeef_f00d;\n            let z2 = ((x & 0xffff_u32) << 16) | (x >> 16);\n            let z3 = ((x & 0xff_u32) << 24)\n                | ((x & 0xff00_u32) << 8)\n                | ((x & 0x00ff_0000_u32) >> 8)\n                | (x & 0xff00_0000_u32) >> 24;\n            Self(z0, z1, z2, z3)\n        }\n\n        pub fn gen_u32(&mut self) -> u32 {\n            let mut b = ((self.0 << 6) ^ self.0) >> 13;\n            self.0 = ((self.0 & 4_294_967_294_u32) << 18) ^ b;\n            b = ((self.1 << 2) ^ self.1) >> 27;\n            self.1 = ((self.1 & 4_294_967_288_u32) << 2) ^ b;\n            b = ((self.2 << 13) ^ self.2) >> 21;\n            self.2 = ((self.2 & 4_294_967_280_u32) << 7) ^ b;\n            b = ((self.3 << 3) ^ self.3) >> 12;\n            self.3 = ((self.3 & 4_294_967_168_u32) << 13) ^ b;\n            self.0 ^ self.1 ^ self.2 ^ self.3\n        }\n\n        pub fn gen(&mut self) -> f32 {\n            let mut v = self.gen_u32();\n            v &= (1_u32 << 23) - 1;\n            let v = f32::from_bits(0x3F80_0000 | v);\n            v - 1.\n        }\n    }\n\n    #[derive(Clone)]\n    pub struct RngH {\n        rng: Rc<UnsafeCell<RngT>>,\n    }\n\n    impl RngH {\n        pub fn gen(&mut self) -> f32 {\n            unsafe { (*self.rng.get()).gen() }\n        }\n    }\n\n    thread_local!(\n        static THREAD_RNG_KEY: Rc<UnsafeCell<RngT>> = {\n            Rc::new(UnsafeCell::new(RngT::from_seed(1)))\n        }\n    );\n\n    pub fn thread_rng() -> RngH {\n        RngH {\n            rng: THREAD_RNG_KEY.with(Clone::clone),\n        }\n    }\n}\n\n/// Vector pseudo random number generator\npub mod vector {\n    use crate::geometry::{f32xN, u32xN, IncrV};\n    use std::cell::UnsafeCell;\n    use std::rc::Rc;\n    struct RngT(u32xN, u32xN, u32xN, u32xN);\n\n    impl RngT {\n        fn from_seed(x: u32xN) -> Self {\n            let z0 = x;\n            let z1 = x ^ u32xN::splat(0xbeef_f00d);\n            let z2 = ((x & u32xN::splat(0xffff)) << 16) | (x >> 16);\n            let z3 = ((x & u32xN::splat(0xff)) << 24)\n                | ((x & u32xN::splat(0xff00)) << 8)\n                | ((x & u32xN::splat(0x00ff_0000)) >> 8)\n                | (x & u32xN::splat(0xff00_0000)) >> 24;\n            Self(z0, z1, z2, z3)\n        }\n\n        #[inline(always)]\n        pub fn gen_u32(&mut self) -> u32xN {\n            let mut b = ((self.0 << 6) ^ self.0) >> 13;\n            self.0 = ((self.0 & u32xN::splat(4_294_967_294)) << 18) ^ b;\n            b = ((self.1 << 2) ^ self.1) >> 27;\n            self.1 = ((self.1 & u32xN::splat(4_294_967_288)) << 2) ^ b;\n            b = ((self.2 << 13) ^ self.2) >> 21;\n            self.2 = ((self.2 & u32xN::splat(4_294_967_280)) << 7) ^ b;\n            b = ((self.3 << 3) ^ self.3) >> 12;\n            self.3 = ((self.3 & u32xN::splat(4_294_967_168)) << 13) ^ b;\n            self.0 ^ self.1 ^ self.2 ^ self.3\n        }\n\n        #[inline(always)]\n        pub fn gen(&mut self) -> f32xN {\n            let mut v = self.gen_u32();\n            v &= u32xN::splat((1_u32 << 23) - 1);\n            let v: f32xN =\n                unsafe { std::mem::transmute(u32xN::splat(0x3F80_0000) | v) };\n            v - f32xN::splat(1.)\n        }\n    }\n\n    #[derive(Clone)]\n    pub struct RngH {\n        rng: Rc<UnsafeCell<RngT>>,\n    }\n\n    impl RngH {\n        #[inline(always)]\n        pub fn gen(&mut self) -> f32xN {\n            unsafe { (*self.rng.get()).gen() }\n        }\n    }\n\n    thread_local!(\n        static THREAD_RNG_KEY: Rc<UnsafeCell<RngT>> = {\n            Rc::new(UnsafeCell::new(RngT::from_seed(<u32xN as IncrV>::incr(0, 1))))\n        }\n    );\n\n    pub fn thread_rng() -> RngH {\n        RngH {\n            rng: THREAD_RNG_KEY.with(Clone::clone),\n        }\n    }\n}\n"
  },
  {
    "path": "examples/aobench/src/scalar.rs",
    "content": "//! Scalar serial aobench\n\nuse crate::ambient_occlusion;\nuse crate::geometry::{Ray, V3D};\nuse crate::intersection::{Intersect, Isect};\nuse crate::scene::Scene;\n\npub fn ao<S: Scene>(\n    scene: &mut S,\n    nsubsamples: usize,\n    img: &mut crate::Image,\n) {\n    let (w, h) = img.size();\n    let image = &mut img.fdata;\n    let ns = nsubsamples;\n    for y in 0..h {\n        for x in 0..w {\n            let offset = 3 * (y * w + x);\n            for u in 0..ns {\n                for v in 0..ns {\n                    let (x, y, u, v, h, w, ns) = (\n                        x as f32, y as f32, u as f32, v as f32, h as f32,\n                        w as f32, ns as f32,\n                    );\n                    let dir: V3D = V3D {\n                        x: (x + u / ns - w / 2.) / (w / 2.) * w / h,\n                        y: -(y + v / ns - h / 2.) / (h / 2.),\n                        z: -1.,\n                    };\n                    let dir = dir.normalized();\n\n                    let ray = Ray {\n                        origin: V3D::default(),\n                        dir,\n                    };\n\n                    let mut isect = Isect::default();\n                    for s in scene.spheres() {\n                        isect = ray.intersect(s, isect);\n                    }\n                    isect = ray.intersect(scene.plane(), isect);\n\n                    let ret = if isect.hit {\n                        ambient_occlusion::scalar(scene, &isect)\n                    } else {\n                        0.\n                    };\n\n                    // Update image for AO for this ray\n                    image[offset + 0] += ret;\n                    image[offset + 1] += ret;\n                    image[offset + 2] += ret;\n                }\n            }\n            // Normalize image pixels by number of samples taken per pixel\n            let ns = (ns * ns) as f32;\n            image[offset + 0] /= ns;\n            image[offset + 1] /= ns;\n            image[offset + 2] /= ns;\n        }\n    }\n}\n"
  },
  {
    "path": "examples/aobench/src/scalar_parallel.rs",
    "content": "//! Scalar parallel aobench\n\nuse crate::ambient_occlusion;\nuse crate::geometry::{Ray, V3D};\nuse crate::intersection::{Intersect, Isect};\nuse crate::scene::Scene;\nuse rayon::prelude::*;\n\npub fn ao<S: Scene>(_: &mut S, nsubsamples: usize, img: &mut crate::Image) {\n    let (w, h) = img.size();\n    let ns = nsubsamples;\n    img.fdata\n        .par_chunks_mut(3 * w)\n        .enumerate()\n        .for_each(|(y, image)| {\n            assert!(image.len() == 3 * w);\n            let mut scene = S::default();\n            for x in 0..w {\n                let offset = 3 * x;\n                for u in 0..ns {\n                    for v in 0..ns {\n                        let (x, y, u, v, h, w, ns) = (\n                            x as f32, y as f32, u as f32, v as f32, h as f32,\n                            w as f32, ns as f32,\n                        );\n                        let dir: V3D = V3D {\n                            x: (x + u / ns - w / 2.) / (w / 2.) * w / h,\n                            y: -(y + v / ns - h / 2.) / (h / 2.),\n                            z: -1.,\n                        };\n                        let dir = dir.normalized();\n\n                        let ray = Ray {\n                            origin: V3D::default(),\n                            dir,\n                        };\n\n                        let mut isect = Isect::default();\n                        for s in scene.spheres() {\n                            isect = ray.intersect(s, isect);\n                        }\n                        isect = ray.intersect(scene.plane(), isect);\n\n                        let ret = if isect.hit {\n                            ambient_occlusion::scalar(&mut scene, &isect)\n                        } else {\n                            0.\n                        };\n\n                        // Update image for AO for this ray\n                        image[offset + 0] += ret;\n                        image[offset + 1] += ret;\n                        image[offset + 2] += ret;\n                    }\n                }\n                // Normalize image pixels by number of samples taken per pixel\n                let ns = (ns * ns) as f32;\n                image[offset + 0] /= ns;\n                image[offset + 1] /= ns;\n                image[offset + 2] /= ns;\n            }\n        });\n}\n"
  },
  {
    "path": "examples/aobench/src/scene/mod.rs",
    "content": "/// Scene interface\nuse crate::geometry::{f32xN, Plane, Sphere};\n\npub trait Scene: Send + Sync + Default {\n    const NAO_SAMPLES: usize;\n    fn rand(&mut self) -> f32;\n    fn plane(&self) -> &Plane;\n    fn spheres(&self) -> &[Sphere];\n    fn rand_f32xN(&mut self) -> (f32xN, f32xN) {\n        #[cfg(feature = \"256bit\")]\n        {\n            let r = [\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n            ];\n            (\n                f32xN::new(r[0], r[2], r[4], r[6], r[8], r[10], r[12], r[14]),\n                f32xN::new(r[1], r[3], r[5], r[7], r[9], r[11], r[13], r[15]),\n            )\n        }\n        #[cfg(not(feature = \"256bit\"))]\n        {\n            let r = [\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n                self.rand(),\n            ];\n            (\n                f32xN::new(r[0], r[2], r[4], r[6]),\n                f32xN::new(r[1], r[3], r[5], r[7]),\n            )\n        }\n    }\n}\n\nmod random;\npub use self::random::Random;\n\nmod test;\npub use self::test::Test;\n"
  },
  {
    "path": "examples/aobench/src/scene/random.rs",
    "content": "//! Aobench scene: 3 spheres and a plane using a random number generator\n\nuse crate::geometry::{f32xN, Plane, Sphere, V3D};\nuse crate::scene::Scene;\n\n#[derive(Clone)]\npub struct Random {\n    pub plane: Plane,\n    pub spheres: [Sphere; 3],\n}\n\nimpl Default for Random {\n    fn default() -> Self {\n        let plane = Plane {\n            p: V3D {\n                x: 0.,\n                y: -0.5,\n                z: 0.,\n            },\n            n: V3D {\n                x: 0.,\n                y: 1.,\n                z: 0.,\n            },\n        };\n        let spheres = [\n            Sphere {\n                center: V3D {\n                    x: -2.,\n                    y: 0.,\n                    z: -3.5,\n                },\n                radius: 0.5,\n            },\n            Sphere {\n                center: V3D {\n                    x: -0.5,\n                    y: 0.,\n                    z: -3.,\n                },\n                radius: 0.5,\n            },\n            Sphere {\n                center: V3D {\n                    x: 1.,\n                    y: 0.,\n                    z: -2.2,\n                },\n                radius: 0.5,\n            },\n        ];\n        Self { plane, spheres }\n    }\n}\n\nimpl Scene for Random {\n    const NAO_SAMPLES: usize = 8;\n    #[inline(always)]\n    fn rand(&mut self) -> f32 {\n        crate::random::scalar::thread_rng().gen()\n    }\n    #[inline(always)]\n    fn plane(&self) -> &Plane {\n        &self.plane\n    }\n    #[inline(always)]\n    fn spheres(&self) -> &[Sphere] {\n        &self.spheres\n    }\n    #[inline(always)]\n    fn rand_f32xN(&mut self) -> (f32xN, f32xN) {\n        let mut rng = crate::random::vector::thread_rng();\n        (rng.gen(), rng.gen())\n    }\n}\n"
  },
  {
    "path": "examples/aobench/src/scene/test.rs",
    "content": "//! Aobench scene: 3 spheres and a plane using a random number generator\n\nuse crate::geometry::{Plane, Sphere, V3D};\nuse crate::scene::Scene;\nuse std::num::Wrapping;\n\n#[derive(Clone)]\npub struct Test {\n    pub plane: Plane,\n    pub spheres: [Sphere; 3],\n    rands: Vec<f32>,\n    rand_step: Wrapping<usize>,\n}\n\nimpl Default for Test {\n    fn default() -> Self {\n        let plane = Plane {\n            p: V3D {\n                x: 0.,\n                y: -0.5,\n                z: 0.,\n            },\n            n: V3D {\n                x: 0.,\n                y: 1.,\n                z: 0.,\n            },\n        };\n        let spheres = [\n            Sphere {\n                center: V3D {\n                    x: -2.,\n                    y: 0.,\n                    z: -3.5,\n                },\n                radius: 0.5,\n            },\n            Sphere {\n                center: V3D {\n                    x: -0.5,\n                    y: 0.,\n                    z: -3.,\n                },\n                radius: 0.5,\n            },\n            Sphere {\n                center: V3D {\n                    x: 1.,\n                    y: 0.,\n                    z: -2.2,\n                },\n                radius: 0.5,\n            },\n        ];\n        let mut rands = Vec::new();\n        let mut rng = crate::random::scalar::thread_rng();\n        for _ in 0..2 * Self::NAO_SAMPLES * Self::NAO_SAMPLES {\n            rands.push(rng.gen());\n        }\n        let rand_step = Wrapping(0);\n        Self {\n            plane,\n            spheres,\n            rands,\n            rand_step,\n        }\n    }\n}\n\nimpl Scene for Test {\n    const NAO_SAMPLES: usize = 8;\n    fn rand(&mut self) -> f32 {\n        let v = self.rands[self.rand_step.0];\n        self.rand_step += Wrapping(1);\n        if self.rand_step\n            >= Wrapping(2 * Self::NAO_SAMPLES * Self::NAO_SAMPLES)\n        {\n            self.rand_step = Wrapping(0);\n        }\n        v\n    }\n    fn plane(&self) -> &Plane {\n        &self.plane\n    }\n    fn spheres(&self) -> &[Sphere] {\n        &self.spheres\n    }\n}\n"
  },
  {
    "path": "examples/aobench/src/tiled.rs",
    "content": "//! SIMD serial aobench\n\nuse crate::ambient_occlusion;\nuse crate::geometry::{f32xN, pf32xN, usizexN, IncrV, RayxN, V3DxN};\nuse crate::intersection::{Intersect, IsectxN};\nuse crate::scene::Scene;\nuse cfg_if::cfg_if;\n\n#[inline(always)]\nfn ao_impl<S: Scene>(\n    scene: &mut S,\n    nsubsamples: usize,\n    img: &mut crate::Image,\n) {\n    let (w, h) = img.size();\n    assert_eq!(w % f32xN::lanes(), 0);\n    let image = &mut img.fdata;\n    let ns = nsubsamples;\n    let inv_ns = 1. / (ns as f32);\n    let ptr = pf32xN::splat(image.as_mut_ptr());\n    for y in 0..h {\n        let yf = f32xN::splat(y as f32);\n        for x in (0..w).step_by(f32xN::lanes()) {\n            let xf = f32xN::incr(x as f32, 1.);\n            let offset = usizexN::splat(3 * (y * w + x));\n            let r_ptr = unsafe { ptr.add(offset + usizexN::incr(0, 3)) };\n            let g_ptr = unsafe { ptr.add(offset + usizexN::incr(1, 3)) };\n            let b_ptr = unsafe { ptr.add(offset + usizexN::incr(2, 3)) };\n\n            for u in 0..ns {\n                for v in 0..ns {\n                    let du = (u as f32) * inv_ns;\n                    let dv = (v as f32) * inv_ns;\n                    let (hf, wf) = (h as f32, w as f32);\n\n                    let dir = V3DxN {\n                        x: (xf + f32xN::splat(du - (wf / 2.)))\n                            / f32xN::splat((wf / 2.) * hf / wf),\n                        y: -(yf + f32xN::splat(dv - (hf / 2.)))\n                            / f32xN::splat(hf / 2.),\n                        z: f32xN::splat(-1.),\n                    };\n                    let dir = dir.normalized();\n\n                    let ray = RayxN {\n                        origin: V3DxN::default(),\n                        dir,\n                    };\n\n                    let mut isect = IsectxN::default();\n                    for s in scene.spheres() {\n                        isect = ray.intersect(s, isect);\n                    }\n                    isect = ray.intersect(scene.plane(), isect);\n\n                    if isect.hit.any() {\n                        let ret =\n                            ambient_occlusion::vector_tiled(scene, &isect)\n                                * f32xN::splat(inv_ns * inv_ns);\n\n                        unsafe {\n                            let img_r =\n                                r_ptr.read(isect.hit, f32xN::splat(0.));\n                            let img_g =\n                                g_ptr.read(isect.hit, f32xN::splat(0.));\n                            let img_b =\n                                b_ptr.read(isect.hit, f32xN::splat(0.));\n\n                            r_ptr.write(isect.hit, img_r + ret);\n                            g_ptr.write(isect.hit, img_g + ret);\n                            b_ptr.write(isect.hit, img_b + ret);\n                        }\n                    }\n                }\n            }\n        }\n    }\n}\n\ncfg_if! {\n    if #[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))] {\n        #[target_feature(enable = \"sse4.2\")]\n        unsafe fn ao_sse42<S: Scene>(scene: &mut S, nsubsamples: usize,\n                                     img: &mut crate::Image) {\n            ao_impl(scene, nsubsamples, img);\n        }\n\n        #[target_feature(enable = \"avx\")]\n        unsafe fn ao_avx<S: Scene>(scene: &mut S, nsubsamples: usize,\n                                   img: &mut crate::Image) {\n            ao_impl(scene, nsubsamples, img);\n        }\n\n        #[target_feature(enable = \"avx,fma\")]\n        unsafe fn ao_avx_fma<S: Scene>(scene: &mut S, nsubsamples: usize,\n                                   img: &mut crate::Image) {\n            ao_impl(scene, nsubsamples, img);\n        }\n\n        #[target_feature(enable = \"avx2,fma\")]\n        unsafe fn ao_avx2<S: Scene>(scene: &mut S, nsubsamples: usize,\n                                    img: &mut crate::Image) {\n            ao_impl(scene, nsubsamples, img);\n        }\n\n        pub fn ao<S: Scene>(scene: &mut S, nsubsamples: usize,\n                            img: &mut crate::Image) {\n            unsafe {\n                if is_x86_feature_detected!(\"avx2\") && is_x86_feature_detected!(\"fma\") {\n                    ao_avx2(scene, nsubsamples, img);\n                } else if is_x86_feature_detected!(\"avx\") {\n                    if is_x86_feature_detected!(\"fma\") {\n                        ao_avx_fma(scene, nsubsamples, img);\n                    } else {\n                        ao_avx(scene, nsubsamples, img);\n                    }\n                } else if is_x86_feature_detected!(\"sse4.2\") {\n                    ao_sse42(scene, nsubsamples, img);\n                } else {\n                    ao_impl(scene, nsubsamples, img);\n                }\n            }\n        }\n    } else {\n        pub fn ao<S: Scene>(scene: &mut S, nsubsamples: usize, img: &mut crate::Image) {\n            ao_impl(scene, nsubsamples, img);\n        }\n    }\n}\n"
  },
  {
    "path": "examples/aobench/src/tiled_parallel.rs",
    "content": "//! SIMD tiled parallel aobench\n\nuse crate::ambient_occlusion;\nuse crate::geometry::{f32xN, pf32xN, usizexN, IncrV, RayxN, V3DxN};\nuse crate::intersection::{Intersect, IsectxN};\nuse crate::scene::Scene;\nuse rayon::prelude::*;\n\npub fn ao<S: Scene>(_: &mut S, nsubsamples: usize, img: &mut crate::Image) {\n    let (w, h) = img.size();\n    assert_eq!(w % f32xN::lanes(), 0);\n    let ns = nsubsamples;\n    let inv_ns = 1. / (ns as f32);\n    let ptr = usizexN::splat(img.fdata.as_mut_ptr() as usize);\n    img.fdata\n        .par_chunks_mut(3 * w)\n        .enumerate()\n        .for_each(|(y, image)| {\n            assert!(image.len() == 3 * w);\n            let mut scene = S::default();\n            let yf = f32xN::splat(y as f32);\n            let ptr: pf32xN = unsafe { std::mem::transmute(ptr) };\n            for x in (0..w).step_by(f32xN::lanes()) {\n                let xf = f32xN::incr(x as f32, 1.);\n                let offset = usizexN::splat(3 * (y * w + x));\n                let r_ptr = unsafe { ptr.add(offset + usizexN::incr(0, 3)) };\n                let g_ptr = unsafe { ptr.add(offset + usizexN::incr(1, 3)) };\n                let b_ptr = unsafe { ptr.add(offset + usizexN::incr(2, 3)) };\n\n                for u in 0..ns {\n                    for v in 0..ns {\n                        let du = (u as f32) * inv_ns;\n                        let dv = (v as f32) * inv_ns;\n                        let (hf, wf) = (h as f32, w as f32);\n\n                        let dir = V3DxN {\n                            x: (xf + f32xN::splat(du - (wf / 2.)))\n                                / f32xN::splat((wf / 2.) * hf / wf),\n                            y: -(yf + f32xN::splat(dv - (hf / 2.)))\n                                / f32xN::splat(hf / 2.),\n                            z: f32xN::splat(-1.),\n                        };\n                        let dir = dir.normalized();\n\n                        let ray = RayxN {\n                            origin: V3DxN::default(),\n                            dir,\n                        };\n\n                        let mut isect = IsectxN::default();\n                        for s in scene.spheres() {\n                            isect = ray.intersect(s, isect);\n                        }\n                        isect = ray.intersect(scene.plane(), isect);\n\n                        if isect.hit.any() {\n                            let ret = ambient_occlusion::vector_tiled(\n                                &mut scene, &isect,\n                            ) * f32xN::splat(inv_ns * inv_ns);\n\n                            unsafe {\n                                let img_r =\n                                    r_ptr.read(isect.hit, f32xN::splat(0.));\n                                let img_g =\n                                    g_ptr.read(isect.hit, f32xN::splat(0.));\n                                let img_b =\n                                    b_ptr.read(isect.hit, f32xN::splat(0.));\n\n                                r_ptr.write(isect.hit, img_r + ret);\n                                g_ptr.write(isect.hit, img_g + ret);\n                                b_ptr.write(isect.hit, img_b + ret);\n                            }\n                        }\n                    }\n                }\n            }\n        });\n}\n"
  },
  {
    "path": "examples/aobench/src/vector.rs",
    "content": "//! SIMD serial aobench\n\nuse crate::ambient_occlusion;\nuse crate::geometry::{Ray, V3D};\nuse crate::intersection::{Intersect, Isect};\nuse crate::scene::Scene;\nuse cfg_if::cfg_if;\n\n#[inline(always)]\nfn ao_impl<S: Scene>(\n    scene: &mut S,\n    nsubsamples: usize,\n    img: &mut crate::Image,\n) {\n    let (w, h) = img.size();\n    let image = &mut img.fdata;\n    let ns = nsubsamples;\n    let inv_ns = 1. / (ns as f32);\n    for y in 0..h {\n        for x in 0..w {\n            let offset = 3 * (y * w + x);\n            for u in 0..ns {\n                for v in 0..ns {\n                    let du = (u as f32) * inv_ns;\n                    let dv = (v as f32) * inv_ns;\n\n                    let (x, y, h, w) =\n                        (x as f32, y as f32, h as f32, w as f32);\n\n                    let dir = V3D {\n                        x: (x + du - (w * 0.5)) / (w * 0.5) * w / h,\n                        y: -(y + dv - (h * 0.5)) / (h * 0.5),\n                        z: -1.,\n                    };\n                    let dir = dir.normalized();\n\n                    let ray = Ray {\n                        origin: V3D::default(),\n                        dir,\n                    };\n\n                    let mut isect = Isect::default();\n                    for s in scene.spheres() {\n                        isect = ray.intersect(s, isect);\n                    }\n                    isect = ray.intersect(scene.plane(), isect);\n\n                    let ret = if isect.hit {\n                        ambient_occlusion::vector(scene, &isect)\n                    } else {\n                        0.\n                    };\n                    let ret = ret * inv_ns * inv_ns;\n\n                    // Update image for AO for this ray\n                    // (already normalized)\n                    image[offset + 0] += ret;\n                    image[offset + 1] += ret;\n                    image[offset + 2] += ret;\n                }\n            }\n        }\n    }\n}\n\ncfg_if! {\n    if #[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))] {\n        #[target_feature(enable = \"sse4.2\")]\n        unsafe fn ao_sse42<S: Scene>(scene: &mut S, nsubsamples: usize,\n                                     img: &mut crate::Image) {\n            ao_impl(scene, nsubsamples, img);\n        }\n\n        #[target_feature(enable = \"avx\")]\n        unsafe fn ao_avx<S: Scene>(scene: &mut S, nsubsamples: usize,\n                                   img: &mut crate::Image) {\n            ao_impl(scene, nsubsamples, img);\n        }\n\n        #[target_feature(enable = \"avx,fma\")]\n        unsafe fn ao_avx_fma<S: Scene>(scene: &mut S, nsubsamples: usize,\n                                   img: &mut crate::Image) {\n            ao_impl(scene, nsubsamples, img);\n        }\n\n        #[target_feature(enable = \"avx2,fma\")]\n        unsafe fn ao_avx2<S: Scene>(scene: &mut S, nsubsamples: usize,\n                                    img: &mut crate::Image) {\n            ao_impl(scene, nsubsamples, img);\n        }\n\n        pub fn ao<S: Scene>(scene: &mut S, nsubsamples: usize,\n                            img: &mut crate::Image) {\n            unsafe {\n                if is_x86_feature_detected!(\"avx2\") && is_x86_feature_detected!(\"fma\") {\n                    ao_avx2(scene, nsubsamples, img);\n                } else if is_x86_feature_detected!(\"avx\") {\n                    if is_x86_feature_detected!(\"fma\") {\n                        ao_avx_fma(scene, nsubsamples, img);\n                    } else {\n                        ao_avx(scene, nsubsamples, img);\n                    }\n                } else if is_x86_feature_detected!(\"sse4.2\") {\n                    ao_sse42(scene, nsubsamples, img);\n                } else {\n                    ao_impl(scene, nsubsamples, img);\n                }\n            }\n        }\n    } else {\n        pub fn ao<S: Scene>(scene: &mut S, nsubsamples: usize, img: &mut crate::Image) {\n            ao_impl(scene, nsubsamples, img);\n        }\n    }\n}\n"
  },
  {
    "path": "examples/aobench/src/vector_parallel.rs",
    "content": "//! SIMD parallel aobench\n\nuse crate::ambient_occlusion;\nuse crate::geometry::{Ray, V3D};\nuse crate::intersection::{Intersect, Isect};\nuse crate::scene::Scene;\nuse rayon::prelude::*;\n\npub fn ao<S: Scene>(_: &mut S, nsubsamples: usize, img: &mut crate::Image) {\n    let (w, h) = img.size();\n    let ns = nsubsamples;\n    let inv_ns = 1. / (ns as f32);\n    img.fdata\n        .par_chunks_mut(3 * w)\n        .enumerate()\n        .for_each(|(y, image)| {\n            assert!(image.len() == 3 * w);\n            let mut scene = S::default();\n            for x in 0..w {\n                let offset = 3 * x;\n                for u in 0..ns {\n                    for v in 0..ns {\n                        let du = (u as f32) * inv_ns;\n                        let dv = (v as f32) * inv_ns;\n\n                        let (x, y, h, w) =\n                            (x as f32, y as f32, h as f32, w as f32);\n\n                        let dir = V3D {\n                            x: (x + du - (w / 2.)) / (w / 2.) * w / h,\n                            y: -(y + dv - (h / 2.)) / (h / 2.),\n                            z: -1.,\n                        };\n                        let dir = dir.normalized();\n\n                        let ray = Ray {\n                            origin: V3D::default(),\n                            dir,\n                        };\n\n                        let mut isect = Isect::default();\n                        for s in scene.spheres() {\n                            isect = ray.intersect(s, isect);\n                        }\n                        isect = ray.intersect(scene.plane(), isect);\n\n                        let ret = if isect.hit {\n                            ambient_occlusion::vector(&mut scene, &isect)\n                        } else {\n                            0.\n                        };\n                        let ret = ret * inv_ns * inv_ns;\n\n                        // Update image for AO for this ray\n                        // (already normalized)\n                        image[offset + 0] += ret;\n                        image[offset + 1] += ret;\n                        image[offset + 2] += ret;\n                    }\n                }\n            }\n        });\n}\n"
  },
  {
    "path": "examples/aobench/volta/.gitignore",
    "content": "ao\n*.ppm\nobjs/\n"
  },
  {
    "path": "examples/aobench/volta/ao.ispc",
    "content": "// -*- mode: c++ -*-\n/*\n  Copyright (c) 2010-2011, Intel Corporation\n  All rights reserved.\n\n  Redistribution and use in source and binary forms, with or without\n  modification, are permitted provided that the following conditions are\n  met:\n\n    * Redistributions of source code must retain the above copyright\n      notice, this list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright\n      notice, this list of conditions and the following disclaimer in the\n      documentation and/or other materials provided with the distribution.\n\n    * Neither the name of Intel Corporation nor the names of its\n      contributors may be used to endorse or promote products derived from\n      this software without specific prior written permission.\n\n\n   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS\n   IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED\n   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A\n   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER\n   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\n   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\n   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  \n*/\n/*\n  Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench\n*/\n\n#define NAO_SAMPLES\t\t8\n#define M_PI 3.1415926535f\n\ntypedef float<3> vec;\n\nstruct Isect {\n    float      t;\n    vec        p;\n    vec        n;\n    int        hit; \n};\n\nstruct Sphere {\n    vec        center;\n    float      radius;\n};\n\nstruct Plane {\n    vec    p;\n    vec    n;\n};\n\nstruct Ray {\n    vec org;\n    vec dir;\n};\n\nstatic inline float dot(vec a, vec b) {\n    return a.x * b.x + a.y * b.y + a.z * b.z;\n}\n\nstatic inline vec vcross(vec v0, vec v1) {\n    vec ret;\n    ret.x = v0.y * v1.z - v0.z * v1.y;\n    ret.y = v0.z * v1.x - v0.x * v1.z;\n    ret.z = v0.x * v1.y - v0.y * v1.x;\n    return ret;\n}\n\nstatic inline void vnormalize(vec &v) {\n    float len2 = dot(v, v);\n    float invlen = rsqrt(len2);\n    v *= invlen;\n}\n\n\nstatic void\nray_plane_intersect(Isect &isect, Ray &ray, uniform Plane &plane) {\n    float d = -dot(plane.p, plane.n);\n    float v = dot(ray.dir, plane.n);\n\n    cif (abs(v) < 1.0e-17) \n        return;\n    else {\n        float t = -(dot(ray.org, plane.n) + d) / v;\n\n        cif ((t > 0.0) && (t < isect.t)) {\n            isect.t = t;\n            isect.hit = 1;\n            isect.p = ray.org + ray.dir * t;\n            isect.n = plane.n;\n        }\n    }\n}\n\n\nstatic inline void\nray_sphere_intersect(Isect &isect, Ray &ray, uniform Sphere &sphere) {\n    vec rs = ray.org - sphere.center;\n\n    float B = dot(rs, ray.dir);\n    float C = dot(rs, rs) - sphere.radius * sphere.radius;\n    float D = B * B - C;\n\n    cif (D > 0.) {\n        float t = -B - sqrt(D);\n\n        cif ((t > 0.0) && (t < isect.t)) {\n            isect.t = t;\n            isect.hit = 1;\n            isect.p = ray.org + t * ray.dir;\n            isect.n = isect.p - sphere.center;\n            vnormalize(isect.n);\n        }\n    }\n}\n\n\nstatic void\northoBasis(vec basis[3], vec n) {\n    basis[2] = n;\n    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;\n\n    if ((n.x < 0.6) && (n.x > -0.6)) {\n        basis[1].x = 1.0;\n    } else if ((n.y < 0.6) && (n.y > -0.6)) {\n        basis[1].y = 1.0;\n    } else if ((n.z < 0.6) && (n.z > -0.6)) {\n        basis[1].z = 1.0;\n    } else {\n        basis[1].x = 1.0;\n    }\n\n    basis[0] = vcross(basis[1], basis[2]);\n    vnormalize(basis[0]);\n\n    basis[1] = vcross(basis[2], basis[0]);\n    vnormalize(basis[1]);\n}\n\n\nstatic float\nambient_occlusion(Isect &isect, uniform Plane &plane, uniform Sphere spheres[3],\n                  RNGState &rngstate) {\n    float eps = 0.0001f;\n    vec p, n;\n    vec basis[3];\n    float occlusion = 0.0;\n\n    p = isect.p + eps * isect.n;\n\n    orthoBasis(basis, isect.n);\n\n    static const uniform int ntheta = NAO_SAMPLES;\n    static const uniform int nphi   = NAO_SAMPLES;\n    for (uniform int j = 0; j < ntheta; j++) {\n        for (uniform int i = 0; i < nphi; i++) {\n            Ray ray;\n            Isect occIsect;\n\n            float theta = sqrt(frandom(&rngstate));\n            float phi   = 2.0f * M_PI * frandom(&rngstate);\n            float x = cos(phi) * theta;\n            float y = sin(phi) * theta;\n            float z = sqrt(1.0 - theta * theta);\n\n            // local . global\n            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;\n            float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y;\n            float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z;\n\n            ray.org = p;\n            ray.dir.x = rx;\n            ray.dir.y = ry;\n            ray.dir.z = rz;\n\n            occIsect.t   = 1.0e+17;\n            occIsect.hit = 0;\n\n            for (uniform int snum = 0; snum < 3; ++snum)\n                ray_sphere_intersect(occIsect, ray, spheres[snum]); \n            ray_plane_intersect (occIsect, ray, plane); \n\n            if (occIsect.hit) occlusion += 1.0;\n        }\n    }\n\n    occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi);\n    return occlusion;\n}\n\n\n/* Compute the image for the scanlines from [y0,y1), for an overall image\n   of width w and height h.\n */\nstatic void ao_scanlines(uniform int y0, uniform int y1, uniform int w, \n                         uniform int h,  uniform int nsubsamples, \n                         uniform float image[]) {\n    static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };\n    static uniform Sphere spheres[3] = {\n        { { -2.0f, 0.0f, -3.5f }, 0.5f },\n        { { -0.5f, 0.0f, -3.0f }, 0.5f },\n        { { 1.0f, 0.0f, -2.2f }, 0.5f } };\n    RNGState rngstate;\n\n    seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15)));\n    float invSamples = 1.f / nsubsamples;\n\n    foreach_tiled(y = y0 ... y1, x = 0 ... w, \n                  u = 0 ... nsubsamples, v = 0 ... nsubsamples) {\n        float du = (float)u * invSamples, dv = (float)v * invSamples;\n\n        // Figure out x,y pixel in NDC\n        float px =  (x + du - (w / 2.0f)) / (w / 2.0f);\n        float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);\n\n        // Scale NDC based on width/height ratio, supporting non-square image output\n        px *= (float)w / (float)h;\n\n        float ret = 0.f;\n        Ray ray;\n        Isect isect;\n\n        ray.org = 0.f;\n\n        // Poor man's perspective projection\n        ray.dir.x = px;\n        ray.dir.y = py;\n        ray.dir.z = -1.0;\n        vnormalize(ray.dir);\n\n        isect.t   = 1.0e+17;\n        isect.hit = 0;\n\n        for (uniform int snum = 0; snum < 3; ++snum)\n            ray_sphere_intersect(isect, ray, spheres[snum]);\n        ray_plane_intersect(isect, ray, plane);\n\n        // Note use of 'coherent' if statement; the set of rays we\n        // trace will often all hit or all miss the scene\n        cif (isect.hit) {\n            ret = ambient_occlusion(isect, plane, spheres, rngstate);\n            ret *= invSamples * invSamples;\n\n            int offset = 3 * (y * w + x);\n            atomic_add_local(&image[offset], ret);\n            atomic_add_local(&image[offset+1], ret);\n            atomic_add_local(&image[offset+2], ret);\n        }\n    }\n}\n\n\nexport void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, \n                    uniform float image[]) {\n    ao_scanlines(0, h, w, h, nsubsamples, image);\n}\n\n\nstatic void task ao_task(uniform int width, uniform int height, \n                         uniform int nsubsamples, uniform float image[]) {\n    ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);\n}\n\n\nexport void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, \n                          uniform float image[]) {\n    launch[h] ao_task(w, h, nsubsamples, image);\n}\n"
  },
  {
    "path": "examples/dot_product/Cargo.toml",
    "content": "[package]\nname = \"dot_product\"\nversion = \"0.1.0\"\nauthors = [\"Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>\"]\nedition = \"2018\"\n\n[dependencies]\npacked_simd = { package = \"packed_simd\", path = \"../..\" }\n\n[lib]\nname = \"dot_product_lib\"\npath = \"src/lib.rs\"\n"
  },
  {
    "path": "examples/dot_product/readme.md",
    "content": "# Vector dot product\n"
  },
  {
    "path": "examples/dot_product/src/lib.rs",
    "content": "//! Vector dot product\n#![deny(rust_2018_idioms)]\n#![feature(custom_inner_attributes)]\n#![allow(clippy::must_use_candidate, clippy::float_cmp)]\n\npub mod scalar;\npub mod simd;\n\n#[cfg(test)]\n#[rustfmt::skip]\nfn test<F: Fn(&[f32], &[f32]) -> f32>(f: F) {\n    let tests: &[(&[f32], &[f32], f32)] = &[\n        (&[0_f32, 0., 0., 0.], &[0_f32, 0., 0., 0.], 0_f32),\n        (&[0_f32, 0., 0., 1.], &[0_f32, 0., 0., 1.], 1_f32),\n        (&[1_f32, 2., 3., 4.], &[0_f32, 0., 0., 0.], 0_f32),\n        (&[1_f32, 2., 3., 4.], &[1_f32, 2., 3., 4.], 30_f32),\n        (&[1_f32, 2., 3., 4., 1., 2., 3., 4.], &[1_f32, 1., 1., 1., 1., 1., 1., 1.], 20_f32),\n    ];\n\n    for &(a, b, output) in tests {\n        assert_eq!(f(a, b), output);\n    }\n}\n"
  },
  {
    "path": "examples/dot_product/src/scalar.rs",
    "content": "//! Scalar implementation\n\npub fn dot_prod(a: &[f32], b: &[f32]) -> f32 {\n    assert_eq!(a.len(), b.len());\n    a.iter().zip(b.iter()).map(|v| v.0 * v.1).sum()\n}\n\n#[cfg(test)]\n#[test]\nfn test() {\n    crate::test(dot_prod)\n}\n"
  },
  {
    "path": "examples/dot_product/src/simd.rs",
    "content": "//! Scalar implementation\n\nuse packed_simd::f32x4;\n\npub fn dot_prod(a: &[f32], b: &[f32]) -> f32 {\n    assert_eq!(a.len(), b.len());\n    assert!(a.len() % 4 == 0);\n\n    a.chunks_exact(4)\n        .map(f32x4::from_slice_unaligned)\n        .zip(b.chunks_exact(4).map(f32x4::from_slice_unaligned))\n        .map(|(a, b)| a * b)\n        .sum::<f32x4>()\n        .sum()\n}\n\n#[cfg(test)]\n#[test]\nfn test() {\n    crate::test(dot_prod)\n}\n"
  },
  {
    "path": "examples/fannkuch_redux/Cargo.toml",
    "content": "[package]\nname = \"fannkuch_redux\"\nversion = \"0.1.0\"\nauthors = [\"gnzlbg <gonzalobg88@gmail.com>\"]\nedition = \"2018\"\n\n[dependencies]\npacked_simd = { package = \"packed_simd\", path = \"../..\" }\n\n[[bin]]\nname = \"fannkuch_redux\"\npath = \"src/main.rs\"\n\n[lib]\nname = \"fannkuch_redux_lib\"\npath = \"src/lib.rs\"\n"
  },
  {
    "path": "examples/fannkuch_redux/readme.md",
    "content": "# Fannkuch redux\n\nThis is the [`fannkuch redux` benchmark from the benchmarksgame][bg]. \n\n## Background and description\n\nThe fannkuch benchmark is defined by programs in [Performing Lisp Analysis of\nthe FANNKUCH\nBenchmark](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.35.5124),\nKenneth R. Anderson and Duane Rettig. FANNKUCH is an abbreviation for the German\nword __Pfannkuchen_, or pancakes, in analogy to flipping pancakes. The conjecture\nis that the maximum count is approximated by `n*log(n)` when `n` goes to infinity.\n\nEach program should:\n\n* Take a permutation of `{1,...,n}`, for example: `{4,2,1,5,3}`.\n\n* Take the first element, here `4`, and reverse the order of the first `4`\n  elements: `{5,1,2,4,3}`.\n\n* Repeat this until the first element is a `1`, so flipping won't change\n  anything more: `{3,4,2,1,5}`, `{2,4,3,1,5}`, `{4,2,3,1,5}`, `{1,3,2,4,5}`.\n\n* Count the number of flips, here `5`.\n\n* Keep a checksum\n\n  * `checksum = checksum + (if permutation_index is even then flips_count else\n    -flips_count)`\n\n  * `checksum = checksum + (toggle_sign_-1_1 * flips_count)`\n\n* Do this for all `n!` permutations, and record the maximum number of flips\n  needed for any permutation.\n\n## Usage\n\nIt takes two arguments in this order:\n\n* `n`: the input sequence length: `{1, ..., n}`\n* (optional) `algorithm`: the algorithm to use - defaults to the fastest one.\n  * `0`: scalar algorithm\n  * `1`: SIMD algorithm\n\n[bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/fannkuchredux.html#fannkuchredux\n"
  },
  {
    "path": "examples/fannkuch_redux/src/fannkuchredux-output.txt",
    "content": "228\nPfannkuchen(7) = 16\n"
  },
  {
    "path": "examples/fannkuch_redux/src/lib.rs",
    "content": "//! Fannkuch redux\n#![deny(warnings, rust_2018_idioms)]\n#![allow(non_snake_case, non_camel_case_types)]\n#![allow(\n    clippy::similar_names,\n    clippy::many_single_char_names,\n    clippy::cast_possible_truncation,\n    clippy::cast_sign_loss,\n    clippy::cast_possible_wrap,\n    clippy::must_use_candidate,\n    clippy::float_cmp\n)]\n\npub mod scalar;\npub mod simd;\n\npub fn fannkuch_redux(n: usize, alg: usize) -> (i32, i32) {\n    match alg {\n        0 => simd::fannkuch_redux(n),\n        1 => scalar::fannkuch_redux(n),\n        v => panic!(\"unknown algorithm value: {}\", v),\n    }\n}\n"
  },
  {
    "path": "examples/fannkuch_redux/src/main.rs",
    "content": "#![deny(rust_2018_idioms)]\n\nuse fannkuch_redux_lib::*;\n\nfn run<O: std::io::Write>(o: &mut O, n: usize, alg: usize) {\n    let (checksum, maxflips) = fannkuch_redux(n, alg);\n    writeln!(o, \"{}\\nPfannkuchen({}) = {}\", checksum, n, maxflips).unwrap();\n}\n\nfn main() {\n    let n: usize =\n        std::env::args().nth(1).expect(\"need one arg\").parse().unwrap();\n    assert!((3..=14).contains(&n), \"n = {} is out-of-range [3, 14]\", n);\n    let alg = if let Some(v) = std::env::args().nth(2) {\n        v.parse().unwrap()\n    } else {\n        0\n    };\n\n    run(&mut std::io::stdout(), n, alg);\n}\n\n#[cfg(test)]\nmod tests {\n    use super::*;\n    static OUTPUT: &[u8] = include_bytes!(\"fannkuchredux-output.txt\");\n    #[test]\n    fn verify_output_simd() {\n        let mut out: Vec<u8> = Vec::new();\n\n        run(&mut out, 7, 0);\n\n        assert_eq!(out.len(), OUTPUT.len());\n        if out != OUTPUT {\n            for i in 0..out.len() {\n                assert_eq!(\n                    out[i], OUTPUT[i],\n                    \"byte {} differs - is: {:#08b} - should: {:#08b}\",\n                    i, out[i], OUTPUT[i]\n                );\n            }\n        }\n    }\n    #[test]\n    fn verify_output_scalar() {\n        let mut out: Vec<u8> = Vec::new();\n\n        run(&mut out, 7, 1);\n\n        assert_eq!(out.len(), OUTPUT.len());\n        if out != OUTPUT {\n            for i in 0..out.len() {\n                assert_eq!(\n                    out[i], OUTPUT[i],\n                    \"byte {} differs - is: {:#08b} - should: {:#08b}\",\n                    i, out[i], OUTPUT[i]\n                );\n            }\n        }\n    }\n\n}\n"
  },
  {
    "path": "examples/fannkuch_redux/src/scalar.rs",
    "content": "//! Scalar fannkuch redux implementation\n\nuse std::{cmp, mem, thread};\n\n// FIXME: replace with slice rotate\nfn rotate(x: &mut [i32]) {\n    let mut prev = x[0];\n    for place in x.iter_mut().rev() {\n        prev = mem::replace(place, prev)\n    }\n}\n\nfn next_permutation(perm: &mut [i32], count: &mut [i32]) {\n    for i in 1..perm.len() {\n        rotate(&mut perm[..=i]);\n        let count_i = &mut count[i];\n        if *count_i >= i as i32 {\n            *count_i = 0;\n        } else {\n            *count_i += 1;\n            break;\n        }\n    }\n}\n\n#[derive(Clone, Copy)]\nstruct P {\n    p: [i32; 16],\n}\n\n#[derive(Clone, Copy)]\nstruct Perm {\n    cnt: [i32; 16],\n    fact: [u32; 16],\n    n: u32,\n    permcount: u32,\n    perm: P,\n}\n\nimpl Perm {\n    fn new(n: u32) -> Self {\n        let mut fact = [1; 16];\n        for i in 1..=n as usize {\n            fact[i] = fact[i - 1] * i as u32;\n        }\n        Self { cnt: [0; 16], fact, n, permcount: 0, perm: P { p: [0; 16] } }\n    }\n\n    fn get(&mut self, mut idx: i32) -> P {\n        let mut pp = [0_u8; 16];\n        self.permcount = idx as u32;\n        for (i, place) in self.perm.p.iter_mut().enumerate() {\n            *place = i as i32 + 1;\n        }\n\n        for i in (1..self.n as usize).rev() {\n            let d = idx / self.fact[i] as i32;\n            self.cnt[i] = d;\n            idx %= self.fact[i] as i32;\n            for (place, val) in pp.iter_mut().zip(self.perm.p[..=i].iter()) {\n                *place = (*val) as u8\n            }\n\n            let d = d as usize;\n            for j in 0..=i {\n                self.perm.p[j] = i32::from(if j + d <= i {\n                    pp[j + d]\n                } else {\n                    pp[j + d - i - 1]\n                });\n            }\n        }\n\n        self.perm\n    }\n\n    fn count(&self) -> u32 {\n        self.permcount\n    }\n    fn max(&self) -> u32 {\n        self.fact[self.n as usize]\n    }\n\n    fn next(&mut self) -> P {\n        next_permutation(&mut self.perm.p, &mut self.cnt);\n        self.permcount += 1;\n\n        self.perm\n    }\n}\n\nfn reverse(tperm: &mut [i32], k: usize) {\n    tperm[..k].reverse()\n}\n\nfn work(mut perm: Perm, n: usize, max: usize) -> (i32, i32) {\n    let mut checksum = 0;\n    let mut maxflips = 0;\n\n    let mut p = perm.get(n as i32);\n\n    while perm.count() < max as u32 {\n        let mut flips = 0;\n\n        while p.p[0] != 1 {\n            let k = p.p[0] as usize;\n            reverse(&mut p.p, k);\n            flips += 1;\n        }\n\n        checksum += if perm.count() % 2 == 0 { flips } else { -flips };\n        maxflips = cmp::max(maxflips, flips);\n\n        p = perm.next();\n    }\n\n    (checksum, maxflips)\n}\n\npub fn fannkuch_redux(n: usize) -> (i32, i32) {\n    let perm = Perm::new(n as u32);\n\n    let m = 1;\n    let mut futures = vec![];\n    let k = perm.max() / m;\n\n    for j in (0..).map(|x| x * k).take_while(|&j| j < k * m) {\n        let max = cmp::min(j + k, perm.max());\n\n        futures\n            .push(thread::spawn(move || work(perm, j as usize, max as usize)))\n    }\n\n    let mut checksum = 0;\n    let mut maxflips = 0;\n    for fut in futures {\n        let (cs, mf) = fut.join().unwrap();\n        checksum += cs;\n        maxflips = cmp::max(maxflips, mf);\n    }\n    (checksum, maxflips)\n}\n\n#[cfg(test)]\n#[test]\nfn test() {\n    assert_eq!(fannkuch_redux(7), (228, 16));\n}\n"
  },
  {
    "path": "examples/fannkuch_redux/src/simd.rs",
    "content": "//! Vectorized fannkuch redux implementation\n\nuse packed_simd::*;\n\nstruct State {\n    s: [u8; 16],\n    flip_masks: [u8x16; 16],\n    rotate_masks: [u8x16; 16],\n\n    maxflips: i32,\n    odd: u16,\n    checksum: i32,\n}\n\nimpl Default for State {\n    fn default() -> Self {\n        Self {\n            s: [0; 16],\n            flip_masks: [u8x16::splat(0); 16],\n            rotate_masks: [u8x16::splat(0); 16],\n\n            maxflips: 0,\n            odd: 0,\n            checksum: 0,\n        }\n    }\n}\n\nimpl State {\n    fn rotate_sisd(&mut self, n: usize) {\n        let c = self.s[0];\n        for i in 1..=n {\n            self.s[i - 1] = self.s[i];\n        }\n        self.s[n] = c;\n    }\n    fn popmasks(&mut self) {\n        let mut mask = [0_u8; 16];\n        for i in 0..16 {\n            for (j, m) in mask.iter_mut().enumerate() {\n                *m = j as u8;\n            }\n\n            for x in 0..(i + 1) / 2 {\n                mask.swap(x, i - x);\n            }\n\n            self.flip_masks[i] = u8x16::from_slice_unaligned(&mask);\n\n            for (j, s) in self.s.iter_mut().enumerate() {\n                *s = j as u8;\n            }\n            self.rotate_sisd(i);\n            self.rotate_masks[i] = self.load_s();\n        }\n    }\n    fn rotate(&mut self, n: usize) {\n        self.load_s()\n            .shuffle1_dyn(self.rotate_masks[n])\n            .write_to_slice_unaligned(&mut self.s)\n    }\n\n    fn load_s(&self) -> u8x16 {\n        u8x16::from_slice_unaligned(&self.s)\n    }\n\n    fn tk(&mut self, n: usize) {\n        #[derive(Copy, Clone, Debug)]\n        struct Perm {\n            perm: u8x16,\n            start: u8,\n            odd: u16,\n        }\n\n        let mut perms = [Perm { perm: u8x16::splat(0), start: 0, odd: 0 }; 60];\n\n        let mut i = 0;\n        let mut c = [0_u8; 16];\n        let mut perm_max = 0;\n        // Cache this locally outside the loop, since the compiler\n        // can't optimize accesses to it otherwise.\n        let mut odd = self.odd;\n\n        while i < n {\n            while i < n && perm_max < 60 {\n                self.rotate(i);\n                if c[i] as usize >= i {\n                    c[i] = 0;\n                    i += 1;\n                    continue;\n                }\n\n                c[i] += 1;\n                i = 1;\n                odd = !odd;\n                if self.s[0] != 0 {\n                    if self.s[self.s[0] as usize] == 0 {\n                        if self.maxflips == 0 {\n                            self.maxflips = 1\n                        }\n                        self.checksum += if odd == 0 { 1 } else { -1 };\n                    } else {\n                        perms[perm_max].perm = self.load_s();\n                        perms[perm_max].start = self.s[0];\n                        perms[perm_max].odd = odd;\n                        perm_max += 1;\n                    }\n                }\n            }\n\n            let mut k = 0;\n            while k < std::cmp::max(1, perm_max) - 1 {\n                let pk = &perms[k];\n                let pk1 = &perms[k + 1];\n                let mut perm1 = pk.perm;\n                let mut perm2 = pk1.perm;\n\n                let mut f1 = 0;\n                let mut f2 = 0;\n                let mut toterm1 = pk.start;\n                let mut toterm2 = pk1.start;\n\n                while toterm1 != 0 && toterm2 != 0 {\n                    perm1 =\n                        perm1.shuffle1_dyn(self.flip_masks[toterm1 as usize]);\n                    perm2 =\n                        perm2.shuffle1_dyn(self.flip_masks[toterm2 as usize]);\n                    toterm1 = perm1.extract(0);\n                    toterm2 = perm2.extract(0);\n\n                    f1 += 1;\n                    f2 += 1;\n                }\n                while toterm1 != 0 {\n                    perm1 =\n                        perm1.shuffle1_dyn(self.flip_masks[toterm1 as usize]);\n                    toterm1 = perm1.extract(0);\n                    f1 += 1;\n                }\n                while toterm2 != 0 {\n                    perm2 =\n                        perm2.shuffle1_dyn(self.flip_masks[toterm2 as usize]);\n                    toterm2 = perm2.extract(0);\n                    f2 += 1;\n                }\n\n                if f1 > self.maxflips {\n                    self.maxflips = f1\n                }\n                if f2 > self.maxflips {\n                    self.maxflips = f2\n                }\n                self.checksum += if pk.odd == 0 { f1 } else { -f1 };\n                self.checksum += if pk1.odd == 0 { f2 } else { -f2 };\n\n                k += 2;\n            }\n            while k < perm_max {\n                let pk = &perms[k];\n                let mut perm = pk.perm;\n                let mut f = 0;\n                let mut toterm = pk.start;\n                while toterm != 0 {\n                    perm = perm.shuffle1_dyn(self.flip_masks[toterm as usize]);\n                    toterm = perm.extract(0);\n                    f += 1;\n                }\n                if f > self.maxflips {\n                    self.maxflips = f\n                }\n                self.checksum += if pk.odd == 0 { f } else { -f };\n                k += 1\n            }\n            perm_max = 0;\n        }\n    }\n}\n\npub fn fannkuch_redux(n: usize) -> (i32, i32) {\n    let mut state = State::default();\n    state.popmasks();\n\n    for i in 0..n {\n        state.s[i] = i as u8\n    }\n    state.tk(n);\n\n    (state.checksum, state.maxflips)\n}\n\n#[cfg(test)]\n#[test]\nfn test() {\n    assert_eq!(fannkuch_redux(7), (228, 16));\n}\n"
  },
  {
    "path": "examples/mandelbrot/Cargo.toml",
    "content": "[package]\nname = \"mandelbrot\"\nversion = \"0.1.0\"\nauthors = [\"gnzlbg <gonzalobg88@gmail.com>\"]\nbuild = \"build.rs\"\nedition = \"2018\"\n\n[dependencies]\npacked_simd = { package = \"packed_simd\", path = \"../..\" }\nrayon = \"^1.0\"\nispc = { version = \"^1.0.4\", optional = true }\nstructopt = { version = \"0.3.0\", features = [\"color\"] }\n\n[build-dependencies]\nispc = { version = \"^1.0.4\", optional = true }\n\n[[bin]]\nname = \"mandelbrot\"\npath = \"src/main.rs\"\n\n[lib]\nname = \"mandelbrot_lib\"\npath = \"src/lib.rs\"\n\n[features]\ndefault = []\nsleef-sys = [\"packed_simd/sleef-sys\"]\ncore_arch = [\"packed_simd/core_arch\"]\n"
  },
  {
    "path": "examples/mandelbrot/benchmark.sh",
    "content": "#!/usr/bin/env bash\n#\n# Runs mandelbrot benchmarks\n\nset -ex\n\nWIDTH=800\nHEIGHT=800\n\nif [[ ${NORUN} != 1 ]]; then\n    hash hyperfine 2>/dev/null || { echo >&2 \"hyperfine is not in PATH.\"; exit 1; }\nfi\n\nif echo \"$FEATURES\" | grep -q \"ispc\"; then\n    hash ispc 2>/dev/null || { echo >&2 \"ispc is not in PATH.\"; exit 1; }\nfi\n\nRUSTFLAGS=\"-C target-cpu=native ${RUSTFLAGS}\" \\\n         cargo build --release --features=\"${FEATURES}\"\n\nif [[ \"${VERIFY}\" == \"1\" ]]; then\n    RUSTFLAGS=\"-C target-cpu=native ${RUSTFLAGS}\" \\\n             cargo test --release --features=\"${FEATURES}\"\nfi\n\nif [[ \"${NORUN}\" == \"1\" ]]; then\n    exit 0\nfi\n\nhyperfine \"../target/release/mandelbrot ${WIDTH} ${HEIGHT} --algo scalar\"\nhyperfine \"../target/release/mandelbrot ${WIDTH} ${HEIGHT} --algo simd\"\n\nif echo \"$FEATURES\" | grep -q \"ispc\"; then\n    hyperfine \"../target/release/mandelbrot ${WIDTH} ${HEIGHT} --algo ispc\"\nfi\n"
  },
  {
    "path": "examples/mandelbrot/build.rs",
    "content": "fn main() {\n    println!(\"cargo:rerun-if-changed=build.rs\");\n\n    #[cfg(feature = \"ispc\")]\n    {\n        if std::env::var(\"CARGO_FEATURE_ISPC\").is_ok() {\n            let mut cfg = ispc::Config::new();\n\n            if cfg!(windows) {\n                cfg.debug(false);\n            }\n\n            let ispc_files = vec![\"volta/mandelbrot.ispc\"];\n\n            for s in &ispc_files[..] {\n                cfg.file(*s);\n            }\n\n            cfg.target_isas(vec![\n                ispc::opt::TargetISA::SSE2i32x4,\n                ispc::opt::TargetISA::SSE4i32x4,\n                ispc::opt::TargetISA::AVX1i32x8,\n                ispc::opt::TargetISA::AVX2i32x8,\n                ispc::opt::TargetISA::AVX512KNLi32x16,\n            ]);\n\n            cfg.compile(\"mandelbrot\");\n        }\n    }\n}\n"
  },
  {
    "path": "examples/mandelbrot/readme.md",
    "content": "# Mandelbrot\n\nThis is the [`mandelbrot` benchmark from the benchmarksgame][bg].\n\n## Background\n\nhttp://mathworld.wolfram.com/MandelbrotSet.html\n\n## Usage\n\nIt takes four arguments in this order:\n\n* `width`: width of the image to render\n* `height`: height of the image to render\n* `algorithm`: algorithm to use:\n  * `scalar`: scalar algorithm\n  * `simd`: parallelized SIMD algorithm\n  * `ispc`: ISPC + tasks algorithm\n* `--color` (optional): enables colorized output, which also determines the image format.\n  * disabled (default): PBM: Portable BitMap format (black & white output)\n  * enabled: PPM: Portable PixMap format (colored output)\n\nThe resulting image is piped to `stdout`.\n\n`cargo run --release -- 400 400 --algo simd > output.ppm` outputs:\n\n![run_400_png](https://user-images.githubusercontent.com/904614/43190942-72bdb834-8ffa-11e8-9dcf-a9a9632ae907.png)\n\n`cargo run --release -- 400 400 --algo simd --color > output.ppm` outputs:\n\n![run_400_400_1_1_png](https://user-images.githubusercontent.com/904614/43190948-759969a4-8ffa-11e8-81a9-35e5baef3e86.png)\n\n## Performance\n\n```\n./benchmark.sh\n```\n\nOn a dual core AVX1 i5 @1.8 GHz:\n\n| 800 x 800  | time [ms] <br> Rust | speedup vs `scalar` [-] |\n|------------|---------------------|-------------|\n| `scalar`   | 86.6                | 1.0x        |\n| `simd`     | 21.0                | 4.1x        |\n| `ispc`     | 25.7                | 3.4x        |\n\n`simd` algorithm is ~1.2x faster than `ispc`.\n\nOn a 28 core Xeon CPU E5-2690 v4 @ 2.60GHz:\n\n| 800 x 800  | time [ms] <br> Rust | speedup vs `scalar` [-] |\n|------------|---------------------|-------------------------|\n| `scalar`   | 50.8                | 1.0x                    |\n| `simd`     | 25.1                | 2x                      |\n| `ispc`     | 14.4                | 3.52x                   |\n\n`simd` algorithm is ~1.74x slower than `ispc`.\n\nOn a 40 core Xeon Gold 6148 CPU @ 2.40GHz:\n\n| 800 x 800  | time [ms] <br> Rust | speedup vs `scalar` [-] |\n|------------|---------------------|-------------|\n| `scalar`   | 59.9                | 1.0x        |\n| `simd`     | 29.9                | 2.0x        |\n| `ispc`     | 30.3                | 2.0x        |\n\n`simd` algorithm is as fast as `ispc`.\n\n[bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/mandelbrot.html#mandelbrot\n"
  },
  {
    "path": "examples/mandelbrot/src/ispc_tasks.rs",
    "content": "//! Includes the ISPC implementations.\nuse crate::*;\nuse ispc::*;\n\nispc_module!(mandelbrot);\n\npub fn generate(dims: Dimensions, xr: Range, yr: Range) -> Vec<u32> {\n    let (width, height) = dims;\n    let Range { start: left, end: right } = xr;\n    let Range { start: top, end: bottom } = yr;\n\n    let len = width * height;\n    let mut out = Vec::with_capacity(len);\n\n    unsafe {\n        mandelbrot::mandelbrot_ispc(\n            left,\n            bottom,\n            right,\n            top,\n            height as i32,\n            width as i32,\n            ITER_LIMIT as i32,\n            out.as_mut_ptr() as *mut i32,\n        );\n\n        out.set_len(len);\n    }\n\n    out\n}\n"
  },
  {
    "path": "examples/mandelbrot/src/lib.rs",
    "content": "//! The mandelbrot benchmark from the [benchmarks game][bg].\n//!\n//! [bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/mandelbrot.html#mandelbrot\n\n// FIXME: Null pointer deref warning triggered in this example,\n// likely inside a macro expansion deriving from packed_simd.\n#![deny(rust_2018_idioms)]\n#![allow(\n    clippy::cast_precision_loss,\n    clippy::cast_sign_loss,\n    clippy::cast_possible_truncation,\n    clippy::must_use_candidate\n)]\n\nuse rayon::prelude::*;\nuse std::{io, ops};\n\n// Each algorithm implementation must expose a single public function,\n// `generate`:   fn generate(dimensions: Dimensions, xr: Range, yr: Range) ->\n// Vec<u8>;\n//\n// Generates the Mandelbrot fractal for a region of Cartesian space,\n// where X is bounded by `xr.begin..xr.end` and Y by `yr.begin..yr.end`.\n//\n// Returns a vector of dimensions `width * height`, where each byte is\n// the number of iterations the corresponding point reached before diverging.\n\n#[cfg(feature = \"ispc\")]\nmod ispc_tasks;\nmod scalar_par;\nmod simd_par;\n\ntype Range = ops::Range<f64>;\ntype Region = (Range, Range);\n\n/// The width and height of a generated image\npub type Dimensions = (usize, usize);\n\n/// The Mandelbrot algorithms supported by this crate.\n#[derive(Debug, Copy, Clone)]\npub enum Algorithm {\n    /// Scalar parallel algorithm\n    Scalar,\n    /// Parallel SIMD algorithm using Rayon\n    Simd,\n    /// ISPC SIMD + parallel tasks algorithm\n    Ispc,\n}\n\npub struct Mandelbrot {\n    dims: Dimensions,\n    data: Vec<u32>,\n}\n\nimpl Mandelbrot {\n    /// Generates a new image of the Mandelbrot fractal.\n    pub fn generate(dims: Dimensions, algo: Algorithm) -> Self {\n        Self::generate_region(dims, DEFAULT_REGION, algo)\n    }\n\n    /// Generates a new image containing a certain region of the Mandelbrot\n    /// fractal.\n    pub fn generate_region(\n        dims: Dimensions, region: Region, algo: Algorithm,\n    ) -> Self {\n        let data = match algo {\n            Algorithm::Scalar => {\n                scalar_par::generate(dims, region.0, region.1)\n            }\n            Algorithm::Simd => simd_par::generate(dims, region.0, region.1),\n            #[cfg(feature = \"ispc\")]\n            Algorithm::Ispc => ispc_tasks::generate(dims, region.0, region.1),\n            #[cfg(not(feature = \"ispc\"))]\n            Algorithm::Ispc => unimplemented!(\n                \"This crate was built with the `ispc` feature disabled\"\n            ),\n        };\n\n        Self { dims, data }\n    }\n\n    /// Writes the PBM / PPM header to the output.\n    fn write_header(\n        &self, f: &mut dyn io::Write, color: bool,\n    ) -> io::Result<()> {\n        writeln!(f, \"P{}\", if color { 6 } else { 4 })?;\n        write!(f, \"{} {}\", self.dims.0, self.dims.1)?;\n        if color {\n            write!(f, \" 255\")?;\n        }\n        writeln!(f)\n    }\n\n    /// Outputs a black/white PBM bitmap to the given writer.\n    pub fn output_pbm(&self, f: &mut dyn io::Write) -> io::Result<()> {\n        self.write_header(f, false)?;\n\n        assert_eq!(\n            self.data.len() % 8,\n            0,\n            \"Output data must be a multiple of 8\"\n        );\n        let buf = self\n            .data\n            .par_chunks(8)\n            .map(|ch| {\n                let mut result = 0;\n                ch.iter().enumerate().for_each(|(i, &count)| {\n                    let undiverged = count == ITER_LIMIT;\n                    result |= (undiverged as u8) << (7 - i);\n                });\n                result\n            })\n            .collect::<Vec<u8>>();\n\n        f.write_all(&buf)\n    }\n\n    /// Outputs a color PPM image to the given writer.\n    pub fn output_ppm(&self, f: &mut dyn io::Write) -> io::Result<()> {\n        self.write_header(f, true)?;\n\n        let buf = self\n            .data\n            .par_iter()\n            .flat_map(|&val| {\n                const COLORS: &[(f32, f32, f32)] = &[\n                    (0.0, 7.0, 100.0),\n                    (32.0, 107.0, 203.0),\n                    (237.0, 255.0, 255.0),\n                    (255.0, 170.0, 0.0),\n                    (0.0, 2.0, 0.0),\n                ];\n                const SCALE: u32 = 12;\n\n                let color_count = COLORS.len() as u32;\n\n                let color = if val == ITER_LIMIT {\n                    vec![0, 0, 0]\n                } else {\n                    let val = (val % SCALE) * color_count / SCALE;\n                    let left = val % color_count;\n                    let right = (left + 1) % color_count;\n\n                    let alpha = (val - left) as f32;\n                    let (r1, g1, b1) = COLORS[left as usize];\n                    let (r2, g2, b2) = COLORS[right as usize];\n                    vec![\n                        (r1 + (r2 - r1) * alpha) as u8,\n                        (g1 + (g2 - g1) * alpha) as u8,\n                        (b1 + (b2 - b1) * alpha) as u8,\n                    ]\n                };\n\n                color.into_par_iter()\n            })\n            .collect::<Vec<_>>();\n\n        f.write_all(&buf)\n    }\n}\n\n/// Returns the default region of space to generate an image for.\n///\n/// This is the region containing the fractal most people think of when they\n/// think of Mandelbrot, since values outside definitely diverge.\nconst DEFAULT_REGION: (Range, Range) = (-1.5..0.5, -1.0..1.0);\n\n/// Threshold for Mandelbrot sequence divergence\n///\n/// Complex numbers which have a modulus squared greater than this are\n/// considered to be diverging.\nconst THRESHOLD: f64 = 4.0;\n\n/// Maximum amount of iterations to perform\n///\n/// Increasing this will make more features to be visible in the image,\n/// assuming the resolution is large enoguh.\nconst ITER_LIMIT: u32 = 50;\n\n#[cfg(test)]\nmod tests {\n    use super::*;\n\n    #[test]\n    #[cfg_attr(windows, ignore)]\n    fn verify_all() {\n        let width = 400;\n        let height = 800;\n\n        let dims = (width, height);\n\n        let verify = |actual: &[u32], expected: &[u32]| {\n            if actual != expected {\n                for row in 0..height {\n                    for column in 0..width {\n                        let idx = row * width + column;\n                        assert_eq!(\n                            actual[idx], expected[idx],\n                            \"difference at ({}, {})\",\n                            row, column,\n                        );\n                    }\n                }\n            }\n        };\n\n        eprintln!(\"Generating Mandelbrot with scalar algorithm\");\n        let scalar =\n            scalar_par::generate(dims, DEFAULT_REGION.0, DEFAULT_REGION.1);\n        assert_eq!(scalar.len(), width * height);\n\n        eprintln!(\"Generating Mandelbrot with SIMD algorithm\");\n        let simd =\n            simd_par::generate(dims, DEFAULT_REGION.0, DEFAULT_REGION.1);\n        verify(&simd[..], &scalar[..]);\n    }\n\n    fn verify_algo(algo: Algorithm) {\n        static OUTPUT: &[u8] = include_bytes!(\"mandelbrot-output.txt\");\n\n        let (width, height) = (200, 200);\n\n        let dims = (width, height);\n        let mb = Mandelbrot::generate(dims, algo);\n\n        let out = {\n            let mut out = Vec::with_capacity(width * height);\n            mb.output_pbm(&mut out).unwrap();\n            out\n        };\n\n        assert_eq!(out.len(), OUTPUT.len());\n\n        if out != OUTPUT {\n            out.into_iter().zip(OUTPUT.iter()).enumerate().for_each(\n                |(i, (a, &b))| {\n                    assert_eq!(\n                        a, b,\n                        \"byte {} differs - {:#08b} != {:#08b} (expected)\",\n                        i, a, b,\n                    );\n                },\n            );\n        }\n    }\n\n    #[test]\n    fn verify_output_scalar() {\n        verify_algo(Algorithm::Scalar);\n    }\n\n    #[test]\n    #[cfg_attr(windows, ignore)]\n    fn verify_output_simd() {\n        verify_algo(Algorithm::Simd);\n    }\n}\n"
  },
  {
    "path": "examples/mandelbrot/src/main.rs",
    "content": "//! The Mandelbrot benchmark from the [benchmarksgame][bg]\n//!\n//! [bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/mandelbrot.html#mandelbrot\n\n#![deny(rust_2018_idioms)]\n\nuse mandelbrot_lib::*;\nuse std::io;\nuse structopt::StructOpt;\n\n/// Mandelbrot image generator.\n///\n/// Output is printed to `stdout`.\n#[derive(StructOpt)]\nstruct Opt {\n    /// Image width.\n    width: usize,\n    /// Image height.\n    height: usize,\n\n    /// Enable this to output a color image.\n    #[structopt(short = \"c\", long = \"color\")]\n    color: bool,\n\n    /// Algorithm\n    #[structopt(short = \"a\", long = \"algo\")]\n    algo: String,\n}\n\nconst ALGORITHMS: &[&str] = &[\"scalar\", \"simd\", \"ispc\"];\n\nfn main() {\n    let opt = Opt::from_args();\n\n    let algo = match opt.algo.as_str() {\n        \"scalar\" => Algorithm::Scalar,\n        \"simd\" => Algorithm::Simd,\n        \"ispc\" => Algorithm::Ispc,\n        algo => panic!(\n            \"Unknown algorithm: {:?}\\nAvailable algorithms: {:?}\",\n            algo, ALGORITHMS\n        ),\n    };\n\n    let mb = Mandelbrot::generate((opt.width, opt.height), algo);\n\n    let mut stdout = io::stdout();\n    if opt.color {\n        mb.output_ppm(&mut stdout).unwrap();\n    } else {\n        mb.output_pbm(&mut stdout).unwrap();\n    }\n}\n"
  },
  {
    "path": "examples/mandelbrot/src/scalar_par.rs",
    "content": "//! Scalar mandelbrot implementation\n\nuse crate::*;\n\n/// Complex number\n#[repr(align(16))]\n#[derive(Copy, Clone)]\nstruct Complex {\n    real: f64,\n    imag: f64,\n}\n\nimpl Complex {\n    /// Returns true if this member of the Mandelbrot sequence is diverging\n    #[inline]\n    fn diverged(&self) -> bool {\n        let Self { real: x, imag: y } = self;\n\n        let xx = x * x;\n        let yy = y * y;\n        let sum = xx + yy;\n\n        sum > THRESHOLD\n    }\n}\n\n/// An iterator yielding the infinite Mandelbrot sequence\nstruct MandelbrotIter {\n    /// Initial value which generated this sequence\n    start: Complex,\n    /// Current iteration value\n    current: Complex,\n}\n\nimpl MandelbrotIter {\n    /// Creates a new Mandelbrot sequence iterator for a given starting point\n    fn new(start: Complex) -> Self {\n        Self { start, current: start }\n    }\n\n    /// Returns the number of iterations it takes for the Mandelbrot sequence\n    /// to diverge at this point, or `ITER_LIMIT` if it doesn't diverge.\n    fn count(mut self) -> u32 {\n        let mut z = self.start;\n        for i in 0..ITER_LIMIT {\n            if z.diverged() {\n                return i;\n            }\n\n            z = self.next().unwrap();\n        }\n        ITER_LIMIT\n    }\n}\n\nimpl Iterator for MandelbrotIter {\n    type Item = Complex;\n\n    /// Generates the next value in the sequence\n    #[inline]\n    fn next(&mut self) -> Option<Complex> {\n        let Complex { real: c_x, imag: c_y } = self.start;\n        let Complex { real: x, imag: y } = self.current;\n\n        let xx = x * x;\n        let yy = y * y;\n        let xy = x * y;\n\n        let new_x = c_x + (xx - yy);\n        let new_y = c_y + (xy + xy);\n\n        self.current = Complex { real: new_x, imag: new_y };\n\n        Some(self.current)\n    }\n}\n\npub fn generate(dims: Dimensions, xr: Range, yr: Range) -> Vec<u32> {\n    let (width, height) = dims;\n\n    let xs = {\n        let dx = (xr.end - xr.start) / (width as f64);\n\n        let mut buf = Vec::new();\n\n        (0..width)\n            .into_par_iter()\n            .map(|j| xr.start + dx * (j as f64))\n            .collect_into_vec(&mut buf);\n\n        buf\n    };\n\n    let dy = (yr.end - yr.start) / (height as f64);\n\n    let len = width * height;\n    let mut out = Vec::with_capacity(len);\n    unsafe {\n        out.set_len(len);\n    }\n\n    out.par_chunks_mut(width).enumerate().for_each(|(i, row)| {\n        let y = yr.start + dy * (i as f64);\n        row.iter_mut().enumerate().for_each(|(j, count)| {\n            let x = xs[j];\n            let z = Complex { real: x, imag: y };\n            *count = MandelbrotIter::new(z).count() as u32;\n        });\n    });\n\n    out\n}\n"
  },
  {
    "path": "examples/mandelbrot/src/simd_par.rs",
    "content": "//! Vectorized parallel Mandelbrot implementation\n#![allow(non_camel_case_types)]\n\nuse crate::*;\nuse packed_simd::*;\n\ntype u64s = u64x8;\ntype u32s = u32x8;\ntype f64s = f64x8;\ntype m64s = m64x8;\n\n/// Storage for complex numbers in SIMD format.\n/// The real and imaginary parts are kept in separate registers.\n#[derive(Copy, Clone)]\nstruct Complex {\n    real: f64s,\n    imag: f64s,\n}\n\nimpl Complex {\n    /// Returns a mask describing which members of the Mandelbrot sequence\n    /// haven't diverged yet\n    #[inline]\n    fn undiverged(&self) -> m64s {\n        let Self { real: x, imag: y } = *self;\n\n        let xx = x * x;\n        let yy = y * y;\n        let sum = xx + yy;\n\n        sum.le(f64s::splat(THRESHOLD))\n    }\n}\n\n/// Mandelbrot sequence iterator using SIMD.\nstruct MandelbrotIter {\n    /// Initial value which generated this sequence\n    start: Complex,\n    /// Current iteration value\n    current: Complex,\n}\n\nimpl MandelbrotIter {\n    /// Creates a new Mandelbrot sequence iterator for a given starting point\n    fn new(start: Complex) -> Self {\n        Self { start, current: start }\n    }\n\n    /// Returns the number of iterations it takes for each member of the\n    /// Mandelbrot sequence to diverge at this point, or `ITER_LIMIT` if\n    /// they don't diverge.\n    ///\n    /// This function will operate on N complex numbers at once, where N is the\n    /// number of lanes in a SIMD vector of doubles.\n    fn count(mut self) -> u32s {\n        let mut z = self.start;\n        let mut count = u64s::splat(0);\n        for _ in 0..ITER_LIMIT {\n            // Keep track of those lanes which haven't diverged yet. The other\n            // ones will be masked off.\n            let undiverged = z.undiverged();\n\n            // Stop the iteration if they all diverged. Note that we don't do\n            // this check every iteration, since a branch\n            // misprediction can hurt more than doing some extra\n            // calculations.\n            if undiverged.none() {\n                break;\n            }\n\n            count += undiverged.select(u64s::splat(1), u64s::splat(0));\n\n            z = self.next().unwrap();\n        }\n        count.cast()\n    }\n}\n\nimpl Iterator for MandelbrotIter {\n    type Item = Complex;\n\n    /// Generates the next values in the sequence\n    #[inline]\n    fn next(&mut self) -> Option<Complex> {\n        let Complex { real: c_x, imag: c_y } = self.start;\n        let Complex { real: x, imag: y } = self.current;\n\n        let xx = x * x;\n        let yy = y * y;\n        let xy = x * y;\n\n        let new_x = c_x + (xx - yy);\n        let new_y = c_y + (xy + xy);\n\n        self.current = Complex { real: new_x, imag: new_y };\n\n        Some(self.current)\n    }\n}\n\npub fn generate(dims: Dimensions, xr: Range, yr: Range) -> Vec<u32> {\n    let (width, height) = dims;\n\n    let block_size = f64s::lanes();\n\n    assert_eq!(\n        width % block_size,\n        0,\n        \"image width = {} is not divisible by the number of vector lanes = {}\",\n        width,\n        block_size,\n    );\n\n    let width_in_blocks = width / block_size;\n\n    // The initial X values are the same for every row.\n    let xs = unsafe {\n        let dx = (xr.end - xr.start) / (width as f64);\n        let mut buf: Vec<f64s> = vec![f64s::splat(0.); width_in_blocks];\n\n        std::slice::from_raw_parts_mut(buf.as_mut_ptr() as *mut f64, width)\n            .iter_mut()\n            .enumerate()\n            .for_each(|(j, x)| {\n                *x = xr.start + dx * (j as f64);\n            });\n\n        buf\n    };\n\n    let dy = (yr.end - yr.start) / (height as f64);\n\n    let len = width_in_blocks * height;\n    let mut out = Vec::with_capacity(len);\n    unsafe {\n        out.set_len(len);\n    }\n\n    out.par_chunks_mut(width_in_blocks).enumerate().for_each(|(i, row)| {\n        let y = f64s::splat(yr.start + dy * (i as f64));\n        row.iter_mut().enumerate().for_each(|(j, count)| {\n            let x = xs[j];\n            let z = Complex { real: x, imag: y };\n            *count = MandelbrotIter::new(z).count();\n        });\n    });\n\n    // This is safe, we're transmuting from a more-aligned type to a\n    // less-aligned one.\n    #[allow(clippy::unsound_collection_transmute)]\n    unsafe {\n        let mut out: Vec<u32> = std::mem::transmute(out);\n        out.set_len(width * height);\n        out\n    }\n}\n"
  },
  {
    "path": "examples/mandelbrot/volta/mandelbrot.ispc",
    "content": "/*\n  Copyright (c) 2010-2012, Intel Corporation\n  All rights reserved.\n\n  Redistribution and use in source and binary forms, with or without\n  modification, are permitted provided that the following conditions are\n  met:\n\n    * Redistributions of source code must retain the above copyright\n      notice, this list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright\n      notice, this list of conditions and the following disclaimer in the\n      documentation and/or other materials provided with the distribution.\n\n    * Neither the name of Intel Corporation nor the names of its\n      contributors may be used to endorse or promote products derived from\n      this software without specific prior written permission.\n\n\n   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS\n   IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED\n   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A\n   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER\n   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\n   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\n   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  \n*/\n\nstatic inline int mandel(double c_re, double c_im, int count) {\n    double z_re = c_re, z_im = c_im;\n    int i;\n    for (i = 0; i < count; ++i) {\n        if (z_re * z_re + z_im * z_im > 4.)\n            break;\n\n        double new_re = z_re*z_re - z_im*z_im;\n        double new_im = 2.f * z_re * z_im;\n        unmasked {\n            z_re = c_re + new_re;\n            z_im = c_im + new_im;\n        }\n    }\n\n    return i;\n}\n\nexport void mandelbrot_ispc(uniform double x0, uniform double y0, \n                            uniform double x1, uniform double y1,\n                            uniform int width, uniform int height, \n                            uniform int maxIterations,\n                            uniform int output[])\n{\n    double dx = (x1 - x0) / width;\n    double dy = (y1 - y0) / height;\n\n    for (uniform int j = 0; j < height; j++) {\n        // Note that we'll be doing programCount computations in parallel,\n        // so increment i by that much.  This assumes that width evenly\n        // divides programCount.\n        foreach (i = 0 ... width) {\n            // Figure out the position on the complex plane to compute the\n            // number of iterations at.  Note that the x values are\n            // different across different program instances, since its\n            // initializer incorporates the value of the programIndex\n            // variable.\n            double x = x0 + i * dx;\n            double y = y0 + j * dy;\n\n            int index = j * width + i;\n            output[index] = mandel(x, y, maxIterations);\n        }\n    }\n}"
  },
  {
    "path": "examples/matrix_inverse/Cargo.toml",
    "content": "[package]\nname = \"matrix_inverse\"\nversion = \"0.1.0\"\nauthors = [\"Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>\"]\nedition = \"2018\"\n\n[dependencies]\npacked_simd = { package = \"packed_simd\", path = \"../..\" }\n\n[lib]\nname = \"matrix_inverse_lib\"\npath = \"src/lib.rs\"\n"
  },
  {
    "path": "examples/matrix_inverse/readme.md",
    "content": "# 4x4 matrix inverse \n"
  },
  {
    "path": "examples/matrix_inverse/src/lib.rs",
    "content": "//! 4x4 matrix inverse\n#![feature(custom_inner_attributes)]\n#![deny(rust_2018_idioms)]\n#![allow(clippy::must_use_candidate)]\n\npub mod scalar;\npub mod simd;\n\n#[derive(Copy, Clone, Debug, PartialEq, PartialOrd)]\npub struct Matrix4x4([[f32; 4]; 4]);\n\n#[cfg(test)]\n#[rustfmt::skip]\nfn test<F: Fn(Matrix4x4) -> Option<Matrix4x4>>(f: F) {\n    let tests: &[(Matrix4x4, Option<Matrix4x4>)] = &[\n        // Identity:\n        (Matrix4x4([\n            [1., 0., 0., 0.],\n            [0., 1., 0., 0.],\n            [0., 0., 1., 0.],\n            [0., 0., 0., 1.],\n         ]),\n         Some(Matrix4x4([\n             [1., 0., 0., 0.],\n             [0., 1., 0., 0.],\n             [0., 0., 1., 0.],\n             [0., 0., 0., 1.],\n         ]))\n        ),\n        // None:\n        (Matrix4x4([\n            [1., 2., 3., 4.],\n            [12., 11., 10., 9.],\n            [5., 6., 7., 8.],\n            [16., 15., 14., 13.],\n        ]),\n         None\n        ),\n        // Other:\n        (Matrix4x4([\n            [1., 1., 1., 0.],\n            [0., 3., 1., 2.],\n            [2., 3., 1., 0.],\n            [1., 0., 2., 1.],\n        ]),\n         Some(Matrix4x4([\n             [-3., -0.5,   1.5,  1.0],\n             [ 1., 0.25, -0.25, -0.5],\n             [ 3., 0.25, -1.25, -0.5],\n             [-3., 0.0,    1.0,  1.0],\n         ]))\n        ),\n\n\n    ];\n\n    for &(input, output) in tests {\n        assert_eq!(f(input), output);\n    }\n}\n"
  },
  {
    "path": "examples/matrix_inverse/src/scalar.rs",
    "content": "//! Scalar implementation\n#[rustfmt::skip]\nuse crate::*;\n\n#[allow(clippy::too_many_lines)]\npub fn inv4x4(m: Matrix4x4) -> Option<Matrix4x4> {\n    let m = m.0;\n\n    let mut inv = [\n        [ // row 0:\n            // 0,0:\n            m[1][1]  * m[2][2] * m[3][3] -\n            m[1][1]  * m[2][3] * m[3][2] -\n            m[2][1]  * m[1][2]  * m[3][3] +\n            m[2][1]  * m[1][3]  * m[3][2] +\n            m[3][1] * m[1][2]  * m[2][3] -\n            m[3][1] * m[1][3]  * m[2][2],\n            // 0,1:\n           -m[0][1]  * m[2][2] * m[3][3] +\n            m[0][1]  * m[2][3] * m[3][2] +\n            m[2][1]  * m[0][2] * m[3][3] -\n            m[2][1]  * m[0][3] * m[3][2] -\n            m[3][1] * m[0][2] * m[2][3] +\n            m[3][1] * m[0][3] * m[2][2],\n            // 0,2:\n            m[0][1]  * m[1][2] * m[3][3] -\n            m[0][1]  * m[1][3] * m[3][2] -\n            m[1][1]  * m[0][2] * m[3][3] +\n            m[1][1]  * m[0][3] * m[3][2] +\n            m[3][1] * m[0][2] * m[1][3] -\n            m[3][1] * m[0][3] * m[1][2],\n            // 0,3:\n           -m[0][1] * m[1][2] * m[2][3] +\n            m[0][1] * m[1][3] * m[2][2] +\n            m[1][1] * m[0][2] * m[2][3] -\n            m[1][1] * m[0][3] * m[2][2] -\n            m[2][1] * m[0][2] * m[1][3] +\n            m[2][1] * m[0][3] * m[1][2],\n        ],\n        [ // row 1\n            // 1,0:\n           -m[1][0]  * m[2][2] * m[3][3] +\n            m[1][0]  * m[2][3] * m[3][2] +\n            m[2][0]  * m[1][2]  * m[3][3] -\n            m[2][0]  * m[1][3]  * m[3][2] -\n            m[3][0] * m[1][2]  * m[2][3] +\n            m[3][0] * m[1][3]  * m[2][2],\n            // 1,1:\n            m[0][0]  * m[2][2] * m[3][3] -\n            m[0][0]  * m[2][3] * m[3][2] -\n            m[2][0]  * m[0][2] * m[3][3] +\n            m[2][0]  * m[0][3] * m[3][2] +\n            m[3][0] * m[0][2] * m[2][3] -\n            m[3][0] * m[0][3] * m[2][2],\n            // 1,2:\n           -m[0][0]  * m[1][2] * m[3][3] +\n            m[0][0]  * m[1][3] * m[3][2] +\n            m[1][0]  * m[0][2] * m[3][3] -\n            m[1][0]  * m[0][3] * m[3][2] -\n            m[3][0] * m[0][2] * m[1][3] +\n            m[3][0] * m[0][3] * m[1][2],\n            // 1,3:\n            m[0][0] * m[1][2] * m[2][3] -\n            m[0][0] * m[1][3] * m[2][2] -\n            m[1][0] * m[0][2] * m[2][3] +\n            m[1][0] * m[0][3] * m[2][2] +\n            m[2][0] * m[0][2] * m[1][3] -\n            m[2][0] * m[0][3] * m[1][2],\n        ],\n        [ // row 2\n            // 2,0:\n            m[1][0]  * m[2][1] * m[3][3] -\n            m[1][0]  * m[2][3] * m[3][1] -\n            m[2][0]  * m[1][1] * m[3][3] +\n            m[2][0]  * m[1][3] * m[3][1] +\n            m[3][0] * m[1][1] * m[2][3] -\n            m[3][0] * m[1][3] * m[2][1],\n            // 2,1:\n           -m[0][0]  * m[2][1] * m[3][3] +\n            m[0][0]  * m[2][3] * m[3][1] +\n            m[2][0]  * m[0][1] * m[3][3] -\n            m[2][0]  * m[0][3] * m[3][1] -\n            m[3][0] * m[0][1] * m[2][3] +\n            m[3][0] * m[0][3] * m[2][1],\n            // 2,2:\n            m[0][0]  * m[1][1] * m[3][3] -\n            m[0][0]  * m[1][3] * m[3][1] -\n            m[1][0]  * m[0][1] * m[3][3] +\n            m[1][0]  * m[0][3] * m[3][1] +\n            m[3][0] * m[0][1] * m[1][3] -\n            m[3][0] * m[0][3] * m[1][1],\n            // 2,3:\n           -m[0][0] * m[1][1] * m[2][3] +\n            m[0][0] * m[1][3] * m[2][1] +\n            m[1][0] * m[0][1] * m[2][3] -\n            m[1][0] * m[0][3] * m[2][1] -\n            m[2][0] * m[0][1] * m[1][3] +\n            m[2][0] * m[0][3] * m[1][1],\n        ],\n        [ // row 3\n            // 3,0:\n           -m[1][0]  * m[2][1] * m[3][2] +\n            m[1][0]  * m[2][2] * m[3][1] +\n            m[2][0]  * m[1][1] * m[3][2] -\n            m[2][0]  * m[1][2] * m[3][1] -\n            m[3][0] * m[1][1] * m[2][2] +\n            m[3][0] * m[1][2] * m[2][1],\n            // 3,1:\n            m[0][0]  * m[2][1] * m[3][2] -\n            m[0][0]  * m[2][2] * m[3][1] -\n            m[2][0]  * m[0][1] * m[3][2] +\n            m[2][0]  * m[0][2] * m[3][1] +\n            m[3][0] * m[0][1] * m[2][2] -\n            m[3][0] * m[0][2] * m[2][1],\n            // 3,2:\n           -m[0][0]  * m[1][1] * m[3][2] +\n            m[0][0]  * m[1][2] * m[3][1] +\n            m[1][0]  * m[0][1] * m[3][2] -\n            m[1][0]  * m[0][2] * m[3][1] -\n            m[3][0] * m[0][1] * m[1][2] +\n            m[3][0] * m[0][2] * m[1][1],\n            // 3,3:\n            m[0][0] * m[1][1] * m[2][2] -\n            m[0][0] * m[1][2] * m[2][1] -\n            m[1][0] * m[0][1] * m[2][2] +\n            m[1][0] * m[0][2] * m[2][1] +\n            m[2][0] * m[0][1] * m[1][2] -\n            m[2][0] * m[0][2] * m[1][1],\n        ],\n    ];\n\n    let det = m[0][0] * inv[0][0] + m[0][1] * inv[1][0] +\n              m[0][2] * inv[2][0] + m[0][3] * inv[3][0];\n    if det == 0. { return None; }\n\n    let det_inv = 1. / det;\n\n    for row in &mut inv {\n        for elem in row.iter_mut() {\n            *elem *= det_inv;\n        }\n    }\n\n    Some(Matrix4x4(inv))\n}\n\n#[cfg(test)]\n#[test]\nfn test() {\n    crate::test(inv4x4)\n}\n"
  },
  {
    "path": "examples/matrix_inverse/src/simd.rs",
    "content": "//! 4x4 matrix inverse using SIMD\nuse crate::*;\nuse packed_simd::shuffle;\n\nuse packed_simd::f32x4;\n\npub fn inv4x4(m: Matrix4x4) -> Option<Matrix4x4> {\n    let m = m.0;\n    let m_0 = f32x4::from_slice_unaligned(&m[0]);\n    let m_1 = f32x4::from_slice_unaligned(&m[1]);\n    let m_2 = f32x4::from_slice_unaligned(&m[2]);\n    let m_3 = f32x4::from_slice_unaligned(&m[3]);\n\n    let tmp1: f32x4 = shuffle!(m_0, m_1, [0, 1, 4, 5]);\n    let row1: f32x4 = shuffle!(m_2, m_3, [0, 1, 4, 5]);\n\n    let row0 = shuffle!(tmp1, row1, [0, 2, 4, 6]);\n    let row1: f32x4 = shuffle!(row1, tmp1, [1, 3, 5, 7]);\n\n    let tmp1: f32x4 = shuffle!(m_0, m_1, [2, 3, 6, 7]);\n    let row3: f32x4 = shuffle!(m_2, m_3, [2, 3, 6, 7]);\n    let row2 = shuffle!(tmp1, row3, [0, 2, 4, 6]);\n    let row3 = shuffle!(row3, tmp1, [1, 3, 5, 7]);\n\n    let tmp1: f32x4 = row2 * row3;\n    let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]);\n\n    let minor0 = row1 * tmp1;\n    let minor1 = row0 * tmp1;\n    let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]);\n    let minor0 = (row1 * tmp1) - minor0;\n    let minor1 = (row0 * tmp1) - minor1;\n    let minor1 = shuffle!(minor1, [2, 3, 0, 1]);\n\n    let tmp1 = row1 * row2;\n    let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]);\n    let minor0 = (row3 * tmp1) + minor0;\n    let minor3 = row0 * tmp1;\n    let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]);\n\n    let minor0 = minor0 - row3 * tmp1;\n    let minor3 = row0 * tmp1 - minor3;\n    let minor3 = shuffle!(minor3, [2, 3, 0, 1]);\n\n    let tmp1 = row3 * shuffle!(row1, [2, 3, 0, 1]);\n    let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]);\n    let row2 = shuffle!(row2, [2, 3, 0, 1]);\n    let minor0 = row2 * tmp1 + minor0;\n    let minor2 = row0 * tmp1;\n    let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]);\n    let minor0 = minor0 - row2 * tmp1;\n    let minor2 = row0 * tmp1 - minor2;\n    let minor2 = shuffle!(minor2, [2, 3, 0, 1]);\n\n    let tmp1 = row0 * row1;\n    let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]);\n    let minor2 = minor2 + row3 * tmp1;\n    let minor3 = row2 * tmp1 - minor3;\n    let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]);\n    let minor2 = row3 * tmp1 - minor2;\n    let minor3 = minor3 - row2 * tmp1;\n\n    let tmp1 = row0 * row3;\n    let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]);\n    let minor1 = minor1 - row2 * tmp1;\n    let minor2 = row1 * tmp1 + minor2;\n    let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]);\n    let minor1 = row2 * tmp1 + minor1;\n    let minor2 = minor2 - row1 * tmp1;\n\n    let tmp1 = row0 * row2;\n    let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]);\n    let minor1 = row3 * tmp1 + minor1;\n    let minor3 = minor3 - row1 * tmp1;\n    let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]);\n    let minor1 = minor1 - row3 * tmp1;\n    let minor3 = row1 * tmp1 + minor3;\n\n    let det = row0 * minor0;\n    let det = shuffle!(det, [2, 3, 0, 1]) + det;\n    let det = shuffle!(det, [1, 0, 3, 2]) + det;\n\n    if det.sum() == 0. {\n        return None;\n    }\n    let tmp1 = det.recpre();\n    let det = tmp1 + tmp1 - det * tmp1 * tmp1;\n\n    let res0 = minor0 * det;\n    let res1 = minor1 * det;\n    let res2 = minor2 * det;\n    let res3 = minor3 * det;\n\n    let mut m = m;\n\n    res0.write_to_slice_unaligned(&mut m[0]);\n    res1.write_to_slice_unaligned(&mut m[1]);\n    res2.write_to_slice_unaligned(&mut m[2]);\n    res3.write_to_slice_unaligned(&mut m[3]);\n\n    Some(Matrix4x4(m))\n}\n\n#[cfg(test)]\n#[test]\nfn test() {\n    crate::test(inv4x4)\n}\n"
  },
  {
    "path": "examples/nbody/Cargo.toml",
    "content": "[package]\nname = \"nbody\"\nversion = \"0.1.0\"\nauthors = [\"Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>\"]\nedition = \"2018\"\n\n[dependencies]\npacked_simd = { package = \"packed_simd\", path = \"../..\" }\n\n[[bin]]\nname = \"nbody\"\npath = \"src/main.rs\"\n\n[lib]\nname = \"nbody_lib\"\npath = \"src/lib.rs\"\n\n[features]\ndefault = [ ]\nsleef-sys = [ \"packed_simd/sleef-sys\" ]\ncore_arch = [ \"packed_simd/core_arch\" ]\n"
  },
  {
    "path": "examples/nbody/benches/algs.rs",
    "content": "//! n-body benchmarks\n#![feature(test)]\n\nextern crate nbody_lib;\nextern crate test;\n\nuse test::{black_box, Bencher};\n\n#[bench]\nfn simd(b: &mut Bencher) {\n    b.iter(|| black_box(nbody_lib::simd::run(black_box(10_000))))\n}\n\n#[bench]\nfn scalar(b: &mut Bencher) {\n    b.iter(|| black_box(nbody_lib::scalar::run(black_box(10_000))))\n}\n"
  },
  {
    "path": "examples/nbody/readme.md",
    "content": "# N-Body\n\nThis is the [`n-body` benchmark from the benchmarksgame][bg]. It models the orbits\nof Jovian planets, using the same simple symplectic-integrator.\n\n## Usage\n\nIt takes two arguments in this order:\n\n* `n`: the number of iterations to perform\n* (optional) `algorithm`: the algorithm to use - defaults to the fastest one.\n  * `0`: scalar algorithm\n  * `1`: SIMD algorithm\n\n## Implementation\n\nThere are three kernels, two of which are only run twice independently of the\nnumber of iterations (`offset_momentum` and `energy`). The `advance` kernel is\nrun once per iterations and uses 100% of the running time.\n\n[bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/nbody.html#nbody\n"
  },
  {
    "path": "examples/nbody/src/lib.rs",
    "content": "//! The N-body benchmark from the [benchmarks game][bg].\n//!\n//! [bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/nbody.html#nbody\n#![deny(rust_2018_idioms)]\n#![allow(\n    clippy::similar_names,\n    clippy::excessive_precision,\n    clippy::must_use_candidate\n)]\n\npub mod scalar;\npub mod simd;\n\npub fn run(n: usize, alg: usize) -> (f64, f64) {\n    match alg {\n        0 => scalar::run(n),\n        1 => simd::run(n),\n        v => panic!(\"unknown algorithm value: {}\", v),\n    }\n}\n\n#[cfg(test)]\nconst RESULTS: &[(usize, &str, &str)] =\n    &[(1_000_usize, \"-0.169075164\", \"-0.169087605\")];\n"
  },
  {
    "path": "examples/nbody/src/main.rs",
    "content": "//! The N-body benchmark from the [benchmarks game][bg].\n//!\n//! [bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/nbody.html#nbody\n#![deny(rust_2018_idioms)]\n\nfn run<O: std::io::Write>(o: &mut O, n: usize, alg: usize) {\n    let (energy_before, energy_after) = nbody_lib::run(n, alg);\n\n    writeln!(o, \"{:.9}\", energy_before).unwrap();\n    writeln!(o, \"{:.9}\", energy_after).unwrap();\n}\n\nfn main() {\n    let n: usize = std::env::args()\n        .nth(1)\n        .expect(\"need one arg\")\n        .parse()\n        .expect(\"argument should be a usize\");\n\n    let alg: usize = if let Some(v) = std::env::args().nth(2) {\n        v.parse().expect(\"second argument must be a usize\")\n    } else {\n        1 // SIMD algorithm\n    };\n\n    run(&mut std::io::stdout(), n, alg);\n}\n\n#[cfg(test)]\nmod tests {\n    use super::*;\n    static OUTPUT: &[u8] = include_bytes!(\"nbody-output.txt\");\n    #[test]\n    fn verify_output_simd() {\n        let mut out: Vec<u8> = Vec::new();\n\n        run(&mut out, 1000, 0);\n\n        assert_eq!(out.len(), OUTPUT.len());\n        if out != OUTPUT {\n            for i in 0..out.len() {\n                assert_eq!(\n                    out[i], OUTPUT[i],\n                    \"byte {} differs - is: {:#08b} - should: {:#08b}\",\n                    i, out[i], OUTPUT[i]\n                );\n            }\n        }\n    }\n    #[test]\n    fn verify_output_scalar() {\n        let mut out: Vec<u8> = Vec::new();\n\n        run(&mut out, 1000, 1);\n\n        assert_eq!(out.len(), OUTPUT.len());\n        if out != OUTPUT {\n            for i in 0..out.len() {\n                assert_eq!(\n                    out[i], OUTPUT[i],\n                    \"byte {} differs - is: {:#08b} - should: {:#08b}\",\n                    i, out[i], OUTPUT[i]\n                );\n            }\n        }\n    }\n\n}\n"
  },
  {
    "path": "examples/nbody/src/nbody-output.txt",
    "content": "-0.169075164\n-0.169087605\n"
  },
  {
    "path": "examples/nbody/src/scalar.rs",
    "content": "// The Computer Language Benchmarks Game\n// https://benchmarksgame-team.pages.debian.net\n//\n// contributed by the Rust Project Developers\n// contributed by TeXitoi\n\nuse std::f64::consts::PI;\nconst SOLAR_MASS: f64 = 4.0 * PI * PI;\nconst DAYS_PER_YEAR: f64 = 365.24;\n\nstruct Body {\n    x: [f64; 3],\n    v: [f64; 3],\n    mass: f64,\n}\n\nconst N_BODIES: usize = 5;\n#[allow(clippy::unreadable_literal)]\nconst BODIES: [Body; N_BODIES] = [\n    // Sun\n    Body { x: [0., 0., 0.], v: [0., 0., 0.], mass: SOLAR_MASS },\n    // Jupiter\n    Body {\n        x: [\n            4.84143144246472090e+00,\n            -1.16032004402742839e+00,\n            -1.03622044471123109e-01,\n        ],\n        v: [\n            1.66007664274403694e-03 * DAYS_PER_YEAR,\n            7.69901118419740425e-03 * DAYS_PER_YEAR,\n            -6.90460016972063023e-05 * DAYS_PER_YEAR,\n        ],\n        mass: 9.54791938424326609e-04 * SOLAR_MASS,\n    },\n    // Saturn\n    Body {\n        x: [\n            8.34336671824457987e+00,\n            4.12479856412430479e+00,\n            -4.03523417114321381e-01,\n        ],\n        v: [\n            -2.76742510726862411e-03 * DAYS_PER_YEAR,\n            4.99852801234917238e-03 * DAYS_PER_YEAR,\n            2.30417297573763929e-05 * DAYS_PER_YEAR,\n        ],\n        mass: 2.85885980666130812e-04 * SOLAR_MASS,\n    },\n    // Uranus\n    Body {\n        x: [\n            1.28943695621391310e+01,\n            -1.51111514016986312e+01,\n            -2.23307578892655734e-01,\n        ],\n        v: [\n            2.96460137564761618e-03 * DAYS_PER_YEAR,\n            2.37847173959480950e-03 * DAYS_PER_YEAR,\n            -2.96589568540237556e-05 * DAYS_PER_YEAR,\n        ],\n        mass: 4.36624404335156298e-05 * SOLAR_MASS,\n    },\n    // Neptune\n    Body {\n        x: [\n            1.53796971148509165e+01,\n            -2.59193146099879641e+01,\n            1.79258772950371181e-01,\n        ],\n        v: [\n            2.68067772490389322e-03 * DAYS_PER_YEAR,\n            1.62824170038242295e-03 * DAYS_PER_YEAR,\n            -9.51592254519715870e-05 * DAYS_PER_YEAR,\n        ],\n        mass: 5.15138902046611451e-05 * SOLAR_MASS,\n    },\n];\n\nfn advance(bodies: &mut [Body; N_BODIES], dt: f64) {\n    let mut b_slice: &mut [_] = bodies;\n    while let Some(bi) = shift_mut_ref(&mut b_slice) {\n        for bj in b_slice.iter_mut() {\n            let mut dx = [0.; 3];\n            for (dx, (x_i, x_j)) in\n                dx.iter_mut().zip(bi.x.iter().zip(bj.x.iter()))\n            {\n                *dx = x_i - x_j;\n            }\n\n            let mut d2: f64 = 0.;\n            for dx in &dx {\n                d2 += dx * dx;\n            }\n            let mag = dt / (d2 * d2.sqrt());\n\n            let massi_mag = bi.mass * mag;\n            let massj_mag = bj.mass * mag;\n            for (v_j, (v_i, dx)) in\n                bj.v.iter_mut().zip(bi.v.iter_mut().zip(dx.iter()))\n            {\n                *v_j += dx * massi_mag;\n                *v_i -= dx * massj_mag;\n            }\n        }\n        for (x, v) in bi.x.iter_mut().zip(bi.v.iter()) {\n            *x += dt * v;\n        }\n    }\n}\n\nfn energy(bodies: &[Body; N_BODIES]) -> f64 {\n    let mut e = 0.0;\n    let mut bodies = bodies.iter();\n    while let Some(bi) = bodies.next() {\n        let mut e_l = 0.;\n        for v in &bi.v {\n            e_l += v * v;\n        }\n        e += e_l * bi.mass / 2.0;\n        for bj in bodies.clone() {\n            let mut dist = 0.;\n            for (xi, xj) in bi.x.iter().zip(bj.x.iter()) {\n                let dx = xi - xj;\n                dist += dx * dx;\n            }\n            e -= bi.mass * bj.mass / dist.sqrt();\n        }\n    }\n    e\n}\n\nfn offset_momentum(bodies: &mut [Body; N_BODIES]) {\n    let mut p = [0.; 3];\n    for bi in bodies.iter() {\n        for (p, v) in p.iter_mut().zip(bi.v.iter()) {\n            *p += v * bi.mass;\n        }\n    }\n    let sun = &mut bodies[0];\n    for (v, p) in sun.v.iter_mut().zip(p.iter()) {\n        *v = -p / SOLAR_MASS;\n    }\n}\n\n/// Pop a mutable reference off the head of a slice, mutating the slice to no\n/// longer contain the mutable reference.\n#[allow(clippy::mut_mut)]\nfn shift_mut_ref<'a, T>(r: &mut &'a mut [T]) -> Option<&'a mut T> {\n    if r.is_empty() {\n        return None;\n    }\n    let tmp = std::mem::replace(r, &mut []);\n    let (h, t) = tmp.split_at_mut(1);\n    *r = t;\n    Some(&mut h[0])\n}\n\npub fn run(n: usize) -> (f64, f64) {\n    let mut bodies = BODIES;\n    offset_momentum(&mut bodies);\n    let a = energy(&bodies);\n    for _ in 0..n {\n        advance(&mut bodies, 0.01);\n    }\n    let b = energy(&bodies);\n    (a, b)\n}\n\n#[cfg(test)]\nmod tests {\n    #[test]\n    fn test() {\n        for &(size, a_e, b_e) in crate::RESULTS {\n            let (a, b) = super::run(size);\n            assert_eq!(format!(\"{:.9}\", a), a_e);\n            assert_eq!(format!(\"{:.9}\", b), b_e);\n        }\n    }\n}\n"
  },
  {
    "path": "examples/nbody/src/simd.rs",
    "content": "#![deny(warnings)]\n\nuse packed_simd::*;\n\nuse std::f64::consts::PI;\nconst SOLAR_MASS: f64 = 4.0 * PI * PI;\nconst DAYS_PER_YEAR: f64 = 365.24;\n\npub struct Body {\n    pub x: f64x4,\n    pub v: f64x4,\n    pub mass: f64,\n}\nconst N_BODIES: usize = 5;\n#[allow(clippy::unreadable_literal)]\nconst BODIES: [Body; N_BODIES] = [\n    // sun:\n    Body {\n        x: f64x4::new(0., 0., 0., 0.),\n        v: f64x4::new(0., 0., 0., 0.),\n        mass: SOLAR_MASS,\n    },\n    // jupiter:\n    Body {\n        x: f64x4::new(\n            4.84143144246472090e+00,\n            -1.16032004402742839e+00,\n            -1.03622044471123109e-01,\n            0.,\n        ),\n        v: f64x4::new(\n            1.66007664274403694e-03 * DAYS_PER_YEAR,\n            7.69901118419740425e-03 * DAYS_PER_YEAR,\n            -6.90460016972063023e-05 * DAYS_PER_YEAR,\n            0.,\n        ),\n        mass: 9.54791938424326609e-04 * SOLAR_MASS,\n    },\n    // saturn:\n    Body {\n        x: f64x4::new(\n            8.34336671824457987e+00,\n            4.12479856412430479e+00,\n            -4.03523417114321381e-01,\n            0.,\n        ),\n        v: f64x4::new(\n            -2.76742510726862411e-03 * DAYS_PER_YEAR,\n            4.99852801234917238e-03 * DAYS_PER_YEAR,\n            2.30417297573763929e-05 * DAYS_PER_YEAR,\n            0.,\n        ),\n        mass: 2.85885980666130812e-04 * SOLAR_MASS,\n    },\n    // uranus:\n    Body {\n        x: f64x4::new(\n            1.28943695621391310e+01,\n            -1.51111514016986312e+01,\n            -2.23307578892655734e-01,\n            0.,\n        ),\n        v: f64x4::new(\n            2.96460137564761618e-03 * DAYS_PER_YEAR,\n            2.37847173959480950e-03 * DAYS_PER_YEAR,\n            -2.96589568540237556e-05 * DAYS_PER_YEAR,\n            0.,\n        ),\n        mass: 4.36624404335156298e-05 * SOLAR_MASS,\n    },\n    // neptune:\n    Body {\n        x: f64x4::new(\n            1.53796971148509165e+01,\n            -2.59193146099879641e+01,\n            1.79258772950371181e-01,\n            0.,\n        ),\n        v: f64x4::new(\n            2.68067772490389322e-03 * DAYS_PER_YEAR,\n            1.62824170038242295e-03 * DAYS_PER_YEAR,\n            -9.51592254519715870e-05 * DAYS_PER_YEAR,\n            0.,\n        ),\n        mass: 5.15138902046611451e-05 * SOLAR_MASS,\n    },\n];\n\npub fn offset_momentum(bodies: &mut [Body; N_BODIES]) {\n    let (sun, rest) = bodies.split_at_mut(1);\n    let sun = &mut sun[0];\n    for body in rest {\n        let m_ratio = body.mass / SOLAR_MASS;\n        sun.v -= body.v * m_ratio;\n    }\n}\n\npub fn energy(bodies: &[Body; N_BODIES]) -> f64 {\n    let mut e = 0.;\n    for i in 0..N_BODIES {\n        let bi = &bodies[i];\n        e += bi.mass * (bi.v * bi.v).sum() * 0.5;\n        for bj in bodies.iter().take(N_BODIES).skip(i + 1) {\n            let dx = bi.x - bj.x;\n            e -= bi.mass * bj.mass / (dx * dx).sum().sqrt()\n        }\n    }\n    e\n}\n\npub fn advance(bodies: &mut [Body; N_BODIES], dt: f64) {\n    const N: usize = N_BODIES * (N_BODIES - 1) / 2;\n\n    // compute distance between bodies:\n    let mut r = [f64x4::splat(0.); N];\n    {\n        let mut i = 0;\n        for j in 0..N_BODIES {\n            for k in j + 1..N_BODIES {\n                r[i] = bodies[j].x - bodies[k].x;\n                i += 1;\n            }\n        }\n    }\n\n    let mut mag = [0.0; N];\n    let mut i = 0;\n    while i < N {\n        let d2s = f64x2::new((r[i] * r[i]).sum(), (r[i + 1] * r[i + 1]).sum());\n        let dmags = f64x2::splat(dt) / (d2s * d2s.sqrte());\n        dmags.write_to_slice_unaligned(&mut mag[i..]);\n        i += 2;\n    }\n\n    i = 0;\n    for j in 0..N_BODIES {\n        for k in j + 1..N_BODIES {\n            let f = r[i] * mag[i];\n            bodies[j].v -= f * bodies[k].mass;\n            bodies[k].v += f * bodies[j].mass;\n            i += 1\n        }\n    }\n    for body in bodies {\n        body.x += dt * body.v\n    }\n}\n\npub fn run_k<K>(n: usize, k: K) -> (f64, f64)\nwhere\n    K: Fn(&mut [Body; N_BODIES], f64),\n{\n    let mut bodies = BODIES;\n    offset_momentum(&mut bodies);\n    let energy_before = energy(&bodies);\n    for _ in 0..n {\n        k(&mut bodies, 0.01);\n    }\n    let energy_after = energy(&bodies);\n\n    (energy_before, energy_after)\n}\n\npub fn run(n: usize) -> (f64, f64) {\n    run_k(n, advance)\n}\n\n#[cfg(test)]\nmod tests {\n    #[test]\n    fn test() {\n        for &(size, a_e, b_e) in crate::RESULTS {\n            let (a, b) = super::run(size);\n            assert_eq!(format!(\"{:.9}\", a), a_e);\n            assert_eq!(format!(\"{:.9}\", b), b_e);\n        }\n    }\n}\n"
  },
  {
    "path": "examples/options_pricing/Cargo.toml",
    "content": "[package]\nname = \"options_pricing\"\nversion = \"0.1.0\"\nauthors = [\"gnzlbg <gonzalobg88@gmail.com>\"]\nedition = \"2018\"\n\n[dependencies]\npacked_simd = { package = \"packed_simd\", path = \"../..\" }\ntime = \"^0.1\"\nrayon = \"^1.0\"\nispc = { version = \"^1.0.4\", optional = true }\n\n[build-dependencies]\nispc = { version = \"^1.0.4\", optional = true }\n\n[[bin]]\nname = \"options_pricing\"\npath = \"src/main.rs\"\n\n[lib]\nname = \"options_pricing_lib\"\npath = \"src/lib.rs\"\n\n[features]\ndefault = []\ncore_arch = [ \"packed_simd/core_arch\" ]\nsleef-sys = [ \"packed_simd/sleef-sys\" ]\nispc_libm = [ \"ispc\" ]\n"
  },
  {
    "path": "examples/options_pricing/benchmark.sh",
    "content": "#!/usr/bin/env bash\n#\n# Runs options_pricing benchmarks\n\nset -ex\n\nNUM_OPTIONS_BLACK_SCHOLES=10000000\n\nif [[ ${NORUN} != 1 ]]; then\n    hash hyperfine 2>/dev/null || { echo >&2 \"hyperfine is not in PATH.\"; exit 1; }\nfi\n\n# Black-Scholes:\nALGS=(\"black_scholes_scalar\" \"black_scholes_simd\" \"black_scholes_simd_par\")\nif echo \"$FEATURES\" | grep -q \"ispc\"; then\n    hash ispc 2>/dev/null || { echo >&2 \"ispc is not in PATH.\"; exit 1; }\n    ALGS+=(\"black_scholes_ispc\" \"black_scholes_ispc_tasks\")\nfi\n\n\nRUSTFLAGS=\"-C target-cpu=native ${RUSTFLAGS}\" \\\n         cargo build --release --features=\"${FEATURES}\"\n\nif [[ \"${NORUN}\" == \"1\" ]]; then\n    exit 0\nfi\n\n#for alg in \"${ALGS[@]}\"\n#do\n#    hyperfine \"../target/release/options_pricing ${NUM_OPTIONS_BLACK_SCHOLES} ${alg}\"\n#done\n\n# Binomial put:\nALGS=(\"binomial_put_scalar\" \"binomial_put_simd\" \"binomial_put_simd_par\")\nif echo \"$FEATURES\" | grep -q \"ispc\"; then\n    ALGS+=(\"binomial_put_ispc\" \"binomial_put_ispc_tasks\")\nfi\n\nNUM_OPTIONS_BINOMIAL_PUT=500000\n\nfor alg in \"${ALGS[@]}\"\ndo\n    hyperfine \"../target/release/options_pricing ${NUM_OPTIONS_BINOMIAL_PUT} ${alg}\"\ndone\n"
  },
  {
    "path": "examples/options_pricing/build.rs",
    "content": "fn main() {\n    println!(\"cargo:rerun-if-changed=build.rs\");\n\n    #[cfg(feature = \"ispc\")]\n    {\n        if std::env::var(\"CARGO_FEATURE_ISPC\").is_ok() {\n            let mut cfg = ispc::Config::new();\n\n            if cfg!(windows) {\n                cfg.debug(false);\n            }\n\n            let ispc_files = vec![\"volta/options.ispc\"];\n\n            for s in &ispc_files[..] {\n                cfg.file(*s);\n            }\n\n            cfg.target_isas(vec![\n                ispc::opt::TargetISA::SSE2i32x4,\n                ispc::opt::TargetISA::SSE4i32x4,\n                ispc::opt::TargetISA::AVX1i32x8,\n                ispc::opt::TargetISA::AVX2i32x8,\n                ispc::opt::TargetISA::AVX512KNLi32x16,\n            ]);\n\n            #[cfg(feature = \"ispc_libm\")]\n            {\n                // Use the system's libm\n                cfg.math_lib(ispc::opt::MathLib::System);\n            }\n\n            cfg.compile(\"options\");\n        }\n    }\n}\n"
  },
  {
    "path": "examples/options_pricing/readme.md",
    "content": "# Options Pricing ISPC example\n\nThis is the [`options` ISPC benchmark][ispc]:\n\n> This program implements both the Black-Scholes and \n> Binomial options pricing models.\n\n## Usage\n\n```\ncargo run --release --features=ispc -- ${SIZE} ${ALGORITHM}\n```\n\n## Results\n\n```\n./benchmark.sh\n```\n\n## Black-Scholes\n\nOn a dual core AVX1 i5 @1.8 GHz:\n\n| 800 x 800    | time [ms] <br> Rust | speedup vs `scalar` [-] |\n|--------------|---------------------|-------------------------|\n| `scalar`     |                998 | 1.0x                       |\n| `simd`       |                367 | 2.7x                       |\n| `par_simd`   |               246 | 4.1x                       |\n| `ispc`       |                360 | 2.8x                       |\n| `ispc+tasks` |               248 | 4.0x                       |\n\n`par_simd` and `ispc+tasks` algorithms are on par.\n\n## Binomial put\n\nOn a dual core AVX1 i5 @1.8 GHz:\n\n| 800 x 800    | time [ms] <br> Rust | speedup vs `scalar` [-] |\n|--------------|---------------------|-------------------------|\n| `scalar`     |               2057 | 1.0x                       |\n| `simd`       |               651 | 3.2x                       |\n| `par_simd`   |               279 | 4.3x                       |\n| `ispc`       |                805 | 7.4x                       |\n| `ispc+tasks` |               404 | 5.1x                       |\n\n`par_simd` algorithm is ~1.4x faster than `ispc+tasks`.\n\n\n[ispc]: https://github.com/ispc/ispc/tree/master/examples/options\n"
  },
  {
    "path": "examples/options_pricing/src/ispc_.rs",
    "content": "//! Includes the ISPC implementations.\n\nuse ispc::*;\nispc_module!(options);\n\npub mod black_scholes {\n    use super::*;\n\n    pub fn serial(\n        sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],\n        result: &mut [f32], count: usize,\n    ) -> f64 {\n        unsafe {\n            self::options::black_scholes_ispc(\n                sa.as_ptr() as *mut f32,\n                xa.as_ptr() as *mut f32,\n                ta.as_ptr() as *mut f32,\n                ra.as_ptr() as *mut f32,\n                va.as_ptr() as *mut f32,\n                result.as_mut_ptr(),\n                count as i32,\n            )\n        }\n    }\n\n    pub fn tasks(\n        sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],\n        result: &mut [f32], count: usize,\n    ) -> f64 {\n        unsafe {\n            self::options::black_scholes_ispc_tasks(\n                sa.as_ptr() as *mut f32,\n                xa.as_ptr() as *mut f32,\n                ta.as_ptr() as *mut f32,\n                ra.as_ptr() as *mut f32,\n                va.as_ptr() as *mut f32,\n                result.as_mut_ptr(),\n                count as i32,\n            )\n        }\n    }\n}\n\npub mod binomial_put {\n    use super::*;\n\n    pub fn serial(\n        sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],\n        result: &mut [f32], count: usize,\n    ) -> f64 {\n        unsafe {\n            self::options::binomial_put_ispc(\n                sa.as_ptr() as *mut f32,\n                xa.as_ptr() as *mut f32,\n                ta.as_ptr() as *mut f32,\n                ra.as_ptr() as *mut f32,\n                va.as_ptr() as *mut f32,\n                result.as_mut_ptr(),\n                count as i32,\n            )\n        }\n    }\n\n    pub fn tasks(\n        sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],\n        result: &mut [f32], count: usize,\n    ) -> f64 {\n        unsafe {\n            self::options::binomial_put_ispc_tasks(\n                sa.as_ptr() as *mut f32,\n                xa.as_ptr() as *mut f32,\n                ta.as_ptr() as *mut f32,\n                ra.as_ptr() as *mut f32,\n                va.as_ptr() as *mut f32,\n                result.as_mut_ptr(),\n                count as i32,\n            )\n        }\n    }\n}\n\n#[cfg(test)]\nmod tests {\n    use super::*;\n    #[test]\n    fn black_scholes() {\n        const NOPTS: usize = 1_000_000;\n        let mut serial = crate::State::new(NOPTS);\n        let mut tasks = crate::State::new(NOPTS);\n\n        let serial_sum = serial.exec(black_scholes::serial);\n        let tasks_sum = tasks.exec(black_scholes::tasks);\n\n        assert_eq!(serial, tasks);\n        assert_eq!(serial_sum, tasks_sum);\n    }\n\n    #[test]\n    fn binomial_put() {\n        const NOPTS: usize = 1_000_000;\n        let mut serial = crate::State::new(NOPTS);\n        let mut tasks = crate::State::new(NOPTS);\n\n        let serial_sum = serial.exec(binomial_put::serial);\n        let tasks_sum = tasks.exec(binomial_put::tasks);\n\n        assert_eq!(serial, tasks);\n        assert_eq!(serial_sum, tasks_sum);\n    }\n}\n"
  },
  {
    "path": "examples/options_pricing/src/lib.rs",
    "content": "#![deny(rust_2018_idioms)]\n#![allow(\n    clippy::inline_always,\n    clippy::many_single_char_names,\n    clippy::excessive_precision,\n    clippy::cast_precision_loss,\n    clippy::cast_possible_truncation,\n    clippy::cast_possible_wrap,\n    clippy::must_use_candidate,\n    clippy::too_many_arguments,\n    clippy::float_cmp\n)]\n\nuse packed_simd::f32x8 as f32s;\nuse packed_simd::f64x8 as f64s;\n\nconst BINOMIAL_NUM: usize = 64;\n\n#[cfg(feature = \"ispc\")]\npub mod ispc_;\npub mod scalar;\npub mod simd;\npub mod simd_kernels;\npub mod simd_par;\npub mod sum;\n\n#[derive(PartialEq, Debug)]\npub struct State {\n    s: Vec<f32>,\n    x: Vec<f32>,\n    t: Vec<f32>,\n    r: Vec<f32>,\n    v: Vec<f32>,\n    result: Vec<f32>,\n    count: usize,\n}\n\nimpl State {\n    pub fn new(count: usize) -> Self {\n        Self {\n            s: vec![100.; count],\n            x: vec![98.; count],\n            t: vec![2.; count],\n            r: vec![0.02; count],\n            v: vec![5.; count],\n            result: vec![0.0; count],\n            count,\n        }\n    }\n    pub fn exec<F>(&mut self, model: F) -> f64\n    where\n        F: Fn(\n            &[f32],\n            &[f32],\n            &[f32],\n            &[f32],\n            &[f32],\n            &mut [f32],\n            usize,\n        ) -> f64,\n    {\n        model(\n            &self.s,\n            &self.x,\n            &self.t,\n            &self.r,\n            &self.v,\n            &mut self.result,\n            self.count,\n        )\n    }\n}\n\n#[cfg(test)]\nfn almost_equal(a: f64, b: f64, max_rel_diff: f64) -> bool {\n    let diff = (a - b).abs();\n    let a = a.abs();\n    let b = b.abs();\n    let largest = a.max(b);\n\n    diff <= largest * max_rel_diff\n}\n"
  },
  {
    "path": "examples/options_pricing/src/main.rs",
    "content": "#![deny(warnings, rust_2018_idioms)]\n#![feature(custom_inner_attributes)]\n\nuse options_pricing_lib::*;\n\n#[rustfmt::skip]\nfn run<F>(name: &str, count: usize, f: F)\nwhere\n    F: Fn(&[f32], &[f32], &[f32], &[f32], &[f32], &mut [f32], usize) -> f64,\n{\n    let mut d = State::new(count);\n    let t = time::Duration::span(move || { d.exec(f); } );\n    println!(\"{}: {} ms\", name, t.num_milliseconds());\n}\n\nmacro_rules! ispc_alg {\n    ($name:tt, $count:ident, $fun:path) => {{\n        #[cfg(feature = \"ispc\")]\n        {\n            run($name, $count, $fun);\n        }\n        #[cfg(not(feature = \"ispc\"))]\n        {\n            panic!(\"algorithm {} requires --feature=ispc\", $name);\n        }\n    }};\n}\n\nfn main() {\n    let mut args = std::env::args();\n    args.next();\n    let num_options: usize = args\n        .next()\n        .unwrap()\n        .parse()\n        .expect(\"expected argument 1 of type usize: num_options\");\n    let algorithm: String = args\n        .next()\n        .unwrap()\n        .parse()\n        .expect(\"expected argument 2 of type String: algorithm\");\n\n    match algorithm.as_str() {\n        \"black_scholes_ispc_tasks\" => ispc_alg!(\n            \"black_scholes_ispc_tasks\",\n            num_options,\n            ispc_::black_scholes::tasks\n        ),\n        \"black_scholes_ispc\" => ispc_alg!(\n            \"black_scholes_ispc\",\n            num_options,\n            ispc_::black_scholes::serial\n        ),\n        \"binomial_put_ispc_tasks\" => ispc_alg!(\n            \"binomial_put_ispc_tasks\",\n            num_options,\n            ispc_::binomial_put::tasks\n        ),\n        \"binomial_put_ispc\" => ispc_alg!(\n            \"binomial_put_ispc\",\n            num_options,\n            ispc_::binomial_put::serial\n        ),\n        \"black_scholes_scalar\" => {\n            run(\"black_scholes_scalar\", num_options, scalar::black_scholes)\n        }\n        \"binomial_put_scalar\" => {\n            run(\"binomial_put_scalar\", num_options, scalar::binomial_put)\n        }\n        \"black_scholes_simd\" => {\n            run(\"black_scholes_simd\", num_options, simd::black_scholes)\n        }\n        \"binomial_put_simd\" => {\n            run(\"binomial_put_simd\", num_options, simd::binomial_put)\n        }\n        \"black_scholes_simd_par\" => {\n            run(\"black_scholes_simd_par\", num_options, simd_par::black_scholes)\n        }\n        \"binomial_put_simd_par\" => {\n            run(\"binomial_put_simd_par\", num_options, simd_par::binomial_put)\n        }\n        _ => panic!(\"unknown algorithm: {}\", algorithm),\n    }\n}\n"
  },
  {
    "path": "examples/options_pricing/src/scalar.rs",
    "content": "//! Scalar implementation\n\n// Cumulative normal distribution function\n#[inline(always)]\nfn cnd(x: f32) -> f32 {\n    const INV_SQRT_2PI: f32 = 0.398_942_280_40;\n\n    let l = x.abs();\n    let k = 1. / (1. + 0.231_641_9 * l);\n    let k2 = k * k;\n    let k3 = k2 * k;\n    let k4 = k2 * k2;\n    let k5 = k3 * k2;\n    let w: f32 = 0.319_381_53 * k - 0.356_563_782 * k2\n        + 1.781_477_937 * k3\n        + -1.821_255_978 * k4\n        + 1.330_274_429 * k5;\n    let w = w * INV_SQRT_2PI * (-l * l * 0.5).exp();\n\n    if x > 0. {\n        1. - w\n    } else {\n        w\n    }\n}\n\npub fn black_scholes(\n    sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],\n    result: &mut [f32], count: usize,\n) -> f64 {\n    for i in 0..count {\n        let s = sa[i];\n        let x = xa[i];\n        let t = ta[i];\n        let r = ra[i];\n        let v = va[i];\n        let d1 = ((s / x).ln() + (r + v * v * 0.5) * t) / (v * t.sqrt());\n        let d2 = d1 - v * t.sqrt();\n        result[i] = s * cnd(d1) - x * (-r * t).exp() * cnd(d2);\n    }\n    crate::sum::slice_scalar(&result)\n}\n\npub fn binomial_put(\n    sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],\n    result: &mut [f32], count: usize,\n) -> f64 {\n    use crate::BINOMIAL_NUM;\n\n    for i in 0..count {\n        let s = sa[i];\n        let x = xa[i];\n        let t = ta[i];\n        let r = ra[i];\n        let v = va[i];\n\n        let dt = t / BINOMIAL_NUM as f32;\n        let u = (v * dt.sqrt()).exp();\n        let d = 1. / u;\n        let disc = (r * dt).exp();\n        let pu = (disc - d) / (u - d);\n\n        let mut vs = [0_f32; BINOMIAL_NUM];\n        for (j, v) in vs.iter_mut().enumerate() {\n            let e = (2_i32 * (j as i32)).wrapping_sub(BINOMIAL_NUM as i32);\n            let upow = u.powf(e as f32);\n            *v = 0_f32.max(x - s * upow);\n        }\n\n        for j in (0..BINOMIAL_NUM).rev() {\n            for k in 0..j {\n                vs[k] = ((1. - pu) * vs[k] + pu * vs[k + 1]) / disc;\n            }\n        }\n\n        result[i] = vs[0];\n    }\n    crate::sum::slice_scalar(&result)\n}\n\n#[cfg(feature = \"ispc\")]\n#[cfg(test)]\nmod tests {\n    use super::*;\n    use crate::almost_equal;\n    #[test]\n    fn black_scholes_ispc() {\n        const NOPTS: usize = 1_000_000;\n        let mut scalar = crate::State::new(NOPTS);\n        let mut ispc = crate::State::new(NOPTS);\n\n        let scalar_sum = scalar.exec(black_scholes);\n        let ispc_sum = ispc.exec(crate::ispc_::black_scholes::serial);\n\n        assert_eq!(scalar, ispc);\n        assert_eq!(scalar_sum, ispc_sum);\n    }\n\n    #[test]\n    fn binomial_put_ispc() {\n        const NOPTS: usize = 1_000_000;\n        let mut scalar = crate::State::new(NOPTS);\n        let mut ispc = crate::State::new(NOPTS);\n\n        let scalar_sum = scalar.exec(binomial_put);\n        let ispc_sum = ispc.exec(crate::ispc_::binomial_put::serial);\n\n        // FIXME: results differ slightly for each value of the result vector\n        // need to figure out why\n        // assert_eq!(scalar, ispc);\n        assert!(almost_equal(scalar_sum, ispc_sum, 1e-5));\n    }\n}\n"
  },
  {
    "path": "examples/options_pricing/src/simd.rs",
    "content": "//! SIMD implementation\n\nuse crate::f32s;\n\npub fn serial<K>(\n    sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],\n    result: &mut [f32], count: usize, kernel: K,\n) -> f64\nwhere\n    K: Fn(f32s, f32s, f32s, f32s, f32s) -> f32s,\n{\n    assert_eq!(count % f32s::lanes(), 0);\n    for i in (0..count).step_by(f32s::lanes()) {\n        unsafe {\n            let s = f32s::from_slice_unaligned_unchecked(&sa[i..]);\n            let x = f32s::from_slice_unaligned_unchecked(&xa[i..]);\n            let t = f32s::from_slice_unaligned_unchecked(&ta[i..]);\n            let r = f32s::from_slice_unaligned_unchecked(&ra[i..]);\n            let v = f32s::from_slice_unaligned_unchecked(&va[i..]);\n            let r = kernel(s, x, t, r, v);\n            r.write_to_slice_unaligned_unchecked(&mut result[i..]);\n        }\n    }\n    crate::sum::slice(&result)\n}\n\npub fn black_scholes(\n    sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],\n    result: &mut [f32], count: usize,\n) -> f64 {\n    serial(\n        sa,\n        xa,\n        ta,\n        ra,\n        va,\n        result,\n        count,\n        crate::simd_kernels::black_scholes,\n    )\n}\n\npub fn binomial_put(\n    sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],\n    result: &mut [f32], count: usize,\n) -> f64 {\n    serial(\n        sa,\n        xa,\n        ta,\n        ra,\n        va,\n        result,\n        count,\n        crate::simd_kernels::binomial_put,\n    )\n}\n\n#[cfg(test)]\nmod tests {\n    use super::*;\n    use crate::almost_equal;\n    #[test]\n    fn black_scholes_scalar() {\n        const NOPTS: usize = 1_000_000;\n        let mut simd = crate::State::new(NOPTS);\n        let mut scalar = crate::State::new(NOPTS);\n\n        let simd_sum = simd.exec(black_scholes);\n        let scalar_sum = scalar.exec(crate::scalar::black_scholes);\n\n        assert_eq!(simd, scalar);\n        assert_eq!(simd_sum, scalar_sum);\n    }\n\n    #[test]\n    fn binomial_put_scalar() {\n        const NOPTS: usize = 1_000_000;\n        let mut simd = crate::State::new(NOPTS);\n        let mut scalar = crate::State::new(NOPTS);\n\n        let simd_sum = simd.exec(binomial_put);\n        let scalar_sum = scalar.exec(crate::scalar::binomial_put);\n\n        // assert_eq!(simd, scalar);\n        // assert_eq!(simd_sum, scalar_sum);\n        assert!(almost_equal(simd_sum, scalar_sum, 1e-5));\n    }\n}\n"
  },
  {
    "path": "examples/options_pricing/src/simd_kernels.rs",
    "content": "use crate::f32s;\n\n// Cumulative normal distribution function\n#[inline(always)]\npub fn cnd(x: f32s) -> f32s {\n    const INV_SQRT_2PI: f32s = f32s::splat(0.398_942_280_40);\n\n    let l = x.abs();\n    let k = 1. / (1. + 0.231_641_9 * l);\n    let k2 = k * k;\n    let k3 = k2 * k;\n    let k4 = k2 * k2;\n    let k5 = k3 * k2;\n    let w: f32s = 0.319_381_53 * k - 0.356_563_782 * k2\n        + 1.781_477_937 * k3\n        + -1.821_255_978 * k4\n        + 1.330_274_429 * k5;\n    let w = w * INV_SQRT_2PI * (-l * l * 0.5).exp();\n\n    x.gt(f32s::splat(0.)).select(1. - w, w)\n}\n\n#[inline(always)]\npub fn black_scholes(s: f32s, x: f32s, t: f32s, r: f32s, v: f32s) -> f32s {\n    let d1 = ((s / x).ln() + (r + v * v * 0.5) * t) / (v * t.sqrt());\n    let d2 = d1 - v * t.sqrt();\n    s * cnd(d1) - x * (-r * t).exp() * cnd(d2)\n}\n\n#[inline(always)]\npub fn binomial_put(s: f32s, x: f32s, t: f32s, r: f32s, v: f32s) -> f32s {\n    use crate::BINOMIAL_NUM;\n\n    let dt = t / BINOMIAL_NUM as f32;\n    let u = (v * dt.sqrt()).exp();\n    let d = 1. / u;\n    let disc = (r * dt).exp();\n    let inv_disc = 1. / disc;\n    let pu = (disc - d) / (u - d);\n    let o_m_pu = 1. - pu;\n\n    let mut vs = [f32s::splat(0.); BINOMIAL_NUM];\n    for (j, v) in vs.iter_mut().enumerate() {\n        let e = (2_i32 * (j as i32)).wrapping_sub(BINOMIAL_NUM as i32);\n        let upow = u.powf(f32s::splat(e as f32));\n        *v = f32s::splat(0.).max(x - s * upow);\n    }\n\n    for j in (0..BINOMIAL_NUM).rev() {\n        for k in 0..j {\n            vs[k] = (o_m_pu * vs[k] + pu * vs[k + 1]) * inv_disc;\n        }\n    }\n\n    vs[0]\n}\n"
  },
  {
    "path": "examples/options_pricing/src/simd_par.rs",
    "content": "//! SIMD implementation\n\nuse crate::f32s;\n\npub fn parallel<K>(\n    sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],\n    result: &mut [f32], count: usize, kernel: K,\n) -> f64\nwhere\n    K: Fn(f32s, f32s, f32s, f32s, f32s) -> f32s + Sync + Send,\n{\n    use rayon::prelude::*;\n    assert_eq!(count % f32s::lanes(), 0);\n    result.par_chunks_mut(f32s::lanes()).enumerate().for_each(\n        |(i, result)| {\n            debug_assert!(result.len() == 8);\n            unsafe {\n                let s = f32s::from_slice_unaligned_unchecked(&sa[i..]);\n                let x = f32s::from_slice_unaligned_unchecked(&xa[i..]);\n                let t = f32s::from_slice_unaligned_unchecked(&ta[i..]);\n                let r = f32s::from_slice_unaligned_unchecked(&ra[i..]);\n                let v = f32s::from_slice_unaligned_unchecked(&va[i..]);\n                let r = kernel(s, x, t, r, v);\n                r.write_to_slice_unaligned_unchecked(result);\n            }\n        },\n    );\n    crate::sum::slice(&result)\n}\n\npub fn black_scholes(\n    sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],\n    result: &mut [f32], count: usize,\n) -> f64 {\n    parallel(\n        sa,\n        xa,\n        ta,\n        ra,\n        va,\n        result,\n        count,\n        crate::simd_kernels::black_scholes,\n    )\n}\n\npub fn binomial_put(\n    sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32],\n    result: &mut [f32], count: usize,\n) -> f64 {\n    parallel(\n        sa,\n        xa,\n        ta,\n        ra,\n        va,\n        result,\n        count,\n        crate::simd_kernels::binomial_put,\n    )\n}\n\n#[cfg(test)]\nmod tests {\n    use super::*;\n    use crate::almost_equal;\n    #[test]\n    fn black_scholes_scalar() {\n        const NOPTS: usize = 1_000_000;\n        let mut simd_par = crate::State::new(NOPTS);\n        let mut scalar = crate::State::new(NOPTS);\n\n        let simd_par_sum = simd_par.exec(black_scholes);\n        let scalar_sum = scalar.exec(crate::scalar::black_scholes);\n\n        assert_eq!(simd_par, scalar);\n        assert_eq!(simd_par_sum, scalar_sum);\n    }\n\n    #[test]\n    fn binomial_put_scalar() {\n        const NOPTS: usize = 1_000_000;\n        let mut simd_par = crate::State::new(NOPTS);\n        let mut scalar = crate::State::new(NOPTS);\n\n        let simd_par_sum = simd_par.exec(binomial_put);\n        let scalar_sum = scalar.exec(crate::scalar::binomial_put);\n\n        // assert_eq!(simd_par, scalar);\n        // assert_eq!(simd_par_sum, scalar_sum);\n        assert!(almost_equal(simd_par_sum, scalar_sum, 1e-5));\n    }\n}\n"
  },
  {
    "path": "examples/options_pricing/src/sum.rs",
    "content": "//! Implements different algorithms for summing a slice of `f32`s\n\nuse super::{f32s, f64s};\n\npub fn slice(x: &[f32]) -> f64 {\n    assert_eq!(f32s::lanes(), f64s::lanes());\n    assert_eq!(x.len() % f32s::lanes(), 0);\n\n    let mut sum = f64s::splat(0.);\n    for i in (0..x.len()).step_by(f32s::lanes()) {\n        unsafe {\n            use packed_simd::Cast;\n            let v: f64s = f32s::from_slice_unaligned_unchecked(&x[i..]).cast();\n            sum += v;\n        }\n    }\n    sum.sum()\n}\n\npub fn slice_scalar(x: &[f32]) -> f64 {\n    let mut sum = 0_f64;\n    for &x in x {\n        sum += f64::from(x);\n    }\n    sum\n}\n"
  },
  {
    "path": "examples/options_pricing/volta/options.ispc",
    "content": "// -*- mode: c++ -*-\n/*\n  Copyright (c) 2010-2011, Intel Corporation\n  All rights reserved.\n\n  Redistribution and use in source and binary forms, with or without\n  modification, are permitted provided that the following conditions are\n  met:\n\n    * Redistributions of source code must retain the above copyright\n      notice, this list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright\n      notice, this list of conditions and the following disclaimer in the\n      documentation and/or other materials provided with the distribution.\n\n    * Neither the name of Intel Corporation nor the names of its\n      contributors may be used to endorse or promote products derived from\n      this software without specific prior written permission.\n\n\n   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS\n   IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED\n   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A\n   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER\n   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\n   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\n   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  \n*/\n\n#include \"options_defs.h\"\n\n// Cumulative normal distribution function\nstatic inline float\nCND(float X) {\n    float L = abs(X);\n\n    float k = 1.0 / (1.0 + 0.2316419 * L);\n    float k2 = k*k;\n    float k3 = k2*k;\n    float k4 = k2*k2;\n    float k5 = k3*k2;\n\n    const float invSqrt2Pi = 0.39894228040f;\n    float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 +\n               -1.821255978f * k4 + 1.330274429f * k5);\n    w *= invSqrt2Pi * exp(-L * L * .5f);\n\n    if (X > 0.f)\n        w = 1.0 - w;\n    return w;\n}\n\nstatic inline\nuniform double sum(const uniform float result[], uniform int count) {\n    double s = 0.0;\n    foreach (i = 0 ... count) {\n        s += (double)result[i];\n    }\n    return reduce_add(s);\n}\n\ntask void\nbs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],\n        uniform float ra[], uniform float va[], \n        uniform float result[], uniform int count) {\n    uniform int first = taskIndex * (count/taskCount);\n    uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));\n\n    foreach (i = first ... last) {\n        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];\n\n        float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));\n        float d2 = d1 - v * sqrt(T);\n\n        result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);\n    }\n}\n\nexport uniform double\nblack_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[],\n                         uniform float ra[], uniform float va[], \n                         uniform float result[], uniform int count) {\n    uniform int nTasks = max((int)64, (int)count/16384);\n    launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count);\n    sync;\n    return sum(result, count);\n}\n\n\nexport uniform double\nblack_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],\n                   uniform float ra[], uniform float va[], \n                   uniform float result[], uniform int count) {\n    foreach (i = 0 ... count) {\n        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];\n\n        float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));\n        float d2 = d1 - v * sqrt(T);\n\n        result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);\n    }\n\n    return sum(result, count);\n}\n\n\nstatic inline float\nbinomial_put(float S, float X, float T, float r, float v) {\n    float V[BINOMIAL_NUM];\n\n    float dt = T / BINOMIAL_NUM;\n    float u = exp(v * sqrt(dt));\n    float d = 1. / u;\n    float disc = exp(r * dt);\n    float Pu = (disc - d) / (u - d);\n\n    for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {\n        float upow = pow(u, (float)(2*j-BINOMIAL_NUM));\n        V[j] = max(0., X - S * upow);\n    }\n\n    for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)\n        for (uniform int k = 0; k < j; ++k)\n            V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;\n    return V[0];\n}\n\n\nexport uniform double\nbinomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[], \n                  uniform float ra[], uniform float va[], \n                  uniform float result[], uniform int count) {\n    foreach (i = 0 ... count) {\n        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];\n        result[i] = binomial_put(S, X, T, r, v);\n    }\n\n    return sum(result, count);\n}\n\n\ntask void\nbinomial_task(uniform float Sa[], uniform float Xa[], \n              uniform float Ta[], uniform float ra[], \n              uniform float va[], uniform float result[], \n              uniform int count) {\n    uniform int first = taskIndex * (count/taskCount);\n    uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));\n\n    foreach (i = first ... last) {\n        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];\n        result[i] = binomial_put(S, X, T, r, v);\n    }\n}\n\n\nexport uniform double\nbinomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[], \n                        uniform float Ta[], uniform float ra[], \n                        uniform float va[], uniform float result[], \n                        uniform int count) {\n    uniform int nTasks = max((int)64, (int)count/16384);\n    launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count);\n    sync;\n    return sum(result, count);\n}"
  },
  {
    "path": "examples/options_pricing/volta/options_defs.h",
    "content": "/*\n  Copyright (c) 2010-2011, Intel Corporation\n  All rights reserved.\n\n  Redistribution and use in source and binary forms, with or without\n  modification, are permitted provided that the following conditions are\n  met:\n\n    * Redistributions of source code must retain the above copyright\n      notice, this list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright\n      notice, this list of conditions and the following disclaimer in the\n      documentation and/or other materials provided with the distribution.\n\n    * Neither the name of Intel Corporation nor the names of its\n      contributors may be used to endorse or promote products derived from\n      this software without specific prior written permission.\n\n\n   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS\n   IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED\n   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A\n   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER\n   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\n   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\n   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  \n*/\n\n#ifndef OPTIONS_DEFS_H\n#define OPTIONS_DEFS_H 1\n\n#define BINOMIAL_NUM 64\n\n\n#endif // OPTIONS_DEFS_H\n"
  },
  {
    "path": "examples/rust-toolchain",
    "content": "nightly"
  },
  {
    "path": "examples/slice_sum/Cargo.toml",
    "content": "[package]\nname = \"slice_sum\"\nversion = \"0.1.0\"\nauthors = [\"gnzlbg <gonzalobg88@gmail.com>\"]\nedition = \"2018\"\n\n[[bin]]\nname = \"slice_sum\"\npath = \"src/main.rs\"\n\n[dependencies]\npacked_simd = { package = \"packed_simd\", path = \"../..\" }\nrayon = \"^1.0\"\ntime = \"^0.1\"\nrand = \"0.7.0\"\n"
  },
  {
    "path": "examples/slice_sum/readme.md",
    "content": "# Computes the sum of a slice of floating-point numbers\n\nThis example show-cases the performance difference of computing the sum of a\n`&[f32]` slice using horizontal or vertical operations. \n\nTo run it:\n\n```\nRUSTFLAGS=\"-C target-cpu=native\" cargo run --release\n```\n\nOn my machine it prints:\n\n```\nvertical: 155 ms\nhorizontal: 424 ms\n```\n\nthat is, on my particular the slice sum algorithm using horizontal vector\nadditions operation is ~2.7x slower than the one using vertical vector\noperations.\n"
  },
  {
    "path": "examples/slice_sum/src/main.rs",
    "content": "#![deny(rust_2018_idioms)]\n\nuse packed_simd::f32x8 as f32s;\nuse std::{mem, slice};\n\nfn init(n: usize) -> Vec<f32> {\n    use rand::distributions::Standard;\n    use rand::prelude::*;\n    thread_rng().sample_iter(&Standard).take(n).collect()\n}\n\nfn sum_ver(x: &[f32]) -> f32 {\n    assert_eq!(x.len() % f32s::lanes(), 0);\n\n    x.chunks_exact(f32s::lanes())\n        .map(f32s::from_slice_unaligned)\n        .sum::<f32s>()\n        .sum()\n}\n\nfn sum_hor(x: &[f32]) -> f32 {\n    assert_eq!(x.len() % f32s::lanes(), 0);\n\n    x.chunks_exact(f32s::lanes())\n        .map(f32s::from_slice_unaligned)\n        .map(f32s::sum)\n        .sum()\n}\n\nfn sum_ver_par(x: &[f32]) -> f32 {\n    use rayon::prelude::*;\n    let len: usize = x.len();\n    assert_eq!(len % 8, 0);\n\n    // find the first properly aligned element\n    let (i, _): (usize, _) = x\n        .iter()\n        .enumerate()\n        .find(|&(_, y): &(usize, &f32)| {\n            (y as *const f32) as usize % mem::align_of::<f32s>() == 0\n        })\n        .unwrap();\n\n    let (head, tail) = x.split_at(i);\n    let head_sum: f32 = head.iter().sum();\n\n    #[allow(clippy::cast_ptr_alignment)]\n    let tail: &[f32s] = unsafe {\n        slice::from_raw_parts(\n            tail.as_ptr() as *const f32s,\n            tail.len() / f32s::lanes(),\n        )\n    };\n    let tail_sum: f32s = tail.into_par_iter().sum();\n    head_sum + tail_sum.sum()\n}\n\nfn main() {\n    let n: usize = std::env::args()\n        .nth(1)\n        .unwrap_or_else(|| \"1000000000\".to_string())\n        .parse()\n        .expect(\"argument should be a usize\");\n\n    assert_eq!(n % 8, 0, \"argument should be a multiple of 8\");\n\n    let s: &[f32] = &init(n);\n\n    let iter = time::Duration::span(|| {\n        let v: f32 = s.iter().sum();\n        assert!(!v.is_nan());\n    });\n    println!(\"std::iter::sum: {} ms\", iter.num_milliseconds());\n\n    let rayon = time::Duration::span(|| {\n        use rayon::prelude::*;\n        let v: f32 = s.par_iter().sum();\n        assert!(!v.is_nan());\n    });\n    println!(\"rayon::sum: {} ms\", rayon.num_milliseconds());\n\n    let ver = time::Duration::span(|| {\n        assert!(!sum_ver(s).is_nan());\n    });\n    println!(\"vertical: {} ms\", ver.num_milliseconds());\n\n    let hor = time::Duration::span(|| {\n        assert!(!sum_hor(s).is_nan());\n    });\n    println!(\"horizontal: {} ms\", hor.num_milliseconds());\n    let ver_par = time::Duration::span(|| {\n        assert!(!sum_ver_par(s).is_nan());\n    });\n    println!(\"vertical_par: {} ms\", ver_par.num_milliseconds());\n}\n"
  },
  {
    "path": "examples/spectral_norm/Cargo.toml",
    "content": "[package]\nname = \"spectral_norm\"\nversion = \"0.1.0\"\nauthors = [\"gnzlbg <gonzalobg88@gmail.com>\"]\nedition = \"2018\"\n\n[dependencies]\npacked_simd = { package = \"packed_simd\", path = \"../..\" }\n\n[[bin]]\nname = \"spectral_norm\"\npath = \"src/main.rs\"\n\n[lib]\nname = \"spectral_norm_lib\"\npath = \"src/lib.rs\"\n"
  },
  {
    "path": "examples/spectral_norm/readme.md",
    "content": "# Spectral norm\n\nThis is the [`spectral-norm` benchmark from the benchmarksgame][bg]. \n\n## Background and description\n\nMathWorld: [\"Hundred-Dollar, Hundred-Digit Challenge Problems\"](http://mathworld.wolfram.com/Hundred-DollarHundred-DigitChallengeProblems.html), [Challenge #3](http://mathworld.wolfram.com/SpectralNorm.html).\n\nEach program should:\n\n* calculate the spectral norm of an infinite matrix `A`, with entries `a11=1`,\n  `a12=1/2`, `a21=1/3`, `a13=1/4`, `a22=1/5`, `a31=1/6`, etc.\n\n* implement 4 separate functions / procedures / methods like the [C#\n  program](https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/spectralnorm-csharpcore-1.html)\n\n## Usage\n\nIt takes two arguments in this order:\n\n* `n`: the size of the matrix `A` (n-times-n)\n* (optional) `algorithm`: the algorithm to use - defaults to the fastest one.\n  * `0`: scalar algorithm\n  * `1`: SIMD algorithm\n\n[bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/spectralnorm.html#spectralnorm\n"
  },
  {
    "path": "examples/spectral_norm/src/lib.rs",
    "content": "//! Spectral Norm\n#![deny(rust_2018_idioms)]\n#![allow(non_snake_case, non_camel_case_types)]\n#![allow(\n    clippy::cast_precision_loss,\n    clippy::must_use_candidate\n)]\n\npub mod scalar;\npub mod simd;\n\nfn A(i: usize, j: usize) -> f64 {\n    ((i + j) * (i + j + 1) / 2 + i + 1) as f64\n}\n\npub fn spectral_norm(n: usize, alg: usize) -> f64 {\n    match alg {\n        0 => simd::spectral_norm(n),\n        1 => scalar::spectral_norm(n),\n        v => panic!(\"unknown algorithm value: {}\", v),\n    }\n}\n"
  },
  {
    "path": "examples/spectral_norm/src/main.rs",
    "content": "extern crate spectral_norm_lib;\nuse spectral_norm_lib::*;\n\nfn run<O: std::io::Write>(o: &mut O, n: usize, alg: usize) {\n    let answer = spectral_norm(n, alg);\n    writeln!(o, \"{:.9}\", answer).unwrap();\n}\n\nfn main() {\n    let n: usize =\n        std::env::args().nth(1).expect(\"need one arg\").parse().unwrap();\n\n    let alg = if let Some(v) = std::env::args().nth(2) {\n        v.parse().unwrap()\n    } else {\n        0\n    };\n\n    run(&mut std::io::stdout(), n, alg);\n}\n\n#[cfg(test)]\nmod tests {\n    use super::*;\n    static OUTPUT: &[u8] = include_bytes!(\"spectralnorm-output.txt\");\n    #[test]\n    fn verify_output_simd() {\n        let mut out: Vec<u8> = Vec::new();\n\n        run(&mut out, 100, 0);\n\n        assert_eq!(out.len(), OUTPUT.len());\n        if out != OUTPUT {\n            for i in 0..out.len() {\n                assert_eq!(\n                    out[i], OUTPUT[i],\n                    \"byte {} differs - is: {:#08b} - should: {:#08b}\",\n                    i, out[i], OUTPUT[i]\n                );\n            }\n        }\n    }\n    #[test]\n    fn verify_output_scalar() {\n        let mut out: Vec<u8> = Vec::new();\n\n        run(&mut out, 100, 1);\n\n        assert_eq!(out.len(), OUTPUT.len());\n        if out != OUTPUT {\n            for i in 0..out.len() {\n                assert_eq!(\n                    out[i], OUTPUT[i],\n                    \"byte {} differs - is: {:#08b} - should: {:#08b}\",\n                    i, out[i], OUTPUT[i]\n                );\n            }\n        }\n    }\n\n}\n"
  },
  {
    "path": "examples/spectral_norm/src/scalar.rs",
    "content": "//! Scalar spectral norm implementation\n\nuse crate::*;\nuse std::{\n    iter::*,\n    ops::{Add, Div},\n};\n\nstruct f64x2(f64, f64);\nimpl Add for f64x2 {\n    type Output = Self;\n    fn add(self, rhs: Self) -> Self {\n        Self(self.0 + rhs.0, self.1 + rhs.1)\n    }\n}\nimpl Div for f64x2 {\n    type Output = Self;\n    fn div(self, rhs: Self) -> Self {\n        Self(self.0 / rhs.0, self.1 / rhs.1)\n    }\n}\n\npub fn spectral_norm(n: usize) -> f64 {\n    assert!(n % 2 == 0, \"only even lengths are accepted\");\n    let mut u = vec![1.0; n];\n    let mut v = u.clone();\n    let mut tmp = v.clone();\n    for _ in 0..10 {\n        mult_AtAv(&u, &mut v, &mut tmp);\n        mult_AtAv(&v, &mut u, &mut tmp);\n    }\n    (dot(&u, &v) / dot(&v, &v)).sqrt()\n}\n\nfn mult_AtAv(v: &[f64], out: &mut [f64], tmp: &mut [f64]) {\n    mult_Av(v, tmp);\n    mult_Atv(tmp, out);\n}\n\nfn mult_Av(v: &[f64], out: &mut [f64]) {\n    mult(v, out, 0, A);\n}\n\nfn mult_Atv(v: &[f64], out: &mut [f64]) {\n    mult(v, out, 0, |i, j| A(j, i));\n}\n\nfn mult<F>(v: &[f64], out: &mut [f64], start: usize, a: F)\nwhere\n    F: Fn(usize, usize) -> f64,\n{\n    for (i, slot) in out.iter_mut().enumerate().map(|(i, s)| (i + start, s)) {\n        let mut sum = f64x2(0.0, 0.0);\n        for (j, chunk) in v.chunks(2).enumerate().map(|(j, s)| (2 * j, s)) {\n            let top = f64x2(chunk[0], chunk[1]);\n            let bot = f64x2(a(i, j), a(i, j + 1));\n            sum = sum + top / bot;\n        }\n        let f64x2(a, b) = sum;\n        *slot = a + b;\n    }\n}\n\nfn dot(x: &[f64], y: &[f64]) -> f64 {\n    x.iter().zip(y).map(|(&x, &y)| x * y).fold(0.0, |a, b| a + b)\n}\n\n#[cfg(test)]\n#[test]\nfn test() {\n    assert_eq!(&format!(\"{:.9}\", spectral_norm(100)), \"1.274219991\");\n}\n"
  },
  {
    "path": "examples/spectral_norm/src/simd.rs",
    "content": "//! Vectorized spectral norm implementation\n\nuse crate::*;\nuse packed_simd::*;\n\nfn mult_Av(v: &[f64], out: &mut [f64]) {\n    assert!(v.len() == out.len());\n    assert!(v.len() % 2 == 0);\n\n    for (i, out) in out.iter_mut().enumerate() {\n        let mut sum = f64x2::splat(0.0);\n\n        let mut j = 0;\n        while j < v.len() {\n            let b = f64x2::from_slice_unaligned(&v[j..]);\n            let a = f64x2::new(A(i, j), A(i, j + 1));\n            sum += b / a;\n            j += 2\n        }\n        *out = sum.sum();\n    }\n}\n\nfn mult_Atv(v: &[f64], out: &mut [f64]) {\n    assert!(v.len() == out.len());\n    assert!(v.len() % 2 == 0);\n\n    for (i, out) in out.iter_mut().enumerate() {\n        let mut sum = f64x2::splat(0.0);\n\n        let mut j = 0;\n        while j < v.len() {\n            let b = f64x2::from_slice_unaligned(&v[j..]);\n            let a = f64x2::new(A(j, i), A(j + 1, i));\n            sum += b / a;\n            j += 2\n        }\n        *out = sum.sum();\n    }\n}\n\nfn mult_AtAv(v: &[f64], out: &mut [f64], tmp: &mut [f64]) {\n    mult_Av(v, tmp);\n    mult_Atv(tmp, out);\n}\n\npub fn spectral_norm(n: usize) -> f64 {\n    assert!(n % 2 == 0, \"only even lengths are accepted\");\n\n    let mut u = vec![1.0; n];\n    let mut v = u.clone();\n    let mut tmp = u.clone();\n\n    for _ in 0..10 {\n        mult_AtAv(&u, &mut v, &mut tmp);\n        mult_AtAv(&v, &mut u, &mut tmp);\n    }\n    (dot(&u, &v) / dot(&v, &v)).sqrt()\n}\n\nfn dot(x: &[f64], y: &[f64]) -> f64 {\n    // This is auto-vectorized:\n    x.iter().zip(y).map(|(&x, &y)| x * y).fold(0.0, |a, b| a + b)\n}\n\n#[cfg(test)]\n#[test]\nfn test() {\n    assert_eq!(&format!(\"{:.9}\", spectral_norm(100)), \"1.274219991\");\n}\n"
  },
  {
    "path": "examples/spectral_norm/src/spectralnorm-output.txt",
    "content": "1.274219991\n"
  },
  {
    "path": "examples/stencil/Cargo.toml",
    "content": "[package]\nname = \"stencil\"\nversion = \"0.1.0\"\nauthors = [\"gnzlbg <gonzalobg88@gmail.com>\"]\nedition = \"2018\"\n\n[dependencies]\npacked_simd = { package = \"packed_simd\", path = \"../..\" }\ntime = \"^0.1\"\nrayon = \"^1.0\"\nispc = { version = \"^1.0.4\", optional = true }\n\n[build-dependencies]\nispc = { version = \"^1.0.4\", optional = true }\n\n[[bin]]\nname = \"stencil\"\npath = \"src/main.rs\"\n\n[lib]\nname = \"stencil_lib\"\npath = \"src/lib.rs\"\n\n[features]\ndefault = []\ncore_arch = [\"packed_simd/core_arch\"]\nsleef-sys = [\"packed_simd/sleef-sys\"]\n"
  },
  {
    "path": "examples/stencil/benchmark.sh",
    "content": "#!/usr/bin/env bash\n#\n# Runs aobench benchmarks\n\nset -ex\n\nif [[ ${NORUN} != 1 ]]; then\n    hash hyperfine 2>/dev/null || { echo >&2 \"hyperfine is not in PATH.\"; exit 1; }\nfi\n\nalgs=(\"0\" \"1\" \"2\")\nif echo \"$FEATURES\" | grep -q \"ispc\"; then\n    hash ispc 2>/dev/null || { echo >&2 \"ispc is not in PATH.\"; exit 1; }\n    algs+=( \"3\" \"4\" )\nfi\n\nRUSTFLAGS=\"-C target-cpu=native ${RUSTFLAGS}\" \\\n         cargo build --release --no-default-features \\\n         --features=\"${FEATURES}\"\n\nif [[ \"${VERIFY}\" == \"1\" ]]; then\n    RUSTFLAGS=\"-C target-cpu=native ${RUSTFLAGS}\" \\\n             cargo test --release --no-default-features \\\n             --features=\"${FEATURES}\"\nfi\n\nif [[ \"${NORUN}\" == \"1\" ]]; then\n    exit 0\nfi\n\nfor alg in \"${algs[@]}\"\ndo\n    hyperfine \"../target/release/stencil ${alg}\"\ndone\n"
  },
  {
    "path": "examples/stencil/build.rs",
    "content": "fn main() {\n    println!(\"cargo:rerun-if-changed=build.rs\");\n\n    #[cfg(feature = \"ispc\")]\n    {\n        if std::env::var(\"CARGO_FEATURE_ISPC\").is_ok() {\n            let mut cfg = ispc::Config::new();\n\n            if cfg!(windows) {\n                cfg.debug(false);\n            }\n\n            let ispc_files = vec![\"volta/stencil.ispc\"];\n\n            for s in &ispc_files[..] {\n                cfg.file(*s);\n            }\n\n            cfg.target_isas(vec![\n                ispc::opt::TargetISA::SSE2i32x4,\n                ispc::opt::TargetISA::SSE4i32x4,\n                ispc::opt::TargetISA::AVX1i32x8,\n                ispc::opt::TargetISA::AVX2i32x8,\n                ispc::opt::TargetISA::AVX512KNLi32x16,\n            ]);\n\n            cfg.compile(\"stencil\");\n        }\n    }\n}\n"
  },
  {
    "path": "examples/stencil/readme.md",
    "content": "# Stencil\n\nThis is the generic [`stencil` ISPC benchmark][ispc]. \n\n## Usage\n\n```\ncargo run --release --features=ispc\n```\n\nwill run all benchmarks including the ISPC ones. \n\n\n## Results\n\n```\n./benchmark.sh\n```\n\nOn a dual core AVX1 i5 @1.8 GHz:\n\n| 800 x 600    | time [ms] <br> Rust | speedup vs `scalar` [-] |\n|--------------|---------------------|-------------------------|\n| `scalar`     |                2842 |                    1.0x |\n| `vector`     |                 630 |                    4.5x |\n| `vector_par` |                 444 |                    6.4x |\n| `ispc`       |                 558 |                     5.0x |\n| `ispc_tasks` |                 470 |                    6.0x |\n\n`vector_par` is 1.06x faster than `ispc_tasks`.\n\nOn a 28 core Xeon CPU E5-2690 v4 @ 2.60GHz:\n\n| 800 x 600    | time [ms] <br> Rust | speedup vs `scalar` [-] |\n|--------------|---------------------|-------------------------|\n| `scalar`     |                1499 | 1.0x                    |\n| `vector`     |                 276 | 5.4x                    |\n| `vector_par` |                 167 | 9.0x                    |\n| `ispc`       |                 287 | 5.2x                    |\n| `ispc_tasks` |                 395 | 3.8x                    |\n\n`vector_par` is 1.72x faster than `ispc_tasks`.\n\nOn a 40 core Xeon Gold 6148 CPU @ 2.40GHz:\n\n| 800 x 600    | time [ms] <br> Rust | speedup vs `scalar` [-] |\n|--------------|---------------------|-------------------------|\n| `scalar`     |                1654 |                    1.0x |\n| `vector`     |                 278 |                    6.0x |\n| `vector_par` |                 148 |                    11.2x |\n| `ispc`       |                 185 |                     9.0x |\n| `ispc_tasks` |                 401 |                    4.1x |\n\n`vector_par` is 1.25x faster than `ispc`.\n\n\n[ispc]: https://github.com/ispc/ispc/tree/master/examples/stencil\n"
  },
  {
    "path": "examples/stencil/src/ispc_loops.rs",
    "content": "//! Includes the ISPC implementations.\n\nuse ispc::*;\nispc_module!(stencil);\n\npub fn serial(\n    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,\n    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],\n    a_even: &mut [f32], a_odd: &mut [f32],\n) {\n    unsafe {\n        self::stencil::loop_stencil_ispc(\n            t0,\n            t1,\n            x0,\n            x1,\n            y0,\n            y1,\n            z0,\n            z1,\n            n_x,\n            n_y,\n            n_z,\n            coef.as_ptr(),\n            vsq.as_ptr(),\n            a_even.as_mut_ptr(),\n            a_odd.as_mut_ptr(),\n        );\n    }\n}\n\npub fn tasks(\n    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,\n    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],\n    a_even: &mut [f32], a_odd: &mut [f32],\n) {\n    unsafe {\n        self::stencil::loop_stencil_ispc_tasks(\n            t0,\n            t1,\n            x0,\n            x1,\n            y0,\n            y1,\n            z0,\n            z1,\n            n_x,\n            n_y,\n            n_z,\n            coef.as_ptr(),\n            vsq.as_ptr(),\n            a_even.as_mut_ptr(),\n            a_odd.as_mut_ptr(),\n        );\n    }\n}\n"
  },
  {
    "path": "examples/stencil/src/lib.rs",
    "content": "#![feature(custom_inner_attributes, stmt_expr_attributes)]\n// FIXME: Null pointer deref warning triggered in this example,\n// likely inside a macro expansion deriving from packed_simd.\n#![deny(rust_2018_idioms)]\n#![allow(\n    clippy::similar_names,\n    clippy::cast_precision_loss,\n    clippy::cast_sign_loss,\n    clippy::too_many_arguments,\n    clippy::cast_possible_wrap,\n    clippy::cast_possible_truncation,\n    clippy::inline_always,\n    clippy::must_use_candidate\n)]\n\n#[cfg(feature = \"ispc\")]\npub mod ispc_loops;\npub mod scalar;\npub mod simd;\npub mod simd_par;\n\n#[derive(Clone, PartialEq, Debug)]\npub struct Data {\n    a: (Vec<f32>, Vec<f32>),\n    vsq: Vec<f32>,\n    coeff: [f32; 4],\n    n: (i32, i32, i32),\n    t: (i32, i32),\n    x: (i32, i32),\n    y: (i32, i32),\n    z: (i32, i32),\n}\n\nimpl Data {\n    pub fn default() -> Self {\n        Self::from_bounds(6, 4, 128, 128, 128)\n    }\n\n    pub fn benchmark() -> Self {\n        Self::from_bounds(6, 4, 256, 256, 256)\n    }\n\n    pub fn from_bounds(\n        max_t: i32, width: i32, n_x: i32, n_y: i32, n_z: i32,\n    ) -> Self {\n        #[rustfmt::skip]\n        Self::new(\n            0, max_t,\n            width, n_x - width, width, n_y - width, width, n_z - width,\n            n_x, n_y, n_z,\n        )\n    }\n\n    /// Initializes data\n    pub fn new(\n        t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32,\n        z1: i32, n_x: i32, n_y: i32, n_z: i32,\n    ) -> Self {\n        let n = (n_x * n_y * n_z) as usize;\n        let mut data = Self {\n            a: (vec![0_f32; n], vec![0_f32; n]),\n            vsq: vec![0_f32; n],\n            coeff: [0.5, -0.25, 0.125, -0.0625],\n            n: (n_x, n_y, n_z),\n            t: (t0, t1),\n            x: (x0, x1),\n            y: (y0, y1),\n            z: (z0, z1),\n        };\n\n        data.reinit();\n        data\n    }\n\n    pub fn reinit(&mut self) {\n        let mut offset: usize = 0;\n        for z in 0..self.n.2 {\n            for y in 0..self.n.1 {\n                for x in 0..self.n.0 {\n                    unsafe {\n                        *self.a.0.get_unchecked_mut(offset) =\n                            if x < self.n.0 / 2 {\n                                x as f32 / self.n.0 as f32\n                            } else {\n                                y as f32 / self.n.1 as f32\n                            };\n                        *self.a.1.get_unchecked_mut(offset) = 0.;\n                        *self.vsq.get_unchecked_mut(offset) = (x * y * z)\n                            as f32\n                            / (self.n.0 * self.n.1 * self.n.2) as f32;\n                        offset += 1;\n                    }\n                }\n            }\n        }\n    }\n\n    #[rustfmt::skip]\n    pub fn exec<F>(&mut self, f: F)\n    where\n        F: Fn(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,\n            &[f32; 4], &[f32], &mut [f32], &mut [f32]),\n    {\n        f(\n            self.t.0, self.t.1,\n            self.x.0, self.x.1,\n            self.y.0, self.y.1,\n            self.z.0, self.z.1,\n            self.n.0, self.n.1, self.n.2,\n            &self.coeff, &self.vsq, &mut self.a.0, &mut self.a.1,\n        );\n    }\n}\n\n#[cfg(test)]\nfn assert_data_eq(a: &Data, b: &Data) {\n    if a == b {\n        return;\n    }\n    assert_eq!(a.coeff, b.coeff, \"coeffs differ\");\n    assert_eq!(a.n, b.n, \"n differ\");\n    assert_eq!(a.t, b.t, \"t differ\");\n    assert_eq!(a.x, b.x, \"x differ\");\n    assert_eq!(a.y, b.y, \"y differ\");\n    assert_eq!(a.z, b.z, \"z differ\");\n\n    for z in 0..a.n.2 {\n        for y in 0..a.n.1 {\n            for x in 0..a.n.0 {\n                let idx = (x + y * a.n.1 + z * a.n.1 * a.n.0) as usize;\n\n                const EPSILON: f32 = 1E-4;\n\n                assert!(\n                    (a.vsq[idx] - b.vsq[idx]).abs() < EPSILON,\n                    \"vsq diff at idx = {} ({}, {}, {})\",\n                    idx,\n                    x,\n                    y,\n                    z,\n                );\n\n                assert!(\n                    (a.a.0[idx] - b.a.0[idx]).abs() < EPSILON,\n                    \"a.0 diff at idx = {} ({}, {}, {})\",\n                    idx,\n                    x,\n                    y,\n                    z,\n                );\n\n                assert!(\n                    (a.a.1[idx] - b.a.1[idx]).abs() < EPSILON,\n                    \"a.1 diff at idx = {} ({}, {}, {})\",\n                    idx,\n                    x,\n                    y,\n                    z,\n                );\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "examples/stencil/src/main.rs",
    "content": "#![feature(custom_inner_attributes)]\n\nuse stencil_lib::*;\n\nuse std::env;\n\n#[rustfmt::skip]\nfn run<F>(name: &str, f: F)\nwhere\n    F: Fn(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,\n        &[f32; 4], &[f32], &mut [f32], &mut [f32]),\n{\n    let mut d = Data::benchmark();\n    let t = time::Duration::span(move || d.exec(f));\n    println!(\"{}: {} ms\", name, t.num_milliseconds());\n}\n\nfn main() {\n    let mut args = env::args();\n    args.next();\n    let alg: usize = args.next().unwrap().parse().unwrap();\n\n    match alg {\n        0 => run(\"scalar\", self::scalar::scalar),\n        1 => run(\"vector\", self::simd::x8),\n        2 => run(\"vector_par\", self::simd_par::x8_par),\n        3 => {\n            #[cfg(feature = \"ispc\")]\n            {\n                run(\"ispc\", self::ispc_loops::serial);\n            }\n            #[cfg(not(feature = \"ispc\"))]\n            {\n                panic!(\"error: algorithm requires binary to be compiled with the ispc feature\")\n            }\n        }\n        4 => {\n            #[cfg(feature = \"ispc\")]\n            {\n                run(\"ispc+tasks\", self::ispc_loops::tasks);\n            }\n            #[cfg(not(feature = \"ispc\"))]\n            {\n                panic!(\"error: algorithm requires binary to be compiled with the ispc feature\")\n            }\n        }\n        _ => panic!(\"unknown algorithm\"),\n    }\n}\n"
  },
  {
    "path": "examples/stencil/src/scalar.rs",
    "content": "//! Scalar implementation\n\npub fn step(\n    x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32,\n    _n_z: i32, coef: &[f32; 4], vsq: &[f32], a_in: &[f32], a_out: &mut [f32],\n) {\n    let n_xy = n_x * n_y;\n\n    for z in z0..z1 {\n        for y in y0..y1 {\n            for x in x0..x1 {\n                let index = (z * n_xy) + (y * n_x) + x;\n\n                macro_rules! a_cur {\n                    ($x:expr, $y:expr, $z:expr) => {\n                        a_in[(index + $x + $y * n_x + $z * n_xy) as usize]\n                    };\n                }\n\n                macro_rules! a_next {\n                    ($x:expr, $y:expr, $z:expr) => {\n                        a_out[(index + $x + $y * n_x + $z * n_xy) as usize]\n                    };\n                }\n\n                let mut div: f32 = coef[0] * a_cur!(0, 0, 0);\n                for i in 1..4 {\n                    div += coef[i as usize]\n                        * (a_cur!(i, 0, 0)\n                            + a_cur!(-i, 0, 0)\n                            + a_cur!(0, i, 0)\n                            + a_cur!(0, -i, 0)\n                            + a_cur!(0, 0, i)\n                            + a_cur!(0, 0, -i));\n                }\n                a_next!(0, 0, 0) = 2. * a_cur!(0, 0, 0) - a_next!(0, 0, 0)\n                    + vsq[index as usize] * div;\n            }\n        }\n    }\n}\n\npub fn scalar(\n    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,\n    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],\n    a_even: &mut [f32], a_odd: &mut [f32],\n) {\n    for t in t0..t1 {\n        if t & 1 == 0 {\n            step(\n                x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even,\n                a_odd,\n            );\n        } else {\n            step(\n                x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_odd,\n                a_even,\n            );\n        }\n    }\n}\n\n#[cfg(all(test, feature = \"ispc\"))]\nmod tests {\n    use super::scalar;\n    use crate::ispc_loops::serial;\n    use crate::{assert_data_eq, Data};\n\n    #[test]\n\n    fn scalar_ispc_verify() {\n        let mut data_scalar = Data::default();\n        data_scalar.exec(scalar);\n\n        let mut data_ispc = Data::default();\n        data_ispc.exec(serial);\n\n        assert_data_eq(&data_scalar, &data_ispc);\n    }\n}\n"
  },
  {
    "path": "examples/stencil/src/simd.rs",
    "content": "//! SIMD implementation\n\nuse packed_simd::*;\n\n#[inline(always)]\npub(crate) fn step_x8(\n    x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32,\n    _n_z: i32, coef: &[f32; 4], vsq: &[f32], a_in: &[f32], a_out: &mut [f32],\n) {\n    assert!((x1 - x0) % f32x8::lanes() as i32 == 0);\n    let n_xy = n_x * n_y;\n    for z in z0..z1 {\n        let z_idx = z * n_xy;\n        for y in y0..y1 {\n            let y_idx = y * n_x;\n            for x in (x0..x1).step_by(f32x8::lanes()) {\n                unsafe {\n                    let out_idx = x + y_idx;\n                    let index: i32 = z_idx + out_idx;\n                    macro_rules! a_cur {\n                        ($x:expr, $y:expr, $z:expr) => {\n                            f32x8::from_slice_unaligned_unchecked(\n                                &a_in.get_unchecked(\n                                    (index + $x + $y * n_x + $z * n_xy)\n                                        as usize..,\n                                ),\n                            )\n                        };\n                    }\n\n                    let cur_0 = a_cur!(0, 0, 0);\n                    let mut div: f32x8 = *coef.get_unchecked(0) * cur_0;\n\n                    for i in 1..4 {\n                        let coef = f32x8::splat(*coef.get_unchecked(i));\n\n                        let sum = {\n                            let i = i as i32;\n                            a_cur!(i, 0, 0)\n                                + a_cur!(-i, 0, 0)\n                                + a_cur!(0, i, 0)\n                                + a_cur!(0, -i, 0)\n                                + a_cur!(0, 0, i)\n                                + a_cur!(0, 0, -i)\n                        };\n\n                        div = coef.mul_adde(sum, div);\n                    }\n\n                    let vsq = f32x8::from_slice_unaligned_unchecked(\n                        vsq.get_unchecked(index as usize..),\n                    );\n\n                    let sum = cur_0.mul_adde(\n                        f32x8::splat(2.),\n                        -f32x8::from_slice_unaligned_unchecked(\n                            a_out.get_unchecked(out_idx as usize..),\n                        ),\n                    );\n\n                    let r = vsq.mul_adde(div, sum);\n                    r.write_to_slice_unaligned_unchecked(\n                        &mut a_out.get_unchecked_mut(out_idx as usize..),\n                    );\n                }\n            }\n        }\n    }\n}\n\n#[inline(always)]\nfn x8_impl(\n    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,\n    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],\n    a_even: &mut [f32], a_odd: &mut [f32],\n) {\n    for t in t0..t1 {\n        if t & 1 == 0 {\n            a_odd\n                .chunks_mut((n_x * n_y) as usize)\n                .enumerate()\n                .skip(z0 as usize)\n                .take((z1 - z0) as usize)\n                .for_each(|(z, a_odd)| {\n                    let z = z as i32;\n                    #[rustfmt::skip]\n                    step_x8(x0, x1, y0, y1, z, z + 1, n_x, n_y, n_z,\n                        coef, vsq, a_even, a_odd,\n                    );\n                });\n        } else {\n            a_even\n                .chunks_mut((n_x * n_y) as usize)\n                .enumerate()\n                .skip(z0 as usize)\n                .take((z1 - z0) as usize)\n                .for_each(|(z, a_even)| {\n                    let z = z as i32;\n                    #[rustfmt::skip]\n                    step_x8(x0, x1, y0, y1, z, z + 1, n_x, n_y, n_z,\n                            coef, vsq, a_odd, a_even,\n                    );\n                });\n        }\n    }\n}\n\n#[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))]\n#[target_feature(enable = \"avx2,fma\")]\nunsafe fn x8_impl_avx2(\n    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,\n    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],\n    a_even: &mut [f32], a_odd: &mut [f32],\n) {\n    #[rustfmt::skip]\n    x8_impl(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,\n            coef, vsq, a_even, a_odd)\n}\n\n#[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))]\n#[target_feature(enable = \"avx\")]\nunsafe fn x8_impl_avx(\n    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,\n    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],\n    a_even: &mut [f32], a_odd: &mut [f32],\n) {\n    #[rustfmt::skip]\n    x8_impl(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,\n            coef, vsq, a_even, a_odd)\n}\n\n#[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))]\n#[target_feature(enable = \"sse4.2\")]\nunsafe fn x8_impl_sse42(\n    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,\n    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],\n    a_even: &mut [f32], a_odd: &mut [f32],\n) {\n    #[rustfmt::skip]\n    x8_impl(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,\n            coef, vsq, a_even, a_odd)\n}\n\n#[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))]\n#[target_feature(enable = \"sse2\")]\nunsafe fn x8_impl_sse2(\n    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,\n    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],\n    a_even: &mut [f32], a_odd: &mut [f32],\n) {\n    #[rustfmt::skip]\n    x8_impl(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,\n            coef, vsq, a_even, a_odd)\n}\n\nunsafe fn x8_impl_def(\n    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,\n    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],\n    a_even: &mut [f32], a_odd: &mut [f32],\n) {\n    #[rustfmt::skip]\n    x8_impl(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,\n            coef, vsq, a_even, a_odd)\n}\n\npub fn x8(\n    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,\n    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],\n    a_even: &mut [f32], a_odd: &mut [f32],\n) {\n    #[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))]\n    unsafe {\n        if is_x86_feature_detected!(\"avx2\") && is_x86_feature_detected!(\"fma\")\n        {\n            #[rustfmt::skip]\n            x8_impl_avx2(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,\n                         coef, vsq, a_even, a_odd)\n        } else if is_x86_feature_detected!(\"avx\") {\n            #[rustfmt::skip]\n            x8_impl_avx(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,\n                         coef, vsq, a_even, a_odd)\n        } else if is_x86_feature_detected!(\"sse4.2\") {\n            #[rustfmt::skip]\n            x8_impl_sse42(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,\n                         coef, vsq, a_even, a_odd)\n        } else if is_x86_feature_detected!(\"sse2\") {\n            #[rustfmt::skip]\n            x8_impl_sse2(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,\n                         coef, vsq, a_even, a_odd)\n        } else {\n            #[rustfmt::skip]\n            x8_impl_def(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,\n                        coef, vsq, a_even, a_odd)\n        }\n    }\n\n    #[cfg(not(any(target_arch = \"x86\", target_arch = \"x86_64\")))]\n    unsafe {\n        #[rustfmt::skip]\n        x8_impl_def(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,\n                    coef, vsq, a_even, a_odd)\n    }\n}\n\n#[cfg(test)]\nmod tests {\n    use super::x8;\n    use crate::scalar::scalar;\n    use crate::{assert_data_eq, Data};\n\n    #[test]\n    fn simd_scalar_verify() {\n        let mut data_simd = Data::default();\n        data_simd.exec(x8);\n\n        let mut data_scalar = Data::default();\n        data_scalar.exec(scalar);\n\n        assert_data_eq(&data_simd, &data_scalar);\n    }\n\n    #[cfg(feature = \"ispc\")]\n    #[test]\n    fn simd_ispc_verify() {\n        use crate::ispc_loops::serial;\n\n        let mut data_simd = Data::default();\n        data_simd.exec(x8);\n\n        let mut data_ispc = Data::default();\n        data_ispc.exec(serial);\n\n        assert_data_eq(&data_simd, &data_ispc);\n    }\n}\n"
  },
  {
    "path": "examples/stencil/src/simd_par.rs",
    "content": "//! SIMD+Rayon implementation.\nuse crate::simd::step_x8;\nuse rayon::prelude::*;\n\n#[inline(always)]\nfn x8_par_impl(\n    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,\n    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],\n    a_even: &mut [f32], a_odd: &mut [f32],\n) {\n    assert!((z1 - z0) <= n_z);\n    for t in t0..t1 {\n        if t & 1 == 0 {\n            a_odd\n                .par_chunks_mut((n_x * n_y) as usize)\n                .enumerate()\n                .skip(z0 as usize)\n                .take((z1 - z0) as usize)\n                .for_each(|(z, a_odd)| {\n                    let z = z as i32;\n                    #[rustfmt::skip]\n                    step_x8(\n                        x0, x1, y0, y1, z, z + 1, n_x, n_y, n_z,\n                        coef, vsq, a_even, a_odd,\n                    );\n                });\n        } else {\n            a_even\n                .par_chunks_mut((n_x * n_y) as usize)\n                .enumerate()\n                .skip(z0 as usize)\n                .take((z1 - z0) as usize)\n                .for_each(|(z, a_even)| {\n                    let z = z as i32;\n                    #[rustfmt::skip]\n                    step_x8(\n                        x0, x1, y0, y1, z, z + 1, n_x, n_y, n_z,\n                        coef, vsq, a_odd, a_even,\n                    );\n                });\n        }\n    }\n}\n\n#[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))]\n#[target_feature(enable = \"avx2\")]\nunsafe fn x8_par_impl_avx2(\n    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,\n    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],\n    a_even: &mut [f32], a_odd: &mut [f32],\n) {\n    x8_par_impl(\n        t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even,\n        a_odd,\n    )\n}\n\n#[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))]\n#[target_feature(enable = \"avx\")]\nunsafe fn x8_par_impl_avx(\n    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,\n    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],\n    a_even: &mut [f32], a_odd: &mut [f32],\n) {\n    x8_par_impl(\n        t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even,\n        a_odd,\n    )\n}\n\n#[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))]\n#[target_feature(enable = \"sse4.2\")]\nunsafe fn x8_par_impl_sse42(\n    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,\n    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],\n    a_even: &mut [f32], a_odd: &mut [f32],\n) {\n    x8_par_impl(\n        t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even,\n        a_odd,\n    )\n}\n\n#[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))]\n#[target_feature(enable = \"sse2\")]\nunsafe fn x8_par_impl_sse2(\n    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,\n    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],\n    a_even: &mut [f32], a_odd: &mut [f32],\n) {\n    x8_par_impl(\n        t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even,\n        a_odd,\n    )\n}\n\nunsafe fn x8_par_impl_def(\n    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,\n    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],\n    a_even: &mut [f32], a_odd: &mut [f32],\n) {\n    x8_par_impl(\n        t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even,\n        a_odd,\n    )\n}\n\npub fn x8_par(\n    t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,\n    n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],\n    a_even: &mut [f32], a_odd: &mut [f32],\n) {\n    #[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))]\n    unsafe {\n        if is_x86_feature_detected!(\"avx2\") {\n            #[rustfmt::skip]\n            x8_par_impl_avx2(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,\n                             coef, vsq, a_even, a_odd)\n        } else if is_x86_feature_detected!(\"avx\") {\n            #[rustfmt::skip]\n            x8_par_impl_avx(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,\n                            coef, vsq, a_even, a_odd)\n        } else if is_x86_feature_detected!(\"sse4.2\") {\n            #[rustfmt::skip]\n            x8_par_impl_sse42(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,\n                              coef, vsq, a_even, a_odd)\n        } else if is_x86_feature_detected!(\"sse2\") {\n            #[rustfmt::skip]\n            x8_par_impl_sse2(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,\n                             coef, vsq, a_even, a_odd)\n        } else {\n            #[rustfmt::skip]\n            x8_par_impl_def(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,\n                            coef, vsq, a_even, a_odd)\n        }\n    }\n\n    #[cfg(not(any(target_arch = \"x86\", target_arch = \"x86_64\")))]\n    unsafe {\n        #[rustfmt::skip]\n        x8_par_impl_def(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,\n                        coef, vsq, a_even, a_odd)\n    }\n}\n\n#[cfg(test)]\nmod tests {\n    use super::x8_par;\n    use crate::scalar::scalar;\n    use crate::{assert_data_eq, Data};\n\n    #[test]\n    fn simd_par_verify() {\n        let mut data_simd_par = Data::default();\n        data_simd_par.exec(x8_par);\n\n        let mut data_scalar = Data::default();\n        data_scalar.exec(scalar);\n\n        assert_data_eq(&data_simd_par, &data_scalar);\n    }\n}\n"
  },
  {
    "path": "examples/stencil/volta/.gitignore",
    "content": "# Files built by ISPC\n/objs/\n/stencil\n"
  },
  {
    "path": "examples/stencil/volta/Makefile",
    "content": "\nEXAMPLE=stencil\nCPP_SRC=stencil.cpp stencil_serial.cpp\nISPC_SRC=stencil.ispc\nISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16\nISPC_ARM_TARGETS=neon\n\ninclude common.mk\n"
  },
  {
    "path": "examples/stencil/volta/common.mk",
    "content": "\nTASK_CXX=tasksys.cpp\nTASK_LIB=-lpthread\nTASK_OBJ=objs/tasksys.o\n\nCXX=clang++\nCXXFLAGS+=-Iobjs/ -O3 -march=native\nCC=clang\nCCFLAGS+=-Iobjs/ -O3 -march=native\n\nLIBS=-lm $(TASK_LIB) -lstdc++\nISPC=ispc\nISPC_FLAGS+=-O3\nISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)\n\nARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)\n\nifeq ($(ARCH),x86)\n  ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o)\n  COMMA=,\n  ifneq (,$(findstring $(COMMA),$(ISPC_IA_TARGETS)))\n    #$(info multi-target detected: $(ISPC_IA_TARGETS))\n    ifneq (,$(findstring sse2,$(ISPC_IA_TARGETS)))\n      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse2.o)\n    endif\n    ifneq (,$(findstring sse4,$(ISPC_IA_TARGETS)))\n      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse4.o)\n    endif\n    ifneq (,$(findstring avx1-,$(ISPC_IA_TARGETS)))\n      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx.o)\n    endif\n    ifneq (,$(findstring avx1.1,$(ISPC_IA_TARGETS)))\n      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx11.o)\n    endif\n    ifneq (,$(findstring avx2,$(ISPC_IA_TARGETS)))\n      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx2.o)\n    endif\n    ifneq (,$(findstring avx512knl,$(ISPC_IA_TARGETS)))\n      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx512knl.o)\n    endif\n    ifneq (,$(findstring avx512skx,$(ISPC_IA_TARGETS)))\n      ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx512skx.o)\n    endif\n  endif\n  ISPC_TARGETS=$(ISPC_IA_TARGETS)\n  ARCH_BIT:=$(shell getconf LONG_BIT)\n  ifeq ($(ARCH_BIT),32)\n    ISPC_FLAGS += --arch=x86\n    CXXFLAGS += -m32\n    CCFLAGS += -m32\n  else\n    ISPC_FLAGS += --arch=x86-64\n    CXXFLAGS += -m64\n    CCFLAGS += -m64\n  endif\nelse ifeq ($(ARCH),arm)\n  ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=_ispc.o))\n  ISPC_TARGETS=$(ISPC_ARM_TARGETS)\nelse\n  $(error Unknown architecture $(ARCH) from uname -m)\nendif\n\nCPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o))\nCC_OBJS=$(addprefix objs/, $(CC_SRC:.c=.o))\nOBJS=$(CPP_OBJS) $(CC_OBJS) $(TASK_OBJ) $(ISPC_OBJS)\n\ndefault: $(EXAMPLE)\n\nall: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar\n\n.PHONY: dirs clean\n\ndirs:\n\t/bin/mkdir -p objs/\n\nobjs/%.cpp objs/%.o objs/%.h: dirs\n\nclean:\n\t/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 ref test\n\n$(EXAMPLE): $(OBJS)\n\t$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)\n\nobjs/%.o: %.cpp dirs $(ISPC_HEADER)\n\t$(CXX) $< $(CXXFLAGS) -c -o $@\n\nobjs/%.o: %.c dirs $(ISPC_HEADER)\n\t$(CC) $< $(CCFLAGS) -c -o $@\n\nobjs/%.o: ../%.cpp dirs\n\t$(CXX) $< $(CXXFLAGS) -c -o $@\n\nobjs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h dirs\n\nobjs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o objs/%_ispc_avx11.o objs/%_ispc_avx2.o objs/%_ispc_avx512knl.o objs/%_ispc_avx512skx.o : %.ispc dirs\n\t$(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h\n\nobjs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)\n\t$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h\n\nobjs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp\n\t$(CXX) -I../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@\n\n$(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o\n\t$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)\n\nobjs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC)\n\t$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h\n\nobjs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp\n\t$(CXX) -I../intrinsics $< $(CXXFLAGS) -c -o $@\n\n$(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o\n\t$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)\n\nobjs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)\n\t$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-1\n\n$(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o\n\t$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)\n"
  },
  {
    "path": "examples/stencil/volta/stencil.cpp",
    "content": "/*\n  Copyright (c) 2010-2014, Intel Corporation\n  All rights reserved.\n  Redistribution and use in source and binary forms, with or without\n  modification, are permitted provided that the following conditions are\n  met:\n    * Redistributions of source code must retain the above copyright\n      notice, this list of conditions and the following disclaimer.\n    * Redistributions in binary form must reproduce the above copyright\n      notice, this list of conditions and the following disclaimer in the\n      documentation and/or other materials provided with the distribution.\n    * Neither the name of Intel Corporation nor the names of its\n      contributors may be used to endorse or promote products derived from\n      this software without specific prior written permission.\n   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS\n   IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED\n   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A\n   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER\n   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\n   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\n   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  \n*/\n\n#ifdef _MSC_VER\n#define _CRT_SECURE_NO_WARNINGS\n#define NOMINMAX\n#pragma warning (disable: 4244)\n#pragma warning (disable: 4305)\n#endif\n\n#include <cstdlib>\n#include <stdio.h>\n#include <algorithm>\n#include <string.h>\n#include <math.h>\n#include \"../timing.h\"\n#include \"stencil_ispc.h\"\nusing namespace ispc;\n\n\nextern void loop_stencil_serial(int t0, int t1, int x0, int x1,\n                                int y0, int y1, int z0, int z1,\n                                int Nx, int Ny, int Nz,\n                                const float coef[5], \n                                const float vsq[],\n                                float Aeven[], float Aodd[]);\n\n\nvoid InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) {\n    int offset = 0;\n    for (int z = 0; z < Nz; ++z)\n        for (int y = 0; y < Ny; ++y)\n            for (int x = 0; x < Nx; ++x, ++offset) {\n                A[0][offset] = (x < Nx / 2) ? x / float(Nx) : y / float(Ny);\n                A[1][offset] = 0;\n                vsq[offset] = x*y*z / float(Nx * Ny * Nz);\n            }\n}\n\n\nint main(int argc, char *argv[]) {\n    static unsigned int test_iterations[] = {3, 3, 3};//the last two numbers must be equal here\n    int Nx = 256, Ny = 256, Nz = 256;\n    int width = 4;\n\n    if (argc > 1) {\n        if (strncmp(argv[1], \"--scale=\", 8) == 0) {\n            float scale = atof(argv[1] + 8);\n            Nx *= scale;\n            Ny *= scale;\n            Nz *= scale;\n        }\n    }\n    if ((argc == 4) || (argc == 5)) {\n        for (int i = 0; i < 3; i++) {\n            test_iterations[i] = atoi(argv[argc - 3 + i]);\n        }\n    }\n\n    float *Aserial[2], *Aispc[2];\n    Aserial[0] = new float [Nx * Ny * Nz];\n    Aserial[1] = new float [Nx * Ny * Nz];\n    Aispc[0] = new float [Nx * Ny * Nz];\n    Aispc[1] = new float [Nx * Ny * Nz];\n    float *vsq = new float [Nx * Ny * Nz];\n\n    float coeff[4] = { 0.5, -.25, .125, -.0625 }; \n\n    InitData(Nx, Ny, Nz, Aispc, vsq);\n    //\n    // Compute the image using the ispc implementation on one core; report\n    // the minimum time of three runs.\n    //\n    double minTimeISPC = 1e30;\n    for (unsigned int i = 0; i < test_iterations[0]; ++i) {\n        reset_and_start_timer();\n        loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,\n                          width, Nz - width, Nx, Ny, Nz, coeff, vsq,\n                          Aispc[0], Aispc[1]);\n        double dt = get_elapsed_mcycles();\n        printf(\"@time of ISPC run:\\t\\t\\t[%.3f] million cycles\\n\", dt);\n        minTimeISPC = std::min(minTimeISPC, dt);\n    }\n\n    printf(\"[stencil ispc 1 core]:\\t\\t[%.3f] million cycles\\n\", minTimeISPC);\n\n    InitData(Nx, Ny, Nz, Aispc, vsq);\n\n    //\n    // Compute the image using the ispc implementation with tasks; report\n    // the minimum time of three runs.\n    //\n    double minTimeISPCTasks = 1e30;\n    for (unsigned int i = 0; i < test_iterations[1]; ++i) {\n        reset_and_start_timer();\n        loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,\n                                width, Nz - width, Nx, Ny, Nz, coeff, vsq,\n                                Aispc[0], Aispc[1]);\n        double dt = get_elapsed_mcycles();\n        printf(\"@time of ISPC + TASKS run:\\t\\t\\t[%.3f] million cycles\\n\", dt);\n        minTimeISPCTasks = std::min(minTimeISPCTasks, dt);\n    }\n\n    printf(\"[stencil ispc + tasks]:\\t\\t[%.3f] million cycles\\n\", minTimeISPCTasks);\n\n    InitData(Nx, Ny, Nz, Aserial, vsq);\n\n    // \n    // And run the serial implementation 3 times, again reporting the\n    // minimum time.\n    //\n    double minTimeSerial = 1e30;\n    for (unsigned int i = 0; i < test_iterations[2]; ++i) {\n        reset_and_start_timer();\n        loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,\n                            width, Nz - width, Nx, Ny, Nz, coeff, vsq,\n                            Aserial[0], Aserial[1]);\n        double dt = get_elapsed_mcycles();\n        printf(\"@time of serial run:\\t\\t\\t[%.3f] million cycles\\n\", dt);\n        minTimeSerial = std::min(minTimeSerial, dt);\n    }\n\n    printf(\"[stencil serial]:\\t\\t[%.3f] million cycles\\n\", minTimeSerial);\n\n    printf(\"\\t\\t\\t\\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\\n\",\n           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);\n\n    // Check for agreement\n    int offset = 0;\n    for (int z = 0; z < Nz; ++z)\n        for (int y = 0; y < Ny; ++y)\n            for (int x = 0; x < Nx; ++x, ++offset) {\n                float error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /\n                                    Aserial[1][offset]);\n                if (error > 1e-4)\n                    printf(\"Error @ (%d,%d,%d): ispc = %f, serial = %f\\n\",\n                           x, y, z, Aispc[1][offset], Aserial[1][offset]);\n            }\n\n    return 0;\n}\n"
  },
  {
    "path": "examples/stencil/volta/stencil.ispc",
    "content": "/*\n  Copyright (c) 2010-2011, Intel Corporation\n  All rights reserved.\n\n  Redistribution and use in source and binary forms, with or without\n  modification, are permitted provided that the following conditions are\n  met:\n\n    * Redistributions of source code must retain the above copyright\n      notice, this list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright\n      notice, this list of conditions and the following disclaimer in the\n      documentation and/or other materials provided with the distribution.\n\n    * Neither the name of Intel Corporation nor the names of its\n      contributors may be used to endorse or promote products derived from\n      this software without specific prior written permission.\n\n\n   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS\n   IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED\n   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A\n   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER\n   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\n   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\n   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  \n*/\n\nstatic void\nstencil_step(uniform int x0, uniform int x1,\n             uniform int y0, uniform int y1,\n             uniform int z0, uniform int z1,\n             uniform int Nx, uniform int Ny, uniform int Nz,\n             uniform const float coef[4], uniform const float vsq[],\n             uniform const float Ain[], uniform float Aout[]) {\n    const uniform int Nxy = Nx * Ny;\n\n    foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) {\n        int index = (z * Nxy) + (y * Nx) + x;\n#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]\n#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]\n        float div = coef[0] * A_cur(0, 0, 0) +\n            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +\n                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +\n                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +\n            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +\n                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +\n                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +\n            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +\n                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +\n                       A_cur(0, 0, +3) + A_cur(0, 0, -3));\n\n        A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + \n            vsq[index] * div;\n    }\n}\n\nstatic task void\nstencil_step_task(uniform int x0, uniform int x1,\n                  uniform int y0, uniform int y1,\n                  uniform int z0,\n                  uniform int Nx, uniform int Ny, uniform int Nz,\n                  uniform const float coef[4], uniform const float vsq[],\n                  uniform const float Ain[], uniform float Aout[]) {\n    stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1,\n                 Nx, Ny, Nz, coef, vsq, Ain, Aout);\n}\n\n\nexport void\nloop_stencil_ispc_tasks(uniform int t0, uniform int t1, \n                        uniform int x0, uniform int x1,\n                        uniform int y0, uniform int y1,\n                        uniform int z0, uniform int z1,\n                        uniform int Nx, uniform int Ny, uniform int Nz,\n                        uniform const float coef[4], \n                        uniform const float vsq[],\n                        uniform float Aeven[], uniform float Aodd[])\n{\n    for (uniform int t = t0; t < t1; ++t) {\n        // Parallelize across cores as well: each task will work on a slice\n        // of 1 in the z extent of the volume.\n        if ((t & 1) == 0)\n            launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, \n                                            coef, vsq, Aeven, Aodd);\n        else\n            launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, \n                                            coef, vsq, Aodd, Aeven);\n\n        // We need to wait for all of the launched tasks to finish before\n        // starting the next iteration.\n        sync;\n    }\n}\n\nexport void\nloop_stencil_ispc(uniform int t0, uniform int t1, \n                  uniform int x0, uniform int x1,\n                  uniform int y0, uniform int y1,\n                  uniform int z0, uniform int z1,\n                  uniform int Nx, uniform int Ny, uniform int Nz,\n                  uniform const float coef[4], \n                  uniform const float vsq[],\n                  uniform float Aeven[], uniform float Aodd[])\n{\n    for (uniform int t = t0; t < t1; ++t) {\n        if ((t & 1) == 0)\n            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, \n                         Aeven, Aodd);\n        else\n            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, \n                         Aodd, Aeven);\n    }\n}\n"
  },
  {
    "path": "examples/stencil/volta/stencil_serial.cpp",
    "content": "/*\n  Copyright (c) 2010-2011, Intel Corporation\n  All rights reserved.\n\n  Redistribution and use in source and binary forms, with or without\n  modification, are permitted provided that the following conditions are\n  met:\n\n    * Redistributions of source code must retain the above copyright\n      notice, this list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright\n      notice, this list of conditions and the following disclaimer in the\n      documentation and/or other materials provided with the distribution.\n\n    * Neither the name of Intel Corporation nor the names of its\n      contributors may be used to endorse or promote products derived from\n      this software without specific prior written permission.\n\n\n   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS\n   IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED\n   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A\n   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER\n   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\n   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\n   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  \n*/\n\n\nstatic void\nstencil_step_serial(int x0, int x1,\n             int y0, int y1,\n             int z0, int z1,\n             int Nx, int Ny, int Nz,\n             const float coef[4], const float vsq[],\n             const float Ain[], float Aout[]) {\n    int Nxy = Nx * Ny;\n\n    for (int z = z0; z < z1; ++z) {\n        for (int y = y0; y < y1; ++y) {\n            for (int x = x0; x < x1; ++x) {\n                int index = (z * Nxy) + (y * Nx) + x;\n#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]\n#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]\n                float div = coef[0] * A_cur(0, 0, 0) +\n                            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +\n                                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +\n                                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +\n                            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +\n                                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +\n                                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +\n                            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +\n                                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +\n                                       A_cur(0, 0, +3) + A_cur(0, 0, -3));\n\n                A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) +\n                    vsq[index] * div;\n            }\n        }\n    }\n}\n\n\nvoid loop_stencil_serial(int t0, int t1,\n                         int x0, int x1,\n                         int y0, int y1,\n                         int z0, int z1,\n                         int Nx, int Ny, int Nz,\n                         const float coef[4],\n                         const float vsq[],\n                         float Aeven[], float Aodd[])\n{\n    for (int t = t0; t < t1; ++t) {\n        if ((t & 1) == 0)\n          stencil_step_serial(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,\n                         Aeven, Aodd);\n        else\n            stencil_step_serial(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq,\n                         Aodd, Aeven);\n    }\n}\n"
  },
  {
    "path": "examples/stencil/volta/tasksys.cpp",
    "content": "/*\n  Copyright (c) 2011-2012, Intel Corporation\n  All rights reserved.\n\n  Redistribution and use in source and binary forms, with or without\n  modification, are permitted provided that the following conditions are\n  met:\n\n    * Redistributions of source code must retain the above copyright\n      notice, this list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright\n      notice, this list of conditions and the following disclaimer in the\n      documentation and/or other materials provided with the distribution.\n\n    * Neither the name of Intel Corporation nor the names of its\n      contributors may be used to endorse or promote products derived from\n      this software without specific prior written permission.\n\n\n   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS\n   IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED\n   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A\n   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER\n   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\n   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\n   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  \n*/\n\n/*\n  This file implements simple task systems that provide the three\n  entrypoints used by ispc-generated to code to handle 'launch' and 'sync'\n  statements in ispc programs.  See the section \"Task Parallelism: Language\n  Syntax\" in the ispc documentation for information about using task\n  parallelism in ispc programs, and see the section \"Task Parallelism:\n  Runtime Requirements\" for information about the task-related entrypoints\n  that are implemented here.\n\n  There are several task systems in this file, built using:\n    - Microsoft's Concurrency Runtime (ISPC_USE_CONCRT)\n    - Apple's Grand Central Dispatch (ISPC_USE_GCD)\n    - bare pthreads (ISPC_USE_PTHREADS, ISPC_USE_PTHREADS_FULLY_SUBSCRIBED)\n    - Cilk Plus (ISPC_USE_CILK)\n    - TBB (ISPC_USE_TBB_TASK_GROUP, ISPC_USE_TBB_PARALLEL_FOR)\n    - OpenMP (ISPC_USE_OMP)\n    - HPX (ISPC_USE_HPX)\n\n  The task system implementation can be selected at compile time, by defining \n  the appropriate preprocessor symbol on the command line (for e.g.: -D ISPC_USE_TBB).\n  Not all combinations of platform and task system are meaningful.\n  If no task system is requested, a reasonable default task system for the platform\n  is selected.  Here are the task systems that can be selected:\n\n#define ISPC_USE_GCD\n#define ISPC_USE_CONCRT\n#define ISPC_USE_PTHREADS\n#define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED\n#define ISPC_USE_CILK\n#define ISPC_USE_OMP\n#define ISPC_USE_TBB_TASK_GROUP\n#define ISPC_USE_TBB_PARALLEL_FOR\n\n  The ISPC_USE_PTHREADS_FULLY_SUBSCRIBED model essentially takes over the machine\n  by assigning one pthread to each hyper-thread, and then uses spinlocks and atomics\n  for task management.  This model is useful for KNC where tasks can take over \n  the machine, but less so when there are other tasks that need running on the machine.\n\n#define ISPC_USE_CREW\n#define ISPC_USE_HPX\n  The HPX model requires the HPX runtime environment to be set up. This can be\n  done manually, e.g. with hpx::init, or by including hpx/hpx_main.hpp which\n  uses the main() function as entry point and sets up the runtime system.\n  Number of threads can be specified as commandline parameter with\n  --hpx:threads, use \"all\" to spawn one thread per processing unit.\n\n*/\n\n#if !(defined ISPC_USE_CONCRT          || defined ISPC_USE_GCD              || \\\n      defined ISPC_USE_PTHREADS        || defined ISPC_USE_PTHREADS_FULLY_SUBSCRIBED || \\\n      defined ISPC_USE_TBB_TASK_GROUP  || defined ISPC_USE_TBB_PARALLEL_FOR || \\\n      defined ISPC_USE_OMP             || defined ISPC_USE_CILK             || \\\n      defined ISPC_USE_HPX)\n\n    // If no task model chosen from the compiler cmdline, pick a reasonable default\n    #if defined(_WIN32) || defined(_WIN64)\n      #define ISPC_USE_CONCRT\n    #elif defined(__linux__)\n    #define ISPC_USE_PTHREADS\n    #elif defined(__APPLE__)\n      #define ISPC_USE_GCD\n    #endif\n    #if defined(__KNC__)\n      #define ISPC_USE_PTHREADS\n    #endif\n\n#endif // No task model specified on compiler cmdline\n\n#if defined(_WIN32) || defined(_WIN64)\n#define ISPC_IS_WINDOWS\n#elif defined(__linux__)\n#define ISPC_IS_LINUX\n#elif defined(__APPLE__)\n#define ISPC_IS_APPLE\n#endif\n#if defined(__KNC__)\n#define ISPC_IS_KNC\n#endif\n\n\n#define DBG(x) \n\n#ifdef ISPC_IS_WINDOWS\n  #define NOMINMAX\n  #include <windows.h>\n#endif // ISPC_IS_WINDOWS\n#ifdef ISPC_USE_CONCRT\n  #include <concrt.h>\n  using namespace Concurrency;\n#endif // ISPC_USE_CONCRT\n#ifdef ISPC_USE_GCD\n  #include <dispatch/dispatch.h>\n  #include <pthread.h>\n#endif // ISPC_USE_GCD\n#ifdef ISPC_USE_PTHREADS\n  #include <pthread.h>\n  #include <semaphore.h>\n  #include <unistd.h>\n  #include <fcntl.h>\n  #include <errno.h>\n  #include <sys/types.h>\n  #include <sys/stat.h>\n  #include <sys/param.h>\n  #include <sys/sysctl.h>\n  #include <vector>\n  #include <algorithm>\n#endif // ISPC_USE_PTHREADS\n#ifdef ISPC_USE_PTHREADS_FULLY_SUBSCRIBED\n#include <pthread.h>\n#include <semaphore.h>\n#include <unistd.h>\n#include <fcntl.h>\n#include <errno.h>\n#include <sys/types.h>\n#include <sys/stat.h>\n#include <sys/param.h>\n#include <sys/sysctl.h>\n#include <vector>\n#include <algorithm>\n//#include <stdexcept>\n#include <stack>\n#endif // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED\n#ifdef ISPC_USE_TBB_PARALLEL_FOR\n  #include <tbb/parallel_for.h>\n#endif // ISPC_USE_TBB_PARALLEL_FOR\n#ifdef ISPC_USE_TBB_TASK_GROUP\n  #include <tbb/task_group.h>\n#endif // ISPC_USE_TBB_TASK_GROUP\n#ifdef ISPC_USE_CILK\n  #include <cilk/cilk.h>\n#endif // ISPC_USE_TBB\n#ifdef ISPC_USE_OMP\n  #include <omp.h>\n#endif // ISPC_USE_OMP\n#ifdef ISPC_USE_HPX\n#include <hpx/include/async.hpp>\n#include <hpx/lcos/wait_all.hpp>\n#endif // ISPC_USE_HPX\n#ifdef ISPC_IS_LINUX\n  #include <malloc.h>\n#endif // ISPC_IS_LINUX\n\n#include <stdio.h>\n#include <stdint.h>\n#include <stdlib.h>\n#include <assert.h>\n#include <string.h>\n#include <algorithm>\n\n// Signature of ispc-generated 'task' functions\ntypedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount,\n                             int taskIndex, int taskCount,\n                             int taskIndex0, int taskIndex1, int taskIndex2,\n                             int taskCount0, int taskCount1, int taskCount2);\n\n// Small structure used to hold the data for each task\n#ifdef _MSC_VER\n__declspec(align(16))\n#endif\nstruct TaskInfo {\n    TaskFuncType func;\n    void *data;\n    int taskIndex;\n    int taskCount3d[3];\n#if defined(  ISPC_USE_CONCRT)\n    event taskEvent;\n#endif\n    int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; }\n    int taskIndex0() const \n    {\n      return taskIndex % taskCount3d[0];\n    }\n    int taskIndex1() const \n    {\n      return ( taskIndex / taskCount3d[0] ) % taskCount3d[1];\n    }\n    int taskIndex2() const \n    {\n      return taskIndex / ( taskCount3d[0]*taskCount3d[1] );\n    }\n    int taskCount0() const { return taskCount3d[0]; }\n    int taskCount1() const { return taskCount3d[1]; }\n    int taskCount2() const { return taskCount3d[2]; }\n    TaskInfo() { assert(sizeof(TaskInfo) % 32 == 0); }\n}\n#ifndef _MSC_VER\n__attribute__((aligned(32)));\n#endif\n;\n\n// ispc expects these functions to have C linkage / not be mangled\nextern \"C\" { \n    void ISPCLaunch(void **handlePtr, void *f, void *data, int countx, int county, int countz);\n    void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);\n    void ISPCSync(void *handle);\n}\n\n///////////////////////////////////////////////////////////////////////////\n// TaskGroupBase\n\n#define LOG_TASK_QUEUE_CHUNK_SIZE 14\n#define MAX_TASK_QUEUE_CHUNKS 8\n#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)\n\n#define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)\n\n#define NUM_MEM_BUFFERS 16\n\nclass TaskGroup;\n\n/** The TaskGroupBase structure provides common functionality for \"task\n    groups\"; a task group is the set of tasks launched from within a single\n    ispc function.  When the function is ready to return, it waits for all\n    of the tasks in its task group to finish before it actually returns.\n */\nclass TaskGroupBase {\npublic:\n    void Reset();\n\n    int AllocTaskInfo(int count);\n    TaskInfo *GetTaskInfo(int index);\n\n    void *AllocMemory(int64_t size, int32_t alignment);\n\nprotected:\n    TaskGroupBase();\n    ~TaskGroupBase();\n\n    int nextTaskInfoIndex;\n\nprivate:\n    /* We allocate blocks of TASK_QUEUE_CHUNK_SIZE TaskInfo structures as\n       needed by the calling function.  We hold up to MAX_TASK_QUEUE_CHUNKS\n       of these (and then exit at runtime if more than this many tasks are\n       launched.)\n     */\n    TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];\n\n    /* We also allocate chunks of memory to service ISPCAlloc() calls.  The\n       memBuffers[] array holds pointers to this memory.  The first element\n       of this array is initialized to point to mem and then any subsequent\n       elements required are initialized with dynamic allocation.\n     */\n    int curMemBuffer, curMemBufferOffset;\n    int memBufferSize[NUM_MEM_BUFFERS];\n    char *memBuffers[NUM_MEM_BUFFERS];\n    char mem[256];\n};\n\n\ninline TaskGroupBase::TaskGroupBase() { \n    nextTaskInfoIndex = 0; \n\n    curMemBuffer = 0; \n    curMemBufferOffset = 0;\n    memBuffers[0] = mem;\n    memBufferSize[0] = sizeof(mem) / sizeof(mem[0]);\n    for (int i = 1; i < NUM_MEM_BUFFERS; ++i) {\n        memBuffers[i] = NULL;\n        memBufferSize[i] = 0;\n    }\n\n    for (int i = 0; i < MAX_TASK_QUEUE_CHUNKS; ++i)\n        taskInfo[i] = NULL;\n}\n\n\ninline TaskGroupBase::~TaskGroupBase() {\n    // Note: don't delete memBuffers[0], since it points to the start of\n    // the \"mem\" member!\n    for (int i = 1; i < NUM_MEM_BUFFERS; ++i)\n        delete[](memBuffers[i]);\n}\n\n\ninline void\nTaskGroupBase::Reset() {\n    nextTaskInfoIndex = 0; \n    curMemBuffer = 0; \n    curMemBufferOffset = 0;\n}\n\n\ninline int\nTaskGroupBase::AllocTaskInfo(int count) {\n    int ret = nextTaskInfoIndex;\n    nextTaskInfoIndex += count;\n    return ret;\n}\n\n\ninline TaskInfo *\nTaskGroupBase::GetTaskInfo(int index) {\n    int chunk = (index >> LOG_TASK_QUEUE_CHUNK_SIZE);\n    int offset = index & (TASK_QUEUE_CHUNK_SIZE-1);\n\n    if (chunk == MAX_TASK_QUEUE_CHUNKS) {\n        fprintf(stderr, \"A total of %d tasks have been launched from the \"\n                \"current function--the simple built-in task system can handle \"\n                \"no more. You can increase the values of TASK_QUEUE_CHUNK_SIZE \"\n                \"and LOG_TASK_QUEUE_CHUNK_SIZE to work around this limitation.  \"\n                \"Sorry!  Exiting.\\n\", index);\n        exit(1);\n    }\n\n    if (taskInfo[chunk] == NULL)\n        taskInfo[chunk] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];\n    return &taskInfo[chunk][offset];\n}\n\n\ninline void *\nTaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {\n    char *basePtr = memBuffers[curMemBuffer];\n    intptr_t iptr = (intptr_t)(basePtr + curMemBufferOffset);\n    iptr = (iptr + (alignment-1)) & ~(alignment-1);\n\n    int newOffset = int(iptr - (intptr_t)basePtr + size);\n    if (newOffset < memBufferSize[curMemBuffer]) {\n        curMemBufferOffset = newOffset;\n        return (char *)iptr;\n    }\n\n    ++curMemBuffer;\n    curMemBufferOffset = 0;\n    assert(curMemBuffer < NUM_MEM_BUFFERS);\n\n    int allocSize = 1 << (12 + curMemBuffer);\n    allocSize = std::max(int(size+alignment), allocSize);\n    char *newBuf = new char[allocSize];\n    memBufferSize[curMemBuffer] = allocSize;\n    memBuffers[curMemBuffer] = newBuf;\n    return AllocMemory(size, alignment);\n}\n\n\n///////////////////////////////////////////////////////////////////////////\n// Atomics and the like\n\nstatic inline void\nlMemFence() {\n    // Windows atomic functions already contain the fence\n    // KNC doesn't need the memory barrier\n#if !defined ISPC_IS_KNC && !defined ISPC_IS_WINDOWS\n    __sync_synchronize();\n#endif\n}\n\nstatic void *\nlAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {\n#ifdef ISPC_IS_WINDOWS\n    return InterlockedCompareExchangePointer(v, newValue, oldValue);\n#else\n    void *result = __sync_val_compare_and_swap(v, oldValue, newValue);\n    lMemFence();\n    return result;\n#endif // ISPC_IS_WINDOWS\n}\n\nstatic int32_t \nlAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {\n#ifdef ISPC_IS_WINDOWS\n    return InterlockedCompareExchange((volatile LONG *)v, newValue, oldValue);\n#else\n    int32_t result = __sync_val_compare_and_swap(v, oldValue, newValue);\n    lMemFence();\n    return result;\n#endif // ISPC_IS_WINDOWS\n}\n\nstatic inline int32_t \nlAtomicAdd(volatile int32_t *v, int32_t delta) {\n#ifdef ISPC_IS_WINDOWS\n    return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta;\n#else\n    return __sync_fetch_and_add(v, delta);\n#endif\n}\n\n///////////////////////////////////////////////////////////////////////////\n\n#ifdef ISPC_USE_CONCRT\n// With ConcRT, we don't need to extend TaskGroupBase at all.\nclass TaskGroup : public TaskGroupBase {\npublic:\n    void Launch(int baseIndex, int count);\n    void Sync();\n};\n#endif // ISPC_USE_CONCRT\n\n#ifdef ISPC_USE_GCD\n/* With Grand Central Dispatch, we associate a GCD dispatch group with each\n   task group.  (We'll later wait on this dispatch group when we need to\n   wait on all of the tasks in the group to finish.)\n */\nclass TaskGroup : public TaskGroupBase {\npublic:\n    TaskGroup() {\n        gcdGroup = dispatch_group_create();\n    }\n\n    void Launch(int baseIndex, int count);\n    void Sync();\n\nprivate:\n    dispatch_group_t gcdGroup;\n};\n#endif // ISPC_USE_GCD\n\n#ifdef ISPC_USE_PTHREADS\nstatic void *lTaskEntry(void *arg);\n\nclass TaskGroup : public TaskGroupBase {\npublic:\n    TaskGroup() {\n        numUnfinishedTasks = 0;\n        waitingTasks.reserve(128);\n        inActiveList = false;\n    }\n\n    void Reset() {\n        TaskGroupBase::Reset();\n        numUnfinishedTasks = 0;\n        assert(inActiveList == false);\n        lMemFence();\n    }\n\n    void Launch(int baseIndex, int count);\n    void Sync();\n\nprivate:\n    friend void *lTaskEntry(void *arg);\n\n    int32_t numUnfinishedTasks;\n    int32_t pad[3];\n    std::vector<int> waitingTasks;\n    bool inActiveList;\n};\n\n#endif // ISPC_USE_PTHREADS\n\n#ifdef ISPC_USE_CILK\n\nclass TaskGroup : public TaskGroupBase {\npublic:\n    void Launch(int baseIndex, int count);\n    void Sync();\n\n};\n\n#endif // ISPC_USE_CILK\n\n#ifdef ISPC_USE_OMP\n\nclass TaskGroup : public TaskGroupBase {\npublic:\n    void Launch(int baseIndex, int count);\n    void Sync();\n\n};\n\n#endif // ISPC_USE_OMP\n\n#ifdef ISPC_USE_TBB_PARALLEL_FOR\n\nclass TaskGroup : public TaskGroupBase {\npublic:\n    void Launch(int baseIndex, int count);\n    void Sync();\n\n};\n\n#endif // ISPC_USE_TBB_PARALLEL_FOR\n\n#ifdef ISPC_USE_TBB_TASK_GROUP\n\nclass TaskGroup : public TaskGroupBase {\npublic:\n    void Launch(int baseIndex, int count);\n    void Sync();\nprivate:\n    tbb::task_group tbbTaskGroup;\n};\n\n#endif // ISPC_USE_TBB_TASK_GROUP\n\n#ifdef ISPC_USE_HPX\n\nclass TaskGroup : public TaskGroupBase {\npublic:\n    void Launch(int baseIndex, int count);\n    void Sync();\nprivate:\n    std::vector<hpx::future<void>> futures;\n};\n\n#endif // ISPC_USE_HPX\n\n///////////////////////////////////////////////////////////////////////////\n\n///////////////////////////////////////////////////////////////////////////\n// Grand Central Dispatch\n\n#ifdef ISPC_USE_GCD\n\n/* A simple task system for ispc programs based on Apple's Grand Central\n   Dispatch. */\n\nstatic dispatch_queue_t gcdQueue;\nstatic volatile int32_t lock = 0;\n\nstatic void\nInitTaskSystem() {\n    if (gcdQueue != NULL)\n        return;\n\n    while (1) {\n        if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {\n            if (gcdQueue == NULL) {\n                gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);\n                assert(gcdQueue != NULL);\n                lMemFence();\n            }\n            lock = 0;\n            break;\n        }\n    }\n}\n\n\nstatic void\nlRunTask(void *ti) {\n    TaskInfo *taskInfo = (TaskInfo *)ti;\n    // FIXME: these are bogus values; may cause bugs in code that depends\n    // on them having unique values in different threads.\n    int threadIndex = 0;\n    int threadCount = 1;\n\n    // Actually run the task\n    taskInfo->func(taskInfo->data, threadIndex, threadCount, \n                   taskInfo->taskIndex, taskInfo->taskCount(),\n            taskInfo->taskIndex0(), taskInfo->taskIndex1(), taskInfo->taskIndex2(),\n            taskInfo->taskCount0(), taskInfo->taskCount1(), taskInfo->taskCount2());\n}\n\n\ninline void\nTaskGroup::Launch(int baseIndex, int count) {\n    for (int i = 0; i < count; ++i) {\n        TaskInfo *ti = GetTaskInfo(baseIndex + i);\n        dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);\n    }\n}\n\n\ninline void\nTaskGroup::Sync() {\n    dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);\n}\n\n#endif // ISPC_USE_GCD\n\n///////////////////////////////////////////////////////////////////////////\n// Concurrency Runtime\n\n#ifdef ISPC_USE_CONCRT\n\nstatic void\nInitTaskSystem() {\n    // No initialization needed\n}\n\n\nstatic void __cdecl\nlRunTask(LPVOID param) {\n    TaskInfo *ti = (TaskInfo *)param;\n    \n    // Actually run the task. \n    // FIXME: like the GCD implementation for OS X, this is passing bogus\n    // values for the threadIndex and threadCount builtins, which in turn\n    // will cause bugs in code that uses those.\n    int threadIndex = 0;\n    int threadCount = 1;\n    ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),\n            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),\n            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());\n\n    // Signal the event that this task is done\n    ti->taskEvent.set();\n}\n\n\ninline void\nTaskGroup::Launch(int baseIndex, int count) {\n    for (int i = 0; i < count; ++i)\n        CurrentScheduler::ScheduleTask(lRunTask, GetTaskInfo(baseIndex + i));\n}\n\n\ninline void\nTaskGroup::Sync() {\n    for (int i = 0; i < nextTaskInfoIndex; ++i) {\n        TaskInfo *ti = GetTaskInfo(i);\n        ti->taskEvent.wait();\n        ti->taskEvent.reset();\n    }\n}\n\n#endif // ISPC_USE_CONCRT\n\n///////////////////////////////////////////////////////////////////////////\n// pthreads\n\n#ifdef ISPC_USE_PTHREADS\n\nstatic volatile int32_t lock = 0;\n\nstatic int nThreads;\nstatic pthread_t *threads = NULL;\n\nstatic pthread_mutex_t taskSysMutex;\nstatic std::vector<TaskGroup *> activeTaskGroups;\nstatic sem_t *workerSemaphore;\n\nstatic void *\nlTaskEntry(void *arg) {\n    int threadIndex = (int)((int64_t)arg);\n    int threadCount = nThreads;\n\n    while (1) {\n        int err;\n        //\n        // Wait on the semaphore until we're woken up due to the arrival of\n        // more work.\n        //\n        if ((err = sem_wait(workerSemaphore)) != 0) {\n            fprintf(stderr, \"Error from sem_wait: %s\\n\", strerror(err));\n            exit(1);\n        }\n\n        //\n        // Acquire the mutex\n        //\n        if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {\n            fprintf(stderr, \"Error from pthread_mutex_lock: %s\\n\", strerror(err));\n            exit(1);\n        }\n\n        if (activeTaskGroups.size() == 0) {\n            //\n            // Task queue is empty, go back and wait on the semaphore\n            //\n            if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {\n                fprintf(stderr, \"Error from pthread_mutex_unlock: %s\\n\", strerror(err));\n                exit(1);\n            }\n            continue;\n        }\n\n        //\n        // Get the last task group on the active list and the last task\n        // from its waiting tasks list.\n        //\n        TaskGroup *tg = activeTaskGroups.back();\n        assert(tg->waitingTasks.size() > 0);\n        int taskNumber = tg->waitingTasks.back();\n        tg->waitingTasks.pop_back();\n\n        if (tg->waitingTasks.size() == 0) {\n            // We just took the last task from this task group, so remove\n            // it from the active list.\n            activeTaskGroups.pop_back();\n            tg->inActiveList = false;\n        }\n    \n        if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {\n            fprintf(stderr, \"Error from pthread_mutex_unlock: %s\\n\", strerror(err));\n            exit(1);\n        }\n\n        //\n        // And now actually run the task\n        //\n        DBG(fprintf(stderr, \"running task %d from group %p\\n\", taskNumber, tg));\n        TaskInfo *myTask = tg->GetTaskInfo(taskNumber);\n        myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex,\n                     myTask->taskCount(),\n            myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(),\n            myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2());\n\n        //\n        // Decrement the \"number of unfinished tasks\" counter in the task\n        // group.\n        //\n        lMemFence();\n        lAtomicAdd(&tg->numUnfinishedTasks, -1);\n    }\n\n    pthread_exit(NULL);\n    return 0;\n}\n\n\nstatic void\nInitTaskSystem() {\n    if (threads == NULL) {\n        while (1) {\n            if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {\n                if (threads == NULL) {\n                    // We launch one fewer thread than there are cores,\n                    // since the main thread here will also grab jobs from\n                    // the task queue itself.\n                    nThreads = sysconf(_SC_NPROCESSORS_ONLN) - 1;\n\n                    int err;\n                    if ((err = pthread_mutex_init(&taskSysMutex, NULL)) != 0) {\n                        fprintf(stderr, \"Error creating mutex: %s\\n\", strerror(err));\n                        exit(1);\n                    }\n\n                    char name[32];\n                    bool success = false;\n                    srand(time(NULL));\n                    for (int i = 0; i < 10; i++) {\n                        sprintf(name, \"ispc_task.%d.%d\", (int)getpid(), (int)rand());\n                        workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);\n                        if (workerSemaphore != SEM_FAILED) {\n                            success = true;\n                            break;\n                        }\n                        fprintf(stderr, \"Failed to create %s\\n\", name);\n                    }\n\n                    if (!success) {\n                        fprintf(stderr, \"Error creating semaphore (%s): %s\\n\", name, strerror(errno));\n                        exit(1);\n                    }\n\n                    threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t));\n                    for (int i = 0; i < nThreads; ++i) {\n                      err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)((long long)i));\n                        if (err != 0) {\n                            fprintf(stderr, \"Error creating pthread %d: %s\\n\", i, strerror(err));\n                            exit(1);\n                        }\n                    }\n\n                    activeTaskGroups.reserve(64);\n                }\n\n                // Make sure all of the above goes to memory before we\n                // clear the lock.\n                lMemFence();\n                lock = 0;\n                break;\n            }\n        }\n    }\n}\n\n\ninline void\nTaskGroup::Launch(int baseCoord, int count) {\n    //\n    // Acquire mutex, add task\n    //\n    int err;\n    if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {\n        fprintf(stderr, \"Error from pthread_mutex_lock: %s\\n\", strerror(err));\n        exit(1);\n    }\n\n    // Add the corresponding set of tasks to the waiting-to-be-run list for\n    // this task group.\n    //\n    // FIXME: it's a little ugly to hold a global mutex for this when we\n    // only need to make sure no one else is accessing this task group's\n    // waitingTasks list.  (But a small experiment in switching to a\n    // per-TaskGroup mutex showed worse performance!)\n    for (int i = 0; i < count; ++i)\n        waitingTasks.push_back(baseCoord + i);\n\n    // Add the task group to the global active list if it isn't there\n    // already.\n    if (inActiveList == false) {\n        activeTaskGroups.push_back(this);\n        inActiveList = true;\n    }\n\n    if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {\n        fprintf(stderr, \"Error from pthread_mutex_unlock: %s\\n\", strerror(err));\n        exit(1);\n    }\n\n    //\n    // Update the count of the number of tasks left to run in this task\n    // group.\n    //\n    lMemFence();\n    lAtomicAdd(&numUnfinishedTasks, count);\n\n    //\n    // Post to the worker semaphore to wake up worker threads that are\n    // sleeping waiting for tasks to show up\n    //\n    for (int i = 0; i < count; ++i)\n        if ((err = sem_post(workerSemaphore)) != 0) {\n            fprintf(stderr, \"Error from sem_post: %s\\n\", strerror(err));\n            exit(1);\n        }\n}\n\n\ninline void\nTaskGroup::Sync() {\n    DBG(fprintf(stderr, \"syncing %p - %d unfinished\\n\", tg, numUnfinishedTasks));\n\n    while (numUnfinishedTasks > 0) {\n        // All of the tasks in this group aren't finished yet.  We'll try\n        // to help out here since we don't have anything else to do...\n\n        DBG(fprintf(stderr, \"while syncing %p - %d unfinished\\n\", tg, \n                    numUnfinishedTasks));\n\n        //\n        // Acquire the global task system mutex to grab a task to work on\n        //\n        int err;\n        if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {\n            fprintf(stderr, \"Error from pthread_mutex_lock: %s\\n\", strerror(err));\n            exit(1);\n        }\n\n        TaskInfo *myTask = NULL;\n        TaskGroup *runtg = this;\n        if (waitingTasks.size() > 0) {\n            int taskNumber = waitingTasks.back();\n            waitingTasks.pop_back();\n\n            if (waitingTasks.size() == 0) {\n                // There's nothing left to start running from this group,\n                // so remove it from the active task list.\n                activeTaskGroups.erase(std::find(activeTaskGroups.begin(),\n                                                 activeTaskGroups.end(), this));\n                inActiveList = false;\n            }\n            myTask = GetTaskInfo(taskNumber);\n            DBG(fprintf(stderr, \"running task %d from group %p in sync\\n\", taskNumber, tg));\n        }\n        else {\n            // Other threads are already working on all of the tasks in\n            // this group, so we can't help out by running one ourself.\n            // We'll try to run one from another group to make ourselves\n            // useful here.\n            if (activeTaskGroups.size() == 0) {\n                // No active task groups left--there's nothing for us to do.\n                if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {\n                    fprintf(stderr, \"Error from pthread_mutex_unlock: %s\\n\", strerror(err));\n                    exit(1);\n                }\n                // FIXME: We basically end up busy-waiting here, which is\n                // extra wasteful in a world with hyper-threading.  It would\n                // be much better to put this thread to sleep on a\n                // condition variable that was signaled when the last task\n                // in this group was finished.\n#ifndef ISPC_IS_KNC\n                usleep(1);\n#else\n                _mm_delay_32(8);\n#endif\n                continue;\n            }\n\n            // Get a task to run from another task group.\n            runtg = activeTaskGroups.back();\n            assert(runtg->waitingTasks.size() > 0);\n\n            int taskNumber = runtg->waitingTasks.back();\n            runtg->waitingTasks.pop_back();\n            if (runtg->waitingTasks.size() == 0) {\n                // There's left to start running from this group, so remove\n                // it from the active task list.\n                activeTaskGroups.pop_back();\n                runtg->inActiveList = false;\n            }\n            myTask = runtg->GetTaskInfo(taskNumber);\n            DBG(fprintf(stderr, \"running task %d from other group %p in sync\\n\", \n                        taskNumber, runtg));\n        }\n\n        if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {\n            fprintf(stderr, \"Error from pthread_mutex_unlock: %s\\n\", strerror(err));\n            exit(1);\n        }\n    \n        //\n        // Do work for _myTask_\n        //\n        // FIXME: bogus values for thread index/thread count here as well..\n        myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount(),\n            myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(),\n            myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2());\n\n        //\n        // Decrement the number of unfinished tasks counter\n        //\n        lMemFence();\n        lAtomicAdd(&runtg->numUnfinishedTasks, -1);\n    }\n    DBG(fprintf(stderr, \"sync for %p done!n\", tg));\n}\n\n#endif // ISPC_USE_PTHREADS\n\n///////////////////////////////////////////////////////////////////////////\n// Cilk Plus\n\n#ifdef ISPC_USE_CILK\n\nstatic void\nInitTaskSystem() {\n    // No initialization needed\n}\n\ninline void\nTaskGroup::Launch(int baseIndex, int count) {\n    cilk_for(int i = 0; i < count; i++) {\n        TaskInfo *ti = GetTaskInfo(baseIndex + i);\n\n        // Actually run the task. \n        // Cilk does not expose the task -> thread mapping so we pretend it's 1:1\n        ti->func(ti->data, ti->taskIndex, ti->taskCount(),\n            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),\n            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());\n    }\n}\n\ninline void\nTaskGroup::Sync() {\n}\n\n#endif // ISPC_USE_CILK\n\n///////////////////////////////////////////////////////////////////////////\n// OpenMP\n\n#ifdef ISPC_USE_OMP\n\nstatic void\nInitTaskSystem() {\n        // No initialization needed\n}\n\ninline void\nTaskGroup::Launch(int baseIndex, int count) {\n#pragma omp parallel\n  {\n    const int threadIndex = omp_get_thread_num();\n    const int threadCount = omp_get_num_threads();\n\n#pragma omp for schedule(runtime)\n    for(int i = 0; i < count; i++) \n    {\n        TaskInfo *ti = GetTaskInfo(baseIndex + i);\n\n        // Actually run the task. \n        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),\n            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),\n            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());\n    }\n  }\n}\n\ninline void\nTaskGroup::Sync() {\n}\n\n#endif // ISPC_USE_OMP\n\n///////////////////////////////////////////////////////////////////////////\n// Thread Building Blocks\n\n#ifdef ISPC_USE_TBB_PARALLEL_FOR\n\nstatic void\nInitTaskSystem() {\n    // No initialization needed by default\n    //tbb::task_scheduler_init();\n}\n\ninline void\nTaskGroup::Launch(int baseIndex, int count) {\n    tbb::parallel_for(0, count, [=](int i) {\n        TaskInfo *ti = GetTaskInfo(baseIndex + i);\n\n        // Actually run the task. \n        // TBB does not expose the task -> thread mapping so we pretend it's 1:1\n        int threadIndex = ti->taskIndex;\n        int threadCount = ti->taskCount();\n\n        ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),\n            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),\n            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());\n    });\n}\n\ninline void\nTaskGroup::Sync() {\n}\n\n#endif // ISPC_USE_TBB_PARALLEL_FOR\n\n#ifdef ISPC_USE_TBB_TASK_GROUP\n\nstatic void\nInitTaskSystem() {\n    // No initialization needed by default\n    //tbb::task_scheduler_init();\n}\n\ninline void\nTaskGroup::Launch(int baseIndex, int count) {\n    for (int i = 0; i < count; i++) {\n        tbbTaskGroup.run([=]() {\n            TaskInfo *ti = GetTaskInfo(baseIndex + i);\n\n            // TBB does not expose the task -> thread mapping so we pretend it's 1:1\n            int threadIndex = ti->taskIndex;\n            int threadCount = ti->taskCount();\n            ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),\n            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),\n            ti->taskCount0(), ti->taskCount1(), ti->taskCount2());\n        });\n    }\n}\n\ninline void\nTaskGroup::Sync() {\n    tbbTaskGroup.wait();\n}\n\n#endif // ISPC_USE_TBB_TASK_GROUP\n\n///////////////////////////////////////////////////////////////////////////\n// ISPC_USE_HPX\n\n#ifdef ISPC_USE_HPX\n\nstatic void\nInitTaskSystem() {\n}\n\ninline void\nTaskGroup::Launch(int baseIndex, int count) {\n    for (int i = 0; i < count; ++i) {\n        TaskInfo *ti = GetTaskInfo(baseIndex + i);\n        int threadIndex = i;\n        int threadCount = count;\n        futures.push_back(hpx::async(ti->func, ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(),\n            ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(),\n            ti->taskCount0(), ti->taskCount1(), ti->taskCount2()));\n    }\n}\n\ninline void\nTaskGroup::Sync() {\n    hpx::wait_all(futures);\n    futures.clear();\n}\n#endif\n///////////////////////////////////////////////////////////////////////////\n\n#ifndef ISPC_USE_PTHREADS_FULLY_SUBSCRIBED\n\n#define MAX_FREE_TASK_GROUPS 64\nstatic TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];\n\nstatic inline TaskGroup *\nAllocTaskGroup() {\n    for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {\n        TaskGroup *tg = freeTaskGroups[i];\n        if (tg != NULL) {\n            void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg);\n            if (ptr != NULL) {\n                return (TaskGroup *)ptr;\n            }\n        }\n    }\n\n    return new TaskGroup;\n}\n\n\nstatic inline void\nFreeTaskGroup(TaskGroup *tg) {\n    tg->Reset();\n\n    for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {\n        if (freeTaskGroups[i] == NULL) {\n            void *ptr = lAtomicCompareAndSwapPointer((void **)&freeTaskGroups[i], tg, NULL);\n            if (ptr == NULL)\n                return;\n        }\n    }\n\n    delete tg;\n}\n\n///////////////////////////////////////////////////////////////////////////\n\nvoid\nISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2) {\n    const int count = count0*count1*count2;\n    TaskGroup *taskGroup;\n    if (*taskGroupPtr == NULL) {\n        InitTaskSystem();\n        taskGroup = AllocTaskGroup();\n        *taskGroupPtr = taskGroup;\n    }\n    else\n        taskGroup = (TaskGroup *)(*taskGroupPtr);\n\n    int baseIndex = taskGroup->AllocTaskInfo(count);\n    for (int i = 0; i < count; ++i) {\n        TaskInfo *ti = taskGroup->GetTaskInfo(baseIndex+i);\n        ti->func = (TaskFuncType)func;\n        ti->data = data;\n        ti->taskIndex = i;\n        ti->taskCount3d[0] = count0;\n        ti->taskCount3d[1] = count1;\n        ti->taskCount3d[2] = count2;\n    }\n    taskGroup->Launch(baseIndex, count);\n}\n\n\nvoid\nISPCSync(void *h) {\n    TaskGroup *taskGroup = (TaskGroup *)h;\n    if (taskGroup != NULL) {\n        taskGroup->Sync();\n        FreeTaskGroup(taskGroup);\n    }\n}\n\n\nvoid *\nISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) {\n    TaskGroup *taskGroup;\n    if (*taskGroupPtr == NULL) {\n        InitTaskSystem();\n        taskGroup = AllocTaskGroup();\n        *taskGroupPtr = taskGroup;\n    }\n    else\n        taskGroup = (TaskGroup *)(*taskGroupPtr);\n\n    return taskGroup->AllocMemory(size, alignment);\n}\n\n#else  // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED\n\n#define MAX_LIVE_TASKS 1024\n\npthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;\n\n// Small structure used to hold the data for each task\nstruct Task {\npublic:\n    TaskFuncType func;\n    void *data;\n    volatile int32_t taskIndex;\n    int taskCount;\n\n    volatile int numDone;\n    int liveIndex; // index in live task queue\n\n    inline int  noMoreWork() { return taskIndex >= taskCount; }\n    /*! given thread is done working on this task --> decrease num locks */\n    // inline void lock() { lAtomicAdd(&locks,1); }\n    // inline void unlock() { lAtomicAdd(&locks,-1); }\n    inline int  nextJob() { return lAtomicAdd(&taskIndex,1); }\n    inline int  numJobs() { return taskCount; }\n    inline void schedule(int idx) { taskIndex = 0; numDone = 0; liveIndex = idx; }\n    inline void run(int idx, int threadIdx);\n    inline void markOneDone() { lAtomicAdd(&numDone,1); }\n    inline void wait()\n    {\n        while (!noMoreWork()) {\n            int next = nextJob();\n            if (next < numJobs()) run(next, 0);\n        }\n        while (numDone != taskCount) {\n#ifndef ISPC_IS_KNC\n            usleep(1);\n#else\n            _mm_delay_32(8);\n#endif\n        }\n    }\n};\n\n///////////////////////////////////////////////////////////////////////////\nclass TaskSys {\n    static int numThreadsRunning;\n    struct LiveTask\n    {\n        volatile int locks; /*!< num locks on this task. gets\n                                 initialized to NUM_THREADS+1, then counted\n                                 down by every thread that sees this. this\n                                 value is only valid when 'active' is set\n                                 to true */\n        volatile int active; /*! workers will spin on this until it\n                                 becomes active */\n        Task *task;\n\n        inline void doneWithThis() { lAtomicAdd(&locks,-1); }\n        LiveTask() : active(0), locks(-1) {}\n    };\n\npublic:\n    volatile int nextScheduleIndex; /*! next index in the task queue\n                                        where we'll insert a live task */\n\n    // inline int inc_begin() { int old = begin; begin = (begin+1)%MAX_TASKS; return old; }\n    // inline int inc_end() { int old = end; end = (end+1)%MAX_TASKS; return old; }\n\n    LiveTask taskQueue[MAX_LIVE_TASKS];\n    std::stack<Task *> taskMem;\n\n    static TaskSys *global;\n\n    TaskSys() : nextScheduleIndex(0)\n    {\n        TaskSys::global = this;\n        Task *mem = new Task[MAX_LIVE_TASKS]; //< could actually be more than _live_ tasks\n        for (int i=0;i<MAX_LIVE_TASKS;i++) {\n            taskMem.push(mem+i);\n        }\n        createThreads();\n    }\n\n    inline Task *allocOne()\n    {\n        pthread_mutex_lock(&mutex);\n        if (taskMem.empty()) {\n            fprintf(stderr, \"Too many live tasks.  \"\n                    \"Change the value of MAX_LIVE_TASKS and recompile.\\n\");\n            exit(1);\n        }\n        Task *task = taskMem.top();\n        taskMem.pop();\n        pthread_mutex_unlock(&mutex);\n        return task;\n    }\n\n    static inline void init()\n    {\n        if (global) return;\n        pthread_mutex_lock(&mutex);\n        if (global == NULL) global = new TaskSys;\n        pthread_mutex_unlock(&mutex);\n    }\n\n    void createThreads();\n    int nThreads;\n    pthread_t *thread;\n\n    void threadFct();\n\n    inline void schedule(Task *t)\n    {\n        pthread_mutex_lock(&mutex);\n        int liveIndex = nextScheduleIndex;\n        nextScheduleIndex = (nextScheduleIndex+1)%MAX_LIVE_TASKS;\n        if (taskQueue[liveIndex].active) {\n            fprintf(stderr, \"Out of task queue resources.  \"\n                    \"Change the value of MAX_LIVE_TASKS and recompile.\\n\");\n            exit(1);\n        }\n        taskQueue[liveIndex].task = t;\n        t->schedule(liveIndex);\n        taskQueue[liveIndex].locks = numThreadsRunning+1; // num _worker_ threads plus creator\n        taskQueue[liveIndex].active = true;\n        pthread_mutex_unlock(&mutex);\n    }\n\n    void sync(Task *task)\n    {\n        task->wait();\n        int liveIndex = task->liveIndex;\n        while (taskQueue[liveIndex].locks > 1) {\n#ifndef ISPC_IS_KNC\n            usleep(1);\n#else\n            _mm_delay_32(8);\n#endif\n        }\n        _mm_free(task->data);\n        pthread_mutex_lock(&mutex);\n        taskMem.push(task); // recycle task index\n        taskQueue[liveIndex].active = false;\n        pthread_mutex_unlock(&mutex);\n    }\n};\n\n\nvoid TaskSys::threadFct() \n{\n    int myIndex = 0; //lAtomicAdd(&threadIdx,1);\n    while (1) {\n        while (!taskQueue[myIndex].active) {\n#ifndef ISPC_IS_KNC\n            usleep(4);\n#else\n            _mm_delay_32(32);\n#endif\n            continue;\n        }\n\n        Task *mine = taskQueue[myIndex].task;\n        while (!mine->noMoreWork()) {\n            int job = mine->nextJob();\n            if (job >= mine->numJobs()) break;\n            mine->run(job,myIndex);\n        }\n        taskQueue[myIndex].doneWithThis();\n        myIndex = (myIndex+1)%MAX_LIVE_TASKS;\n    }\n}\n\n\ninline void Task::run(int idx, int threadIdx) {\n    (*this->func)(data,threadIdx,TaskSys::global->nThreads,idx,taskCount);\n    markOneDone();\n}\n\n\nvoid *_threadFct(void *data) {\n    ((TaskSys*)data)->threadFct();\n    return NULL;\n}\n\n\nvoid TaskSys::createThreads() \n{\n    init();\n    int reserved = 4;\n    int minid = 2;\n    nThreads = sysconf(_SC_NPROCESSORS_ONLN) - reserved;\n\n    thread = (pthread_t *)malloc(nThreads * sizeof(pthread_t));\n\n    numThreadsRunning = 0;\n    for (int i = 0; i < nThreads; ++i) {\n        pthread_attr_t attr;\n        pthread_attr_init(&attr);\n        pthread_attr_setstacksize(&attr, 2*1024 * 1024);\n\n        int threadID = minid+i;\n        cpu_set_t cpuset;\n        CPU_ZERO(&cpuset);\n        CPU_SET(threadID,&cpuset);\n        int ret = pthread_attr_setaffinity_np(&attr,sizeof(cpuset),&cpuset);\n\n        int err = pthread_create(&thread[i], &attr, &_threadFct, this);\n        ++numThreadsRunning;\n        if (err != 0) {\n            fprintf(stderr, \"Error creating pthread %d: %s\\n\", i, strerror(err));\n            exit(1);\n        }\n    }\n}\n\nTaskSys * TaskSys::global = NULL;\nint TaskSys::numThreadsRunning = 0;\n\n///////////////////////////////////////////////////////////////////////////\n\nvoid ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) \n{\n    Task *ti = *(Task**)taskGroupPtr;\n    ti->func = (TaskFuncType)func;\n    ti->data = data;\n    ti->taskIndex = 0;\n    ti->taskCount = count;\n    TaskSys::global->schedule(ti);\n}\n\nvoid ISPCSync(void *h) \n{\n    Task *task = (Task *)h; \n    assert(task);\n    TaskSys::global->sync(task);\n}\n\nvoid *ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) \n{\n    TaskSys::init();\n    Task *task = TaskSys::global->allocOne();\n    *taskGroupPtr = task;\n    task->data = _mm_malloc(size,alignment);\n    return task->data;//*taskGroupPtr;\n}\n\n#endif // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED\n"
  },
  {
    "path": "examples/stencil/volta/timing.h",
    "content": "/*\n  Copyright (c) 2010-2011, Intel Corporation\n  All rights reserved.\n\n  Redistribution and use in source and binary forms, with or without\n  modification, are permitted provided that the following conditions are\n  met:\n\n    * Redistributions of source code must retain the above copyright\n      notice, this list of conditions and the following disclaimer.\n\n    * Redistributions in binary form must reproduce the above copyright\n      notice, this list of conditions and the following disclaimer in the\n      documentation and/or other materials provided with the distribution.\n\n    * Neither the name of Intel Corporation nor the names of its\n      contributors may be used to endorse or promote products derived from\n      this software without specific prior written permission.\n\n\n   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS\n   IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED\n   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A\n   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER\n   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\n   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\n   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  \n*/\n\n#include <stdint.h>\n\n#ifdef __arm__\n#include <sys/time.h>\n// There's no easy way to get a hardware clock counter on ARM, so instead\n// we'll pretend it's a 1GHz processor and then compute pretend cycles\n// based on elapsed time from gettimeofday().\n__inline__ uint64_t rdtsc() {\n  static bool first = true;\n  static struct timeval tv_start;\n  if (first) {\n    gettimeofday(&tv_start, NULL);\n    first = false;\n    return 0;\n  }\n\n  struct timeval tv;\n  gettimeofday(&tv, NULL);\n  tv.tv_sec -= tv_start.tv_sec;\n  tv.tv_usec -= tv_start.tv_usec;\n  return (1000000ull * tv.tv_sec + tv.tv_usec) * 1000ull;\n}\n\n#include <sys/time.h>\nstatic inline double rtc(void)\n{\n  struct timeval Tvalue;\n  double etime;\n  struct timezone dummy;\n\n  gettimeofday(&Tvalue,&dummy);\n  etime =  (double) Tvalue.tv_sec +\n    1.e-6*((double) Tvalue.tv_usec);\n  return etime;\n}\n\n#else // __arm__\n\n#ifdef WIN32\n#include <windows.h>\n#define rdtsc __rdtsc\n#else // WIN32\n__inline__ uint64_t rdtsc() {\n  uint32_t low, high;\n#ifdef __x86_64\n  __asm__ __volatile__ (\"xorl %%eax,%%eax \\n    cpuid\"\n                        ::: \"%rax\", \"%rbx\", \"%rcx\", \"%rdx\" );\n#else\n  __asm__ __volatile__ (\"xorl %%eax,%%eax \\n    cpuid\"\n                        ::: \"%eax\", \"%ebx\", \"%ecx\", \"%edx\" );\n#endif\n  __asm__ __volatile__ (\"rdtsc\" : \"=a\" (low), \"=d\" (high));\n  return (uint64_t)high << 32 | low;\n}\n\n#include <sys/time.h>\nstatic inline double rtc(void)\n{\n  struct timeval Tvalue;\n  double etime;\n  struct timezone dummy;\n\n  gettimeofday(&Tvalue,&dummy);\n  etime =  (double) Tvalue.tv_sec +\n    1.e-6*((double) Tvalue.tv_usec);\n  return etime;\n}\n\n#endif // !WIN32\n#endif // !__arm__            \n            \nstatic uint64_t start,  end;\nstatic double  tstart, tend;\n\nstatic inline void reset_and_start_timer()\n{\n    start = rdtsc();\n#ifndef WIN32\n    // Unused in Windows build, rtc() causing link errors\n    tstart = rtc();\n#endif\n}\n\n/* Returns the number of millions of elapsed processor cycles since the\n   last reset_and_start_timer() call. */\nstatic inline double get_elapsed_mcycles()\n{\n    end = rdtsc();\n    return (end-start) / (1024. * 1024.);\n}\n\n#ifndef WIN32\n// Unused in Windows build, rtc() causing link errors\nstatic inline double get_elapsed_msec()\n{\n    tend = rtc();\n    return (tend - tstart)*1e3;\n}\n#endif\n"
  },
  {
    "path": "examples/triangle_xform/Cargo.toml",
    "content": "[package]\nname = \"triangle_xform\"\nversion = \"0.1.0\"\nauthors = [\"Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>\"]\nedition = \"2018\"\n\n[dependencies]\npacked_simd = { package = \"packed_simd\", path = \"../..\" }\n\n[dev-dependencies]\nrand = \"0.7.0\"\ntime = \"0.1.40\"\n"
  },
  {
    "path": "examples/triangle_xform/readme.md",
    "content": "# Transforming triangle vertices using a transformation matrix\n\n## Description\n\nThis example contains the SIMD implementation of a common computer graphics task:\ntransforming vertices with a matrix.\n\n## Implementation\n\nThere are two implementations:\n\n- scalar version, uses an array-of-structures layout, where each triangle contains\n  three vertices, and each vertex contains only a 3D position vector; the algorithm\n  operates on **one triangle at a time**.\n\n- SIMD version, uses a structure-of-arrays layout, where the structure contains, for\n  each of the X, Y, and Z components of a 3D vector, an array of their values; the\n  algorithm operates on **up to N triangles at once**, where N is number of lanes in a\n  SIMD register.\n\nTo simplify the implementation, the transformation matrix is composed only of simple\nrotation, scaling and translation matrices.\n\nBoth implementations are single-threaded. They can be easily parallelized using [rayon]\nand dividing the list of triangles into chunks.\n\n[rayon]: https://github.com/rayon-rs/rayon\n\n## Benchmark results\n\nThis crate is mainly intended for educational purposes, since performance improvements\nwill likely come from using the transformed triangles in SIMD layout further down the\npipeline.\n\nIn order to compare the generated results, the tests will convert the SIMD output back\ninto a scalar representation.\n\nThat being said, the crate's tests also come with a micro-benchmark.\nIt is recommended to increase the `TRIANGLE_COUNT` constant to the point where\nyou get accurate benchmark results.\n\nRun the unit tests in release mode, and with `stdout` capture disabled:\n\n```sh\ncargo test --release -- --no-capture\n```\n\nBenchmark results on an Intel i5 with AVX, for 2^24 triangles:\n\n| algorithm |  time  |\n|-----------|--------|\n|  scalar   | 255 ms |\n|  simd     | 237 ms |\n\n(**Note**: the benchmark does not take into account the time required for transforming\nthe data into an SIMD layout)\n\nSIMD is a mere 7% faster than the scalar algorithm, since LLVM was already able to\nvectorize most of the multiplication code. Since we're not doing a lot of processing\non the triangles after transforming them, this \"benchmark\" is very limited by memory\nbandwidth.\n"
  },
  {
    "path": "examples/triangle_xform/src/lib.rs",
    "content": "#![allow(clippy::must_use_candidate)]\n\n/// Simple matrix type.\n/// The memory layout is the same as the one for Direct3D/OpenGL: fourth vector\n/// represents the translation vector `[x, y, z]`.\ntype Matrix = [[f32; 3]; 4];\n\n/// Scalar implementation of the triangle transform.\npub mod scalar;\n/// SIMD implementation of the triangle transform.\npub mod simd;\n\n#[cfg(test)]\nmod tests {\n    use super::*;\n    use rand::prelude::*;\n\n    const TRIANGLE_COUNT: usize = 1 << 5;\n\n    #[test]\n    fn compare_scalar_simd() {\n        let dist = rand::distributions::Standard;\n        let mut rng = thread_rng();\n\n        // Generate a random triangle\n        let triangles = dist\n            .sample_iter(&mut rng)\n            .take(TRIANGLE_COUNT)\n            .collect::<Vec<scalar::Triangle>>();\n\n        // Generate a random matrix\n        let mat: Matrix = dist.sample(&mut rng);\n\n        // Benchmark scalar performance\n        let mut scalar_xformed = Vec::new();\n        let scalar_dur = time::Duration::span(|| {\n            scalar_xformed = triangles\n                .iter()\n                .map(|tri| tri.transform(mat))\n                .collect::<Vec<_>>();\n        });\n\n        // Convert the random triangles to a structure-of-arrays format.\n        let triangles = triangles\n            .chunks(simd::VecF::lanes())\n            .map(|tris| simd::Triangle::pack(tris))\n            .collect::<Vec<_>>();\n\n        // Benchmark SIMD performance\n        let mut simd_xformed = Vec::new();\n        let simd_dur = time::Duration::span(|| {\n            simd_xformed = triangles\n                .iter()\n                .map(|tri| tri.transform(mat))\n                .collect::<Vec<_>>();\n        });\n\n        println!(\"scalar: {} ms\", scalar_dur.num_milliseconds());\n        println!(\"simd: {} ms\", simd_dur.num_milliseconds());\n\n        // Convert SIMD results back to AOS layout for comparison test\n        let simd_xformed = simd_xformed\n            .into_iter()\n            .flat_map(|tri| tri.unpack())\n            .collect::<Vec<_>>();\n\n        const EPSILON: f32 = 1E-5;\n\n        if scalar_xformed != simd_xformed {\n            scalar_xformed.into_iter().zip(simd_xformed.into_iter()).for_each(\n                |(a, b)| {\n                    if a != b {\n                        a.0.iter().zip(b.0.iter()).for_each(\n                            |(v1, v2)| {\n                                v1.iter().zip(v2.iter()).for_each(\n                                    |(a, b)| {\n                                        assert!(\n                                            (a - b).abs() <= EPSILON,\n                                            \"Vertex components do not match\"\n                                        );\n                                    },\n                                );\n                            },\n                        );\n                    }\n                },\n            );\n        }\n    }\n}\n"
  },
  {
    "path": "examples/triangle_xform/src/scalar.rs",
    "content": "use super::Matrix;\n\n/// Vertex data: a single 3D vector of floats, representing position.\npub type Vertex = [f32; 3];\n\n/// Triangle type for array-of-structs layout.\n#[derive(Debug, Default, Copy, Clone, PartialEq)]\npub struct Triangle(pub [Vertex; 3]);\n\nimpl Triangle {\n    /// Transforms this triangle by multiplying with a matrix.\n    #[inline]\n    pub fn transform(self, mat: Matrix) -> Self {\n        let mut xformed: [Vertex; 3] = Default::default();\n\n        let vertices = self.0;\n\n        let col_a = mat[0];\n        let col_b = mat[1];\n        let col_c = mat[2];\n        let col_d = mat[3];\n\n        for k in 0..3 {\n            let v = vertices[k];\n\n            let x =\n                col_a[0] * v[0] + col_b[0] * v[1] + col_c[0] * v[2] + col_d[0];\n            let y =\n                col_a[1] * v[0] + col_b[1] * v[1] + col_c[1] * v[2] + col_d[1];\n            let z =\n                col_a[2] * v[0] + col_b[2] * v[1] + col_c[2] * v[2] + col_d[2];\n\n            xformed[k] = [x, y, z];\n        }\n\n        Self(xformed)\n    }\n}\n\n#[cfg(test)]\nmod tests {\n    use super::*;\n    use rand::{distributions::Standard, prelude::*};\n\n    impl Distribution<Triangle> for Standard {\n        fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Triangle {\n            Triangle(self.sample(rng))\n        }\n    }\n\n    #[test]\n    fn translate() {\n        let tri =\n            Triangle([[-0.5, -0.5, 0.0], [0.5, -0.5, 0.0], [0.0, 0.5, 0.0]]);\n\n        let (x, y, z) = (-0.25, 0.5, 1.0);\n\n        let matrix =\n            [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0], [x, y, z]];\n\n        let tri = tri.transform(matrix);\n\n        let expected =\n            Triangle([[-0.75, 0.0, 1.0], [0.25, 0.0, 1.0], [-0.25, 1.0, 1.0]]);\n\n        assert_eq!(tri, expected);\n    }\n}\n"
  },
  {
    "path": "examples/triangle_xform/src/simd.rs",
    "content": "use super::Matrix;\n\n/// SIMD vector of floats\npub type VecF = packed_simd::f32x8;\n\n/// SIMD batch of N triangles, where N is SIMD width.\n#[derive(Debug, Default, Copy, Clone)]\npub struct Triangle {\n    pub x: [VecF; 3],\n    pub y: [VecF; 3],\n    pub z: [VecF; 3],\n}\n\nimpl Triangle {\n    /// Combines N scalar triangles into a single SIMD triangle.\n    pub fn pack(tris: &[crate::scalar::Triangle]) -> Self {\n        assert_eq!(tris.len(), VecF::lanes());\n\n        let mut x = [VecF::splat(0.0); 3];\n        let mut y = [VecF::splat(0.0); 3];\n        let mut z = [VecF::splat(0.0); 3];\n        (0..3).for_each(|k| {\n            let x = &mut x[k];\n            let y = &mut y[k];\n            let z = &mut z[k];\n\n            (0..VecF::lanes()).for_each(|i| {\n                let t = tris[i];\n                let vertex = t.0[k];\n                let tx = vertex[0];\n                let ty = vertex[1];\n                let tz = vertex[2];\n\n                *x = x.replace(i, tx);\n                *y = y.replace(i, ty);\n                *z = z.replace(i, tz);\n            });\n        });\n\n        Self { x, y, z }\n    }\n\n    /// Unpacks the N scalar triangles into an array-of-structures layout.\n    pub fn unpack(self) -> Vec<crate::scalar::Triangle> {\n        let mut tris = [crate::scalar::Triangle::default(); VecF::lanes()];\n\n        (0..3).for_each(|k| {\n            (0..VecF::lanes()).for_each(|i| {\n                let vtx = &mut tris[i].0;\n                vtx[k][0] = self.x[k].extract(i);\n                vtx[k][1] = self.y[k].extract(i);\n                vtx[k][2] = self.z[k].extract(i);\n            });\n        });\n\n        tris.to_vec()\n    }\n\n    /// Transforms this triangle by multiplying with a matrix.\n    #[inline]\n    pub fn transform(self, mat: Matrix) -> Self {\n        let mut tri = Self::default();\n\n        let x = self.x;\n        let y = self.y;\n        let z = self.z;\n\n        let col_a = mat[0];\n        let col_b = mat[1];\n        let col_c = mat[2];\n        let col_d = mat[3];\n\n        for k in 0..3 {\n            let x = x[k];\n            let y = y[k];\n            let z = z[k];\n\n            tri.x[k] = col_a[0] * x + col_b[0] * y + col_c[0] * z + col_d[0];\n            tri.y[k] = col_a[1] * x + col_b[1] * y + col_c[1] * z + col_d[1];\n            tri.z[k] = col_a[2] * x + col_b[2] * y + col_c[2] * z + col_d[2];\n        }\n\n        tri\n    }\n}\n"
  },
  {
    "path": "micro_benchmarks/Cargo.toml",
    "content": "[package]\nname = \"micro_benchmarks\"\nversion = \"0.1.0\"\nauthors = [\"gnzlbg <gonzalobg88@gmail.com>\"]\nautobenches = false\nedition = \"2018\"\n\n[dev-dependencies]\npacked_simd = { package = \"packed_simd\", path = \"..\" }\npaste = \"0.1.3\"\ncriterion = \"0.3\"\n\n[profile.bench]\nopt-level = 3\ndebug = false\nlto = 'fat'\ndebug-assertions = false\ncodegen-units = 1\n\n[[bench]]\nname = \"mask_reductions\"\nharness = false\n"
  },
  {
    "path": "micro_benchmarks/benches/mask_reductions.rs",
    "content": "//! Benchmarks for the mask reductions `all`, `any`, and `none`.\n#![deny(rust_2018_idioms)]\n#![feature(test)]\n\nuse packed_simd::*;\nuse test::black_box;\n\nuse criterion::{Benchmark, Criterion, Throughput};\nconst NO_ITERATIONS: u32 = 1_000;\n\nmacro_rules! bench {\n    ($id:ident) => {\n        paste::item! {\n            fn [<$id _all>](c: &mut Criterion) {\n                c.bench(\n                    stringify!($id),\n                    Benchmark::new(\"all\", |b| b.iter(|| {\n                        let mut x: $id = Default::default();\n                        for _ in 0..NO_ITERATIONS {\n                            if black_box(x).all() {\n                                black_box(&mut x);\n                            }\n                        }\n                    })).throughput(Throughput::Elements(NO_ITERATIONS))\n                );\n            }\n            fn [<$id _any>](c: &mut Criterion) {\n                c.bench(\n                    stringify!($id),\n                    Benchmark::new(\"any\", |b| b.iter(|| {\n                        let mut x: $id = Default::default();\n                        for _ in 0..NO_ITERATIONS {\n                            if black_box(x).any() {\n                                black_box(&mut x);\n                            }\n                        }\n                    })).throughput(Throughput::Elements(NO_ITERATIONS))\n                );\n            }\n            fn [<$id _none>](c: &mut Criterion) {\n                c.bench(\n                    stringify!($id),\n                    Benchmark::new(\"none\", |b| b.iter(|| {\n                        let mut x: $id = Default::default();\n                        for _ in 0..NO_ITERATIONS {\n                            if black_box(x).none() {\n                                black_box(&mut x);\n                            }\n                        }\n                    })).throughput(Throughput::Elements(NO_ITERATIONS))\n                );\n            }\n        }\n    };\n    ($($id:ident),*) => {\n        $( bench!($id); )*\n        paste::item! {\n            criterion_group!(\n                benches,\n                $([<$id _all>]),*, $([<$id _any>]),*, $([<$id _none>]),*\n            );\n        }\n    };\n}\n\nbench!(\n    m8x2, // 16-bit wide types\n    m8x8, m16x4, m32x2, // 64-bit wide types\n    m8x16, m16x8, m32x4, m64x2, m128x1, // 128-bit wide types\n    m8x32, m16x16, m32x8, m64x4, m128x2, // 256-bit wide types\n    m8x64, m16x32, m32x16, m64x8, m128x4 // 512-bit wide types\n);\n\ncriterion_main!(benches);\n"
  },
  {
    "path": "micro_benchmarks/rust-toolchain",
    "content": "nightly"
  },
  {
    "path": "perf-guide/.gitignore",
    "content": "/book\n"
  },
  {
    "path": "perf-guide/book.toml",
    "content": "[book]\nauthors = [\"Gonzalo Brito Gadeschi\", \"Gabriel Majeri\"]\nmultilingual = false\nsrc = \"src\"\ntitle = \"Rust SIMD Performance Guide\"\ndescription = \"This book describes how to write performant SIMD code in Rust.\"\n\n[build]\ncreate-missing = false\n\n[output.html]\nadditional-css = [\"./src/ascii.css\"]\n"
  },
  {
    "path": "perf-guide/src/SUMMARY.md",
    "content": "# Summary\n\n[Introduction](./introduction.md)\n\n- [Floating-point Math](./float-math/fp.md)\n  - [Short-vector Math Library](./float-math/svml.md)\n  - [Approximate functions](./float-math/approx.md)\n  - [Fused multiply-accumulate](./float-math/fma.md)\n\n- [Target features](./target-feature/features.md)\n  - [Using `RUSTFLAGS`](./target-feature/rustflags.md)\n  - [Using the `target_feature` attribute](./target-feature/attribute.md)\n  - [Interaction with inlining](./target-feature/inlining.md)\n  - [Detecting features at runtime](./target-feature/runtime.md)\n\n- [Bounds checking](./bound_checks.md)\n- [Vertical and horizontal operations](./vert-hor-ops.md)\n\n- [Performance profiling](./prof/profiling.md)\n  - [Profiling on Linux](./prof/linux.md)\n  - [Using machine code analyzers](./prof/mca.md)\n"
  },
  {
    "path": "perf-guide/src/ascii.css",
    "content": "code {\n    /* \"Source Code Pro\" breaks ASCII art */\n    font-family: Consolas, \"Ubuntu Mono\", Menlo, \"DejaVu Sans Mono\", monospace;\n}\n"
  },
  {
    "path": "perf-guide/src/bound_checks.md",
    "content": "# Bounds checking\n\nReading and writing packed vectors to/from slices is checked by default.\nIndependently of the configuration options used, the safe functions:\n\n* `Simd<[T; N]>::from_slice_aligned(& s[..])`\n* `Simd<[T; N]>::write_to_slice_aligned(&mut s[..])`\n\nalways check that:\n\n* the slice is big enough to hold the vector\n* the slice is suitably aligned to perform an aligned load/store for a `Simd<[T;\n  N]>` (this alignment is often much larger than that of `T`).\n\nThere are `_unaligned` versions that use unaligned load and stores, as well as\n`unsafe` `_unchecked` that do not perform any checks iff `debug-assertions =\nfalse` / `debug = false`. That is, the `_unchecked` methods do still assert size\nand alignment in debug builds and could also do so in release builds depending\non the configuration options.\n\nThese assertions do often significantly impact performance and you should be\naware of them.\n"
  },
  {
    "path": "perf-guide/src/float-math/approx.md",
    "content": "# Approximate functions\n\n<!-- TODO:\n\nExplain that they exists, that they are often _much_ faster, how to use them,\nthat people should check whether the error is good enough for their\napplications. Explain that this error is currently unstable and might change.\n-->\n"
  },
  {
    "path": "perf-guide/src/float-math/fma.md",
    "content": "# Fused Multiply Add\n\n<!-- TODO:\nExplain that this is a compound operation, infinite precision, difference\nbetween `mul_add` and `mul_adde`, that LLVM cannot do this by itself, etc.\n-->\n"
  },
  {
    "path": "perf-guide/src/float-math/fp.md",
    "content": "# Floating-point math\n\nThis chapter contains information pertaining to working with floating-point numbers.\n"
  },
  {
    "path": "perf-guide/src/float-math/svml.md",
    "content": "# Short Vector Math Library\n\n<!-- TODO:\nExplain how is short-vector math performed by default (just scalarized libm calls).\n\nExplain how to enable `sleef`, etc.\n-->\n"
  },
  {
    "path": "perf-guide/src/introduction.md",
    "content": "# Introduction\n\n## What is SIMD\n\n<!-- TODO:\ndescribe what SIMD is, which algorithms can benefit from it,\ngive usage examples\n-->\n\n## History of SIMD in Rust\n\n<!-- TODO:\ndiscuss history of unstable std::simd,\nstabilization of std::arch, etc.\n-->\n\n## Discover packed_simd\n\n<!-- TODO: describe scope of this project -->\n\nWriting fast and portable SIMD algorithms using `packed_simd` is, unfortunately,\nnot trivial. There are many pitfals that one should be aware of, and some idioms\nthat help avoid those pitfalls.\n\nThis book attempts to document these best practices and provides practical examples\non how to apply the tips to _your_ code.\n"
  },
  {
    "path": "perf-guide/src/prof/linux.md",
    "content": "# Performance profiling on Linux\n\n## Using `perf`\n\n[perf](https://perf.wiki.kernel.org/) is the most powerful performance profiler\nfor Linux, featuring support for various hardware Performance Monitoring Units,\nas well as integration with the kernel's performance events framework.\n\nWe will only look at how can the `perf` command can be used to profile SIMD code.\nFull system profiling is outside of the scope of this book.\n\n### Recording\n\nThe first step is to record a program's execution during an average workload.\nIt helps if you can isolate the parts of your program which have performance\nissues, and set up a benchmark which can be easily (re)run.\n\nBuild the benchmark binary in release mode, after having enabled debug info:\n\n```sh\n$ cargo build --release\nFinished release [optimized + debuginfo] target(s) in 0.02s\n```\n\nThen use the `perf record` subcommand:\n\n```sh\n$ perf record --call-graph=dwarf ./target/release/my-program\n[ perf record: Woken up 10 times to write data ]\n[ perf record: Captured and wrote 2,356 MB perf.data (292 samples) ]\n```\n\nInstead of using `--call-graph=dwarf`, which can become pretty slow, you can use\n`--call-graph=lbr` if you have a processor with support for Last Branch Record\n(i.e. Intel Haswell and newer).\n\n`perf` will, by default, record the count of CPU cycles it takes to execute\nvarious parts of your program. You can use the `-e` command line option\nto enable other performance events, such as `cache-misses`. Use `perf list`\nto get a list of all hardware counters supported by your CPU.\n\n### Viewing the report\n\nThe next step is getting a bird's eye view of the program's execution.\n`perf` provides a `ncurses`-based interface which will get you started.\n\nUse `perf report` to open a visualization of your program's performance:\n\n```sh\nperf report --hierarchy -M intel\n```\n\n`--hierarchy` will display a tree-like structure of where your program spent\nmost of its time. `-M intel` enables disassembly output with Intel syntax, which\nis subjectively more readable than the default AT&T syntax.\n\nHere is the output from profiling the `nbody` benchmark:\n\n```\n- 100,00% nbody\n  - 94,18% nbody\n    + 93,48% [.] nbody_lib::simd::advance\n    + 0,70% [.] nbody_lib::run\n    + 5,06% libc-2.28.so\n```\n\nIf you move with the arrow keys to any node in the tree, you can the press `a`\nto have `perf` _annotate_ that node. This means it will:\n\n- disassemble the function\n\n- associate every instruction with the percentage of time which was spent executing it\n\n- interleaves the disassembly with the source code,\n  assuming it found the debug symbols\n  (you can use `s` to toggle this behaviour)\n\n`perf` will, by default, open the instruction which it identified as being the\nhottest spot in the function:\n\n```\n0,76  │ movapd xmm2,xmm0\n0,38  │ movhlps xmm2,xmm0\n      │ addpd  xmm2,xmm0\n      │ unpcklpd xmm1,xmm2\n12,50 │ sqrtpd xmm0,xmm1\n1,52  │ mulpd  xmm0,xmm1\n```\n\nIn this case, `sqrtpd` will be highlighted in red, since that's the instruction\nwhich the CPU spends most of its time executing.\n\n## Using Valgrind\n\nValgrind is a set of tools which initially helped C/C++ programmers find unsafe\nmemory accesses in their code. Nowadays the project also has\n\n- a heap profiler called `massif`\n\n- a cache utilization profiler called `cachegrind`\n\n- a call-graph performance profiler called `callgrind`\n\n<!--\nTODO: explain valgrind's dynamic binary translation, warn about massive\nslowdown, talk about `kcachegrind` for a GUI\n-->\n"
  },
  {
    "path": "perf-guide/src/prof/mca.md",
    "content": "# Machine code analysis tools\n\n## The microarchitecture of modern CPUs\n\nWhile you might have heard of Instruction Set Architectures, such as `x86` or\n`arm` or `mips`, the term _microarchitecture_ (also written here as _µ-arch_),\nrefers to the internal details of an actual family of CPUs, such as Intel's\n_Haswell_ or AMD's _Jaguar_.\n\nReplacing scalar code with SIMD code will improve performance on all CPUs\nsupporting the required vector extensions.\nHowever, due to microarchitectural differences, the actual speed-up at\nruntime might vary.\n\n**Example**: a simple example arises when optimizing for AMD K8 CPUs.\nThe assembly generated for an empty function should look like this:\n\n```asm\nnop\nret\n```\n\nThe `nop` is used to align the `ret` instruction for better performance.\nHowever, the compiler will actually generated the following code:\n\n```asm\nrepz ret\n```\n\nThe `repz` instruction will repeat the following instruction until a certain\ncondition. Of course, in this situation, the function will simply immediately\nreturn, and the `ret` instruction is still aligned.\nHowever, AMD K8's branch predictor performs better with the latter code.\n\nFor those looking to absolutely maximize performance for a certain target µ-arch,\nyou will have to read some CPU manuals, or ask the compiler to do it for you\nwith `-C target-cpu`.\n\n### Summary of CPU internals\n\nModern processors are able to execute instructions out-of-order for better performance,\nby utilizing tricks such as [branch prediction], [instruction pipelining],\nor [superscalar execution].\n\n[branch prediction]: https://en.wikipedia.org/wiki/Branch_predictor\n[instruction pipelining]: https://en.wikipedia.org/wiki/Instruction_pipelining\n[superscalar execution]: https://en.wikipedia.org/wiki/Superscalar_processor\n\nSIMD instructions are also subject to these optimizations, meaning it can get pretty\ndifficult to determine where the slowdown happens.\nFor example, if the profiler reports a store operation is slow, one of two things\ncould be happening:\n\n- the store is limited by the CPU's memory bandwidth, which is actually an ideal\n  scenario, all things considered;\n\n- memory bandwidth is nowhere near its peak, but the value to be stored is at the\n  end of a long chain of operations, and this store is where the profiler\n  encountered the pipeline stall;\n\nSince most profilers are simple tools which don't understand the subtleties of\ninstruction scheduling, you\n\n## Analyzing the machine code\n\nCertain tools have knowledge of internal CPU microarchitecture, i.e. they know\n\n- how many physical [register files] a CPU actually has\n\n- what is the latency / throughtput of an instruction\n\n- what [µ-ops] are generated for a set of instructions\n\nand many other architectural details.\n\n[register files]: https://en.wikipedia.org/wiki/Register_file\n[µ-ops]: https://en.wikipedia.org/wiki/Micro-operation\n\nThese tools are therefore able to provide accurate information as to why some\ninstructions are inefficient, and where the bottleneck is.\n\nThe disadvantage is that the output of these tools requires advanced knowledge\nof the target architecture to understand, i.e. they **cannot** point out what\nthe cause of the issue is explicitly.\n\n## Intel's Architecture Code Analyzer (IACA)\n\n[IACA] is a free tool offered by Intel for analyzing the performance of various\ncomputational kernels.\n\nBeing a proprietary, closed source tool, it _only_ supports Intel's µ-arches.\n\n[IACA]: https://software.intel.com/en-us/articles/intel-architecture-code-analyzer\n\n## llvm-mca\n\n<!--\nTODO: once LLVM 7 gets released, write a chapter on using llvm-mca\nwith SIMD disassembly.\n-->\n"
  },
  {
    "path": "perf-guide/src/prof/profiling.md",
    "content": "# Performance profiling\n\nWhile the rest of the book provides practical advice on how to improve the performance\nof SIMD code, this chapter is dedicated to [**performance profiling**][profiling].\nProfiling consists of recording a program's execution in order to identify program\nhotspots.\n\n**Important**: most profilers require debug information in order to accurately\nlink the program hotspots back to the corresponding source code lines. Rust will\ndisable debug info generation by default for optimized builds, but you can change\nthat [in your `Cargo.toml`][cargo-ref].\n\n[profiling]: https://en.wikipedia.org/wiki/Profiling_(computer_programming)\n[cargo-ref]: https://doc.rust-lang.org/cargo/reference/manifest.html#the-profile-sections\n"
  },
  {
    "path": "perf-guide/src/target-feature/attribute.md",
    "content": "# The `target_feature` attribute\n\n<!-- TODO:\nExplain the `#[target_feature]` attribute\n-->\n"
  },
  {
    "path": "perf-guide/src/target-feature/features.md",
    "content": "# Enabling target features\n\nNot all processors of a certain architecture will have SIMD processing units,\nand using a SIMD instruction which is not supported will trigger undefined behavior.\n\nTo allow building safe, portable programs, the Rust compiler will **not**, by default,\ngenerate any sort of vector instructions, unless it can statically determine\nthey are supported. For example, on AMD64, SSE2 support is architecturally guaranteed.\nThe `x86_64-apple-darwin` target enables up to SSSE3. The get a defintive list of\nwhich features are enabled by default on various platforms, refer to the target\nspecifications [in the compiler's source code][targets].\n\n[targets]: https://github.com/rust-lang/rust/tree/master/src/librustc_target/spec\n"
  },
  {
    "path": "perf-guide/src/target-feature/inlining.md",
    "content": "# Inlining\n\n<!-- TODO:\nExplain how the `#[target_feature]` attribute interacts with inlining\n-->\n"
  },
  {
    "path": "perf-guide/src/target-feature/practice.md",
    "content": "# Target features in practice\n\nUsing `RUSTFLAGS` will allow the crate being compiled, as well as all its\ntransitive dependencies to use certain target features.\n\nA tehnique used to avoid undefined behavior at runtime is to compile and\nship multiple binaries, each compiled with a certain set of features.\nThis might not be feasible in some cases, and can quickly get out of hand\nas more and more vector extensions are added to an architecture.\n\nRust can be more flexible: you can build a single binary/library which automatically\npicks the best supported vector instructions depending on the host machine.\nThe trick consists of monomorphizing parts of the code during building, and then\nusing run-time feature detection to select the right code path when running.\n\n<!-- TODO\nExplain how to create efficient functions that dispatch to different\nimplementations at run-time without issues (e.g. using `#[inline(always)]` for\nthe impls, wrapping in `#[target_feature]`, and the wrapping those in a function\nthat does run-time feature detection).\n-->\n\n**NOTE** (x86 specific): because the AVX (256-bit) registers extend the existing\nSSE (128-bit) registers, mixing SSE and AVX instructions in a program can cause\nperformance issues.\n\nThe solution is to compile all code, even the code written with 128-bit vectors,\nwith the AVX target feature enabled. This will cause the compiler to prefix the\ngenerated instructions with the [VEX] prefix.\n\n[VEX]: https://en.wikipedia.org/wiki/VEX_prefix\n"
  },
  {
    "path": "perf-guide/src/target-feature/runtime.md",
    "content": "# Detecting host features at runtime\n\n<!-- TODO:\nExplain cost (how it works).\n-->\n"
  },
  {
    "path": "perf-guide/src/target-feature/rustflags.md",
    "content": "# Using RUSTFLAGS\n\nOne of the easiest ways to benefit from SIMD is to allow the compiler\nto generate code using certain vector instruction extensions.\n\nThe environment variable `RUSTFLAGS` can be used to pass options for code\ngeneration to the Rust compiler. These flags will affect **all** compiled crates.\n\nThere are two flags which can be used to enable specific vector extensions:\n\n## target-feature\n\n- Syntax: `-C target-feature=<features>`\n\n- Provides the compiler with a comma-separated set of instruction extensions\n  to enable.\n\n  **Example**: Use `-C target-feature=+sse3,+avx` to enable generating instructions\n  for [Streaming SIMD Extensions 3](https://en.wikipedia.org/wiki/SSE3) and\n  [Advanced Vector Extensions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions).\n\n- To list target triples for all targets supported by Rust, use:\n\n  ```sh\n  rustc --print target-list\n  ```\n\n- To list all support target features for a certain target triple, use:\n\n  ```sh\n  rustc --target=${TRIPLE} --print target-features\n  ```\n\n- Note that all CPU features are independent, and will have to be enabled individually.\n\n  **Example**: Setting `-C target-feature=+avx2` will _not_ enable `fma`, even though\n  all CPUs which support AVX2 also support FMA. To enable both, one has to use\n  `-C target-feature=+avx2,+fma`\n\n- Some features also depend on other features, which need to be enabled for the\n  target instructions to be generated.\n\n  **Example**: Unless `v7` is specified as the target CPU (see below), to enable\n  NEON on ARM it is necessary to use `-C target-feature=+v7,+neon`.\n\n## target-cpu\n\n- Syntax: `-C target-cpu=<cpu>`\n\n- Sets the identifier of a CPU family / model for which to build and optimize the code.\n\n  **Example**: `RUSTFLAGS='-C target-cpu=cortex-a75'`\n\n- To list all supported target CPUs for a certain target triple, use:\n\n  ```sh\n  rustc --target=${TRIPLE} --print target-cpus\n  ```\n\n  **Example**:\n\n  ```sh\n  rustc --target=i686-pc-windows-msvc --print target-cpus\n  ```\n\n- The compiler will translate this into a list of target features. Therefore,\n  individual feature checks (`#[cfg(target_feature = \"...\")]`) will still\n  work properly.\n\n- It will cause the code generator to optimize the generated code for that\n  specific CPU model.\n\n- Using `native` as the CPU model will cause Rust to generate and optimize code\n  for the CPU running the compiler. It is useful when building programs which you\n  plan to only use locally. This should never be used when the generated programs\n  are meant to be run on other computers, such as when packaging for distribution\n  or cross-compiling.\n"
  },
  {
    "path": "perf-guide/src/vert-hor-ops.md",
    "content": "# Vertical and horizontal operations\n\nIn SIMD terminology, each vector has a certain \"width\" (number of lanes).\nA vector processor is able to perform two kinds of operations on a vector:\n\n- Vertical operations:\n  operate on two vectors of the same width, result has same width\n\n**Example**: vertical addition of two `f32x4` vectors\n\n      %0     == | 2 | -3.5 |  0 | 7 |\n                  +     +     +   +\n      %1     == | 4 |  1.5 | -1 | 0 |\n                  =     =     =   =\n    %0 + %1  == | 6 |  -2  | -1 | 7 |\n\n- Horizontal operations:\n  reduce the elements of two vectors in some way,\n  the result's elements combine information from the two original ones\n\n**Example**: horizontal addition of two `u64x2` vectors\n\n      %0     == | 1 |  3 |\n                  └─+───┘\n                    └───────┐\n                            │\n      %1     == | 4 | -1 |  │\n                  └─+──┘    │\n                    └───┐   │\n                        │   │\n                  ┌─────│───┘\n                  ▼     ▼\n    %0 + %1  == | 4 |   3 |\n\n## Performance consideration of horizontal operations\n\nThe result of vertical operations, like vector negation: `-a`, for a given lane,\ndoes not depend on the result of the operation for the other lanes. The result\nof horizontal operations, like the vector `sum` reduction: `a.sum()`, depends on\nthe value of all vector lanes.\n\nIn virtually all architectures vertical operations are fast, while horizontal\noperations are, by comparison, very slow.\n\nConsider the following two functions for computing the sum of all `f32` values\nin a slice:\n\n```rust\nfn fast_sum(x: &[f32]) -> f32 {\n    assert!(x.len() % 4 == 0);\n    let mut sum = f32x4::splat(0.); // [0., 0., 0., 0.]\n    for i in (0..x.len()).step_by(4) {\n        sum += f32x4::from_slice_unaligned(&x[i..]);\n    }\n    sum.sum()\n}\n\nfn slow_sum(x: &[f32]) -> f32 {\n    assert!(x.len() % 4 == 0);\n    let mut sum: f32 = 0.;\n    for i in (0..x.len()).step_by(4) {\n        sum += f32x4::from_slice_unaligned(&x[i..]).sum();\n    }\n    sum\n}\n```\n\nThe inner loop over the slice is where the bulk of the work actually happens.\nThere, the `fast_sum` function perform vertical operations into a vector, doing\na single horizontal reduction at the end, while the `slow_sum` function performs\nhorizontal vector operations inside of the loop.\n\nOn all widely-used architectures, `fast_sum` is a large constant factor faster\nthan `slow_sum`. You can run the [slice_sum]() example and see for yourself. On\nthe particular machine tested there the algorithm using the horizontal vector\naddition is 2.7x slower than the one using vertical vector operations!\n"
  },
  {
    "path": "rust-toolchain",
    "content": "nightly\n"
  },
  {
    "path": "rustfmt.toml",
    "content": "max_width = 110\nuse_small_heuristics = \"Max\"\nwrap_comments = true\nedition = \"2018\"\nerror_on_line_overflow = true"
  },
  {
    "path": "src/api/bit_manip.rs",
    "content": "//! Bit manipulations.\n\nmacro_rules! impl_bit_manip {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Returns the number of ones in the binary representation of\n            /// the lanes of `self`.\n            #[inline]\n            pub fn count_ones(self) -> Self {\n                super::codegen::bit_manip::BitManip::ctpop(self)\n            }\n\n            /// Returns the number of zeros in the binary representation of\n            /// the lanes of `self`.\n            #[inline]\n            pub fn count_zeros(self) -> Self {\n                super::codegen::bit_manip::BitManip::ctpop(!self)\n            }\n\n            /// Returns the number of leading zeros in the binary\n            /// representation of the lanes of `self`.\n            #[inline]\n            pub fn leading_zeros(self) -> Self {\n                super::codegen::bit_manip::BitManip::ctlz(self)\n            }\n\n            /// Returns the number of trailing zeros in the binary\n            /// representation of the lanes of `self`.\n            #[inline]\n            pub fn trailing_zeros(self) -> Self {\n                super::codegen::bit_manip::BitManip::cttz(self)\n            }\n        }\n\n        test_if! {\n            $test_tt:\n            paste::item! {\n                #[allow(overflowing_literals)]\n                pub mod [<$id _bit_manip>] {\n                    #![allow(const_item_mutation)]\n                    use super::*;\n\n                    const LANE_WIDTH: usize = mem::size_of::<$elem_ty>() * 8;\n\n                    macro_rules! test_func {\n                        ($x:expr, $func:ident) => {{\n                            let mut actual = $x;\n                            for i in 0..$id::lanes() {\n                                actual = actual.replace(\n                                    i,\n                                    $x.extract(i).$func() as $elem_ty\n                                );\n                            }\n                            let expected = $x.$func();\n                            assert_eq!(actual, expected);\n                        }};\n                    }\n\n                    const BYTES: [u8; 64] = [\n                        0, 1, 2, 3, 4, 5, 6, 7,\n                        8, 9, 10, 11, 12, 13, 14, 15,\n                        16, 17, 18, 19, 20, 21, 22, 23,\n                        24, 25, 26, 27, 28, 29, 30, 31,\n                        32, 33, 34, 35, 36, 37, 38, 39,\n                        40, 41, 42, 43, 44, 45, 46, 47,\n                        48, 49, 50, 51, 52, 53, 54, 55,\n                        56, 57, 58, 59, 60, 61, 62, 63,\n                    ];\n\n                    fn load_bytes() -> $id {\n                        let elems: &mut [$elem_ty] = unsafe {\n                            slice::from_raw_parts_mut(\n                                BYTES.as_mut_ptr() as *mut $elem_ty,\n                                $id::lanes(),\n                            )\n                        };\n                        $id::from_slice_unaligned(elems)\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn count_ones() {\n                        test_func!($id::splat(0), count_ones);\n                        test_func!($id::splat(!0), count_ones);\n                        test_func!(load_bytes(), count_ones);\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn count_zeros() {\n                        test_func!($id::splat(0), count_zeros);\n                        test_func!($id::splat(!0), count_zeros);\n                        test_func!(load_bytes(), count_zeros);\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn leading_zeros() {\n                        test_func!($id::splat(0), leading_zeros);\n                        test_func!($id::splat(1), leading_zeros);\n                        // some implementations use `pshufb` which has unique\n                        // behavior when the 8th bit is set.\n                        test_func!($id::splat(0b1000_0010), leading_zeros);\n                        test_func!($id::splat(!0), leading_zeros);\n                        test_func!(\n                            $id::splat(1 << (LANE_WIDTH - 1)),\n                            leading_zeros\n                        );\n                        test_func!(load_bytes(), leading_zeros);\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn trailing_zeros() {\n                        test_func!($id::splat(0), trailing_zeros);\n                        test_func!($id::splat(1), trailing_zeros);\n                        test_func!($id::splat(0b1000_0010), trailing_zeros);\n                        test_func!($id::splat(!0), trailing_zeros);\n                        test_func!(\n                            $id::splat(1 << (LANE_WIDTH - 1)),\n                            trailing_zeros\n                        );\n                        test_func!(load_bytes(), trailing_zeros);\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/bitmask.rs",
    "content": "//! Bitmask API\n\nmacro_rules! impl_bitmask {\n    ($id:ident | $ibitmask_ty:ident | ($set:expr, $clear:expr)\n     | $test_tt:tt) => {\n        impl $id {\n            /// Creates a bitmask with the MSB of each vector lane.\n            ///\n            /// If the vector has less than 8 lanes, the bits that do not\n            /// correspond to any vector lanes are cleared.\n            #[inline]\n            pub fn bitmask(self) -> $ibitmask_ty {\n                unsafe { codegen::llvm::simd_bitmask(self.0) }\n            }\n        }\n\n        test_if! {\n            $test_tt:\n            paste::item! {\n                #[cfg(not(\n                    // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/210\n                    target_endian = \"big\"\n                ))]\n                pub mod [<$id _bitmask>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn bitmask() {\n                        // clear all lanes\n                        let vec = $id::splat($clear as _);\n                        let bitmask: $ibitmask_ty = 0;\n                        assert_eq!(vec.bitmask(), bitmask);\n\n                        // set even lanes\n                        let mut vec = $id::splat($clear as _);\n                        for i in 0..$id::lanes() {\n                            if i % 2 == 0 {\n                                vec = vec.replace(i, $set as _);\n                            }\n                        }\n                        // create bitmask with even lanes set:\n                        let mut bitmask: $ibitmask_ty = 0;\n                        for i in 0..$id::lanes() {\n                            if i % 2 == 0 {\n                                bitmask |= 1 << i;\n                            }\n                        }\n                        assert_eq!(vec.bitmask(), bitmask);\n\n\n                        // set odd lanes\n                        let mut vec = $id::splat($clear as _);\n                        for i in 0..$id::lanes() {\n                            if i % 2 != 0 {\n                                vec = vec.replace(i, $set as _);\n                            }\n                        }\n                        // create bitmask with odd lanes set:\n                        let mut bitmask: $ibitmask_ty = 0;\n                        for i in 0..$id::lanes() {\n                            if i % 2 != 0 {\n                                bitmask |= 1 << i;\n                            }\n                        }\n                        assert_eq!(vec.bitmask(), bitmask);\n\n                        // set all lanes\n                        let vec = $id::splat($set as _);\n                        let mut bitmask: $ibitmask_ty = 0;\n                        for i in 0..$id::lanes() {\n                            bitmask |= 1 << i;\n                        }\n                        assert_eq!(vec.bitmask(), bitmask);\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/cast/macros.rs",
    "content": "//! Macros implementing `FromCast`\n\nmacro_rules! impl_from_cast_ {\n    ($id:ident[$test_tt:tt]: $from_ty:ident) => {\n        impl crate::api::cast::FromCast<$from_ty> for $id {\n            #[inline]\n            fn from_cast(x: $from_ty) -> Self {\n                use crate::llvm::simd_cast;\n                debug_assert_eq!($from_ty::lanes(), $id::lanes());\n                Simd(unsafe { simd_cast(x.0) })\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _from_cast_ $from_ty>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn test() {\n                        assert_eq!($id::lanes(), $from_ty::lanes());\n                    }\n                }\n            }\n        }\n    };\n}\n\nmacro_rules! impl_from_cast {\n    ($id:ident[$test_tt:tt]: $($from_ty:ident),*) => {\n        $(\n            impl_from_cast_!($id[$test_tt]: $from_ty);\n        )*\n    }\n}\n\nmacro_rules! impl_from_cast_mask_ {\n    ($id:ident[$test_tt:tt]: $from_ty:ident) => {\n        impl crate::api::cast::FromCast<$from_ty> for $id {\n            #[inline]\n            fn from_cast(x: $from_ty) -> Self {\n                debug_assert_eq!($from_ty::lanes(), $id::lanes());\n                x.ne($from_ty::default())\n                    .select($id::splat(true), $id::splat(false))\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _from_cast_ $from_ty>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn test() {\n                        assert_eq!($id::lanes(), $from_ty::lanes());\n\n                        let x = $from_ty::default();\n                        let m: $id = x.cast();\n                        assert!(m.none());\n                    }\n                }\n            }\n        }\n    };\n}\n\nmacro_rules! impl_from_cast_mask {\n    ($id:ident[$test_tt:tt]: $($from_ty:ident),*) => {\n        $(\n            impl_from_cast_mask_!($id[$test_tt]: $from_ty);\n        )*\n    }\n}\n\n#[allow(unused)]\nmacro_rules! impl_into_cast {\n    ($id:ident[$test_tt:tt]: $($from_ty:ident),*) => {\n        $(\n            impl_from_cast_!($from_ty[$test_tt]: $id);\n        )*\n    }\n}\n"
  },
  {
    "path": "src/api/cast/v128.rs",
    "content": "//! `FromCast` and `IntoCast` implementations for portable 128-bit wide vectors\n#[rustfmt::skip]\n\nuse crate::*;\n\nimpl_from_cast!(i8x16[test_v128]: u8x16, m8x16, i16x16, u16x16, m16x16, i32x16, u32x16, f32x16, m32x16);\nimpl_from_cast!(u8x16[test_v128]: i8x16, m8x16, i16x16, u16x16, m16x16, i32x16, u32x16, f32x16, m32x16);\nimpl_from_cast_mask!(m8x16[test_v128]: i8x16, u8x16, i16x16, u16x16, m16x16, i32x16, u32x16, f32x16, m32x16);\n\nimpl_from_cast!(\n    i16x8[test_v128]: i8x8,\n    u8x8,\n    m8x8,\n    u16x8,\n    m16x8,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    isizex8,\n    usizex8,\n    msizex8\n);\nimpl_from_cast!(\n    u16x8[test_v128]: i8x8,\n    u8x8,\n    m8x8,\n    i16x8,\n    m16x8,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    isizex8,\n    usizex8,\n    msizex8\n);\nimpl_from_cast_mask!(\n    m16x8[test_v128]: i8x8,\n    u8x8,\n    m8x8,\n    i16x8,\n    u16x8,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    isizex8,\n    usizex8,\n    msizex8\n);\n\nimpl_from_cast!(\n    i32x4[test_v128]: i8x4,\n    u8x4,\n    m8x4,\n    i16x4,\n    u16x4,\n    m16x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x4,\n    u128x4,\n    m128x4,\n    isizex4,\n    usizex4,\n    msizex4\n);\nimpl_from_cast!(\n    u32x4[test_v128]: i8x4,\n    u8x4,\n    m8x4,\n    i16x4,\n    u16x4,\n    m16x4,\n    i32x4,\n    f32x4,\n    m32x4,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x4,\n    u128x4,\n    m128x4,\n    isizex4,\n    usizex4,\n    msizex4\n);\nimpl_from_cast!(\n    f32x4[test_v128]: i8x4,\n    u8x4,\n    m8x4,\n    i16x4,\n    u16x4,\n    m16x4,\n    i32x4,\n    u32x4,\n    m32x4,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x4,\n    u128x4,\n    m128x4,\n    isizex4,\n    usizex4,\n    msizex4\n);\nimpl_from_cast_mask!(\n    m32x4[test_v128]: i8x4,\n    u8x4,\n    m8x4,\n    i16x4,\n    u16x4,\n    m16x4,\n    i32x4,\n    u32x4,\n    f32x4,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x4,\n    u128x4,\n    m128x4,\n    isizex4,\n    usizex4,\n    msizex4\n);\n\nimpl_from_cast!(\n    i64x2[test_v128]: i8x2,\n    u8x2,\n    m8x2,\n    i16x2,\n    u16x2,\n    m16x2,\n    i32x2,\n    u32x2,\n    f32x2,\n    m32x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x2,\n    u128x2,\n    m128x2,\n    isizex2,\n    usizex2,\n    msizex2\n);\nimpl_from_cast!(\n    u64x2[test_v128]: i8x2,\n    u8x2,\n    m8x2,\n    i16x2,\n    u16x2,\n    m16x2,\n    i32x2,\n    u32x2,\n    f32x2,\n    m32x2,\n    i64x2,\n    f64x2,\n    m64x2,\n    i128x2,\n    u128x2,\n    m128x2,\n    isizex2,\n    usizex2,\n    msizex2\n);\nimpl_from_cast!(\n    f64x2[test_v128]: i8x2,\n    u8x2,\n    m8x2,\n    i16x2,\n    u16x2,\n    m16x2,\n    i32x2,\n    u32x2,\n    f32x2,\n    m32x2,\n    i64x2,\n    u64x2,\n    m64x2,\n    i128x2,\n    u128x2,\n    m128x2,\n    isizex2,\n    usizex2,\n    msizex2\n);\nimpl_from_cast_mask!(\n    m64x2[test_v128]: i8x2,\n    u8x2,\n    m8x2,\n    i16x2,\n    u16x2,\n    m16x2,\n    i32x2,\n    u32x2,\n    f32x2,\n    m32x2,\n    i64x2,\n    u64x2,\n    f64x2,\n    i128x2,\n    u128x2,\n    m128x2,\n    isizex2,\n    usizex2,\n    msizex2\n);\n\nimpl_from_cast!(\n    isizex2[test_v128]: i8x2,\n    u8x2,\n    m8x2,\n    i16x2,\n    u16x2,\n    m16x2,\n    i32x2,\n    u32x2,\n    f32x2,\n    m32x2,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x2,\n    u128x2,\n    m128x2,\n    usizex2,\n    msizex2\n);\nimpl_from_cast!(\n    usizex2[test_v128]: i8x2,\n    u8x2,\n    m8x2,\n    i16x2,\n    u16x2,\n    m16x2,\n    i32x2,\n    u32x2,\n    f32x2,\n    m32x2,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x2,\n    u128x2,\n    m128x2,\n    isizex2,\n    msizex2\n);\nimpl_from_cast_mask!(\n    msizex2[test_v128]: i8x2,\n    u8x2,\n    m8x2,\n    i16x2,\n    u16x2,\n    m16x2,\n    i32x2,\n    u32x2,\n    f32x2,\n    m32x2,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x2,\n    u128x2,\n    m128x2,\n    isizex2,\n    usizex2\n);\n\n// FIXME[test_v128]: 64-bit single element vectors into_cast impls\nimpl_from_cast!(i128x1[test_v128]: u128x1, m128x1);\nimpl_from_cast!(u128x1[test_v128]: i128x1, m128x1);\nimpl_from_cast!(m128x1[test_v128]: i128x1, u128x1);\n"
  },
  {
    "path": "src/api/cast/v16.rs",
    "content": "//! `FromCast` and `IntoCast` implementations for portable 16-bit wide vectors\n#[rustfmt::skip]\n\nuse crate::*;\n\nimpl_from_cast!(\n    i8x2[test_v16]: u8x2,\n    m8x2,\n    i16x2,\n    u16x2,\n    m16x2,\n    i32x2,\n    u32x2,\n    f32x2,\n    m32x2,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x2,\n    u128x2,\n    m128x2,\n    isizex2,\n    usizex2,\n    msizex2\n);\nimpl_from_cast!(\n    u8x2[test_v16]: i8x2,\n    m8x2,\n    i16x2,\n    u16x2,\n    m16x2,\n    i32x2,\n    u32x2,\n    f32x2,\n    m32x2,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x2,\n    u128x2,\n    m128x2,\n    isizex2,\n    usizex2,\n    msizex2\n);\nimpl_from_cast_mask!(\n    m8x2[test_v16]: i8x2,\n    u8x2,\n    i16x2,\n    u16x2,\n    m16x2,\n    i32x2,\n    u32x2,\n    f32x2,\n    m32x2,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x2,\n    u128x2,\n    m128x2,\n    isizex2,\n    usizex2,\n    msizex2\n);\n"
  },
  {
    "path": "src/api/cast/v256.rs",
    "content": "//! `FromCast` and `IntoCast` implementations for portable 256-bit wide vectors\n#[rustfmt::skip]\n\nuse crate::*;\n\nimpl_from_cast!(i8x32[test_v256]: u8x32, m8x32, i16x32, u16x32, m16x32);\nimpl_from_cast!(u8x32[test_v256]: i8x32, m8x32, i16x32, u16x32, m16x32);\nimpl_from_cast_mask!(m8x32[test_v256]: i8x32, u8x32, i16x32, u16x32, m16x32);\n\nimpl_from_cast!(i16x16[test_v256]: i8x16, u8x16, m8x16, u16x16, m16x16, i32x16, u32x16, f32x16, m32x16);\nimpl_from_cast!(u16x16[test_v256]: i8x16, u8x16, m8x16, i16x16, m16x16, i32x16, u32x16, f32x16, m32x16);\nimpl_from_cast_mask!(m16x16[test_v256]: i8x16, u8x16, m8x16, i16x16, u16x16, i32x16, u32x16, f32x16, m32x16);\n\nimpl_from_cast!(\n    i32x8[test_v256]: i8x8,\n    u8x8,\n    m8x8,\n    i16x8,\n    u16x8,\n    m16x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    isizex8,\n    usizex8,\n    msizex8\n);\nimpl_from_cast!(\n    u32x8[test_v256]: i8x8,\n    u8x8,\n    m8x8,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x8,\n    f32x8,\n    m32x8,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    isizex8,\n    usizex8,\n    msizex8\n);\nimpl_from_cast!(\n    f32x8[test_v256]: i8x8,\n    u8x8,\n    m8x8,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x8,\n    u32x8,\n    m32x8,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    isizex8,\n    usizex8,\n    msizex8\n);\nimpl_from_cast_mask!(\n    m32x8[test_v256]: i8x8,\n    u8x8,\n    m8x8,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x8,\n    u32x8,\n    f32x8,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    isizex8,\n    usizex8,\n    msizex8\n);\n\nimpl_from_cast!(\n    i64x4[test_v256]: i8x4,\n    u8x4,\n    m8x4,\n    i16x4,\n    u16x4,\n    m16x4,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x4,\n    u128x4,\n    m128x4,\n    isizex4,\n    usizex4,\n    msizex4\n);\nimpl_from_cast!(\n    u64x4[test_v256]: i8x4,\n    u8x4,\n    m8x4,\n    i16x4,\n    u16x4,\n    m16x4,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x4,\n    f64x4,\n    m64x4,\n    i128x4,\n    u128x4,\n    m128x4,\n    isizex4,\n    usizex4,\n    msizex4\n);\nimpl_from_cast!(\n    f64x4[test_v256]: i8x4,\n    u8x4,\n    m8x4,\n    i16x4,\n    u16x4,\n    m16x4,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x4,\n    u64x4,\n    m64x4,\n    i128x4,\n    u128x4,\n    m128x4,\n    isizex4,\n    usizex4,\n    msizex4\n);\nimpl_from_cast_mask!(\n    m64x4[test_v256]: i8x4,\n    u8x4,\n    m8x4,\n    i16x4,\n    u16x4,\n    m16x4,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x4,\n    u64x4,\n    f64x4,\n    i128x4,\n    u128x4,\n    m128x4,\n    isizex4,\n    usizex4,\n    msizex4\n);\n\nimpl_from_cast!(\n    i128x2[test_v256]: i8x2,\n    u8x2,\n    m8x2,\n    i16x2,\n    u16x2,\n    m16x2,\n    i32x2,\n    u32x2,\n    f32x2,\n    m32x2,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    u128x2,\n    m128x2,\n    isizex2,\n    usizex2,\n    msizex2\n);\nimpl_from_cast!(\n    u128x2[test_v256]: i8x2,\n    u8x2,\n    m8x2,\n    i16x2,\n    u16x2,\n    m16x2,\n    i32x2,\n    u32x2,\n    f32x2,\n    m32x2,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x2,\n    m128x2,\n    isizex2,\n    usizex2,\n    msizex2\n);\nimpl_from_cast_mask!(\n    m128x2[test_v256]: i8x2,\n    u8x2,\n    m8x2,\n    i16x2,\n    u16x2,\n    m16x2,\n    i32x2,\n    u32x2,\n    f32x2,\n    m32x2,\n    i64x2,\n    u64x2,\n    m64x2,\n    f64x2,\n    i128x2,\n    u128x2,\n    isizex2,\n    usizex2,\n    msizex2\n);\n\nimpl_from_cast!(\n    isizex4[test_v256]: i8x4,\n    u8x4,\n    m8x4,\n    i16x4,\n    u16x4,\n    m16x4,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x4,\n    u128x4,\n    m128x4,\n    usizex4,\n    msizex4\n);\nimpl_from_cast!(\n    usizex4[test_v256]: i8x4,\n    u8x4,\n    m8x4,\n    i16x4,\n    u16x4,\n    m16x4,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x4,\n    u128x4,\n    m128x4,\n    isizex4,\n    msizex4\n);\nimpl_from_cast_mask!(\n    msizex4[test_v256]: i8x4,\n    u8x4,\n    m8x4,\n    i16x4,\n    u16x4,\n    m16x4,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x4,\n    u128x4,\n    m128x4,\n    isizex4,\n    usizex4\n);\n"
  },
  {
    "path": "src/api/cast/v32.rs",
    "content": "//! `FromCast` and `IntoCast` implementations for portable 32-bit wide vectors\n#[rustfmt::skip]\n\nuse crate::*;\n\nimpl_from_cast!(\n    i8x4[test_v32]: u8x4,\n    m8x4,\n    i16x4,\n    u16x4,\n    m16x4,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x4,\n    u128x4,\n    m128x4,\n    isizex4,\n    usizex4,\n    msizex4\n);\nimpl_from_cast!(\n    u8x4[test_v32]: i8x4,\n    m8x4,\n    i16x4,\n    u16x4,\n    m16x4,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x4,\n    u128x4,\n    m128x4,\n    isizex4,\n    usizex4,\n    msizex4\n);\nimpl_from_cast_mask!(\n    m8x4[test_v32]: i8x4,\n    u8x4,\n    i16x4,\n    u16x4,\n    m16x4,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x4,\n    u128x4,\n    m128x4,\n    isizex4,\n    usizex4,\n    msizex4\n);\n\nimpl_from_cast!(\n    i16x2[test_v32]: i8x2,\n    u8x2,\n    m8x2,\n    u16x2,\n    m16x2,\n    i32x2,\n    u32x2,\n    f32x2,\n    m32x2,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x2,\n    u128x2,\n    m128x2,\n    isizex2,\n    usizex2,\n    msizex2\n);\nimpl_from_cast!(\n    u16x2[test_v32]: i8x2,\n    u8x2,\n    m8x2,\n    i16x2,\n    m16x2,\n    i32x2,\n    u32x2,\n    f32x2,\n    m32x2,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x2,\n    u128x2,\n    m128x2,\n    isizex2,\n    usizex2,\n    msizex2\n);\nimpl_from_cast_mask!(\n    m16x2[test_v32]: i8x2,\n    u8x2,\n    m8x2,\n    i16x2,\n    u16x2,\n    i32x2,\n    u32x2,\n    f32x2,\n    m32x2,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x2,\n    u128x2,\n    m128x2,\n    isizex2,\n    usizex2,\n    msizex2\n);\n"
  },
  {
    "path": "src/api/cast/v512.rs",
    "content": "//! `FromCast` and `IntoCast` implementations for portable 512-bit wide vectors\n#[rustfmt::skip]\n\nuse crate::*;\n\nimpl_from_cast!(i8x64[test_v512]: u8x64, m8x64);\nimpl_from_cast!(u8x64[test_v512]: i8x64, m8x64);\nimpl_from_cast_mask!(m8x64[test_v512]: i8x64, u8x64);\n\nimpl_from_cast!(i16x32[test_v512]: i8x32, u8x32, m8x32, u16x32, m16x32);\nimpl_from_cast!(u16x32[test_v512]: i8x32, u8x32, m8x32, i16x32, m16x32);\nimpl_from_cast_mask!(m16x32[test_v512]: i8x32, u8x32, m8x32, i16x32, u16x32);\n\nimpl_from_cast!(i32x16[test_v512]: i8x16, u8x16, m8x16, i16x16, u16x16, m16x16, u32x16, f32x16, m32x16);\nimpl_from_cast!(u32x16[test_v512]: i8x16, u8x16, m8x16, i16x16, u16x16, m16x16, i32x16, f32x16, m32x16);\nimpl_from_cast!(f32x16[test_v512]: i8x16, u8x16, m8x16, i16x16, u16x16, m16x16, i32x16, u32x16, m32x16);\nimpl_from_cast_mask!(m32x16[test_v512]: i8x16, u8x16, m8x16, i16x16, u16x16, m16x16, i32x16, u32x16, f32x16);\n\nimpl_from_cast!(\n    i64x8[test_v512]: i8x8,\n    u8x8,\n    m8x8,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    isizex8,\n    usizex8,\n    msizex8\n);\nimpl_from_cast!(\n    u64x8[test_v512]: i8x8,\n    u8x8,\n    m8x8,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x8,\n    f64x8,\n    m64x8,\n    isizex8,\n    usizex8,\n    msizex8\n);\nimpl_from_cast!(\n    f64x8[test_v512]: i8x8,\n    u8x8,\n    m8x8,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x8,\n    u64x8,\n    m64x8,\n    isizex8,\n    usizex8,\n    msizex8\n);\nimpl_from_cast_mask!(\n    m64x8[test_v512]: i8x8,\n    u8x8,\n    m8x8,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x8,\n    u64x8,\n    f64x8,\n    isizex8,\n    usizex8,\n    msizex8\n);\n\nimpl_from_cast!(\n    i128x4[test_v512]: i8x4,\n    u8x4,\n    m8x4,\n    i16x4,\n    u16x4,\n    m16x4,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    u128x4,\n    m128x4,\n    isizex4,\n    usizex4,\n    msizex4\n);\nimpl_from_cast!(\n    u128x4[test_v512]: i8x4,\n    u8x4,\n    m8x4,\n    i16x4,\n    u16x4,\n    m16x4,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x4,\n    m128x4,\n    isizex4,\n    usizex4,\n    msizex4\n);\nimpl_from_cast_mask!(\n    m128x4[test_v512]: i8x4,\n    u8x4,\n    m8x4,\n    i16x4,\n    u16x4,\n    m16x4,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x4,\n    u64x4,\n    m64x4,\n    f64x4,\n    i128x4,\n    u128x4,\n    isizex4,\n    usizex4,\n    msizex4\n);\n\nimpl_from_cast!(\n    isizex8[test_v512]: i8x8,\n    u8x8,\n    m8x8,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    usizex8,\n    msizex8\n);\nimpl_from_cast!(\n    usizex8[test_v512]: i8x8,\n    u8x8,\n    m8x8,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    isizex8,\n    msizex8\n);\nimpl_from_cast_mask!(\n    msizex8[test_v512]: i8x8,\n    u8x8,\n    m8x8,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    isizex8,\n    usizex8\n);\n"
  },
  {
    "path": "src/api/cast/v64.rs",
    "content": "//! `FromCast` and `IntoCast` implementations for portable 64-bit wide vectors\n#[rustfmt::skip]\n\nuse crate::*;\n\nimpl_from_cast!(\n    i8x8[test_v64]: u8x8,\n    m8x8,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    isizex8,\n    usizex8,\n    msizex8\n);\nimpl_from_cast!(\n    u8x8[test_v64]: i8x8,\n    m8x8,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    isizex8,\n    usizex8,\n    msizex8\n);\nimpl_from_cast_mask!(\n    m8x8[test_v64]: i8x8,\n    u8x8,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    isizex8,\n    usizex8,\n    msizex8\n);\n\nimpl_from_cast!(\n    i16x4[test_v64]: i8x4,\n    u8x4,\n    m8x4,\n    u16x4,\n    m16x4,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x4,\n    u128x4,\n    m128x4,\n    isizex4,\n    usizex4,\n    msizex4\n);\nimpl_from_cast!(\n    u16x4[test_v64]: i8x4,\n    u8x4,\n    m8x4,\n    i16x4,\n    m16x4,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x4,\n    u128x4,\n    m128x4,\n    isizex4,\n    usizex4,\n    msizex4\n);\nimpl_from_cast_mask!(\n    m16x4[test_v64]: i8x4,\n    u8x4,\n    m8x4,\n    i16x4,\n    u16x4,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x4,\n    u128x4,\n    m128x4,\n    isizex4,\n    usizex4,\n    msizex4\n);\n\nimpl_from_cast!(\n    i32x2[test_v64]: i8x2,\n    u8x2,\n    m8x2,\n    i16x2,\n    u16x2,\n    m16x2,\n    u32x2,\n    f32x2,\n    m32x2,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x2,\n    u128x2,\n    m128x2,\n    isizex2,\n    usizex2,\n    msizex2\n);\nimpl_from_cast!(\n    u32x2[test_v64]: i8x2,\n    u8x2,\n    m8x2,\n    i16x2,\n    u16x2,\n    m16x2,\n    i32x2,\n    f32x2,\n    m32x2,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x2,\n    u128x2,\n    m128x2,\n    isizex2,\n    usizex2,\n    msizex2\n);\nimpl_from_cast!(\n    f32x2[test_v64]: i8x2,\n    u8x2,\n    m8x2,\n    i16x2,\n    u16x2,\n    m16x2,\n    i32x2,\n    u32x2,\n    m32x2,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x2,\n    u128x2,\n    m128x2,\n    isizex2,\n    usizex2,\n    msizex2\n);\nimpl_from_cast_mask!(\n    m32x2[test_v64]: i8x2,\n    u8x2,\n    m8x2,\n    i16x2,\n    u16x2,\n    m16x2,\n    i32x2,\n    u32x2,\n    f32x2,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x2,\n    u128x2,\n    m128x2,\n    isizex2,\n    usizex2,\n    msizex2\n);\n"
  },
  {
    "path": "src/api/cast.rs",
    "content": "//! Implementation of `FromCast` and `IntoCast`.\n#![allow(clippy::module_name_repetitions)]\n\n/// Numeric cast from `T` to `Self`.\n///\n/// > Note: This is a temporary workaround until the conversion traits\n/// specified > in [RFC2484] are implemented.\n///\n/// Numeric cast between vectors with the same number of lanes, such that:\n///\n/// * casting integer vectors whose lane types have the same size (e.g. `i32xN`\n/// -> `u32xN`) is a **no-op**,\n///\n/// * casting from a larger integer to a smaller integer (e.g. `u32xN` ->\n/// `u8xN`) will **truncate**,\n///\n/// * casting from a smaller integer to a larger integer   (e.g. `u8xN` ->\n///   `u32xN`) will:\n///    * **zero-extend** if the source is unsigned, or\n///    * **sign-extend** if the source is signed,\n///\n/// * casting from a float to an integer will **round the float towards zero**,\n///\n/// * casting from an integer to float will produce the floating point\n/// representation of the integer, **rounding to nearest, ties to even**,\n///\n/// * casting from an `f32` to an `f64` is perfect and lossless,\n///\n/// * casting from an `f64` to an `f32` **rounds to nearest, ties to even**.\n///\n/// [RFC2484]: https://github.com/rust-lang/rfcs/pull/2484\npub trait FromCast<T>: crate::marker::Sized {\n    /// Numeric cast from `T` to `Self`.\n    fn from_cast(_: T) -> Self;\n}\n\n/// Numeric cast from `Self` to `T`.\n///\n/// > Note: This is a temporary workaround until the conversion traits\n/// specified > in [RFC2484] are implemented.\n///\n/// Numeric cast between vectors with the same number of lanes, such that:\n///\n/// * casting integer vectors whose lane types have the same size (e.g. `i32xN`\n/// -> `u32xN`) is a **no-op**,\n///\n/// * casting from a larger integer to a smaller integer (e.g. `u32xN` ->\n/// `u8xN`) will **truncate**,\n///\n/// * casting from a smaller integer to a larger integer   (e.g. `u8xN` ->\n///   `u32xN`) will:\n///    * **zero-extend** if the source is unsigned, or\n///    * **sign-extend** if the source is signed,\n///\n/// * casting from a float to an integer will **round the float towards zero**,\n///\n/// * casting from an integer to float will produce the floating point\n/// representation of the integer, **rounding to nearest, ties to even**,\n///\n/// * casting from an `f32` to an `f64` is perfect and lossless,\n///\n/// * casting from an `f64` to an `f32` **rounds to nearest, ties to even**.\n///\n/// [RFC2484]: https://github.com/rust-lang/rfcs/pull/2484\npub trait Cast<T>: crate::marker::Sized {\n    /// Numeric cast from `self` to `T`.\n    fn cast(self) -> T;\n}\n\n/// `FromCast` implies `Cast`.\nimpl<T, U> Cast<U> for T\nwhere\n    U: FromCast<T>,\n{\n    #[inline]\n    fn cast(self) -> U {\n        U::from_cast(self)\n    }\n}\n\n/// `FromCast` and `Cast` are reflexive\nimpl<T> FromCast<T> for T {\n    #[inline]\n    fn from_cast(t: Self) -> Self {\n        t\n    }\n}\n\n#[macro_use]\nmod macros;\n\nmod v16;\npub use self::v16::*;\n\nmod v32;\npub use self::v32::*;\n\nmod v64;\npub use self::v64::*;\n\nmod v128;\npub use self::v128::*;\n\nmod v256;\npub use self::v256::*;\n\nmod v512;\npub use self::v512::*;\n"
  },
  {
    "path": "src/api/cmp/eq.rs",
    "content": "//! Implements `Eq` for vector types.\n\nmacro_rules! impl_cmp_eq {\n    (\n        [$elem_ty:ident; $elem_count:expr]:\n        $id:ident | $test_tt:tt |\n        ($true:expr, $false:expr)\n    ) => {\n        impl crate::cmp::Eq for $id {}\n        impl crate::cmp::Eq for LexicographicallyOrdered<$id> {}\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _cmp_eq>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn eq() {\n                        fn foo<E: crate::cmp::Eq>(_: E) {}\n                        let a = $id::splat($false);\n                        foo(a);\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/cmp/ord.rs",
    "content": "//! Implements `Ord` for vector types.\n\nmacro_rules! impl_cmp_ord {\n    (\n        [$elem_ty:ident; $elem_count:expr]:\n        $id:ident | $test_tt:tt |\n        ($true:expr, $false:expr)\n    ) => {\n        impl $id {\n            /// Returns a wrapper that implements `Ord`.\n            #[inline]\n            pub fn lex_ord(&self) -> LexicographicallyOrdered<$id> {\n                LexicographicallyOrdered(*self)\n            }\n        }\n\n        impl crate::cmp::Ord for LexicographicallyOrdered<$id> {\n            #[inline]\n            fn cmp(&self, other: &Self) -> crate::cmp::Ordering {\n                match self.partial_cmp(other) {\n                    Some(x) => x,\n                    None => unsafe { crate::hint::unreachable_unchecked() },\n                }\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _cmp_ord>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn eq() {\n                        fn foo<E: crate::cmp::Ord>(_: E) {}\n                        let a = $id::splat($false);\n                        foo(a.partial_lex_ord());\n                        foo(a.lex_ord());\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/cmp/partial_eq.rs",
    "content": "//! Implements `PartialEq` for vector types.\n\nmacro_rules! impl_cmp_partial_eq {\n    (\n        [$elem_ty:ident; $elem_count:expr]:\n        $id:ident | $test_tt:tt |\n        ($true:expr, $false:expr)\n    ) => {\n        // FIXME: https://github.com/rust-lang-nursery/rust-clippy/issues/2892\n        #[allow(clippy::partialeq_ne_impl)]\n        impl crate::cmp::PartialEq<$id> for $id {\n            #[inline]\n            fn eq(&self, other: &Self) -> bool {\n                $id::eq(*self, *other).all()\n            }\n            #[inline]\n            fn ne(&self, other: &Self) -> bool {\n                $id::ne(*self, *other).any()\n            }\n        }\n\n        // FIXME: https://github.com/rust-lang-nursery/rust-clippy/issues/2892\n        #[allow(clippy::partialeq_ne_impl)]\n        impl crate::cmp::PartialEq<LexicographicallyOrdered<$id>> for LexicographicallyOrdered<$id> {\n            #[inline]\n            fn eq(&self, other: &Self) -> bool {\n                self.0 == other.0\n            }\n            #[inline]\n            fn ne(&self, other: &Self) -> bool {\n                self.0 != other.0\n            }\n        }\n\n        test_if! {\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _cmp_PartialEq>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn partial_eq() {\n                        let a = $id::splat($false);\n                        let b = $id::splat($true);\n\n                        assert!(a != b);\n                        assert!(!(a == b));\n                        assert!(a == a);\n                        assert!(!(a != a));\n\n                        if $id::lanes() > 1 {\n                            let a = $id::splat($false).replace(0, $true);\n                            let b = $id::splat($true);\n\n                            assert!(a != b);\n                            assert!(!(a == b));\n                            assert!(a == a);\n                            assert!(!(a != a));\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/cmp/partial_ord.rs",
    "content": "//! Implements `PartialOrd` for vector types.\n//!\n//! This implements a lexicographical order.\n\nmacro_rules! impl_cmp_partial_ord {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Returns a wrapper that implements `PartialOrd`.\n            #[inline]\n            pub fn partial_lex_ord(&self) -> LexicographicallyOrdered<$id> {\n                LexicographicallyOrdered(*self)\n            }\n        }\n\n        impl crate::cmp::PartialOrd<LexicographicallyOrdered<$id>> for LexicographicallyOrdered<$id> {\n            #[inline]\n            fn partial_cmp(&self, other: &Self) -> Option<crate::cmp::Ordering> {\n                if PartialEq::eq(self, other) {\n                    Some(crate::cmp::Ordering::Equal)\n                } else if PartialOrd::lt(self, other) {\n                    Some(crate::cmp::Ordering::Less)\n                } else if PartialOrd::gt(self, other) {\n                    Some(crate::cmp::Ordering::Greater)\n                } else {\n                    None\n                }\n            }\n            #[inline]\n            fn lt(&self, other: &Self) -> bool {\n                let m_lt = self.0.lt(other.0);\n                let m_eq = self.0.eq(other.0);\n                for i in 0..$id::lanes() {\n                    if m_eq.extract(i) {\n                        continue;\n                    }\n                    return m_lt.extract(i);\n                }\n                false\n            }\n            #[inline]\n            fn le(&self, other: &Self) -> bool {\n                self.lt(other) | PartialEq::eq(self, other)\n            }\n            #[inline]\n            fn ge(&self, other: &Self) -> bool {\n                self.gt(other) | PartialEq::eq(self, other)\n            }\n            #[inline]\n            fn gt(&self, other: &Self) -> bool {\n                let m_gt = self.0.gt(other.0);\n                let m_eq = self.0.eq(other.0);\n                for i in 0..$id::lanes() {\n                    if m_eq.extract(i) {\n                        continue;\n                    }\n                    return m_gt.extract(i);\n                }\n                false\n            }\n        }\n    };\n}\n\nmacro_rules! test_cmp_partial_ord_int {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _cmp_PartialOrd>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn partial_lex_ord() {\n                        use crate::testing::utils::{test_cmp};\n                        // constant values\n                        let a = $id::splat(0);\n                        let b = $id::splat(1);\n\n                        test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),\n                                 Some(crate::cmp::Ordering::Less));\n                        test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),\n                                 Some(crate::cmp::Ordering::Greater));\n                        test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),\n                                 Some(crate::cmp::Ordering::Equal));\n                        test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),\n                                 Some(crate::cmp::Ordering::Equal));\n\n                        // variable values: a = [0, 1, 2, 3]; b = [3, 2, 1, 0]\n                        let mut a = $id::splat(0);\n                        let mut b = $id::splat(0);\n                        for i in 0..$id::lanes() {\n                            a = a.replace(i, i as $elem_ty);\n                            b = b.replace(i, ($id::lanes() - i) as $elem_ty);\n                        }\n                        test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),\n                                 Some(crate::cmp::Ordering::Less));\n                        test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),\n                                 Some(crate::cmp::Ordering::Greater));\n                        test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),\n                                 Some(crate::cmp::Ordering::Equal));\n                        test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),\n                                 Some(crate::cmp::Ordering::Equal));\n\n                        // variable values: a = [0, 1, 2, 3]; b = [0, 1, 2, 4]\n                        let mut b = a;\n                        b = b.replace(\n                            $id::lanes() - 1,\n                            a.extract($id::lanes() - 1) + 1 as $elem_ty\n                        );\n                        test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),\n                                 Some(crate::cmp::Ordering::Less));\n                        test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),\n                                 Some(crate::cmp::Ordering::Greater));\n                        test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),\n                                 Some(crate::cmp::Ordering::Equal));\n                        test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),\n                                 Some(crate::cmp::Ordering::Equal));\n\n                        if $id::lanes() > 2 {\n                            // variable values a = [0, 1, 0, 0]; b = [0, 1, 2, 3]\n                            let b = a;\n                            let mut a = $id::splat(0);\n                            a = a.replace(1, 1 as $elem_ty);\n                            test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),\n                                     Some(crate::cmp::Ordering::Less));\n                            test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),\n                                     Some(crate::cmp::Ordering::Greater));\n                            test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),\n                                     Some(crate::cmp::Ordering::Equal));\n                            test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),\n                                     Some(crate::cmp::Ordering::Equal));\n\n                            // variable values: a = [0, 1, 2, 3]; b = [0, 1, 3, 2]\n                            let mut b = a;\n                            b = b.replace(\n                                2, a.extract($id::lanes() - 1) + 1 as $elem_ty\n                            );\n                            test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),\n                                     Some(crate::cmp::Ordering::Less));\n                            test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),\n                                     Some(crate::cmp::Ordering::Greater));\n                            test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),\n                                     Some(crate::cmp::Ordering::Equal));\n                            test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),\n                                     Some(crate::cmp::Ordering::Equal));\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n\nmacro_rules! test_cmp_partial_ord_mask {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _cmp_PartialOrd>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn partial_lex_ord() {\n                        use crate::testing::utils::{test_cmp};\n                        use crate::cmp::Ordering;\n\n                        // constant values\n                        let a = $id::splat(false);\n                        let b = $id::splat(true);\n\n                        test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),\n                                 Some(Ordering::Less));\n                        test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),\n                                 Some(Ordering::Greater));\n                        test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),\n                                 Some(Ordering::Equal));\n                        test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),\n                                 Some(Ordering::Equal));\n\n                        // variable values:\n                        // a = [false, false, false, false];\n                        // b = [false, false, false, true]\n                        let a = $id::splat(false);\n                        let mut b = $id::splat(false);\n                        b = b.replace($id::lanes() - 1, true);\n                        test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),\n                                 Some(Ordering::Less));\n                        test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),\n                                 Some(Ordering::Greater));\n                        test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),\n                                 Some(Ordering::Equal));\n                        test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),\n                                 Some(Ordering::Equal));\n\n                        // variable values:\n                        // a = [true, true, true, false];\n                        // b = [true, true, true, true]\n                        let mut a = $id::splat(true);\n                        let b = $id::splat(true);\n                        a = a.replace($id::lanes() - 1, false);\n                        test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),\n                                 Some(Ordering::Less));\n                        test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),\n                                 Some(Ordering::Greater));\n                        test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),\n                                 Some(Ordering::Equal));\n                        test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),\n                                 Some(Ordering::Equal));\n\n                        if $id::lanes() > 2 {\n                            // variable values\n                            // a = [false, true, false, false];\n                            // b = [false, true, true, true]\n                            let mut a = $id::splat(false);\n                            let mut b = $id::splat(true);\n                            a = a.replace(1, true);\n                            b = b.replace(0, false);\n                            test_cmp(a.partial_lex_ord(), b.partial_lex_ord(),\n                                     Some(Ordering::Less));\n                            test_cmp(b.partial_lex_ord(), a.partial_lex_ord(),\n                                     Some(Ordering::Greater));\n                            test_cmp(a.partial_lex_ord(), a.partial_lex_ord(),\n                                     Some(Ordering::Equal));\n                            test_cmp(b.partial_lex_ord(), b.partial_lex_ord(),\n                                     Some(Ordering::Equal));\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/cmp/vertical.rs",
    "content": "//! Vertical (lane-wise) vector comparisons returning vector masks.\n\nmacro_rules! impl_cmp_vertical {\n    (\n        [$elem_ty:ident; $elem_count:expr]:\n        $id:ident,\n        $mask_ty:ident,\n        $is_mask:expr,($true:expr, $false:expr) | $test_tt:tt\n    ) => {\n        impl $id {\n            /// Lane-wise equality comparison.\n            #[inline]\n            pub fn eq(self, other: Self) -> $mask_ty {\n                use crate::llvm::simd_eq;\n                Simd(unsafe { simd_eq(self.0, other.0) })\n            }\n\n            /// Lane-wise inequality comparison.\n            #[inline]\n            pub fn ne(self, other: Self) -> $mask_ty {\n                use crate::llvm::simd_ne;\n                Simd(unsafe { simd_ne(self.0, other.0) })\n            }\n\n            /// Lane-wise less-than comparison.\n            #[inline]\n            pub fn lt(self, other: Self) -> $mask_ty {\n                use crate::llvm::{simd_gt, simd_lt};\n                if $is_mask {\n                    Simd(unsafe { simd_gt(self.0, other.0) })\n                } else {\n                    Simd(unsafe { simd_lt(self.0, other.0) })\n                }\n            }\n\n            /// Lane-wise less-than-or-equals comparison.\n            #[inline]\n            pub fn le(self, other: Self) -> $mask_ty {\n                use crate::llvm::{simd_ge, simd_le};\n                if $is_mask {\n                    Simd(unsafe { simd_ge(self.0, other.0) })\n                } else {\n                    Simd(unsafe { simd_le(self.0, other.0) })\n                }\n            }\n\n            /// Lane-wise greater-than comparison.\n            #[inline]\n            pub fn gt(self, other: Self) -> $mask_ty {\n                use crate::llvm::{simd_gt, simd_lt};\n                if $is_mask {\n                    Simd(unsafe { simd_lt(self.0, other.0) })\n                } else {\n                    Simd(unsafe { simd_gt(self.0, other.0) })\n                }\n            }\n\n            /// Lane-wise greater-than-or-equals comparison.\n            #[inline]\n            pub fn ge(self, other: Self) -> $mask_ty {\n                use crate::llvm::{simd_ge, simd_le};\n                if $is_mask {\n                    Simd(unsafe { simd_le(self.0, other.0) })\n                } else {\n                    Simd(unsafe { simd_ge(self.0, other.0) })\n                }\n            }\n        }\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _cmp_vertical>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn cmp() {\n                        let a = $id::splat($false);\n                        let b = $id::splat($true);\n\n                        let r = a.lt(b);\n                        let e = $mask_ty::splat(true);\n                        assert!(r == e);\n                        let r = a.le(b);\n                        assert!(r == e);\n\n                        let e = $mask_ty::splat(false);\n                        let r = a.gt(b);\n                        assert!(r == e);\n                        let r = a.ge(b);\n                        assert!(r == e);\n                        let r = a.eq(b);\n                        assert!(r == e);\n\n                        let mut a = a;\n                        let mut b = b;\n                        let mut e = e;\n                        for i in 0..$id::lanes() {\n                            if i % 2 == 0 {\n                                a = a.replace(i, $false);\n                                b = b.replace(i, $true);\n                                e = e.replace(i, true);\n                            } else {\n                                a = a.replace(i, $true);\n                                b = b.replace(i, $false);\n                                e = e.replace(i, false);\n                            }\n                        }\n                        let r = a.lt(b);\n                        assert!(r == e);\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/cmp.rs",
    "content": "//! Implement cmp traits for vector types\n\n#[macro_use]\nmod partial_eq;\n\n#[macro_use]\nmod eq;\n\n#[macro_use]\nmod partial_ord;\n\n#[macro_use]\nmod ord;\n\n#[macro_use]\nmod vertical;\n"
  },
  {
    "path": "src/api/default.rs",
    "content": "//! Implements `Default` for vector types.\n\nmacro_rules! impl_default {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl Default for $id {\n            #[inline]\n            fn default() -> Self {\n                Self::splat($elem_ty::default())\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                // Comparisons use integer casts within mantissa^1 range.\n                #[allow(clippy::float_cmp)]\n                pub mod [<$id _default>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn default() {\n                        let a = $id::default();\n                        for i in 0..$id::lanes() {\n                            assert_eq!(a.extract(i), $elem_ty::default());\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/fmt/binary.rs",
    "content": "//! Implement Octal formatting\n\nmacro_rules! impl_fmt_binary {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl crate::fmt::Binary for $id {\n            #[allow(clippy::missing_inline_in_public_items)]\n            fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result {\n                write!(f, \"{}(\", stringify!($id))?;\n                for i in 0..$elem_count {\n                    if i > 0 {\n                        write!(f, \", \")?;\n                    }\n                    self.extract(i).fmt(f)?;\n                }\n                write!(f, \")\")\n            }\n        }\n        test_if! {\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _fmt_binary>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn binary() {\n                        use arrayvec::{ArrayString,ArrayVec};\n                        type TinyString = ArrayString<[u8; 512]>;\n\n                        use crate::fmt::Write;\n                        let v = $id::splat($elem_ty::default());\n                        let mut s = TinyString::new();\n                        write!(&mut s, \"{:#b}\", v).unwrap();\n\n                        let mut beg = TinyString::new();\n                        write!(&mut beg, \"{}(\", stringify!($id)).unwrap();\n                        assert!(s.starts_with(beg.as_str()));\n                        assert!(s.ends_with(\")\"));\n                        let s: ArrayVec<[TinyString; 64]>\n                            = s.replace(beg.as_str(), \"\")\n                            .replace(\")\", \"\").split(\",\")\n                            .map(|v| TinyString::from(v.trim()).unwrap())\n                            .collect();\n                        assert_eq!(s.len(), $id::lanes());\n                        for (index, ss) in s.into_iter().enumerate() {\n                            let mut e = TinyString::new();\n                            write!(&mut e, \"{:#b}\", v.extract(index)).unwrap();\n                            assert_eq!(ss, e);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/fmt/debug.rs",
    "content": "//! Implement debug formatting\n\nmacro_rules! impl_fmt_debug_tests {\n    ([$elem_ty:ty; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        test_if! {\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _fmt_debug>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn debug() {\n                        use arrayvec::{ArrayString,ArrayVec};\n                        type TinyString = ArrayString<[u8; 512]>;\n\n                        use crate::fmt::Write;\n                        let v = $id::default();\n                        let mut s = TinyString::new();\n                        write!(&mut s, \"{:?}\", v).unwrap();\n\n                        let mut beg = TinyString::new();\n                        write!(&mut beg, \"{}(\", stringify!($id)).unwrap();\n                        assert!(s.starts_with(beg.as_str()));\n                        assert!(s.ends_with(\")\"));\n                        let s: ArrayVec<[TinyString; 64]>\n                            = s.replace(beg.as_str(), \"\")\n                            .replace(\")\", \"\").split(\",\")\n                            .map(|v| TinyString::from(v.trim()).unwrap())\n                            .collect();\n                        assert_eq!(s.len(), $id::lanes());\n                        for (index, ss) in s.into_iter().enumerate() {\n                            let mut e = TinyString::new();\n                            write!(&mut e, \"{:?}\", v.extract(index)).unwrap();\n                            assert_eq!(ss, e);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n\nmacro_rules! impl_fmt_debug {\n    ([$elem_ty:ty; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl crate::fmt::Debug for $id {\n            #[allow(clippy::missing_inline_in_public_items)]\n            fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result {\n                write!(f, \"{}(\", stringify!($id))?;\n                for i in 0..$elem_count {\n                    if i > 0 {\n                        write!(f, \", \")?;\n                    }\n                    self.extract(i).fmt(f)?;\n                }\n                write!(f, \")\")\n            }\n        }\n        impl_fmt_debug_tests!([$elem_ty; $elem_count]: $id | $test_tt);\n    };\n}\n"
  },
  {
    "path": "src/api/fmt/lower_hex.rs",
    "content": "//! Implement `LowerHex` formatting\n\nmacro_rules! impl_fmt_lower_hex {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl crate::fmt::LowerHex for $id {\n            #[allow(clippy::missing_inline_in_public_items)]\n            fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result {\n                write!(f, \"{}(\", stringify!($id))?;\n                for i in 0..$elem_count {\n                    if i > 0 {\n                        write!(f, \", \")?;\n                    }\n                    self.extract(i).fmt(f)?;\n                }\n                write!(f, \")\")\n            }\n        }\n        test_if! {\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _fmt_lower_hex>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn lower_hex() {\n                        use arrayvec::{ArrayString,ArrayVec};\n                        type TinyString = ArrayString<[u8; 512]>;\n\n                        use crate::fmt::Write;\n                        let v = $id::splat($elem_ty::default());\n                        let mut s = TinyString::new();\n                        write!(&mut s, \"{:#x}\", v).unwrap();\n\n                        let mut beg = TinyString::new();\n                        write!(&mut beg, \"{}(\", stringify!($id)).unwrap();\n                        assert!(s.starts_with(beg.as_str()));\n                        assert!(s.ends_with(\")\"));\n                        let s: ArrayVec<[TinyString; 64]>\n                            = s.replace(beg.as_str(), \"\").replace(\")\", \"\")\n                            .split(\",\")\n                            .map(|v| TinyString::from(v.trim()).unwrap())\n                            .collect();\n                        assert_eq!(s.len(), $id::lanes());\n                        for (index, ss) in s.into_iter().enumerate() {\n                            let mut e = TinyString::new();\n                            write!(&mut e, \"{:#x}\", v.extract(index)).unwrap();\n                        assert_eq!(ss, e);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/fmt/octal.rs",
    "content": "//! Implement Octal formatting\n\nmacro_rules! impl_fmt_octal {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl crate::fmt::Octal for $id {\n            #[allow(clippy::missing_inline_in_public_items)]\n            fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result {\n                write!(f, \"{}(\", stringify!($id))?;\n                for i in 0..$elem_count {\n                    if i > 0 {\n                        write!(f, \", \")?;\n                    }\n                    self.extract(i).fmt(f)?;\n                }\n                write!(f, \")\")\n            }\n        }\n        test_if! {\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _fmt_octal>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn octal_hex() {\n                        use arrayvec::{ArrayString,ArrayVec};\n                        type TinyString = ArrayString<[u8; 512]>;\n\n                        use crate::fmt::Write;\n                        let v = $id::splat($elem_ty::default());\n                        let mut s = TinyString::new();\n                        write!(&mut s, \"{:#o}\", v).unwrap();\n\n                        let mut beg = TinyString::new();\n                        write!(&mut beg, \"{}(\", stringify!($id)).unwrap();\n                        assert!(s.starts_with(beg.as_str()));\n                        assert!(s.ends_with(\")\"));\n                        let s: ArrayVec<[TinyString; 64]>\n                            = s.replace(beg.as_str(), \"\").replace(\")\", \"\")\n                            .split(\",\")\n                            .map(|v| TinyString::from(v.trim()).unwrap())\n                            .collect();\n                        assert_eq!(s.len(), $id::lanes());\n                        for (index, ss) in s.into_iter().enumerate() {\n                            let mut e = TinyString::new();\n                            write!(&mut e, \"{:#o}\", v.extract(index)).unwrap();\n                            assert_eq!(ss, e);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/fmt/upper_hex.rs",
    "content": "//! Implement `UpperHex` formatting\n\nmacro_rules! impl_fmt_upper_hex {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl crate::fmt::UpperHex for $id {\n            #[allow(clippy::missing_inline_in_public_items)]\n            fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result {\n                write!(f, \"{}(\", stringify!($id))?;\n                for i in 0..$elem_count {\n                    if i > 0 {\n                        write!(f, \", \")?;\n                    }\n                    self.extract(i).fmt(f)?;\n                }\n                write!(f, \")\")\n            }\n        }\n        test_if! {\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _fmt_upper_hex>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn upper_hex() {\n                        use arrayvec::{ArrayString,ArrayVec};\n                        type TinyString = ArrayString<[u8; 512]>;\n\n                        use crate::fmt::Write;\n                        let v = $id::splat($elem_ty::default());\n                        let mut s = TinyString::new();\n                        write!(&mut s, \"{:#X}\", v).unwrap();\n\n                        let mut beg = TinyString::new();\n                        write!(&mut beg, \"{}(\", stringify!($id)).unwrap();\n                        assert!(s.starts_with(beg.as_str()));\n                        assert!(s.ends_with(\")\"));\n                        let s: ArrayVec<[TinyString; 64]>\n                            = s.replace(beg.as_str(), \"\").replace(\")\", \"\")\n                            .split(\",\")\n                            .map(|v| TinyString::from(v.trim()).unwrap())\n                            .collect();\n                        assert_eq!(s.len(), $id::lanes());\n                        for (index, ss) in s.into_iter().enumerate() {\n                            let mut e = TinyString::new();\n                            write!(&mut e, \"{:#X}\", v.extract(index)).unwrap();\n                            assert_eq!(ss, e);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/fmt.rs",
    "content": "//! Implements formatting APIs\n\n#[macro_use]\nmod debug;\n#[macro_use]\nmod lower_hex;\n#[macro_use]\nmod upper_hex;\n#[macro_use]\nmod octal;\n#[macro_use]\nmod binary;\n"
  },
  {
    "path": "src/api/from/from_array.rs",
    "content": "//! Implements `From<[T; N]>` and `Into<[T; N]>` for vector types.\n\nmacro_rules! impl_from_array {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt\n     | ($non_default_array:expr, $non_default_vec:expr)) => {\n        impl From<[$elem_ty; $elem_count]> for $id {\n            #[inline]\n            fn from(array: [$elem_ty; $elem_count]) -> Self {\n                union U {\n                    array: [$elem_ty; $elem_count],\n                    vec: $id,\n                }\n                unsafe { U { array }.vec }\n            }\n        }\n\n        impl From<$id> for [$elem_ty; $elem_count] {\n            #[inline]\n            fn from(vec: $id) -> Self {\n                union U {\n                    array: [$elem_ty; $elem_count],\n                    vec: $id,\n                }\n                unsafe { U { vec }.array }\n            }\n        }\n\n        // FIXME: `Into::into` is not inline, but due to\n        // the blanket impl in `std`, which is not\n        // marked `default`, we cannot override it here with\n        // specialization.\n        /*\n        impl Into<[$elem_ty; $elem_count]> for $id {\n            #[inline]\n            fn into(self) -> [$elem_ty; $elem_count] {\n                union U {\n                    array: [$elem_ty; $elem_count],\n                    vec: $id,\n                }\n                unsafe { U { vec: self }.array }\n            }\n        }\n\n        impl Into<$id> for [$elem_ty; $elem_count] {\n            #[inline]\n            fn into(self) -> $id {\n                union U {\n                    array: [$elem_ty; $elem_count],\n                    vec: $id,\n                }\n                unsafe { U { array: self }.vec }\n            }\n        }\n        */\n\n        test_if! {\n            $test_tt:\n            paste::item! {\n                // Comparisons use integer casts within mantissa^1 range.\n                #[allow(clippy::float_cmp)]\n                mod [<$id _from>] {\n                    use super::*;\n                    #[test]\n                    #[cfg_attr(miri, ignore)]\n                    fn array() {\n                        let vec: $id = Default::default();\n\n                        // FIXME: Workaround for arrays with more than 32\n                        // elements.\n                        //\n                        // Safe because we never take a reference to any\n                        // uninitialized element.\n                        union W {\n                            array: [$elem_ty; $elem_count],\n                            other: ()\n                        }\n                        let mut array = W { other: () };\n                        for i in 0..$elem_count {\n                            let default: $elem_ty = Default::default();\n                            // note: array.other is the active member and\n                            // initialized so we can take a reference to it:\n                            let p = unsafe {\n                                &mut array.other as *mut () as *mut $elem_ty\n                            };\n                            // note: default is a valid bit-pattern for\n                            // $elem_ty:\n                            unsafe {\n                                crate::ptr::write(p.wrapping_add(i), default)\n                            };\n                        }\n                        // note: the array variant of the union is properly\n                        // initialized:\n                        let mut array = unsafe {\n                            array.array\n                        };\n\n                        array[0] = $non_default_array;\n                        let vec = vec.replace(0, $non_default_vec);\n\n                        let vec_from_array = $id::from(array);\n                        assert_eq!(vec_from_array, vec);\n                        let array_from_vec\n                            = <[$elem_ty; $elem_count]>::from(vec);\n                        // FIXME: Workaround for arrays with more than 32\n                        // elements.\n                        for i in 0..$elem_count {\n                            assert_eq!(array_from_vec[i], array[i]);\n                        }\n\n                        let vec_from_into_array: $id = array.into();\n                        assert_eq!(vec_from_into_array, vec);\n                        let array_from_into_vec: [$elem_ty; $elem_count]\n                            = vec.into();\n                        // FIXME: Workaround for arrays with more than 32\n                        // elements.\n                        for i in 0..$elem_count {\n                            assert_eq!(array_from_into_vec[i], array[i]);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/from/from_vector.rs",
    "content": "//! Implements `From` and `Into` for vector types.\n\nmacro_rules! impl_from_vector {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt\n     | $source:ident) => {\n        impl From<$source> for $id {\n            #[inline]\n            fn from(source: $source) -> Self {\n                fn static_assert_same_number_of_lanes<T, U>()\n                where\n                    T: crate::sealed::Simd,\n                    U: crate::sealed::Simd<LanesType = T::LanesType>,\n                {\n                }\n                use crate::llvm::simd_cast;\n                static_assert_same_number_of_lanes::<$id, $source>();\n                Simd(unsafe { simd_cast(source.0) })\n            }\n        }\n\n        // FIXME: `Into::into` is not inline, but due to the blanket impl in\n        // `std`, which is not marked `default`, we cannot override it here\n        // with specialization.\n\n        /*\n           impl Into<$id> for $source {\n               #[inline]\n               fn into(self) -> $id {\n                   unsafe { simd_cast(self) }\n               }\n           }\n        */\n\n        test_if! {\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _from_ $source>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn from() {\n                        assert_eq!($id::lanes(), $source::lanes());\n                        let source: $source = Default::default();\n                        let vec: $id = Default::default();\n\n                        let e = $id::from(source);\n                        assert_eq!(e, vec);\n\n                        let e: $id = source.into();\n                        assert_eq!(e, vec);\n                    }\n                }\n            }\n        }\n    };\n}\n\nmacro_rules! impl_from_vectors {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt\n     | $($source:ident),*) => {\n        $(\n            impl_from_vector!(\n                [$elem_ty; $elem_count]: $id | $test_tt | $source\n            );\n        )*\n    }\n}\n"
  },
  {
    "path": "src/api/from.rs",
    "content": "//! Implementations of the `From` and `Into` traits\n\n#[macro_use]\nmod from_array;\n\n#[macro_use]\nmod from_vector;\n"
  },
  {
    "path": "src/api/hash.rs",
    "content": "//! Implements `Hash` for vector types.\n\nmacro_rules! impl_hash {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl crate::hash::Hash for $id {\n            #[inline]\n            fn hash<H: crate::hash::Hasher>(&self, state: &mut H) {\n                unsafe {\n                    union A {\n                        data: [$elem_ty; $id::lanes()],\n                        vec: $id,\n                    }\n                    A { vec: *self }.data.hash(state)\n                }\n            }\n        }\n\n        test_if! {\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _hash>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn hash() {\n                        use crate::hash::{Hash, Hasher};\n                        #[allow(deprecated)]\n                        use crate::hash::{SipHasher13};\n                        type A = [$elem_ty; $id::lanes()];\n                        let a: A = [42 as $elem_ty; $id::lanes()];\n                        assert_eq!(\n                            crate::mem::size_of::<A>(),\n                            crate::mem::size_of::<$id>()\n                        );\n                        #[allow(deprecated)]\n                        let mut a_hash = SipHasher13::new();\n                        let mut v_hash = a_hash.clone();\n                        a.hash(&mut a_hash);\n\n                        // Integer within mantissa^1 range.\n                        #[allow(clippy::float_cmp)]\n                        let v = $id::splat(42 as $elem_ty);\n                        v.hash(&mut v_hash);\n                        assert_eq!(a_hash.finish(), v_hash.finish());\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/into_bits/arch_specific.rs",
    "content": "//! `FromBits` and `IntoBits` between portable vector types and the\n//! architecture-specific vector types.\n#[rustfmt::skip]\n\n// FIXME: MIPS FromBits/IntoBits\n\n#[allow(unused)]\nuse crate::*;\n\n/// This macro implements FromBits for the portable and the architecture\n/// specific vector types.\n///\n/// The \"leaf\" case is at the bottom, and the most generic case is at the top.\n/// The generic case is split into smaller cases recursively.\nmacro_rules! impl_arch {\n    ([$arch_head_i:ident[$arch_head_tt:tt]: $($arch_head_ty:ident),*],\n     $([$arch_tail_i:ident[$arch_tail_tt:tt]: $($arch_tail_ty:ident),*]),* |\n     from: $($from_ty:ident),* | into: $($into_ty:ident),* |\n     test: $test_tt:tt) => {\n        impl_arch!(\n            [$arch_head_i[$arch_head_tt]: $($arch_head_ty),*] |\n            from: $($from_ty),* |\n            into: $($into_ty),* |\n            test: $test_tt\n        );\n        impl_arch!(\n            $([$arch_tail_i[$arch_tail_tt]: $($arch_tail_ty),*]),* |\n            from: $($from_ty),* |\n            into: $($into_ty),* |\n            test: $test_tt\n        );\n    };\n    ([$arch:ident[$arch_tt:tt]: $($arch_ty:ident),*] |\n     from: $($from_ty:ident),* | into: $($into_ty:ident),* |\n     test: $test_tt:tt) => {\n        // note: if target is \"arm\", \"+v7,+neon\" must be enabled\n        // and the std library must be recompiled with them\n        #[cfg(any(\n            not(target_arch = \"arm\"),\n            all(target_feature = \"v7\", target_feature = \"neon\",\n                any(feature = \"core_arch\", libcore_neon)))\n        )]\n        // note: if target is \"powerpc\", \"altivec\" must be enabled\n        // and the std library must be recompiled with it\n        #[cfg(any(\n            not(target_arch = \"powerpc\"),\n            all(target_feature = \"altivec\", feature = \"core_arch\"),\n        ))]\n        #[cfg(target_arch = $arch_tt)]\n        use crate::arch::$arch::{\n            $($arch_ty),*\n        };\n\n        #[cfg(any(\n            not(target_arch = \"arm\"),\n            all(target_feature = \"v7\", target_feature = \"neon\",\n                any(feature = \"core_arch\", libcore_neon)))\n        )]\n        #[cfg(any(\n            not(target_arch = \"powerpc\"),\n            all(target_feature = \"altivec\", feature = \"core_arch\"),\n        ))]\n        #[cfg(target_arch = $arch_tt)]\n        impl_arch!($($arch_ty),* | $($from_ty),* | $($into_ty),* |\n                   test: $test_tt);\n    };\n    ($arch_head:ident, $($arch_tail:ident),* | $($from_ty:ident),*\n     | $($into_ty:ident),* | test: $test_tt:tt) => {\n        impl_arch!($arch_head | $($from_ty),* | $($into_ty),* |\n                   test: $test_tt);\n        impl_arch!($($arch_tail),* | $($from_ty),* | $($into_ty),* |\n                   test: $test_tt);\n    };\n    ($arch_head:ident | $($from_ty:ident),* | $($into_ty:ident),* |\n     test: $test_tt:tt) => {\n        impl_from_bits!($arch_head[$test_tt]: $($from_ty),*);\n        impl_into_bits!($arch_head[$test_tt]: $($into_ty),*);\n    };\n}\n\n////////////////////////////////////////////////////////////////////////////////\n// Implementations for the 64-bit wide vector types:\n\n// FIXME: 64-bit single element types\n// FIXME: arm/aarch float16x4_t missing\nimpl_arch!(\n    [\n        arm[\"arm\"]: int8x8_t,\n        uint8x8_t,\n        poly8x8_t,\n        int16x4_t,\n        uint16x4_t,\n        poly16x4_t,\n        int32x2_t,\n        uint32x2_t,\n        float32x2_t,\n        int64x1_t,\n        uint64x1_t\n    ],\n    [\n        aarch64[\"aarch64\"]: int8x8_t,\n        uint8x8_t,\n        poly8x8_t,\n        int16x4_t,\n        uint16x4_t,\n        poly16x4_t,\n        int32x2_t,\n        uint32x2_t,\n        float32x2_t,\n        int64x1_t,\n        uint64x1_t,\n        float64x1_t\n    ] | from: i8x8,\n    u8x8,\n    m8x8,\n    i16x4,\n    u16x4,\n    m16x4,\n    i32x2,\n    u32x2,\n    f32x2,\n    m32x2 | into: i8x8,\n    u8x8,\n    i16x4,\n    u16x4,\n    i32x2,\n    u32x2,\n    f32x2 | test: test_v64\n);\n\n////////////////////////////////////////////////////////////////////////////////\n// Implementations for the 128-bit wide vector types:\n\n// FIXME: arm/aarch float16x8_t missing\n// FIXME: ppc vector_pixel missing\n// FIXME: ppc64 vector_Float16 missing\n// FIXME: ppc64 vector_signed_long_long missing\n// FIXME: ppc64 vector_unsigned_long_long missing\n// FIXME: ppc64 vector_bool_long_long missing\n// FIXME: ppc64 vector_signed___int128 missing\n// FIXME: ppc64 vector_unsigned___int128 missing\nimpl_arch!(\n    [x86[\"x86\"]: __m128, __m128i, __m128d],\n    [x86_64[\"x86_64\"]: __m128, __m128i, __m128d],\n    [\n        arm[\"arm\"]: int8x16_t,\n        uint8x16_t,\n        poly8x16_t,\n        int16x8_t,\n        uint16x8_t,\n        poly16x8_t,\n        int32x4_t,\n        uint32x4_t,\n        float32x4_t,\n        int64x2_t,\n        uint64x2_t\n    ],\n    [\n        aarch64[\"aarch64\"]: int8x16_t,\n        uint8x16_t,\n        poly8x16_t,\n        int16x8_t,\n        uint16x8_t,\n        poly16x8_t,\n        int32x4_t,\n        uint32x4_t,\n        float32x4_t,\n        int64x2_t,\n        uint64x2_t,\n        float64x2_t\n    ],\n    [\n        powerpc[\"powerpc\"]: vector_signed_char,\n        vector_unsigned_char,\n        vector_signed_short,\n        vector_unsigned_short,\n        vector_signed_int,\n        vector_unsigned_int,\n        vector_float\n    ],\n    [\n        powerpc64[\"powerpc64\"]: vector_signed_char,\n        vector_unsigned_char,\n        vector_signed_short,\n        vector_unsigned_short,\n        vector_signed_int,\n        vector_unsigned_int,\n        vector_float,\n        vector_signed_long,\n        vector_unsigned_long,\n        vector_double\n    ] | from: i8x16,\n    u8x16,\n    m8x16,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x1,\n    u128x1,\n    m128x1 | into: i8x16,\n    u8x16,\n    i16x8,\n    u16x8,\n    i32x4,\n    u32x4,\n    f32x4,\n    i64x2,\n    u64x2,\n    f64x2,\n    i128x1,\n    u128x1 | test: test_v128\n);\n\nimpl_arch!(\n    [powerpc[\"powerpc\"]: vector_bool_char],\n    [powerpc64[\"powerpc64\"]: vector_bool_char] | from: m8x16,\n    m16x8,\n    m32x4,\n    m64x2,\n    m128x1 | into: i8x16,\n    u8x16,\n    i16x8,\n    u16x8,\n    i32x4,\n    u32x4,\n    f32x4,\n    i64x2,\n    u64x2,\n    f64x2,\n    i128x1,\n    u128x1,\n    // Masks:\n    m8x16 | test: test_v128\n);\n\nimpl_arch!(\n    [powerpc[\"powerpc\"]: vector_bool_short],\n    [powerpc64[\"powerpc64\"]: vector_bool_short] | from: m16x8,\n    m32x4,\n    m64x2,\n    m128x1 | into: i8x16,\n    u8x16,\n    i16x8,\n    u16x8,\n    i32x4,\n    u32x4,\n    f32x4,\n    i64x2,\n    u64x2,\n    f64x2,\n    i128x1,\n    u128x1,\n    // Masks:\n    m8x16,\n    m16x8 | test: test_v128\n);\n\nimpl_arch!(\n    [powerpc[\"powerpc\"]: vector_bool_int],\n    [powerpc64[\"powerpc64\"]: vector_bool_int] | from: m32x4,\n    m64x2,\n    m128x1 | into: i8x16,\n    u8x16,\n    i16x8,\n    u16x8,\n    i32x4,\n    u32x4,\n    f32x4,\n    i64x2,\n    u64x2,\n    f64x2,\n    i128x1,\n    u128x1,\n    // Masks:\n    m8x16,\n    m16x8,\n    m32x4 | test: test_v128\n);\n\nimpl_arch!(\n    [powerpc64[\"powerpc64\"]: vector_bool_long] | from: m64x2,\n    m128x1 | into: i8x16,\n    u8x16,\n    i16x8,\n    u16x8,\n    i32x4,\n    u32x4,\n    f32x4,\n    i64x2,\n    u64x2,\n    f64x2,\n    i128x1,\n    u128x1,\n    // Masks:\n    m8x16,\n    m16x8,\n    m32x4,\n    m64x2 | test: test_v128\n);\n\n////////////////////////////////////////////////////////////////////////////////\n// Implementations for the 256-bit wide vector types\n\nimpl_arch!(\n    [x86[\"x86\"]: __m256, __m256i, __m256d],\n    [x86_64[\"x86_64\"]: __m256, __m256i, __m256d] | from: i8x32,\n    u8x32,\n    m8x32,\n    i16x16,\n    u16x16,\n    m16x16,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x2,\n    u128x2,\n    m128x2 | into: i8x32,\n    u8x32,\n    i16x16,\n    u16x16,\n    i32x8,\n    u32x8,\n    f32x8,\n    i64x4,\n    u64x4,\n    f64x4,\n    i128x2,\n    u128x2 | test: test_v256\n);\n\n////////////////////////////////////////////////////////////////////////////////\n// FIXME: Implementations for the 512-bit wide vector types\n"
  },
  {
    "path": "src/api/into_bits/macros.rs",
    "content": "//! Macros implementing `FromBits`\n\nmacro_rules! impl_from_bits_ {\n    ($id:ident[$test_tt:tt]: $from_ty:ident) => {\n        impl crate::api::into_bits::FromBits<$from_ty> for $id {\n            #[inline]\n            fn from_bits(x: $from_ty) -> Self {\n                unsafe { crate::mem::transmute(x) }\n            }\n        }\n\n        test_if! {\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _from_bits_ $from_ty>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn test() {\n                        use crate::{\n                            ptr::{read_unaligned},\n                            mem::{size_of, zeroed}\n                        };\n                        use crate::IntoBits;\n                        assert_eq!(size_of::<$id>(),\n                                   size_of::<$from_ty>());\n                        // This is safe because we never create a reference to\n                        // uninitialized memory:\n                        let a: $from_ty = unsafe { zeroed() };\n\n                        let b_0: $id = crate::FromBits::from_bits(a);\n                        let b_1: $id = a.into_bits();\n\n                        // Check that these are byte-wise equal, that is,\n                        // that the bit patterns are identical:\n                        for i in 0..size_of::<$id>() {\n                            // This is safe because we only read initialized\n                            // memory in bounds. Also, taking a reference to\n                            // `b_i` is ok because the fields are initialized.\n                            unsafe {\n                                let b_0_v: u8 = read_unaligned(\n                                    (&b_0 as *const $id as *const u8)\n                                        .wrapping_add(i)\n                                );\n                                let b_1_v: u8 = read_unaligned(\n                                    (&b_1 as *const $id as *const u8)\n                                        .wrapping_add(i)\n                                );\n                                assert_eq!(b_0_v, b_1_v);\n                            }\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n\nmacro_rules! impl_from_bits {\n    ($id:ident[$test_tt:tt]: $($from_ty:ident),*) => {\n        $(\n            impl_from_bits_!($id[$test_tt]: $from_ty);\n        )*\n    }\n}\n\n#[allow(unused)]\nmacro_rules! impl_into_bits {\n    ($id:ident[$test_tt:tt]: $($from_ty:ident),*) => {\n        $(\n            impl_from_bits_!($from_ty[$test_tt]: $id);\n        )*\n    }\n}\n"
  },
  {
    "path": "src/api/into_bits/v128.rs",
    "content": "//! `FromBits` and `IntoBits` implementations for portable 128-bit wide vectors\n#[rustfmt::skip]\n\n#[allow(unused)]  // wasm_bindgen_test\nuse crate::*;\n\nimpl_from_bits!(\n    i8x16[test_v128]: u8x16,\n    m8x16,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x1,\n    u128x1,\n    m128x1\n);\nimpl_from_bits!(\n    u8x16[test_v128]: i8x16,\n    m8x16,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x1,\n    u128x1,\n    m128x1\n);\nimpl_from_bits!(m8x16[test_v128]: m16x8, m32x4, m64x2, m128x1);\n\nimpl_from_bits!(\n    i16x8[test_v128]: i8x16,\n    u8x16,\n    m8x16,\n    u16x8,\n    m16x8,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x1,\n    u128x1,\n    m128x1\n);\nimpl_from_bits!(\n    u16x8[test_v128]: i8x16,\n    u8x16,\n    m8x16,\n    i16x8,\n    m16x8,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x1,\n    u128x1,\n    m128x1\n);\nimpl_from_bits!(m16x8[test_v128]: m32x4, m64x2, m128x1);\n\nimpl_from_bits!(\n    i32x4[test_v128]: i8x16,\n    u8x16,\n    m8x16,\n    i16x8,\n    u16x8,\n    m16x8,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x1,\n    u128x1,\n    m128x1\n);\nimpl_from_bits!(\n    u32x4[test_v128]: i8x16,\n    u8x16,\n    m8x16,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x4,\n    f32x4,\n    m32x4,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x1,\n    u128x1,\n    m128x1\n);\nimpl_from_bits!(\n    f32x4[test_v128]: i8x16,\n    u8x16,\n    m8x16,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x4,\n    u32x4,\n    m32x4,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x1,\n    u128x1,\n    m128x1\n);\nimpl_from_bits!(m32x4[test_v128]: m64x2, m128x1);\n\nimpl_from_bits!(\n    i64x2[test_v128]: i8x16,\n    u8x16,\n    m8x16,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x1,\n    u128x1,\n    m128x1\n);\nimpl_from_bits!(\n    u64x2[test_v128]: i8x16,\n    u8x16,\n    m8x16,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x2,\n    f64x2,\n    m64x2,\n    i128x1,\n    u128x1,\n    m128x1\n);\nimpl_from_bits!(\n    f64x2[test_v128]: i8x16,\n    u8x16,\n    m8x16,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x2,\n    u64x2,\n    m64x2,\n    i128x1,\n    u128x1,\n    m128x1\n);\nimpl_from_bits!(m64x2[test_v128]: m128x1);\n\nimpl_from_bits!(\n    i128x1[test_v128]: i8x16,\n    u8x16,\n    m8x16,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    u128x1,\n    m128x1\n);\nimpl_from_bits!(\n    u128x1[test_v128]: i8x16,\n    u8x16,\n    m8x16,\n    i16x8,\n    u16x8,\n    m16x8,\n    i32x4,\n    u32x4,\n    f32x4,\n    m32x4,\n    i64x2,\n    u64x2,\n    f64x2,\n    m64x2,\n    i128x1,\n    m128x1\n);\n// note: m128x1 cannot be constructed from all the other masks bit patterns in\n// here\n"
  },
  {
    "path": "src/api/into_bits/v16.rs",
    "content": "//! `FromBits` and `IntoBits` implementations for portable 16-bit wide vectors\n#[rustfmt::skip]\n\n#[allow(unused)]  // wasm_bindgen_test\nuse crate::*;\n\nimpl_from_bits!(i8x2[test_v16]: u8x2, m8x2);\nimpl_from_bits!(u8x2[test_v16]: i8x2, m8x2);\n// note: m8x2 cannot be constructed from all i8x2 or u8x2 bit patterns\n"
  },
  {
    "path": "src/api/into_bits/v256.rs",
    "content": "//! `FromBits` and `IntoBits` implementations for portable 256-bit wide vectors\n#[rustfmt::skip]\n\n#[allow(unused)]  // wasm_bindgen_test\nuse crate::*;\n\nimpl_from_bits!(\n    i8x32[test_v256]: u8x32,\n    m8x32,\n    i16x16,\n    u16x16,\n    m16x16,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x2,\n    u128x2,\n    m128x2\n);\nimpl_from_bits!(\n    u8x32[test_v256]: i8x32,\n    m8x32,\n    i16x16,\n    u16x16,\n    m16x16,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x2,\n    u128x2,\n    m128x2\n);\nimpl_from_bits!(m8x32[test_v256]: m16x16, m32x8, m64x4, m128x2);\n\nimpl_from_bits!(\n    i16x16[test_v256]: i8x32,\n    u8x32,\n    m8x32,\n    u16x16,\n    m16x16,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x2,\n    u128x2,\n    m128x2\n);\nimpl_from_bits!(\n    u16x16[test_v256]: i8x32,\n    u8x32,\n    m8x32,\n    i16x16,\n    m16x16,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x2,\n    u128x2,\n    m128x2\n);\nimpl_from_bits!(m16x16[test_v256]: m32x8, m64x4, m128x2);\n\nimpl_from_bits!(\n    i32x8[test_v256]: i8x32,\n    u8x32,\n    m8x32,\n    i16x16,\n    u16x16,\n    m16x16,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x2,\n    u128x2,\n    m128x2\n);\nimpl_from_bits!(\n    u32x8[test_v256]: i8x32,\n    u8x32,\n    m8x32,\n    i16x16,\n    u16x16,\n    m16x16,\n    i32x8,\n    f32x8,\n    m32x8,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x2,\n    u128x2,\n    m128x2\n);\nimpl_from_bits!(\n    f32x8[test_v256]: i8x32,\n    u8x32,\n    m8x32,\n    i16x16,\n    u16x16,\n    m16x16,\n    i32x8,\n    u32x8,\n    m32x8,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x2,\n    u128x2,\n    m128x2\n);\nimpl_from_bits!(m32x8[test_v256]: m64x4, m128x2);\n\nimpl_from_bits!(\n    i64x4[test_v256]: i8x32,\n    u8x32,\n    m8x32,\n    i16x16,\n    u16x16,\n    m16x16,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x2,\n    u128x2,\n    m128x2\n);\nimpl_from_bits!(\n    u64x4[test_v256]: i8x32,\n    u8x32,\n    m8x32,\n    i16x16,\n    u16x16,\n    m16x16,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x4,\n    f64x4,\n    m64x4,\n    i128x2,\n    u128x2,\n    m128x2\n);\nimpl_from_bits!(\n    f64x4[test_v256]: i8x32,\n    u8x32,\n    m8x32,\n    i16x16,\n    u16x16,\n    m16x16,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x4,\n    u64x4,\n    m64x4,\n    i128x2,\n    u128x2,\n    m128x2\n);\nimpl_from_bits!(m64x4[test_v256]: m128x2);\n\nimpl_from_bits!(\n    i128x2[test_v256]: i8x32,\n    u8x32,\n    m8x32,\n    i16x16,\n    u16x16,\n    m16x16,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    u128x2,\n    m128x2\n);\nimpl_from_bits!(\n    u128x2[test_v256]: i8x32,\n    u8x32,\n    m8x32,\n    i16x16,\n    u16x16,\n    m16x16,\n    i32x8,\n    u32x8,\n    f32x8,\n    m32x8,\n    i64x4,\n    u64x4,\n    f64x4,\n    m64x4,\n    i128x2,\n    m128x2\n);\n// note: m128x2 cannot be constructed from all the other masks bit patterns in\n// here\n"
  },
  {
    "path": "src/api/into_bits/v32.rs",
    "content": "//! `FromBits` and `IntoBits` implementations for portable 32-bit wide vectors\n#[rustfmt::skip]\n\n#[allow(unused)]  // wasm_bindgen_test\nuse crate::*;\n\nimpl_from_bits!(i8x4[test_v32]: u8x4, m8x4, i16x2, u16x2, m16x2);\nimpl_from_bits!(u8x4[test_v32]: i8x4, m8x4, i16x2, u16x2, m16x2);\nimpl_from_bits!(m8x4[test_v32]: m16x2);\n\nimpl_from_bits!(i16x2[test_v32]: i8x4, u8x4, m8x4, u16x2, m16x2);\nimpl_from_bits!(u16x2[test_v32]: i8x4, u8x4, m8x4, i16x2, m16x2);\n// note: m16x2 cannot be constructed from all m8x4 bit patterns\n"
  },
  {
    "path": "src/api/into_bits/v512.rs",
    "content": "//! `FromBits` and `IntoBits` implementations for portable 512-bit wide vectors\n#[rustfmt::skip]\n\n#[allow(unused)]  // wasm_bindgen_test\nuse crate::*;\n\nimpl_from_bits!(\n    i8x64[test_v512]: u8x64,\n    m8x64,\n    i16x32,\n    u16x32,\n    m16x32,\n    i32x16,\n    u32x16,\n    f32x16,\n    m32x16,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    i128x4,\n    u128x4,\n    m128x4\n);\nimpl_from_bits!(\n    u8x64[test_v512]: i8x64,\n    m8x64,\n    i16x32,\n    u16x32,\n    m16x32,\n    i32x16,\n    u32x16,\n    f32x16,\n    m32x16,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    i128x4,\n    u128x4,\n    m128x4\n);\nimpl_from_bits!(m8x64[test_v512]: m16x32, m32x16, m64x8, m128x4);\n\nimpl_from_bits!(\n    i16x32[test_v512]: i8x64,\n    u8x64,\n    m8x64,\n    u16x32,\n    m16x32,\n    i32x16,\n    u32x16,\n    f32x16,\n    m32x16,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    i128x4,\n    u128x4,\n    m128x4\n);\nimpl_from_bits!(\n    u16x32[test_v512]: i8x64,\n    u8x64,\n    m8x64,\n    i16x32,\n    m16x32,\n    i32x16,\n    u32x16,\n    f32x16,\n    m32x16,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    i128x4,\n    u128x4,\n    m128x4\n);\nimpl_from_bits!(m16x32[test_v512]: m32x16, m64x8, m128x4);\n\nimpl_from_bits!(\n    i32x16[test_v512]: i8x64,\n    u8x64,\n    m8x64,\n    i16x32,\n    u16x32,\n    m16x32,\n    u32x16,\n    f32x16,\n    m32x16,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    i128x4,\n    u128x4,\n    m128x4\n);\nimpl_from_bits!(\n    u32x16[test_v512]: i8x64,\n    u8x64,\n    m8x64,\n    i16x32,\n    u16x32,\n    m16x32,\n    i32x16,\n    f32x16,\n    m32x16,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    i128x4,\n    u128x4,\n    m128x4\n);\nimpl_from_bits!(\n    f32x16[test_v512]: i8x64,\n    u8x64,\n    m8x64,\n    i16x32,\n    u16x32,\n    m16x32,\n    i32x16,\n    u32x16,\n    m32x16,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    i128x4,\n    u128x4,\n    m128x4\n);\nimpl_from_bits!(m32x16[test_v512]: m64x8, m128x4);\n\nimpl_from_bits!(\n    i64x8[test_v512]: i8x64,\n    u8x64,\n    m8x64,\n    i16x32,\n    u16x32,\n    m16x32,\n    i32x16,\n    u32x16,\n    f32x16,\n    m32x16,\n    u64x8,\n    f64x8,\n    m64x8,\n    i128x4,\n    u128x4,\n    m128x4\n);\nimpl_from_bits!(\n    u64x8[test_v512]: i8x64,\n    u8x64,\n    m8x64,\n    i16x32,\n    u16x32,\n    m16x32,\n    i32x16,\n    u32x16,\n    f32x16,\n    m32x16,\n    i64x8,\n    f64x8,\n    m64x8,\n    i128x4,\n    u128x4,\n    m128x4\n);\nimpl_from_bits!(\n    f64x8[test_v512]: i8x64,\n    u8x64,\n    m8x64,\n    i16x32,\n    u16x32,\n    m16x32,\n    i32x16,\n    u32x16,\n    f32x16,\n    m32x16,\n    i64x8,\n    u64x8,\n    m64x8,\n    i128x4,\n    u128x4,\n    m128x4\n);\nimpl_from_bits!(m64x8[test_v512]: m128x4);\n\nimpl_from_bits!(\n    i128x4[test_v512]: i8x64,\n    u8x64,\n    m8x64,\n    i16x32,\n    u16x32,\n    m16x32,\n    i32x16,\n    u32x16,\n    f32x16,\n    m32x16,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    u128x4,\n    m128x4\n);\nimpl_from_bits!(\n    u128x4[test_v512]: i8x64,\n    u8x64,\n    m8x64,\n    i16x32,\n    u16x32,\n    m16x32,\n    i32x16,\n    u32x16,\n    f32x16,\n    m32x16,\n    i64x8,\n    u64x8,\n    f64x8,\n    m64x8,\n    i128x4,\n    m128x4\n);\n// note: m128x4 cannot be constructed from all the other masks bit patterns in\n// here\n"
  },
  {
    "path": "src/api/into_bits/v64.rs",
    "content": "//! `FromBits` and `IntoBits` implementations for portable 64-bit wide vectors\n#[rustfmt::skip]\n\n#[allow(unused)]  // wasm_bindgen_test\nuse crate::*;\n\nimpl_from_bits!(i8x8[test_v64]: u8x8, m8x8, i16x4, u16x4, m16x4, i32x2, u32x2, f32x2, m32x2);\nimpl_from_bits!(u8x8[test_v64]: i8x8, m8x8, i16x4, u16x4, m16x4, i32x2, u32x2, f32x2, m32x2);\nimpl_from_bits!(m8x8[test_v64]: m16x4, m32x2);\n\nimpl_from_bits!(i16x4[test_v64]: i8x8, u8x8, m8x8, u16x4, m16x4, i32x2, u32x2, f32x2, m32x2);\nimpl_from_bits!(u16x4[test_v64]: i8x8, u8x8, m8x8, i16x4, m16x4, i32x2, u32x2, f32x2, m32x2);\nimpl_from_bits!(m16x4[test_v64]: m32x2);\n\nimpl_from_bits!(i32x2[test_v64]: i8x8, u8x8, m8x8, i16x4, u16x4, m16x4, u32x2, f32x2, m32x2);\nimpl_from_bits!(u32x2[test_v64]: i8x8, u8x8, m8x8, i16x4, u16x4, m16x4, i32x2, f32x2, m32x2);\nimpl_from_bits!(f32x2[test_v64]: i8x8, u8x8, m8x8, i16x4, u16x4, m16x4, i32x2, u32x2, m32x2);\n// note: m32x2 cannot be constructed from all m16x4 or m8x8 bit patterns\n"
  },
  {
    "path": "src/api/into_bits.rs",
    "content": "//! Implementation of `FromBits` and `IntoBits`.\n\n/// Safe lossless bitwise conversion from `T` to `Self`.\n#[cfg_attr(doc_cfg, doc(cfg(feature = \"into_bits\")))]\npub trait FromBits<T>: crate::marker::Sized {\n    /// Safe lossless bitwise transmute from `T` to `Self`.\n    fn from_bits(t: T) -> Self;\n}\n\n/// Safe lossless bitwise conversion from `Self` to `T`.\n#[cfg_attr(doc_cfg, doc(cfg(feature = \"into_bits\")))]\npub trait IntoBits<T>: crate::marker::Sized {\n    /// Safe lossless bitwise transmute from `self` to `T`.\n    fn into_bits(self) -> T;\n}\n\n/// `FromBits` implies `IntoBits`.\nimpl<T, U> IntoBits<U> for T\nwhere\n    U: FromBits<T>,\n{\n    #[inline]\n    fn into_bits(self) -> U {\n        debug_assert!(crate::mem::size_of::<Self>() == crate::mem::size_of::<U>());\n        U::from_bits(self)\n    }\n}\n\n/// `FromBits` and `IntoBits` are reflexive\nimpl<T> FromBits<T> for T {\n    #[inline]\n    fn from_bits(t: Self) -> Self {\n        t\n    }\n}\n\n#[macro_use]\nmod macros;\n\nmod v16;\npub use self::v16::*;\n\nmod v32;\npub use self::v32::*;\n\nmod v64;\npub use self::v64::*;\n\nmod v128;\npub use self::v128::*;\n\nmod v256;\npub use self::v256::*;\n\nmod v512;\npub use self::v512::*;\n\nmod arch_specific;\npub use self::arch_specific::*;\n"
  },
  {
    "path": "src/api/math/float/abs.rs",
    "content": "//! Implements vertical (lane-wise) floating-point `abs`.\n\nmacro_rules! impl_math_float_abs {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Absolute value.\n            #[inline]\n            pub fn abs(self) -> Self {\n                use crate::codegen::math::float::abs::Abs;\n                Abs::abs(self)\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _math_abs>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn abs() {\n                        let o = $id::splat(1 as $elem_ty);\n                        assert_eq!(o, o.abs());\n\n                        let mo = $id::splat(-1 as $elem_ty);\n                        assert_eq!(o, mo.abs());\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/math/float/consts.rs",
    "content": "macro_rules! impl_float_consts {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident) => {\n        impl $id {\n            /// Machine epsilon value.\n            pub const EPSILON: $id = $id::splat(core::$elem_ty::EPSILON);\n\n            /// Smallest finite value.\n            pub const MIN: $id = $id::splat(core::$elem_ty::MIN);\n\n            /// Smallest positive normal value.\n            pub const MIN_POSITIVE: $id = $id::splat(core::$elem_ty::MIN_POSITIVE);\n\n            /// Largest finite value.\n            pub const MAX: $id = $id::splat(core::$elem_ty::MAX);\n\n            /// Not a Number (NaN).\n            pub const NAN: $id = $id::splat(core::$elem_ty::NAN);\n\n            /// Infinity (∞).\n            pub const INFINITY: $id = $id::splat(core::$elem_ty::INFINITY);\n\n            /// Negative infinity (-∞).\n            pub const NEG_INFINITY: $id = $id::splat(core::$elem_ty::NEG_INFINITY);\n\n            /// Archimedes' constant (π)\n            pub const PI: $id = $id::splat(core::$elem_ty::consts::PI);\n\n            /// π/2\n            pub const FRAC_PI_2: $id = $id::splat(core::$elem_ty::consts::FRAC_PI_2);\n\n            /// π/3\n            pub const FRAC_PI_3: $id = $id::splat(core::$elem_ty::consts::FRAC_PI_3);\n\n            /// π/4\n            pub const FRAC_PI_4: $id = $id::splat(core::$elem_ty::consts::FRAC_PI_4);\n\n            /// π/6\n            pub const FRAC_PI_6: $id = $id::splat(core::$elem_ty::consts::FRAC_PI_6);\n\n            /// π/8\n            pub const FRAC_PI_8: $id = $id::splat(core::$elem_ty::consts::FRAC_PI_8);\n\n            /// 1/π\n            pub const FRAC_1_PI: $id = $id::splat(core::$elem_ty::consts::FRAC_1_PI);\n\n            /// 2/π\n            pub const FRAC_2_PI: $id = $id::splat(core::$elem_ty::consts::FRAC_2_PI);\n\n            /// 2/sqrt(π)\n            pub const FRAC_2_SQRT_PI: $id = $id::splat(core::$elem_ty::consts::FRAC_2_SQRT_PI);\n\n            /// sqrt(2)\n            pub const SQRT_2: $id = $id::splat(core::$elem_ty::consts::SQRT_2);\n\n            /// 1/sqrt(2)\n            pub const FRAC_1_SQRT_2: $id = $id::splat(core::$elem_ty::consts::FRAC_1_SQRT_2);\n\n            /// Euler's number (e)\n            pub const E: $id = $id::splat(core::$elem_ty::consts::E);\n\n            /// log<sub>2</sub>(e)\n            pub const LOG2_E: $id = $id::splat(core::$elem_ty::consts::LOG2_E);\n\n            /// log<sub>10</sub>(e)\n            pub const LOG10_E: $id = $id::splat(core::$elem_ty::consts::LOG10_E);\n\n            /// ln(2)\n            pub const LN_2: $id = $id::splat(core::$elem_ty::consts::LN_2);\n\n            /// ln(10)\n            pub const LN_10: $id = $id::splat(core::$elem_ty::consts::LN_10);\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/math/float/cos.rs",
    "content": "//! Implements vertical (lane-wise) floating-point `cos`.\n\nmacro_rules! impl_math_float_cos {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Cosine.\n            #[inline]\n            pub fn cos(self) -> Self {\n                use crate::codegen::math::float::cos::Cos;\n                Cos::cos(self)\n            }\n\n            /// Cosine of `self * PI`.\n            #[inline]\n            pub fn cos_pi(self) -> Self {\n                use crate::codegen::math::float::cos_pi::CosPi;\n                CosPi::cos_pi(self)\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _math_cos>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn cos() {\n                        use crate::$elem_ty::consts::PI;\n                        let z = $id::splat(0 as $elem_ty);\n                        let o = $id::splat(1 as $elem_ty);\n                        let p = $id::splat(PI as $elem_ty);\n                        let ph = $id::splat(PI as $elem_ty / 2.);\n                        let z_r = $id::splat((PI as $elem_ty / 2.).cos());\n                        let o_r = $id::splat((PI as $elem_ty).cos());\n\n                        assert_eq!(o, z.cos());\n                        assert_eq!(z_r, ph.cos());\n                        assert_eq!(o_r, p.cos());\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/math/float/exp.rs",
    "content": "//! Implements vertical (lane-wise) floating-point `exp`.\n\nmacro_rules! impl_math_float_exp {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Returns the exponential function of `self`: `e^(self)`.\n            #[inline]\n            pub fn exp(self) -> Self {\n                use crate::codegen::math::float::exp::Exp;\n                Exp::exp(self)\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _math_exp>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn exp() {\n                        let z = $id::splat(0 as $elem_ty);\n                        let o = $id::splat(1 as $elem_ty);\n                        assert_eq!(o, z.exp());\n\n                        let e = $id::splat(crate::f64::consts::E as $elem_ty);\n                        let tol = $id::splat(2.4e-4 as $elem_ty);\n                        assert!((e - o.exp()).abs().le(tol).all());\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/math/float/ln.rs",
    "content": "//! Implements vertical (lane-wise) floating-point `ln`.\n\nmacro_rules! impl_math_float_ln {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Returns the natural logarithm of `self`.\n            #[inline]\n            pub fn ln(self) -> Self {\n                use crate::codegen::math::float::ln::Ln;\n                Ln::ln(self)\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _math_ln>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn ln() {\n                        let z = $id::splat(0 as $elem_ty);\n                        let o = $id::splat(1 as $elem_ty);\n                        assert_eq!(z, o.ln());\n\n                        let e = $id::splat(crate::f64::consts::E as $elem_ty);\n                        let tol = $id::splat(2.4e-4 as $elem_ty);\n                        assert!((o - e.ln()).abs().le(tol).all());\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/math/float/mul_add.rs",
    "content": "//! Implements vertical (lane-wise) floating-point `mul_add`.\n\nmacro_rules! impl_math_float_mul_add {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Fused multiply add: `self * y + z`\n            #[inline]\n            pub fn mul_add(self, y: Self, z: Self) -> Self {\n                use crate::codegen::math::float::mul_add::MulAdd;\n                MulAdd::mul_add(self, y, z)\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _math_mul_add>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn mul_add() {\n                        let z = $id::splat(0 as $elem_ty);\n                        let o = $id::splat(1 as $elem_ty);\n                        let t = $id::splat(2 as $elem_ty);\n                        let t3 = $id::splat(3 as $elem_ty);\n                        let f = $id::splat(4 as $elem_ty);\n\n                        assert_eq!(z, z.mul_add(z, z));\n                        assert_eq!(o, o.mul_add(o, z));\n                        assert_eq!(o, o.mul_add(z, o));\n                        assert_eq!(o, z.mul_add(o, o));\n\n                        assert_eq!(t, o.mul_add(o, o));\n                        assert_eq!(t, o.mul_add(t, z));\n                        assert_eq!(t, t.mul_add(o, z));\n\n                        assert_eq!(f, t.mul_add(t, z));\n                        assert_eq!(f, t.mul_add(o, t));\n                        assert_eq!(t3, t.mul_add(o, o));\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/math/float/mul_adde.rs",
    "content": "//! Implements vertical (lane-wise) floating-point `mul_adde`.\n\nmacro_rules! impl_math_float_mul_adde {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Fused multiply add estimate: ~= `self * y + z`\n            ///\n            /// While fused multiply-add (`fma`) has infinite precision,\n            /// `mul_adde` has _at worst_ the same precision of a multiply followed by an add.\n            /// This might be more efficient on architectures that do not have an `fma` instruction.\n            #[inline]\n            pub fn mul_adde(self, y: Self, z: Self) -> Self {\n                use crate::codegen::math::float::mul_adde::MulAddE;\n                MulAddE::mul_adde(self, y, z)\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _math_mul_adde>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn mul_adde() {\n                        let z = $id::splat(0 as $elem_ty);\n                        let o = $id::splat(1 as $elem_ty);\n                        let t = $id::splat(2 as $elem_ty);\n                        let t3 = $id::splat(3 as $elem_ty);\n                        let f = $id::splat(4 as $elem_ty);\n\n                        assert_eq!(z, z.mul_adde(z, z));\n                        assert_eq!(o, o.mul_adde(o, z));\n                        assert_eq!(o, o.mul_adde(z, o));\n                        assert_eq!(o, z.mul_adde(o, o));\n\n                        assert_eq!(t, o.mul_adde(o, o));\n                        assert_eq!(t, o.mul_adde(t, z));\n                        assert_eq!(t, t.mul_adde(o, z));\n\n                        assert_eq!(f, t.mul_adde(t, z));\n                        assert_eq!(f, t.mul_adde(o, t));\n                        assert_eq!(t3, t.mul_adde(o, o));\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/math/float/powf.rs",
    "content": "//! Implements vertical (lane-wise) floating-point `powf`.\n\nmacro_rules! impl_math_float_powf {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Raises `self` number to the floating point power of `x`.\n            #[inline]\n            pub fn powf(self, x: Self) -> Self {\n                use crate::codegen::math::float::powf::Powf;\n                Powf::powf(self, x)\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _math_powf>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn powf() {\n                        let z = $id::splat(0 as $elem_ty);\n                        let o = $id::splat(1 as $elem_ty);\n                        let t = $id::splat(2 as $elem_ty);\n                        assert_eq!(o, o.powf(z));\n                        assert_eq!(o, t.powf(z));\n                        assert_eq!(o, o.powf(o));\n                        assert_eq!(t, t.powf(o));\n\n                        let f = $id::splat(4 as $elem_ty);\n                        assert_eq!(f, t.powf(t));\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/math/float/recpre.rs",
    "content": "//! Implements vertical (lane-wise) floating-point `recpre`.\n\nmacro_rules! impl_math_float_recpre {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Reciprocal estimate: `~= 1. / self`.\n            ///\n            /// FIXME: The precision of the estimate is currently unspecified.\n            #[inline]\n            pub fn recpre(self) -> Self {\n                $id::splat(1.) / self\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _math_recpre>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn recpre() {\n                        let tol = $id::splat(2.4e-4 as $elem_ty);\n                        let o = $id::splat(1 as $elem_ty);\n                        let error = (o - o.recpre()).abs();\n                        assert!(error.le(tol).all());\n\n                        let t = $id::splat(2 as $elem_ty);\n                        let e = 0.5;\n                        let error = (e - t.recpre()).abs();\n                        assert!(error.le(tol).all());\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/math/float/rsqrte.rs",
    "content": "//! Implements vertical (lane-wise) floating-point `rsqrte`.\n\nmacro_rules! impl_math_float_rsqrte {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Reciprocal square-root estimate: `~= 1. / self.sqrt()`.\n            ///\n            /// FIXME: The precision of the estimate is currently unspecified.\n            #[inline]\n            pub fn rsqrte(self) -> Self {\n                unsafe {\n                    use crate::llvm::simd_fsqrt;\n                    $id::splat(1.) / Simd(simd_fsqrt(self.0))\n                }\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _math_rsqrte>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn rsqrte() {\n                        use crate::$elem_ty::consts::SQRT_2;\n                        let tol = $id::splat(2.4e-4 as $elem_ty);\n                        let o = $id::splat(1 as $elem_ty);\n                        let error = (o - o.rsqrte()).abs();\n                        assert!(error.le(tol).all());\n\n                        let t = $id::splat(2 as $elem_ty);\n                        let e = 1. / SQRT_2;\n                        let error = (e - t.rsqrte()).abs();\n                        assert!(error.le(tol).all());\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/math/float/sin.rs",
    "content": "//! Implements vertical (lane-wise) floating-point `sin`.\n\nmacro_rules! impl_math_float_sin {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Sine.\n            #[inline]\n            pub fn sin(self) -> Self {\n                use crate::codegen::math::float::sin::Sin;\n                Sin::sin(self)\n            }\n\n            /// Sine of `self * PI`.\n            #[inline]\n            pub fn sin_pi(self) -> Self {\n                use crate::codegen::math::float::sin_pi::SinPi;\n                SinPi::sin_pi(self)\n            }\n\n            /// Sine and cosine of `self * PI`.\n            #[inline]\n            pub fn sin_cos_pi(self) -> (Self, Self) {\n                use crate::codegen::math::float::sin_cos_pi::SinCosPi;\n                SinCosPi::sin_cos_pi(self)\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _math_sin>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn sin() {\n                        use crate::$elem_ty::consts::PI;\n                        let z = $id::splat(0 as $elem_ty);\n                        let p = $id::splat(PI as $elem_ty);\n                        let ph = $id::splat(PI as $elem_ty / 2.);\n                        let o_r = $id::splat((PI as $elem_ty / 2.).sin());\n                        let z_r = $id::splat((PI as $elem_ty).sin());\n\n                        assert_eq!(z, z.sin());\n                        assert_eq!(o_r, ph.sin());\n                        assert_eq!(z_r, p.sin());\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/math/float/sqrt.rs",
    "content": "//! Implements vertical (lane-wise) floating-point `sqrt`.\n\nmacro_rules! impl_math_float_sqrt {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            #[inline]\n            pub fn sqrt(self) -> Self {\n                use crate::codegen::math::float::sqrt::Sqrt;\n                Sqrt::sqrt(self)\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _math_sqrt>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn sqrt() {\n                        use crate::$elem_ty::consts::SQRT_2;\n                        let z = $id::splat(0 as $elem_ty);\n                        let o = $id::splat(1 as $elem_ty);\n                        assert_eq!(z, z.sqrt());\n                        assert_eq!(o, o.sqrt());\n\n                        let t = $id::splat(2 as $elem_ty);\n                        let e = $id::splat(SQRT_2);\n                        assert_eq!(e, t.sqrt());\n\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/math/float/sqrte.rs",
    "content": "//! Implements vertical (lane-wise) floating-point `sqrte`.\n\nmacro_rules! impl_math_float_sqrte {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Square-root estimate.\n            ///\n            /// FIXME: The precision of the estimate is currently unspecified.\n            #[inline]\n            pub fn sqrte(self) -> Self {\n                use crate::codegen::math::float::sqrte::Sqrte;\n                Sqrte::sqrte(self)\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _math_sqrte>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn sqrte() {\n                        use crate::$elem_ty::consts::SQRT_2;\n                        let tol = $id::splat(2.4e-4 as $elem_ty);\n\n                        let z = $id::splat(0 as $elem_ty);\n                        let error = (z - z.sqrte()).abs();\n                        assert!(error.le(tol).all());\n\n                        let o = $id::splat(1 as $elem_ty);\n                        let error = (o - o.sqrte()).abs();\n                        assert!(error.le(tol).all());\n\n                        let t = $id::splat(2 as $elem_ty);\n                        let e = $id::splat(SQRT_2 as $elem_ty);\n                        let error = (e - t.sqrte()).abs();\n\n                        assert!(error.le(tol).all());\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/math/float/tanh.rs",
    "content": "//! Implements vertical (lane-wise) floating-point `tanh`.\n\nmacro_rules! impl_math_float_tanh {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Tanh.\n            #[inline]\n            pub fn tanh(self) -> Self {\n                use crate::codegen::math::float::tanh::Tanh;\n                Tanh::tanh(self)\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _math_tanh>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn tanh() {\n                        let z = $id::splat(0 as $elem_ty);\n\n                        assert_eq!(z, z.tanh());\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/math/float.rs",
    "content": "//! Implements vertical floating-point math operations.\n\n#[macro_use]\nmod abs;\n\n#[macro_use]\nmod consts;\n\n#[macro_use]\nmod cos;\n\n#[macro_use]\nmod exp;\n\n#[macro_use]\nmod powf;\n\n#[macro_use]\nmod ln;\n\n#[macro_use]\nmod mul_add;\n\n#[macro_use]\nmod mul_adde;\n\n#[macro_use]\nmod recpre;\n\n#[macro_use]\nmod rsqrte;\n\n#[macro_use]\nmod sin;\n\n#[macro_use]\nmod sqrt;\n\n#[macro_use]\nmod sqrte;\n\n#[macro_use]\nmod tanh;\n\nmacro_rules! impl_float_category {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident, $mask_ty:ident) => {\n        impl $id {\n            #[inline]\n            pub fn is_nan(self) -> $mask_ty {\n                self.ne(self)\n            }\n\n            #[inline]\n            pub fn is_infinite(self) -> $mask_ty {\n                self.eq(Self::INFINITY) | self.eq(Self::NEG_INFINITY)\n            }\n\n            #[inline]\n            pub fn is_finite(self) -> $mask_ty {\n                !(self.is_nan() | self.is_infinite())\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/math.rs",
    "content": "//! Implements vertical math operations\n\n#[macro_use]\nmod float;\n"
  },
  {
    "path": "src/api/minimal/iuf.rs",
    "content": "//! Minimal API of signed integer, unsigned integer, and floating-point\n//! vectors.\n\nmacro_rules! impl_minimal_iuf {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $ielem_ty:ident |\n     $test_tt:tt | $($elem_name:ident),+ | $(#[$doc:meta])*) => {\n\n        $(#[$doc])*\n        pub type $id = Simd<[$elem_ty; $elem_count]>;\n\n        impl sealed::Simd for $id {\n            type Element = $elem_ty;\n            const LANES: usize = $elem_count;\n            type LanesType = [u32; $elem_count];\n        }\n\n        impl $id {\n            /// Creates a new instance with each vector elements initialized\n            /// with the provided values.\n            #[inline]\n            #[allow(clippy::too_many_arguments)]\n            pub const fn new($($elem_name: $elem_ty),*) -> Self {\n                Simd(codegen::$id($($elem_name as $ielem_ty),*))\n            }\n\n            /// Returns the number of vector lanes.\n            #[inline]\n            pub const fn lanes() -> usize {\n                $elem_count\n            }\n\n            /// Constructs a new instance with each element initialized to\n            /// `value`.\n            #[inline]\n            pub const fn splat(value: $elem_ty) -> Self {\n                Simd(codegen::$id($({\n                    #[allow(non_camel_case_types, dead_code)]\n                    struct $elem_name;\n                    value as $ielem_ty\n                }),*))\n            }\n\n            /// Extracts the value at `index`.\n            ///\n            /// # Panics\n            ///\n            /// If `index >= Self::lanes()`.\n            #[inline]\n            pub fn extract(self, index: usize) -> $elem_ty {\n                assert!(index < $elem_count);\n                unsafe { self.extract_unchecked(index) }\n            }\n\n            /// Extracts the value at `index`.\n            ///\n            /// # Safety\n            ///\n            /// If `index >= Self::lanes()` the behavior is undefined.\n            #[inline]\n            pub unsafe fn extract_unchecked(self, index: usize) -> $elem_ty {\n                use crate::llvm::simd_extract;\n                let e: $ielem_ty = simd_extract(self.0, index as u32);\n                e as $elem_ty\n            }\n\n            /// Returns a new vector where the value at `index` is replaced by `new_value`.\n            ///\n            /// # Panics\n            ///\n            /// If `index >= Self::lanes()`.\n            #[inline]\n            #[must_use = \"replace does not modify the original value - \\\n                          it returns a new vector with the value at `index` \\\n                          replaced by `new_value`d\"\n            ]\n            pub fn replace(self, index: usize, new_value: $elem_ty) -> Self {\n                assert!(index < $elem_count);\n                unsafe { self.replace_unchecked(index, new_value) }\n            }\n\n            /// Returns a new vector where the value at `index` is replaced by `new_value`.\n            ///\n            /// # Safety\n            ///\n            /// If `index >= Self::lanes()` the behavior is undefined.\n            #[inline]\n            #[must_use = \"replace_unchecked does not modify the original value - \\\n                          it returns a new vector with the value at `index` \\\n                          replaced by `new_value`d\"\n            ]\n            pub unsafe fn replace_unchecked(\n                self,\n                index: usize,\n                new_value: $elem_ty,\n            ) -> Self {\n                use crate::llvm::simd_insert;\n                Simd(simd_insert(self.0, index as u32, new_value as $ielem_ty))\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                // Comparisons use integer casts within mantissa^1 range.\n                #[allow(clippy::float_cmp)]\n                pub mod [<$id _minimal>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn minimal() {\n                        // lanes:\n                        assert_eq!($elem_count, $id::lanes());\n\n                        // splat and extract / extract_unchecked:\n                        const VAL: $elem_ty = 7 as $elem_ty;\n                        const VEC: $id = $id::splat(VAL);\n                        for i in 0..$id::lanes() {\n                            assert_eq!(VAL, VEC.extract(i));\n                            assert_eq!(\n                                VAL, unsafe { VEC.extract_unchecked(i) }\n                            );\n                        }\n\n                        // replace / replace_unchecked\n                        let new_vec = VEC.replace(0, 42 as $elem_ty);\n                        for i in 0..$id::lanes() {\n                            if i == 0 {\n                                assert_eq!(42 as $elem_ty, new_vec.extract(i));\n                            } else {\n                                assert_eq!(VAL, new_vec.extract(i));\n                            }\n                        }\n                        let new_vec = unsafe {\n                            VEC.replace_unchecked(0, 42 as $elem_ty)\n                        };\n                        for i in 0..$id::lanes() {\n                            if i == 0 {\n                                assert_eq!(42 as $elem_ty, new_vec.extract(i));\n                            } else {\n                                assert_eq!(VAL, new_vec.extract(i));\n                            }\n                        }\n                    }\n\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn extract_panic_oob() {\n                        const VAL: $elem_ty = 7 as $elem_ty;\n                        const VEC: $id = $id::splat(VAL);\n                        let _ = VEC.extract($id::lanes());\n                    }\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn replace_panic_oob() {\n                        const VAL: $elem_ty = 7 as $elem_ty;\n                        const VEC: $id = $id::splat(VAL);\n                        let _ = VEC.replace($id::lanes(), 42 as $elem_ty);\n                    }\n                }\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "src/api/minimal/mask.rs",
    "content": "//! Minimal API of mask vectors.\n\nmacro_rules! impl_minimal_mask {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $ielem_ty:ident\n    | $test_tt:tt | $($elem_name:ident),+ | $(#[$doc:meta])*) => {\n        $(#[$doc])*\n        pub type $id = Simd<[$elem_ty; $elem_count]>;\n\n        impl sealed::Simd for $id {\n            type Element = $elem_ty;\n            const LANES: usize = $elem_count;\n            type LanesType = [u32; $elem_count];\n        }\n\n        impl $id {\n            /// Creates a new instance with each vector elements initialized\n            /// with the provided values.\n            #[inline]\n            #[allow(clippy::too_many_arguments)]\n            pub const fn new($($elem_name: bool),*) -> Self {\n                Simd(codegen::$id($(Self::bool_to_internal($elem_name)),*))\n            }\n\n            /// Converts a boolean type into the type of the vector lanes.\n            #[inline]\n            #[allow(clippy::indexing_slicing)]\n            const fn bool_to_internal(x: bool) -> $ielem_ty {\n                [0 as $ielem_ty, !(0 as $ielem_ty)][x as usize]\n            }\n\n            /// Returns the number of vector lanes.\n            #[inline]\n            pub const fn lanes() -> usize {\n                $elem_count\n            }\n\n            /// Constructs a new instance with each element initialized to\n            /// `value`.\n            #[inline]\n            pub const fn splat(value: bool) -> Self {\n                Simd(codegen::$id($({\n                    #[allow(non_camel_case_types, dead_code)]\n                    struct $elem_name;\n                    Self::bool_to_internal(value)\n                }),*))\n            }\n\n            /// Extracts the value at `index`.\n            ///\n            /// # Panics\n            ///\n            /// If `index >= Self::lanes()`.\n            #[inline]\n            pub fn extract(self, index: usize) -> bool {\n                assert!(index < $elem_count);\n                unsafe { self.extract_unchecked(index) }\n            }\n\n            /// Extracts the value at `index`.\n            ///\n            /// # Safety\n            ///\n            /// If `index >= Self::lanes()` the behavior is undefined.\n            #[inline]\n            pub unsafe fn extract_unchecked(self, index: usize) -> bool {\n                use crate::llvm::simd_extract;\n                let x: $ielem_ty = simd_extract(self.0, index as u32);\n                x != 0\n            }\n\n            /// Returns a new vector where the value at `index` is replaced by\n            /// `new_value`.\n            ///\n            /// # Panics\n            ///\n            /// If `index >= Self::lanes()`.\n            #[inline]\n            #[must_use = \"replace does not modify the original value - \\\n                          it returns a new vector with the value at `index` \\\n                          replaced by `new_value`d\"\n            ]\n            pub fn replace(self, index: usize, new_value: bool) -> Self {\n                assert!(index < $elem_count);\n                unsafe { self.replace_unchecked(index, new_value) }\n            }\n\n            /// Returns a new vector where the value at `index` is replaced by\n            /// `new_value`.\n            ///\n            /// # Safety\n            ///\n            /// If `index >= Self::lanes()` the behavior is undefined.\n            #[inline]\n            #[must_use = \"replace_unchecked does not modify the original value - \\\n                          it returns a new vector with the value at `index` \\\n                          replaced by `new_value`d\"\n            ]\n            pub unsafe fn replace_unchecked(\n                self,\n                index: usize,\n                new_value: bool,\n            ) -> Self {\n                use crate::llvm::simd_insert;\n                Simd(simd_insert(self.0, index as u32,\n                                 Self::bool_to_internal(new_value)))\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _minimal>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn minimal() {\n                        // TODO: test new\n\n                        // lanes:\n                        assert_eq!($elem_count, $id::lanes());\n\n                        // splat and extract / extract_unchecked:\n                        let vec = $id::splat(true);\n                        for i in 0..$id::lanes() {\n                            assert_eq!(true, vec.extract(i));\n                            assert_eq!(true,\n                                       unsafe { vec.extract_unchecked(i) }\n                            );\n                        }\n\n                        // replace / replace_unchecked\n                        let new_vec = vec.replace(0, false);\n                        for i in 0..$id::lanes() {\n                            if i == 0 {\n                                assert_eq!(false, new_vec.extract(i));\n                            } else {\n                                assert_eq!(true, new_vec.extract(i));\n                            }\n                        }\n                        let new_vec = unsafe {\n                            vec.replace_unchecked(0, false)\n                        };\n                        for i in 0..$id::lanes() {\n                            if i == 0 {\n                                assert_eq!(false, new_vec.extract(i));\n                            } else {\n                                assert_eq!(true, new_vec.extract(i));\n                            }\n                        }\n                    }\n\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    // #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn extract_panic_oob() {\n                        let vec = $id::splat(false);\n                        let _ = vec.extract($id::lanes());\n                    }\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    // #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn replace_panic_oob() {\n                        let vec = $id::splat(false);\n                        let _ = vec.replace($id::lanes(), true);\n                    }\n                }\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "src/api/minimal/ptr.rs",
    "content": "//! Minimal API of pointer vectors.\n\nmacro_rules! impl_minimal_p {\n    ([$elem_ty:ty; $elem_count:expr]: $id:ident, $mask_ty:ident,\n     $usize_ty:ident, $isize_ty:ident | $ref:ident | $test_tt:tt\n     | $($elem_name:ident),+ | ($true:expr, $false:expr) |\n     $(#[$doc:meta])*) => {\n\n        $(#[$doc])*\n        pub type $id<T> = Simd<[$elem_ty; $elem_count]>;\n\n        impl<T> sealed::Simd for $id<T> {\n            type Element = $elem_ty;\n            const LANES: usize = $elem_count;\n            type LanesType = [u32; $elem_count];\n        }\n\n        impl<T> $id<T> {\n            /// Creates a new instance with each vector elements initialized\n            /// with the provided values.\n            #[inline]\n            #[allow(clippy::too_many_arguments)]\n            pub const fn new($($elem_name: $elem_ty),*) -> Self {\n                Simd(codegen::$id($($elem_name),*))\n            }\n\n            /// Returns the number of vector lanes.\n            #[inline]\n            pub const fn lanes() -> usize {\n                $elem_count\n            }\n\n            /// Constructs a new instance with each element initialized to\n            /// `value`.\n            #[inline]\n            pub const fn splat(value: $elem_ty) -> Self {\n                Simd(codegen::$id($({\n                    #[allow(non_camel_case_types, dead_code)]\n                    struct $elem_name;\n                    value\n                }),*))\n            }\n\n            /// Constructs a new instance with each element initialized to\n            /// `null`.\n            #[inline]\n            pub const fn null() -> Self {\n                Self::splat(crate::ptr::null_mut() as $elem_ty)\n            }\n\n            /// Returns a mask that selects those lanes that contain `null`\n            /// pointers.\n            #[inline]\n            pub fn is_null(self) -> $mask_ty {\n                self.eq(Self::null())\n            }\n\n            /// Extracts the value at `index`.\n            ///\n            /// # Panics\n            ///\n            /// If `index >= Self::lanes()`.\n            #[inline]\n            pub fn extract(self, index: usize) -> $elem_ty {\n                assert!(index < $elem_count);\n                unsafe { self.extract_unchecked(index) }\n            }\n\n            /// Extracts the value at `index`.\n            ///\n            /// # Safety\n            ///\n            /// If `index >= Self::lanes()` the behavior is undefined.\n            #[inline]\n            pub unsafe fn extract_unchecked(self, index: usize) -> $elem_ty {\n                use crate::llvm::simd_extract;\n                simd_extract(self.0, index as u32)\n            }\n\n            /// Returns a new vector where the value at `index` is replaced by\n            /// `new_value`.\n            ///\n            /// # Panics\n            ///\n            /// If `index >= Self::lanes()`.\n            #[inline]\n            #[must_use = \"replace does not modify the original value - \\\n                          it returns a new vector with the value at `index` \\\n                          replaced by `new_value`d\"\n            ]\n            #[allow(clippy::not_unsafe_ptr_arg_deref)]\n            pub fn replace(self, index: usize, new_value: $elem_ty) -> Self {\n                assert!(index < $elem_count);\n                unsafe { self.replace_unchecked(index, new_value) }\n            }\n\n            /// Returns a new vector where the value at `index` is replaced by `new_value`.\n            ///\n            /// # Safety\n            ///\n            /// If `index >= Self::lanes()` the behavior is undefined.\n            #[inline]\n            #[must_use = \"replace_unchecked does not modify the original value - \\\n                          it returns a new vector with the value at `index` \\\n                          replaced by `new_value`d\"\n            ]\n            pub unsafe fn replace_unchecked(\n                self,\n                index: usize,\n                new_value: $elem_ty,\n            ) -> Self {\n                use crate::llvm::simd_insert;\n                Simd(simd_insert(self.0, index as u32, new_value))\n            }\n        }\n\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _minimal>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn minimal() {\n                        // lanes:\n                        assert_eq!($elem_count, $id::<i32>::lanes());\n\n                        // splat and extract / extract_unchecked:\n                        let VAL7: <$id<i32> as sealed::Simd>::Element\n                            = $ref!(7);\n                        let VAL42: <$id<i32> as sealed::Simd>::Element\n                            = $ref!(42);\n                        let VEC: $id<i32> = $id::splat(VAL7);\n                        for i in 0..$id::<i32>::lanes() {\n                            assert_eq!(VAL7, VEC.extract(i));\n                            assert_eq!(\n                                VAL7, unsafe { VEC.extract_unchecked(i) }\n                            );\n                        }\n\n                        // replace / replace_unchecked\n                        let new_vec = VEC.replace(0, VAL42);\n                        for i in 0..$id::<i32>::lanes() {\n                            if i == 0 {\n                                assert_eq!(VAL42, new_vec.extract(i));\n                            } else {\n                                assert_eq!(VAL7, new_vec.extract(i));\n                            }\n                        }\n                        let new_vec = unsafe {\n                            VEC.replace_unchecked(0, VAL42)\n                        };\n                        for i in 0..$id::<i32>::lanes() {\n                            if i == 0 {\n                                assert_eq!(VAL42, new_vec.extract(i));\n                            } else {\n                                assert_eq!(VAL7, new_vec.extract(i));\n                            }\n                        }\n\n                        let mut n = $id::<i32>::null();\n                        assert_eq!(\n                            n,\n                            $id::<i32>::splat(unsafe { crate::mem::zeroed() })\n                        );\n                        assert!(n.is_null().all());\n                        n = n.replace(\n                            0, unsafe { crate::mem::transmute(1_isize) }\n                        );\n                        assert!(!n.is_null().all());\n                        if $id::<i32>::lanes() > 1 {\n                            assert!(n.is_null().any());\n                        } else {\n                            assert!(!n.is_null().any());\n                        }\n                    }\n\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    // #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn extract_panic_oob() {\n                        let VAL: <$id<i32> as sealed::Simd>::Element\n                            = $ref!(7);\n                        let VEC: $id<i32> = $id::splat(VAL);\n                        let _ = VEC.extract($id::<i32>::lanes());\n                    }\n\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    // #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn replace_panic_oob() {\n                        let VAL: <$id<i32> as sealed::Simd>::Element\n                            = $ref!(7);\n                        let VAL42: <$id<i32> as sealed::Simd>::Element\n                            = $ref!(42);\n                        let VEC: $id<i32> = $id::splat(VAL);\n                        let _ = VEC.replace($id::<i32>::lanes(), VAL42);\n                    }\n                }\n            }\n        }\n\n        impl<T> crate::fmt::Debug for $id<T> {\n            #[allow(clippy::missing_inline_in_public_items)]\n            fn fmt(&self, f: &mut crate::fmt::Formatter<'_>)\n                   -> crate::fmt::Result {\n                write!(\n                    f,\n                    \"{}<{}>(\",\n                    stringify!($id),\n                    crate::intrinsics::type_name::<T>()\n                )?;\n                for i in 0..$elem_count {\n                    if i > 0 {\n                        write!(f, \", \")?;\n                    }\n                    self.extract(i).fmt(f)?;\n                }\n                write!(f, \")\")\n            }\n        }\n\n         test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _fmt_debug>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn debug() {\n                        use arrayvec::{ArrayString,ArrayVec};\n                        type TinyString = ArrayString<[u8; 512]>;\n\n                        use crate::fmt::Write;\n                        let v = $id::<i32>::default();\n                        let mut s = TinyString::new();\n                        write!(&mut s, \"{:?}\", v).unwrap();\n\n                        let mut beg = TinyString::new();\n                        write!(&mut beg, \"{}<i32>(\", stringify!($id)).unwrap();\n                        assert!(\n                            s.starts_with(beg.as_str()),\n                            \"s = {} (should start with = {})\", s, beg\n                        );\n                        assert!(s.ends_with(\")\"));\n                        let s: ArrayVec<[TinyString; 64]>\n                            = s.replace(beg.as_str(), \"\")\n                            .replace(\")\", \"\").split(\",\")\n                            .map(|v| TinyString::from(v.trim()).unwrap())\n                            .collect();\n                        assert_eq!(s.len(), $id::<i32>::lanes());\n                        for (index, ss) in s.into_iter().enumerate() {\n                            let mut e = TinyString::new();\n                            write!(&mut e, \"{:?}\", v.extract(index)).unwrap();\n                            assert_eq!(ss, e);\n                        }\n                    }\n                }\n            }\n         }\n\n        impl<T> Default for $id<T> {\n            #[inline]\n            fn default() -> Self {\n                // FIXME: ptrs do not implement default\n                Self::null()\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _default>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn default() {\n                        let a = $id::<i32>::default();\n                        for i in 0..$id::<i32>::lanes() {\n                            assert_eq!(\n                                a.extract(i), unsafe { crate::mem::zeroed() }\n                            );\n                        }\n                    }\n                }\n            }\n        }\n\n        impl<T> $id<T> {\n            /// Lane-wise equality comparison.\n            #[inline]\n            pub fn eq(self, other: Self) -> $mask_ty {\n                unsafe {\n                    use crate::llvm::simd_eq;\n                    let a: $usize_ty = crate::mem::transmute(self);\n                    let b: $usize_ty = crate::mem::transmute(other);\n                    Simd(simd_eq(a.0, b.0))\n                }\n            }\n\n            /// Lane-wise inequality comparison.\n            #[inline]\n            pub fn ne(self, other: Self) -> $mask_ty {\n                unsafe {\n                    use crate::llvm::simd_ne;\n                    let a: $usize_ty = crate::mem::transmute(self);\n                    let b: $usize_ty = crate::mem::transmute(other);\n                    Simd(simd_ne(a.0, b.0))\n                }\n            }\n\n            /// Lane-wise less-than comparison.\n            #[inline]\n            pub fn lt(self, other: Self) -> $mask_ty {\n                unsafe {\n                    use crate::llvm::simd_lt;\n                    let a: $usize_ty = crate::mem::transmute(self);\n                    let b: $usize_ty = crate::mem::transmute(other);\n                    Simd(simd_lt(a.0, b.0))\n                }\n            }\n\n            /// Lane-wise less-than-or-equals comparison.\n            #[inline]\n            pub fn le(self, other: Self) -> $mask_ty {\n                unsafe {\n                    use crate::llvm::simd_le;\n                    let a: $usize_ty = crate::mem::transmute(self);\n                    let b: $usize_ty = crate::mem::transmute(other);\n                    Simd(simd_le(a.0, b.0))\n                }\n            }\n\n            /// Lane-wise greater-than comparison.\n            #[inline]\n            pub fn gt(self, other: Self) -> $mask_ty {\n                unsafe {\n                    use crate::llvm::simd_gt;\n                    let a: $usize_ty = crate::mem::transmute(self);\n                    let b: $usize_ty = crate::mem::transmute(other);\n                    Simd(simd_gt(a.0, b.0))\n                }\n            }\n\n            /// Lane-wise greater-than-or-equals comparison.\n            #[inline]\n            pub fn ge(self, other: Self) -> $mask_ty {\n                unsafe {\n                    use crate::llvm::simd_ge;\n                    let a: $usize_ty = crate::mem::transmute(self);\n                    let b: $usize_ty = crate::mem::transmute(other);\n                    Simd(simd_ge(a.0, b.0))\n                }\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _cmp_vertical>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn cmp() {\n                        let a = $id::<i32>::null();\n                        let b = $id::<i32>::splat(unsafe {\n                            crate::mem::transmute(1_isize)\n                        });\n\n                        let r = a.lt(b);\n                        let e = $mask_ty::splat(true);\n                        assert!(r == e);\n                        let r = a.le(b);\n                        assert!(r == e);\n\n                        let e = $mask_ty::splat(false);\n                        let r = a.gt(b);\n                        assert!(r == e);\n                        let r = a.ge(b);\n                        assert!(r == e);\n                        let r = a.eq(b);\n                        assert!(r == e);\n\n                        let mut a = a;\n                        let mut b = b;\n                        let mut e = e;\n                        for i in 0..$id::<i32>::lanes() {\n                            if i % 2 == 0 {\n                                a = a.replace(\n                                    i,\n                                    unsafe { crate::mem::transmute(0_isize) }\n                                );\n                                b = b.replace(\n                                    i,\n                                    unsafe { crate::mem::transmute(1_isize) }\n                                );\n                                e = e.replace(i, true);\n                            } else {\n                                a = a.replace(\n                                    i,\n                                    unsafe { crate::mem::transmute(1_isize) }\n                                );\n                                b = b.replace(\n                                    i,\n                                    unsafe { crate::mem::transmute(0_isize) }\n                                );\n                                e = e.replace(i, false);\n                            }\n                        }\n                        let r = a.lt(b);\n                        assert!(r == e);\n                    }\n                }\n            }\n        }\n\n        #[allow(clippy::partialeq_ne_impl)]\n        impl<T> crate::cmp::PartialEq<$id<T>> for $id<T> {\n            #[inline]\n            fn eq(&self, other: &Self) -> bool {\n                $id::<T>::eq(*self, *other).all()\n            }\n            #[inline]\n            fn ne(&self, other: &Self) -> bool {\n                $id::<T>::ne(*self, *other).any()\n            }\n        }\n\n        // FIXME: https://github.com/rust-lang-nursery/rust-clippy/issues/2892\n        #[allow(clippy::partialeq_ne_impl)]\n        impl<T> crate::cmp::PartialEq<LexicographicallyOrdered<$id<T>>>\n            for LexicographicallyOrdered<$id<T>>\n        {\n            #[inline]\n            fn eq(&self, other: &Self) -> bool {\n                self.0 == other.0\n            }\n            #[inline]\n            fn ne(&self, other: &Self) -> bool {\n                self.0 != other.0\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _cmp_PartialEq>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn partial_eq() {\n                        let a = $id::<i32>::null();\n                        let b = $id::<i32>::splat(unsafe {\n                            crate::mem::transmute(1_isize)\n                        });\n\n                        assert!(a != b);\n                        assert!(!(a == b));\n                        assert!(a == a);\n                        assert!(!(a != a));\n\n                        if $id::<i32>::lanes() > 1 {\n                            let a = $id::<i32>::null().replace(0, unsafe {\n                                crate::mem::transmute(1_isize)\n                            });\n                            let b = $id::<i32>::splat(unsafe {\n                                crate::mem::transmute(1_isize)\n                            });\n\n                            assert!(a != b);\n                            assert!(!(a == b));\n                            assert!(a == a);\n                            assert!(!(a != a));\n                        }\n                    }\n                }\n            }\n        }\n\n        impl<T> crate::cmp::Eq for $id<T> {}\n        impl<T> crate::cmp::Eq for LexicographicallyOrdered<$id<T>> {}\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _cmp_eq>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn eq() {\n                        fn foo<E: crate::cmp::Eq>(_: E) {}\n                        let a = $id::<i32>::null();\n                        foo(a);\n                    }\n                }\n            }\n        }\n\n        impl<T> From<[$elem_ty; $elem_count]> for $id<T> {\n            #[inline]\n            fn from(array: [$elem_ty; $elem_count]) -> Self {\n                unsafe {\n                    // FIXME: unnecessary zeroing; better than UB.\n                    let mut u: Self = crate::mem::zeroed();\n                    crate::ptr::copy_nonoverlapping(\n                        &array as *const [$elem_ty; $elem_count] as *const u8,\n                        &mut u as *mut Self as *mut u8,\n                        crate::mem::size_of::<Self>()\n                    );\n                    u\n                }\n            }\n        }\n        impl<T> Into<[$elem_ty; $elem_count]> for $id<T> {\n            #[inline]\n            fn into(self) -> [$elem_ty; $elem_count] {\n                unsafe {\n                    // FIXME: unnecessary zeroing; better than UB.\n                    let mut u: [$elem_ty; $elem_count] = crate::mem::zeroed();\n                    crate::ptr::copy_nonoverlapping(\n                        &self as *const $id<T> as *const u8,\n                        &mut u as *mut [$elem_ty; $elem_count] as *mut u8,\n                        crate::mem::size_of::<Self>()\n                    );\n                    u\n                }\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _from>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn array() {\n                        let values = [1_i32; $elem_count];\n\n                        let mut vec: $id<i32> = Default::default();\n                        let mut array = [\n                            $id::<i32>::null().extract(0); $elem_count\n                        ];\n\n                        for i in 0..$elem_count {\n                            let ptr = &values[i] as *const i32 as *mut i32;\n                            vec = vec.replace(i, ptr);\n                            array[i] = ptr;\n                        }\n\n                        // FIXME: there is no impl of From<$id<T>> for [$elem_ty; N]\n                        // let a0 = From::from(vec);\n                        // assert_eq!(a0, array);\n                        #[allow(unused_assignments)]\n                        let mut a1 = array;\n                        a1 = vec.into();\n                        assert_eq!(a1, array);\n\n                        let v0: $id<i32> = From::from(array);\n                        assert_eq!(v0, vec);\n                        let v1: $id<i32> = array.into();\n                        assert_eq!(v1, vec);\n                    }\n                }\n            }\n        }\n\n        impl<T> $id<T> {\n            /// Instantiates a new vector with the values of the `slice`.\n            ///\n            /// # Panics\n            ///\n            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not aligned\n            /// to an `align_of::<Self>()` boundary.\n            #[inline]\n            pub fn from_slice_aligned(slice: &[$elem_ty]) -> Self {\n                unsafe {\n                    assert!(slice.len() >= $elem_count);\n                    let target_ptr = slice.as_ptr();\n                    assert!(\n                        target_ptr.align_offset(crate::mem::align_of::<Self>())\n                            == 0\n                    );\n                    Self::from_slice_aligned_unchecked(slice)\n                }\n            }\n\n            /// Instantiates a new vector with the values of the `slice`.\n            ///\n            /// # Panics\n            ///\n            /// If `slice.len() < Self::lanes()`.\n            #[inline]\n            pub fn from_slice_unaligned(slice: &[$elem_ty]) -> Self {\n                unsafe {\n                    assert!(slice.len() >= $elem_count);\n                    Self::from_slice_unaligned_unchecked(slice)\n                }\n            }\n\n            /// Instantiates a new vector with the values of the `slice`.\n            ///\n            /// # Safety\n            ///\n            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not aligned\n            /// to an `align_of::<Self>()` boundary, the behavior is undefined.\n            #[inline]\n            pub unsafe fn from_slice_aligned_unchecked(slice: &[$elem_ty])\n                                                       -> Self {\n                #[allow(clippy::cast_ptr_alignment)]\n                *(slice.as_ptr().cast())\n            }\n\n            /// Instantiates a new vector with the values of the `slice`.\n            ///\n            /// # Safety\n            ///\n            /// If `slice.len() < Self::lanes()` the behavior is undefined.\n            #[inline]\n            pub unsafe fn from_slice_unaligned_unchecked(\n                slice: &[$elem_ty],\n            ) -> Self {\n                use crate::mem::size_of;\n                let target_ptr = slice.as_ptr().cast();\n                let mut x = Self::splat(crate::ptr::null_mut() as $elem_ty);\n                let self_ptr = &mut x as *mut Self as *mut u8;\n                crate::ptr::copy_nonoverlapping(\n                    target_ptr,\n                    self_ptr,\n                    size_of::<Self>(),\n                );\n                x\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _slice_from_slice>] {\n                    use super::*;\n                    use crate::iter::Iterator;\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn from_slice_unaligned() {\n                        let (null, non_null) = ptr_vals!($id<i32>);\n\n                        let mut unaligned = [\n                            non_null; $id::<i32>::lanes() + 1\n                        ];\n                        unaligned[0] = null;\n                        let vec = $id::<i32>::from_slice_unaligned(\n                            &unaligned[1..]\n                        );\n                        for (index, &b) in unaligned.iter().enumerate() {\n                            if index == 0 {\n                                assert_eq!(b, null);\n                            } else {\n                                assert_eq!(b, non_null);\n                                assert_eq!(b, vec.extract(index - 1));\n                            }\n                        }\n                    }\n\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    // #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn from_slice_unaligned_fail() {\n                        let (_null, non_null) = ptr_vals!($id<i32>);\n                        let unaligned = [non_null; $id::<i32>::lanes() + 1];\n                        // the slice is not large enough => panic\n                        let _vec = $id::<i32>::from_slice_unaligned(\n                            &unaligned[2..]\n                        );\n                    }\n\n                    union A {\n                        data: [<$id<i32> as sealed::Simd>::Element;\n                               2 * $id::<i32>::lanes()],\n                        _vec: $id<i32>,\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn from_slice_aligned() {\n                        let (null, non_null) = ptr_vals!($id<i32>);\n                        let mut aligned = A {\n                            data: [null; 2 * $id::<i32>::lanes()],\n                        };\n                        for i in\n                            $id::<i32>::lanes()..(2 * $id::<i32>::lanes()) {\n                            unsafe {\n                                aligned.data[i] = non_null;\n                            }\n                        }\n\n                        let vec = unsafe {\n                            $id::<i32>::from_slice_aligned(\n                                &aligned.data[$id::<i32>::lanes()..]\n                            )\n                        };\n                        for (index, &b) in unsafe {\n                            aligned.data.iter().enumerate()\n                        } {\n                            if index < $id::<i32>::lanes() {\n                                assert_eq!(b, null);\n                            } else {\n                                assert_eq!(b, non_null);\n                                assert_eq!(\n                                    b, vec.extract(index - $id::<i32>::lanes())\n                                );\n                            }\n                        }\n                    }\n\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    // #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn from_slice_aligned_fail_lanes() {\n                        let (_null, non_null) = ptr_vals!($id<i32>);\n                        let aligned = A {\n                            data: [non_null; 2 * $id::<i32>::lanes()],\n                        };\n                        // the slice is not large enough => panic\n                        let _vec = unsafe {\n                            $id::<i32>::from_slice_aligned(\n                                &aligned.data[2 * $id::<i32>::lanes()..]\n                            )\n                        };\n                    }\n\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    // #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn from_slice_aligned_fail_align() {\n                        unsafe {\n                            let (null, _non_null) = ptr_vals!($id<i32>);\n                            let aligned = A {\n                                data: [null; 2 * $id::<i32>::lanes()],\n                            };\n\n                            // get a pointer to the front of data\n                            let ptr = aligned.data.as_ptr();\n                            // offset pointer by one element\n                            let ptr = ptr.wrapping_add(1);\n\n                            if ptr.align_offset(\n                                crate::mem::align_of::<$id<i32>>()\n                            ) == 0 {\n                                // the pointer is properly aligned, so\n                                // from_slice_aligned won't fail here (e.g. this\n                                // can happen for i128x1). So we panic to make\n                                // the \"should_fail\" test pass:\n                                panic!(\"ok\");\n                            }\n\n                            // create a slice - this is safe, because the\n                            // elements of the slice exist, are properly\n                            // initialized, and properly aligned:\n                            let s = slice::from_raw_parts(\n                                ptr, $id::<i32>::lanes()\n                            );\n                            // this should always panic because the slice\n                            // alignment does not match the alignment\n                            // requirements for the vector type:\n                            let _vec = $id::<i32>::from_slice_aligned(s);\n                        }\n                    }\n                }\n            }\n        }\n\n        impl<T> $id<T> {\n            /// Writes the values of the vector to the `slice`.\n            ///\n            /// # Panics\n            ///\n            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not\n            /// aligned to an `align_of::<Self>()` boundary.\n            #[inline]\n            pub fn write_to_slice_aligned(self, slice: &mut [$elem_ty]) {\n                unsafe {\n                    assert!(slice.len() >= $elem_count);\n                    let target_ptr = slice.as_mut_ptr();\n                    assert!(\n                        target_ptr.align_offset(crate::mem::align_of::<Self>())\n                            == 0\n                    );\n                    self.write_to_slice_aligned_unchecked(slice);\n                }\n            }\n\n            /// Writes the values of the vector to the `slice`.\n            ///\n            /// # Panics\n            ///\n            /// If `slice.len() < Self::lanes()`.\n            #[inline]\n            pub fn write_to_slice_unaligned(self, slice: &mut [$elem_ty]) {\n                unsafe {\n                    assert!(slice.len() >= $elem_count);\n                    self.write_to_slice_unaligned_unchecked(slice);\n                }\n            }\n\n            /// Writes the values of the vector to the `slice`.\n            ///\n            /// # Safety\n            ///\n            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not\n            /// aligned to an `align_of::<Self>()` boundary, the behavior is\n            /// undefined.\n            #[inline]\n            pub unsafe fn write_to_slice_aligned_unchecked(\n                self, slice: &mut [$elem_ty],\n            ) {\n                #[allow(clippy::cast_ptr_alignment)]\n                *(slice.as_mut_ptr().cast()) = self;\n            }\n\n            /// Writes the values of the vector to the `slice`.\n            ///\n            /// # Safety\n            ///\n            /// If `slice.len() < Self::lanes()` the behavior is undefined.\n            #[inline]\n            pub unsafe fn write_to_slice_unaligned_unchecked(\n                self, slice: &mut [$elem_ty],\n            ) {\n                let target_ptr = slice.as_mut_ptr().cast();\n                let self_ptr = &self as *const Self as *const u8;\n                crate::ptr::copy_nonoverlapping(\n                    self_ptr,\n                    target_ptr,\n                    crate::mem::size_of::<Self>(),\n                );\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _slice_write_to_slice>] {\n                    use super::*;\n                    use crate::iter::Iterator;\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn write_to_slice_unaligned() {\n                        let (null, non_null) = ptr_vals!($id<i32>);\n                        let mut unaligned = [null; $id::<i32>::lanes() + 1];\n                        let vec = $id::<i32>::splat(non_null);\n                        vec.write_to_slice_unaligned(&mut unaligned[1..]);\n                        for (index, &b) in unaligned.iter().enumerate() {\n                            if index == 0 {\n                                assert_eq!(b, null);\n                            } else {\n                                assert_eq!(b, non_null);\n                                assert_eq!(b, vec.extract(index - 1));\n                            }\n                        }\n                    }\n\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    // #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn write_to_slice_unaligned_fail() {\n                        let (null, non_null) = ptr_vals!($id<i32>);\n                        let mut unaligned = [null; $id::<i32>::lanes() + 1];\n                        let vec = $id::<i32>::splat(non_null);\n                        // the slice is not large enough => panic\n                        vec.write_to_slice_unaligned(&mut unaligned[2..]);\n                    }\n\n                    union A {\n                        data: [<$id<i32> as sealed::Simd>::Element;\n                               2 * $id::<i32>::lanes()],\n                        _vec: $id<i32>,\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn write_to_slice_aligned() {\n                        let (null, non_null) = ptr_vals!($id<i32>);\n                        let mut aligned = A {\n                            data: [null; 2 * $id::<i32>::lanes()],\n                        };\n                        let vec = $id::<i32>::splat(non_null);\n                        unsafe {\n                            vec.write_to_slice_aligned(\n                                &mut aligned.data[$id::<i32>::lanes()..]\n                            )\n                        };\n                        for (index, &b) in\n                            unsafe { aligned.data.iter().enumerate() } {\n                            if index < $id::<i32>::lanes() {\n                                assert_eq!(b, null);\n                            } else {\n                                assert_eq!(b, non_null);\n                                assert_eq!(\n                                    b, vec.extract(index - $id::<i32>::lanes())\n                                );\n                            }\n                        }\n                    }\n\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    // #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn write_to_slice_aligned_fail_lanes() {\n                        let (null, non_null) = ptr_vals!($id<i32>);\n                        let mut aligned = A {\n                            data: [null; 2 * $id::<i32>::lanes()],\n                        };\n                        let vec = $id::<i32>::splat(non_null);\n                        // the slice is not large enough => panic\n                        unsafe {\n                            vec.write_to_slice_aligned(\n                                &mut aligned.data[2 * $id::<i32>::lanes()..]\n                            )\n                        };\n                    }\n\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    // #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn write_to_slice_aligned_fail_align() {\n                        let (null, non_null) = ptr_vals!($id<i32>);\n                        unsafe {\n                            let mut aligned = A {\n                                data: [null; 2 * $id::<i32>::lanes()],\n                            };\n\n                            // get a pointer to the front of data\n                            let ptr = aligned.data.as_mut_ptr();\n                            // offset pointer by one element\n                            let ptr = ptr.wrapping_add(1);\n\n                            if ptr.align_offset(\n                                crate::mem::align_of::<$id<i32>>()\n                            ) == 0 {\n                                // the pointer is properly aligned, so\n                                // write_to_slice_aligned won't fail here (e.g.\n                                // this can happen for i128x1). So we panic to\n                                // make the \"should_fail\" test pass:\n                                panic!(\"ok\");\n                            }\n\n                            // create a slice - this is safe, because the\n                            // elements of the slice exist, are properly\n                            // initialized, and properly aligned:\n                            let s = slice::from_raw_parts_mut(\n                                ptr, $id::<i32>::lanes()\n                            );\n                            // this should always panic because the slice\n                            // alignment does not match the alignment\n                            // requirements for the vector type:\n                            let vec = $id::<i32>::splat(non_null);\n                            vec.write_to_slice_aligned(s);\n                        }\n                    }\n                }\n            }\n        }\n\n        impl<T> crate::hash::Hash for $id<T> {\n            #[inline]\n            fn hash<H: crate::hash::Hasher>(&self, state: &mut H) {\n                let s: $usize_ty = unsafe { crate::mem::transmute(*self) };\n                s.hash(state)\n            }\n        }\n\n        test_if! {\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _hash>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn hash() {\n                        use crate::hash::{Hash, Hasher};\n                        #[allow(deprecated)]\n                        use crate::hash::{SipHasher13};\n\n                        let values = [1_i32; $elem_count];\n\n                        let mut vec: $id<i32> = Default::default();\n                        let mut array = [\n                            $id::<i32>::null().extract(0);\n                            $elem_count\n                        ];\n\n                        for i in 0..$elem_count {\n                            let ptr = &values[i] as *const i32 as *mut i32;\n                            vec = vec.replace(i, ptr);\n                            array[i] = ptr;\n                        }\n\n                        #[allow(deprecated)]\n                        let mut a_hash = SipHasher13::new();\n                        let mut v_hash = a_hash.clone();\n                        array.hash(&mut a_hash);\n                        vec.hash(&mut v_hash);\n                        assert_eq!(a_hash.finish(), v_hash.finish());\n                    }\n                }\n            }\n        }\n\n        impl<T> $id<T> {\n            /// Calculates the offset from a pointer.\n            ///\n            /// `count` is in units of `T`; e.g. a count of `3` represents a\n            /// pointer offset of `3 * size_of::<T>()` bytes.\n            ///\n            /// # Safety\n            ///\n            /// If any of the following conditions are violated, the result is\n            /// Undefined Behavior:\n            ///\n            /// * Both the starting and resulting pointer must be either in\n            /// bounds or one byte past the end of an allocated object.\n            ///\n            /// * The computed offset, in bytes, cannot overflow an `isize`.\n            ///\n            /// * The offset being in bounds cannot rely on \"wrapping around\"\n            /// the address space. That is, the infinite-precision sum, in bytes\n            /// must fit in a `usize`.\n            ///\n            /// The compiler and standard library generally tries to ensure\n            /// allocations never reach a size where an offset is a concern. For\n            /// instance, `Vec` and `Box` ensure they never allocate more than\n            /// `isize::MAX` bytes, so `vec.as_ptr().offset(vec.len() as isize)`\n            /// is always safe.\n            ///\n            /// Most platforms fundamentally can't even construct such an\n            /// allocation. For instance, no known 64-bit platform can ever\n            /// serve a request for 263 bytes due to page-table limitations or\n            /// splitting the address space. However, some 32-bit and 16-bit\n            /// platforms may successfully serve a request for more than\n            /// `isize::MAX` bytes with things like Physical Address Extension.\n            /// As such, memory acquired directly from allocators or memory\n            /// mapped files may be too large to handle with this function.\n            ///\n            /// Consider using `wrapping_offset` instead if these constraints\n            /// are difficult to satisfy. The only advantage of this method is\n            /// that it enables more aggressive compiler optimizations.\n            #[inline]\n            pub unsafe fn offset(self, count: $isize_ty) -> Self {\n                // FIXME: should use LLVM's `add nsw nuw`\n                self.wrapping_offset(count)\n            }\n\n            /// Calculates the offset from a pointer using wrapping arithmetic.\n            ///\n            /// `count` is in units of `T`; e.g. a count of `3` represents a\n            /// pointer offset of `3 * size_of::<T>()` bytes.\n            ///\n            /// # Safety\n            ///\n            /// The resulting pointer does not need to be in bounds, but it is\n            /// potentially hazardous to dereference (which requires unsafe).\n            ///\n            /// Always use `.offset(count)` instead when possible, because\n            /// offset allows the compiler to optimize better.\n            #[inline]\n            pub fn wrapping_offset(self, count: $isize_ty) -> Self {\n                unsafe {\n                    let x: $isize_ty = crate::mem::transmute(self);\n                    // note: {+,*} currently performs a `wrapping_{add, mul}`\n                    crate::mem::transmute(\n                        x + (count * crate::mem::size_of::<T>() as isize)\n                    )\n                }\n            }\n\n            /// Calculates the distance between two pointers.\n            ///\n            /// The returned value is in units of `T`: the distance in bytes is\n            /// divided by `mem::size_of::<T>()`.\n            ///\n            /// This function is the inverse of offset.\n            ///\n            /// # Safety\n            ///\n            /// If any of the following conditions are violated, the result is\n            /// Undefined Behavior:\n            ///\n            /// * Both the starting and other pointer must be either in bounds\n            /// or one byte past the end of the same allocated object.\n            ///\n            /// * The distance between the pointers, in bytes, cannot overflow\n            /// an `isize`.\n            ///\n            /// * The distance between the pointers, in bytes, must be an exact\n            /// multiple of the size of `T`.\n            ///\n            /// * The distance being in bounds cannot rely on \"wrapping around\"\n            /// the address space.\n            ///\n            /// The compiler and standard library generally try to ensure\n            /// allocations never reach a size where an offset is a concern. For\n            /// instance, `Vec` and `Box` ensure they never allocate more than\n            /// `isize::MAX` bytes, so `ptr_into_vec.offset_from(vec.as_ptr())`\n            /// is always safe.\n            ///\n            /// Most platforms fundamentally can't even construct such an\n            /// allocation. For instance, no known 64-bit platform can ever\n            /// serve a request for 263 bytes due to page-table limitations or\n            /// splitting the address space. However, some 32-bit and 16-bit\n            /// platforms may successfully serve a request for more than\n            /// `isize::MAX` bytes with things like Physical Address Extension.\n            /// As such, memory acquired directly from allocators or memory\n            /// mapped files may be too large to handle with this function.\n            ///\n            /// Consider using `wrapping_offset_from` instead if these constraints\n            /// are difficult to satisfy. The only advantage of this method is\n            /// that it enables more aggressive compiler optimizations.\n            #[inline]\n            pub unsafe fn offset_from(self, origin: Self) -> $isize_ty {\n                // FIXME: should use LLVM's `sub nsw nuw`.\n                self.wrapping_offset_from(origin)\n            }\n\n            /// Calculates the distance between two pointers.\n            ///\n            /// The returned value is in units of `T`: the distance in bytes is\n            /// divided by `mem::size_of::<T>()`.\n            ///\n            /// If the address different between the two pointers is not a\n            /// multiple of `mem::size_of::<T>()` then the result of the\n            /// division is rounded towards zero.\n            ///\n            /// Though this method is safe for any two pointers, note that its\n            /// result will be mostly useless if the two pointers aren't into\n            /// the same allocated object, for example if they point to two\n            /// different local variables.\n            #[inline]\n            pub fn wrapping_offset_from(self, origin: Self) -> $isize_ty {\n                let x: $isize_ty = unsafe { crate::mem::transmute(self) };\n                let y: $isize_ty = unsafe { crate::mem::transmute(origin) };\n                // note: {-,/} currently perform wrapping_{sub, div}\n                (y - x) / (crate::mem::size_of::<T>() as isize)\n            }\n\n            /// Calculates the offset from a pointer (convenience for\n            /// `.offset(count as isize)`).\n            ///\n            /// `count` is in units of `T`; e.g. a count of 3 represents a\n            /// pointer offset of `3 * size_of::<T>()` bytes.\n            ///\n            /// # Safety\n            ///\n            /// If any of the following conditions are violated, the result is\n            /// Undefined Behavior:\n            ///\n            /// * Both the starting and resulting pointer must be either in\n            /// bounds or one byte past the end of an allocated object.\n            ///\n            /// * The computed offset, in bytes, cannot overflow an `isize`.\n            ///\n            /// * The offset being in bounds cannot rely on \"wrapping around\"\n            /// the address space. That is, the infinite-precision sum must fit\n            /// in a `usize`.\n            ///\n            /// The compiler and standard library generally tries to ensure\n            /// allocations never reach a size where an offset is a concern. For\n            /// instance, `Vec` and `Box` ensure they never allocate more than\n            /// `isize::MAX` bytes, so `vec.as_ptr().add(vec.len())` is always\n            /// safe.\n            ///\n            /// Most platforms fundamentally can't even construct such an\n            /// allocation. For instance, no known 64-bit platform can ever\n            /// serve a request for 263 bytes due to page-table limitations or\n            /// splitting the address space. However, some 32-bit and 16-bit\n            /// platforms may successfully serve a request for more than\n            /// `isize::MAX` bytes with things like Physical Address Extension.\n            /// As such, memory acquired directly from allocators or memory\n            /// mapped files may be too large to handle with this function.\n            ///\n            /// Consider using `wrapping_offset` instead if these constraints\n            /// are difficult to satisfy. The only advantage of this method is\n            /// that it enables more aggressive compiler optimizations.\n            #[inline]\n            #[allow(clippy::should_implement_trait)]\n            pub unsafe fn add(self, count: $usize_ty) -> Self {\n                self.offset(count.cast())\n            }\n\n            /// Calculates the offset from a pointer (convenience for\n            /// `.offset((count as isize).wrapping_neg())`).\n            ///\n            /// `count` is in units of T; e.g. a `count` of 3 represents a\n            /// pointer offset of `3 * size_of::<T>()` bytes.\n            ///\n            /// # Safety\n            ///\n            /// If any of the following conditions are violated, the result is\n            /// Undefined Behavior:\n            ///\n            /// * Both the starting and resulting pointer must be either in\n            /// bounds or one byte past the end of an allocated object.\n            ///\n            /// * The computed offset cannot exceed `isize::MAX` **bytes**.\n            ///\n            /// * The offset being in bounds cannot rely on \"wrapping around\"\n            /// the address space. That is, the infinite-precision sum must fit\n            /// in a usize.\n            ///\n            /// The compiler and standard library generally tries to ensure\n            /// allocations never reach a size where an offset is a concern. For\n            /// instance, `Vec` and `Box` ensure they never allocate more than\n            /// `isize::MAX` bytes, so\n            /// `vec.as_ptr().add(vec.len()).sub(vec.len())` is always safe.\n            ///\n            /// Most platforms fundamentally can't even construct such an\n            /// allocation. For instance, no known 64-bit platform can ever\n            /// serve a request for 2<sup>63</sup> bytes due to page-table\n            /// limitations or splitting the address space. However, some 32-bit\n            /// and 16-bit platforms may successfully serve a request for more\n            /// than `isize::MAX` bytes with things like Physical Address\n            /// Extension. As such, memory acquired directly from allocators or\n            /// memory mapped files *may* be too large to handle with this\n            /// function.\n            ///\n            /// Consider using `wrapping_offset` instead if these constraints\n            /// are difficult to satisfy. The only advantage of this method is\n            /// that it enables more aggressive compiler optimizations.\n            #[inline]\n            #[allow(clippy::should_implement_trait)]\n            pub unsafe fn sub(self, count: $usize_ty) -> Self {\n                let x: $isize_ty = count.cast();\n                // note: - is currently wrapping_neg\n                self.offset(-x)\n            }\n\n            /// Calculates the offset from a pointer using wrapping arithmetic.\n            /// (convenience for `.wrapping_offset(count as isize)`)\n            ///\n            /// `count` is in units of T; e.g. a `count` of 3 represents a\n            /// pointer offset of `3 * size_of::<T>()` bytes.\n            ///\n            /// # Safety\n            ///\n            /// The resulting pointer does not need to be in bounds, but it is\n            /// potentially hazardous to dereference (which requires `unsafe`).\n            ///\n            /// Always use `.add(count)` instead when possible, because `add`\n            /// allows the compiler to optimize better.\n            #[inline]\n            pub fn wrapping_add(self, count: $usize_ty) -> Self {\n                self.wrapping_offset(count.cast())\n            }\n\n            /// Calculates the offset from a pointer using wrapping arithmetic.\n            /// (convenience for `.wrapping_offset((count as\n            /// isize).wrapping_sub())`)\n            ///\n            /// `count` is in units of T; e.g. a `count` of 3 represents a\n            /// pointer offset of `3 * size_of::<T>()` bytes.\n            ///\n            /// # Safety\n            ///\n            /// The resulting pointer does not need to be in bounds, but it is\n            /// potentially hazardous to dereference (which requires `unsafe`).\n            ///\n            /// Always use `.sub(count)` instead when possible, because `sub`\n            /// allows the compiler to optimize better.\n            #[inline]\n            pub fn wrapping_sub(self, count: $usize_ty) -> Self {\n                let x: $isize_ty = count.cast();\n                self.wrapping_offset(-1 * x)\n            }\n        }\n\n        impl<T> $id<T> {\n            /// Shuffle vector elements according to `indices`.\n            #[inline]\n            pub fn shuffle1_dyn<I>(self, indices: I) -> Self\n                where\n                Self: codegen::shuffle1_dyn::Shuffle1Dyn<Indices = I>,\n            {\n                codegen::shuffle1_dyn::Shuffle1Dyn::shuffle1_dyn(self, indices)\n            }\n        }\n\n        test_if! {\n                $test_tt:\n            paste::item! {\n                pub mod [<$id _shuffle1_dyn>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn shuffle1_dyn() {\n                        let (null, non_null) = ptr_vals!($id<i32>);\n\n                        // alternating = [non_null, null, non_null, null, ...]\n                        let mut alternating = $id::<i32>::splat(null);\n                        for i in 0..$id::<i32>::lanes() {\n                            if i % 2 == 0 {\n                                alternating = alternating.replace(i, non_null);\n                            }\n                        }\n\n                        type Indices = <$id<i32>\n                            as codegen::shuffle1_dyn::Shuffle1Dyn>::Indices;\n                        // even = [0, 0, 2, 2, 4, 4, ..]\n                        let even = {\n                            let mut v = Indices::splat(0);\n                            for i in 0..$id::<i32>::lanes() {\n                                if i % 2 == 0 {\n                                    v = v.replace(i, (i as u8).into());\n                                } else {\n                                v = v.replace(i, (i as u8 - 1).into());\n                                }\n                            }\n                            v\n                        };\n                        // odd = [1, 1, 3, 3, 5, 5, ...]\n                        let odd = {\n                            let mut v = Indices::splat(0);\n                            for i in 0..$id::<i32>::lanes() {\n                                if i % 2 != 0 {\n                                    v = v.replace(i, (i as u8).into());\n                                } else {\n                                    v = v.replace(i, (i as u8 + 1).into());\n                                }\n                            }\n                            v\n                        };\n\n                        assert_eq!(\n                            alternating.shuffle1_dyn(even),\n                            $id::<i32>::splat(non_null)\n                        );\n                        if $id::<i32>::lanes() > 1 {\n                            assert_eq!(\n                                alternating.shuffle1_dyn(odd),\n                                $id::<i32>::splat(null)\n                            );\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/minimal.rs",
    "content": "#[macro_use]\nmod iuf;\n#[macro_use]\nmod mask;\n#[macro_use]\nmod ptr;\n"
  },
  {
    "path": "src/api/ops/scalar_arithmetic.rs",
    "content": "//! Vertical (lane-wise) vector-scalar / scalar-vector arithmetic operations.\n\nmacro_rules! impl_ops_scalar_arithmetic {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl crate::ops::Add<$elem_ty> for $id {\n            type Output = Self;\n            #[inline]\n            fn add(self, other: $elem_ty) -> Self {\n                self + $id::splat(other)\n            }\n        }\n        impl crate::ops::Add<$id> for $elem_ty {\n            type Output = $id;\n            #[inline]\n            fn add(self, other: $id) -> $id {\n                $id::splat(self) + other\n            }\n        }\n\n        impl crate::ops::Sub<$elem_ty> for $id {\n            type Output = Self;\n            #[inline]\n            fn sub(self, other: $elem_ty) -> Self {\n                self - $id::splat(other)\n            }\n        }\n        impl crate::ops::Sub<$id> for $elem_ty {\n            type Output = $id;\n            #[inline]\n            fn sub(self, other: $id) -> $id {\n                $id::splat(self) - other\n            }\n        }\n\n        impl crate::ops::Mul<$elem_ty> for $id {\n            type Output = Self;\n            #[inline]\n            fn mul(self, other: $elem_ty) -> Self {\n                self * $id::splat(other)\n            }\n        }\n        impl crate::ops::Mul<$id> for $elem_ty {\n            type Output = $id;\n            #[inline]\n            fn mul(self, other: $id) -> $id {\n                $id::splat(self) * other\n            }\n        }\n\n        impl crate::ops::Div<$elem_ty> for $id {\n            type Output = Self;\n            #[inline]\n            fn div(self, other: $elem_ty) -> Self {\n                self / $id::splat(other)\n            }\n        }\n        impl crate::ops::Div<$id> for $elem_ty {\n            type Output = $id;\n            #[inline]\n            fn div(self, other: $id) -> $id {\n                $id::splat(self) / other\n            }\n        }\n\n        impl crate::ops::Rem<$elem_ty> for $id {\n            type Output = Self;\n            #[inline]\n            fn rem(self, other: $elem_ty) -> Self {\n                self % $id::splat(other)\n            }\n        }\n        impl crate::ops::Rem<$id> for $elem_ty {\n            type Output = $id;\n            #[inline]\n            fn rem(self, other: $id) -> $id {\n                $id::splat(self) % other\n            }\n        }\n\n        impl crate::ops::AddAssign<$elem_ty> for $id {\n            #[inline]\n            fn add_assign(&mut self, other: $elem_ty) {\n                *self = *self + other;\n            }\n        }\n\n        impl crate::ops::SubAssign<$elem_ty> for $id {\n            #[inline]\n            fn sub_assign(&mut self, other: $elem_ty) {\n                *self = *self - other;\n            }\n        }\n\n        impl crate::ops::MulAssign<$elem_ty> for $id {\n            #[inline]\n            fn mul_assign(&mut self, other: $elem_ty) {\n                *self = *self * other;\n            }\n        }\n\n        impl crate::ops::DivAssign<$elem_ty> for $id {\n            #[inline]\n            fn div_assign(&mut self, other: $elem_ty) {\n                *self = *self / other;\n            }\n        }\n\n        impl crate::ops::RemAssign<$elem_ty> for $id {\n            #[inline]\n            fn rem_assign(&mut self, other: $elem_ty) {\n                *self = *self % other;\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _ops_scalar_arith>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn ops_scalar_arithmetic() {\n                        let zi = 0 as $elem_ty;\n                        let oi = 1 as $elem_ty;\n                        let ti = 2 as $elem_ty;\n                        let fi = 4 as $elem_ty;\n                        let z = $id::splat(zi);\n                        let o = $id::splat(oi);\n                        let t = $id::splat(ti);\n                        let f = $id::splat(fi);\n\n                        // add\n                        assert_eq!(zi + z, z);\n                        assert_eq!(z + zi, z);\n                        assert_eq!(oi + z, o);\n                        assert_eq!(o + zi, o);\n                        assert_eq!(ti + z, t);\n                        assert_eq!(t + zi, t);\n                        assert_eq!(ti + t, f);\n                        assert_eq!(t + ti, f);\n                        // sub\n                        assert_eq!(zi - z, z);\n                        assert_eq!(z - zi, z);\n                        assert_eq!(oi - z, o);\n                        assert_eq!(o - zi, o);\n                        assert_eq!(ti - z, t);\n                        assert_eq!(t - zi, t);\n                        assert_eq!(fi - t, t);\n                        assert_eq!(f - ti, t);\n                        assert_eq!(f - o - o, t);\n                        assert_eq!(f - oi - oi, t);\n                        // mul\n                        assert_eq!(zi * z, z);\n                        assert_eq!(z * zi, z);\n                        assert_eq!(zi * o, z);\n                        assert_eq!(z * oi, z);\n                        assert_eq!(zi * t, z);\n                        assert_eq!(z * ti, z);\n                        assert_eq!(oi * t, t);\n                        assert_eq!(o * ti, t);\n                        assert_eq!(ti * t, f);\n                        assert_eq!(t * ti, f);\n                        // div\n                        assert_eq!(zi / o, z);\n                        assert_eq!(z / oi, z);\n                        assert_eq!(ti / o, t);\n                        assert_eq!(t / oi, t);\n                        assert_eq!(fi / o, f);\n                        assert_eq!(f / oi, f);\n                        assert_eq!(ti / t, o);\n                        assert_eq!(t / ti, o);\n                        assert_eq!(fi / t, t);\n                        assert_eq!(f / ti, t);\n                        // rem\n                        assert_eq!(oi % o, z);\n                        assert_eq!(o % oi, z);\n                        assert_eq!(fi % t, z);\n                        assert_eq!(f % ti, z);\n\n                        {\n                            let mut v = z;\n                            assert_eq!(v, z);\n                            v += oi; // add_assign\n                            assert_eq!(v, o);\n                            v -= oi; // sub_assign\n                            assert_eq!(v, z);\n                            v = t;\n                            v *= oi; // mul_assign\n                            assert_eq!(v, t);\n                            v *= ti;\n                            assert_eq!(v, f);\n                            v /= oi; // div_assign\n                            assert_eq!(v, f);\n                            v /= ti;\n                            assert_eq!(v, t);\n                            v %= ti; // rem_assign\n                            assert_eq!(v, z);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/ops/scalar_bitwise.rs",
    "content": "//! Vertical (lane-wise) vector-scalar / scalar-vector bitwise operations.\n\nmacro_rules! impl_ops_scalar_bitwise {\n    (\n        [$elem_ty:ident; $elem_count:expr]:\n        $id:ident | $test_tt:tt |\n        ($true:expr, $false:expr)\n    ) => {\n        impl crate::ops::BitXor<$elem_ty> for $id {\n            type Output = Self;\n            #[inline]\n            fn bitxor(self, other: $elem_ty) -> Self {\n                self ^ $id::splat(other)\n            }\n        }\n        impl crate::ops::BitXor<$id> for $elem_ty {\n            type Output = $id;\n            #[inline]\n            fn bitxor(self, other: $id) -> $id {\n                $id::splat(self) ^ other\n            }\n        }\n\n        impl crate::ops::BitAnd<$elem_ty> for $id {\n            type Output = Self;\n            #[inline]\n            fn bitand(self, other: $elem_ty) -> Self {\n                self & $id::splat(other)\n            }\n        }\n        impl crate::ops::BitAnd<$id> for $elem_ty {\n            type Output = $id;\n            #[inline]\n            fn bitand(self, other: $id) -> $id {\n                $id::splat(self) & other\n            }\n        }\n\n        impl crate::ops::BitOr<$elem_ty> for $id {\n            type Output = Self;\n            #[inline]\n            fn bitor(self, other: $elem_ty) -> Self {\n                self | $id::splat(other)\n            }\n        }\n        impl crate::ops::BitOr<$id> for $elem_ty {\n            type Output = $id;\n            #[inline]\n            fn bitor(self, other: $id) -> $id {\n                $id::splat(self) | other\n            }\n        }\n\n        impl crate::ops::BitAndAssign<$elem_ty> for $id {\n            #[inline]\n            fn bitand_assign(&mut self, other: $elem_ty) {\n                *self = *self & other;\n            }\n        }\n        impl crate::ops::BitOrAssign<$elem_ty> for $id {\n            #[inline]\n            fn bitor_assign(&mut self, other: $elem_ty) {\n                *self = *self | other;\n            }\n        }\n        impl crate::ops::BitXorAssign<$elem_ty> for $id {\n            #[inline]\n            fn bitxor_assign(&mut self, other: $elem_ty) {\n                *self = *self ^ other;\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _ops_scalar_bitwise>] {\n                    use super::*;\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn ops_scalar_bitwise() {\n                        let zi = 0 as $elem_ty;\n                        let oi = 1 as $elem_ty;\n                        let ti = 2 as $elem_ty;\n                        let z = $id::splat(zi);\n                        let o = $id::splat(oi);\n                        let t = $id::splat(ti);\n\n                        // BitAnd:\n                        assert_eq!(oi & o, o);\n                        assert_eq!(o & oi, o);\n                        assert_eq!(oi & z, z);\n                        assert_eq!(o & zi, z);\n                        assert_eq!(zi & o, z);\n                        assert_eq!(z & oi, z);\n                        assert_eq!(zi & z, z);\n                        assert_eq!(z & zi, z);\n\n                        assert_eq!(ti & t, t);\n                        assert_eq!(t & ti, t);\n                        assert_eq!(ti & o, z);\n                        assert_eq!(t & oi, z);\n                        assert_eq!(oi & t, z);\n                        assert_eq!(o & ti, z);\n\n                        // BitOr:\n                        assert_eq!(oi | o, o);\n                        assert_eq!(o | oi, o);\n                        assert_eq!(oi | z, o);\n                        assert_eq!(o | zi, o);\n                        assert_eq!(zi | o, o);\n                        assert_eq!(z | oi, o);\n                        assert_eq!(zi | z, z);\n                        assert_eq!(z | zi, z);\n\n                        assert_eq!(ti | t, t);\n                        assert_eq!(t | ti, t);\n                        assert_eq!(zi | t, t);\n                        assert_eq!(z | ti, t);\n                        assert_eq!(ti | z, t);\n                        assert_eq!(t | zi, t);\n\n                        // BitXOR:\n                        assert_eq!(oi ^ o, z);\n                        assert_eq!(o ^ oi, z);\n                        assert_eq!(zi ^ z, z);\n                        assert_eq!(z ^ zi, z);\n                        assert_eq!(zi ^ o, o);\n                        assert_eq!(z ^ oi, o);\n                        assert_eq!(oi ^ z, o);\n                        assert_eq!(o ^ zi, o);\n\n                        assert_eq!(ti ^ t, z);\n                        assert_eq!(t ^ ti, z);\n                        assert_eq!(ti ^ z, t);\n                        assert_eq!(t ^ zi, t);\n                        assert_eq!(zi ^ t, t);\n                        assert_eq!(z ^ ti, t);\n\n                        {\n                            // AndAssign:\n                            let mut v = o;\n                            v &= ti;\n                            assert_eq!(v, z);\n                        }\n                        {\n                            // OrAssign:\n                            let mut v = z;\n                            v |= oi;\n                            assert_eq!(v, o);\n                        }\n                        {\n                            // XORAssign:\n                            let mut v = z;\n                            v ^= oi;\n                            assert_eq!(v, o);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/ops/scalar_mask_bitwise.rs",
    "content": "//! Vertical (lane-wise) vector-vector bitwise operations.\n\nmacro_rules! impl_ops_scalar_mask_bitwise {\n    (\n        [$elem_ty:ident; $elem_count:expr]:\n        $id:ident | $test_tt:tt |\n        ($true:expr, $false:expr)\n    ) => {\n        impl crate::ops::BitXor<bool> for $id {\n            type Output = Self;\n            #[inline]\n            fn bitxor(self, other: bool) -> Self {\n                self ^ $id::splat(other)\n            }\n        }\n        impl crate::ops::BitXor<$id> for bool {\n            type Output = $id;\n            #[inline]\n            fn bitxor(self, other: $id) -> $id {\n                $id::splat(self) ^ other\n            }\n        }\n\n        impl crate::ops::BitAnd<bool> for $id {\n            type Output = Self;\n            #[inline]\n            fn bitand(self, other: bool) -> Self {\n                self & $id::splat(other)\n            }\n        }\n        impl crate::ops::BitAnd<$id> for bool {\n            type Output = $id;\n            #[inline]\n            fn bitand(self, other: $id) -> $id {\n                $id::splat(self) & other\n            }\n        }\n\n        impl crate::ops::BitOr<bool> for $id {\n            type Output = Self;\n            #[inline]\n            fn bitor(self, other: bool) -> Self {\n                self | $id::splat(other)\n            }\n        }\n        impl crate::ops::BitOr<$id> for bool {\n            type Output = $id;\n            #[inline]\n            fn bitor(self, other: $id) -> $id {\n                $id::splat(self) | other\n            }\n        }\n\n        impl crate::ops::BitAndAssign<bool> for $id {\n            #[inline]\n            fn bitand_assign(&mut self, other: bool) {\n                *self = *self & other;\n            }\n        }\n        impl crate::ops::BitOrAssign<bool> for $id {\n            #[inline]\n            fn bitor_assign(&mut self, other: bool) {\n                *self = *self | other;\n            }\n        }\n        impl crate::ops::BitXorAssign<bool> for $id {\n            #[inline]\n            fn bitxor_assign(&mut self, other: bool) {\n                *self = *self ^ other;\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _ops_scalar_mask_bitwise>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn ops_scalar_mask_bitwise() {\n                        let ti = true;\n                        let fi = false;\n                        let t = $id::splat(ti);\n                        let f = $id::splat(fi);\n                        assert!(t != f);\n                        assert!(!(t == f));\n\n                        // BitAnd:\n                        assert_eq!(ti & f, f);\n                        assert_eq!(t & fi, f);\n                        assert_eq!(fi & t, f);\n                        assert_eq!(f & ti, f);\n                        assert_eq!(ti & t, t);\n                        assert_eq!(t & ti, t);\n                        assert_eq!(fi & f, f);\n                        assert_eq!(f & fi, f);\n\n                        // BitOr:\n                        assert_eq!(ti | f, t);\n                        assert_eq!(t | fi, t);\n                        assert_eq!(fi | t, t);\n                        assert_eq!(f | ti, t);\n                        assert_eq!(ti | t, t);\n                        assert_eq!(t | ti, t);\n                        assert_eq!(fi | f, f);\n                        assert_eq!(f | fi, f);\n\n                        // BitXOR:\n                        assert_eq!(ti ^ f, t);\n                        assert_eq!(t ^ fi, t);\n                        assert_eq!(fi ^ t, t);\n                        assert_eq!(f ^ ti, t);\n                        assert_eq!(ti ^ t, f);\n                        assert_eq!(t ^ ti, f);\n                        assert_eq!(fi ^ f, f);\n                        assert_eq!(f ^ fi, f);\n\n                        {\n                            // AndAssign:\n                            let mut v = f;\n                            v &= ti;\n                            assert_eq!(v, f);\n                        }\n                        {\n                            // OrAssign:\n                            let mut v = f;\n                            v |= ti;\n                            assert_eq!(v, t);\n                        }\n                        {\n                            // XORAssign:\n                            let mut v = f;\n                            v ^= ti;\n                            assert_eq!(v, t);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/ops/scalar_shifts.rs",
    "content": "//! Vertical (lane-wise) vector-scalar shifts operations.\n\nmacro_rules! impl_ops_scalar_shifts {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl crate::ops::Shl<u32> for $id {\n            type Output = Self;\n            #[inline]\n            fn shl(self, other: u32) -> Self {\n                self << $id::splat(other as $elem_ty)\n            }\n        }\n        impl crate::ops::Shr<u32> for $id {\n            type Output = Self;\n            #[inline]\n            fn shr(self, other: u32) -> Self {\n                self >> $id::splat(other as $elem_ty)\n            }\n        }\n\n        impl crate::ops::ShlAssign<u32> for $id {\n            #[inline]\n            fn shl_assign(&mut self, other: u32) {\n                *self = *self << other;\n            }\n        }\n        impl crate::ops::ShrAssign<u32> for $id {\n            #[inline]\n            fn shr_assign(&mut self, other: u32) {\n                *self = *self >> other;\n            }\n        }\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _ops_scalar_shifts>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg_attr(any(target_arch = \"s390x\", target_arch = \"sparc64\"),\n                               allow(unreachable_code, unused_variables)\n                    )]\n                    #[cfg(not(target_arch = \"aarch64\"))]\n                    //~^ FIXME: https://github.com/rust-lang/packed_simd/issues/317\n                    fn ops_scalar_shifts() {\n                        let z = $id::splat(0 as $elem_ty);\n                        let o = $id::splat(1 as $elem_ty);\n                        let t = $id::splat(2 as $elem_ty);\n                        let f = $id::splat(4 as $elem_ty);\n\n                        {\n                            let zi = 0 as u32;\n                            let oi = 1 as u32;\n                            let ti = 2 as u32;\n                            let maxi\n                                = (mem::size_of::<$elem_ty>() * 8 - 1) as u32;\n\n                            // shr\n                            assert_eq!(z >> zi, z);\n                            assert_eq!(z >> oi, z);\n                            assert_eq!(z >> ti, z);\n                            assert_eq!(z >> ti, z);\n\n                            #[cfg(any(target_arch = \"s390x\", target_arch = \"sparc64\"))] {\n                                // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/13\n                                return;\n                            }\n\n                            assert_eq!(o >> zi, o);\n                            assert_eq!(t >> zi, t);\n                            assert_eq!(f >> zi, f);\n                            assert_eq!(f >> maxi, z);\n\n                            assert_eq!(o >> oi, z);\n                            assert_eq!(t >> oi, o);\n                            assert_eq!(t >> ti, z);\n                            assert_eq!(f >> oi, t);\n                            assert_eq!(f >> ti, o);\n                            assert_eq!(f >> maxi, z);\n\n                            // shl\n                            assert_eq!(z << zi, z);\n                            assert_eq!(o << zi, o);\n                            assert_eq!(t << zi, t);\n                            assert_eq!(f << zi, f);\n                            assert_eq!(f << maxi, z);\n\n                            assert_eq!(o << oi, t);\n                            assert_eq!(o << ti, f);\n                            assert_eq!(t << oi, f);\n\n                            {  // shr_assign\n                                let mut v = o;\n                                v >>= oi;\n                                assert_eq!(v, z);\n                            }\n                            {  // shl_assign\n                                let mut v = o;\n                                v <<= oi;\n                                assert_eq!(v, t);\n                            }\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/ops/vector_arithmetic.rs",
    "content": "//! Vertical (lane-wise) vector-vector arithmetic operations.\n\nmacro_rules! impl_ops_vector_arithmetic {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl crate::ops::Add for $id {\n            type Output = Self;\n            #[inline]\n            fn add(self, other: Self) -> Self {\n                use crate::llvm::simd_add;\n                unsafe { Simd(simd_add(self.0, other.0)) }\n            }\n        }\n\n        impl crate::ops::Sub for $id {\n            type Output = Self;\n            #[inline]\n            fn sub(self, other: Self) -> Self {\n                use crate::llvm::simd_sub;\n                unsafe { Simd(simd_sub(self.0, other.0)) }\n            }\n        }\n\n        impl crate::ops::Mul for $id {\n            type Output = Self;\n            #[inline]\n            fn mul(self, other: Self) -> Self {\n                use crate::llvm::simd_mul;\n                unsafe { Simd(simd_mul(self.0, other.0)) }\n            }\n        }\n\n        impl crate::ops::Div for $id {\n            type Output = Self;\n            #[inline]\n            fn div(self, other: Self) -> Self {\n                use crate::llvm::simd_div;\n                unsafe { Simd(simd_div(self.0, other.0)) }\n            }\n        }\n\n        impl crate::ops::Rem for $id {\n            type Output = Self;\n            #[inline]\n            fn rem(self, other: Self) -> Self {\n                use crate::llvm::simd_rem;\n                unsafe { Simd(simd_rem(self.0, other.0)) }\n            }\n        }\n\n        impl crate::ops::AddAssign for $id {\n            #[inline]\n            fn add_assign(&mut self, other: Self) {\n                *self = *self + other;\n            }\n        }\n\n        impl crate::ops::SubAssign for $id {\n            #[inline]\n            fn sub_assign(&mut self, other: Self) {\n                *self = *self - other;\n            }\n        }\n\n        impl crate::ops::MulAssign for $id {\n            #[inline]\n            fn mul_assign(&mut self, other: Self) {\n                *self = *self * other;\n            }\n        }\n\n        impl crate::ops::DivAssign for $id {\n            #[inline]\n            fn div_assign(&mut self, other: Self) {\n                *self = *self / other;\n            }\n        }\n\n        impl crate::ops::RemAssign for $id {\n            #[inline]\n            fn rem_assign(&mut self, other: Self) {\n                *self = *self % other;\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n               pub mod [<$id _ops_vector_arith>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn ops_vector_arithmetic() {\n                        let z = $id::splat(0 as $elem_ty);\n                        let o = $id::splat(1 as $elem_ty);\n                        let t = $id::splat(2 as $elem_ty);\n                        let f = $id::splat(4 as $elem_ty);\n\n                        // add\n                        assert_eq!(z + z, z);\n                        assert_eq!(o + z, o);\n                        assert_eq!(t + z, t);\n                        assert_eq!(t + t, f);\n                        // sub\n                        assert_eq!(z - z, z);\n                        assert_eq!(o - z, o);\n                        assert_eq!(t - z, t);\n                        assert_eq!(f - t, t);\n                        assert_eq!(f - o - o, t);\n                        // mul\n                        assert_eq!(z * z, z);\n                        assert_eq!(z * o, z);\n                        assert_eq!(z * t, z);\n                        assert_eq!(o * t, t);\n                        assert_eq!(t * t, f);\n                        // div\n                        assert_eq!(z / o, z);\n                        assert_eq!(t / o, t);\n                        assert_eq!(f / o, f);\n                        assert_eq!(t / t, o);\n                        assert_eq!(f / t, t);\n                        // rem\n                        assert_eq!(o % o, z);\n                        assert_eq!(f % t, z);\n\n                        {\n                            let mut v = z;\n                            assert_eq!(v, z);\n                            v += o; // add_assign\n                            assert_eq!(v, o);\n                            v -= o; // sub_assign\n                            assert_eq!(v, z);\n                            v = t;\n                            v *= o; // mul_assign\n                            assert_eq!(v, t);\n                            v *= t;\n                            assert_eq!(v, f);\n                            v /= o; // div_assign\n                            assert_eq!(v, f);\n                            v /= t;\n                            assert_eq!(v, t);\n                            v %= t; // rem_assign\n                            assert_eq!(v, z);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/ops/vector_bitwise.rs",
    "content": "//! Vertical (lane-wise) vector-vector bitwise operations.\n\nmacro_rules! impl_ops_vector_bitwise {\n    (\n        [$elem_ty:ident; $elem_count:expr]:\n        $id:ident | $test_tt:tt |\n        ($true:expr, $false:expr)\n    ) => {\n        impl crate::ops::Not for $id {\n            type Output = Self;\n            #[inline]\n            fn not(self) -> Self {\n                Self::splat($true) ^ self\n            }\n        }\n        impl crate::ops::BitXor for $id {\n            type Output = Self;\n            #[inline]\n            fn bitxor(self, other: Self) -> Self {\n                use crate::llvm::simd_xor;\n                unsafe { Simd(simd_xor(self.0, other.0)) }\n            }\n        }\n        impl crate::ops::BitAnd for $id {\n            type Output = Self;\n            #[inline]\n            fn bitand(self, other: Self) -> Self {\n                use crate::llvm::simd_and;\n                unsafe { Simd(simd_and(self.0, other.0)) }\n            }\n        }\n        impl crate::ops::BitOr for $id {\n            type Output = Self;\n            #[inline]\n            fn bitor(self, other: Self) -> Self {\n                use crate::llvm::simd_or;\n                unsafe { Simd(simd_or(self.0, other.0)) }\n            }\n        }\n        impl crate::ops::BitAndAssign for $id {\n            #[inline]\n            fn bitand_assign(&mut self, other: Self) {\n                *self = *self & other;\n            }\n        }\n        impl crate::ops::BitOrAssign for $id {\n            #[inline]\n            fn bitor_assign(&mut self, other: Self) {\n                *self = *self | other;\n            }\n        }\n        impl crate::ops::BitXorAssign for $id {\n            #[inline]\n            fn bitxor_assign(&mut self, other: Self) {\n                *self = *self ^ other;\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _ops_vector_bitwise>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn ops_vector_bitwise() {\n\n                        let z = $id::splat(0 as $elem_ty);\n                        let o = $id::splat(1 as $elem_ty);\n                        let t = $id::splat(2 as $elem_ty);\n                        let m = $id::splat(!z.extract(0));\n\n                        // Not:\n                        assert_eq!(!z, m);\n                        assert_eq!(!m, z);\n\n                        // BitAnd:\n                        assert_eq!(o & o, o);\n                        assert_eq!(o & z, z);\n                        assert_eq!(z & o, z);\n                        assert_eq!(z & z, z);\n\n                        assert_eq!(t & t, t);\n                        assert_eq!(t & o, z);\n                        assert_eq!(o & t, z);\n\n                        // BitOr:\n                        assert_eq!(o | o, o);\n                        assert_eq!(o | z, o);\n                        assert_eq!(z | o, o);\n                        assert_eq!(z | z, z);\n\n                        assert_eq!(t | t, t);\n                        assert_eq!(z | t, t);\n                        assert_eq!(t | z, t);\n\n                        // BitXOR:\n                        assert_eq!(o ^ o, z);\n                        assert_eq!(z ^ z, z);\n                        assert_eq!(z ^ o, o);\n                        assert_eq!(o ^ z, o);\n\n                        assert_eq!(t ^ t, z);\n                        assert_eq!(t ^ z, t);\n                        assert_eq!(z ^ t, t);\n\n                        {\n                            // AndAssign:\n                            let mut v = o;\n                            v &= t;\n                            assert_eq!(v, z);\n                        }\n                        {\n                            // OrAssign:\n                            let mut v = z;\n                            v |= o;\n                            assert_eq!(v, o);\n                        }\n                        {\n                            // XORAssign:\n                            let mut v = z;\n                            v ^= o;\n                            assert_eq!(v, o);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/ops/vector_float_min_max.rs",
    "content": "//! Vertical (lane-wise) vector `min` and `max` for floating-point vectors.\n\nmacro_rules! impl_ops_vector_float_min_max {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Minimum of two vectors.\n            ///\n            /// Returns a new vector containing the minimum value of each of\n            /// the input vector lanes.\n            #[inline]\n            pub fn min(self, x: Self) -> Self {\n                use crate::llvm::simd_fmin;\n                unsafe { Simd(simd_fmin(self.0, x.0)) }\n            }\n\n            /// Maximum of two vectors.\n            ///\n            /// Returns a new vector containing the maximum value of each of\n            /// the input vector lanes.\n            #[inline]\n            pub fn max(self, x: Self) -> Self {\n                use crate::llvm::simd_fmax;\n                unsafe { Simd(simd_fmax(self.0, x.0)) }\n            }\n        }\n        test_if!{\n            $test_tt:\n            paste::item! {\n                #[cfg(not(any(\n                    // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/223\n                    all(target_arch = \"mips\", target_endian = \"big\"),\n                    target_arch = \"mips64\",\n                )))]\n                pub mod [<$id _ops_vector_min_max>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn min_max() {\n                        let n = crate::$elem_ty::NAN;\n                        let o = $id::splat(1. as $elem_ty);\n                        let t = $id::splat(2. as $elem_ty);\n\n                        let mut m = o; // [1., 2., 1., 2., ...]\n                        let mut on = o;\n                        for i in 0..$id::lanes() {\n                            if i % 2 == 0 {\n                                m = m.replace(i, 2. as $elem_ty);\n                                on = on.replace(i, n);\n                            }\n                        }\n\n                        assert_eq!(o.min(t), o);\n                        assert_eq!(t.min(o), o);\n                        assert_eq!(m.min(o), o);\n                        assert_eq!(o.min(m), o);\n                        assert_eq!(m.min(t), m);\n                        assert_eq!(t.min(m), m);\n\n                        assert_eq!(o.max(t), t);\n                        assert_eq!(t.max(o), t);\n                        assert_eq!(m.max(o), m);\n                        assert_eq!(o.max(m), m);\n                        assert_eq!(m.max(t), t);\n                        assert_eq!(t.max(m), t);\n\n                        assert_eq!(on.min(o), o);\n                        assert_eq!(o.min(on), o);\n                        assert_eq!(on.max(o), o);\n                        assert_eq!(o.max(on), o);\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/ops/vector_int_min_max.rs",
    "content": "//! Vertical (lane-wise) vector `min` and `max` for integer vectors.\n\nmacro_rules! impl_ops_vector_int_min_max {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Minimum of two vectors.\n            ///\n            /// Returns a new vector containing the minimum value of each of\n            /// the input vector lanes.\n            #[inline]\n            pub fn min(self, x: Self) -> Self {\n                self.lt(x).select(self, x)\n            }\n\n            /// Maximum of two vectors.\n            ///\n            /// Returns a new vector containing the maximum value of each of\n            /// the input vector lanes.\n            #[inline]\n            pub fn max(self, x: Self) -> Self {\n                self.gt(x).select(self, x)\n            }\n        }\n        test_if!{$test_tt:\n        paste::item! {\n            pub mod [<$id _ops_vector_min_max>] {\n                use super::*;\n                #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                fn min_max() {\n                    let o = $id::splat(1 as $elem_ty);\n                    let t = $id::splat(2 as $elem_ty);\n\n                    let mut m = o;\n                    for i in 0..$id::lanes() {\n                        if i % 2 == 0 {\n                            m = m.replace(i, 2 as $elem_ty);\n                        }\n                    }\n                    assert_eq!(o.min(t), o);\n                    assert_eq!(t.min(o), o);\n                    assert_eq!(m.min(o), o);\n                    assert_eq!(o.min(m), o);\n                    assert_eq!(m.min(t), m);\n                    assert_eq!(t.min(m), m);\n\n                    assert_eq!(o.max(t), t);\n                    assert_eq!(t.max(o), t);\n                    assert_eq!(m.max(o), m);\n                    assert_eq!(o.max(m), m);\n                    assert_eq!(m.max(t), t);\n                    assert_eq!(t.max(m), t);\n                }\n            }\n        }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/ops/vector_mask_bitwise.rs",
    "content": "//! Vertical (lane-wise) vector-vector bitwise operations.\n\nmacro_rules! impl_ops_vector_mask_bitwise {\n    (\n        [$elem_ty:ident; $elem_count:expr]:\n        $id:ident | $test_tt:tt |\n        ($true:expr, $false:expr)\n    ) => {\n        impl crate::ops::Not for $id {\n            type Output = Self;\n            #[inline]\n            fn not(self) -> Self {\n                Self::splat($true) ^ self\n            }\n        }\n        impl crate::ops::BitXor for $id {\n            type Output = Self;\n            #[inline]\n            fn bitxor(self, other: Self) -> Self {\n                use crate::llvm::simd_xor;\n                unsafe { Simd(simd_xor(self.0, other.0)) }\n            }\n        }\n        impl crate::ops::BitAnd for $id {\n            type Output = Self;\n            #[inline]\n            fn bitand(self, other: Self) -> Self {\n                use crate::llvm::simd_and;\n                unsafe { Simd(simd_and(self.0, other.0)) }\n            }\n        }\n        impl crate::ops::BitOr for $id {\n            type Output = Self;\n            #[inline]\n            fn bitor(self, other: Self) -> Self {\n                use crate::llvm::simd_or;\n                unsafe { Simd(simd_or(self.0, other.0)) }\n            }\n        }\n        impl crate::ops::BitAndAssign for $id {\n            #[inline]\n            fn bitand_assign(&mut self, other: Self) {\n                *self = *self & other;\n            }\n        }\n        impl crate::ops::BitOrAssign for $id {\n            #[inline]\n            fn bitor_assign(&mut self, other: Self) {\n                *self = *self | other;\n            }\n        }\n        impl crate::ops::BitXorAssign for $id {\n            #[inline]\n            fn bitxor_assign(&mut self, other: Self) {\n                *self = *self ^ other;\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _ops_vector_mask_bitwise>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn ops_vector_mask_bitwise() {\n                        let t = $id::splat(true);\n                        let f = $id::splat(false);\n                        assert!(t != f);\n                        assert!(!(t == f));\n\n                        // Not:\n                        assert_eq!(!t, f);\n                        assert_eq!(t, !f);\n\n                        // BitAnd:\n                        assert_eq!(t & f, f);\n                        assert_eq!(f & t, f);\n                        assert_eq!(t & t, t);\n                        assert_eq!(f & f, f);\n\n                        // BitOr:\n                        assert_eq!(t | f, t);\n                        assert_eq!(f | t, t);\n                        assert_eq!(t | t, t);\n                        assert_eq!(f | f, f);\n\n                        // BitXOR:\n                        assert_eq!(t ^ f, t);\n                        assert_eq!(f ^ t, t);\n                        assert_eq!(t ^ t, f);\n                        assert_eq!(f ^ f, f);\n\n                        {\n                            // AndAssign:\n                            let mut v = f;\n                            v &= t;\n                            assert_eq!(v, f);\n                        }\n                        {\n                            // OrAssign:\n                            let mut v = f;\n                            v |= t;\n                            assert_eq!(v, t);\n                        }\n                        {\n                            // XORAssign:\n                            let mut v = f;\n                            v ^= t;\n                            assert_eq!(v, t);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/ops/vector_neg.rs",
    "content": "//! Vertical (lane-wise) vector `Neg`.\n\nmacro_rules! impl_ops_vector_neg {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl crate::ops::Neg for $id {\n            type Output = Self;\n            #[inline]\n            fn neg(self) -> Self {\n                Self::splat(-1 as $elem_ty) * self\n            }\n        }\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _ops_vector_neg>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn neg() {\n                        let z = $id::splat(0 as $elem_ty);\n                        let o = $id::splat(1 as $elem_ty);\n                        let t = $id::splat(2 as $elem_ty);\n                        let f = $id::splat(4 as $elem_ty);\n\n                        let nz = $id::splat(-(0 as $elem_ty));\n                        let no = $id::splat(-(1 as $elem_ty));\n                        let nt = $id::splat(-(2 as $elem_ty));\n                        let nf = $id::splat(-(4 as $elem_ty));\n\n                        assert_eq!(-z, nz);\n                        assert_eq!(-o, no);\n                        assert_eq!(-t, nt);\n                        assert_eq!(-f, nf);\n\n                        assert_eq!(z, -nz);\n                        assert_eq!(o, -no);\n                        assert_eq!(t, -nt);\n                        assert_eq!(f, -nf);\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/ops/vector_rotates.rs",
    "content": "//! Vertical (lane-wise) vector rotates operations.\n#![allow(unused)]\n\nmacro_rules! impl_ops_vector_rotates {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Shifts the bits of each lane to the left by the specified\n            /// amount in the corresponding lane of `n`, wrapping the\n            /// truncated bits to the end of the resulting integer.\n            ///\n            /// Note: this is neither the same operation as `<<` nor equivalent\n            /// to `slice::rotate_left`.\n            #[inline]\n            pub fn rotate_left(self, n: $id) -> $id {\n                const LANE_WIDTH: $elem_ty =\n                    crate::mem::size_of::<$elem_ty>() as $elem_ty * 8;\n                // Protect against undefined behavior for over-long bit shifts\n                let n = n % LANE_WIDTH;\n                (self << n) | (self >> ((LANE_WIDTH - n) % LANE_WIDTH))\n            }\n\n            /// Shifts the bits of each lane to the right by the specified\n            /// amount in the corresponding lane of `n`, wrapping the\n            /// truncated bits to the beginning of the resulting integer.\n            ///\n            /// Note: this is neither the same operation as `>>` nor equivalent\n            /// to `slice::rotate_right`.\n            #[inline]\n            pub fn rotate_right(self, n: $id) -> $id {\n                const LANE_WIDTH: $elem_ty =\n                    crate::mem::size_of::<$elem_ty>() as $elem_ty * 8;\n                // Protect against undefined behavior for over-long bit shifts\n                let n = n % LANE_WIDTH;\n                (self >> n) | (self << ((LANE_WIDTH - n) % LANE_WIDTH))\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                // FIXME:\n                // https://github.com/rust-lang-nursery/packed_simd/issues/75\n                #[cfg(not(any(\n                    target_arch = \"s390x\",\n                    target_arch = \"sparc64\",\n                )))]\n                pub mod [<$id _ops_vector_rotate>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"aarch64\"))]\n                    //~^ FIXME: https://github.com/rust-lang/packed_simd/issues/317\n                    fn rotate_ops() {\n                        let z = $id::splat(0 as $elem_ty);\n                        let o = $id::splat(1 as $elem_ty);\n                        let t = $id::splat(2 as $elem_ty);\n                        let f = $id::splat(4 as $elem_ty);\n\n                        let max = $id::splat(\n                            (mem::size_of::<$elem_ty>() * 8 - 1) as $elem_ty);\n\n                        // rotate_right\n                        assert_eq!(z.rotate_right(z), z);\n                        assert_eq!(z.rotate_right(o), z);\n                        assert_eq!(z.rotate_right(t), z);\n\n                        assert_eq!(o.rotate_right(z), o);\n                        assert_eq!(t.rotate_right(z), t);\n                        assert_eq!(f.rotate_right(z), f);\n                        assert_eq!(f.rotate_right(max), f << 1);\n\n                        assert_eq!(o.rotate_right(o), o << max);\n                        assert_eq!(t.rotate_right(o), o);\n                        assert_eq!(t.rotate_right(t), o << max);\n                        assert_eq!(f.rotate_right(o), t);\n                        assert_eq!(f.rotate_right(t), o);\n\n                        // rotate_left\n                        assert_eq!(z.rotate_left(z), z);\n                        assert_eq!(o.rotate_left(z), o);\n                        assert_eq!(t.rotate_left(z), t);\n                        assert_eq!(f.rotate_left(z), f);\n                        assert_eq!(f.rotate_left(max), t);\n\n                        assert_eq!(o.rotate_left(o), t);\n                        assert_eq!(o.rotate_left(t), f);\n                        assert_eq!(t.rotate_left(o), f);\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/ops/vector_shifts.rs",
    "content": "//! Vertical (lane-wise) vector-vector shifts operations.\n\nmacro_rules! impl_ops_vector_shifts {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl crate::ops::Shl<$id> for $id {\n            type Output = Self;\n            #[inline]\n            fn shl(self, other: Self) -> Self {\n                use crate::llvm::simd_shl;\n                unsafe { Simd(simd_shl(self.0, other.0)) }\n            }\n        }\n        impl crate::ops::Shr<$id> for $id {\n            type Output = Self;\n            #[inline]\n            fn shr(self, other: Self) -> Self {\n                use crate::llvm::simd_shr;\n                unsafe { Simd(simd_shr(self.0, other.0)) }\n            }\n        }\n        impl crate::ops::ShlAssign<$id> for $id {\n            #[inline]\n            fn shl_assign(&mut self, other: Self) {\n                *self = *self << other;\n            }\n        }\n        impl crate::ops::ShrAssign<$id> for $id {\n            #[inline]\n            fn shr_assign(&mut self, other: Self) {\n                *self = *self >> other;\n            }\n        }\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _ops_vector_shifts>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg_attr(any(target_arch = \"s390x\", target_arch = \"sparc64\"),\n                               allow(unreachable_code, unused_variables)\n                    )]\n                    #[cfg(not(target_arch = \"aarch64\"))]\n                    //~^ FIXME: https://github.com/rust-lang/packed_simd/issues/317\n                    fn ops_vector_shifts() {\n                        let z = $id::splat(0 as $elem_ty);\n                        let o = $id::splat(1 as $elem_ty);\n                        let t = $id::splat(2 as $elem_ty);\n                        let f = $id::splat(4 as $elem_ty);\n\n                        let max =$id::splat(\n                            (mem::size_of::<$elem_ty>() * 8 - 1) as $elem_ty\n                        );\n\n                        // shr\n                        assert_eq!(z >> z, z);\n                        assert_eq!(z >> o, z);\n                        assert_eq!(z >> t, z);\n                        assert_eq!(z >> t, z);\n\n                        #[cfg(any(target_arch = \"s390x\", target_arch = \"sparc64\"))] {\n                            // FIXME: rust produces bad codegen for shifts:\n                            // https://github.com/rust-lang-nursery/packed_simd/issues/13\n                            return;\n                        }\n\n                        assert_eq!(o >> z, o);\n                        assert_eq!(t >> z, t);\n                        assert_eq!(f >> z, f);\n                        assert_eq!(f >> max, z);\n\n                        assert_eq!(o >> o, z);\n                        assert_eq!(t >> o, o);\n                        assert_eq!(t >> t, z);\n                        assert_eq!(f >> o, t);\n                        assert_eq!(f >> t, o);\n                        assert_eq!(f >> max, z);\n\n                        // shl\n                        assert_eq!(z << z, z);\n                        assert_eq!(o << z, o);\n                        assert_eq!(t << z, t);\n                        assert_eq!(f << z, f);\n                        assert_eq!(f << max, z);\n\n                        assert_eq!(o << o, t);\n                        assert_eq!(o << t, f);\n                        assert_eq!(t << o, f);\n\n                        {\n                            // shr_assign\n                            let mut v = o;\n                            v >>= o;\n                            assert_eq!(v, z);\n                        }\n                        {\n                            // shl_assign\n                            let mut v = o;\n                            v <<= o;\n                            assert_eq!(v, t);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/ops.rs",
    "content": "//! Implementation of the `ops` traits\n#[macro_use]\nmod vector_mask_bitwise;\n#[macro_use]\nmod scalar_mask_bitwise;\n\n#[macro_use]\nmod vector_arithmetic;\n#[macro_use]\nmod scalar_arithmetic;\n\n#[macro_use]\nmod vector_bitwise;\n#[macro_use]\nmod scalar_bitwise;\n\n#[macro_use]\nmod vector_shifts;\n#[macro_use]\nmod scalar_shifts;\n\n#[macro_use]\nmod vector_rotates;\n\n#[macro_use]\nmod vector_neg;\n\n#[macro_use]\nmod vector_int_min_max;\n\n#[macro_use]\nmod vector_float_min_max;\n"
  },
  {
    "path": "src/api/ptr/gather_scatter.rs",
    "content": "//! Implements masked gather and scatters for vectors of pointers\n\nmacro_rules! impl_ptr_read {\n    ([$elem_ty:ty; $elem_count:expr]: $id:ident, $mask_ty:ident\n     | $test_tt:tt) => {\n        impl<T> $id<T>\n        where\n            [T; $elem_count]: sealed::SimdArray,\n        {\n            /// Reads selected vector elements from memory.\n            ///\n            /// Instantiates a new vector by reading the values from `self` for\n            /// those lanes whose `mask` is `true`, and using the elements of\n            /// `value` otherwise.\n            ///\n            /// No memory is accessed for those lanes of `self` whose `mask` is\n            /// `false`.\n            ///\n            /// # Safety\n            ///\n            /// This method is unsafe because it dereferences raw pointers. The\n            /// pointers must be aligned to `mem::align_of::<T>()`.\n            #[inline]\n            pub unsafe fn read<M>(\n                self,\n                mask: Simd<[M; $elem_count]>,\n                value: Simd<[T; $elem_count]>,\n            ) -> Simd<[T; $elem_count]>\n            where\n                M: sealed::Mask,\n                [M; $elem_count]: sealed::SimdArray,\n            {\n                use crate::llvm::simd_gather;\n                Simd(simd_gather(value.0, self.0, mask.0))\n            }\n        }\n\n        test_if! {\n            $test_tt:\n            paste::item! {\n                mod [<$id _read>] {\n                    use super::*;\n                    #[test]\n                    fn read() {\n                        let mut v = [0_i32; $elem_count];\n                        for i in 0..$elem_count {\n                            v[i] = i as i32;\n                        }\n\n                        let mut ptr = $id::<i32>::null();\n\n                        for i in 0..$elem_count {\n                            ptr = ptr.replace(i,\n                                &v[i] as *const i32 as *mut i32\n                            );\n                        }\n\n                        // all mask elements are true:\n                        let mask = $mask_ty::splat(true);\n                        let def = Simd::<[i32; $elem_count]>::splat(42_i32);\n                        let r: Simd<[i32; $elem_count]> = unsafe {\n                            ptr.read(mask, def)\n                        };\n                        assert_eq!(\n                            r,\n                            Simd::<[i32; $elem_count]>::from_slice_unaligned(\n                                &v\n                            )\n                        );\n\n                        let mut mask = mask;\n                        for i in 0..$elem_count {\n                            if i % 2 != 0 {\n                                mask = mask.replace(i, false);\n                            }\n                        }\n\n                        // even mask elements are true, odd ones are false:\n                        let r: Simd<[i32; $elem_count]> = unsafe {\n                            ptr.read(mask, def)\n                        };\n                        let mut e = v;\n                        for i in 0..$elem_count {\n                            if i % 2 != 0 {\n                                e[i] = 42;\n                            }\n                        }\n                        assert_eq!(\n                            r,\n                            Simd::<[i32; $elem_count]>::from_slice_unaligned(\n                                &e\n                            )\n                        );\n\n                        // all mask elements are false:\n                        let mask = $mask_ty::splat(false);\n                        let def = Simd::<[i32; $elem_count]>::splat(42_i32);\n                        let r: Simd<[i32; $elem_count]> = unsafe {\n                            ptr.read(mask, def) }\n                        ;\n                        assert_eq!(r, def);\n                    }\n                }\n            }\n        }\n    };\n}\n\nmacro_rules! impl_ptr_write {\n    ([$elem_ty:ty; $elem_count:expr]: $id:ident, $mask_ty:ident\n     | $test_tt:tt) => {\n        impl<T> $id<T>\n        where\n            [T; $elem_count]: sealed::SimdArray,\n        {\n            /// Writes selected vector elements to memory.\n            ///\n            /// Writes the lanes of `values` for which the mask is `true` to\n            /// their corresponding memory addresses in `self`.\n            ///\n            /// No memory is accessed for those lanes of `self` whose `mask` is\n            /// `false`.\n            ///\n            /// Overlapping memory addresses of `self` are written to in order\n            /// from the lest-significant to the most-significant element.\n            ///\n            /// # Safety\n            ///\n            /// This method is unsafe because it dereferences raw pointers. The\n            /// pointers must be aligned to `mem::align_of::<T>()`.\n            #[inline]\n            pub unsafe fn write<M>(self, mask: Simd<[M; $elem_count]>, value: Simd<[T; $elem_count]>)\n            where\n                M: sealed::Mask,\n                [M; $elem_count]: sealed::SimdArray,\n            {\n                use crate::llvm::simd_scatter;\n                simd_scatter(value.0, self.0, mask.0)\n            }\n        }\n\n        test_if! {\n            $test_tt:\n            paste::item! {\n                mod [<$id _write>] {\n                    use super::*;\n                    #[test]\n                    fn write() {\n                        // forty_two = [42, 42, 42, ...]\n                        let forty_two\n                            = Simd::<[i32; $elem_count]>::splat(42_i32);\n\n                        // This test will write to this array\n                        let mut arr = [0_i32; $elem_count];\n                        for i in 0..$elem_count {\n                            arr[i] = i as i32;\n                        }\n                        // arr = [0, 1, 2, ...]\n\n                        let mut ptr = $id::<i32>::null();\n                        for i in 0..$elem_count {\n                            ptr = ptr.replace(i, unsafe {\n                                arr.as_ptr().add(i) as *mut i32\n                            });\n                        }\n                        // ptr = [&arr[0], &arr[1], ...]\n\n                        // write `forty_two` to all elements of `v`\n                        {\n                            let backup = arr;\n                            unsafe {\n                                ptr.write($mask_ty::splat(true), forty_two)\n                            };\n                            assert_eq!(arr, [42_i32; $elem_count]);\n                            arr = backup;  // arr = [0, 1, 2, ...]\n                        }\n\n                        // write 42 to even elements of arr:\n                        {\n                            // set odd elements of the mask to false\n                            let mut mask = $mask_ty::splat(true);\n                            for i in 0..$elem_count {\n                                if i % 2 != 0 {\n                                    mask = mask.replace(i, false);\n                                }\n                            }\n                            // mask = [true, false, true, false, ...]\n\n                            // expected result r = [42, 1, 42, 3, 42, 5, ...]\n                            let mut r = arr;\n                            for i in 0..$elem_count {\n                                if i % 2 == 0 {\n                                    r[i] = 42;\n                                }\n                            }\n\n                            let backup = arr;\n                            unsafe { ptr.write(mask, forty_two) };\n                            assert_eq!(arr, r);\n                            arr = backup;  // arr = [0, 1, 2, 3, ...]\n                        }\n\n                        // write 42 to no elements of arr\n                        {\n                            let backup = arr;\n                            unsafe {\n                                ptr.write($mask_ty::splat(false), forty_two)\n                            };\n                            assert_eq!(arr, backup);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/ptr.rs",
    "content": "//! Vector of pointers\n\n#[macro_use]\nmod gather_scatter;\n"
  },
  {
    "path": "src/api/reductions/bitwise.rs",
    "content": "//! Implements portable horizontal bitwise vector reductions.\n#![allow(unused)]\n\nmacro_rules! impl_reduction_bitwise {\n    (\n        [$elem_ty:ident; $elem_count:expr]:\n        $id:ident | $ielem_ty:ident | $test_tt:tt |\n        ($convert:expr) |\n        ($true:expr, $false:expr)\n    ) => {\n        impl $id {\n            /// Lane-wise bitwise `and` of the vector elements.\n            ///\n            /// Note: if the vector has one lane, the first element of the\n            /// vector is returned.\n            #[inline]\n            pub fn and(self) -> $elem_ty {\n                #[cfg(not(target_arch = \"aarch64\"))]\n                {\n                    use crate::llvm::simd_reduce_and;\n                    let r: $ielem_ty = unsafe { simd_reduce_and(self.0) };\n                    $convert(r)\n                }\n                #[cfg(target_arch = \"aarch64\")]\n                {\n                    // FIXME: broken on aarch64\n                    // https://github.com/rust-lang-nursery/packed_simd/issues/15\n                    let mut x = self.extract(0) as $elem_ty;\n                    for i in 1..$id::lanes() {\n                        x &= self.extract(i) as $elem_ty;\n                    }\n                    x\n                }\n            }\n\n            /// Lane-wise bitwise `or` of the vector elements.\n            ///\n            /// Note: if the vector has one lane, the first element of the\n            /// vector is returned.\n            #[inline]\n            pub fn or(self) -> $elem_ty {\n                #[cfg(not(target_arch = \"aarch64\"))]\n                {\n                    use crate::llvm::simd_reduce_or;\n                    let r: $ielem_ty = unsafe { simd_reduce_or(self.0) };\n                    $convert(r)\n                }\n                #[cfg(target_arch = \"aarch64\")]\n                {\n                    // FIXME: broken on aarch64\n                    // https://github.com/rust-lang-nursery/packed_simd/issues/15\n                    let mut x = self.extract(0) as $elem_ty;\n                    for i in 1..$id::lanes() {\n                        x |= self.extract(i) as $elem_ty;\n                    }\n                    x\n                }\n            }\n\n            /// Lane-wise bitwise `xor` of the vector elements.\n            ///\n            /// Note: if the vector has one lane, the first element of the\n            /// vector is returned.\n            #[inline]\n            pub fn xor(self) -> $elem_ty {\n                #[cfg(not(target_arch = \"aarch64\"))]\n                {\n                    use crate::llvm::simd_reduce_xor;\n                    let r: $ielem_ty = unsafe { simd_reduce_xor(self.0) };\n                    $convert(r)\n                }\n                #[cfg(target_arch = \"aarch64\")]\n                {\n                    // FIXME: broken on aarch64\n                    // https://github.com/rust-lang-nursery/packed_simd/issues/15\n                    let mut x = self.extract(0) as $elem_ty;\n                    for i in 1..$id::lanes() {\n                        x ^= self.extract(i) as $elem_ty;\n                    }\n                    x\n                }\n            }\n        }\n\n        test_if!{\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _reduction_bitwise>] {\n                    use super::*;\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn and() {\n                        let v = $id::splat($false);\n                        assert_eq!(v.and(), $false);\n                        let v = $id::splat($true);\n                        assert_eq!(v.and(), $true);\n                        let v = $id::splat($false);\n                        let v = v.replace(0, $true);\n                        if $id::lanes() > 1 {\n                            assert_eq!(v.and(), $false);\n                        } else {\n                            assert_eq!(v.and(), $true);\n                        }\n                        let v = $id::splat($true);\n                        let v = v.replace(0, $false);\n                        assert_eq!(v.and(), $false);\n\n                    }\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn or() {\n                        let v = $id::splat($false);\n                        assert_eq!(v.or(), $false);\n                        let v = $id::splat($true);\n                        assert_eq!(v.or(), $true);\n                        let v = $id::splat($false);\n                        let v = v.replace(0, $true);\n                        assert_eq!(v.or(), $true);\n                        let v = $id::splat($true);\n                        let v = v.replace(0, $false);\n                        if $id::lanes() > 1 {\n                            assert_eq!(v.or(), $true);\n                        } else {\n                            assert_eq!(v.or(), $false);\n                        }\n                    }\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn xor() {\n                        let v = $id::splat($false);\n                        assert_eq!(v.xor(), $false);\n                        let v = $id::splat($true);\n                        if $id::lanes() > 1 {\n                            assert_eq!(v.xor(), $false);\n                        } else {\n                            assert_eq!(v.xor(), $true);\n                        }\n                        let v = $id::splat($false);\n                        let v = v.replace(0, $true);\n                        assert_eq!(v.xor(), $true);\n                        let v = $id::splat($true);\n                        let v = v.replace(0, $false);\n                        if $id::lanes() > 1 {\n                            assert_eq!(v.xor(), $true);\n                        } else {\n                            assert_eq!(v.xor(), $false);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/reductions/float_arithmetic.rs",
    "content": "//! Implements portable horizontal float vector arithmetic reductions.\n\nmacro_rules! impl_reduction_float_arithmetic {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Horizontal sum of the vector elements.\n            ///\n            /// The intrinsic performs a tree-reduction of the vector elements.\n            /// That is, for an 8 element vector:\n            ///\n            /// > ((x0 + x1) + (x2 + x3)) + ((x4 + x5) + (x6 + x7))\n            ///\n            /// If one of the vector element is `NaN` the reduction returns\n            /// `NaN`. The resulting `NaN` is not required to be equal to any\n            /// of the `NaN`s in the vector.\n            #[inline]\n            pub fn sum(self) -> $elem_ty {\n                #[cfg(not(target_arch = \"aarch64\"))]\n                {\n                    use crate::llvm::simd_reduce_add_ordered;\n                    unsafe { simd_reduce_add_ordered(self.0, 0 as $elem_ty) }\n                }\n                #[cfg(target_arch = \"aarch64\")]\n                {\n                    // FIXME: broken on AArch64\n                    // https://github.com/rust-lang-nursery/packed_simd/issues/15\n                    let mut x = self.extract(0) as $elem_ty;\n                    for i in 1..$id::lanes() {\n                        x += self.extract(i) as $elem_ty;\n                    }\n                    x\n                }\n            }\n\n            /// Horizontal product of the vector elements.\n            ///\n            /// The intrinsic performs a tree-reduction of the vector elements.\n            /// That is, for an 8 element vector:\n            ///\n            /// > ((x0 * x1) * (x2 * x3)) * ((x4 * x5) * (x6 * x7))\n            ///\n            /// If one of the vector element is `NaN` the reduction returns\n            /// `NaN`. The resulting `NaN` is not required to be equal to any\n            /// of the `NaN`s in the vector.\n            #[inline]\n            pub fn product(self) -> $elem_ty {\n                #[cfg(not(target_arch = \"aarch64\"))]\n                {\n                    use crate::llvm::simd_reduce_mul_ordered;\n                    unsafe { simd_reduce_mul_ordered(self.0, 1 as $elem_ty) }\n                }\n                #[cfg(target_arch = \"aarch64\")]\n                {\n                    // FIXME: broken on AArch64\n                    // https://github.com/rust-lang-nursery/packed_simd/issues/15\n                    let mut x = self.extract(0) as $elem_ty;\n                    for i in 1..$id::lanes() {\n                        x *= self.extract(i) as $elem_ty;\n                    }\n                    x\n                }\n            }\n        }\n\n        impl crate::iter::Sum for $id {\n            #[inline]\n            fn sum<I: Iterator<Item = $id>>(iter: I) -> $id {\n                iter.fold($id::splat(0.), crate::ops::Add::add)\n            }\n        }\n\n        impl crate::iter::Product for $id {\n            #[inline]\n            fn product<I: Iterator<Item = $id>>(iter: I) -> $id {\n                iter.fold($id::splat(1.), crate::ops::Mul::mul)\n            }\n        }\n\n        impl<'a> crate::iter::Sum<&'a $id> for $id {\n            #[inline]\n            fn sum<I: Iterator<Item = &'a $id>>(iter: I) -> $id {\n                iter.fold($id::splat(0.), |a, b| crate::ops::Add::add(a, *b))\n            }\n        }\n\n        impl<'a> crate::iter::Product<&'a $id> for $id {\n            #[inline]\n            fn product<I: Iterator<Item = &'a $id>>(iter: I) -> $id {\n                iter.fold($id::splat(1.), |a, b| crate::ops::Mul::mul(a, *b))\n            }\n        }\n\n        test_if! {\n            $test_tt:\n            paste::item! {\n                // Comparisons use integer casts within mantissa^1 range.\n                #[allow(clippy::float_cmp)]\n                pub mod [<$id _reduction_float_arith>] {\n                    use super::*;\n                    fn alternating(x: usize) -> $id {\n                        let mut v = $id::splat(1 as $elem_ty);\n                        for i in 0..$id::lanes() {\n                            if i % x == 0 {\n                                v = v.replace(i, 2 as $elem_ty);\n                            }\n                        }\n                        v\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn sum() {\n                        let v = $id::splat(0 as $elem_ty);\n                        assert_eq!(v.sum(), 0 as $elem_ty);\n                        let v = $id::splat(1 as $elem_ty);\n                        assert_eq!(v.sum(), $id::lanes() as $elem_ty);\n                        let v = alternating(2);\n                        assert_eq!(\n                            v.sum(),\n                            ($id::lanes() / 2 + $id::lanes()) as $elem_ty\n                        );\n                    }\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn product() {\n                        let v = $id::splat(0 as $elem_ty);\n                        assert_eq!(v.product(), 0 as $elem_ty);\n                        let v = $id::splat(1 as $elem_ty);\n                        assert_eq!(v.product(), 1 as $elem_ty);\n                        let f = match $id::lanes() {\n                            64 => 16,\n                            32 => 8,\n                            16 => 4,\n                            _ => 2,\n                        };\n                        let v = alternating(f);\n                        assert_eq!(\n                            v.product(),\n                            (2_usize.pow(($id::lanes() / f) as u32)\n                             as $elem_ty)\n                        );\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[allow(unreachable_code)]\n                    fn sum_nan() {\n                        // FIXME: https://bugs.llvm.org/show_bug.cgi?id=36732\n                        // https://github.com/rust-lang-nursery/packed_simd/issues/6\n                        return;\n\n                        let n0 = crate::$elem_ty::NAN;\n                        let v0 = $id::splat(-3.0);\n                        for i in 0..$id::lanes() {\n                            let mut v = v0.replace(i, n0);\n                            // If the vector contains a NaN the result is NaN:\n                            assert!(\n                                v.sum().is_nan(),\n                                \"nan at {} => {} | {:?}\",\n                                i,\n                                v.sum(),\n                                v\n                            );\n                            for j in 0..i {\n                                v = v.replace(j, n0);\n                                assert!(v.sum().is_nan());\n                            }\n                        }\n                        let v = $id::splat(n0);\n                        assert!(v.sum().is_nan(), \"all nans | {:?}\", v);\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[allow(unreachable_code)]\n                    fn product_nan() {\n                        // FIXME: https://bugs.llvm.org/show_bug.cgi?id=36732\n                        // https://github.com/rust-lang-nursery/packed_simd/issues/6\n                        return;\n\n                        let n0 = crate::$elem_ty::NAN;\n                        let v0 = $id::splat(-3.0);\n                        for i in 0..$id::lanes() {\n                            let mut v = v0.replace(i, n0);\n                            // If the vector contains a NaN the result is NaN:\n                            assert!(\n                                v.product().is_nan(),\n                                \"nan at {} => {} | {:?}\",\n                                i,\n                                v.product(),\n                                v\n                            );\n                            for j in 0..i {\n                                v = v.replace(j, n0);\n                                assert!(v.product().is_nan());\n                            }\n                        }\n                        let v = $id::splat(n0);\n                        assert!(v.product().is_nan(), \"all nans | {:?}\", v);\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[allow(unused, dead_code)]\n                    fn sum_roundoff() {\n                        // Performs a tree-reduction\n                        fn tree_reduce_sum(a: &[$elem_ty]) -> $elem_ty {\n                            assert!(!a.is_empty());\n                            if a.len() == 1 {\n                                a[0]\n                            } else if a.len() == 2 {\n                                a[0] + a[1]\n                            } else {\n                                let mid = a.len() / 2;\n                                let (left, right) = a.split_at(mid);\n                                tree_reduce_sum(left) + tree_reduce_sum(right)\n                            }\n                        }\n\n                        let mut start = crate::$elem_ty::EPSILON;\n                        let mut scalar_reduction = 0. as $elem_ty;\n\n                        let mut v = $id::splat(0. as $elem_ty);\n                        for i in 0..$id::lanes() {\n                            let c = if i % 2 == 0 { 1e3 } else { -1. };\n                            start *= ::core::$elem_ty::consts::PI * c;\n                            scalar_reduction += start;\n                            v = v.replace(i, start);\n                        }\n                        let simd_reduction = v.sum();\n\n                        let mut a = [0. as $elem_ty; $id::lanes()];\n                        v.write_to_slice_unaligned(&mut a);\n                        let tree_reduction = tree_reduce_sum(&a);\n\n                        // tolerate 1 ULP difference:\n                        let red_bits = simd_reduction.to_bits();\n                        let tree_bits = tree_reduction.to_bits();\n                        assert!(\n                            if red_bits > tree_bits {\n                                red_bits - tree_bits\n                            } else {\n                                tree_bits - red_bits\n                            } < 2,\n                            \"vector: {:?} | simd_reduction: {:?} | \\\ntree_reduction: {} | scalar_reduction: {}\",\n                            v,\n                            simd_reduction,\n                            tree_reduction,\n                            scalar_reduction\n                        );\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[allow(unused, dead_code)]\n                    fn product_roundoff() {\n                        use ::core::convert::TryInto;\n                        // Performs a tree-reduction\n                        fn tree_reduce_product(a: &[$elem_ty]) -> $elem_ty {\n                            assert!(!a.is_empty());\n                            if a.len() == 1 {\n                                a[0]\n                            } else if a.len() == 2 {\n                                a[0] * a[1]\n                            } else {\n                                let mid = a.len() / 2;\n                                let (left, right) = a.split_at(mid);\n                                tree_reduce_product(left)\n                                    * tree_reduce_product(right)\n                            }\n                        }\n\n                        let mut start = crate::$elem_ty::EPSILON;\n                        let mut scalar_reduction = 1. as $elem_ty;\n\n                        let mut v = $id::splat(0. as $elem_ty);\n                        for i in 0..$id::lanes() {\n                            let c = if i % 2 == 0 { 1e3 } else { -1. };\n                            start *= ::core::$elem_ty::consts::PI * c;\n                            scalar_reduction *= start;\n                            v = v.replace(i, start);\n                        }\n                        let simd_reduction = v.product();\n\n                        let mut a = [0. as $elem_ty; $id::lanes()];\n                        v.write_to_slice_unaligned(&mut a);\n                        let tree_reduction = tree_reduce_product(&a);\n\n                        // FIXME: Too imprecise, even only for product(f32x8).\n                        // Figure out how to narrow this down.\n                        let ulp_limit = $id::lanes() / 2;\n                        let red_bits = simd_reduction.to_bits();\n                        let tree_bits = tree_reduction.to_bits();\n                        assert!(\n                            if red_bits > tree_bits {\n                                red_bits - tree_bits\n                            } else {\n                                tree_bits - red_bits\n                            } < ulp_limit.try_into().unwrap(),\n                            \"vector: {:?} | simd_reduction: {:?} | \\\ntree_reduction: {} | scalar_reduction: {}\",\n                            v,\n                            simd_reduction,\n                            tree_reduction,\n                            scalar_reduction\n                        );\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/reductions/integer_arithmetic.rs",
    "content": "//! Implements portable horizontal integer vector arithmetic reductions.\n\nmacro_rules! impl_reduction_integer_arithmetic {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $ielem_ty:ident\n     | $test_tt:tt) => {\n        impl $id {\n            /// Horizontal wrapping sum of the vector elements.\n            ///\n            /// The intrinsic performs a tree-reduction of the vector elements.\n            /// That is, for an 8 element vector:\n            ///\n            /// > ((x0 + x1) + (x2 + x3)) + ((x4 + x5) + (x6 + x7))\n            ///\n            /// If an operation overflows it returns the mathematical result\n            /// modulo `2^n` where `n` is the number of times it overflows.\n            #[inline]\n            pub fn wrapping_sum(self) -> $elem_ty {\n                #[cfg(not(target_arch = \"aarch64\"))]\n                {\n                    use crate::llvm::simd_reduce_add_ordered;\n                    let v: $ielem_ty = unsafe { simd_reduce_add_ordered(self.0, 0 as $ielem_ty) };\n                    v as $elem_ty\n                }\n                #[cfg(target_arch = \"aarch64\")]\n                {\n                    // FIXME: broken on AArch64\n                    // https://github.com/rust-lang-nursery/packed_simd/issues/15\n                    let mut x = self.extract(0) as $elem_ty;\n                    for i in 1..$id::lanes() {\n                        x = x.wrapping_add(self.extract(i) as $elem_ty);\n                    }\n                    x\n                }\n            }\n\n            /// Horizontal wrapping product of the vector elements.\n            ///\n            /// The intrinsic performs a tree-reduction of the vector elements.\n            /// That is, for an 8 element vector:\n            ///\n            /// > ((x0 * x1) * (x2 * x3)) * ((x4 * x5) * (x6 * x7))\n            ///\n            /// If an operation overflows it returns the mathematical result\n            /// modulo `2^n` where `n` is the number of times it overflows.\n            #[inline]\n            pub fn wrapping_product(self) -> $elem_ty {\n                #[cfg(not(target_arch = \"aarch64\"))]\n                {\n                    use crate::llvm::simd_reduce_mul_ordered;\n                    let v: $ielem_ty = unsafe { simd_reduce_mul_ordered(self.0, 1 as $ielem_ty) };\n                    v as $elem_ty\n                }\n                #[cfg(target_arch = \"aarch64\")]\n                {\n                    // FIXME: broken on AArch64\n                    // https://github.com/rust-lang-nursery/packed_simd/issues/15\n                    let mut x = self.extract(0) as $elem_ty;\n                    for i in 1..$id::lanes() {\n                        x = x.wrapping_mul(self.extract(i) as $elem_ty);\n                    }\n                    x\n                }\n            }\n        }\n\n        impl crate::iter::Sum for $id {\n            #[inline]\n            fn sum<I: Iterator<Item = $id>>(iter: I) -> $id {\n                iter.fold($id::splat(0), crate::ops::Add::add)\n            }\n        }\n\n        impl crate::iter::Product for $id {\n            #[inline]\n            fn product<I: Iterator<Item = $id>>(iter: I) -> $id {\n                iter.fold($id::splat(1), crate::ops::Mul::mul)\n            }\n        }\n\n        impl<'a> crate::iter::Sum<&'a $id> for $id {\n            #[inline]\n            fn sum<I: Iterator<Item = &'a $id>>(iter: I) -> $id {\n                iter.fold($id::splat(0), |a, b| crate::ops::Add::add(a, *b))\n            }\n        }\n\n        impl<'a> crate::iter::Product<&'a $id> for $id {\n            #[inline]\n            fn product<I: Iterator<Item = &'a $id>>(iter: I) -> $id {\n                iter.fold($id::splat(1), |a, b| crate::ops::Mul::mul(a, *b))\n            }\n        }\n\n        test_if! {\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _reduction_int_arith>] {\n                    use super::*;\n\n                    fn alternating(x: usize) -> $id {\n                        let mut v = $id::splat(1 as $elem_ty);\n                        for i in 0..$id::lanes() {\n                            if i % x == 0 {\n                                v = v.replace(i, 2 as $elem_ty);\n                            }\n                        }\n                        v\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn wrapping_sum() {\n                        let v = $id::splat(0 as $elem_ty);\n                        assert_eq!(v.wrapping_sum(), 0 as $elem_ty);\n                        let v = $id::splat(1 as $elem_ty);\n                        assert_eq!(v.wrapping_sum(), $id::lanes() as $elem_ty);\n                        let v = alternating(2);\n                        if $id::lanes() > 1 {\n                            assert_eq!(\n                                v.wrapping_sum(),\n                                ($id::lanes() / 2 + $id::lanes()) as $elem_ty\n                            );\n                        } else {\n                            assert_eq!(\n                                v.wrapping_sum(),\n                                2 as $elem_ty\n                            );\n                        }\n                    }\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn wrapping_sum_overflow() {\n                        let start = $elem_ty::max_value()\n                            - ($id::lanes() as $elem_ty / 2);\n\n                        let v = $id::splat(start as $elem_ty);\n                        let vwrapping_sum = v.wrapping_sum();\n\n                        let mut wrapping_sum = start;\n                        for _ in 1..$id::lanes() {\n                            wrapping_sum = wrapping_sum.wrapping_add(start);\n                        }\n                        assert_eq!(wrapping_sum, vwrapping_sum, \"v = {:?}\", v);\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn wrapping_product() {\n                        let v = $id::splat(0 as $elem_ty);\n                        assert_eq!(v.wrapping_product(), 0 as $elem_ty);\n                        let v = $id::splat(1 as $elem_ty);\n                        assert_eq!(v.wrapping_product(), 1 as $elem_ty);\n                        let f = match $id::lanes() {\n                            64 => 16,\n                            32 => 8,\n                            16 => 4,\n                            _ => 2,\n                        };\n                        let v = alternating(f);\n                        if $id::lanes() > 1 {\n                            assert_eq!(\n                                v.wrapping_product(),\n                                (2_usize.pow(($id::lanes() / f) as u32)\n                                 as $elem_ty)\n                            );\n                        } else {\n                            assert_eq!(\n                                v.wrapping_product(),\n                                2 as $elem_ty\n                            );\n                        }\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn wrapping_product_overflow() {\n                        let start = $elem_ty::max_value()\n                            - ($id::lanes() as $elem_ty / 2);\n\n                        let v = $id::splat(start as $elem_ty);\n                        let vmul = v.wrapping_product();\n\n                        let mut mul = start;\n                        for _ in 1..$id::lanes() {\n                            mul = mul.wrapping_mul(start);\n                        }\n                        assert_eq!(mul, vmul, \"v = {:?}\", v);\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/reductions/mask.rs",
    "content": "//! Implements portable horizontal mask reductions.\n\nmacro_rules! impl_reduction_mask {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Are `all` vector lanes `true`?\n            #[inline]\n            pub fn all(self) -> bool {\n                unsafe { crate::codegen::reductions::mask::All::all(self) }\n            }\n            /// Is `any` vector lane `true`?\n            #[inline]\n            pub fn any(self) -> bool {\n                unsafe { crate::codegen::reductions::mask::Any::any(self) }\n            }\n            /// Are `all` vector lanes `false`?\n            #[inline]\n            pub fn none(self) -> bool {\n                !self.any()\n            }\n        }\n\n        test_if! {\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _reduction>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn all() {\n                        let a = $id::splat(true);\n                        assert!(a.all());\n                        let a = $id::splat(false);\n                        assert!(!a.all());\n\n                        if $id::lanes() > 1 {\n                            for i in 0..$id::lanes() {\n                                let mut a = $id::splat(true);\n                                a = a.replace(i, false);\n                                assert!(!a.all());\n                                let mut a = $id::splat(false);\n                                a = a.replace(i, true);\n                                assert!(!a.all());\n                            }\n                        }\n                    }\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn any() {\n                        let a = $id::splat(true);\n                        assert!(a.any());\n                        let a = $id::splat(false);\n                        assert!(!a.any());\n\n                        if $id::lanes() > 1 {\n                            for i in 0..$id::lanes() {\n                                let mut a = $id::splat(true);\n                                a = a.replace(i, false);\n                                assert!(a.any());\n                                let mut a = $id::splat(false);\n                                a = a.replace(i, true);\n                                assert!(a.any());\n                            }\n                        }\n                    }\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn none() {\n                        let a = $id::splat(true);\n                        assert!(!a.none());\n                        let a = $id::splat(false);\n                        assert!(a.none());\n\n                        if $id::lanes() > 1 {\n                            for i in 0..$id::lanes() {\n                                let mut a = $id::splat(true);\n                                a = a.replace(i, false);\n                                assert!(!a.none());\n                                let mut a = $id::splat(false);\n                                a = a.replace(i, true);\n                                assert!(!a.none());\n                            }\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/reductions/min_max.rs",
    "content": "//! Implements portable horizontal vector min/max reductions.\n\nmacro_rules! impl_reduction_min_max {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident\n     | $ielem_ty:ident | $test_tt:tt) => {\n        impl $id {\n            /// Largest vector element value.\n            #[inline]\n            pub fn max_element(self) -> $elem_ty {\n                #[cfg(not(any(\n                    target_arch = \"aarch64\",\n                    target_arch = \"arm\",\n                    target_arch = \"powerpc64\",\n                    target_arch = \"wasm32\",\n                )))]\n                {\n                    use crate::llvm::simd_reduce_max;\n                    let v: $ielem_ty = unsafe { simd_reduce_max(self.0) };\n                    v as $elem_ty\n                }\n                #[cfg(any(\n                    target_arch = \"aarch64\",\n                    target_arch = \"arm\",\n                    target_arch = \"powerpc64\",\n                    target_arch = \"wasm32\",\n                ))]\n                {\n                    // FIXME: broken on AArch64\n                    // https://github.com/rust-lang-nursery/packed_simd/issues/15\n                    // FIXME: broken on WASM32\n                    // https://github.com/rust-lang-nursery/packed_simd/issues/91\n                    let mut x = self.extract(0);\n                    for i in 1..$id::lanes() {\n                        x = x.max(self.extract(i));\n                    }\n                    x\n                }\n            }\n\n            /// Smallest vector element value.\n            #[inline]\n            pub fn min_element(self) -> $elem_ty {\n                #[cfg(not(any(\n                    target_arch = \"aarch64\",\n                    target_arch = \"arm\",\n                    all(target_arch = \"x86\", not(target_feature = \"sse2\")),\n                    target_arch = \"powerpc64\",\n                    target_arch = \"wasm32\",\n                ),))]\n                {\n                    use crate::llvm::simd_reduce_min;\n                    let v: $ielem_ty = unsafe { simd_reduce_min(self.0) };\n                    v as $elem_ty\n                }\n                #[cfg(any(\n                    target_arch = \"aarch64\",\n                    target_arch = \"arm\",\n                    all(target_arch = \"x86\", not(target_feature = \"sse2\")),\n                    target_arch = \"powerpc64\",\n                    target_arch = \"wasm32\",\n                ))]\n                {\n                    // FIXME: broken on AArch64\n                    // https://github.com/rust-lang-nursery/packed_simd/issues/15\n                    // FIXME: broken on i586-unknown-linux-gnu\n                    // https://github.com/rust-lang-nursery/packed_simd/issues/22\n                    // FIXME: broken on WASM32\n                    // https://github.com/rust-lang-nursery/packed_simd/issues/91\n                    let mut x = self.extract(0);\n                    for i in 1..$id::lanes() {\n                        x = x.min(self.extract(i));\n                    }\n                    x\n                }\n            }\n        }\n        test_if! {$test_tt:\n        paste::item! {\n            // Comparisons use integer casts within mantissa^1 range.\n            #[allow(clippy::float_cmp)]\n            pub mod [<$id _reduction_min_max>] {\n                use super::*;\n                #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                pub fn max_element() {\n                    let v = $id::splat(0 as $elem_ty);\n                    assert_eq!(v.max_element(), 0 as $elem_ty);\n                    if $id::lanes() > 1 {\n                        let v = v.replace(1, 1 as $elem_ty);\n                        assert_eq!(v.max_element(), 1 as $elem_ty);\n                    }\n                    let v = v.replace(0, 2 as $elem_ty);\n                    assert_eq!(v.max_element(), 2 as $elem_ty);\n                }\n\n                #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                pub fn min_element() {\n                    let v = $id::splat(0 as $elem_ty);\n                    assert_eq!(v.min_element(), 0 as $elem_ty);\n                    if $id::lanes() > 1 {\n                        let v = v.replace(1, 1 as $elem_ty);\n                        assert_eq!(v.min_element(), 0 as $elem_ty);\n                    }\n                    let v = $id::splat(1 as $elem_ty);\n                    let v = v.replace(0, 2 as $elem_ty);\n                    if $id::lanes() > 1 {\n                        assert_eq!(v.min_element(), 1 as $elem_ty);\n                    } else {\n                        assert_eq!(v.min_element(), 2 as $elem_ty);\n                    }\n                    if $id::lanes() > 1 {\n                        let v = $id::splat(2 as $elem_ty);\n                        let v = v.replace(1, 1 as $elem_ty);\n                        assert_eq!(v.min_element(), 1 as $elem_ty);\n                    }\n                }\n            }\n        }\n        }\n    };\n}\n\nmacro_rules! test_reduction_float_min_max {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        test_if! {\n            $test_tt:\n            paste::item! {\n                // Comparisons use integer casts within mantissa^1 range.\n                #[allow(clippy::float_cmp)]\n                pub mod [<$id _reduction_min_max_nan>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn min_element_test() {\n                        let n = crate::$elem_ty::NAN;\n\n                        assert_eq!(n.min(-3.), -3.);\n                        assert_eq!((-3. as $elem_ty).min(n), -3.);\n\n                        let v0 = $id::splat(-3.);\n\n                        let target_with_broken_last_lane_nan = !cfg!(any(\n                            target_arch = \"arm\", target_arch = \"aarch64\",\n                            all(target_arch = \"x86\",\n                                not(target_feature = \"sse2\")\n                            ),\n                            target_arch = \"powerpc64\",\n                            target_arch = \"wasm32\",\n                        ));\n\n                        // The vector is initialized to `-3.`s: [-3, -3, -3, -3]\n                        for i in 0..$id::lanes() {\n                            // We replace the i-th element of the vector with\n                            // `NaN`: [-3, -3, -3, NaN]\n                            let mut v = v0.replace(i, n);\n\n                            // If the NaN is in the last place, the LLVM\n                            // implementation of these methods is broken on some\n                            // targets:\n                            if i == $id::lanes() - 1 &&\n                                target_with_broken_last_lane_nan {\n                                    assert_eq!(v.min_element(), -3.,\n                                            \"[A]: nan at {} => {} | {:?}\",\n                                            i, v.min_element(), v);\n\n                                // If we replace all the elements in the vector\n                                // up-to the `i-th` lane with `NaN`s, the result\n                                // is still always `-3.` unless all elements of\n                                // the vector are `NaN`s:\n                                for j in 0..i {\n                                    v = v.replace(j, n);\n                                    if j == i-1 {\n                                        assert!(v.min_element().is_nan(),\n                                            \"[B]: nan at {} => {} | {:?}\",\n                                            i, v.min_element(), v);\n                                    } else {\n                                        assert_eq!(v.min_element(), -3.,\n                                            \"[B]: nan at {} => {} | {:?}\",\n                                            i, v.min_element(), v);\n                                    }\n                                }\n\n                                // We are done here, since we were in the last\n                                // lane which is the last iteration of the loop.\n                                break\n                            }\n\n                            // We are not in the last lane, and there is only\n                            // one `NaN` in the vector.\n\n                            // If the vector has one lane, the result is `NaN`:\n                            if $id::lanes() == 1 {\n                                assert!(v.min_element().is_nan(),\n                                        \"[C]: all nans | v={:?} | min={} | \\\nis_nan: {}\",\n                                        v, v.min_element(),\n                                        v.min_element().is_nan()\n                                );\n\n                                // And we are done, since the vector only has\n                                // one lane anyways.\n                                break;\n                            }\n\n                            // The vector has more than one lane, since there is\n                            // only one `NaN` in the vector, the result is\n                            // always `-3`.\n                            assert_eq!(v.min_element(), -3.,\n                                       \"[D]: nan at {} => {} | {:?}\",\n                                       i, v.min_element(), v);\n\n                            // If we replace all the elements in the vector\n                            // up-to the `i-th` lane with `NaN`s, the result is\n                            // still always `-3.` unless all elements of the\n                            // vector are `NaN`s:\n                            for j in 0..i {\n                                v = v.replace(j, n);\n\n                                if i == $id::lanes() - 1 && j == i - 1 {\n                                    // All elements of the vector are `NaN`s,\n                                    // therefore the result is NaN as well.\n                                    //\n                                    // Note: the #lanes of the vector is > 1, so\n                                    // \"i - 1\" does not overflow.\n                                    assert!(v.min_element().is_nan(),\n                                            \"[E]: all nans | v={:?} | min={} | \\\nis_nan: {}\",\n                                            v, v.min_element(),\n                                            v.min_element().is_nan());\n                                } else {\n                                    // There are non-`NaN` elements in the\n                                    // vector, therefore the result is `-3.`:\n                                    assert_eq!(v.min_element(), -3.,\n                                               \"[F]: nan at {} => {} | {:?}\",\n                                               i, v.min_element(), v);\n                                }\n                            }\n                        }\n\n                        // If the vector contains all NaNs the result is NaN:\n                        assert!($id::splat(n).min_element().is_nan(),\n                                \"all nans | v={:?} | min={} | is_nan: {}\",\n                                $id::splat(n), $id::splat(n).min_element(),\n                                $id::splat(n).min_element().is_nan());\n                    }\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn max_element_test() {\n                        let n = crate::$elem_ty::NAN;\n\n                        assert_eq!(n.max(-3.), -3.);\n                        assert_eq!((-3. as $elem_ty).max(n), -3.);\n\n                        let v0 = $id::splat(-3.);\n\n                        let target_with_broken_last_lane_nan = !cfg!(any(\n                            target_arch = \"arm\", target_arch = \"aarch64\",\n                            target_arch = \"powerpc64\", target_arch = \"wasm32\",\n                        ));\n\n                        // The vector is initialized to `-3.`s: [-3, -3, -3, -3]\n                        for i in 0..$id::lanes() {\n                            // We replace the i-th element of the vector with\n                            // `NaN`: [-3, -3, -3, NaN]\n                            let mut v = v0.replace(i, n);\n\n                            // If the NaN is in the last place, the LLVM\n                            // implementation of these methods is broken on some\n                            // targets:\n                            if i == $id::lanes() - 1 &&\n                              target_with_broken_last_lane_nan {\n                                assert_eq!(v.max_element(), -3.,\n                                        \"[A]: nan at {} => {} | {:?}\",\n                                        i, v.max_element(), v);\n\n                                // If we replace all the elements in the vector\n                                // up-to the `i-th` lane with `NaN`s, the result\n                                // is still always `-3.` unless all elements of\n                                // the vector are `NaN`s:\n                                for j in 0..i {\n                                    v = v.replace(j, n);\n                                    if j == i-1 {\n                                        assert!(v.min_element().is_nan(),\n                                        \"[B]: nan at {} => {} | {:?}\",\n                                        i, v.min_element(), v);\n                                    } else {\n                                        assert_eq!(v.max_element(), -3.,\n                                            \"[B]: nan at {} => {} | {:?}\",\n                                            i, v.max_element(), v);\n                                    }\n                                }\n\n                                // We are done here, since we were in the last\n                                // lane which is the last iteration of the loop.\n                                break\n                            }\n\n                            // We are not in the last lane, and there is only\n                            // one `NaN` in the vector.\n\n                            // If the vector has one lane, the result is `NaN`:\n                            if $id::lanes() == 1 {\n                                assert!(v.max_element().is_nan(),\n                                        \"[C]: all nans | v={:?} | min={} | \\\nis_nan: {}\",\n                                        v, v.max_element(),\n                                        v.max_element().is_nan());\n\n                                // And we are done, since the vector only has\n                                // one lane anyways.\n                                break;\n                            }\n\n                            // The vector has more than one lane, since there is\n                            // only one `NaN` in the vector, the result is\n                            // always `-3`.\n                            assert_eq!(v.max_element(), -3.,\n                                       \"[D]: nan at {} => {} | {:?}\",\n                                       i, v.max_element(), v);\n\n                            // If we replace all the elements in the vector\n                            // up-to the `i-th` lane with `NaN`s, the result is\n                            // still always `-3.` unless all elements of the\n                            // vector are `NaN`s:\n                            for j in 0..i {\n                                v = v.replace(j, n);\n\n                                if i == $id::lanes() - 1 && j == i - 1 {\n                                    // All elements of the vector are `NaN`s,\n                                    // therefore the result is NaN as well.\n                                    //\n                                    // Note: the #lanes of the vector is > 1, so\n                                    // \"i - 1\" does not overflow.\n                                    assert!(v.max_element().is_nan(),\n                                            \"[E]: all nans | v={:?} | max={} | \\\nis_nan: {}\",\n                                            v, v.max_element(),\n                                            v.max_element().is_nan());\n                                } else {\n                                    // There are non-`NaN` elements in the\n                                    // vector, therefore the result is `-3.`:\n                                    assert_eq!(v.max_element(), -3.,\n                                               \"[F]: nan at {} => {} | {:?}\",\n                                               i, v.max_element(), v);\n                                }\n                            }\n                        }\n\n                        // If the vector contains all NaNs the result is NaN:\n                        assert!($id::splat(n).max_element().is_nan(),\n                                \"all nans | v={:?} | max={} | is_nan: {}\",\n                                $id::splat(n), $id::splat(n).max_element(),\n                                $id::splat(n).max_element().is_nan());\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/reductions.rs",
    "content": "//! Reductions\n\n#[macro_use]\nmod float_arithmetic;\n#[macro_use]\nmod integer_arithmetic;\n#[macro_use]\nmod bitwise;\n#[macro_use]\nmod mask;\n#[macro_use]\nmod min_max;\n"
  },
  {
    "path": "src/api/select.rs",
    "content": "//! Implements mask's `select`.\n\n/// Implements mask select method\nmacro_rules! impl_select {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Selects elements of `a` and `b` using mask.\n            ///\n            /// The lanes of the result for which the mask is `true` contain\n            /// the values of `a`. The remaining lanes contain the values of\n            /// `b`.\n            #[inline]\n            pub fn select<T>(self, a: Simd<T>, b: Simd<T>) -> Simd<T>\n            where\n                T: sealed::SimdArray<NT = <[$elem_ty; $elem_count] as sealed::SimdArray>::NT>,\n            {\n                use crate::llvm::simd_select;\n                Simd(unsafe { simd_select(self.0, a.0, b.0) })\n            }\n        }\n\n        test_select!(bool, $id, $id, (false, true) | $test_tt);\n    };\n}\n\nmacro_rules! test_select {\n    (\n        $elem_ty:ident,\n        $mask_ty:ident,\n        $vec_ty:ident,($small:expr, $large:expr) |\n        $test_tt:tt\n    ) => {\n        test_if! {\n            $test_tt:\n            paste::item! {\n                pub mod [<$vec_ty _select>] {\n                    use super::*;\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn select() {\n                        let o = $small as $elem_ty;\n                        let t = $large as $elem_ty;\n\n                        let a = $vec_ty::splat(o);\n                        let b = $vec_ty::splat(t);\n                        let m = a.lt(b);\n                        assert_eq!(m.select(a, b), a);\n\n                        let m = b.lt(a);\n                        assert_eq!(m.select(b, a), a);\n\n                        let mut c = a;\n                        let mut d = b;\n                        let mut m_e = $mask_ty::splat(false);\n                        for i in 0..$vec_ty::lanes() {\n                            if i % 2 == 0 {\n                                let c_tmp = c.extract(i);\n                                c = c.replace(i, d.extract(i));\n                                d = d.replace(i, c_tmp);\n                            } else {\n                                m_e = m_e.replace(i, true);\n                            }\n                        }\n\n                        let m = c.lt(d);\n                        assert_eq!(m_e, m);\n                        assert_eq!(m.select(c, d), a);\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/shuffle.rs",
    "content": "//! Implements portable vector shuffles with immediate indices.\n\n// FIXME: comprehensive tests\n// https://github.com/rust-lang-nursery/packed_simd/issues/20\n\n/// Shuffles vector elements.\n///\n/// This macro returns a new vector that contains a shuffle of the elements in\n/// one (`shuffle!(vec, [indices...])`) or two (`shuffle!(vec0, vec1,\n/// [indices...])`) input vectors.\n///\n/// The type of `vec0` and `vec1` must be equal, and the element type of the\n/// resulting vector is the element type of the input vector.\n///\n/// The number of `indices` must be a power-of-two in range `[0, 64)`, since\n/// currently, the largest vector supported by the library has 64 lanes. The\n/// length of the resulting vector equals the number of indices provided.\n///\n/// The indices must be in range `[0, M * N)` where `M` is the number of input\n/// vectors (`1` or `2`) and `N` is the number of lanes of the input vectors.\n/// The indices `i` in range `[0, N)` refer to the `i`-th element of `vec0`,\n/// while the indices in range `[N, 2*N)` refer to the `i - N`-th element of\n/// `vec1`.\n///\n/// # Examples\n///\n/// Shuffling elements of two vectors:\n///\n/// ```\n/// # use packed_simd::*;\n/// # fn main() {\n/// // Shuffle allows reordering the elements:\n/// let x = i32x4::new(1, 2, 3, 4);\n/// let y = i32x4::new(5, 6, 7, 8);\n/// let r = shuffle!(x, y, [4, 0, 5, 1]);\n/// assert_eq!(r, i32x4::new(5, 1, 6, 2));\n///\n/// // The resulting vector can als be smaller than the input:\n/// let r = shuffle!(x, y, [1, 6]);\n/// assert_eq!(r, i32x2::new(2, 7));\n///\n/// // Or larger:\n/// let r = shuffle!(x, y, [1, 3, 4, 2, 1, 7, 2, 2]);\n/// assert_eq!(r, i32x8::new(2, 4, 5, 3, 2, 8, 3, 3));\n/// // At most 2 * the number of lanes in the input vector.\n/// # }\n/// ```\n///\n/// Shuffling elements of one vector:\n///\n/// ```\n/// # use packed_simd::*;\n/// # fn main() {\n/// // Shuffle allows reordering the elements of a vector:\n/// let x = i32x4::new(1, 2, 3, 4);\n/// let r = shuffle!(x, [2, 1, 3, 0]);\n/// assert_eq!(r, i32x4::new(3, 2, 4, 1));\n///\n/// // The resulting vector can be smaller than the input:\n/// let r = shuffle!(x, [1, 3]);\n/// assert_eq!(r, i32x2::new(2, 4));\n///\n/// // Equal:\n/// let r = shuffle!(x, [1, 3, 2, 0]);\n/// assert_eq!(r, i32x4::new(2, 4, 3, 1));\n///\n/// // Or larger:\n/// let r = shuffle!(x, [1, 3, 2, 2, 1, 3, 2, 2]);\n/// assert_eq!(r, i32x8::new(2, 4, 3, 3, 2, 4, 3, 3));\n/// // At most 2 * the number of lanes in the input vector.\n/// # }\n/// ```\n#[macro_export]\nmacro_rules! shuffle {\n    ($vec0:expr, $vec1:expr, [$l0:expr, $l1:expr]) => {{\n        #[allow(unused_unsafe)]\n        unsafe {\n            $crate::Simd($crate::__shuffle_vector2::<{[$l0, $l1]}, _, _>(\n                $vec0.0,\n                $vec1.0,\n            ))\n        }\n    }};\n    ($vec0:expr, $vec1:expr, [$l0:expr, $l1:expr, $l2:expr, $l3:expr]) => {{\n        #[allow(unused_unsafe)]\n        unsafe {\n            $crate::Simd($crate::__shuffle_vector4::<{[$l0, $l1, $l2, $l3]}, _, _>(\n                $vec0.0,\n                $vec1.0,\n            ))\n        }\n    }};\n    ($vec0:expr, $vec1:expr,\n     [$l0:expr, $l1:expr, $l2:expr, $l3:expr,\n      $l4:expr, $l5:expr, $l6:expr, $l7:expr]) => {{\n        #[allow(unused_unsafe)]\n        unsafe {\n            $crate::Simd($crate::__shuffle_vector8::<{[$l0, $l1, $l2, $l3, $l4, $l5, $l6, $l7]}, _, _>(\n                $vec0.0,\n                $vec1.0,\n            ))\n        }\n    }};\n    ($vec0:expr, $vec1:expr,\n     [$l0:expr, $l1:expr, $l2:expr, $l3:expr,\n      $l4:expr, $l5:expr, $l6:expr, $l7:expr,\n      $l8:expr, $l9:expr, $l10:expr, $l11:expr,\n      $l12:expr, $l13:expr, $l14:expr, $l15:expr]) => {{\n        #[allow(unused_unsafe)]\n        unsafe {\n            $crate::Simd($crate::__shuffle_vector16::<{\n                [\n                    $l0, $l1, $l2, $l3, $l4, $l5, $l6, $l7, $l8, $l9, $l10,\n                    $l11, $l12, $l13, $l14, $l15,\n                ]\n            }, _, _>(\n                $vec0.0,\n                $vec1.0,\n            ))\n        }\n    }};\n    ($vec0:expr, $vec1:expr,\n     [$l0:expr, $l1:expr, $l2:expr, $l3:expr,\n      $l4:expr, $l5:expr, $l6:expr, $l7:expr,\n      $l8:expr, $l9:expr, $l10:expr, $l11:expr,\n      $l12:expr, $l13:expr, $l14:expr, $l15:expr,\n      $l16:expr, $l17:expr, $l18:expr, $l19:expr,\n      $l20:expr, $l21:expr, $l22:expr, $l23:expr,\n      $l24:expr, $l25:expr, $l26:expr, $l27:expr,\n      $l28:expr, $l29:expr, $l30:expr, $l31:expr]) => {{\n        #[allow(unused_unsafe)]\n        unsafe {\n            $crate::Simd($crate::__shuffle_vector32::<{\n                [\n                    $l0, $l1, $l2, $l3, $l4, $l5, $l6, $l7, $l8, $l9, $l10,\n                    $l11, $l12, $l13, $l14, $l15, $l16, $l17, $l18, $l19,\n                    $l20, $l21, $l22, $l23, $l24, $l25, $l26, $l27, $l28,\n                    $l29, $l30, $l31,\n                ]\n            }, _, _>(\n                $vec0.0,\n                $vec1.0,\n            ))\n        }\n    }};\n    ($vec0:expr, $vec1:expr,\n     [$l0:expr, $l1:expr, $l2:expr, $l3:expr,\n      $l4:expr, $l5:expr, $l6:expr, $l7:expr,\n      $l8:expr, $l9:expr, $l10:expr, $l11:expr,\n      $l12:expr, $l13:expr, $l14:expr, $l15:expr,\n      $l16:expr, $l17:expr, $l18:expr, $l19:expr,\n      $l20:expr, $l21:expr, $l22:expr, $l23:expr,\n      $l24:expr, $l25:expr, $l26:expr, $l27:expr,\n      $l28:expr, $l29:expr, $l30:expr, $l31:expr,\n      $l32:expr, $l33:expr, $l34:expr, $l35:expr,\n      $l36:expr, $l37:expr, $l38:expr, $l39:expr,\n      $l40:expr, $l41:expr, $l42:expr, $l43:expr,\n      $l44:expr, $l45:expr, $l46:expr, $l47:expr,\n      $l48:expr, $l49:expr, $l50:expr, $l51:expr,\n      $l52:expr, $l53:expr, $l54:expr, $l55:expr,\n      $l56:expr, $l57:expr, $l58:expr, $l59:expr,\n      $l60:expr, $l61:expr, $l62:expr, $l63:expr]) => {{\n        #[allow(unused_unsafe)]\n        unsafe {\n            $crate::Simd($crate::__shuffle_vector64::<{[\n                $l0, $l1, $l2, $l3, $l4, $l5, $l6, $l7, $l8, $l9, $l10,\n                $l11, $l12, $l13, $l14, $l15, $l16, $l17, $l18, $l19,\n                $l20, $l21, $l22, $l23, $l24, $l25, $l26, $l27, $l28,\n                $l29, $l30, $l31, $l32, $l33, $l34, $l35, $l36, $l37,\n                $l38, $l39, $l40, $l41, $l42, $l43, $l44, $l45, $l46,\n                $l47, $l48, $l49, $l50, $l51, $l52, $l53, $l54, $l55,\n                $l56, $l57, $l58, $l59, $l60, $l61, $l62, $l63,\n            ]}, _, _>(\n                $vec0.0,\n                $vec1.0,\n            ))\n        }\n     }};\n    ($vec:expr, [$($l:expr),*]) => {\n        match $vec {\n            v => shuffle!(v, v, [$($l),*])\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/shuffle1_dyn.rs",
    "content": "//! Shuffle vector elements according to a dynamic vector of indices.\n\nmacro_rules! impl_shuffle1_dyn {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Shuffle vector elements according to `indices`.\n            #[inline]\n            pub fn shuffle1_dyn<I>(self, indices: I) -> Self\n            where\n                Self: codegen::shuffle1_dyn::Shuffle1Dyn<Indices = I>,\n            {\n                codegen::shuffle1_dyn::Shuffle1Dyn::shuffle1_dyn(self, indices)\n            }\n        }\n    };\n}\n\nmacro_rules! test_shuffle1_dyn {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        test_if! {\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _shuffle1_dyn>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn shuffle1_dyn() {\n                        let increasing = {\n                            let mut v = $id::splat(0 as $elem_ty);\n                            for i in 0..$id::lanes() {\n                                v = v.replace(i, i as $elem_ty);\n                            }\n                            v\n                        };\n                        let decreasing = {\n                            let mut v = $id::splat(0 as $elem_ty);\n                            for i in 0..$id::lanes() {\n                                v = v.replace(\n                                    i,\n                                    ($id::lanes() - 1 - i) as $elem_ty\n                                );\n                            }\n                            v\n                        };\n\n                        type Indices = <\n                            $id as codegen::shuffle1_dyn::Shuffle1Dyn\n                            >::Indices;\n                        let increasing_ids: Indices = increasing.cast();\n                        let decreasing_ids: Indices = decreasing.cast();\n\n                        assert_eq!(\n                            increasing.shuffle1_dyn(increasing_ids),\n                            increasing,\n                            \"(i,i)=>i\"\n                        );\n                        assert_eq!(\n                            decreasing.shuffle1_dyn(increasing_ids),\n                            decreasing,\n                            \"(d,i)=>d\"\n                        );\n                        assert_eq!(\n                            increasing.shuffle1_dyn(decreasing_ids),\n                            decreasing,\n                            \"(i,d)=>d\"\n                        );\n                        assert_eq!(\n                            decreasing.shuffle1_dyn(decreasing_ids),\n                            increasing,\n                            \"(d,d)=>i\"\n                        );\n\n                        for i in 0..$id::lanes() {\n                            let v_ids: Indices\n                                = $id::splat(i as $elem_ty).cast();\n                            assert_eq!(increasing.shuffle1_dyn(v_ids),\n                                       $id::splat(increasing.extract(i))\n                            );\n                            assert_eq!(decreasing.shuffle1_dyn(v_ids),\n                                       $id::splat(decreasing.extract(i))\n                            );\n                            assert_eq!(\n                                $id::splat(i as $elem_ty)\n                                    .shuffle1_dyn(increasing_ids),\n                                $id::splat(i as $elem_ty)\n                            );\n                            assert_eq!(\n                                $id::splat(i as $elem_ty)\n                                    .shuffle1_dyn(decreasing_ids),\n                                $id::splat(i as $elem_ty)\n                            );\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n\nmacro_rules! test_shuffle1_dyn_mask {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        test_if! {\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _shuffle1_dyn>] {\n                    use super::*;\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn shuffle1_dyn() {\n                        // alternating = [true, false, true, false, ...]\n                        let mut alternating = $id::splat(false);\n                        for i in 0..$id::lanes() {\n                            if i % 2 == 0 {\n                                alternating = alternating.replace(i, true);\n                            }\n                        }\n\n                        type Indices = <\n                            $id as codegen::shuffle1_dyn::Shuffle1Dyn\n                            >::Indices;\n                        // even = [0, 0, 2, 2, 4, 4, ..]\n                        let even = {\n                            let mut v = Indices::splat(0);\n                            for i in 0..$id::lanes() {\n                                if i % 2 == 0 {\n                                    v = v.replace(i, (i as u8).into());\n                                } else {\n                                    v = v.replace(i, (i as u8 - 1).into());\n                                }\n                            }\n                            v\n                        };\n                        // odd = [1, 1, 3, 3, 5, 5, ...]\n                        let odd = {\n                            let mut v = Indices::splat(0);\n                            for i in 0..$id::lanes() {\n                                if i % 2 != 0 {\n                                    v = v.replace(i, (i as u8).into());\n                                } else {\n                                    v = v.replace(i, (i as u8 + 1).into());\n                                }\n                            }\n                            v\n                        };\n\n                        assert_eq!(\n                            alternating.shuffle1_dyn(even),\n                            $id::splat(true)\n                        );\n                        if $id::lanes() > 1 {\n                            assert_eq!(\n                                alternating.shuffle1_dyn(odd),\n                                $id::splat(false)\n                            );\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/slice/from_slice.rs",
    "content": "//! Implements methods to read a vector type from a slice.\n\nmacro_rules! impl_slice_from_slice {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Instantiates a new vector with the values of the `slice`.\n            ///\n            /// # Panics\n            ///\n            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not aligned\n            /// to an `align_of::<Self>()` boundary.\n            #[inline]\n            pub fn from_slice_aligned(slice: &[$elem_ty]) -> Self {\n                unsafe {\n                    assert!(slice.len() >= $elem_count);\n                    let target_ptr = slice.as_ptr();\n                    assert_eq!(target_ptr.align_offset(crate::mem::align_of::<Self>()), 0);\n                    Self::from_slice_aligned_unchecked(slice)\n                }\n            }\n\n            /// Instantiates a new vector with the values of the `slice`.\n            ///\n            /// # Panics\n            ///\n            /// If `slice.len() < Self::lanes()`.\n            #[inline]\n            pub fn from_slice_unaligned(slice: &[$elem_ty]) -> Self {\n                unsafe {\n                    assert!(slice.len() >= $elem_count);\n                    Self::from_slice_unaligned_unchecked(slice)\n                }\n            }\n\n            /// Instantiates a new vector with the values of the `slice`.\n            ///\n            /// # Safety\n            ///\n            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not aligned\n            /// to an `align_of::<Self>()` boundary, the behavior is undefined.\n            #[inline]\n            pub unsafe fn from_slice_aligned_unchecked(slice: &[$elem_ty]) -> Self {\n                debug_assert!(slice.len() >= $elem_count);\n                let target_ptr = slice.as_ptr();\n                debug_assert_eq!(target_ptr.align_offset(crate::mem::align_of::<Self>()), 0);\n\n                #[allow(clippy::cast_ptr_alignment)]\n                *(target_ptr as *const Self)\n            }\n\n            /// Instantiates a new vector with the values of the `slice`.\n            ///\n            /// # Safety\n            ///\n            /// If `slice.len() < Self::lanes()` the behavior is undefined.\n            #[inline]\n            pub unsafe fn from_slice_unaligned_unchecked(slice: &[$elem_ty]) -> Self {\n                use crate::mem::size_of;\n                debug_assert!(slice.len() >= $elem_count);\n                let target_ptr = slice.as_ptr().cast();\n                let mut x = Self::splat(0 as $elem_ty);\n                let self_ptr = &mut x as *mut Self as *mut u8;\n                crate::ptr::copy_nonoverlapping(target_ptr, self_ptr, size_of::<Self>());\n                x\n            }\n        }\n\n        test_if! {\n            $test_tt:\n            paste::item! {\n                // Comparisons use integer casts within mantissa^1 range.\n                #[allow(clippy::float_cmp)]\n                pub mod [<$id _slice_from_slice>] {\n                    use super::*;\n                    use crate::iter::Iterator;\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn from_slice_unaligned() {\n                        let mut unaligned = [42 as $elem_ty; $id::lanes() + 1];\n                        unaligned[0] = 0 as $elem_ty;\n                        let vec = $id::from_slice_unaligned(&unaligned[1..]);\n                        for (index, &b) in unaligned.iter().enumerate() {\n                            if index == 0 {\n                                assert_eq!(b, 0 as $elem_ty);\n                            } else {\n                                assert_eq!(b, 42 as $elem_ty);\n                                assert_eq!(b, vec.extract(index - 1));\n                            }\n                        }\n                    }\n\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    // #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn from_slice_unaligned_fail() {\n                        let mut unaligned = [42 as $elem_ty; $id::lanes() + 1];\n                        unaligned[0] = 0 as $elem_ty;\n                        // the slice is not large enough => panic\n                        let _vec = $id::from_slice_unaligned(&unaligned[2..]);\n                    }\n\n                    union A {\n                        data: [$elem_ty; 2 * $id::lanes()],\n                        _vec: $id,\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn from_slice_aligned() {\n                        let mut aligned = A {\n                            data: [0 as $elem_ty; 2 * $id::lanes()],\n                        };\n                        for i in $id::lanes()..(2 * $id::lanes()) {\n                            unsafe {\n                                aligned.data[i] = 42 as $elem_ty;\n                            }\n                        }\n\n                        let vec = unsafe {\n                            $id::from_slice_aligned(\n                                &aligned.data[$id::lanes()..]\n                            )\n                        };\n                        for (index, &b) in\n                            unsafe { aligned.data.iter().enumerate() } {\n                            if index < $id::lanes() {\n                                assert_eq!(b, 0 as $elem_ty);\n                            } else {\n                                assert_eq!(b, 42 as $elem_ty);\n                                assert_eq!(\n                                    b, vec.extract(index - $id::lanes())\n                                );\n                            }\n                        }\n                    }\n\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    // #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn from_slice_aligned_fail_lanes() {\n                        let aligned = A {\n                            data: [0 as $elem_ty; 2 * $id::lanes()],\n                        };\n                        let _vec = unsafe {\n                            $id::from_slice_aligned(\n                                &aligned.data[2 * $id::lanes()..]\n                            )\n                        };\n                    }\n\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    // #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn from_slice_aligned_fail_align() {\n                        unsafe {\n                            let aligned = A {\n                                data: [0 as $elem_ty; 2 * $id::lanes()],\n                            };\n\n                            // get a pointer to the front of data\n                            let ptr: *const $elem_ty = aligned.data.as_ptr()\n                                as *const $elem_ty;\n                            // offset pointer by one element\n                            let ptr = ptr.wrapping_add(1);\n\n                            if ptr.align_offset(\n                                crate::mem::align_of::<$id>()\n                            ) == 0 {\n                                // the pointer is properly aligned, so\n                                // from_slice_aligned won't fail here (e.g. this\n                                // can happen for i128x1). So we panic to make\n                                // the \"should_fail\" test pass:\n                                panic!(\"ok\");\n                            }\n\n                            // create a slice - this is safe, because the\n                            // elements of the slice exist, are properly\n                            // initialized, and properly aligned:\n                            let s: &[$elem_ty] = slice::from_raw_parts(\n                                ptr, $id::lanes()\n                            );\n                            // this should always panic because the slice\n                            // alignment does not match the alignment\n                            // requirements for the vector type:\n                            let _vec = $id::from_slice_aligned(s);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/slice/write_to_slice.rs",
    "content": "//! Implements methods to write a vector type to a slice.\n\nmacro_rules! impl_slice_write_to_slice {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Writes the values of the vector to the `slice`.\n            ///\n            /// # Panics\n            ///\n            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not\n            /// aligned to an `align_of::<Self>()` boundary.\n            #[inline]\n            pub fn write_to_slice_aligned(self, slice: &mut [$elem_ty]) {\n                unsafe {\n                    assert!(slice.len() >= $elem_count);\n                    let target_ptr = slice.as_mut_ptr();\n                    assert_eq!(target_ptr.align_offset(crate::mem::align_of::<Self>()), 0);\n                    self.write_to_slice_aligned_unchecked(slice);\n                }\n            }\n\n            /// Writes the values of the vector to the `slice`.\n            ///\n            /// # Panics\n            ///\n            /// If `slice.len() < Self::lanes()`.\n            #[inline]\n            pub fn write_to_slice_unaligned(self, slice: &mut [$elem_ty]) {\n                unsafe {\n                    assert!(slice.len() >= $elem_count);\n                    self.write_to_slice_unaligned_unchecked(slice);\n                }\n            }\n\n            /// Writes the values of the vector to the `slice`.\n            ///\n            /// # Safety\n            ///\n            /// If `slice.len() < Self::lanes()` or `&slice[0]` is not\n            /// aligned to an `align_of::<Self>()` boundary, the behavior is\n            /// undefined.\n            #[inline]\n            pub unsafe fn write_to_slice_aligned_unchecked(self, slice: &mut [$elem_ty]) {\n                debug_assert!(slice.len() >= $elem_count);\n                let target_ptr = slice.as_mut_ptr();\n                debug_assert_eq!(target_ptr.align_offset(crate::mem::align_of::<Self>()), 0);\n\n                #[allow(clippy::cast_ptr_alignment)]\n                #[allow(clippy::cast_ptr_alignment)]\n                #[allow(clippy::cast_ptr_alignment)]\n                #[allow(clippy::cast_ptr_alignment)]\n                *(target_ptr as *mut Self) = self;\n            }\n\n            /// Writes the values of the vector to the `slice`.\n            ///\n            /// # Safety\n            ///\n            /// If `slice.len() < Self::lanes()` the behavior is undefined.\n            #[inline]\n            pub unsafe fn write_to_slice_unaligned_unchecked(self, slice: &mut [$elem_ty]) {\n                debug_assert!(slice.len() >= $elem_count);\n                let target_ptr = slice.as_mut_ptr().cast();\n                let self_ptr = &self as *const Self as *const u8;\n                crate::ptr::copy_nonoverlapping(self_ptr, target_ptr, crate::mem::size_of::<Self>());\n            }\n        }\n\n        test_if! {\n            $test_tt:\n            paste::item! {\n                // Comparisons use integer casts within mantissa^1 range.\n                #[allow(clippy::float_cmp)]\n                pub mod [<$id _slice_write_to_slice>] {\n                    use super::*;\n                    use crate::iter::Iterator;\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn write_to_slice_unaligned() {\n                        let mut unaligned = [0 as $elem_ty; $id::lanes() + 1];\n                        let vec = $id::splat(42 as $elem_ty);\n                        vec.write_to_slice_unaligned(&mut unaligned[1..]);\n                        for (index, &b) in unaligned.iter().enumerate() {\n                            if index == 0 {\n                                assert_eq!(b, 0 as $elem_ty);\n                            } else {\n                                assert_eq!(b, 42 as $elem_ty);\n                                assert_eq!(b, vec.extract(index - 1));\n                            }\n                        }\n                    }\n\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    // #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn write_to_slice_unaligned_fail() {\n                        let mut unaligned = [0 as $elem_ty; $id::lanes() + 1];\n                        let vec = $id::splat(42 as $elem_ty);\n                        vec.write_to_slice_unaligned(&mut unaligned[2..]);\n                    }\n\n                    union A {\n                        data: [$elem_ty; 2 * $id::lanes()],\n                        _vec: $id,\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn write_to_slice_aligned() {\n                        let mut aligned = A {\n                            data: [0 as $elem_ty; 2 * $id::lanes()],\n                        };\n                        let vec = $id::splat(42 as $elem_ty);\n                        unsafe {\n                            vec.write_to_slice_aligned(\n                                &mut aligned.data[$id::lanes()..]\n                            );\n                            for (idx, &b) in aligned.data.iter().enumerate() {\n                                if idx < $id::lanes() {\n                                    assert_eq!(b, 0 as $elem_ty);\n                                } else {\n                                    assert_eq!(b, 42 as $elem_ty);\n                                    assert_eq!(\n                                        b, vec.extract(idx - $id::lanes())\n                                    );\n                                }\n                            }\n                        }\n                    }\n\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    // #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn write_to_slice_aligned_fail_lanes() {\n                        let mut aligned = A {\n                            data: [0 as $elem_ty; 2 * $id::lanes()],\n                        };\n                        let vec = $id::splat(42 as $elem_ty);\n                        unsafe {\n                            vec.write_to_slice_aligned(\n                                &mut aligned.data[2 * $id::lanes()..]\n                            )\n                        };\n                    }\n\n                    // FIXME: wasm-bindgen-test does not support #[should_panic]\n                    // #[cfg_attr(not(target_arch = \"wasm32\"), test)]\n                    // #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    #[cfg(not(target_arch = \"wasm32\"))]\n                    #[test]\n                    #[should_panic]\n                    fn write_to_slice_aligned_fail_align() {\n                        unsafe {\n                            let mut aligned = A {\n                                data: [0 as $elem_ty; 2 * $id::lanes()],\n                            };\n\n                            // get a pointer to the front of data\n                            let ptr: *mut $elem_ty\n                                = aligned.data.as_mut_ptr() as *mut $elem_ty;\n                            // offset pointer by one element\n                            let ptr = ptr.wrapping_add(1);\n\n                            if ptr.align_offset(crate::mem::align_of::<$id>())\n                                == 0 {\n                                // the pointer is properly aligned, so\n                                // write_to_slice_aligned won't fail here (e.g.\n                                // this can happen for i128x1). So we panic to\n                                // make the \"should_fail\" test pass:\n                                panic!(\"ok\");\n                            }\n\n                            // create a slice - this is safe, because the\n                            // elements of the slice exist, are properly\n                            // initialized, and properly aligned:\n                            let s: &mut [$elem_ty]\n                                = slice::from_raw_parts_mut(ptr, $id::lanes());\n                            // this should always panic because the slice\n                            // alignment does not match the alignment\n                            // requirements for the vector type:\n                            let vec = $id::splat(42 as $elem_ty);\n                            vec.write_to_slice_aligned(s);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api/slice.rs",
    "content": "//! Slice from/to methods\n\n#[macro_use]\nmod from_slice;\n\n#[macro_use]\nmod write_to_slice;\n"
  },
  {
    "path": "src/api/swap_bytes.rs",
    "content": "//! Horizontal swap bytes\n\nmacro_rules! impl_swap_bytes {\n    ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {\n        impl $id {\n            /// Reverses the byte order of the vector.\n            #[inline]\n            pub fn swap_bytes(self) -> Self {\n                super::codegen::swap_bytes::SwapBytes::swap_bytes(self)\n            }\n\n            /// Converts self to little endian from the target's endianness.\n            ///\n            /// On little endian this is a no-op. On big endian the bytes are\n            /// swapped.\n            #[inline]\n            pub fn to_le(self) -> Self {\n                #[cfg(target_endian = \"little\")]\n                {\n                    self\n                }\n                #[cfg(not(target_endian = \"little\"))]\n                {\n                    self.swap_bytes()\n                }\n            }\n\n            /// Converts self to big endian from the target's endianness.\n            ///\n            /// On big endian this is a no-op. On little endian the bytes are\n            /// swapped.\n            #[inline]\n            pub fn to_be(self) -> Self {\n                #[cfg(target_endian = \"big\")]\n                {\n                    self\n                }\n                #[cfg(not(target_endian = \"big\"))]\n                {\n                    self.swap_bytes()\n                }\n            }\n\n            /// Converts a vector from little endian to the target's endianness.\n            ///\n            /// On little endian this is a no-op. On big endian the bytes are\n            /// swapped.\n            #[inline]\n            pub fn from_le(x: Self) -> Self {\n                #[cfg(target_endian = \"little\")]\n                {\n                    x\n                }\n                #[cfg(not(target_endian = \"little\"))]\n                {\n                    x.swap_bytes()\n                }\n            }\n\n            /// Converts a vector from big endian to the target's endianness.\n            ///\n            /// On big endian this is a no-op. On little endian the bytes are\n            /// swapped.\n            #[inline]\n            pub fn from_be(x: Self) -> Self {\n                #[cfg(target_endian = \"big\")]\n                {\n                    x\n                }\n                #[cfg(not(target_endian = \"big\"))]\n                {\n                    x.swap_bytes()\n                }\n            }\n        }\n\n        test_if! {\n            $test_tt:\n            paste::item! {\n                pub mod [<$id _swap_bytes>] {\n                    use super::*;\n\n                    const BYTES: [u8; 64] = [\n                        0, 1, 2, 3, 4, 5, 6, 7,\n                        8, 9, 10, 11, 12, 13, 14, 15,\n                        16, 17, 18, 19, 20, 21, 22, 23,\n                        24, 25, 26, 27, 28, 29, 30, 31,\n                        32, 33, 34, 35, 36, 37, 38, 39,\n                        40, 41, 42, 43, 44, 45, 46, 47,\n                        48, 49, 50, 51, 52, 53, 54, 55,\n                        56, 57, 58, 59, 60, 61, 62, 63,\n                    ];\n\n                    macro_rules! swap {\n                        ($func: ident) => {{\n                            // catch possible future >512 vectors\n                            assert!(mem::size_of::<$id>() <= 64);\n\n                            let mut actual = BYTES;\n                            let elems: &mut [$elem_ty] = unsafe {\n                                slice::from_raw_parts_mut(\n                                    actual.as_mut_ptr() as *mut $elem_ty,\n                                    $id::lanes(),\n                                )\n                            };\n\n                            let vec = $id::from_slice_unaligned(elems);\n                            $id::$func(vec).write_to_slice_unaligned(elems);\n\n                            actual\n                        }};\n                    }\n\n                    macro_rules! test_swap {\n                        ($func: ident) => {{\n                            let actual = swap!($func);\n                            let expected =\n                                BYTES.iter().rev()\n                                .skip(64 - crate::mem::size_of::<$id>());\n                            assert!(actual.iter().zip(expected)\n                                    .all(|(x, y)| x == y));\n                        }};\n                    }\n\n                    macro_rules! test_no_swap {\n                        ($func: ident) => {{\n                            let actual = swap!($func);\n                            let expected = BYTES.iter()\n                                .take(mem::size_of::<$id>());\n\n                            assert!(actual.iter().zip(expected)\n                                    .all(|(x, y)| x == y));\n                        }};\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn swap_bytes() {\n                        test_swap!(swap_bytes);\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn to_le() {\n                        #[cfg(target_endian = \"little\")]\n                        {\n                            test_no_swap!(to_le);\n                        }\n                        #[cfg(not(target_endian = \"little\"))]\n                        {\n                            test_swap!(to_le);\n                        }\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn to_be() {\n                        #[cfg(target_endian = \"big\")]\n                        {\n                            test_no_swap!(to_be);\n                        }\n                        #[cfg(not(target_endian = \"big\"))]\n                        {\n                            test_swap!(to_be);\n                        }\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn from_le() {\n                        #[cfg(target_endian = \"little\")]\n                        {\n                            test_no_swap!(from_le);\n                        }\n                        #[cfg(not(target_endian = \"little\"))]\n                        {\n                            test_swap!(from_le);\n                        }\n                    }\n\n                    #[cfg_attr(not(target_arch = \"wasm32\"), test)] #[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\n                    fn from_be() {\n                        #[cfg(target_endian = \"big\")]\n                        {\n                            test_no_swap!(from_be);\n                        }\n                        #[cfg(not(target_endian = \"big\"))]\n                        {\n                            test_swap!(from_be);\n                        }\n                    }\n                }\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/api.rs",
    "content": "//! Implements the Simd<[T; N]> APIs\n\n#[macro_use]\nmod bitmask;\npub(crate) mod cast;\n#[macro_use]\nmod cmp;\n#[macro_use]\nmod default;\n#[macro_use]\nmod fmt;\n#[macro_use]\nmod from;\n#[macro_use]\nmod hash;\n#[macro_use]\nmod math;\n#[macro_use]\nmod minimal;\n#[macro_use]\nmod ops;\n#[macro_use]\nmod ptr;\n#[macro_use]\nmod reductions;\n#[macro_use]\nmod select;\n#[macro_use]\nmod shuffle;\n#[macro_use]\nmod shuffle1_dyn;\n#[macro_use]\nmod slice;\n#[macro_use]\nmod swap_bytes;\n#[macro_use]\nmod bit_manip;\n\n#[cfg(feature = \"into_bits\")]\npub(crate) mod into_bits;\n\nmacro_rules! impl_i {\n    ([$elem_ty:ident; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident\n     | $ielem_ty:ident, $ibitmask_ty:ident | $test_tt:tt | $($elem_ids:ident),*\n     | From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => {\n        impl_minimal_iuf!([$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt\n                          | $($elem_ids),* | $(#[$doc])*);\n        impl_ops_vector_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_ops_scalar_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_ops_vector_bitwise!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (!(0 as $elem_ty), 0)\n        );\n        impl_ops_scalar_bitwise!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (!(0 as $elem_ty), 0)\n        );\n        impl_ops_vector_shifts!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_ops_scalar_shifts!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_ops_vector_rotates!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_ops_vector_neg!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_ops_vector_int_min_max!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt\n        );\n        impl_reduction_integer_arithmetic!(\n            [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt\n        );\n        impl_reduction_min_max!(\n            [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt\n        );\n        impl_reduction_bitwise!(\n            [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt\n            | (|x|{ x as $elem_ty }) | (!(0 as $elem_ty), 0)\n        );\n        impl_fmt_debug!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_fmt_lower_hex!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_fmt_upper_hex!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_fmt_octal!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_fmt_binary!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_from_array!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (1, 1));\n        impl_from_vectors!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt | $($from_vec_ty),*\n        );\n        impl_default!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_hash!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_slice_from_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_slice_write_to_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_swap_bytes!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_bit_manip!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_cmp_partial_eq!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1)\n        );\n        impl_cmp_eq!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1));\n        impl_cmp_vertical!(\n            [$elem_ty; $elem_n]: $tuple_id, $mask_ty, false, (1, 0) | $test_tt\n        );\n        impl_cmp_partial_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_cmp_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1));\n        impl_bitmask!($tuple_id | $ibitmask_ty | (-1, 0) | $test_tt);\n\n        test_select!($elem_ty, $mask_ty, $tuple_id, (1, 2) | $test_tt);\n        test_cmp_partial_ord_int!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        test_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n    }\n}\n\nmacro_rules! impl_u {\n    ([$elem_ty:ident; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident\n     | $ielem_ty:ident, $ibitmask_ty:ident | $test_tt:tt | $($elem_ids:ident),*\n     | From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => {\n        impl_minimal_iuf!([$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt\n                          | $($elem_ids),* | $(#[$doc])*);\n        impl_ops_vector_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_ops_scalar_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_ops_vector_bitwise!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (!(0 as $elem_ty), 0)\n        );\n        impl_ops_scalar_bitwise!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (!(0 as $elem_ty), 0)\n        );\n        impl_ops_vector_shifts!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_ops_scalar_shifts!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_ops_vector_rotates!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_ops_vector_int_min_max!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt\n        );\n        impl_reduction_integer_arithmetic!(\n            [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt\n        );\n        impl_reduction_min_max!(\n            [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt\n        );\n        impl_reduction_bitwise!(\n            [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt\n            | (|x|{ x as $elem_ty }) | (!(0 as $elem_ty), 0)\n        );\n        impl_fmt_debug!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_fmt_lower_hex!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_fmt_upper_hex!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_fmt_octal!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_fmt_binary!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_from_array!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (1, 1));\n        impl_from_vectors!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt | $($from_vec_ty),*\n        );\n        impl_default!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_hash!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_slice_from_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_slice_write_to_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_swap_bytes!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_bit_manip!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_cmp_partial_eq!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (1, 0)\n        );\n        impl_cmp_eq!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1));\n        impl_cmp_vertical!(\n            [$elem_ty; $elem_n]: $tuple_id, $mask_ty, false, (1, 0) | $test_tt\n        );\n        impl_cmp_partial_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_cmp_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1));\n        impl_bitmask!($tuple_id | $ibitmask_ty | ($ielem_ty::max_value(), 0) |\n                      $test_tt);\n\n        test_select!($elem_ty, $mask_ty, $tuple_id, (1, 2) | $test_tt);\n        test_cmp_partial_ord_int!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        test_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n    }\n}\n\nmacro_rules! impl_f {\n    ([$elem_ty:ident; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident\n     | $ielem_ty:ident | $test_tt:tt | $($elem_ids:ident),*\n     | From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => {\n        impl_minimal_iuf!([$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt\n                          | $($elem_ids),* | $(#[$doc])*);\n        impl_ops_vector_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_ops_scalar_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_ops_vector_neg!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_ops_vector_float_min_max!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt\n        );\n        impl_reduction_float_arithmetic!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_reduction_min_max!(\n            [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt\n        );\n        impl_fmt_debug!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_from_array!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (1., 1.));\n        impl_from_vectors!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt | $($from_vec_ty),*\n        );\n        impl_default!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_cmp_partial_eq!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (1., 0.)\n        );\n        impl_slice_from_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_slice_write_to_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n\n        impl_float_consts!([$elem_ty; $elem_n]: $tuple_id);\n        impl_float_category!([$elem_ty; $elem_n]: $tuple_id, $mask_ty);\n\n        // floating-point math\n        impl_math_float_abs!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_math_float_cos!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_math_float_exp!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_math_float_ln!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_math_float_mul_add!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_math_float_mul_adde!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_math_float_powf!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_math_float_recpre!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_math_float_rsqrte!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_math_float_sin!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_math_float_sqrt!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_math_float_sqrte!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_math_float_tanh!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_cmp_vertical!(\n            [$elem_ty; $elem_n]: $tuple_id, $mask_ty, false, (1., 0.)\n                | $test_tt\n        );\n\n        test_select!($elem_ty, $mask_ty, $tuple_id, (1., 2.) | $test_tt);\n        test_reduction_float_min_max!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt\n        );\n        test_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n    }\n}\n\nmacro_rules! impl_m {\n    ([$elem_ty:ident; $elem_n:expr]: $tuple_id:ident\n     | $ielem_ty:ident, $ibitmask_ty:ident\n     | $test_tt:tt | $($elem_ids:ident),* | From: $($from_vec_ty:ident),*\n     | $(#[$doc:meta])*) => {\n        impl_minimal_mask!(\n            [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt\n            | $($elem_ids),* | $(#[$doc])*\n        );\n        impl_ops_vector_mask_bitwise!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (true, false)\n        );\n        impl_ops_scalar_mask_bitwise!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (true, false)\n        );\n        impl_reduction_bitwise!(\n            [bool; $elem_n]: $tuple_id | $ielem_ty | $test_tt\n                | (|x|{ x != 0 }) | (true, false)\n        );\n        impl_reduction_mask!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_fmt_debug!([bool; $elem_n]: $tuple_id | $test_tt);\n        impl_from_array!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt\n            | (crate::$elem_ty::new(true), true)\n        );\n        impl_from_vectors!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt | $($from_vec_ty),*\n        );\n        impl_default!([bool; $elem_n]: $tuple_id | $test_tt);\n        impl_cmp_partial_eq!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (true, false)\n        );\n        impl_cmp_eq!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (true, false)\n        );\n        impl_cmp_vertical!(\n            [$elem_ty; $elem_n]: $tuple_id, $tuple_id, true, (true, false)\n            | $test_tt\n        );\n        impl_select!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_cmp_partial_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_cmp_ord!(\n            [$elem_ty; $elem_n]: $tuple_id | $test_tt | (false, true)\n        );\n        impl_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        impl_bitmask!($tuple_id | $ibitmask_ty | (true, false) | $test_tt);\n\n        test_cmp_partial_ord_mask!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n        test_shuffle1_dyn_mask!([$elem_ty; $elem_n]: $tuple_id | $test_tt);\n    }\n}\n\nmacro_rules! impl_const_p {\n    ([$elem_ty:ty; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident,\n     $usize_ty:ident, $isize_ty:ident\n     | $test_tt:tt | $($elem_ids:ident),*\n     | From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => {\n        impl_minimal_p!(\n            [$elem_ty; $elem_n]: $tuple_id, $mask_ty, $usize_ty, $isize_ty\n                | ref_ | $test_tt | $($elem_ids),*\n                | (1 as $elem_ty, 0 as $elem_ty) | $(#[$doc])*\n        );\n        impl_ptr_read!([$elem_ty; $elem_n]: $tuple_id, $mask_ty | $test_tt);\n    }\n}\n\nmacro_rules! impl_mut_p {\n    ([$elem_ty:ty; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident,\n     $usize_ty:ident, $isize_ty:ident\n     | $test_tt:tt | $($elem_ids:ident),*\n     | From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => {\n        impl_minimal_p!(\n            [$elem_ty; $elem_n]: $tuple_id, $mask_ty, $usize_ty, $isize_ty\n                | ref_mut_ | $test_tt | $($elem_ids),*\n                | (1 as $elem_ty, 0 as $elem_ty) | $(#[$doc])*\n        );\n        impl_ptr_read!([$elem_ty; $elem_n]: $tuple_id, $mask_ty | $test_tt);\n        impl_ptr_write!([$elem_ty; $elem_n]: $tuple_id, $mask_ty | $test_tt);\n    }\n}\n"
  },
  {
    "path": "src/codegen/bit_manip.rs",
    "content": "//! LLVM bit manipulation intrinsics.\n#[rustfmt::skip]\n\npub(crate) use crate::*;\n\n#[allow(improper_ctypes, dead_code)]\nextern \"C\" {\n    #[link_name = \"llvm.ctlz.v2i8\"]\n    fn ctlz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x2;\n    #[link_name = \"llvm.ctlz.v4i8\"]\n    fn ctlz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x4;\n    #[link_name = \"llvm.ctlz.v8i8\"]\n    fn ctlz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x8;\n    #[link_name = \"llvm.ctlz.v16i8\"]\n    fn ctlz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x16;\n    #[link_name = \"llvm.ctlz.v32i8\"]\n    fn ctlz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x32;\n    #[link_name = \"llvm.ctlz.v64i8\"]\n    fn ctlz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x64;\n\n    #[link_name = \"llvm.ctlz.v2i16\"]\n    fn ctlz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x2;\n    #[link_name = \"llvm.ctlz.v4i16\"]\n    fn ctlz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x4;\n    #[link_name = \"llvm.ctlz.v8i16\"]\n    fn ctlz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x8;\n    #[link_name = \"llvm.ctlz.v16i16\"]\n    fn ctlz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x16;\n    #[link_name = \"llvm.ctlz.v32i16\"]\n    fn ctlz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x32;\n\n    #[link_name = \"llvm.ctlz.v2i32\"]\n    fn ctlz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x2;\n    #[link_name = \"llvm.ctlz.v4i32\"]\n    fn ctlz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x4;\n    #[link_name = \"llvm.ctlz.v8i32\"]\n    fn ctlz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x8;\n    #[link_name = \"llvm.ctlz.v16i32\"]\n    fn ctlz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x16;\n\n    #[link_name = \"llvm.ctlz.v2i64\"]\n    fn ctlz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x2;\n    #[link_name = \"llvm.ctlz.v4i64\"]\n    fn ctlz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x4;\n    #[link_name = \"llvm.ctlz.v8i64\"]\n    fn ctlz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x8;\n\n    #[link_name = \"llvm.ctlz.v1i128\"]\n    fn ctlz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x1;\n    #[link_name = \"llvm.ctlz.v2i128\"]\n    fn ctlz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x2;\n    #[link_name = \"llvm.ctlz.v4i128\"]\n    fn ctlz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4;\n\n    #[link_name = \"llvm.cttz.v2i8\"]\n    fn cttz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x2;\n    #[link_name = \"llvm.cttz.v4i8\"]\n    fn cttz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x4;\n    #[link_name = \"llvm.cttz.v8i8\"]\n    fn cttz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x8;\n    #[link_name = \"llvm.cttz.v16i8\"]\n    fn cttz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x16;\n    #[link_name = \"llvm.cttz.v32i8\"]\n    fn cttz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x32;\n    #[link_name = \"llvm.cttz.v64i8\"]\n    fn cttz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x64;\n\n    #[link_name = \"llvm.cttz.v2i16\"]\n    fn cttz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x2;\n    #[link_name = \"llvm.cttz.v4i16\"]\n    fn cttz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x4;\n    #[link_name = \"llvm.cttz.v8i16\"]\n    fn cttz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x8;\n    #[link_name = \"llvm.cttz.v16i16\"]\n    fn cttz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x16;\n    #[link_name = \"llvm.cttz.v32i16\"]\n    fn cttz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x32;\n\n    #[link_name = \"llvm.cttz.v2i32\"]\n    fn cttz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x2;\n    #[link_name = \"llvm.cttz.v4i32\"]\n    fn cttz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x4;\n    #[link_name = \"llvm.cttz.v8i32\"]\n    fn cttz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x8;\n    #[link_name = \"llvm.cttz.v16i32\"]\n    fn cttz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x16;\n\n    #[link_name = \"llvm.cttz.v2i64\"]\n    fn cttz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x2;\n    #[link_name = \"llvm.cttz.v4i64\"]\n    fn cttz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x4;\n    #[link_name = \"llvm.cttz.v8i64\"]\n    fn cttz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x8;\n\n    #[link_name = \"llvm.cttz.v1i128\"]\n    fn cttz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x1;\n    #[link_name = \"llvm.cttz.v2i128\"]\n    fn cttz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x2;\n    #[link_name = \"llvm.cttz.v4i128\"]\n    fn cttz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4;\n\n    #[link_name = \"llvm.ctpop.v2i8\"]\n    fn ctpop_u8x2(x: u8x2) -> u8x2;\n    #[link_name = \"llvm.ctpop.v4i8\"]\n    fn ctpop_u8x4(x: u8x4) -> u8x4;\n    #[link_name = \"llvm.ctpop.v8i8\"]\n    fn ctpop_u8x8(x: u8x8) -> u8x8;\n    #[link_name = \"llvm.ctpop.v16i8\"]\n    fn ctpop_u8x16(x: u8x16) -> u8x16;\n    #[link_name = \"llvm.ctpop.v32i8\"]\n    fn ctpop_u8x32(x: u8x32) -> u8x32;\n    #[link_name = \"llvm.ctpop.v64i8\"]\n    fn ctpop_u8x64(x: u8x64) -> u8x64;\n\n    #[link_name = \"llvm.ctpop.v2i16\"]\n    fn ctpop_u16x2(x: u16x2) -> u16x2;\n    #[link_name = \"llvm.ctpop.v4i16\"]\n    fn ctpop_u16x4(x: u16x4) -> u16x4;\n    #[link_name = \"llvm.ctpop.v8i16\"]\n    fn ctpop_u16x8(x: u16x8) -> u16x8;\n    #[link_name = \"llvm.ctpop.v16i16\"]\n    fn ctpop_u16x16(x: u16x16) -> u16x16;\n    #[link_name = \"llvm.ctpop.v32i16\"]\n    fn ctpop_u16x32(x: u16x32) -> u16x32;\n\n    #[link_name = \"llvm.ctpop.v2i32\"]\n    fn ctpop_u32x2(x: u32x2) -> u32x2;\n    #[link_name = \"llvm.ctpop.v4i32\"]\n    fn ctpop_u32x4(x: u32x4) -> u32x4;\n    #[link_name = \"llvm.ctpop.v8i32\"]\n    fn ctpop_u32x8(x: u32x8) -> u32x8;\n    #[link_name = \"llvm.ctpop.v16i32\"]\n    fn ctpop_u32x16(x: u32x16) -> u32x16;\n\n    #[link_name = \"llvm.ctpop.v2i64\"]\n    fn ctpop_u64x2(x: u64x2) -> u64x2;\n    #[link_name = \"llvm.ctpop.v4i64\"]\n    fn ctpop_u64x4(x: u64x4) -> u64x4;\n    #[link_name = \"llvm.ctpop.v8i64\"]\n    fn ctpop_u64x8(x: u64x8) -> u64x8;\n\n    #[link_name = \"llvm.ctpop.v1i128\"]\n    fn ctpop_u128x1(x: u128x1) -> u128x1;\n    #[link_name = \"llvm.ctpop.v2i128\"]\n    fn ctpop_u128x2(x: u128x2) -> u128x2;\n    #[link_name = \"llvm.ctpop.v4i128\"]\n    fn ctpop_u128x4(x: u128x4) -> u128x4;\n}\n\npub(crate) trait BitManip {\n    fn ctpop(self) -> Self;\n    fn ctlz(self) -> Self;\n    fn cttz(self) -> Self;\n}\n\nmacro_rules! impl_bit_manip {\n    (inner: $ty:ident, $scalar:ty, $uty:ident,\n     $ctpop:ident, $ctlz:ident, $cttz:ident) => {\n        // FIXME: several LLVM intrinsics break on s390x https://github.com/rust-lang-nursery/packed_simd/issues/192\n        #[cfg(target_arch = \"s390x\")]\n        impl_bit_manip! { scalar: $ty, $scalar }\n        #[cfg(not(target_arch = \"s390x\"))]\n        impl BitManip for $ty {\n            #[inline]\n            fn ctpop(self) -> Self {\n                let y: $uty = self.cast();\n                unsafe { $ctpop(y).cast() }\n            }\n\n            #[inline]\n            fn ctlz(self) -> Self {\n                let y: $uty = self.cast();\n                // the ctxx intrinsics need compile-time constant\n                // `is_zero_undef`\n                unsafe { $ctlz(y, false).cast() }\n            }\n\n            #[inline]\n            fn cttz(self) -> Self {\n                let y: $uty = self.cast();\n                unsafe { $cttz(y, false).cast() }\n            }\n        }\n    };\n    (sized_inner: $ty:ident, $scalar:ty, $uty:ident) => {\n        #[cfg(target_arch = \"s390x\")]\n        impl_bit_manip! { scalar: $ty, $scalar }\n        #[cfg(not(target_arch = \"s390x\"))]\n        impl BitManip for $ty {\n            #[inline]\n            fn ctpop(self) -> Self {\n                let y: $uty = self.cast();\n                $uty::ctpop(y).cast()\n            }\n\n            #[inline]\n            fn ctlz(self) -> Self {\n                let y: $uty = self.cast();\n                $uty::ctlz(y).cast()\n            }\n\n            #[inline]\n            fn cttz(self) -> Self {\n                let y: $uty = self.cast();\n                $uty::cttz(y).cast()\n            }\n        }\n    };\n    (scalar: $ty:ident, $scalar:ty) => {\n        impl BitManip for $ty {\n            #[inline]\n            fn ctpop(self) -> Self {\n                let mut ones = self;\n                for i in 0..Self::lanes() {\n                    ones = ones.replace(i, self.extract(i).count_ones() as $scalar);\n                }\n                ones\n            }\n\n            #[inline]\n            fn ctlz(self) -> Self {\n                let mut lz = self;\n                for i in 0..Self::lanes() {\n                    lz = lz.replace(i, self.extract(i).leading_zeros() as $scalar);\n                }\n                lz\n            }\n\n            #[inline]\n            fn cttz(self) -> Self {\n                let mut tz = self;\n                for i in 0..Self::lanes() {\n                    tz = tz.replace(i, self.extract(i).trailing_zeros() as $scalar);\n                }\n                tz\n            }\n        }\n    };\n    ($uty:ident, $uscalar:ty, $ity:ident, $iscalar:ty,\n     $ctpop:ident, $ctlz:ident, $cttz:ident) => {\n        impl_bit_manip! { inner: $uty, $uscalar, $uty, $ctpop, $ctlz, $cttz }\n        impl_bit_manip! { inner: $ity, $iscalar, $uty, $ctpop, $ctlz, $cttz }\n    };\n    (sized: $usize:ident, $uscalar:ty, $isize:ident,\n     $iscalar:ty, $ty:ident) => {\n        impl_bit_manip! { sized_inner: $usize, $uscalar, $ty }\n        impl_bit_manip! { sized_inner: $isize, $iscalar, $ty }\n    };\n}\n\nimpl_bit_manip! { u8x2   ,   u8, i8x2, i8,   ctpop_u8x2,   ctlz_u8x2,   cttz_u8x2   }\nimpl_bit_manip! { u8x4   ,   u8, i8x4, i8,   ctpop_u8x4,   ctlz_u8x4,   cttz_u8x4   }\n#[cfg(not(target_arch = \"aarch64\"))] // see below\nimpl_bit_manip! { u8x8   ,   u8, i8x8, i8,   ctpop_u8x8,   ctlz_u8x8,   cttz_u8x8   }\nimpl_bit_manip! { u8x16  ,  u8, i8x16, i8,  ctpop_u8x16,  ctlz_u8x16,  cttz_u8x16  }\nimpl_bit_manip! { u8x32  ,  u8, i8x32, i8,  ctpop_u8x32,  ctlz_u8x32,  cttz_u8x32  }\nimpl_bit_manip! { u8x64  ,  u8, i8x64, i8,  ctpop_u8x64,  ctlz_u8x64,  cttz_u8x64  }\nimpl_bit_manip! { u16x2  ,  u16, i16x2, i16,  ctpop_u16x2,  ctlz_u16x2,  cttz_u16x2  }\nimpl_bit_manip! { u16x4  ,  u16, i16x4, i16,  ctpop_u16x4,  ctlz_u16x4,  cttz_u16x4  }\nimpl_bit_manip! { u16x8  ,  u16, i16x8, i16,  ctpop_u16x8,  ctlz_u16x8,  cttz_u16x8  }\nimpl_bit_manip! { u16x16 , u16, i16x16, i16, ctpop_u16x16, ctlz_u16x16, cttz_u16x16 }\nimpl_bit_manip! { u16x32 , u16, i16x32, i16, ctpop_u16x32, ctlz_u16x32, cttz_u16x32 }\nimpl_bit_manip! { u32x2  ,  u32, i32x2, i32,  ctpop_u32x2,  ctlz_u32x2,  cttz_u32x2  }\nimpl_bit_manip! { u32x4  ,  u32, i32x4, i32,  ctpop_u32x4,  ctlz_u32x4,  cttz_u32x4  }\nimpl_bit_manip! { u32x8  ,  u32, i32x8, i32,  ctpop_u32x8,  ctlz_u32x8,  cttz_u32x8  }\nimpl_bit_manip! { u32x16 , u32, i32x16, i32, ctpop_u32x16, ctlz_u32x16, cttz_u32x16 }\nimpl_bit_manip! { u64x2  ,  u64, i64x2, i64,  ctpop_u64x2,  ctlz_u64x2,  cttz_u64x2  }\nimpl_bit_manip! { u64x4  ,  u64, i64x4, i64,  ctpop_u64x4,  ctlz_u64x4,  cttz_u64x4  }\nimpl_bit_manip! { u64x8  ,  u64, i64x8, i64,  ctpop_u64x8,  ctlz_u64x8,  cttz_u64x8  }\nimpl_bit_manip! { u128x1 , u128, i128x1, i128, ctpop_u128x1, ctlz_u128x1, cttz_u128x1 }\nimpl_bit_manip! { u128x2 , u128, i128x2, i128, ctpop_u128x2, ctlz_u128x2, cttz_u128x2 }\nimpl_bit_manip! { u128x4 , u128, i128x4, i128, ctpop_u128x4, ctlz_u128x4, cttz_u128x4 }\n\n#[cfg(target_arch = \"aarch64\")]\nimpl BitManip for u8x8 {\n    #[inline]\n    fn ctpop(self) -> Self {\n        let y: u8x8 = self.cast();\n        unsafe { ctpop_u8x8(y).cast() }\n    }\n\n    #[inline]\n    fn ctlz(self) -> Self {\n        let y: u8x8 = self.cast();\n        unsafe { ctlz_u8x8(y, false).cast() }\n    }\n\n    #[inline]\n    fn cttz(self) -> Self {\n        // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191\n        // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64\n        // intrinsics\n        let mut tz = self;\n        for i in 0..Self::lanes() {\n            tz = tz.replace(i, self.extract(i).trailing_zeros() as u8);\n        }\n        tz\n    }\n}\n#[cfg(target_arch = \"aarch64\")]\nimpl BitManip for i8x8 {\n    #[inline]\n    fn ctpop(self) -> Self {\n        let y: u8x8 = self.cast();\n        unsafe { ctpop_u8x8(y).cast() }\n    }\n\n    #[inline]\n    fn ctlz(self) -> Self {\n        let y: u8x8 = self.cast();\n        unsafe { ctlz_u8x8(y, false).cast() }\n    }\n\n    #[inline]\n    fn cttz(self) -> Self {\n        // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191\n        // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64\n        // intrinsics\n        let mut tz = self;\n        for i in 0..Self::lanes() {\n            tz = tz.replace(i, self.extract(i).trailing_zeros() as i8);\n        }\n        tz\n    }\n}\n\ncfg_if! {\n    if #[cfg(target_pointer_width = \"8\")] {\n        impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u8x2 }\n        impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u8x4 }\n        impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u8x8 }\n    } else if #[cfg(target_pointer_width = \"16\")] {\n        impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u16x2 }\n        impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u16x4 }\n        impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u16x8 }\n    } else if #[cfg(target_pointer_width = \"32\")] {\n        impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u32x2 }\n        impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u32x4 }\n        impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u32x8 }\n    } else if #[cfg(target_pointer_width = \"64\")] {\n        impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u64x2 }\n        impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u64x4 }\n        impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u64x8 }\n    } else {\n        compile_error!(\"unsupported target_pointer_width\");\n    }\n}\n"
  },
  {
    "path": "src/codegen/llvm.rs",
    "content": "//! LLVM's platform intrinsics\n#![allow(dead_code)]\n\nuse crate::sealed::Shuffle;\n#[allow(unused_imports)] // FIXME: spurious warning?\nuse crate::sealed::Simd;\n\nextern \"platform-intrinsic\" {\n    fn simd_shuffle<T, I, U>(x: T, y: T, idx: I) -> U;\n}\n\n#[allow(clippy::missing_safety_doc)]\n#[inline]\npub unsafe fn __shuffle_vector2<const IDX: [u32; 2], T, U>(x: T, y: T) -> U\nwhere\n    T: Simd,\n    <T as Simd>::Element: Shuffle<[u32; 2], Output = U>,\n{\n    simd_shuffle(x, y, IDX)\n}\n\n#[allow(clippy::missing_safety_doc)]\n#[inline]\npub unsafe fn __shuffle_vector4<const IDX: [u32; 4], T, U>(x: T, y: T) -> U\nwhere\n    T: Simd,\n    <T as Simd>::Element: Shuffle<[u32; 4], Output = U>,\n{\n    simd_shuffle(x, y, IDX)\n}\n\n#[allow(clippy::missing_safety_doc)]\n#[inline]\npub unsafe fn __shuffle_vector8<const IDX: [u32; 8], T, U>(x: T, y: T) -> U\nwhere\n    T: Simd,\n    <T as Simd>::Element: Shuffle<[u32; 8], Output = U>,\n{\n    simd_shuffle(x, y, IDX)\n}\n\n#[allow(clippy::missing_safety_doc)]\n#[inline]\npub unsafe fn __shuffle_vector16<const IDX: [u32; 16], T, U>(x: T, y: T) -> U\nwhere\n    T: Simd,\n    <T as Simd>::Element: Shuffle<[u32; 16], Output = U>,\n{\n    simd_shuffle(x, y, IDX)\n}\n\n#[allow(clippy::missing_safety_doc)]\n#[inline]\npub unsafe fn __shuffle_vector32<const IDX: [u32; 32], T, U>(x: T, y: T) -> U\nwhere\n    T: Simd,\n    <T as Simd>::Element: Shuffle<[u32; 32], Output = U>,\n{\n    simd_shuffle(x, y, IDX)\n}\n\n#[allow(clippy::missing_safety_doc)]\n#[inline]\npub unsafe fn __shuffle_vector64<const IDX: [u32; 64], T, U>(x: T, y: T) -> U\nwhere\n    T: Simd,\n    <T as Simd>::Element: Shuffle<[u32; 64], Output = U>,\n{\n    simd_shuffle(x, y, IDX)\n}\n\nextern \"platform-intrinsic\" {\n    pub(crate) fn simd_eq<T, U>(x: T, y: T) -> U;\n    pub(crate) fn simd_ne<T, U>(x: T, y: T) -> U;\n    pub(crate) fn simd_lt<T, U>(x: T, y: T) -> U;\n    pub(crate) fn simd_le<T, U>(x: T, y: T) -> U;\n    pub(crate) fn simd_gt<T, U>(x: T, y: T) -> U;\n    pub(crate) fn simd_ge<T, U>(x: T, y: T) -> U;\n\n    pub(crate) fn simd_insert<T, U>(x: T, idx: u32, val: U) -> T;\n    pub(crate) fn simd_extract<T, U>(x: T, idx: u32) -> U;\n\n    pub(crate) fn simd_cast<T, U>(x: T) -> U;\n\n    pub(crate) fn simd_add<T>(x: T, y: T) -> T;\n    pub(crate) fn simd_sub<T>(x: T, y: T) -> T;\n    pub(crate) fn simd_mul<T>(x: T, y: T) -> T;\n    pub(crate) fn simd_div<T>(x: T, y: T) -> T;\n    pub(crate) fn simd_rem<T>(x: T, y: T) -> T;\n    pub(crate) fn simd_shl<T>(x: T, y: T) -> T;\n    pub(crate) fn simd_shr<T>(x: T, y: T) -> T;\n    pub(crate) fn simd_and<T>(x: T, y: T) -> T;\n    pub(crate) fn simd_or<T>(x: T, y: T) -> T;\n    pub(crate) fn simd_xor<T>(x: T, y: T) -> T;\n\n    pub(crate) fn simd_reduce_add_unordered<T, U>(x: T) -> U;\n    pub(crate) fn simd_reduce_mul_unordered<T, U>(x: T) -> U;\n    pub(crate) fn simd_reduce_add_ordered<T, U>(x: T, acc: U) -> U;\n    pub(crate) fn simd_reduce_mul_ordered<T, U>(x: T, acc: U) -> U;\n    pub(crate) fn simd_reduce_min<T, U>(x: T) -> U;\n    pub(crate) fn simd_reduce_max<T, U>(x: T) -> U;\n    pub(crate) fn simd_reduce_min_nanless<T, U>(x: T) -> U;\n    pub(crate) fn simd_reduce_max_nanless<T, U>(x: T) -> U;\n    pub(crate) fn simd_reduce_and<T, U>(x: T) -> U;\n    pub(crate) fn simd_reduce_or<T, U>(x: T) -> U;\n    pub(crate) fn simd_reduce_xor<T, U>(x: T) -> U;\n    pub(crate) fn simd_reduce_all<T>(x: T) -> bool;\n    pub(crate) fn simd_reduce_any<T>(x: T) -> bool;\n\n    pub(crate) fn simd_select<M, T>(m: M, a: T, b: T) -> T;\n\n    pub(crate) fn simd_fmin<T>(a: T, b: T) -> T;\n    pub(crate) fn simd_fmax<T>(a: T, b: T) -> T;\n\n    pub(crate) fn simd_fsqrt<T>(a: T) -> T;\n    pub(crate) fn simd_fma<T>(a: T, b: T, c: T) -> T;\n\n    pub(crate) fn simd_gather<T, P, M>(value: T, pointers: P, mask: M) -> T;\n    pub(crate) fn simd_scatter<T, P, M>(value: T, pointers: P, mask: M);\n\n    pub(crate) fn simd_bitmask<T, U>(value: T) -> U;\n}\n"
  },
  {
    "path": "src/codegen/math/float/abs.rs",
    "content": "//! Vertical floating-point `fabs`\n#![allow(unused)]\n\n// FIXME 64-bit 1 elem vectors fabs\n\nuse crate::*;\n\npub(crate) trait Abs {\n    fn abs(self) -> Self;\n}\n\n#[allow(improper_ctypes)]\nextern \"C\" {\n    #[link_name = \"llvm.fabs.v2f32\"]\n    fn fabs_v2f32(x: f32x2) -> f32x2;\n    #[link_name = \"llvm.fabs.v4f32\"]\n    fn fabs_v4f32(x: f32x4) -> f32x4;\n    #[link_name = \"llvm.fabs.v8f32\"]\n    fn fabs_v8f32(x: f32x8) -> f32x8;\n    #[link_name = \"llvm.fabs.v16f32\"]\n    fn fabs_v16f32(x: f32x16) -> f32x16;\n    /* FIXME 64-bit fabsgle elem vectors\n    #[link_name = \"llvm.fabs.v1f64\"]\n    fn fabs_v1f64(x: f64x1) -> f64x1;\n     */\n    #[link_name = \"llvm.fabs.v2f64\"]\n    fn fabs_v2f64(x: f64x2) -> f64x2;\n    #[link_name = \"llvm.fabs.v4f64\"]\n    fn fabs_v4f64(x: f64x4) -> f64x4;\n    #[link_name = \"llvm.fabs.v8f64\"]\n    fn fabs_v8f64(x: f64x8) -> f64x8;\n\n    #[link_name = \"llvm.fabs.f32\"]\n    fn fabs_f32(x: f32) -> f32;\n    #[link_name = \"llvm.fabs.f64\"]\n    fn fabs_f64(x: f64) -> f64;\n}\n\ngen_unary_impl_table!(Abs, abs);\n\ncfg_if! {\n    if #[cfg(target_arch = \"s390x\")] {\n        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14\n        impl_unary!(f32x2[f32; 2]: fabs_f32);\n        impl_unary!(f32x4[f32; 4]: fabs_f32);\n        impl_unary!(f32x8[f32; 8]: fabs_f32);\n        impl_unary!(f32x16[f32; 16]: fabs_f32);\n\n        impl_unary!(f64x2[f64; 2]: fabs_f64);\n        impl_unary!(f64x4[f64; 4]: fabs_f64);\n        impl_unary!(f64x8[f64; 8]: fabs_f64);\n    } else if #[cfg(all(target_arch = \"x86_64\", feature = \"sleef-sys\"))] {\n        use sleef_sys::*;\n        cfg_if! {\n            if #[cfg(target_feature = \"avx2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_avx2128);\n                impl_unary!(f32x16[h => f32x8]: Sleef_fabsf8_avx2);\n                impl_unary!(f64x8[h => f64x4]: Sleef_fabsd4_avx2);\n\n                impl_unary!(f32x4: Sleef_fabsf4_avx2128);\n                impl_unary!(f32x8: Sleef_fabsf8_avx2);\n                impl_unary!(f64x2: Sleef_fabsd2_avx2128);\n                impl_unary!(f64x4: Sleef_fabsd4_avx2);\n            } else if #[cfg(target_feature = \"avx\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_sse4);\n                impl_unary!(f32x16[h => f32x8]: Sleef_fabsf8_avx);\n                impl_unary!(f64x8[h => f64x4]: Sleef_fabsd4_avx);\n\n                impl_unary!(f32x4: Sleef_fabsf4_sse4);\n                impl_unary!(f32x8: Sleef_fabsf8_avx);\n                impl_unary!(f64x2: Sleef_fabsd2_sse4);\n                impl_unary!(f64x4: Sleef_fabsd4_avx);\n            } else if #[cfg(target_feature = \"sse4.2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_sse4);\n                impl_unary!(f32x16[q => f32x4]: Sleef_fabsf4_sse4);\n                impl_unary!(f64x8[q => f64x2]: Sleef_fabsd2_sse4);\n\n                impl_unary!(f32x4: Sleef_fabsf4_sse4);\n                impl_unary!(f32x8[h => f32x4]: Sleef_fabsf4_sse4);\n                impl_unary!(f64x2: Sleef_fabsd2_sse4);\n                impl_unary!(f64x4[h => f64x2]: Sleef_fabsd2_sse4);\n            } else {\n                impl_unary!(f32x2[f32; 2]: fabs_f32);\n                impl_unary!(f32x16: fabs_v16f32);\n                impl_unary!(f64x8: fabs_v8f64);\n\n                impl_unary!(f32x4: fabs_v4f32);\n                impl_unary!(f32x8: fabs_v8f32);\n                impl_unary!(f64x2: fabs_v2f64);\n                impl_unary!(f64x4: fabs_v4f64);\n            }\n        }\n    } else {\n        impl_unary!(f32x2[f32; 2]: fabs_f32);\n        impl_unary!(f32x4: fabs_v4f32);\n        impl_unary!(f32x8: fabs_v8f32);\n        impl_unary!(f32x16: fabs_v16f32);\n\n        impl_unary!(f64x2: fabs_v2f64);\n        impl_unary!(f64x4: fabs_v4f64);\n        impl_unary!(f64x8: fabs_v8f64);\n    }\n}\n"
  },
  {
    "path": "src/codegen/math/float/cos.rs",
    "content": "//! Vertical floating-point `cos`\n#![allow(unused)]\n\n// FIXME 64-bit 1 elem vector cos\n\nuse crate::*;\n\npub(crate) trait Cos {\n    fn cos(self) -> Self;\n}\n\n#[allow(improper_ctypes)]\nextern \"C\" {\n    #[link_name = \"llvm.cos.v2f32\"]\n    fn cos_v2f32(x: f32x2) -> f32x2;\n    #[link_name = \"llvm.cos.v4f32\"]\n    fn cos_v4f32(x: f32x4) -> f32x4;\n    #[link_name = \"llvm.cos.v8f32\"]\n    fn cos_v8f32(x: f32x8) -> f32x8;\n    #[link_name = \"llvm.cos.v16f32\"]\n    fn cos_v16f32(x: f32x16) -> f32x16;\n    /* FIXME 64-bit cosgle elem vectors\n    #[link_name = \"llvm.cos.v1f64\"]\n    fn cos_v1f64(x: f64x1) -> f64x1;\n     */\n    #[link_name = \"llvm.cos.v2f64\"]\n    fn cos_v2f64(x: f64x2) -> f64x2;\n    #[link_name = \"llvm.cos.v4f64\"]\n    fn cos_v4f64(x: f64x4) -> f64x4;\n    #[link_name = \"llvm.cos.v8f64\"]\n    fn cos_v8f64(x: f64x8) -> f64x8;\n\n    #[link_name = \"llvm.cos.f32\"]\n    fn cos_f32(x: f32) -> f32;\n    #[link_name = \"llvm.cos.f64\"]\n    fn cos_f64(x: f64) -> f64;\n}\n\ngen_unary_impl_table!(Cos, cos);\n\ncfg_if! {\n    if #[cfg(target_arch = \"s390x\")] {\n        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14\n        impl_unary!(f32x2[f32; 2]: cos_f32);\n        impl_unary!(f32x4[f32; 4]: cos_f32);\n        impl_unary!(f32x8[f32; 8]: cos_f32);\n        impl_unary!(f32x16[f32; 16]: cos_f32);\n\n        impl_unary!(f64x2[f64; 2]: cos_f64);\n        impl_unary!(f64x4[f64; 4]: cos_f64);\n        impl_unary!(f64x8[f64; 8]: cos_f64);\n    } else if #[cfg(all(target_arch = \"x86_64\", feature = \"sleef-sys\"))] {\n        use sleef_sys::*;\n        cfg_if! {\n            if #[cfg(target_feature = \"avx2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10avx2128);\n                impl_unary!(f32x16[h => f32x8]: Sleef_cosf8_u10avx2);\n                impl_unary!(f64x8[h => f64x4]: Sleef_cosd4_u10avx2);\n\n                impl_unary!(f32x4: Sleef_cosf4_u10avx2128);\n                impl_unary!(f32x8: Sleef_cosf8_u10avx2);\n                impl_unary!(f64x2: Sleef_cosd2_u10avx2128);\n                impl_unary!(f64x4: Sleef_cosd4_u10avx2);\n            } else if #[cfg(target_feature = \"avx\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10sse4);\n                impl_unary!(f32x16[h => f32x8]: Sleef_cosf8_u10avx);\n                impl_unary!(f64x8[h => f64x4]: Sleef_cosd4_u10avx);\n\n                impl_unary!(f32x4: Sleef_cosf4_u10sse4);\n                impl_unary!(f32x8: Sleef_cosf8_u10avx);\n                impl_unary!(f64x2: Sleef_cosd2_u10sse4);\n                impl_unary!(f64x4: Sleef_cosd4_u10avx);\n            } else if #[cfg(target_feature = \"sse4.2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10sse4);\n                impl_unary!(f32x16[q => f32x4]: Sleef_cosf4_u10sse4);\n                impl_unary!(f64x8[q => f64x2]: Sleef_cosd2_u10sse4);\n\n                impl_unary!(f32x4: Sleef_cosf4_u10sse4);\n                impl_unary!(f32x8[h => f32x4]: Sleef_cosf4_u10sse4);\n                impl_unary!(f64x2: Sleef_cosd2_u10sse4);\n                impl_unary!(f64x4[h => f64x2]: Sleef_cosd2_u10sse4);\n            } else {\n                impl_unary!(f32x2[f32; 2]: cos_f32);\n                impl_unary!(f32x16: cos_v16f32);\n                impl_unary!(f64x8: cos_v8f64);\n\n                impl_unary!(f32x4: cos_v4f32);\n                impl_unary!(f32x8: cos_v8f32);\n                impl_unary!(f64x2: cos_v2f64);\n                impl_unary!(f64x4: cos_v4f64);\n            }\n        }\n    } else {\n        impl_unary!(f32x2[f32; 2]: cos_f32);\n        impl_unary!(f32x4: cos_v4f32);\n        impl_unary!(f32x8: cos_v8f32);\n        impl_unary!(f32x16: cos_v16f32);\n\n        impl_unary!(f64x2: cos_v2f64);\n        impl_unary!(f64x4: cos_v4f64);\n        impl_unary!(f64x8: cos_v8f64);\n    }\n}\n"
  },
  {
    "path": "src/codegen/math/float/cos_pi.rs",
    "content": "//! Vertical floating-point `cos`\n#![allow(unused)]\n\n// FIXME 64-bit 1 elem vectors cos_pi\n\nuse crate::*;\n\npub(crate) trait CosPi {\n    fn cos_pi(self) -> Self;\n}\n\ngen_unary_impl_table!(CosPi, cos_pi);\n\nmacro_rules! impl_def {\n    ($vid:ident, $PI:path) => {\n        impl CosPi for $vid {\n            #[inline]\n            fn cos_pi(self) -> Self {\n                (self * Self::splat($PI)).cos()\n            }\n        }\n    };\n}\nmacro_rules! impl_def32 {\n    ($vid:ident) => {\n        impl_def!($vid, crate::f32::consts::PI);\n    };\n}\nmacro_rules! impl_def64 {\n    ($vid:ident) => {\n        impl_def!($vid, crate::f64::consts::PI);\n    };\n}\n\ncfg_if! {\n    if #[cfg(all(target_arch = \"x86_64\", feature = \"sleef-sys\"))] {\n        use sleef_sys::*;\n        cfg_if! {\n            if #[cfg(target_feature = \"avx2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05avx2128);\n                impl_unary!(f32x16[h => f32x8]: Sleef_cospif8_u05avx2);\n                impl_unary!(f64x8[h => f64x4]: Sleef_cospid4_u05avx2);\n\n                impl_unary!(f32x4: Sleef_cospif4_u05avx2128);\n                impl_unary!(f32x8: Sleef_cospif8_u05avx2);\n                impl_unary!(f64x2: Sleef_cospid2_u05avx2128);\n                impl_unary!(f64x4: Sleef_cospid4_u05avx2);\n            } else if #[cfg(target_feature = \"avx\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05sse4);\n                impl_unary!(f32x16[h => f32x8]: Sleef_cospif8_u05avx);\n                impl_unary!(f64x8[h => f64x4]: Sleef_cospid4_u05avx);\n\n                impl_unary!(f32x4: Sleef_cospif4_u05sse4);\n                impl_unary!(f32x8: Sleef_cospif8_u05avx);\n                impl_unary!(f64x2: Sleef_cospid2_u05sse4);\n                impl_unary!(f64x4: Sleef_cospid4_u05avx);\n            } else if #[cfg(target_feature = \"sse4.2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05sse4);\n                impl_unary!(f32x16[q => f32x4]: Sleef_cospif4_u05sse4);\n                impl_unary!(f64x8[q => f64x2]: Sleef_cospid2_u05sse4);\n\n                impl_unary!(f32x4: Sleef_cospif4_u05sse4);\n                impl_unary!(f32x8[h => f32x4]: Sleef_cospif4_u05sse4);\n                impl_unary!(f64x2: Sleef_cospid2_u05sse4);\n                impl_unary!(f64x4[h => f64x2]: Sleef_cospid2_u05sse4);\n            } else {\n                impl_def32!(f32x2);\n                impl_def32!(f32x4);\n                impl_def32!(f32x8);\n                impl_def32!(f32x16);\n\n                impl_def64!(f64x2);\n                impl_def64!(f64x4);\n                impl_def64!(f64x8);\n            }\n        }\n    } else {\n        impl_def32!(f32x2);\n        impl_def32!(f32x4);\n        impl_def32!(f32x8);\n        impl_def32!(f32x16);\n\n        impl_def64!(f64x2);\n        impl_def64!(f64x4);\n        impl_def64!(f64x8);\n    }\n}\n"
  },
  {
    "path": "src/codegen/math/float/exp.rs",
    "content": "//! Vertical floating-point `exp`\n#![allow(unused)]\n\n// FIXME 64-bit expgle elem vectors misexpg\n\nuse crate::*;\n\npub(crate) trait Exp {\n    fn exp(self) -> Self;\n}\n\n#[allow(improper_ctypes)]\nextern \"C\" {\n    #[link_name = \"llvm.exp.v2f32\"]\n    fn exp_v2f32(x: f32x2) -> f32x2;\n    #[link_name = \"llvm.exp.v4f32\"]\n    fn exp_v4f32(x: f32x4) -> f32x4;\n    #[link_name = \"llvm.exp.v8f32\"]\n    fn exp_v8f32(x: f32x8) -> f32x8;\n    #[link_name = \"llvm.exp.v16f32\"]\n    fn exp_v16f32(x: f32x16) -> f32x16;\n    /* FIXME 64-bit expgle elem vectors\n    #[link_name = \"llvm.exp.v1f64\"]\n    fn exp_v1f64(x: f64x1) -> f64x1;\n     */\n    #[link_name = \"llvm.exp.v2f64\"]\n    fn exp_v2f64(x: f64x2) -> f64x2;\n    #[link_name = \"llvm.exp.v4f64\"]\n    fn exp_v4f64(x: f64x4) -> f64x4;\n    #[link_name = \"llvm.exp.v8f64\"]\n    fn exp_v8f64(x: f64x8) -> f64x8;\n\n    #[link_name = \"llvm.exp.f32\"]\n    fn exp_f32(x: f32) -> f32;\n    #[link_name = \"llvm.exp.f64\"]\n    fn exp_f64(x: f64) -> f64;\n}\n\ngen_unary_impl_table!(Exp, exp);\n\ncfg_if! {\n    if #[cfg(target_arch = \"s390x\")] {\n        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14\n        impl_unary!(f32x2[f32; 2]: exp_f32);\n        impl_unary!(f32x4[f32; 4]: exp_f32);\n        impl_unary!(f32x8[f32; 8]: exp_f32);\n        impl_unary!(f32x16[f32; 16]: exp_f32);\n\n        impl_unary!(f64x2[f64; 2]: exp_f64);\n        impl_unary!(f64x4[f64; 4]: exp_f64);\n        impl_unary!(f64x8[f64; 8]: exp_f64);\n    } else if #[cfg(all(target_arch = \"x86_64\", feature = \"sleef-sys\"))] {\n        use sleef_sys::*;\n        cfg_if! {\n            if #[cfg(target_feature = \"avx2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10avx2128);\n                impl_unary!(f32x16[h => f32x8]: Sleef_expf8_u10avx2);\n                impl_unary!(f64x8[h => f64x4]: Sleef_expd4_u10avx2);\n\n                impl_unary!(f32x4: Sleef_expf4_u10avx2128);\n                impl_unary!(f32x8: Sleef_expf8_u10avx2);\n                impl_unary!(f64x2: Sleef_expd2_u10avx2128);\n                impl_unary!(f64x4: Sleef_expd4_u10avx2);\n            } else if #[cfg(target_feature = \"avx\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse4);\n                impl_unary!(f32x16[h => f32x8]: Sleef_expf8_u10avx);\n                impl_unary!(f64x8[h => f64x4]: Sleef_expd4_u10avx);\n\n                impl_unary!(f32x4: Sleef_expf4_u10sse4);\n                impl_unary!(f32x8: Sleef_expf8_u10avx);\n                impl_unary!(f64x2: Sleef_expd2_u10sse4);\n                impl_unary!(f64x4: Sleef_expd4_u10avx);\n            } else if #[cfg(target_feature = \"sse4.2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse4);\n                impl_unary!(f32x16[q => f32x4]: Sleef_expf4_u10sse4);\n                impl_unary!(f64x8[q => f64x2]: Sleef_expd2_u10sse4);\n\n                impl_unary!(f32x4: Sleef_expf4_u10sse4);\n                impl_unary!(f32x8[h => f32x4]: Sleef_expf4_u10sse4);\n                impl_unary!(f64x2: Sleef_expd2_u10sse4);\n                impl_unary!(f64x4[h => f64x2]: Sleef_expd2_u10sse4);\n            } else if #[cfg(target_feature = \"sse2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse2);\n                impl_unary!(f32x16[q => f32x4]: Sleef_expf4_u10sse2);\n                impl_unary!(f64x8[q => f64x2]: Sleef_expd2_u10sse2);\n\n                impl_unary!(f32x4: Sleef_expf4_u10sse2);\n                impl_unary!(f32x8[h => f32x4]: Sleef_expf4_u10sse2);\n                impl_unary!(f64x2: Sleef_expd2_u10sse2);\n                impl_unary!(f64x4[h => f64x2]: Sleef_expd2_u10sse2);\n            } else {\n                impl_unary!(f32x2[f32; 2]: exp_f32);\n                impl_unary!(f32x16: exp_v16f32);\n                impl_unary!(f64x8: exp_v8f64);\n\n                impl_unary!(f32x4: exp_v4f32);\n                impl_unary!(f32x8: exp_v8f32);\n                impl_unary!(f64x2: exp_v2f64);\n                impl_unary!(f64x4: exp_v4f64);\n            }\n        }\n    } else {\n        impl_unary!(f32x2[f32; 2]: exp_f32);\n        impl_unary!(f32x4: exp_v4f32);\n        impl_unary!(f32x8: exp_v8f32);\n        impl_unary!(f32x16: exp_v16f32);\n\n        impl_unary!(f64x2: exp_v2f64);\n        impl_unary!(f64x4: exp_v4f64);\n        impl_unary!(f64x8: exp_v8f64);\n    }\n}\n"
  },
  {
    "path": "src/codegen/math/float/ln.rs",
    "content": "//! Vertical floating-point `ln`\n#![allow(unused)]\n\n// FIXME 64-bit lngle elem vectors mislng\n\nuse crate::*;\n\npub(crate) trait Ln {\n    fn ln(self) -> Self;\n}\n\n#[allow(improper_ctypes)]\nextern \"C\" {\n    #[link_name = \"llvm.log.v2f32\"]\n    fn ln_v2f32(x: f32x2) -> f32x2;\n    #[link_name = \"llvm.log.v4f32\"]\n    fn ln_v4f32(x: f32x4) -> f32x4;\n    #[link_name = \"llvm.log.v8f32\"]\n    fn ln_v8f32(x: f32x8) -> f32x8;\n    #[link_name = \"llvm.log.v16f32\"]\n    fn ln_v16f32(x: f32x16) -> f32x16;\n    /* FIXME 64-bit lngle elem vectors\n    #[link_name = \"llvm.log.v1f64\"]\n    fn ln_v1f64(x: f64x1) -> f64x1;\n     */\n    #[link_name = \"llvm.log.v2f64\"]\n    fn ln_v2f64(x: f64x2) -> f64x2;\n    #[link_name = \"llvm.log.v4f64\"]\n    fn ln_v4f64(x: f64x4) -> f64x4;\n    #[link_name = \"llvm.log.v8f64\"]\n    fn ln_v8f64(x: f64x8) -> f64x8;\n\n    #[link_name = \"llvm.log.f32\"]\n    fn ln_f32(x: f32) -> f32;\n    #[link_name = \"llvm.log.f64\"]\n    fn ln_f64(x: f64) -> f64;\n}\n\ngen_unary_impl_table!(Ln, ln);\n\ncfg_if! {\n    if #[cfg(target_arch = \"s390x\")] {\n        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14\n        impl_unary!(f32x2[f32; 2]: ln_f32);\n        impl_unary!(f32x4[f32; 4]: ln_f32);\n        impl_unary!(f32x8[f32; 8]: ln_f32);\n        impl_unary!(f32x16[f32; 16]: ln_f32);\n\n        impl_unary!(f64x2[f64; 2]: ln_f64);\n        impl_unary!(f64x4[f64; 4]: ln_f64);\n        impl_unary!(f64x8[f64; 8]: ln_f64);\n    } else if #[cfg(all(target_arch = \"x86_64\", feature = \"sleef-sys\"))] {\n        use sleef_sys::*;\n        cfg_if! {\n            if #[cfg(target_feature = \"avx2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10avx2128);\n                impl_unary!(f32x16[h => f32x8]: Sleef_logf8_u10avx2);\n                impl_unary!(f64x8[h => f64x4]: Sleef_logd4_u10avx2);\n\n                impl_unary!(f32x4: Sleef_logf4_u10avx2128);\n                impl_unary!(f32x8: Sleef_logf8_u10avx2);\n                impl_unary!(f64x2: Sleef_logd2_u10avx2128);\n                impl_unary!(f64x4: Sleef_logd4_u10avx2);\n            } else if #[cfg(target_feature = \"avx\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse4);\n                impl_unary!(f32x16[h => f32x8]: Sleef_logf8_u10avx);\n                impl_unary!(f64x8[h => f64x4]: Sleef_logd4_u10avx);\n\n                impl_unary!(f32x4: Sleef_logf4_u10sse4);\n                impl_unary!(f32x8: Sleef_logf8_u10avx);\n                impl_unary!(f64x2: Sleef_logd2_u10sse4);\n                impl_unary!(f64x4: Sleef_logd4_u10avx);\n            } else if #[cfg(target_feature = \"sse4.2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse4);\n                impl_unary!(f32x16[q => f32x4]: Sleef_logf4_u10sse4);\n                impl_unary!(f64x8[q => f64x2]: Sleef_logd2_u10sse4);\n\n                impl_unary!(f32x4: Sleef_logf4_u10sse4);\n                impl_unary!(f32x8[h => f32x4]: Sleef_logf4_u10sse4);\n                impl_unary!(f64x2: Sleef_logd2_u10sse4);\n                impl_unary!(f64x4[h => f64x2]: Sleef_logd2_u10sse4);\n            } else if #[cfg(target_feature = \"sse2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse2);\n                impl_unary!(f32x16[q => f32x4]: Sleef_logf4_u10sse2);\n                impl_unary!(f64x8[q => f64x2]: Sleef_logd2_u10sse2);\n\n                impl_unary!(f32x4: Sleef_logf4_u10sse2);\n                impl_unary!(f32x8[h => f32x4]: Sleef_logf4_u10sse2);\n                impl_unary!(f64x2: Sleef_logd2_u10sse2);\n                impl_unary!(f64x4[h => f64x2]: Sleef_logd2_u10sse2);\n            } else {\n                impl_unary!(f32x2[f32; 2]: ln_f32);\n                impl_unary!(f32x16: ln_v16f32);\n                impl_unary!(f64x8: ln_v8f64);\n\n                impl_unary!(f32x4: ln_v4f32);\n                impl_unary!(f32x8: ln_v8f32);\n                impl_unary!(f64x2: ln_v2f64);\n                impl_unary!(f64x4: ln_v4f64);\n            }\n        }\n    } else {\n        impl_unary!(f32x2[f32; 2]: ln_f32);\n        impl_unary!(f32x4: ln_v4f32);\n        impl_unary!(f32x8: ln_v8f32);\n        impl_unary!(f32x16: ln_v16f32);\n\n        impl_unary!(f64x2: ln_v2f64);\n        impl_unary!(f64x4: ln_v4f64);\n        impl_unary!(f64x8: ln_v8f64);\n    }\n}\n"
  },
  {
    "path": "src/codegen/math/float/macros.rs",
    "content": "//! Utility macros\n#![allow(unused)]\n\nmacro_rules! impl_unary_ {\n    // implementation mapping 1:1\n    (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident,\n     $fun:ident) => {\n        impl $trait_id for $vec_id {\n            #[inline]\n            fn $trait_method(self) -> Self {\n                unsafe {\n                    use crate::mem::transmute;\n                    transmute($fun(transmute(self)))\n                }\n            }\n        }\n    };\n    // implementation mapping 1:1 for when `$fun` is a generic function\n    // like some of the fp math rustc intrinsics (e.g. `fn fun<T>(x: T) -> T`).\n    (gen | $trait_id:ident, $trait_method:ident, $vec_id:ident,\n     $fun:ident) => {\n        impl $trait_id for $vec_id {\n            #[inline]\n            fn $trait_method(self) -> Self {\n                unsafe {\n                    use crate::mem::transmute;\n                    transmute($fun(self.0))\n                }\n            }\n        }\n    };\n    (scalar | $trait_id:ident, $trait_method:ident,\n     $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => {\n        impl $trait_id for $vec_id {\n            #[inline]\n            fn $trait_method(self) -> Self {\n                unsafe {\n                    union U {\n                        vec: $vec_id,\n                        scalars: [$sid; $scount],\n                    }\n                    let mut scalars = U { vec: self }.scalars;\n                    for i in &mut scalars {\n                        *i = $fun(*i);\n                    }\n                    U { scalars }.vec\n                }\n            }\n        }\n    };\n    // implementation calling fun twice on each of the vector halves:\n    (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident,\n     $vech_id:ident, $fun:ident) => {\n        impl $trait_id for $vec_id {\n            #[inline]\n            fn $trait_method(self) -> Self {\n                unsafe {\n                    use crate::mem::transmute;\n                    union U {\n                        vec: $vec_id,\n                        halves: [$vech_id; 2],\n                    }\n\n                    let mut halves = U { vec: self }.halves;\n\n                    *halves.get_unchecked_mut(0) = transmute($fun(transmute(*halves.get_unchecked(0))));\n                    *halves.get_unchecked_mut(1) = transmute($fun(transmute(*halves.get_unchecked(1))));\n\n                    U { halves }.vec\n                }\n            }\n        }\n    };\n    // implementation calling fun four times on each of the vector quarters:\n    (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident,\n     $vecq_id:ident, $fun:ident) => {\n        impl $trait_id for $vec_id {\n            #[inline]\n            fn $trait_method(self) -> Self {\n                unsafe {\n                    use crate::mem::transmute;\n                    union U {\n                        vec: $vec_id,\n                        quarters: [$vecq_id; 4],\n                    }\n\n                    let mut quarters = U { vec: self }.quarters;\n\n                    *quarters.get_unchecked_mut(0) = transmute($fun(transmute(*quarters.get_unchecked(0))));\n                    *quarters.get_unchecked_mut(1) = transmute($fun(transmute(*quarters.get_unchecked(1))));\n                    *quarters.get_unchecked_mut(2) = transmute($fun(transmute(*quarters.get_unchecked(2))));\n                    *quarters.get_unchecked_mut(3) = transmute($fun(transmute(*quarters.get_unchecked(3))));\n\n                    U { quarters }.vec\n                }\n            }\n        }\n    };\n    // implementation calling fun once on a vector twice as large:\n    (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident,\n     $vect_id:ident, $fun:ident) => {\n        impl $trait_id for $vec_id {\n            #[inline]\n            fn $trait_method(self) -> Self {\n                unsafe {\n                    use crate::mem::{transmute, uninitialized};\n\n                    union U {\n                        vec: [$vec_id; 2],\n                        twice: $vect_id,\n                    }\n\n                    let twice = U { vec: [self, uninitialized()] }.twice;\n                    let twice = transmute($fun(transmute(twice)));\n\n                    *(U { twice }.vec.get_unchecked(0))\n                }\n            }\n        }\n    };\n}\n\nmacro_rules! gen_unary_impl_table {\n    ($trait_id:ident, $trait_method:ident) => {\n        macro_rules! impl_unary {\n            ($vid:ident: $fun:ident) => {\n                impl_unary_!(vec | $trait_id, $trait_method, $vid, $fun);\n            };\n            ($vid:ident[g]: $fun:ident) => {\n                impl_unary_!(gen | $trait_id, $trait_method, $vid, $fun);\n            };\n            ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => {\n                impl_unary_!(scalar | $trait_id, $trait_method, $vid, [$sid; $sc], $fun);\n            };\n            ($vid:ident[s]: $fun:ident) => {\n                impl_unary_!(scalar | $trait_id, $trait_method, $vid, $fun);\n            };\n            ($vid:ident[h => $vid_h:ident]: $fun:ident) => {\n                impl_unary_!(halves | $trait_id, $trait_method, $vid, $vid_h, $fun);\n            };\n            ($vid:ident[q => $vid_q:ident]: $fun:ident) => {\n                impl_unary_!(quarter | $trait_id, $trait_method, $vid, $vid_q, $fun);\n            };\n            ($vid:ident[t => $vid_t:ident]: $fun:ident) => {\n                impl_unary_!(twice | $trait_id, $trait_method, $vid, $vid_t, $fun);\n            };\n        }\n    };\n}\n\nmacro_rules! impl_tertiary_ {\n    // implementation mapping 1:1\n    (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident,\n     $fun:ident) => {\n        impl $trait_id for $vec_id {\n            #[inline]\n            fn $trait_method(self, y: Self, z: Self) -> Self {\n                unsafe {\n                    use crate::mem::transmute;\n                    transmute($fun(transmute(self), transmute(y), transmute(z)))\n                }\n            }\n        }\n    };\n    (scalar | $trait_id:ident, $trait_method:ident,\n     $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => {\n        impl $trait_id for $vec_id {\n            #[inline]\n            fn $trait_method(self, y: Self, z: Self) -> Self {\n                unsafe {\n                    union U {\n                        vec: $vec_id,\n                        scalars: [$sid; $scount],\n                    }\n                    let mut x = U { vec: self }.scalars;\n                    let y = U { vec: y }.scalars;\n                    let z = U { vec: z }.scalars;\n                    for (x, (y, z)) in (&mut scalars).zip(&y).zip(&z) {\n                        *i = $fun(*i, *y, *z);\n                    }\n                    U { vec: x }.vec\n                }\n            }\n        }\n    };\n    // implementation calling fun twice on each of the vector halves:\n    (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident,\n     $vech_id:ident, $fun:ident) => {\n        impl $trait_id for $vec_id {\n            #[inline]\n            fn $trait_method(self, y: Self, z: Self) -> Self {\n                unsafe {\n                    use crate::mem::transmute;\n                    union U {\n                        vec: $vec_id,\n                        halves: [$vech_id; 2],\n                    }\n\n                    let mut x_halves = U { vec: self }.halves;\n                    let y_halves = U { vec: y }.halves;\n                    let z_halves = U { vec: z }.halves;\n\n                    *x_halves.get_unchecked_mut(0) = transmute($fun(\n                        transmute(*x_halves.get_unchecked(0)),\n                        transmute(*y_halves.get_unchecked(0)),\n                        transmute(*z_halves.get_unchecked(0)),\n                    ));\n                    *x_halves.get_unchecked_mut(1) = transmute($fun(\n                        transmute(*x_halves.get_unchecked(1)),\n                        transmute(*y_halves.get_unchecked(1)),\n                        transmute(*z_halves.get_unchecked(1)),\n                    ));\n\n                    U { halves: x_halves }.vec\n                }\n            }\n        }\n    };\n    // implementation calling fun four times on each of the vector quarters:\n    (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident,\n     $vecq_id:ident, $fun:ident) => {\n        impl $trait_id for $vec_id {\n            #[inline]\n            fn $trait_method(self, y: Self, z: Self) -> Self {\n                unsafe {\n                    use crate::mem::transmute;\n                    union U {\n                        vec: $vec_id,\n                        quarters: [$vecq_id; 4],\n                    }\n\n                    let mut x_quarters = U { vec: self }.quarters;\n                    let y_quarters = U { vec: y }.quarters;\n                    let z_quarters = U { vec: z }.quarters;\n\n                    *x_quarters.get_unchecked_mut(0) = transmute($fun(\n                        transmute(*x_quarters.get_unchecked(0)),\n                        transmute(*y_quarters.get_unchecked(0)),\n                        transmute(*z_quarters.get_unchecked(0)),\n                    ));\n\n                    *x_quarters.get_unchecked_mut(1) = transmute($fun(\n                        transmute(*x_quarters.get_unchecked(1)),\n                        transmute(*y_quarters.get_unchecked(1)),\n                        transmute(*z_quarters.get_unchecked(1)),\n                    ));\n\n                    *x_quarters.get_unchecked_mut(2) = transmute($fun(\n                        transmute(*x_quarters.get_unchecked(2)),\n                        transmute(*y_quarters.get_unchecked(2)),\n                        transmute(*z_quarters.get_unchecked(2)),\n                    ));\n\n                    *x_quarters.get_unchecked_mut(3) = transmute($fun(\n                        transmute(*x_quarters.get_unchecked(3)),\n                        transmute(*y_quarters.get_unchecked(3)),\n                        transmute(*z_quarters.get_unchecked(3)),\n                    ));\n\n                    U { quarters: x_quarters }.vec\n                }\n            }\n        }\n    };\n    // implementation calling fun once on a vector twice as large:\n    (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident,\n     $vect_id:ident, $fun:ident) => {\n        impl $trait_id for $vec_id {\n            #[inline]\n            fn $trait_method(self, y: Self, z: Self) -> Self {\n                unsafe {\n                    use crate::mem::{transmute, uninitialized};\n\n                    union U {\n                        vec: [$vec_id; 2],\n                        twice: $vect_id,\n                    }\n\n                    let x_twice = U { vec: [self, uninitialized()] }.twice;\n                    let y_twice = U { vec: [y, uninitialized()] }.twice;\n                    let z_twice = U { vec: [z, uninitialized()] }.twice;\n                    let twice: $vect_id =\n                        transmute($fun(transmute(x_twice), transmute(y_twice), transmute(z_twice)));\n\n                    *(U { twice }.vec.get_unchecked(0))\n                }\n            }\n        }\n    };\n}\n\nmacro_rules! gen_tertiary_impl_table {\n    ($trait_id:ident, $trait_method:ident) => {\n        macro_rules! impl_tertiary {\n            ($vid:ident: $fun:ident) => {\n                impl_tertiary_!(vec | $trait_id, $trait_method, $vid, $fun);\n            };\n            ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => {\n                impl_tertiary_!(scalar | $trait_id, $trait_method, $vid, [$sid; $sc], $fun);\n            };\n            ($vid:ident[s]: $fun:ident) => {\n                impl_tertiary_!(scalar | $trait_id, $trait_method, $vid, $fun);\n            };\n            ($vid:ident[h => $vid_h:ident]: $fun:ident) => {\n                impl_tertiary_!(halves | $trait_id, $trait_method, $vid, $vid_h, $fun);\n            };\n            ($vid:ident[q => $vid_q:ident]: $fun:ident) => {\n                impl_tertiary_!(quarter | $trait_id, $trait_method, $vid, $vid_q, $fun);\n            };\n            ($vid:ident[t => $vid_t:ident]: $fun:ident) => {\n                impl_tertiary_!(twice | $trait_id, $trait_method, $vid, $vid_t, $fun);\n            };\n        }\n    };\n}\n\nmacro_rules! impl_binary_ {\n    // implementation mapping 1:1\n    (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident,\n     $fun:ident) => {\n        impl $trait_id for $vec_id {\n            #[inline]\n            fn $trait_method(self, y: Self) -> Self {\n                unsafe {\n                    use crate::mem::transmute;\n                    transmute($fun(transmute(self), transmute(y)))\n                }\n            }\n        }\n    };\n    (scalar | $trait_id:ident, $trait_method:ident,\n     $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => {\n        impl $trait_id for $vec_id {\n            #[inline]\n            fn $trait_method(self, y: Self) -> Self {\n                unsafe {\n                    union U {\n                        vec: $vec_id,\n                        scalars: [$sid; $scount],\n                    }\n                    let mut x = U { vec: self }.scalars;\n                    let y = U { vec: y }.scalars;\n                    for (x, y) in x.iter_mut().zip(&y) {\n                        *x = $fun(*x, *y);\n                    }\n                    U { scalars: x }.vec\n                }\n            }\n        }\n    };\n    // implementation calling fun twice on each of the vector halves:\n    (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident,\n     $vech_id:ident, $fun:ident) => {\n        impl $trait_id for $vec_id {\n            #[inline]\n            fn $trait_method(self, y: Self) -> Self {\n                unsafe {\n                    use crate::mem::transmute;\n                    union U {\n                        vec: $vec_id,\n                        halves: [$vech_id; 2],\n                    }\n\n                    let mut x_halves = U { vec: self }.halves;\n                    let y_halves = U { vec: y }.halves;\n\n                    *x_halves.get_unchecked_mut(0) = transmute($fun(\n                        transmute(*x_halves.get_unchecked(0)),\n                        transmute(*y_halves.get_unchecked(0)),\n                    ));\n                    *x_halves.get_unchecked_mut(1) = transmute($fun(\n                        transmute(*x_halves.get_unchecked(1)),\n                        transmute(*y_halves.get_unchecked(1)),\n                    ));\n\n                    U { halves: x_halves }.vec\n                }\n            }\n        }\n    };\n    // implementation calling fun four times on each of the vector quarters:\n    (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident,\n     $vecq_id:ident, $fun:ident) => {\n        impl $trait_id for $vec_id {\n            #[inline]\n            fn $trait_method(self, y: Self) -> Self {\n                unsafe {\n                    use crate::mem::transmute;\n                    union U {\n                        vec: $vec_id,\n                        quarters: [$vecq_id; 4],\n                    }\n\n                    let mut x_quarters = U { vec: self }.quarters;\n                    let y_quarters = U { vec: y }.quarters;\n\n                    *x_quarters.get_unchecked_mut(0) = transmute($fun(\n                        transmute(*x_quarters.get_unchecked(0)),\n                        transmute(*y_quarters.get_unchecked(0)),\n                    ));\n\n                    *x_quarters.get_unchecked_mut(1) = transmute($fun(\n                        transmute(*x_quarters.get_unchecked(1)),\n                        transmute(*y_quarters.get_unchecked(1)),\n                    ));\n\n                    *x_quarters.get_unchecked_mut(2) = transmute($fun(\n                        transmute(*x_quarters.get_unchecked(2)),\n                        transmute(*y_quarters.get_unchecked(2)),\n                    ));\n\n                    *x_quarters.get_unchecked_mut(3) = transmute($fun(\n                        transmute(*x_quarters.get_unchecked(3)),\n                        transmute(*y_quarters.get_unchecked(3)),\n                    ));\n\n                    U { quarters: x_quarters }.vec\n                }\n            }\n        }\n    };\n    // implementation calling fun once on a vector twice as large:\n    (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident,\n     $vect_id:ident, $fun:ident) => {\n        impl $trait_id for $vec_id {\n            #[inline]\n            fn $trait_method(self, y: Self) -> Self {\n                unsafe {\n                    use crate::mem::{transmute, uninitialized};\n\n                    union U {\n                        vec: [$vec_id; 2],\n                        twice: $vect_id,\n                    }\n\n                    let x_twice = U { vec: [self, uninitialized()] }.twice;\n                    let y_twice = U { vec: [y, uninitialized()] }.twice;\n                    let twice: $vect_id = transmute($fun(transmute(x_twice), transmute(y_twice)));\n\n                    *(U { twice }.vec.get_unchecked(0))\n                }\n            }\n        }\n    };\n}\n\nmacro_rules! gen_binary_impl_table {\n    ($trait_id:ident, $trait_method:ident) => {\n        macro_rules! impl_binary {\n            ($vid:ident: $fun:ident) => {\n                impl_binary_!(vec | $trait_id, $trait_method, $vid, $fun);\n            };\n            ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => {\n                impl_binary_!(scalar | $trait_id, $trait_method, $vid, [$sid; $sc], $fun);\n            };\n            ($vid:ident[s]: $fun:ident) => {\n                impl_binary_!(scalar | $trait_id, $trait_method, $vid, $fun);\n            };\n            ($vid:ident[h => $vid_h:ident]: $fun:ident) => {\n                impl_binary_!(halves | $trait_id, $trait_method, $vid, $vid_h, $fun);\n            };\n            ($vid:ident[q => $vid_q:ident]: $fun:ident) => {\n                impl_binary_!(quarter | $trait_id, $trait_method, $vid, $vid_q, $fun);\n            };\n            ($vid:ident[t => $vid_t:ident]: $fun:ident) => {\n                impl_binary_!(twice | $trait_id, $trait_method, $vid, $vid_t, $fun);\n            };\n        }\n    };\n}\n"
  },
  {
    "path": "src/codegen/math/float/mul_add.rs",
    "content": "//! Vertical floating-point `mul_add`\n#![allow(unused)]\nuse crate::*;\n\n// FIXME: 64-bit 1 element mul_add\n\npub(crate) trait MulAdd {\n    fn mul_add(self, y: Self, z: Self) -> Self;\n}\n\n#[cfg(not(target_arch = \"s390x\"))]\n#[allow(improper_ctypes)]\nextern \"C\" {\n    #[link_name = \"llvm.fma.v2f32\"]\n    fn fma_v2f32(x: f32x2, y: f32x2, z: f32x2) -> f32x2;\n    #[link_name = \"llvm.fma.v4f32\"]\n    fn fma_v4f32(x: f32x4, y: f32x4, z: f32x4) -> f32x4;\n    #[link_name = \"llvm.fma.v8f32\"]\n    fn fma_v8f32(x: f32x8, y: f32x8, z: f32x8) -> f32x8;\n    #[link_name = \"llvm.fma.v16f32\"]\n    fn fma_v16f32(x: f32x16, y: f32x16, z: f32x16) -> f32x16;\n    /* FIXME 64-bit single elem vectors\n    #[link_name = \"llvm.fma.v1f64\"]\n    fn fma_v1f64(x: f64x1, y: f64x1, z: f64x1) -> f64x1;\n    */\n    #[link_name = \"llvm.fma.v2f64\"]\n    fn fma_v2f64(x: f64x2, y: f64x2, z: f64x2) -> f64x2;\n    #[link_name = \"llvm.fma.v4f64\"]\n    fn fma_v4f64(x: f64x4, y: f64x4, z: f64x4) -> f64x4;\n    #[link_name = \"llvm.fma.v8f64\"]\n    fn fma_v8f64(x: f64x8, y: f64x8, z: f64x8) -> f64x8;\n}\n\ngen_tertiary_impl_table!(MulAdd, mul_add);\n\ncfg_if! {\n    if #[cfg(target_arch = \"s390x\")] {\n        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14\n        macro_rules! impl_broken {\n            ($id:ident) => {\n                impl MulAdd for $id {\n                    #[inline]\n                    fn mul_add(self, y: Self, z: Self) -> Self {\n                        self * y + z\n                    }\n                }\n            };\n        }\n\n        impl_broken!(f32x2);\n        impl_broken!(f32x4);\n        impl_broken!(f32x8);\n        impl_broken!(f32x16);\n\n        impl_broken!(f64x2);\n        impl_broken!(f64x4);\n        impl_broken!(f64x8);\n    } else if #[cfg(all(target_arch = \"x86_64\", feature = \"sleef-sys\"))] {\n        use sleef_sys::*;\n        cfg_if! {\n            if #[cfg(target_feature = \"avx2\")] {\n                impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_avx2128);\n                impl_tertiary!(f32x16[h => f32x8]: Sleef_fmaf8_avx2);\n                impl_tertiary!(f64x8[h => f64x4]: Sleef_fmad4_avx2);\n\n                impl_tertiary!(f32x4: Sleef_fmaf4_avx2128);\n                impl_tertiary!(f32x8: Sleef_fmaf8_avx2);\n                impl_tertiary!(f64x2: Sleef_fmad2_avx2128);\n                impl_tertiary!(f64x4: Sleef_fmad4_avx2);\n            } else if #[cfg(target_feature = \"avx\")] {\n                impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_sse4);\n                impl_tertiary!(f32x16[h => f32x8]: Sleef_fmaf8_avx);\n                impl_tertiary!(f64x8[h => f64x4]: Sleef_fmad4_avx);\n\n                impl_tertiary!(f32x4: Sleef_fmaf4_sse4);\n                impl_tertiary!(f32x8: Sleef_fmaf8_avx);\n                impl_tertiary!(f64x2: Sleef_fmad2_sse4);\n                impl_tertiary!(f64x4: Sleef_fmad4_avx);\n            } else if #[cfg(target_feature = \"sse4.2\")] {\n                impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_sse4);\n                impl_tertiary!(f32x16[q => f32x4]: Sleef_fmaf4_sse4);\n                impl_tertiary!(f64x8[q => f64x2]: Sleef_fmad2_sse4);\n\n                impl_tertiary!(f32x4: Sleef_fmaf4_sse4);\n                impl_tertiary!(f32x8[h => f32x4]: Sleef_fmaf4_sse4);\n                impl_tertiary!(f64x2: Sleef_fmad2_sse4);\n                impl_tertiary!(f64x4[h => f64x2]: Sleef_fmad2_sse4);\n            } else {\n                impl_tertiary!(f32x2: fma_v2f32);\n                impl_tertiary!(f32x16: fma_v16f32);\n                impl_tertiary!(f64x8: fma_v8f64);\n\n                impl_tertiary!(f32x4: fma_v4f32);\n                impl_tertiary!(f32x8: fma_v8f32);\n                impl_tertiary!(f64x2: fma_v2f64);\n                impl_tertiary!(f64x4: fma_v4f64);\n            }\n        }\n    } else {\n        impl_tertiary!(f32x2: fma_v2f32);\n        impl_tertiary!(f32x4: fma_v4f32);\n        impl_tertiary!(f32x8: fma_v8f32);\n        impl_tertiary!(f32x16: fma_v16f32);\n        // impl_tertiary!(f64x1: fma_v1f64); // FIXME 64-bit fmagle elem vectors\n        impl_tertiary!(f64x2: fma_v2f64);\n        impl_tertiary!(f64x4: fma_v4f64);\n        impl_tertiary!(f64x8: fma_v8f64);\n    }\n}\n"
  },
  {
    "path": "src/codegen/math/float/mul_adde.rs",
    "content": "//! Approximation for floating-point `mul_add`\nuse crate::*;\n\n// FIXME: 64-bit 1 element mul_adde\n\npub(crate) trait MulAddE {\n    fn mul_adde(self, y: Self, z: Self) -> Self;\n}\n\n#[cfg(not(target_arch = \"s390x\"))]\n#[allow(improper_ctypes)]\nextern \"C\" {\n    #[link_name = \"llvm.fmuladd.v2f32\"]\n    fn fmuladd_v2f32(x: f32x2, y: f32x2, z: f32x2) -> f32x2;\n    #[link_name = \"llvm.fmuladd.v4f32\"]\n    fn fmuladd_v4f32(x: f32x4, y: f32x4, z: f32x4) -> f32x4;\n    #[link_name = \"llvm.fmuladd.v8f32\"]\n    fn fmuladd_v8f32(x: f32x8, y: f32x8, z: f32x8) -> f32x8;\n    #[link_name = \"llvm.fmuladd.v16f32\"]\n    fn fmuladd_v16f32(x: f32x16, y: f32x16, z: f32x16) -> f32x16;\n    /* FIXME 64-bit single elem vectors\n    #[link_name = \"llvm.fmuladd.v1f64\"]\n    fn fmuladd_v1f64(x: f64x1, y: f64x1, z: f64x1) -> f64x1;\n    */\n    #[link_name = \"llvm.fmuladd.v2f64\"]\n    fn fmuladd_v2f64(x: f64x2, y: f64x2, z: f64x2) -> f64x2;\n    #[link_name = \"llvm.fmuladd.v4f64\"]\n    fn fmuladd_v4f64(x: f64x4, y: f64x4, z: f64x4) -> f64x4;\n    #[link_name = \"llvm.fmuladd.v8f64\"]\n    fn fmuladd_v8f64(x: f64x8, y: f64x8, z: f64x8) -> f64x8;\n}\n\nmacro_rules! impl_mul_adde {\n    ($id:ident : $fn:ident) => {\n        impl MulAddE for $id {\n            #[inline]\n            fn mul_adde(self, y: Self, z: Self) -> Self {\n                #[cfg(not(target_arch = \"s390x\"))]\n                {\n                    use crate::mem::transmute;\n                    unsafe { transmute($fn(transmute(self), transmute(y), transmute(z))) }\n                }\n                #[cfg(target_arch = \"s390x\")]\n                {\n                    // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14\n                    self * y + z\n                }\n            }\n        }\n    };\n}\n\nimpl_mul_adde!(f32x2: fmuladd_v2f32);\nimpl_mul_adde!(f32x4: fmuladd_v4f32);\nimpl_mul_adde!(f32x8: fmuladd_v8f32);\nimpl_mul_adde!(f32x16: fmuladd_v16f32);\n// impl_mul_adde!(f64x1: fma_v1f64); // FIXME 64-bit fmagle elem vectors\nimpl_mul_adde!(f64x2: fmuladd_v2f64);\nimpl_mul_adde!(f64x4: fmuladd_v4f64);\nimpl_mul_adde!(f64x8: fmuladd_v8f64);\n"
  },
  {
    "path": "src/codegen/math/float/powf.rs",
    "content": "//! Vertical floating-point `powf`\n#![allow(unused)]\n\n// FIXME 64-bit powfgle elem vectors mispowfg\n\nuse crate::*;\n\npub(crate) trait Powf {\n    fn powf(self, x: Self) -> Self;\n}\n\n#[allow(improper_ctypes)]\nextern \"C\" {\n    #[link_name = \"llvm.pow.v2f32\"]\n    fn powf_v2f32(x: f32x2, y: f32x2) -> f32x2;\n    #[link_name = \"llvm.pow.v4f32\"]\n    fn powf_v4f32(x: f32x4, y: f32x4) -> f32x4;\n    #[link_name = \"llvm.pow.v8f32\"]\n    fn powf_v8f32(x: f32x8, y: f32x8) -> f32x8;\n    #[link_name = \"llvm.pow.v16f32\"]\n    fn powf_v16f32(x: f32x16, y: f32x16) -> f32x16;\n    /* FIXME 64-bit powfgle elem vectors\n    #[link_name = \"llvm.pow.v1f64\"]\n    fn powf_v1f64(x: f64x1, y: f64x1) -> f64x1;\n     */\n    #[link_name = \"llvm.pow.v2f64\"]\n    fn powf_v2f64(x: f64x2, y: f64x2) -> f64x2;\n    #[link_name = \"llvm.pow.v4f64\"]\n    fn powf_v4f64(x: f64x4, y: f64x4) -> f64x4;\n    #[link_name = \"llvm.pow.v8f64\"]\n    fn powf_v8f64(x: f64x8, y: f64x8) -> f64x8;\n\n    #[link_name = \"llvm.pow.f32\"]\n    fn powf_f32(x: f32, y: f32) -> f32;\n    #[link_name = \"llvm.pow.f64\"]\n    fn powf_f64(x: f64, y: f64) -> f64;\n}\n\ngen_binary_impl_table!(Powf, powf);\n\ncfg_if! {\n    if #[cfg(target_arch = \"s390x\")] {\n        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14\n        impl_binary!(f32x2[f32; 2]: powf_f32);\n        impl_binary!(f32x4[f32; 4]: powf_f32);\n        impl_binary!(f32x8[f32; 8]: powf_f32);\n        impl_binary!(f32x16[f32; 16]: powf_f32);\n\n        impl_binary!(f64x2[f64; 2]: powf_f64);\n        impl_binary!(f64x4[f64; 4]: powf_f64);\n        impl_binary!(f64x8[f64; 8]: powf_f64);\n    } else if #[cfg(all(target_arch = \"x86_64\", feature = \"sleef-sys\"))] {\n        use sleef_sys::*;\n        cfg_if! {\n            if #[cfg(target_feature = \"avx2\")] {\n                impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10avx2128);\n                impl_binary!(f32x16[h => f32x8]: Sleef_powf8_u10avx2);\n                impl_binary!(f64x8[h => f64x4]: Sleef_powd4_u10avx2);\n\n                impl_binary!(f32x4: Sleef_powf4_u10avx2128);\n                impl_binary!(f32x8: Sleef_powf8_u10avx2);\n                impl_binary!(f64x2: Sleef_powd2_u10avx2128);\n                impl_binary!(f64x4: Sleef_powd4_u10avx2);\n            } else if #[cfg(target_feature = \"avx\")] {\n                impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse4);\n                impl_binary!(f32x16[h => f32x8]: Sleef_powf8_u10avx);\n                impl_binary!(f64x8[h => f64x4]: Sleef_powd4_u10avx);\n\n                impl_binary!(f32x4: Sleef_powf4_u10sse4);\n                impl_binary!(f32x8: Sleef_powf8_u10avx);\n                impl_binary!(f64x2: Sleef_powd2_u10sse4);\n                impl_binary!(f64x4: Sleef_powd4_u10avx);\n            } else if #[cfg(target_feature = \"sse4.2\")] {\n                impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse4);\n                impl_binary!(f32x16[q => f32x4]: Sleef_powf4_u10sse4);\n                impl_binary!(f64x8[q => f64x2]: Sleef_powd2_u10sse4);\n\n                impl_binary!(f32x4: Sleef_powf4_u10sse4);\n                impl_binary!(f32x8[h => f32x4]: Sleef_powf4_u10sse4);\n                impl_binary!(f64x2: Sleef_powd2_u10sse4);\n                impl_binary!(f64x4[h => f64x2]: Sleef_powd2_u10sse4);\n            } else if #[cfg(target_feature = \"sse2\")] {\n                impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse2);\n                impl_binary!(f32x16[q => f32x4]: Sleef_powf4_u10sse2);\n                impl_binary!(f64x8[q => f64x2]: Sleef_powd2_u10sse2);\n\n                impl_binary!(f32x4: Sleef_powf4_u10sse2);\n                impl_binary!(f32x8[h => f32x4]: Sleef_powf4_u10sse2);\n                impl_binary!(f64x2: Sleef_powd2_u10sse2);\n                impl_binary!(f64x4[h => f64x2]: Sleef_powd2_u10sse2);\n            } else {\n                impl_binary!(f32x2[f32; 2]: powf_f32);\n                impl_binary!(f32x4: powf_v4f32);\n                impl_binary!(f32x8: powf_v8f32);\n                impl_binary!(f32x16: powf_v16f32);\n\n                impl_binary!(f64x2: powf_v2f64);\n                impl_binary!(f64x4: powf_v4f64);\n                impl_binary!(f64x8: powf_v8f64);\n            }\n        }\n    } else {\n        impl_binary!(f32x2[f32; 2]: powf_f32);\n        impl_binary!(f32x4: powf_v4f32);\n        impl_binary!(f32x8: powf_v8f32);\n        impl_binary!(f32x16: powf_v16f32);\n\n        impl_binary!(f64x2: powf_v2f64);\n        impl_binary!(f64x4: powf_v4f64);\n        impl_binary!(f64x8: powf_v8f64);\n    }\n}\n"
  },
  {
    "path": "src/codegen/math/float/sin.rs",
    "content": "//! Vertical floating-point `sin`\n#![allow(unused)]\n\n// FIXME 64-bit 1 elem vectors sin\n\nuse crate::*;\n\npub(crate) trait Sin {\n    fn sin(self) -> Self;\n}\n\n#[allow(improper_ctypes)]\nextern \"C\" {\n    #[link_name = \"llvm.sin.v2f32\"]\n    fn sin_v2f32(x: f32x2) -> f32x2;\n    #[link_name = \"llvm.sin.v4f32\"]\n    fn sin_v4f32(x: f32x4) -> f32x4;\n    #[link_name = \"llvm.sin.v8f32\"]\n    fn sin_v8f32(x: f32x8) -> f32x8;\n    #[link_name = \"llvm.sin.v16f32\"]\n    fn sin_v16f32(x: f32x16) -> f32x16;\n    /* FIXME 64-bit single elem vectors\n    #[link_name = \"llvm.sin.v1f64\"]\n    fn sin_v1f64(x: f64x1) -> f64x1;\n     */\n    #[link_name = \"llvm.sin.v2f64\"]\n    fn sin_v2f64(x: f64x2) -> f64x2;\n    #[link_name = \"llvm.sin.v4f64\"]\n    fn sin_v4f64(x: f64x4) -> f64x4;\n    #[link_name = \"llvm.sin.v8f64\"]\n    fn sin_v8f64(x: f64x8) -> f64x8;\n\n    #[link_name = \"llvm.sin.f32\"]\n    fn sin_f32(x: f32) -> f32;\n    #[link_name = \"llvm.sin.f64\"]\n    fn sin_f64(x: f64) -> f64;\n}\n\ngen_unary_impl_table!(Sin, sin);\n\ncfg_if! {\n    if #[cfg(target_arch = \"s390x\")] {\n        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14\n        impl_unary!(f32x2[f32; 2]: sin_f32);\n        impl_unary!(f32x4[f32; 4]: sin_f32);\n        impl_unary!(f32x8[f32; 8]: sin_f32);\n        impl_unary!(f32x16[f32; 16]: sin_f32);\n\n        impl_unary!(f64x2[f64; 2]: sin_f64);\n        impl_unary!(f64x4[f64; 4]: sin_f64);\n        impl_unary!(f64x8[f64; 8]: sin_f64);\n    } else if #[cfg(all(target_arch = \"x86_64\", feature = \"sleef-sys\"))] {\n        use sleef_sys::*;\n        cfg_if! {\n            if #[cfg(target_feature = \"avx2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10avx2128);\n                impl_unary!(f32x16[h => f32x8]: Sleef_sinf8_u10avx2);\n                impl_unary!(f64x8[h => f64x4]: Sleef_sind4_u10avx2);\n\n                impl_unary!(f32x4: Sleef_sinf4_u10avx2128);\n                impl_unary!(f32x8: Sleef_sinf8_u10avx2);\n                impl_unary!(f64x2: Sleef_sind2_u10avx2128);\n                impl_unary!(f64x4: Sleef_sind4_u10avx2);\n            } else if #[cfg(target_feature = \"avx\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10sse4);\n                impl_unary!(f32x16[h => f32x8]: Sleef_sinf8_u10avx);\n                impl_unary!(f64x8[h => f64x4]: Sleef_sind4_u10avx);\n\n                impl_unary!(f32x4: Sleef_sinf4_u10sse4);\n                impl_unary!(f32x8: Sleef_sinf8_u10avx);\n                impl_unary!(f64x2: Sleef_sind2_u10sse4);\n                impl_unary!(f64x4: Sleef_sind4_u10avx);\n            } else if #[cfg(target_feature = \"sse4.2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10sse4);\n                impl_unary!(f32x16[q => f32x4]: Sleef_sinf4_u10sse4);\n                impl_unary!(f64x8[q => f64x2]: Sleef_sind2_u10sse4);\n\n                impl_unary!(f32x4: Sleef_sinf4_u10sse4);\n                impl_unary!(f32x8[h => f32x4]: Sleef_sinf4_u10sse4);\n                impl_unary!(f64x2: Sleef_sind2_u10sse4);\n                impl_unary!(f64x4[h => f64x2]: Sleef_sind2_u10sse4);\n            } else {\n                impl_unary!(f32x2[f32; 2]: sin_f32);\n                impl_unary!(f32x16: sin_v16f32);\n                impl_unary!(f64x8: sin_v8f64);\n\n                impl_unary!(f32x4: sin_v4f32);\n                impl_unary!(f32x8: sin_v8f32);\n                impl_unary!(f64x2: sin_v2f64);\n                impl_unary!(f64x4: sin_v4f64);\n            }\n        }\n    } else {\n        impl_unary!(f32x2[f32; 2]: sin_f32);\n        impl_unary!(f32x4: sin_v4f32);\n        impl_unary!(f32x8: sin_v8f32);\n        impl_unary!(f32x16: sin_v16f32);\n\n        impl_unary!(f64x2: sin_v2f64);\n        impl_unary!(f64x4: sin_v4f64);\n        impl_unary!(f64x8: sin_v8f64);\n    }\n}\n"
  },
  {
    "path": "src/codegen/math/float/sin_cos_pi.rs",
    "content": "//! Vertical floating-point `sin_cos`\n#![allow(unused)]\n\n// FIXME 64-bit 1 elem vectors sin_cos\n\nuse crate::*;\n\npub(crate) trait SinCosPi: Sized {\n    type Output;\n    fn sin_cos_pi(self) -> Self::Output;\n}\n\nmacro_rules! impl_def {\n    ($vid:ident, $PI:path) => {\n        impl SinCosPi for $vid {\n            type Output = (Self, Self);\n            #[inline]\n            fn sin_cos_pi(self) -> Self::Output {\n                let v = self * Self::splat($PI);\n                (v.sin(), v.cos())\n            }\n        }\n    };\n}\n\nmacro_rules! impl_def32 {\n    ($vid:ident) => {\n        impl_def!($vid, crate::f32::consts::PI);\n    };\n}\nmacro_rules! impl_def64 {\n    ($vid:ident) => {\n        impl_def!($vid, crate::f64::consts::PI);\n    };\n}\n\nmacro_rules! impl_unary_t {\n    ($vid:ident: $fun:ident) => {\n        impl SinCosPi for $vid {\n            type Output = (Self, Self);\n            fn sin_cos_pi(self) -> Self::Output {\n                unsafe {\n                    use crate::mem::transmute;\n                    transmute($fun(transmute(self)))\n                }\n            }\n        }\n    };\n    ($vid:ident[t => $vid_t:ident]: $fun:ident) => {\n        impl SinCosPi for $vid {\n            type Output = (Self, Self);\n            fn sin_cos_pi(self) -> Self::Output {\n                unsafe {\n                    use crate::mem::{transmute, uninitialized};\n\n                    union U {\n                        vec: [$vid; 2],\n                        twice: $vid_t,\n                    }\n\n                    let twice = U { vec: [self, uninitialized()] }.twice;\n                    let twice = transmute($fun(transmute(twice)));\n\n                    union R {\n                        twice: ($vid_t, $vid_t),\n                        vecs: ([$vid; 2], [$vid; 2]),\n                    }\n                    let r = R { twice }.vecs;\n                    (*r.0.get_unchecked(0), *r.0.get_unchecked(1))\n                }\n            }\n        }\n    };\n    ($vid:ident[h => $vid_h:ident]: $fun:ident) => {\n        impl SinCosPi for $vid {\n            type Output = (Self, Self);\n            fn sin_cos_pi(self) -> Self::Output {\n                unsafe {\n                    use crate::mem::transmute;\n\n                    union U {\n                        vec: $vid,\n                        halves: [$vid_h; 2],\n                    }\n\n                    let halves = U { vec: self }.halves;\n\n                    let res_0: ($vid_h, $vid_h) = transmute($fun(transmute(*halves.get_unchecked(0))));\n                    let res_1: ($vid_h, $vid_h) = transmute($fun(transmute(*halves.get_unchecked(1))));\n\n                    union R {\n                        result: ($vid, $vid),\n                        halves: ([$vid_h; 2], [$vid_h; 2]),\n                    }\n                    R { halves: ([res_0.0, res_1.0], [res_0.1, res_1.1]) }.result\n                }\n            }\n        }\n    };\n    ($vid:ident[q => $vid_q:ident]: $fun:ident) => {\n        impl SinCosPi for $vid {\n            type Output = (Self, Self);\n            fn sin_cos_pi(self) -> Self::Output {\n                unsafe {\n                    use crate::mem::transmute;\n\n                    union U {\n                        vec: $vid,\n                        quarters: [$vid_q; 4],\n                    }\n\n                    let quarters = U { vec: self }.quarters;\n\n                    let res_0: ($vid_q, $vid_q) = transmute($fun(transmute(*quarters.get_unchecked(0))));\n                    let res_1: ($vid_q, $vid_q) = transmute($fun(transmute(*quarters.get_unchecked(1))));\n                    let res_2: ($vid_q, $vid_q) = transmute($fun(transmute(*quarters.get_unchecked(2))));\n                    let res_3: ($vid_q, $vid_q) = transmute($fun(transmute(*quarters.get_unchecked(3))));\n\n                    union R {\n                        result: ($vid, $vid),\n                        quarters: ([$vid_q; 4], [$vid_q; 4]),\n                    }\n                    R {\n                        quarters: (\n                            [res_0.0, res_1.0, res_2.0, res_3.0],\n                            [res_0.1, res_1.1, res_2.1, res_3.1],\n                        ),\n                    }\n                    .result\n                }\n            }\n        }\n    };\n}\n\ncfg_if! {\n    if #[cfg(all(target_arch = \"x86_64\", feature = \"sleef-sys\"))] {\n        use sleef_sys::*;\n        cfg_if! {\n            if #[cfg(target_feature = \"avx2\")] {\n                impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05avx2128);\n                impl_unary_t!(f32x16[h => f32x8]: Sleef_sincospif8_u05avx2);\n                impl_unary_t!(f64x8[h => f64x4]: Sleef_sincospid4_u05avx2);\n\n                impl_unary_t!(f32x4: Sleef_sincospif4_u05avx2128);\n                impl_unary_t!(f32x8: Sleef_sincospif8_u05avx2);\n                impl_unary_t!(f64x2: Sleef_sincospid2_u05avx2128);\n                impl_unary_t!(f64x4: Sleef_sincospid4_u05avx2);\n            } else if #[cfg(target_feature = \"avx\")] {\n                impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05sse4);\n                impl_unary_t!(f32x16[h => f32x8]: Sleef_sincospif8_u05avx);\n                impl_unary_t!(f64x8[h => f64x4]: Sleef_sincospid4_u05avx);\n\n                impl_unary_t!(f32x4: Sleef_sincospif4_u05sse4);\n                impl_unary_t!(f32x8: Sleef_sincospif8_u05avx);\n                impl_unary_t!(f64x2: Sleef_sincospid2_u05sse4);\n                impl_unary_t!(f64x4: Sleef_sincospid4_u05avx);\n            } else if #[cfg(target_feature = \"sse4.2\")] {\n                impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05sse4);\n                impl_unary_t!(f32x16[q => f32x4]: Sleef_sincospif4_u05sse4);\n                impl_unary_t!(f64x8[q => f64x2]: Sleef_sincospid2_u05sse4);\n\n                impl_unary_t!(f32x4: Sleef_sincospif4_u05sse4);\n                impl_unary_t!(f32x8[h => f32x4]: Sleef_sincospif4_u05sse4);\n                impl_unary_t!(f64x2: Sleef_sincospid2_u05sse4);\n                impl_unary_t!(f64x4[h => f64x2]: Sleef_sincospid2_u05sse4);\n            } else {\n                impl_def32!(f32x2);\n                impl_def32!(f32x4);\n                impl_def32!(f32x8);\n                impl_def32!(f32x16);\n\n                impl_def64!(f64x2);\n                impl_def64!(f64x4);\n                impl_def64!(f64x8);\n            }\n        }\n    } else {\n        impl_def32!(f32x2);\n        impl_def32!(f32x4);\n        impl_def32!(f32x8);\n        impl_def32!(f32x16);\n\n        impl_def64!(f64x2);\n        impl_def64!(f64x4);\n        impl_def64!(f64x8);\n    }\n}\n"
  },
  {
    "path": "src/codegen/math/float/sin_pi.rs",
    "content": "//! Vertical floating-point `sin_pi`\n#![allow(unused)]\n\n// FIXME 64-bit 1 elem vectors sin_pi\n\nuse crate::*;\n\npub(crate) trait SinPi {\n    fn sin_pi(self) -> Self;\n}\n\ngen_unary_impl_table!(SinPi, sin_pi);\n\nmacro_rules! impl_def {\n    ($vid:ident, $PI:path) => {\n        impl SinPi for $vid {\n            #[inline]\n            fn sin_pi(self) -> Self {\n                (self * Self::splat($PI)).sin()\n            }\n        }\n    };\n}\nmacro_rules! impl_def32 {\n    ($vid:ident) => {\n        impl_def!($vid, crate::f32::consts::PI);\n    };\n}\nmacro_rules! impl_def64 {\n    ($vid:ident) => {\n        impl_def!($vid, crate::f64::consts::PI);\n    };\n}\n\ncfg_if! {\n    if #[cfg(all(target_arch = \"x86_64\", feature = \"sleef-sys\"))] {\n        use sleef_sys::*;\n        cfg_if! {\n            if #[cfg(target_feature = \"avx2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05avx2128);\n                impl_unary!(f32x16[h => f32x8]: Sleef_sinpif8_u05avx2);\n                impl_unary!(f64x8[h => f64x4]: Sleef_sinpid4_u05avx2);\n\n                impl_unary!(f32x4: Sleef_sinpif4_u05avx2128);\n                impl_unary!(f32x8: Sleef_sinpif8_u05avx2);\n                impl_unary!(f64x2: Sleef_sinpid2_u05avx2128);\n                impl_unary!(f64x4: Sleef_sinpid4_u05avx2);\n            } else if #[cfg(target_feature = \"avx\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05sse4);\n                impl_unary!(f32x16[h => f32x8]: Sleef_sinpif8_u05avx);\n                impl_unary!(f64x8[h => f64x4]: Sleef_sinpid4_u05avx);\n\n                impl_unary!(f32x4: Sleef_sinpif4_u05sse4);\n                impl_unary!(f32x8: Sleef_sinpif8_u05avx);\n                impl_unary!(f64x2: Sleef_sinpid2_u05sse4);\n                impl_unary!(f64x4: Sleef_sinpid4_u05avx);\n            } else if #[cfg(target_feature = \"sse4.2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05sse4);\n                impl_unary!(f32x16[q => f32x4]: Sleef_sinpif4_u05sse4);\n                impl_unary!(f64x8[q => f64x2]: Sleef_sinpid2_u05sse4);\n\n                impl_unary!(f32x4: Sleef_sinpif4_u05sse4);\n                impl_unary!(f32x8[h => f32x4]: Sleef_sinpif4_u05sse4);\n                impl_unary!(f64x2: Sleef_sinpid2_u05sse4);\n                impl_unary!(f64x4[h => f64x2]: Sleef_sinpid2_u05sse4);\n            } else {\n                impl_def32!(f32x2);\n                impl_def32!(f32x4);\n                impl_def32!(f32x8);\n                impl_def32!(f32x16);\n\n                impl_def64!(f64x2);\n                impl_def64!(f64x4);\n                impl_def64!(f64x8);\n            }\n        }\n    } else {\n        impl_def32!(f32x2);\n        impl_def32!(f32x4);\n        impl_def32!(f32x8);\n        impl_def32!(f32x16);\n\n        impl_def64!(f64x2);\n        impl_def64!(f64x4);\n        impl_def64!(f64x8);\n    }\n}\n"
  },
  {
    "path": "src/codegen/math/float/sqrt.rs",
    "content": "//! Vertical floating-point `sqrt`\n#![allow(unused)]\n\n// FIXME 64-bit 1 elem vectors sqrt\n\nuse crate::*;\n\npub(crate) trait Sqrt {\n    fn sqrt(self) -> Self;\n}\n\n#[allow(improper_ctypes)]\nextern \"C\" {\n    #[link_name = \"llvm.sqrt.v2f32\"]\n    fn sqrt_v2f32(x: f32x2) -> f32x2;\n    #[link_name = \"llvm.sqrt.v4f32\"]\n    fn sqrt_v4f32(x: f32x4) -> f32x4;\n    #[link_name = \"llvm.sqrt.v8f32\"]\n    fn sqrt_v8f32(x: f32x8) -> f32x8;\n    #[link_name = \"llvm.sqrt.v16f32\"]\n    fn sqrt_v16f32(x: f32x16) -> f32x16;\n    /* FIXME 64-bit sqrtgle elem vectors\n    #[link_name = \"llvm.sqrt.v1f64\"]\n    fn sqrt_v1f64(x: f64x1) -> f64x1;\n     */\n    #[link_name = \"llvm.sqrt.v2f64\"]\n    fn sqrt_v2f64(x: f64x2) -> f64x2;\n    #[link_name = \"llvm.sqrt.v4f64\"]\n    fn sqrt_v4f64(x: f64x4) -> f64x4;\n    #[link_name = \"llvm.sqrt.v8f64\"]\n    fn sqrt_v8f64(x: f64x8) -> f64x8;\n\n    #[link_name = \"llvm.sqrt.f32\"]\n    fn sqrt_f32(x: f32) -> f32;\n    #[link_name = \"llvm.sqrt.f64\"]\n    fn sqrt_f64(x: f64) -> f64;\n}\n\ngen_unary_impl_table!(Sqrt, sqrt);\n\ncfg_if! {\n    if #[cfg(target_arch = \"s390x\")] {\n        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14\n        impl_unary!(f32x2[f32; 2]: sqrt_f32);\n        impl_unary!(f32x4[f32; 4]: sqrt_f32);\n        impl_unary!(f32x8[f32; 8]: sqrt_f32);\n        impl_unary!(f32x16[f32; 16]: sqrt_f32);\n\n        impl_unary!(f64x2[f64; 2]: sqrt_f64);\n        impl_unary!(f64x4[f64; 4]: sqrt_f64);\n        impl_unary!(f64x8[f64; 8]: sqrt_f64);\n    } else if #[cfg(all(target_arch = \"x86_64\", feature = \"sleef-sys\"))] {\n        use sleef_sys::*;\n        cfg_if! {\n            if #[cfg(target_feature = \"avx2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_avx2128);\n                impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_avx2);\n                impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_avx2);\n\n                impl_unary!(f32x4: Sleef_sqrtf4_avx2128);\n                impl_unary!(f32x8: Sleef_sqrtf8_avx2);\n                impl_unary!(f64x2: Sleef_sqrtd2_avx2128);\n                impl_unary!(f64x4: Sleef_sqrtd4_avx2);\n            } else if #[cfg(target_feature = \"avx\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_sse4);\n                impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_avx);\n                impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_avx);\n\n                impl_unary!(f32x4: Sleef_sqrtf4_sse4);\n                impl_unary!(f32x8: Sleef_sqrtf8_avx);\n                impl_unary!(f64x2: Sleef_sqrtd2_sse4);\n                impl_unary!(f64x4: Sleef_sqrtd4_avx);\n            } else if #[cfg(target_feature = \"sse4.2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_sse4);\n                impl_unary!(f32x16[q => f32x4]: Sleef_sqrtf4_sse4);\n                impl_unary!(f64x8[q => f64x2]: Sleef_sqrtd2_sse4);\n\n                impl_unary!(f32x4: Sleef_sqrtf4_sse4);\n                impl_unary!(f32x8[h => f32x4]: Sleef_sqrtf4_sse4);\n                impl_unary!(f64x2: Sleef_sqrtd2_sse4);\n                impl_unary!(f64x4[h => f64x2]: Sleef_sqrtd2_sse4);\n            } else {\n                impl_unary!(f32x2[f32; 2]: sqrt_f32);\n                impl_unary!(f32x16: sqrt_v16f32);\n                impl_unary!(f64x8: sqrt_v8f64);\n\n                impl_unary!(f32x4: sqrt_v4f32);\n                impl_unary!(f32x8: sqrt_v8f32);\n                impl_unary!(f64x2: sqrt_v2f64);\n                impl_unary!(f64x4: sqrt_v4f64);\n            }\n        }\n    } else {\n        impl_unary!(f32x2[f32; 2]: sqrt_f32);\n        impl_unary!(f32x4: sqrt_v4f32);\n        impl_unary!(f32x8: sqrt_v8f32);\n        impl_unary!(f32x16: sqrt_v16f32);\n\n        impl_unary!(f64x2: sqrt_v2f64);\n        impl_unary!(f64x4: sqrt_v4f64);\n        impl_unary!(f64x8: sqrt_v8f64);\n    }\n}\n"
  },
  {
    "path": "src/codegen/math/float/sqrte.rs",
    "content": "//! Vertical floating-point `sqrt`\n#![allow(unused)]\n\n// FIXME 64-bit 1 elem vectors sqrte\n\nuse crate::llvm::simd_fsqrt;\nuse crate::*;\n\npub(crate) trait Sqrte {\n    fn sqrte(self) -> Self;\n}\n\ngen_unary_impl_table!(Sqrte, sqrte);\n\ncfg_if! {\n    if #[cfg(all(target_arch = \"x86_64\", feature = \"sleef-sys\"))] {\n        use sleef_sys::*;\n        cfg_if! {\n            if #[cfg(target_feature = \"avx2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35avx2128);\n                impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_u35avx2);\n                impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_u35avx2);\n\n                impl_unary!(f32x4: Sleef_sqrtf4_u35avx2128);\n                impl_unary!(f32x8: Sleef_sqrtf8_u35avx2);\n                impl_unary!(f64x2: Sleef_sqrtd2_u35avx2128);\n                impl_unary!(f64x4: Sleef_sqrtd4_u35avx2);\n            } else if #[cfg(target_feature = \"avx\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35sse4);\n                impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_u35avx);\n                impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_u35avx);\n\n                impl_unary!(f32x4: Sleef_sqrtf4_u35sse4);\n                impl_unary!(f32x8: Sleef_sqrtf8_u35avx);\n                impl_unary!(f64x2: Sleef_sqrtd2_u35sse4);\n                impl_unary!(f64x4: Sleef_sqrtd4_u35avx);\n            } else if #[cfg(target_feature = \"sse4.2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35sse4);\n                impl_unary!(f32x16[q => f32x4]: Sleef_sqrtf4_u35sse4);\n                impl_unary!(f64x8[q => f64x2]: Sleef_sqrtd2_u35sse4);\n\n                impl_unary!(f32x4: Sleef_sqrtf4_u35sse4);\n                impl_unary!(f32x8[h => f32x4]: Sleef_sqrtf4_u35sse4);\n                impl_unary!(f64x2: Sleef_sqrtd2_u35sse4);\n                impl_unary!(f64x4[h => f64x2]: Sleef_sqrtd2_u35sse4);\n            } else {\n                impl_unary!(f32x2[g]: simd_fsqrt);\n                impl_unary!(f32x16[g]: simd_fsqrt);\n                impl_unary!(f64x8[g]: simd_fsqrt);\n\n                impl_unary!(f32x4[g]: simd_fsqrt);\n                impl_unary!(f32x8[g]: simd_fsqrt);\n                impl_unary!(f64x2[g]: simd_fsqrt);\n                impl_unary!(f64x4[g]: simd_fsqrt);\n            }\n        }\n    } else {\n        impl_unary!(f32x2[g]: simd_fsqrt);\n        impl_unary!(f32x4[g]: simd_fsqrt);\n        impl_unary!(f32x8[g]: simd_fsqrt);\n        impl_unary!(f32x16[g]: simd_fsqrt);\n\n        impl_unary!(f64x2[g]: simd_fsqrt);\n        impl_unary!(f64x4[g]: simd_fsqrt);\n        impl_unary!(f64x8[g]: simd_fsqrt);\n    }\n}\n"
  },
  {
    "path": "src/codegen/math/float/tanh.rs",
    "content": "//! Vertical floating-point `tanh`\n#![allow(unused)]\n\n// FIXME 64-bit 1 elem vectors tanh\n\n#[cfg(not(feature = \"std\"))]\nuse num_traits::Float;\n\nuse crate::*;\n\npub(crate) trait Tanh {\n    fn tanh(self) -> Self;\n}\n\nmacro_rules! define_tanh {\n    ($name:ident, $basetype:ty, $simdtype:ty, $lanes:expr, $trait:path) => {\n        fn $name(x: $simdtype) -> $simdtype {\n            use core::intrinsics::transmute;\n            let mut buf: [$basetype; $lanes] = unsafe { transmute(x) };\n            for elem in &mut buf {\n                *elem = <$basetype as $trait>::tanh(*elem);\n            }\n            unsafe { transmute(buf) }\n        }\n    };\n\n    (f32 => $name:ident, $type:ty, $lanes:expr) => {\n        define_tanh!($name, f32, $type, $lanes, Float);\n    };\n\n    (f64 => $name:ident, $type:ty, $lanes:expr) => {\n        define_tanh!($name, f64, $type, $lanes, Float);\n    };\n}\n\n// llvm does not seem to expose the hyperbolic versions of trigonometric\n// functions; we thus call the classical rust versions on all of them (which\n// stem from cmath).\ndefine_tanh!(f32 => tanh_v2f32, f32x2, 2);\ndefine_tanh!(f32 => tanh_v4f32, f32x4, 4);\ndefine_tanh!(f32 => tanh_v8f32, f32x8, 8);\ndefine_tanh!(f32 => tanh_v16f32, f32x16, 16);\n\ndefine_tanh!(f64 => tanh_v2f64, f64x2, 2);\ndefine_tanh!(f64 => tanh_v4f64, f64x4, 4);\ndefine_tanh!(f64 => tanh_v8f64, f64x8, 8);\n\nfn tanh_f32(x: f32) -> f32 {\n    Float::tanh(x)\n}\n\nfn tanh_f64(x: f64) -> f64 {\n    Float::tanh(x)\n}\n\ngen_unary_impl_table!(Tanh, tanh);\n\ncfg_if! {\n    if #[cfg(target_arch = \"s390x\")] {\n        // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14\n        impl_unary!(f32x2[f32; 2]: tanh_f32);\n        impl_unary!(f32x4[f32; 4]: tanh_f32);\n        impl_unary!(f32x8[f32; 8]: tanh_f32);\n        impl_unary!(f32x16[f32; 16]: tanh_f32);\n\n        impl_unary!(f64x2[f64; 2]: tanh_f64);\n        impl_unary!(f64x4[f64; 4]: tanh_f64);\n        impl_unary!(f64x8[f64; 8]: tanh_f64);\n    } else if #[cfg(all(target_arch = \"x86_64\", feature = \"sleef-sys\"))] {\n        use sleef_sys::*;\n        cfg_if! {\n            if #[cfg(target_feature = \"avx2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10avx2128);\n                impl_unary!(f32x16[h => f32x8]: Sleef_tanhf8_u10avx2);\n                impl_unary!(f64x8[h => f64x4]: Sleef_tanhd4_u10avx2);\n\n                impl_unary!(f32x4: Sleef_tanhf4_u10avx2128);\n                impl_unary!(f32x8: Sleef_tanhf8_u10avx2);\n                impl_unary!(f64x2: Sleef_tanhd2_u10avx2128);\n                impl_unary!(f64x4: Sleef_tanhd4_u10avx2);\n            } else if #[cfg(target_feature = \"avx\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10sse4);\n                impl_unary!(f32x16[h => f32x8]: Sleef_tanhf8_u10avx);\n                impl_unary!(f64x8[h => f64x4]: Sleef_tanhd4_u10avx);\n\n                impl_unary!(f32x4: Sleef_tanhf4_u10sse4);\n                impl_unary!(f32x8: Sleef_tanhf8_u10avx);\n                impl_unary!(f64x2: Sleef_tanhd2_u10sse4);\n                impl_unary!(f64x4: Sleef_tanhd4_u10avx);\n            } else if #[cfg(target_feature = \"sse4.2\")] {\n                impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10sse4);\n                impl_unary!(f32x16[q => f32x4]: Sleef_tanhf4_u10sse4);\n                impl_unary!(f64x8[q => f64x2]: Sleef_tanhd2_u10sse4);\n\n                impl_unary!(f32x4: Sleef_tanhf4_u10sse4);\n                impl_unary!(f32x8[h => f32x4]: Sleef_tanhf4_u10sse4);\n                impl_unary!(f64x2: Sleef_tanhd2_u10sse4);\n                impl_unary!(f64x4[h => f64x2]: Sleef_tanhd2_u10sse4);\n            } else {\n                impl_unary!(f32x2[f32; 2]: tanh_f32);\n                impl_unary!(f32x16: tanh_v16f32);\n                impl_unary!(f64x8: tanh_v8f64);\n\n                impl_unary!(f32x4: tanh_v4f32);\n                impl_unary!(f32x8: tanh_v8f32);\n                impl_unary!(f64x2: tanh_v2f64);\n                impl_unary!(f64x4: tanh_v4f64);\n            }\n        }\n    } else {\n        impl_unary!(f32x2[f32; 2]: tanh_f32);\n        impl_unary!(f32x4: tanh_v4f32);\n        impl_unary!(f32x8: tanh_v8f32);\n        impl_unary!(f32x16: tanh_v16f32);\n\n        impl_unary!(f64x2: tanh_v2f64);\n        impl_unary!(f64x4: tanh_v4f64);\n        impl_unary!(f64x8: tanh_v8f64);\n    }\n}\n"
  },
  {
    "path": "src/codegen/math/float.rs",
    "content": "//! Vertical floating-point math operations.\n#![allow(clippy::useless_transmute)]\n\n#[macro_use]\npub(crate) mod macros;\npub(crate) mod abs;\npub(crate) mod cos;\npub(crate) mod cos_pi;\npub(crate) mod exp;\npub(crate) mod ln;\npub(crate) mod mul_add;\npub(crate) mod mul_adde;\npub(crate) mod powf;\npub(crate) mod sin;\npub(crate) mod sin_cos_pi;\npub(crate) mod sin_pi;\npub(crate) mod sqrt;\npub(crate) mod sqrte;\npub(crate) mod tanh;\n"
  },
  {
    "path": "src/codegen/math.rs",
    "content": "//! Vertical math operations\n\npub(crate) mod float;\n"
  },
  {
    "path": "src/codegen/pointer_sized_int.rs",
    "content": "//! Provides `isize` and `usize`\n\nuse cfg_if::cfg_if;\n\ncfg_if! {\n    if #[cfg(target_pointer_width = \"8\")] {\n        pub(crate) type isize_ = i8;\n        pub(crate) type usize_ = u8;\n    } else if #[cfg(target_pointer_width = \"16\")] {\n        pub(crate) type isize_ = i16;\n        pub(crate) type usize_ = u16;\n    } else if #[cfg(target_pointer_width = \"32\")] {\n        pub(crate) type isize_ = i32;\n        pub(crate) type usize_ = u32;\n\n    } else if #[cfg(target_pointer_width = \"64\")] {\n        pub(crate) type isize_ = i64;\n        pub(crate) type usize_ = u64;\n    } else if #[cfg(target_pointer_width = \"64\")] {\n        pub(crate) type isize_ = i64;\n        pub(crate) type usize_ = u64;\n    } else if #[cfg(target_pointer_width = \"128\")] {\n        pub(crate) type isize_ = i128;\n        pub(crate) type usize_ = u128;\n    } else {\n        compile_error!(\"unsupported target_pointer_width\");\n    }\n}\n"
  },
  {
    "path": "src/codegen/reductions/mask/aarch64.rs",
    "content": "//! Mask reductions implementation for `aarch64` targets\n\n/// 128-bit wide vectors\nmacro_rules! aarch64_128_neon_impl {\n    ($id:ident, $vmin:ident, $vmax:ident) => {\n        impl All for $id {\n            #[inline]\n            #[target_feature(enable = \"neon\")]\n            unsafe fn all(self) -> bool {\n                use crate::arch::aarch64::$vmin;\n                $vmin(crate::mem::transmute(self)) != 0\n            }\n        }\n        impl Any for $id {\n            #[inline]\n            #[target_feature(enable = \"neon\")]\n            unsafe fn any(self) -> bool {\n                use crate::arch::aarch64::$vmax;\n                $vmax(crate::mem::transmute(self)) != 0\n            }\n        }\n    };\n}\n\n/// 64-bit wide vectors\nmacro_rules! aarch64_64_neon_impl {\n    ($id:ident, $vec128:ident) => {\n        impl All for $id {\n            #[inline]\n            #[target_feature(enable = \"neon\")]\n            unsafe fn all(self) -> bool {\n                // Duplicates the 64-bit vector into a 128-bit one and\n                // calls all on that.\n                union U {\n                    halves: ($id, $id),\n                    vec: $vec128,\n                }\n                U { halves: (self, self) }.vec.all()\n            }\n        }\n        impl Any for $id {\n            #[inline]\n            #[target_feature(enable = \"neon\")]\n            unsafe fn any(self) -> bool {\n                union U {\n                    halves: ($id, $id),\n                    vec: $vec128,\n                }\n                U { halves: (self, self) }.vec.any()\n            }\n        }\n    };\n}\n\n/// Mask reduction implementation for `aarch64` targets\nmacro_rules! impl_mask_reductions {\n    // 64-bit wide masks\n    (m8x8) => {\n        aarch64_64_neon_impl!(m8x8, m8x16);\n    };\n    (m16x4) => {\n        aarch64_64_neon_impl!(m16x4, m16x8);\n    };\n    (m32x2) => {\n        aarch64_64_neon_impl!(m32x2, m32x4);\n    };\n    // 128-bit wide masks\n    (m8x16) => {\n        aarch64_128_neon_impl!(m8x16, vminvq_u8, vmaxvq_u8);\n    };\n    (m16x8) => {\n        aarch64_128_neon_impl!(m16x8, vminvq_u16, vmaxvq_u16);\n    };\n    (m32x4) => {\n        aarch64_128_neon_impl!(m32x4, vminvq_u32, vmaxvq_u32);\n    };\n    // Fallback to LLVM's default code-generation:\n    ($id:ident) => {\n        fallback_impl!($id);\n    };\n}\n"
  },
  {
    "path": "src/codegen/reductions/mask/arm.rs",
    "content": "//! Mask reductions implementation for `arm` targets\n\n/// Implementation for ARM + v7 + NEON for 64-bit or 128-bit wide vectors with\n/// more than two elements.\nmacro_rules! arm_128_v7_neon_impl {\n    ($id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => {\n        impl All for $id {\n            #[inline]\n            #[target_feature(enable = \"v7,neon\")]\n            unsafe fn all(self) -> bool {\n                use crate::arch::arm::$vpmin;\n                use crate::mem::transmute;\n                union U {\n                    halves: ($half, $half),\n                    vec: $id,\n                }\n                let halves = U { vec: self }.halves;\n                let h: $half = transmute($vpmin(transmute(halves.0), transmute(halves.1)));\n                h.all()\n            }\n        }\n        impl Any for $id {\n            #[inline]\n            #[target_feature(enable = \"v7,neon\")]\n            unsafe fn any(self) -> bool {\n                use crate::arch::arm::$vpmax;\n                use crate::mem::transmute;\n                union U {\n                    halves: ($half, $half),\n                    vec: $id,\n                }\n                let halves = U { vec: self }.halves;\n                let h: $half = transmute($vpmax(transmute(halves.0), transmute(halves.1)));\n                h.any()\n            }\n        }\n    };\n}\n\n/// Mask reduction implementation for `arm` targets\nmacro_rules! impl_mask_reductions {\n    // 128-bit wide masks\n    (m8x16) => {\n        arm_128_v7_neon_impl!(m8x16, m8x8, vpmin_u8, vpmax_u8);\n    };\n    (m16x8) => {\n        arm_128_v7_neon_impl!(m16x8, m16x4, vpmin_u16, vpmax_u16);\n    };\n    (m32x4) => {\n        arm_128_v7_neon_impl!(m32x4, m32x2, vpmin_u32, vpmax_u32);\n    };\n    // Fallback to LLVM's default code-generation:\n    ($id:ident) => {\n        fallback_impl!($id);\n    };\n}\n"
  },
  {
    "path": "src/codegen/reductions/mask/fallback.rs",
    "content": "//! Default mask reduction implementations.\n\n/// Default mask reduction implementation\nmacro_rules! impl_mask_reductions {\n    ($id:ident) => {\n        fallback_impl!($id);\n    };\n}\n"
  },
  {
    "path": "src/codegen/reductions/mask/fallback_impl.rs",
    "content": "//! Default implementation of a mask reduction for any target.\n\nmacro_rules! fallback_to_other_impl {\n    ($id:ident, $other:ident) => {\n        impl All for $id {\n            #[inline]\n            unsafe fn all(self) -> bool {\n                let m: $other = crate::mem::transmute(self);\n                m.all()\n            }\n        }\n        impl Any for $id {\n            #[inline]\n            unsafe fn any(self) -> bool {\n                let m: $other = crate::mem::transmute(self);\n                m.any()\n            }\n        }\n    };\n}\n\n/// Fallback implementation.\nmacro_rules! fallback_impl {\n    // 16-bit wide masks:\n    (m8x2) => {\n        impl All for m8x2 {\n            #[inline]\n            unsafe fn all(self) -> bool {\n                let i: u16 = crate::mem::transmute(self);\n                i == u16::max_value()\n            }\n        }\n        impl Any for m8x2 {\n            #[inline]\n            unsafe fn any(self) -> bool {\n                let i: u16 = crate::mem::transmute(self);\n                i != 0\n            }\n        }\n    };\n    // 32-bit wide masks\n    (m8x4) => {\n        impl All for m8x4 {\n            #[inline]\n            unsafe fn all(self) -> bool {\n                let i: u32 = crate::mem::transmute(self);\n                i == u32::max_value()\n            }\n        }\n        impl Any for m8x4 {\n            #[inline]\n            unsafe fn any(self) -> bool {\n                let i: u32 = crate::mem::transmute(self);\n                i != 0\n            }\n        }\n    };\n    (m16x2) => {\n        fallback_to_other_impl!(m16x2, m8x4);\n    };\n    // 64-bit wide masks:\n    (m8x8) => {\n        impl All for m8x8 {\n            #[inline]\n            unsafe fn all(self) -> bool {\n                let i: u64 = crate::mem::transmute(self);\n                i == u64::max_value()\n            }\n        }\n        impl Any for m8x8 {\n            #[inline]\n            unsafe fn any(self) -> bool {\n                let i: u64 = crate::mem::transmute(self);\n                i != 0\n            }\n        }\n    };\n    (m16x4) => {\n        fallback_to_other_impl!(m16x4, m8x8);\n    };\n    (m32x2) => {\n        fallback_to_other_impl!(m32x2, m16x4);\n    };\n    // FIXME: 64x1 maxk\n    // 128-bit wide masks:\n    (m8x16) => {\n        impl All for m8x16 {\n            #[inline]\n            unsafe fn all(self) -> bool {\n                let i: u128 = crate::mem::transmute(self);\n                i == u128::max_value()\n            }\n        }\n        impl Any for m8x16 {\n            #[inline]\n            unsafe fn any(self) -> bool {\n                let i: u128 = crate::mem::transmute(self);\n                i != 0\n            }\n        }\n    };\n    (m16x8) => {\n        fallback_to_other_impl!(m16x8, m8x16);\n    };\n    (m32x4) => {\n        fallback_to_other_impl!(m32x4, m16x8);\n    };\n    (m64x2) => {\n        fallback_to_other_impl!(m64x2, m32x4);\n    };\n    (m128x1) => {\n        fallback_to_other_impl!(m128x1, m64x2);\n    };\n    // 256-bit wide masks\n    (m8x32) => {\n        impl All for m8x32 {\n            #[inline]\n            unsafe fn all(self) -> bool {\n                let i: [u128; 2] = crate::mem::transmute(self);\n                let o: [u128; 2] = [u128::max_value(); 2];\n                i == o\n            }\n        }\n        impl Any for m8x32 {\n            #[inline]\n            unsafe fn any(self) -> bool {\n                let i: [u128; 2] = crate::mem::transmute(self);\n                let o: [u128; 2] = [0; 2];\n                i != o\n            }\n        }\n    };\n    (m16x16) => {\n        fallback_to_other_impl!(m16x16, m8x32);\n    };\n    (m32x8) => {\n        fallback_to_other_impl!(m32x8, m16x16);\n    };\n    (m64x4) => {\n        fallback_to_other_impl!(m64x4, m32x8);\n    };\n    (m128x2) => {\n        fallback_to_other_impl!(m128x2, m64x4);\n    };\n    // 512-bit wide masks\n    (m8x64) => {\n        impl All for m8x64 {\n            #[inline]\n            unsafe fn all(self) -> bool {\n                let i: [u128; 4] = crate::mem::transmute(self);\n                let o: [u128; 4] = [u128::max_value(); 4];\n                i == o\n            }\n        }\n        impl Any for m8x64 {\n            #[inline]\n            unsafe fn any(self) -> bool {\n                let i: [u128; 4] = crate::mem::transmute(self);\n                let o: [u128; 4] = [0; 4];\n                i != o\n            }\n        }\n    };\n    (m16x32) => {\n        fallback_to_other_impl!(m16x32, m8x64);\n    };\n    (m32x16) => {\n        fallback_to_other_impl!(m32x16, m16x32);\n    };\n    (m64x8) => {\n        fallback_to_other_impl!(m64x8, m32x16);\n    };\n    (m128x4) => {\n        fallback_to_other_impl!(m128x4, m64x8);\n    };\n    // Masks with pointer-sized elements64\n    (msizex2) => {\n        cfg_if! {\n            if #[cfg(target_pointer_width = \"64\")] {\n                fallback_to_other_impl!(msizex2, m64x2);\n            } else if #[cfg(target_pointer_width = \"32\")] {\n                fallback_to_other_impl!(msizex2, m32x2);\n            } else {\n                compile_error!(\"unsupported target_pointer_width\");\n            }\n        }\n    };\n    (msizex4) => {\n        cfg_if! {\n            if #[cfg(target_pointer_width = \"64\")] {\n                fallback_to_other_impl!(msizex4, m64x4);\n            } else if #[cfg(target_pointer_width = \"32\")] {\n                fallback_to_other_impl!(msizex4, m32x4);\n            } else {\n                compile_error!(\"unsupported target_pointer_width\");\n            }\n        }\n    };\n    (msizex8) => {\n        cfg_if! {\n            if #[cfg(target_pointer_width = \"64\")] {\n                fallback_to_other_impl!(msizex8, m64x8);\n            } else if #[cfg(target_pointer_width = \"32\")] {\n                fallback_to_other_impl!(msizex8, m32x8);\n            } else {\n                compile_error!(\"unsupported target_pointer_width\");\n            }\n        }\n    };\n}\n\nmacro_rules! recurse_half {\n    ($vid:ident, $vid_h:ident) => {\n        impl All for $vid {\n            #[inline]\n            unsafe fn all(self) -> bool {\n                union U {\n                    halves: ($vid_h, $vid_h),\n                    vec: $vid,\n                }\n                let halves = U { vec: self }.halves;\n                halves.0.all() && halves.1.all()\n            }\n        }\n        impl Any for $vid {\n            #[inline]\n            unsafe fn any(self) -> bool {\n                union U {\n                    halves: ($vid_h, $vid_h),\n                    vec: $vid,\n                }\n                let halves = U { vec: self }.halves;\n                halves.0.any() || halves.1.any()\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/codegen/reductions/mask/x86/avx.rs",
    "content": "//! Mask reductions implementation for `x86` and `x86_64` targets with `AVX`\n\n/// `x86`/`x86_64` 256-bit `AVX` implementation\n/// FIXME: it might be faster here to do two `_mm_movmask_epi8`\n#[cfg(target_feature = \"avx\")]\nmacro_rules! x86_m8x32_avx_impl {\n    ($id:ident) => {\n        impl All for $id {\n            #[inline]\n            #[target_feature(enable = \"avx\")]\n            unsafe fn all(self) -> bool {\n                #[cfg(target_arch = \"x86\")]\n                use crate::arch::x86::_mm256_testc_si256;\n                #[cfg(target_arch = \"x86_64\")]\n                use crate::arch::x86_64::_mm256_testc_si256;\n                _mm256_testc_si256(crate::mem::transmute(self), crate::mem::transmute($id::splat(true))) != 0\n            }\n        }\n        impl Any for $id {\n            #[inline]\n            #[target_feature(enable = \"avx\")]\n            unsafe fn any(self) -> bool {\n                #[cfg(target_arch = \"x86\")]\n                use crate::arch::x86::_mm256_testz_si256;\n                #[cfg(target_arch = \"x86_64\")]\n                use crate::arch::x86_64::_mm256_testz_si256;\n                _mm256_testz_si256(crate::mem::transmute(self), crate::mem::transmute(self)) == 0\n            }\n        }\n    };\n}\n\n/// `x86`/`x86_64` 256-bit m32x8 `AVX` implementation\nmacro_rules! x86_m32x8_avx_impl {\n    ($id:ident) => {\n        impl All for $id {\n            #[inline]\n            #[target_feature(enable = \"sse\")]\n            unsafe fn all(self) -> bool {\n                #[cfg(target_arch = \"x86\")]\n                use crate::arch::x86::_mm256_movemask_ps;\n                #[cfg(target_arch = \"x86_64\")]\n                use crate::arch::x86_64::_mm256_movemask_ps;\n                // _mm256_movemask_ps(a) creates a 8bit mask containing the\n                // most significant bit of each lane of `a`. If all bits are\n                // set, then all 8 lanes of the mask are true.\n                _mm256_movemask_ps(crate::mem::transmute(self)) == 0b_1111_1111_i32\n            }\n        }\n        impl Any for $id {\n            #[inline]\n            #[target_feature(enable = \"sse\")]\n            unsafe fn any(self) -> bool {\n                #[cfg(target_arch = \"x86\")]\n                use crate::arch::x86::_mm256_movemask_ps;\n                #[cfg(target_arch = \"x86_64\")]\n                use crate::arch::x86_64::_mm256_movemask_ps;\n\n                _mm256_movemask_ps(crate::mem::transmute(self)) != 0\n            }\n        }\n    };\n}\n\n/// `x86`/`x86_64` 256-bit m64x4 `AVX` implementation\nmacro_rules! x86_m64x4_avx_impl {\n    ($id:ident) => {\n        impl All for $id {\n            #[inline]\n            #[target_feature(enable = \"sse\")]\n            unsafe fn all(self) -> bool {\n                #[cfg(target_arch = \"x86\")]\n                use crate::arch::x86::_mm256_movemask_pd;\n                #[cfg(target_arch = \"x86_64\")]\n                use crate::arch::x86_64::_mm256_movemask_pd;\n                // _mm256_movemask_pd(a) creates a 4bit mask containing the\n                // most significant bit of each lane of `a`. If all bits are\n                // set, then all 4 lanes of the mask are true.\n                _mm256_movemask_pd(crate::mem::transmute(self)) == 0b_1111_i32\n            }\n        }\n        impl Any for $id {\n            #[inline]\n            #[target_feature(enable = \"sse\")]\n            unsafe fn any(self) -> bool {\n                #[cfg(target_arch = \"x86\")]\n                use crate::arch::x86::_mm256_movemask_pd;\n                #[cfg(target_arch = \"x86_64\")]\n                use crate::arch::x86_64::_mm256_movemask_pd;\n\n                _mm256_movemask_pd(crate::mem::transmute(self)) != 0\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/codegen/reductions/mask/x86/avx2.rs",
    "content": "//! Mask reductions implementation for `x86` and `x86_64` targets with `AVX2`.\n#![allow(unused)]\n\n/// x86/x86_64 256-bit m8x32 AVX2 implementation\nmacro_rules! x86_m8x32_avx2_impl {\n    ($id:ident) => {\n        impl All for $id {\n            #[inline]\n            #[target_feature(enable = \"sse2\")]\n            unsafe fn all(self) -> bool {\n                #[cfg(target_arch = \"x86\")]\n                use crate::arch::x86::_mm256_movemask_epi8;\n                #[cfg(target_arch = \"x86_64\")]\n                use crate::arch::x86_64::_mm256_movemask_epi8;\n                // _mm256_movemask_epi8(a) creates a 32bit mask containing the\n                // most significant bit of each byte of `a`. If all\n                // bits are set, then all 32 lanes of the mask are\n                // true.\n                _mm256_movemask_epi8(crate::mem::transmute(self)) == -1_i32\n            }\n        }\n        impl Any for $id {\n            #[inline]\n            #[target_feature(enable = \"sse2\")]\n            unsafe fn any(self) -> bool {\n                #[cfg(target_arch = \"x86\")]\n                use crate::arch::x86::_mm256_movemask_epi8;\n                #[cfg(target_arch = \"x86_64\")]\n                use crate::arch::x86_64::_mm256_movemask_epi8;\n\n                _mm256_movemask_epi8(crate::mem::transmute(self)) != 0\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/codegen/reductions/mask/x86/sse.rs",
    "content": "//! Mask reductions implementation for `x86` and `x86_64` targets with `SSE`.\n#![allow(unused)]\n\n/// `x86`/`x86_64` 128-bit `m32x4` `SSE` implementation\nmacro_rules! x86_m32x4_sse_impl {\n    ($id:ident) => {\n        impl All for $id {\n            #[inline]\n            #[target_feature(enable = \"sse\")]\n            unsafe fn all(self) -> bool {\n                #[cfg(target_arch = \"x86\")]\n                use crate::arch::x86::_mm_movemask_ps;\n                #[cfg(target_arch = \"x86_64\")]\n                use crate::arch::x86_64::_mm_movemask_ps;\n                // _mm_movemask_ps(a) creates a 4bit mask containing the\n                // most significant bit of each lane of `a`. If all\n                // bits are set, then all 4 lanes of the mask are\n                // true.\n                _mm_movemask_ps(crate::mem::transmute(self)) == 0b_1111_i32\n            }\n        }\n        impl Any for $id {\n            #[inline]\n            #[target_feature(enable = \"sse\")]\n            unsafe fn any(self) -> bool {\n                #[cfg(target_arch = \"x86\")]\n                use crate::arch::x86::_mm_movemask_ps;\n                #[cfg(target_arch = \"x86_64\")]\n                use crate::arch::x86_64::_mm_movemask_ps;\n\n                _mm_movemask_ps(crate::mem::transmute(self)) != 0\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/codegen/reductions/mask/x86/sse2.rs",
    "content": "//! Mask reductions implementation for `x86` and `x86_64` targets with `SSE2`.\n#![allow(unused)]\n\n/// `x86`/`x86_64` 128-bit m64x2 `SSE2` implementation\nmacro_rules! x86_m64x2_sse2_impl {\n    ($id:ident) => {\n        impl All for $id {\n            #[inline]\n            #[target_feature(enable = \"sse\")]\n            unsafe fn all(self) -> bool {\n                #[cfg(target_arch = \"x86\")]\n                use crate::arch::x86::_mm_movemask_pd;\n                #[cfg(target_arch = \"x86_64\")]\n                use crate::arch::x86_64::_mm_movemask_pd;\n                // _mm_movemask_pd(a) creates a 2bit mask containing the\n                // most significant bit of each lane of `a`. If all\n                // bits are set, then all 2 lanes of the mask are\n                // true.\n                _mm_movemask_pd(crate::mem::transmute(self)) == 0b_11_i32\n            }\n        }\n        impl Any for $id {\n            #[inline]\n            #[target_feature(enable = \"sse\")]\n            unsafe fn any(self) -> bool {\n                #[cfg(target_arch = \"x86\")]\n                use crate::arch::x86::_mm_movemask_pd;\n                #[cfg(target_arch = \"x86_64\")]\n                use crate::arch::x86_64::_mm_movemask_pd;\n\n                _mm_movemask_pd(crate::mem::transmute(self)) != 0\n            }\n        }\n    };\n}\n\n/// `x86`/`x86_64` 128-bit m8x16 `SSE2` implementation\nmacro_rules! x86_m8x16_sse2_impl {\n    ($id:ident) => {\n        impl All for $id {\n            #[inline]\n            #[target_feature(enable = \"sse2\")]\n            unsafe fn all(self) -> bool {\n                #[cfg(target_arch = \"x86\")]\n                use crate::arch::x86::_mm_movemask_epi8;\n                #[cfg(target_arch = \"x86_64\")]\n                use crate::arch::x86_64::_mm_movemask_epi8;\n                // _mm_movemask_epi8(a) creates a 16bit mask containing the\n                // most significant bit of each byte of `a`. If all\n                // bits are set, then all 16 lanes of the mask are\n                // true.\n                _mm_movemask_epi8(crate::mem::transmute(self)) == i32::from(u16::max_value())\n            }\n        }\n        impl Any for $id {\n            #[inline]\n            #[target_feature(enable = \"sse2\")]\n            unsafe fn any(self) -> bool {\n                #[cfg(target_arch = \"x86\")]\n                use crate::arch::x86::_mm_movemask_epi8;\n                #[cfg(target_arch = \"x86_64\")]\n                use crate::arch::x86_64::_mm_movemask_epi8;\n\n                _mm_movemask_epi8(crate::mem::transmute(self)) != 0\n            }\n        }\n    };\n}\n"
  },
  {
    "path": "src/codegen/reductions/mask/x86.rs",
    "content": "//! Mask reductions implementation for `x86` and `x86_64` targets\n\n#[cfg(target_feature = \"sse\")]\n#[macro_use]\nmod sse;\n\n#[cfg(target_feature = \"sse2\")]\n#[macro_use]\nmod sse2;\n\n#[cfg(target_feature = \"avx\")]\n#[macro_use]\nmod avx;\n\n#[cfg(target_feature = \"avx2\")]\n#[macro_use]\nmod avx2;\n\n/// x86 64-bit m8x8 implementation\nmacro_rules! x86_m8x8_impl {\n    ($id:ident) => {\n        fallback_impl!($id);\n    };\n}\n\n/// x86 128-bit m8x16 implementation\nmacro_rules! x86_m8x16_impl {\n    ($id:ident) => {\n        cfg_if! {\n            if #[cfg(target_feature = \"sse2\")] {\n                x86_m8x16_sse2_impl!($id);\n            } else {\n                fallback_impl!($id);\n            }\n        }\n    };\n}\n\n/// x86 128-bit m32x4 implementation\nmacro_rules! x86_m32x4_impl {\n    ($id:ident) => {\n        cfg_if! {\n            if #[cfg(target_feature = \"sse\")] {\n                x86_m32x4_sse_impl!($id);\n            } else {\n                fallback_impl!($id);\n            }\n        }\n    };\n}\n\n/// x86 128-bit m64x2 implementation\nmacro_rules! x86_m64x2_impl {\n    ($id:ident) => {\n        cfg_if! {\n            if #[cfg(target_feature = \"sse2\")] {\n                x86_m64x2_sse2_impl!($id);\n            } else if #[cfg(target_feature = \"sse\")] {\n                x86_m32x4_sse_impl!($id);\n            } else {\n                fallback_impl!($id);\n            }\n        }\n    };\n}\n\n/// x86 256-bit m8x32 implementation\nmacro_rules! x86_m8x32_impl {\n    ($id:ident, $half_id:ident) => {\n        cfg_if! {\n            if #[cfg(target_feature = \"avx2\")] {\n                x86_m8x32_avx2_impl!($id);\n            } else if #[cfg(target_feature = \"avx\")] {\n                x86_m8x32_avx_impl!($id);\n            } else if #[cfg(target_feature = \"sse2\")] {\n                recurse_half!($id, $half_id);\n            } else {\n                fallback_impl!($id);\n            }\n        }\n    };\n}\n\n/// x86 256-bit m32x8 implementation\nmacro_rules! x86_m32x8_impl {\n    ($id:ident, $half_id:ident) => {\n        cfg_if! {\n            if #[cfg(target_feature = \"avx\")] {\n                x86_m32x8_avx_impl!($id);\n            } else if #[cfg(target_feature = \"sse\")] {\n                recurse_half!($id, $half_id);\n            } else {\n                fallback_impl!($id);\n            }\n        }\n    };\n}\n\n/// x86 256-bit m64x4 implementation\nmacro_rules! x86_m64x4_impl {\n    ($id:ident, $half_id:ident) => {\n        cfg_if! {\n            if #[cfg(target_feature = \"avx\")] {\n                x86_m64x4_avx_impl!($id);\n            } else if #[cfg(target_feature = \"sse\")] {\n                recurse_half!($id, $half_id);\n            } else {\n                fallback_impl!($id);\n            }\n        }\n    };\n}\n\n/// Fallback implementation.\nmacro_rules! x86_intr_impl {\n    ($id:ident) => {\n        impl All for $id {\n            #[inline]\n            unsafe fn all(self) -> bool {\n                use crate::llvm::simd_reduce_all;\n                simd_reduce_all(self.0)\n            }\n        }\n        impl Any for $id {\n            #[inline]\n            unsafe fn any(self) -> bool {\n                use crate::llvm::simd_reduce_any;\n                simd_reduce_any(self.0)\n            }\n        }\n    };\n}\n\n/// Mask reduction implementation for `x86` and `x86_64` targets\nmacro_rules! impl_mask_reductions {\n    // 64-bit wide masks\n    (m8x8) => {\n        x86_m8x8_impl!(m8x8);\n    };\n    (m16x4) => {\n        x86_m8x8_impl!(m16x4);\n    };\n    (m32x2) => {\n        x86_m8x8_impl!(m32x2);\n    };\n    // 128-bit wide masks\n    (m8x16) => {\n        x86_m8x16_impl!(m8x16);\n    };\n    (m16x8) => {\n        x86_m8x16_impl!(m16x8);\n    };\n    (m32x4) => {\n        x86_m32x4_impl!(m32x4);\n    };\n    (m64x2) => {\n        x86_m64x2_impl!(m64x2);\n    };\n    (m128x1) => {\n        x86_intr_impl!(m128x1);\n    };\n    // 256-bit wide masks:\n    (m8x32) => {\n        x86_m8x32_impl!(m8x32, m8x16);\n    };\n    (m16x16) => {\n        x86_m8x32_impl!(m16x16, m16x8);\n    };\n    (m32x8) => {\n        x86_m32x8_impl!(m32x8, m32x4);\n    };\n    (m64x4) => {\n        x86_m64x4_impl!(m64x4, m64x2);\n    };\n    (m128x2) => {\n        x86_intr_impl!(m128x2);\n    };\n    (msizex2) => {\n        cfg_if! {\n            if #[cfg(target_pointer_width = \"64\")] {\n                fallback_to_other_impl!(msizex2, m64x2);\n            } else if #[cfg(target_pointer_width = \"32\")] {\n                fallback_to_other_impl!(msizex2, m32x2);\n            } else {\n                compile_error!(\"unsupported target_pointer_width\");\n            }\n        }\n    };\n    (msizex4) => {\n        cfg_if! {\n            if #[cfg(target_pointer_width = \"64\")] {\n                fallback_to_other_impl!(msizex4, m64x4);\n            } else if #[cfg(target_pointer_width = \"32\")] {\n                fallback_to_other_impl!(msizex4, m32x4);\n            } else {\n                compile_error!(\"unsupported target_pointer_width\");\n            }\n        }\n    };\n    (msizex8) => {\n        cfg_if! {\n            if #[cfg(target_pointer_width = \"64\")] {\n                fallback_to_other_impl!(msizex8, m64x8);\n            } else if #[cfg(target_pointer_width = \"32\")] {\n                fallback_to_other_impl!(msizex8, m32x8);\n            } else {\n                compile_error!(\"unsupported target_pointer_width\");\n            }\n        }\n    };\n\n    // Fallback to LLVM's default code-generation:\n    ($id:ident) => {\n        fallback_impl!($id);\n    };\n}\n"
  },
  {
    "path": "src/codegen/reductions/mask.rs",
    "content": "//! Code generation workaround for `all()` mask horizontal reduction.\n//!\n//! Works around [LLVM bug 36702].\n//!\n//! [LLVM bug 36702]: https://bugs.llvm.org/show_bug.cgi?id=36702\n#![allow(unused_macros)]\n\nuse crate::*;\n\npub(crate) trait All: crate::marker::Sized {\n    unsafe fn all(self) -> bool;\n}\n\npub(crate) trait Any: crate::marker::Sized {\n    unsafe fn any(self) -> bool;\n}\n\n#[macro_use]\nmod fallback_impl;\n\ncfg_if! {\n    if #[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))] {\n        #[macro_use]\n        mod x86;\n    } else if #[cfg(all(target_arch = \"arm\", target_feature = \"v7\",\n                        target_feature = \"neon\",\n                        any(feature = \"core_arch\", libcore_neon)))] {\n        #[macro_use]\n        mod arm;\n    } else if #[cfg(all(target_arch = \"aarch64\", target_feature = \"neon\"))] {\n        #[macro_use]\n        mod aarch64;\n    } else {\n        #[macro_use]\n        mod fallback;\n    }\n}\n\nimpl_mask_reductions!(m8x2);\nimpl_mask_reductions!(m8x4);\nimpl_mask_reductions!(m8x8);\nimpl_mask_reductions!(m8x16);\nimpl_mask_reductions!(m8x32);\nimpl_mask_reductions!(m8x64);\n\nimpl_mask_reductions!(m16x2);\nimpl_mask_reductions!(m16x4);\nimpl_mask_reductions!(m16x8);\nimpl_mask_reductions!(m16x16);\nimpl_mask_reductions!(m16x32);\n\nimpl_mask_reductions!(m32x2);\nimpl_mask_reductions!(m32x4);\nimpl_mask_reductions!(m32x8);\nimpl_mask_reductions!(m32x16);\n\n// FIXME: 64-bit single element vector\n// impl_mask_reductions!(m64x1);\nimpl_mask_reductions!(m64x2);\nimpl_mask_reductions!(m64x4);\nimpl_mask_reductions!(m64x8);\n\nimpl_mask_reductions!(m128x1);\nimpl_mask_reductions!(m128x2);\nimpl_mask_reductions!(m128x4);\n\nimpl_mask_reductions!(msizex2);\nimpl_mask_reductions!(msizex4);\nimpl_mask_reductions!(msizex8);\n"
  },
  {
    "path": "src/codegen/reductions.rs",
    "content": "pub(crate) mod mask;\n"
  },
  {
    "path": "src/codegen/shuffle.rs",
    "content": "//! Implementations of the `ShuffleResult` trait for the different numbers of\n//! lanes and vector element types.\n\nuse crate::masks::*;\nuse crate::sealed::{Seal, Shuffle};\n\nmacro_rules! impl_shuffle {\n    ($array:ty, $base:ty, $out:ty) => {\n        impl Seal<$array> for $base {}\n        impl Shuffle<$array> for $base {\n            type Output = $out;\n        }\n    };\n}\n\nimpl_shuffle! { [u32; 2], i8, crate::codegen::i8x2 }\nimpl_shuffle! { [u32; 4], i8, crate::codegen::i8x4 }\nimpl_shuffle! { [u32; 8], i8, crate::codegen::i8x8 }\nimpl_shuffle! { [u32; 16], i8, crate::codegen::i8x16 }\nimpl_shuffle! { [u32; 32], i8, crate::codegen::i8x32 }\nimpl_shuffle! { [u32; 64], i8, crate::codegen::i8x64 }\n\nimpl_shuffle! { [u32; 2], u8, crate::codegen::u8x2 }\nimpl_shuffle! { [u32; 4], u8, crate::codegen::u8x4 }\nimpl_shuffle! { [u32; 8], u8, crate::codegen::u8x8 }\nimpl_shuffle! { [u32; 16], u8, crate::codegen::u8x16 }\nimpl_shuffle! { [u32; 32], u8, crate::codegen::u8x32 }\nimpl_shuffle! { [u32; 64], u8, crate::codegen::u8x64 }\n\nimpl_shuffle! { [u32; 2], m8, crate::codegen::m8x2 }\nimpl_shuffle! { [u32; 4], m8, crate::codegen::m8x4 }\nimpl_shuffle! { [u32; 8], m8, crate::codegen::m8x8 }\nimpl_shuffle! { [u32; 16], m8, crate::codegen::m8x16 }\nimpl_shuffle! { [u32; 32], m8, crate::codegen::m8x32 }\nimpl_shuffle! { [u32; 64], m8, crate::codegen::m8x64 }\n\nimpl_shuffle! { [u32; 2], i16, crate::codegen::i16x2 }\nimpl_shuffle! { [u32; 4], i16, crate::codegen::i16x4 }\nimpl_shuffle! { [u32; 8], i16, crate::codegen::i16x8 }\nimpl_shuffle! { [u32; 16], i16, crate::codegen::i16x16 }\nimpl_shuffle! { [u32; 32], i16, crate::codegen::i16x32 }\n\nimpl_shuffle! { [u32; 2], u16, crate::codegen::u16x2 }\nimpl_shuffle! { [u32; 4], u16, crate::codegen::u16x4 }\nimpl_shuffle! { [u32; 8], u16, crate::codegen::u16x8 }\nimpl_shuffle! { [u32; 16], u16, crate::codegen::u16x16 }\nimpl_shuffle! { [u32; 32], u16, crate::codegen::u16x32 }\n\nimpl_shuffle! { [u32; 2], m16, crate::codegen::m16x2 }\nimpl_shuffle! { [u32; 4], m16, crate::codegen::m16x4 }\nimpl_shuffle! { [u32; 8], m16, crate::codegen::m16x8 }\nimpl_shuffle! { [u32; 16], m16, crate::codegen::m16x16 }\n\nimpl_shuffle! { [u32; 2], i32, crate::codegen::i32x2 }\nimpl_shuffle! { [u32; 4], i32, crate::codegen::i32x4 }\nimpl_shuffle! { [u32; 8], i32, crate::codegen::i32x8 }\nimpl_shuffle! { [u32; 16], i32, crate::codegen::i32x16 }\n\nimpl_shuffle! { [u32; 2], u32, crate::codegen::u32x2 }\nimpl_shuffle! { [u32; 4], u32, crate::codegen::u32x4 }\nimpl_shuffle! { [u32; 8], u32, crate::codegen::u32x8 }\nimpl_shuffle! { [u32; 16], u32, crate::codegen::u32x16 }\n\nimpl_shuffle! { [u32; 2], f32, crate::codegen::f32x2 }\nimpl_shuffle! { [u32; 4], f32, crate::codegen::f32x4 }\nimpl_shuffle! { [u32; 8], f32, crate::codegen::f32x8 }\nimpl_shuffle! { [u32; 16], f32, crate::codegen::f32x16 }\n\nimpl_shuffle! { [u32; 2], m32, crate::codegen::m32x2 }\nimpl_shuffle! { [u32; 4], m32, crate::codegen::m32x4 }\nimpl_shuffle! { [u32; 8], m32, crate::codegen::m32x8 }\nimpl_shuffle! { [u32; 16], m32, crate::codegen::m32x16 }\n\n/* FIXME: 64-bit single element vector\nimpl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 }\n*/\nimpl_shuffle! { [u32; 2], i64, crate::codegen::i64x2 }\nimpl_shuffle! { [u32; 4], i64, crate::codegen::i64x4 }\nimpl_shuffle! { [u32; 8], i64, crate::codegen::i64x8 }\n\n/* FIXME: 64-bit single element vector\nimpl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 }\n*/\nimpl_shuffle! { [u32; 2], u64, crate::codegen::u64x2 }\nimpl_shuffle! { [u32; 4], u64, crate::codegen::u64x4 }\nimpl_shuffle! { [u32; 8], u64, crate::codegen::u64x8 }\n\n/* FIXME: 64-bit single element vector\nimpl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 }\n*/\nimpl_shuffle! { [u32; 2], f64, crate::codegen::f64x2 }\nimpl_shuffle! { [u32; 4], f64, crate::codegen::f64x4 }\nimpl_shuffle! { [u32; 8], f64, crate::codegen::f64x8 }\n\n/* FIXME: 64-bit single element vector\nimpl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 }\n*/\nimpl_shuffle! { [u32; 2], m64, crate::codegen::m64x2 }\nimpl_shuffle! { [u32; 4], m64, crate::codegen::m64x4 }\nimpl_shuffle! { [u32; 8], m64, crate::codegen::m64x8 }\n\nimpl_shuffle! { [u32; 2], isize, crate::codegen::isizex2 }\nimpl_shuffle! { [u32; 4], isize, crate::codegen::isizex4 }\nimpl_shuffle! { [u32; 8], isize, crate::codegen::isizex8 }\n\nimpl_shuffle! { [u32; 2], usize, crate::codegen::usizex2 }\nimpl_shuffle! { [u32; 4], usize, crate::codegen::usizex4 }\nimpl_shuffle! { [u32; 8], usize, crate::codegen::usizex8 }\n\nimpl_shuffle! { [u32; 2], msize, crate::codegen::msizex2 }\nimpl_shuffle! { [u32; 4], msize, crate::codegen::msizex4 }\nimpl_shuffle! { [u32; 8], msize, crate::codegen::msizex8 }\n\nimpl<T> Seal<[u32; 2]> for *const T {}\nimpl<T> Shuffle<[u32; 2]> for *const T {\n    type Output = crate::codegen::cptrx2<T>;\n}\nimpl<T> Seal<[u32; 4]> for *const T {}\nimpl<T> Shuffle<[u32; 4]> for *const T {\n    type Output = crate::codegen::cptrx4<T>;\n}\nimpl<T> Seal<[u32; 8]> for *const T {}\nimpl<T> Shuffle<[u32; 8]> for *const T {\n    type Output = crate::codegen::cptrx8<T>;\n}\n\nimpl<T> Seal<[u32; 2]> for *mut T {}\nimpl<T> Shuffle<[u32; 2]> for *mut T {\n    type Output = crate::codegen::mptrx2<T>;\n}\nimpl<T> Seal<[u32; 4]> for *mut T {}\nimpl<T> Shuffle<[u32; 4]> for *mut T {\n    type Output = crate::codegen::mptrx4<T>;\n}\nimpl<T> Seal<[u32; 8]> for *mut T {}\nimpl<T> Shuffle<[u32; 8]> for *mut T {\n    type Output = crate::codegen::mptrx8<T>;\n}\n\nimpl_shuffle! { [u32; 1], i128, crate::codegen::i128x1 }\nimpl_shuffle! { [u32; 2], i128, crate::codegen::i128x2 }\nimpl_shuffle! { [u32; 4], i128, crate::codegen::i128x4 }\n\nimpl_shuffle! { [u32; 1], u128, crate::codegen::u128x1 }\nimpl_shuffle! { [u32; 2], u128, crate::codegen::u128x2 }\nimpl_shuffle! { [u32; 4], u128, crate::codegen::u128x4 }\n\nimpl_shuffle! { [u32; 1], m128, crate::codegen::m128x1 }\nimpl_shuffle! { [u32; 2], m128, crate::codegen::m128x2 }\nimpl_shuffle! { [u32; 4], m128, crate::codegen::m128x4 }\n"
  },
  {
    "path": "src/codegen/shuffle1_dyn.rs",
    "content": "//! Shuffle vector lanes with run-time indices.\n\nuse crate::*;\n\npub trait Shuffle1Dyn {\n    type Indices;\n    fn shuffle1_dyn(self, _: Self::Indices) -> Self;\n}\n\n// Fallback implementation\nmacro_rules! impl_fallback {\n    ($id:ident) => {\n        impl Shuffle1Dyn for $id {\n            type Indices = Self;\n            #[inline]\n            fn shuffle1_dyn(self, indices: Self::Indices) -> Self {\n                let mut result = Self::splat(0);\n                for i in 0..$id::lanes() {\n                    result = result.replace(i, self.extract(indices.extract(i) as usize));\n                }\n                result\n            }\n        }\n    };\n}\n\nmacro_rules! impl_shuffle1_dyn {\n    (u8x8) => {\n        cfg_if! {\n            if #[cfg(all(\n                any(\n                    all(target_arch = \"aarch64\", target_feature = \"neon\"),\n                    all(target_arch = \"doesnotexist\", target_feature = \"v7\",\n                        target_feature = \"neon\")\n                ),\n                any(feature = \"core_arch\", libcore_neon)\n            )\n            )] {\n                impl Shuffle1Dyn for u8x8 {\n                    type Indices = Self;\n                    #[inline]\n                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {\n                        #[cfg(target_arch = \"aarch64\")]\n                        use crate::arch::aarch64::vtbl1_u8;\n                        #[cfg(target_arch = \"doesnotexist\")]\n                        use crate::arch::arm::vtbl1_u8;\n\n                        // This is safe because the binary is compiled with\n                        // neon enabled at compile-time and can therefore only\n                        // run on CPUs that have it enabled.\n                        unsafe {\n                            Simd(mem::transmute(\n                                vtbl1_u8(mem::transmute(self.0),\n                                        crate::mem::transmute(indices.0))\n                            ))\n                        }\n                    }\n                }\n            } else {\n                impl_fallback!(u8x8);\n            }\n        }\n    };\n    (u8x16) => {\n        cfg_if! {\n            if #[cfg(all(any(target_arch = \"x86\", target_arch = \"x86_64\"),\n                         target_feature = \"ssse3\"))] {\n                impl Shuffle1Dyn for u8x16 {\n                    type Indices = Self;\n                    #[inline]\n                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {\n                        #[cfg(target_arch = \"x86\")]\n                        use crate::arch::x86::_mm_shuffle_epi8;\n                        #[cfg(target_arch = \"x86_64\")]\n                        use crate::arch::x86_64::_mm_shuffle_epi8;\n                        // This is safe because the binary is compiled with\n                        // ssse3 enabled at compile-time and can therefore only\n                        // run on CPUs that have it enabled.\n                        unsafe {\n                            Simd(mem::transmute(\n                                _mm_shuffle_epi8(mem::transmute(self.0),\n                                                crate::mem::transmute(indices))\n                            ))\n                        }\n                    }\n                }\n            } else if #[cfg(all(target_arch = \"aarch64\", target_feature = \"neon\",\n                                any(feature = \"core_arch\", libcore_neon)))] {\n                impl Shuffle1Dyn for u8x16 {\n                    type Indices = Self;\n                    #[inline]\n                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {\n                        use crate::arch::aarch64::vqtbl1q_u8;\n\n                        // This is safe because the binary is compiled with\n                        // neon enabled at compile-time and can therefore only\n                        // run on CPUs that have it enabled.\n                        unsafe {\n                            Simd(mem::transmute(\n                                vqtbl1q_u8(mem::transmute(self.0),\n                                          crate::mem::transmute(indices.0))\n                            ))\n                        }\n                    }\n                }\n            } else if #[cfg(all(target_arch = \"doesnotexist\", target_feature = \"v7\",\n                                target_feature = \"neon\",\n                                any(feature = \"core_arch\", libcore_neon)))] {\n                impl Shuffle1Dyn for u8x16 {\n                    type Indices = Self;\n                    #[inline]\n                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {\n                        use crate::arch::arm::vtbl2_u8;\n\n                        // This is safe because the binary is compiled with\n                        // neon enabled at compile-time and can therefore only\n                        // run on CPUs that have it enabled.\n                        unsafe {\n                            union U {\n                                j: u8x16,\n                                s: (u8x8, u8x8),\n                            }\n\n                            let (i0, i1) = U { j: y }.s;\n\n                            let r0 = vtbl2_u8(\n                                mem::transmute(x),\n                                crate::mem::transmute(i0)\n                            );\n                            let r1 = vtbl2_u8(\n                                mem::transmute(x),\n                                crate::mem::transmute(i1)\n                            );\n\n                            let r = U { s: (r0, r1) }.j;\n\n                            Simd(mem::transmute(r))\n                        }\n                    }\n                }\n            } else {\n                impl_fallback!(u8x16);\n            }\n        }\n    };\n    (u16x8) => {\n        impl Shuffle1Dyn for u16x8 {\n            type Indices = Self;\n            #[inline]\n            fn shuffle1_dyn(self, indices: Self::Indices) -> Self {\n                let indices: u8x8 = (indices * 2).cast();\n                let indices: u8x16 = shuffle!(indices, [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7]);\n                let v = u8x16::new(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);\n                let indices = indices + v;\n                unsafe {\n                    let s: u8x16 = crate::mem::transmute(self);\n                    crate::mem::transmute(s.shuffle1_dyn(indices))\n                }\n            }\n        }\n    };\n    (u32x4) => {\n        cfg_if! {\n            if #[cfg(all(any(target_arch = \"x86\", target_arch = \"x86_64\"),\n                         target_feature = \"avx\"))] {\n                impl Shuffle1Dyn for u32x4 {\n                    type Indices = Self;\n                    #[inline]\n                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {\n                        #[cfg(target_arch = \"x86\")]\n                        use crate::arch::x86::{_mm_permutevar_ps};\n                        #[cfg(target_arch = \"x86_64\")]\n                        use crate::arch::x86_64::{_mm_permutevar_ps};\n\n                        unsafe {\n                            crate::mem::transmute(\n                                _mm_permutevar_ps(\n                                    crate::mem::transmute(self.0),\n                                    crate::mem::transmute(indices.0)\n                                )\n                            )\n                        }\n                    }\n                }\n            } else {\n                impl Shuffle1Dyn for u32x4 {\n                    type Indices = Self;\n                    #[inline]\n                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {\n                        let indices: u8x4 = (indices * 4).cast();\n                        let indices: u8x16 = shuffle!(\n                            indices,\n                            [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]\n                        );\n                        let v = u8x16::new(\n                            0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3\n                        );\n                        let indices = indices + v;\n                        unsafe {\n                            let s: u8x16 =crate::mem::transmute(self);\n                           crate::mem::transmute(s.shuffle1_dyn(indices))\n                        }\n                    }\n                }\n            }\n        }\n    };\n    (u64x2) => {\n        cfg_if! {\n            if #[cfg(all(any(target_arch = \"x86\", target_arch = \"x86_64\"),\n                         target_feature = \"avx\"))] {\n                impl Shuffle1Dyn for u64x2 {\n                    type Indices = Self;\n                    #[inline]\n                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {\n                        #[cfg(target_arch = \"x86\")]\n                        use crate::arch::x86::{_mm_permutevar_pd};\n                        #[cfg(target_arch = \"x86_64\")]\n                        use crate::arch::x86_64::{_mm_permutevar_pd};\n                        // _mm_permutevar_pd uses the _second_ bit of each\n                        // element to perform the selection, that is: 0b00 => 0,\n                        // 0b10 => 1:\n                        let indices = indices << 1;\n                        unsafe {\n                            crate::mem::transmute(\n                                _mm_permutevar_pd(\n                                    crate::mem::transmute(self),\n                                    crate::mem::transmute(indices)\n                                )\n                            )\n                        }\n                    }\n                }\n            } else {\n                impl Shuffle1Dyn for u64x2 {\n                    type Indices = Self;\n                    #[inline]\n                    fn shuffle1_dyn(self, indices: Self::Indices) -> Self {\n                        let indices: u8x2 = (indices * 8).cast();\n                        let indices: u8x16 = shuffle!(\n                            indices,\n                            [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n                        );\n                        let v = u8x16::new(\n                            0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7\n                        );\n                        let indices = indices + v;\n                        unsafe {\n                            let s: u8x16 =crate::mem::transmute(self);\n                           crate::mem::transmute(s.shuffle1_dyn(indices))\n                        }\n                    }\n                }\n            }\n        }\n    };\n    (u128x1) => {\n        impl Shuffle1Dyn for u128x1 {\n            type Indices = Self;\n            #[inline]\n            fn shuffle1_dyn(self, _indices: Self::Indices) -> Self {\n                self\n            }\n        }\n    };\n    ($id:ident) => {\n        impl_fallback!($id);\n    };\n}\n\nimpl_shuffle1_dyn!(u8x2);\nimpl_shuffle1_dyn!(u8x4);\nimpl_shuffle1_dyn!(u8x8);\nimpl_shuffle1_dyn!(u8x16);\nimpl_shuffle1_dyn!(u8x32);\nimpl_shuffle1_dyn!(u8x64);\n\nimpl_shuffle1_dyn!(u16x2);\nimpl_shuffle1_dyn!(u16x4);\nimpl_shuffle1_dyn!(u16x8);\nimpl_shuffle1_dyn!(u16x16);\nimpl_shuffle1_dyn!(u16x32);\n\nimpl_shuffle1_dyn!(u32x2);\nimpl_shuffle1_dyn!(u32x4);\nimpl_shuffle1_dyn!(u32x8);\nimpl_shuffle1_dyn!(u32x16);\n\nimpl_shuffle1_dyn!(u64x2);\nimpl_shuffle1_dyn!(u64x4);\nimpl_shuffle1_dyn!(u64x8);\n\nimpl_shuffle1_dyn!(usizex2);\nimpl_shuffle1_dyn!(usizex4);\nimpl_shuffle1_dyn!(usizex8);\n\nimpl_shuffle1_dyn!(u128x1);\nimpl_shuffle1_dyn!(u128x2);\nimpl_shuffle1_dyn!(u128x4);\n\n// Implementation for non-unsigned vector types\nmacro_rules! impl_shuffle1_dyn_non_u {\n    ($id:ident, $uid:ident) => {\n        impl Shuffle1Dyn for $id {\n            type Indices = $uid;\n            #[inline]\n            fn shuffle1_dyn(self, indices: Self::Indices) -> Self {\n                unsafe {\n                    let u: $uid = crate::mem::transmute(self);\n                    crate::mem::transmute(u.shuffle1_dyn(indices))\n                }\n            }\n        }\n    };\n}\n\nimpl_shuffle1_dyn_non_u!(i8x2, u8x2);\nimpl_shuffle1_dyn_non_u!(i8x4, u8x4);\nimpl_shuffle1_dyn_non_u!(i8x8, u8x8);\nimpl_shuffle1_dyn_non_u!(i8x16, u8x16);\nimpl_shuffle1_dyn_non_u!(i8x32, u8x32);\nimpl_shuffle1_dyn_non_u!(i8x64, u8x64);\n\nimpl_shuffle1_dyn_non_u!(i16x2, u16x2);\nimpl_shuffle1_dyn_non_u!(i16x4, u16x4);\nimpl_shuffle1_dyn_non_u!(i16x8, u16x8);\nimpl_shuffle1_dyn_non_u!(i16x16, u16x16);\nimpl_shuffle1_dyn_non_u!(i16x32, u16x32);\n\nimpl_shuffle1_dyn_non_u!(i32x2, u32x2);\nimpl_shuffle1_dyn_non_u!(i32x4, u32x4);\nimpl_shuffle1_dyn_non_u!(i32x8, u32x8);\nimpl_shuffle1_dyn_non_u!(i32x16, u32x16);\n\nimpl_shuffle1_dyn_non_u!(i64x2, u64x2);\nimpl_shuffle1_dyn_non_u!(i64x4, u64x4);\nimpl_shuffle1_dyn_non_u!(i64x8, u64x8);\n\nimpl_shuffle1_dyn_non_u!(isizex2, usizex2);\nimpl_shuffle1_dyn_non_u!(isizex4, usizex4);\nimpl_shuffle1_dyn_non_u!(isizex8, usizex8);\n\nimpl_shuffle1_dyn_non_u!(i128x1, u128x1);\nimpl_shuffle1_dyn_non_u!(i128x2, u128x2);\nimpl_shuffle1_dyn_non_u!(i128x4, u128x4);\n\nimpl_shuffle1_dyn_non_u!(m8x2, u8x2);\nimpl_shuffle1_dyn_non_u!(m8x4, u8x4);\nimpl_shuffle1_dyn_non_u!(m8x8, u8x8);\nimpl_shuffle1_dyn_non_u!(m8x16, u8x16);\nimpl_shuffle1_dyn_non_u!(m8x32, u8x32);\nimpl_shuffle1_dyn_non_u!(m8x64, u8x64);\n\nimpl_shuffle1_dyn_non_u!(m16x2, u16x2);\nimpl_shuffle1_dyn_non_u!(m16x4, u16x4);\nimpl_shuffle1_dyn_non_u!(m16x8, u16x8);\nimpl_shuffle1_dyn_non_u!(m16x16, u16x16);\nimpl_shuffle1_dyn_non_u!(m16x32, u16x32);\n\nimpl_shuffle1_dyn_non_u!(m32x2, u32x2);\nimpl_shuffle1_dyn_non_u!(m32x4, u32x4);\nimpl_shuffle1_dyn_non_u!(m32x8, u32x8);\nimpl_shuffle1_dyn_non_u!(m32x16, u32x16);\n\nimpl_shuffle1_dyn_non_u!(m64x2, u64x2);\nimpl_shuffle1_dyn_non_u!(m64x4, u64x4);\nimpl_shuffle1_dyn_non_u!(m64x8, u64x8);\n\nimpl_shuffle1_dyn_non_u!(msizex2, usizex2);\nimpl_shuffle1_dyn_non_u!(msizex4, usizex4);\nimpl_shuffle1_dyn_non_u!(msizex8, usizex8);\n\nimpl_shuffle1_dyn_non_u!(m128x1, u128x1);\nimpl_shuffle1_dyn_non_u!(m128x2, u128x2);\nimpl_shuffle1_dyn_non_u!(m128x4, u128x4);\n\nimpl_shuffle1_dyn_non_u!(f32x2, u32x2);\nimpl_shuffle1_dyn_non_u!(f32x4, u32x4);\nimpl_shuffle1_dyn_non_u!(f32x8, u32x8);\nimpl_shuffle1_dyn_non_u!(f32x16, u32x16);\n\nimpl_shuffle1_dyn_non_u!(f64x2, u64x2);\nimpl_shuffle1_dyn_non_u!(f64x4, u64x4);\nimpl_shuffle1_dyn_non_u!(f64x8, u64x8);\n\n// Implementation for non-unsigned vector types\nmacro_rules! impl_shuffle1_dyn_ptr {\n    ($id:ident, $uid:ident) => {\n        impl<T> Shuffle1Dyn for $id<T> {\n            type Indices = $uid;\n            #[inline]\n            fn shuffle1_dyn(self, indices: Self::Indices) -> Self {\n                unsafe {\n                    let u: $uid = crate::mem::transmute(self);\n                    crate::mem::transmute(u.shuffle1_dyn(indices))\n                }\n            }\n        }\n    };\n}\n\nimpl_shuffle1_dyn_ptr!(cptrx2, usizex2);\nimpl_shuffle1_dyn_ptr!(cptrx4, usizex4);\nimpl_shuffle1_dyn_ptr!(cptrx8, usizex8);\n\nimpl_shuffle1_dyn_ptr!(mptrx2, usizex2);\nimpl_shuffle1_dyn_ptr!(mptrx4, usizex4);\nimpl_shuffle1_dyn_ptr!(mptrx8, usizex8);\n"
  },
  {
    "path": "src/codegen/swap_bytes.rs",
    "content": "//! Horizontal swap bytes reductions.\n\n// FIXME: investigate using `llvm.bswap`\n// https://github.com/rust-lang-nursery/packed_simd/issues/19\n\nuse crate::*;\n\npub(crate) trait SwapBytes {\n    fn swap_bytes(self) -> Self;\n}\n\nmacro_rules! impl_swap_bytes {\n    (v16: $($id:ident,)+) => {\n        $(\n            impl SwapBytes for $id {\n                #[inline]\n                fn swap_bytes(self) -> Self {\n                    shuffle!(self, [1, 0])\n                }\n            }\n        )+\n    };\n    (v32: $($id:ident,)+) => {\n        $(\n            impl SwapBytes for $id {\n                #[inline]\n                #[allow(clippy::useless_transmute)]\n                fn swap_bytes(self) -> Self {\n                    unsafe {\n                        let bytes: u8x4 = crate::mem::transmute(self);\n                        let result: u8x4 = shuffle!(bytes, [3, 2, 1, 0]);\n                        crate::mem::transmute(result)\n                    }\n                }\n            }\n        )+\n    };\n    (v64: $($id:ident,)+) => {\n        $(\n            impl SwapBytes for $id {\n                #[inline]\n                #[allow(clippy::useless_transmute)]\n                fn swap_bytes(self) -> Self {\n                    unsafe {\n                        let bytes: u8x8 = crate::mem::transmute(self);\n                        let result: u8x8 = shuffle!(\n                            bytes, [7, 6, 5, 4, 3, 2, 1, 0]\n                        );\n                        crate::mem::transmute(result)\n                    }\n                }\n            }\n        )+\n    };\n    (v128: $($id:ident,)+) => {\n        $(\n            impl SwapBytes for $id {\n                #[inline]\n                #[allow(clippy::useless_transmute)]\n                fn swap_bytes(self) -> Self {\n                    unsafe {\n                        let bytes: u8x16 = crate::mem::transmute(self);\n                        let result: u8x16 = shuffle!(bytes, [\n                            15, 14, 13, 12, 11, 10, 9, 8,\n                            7, 6, 5, 4, 3, 2, 1, 0\n                        ]);\n                        crate::mem::transmute(result)\n                    }\n                }\n            }\n        )+\n    };\n    (v256: $($id:ident,)+) => {\n        $(\n            impl SwapBytes for $id {\n                #[inline]\n                #[allow(clippy::useless_transmute)]\n                fn swap_bytes(self) -> Self {\n                    unsafe {\n                        let bytes: u8x32 = crate::mem::transmute(self);\n                        let result: u8x32 = shuffle!(bytes, [\n                            31, 30, 29, 28, 27, 26, 25, 24,\n                            23, 22, 21, 20, 19, 18, 17, 16,\n                            15, 14, 13, 12, 11, 10, 9,  8,\n                            7,  6,  5,  4,  3,  2,  1,  0\n                        ]);\n                        crate::mem::transmute(result)\n                    }\n                }\n            }\n        )+\n    };\n    (v512: $($id:ident,)+) => {\n        $(\n            impl SwapBytes for $id {\n                #[inline]\n                #[allow(clippy::useless_transmute)]\n                fn swap_bytes(self) -> Self {\n                    unsafe {\n                        let bytes: u8x64 = crate::mem::transmute(self);\n                        let result: u8x64 = shuffle!(bytes, [\n                            63, 62, 61, 60, 59, 58, 57, 56,\n                            55, 54, 53, 52, 51, 50, 49, 48,\n                            47, 46, 45, 44, 43, 42, 41, 40,\n                            39, 38, 37, 36, 35, 34, 33, 32,\n                            31, 30, 29, 28, 27, 26, 25, 24,\n                            23, 22, 21, 20, 19, 18, 17, 16,\n                            15, 14, 13, 12, 11, 10, 9,  8,\n                            7,  6,  5,  4,  3,  2,  1,  0\n                        ]);\n                        crate::mem::transmute(result)\n                    }\n                }\n            }\n        )+\n    };\n}\n\nimpl_swap_bytes!(v16: u8x2, i8x2,);\nimpl_swap_bytes!(v32: u8x4, i8x4, u16x2, i16x2,);\n// FIXME: 64-bit single element vector\nimpl_swap_bytes!(v64: u8x8, i8x8, u16x4, i16x4, u32x2, i32x2 /* u64x1, i64x1, */,);\n\nimpl_swap_bytes!(v128: u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2, u128x1, i128x1,);\nimpl_swap_bytes!(v256: u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, u64x4, i64x4, u128x2, i128x2,);\n\nimpl_swap_bytes!(v512: u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, u64x8, i64x8, u128x4, i128x4,);\n\ncfg_if! {\n    if #[cfg(target_pointer_width = \"8\")] {\n        impl_swap_bytes!(v16: isizex2, usizex2,);\n        impl_swap_bytes!(v32: isizex4, usizex4,);\n        impl_swap_bytes!(v64: isizex8, usizex8,);\n    } else if #[cfg(target_pointer_width = \"16\")] {\n        impl_swap_bytes!(v32: isizex2, usizex2,);\n        impl_swap_bytes!(v64: isizex4, usizex4,);\n        impl_swap_bytes!(v128: isizex8, usizex8,);\n    } else if #[cfg(target_pointer_width = \"32\")] {\n        impl_swap_bytes!(v64: isizex2, usizex2,);\n        impl_swap_bytes!(v128: isizex4, usizex4,);\n        impl_swap_bytes!(v256: isizex8, usizex8,);\n    } else if #[cfg(target_pointer_width = \"64\")] {\n        impl_swap_bytes!(v128: isizex2, usizex2,);\n        impl_swap_bytes!(v256: isizex4, usizex4,);\n        impl_swap_bytes!(v512: isizex8, usizex8,);\n    } else {\n        compile_error!(\"unsupported target_pointer_width\");\n    }\n}\n"
  },
  {
    "path": "src/codegen/v128.rs",
    "content": "//! Internal 128-bit wide vector types\n\nuse crate::masks::*;\n\n#[rustfmt::skip]\nimpl_simd_array!(\n    [i8; 16]: i8x16 |\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8\n);\n#[rustfmt::skip]\nimpl_simd_array!(\n    [u8; 16]: u8x16 |\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8\n);\n#[rustfmt::skip]\nimpl_simd_array!(\n    [m8; 16]: m8x16 |\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8\n);\n\nimpl_simd_array!([i16; 8]: i16x8 | i16, i16, i16, i16, i16, i16, i16, i16);\nimpl_simd_array!([u16; 8]: u16x8 | u16, u16, u16, u16, u16, u16, u16, u16);\nimpl_simd_array!([m16; 8]: m16x8 | i16, i16, i16, i16, i16, i16, i16, i16);\n\nimpl_simd_array!([i32; 4]: i32x4 | i32, i32, i32, i32);\nimpl_simd_array!([u32; 4]: u32x4 | u32, u32, u32, u32);\nimpl_simd_array!([f32; 4]: f32x4 | f32, f32, f32, f32);\nimpl_simd_array!([m32; 4]: m32x4 | i32, i32, i32, i32);\n\nimpl_simd_array!([i64; 2]: i64x2 | i64, i64);\nimpl_simd_array!([u64; 2]: u64x2 | u64, u64);\nimpl_simd_array!([f64; 2]: f64x2 | f64, f64);\nimpl_simd_array!([m64; 2]: m64x2 | i64, i64);\n\nimpl_simd_array!([i128; 1]: i128x1 | i128);\nimpl_simd_array!([u128; 1]: u128x1 | u128);\nimpl_simd_array!([m128; 1]: m128x1 | i128);\n"
  },
  {
    "path": "src/codegen/v16.rs",
    "content": "//! Internal 16-bit wide vector types\n\nuse crate::masks::*;\n\nimpl_simd_array!([i8; 2]: i8x2 | i8, i8);\nimpl_simd_array!([u8; 2]: u8x2 | u8, u8);\nimpl_simd_array!([m8; 2]: m8x2 | i8, i8);\n"
  },
  {
    "path": "src/codegen/v256.rs",
    "content": "//! Internal 256-bit wide vector types\n\nuse crate::masks::*;\n\n#[rustfmt::skip]\nimpl_simd_array!(\n    [i8; 32]: i8x32 |\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8\n);\n#[rustfmt::skip]\nimpl_simd_array!(\n    [u8; 32]: u8x32 |\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8\n);\n#[rustfmt::skip]\nimpl_simd_array!(\n    [m8; 32]: m8x32 |\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8\n);\n#[rustfmt::skip]\nimpl_simd_array!(\n    [i16; 16]: i16x16 |\n    i16, i16, i16, i16,\n    i16, i16, i16, i16,\n    i16, i16, i16, i16,\n    i16, i16, i16, i16\n);\n#[rustfmt::skip]\nimpl_simd_array!(\n    [u16; 16]: u16x16 |\n    u16, u16, u16, u16,\n    u16, u16, u16, u16,\n    u16, u16, u16, u16,\n    u16, u16, u16, u16\n);\n#[rustfmt::skip]\nimpl_simd_array!(\n    [m16; 16]: m16x16 |\n    i16, i16, i16, i16,\n    i16, i16, i16, i16,\n    i16, i16, i16, i16,\n    i16, i16, i16, i16\n);\n\nimpl_simd_array!([i32; 8]: i32x8 | i32, i32, i32, i32, i32, i32, i32, i32);\nimpl_simd_array!([u32; 8]: u32x8 | u32, u32, u32, u32, u32, u32, u32, u32);\nimpl_simd_array!([f32; 8]: f32x8 | f32, f32, f32, f32, f32, f32, f32, f32);\nimpl_simd_array!([m32; 8]: m32x8 | i32, i32, i32, i32, i32, i32, i32, i32);\n\nimpl_simd_array!([i64; 4]: i64x4 | i64, i64, i64, i64);\nimpl_simd_array!([u64; 4]: u64x4 | u64, u64, u64, u64);\nimpl_simd_array!([f64; 4]: f64x4 | f64, f64, f64, f64);\nimpl_simd_array!([m64; 4]: m64x4 | i64, i64, i64, i64);\n\nimpl_simd_array!([i128; 2]: i128x2 | i128, i128);\nimpl_simd_array!([u128; 2]: u128x2 | u128, u128);\nimpl_simd_array!([m128; 2]: m128x2 | i128, i128);\n"
  },
  {
    "path": "src/codegen/v32.rs",
    "content": "//! Internal 32-bit wide vector types\n\nuse crate::masks::*;\n\nimpl_simd_array!([i8; 4]: i8x4 | i8, i8, i8, i8);\nimpl_simd_array!([u8; 4]: u8x4 | u8, u8, u8, u8);\nimpl_simd_array!([m8; 4]: m8x4 | i8, i8, i8, i8);\n\nimpl_simd_array!([i16; 2]: i16x2 | i16, i16);\nimpl_simd_array!([u16; 2]: u16x2 | u16, u16);\nimpl_simd_array!([m16; 2]: m16x2 | i16, i16);\n"
  },
  {
    "path": "src/codegen/v512.rs",
    "content": "//! Internal 512-bit wide vector types\n\nuse crate::masks::*;\n\n#[rustfmt::skip]\nimpl_simd_array!(\n    [i8; 64]: i8x64 |\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8\n);\n#[rustfmt::skip]\nimpl_simd_array!(\n    [u8; 64]: u8x64 |\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8,\n    u8, u8, u8, u8\n);\n#[rustfmt::skip]\nimpl_simd_array!(\n    [m8; 64]: m8x64 |\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8,\n    i8, i8, i8, i8\n);\n#[rustfmt::skip]\nimpl_simd_array!(\n    [i16; 32]: i16x32 |\n    i16, i16, i16, i16,\n    i16, i16, i16, i16,\n    i16, i16, i16, i16,\n    i16, i16, i16, i16,\n    i16, i16, i16, i16,\n    i16, i16, i16, i16,\n    i16, i16, i16, i16,\n    i16, i16, i16, i16\n);\n#[rustfmt::skip]\nimpl_simd_array!(\n    [u16; 32]: u16x32 |\n    u16, u16, u16, u16,\n    u16, u16, u16, u16,\n    u16, u16, u16, u16,\n    u16, u16, u16, u16,\n    u16, u16, u16, u16,\n    u16, u16, u16, u16,\n    u16, u16, u16, u16,\n    u16, u16, u16, u16\n);\n#[rustfmt::skip]\nimpl_simd_array!(\n    [m16; 32]: m16x32 |\n    i16, i16, i16, i16,\n    i16, i16, i16, i16,\n    i16, i16, i16, i16,\n    i16, i16, i16, i16,\n    i16, i16, i16, i16,\n    i16, i16, i16, i16,\n    i16, i16, i16, i16,\n    i16, i16, i16, i16\n);\n\n#[rustfmt::skip]\nimpl_simd_array!(\n    [i32; 16]: i32x16 |\n    i32, i32, i32, i32,\n    i32, i32, i32, i32,\n    i32, i32, i32, i32,\n    i32, i32, i32, i32\n);\n#[rustfmt::skip]\nimpl_simd_array!(\n    [u32; 16]: u32x16 |\n    u32, u32, u32, u32,\n    u32, u32, u32, u32,\n    u32, u32, u32, u32,\n    u32, u32, u32, u32\n);\n#[rustfmt::skip]\nimpl_simd_array!(\n    [f32; 16]: f32x16 |\n    f32, f32, f32, f32,\n    f32, f32, f32, f32,\n    f32, f32, f32, f32,\n    f32, f32, f32, f32\n);\n#[rustfmt::skip]\nimpl_simd_array!(\n    [m32; 16]: m32x16 |\n    i32, i32, i32, i32,\n    i32, i32, i32, i32,\n    i32, i32, i32, i32,\n    i32, i32, i32, i32\n);\n\nimpl_simd_array!([i64; 8]: i64x8 | i64, i64, i64, i64, i64, i64, i64, i64);\nimpl_simd_array!([u64; 8]: u64x8 | u64, u64, u64, u64, u64, u64, u64, u64);\nimpl_simd_array!([f64; 8]: f64x8 | f64, f64, f64, f64, f64, f64, f64, f64);\nimpl_simd_array!([m64; 8]: m64x8 | i64, i64, i64, i64, i64, i64, i64, i64);\n\nimpl_simd_array!([i128; 4]: i128x4 | i128, i128, i128, i128);\nimpl_simd_array!([u128; 4]: u128x4 | u128, u128, u128, u128);\nimpl_simd_array!([m128; 4]: m128x4 | i128, i128, i128, i128);\n"
  },
  {
    "path": "src/codegen/v64.rs",
    "content": "//! Internal 64-bit wide vector types\n\nuse crate::masks::*;\n\nimpl_simd_array!([i8; 8]: i8x8 | i8, i8, i8, i8, i8, i8, i8, i8);\nimpl_simd_array!([u8; 8]: u8x8 | u8, u8, u8, u8, u8, u8, u8, u8);\nimpl_simd_array!([m8; 8]: m8x8 | i8, i8, i8, i8, i8, i8, i8, i8);\n\nimpl_simd_array!([i16; 4]: i16x4 | i16, i16, i16, i16);\nimpl_simd_array!([u16; 4]: u16x4 | u16, u16, u16, u16);\nimpl_simd_array!([m16; 4]: m16x4 | i16, i16, i16, i16);\n\nimpl_simd_array!([i32; 2]: i32x2 | i32, i32);\nimpl_simd_array!([u32; 2]: u32x2 | u32, u32);\nimpl_simd_array!([f32; 2]: f32x2 | f32, f32);\nimpl_simd_array!([m32; 2]: m32x2 | i32, i32);\n\nimpl_simd_array!([i64; 1]: i64x1 | i64);\nimpl_simd_array!([u64; 1]: u64x1 | u64);\nimpl_simd_array!([f64; 1]: f64x1 | f64);\nimpl_simd_array!([m64; 1]: m64x1 | i64);\n"
  },
  {
    "path": "src/codegen/vPtr.rs",
    "content": "//! Pointer vector types\n\nmacro_rules! impl_simd_ptr {\n    ([$ptr_ty:ty; $elem_count:expr]: $tuple_id:ident | $ty:ident\n     | $($tys:ty),*) => {\n        #[derive(Copy, Clone)]\n        #[repr(simd)]\n        pub struct $tuple_id<$ty>($(pub(crate) $tys),*);\n        //^^^^^^^ leaked through SimdArray\n\n        impl<$ty> crate::sealed::Seal for [$ptr_ty; $elem_count] {}\n        impl<$ty> crate::sealed::SimdArray for [$ptr_ty; $elem_count] {\n            type Tuple = $tuple_id<$ptr_ty>;\n            type T = $ptr_ty;\n            const N: usize = $elem_count;\n            type NT = [u32; $elem_count];\n        }\n\n        impl<$ty> crate::sealed::Seal for $tuple_id<$ptr_ty> {}\n        impl<$ty> crate::sealed::Simd for $tuple_id<$ptr_ty> {\n            type Element = $ptr_ty;\n            const LANES: usize = $elem_count;\n            type LanesType = [u32; $elem_count];\n        }\n\n    }\n}\n\nimpl_simd_ptr!([*const T; 2]: cptrx2 | T | T, T);\nimpl_simd_ptr!([*const T; 4]: cptrx4 | T | T, T, T, T);\nimpl_simd_ptr!([*const T; 8]: cptrx8 | T | T, T, T, T, T, T, T, T);\n\nimpl_simd_ptr!([*mut T; 2]: mptrx2 | T | T, T);\nimpl_simd_ptr!([*mut T; 4]: mptrx4 | T | T, T, T, T);\nimpl_simd_ptr!([*mut T; 8]: mptrx8 | T | T, T, T, T, T, T, T, T);\n"
  },
  {
    "path": "src/codegen/vSize.rs",
    "content": "//! Vector types with pointer-sized elements\n\nuse crate::codegen::pointer_sized_int::{isize_, usize_};\nuse crate::masks::*;\n\nimpl_simd_array!([isize; 2]: isizex2 | isize_, isize_);\nimpl_simd_array!([usize; 2]: usizex2 | usize_, usize_);\nimpl_simd_array!([msize; 2]: msizex2 | isize_, isize_);\n\nimpl_simd_array!([isize; 4]: isizex4 | isize_, isize_, isize_, isize_);\nimpl_simd_array!([usize; 4]: usizex4 | usize_, usize_, usize_, usize_);\nimpl_simd_array!([msize; 4]: msizex4 | isize_, isize_, isize_, isize_);\n\nimpl_simd_array!([isize; 8]: isizex8 | isize_, isize_, isize_, isize_, isize_, isize_, isize_, isize_);\nimpl_simd_array!([usize; 8]: usizex8 | usize_, usize_, usize_, usize_, usize_, usize_, usize_, usize_);\nimpl_simd_array!([msize; 8]: msizex8 | isize_, isize_, isize_, isize_, isize_, isize_, isize_, isize_);\n"
  },
  {
    "path": "src/codegen.rs",
    "content": "//! Code-generation utilities\n\npub(crate) mod bit_manip;\npub(crate) mod llvm;\npub(crate) mod math;\npub(crate) mod reductions;\npub(crate) mod shuffle;\npub(crate) mod shuffle1_dyn;\npub(crate) mod swap_bytes;\n\nmacro_rules! impl_simd_array {\n    ([$elem_ty:ident; $elem_count:expr]:\n     $tuple_id:ident | $($elem_tys:ident),*) => {\n        #[derive(Copy, Clone)]\n        #[repr(simd)]\n        pub struct $tuple_id($(pub(crate) $elem_tys),*);\n        //^^^^^^^ leaked through SimdArray\n\n        impl crate::sealed::Seal for [$elem_ty; $elem_count] {}\n\n        impl crate::sealed::SimdArray for [$elem_ty; $elem_count] {\n            type Tuple = $tuple_id;\n            type T = $elem_ty;\n            const N: usize = $elem_count;\n            type NT = [u32; $elem_count];\n        }\n\n        impl crate::sealed::Seal for $tuple_id {}\n        impl crate::sealed::Simd for $tuple_id {\n            type Element = $elem_ty;\n            const LANES: usize = $elem_count;\n            type LanesType = [u32; $elem_count];\n        }\n\n    }\n}\n\npub(crate) mod pointer_sized_int;\n\npub(crate) mod v16;\npub(crate) use self::v16::*;\n\npub(crate) mod v32;\npub(crate) use self::v32::*;\n\npub(crate) mod v64;\npub(crate) use self::v64::*;\n\npub(crate) mod v128;\npub(crate) use self::v128::*;\n\npub(crate) mod v256;\npub(crate) use self::v256::*;\n\npub(crate) mod v512;\npub(crate) use self::v512::*;\n\npub(crate) mod vSize;\npub(crate) use self::vSize::*;\n\npub(crate) mod vPtr;\npub(crate) use self::vPtr::*;\n"
  },
  {
    "path": "src/lib.rs",
    "content": "//! # Portable packed SIMD vectors\n//!\n//! This crate is proposed for stabilization as `std::packed_simd` in [RFC2366:\n//! `std::simd`](https://github.com/rust-lang/rfcs/pull/2366) .\n//!\n//! The examples available in the\n//! [`examples/`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples)\n//! sub-directory of the crate showcase how to use the library in practice.\n//!\n//! ## Table of contents\n//!\n//! - [Introduction](#introduction)\n//! - [Vector types](#vector-types)\n//! - [Conditional operations](#conditional-operations)\n//! - [Conversions](#conversions)\n//! - [Hardware Features](#hardware-features)\n//! - [Performance guide](https://rust-lang-nursery.github.io/packed_simd/perf-guide/)\n//!\n//! ## Introduction\n//!\n//! This crate exports [`Simd<[T; N]>`][`Simd`]: a packed vector of `N`\n//! elements of type `T` as well as many type aliases for this type: for\n//! example, [`f32x4`], which is just an alias for `Simd<[f32; 4]>`.\n//!\n//! The operations on packed vectors are, by default, \"vertical\", that is, they\n//! are applied to each vector lane in isolation of the others:\n//!\n//! ```\n//! # use packed_simd::*;\n//! let a = i32x4::new(1, 2, 3, 4);\n//! let b = i32x4::new(5, 6, 7, 8);\n//! assert_eq!(a + b, i32x4::new(6, 8, 10, 12));\n//! ```\n//!\n//! Many \"horizontal\" operations are also provided:\n//!\n//! ```\n//! # use packed_simd::*;\n//! # let a = i32x4::new(1, 2, 3, 4);\n//! assert_eq!(a.wrapping_sum(), 10);\n//! ```\n//!\n//! In virtually all architectures vertical operations are fast, while\n//! horizontal operations are, by comparison, much slower. That is, the\n//! most portably-efficient way of performing a reduction over a slice\n//! is to collect the results into a vector using vertical operations,\n//! and performing a single horizontal operation at the end:\n//!\n//! ```\n//! # use packed_simd::*;\n//! fn reduce(x: &[i32]) -> i32 {\n//!     assert_eq!(x.len() % 4, 0);\n//!     let mut sum = i32x4::splat(0); // [0, 0, 0, 0]\n//!     for i in (0..x.len()).step_by(4) {\n//!         sum += i32x4::from_slice_unaligned(&x[i..]);\n//!     }\n//!     sum.wrapping_sum()\n//! }\n//!\n//! let x = [0, 1, 2, 3, 4, 5, 6, 7];\n//! assert_eq!(reduce(&x), 28);\n//! ```\n//!\n//! ## Vector types\n//!\n//! The vector type aliases are named according to the following scheme:\n//!\n//! > `{element_type}x{number_of_lanes} == Simd<[element_type;\n//! number_of_lanes]>`\n//!\n//! where the following element types are supported:\n//!\n//! * `i{element_width}`: signed integer\n//! * `u{element_width}`: unsigned integer\n//! * `f{element_width}`: float\n//! * `m{element_width}`: mask (see below)\n//! * `*{const,mut} T`: `const` and `mut` pointers\n//!\n//! ## Basic operations\n//!\n//! ```\n//! # use packed_simd::*;\n//! // Sets all elements to `0`:\n//! let a = i32x4::splat(0);\n//!\n//! // Reads a vector from a slice:\n//! let mut arr = [0, 0, 0, 1, 2, 3, 4, 5];\n//! let b = i32x4::from_slice_unaligned(&arr);\n//!\n//! // Reads the 4-th element of a vector:\n//! assert_eq!(b.extract(3), 1);\n//!\n//! // Returns a new vector where the 4-th element is replaced with `1`:\n//! let a = a.replace(3, 1);\n//! assert_eq!(a, b);\n//!\n//! // Writes a vector to a slice:\n//! let a = a.replace(2, 1);\n//! a.write_to_slice_unaligned(&mut arr[4..]);\n//! assert_eq!(arr, [0, 0, 0, 1, 0, 0, 1, 1]);\n//! ```\n//!\n//! ## Conditional operations\n//!\n//! One often needs to perform an operation on some lanes of the vector. Vector\n//! masks, like `m32x4`, allow selecting on which vector lanes an operation is\n//! to be performed:\n//!\n//! ```\n//! # use packed_simd::*;\n//! let a = i32x4::new(1, 1, 2, 2);\n//!\n//! // Add `1` to the first two lanes of the vector.\n//! let m = m16x4::new(true, true, false, false);\n//! let a = m.select(a + 1, a);\n//! assert_eq!(a, i32x4::splat(2));\n//! ```\n//!\n//! The elements of a vector mask are either `true` or `false`. Here `true`\n//! means that a lane is \"selected\", while `false` means that a lane is not\n//! selected.\n//!\n//! All vector masks implement a `mask.select(a: T, b: T) -> T` method that\n//! works on all vectors that have the same number of lanes as the mask. The\n//! resulting vector contains the elements of `a` for those lanes for which the\n//! mask is `true`, and the elements of `b` otherwise.\n//!\n//! The example constructs a mask with the first two lanes set to `true` and\n//! the last two lanes set to `false`. This selects the first two lanes of `a +\n//! 1` and the last two lanes of `a`, producing a vector where the first two\n//! lanes have been incremented by `1`.\n//!\n//! > note: mask `select` can be used on vector types that have the same number\n//! > of lanes as the mask. The example shows this by using [`m16x4`] instead\n//! > of [`m32x4`]. It is _typically_ more performant to use a mask element\n//! > width equal to the element width of the vectors being operated upon.\n//! > This is, however, not true for 512-bit wide vectors when targeting\n//! > AVX-512, where the most efficient masks use only 1-bit per element.\n//!\n//! All vertical comparison operations returns masks:\n//!\n//! ```\n//! # use packed_simd::*;\n//! let a = i32x4::new(1, 1, 3, 3);\n//! let b = i32x4::new(2, 2, 0, 0);\n//!\n//! // ge: >= (Greater Eequal; see also lt, le, gt, eq, ne).\n//! let m = a.ge(i32x4::splat(2));\n//!\n//! if m.any() {\n//!     // all / any / none allow coherent control flow\n//!     let d = m.select(a, b);\n//!     assert_eq!(d, i32x4::new(2, 2, 3, 3));\n//! }\n//! ```\n//!\n//! ## Conversions\n//!\n//! * **lossless widening conversions**: [`From`]/[`Into`] are implemented for\n//!   vectors with the same number of lanes when the conversion is value\n//! preserving   (same as in `std`).\n//!\n//! * **safe bitwise conversions**: The cargo feature `into_bits` provides the\n//!   `IntoBits/FromBits` traits (`x.into_bits()`). These perform safe bitwise\n//!   `transmute`s when all bit patterns of the source type are valid bit\n//!   patterns of the target type and are also implemented for the\n//!   architecture-specific vector types of `std::arch`. For example, `let x:\n//!   u8x8 = m8x8::splat(true).into_bits();` is provided because all `m8x8` bit\n//!   patterns are valid `u8x8` bit patterns. However, the opposite is not\n//! true,   not all `u8x8` bit patterns are valid `m8x8` bit-patterns, so this\n//!   operation cannot be performed safely using `x.into_bits()`; one needs to\n//!   use `unsafe { crate::mem::transmute(x) }` for that, making sure that the\n//!   value in the `u8x8` is a valid bit-pattern of `m8x8`.\n//!\n//! * **numeric casts** (`as`): are performed using [`FromCast`]/[`Cast`]\n//! (`x.cast()`), just like `as`:\n//!\n//!   * casting integer vectors whose lane types have the same size (e.g.\n//! `i32xN`     -> `u32xN`) is a **no-op**,\n//!\n//!   * casting from a larger integer to a smaller integer (e.g. `u32xN` ->\n//! `u8xN`)     will **truncate**,\n//!\n//!   * casting from a smaller integer to a larger integer     (e.g. `u8xN` ->\n//!     `u32xN`) will:\n//!        * **zero-extend** if the source is unsigned, or\n//!        * **sign-extend** if the source is signed,\n//!\n//!   * casting from a float to an integer will **round the float towards\n//! zero**,\n//!\n//!   * casting from an integer to float will produce the floating point\n//!     representation of the integer, **rounding to nearest, ties to even**,\n//!\n//!   * casting from an `f32` to an `f64` is perfect and lossless,\n//!\n//!   * casting from an `f64` to an `f32` **rounds to nearest, ties to even**.\n//!\n//!   Numeric casts are not very \"precise\": sometimes lossy, sometimes value\n//!   preserving, etc.\n//!\n//! ## Hardware Features\n//!\n//! This crate can use different hardware features based on your configured\n//! `RUSTFLAGS`. For example, with no configured `RUSTFLAGS`, `u64x8` on\n//! x86_64 will use SSE2 operations like `PCMPEQD`. If you configure\n//! `RUSTFLAGS='-C target-feature=+avx2,+avx'` on supported x86_64 hardware\n//! the same `u64x8` may use wider AVX2 operations like `VPCMPEQQ`. It is\n//! important for performance and for hardware support requirements that\n//! you choose an appropriate set of `target-feature` and `target-cpu`\n//! options during builds. For more information, see the [Performance\n//! guide](https://rust-lang-nursery.github.io/packed_simd/perf-guide/)\n\n#![feature(\n    adt_const_params,\n    repr_simd,\n    rustc_attrs,\n    platform_intrinsics,\n    stdsimd,\n    arm_target_feature,\n    link_llvm_intrinsics,\n    core_intrinsics,\n    stmt_expr_attributes,\n    custom_inner_attributes,\n)]\n#![allow(non_camel_case_types, non_snake_case,\n        // FIXME: these types are unsound in C FFI already\n        // See https://github.com/rust-lang/rust/issues/53346\n        improper_ctypes_definitions,\n        incomplete_features,\n        clippy::cast_possible_truncation,\n        clippy::cast_lossless,\n        clippy::cast_possible_wrap,\n        clippy::cast_precision_loss,\n        // TODO: manually add the `#[must_use]` attribute where appropriate\n        clippy::must_use_candidate,\n        // This lint is currently broken for generic code\n        // See https://github.com/rust-lang/rust-clippy/issues/3410\n        clippy::use_self,\n        clippy::wrong_self_convention,\n        clippy::from_over_into,\n)]\n#![cfg_attr(test, feature(hashmap_internals))]\n#![cfg_attr(doc_cfg, feature(doc_cfg))]\n#![deny(rust_2018_idioms, clippy::missing_inline_in_public_items)]\n#![no_std]\n\nuse cfg_if::cfg_if;\n\ncfg_if! {\n    if #[cfg(feature = \"core_arch\")] {\n        #[allow(unused_imports)]\n        use core_arch as arch;\n    } else {\n        #[allow(unused_imports)]\n        use core::arch;\n    }\n}\n\n#[cfg(all(target_arch = \"wasm32\", test))]\nuse wasm_bindgen_test::*;\n\n#[allow(unused_imports)]\nuse core::{\n    /* arch (handled above), */ cmp, f32, f64, fmt, hash, hint, i128, i16, i32, i64, i8, intrinsics,\n    isize, iter, marker, mem, ops, ptr, slice, u128, u16, u32, u64, u8, usize,\n};\n\n#[macro_use]\nmod testing;\n#[macro_use]\nmod api;\nmod codegen;\nmod sealed;\n\npub use crate::sealed::{Mask, Shuffle, Simd as SimdVector, SimdArray};\n\n/// Packed SIMD vector type.\n///\n/// # Examples\n///\n/// ```\n/// # use packed_simd::Simd;\n/// let v = Simd::<[i32; 4]>::new(0, 1, 2, 3);\n/// assert_eq!(v.extract(2), 2);\n/// ```\n#[repr(transparent)]\n#[derive(Copy, Clone)]\npub struct Simd<A: sealed::SimdArray>(\n    // FIXME: this type should be private,\n    // but it currently must be public for the\n    // `shuffle!` macro to work: it needs to\n    // access the internal `repr(simd)` type\n    // to call the shuffle intrinsics.\n    #[doc(hidden)] pub <A as sealed::SimdArray>::Tuple,\n);\n\nimpl<A: sealed::SimdArray> sealed::Seal for Simd<A> {}\n\n/// Wrapper over `T` implementing a lexicoraphical order via the `PartialOrd`\n/// and/or `Ord` traits.\n#[repr(transparent)]\n#[derive(Copy, Clone, Debug)]\n#[allow(clippy::missing_inline_in_public_items)]\npub struct LexicographicallyOrdered<T>(T);\n\nmod masks;\npub use self::masks::*;\n\nmod v16;\npub use self::v16::*;\n\nmod v32;\npub use self::v32::*;\n\nmod v64;\npub use self::v64::*;\n\nmod v128;\npub use self::v128::*;\n\nmod v256;\npub use self::v256::*;\n\nmod v512;\npub use self::v512::*;\n\nmod vSize;\npub use self::vSize::*;\n\nmod vPtr;\npub use self::vPtr::*;\n\npub use self::api::cast::*;\n\n#[cfg(feature = \"into_bits\")]\npub use self::api::into_bits::*;\n\n// Re-export the shuffle intrinsics required by the `shuffle!` macro.\n#[doc(hidden)]\npub use self::codegen::llvm::{\n    __shuffle_vector16, __shuffle_vector2, __shuffle_vector32, __shuffle_vector4, __shuffle_vector64,\n    __shuffle_vector8,\n};\n\npub(crate) mod llvm {\n    pub(crate) use crate::codegen::llvm::*;\n}\n"
  },
  {
    "path": "src/masks.rs",
    "content": "//! Mask types\n\nmacro_rules! impl_mask_ty {\n    ($id:ident : $elem_ty:ident | #[$doc:meta]) => {\n        #[$doc]\n        #[derive(Copy, Clone)]\n        pub struct $id($elem_ty);\n\n        impl crate::sealed::Seal for $id {}\n        impl crate::sealed::Mask for $id {\n            #[inline]\n            fn test(&self) -> bool {\n                $id::test(self)\n            }\n        }\n\n        impl $id {\n            /// Instantiate a mask with `value`\n            #[inline]\n            pub fn new(x: bool) -> Self {\n                if x {\n                    $id(!0)\n                } else {\n                    $id(0)\n                }\n            }\n            /// Test if the mask is set\n            #[inline]\n            pub fn test(&self) -> bool {\n                self.0 != 0\n            }\n        }\n\n        impl Default for $id {\n            #[inline]\n            fn default() -> Self {\n                $id(0)\n            }\n        }\n\n        #[allow(clippy::partialeq_ne_impl)]\n        impl PartialEq<$id> for $id {\n            #[inline]\n            fn eq(&self, other: &Self) -> bool {\n                self.0 == other.0\n            }\n            #[inline]\n            fn ne(&self, other: &Self) -> bool {\n                self.0 != other.0\n            }\n        }\n\n        impl Eq for $id {}\n\n        impl PartialOrd<$id> for $id {\n            #[inline]\n            fn partial_cmp(&self, other: &Self) -> Option<crate::cmp::Ordering> {\n                use crate::cmp::Ordering;\n                if self == other {\n                    Some(Ordering::Equal)\n                } else if self.0 > other.0 {\n                    // Note:\n                    //  * false = 0_i\n                    //  * true == !0_i == -1_i\n                    Some(Ordering::Less)\n                } else {\n                    Some(Ordering::Greater)\n                }\n            }\n\n            #[inline]\n            fn lt(&self, other: &Self) -> bool {\n                self.0 > other.0\n            }\n            #[inline]\n            fn gt(&self, other: &Self) -> bool {\n                self.0 < other.0\n            }\n            #[inline]\n            fn le(&self, other: &Self) -> bool {\n                self.0 >= other.0\n            }\n            #[inline]\n            fn ge(&self, other: &Self) -> bool {\n                self.0 <= other.0\n            }\n        }\n\n        impl Ord for $id {\n            #[inline]\n            fn cmp(&self, other: &Self) -> crate::cmp::Ordering {\n                match self.partial_cmp(other) {\n                    Some(x) => x,\n                    None => unsafe { crate::hint::unreachable_unchecked() },\n                }\n            }\n        }\n\n        impl crate::hash::Hash for $id {\n            #[inline]\n            fn hash<H: crate::hash::Hasher>(&self, state: &mut H) {\n                (self.0 != 0).hash(state);\n            }\n        }\n\n        impl crate::fmt::Debug for $id {\n            #[inline]\n            fn fmt(&self, fmtter: &mut crate::fmt::Formatter<'_>) -> Result<(), crate::fmt::Error> {\n                write!(fmtter, \"{}({})\", stringify!($id), self.0 != 0)\n            }\n        }\n    };\n}\n\nimpl_mask_ty!(m8: i8 | /// 8-bit wide mask.\n);\nimpl_mask_ty!(m16: i16 | /// 16-bit wide mask.\n);\nimpl_mask_ty!(m32: i32 | /// 32-bit wide mask.\n);\nimpl_mask_ty!(m64: i64 | /// 64-bit wide mask.\n);\nimpl_mask_ty!(m128: i128 | /// 128-bit wide mask.\n);\nimpl_mask_ty!(msize: isize | /// isize-wide mask.\n);\n"
  },
  {
    "path": "src/sealed.rs",
    "content": "//! Sealed traits\n\n/// A sealed trait, this is logically private to the crate\n/// and will prevent implementations from outside the crate\npub trait Seal<T = ()> {}\n\n/// Trait implemented by arrays that can be SIMD types.\npub trait SimdArray: Seal {\n    /// The type of the #[repr(simd)] type.\n    type Tuple: Copy + Clone;\n    /// The element type of the vector.\n    type T;\n    /// The number of elements in the array.\n    const N: usize;\n    /// The type: `[u32; Self::N]`.\n    type NT;\n}\n\n/// This traits is used to constraint the arguments\n/// and result type of the portable shuffles.\n#[doc(hidden)]\npub trait Shuffle<Lanes>: Seal<Lanes> {\n    // Lanes is a `[u32; N]` where `N` is the number of vector lanes\n\n    /// The result type of the shuffle.\n    type Output;\n}\n\n/// This trait is implemented by all SIMD vector types.\npub trait Simd: Seal {\n    /// Element type of the SIMD vector\n    type Element;\n    /// The number of elements in the SIMD vector.\n    const LANES: usize;\n    /// The type: `[u32; Self::N]`.\n    type LanesType;\n}\n\n/// This trait is implemented by all mask types\npub trait Mask: Seal {\n    fn test(&self) -> bool;\n}\n"
  },
  {
    "path": "src/testing/macros.rs",
    "content": "//! Testing macros\n\nmacro_rules! test_if {\n    ($cfg_tt:tt: $it:item) => {\n        #[cfg(any(\n                                                            // Test everything if:\n                                                            //\n                                                            // * tests are enabled,\n                                                            // * no features about exclusively testing\n                                                            //   specific vector classes are enabled\n                                                            all(test, not(any(\n                                                                test_v16,\n                                                                test_v32,\n                                                                test_v64,\n                                                                test_v128,\n                                                                test_v256,\n                                                                test_v512,\n                                                                test_none,  // disables all tests\n                                                            ))),\n                                                            // Test if:\n                                                            //\n                                                            // * tests are enabled\n                                                            // * a particular cfg token tree returns true\n                                                            all(test, $cfg_tt),\n                                                        ))]\n        $it\n    };\n}\n\n#[cfg(test)]\n#[allow(unused)]\nmacro_rules! ref_ {\n    ($anything:tt) => {\n        &$anything\n    };\n}\n\n#[cfg(test)]\n#[allow(unused)]\nmacro_rules! ref_mut_ {\n    ($anything:tt) => {\n        &mut $anything\n    };\n}\n"
  },
  {
    "path": "src/testing/utils.rs",
    "content": "//! Testing utilities\n\n#![allow(dead_code)]\n// FIXME: Or don't. But it's true this is a problematic comparison.\n#![allow(clippy::neg_cmp_op_on_partial_ord)]\n\nuse crate::{cmp::PartialOrd, fmt::Debug, LexicographicallyOrdered};\n\n/// Tests PartialOrd for `a` and `b` where `a < b` is true.\npub fn test_lt<T>(a: LexicographicallyOrdered<T>, b: LexicographicallyOrdered<T>)\nwhere\n    LexicographicallyOrdered<T>: Debug + PartialOrd,\n{\n    assert!(a < b, \"{:?}, {:?}\", a, b);\n    assert!(b > a, \"{:?}, {:?}\", a, b);\n\n    assert!(!(a == b), \"{:?}, {:?}\", a, b);\n    assert_ne!(a, b, \"{:?}, {:?}\", a, b);\n\n    assert!(a <= b, \"{:?}, {:?}\", a, b);\n    assert!(b >= a, \"{:?}, {:?}\", a, b);\n\n    // The elegance of the mathematical expression of irreflexivity is more\n    // than clippy can handle.\n    #[allow(clippy::eq_op)]\n    {\n        // Irreflexivity\n        assert!(!(a < a), \"{:?}, {:?}\", a, b);\n        assert!(!(b < b), \"{:?}, {:?}\", a, b);\n        assert!(!(a > a), \"{:?}, {:?}\", a, b);\n        assert!(!(b > b), \"{:?}, {:?}\", a, b);\n\n        assert!(a <= a, \"{:?}, {:?}\", a, b);\n        assert!(b <= b, \"{:?}, {:?}\", a, b);\n    }\n}\n\n/// Tests PartialOrd for `a` and `b` where `a <= b` is true.\npub fn test_le<T>(a: LexicographicallyOrdered<T>, b: LexicographicallyOrdered<T>)\nwhere\n    LexicographicallyOrdered<T>: Debug + PartialOrd,\n{\n    assert!(a <= b, \"{:?}, {:?}\", a, b);\n    assert!(b >= a, \"{:?}, {:?}\", a, b);\n\n    assert!(a <= b, \"{:?}, {:?}\", a, b);\n    assert!(b >= a, \"{:?}, {:?}\", a, b);\n\n    if a == b {\n        assert!(!(a < b), \"{:?}, {:?}\", a, b);\n        assert!(!(b > a), \"{:?}, {:?}\", a, b);\n\n        assert!(!(a != b), \"{:?}, {:?}\", a, b);\n    } else {\n        assert_ne!(a, b, \"{:?}, {:?}\", a, b);\n        test_lt(a, b);\n    }\n}\n\n/// Test PartialOrd::partial_cmp for `a` and `b` returning `Ordering`\npub fn test_cmp<T>(\n    a: LexicographicallyOrdered<T>,\n    b: LexicographicallyOrdered<T>,\n    o: Option<crate::cmp::Ordering>,\n) where\n    LexicographicallyOrdered<T>: PartialOrd + Debug,\n    T: Debug + crate::sealed::Simd + Copy + Clone,\n    <T as crate::sealed::Simd>::Element: Default + Copy + Clone + PartialOrd,\n{\n    assert!(T::LANES <= 64, \"array length in these two arrays needs updating\");\n    let mut arr_a: [T::Element; 64] = [Default::default(); 64];\n    let mut arr_b: [T::Element; 64] = [Default::default(); 64];\n\n    unsafe { crate::ptr::write_unaligned(arr_a.as_mut_ptr() as *mut LexicographicallyOrdered<T>, a) }\n    unsafe { crate::ptr::write_unaligned(arr_b.as_mut_ptr() as *mut LexicographicallyOrdered<T>, b) }\n    let expected = arr_a[0..T::LANES].partial_cmp(&arr_b[0..T::LANES]);\n    let result = a.partial_cmp(&b);\n    assert_eq!(expected, result, \"{:?}, {:?}\", a, b);\n    assert_eq!(o, result, \"{:?}, {:?}\", a, b);\n    match o {\n        Some(crate::cmp::Ordering::Less) => {\n            test_lt(a, b);\n            test_le(a, b);\n        }\n        Some(crate::cmp::Ordering::Greater) => {\n            test_lt(b, a);\n            test_le(b, a);\n        }\n        Some(crate::cmp::Ordering::Equal) => {\n            assert!(a == b, \"{:?}, {:?}\", a, b);\n            assert!(!(a != b), \"{:?}, {:?}\", a, b);\n            assert!(!(a < b), \"{:?}, {:?}\", a, b);\n            assert!(!(b < a), \"{:?}, {:?}\", a, b);\n            assert!(!(a > b), \"{:?}, {:?}\", a, b);\n            assert!(!(b > a), \"{:?}, {:?}\", a, b);\n\n            test_le(a, b);\n            test_le(b, a);\n        }\n        None => {\n            assert!(!(a == b), \"{:?}, {:?}\", a, b);\n            assert!(!(a != b), \"{:?}, {:?}\", a, b);\n            assert!(!(a < b), \"{:?}, {:?}\", a, b);\n            assert!(!(a > b), \"{:?}, {:?}\", a, b);\n            assert!(!(b < a), \"{:?}, {:?}\", a, b);\n            assert!(!(b > a), \"{:?}, {:?}\", a, b);\n            assert!(!(a <= b), \"{:?}, {:?}\", a, b);\n            assert!(!(b <= a), \"{:?}, {:?}\", a, b);\n            assert!(!(a >= b), \"{:?}, {:?}\", a, b);\n            assert!(!(b >= a), \"{:?}, {:?}\", a, b);\n        }\n    }\n}\n\n// Returns a tuple containing two distinct pointer values of the same type as\n// the element type of the Simd vector `$id`.\n#[allow(unused)]\nmacro_rules! ptr_vals {\n    ($id:ty) => {\n        // expands to an expression\n        #[allow(unused_unsafe)]\n        unsafe {\n            // all bits cleared\n            let clear: <$id as sealed::Simd>::Element = crate::mem::zeroed();\n            // all bits set\n            let set: <$id as sealed::Simd>::Element = crate::mem::transmute(-1_isize);\n            (clear, set)\n        }\n    };\n}\n"
  },
  {
    "path": "src/testing.rs",
    "content": "//! Testing macros and other utilities.\n\n#[macro_use]\nmod macros;\n\n#[cfg(test)]\n#[macro_use]\npub(crate) mod utils;\n"
  },
  {
    "path": "src/v128.rs",
    "content": "//! 128-bit wide vector types\n#[rustfmt::skip]\n\nuse crate::*;\n\nimpl_i!([i8; 16]: i8x16, m8x16 | i8, u16 | test_v128 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |\n        From: |\n        /// A 128-bit vector with 16 `i8` lanes.\n);\nimpl_u!([u8; 16]: u8x16, m8x16 | u8, u16 | test_v128 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |\n        From: |\n        /// A 128-bit vector with 16 `u8` lanes.\n);\nimpl_m!([m8; 16]: m8x16 | i8, u16 | test_v128 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |\n        From: m16x16 |\n        /// A 128-bit vector mask with 16 `m8` lanes.\n);\n\nimpl_i!([i16; 8]: i16x8, m16x8 | i16, u8 | test_v128 | x0, x1, x2, x3, x4, x5, x6, x7 |\n        From: i8x8, u8x8 |\n        /// A 128-bit vector with 8 `i16` lanes.\n);\nimpl_u!([u16; 8]: u16x8, m16x8 | u16, u8 | test_v128 | x0, x1, x2, x3, x4, x5, x6, x7 |\n        From: u8x8 |\n        /// A 128-bit vector with 8 `u16` lanes.\n);\nimpl_m!([m16; 8]: m16x8 | i16, u8 | test_v128 | x0, x1, x2, x3, x4, x5, x6, x7 |\n        From: m8x8, m32x8 |\n        /// A 128-bit vector mask with 8 `m16` lanes.\n);\n\nimpl_i!([i32; 4]: i32x4, m32x4 | i32, u8 | test_v128 | x0, x1, x2, x3 |\n        From: i8x4, u8x4, i16x4, u16x4  |\n        /// A 128-bit vector with 4 `i32` lanes.\n);\nimpl_u!([u32; 4]: u32x4, m32x4 | u32, u8 | test_v128 | x0, x1, x2, x3 |\n        From: u8x4, u16x4 |\n        /// A 128-bit vector with 4 `u32` lanes.\n);\nimpl_f!([f32; 4]: f32x4, m32x4 | f32 | test_v128 | x0, x1, x2, x3 |\n        From: i8x4, u8x4, i16x4, u16x4 |\n        /// A 128-bit vector with 4 `f32` lanes.\n);\nimpl_m!([m32; 4]: m32x4 | i32, u8 | test_v128 | x0, x1, x2, x3 |\n        From: m8x4, m16x4, m64x4 |\n        /// A 128-bit vector mask with 4 `m32` lanes.\n);\n\nimpl_i!([i64; 2]: i64x2, m64x2 | i64, u8 | test_v128 | x0, x1 |\n        From: i8x2, u8x2, i16x2, u16x2, i32x2, u32x2 |\n        /// A 128-bit vector with 2 `i64` lanes.\n);\nimpl_u!([u64; 2]: u64x2, m64x2 | u64, u8 | test_v128 | x0, x1 |\n        From: u8x2, u16x2, u32x2 |\n        /// A 128-bit vector with 2 `u64` lanes.\n);\nimpl_f!([f64; 2]: f64x2, m64x2 | f64 | test_v128 | x0, x1 |\n        From: i8x2, u8x2, i16x2, u16x2, i32x2, u32x2, f32x2 |\n        /// A 128-bit vector with 2 `f64` lanes.\n);\nimpl_m!([m64; 2]: m64x2 | i64, u8 | test_v128 | x0, x1 |\n        From: m8x2, m16x2, m32x2, m128x2 |\n        /// A 128-bit vector mask with 2 `m64` lanes.\n);\n\nimpl_i!([i128; 1]: i128x1, m128x1 | i128, u8 | test_v128 | x0 |\n        From: /*i8x1, u8x1, i16x1, u16x1, i32x1, u32x1, i64x1, u64x1 */ | // FIXME: unary small vector types\n        /// A 128-bit vector with 1 `i128` lane.\n);\nimpl_u!([u128; 1]: u128x1, m128x1 | u128, u8 | test_v128 | x0 |\n        From: /*u8x1, u16x1, u32x1, u64x1 */ | // FIXME: unary small vector types\n        /// A 128-bit vector with 1 `u128` lane.\n);\nimpl_m!([m128; 1]: m128x1 | i128, u8 | test_v128 | x0 |\n        From: /*m8x1, m16x1, m32x1, m64x1 */ | // FIXME: unary small vector types\n        /// A 128-bit vector mask with 1 `m128` lane.\n);\n"
  },
  {
    "path": "src/v16.rs",
    "content": "//! 16-bit wide vector types\n\nuse crate::*;\n\nimpl_i!([i8; 2]: i8x2, m8x2 | i8, u8 | test_v16 | x0, x1 |\n        From: |\n        /// A 16-bit vector with 2 `i8` lanes.\n);\nimpl_u!([u8; 2]: u8x2, m8x2 | u8, u8 | test_v16 | x0, x1 |\n        From: |\n        /// A 16-bit vector with 2 `u8` lanes.\n);\nimpl_m!([m8; 2]: m8x2 | i8, u8 | test_v16 | x0, x1 |\n        From: m16x2, m32x2, m64x2, m128x2 |\n        /// A 16-bit vector mask with 2 `m8` lanes.\n);\n"
  },
  {
    "path": "src/v256.rs",
    "content": "//! 256-bit wide vector types\n#[rustfmt::skip]\n\nuse crate::*;\n\nimpl_i!([i8; 32]: i8x32, m8x32 | i8, u32 | test_v256 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,\n        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 |\n        From: |\n        /// A 256-bit vector with 32 `i8` lanes.\n);\nimpl_u!([u8; 32]: u8x32, m8x32 | u8, u32 | test_v256 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,\n        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 |\n        From: |\n        /// A 256-bit vector with 32 `u8` lanes.\n);\nimpl_m!([m8; 32]: m8x32 | i8, u32 | test_v256 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,\n        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 |\n        From:  |\n        /// A 256-bit vector mask with 32 `m8` lanes.\n);\n\nimpl_i!([i16; 16]: i16x16, m16x16 | i16, u16 | test_v256 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |\n        From: i8x16, u8x16 |\n        /// A 256-bit vector with 16 `i16` lanes.\n);\nimpl_u!([u16; 16]: u16x16, m16x16 | u16, u16 | test_v256 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |\n        From: u8x16 |\n        /// A 256-bit vector with 16 `u16` lanes.\n);\nimpl_m!([m16; 16]: m16x16 | i16, u16 | test_v256 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |\n        From: m8x16 |\n        /// A 256-bit vector mask with 16 `m16` lanes.\n);\n\nimpl_i!([i32; 8]: i32x8, m32x8 | i32, u8 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7  |\n        From: i8x8, u8x8, i16x8, u16x8 |\n        /// A 256-bit vector with 8 `i32` lanes.\n);\nimpl_u!([u32; 8]: u32x8, m32x8 | u32, u8 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7 |\n        From: u8x8, u16x8 |\n        /// A 256-bit vector with 8 `u32` lanes.\n);\nimpl_f!([f32; 8]: f32x8, m32x8 | f32 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7 |\n        From: i8x8, u8x8, i16x8, u16x8 |\n        /// A 256-bit vector with 8 `f32` lanes.\n);\nimpl_m!([m32; 8]: m32x8 | i32, u8 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7 |\n        From: m8x8, m16x8 |\n        /// A 256-bit vector mask with 8 `m32` lanes.\n);\n\nimpl_i!([i64; 4]: i64x4, m64x4 | i64, u8 | test_v256 | x0, x1, x2, x3 |\n        From: i8x4, u8x4, i16x4, u16x4, i32x4, u32x4 |\n        /// A 256-bit vector with 4 `i64` lanes.\n);\nimpl_u!([u64; 4]: u64x4, m64x4 | u64, u8 | test_v256 | x0, x1, x2, x3 |\n        From: u8x4, u16x4, u32x4 |\n        /// A 256-bit vector with 4 `u64` lanes.\n);\nimpl_f!([f64; 4]: f64x4, m64x4 | f64 | test_v256 | x0, x1, x2, x3 |\n        From: i8x4, u8x4, i16x4, u16x4, i32x4, u32x4, f32x4 |\n        /// A 256-bit vector with 4 `f64` lanes.\n);\nimpl_m!([m64; 4]: m64x4 | i64, u8 | test_v256 | x0, x1, x2, x3 |\n        From: m8x4, m16x4, m32x4 |\n        /// A 256-bit vector mask with 4 `m64` lanes.\n);\n\nimpl_i!([i128; 2]: i128x2, m128x2 | i128, u8 | test_v256 | x0, x1 |\n        From: i8x2, u8x2, i16x2, u16x2, i32x2, u32x2, i64x2, u64x2 |\n        /// A 256-bit vector with 2 `i128` lanes.\n);\nimpl_u!([u128; 2]: u128x2, m128x2 | u128, u8 | test_v256 | x0, x1 |\n        From: u8x2, u16x2, u32x2, u64x2 |\n        /// A 256-bit vector with 2 `u128` lanes.\n);\nimpl_m!([m128; 2]: m128x2 | i128, u8 | test_v256 | x0, x1 |\n        From: m8x2, m16x2, m32x2, m64x2 |\n        /// A 256-bit vector mask with 2 `m128` lanes.\n);\n"
  },
  {
    "path": "src/v32.rs",
    "content": "//! 32-bit wide vector types\n\nuse crate::*;\n\nimpl_i!([i8; 4]: i8x4, m8x4 | i8, u8 | test_v32 | x0, x1, x2, x3 |\n        From: |\n        /// A 32-bit vector with 4 `i8` lanes.\n);\nimpl_u!([u8; 4]: u8x4, m8x4 | u8, u8 | test_v32 | x0, x1, x2, x3 |\n        From: |\n        /// A 32-bit vector with 4 `u8` lanes.\n);\nimpl_m!([m8; 4]: m8x4 | i8, u8 | test_v32 | x0, x1, x2, x3 |\n        From: m16x4, m32x4, m64x4 |\n        /// A 32-bit vector mask with 4 `m8` lanes.\n);\n\nimpl_i!([i16; 2]: i16x2, m16x2 | i16, u8 | test_v32 | x0, x1 |\n        From: i8x2, u8x2 |\n        /// A 32-bit vector with 2 `i16` lanes.\n);\nimpl_u!([u16; 2]: u16x2, m16x2 | u16, u8 | test_v32 | x0, x1 |\n        From: u8x2 |\n        /// A 32-bit vector with 2 `u16` lanes.\n);\nimpl_m!([m16; 2]: m16x2 | i16, u8 | test_v32 | x0, x1 |\n        From: m8x2, m32x2, m64x2, m128x2 |\n        /// A 32-bit vector mask with 2 `m16` lanes.\n);\n"
  },
  {
    "path": "src/v512.rs",
    "content": "//! 512-bit wide vector types\n#[rustfmt::skip]\n\nuse crate::*;\n\nimpl_i!([i8; 64]: i8x64, m8x64 | i8, u64 | test_v512 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,\n        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31,\n        x32, x33, x34, x35, x36, x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47,\n        x48, x49, x50, x51, x52, x53, x54, x55, x56, x57, x58, x59, x60, x61, x62, x63 |\n        From: |\n        /// A 512-bit vector with 64 `i8` lanes.\n);\nimpl_u!([u8; 64]: u8x64, m8x64 | u8, u64 | test_v512 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,\n        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31,\n        x32, x33, x34, x35, x36, x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47,\n        x48, x49, x50, x51, x52, x53, x54, x55, x56, x57, x58, x59, x60, x61, x62, x63 |\n        From: |\n        /// A 512-bit vector with 64 `u8` lanes.\n);\nimpl_m!([m8; 64]: m8x64 | i8, u64 | test_v512 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,\n        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31,\n        x32, x33, x34, x35, x36, x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47,\n        x48, x49, x50, x51, x52, x53, x54, x55, x56, x57, x58, x59, x60, x61, x62, x63 |\n        From:  |\n        /// A 512-bit vector mask with 64 `m8` lanes.\n);\n\nimpl_i!([i16; 32]: i16x32, m16x32 | i16, u32 | test_v512 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,\n        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 |\n        From: i8x32, u8x32 |\n        /// A 512-bit vector with 32 `i16` lanes.\n);\nimpl_u!([u16; 32]: u16x32, m16x32 | u16, u32 | test_v512 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,\n        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 |\n        From: u8x32 |\n        /// A 512-bit vector with 32 `u16` lanes.\n);\nimpl_m!([m16; 32]: m16x32 | i16, u32 | test_v512 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,\n        x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 |\n        From: m8x32 |\n        /// A 512-bit vector mask with 32 `m16` lanes.\n);\n\nimpl_i!([i32; 16]: i32x16, m32x16 | i32, u16 | test_v512 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |\n        From: i8x16, u8x16, i16x16, u16x16 |\n        /// A 512-bit vector with 16 `i32` lanes.\n);\nimpl_u!([u32; 16]: u32x16, m32x16 | u32, u16 | test_v512 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |\n        From: u8x16, u16x16 |\n        /// A 512-bit vector with 16 `u32` lanes.\n);\nimpl_f!([f32; 16]: f32x16, m32x16 | f32 | test_v512 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |\n        From: i8x16, u8x16, i16x16, u16x16 |\n        /// A 512-bit vector with 16 `f32` lanes.\n);\nimpl_m!([m32; 16]: m32x16 | i32, u16 | test_v512 |\n        x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 |\n        From: m8x16, m16x16 |\n        /// A 512-bit vector mask with 16 `m32` lanes.\n);\n\nimpl_i!([i64; 8]: i64x8, m64x8 | i64, u8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 |\n        From: i8x8, u8x8, i16x8, u16x8, i32x8, u32x8 |\n        /// A 512-bit vector with 8 `i64` lanes.\n);\nimpl_u!([u64; 8]: u64x8, m64x8 | u64, u8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 |\n        From: u8x8, u16x8, u32x8 |\n        /// A 512-bit vector with 8 `u64` lanes.\n);\nimpl_f!([f64; 8]: f64x8, m64x8 | f64 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 |\n        From: i8x8, u8x8, i16x8, u16x8, i32x8, u32x8, f32x8 |\n        /// A 512-bit vector with 8 `f64` lanes.\n);\nimpl_m!([m64; 8]: m64x8 | i64, u8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 |\n        From: m8x8, m16x8, m32x8 |\n        /// A 512-bit vector mask with 8 `m64` lanes.\n);\n\nimpl_i!([i128; 4]: i128x4, m128x4 | i128, u8 | test_v512 | x0, x1, x2, x3 |\n        From: i8x4, u8x4, i16x4, u16x4, i32x4, u32x4, i64x4, u64x4 |\n        /// A 512-bit vector with 4 `i128` lanes.\n);\nimpl_u!([u128; 4]: u128x4, m128x4 | u128, u8 | test_v512 | x0, x1, x2, x3 |\n        From: u8x4, u16x4, u32x4, u64x4 |\n        /// A 512-bit vector with 4 `u128` lanes.\n);\nimpl_m!([m128; 4]: m128x4 | i128, u8 | test_v512 | x0, x1, x2, x3 |\n        From: m8x4, m16x4, m32x4, m64x4 |\n        /// A 512-bit vector mask with 4 `m128` lanes.\n);\n"
  },
  {
    "path": "src/v64.rs",
    "content": "//! 64-bit wide vector types\n#[rustfmt::skip]\n\nuse super::*;\n\nimpl_i!([i8; 8]: i8x8, m8x8 | i8, u8 | test_v64 | x0, x1, x2, x3, x4, x5, x6, x7 |\n        From: |\n        /// A 64-bit vector with 8 `i8` lanes.\n);\nimpl_u!([u8; 8]: u8x8, m8x8 | u8, u8 | test_v64 | x0, x1, x2, x3, x4, x5, x6, x7 |\n        From: |\n        /// A 64-bit vector with 8 `u8` lanes.\n);\nimpl_m!([m8; 8]: m8x8 | i8, u8 | test_v64 | x0, x1, x2, x3, x4, x5, x6, x7 |\n        From: m16x8, m32x8 |\n        /// A 64-bit vector mask with 8 `m8` lanes.\n);\n\nimpl_i!([i16; 4]: i16x4, m16x4 | i16, u8 | test_v64 | x0, x1, x2, x3 |\n        From: i8x4, u8x4 |\n        /// A 64-bit vector with 4 `i16` lanes.\n);\nimpl_u!([u16; 4]: u16x4, m16x4 | u16, u8 | test_v64 | x0, x1, x2, x3 |\n        From: u8x4 |\n        /// A 64-bit vector with 4 `u16` lanes.\n);\nimpl_m!([m16; 4]: m16x4 | i16, u8 | test_v64 | x0, x1, x2, x3 |\n        From: m8x4, m32x4, m64x4 |\n        /// A 64-bit vector mask with 4 `m16` lanes.\n);\n\nimpl_i!([i32; 2]: i32x2, m32x2 | i32, u8 | test_v64 | x0, x1 |\n        From: i8x2, u8x2, i16x2, u16x2 |\n        /// A 64-bit vector with 2 `i32` lanes.\n);\nimpl_u!([u32; 2]: u32x2, m32x2 | u32, u8 | test_v64 | x0, x1 |\n        From: u8x2, u16x2 |\n        /// A 64-bit vector with 2 `u32` lanes.\n);\nimpl_m!([m32; 2]: m32x2 | i32, u8 | test_v64 | x0, x1 |\n        From: m8x2, m16x2, m64x2, m128x2 |\n        /// A 64-bit vector mask with 2 `m32` lanes.\n);\nimpl_f!([f32; 2]: f32x2, m32x2 | f32 | test_v64 | x0, x1 |\n        From: i8x2, u8x2, i16x2, u16x2 |\n        /// A 64-bit vector with 2 `f32` lanes.\n);\n\n/*\nimpl_i!([i64; 1]: i64x1, m64x1 | i64, u8 | test_v64 | x0 |\n        From: /*i8x1, u8x1, i16x1, u16x1, i32x1, u32x1*/ |  // FIXME: primitive to vector conversion\n        /// A 64-bit vector with 1 `i64` lanes.\n);\nimpl_u!([u64; 1]: u64x1, m64x1 | u64, u8 | test_v64 | x0 |\n        From: /*u8x1, u16x1, u32x1*/ | // FIXME: primitive to vector conversion\n        /// A 64-bit vector with 1 `u64` lanes.\n);\nimpl_m!([m64; 1]: m64x1 | i64, u8 | test_v64 | x0 |\n        From: /*m8x1, m16x1, m32x1, */ m128x1 | // FIXME: unary small vector types\n        /// A 64-bit vector mask with 1 `m64` lanes.\n);\nimpl_f!([f64; 1]: f64x1, m64x1 | f64 | test_v64 | x0 |\n        From: /*i8x1, u8x1, i16x1, u16x1, i32x1, u32x1, f32x1*/ | // FIXME: unary small vector types\n        /// A 64-bit vector with 1 `f64` lanes.\n);\n*/\n"
  },
  {
    "path": "src/vPtr.rs",
    "content": "//! Vectors of pointers\n#[rustfmt::skip]\n\nuse crate::*;\n\nimpl_const_p!(\n    [*const T; 2]: cptrx2, msizex2, usizex2, isizex2 | test_v128 | x0, x1 | From: |\n    /// A vector with 2 `*const T` lanes\n);\n\nimpl_mut_p!(\n    [*mut T; 2]: mptrx2, msizex2, usizex2, isizex2 | test_v128 | x0, x1 | From: |\n    /// A vector with 2 `*mut T` lanes\n);\n\nimpl_const_p!(\n    [*const T; 4]: cptrx4, msizex4, usizex4, isizex4 | test_v256 | x0, x1, x2, x3 | From: |\n    /// A vector with 4 `*const T` lanes\n);\n\nimpl_mut_p!(\n    [*mut T; 4]: mptrx4, msizex4, usizex4, isizex4 | test_v256 | x0, x1, x2, x3 | From: |\n    /// A vector with 4 `*mut T` lanes\n);\n\nimpl_const_p!(\n    [*const T; 8]: cptrx8, msizex8, usizex8, isizex8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 | From: |\n    /// A vector with 8 `*const T` lanes\n);\n\nimpl_mut_p!(\n    [*mut T; 8]: mptrx8, msizex8, usizex8, isizex8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 | From: |\n    /// A vector with 8 `*mut T` lanes\n);\n"
  },
  {
    "path": "src/vSize.rs",
    "content": "//! Vectors with pointer-sized elements\n\nuse crate::codegen::pointer_sized_int::{isize_, usize_};\nuse crate::*;\n\nimpl_i!([isize; 2]: isizex2, msizex2 | isize_, u8 | test_v128 |\n        x0, x1|\n        From: |\n        /// A vector with 2 `isize` lanes.\n);\n\nimpl_u!([usize; 2]: usizex2, msizex2 | usize_, u8 | test_v128 |\n        x0, x1|\n        From: |\n        /// A vector with 2 `usize` lanes.\n);\nimpl_m!([msize; 2]: msizex2 | isize_, u8 | test_v128 |\n        x0, x1 |\n        From: |\n        /// A vector mask with 2 `msize` lanes.\n);\n\nimpl_i!([isize; 4]: isizex4, msizex4 | isize_, u8 | test_v256 |\n        x0, x1, x2, x3 |\n        From: |\n        /// A vector with 4 `isize` lanes.\n);\nimpl_u!([usize; 4]: usizex4, msizex4 | usize_, u8 | test_v256 |\n        x0, x1, x2, x3|\n        From: |\n        /// A vector with 4 `usize` lanes.\n);\nimpl_m!([msize; 4]: msizex4 | isize_, u8 | test_v256 |\n        x0, x1, x2, x3 |\n        From: |\n        /// A vector mask with 4 `msize` lanes.\n);\n\nimpl_i!([isize; 8]: isizex8, msizex8 | isize_, u8 | test_v512 |\n        x0, x1, x2, x3, x4, x5, x6, x7 |\n        From: |\n        /// A vector with 8 `isize` lanes.\n);\nimpl_u!([usize; 8]: usizex8, msizex8 | usize_, u8 | test_v512 |\n        x0, x1, x2, x3, x4, x5, x6, x7 |\n        From: |\n        /// A vector with 8 `usize` lanes.\n);\nimpl_m!([msize; 8]: msizex8 | isize_, u8 | test_v512 |\n        x0, x1, x2, x3, x4, x5, x6, x7 |\n        From: |\n        /// A vector mask with 8 `msize` lanes.\n);\n"
  },
  {
    "path": "tests/endianness.rs",
    "content": "#[cfg(target_arch = \"wasm32\")]\nuse wasm_bindgen_test::*;\n\nuse packed_simd::*;\nuse std::{mem, slice};\n\n#[cfg_attr(not(target_arch = \"wasm32\"), test)]\n#[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\nfn endian_indexing() {\n    let v = i32x4::new(0, 1, 2, 3);\n    assert_eq!(v.extract(0), 0);\n    assert_eq!(v.extract(1), 1);\n    assert_eq!(v.extract(2), 2);\n    assert_eq!(v.extract(3), 3);\n}\n\n#[cfg_attr(not(target_arch = \"wasm32\"), test)]\n#[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\nfn endian_bitcasts() {\n    #[rustfmt::skip]\n    let x = i8x16::new(\n        0, 1, 2, 3, 4, 5, 6, 7,\n        8, 9, 10, 11, 12, 13, 14, 15,\n    );\n    let t: i16x8 = unsafe { mem::transmute(x) };\n    let e: i16x8 = if cfg!(target_endian = \"little\") {\n        i16x8::new(256, 770, 1284, 1798, 2312, 2826, 3340, 3854)\n    } else {\n        i16x8::new(1, 515, 1029, 1543, 2057, 2571, 3085, 3599)\n    };\n    assert_eq!(t, e);\n}\n\n#[cfg_attr(not(target_arch = \"wasm32\"), test)]\n#[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\nfn endian_casts() {\n    #[rustfmt::skip]\n    let x = i8x16::new(\n        0, 1, 2, 3, 4, 5, 6, 7,\n        8, 9, 10, 11, 12, 13, 14, 15,\n    );\n    let t: i16x16 = x.into(); // simd_cast\n    #[rustfmt::skip]\n    let e = i16x16::new(\n        0, 1, 2, 3, 4, 5, 6, 7,\n        8, 9, 10, 11, 12, 13, 14, 15,\n    );\n    assert_eq!(t, e);\n}\n\n#[cfg_attr(not(target_arch = \"wasm32\"), test)]\n#[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\nfn endian_load_and_stores() {\n    #[rustfmt::skip]\n    let x = i8x16::new(\n        0, 1, 2, 3, 4, 5, 6, 7,\n        8, 9, 10, 11, 12, 13, 14, 15,\n    );\n    let mut y: [i16; 8] = [0; 8];\n    x.write_to_slice_unaligned(unsafe { slice::from_raw_parts_mut(&mut y as *mut _ as *mut i8, 16) });\n\n    let e: [i16; 8] = if cfg!(target_endian = \"little\") {\n        [256, 770, 1284, 1798, 2312, 2826, 3340, 3854]\n    } else {\n        [1, 515, 1029, 1543, 2057, 2571, 3085, 3599]\n    };\n    assert_eq!(y, e);\n\n    let z = i8x16::from_slice_unaligned(unsafe { slice::from_raw_parts(&y as *const _ as *const i8, 16) });\n    assert_eq!(z, x);\n}\n\n#[cfg_attr(not(target_arch = \"wasm32\"), test)]\n#[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\nfn endian_array_union() {\n    union A {\n        data: [f32; 4],\n        vec: f32x4,\n    }\n    let x: [f32; 4] = unsafe { A { vec: f32x4::new(0., 1., 2., 3.) }.data };\n    // As all of these are integer values within the mantissa^1 range, it\n    // would be very unusual for them to actually fail to compare.\n    #[allow(clippy::float_cmp)]\n    {\n        assert_eq!(x[0], 0_f32);\n        assert_eq!(x[1], 1_f32);\n        assert_eq!(x[2], 2_f32);\n        assert_eq!(x[3], 3_f32);\n    }\n    let y: f32x4 = unsafe { A { data: [3., 2., 1., 0.] }.vec };\n    assert_eq!(y, f32x4::new(3., 2., 1., 0.));\n\n    union B {\n        data: [i8; 16],\n        vec: i8x16,\n    }\n    #[rustfmt::skip]\n    let x = i8x16::new(\n        0, 1, 2, 3, 4, 5, 6, 7,\n        8, 9, 10, 11, 12, 13, 14, 15,\n    );\n    let x: [i8; 16] = unsafe { B { vec: x }.data };\n\n    for (i, v) in x.iter().enumerate() {\n        assert_eq!(i as i8, *v);\n    }\n\n    #[rustfmt::skip]\n    let y = [\n        15, 14, 13, 12, 11, 19, 9, 8,\n        7, 6, 5, 4, 3, 2, 1, 0\n    ];\n    #[rustfmt::skip]\n    let e = i8x16::new(\n        15, 14, 13, 12, 11, 19, 9, 8,\n        7, 6, 5, 4, 3, 2, 1, 0\n    );\n    let z = unsafe { B { data: y }.vec };\n    assert_eq!(z, e);\n\n    union C {\n        data: [i16; 8],\n        vec: i8x16,\n    }\n    #[rustfmt::skip]\n    let x = i8x16::new(\n        0, 1, 2, 3, 4, 5, 6, 7,\n        8, 9, 10, 11, 12, 13, 14, 15,\n    );\n    let x: [i16; 8] = unsafe { C { vec: x }.data };\n\n    let e: [i16; 8] = if cfg!(target_endian = \"little\") {\n        [256, 770, 1284, 1798, 2312, 2826, 3340, 3854]\n    } else {\n        [1, 515, 1029, 1543, 2057, 2571, 3085, 3599]\n    };\n    assert_eq!(x, e);\n}\n\n#[cfg_attr(not(target_arch = \"wasm32\"), test)]\n#[cfg_attr(target_arch = \"wasm32\", wasm_bindgen_test)]\nfn endian_tuple_access() {\n    type F32x4T = (f32, f32, f32, f32);\n    union A {\n        data: F32x4T,\n        vec: f32x4,\n    }\n    let x: F32x4T = unsafe { A { vec: f32x4::new(0., 1., 2., 3.) }.data };\n    // As all of these are integer values within the mantissa^1 range, it\n    // would be very unusual for them to actually fail to compare.\n    #[allow(clippy::float_cmp)]\n    {\n        assert_eq!(x.0, 0_f32);\n        assert_eq!(x.1, 1_f32);\n        assert_eq!(x.2, 2_f32);\n        assert_eq!(x.3, 3_f32);\n    }\n    let y: f32x4 = unsafe { A { data: (3., 2., 1., 0.) }.vec };\n    assert_eq!(y, f32x4::new(3., 2., 1., 0.));\n\n    #[rustfmt::skip]\n    type I8x16T = (i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8);\n    union B {\n        data: I8x16T,\n        vec: i8x16,\n    }\n\n    #[rustfmt::skip]\n    let x = i8x16::new(\n        0, 1, 2, 3, 4, 5, 6, 7,\n        8, 9, 10, 11, 12, 13, 14, 15,\n    );\n    let x: I8x16T = unsafe { B { vec: x }.data };\n\n    assert_eq!(x.0, 0);\n    assert_eq!(x.1, 1);\n    assert_eq!(x.2, 2);\n    assert_eq!(x.3, 3);\n    assert_eq!(x.4, 4);\n    assert_eq!(x.5, 5);\n    assert_eq!(x.6, 6);\n    assert_eq!(x.7, 7);\n    assert_eq!(x.8, 8);\n    assert_eq!(x.9, 9);\n    assert_eq!(x.10, 10);\n    assert_eq!(x.11, 11);\n    assert_eq!(x.12, 12);\n    assert_eq!(x.13, 13);\n    assert_eq!(x.14, 14);\n    assert_eq!(x.15, 15);\n\n    #[rustfmt::skip]\n    let y = (\n        15, 14, 13, 12, 11, 10, 9, 8,\n        7, 6, 5, 4, 3, 2, 1, 0\n    );\n    let z: i8x16 = unsafe { B { data: y }.vec };\n    #[rustfmt::skip]\n    let e = i8x16::new(\n        15, 14, 13, 12, 11, 10, 9, 8,\n        7, 6, 5, 4, 3, 2, 1, 0\n    );\n    assert_eq!(e, z);\n\n    #[rustfmt::skip]\n    type I16x8T = (i16, i16, i16, i16, i16, i16, i16, i16);\n    union C {\n        data: I16x8T,\n        vec: i8x16,\n    }\n\n    #[rustfmt::skip]\n    let x = i8x16::new(\n        0, 1, 2, 3, 4, 5, 6, 7,\n        8, 9, 10, 11, 12, 13, 14, 15,\n    );\n    let x: I16x8T = unsafe { C { vec: x }.data };\n\n    let e: [i16; 8] = if cfg!(target_endian = \"little\") {\n        [256, 770, 1284, 1798, 2312, 2826, 3340, 3854]\n    } else {\n        [1, 515, 1029, 1543, 2057, 2571, 3085, 3599]\n    };\n    assert_eq!(x.0, e[0]);\n    assert_eq!(x.1, e[1]);\n    assert_eq!(x.2, e[2]);\n    assert_eq!(x.3, e[3]);\n    assert_eq!(x.4, e[4]);\n    assert_eq!(x.5, e[5]);\n    assert_eq!(x.6, e[6]);\n    assert_eq!(x.7, e[7]);\n\n    #[rustfmt::skip]\n    #[repr(C)]\n    #[derive(Copy ,Clone)]\n    pub struct Tup(pub i8, pub i8, pub i16, pub i8, pub i8, pub i16,\n                   pub i8, pub i8, pub i16, pub i8, pub i8, pub i16);\n\n    union D {\n        data: Tup,\n        vec: i8x16,\n    }\n\n    #[rustfmt::skip]\n    let x = i8x16::new(\n        0, 1, 2, 3, 4, 5, 6, 7,\n        8, 9, 10, 11, 12, 13, 14, 15,\n    );\n    let x: Tup = unsafe { D { vec: x }.data };\n\n    let e: [i16; 12] = if cfg!(target_endian = \"little\") {\n        [0, 1, 770, 4, 5, 1798, 8, 9, 2826, 12, 13, 3854]\n    } else {\n        [0, 1, 515, 4, 5, 1543, 8, 9, 2571, 12, 13, 3599]\n    };\n    assert_eq!(x.0 as i16, e[0]);\n    assert_eq!(x.1 as i16, e[1]);\n    assert_eq!(x.2 as i16, e[2]);\n    assert_eq!(x.3 as i16, e[3]);\n    assert_eq!(x.4 as i16, e[4]);\n    assert_eq!(x.5 as i16, e[5]);\n    assert_eq!(x.6 as i16, e[6]);\n    assert_eq!(x.7 as i16, e[7]);\n    assert_eq!(x.8 as i16, e[8]);\n    assert_eq!(x.9 as i16, e[9]);\n    assert_eq!(x.10 as i16, e[10]);\n    assert_eq!(x.11 as i16, e[11]);\n}\n"
  },
  {
    "path": "verify/verify/Cargo.toml",
    "content": "[package]\nname = \"verify\"\nversion = \"0.1.0\"\nauthors = [\"gnzlbg <gonzalobg88@gmail.com>\"]\nedition = \"2018\"\n\n[dev-dependencies]\nstdarch-test = { git = \"https://github.com/rust-lang/stdarch.git\"  }\npacked_simd = { package = \"packed_simd\", path = \"../..\" }\ncfg-if = \"^0.1\"\npaste = \"^0.1.3\"\n"
  },
  {
    "path": "verify/verify/readme.md",
    "content": "# Machine code verification\n\n## Quick start\n\nTo run the verification tests run:\n\n```\ncargo test --release\n```\n\non this crate, eventually passing the required target features via `RUSTFLAGS`.\nFor example, `RUSTFLAGS=\"-C target-feature=+avx2\"`.\n\nThis crate only contains tests, and the tests only run in `--release` mode.\nTherefore, building this crate with anything different from `cargo test\n--release` does not make much sense.\n\n## How it works\n\nThis crates verifies the machine code generated for some of the portable packed\nvector APIs by disassembling the API at run-time and comparing the machine code\ngenerated against the desired one for a particular target and target features.\n\nThis is done by using the\n[`stdarch-test`](https://github.com/rust-lang/stdarch/tree/master/crates/stdarch-test)\ncrate, which exposes the `assert_instr` procedural macro. It is used like this:\n\n```rust\n// The verification functions must be #[inline]: \n#[inline]\n// Enable the target features required for the desired code generation\n// on the different targets:\n#[cfg_attr(\n    any(target_arch = \"x86\", target_arch = \"x86_64\"),\n    target_feature(enable = \"avx512f,avx512vl\")\n)]\n// Check that the disassembly contains a particular instruction:\n#[cfg_attr(\n    any(target_arch = \"x86\", target_arch = \"x86_64\"),\n    assert_instr(vpro)\n)]\nunsafe fn rotate_right_variable(x: u64x8, v: u64x8) -> u64x8 {\n    x.rotate_right(v)\n}\n```\n\nThe `assert_instr` procedural macro creates a test that contains a\n`#[inline(never)]` function that calls the API. It then gets a function pointer\nto this function, and calls `stdarch_test::assert` with it, the function name,\nand the expected assembly instruction. `stdarch_test` uses `objdump` or similar\nto disassemble itself, it then looks for the function address and name in the\ndisassembly, and verifies that the machine code for the function contains the\ninstruction.\n"
  },
  {
    "path": "verify/verify/rust-toolchain",
    "content": "nightly"
  },
  {
    "path": "verify/verify/src/api/math/float/mod.rs",
    "content": "mod mul_add;\n"
  },
  {
    "path": "verify/verify/src/api/math/float/mul_add.rs",
    "content": "#[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))]\nmod x86 {\n    mod f32x4 {\n        #![allow(unused)]\n        use packed_simd::*;\n        use stdarch_test::assert_instr;\n\n        #[inline]\n        #[target_feature(enable = \"sse,fma\")]\n        #[assert_instr(vfmadd)]\n        unsafe fn fused_multiply_add(a: f32x4, b: f32x4, c: f32x4) -> f32x4 {\n            a.mul_add(b, c)\n        }\n\n        #[inline]\n        #[target_feature(enable = \"sse,fma\")]\n        #[assert_instr(vfmsub)]\n        unsafe fn fused_multiply_sub(a: f32x4, b: f32x4, c: f32x4) -> f32x4 {\n            a.mul_add(b, -c)\n        }\n\n        #[inline]\n        #[target_feature(enable = \"sse,fma\")]\n        #[assert_instr(vfnmadd)]\n        unsafe fn fused_negate_multiply_add(\n            a: f32x4, b: f32x4, c: f32x4,\n        ) -> f32x4 {\n            a.mul_add(-b, c)\n        }\n\n        #[inline]\n        #[target_feature(enable = \"sse,fma\")]\n        #[assert_instr(vfnmsub)]\n        unsafe fn fused_negate_multiply_sub(\n            a: f32x4, b: f32x4, c: f32x4,\n        ) -> f32x4 {\n            a.mul_add(-b, -c)\n        }\n\n        #[inline]\n        #[target_feature(enable = \"sse,fma\")]\n        #[assert_instr(vfmaddsub)]\n        unsafe fn fused_multiply_add_sub(\n            a: f32x4, b: f32x4, c: f32x4,\n        ) -> f32x4 {\n            let add = a.mul_add(b, c);\n            let sub = a.mul_add(b, -c);\n\n            m32x4::new(false, true, false, true).select(add, sub)\n        }\n\n        #[inline]\n        #[target_feature(enable = \"sse,fma\")]\n        #[assert_instr(vfmsubadd)]\n        unsafe fn fused_multiply_sub_add(\n            a: f32x4, b: f32x4, c: f32x4,\n        ) -> f32x4 {\n            let add = a.mul_add(b, c);\n            let sub = a.mul_add(b, -c);\n\n            m32x4::new(true, false, true, false).select(add, sub)\n        }\n    }\n}\n"
  },
  {
    "path": "verify/verify/src/api/math.rs",
    "content": "mod float;\n"
  },
  {
    "path": "verify/verify/src/api/ops/vector_rotates/x86.rs",
    "content": "mod u64x8 {\n    #![allow(unused)]\n    use packed_simd::*;\n    use stdarch_test::assert_instr;\n\n    #[inline]\n    #[target_feature(enable = \"avx512f\")]\n    #[assert_instr(vpro)]\n    unsafe fn rotate_right_variable(x: u64x8, v: u64x8) -> u64x8 {\n        x.rotate_right(v)\n    }\n\n    #[inline]\n    #[target_feature(enable = \"avx512f\")]\n    #[assert_instr(vpro)]\n    unsafe fn rotate_left_variable(x: u64x8, v: u64x8) -> u64x8 {\n        x.rotate_left(v)\n    }\n\n    #[inline]\n    #[target_feature(enable = \"avx512f\")]\n    #[assert_instr(vpro)]\n    unsafe fn rotate_right(x: u64x8) -> u64x8 {\n        x.rotate_right(u64x8::splat(12))\n    }\n\n    #[inline]\n    #[target_feature(enable = \"avx512f\")]\n    #[assert_instr(vpro)]\n    unsafe fn rotate_left(x: u64x8) -> u64x8 {\n        x.rotate_left(u64x8::splat(12))\n    }\n\n    #[inline]\n    #[target_feature(enable = \"avx512f,avx512vl\")]\n    #[assert_instr(vpro)]\n    unsafe fn rotate_left_x2(x: u64x2) -> u64x2 {\n        x.rotate_left(u64x2::splat(12))\n    }\n}\n"
  },
  {
    "path": "verify/verify/src/api/ops/vector_rotates.rs",
    "content": "use cfg_if::cfg_if;\n\ncfg_if! {\n    if #[cfg(any(target_arch = \"x86\", target_arch = \"x86_64\"))] {\n        mod x86;\n    }\n}\n"
  },
  {
    "path": "verify/verify/src/api/ops.rs",
    "content": "mod vector_rotates;\n"
  },
  {
    "path": "verify/verify/src/api/reductions/mask/avx.rs",
    "content": "//! Verification of the mask reduction API for `x86`/`x86_64`+`SSE2`\n\nuse packed_simd::*;\nuse stdarch_test::assert_instr;\n\nmacro_rules! verify {\n    ($id:ident => $instr:tt) => {\n        verify_mask!($id[\"avx\"] => $instr);\n    }\n}\n\n// 128-bit wide:\nverify!(m8x16 => vpmovmskb);\nverify!(m16x8 => vpmovmskb);\nverify!(m32x4 => vmovmskps);\nverify!(m64x2 => vmovmskpd);\n// FIXME: verify!(m128x1 => vmovmskpd);\n\n// 256-bit wide:\nverify!(m8x32 => vptest);\nverify!(m16x16 => vptest);\nverify!(m32x8 => vmovmskps);\nverify!(m64x4 => vmovmskpd);\n// FIXME: verify!(m128x2 => vmovmskpd);\n\n// FIXME: 512-bit wide masks\n"
  },
  {
    "path": "verify/verify/src/api/reductions/mask/avx2.rs",
    "content": "//! Verification of the mask reduction API for `x86`/`x86_64`+`SSE2`\n\nuse packed_simd::*;\nuse stdarch_test::assert_instr;\n\nmacro_rules! verify {\n    ($id:ident => $instr:tt) => {\n        verify_mask!($id[\"avx2\"] => $instr);\n    }\n}\n\n// 128-bit wide:\nverify!(m8x16 => vpmovmskb);\nverify!(m16x8 => vpmovmskb);\nverify!(m32x4 => vmovmskps);\nverify!(m64x2 => vmovmskpd);\n// FIXME: verify!(m128x1 => vmovmskpd);\n\n// 256-bit wide:\nverify!(m8x32 => vpmovmskb);\nverify!(m16x16 => vpmovmskb);\nverify!(m32x8 => vmovmskps);\nverify!(m64x4 => vmovmskpd);\n// FIXME: verify!(m128x2 => vmovmskpd);\n\n// FIXME: 512-bit wide masks\n"
  },
  {
    "path": "verify/verify/src/api/reductions/mask/sse.rs",
    "content": "//! Verification of the mask reduction API for `x86`/`x86_64`+`SSE`\n\n#![allow(unused)]\nuse packed_simd::*;\nuse stdarch_test::assert_instr;\n\nmacro_rules! verify {\n    ($id:ident => $instr:tt) => {\n        verify_mask!($id[\"sse\"] => $instr);\n    }\n}\n\n// 128-bit wide:\nverify!(m32x4 => movmskps);\nverify!(m64x2 => movmskps);\n// FIXME: verify!(m128x1 => movmskps);\n\n// 256-bit wide:\nverify!(m32x8 => movmskps);\nverify!(m64x4 => movmskps);\n// FIXME: verify!(m128x2 => movmskps);\n\n// FIXME: 512-bit wide masks\n"
  },
  {
    "path": "verify/verify/src/api/reductions/mask/sse2.rs",
    "content": "//! Verification of the mask reduction API for `x86`/`x86_64`+`SSE2`\n\nuse packed_simd::*;\nuse stdarch_test::assert_instr;\n\nmacro_rules! verify {\n    ($id:ident => $instr:tt) => {\n        verify_mask!($id[\"sse2\"] => $instr);\n    }\n}\n\n// 128-bit wide:\nverify!(m8x16 => pmovmskb);\nverify!(m16x8 => pmovmskb);\nverify!(m32x4 => movmskps);\nverify!(m64x2 => movmskpd);\n// FIXME: verify!(m128x1 => movmskpd);\n\n// 256-bit wide:\nverify!(m8x32 => pmovmskb);\nverify!(m16x16 => pmovmskb);\nverify!(m32x8 => movmskps);\nverify!(m64x4 => movmskpd);\n// FIXME: verify!(m128x2 => movmskpd);\n\n// FIXME: 512-bit wide masks\n"
  },
  {
    "path": "verify/verify/src/api/reductions/mask.rs",
    "content": "//! Verify the mask reduction API.\n\nuse cfg_if::cfg_if;\n\n#[allow(unused)]\nmacro_rules! verify_mask {\n    ($mask_id:ident[$target_feature:tt] => $all_instr:tt, $any_instr:tt,\n     $none_instr:tt) => {\n        paste::item! {\n            #[inline]\n            #[target_feature(enable = $target_feature)]\n            #[assert_instr($all_instr)]\n            pub unsafe fn [<$mask_id _all>](x: $mask_id) -> bool {\n                x.all()\n            }\n            #[inline]\n            #[target_feature(enable = $target_feature)]\n            #[assert_instr($any_instr)]\n            pub unsafe fn [<$mask_id _any>](x: $mask_id) -> bool {\n                x.any()\n            }\n            #[inline]\n            #[target_feature(enable = $target_feature)]\n            #[assert_instr($none_instr)]\n            pub unsafe fn [<$mask_id _none>](x: $mask_id) -> bool {\n                x.none()\n            }\n        }\n    };\n    ($mask_id:ident[$target_feature:tt] => $instr:tt) => {\n        verify_mask!($mask_id[$target_feature] => $instr, $instr, $instr);\n    };\n}\n\ncfg_if! {\n    if #[cfg(all(any(target_arch = \"x86\", target_arch = \"x86_64\")),\n             target_feature = \"sse\")] {\n        // FIXME: avx512\n        #[cfg(all(not(target_feature = \"avx512f\"), target_feature = \"avx2\"))]\n        mod avx2;\n        #[cfg(all(not(target_feature = \"avx2\"), target_feature = \"avx\"))]\n        mod avx;\n        #[cfg(all(not(target_feature = \"avx\"), target_feature = \"sse2\"))]\n        mod sse2;\n        #[cfg(all(not(target_feature = \"sse2\"), target_feature = \"sse\"))]\n        mod sse;\n    }\n}\n"
  },
  {
    "path": "verify/verify/src/api/reductions.rs",
    "content": "mod mask;\n"
  },
  {
    "path": "verify/verify/src/api.rs",
    "content": "use cfg_if::cfg_if;\n\ncfg_if! {\n    if #[cfg(debug_assertions)] {\n        compile_error!(\"the verify tests only run in --release mode\");\n    }\n}\n\nmod math;\nmod ops;\nmod reductions;\n"
  },
  {
    "path": "verify/verify/src/lib.rs",
    "content": "// FIXME: these types are unsound in C FFI already\n// See https://github.com/rust-lang/rust/issues/53346\n#![allow(improper_ctypes_definitions)]\n#![deny(rust_2018_idioms)]\n#![cfg_attr(test, feature(avx512_target_feature, abi_vectorcall))]\n\n#[cfg(test)]\nmod api;\n"
  }
]